From 85ef98d9f1cfe4c4d0e9a975060fe55bc0810a72 Mon Sep 17 00:00:00 2001 From: ColinLee Date: Thu, 18 Jun 2026 11:29:32 +0800 Subject: [PATCH] update doc. --- src/.vuepress/sidebar/v2.x/en.ts | 14 + src/.vuepress/sidebar/v2.x/zh.ts | 14 + .../develop/DataFrame/TsFileDataFrame.md | 287 ++++++++++ .../InterfaceDefinition-C.md | 253 +++++++-- .../InterfaceDefinition-CPP.md | 497 +++++++----------- .../InterfaceDefinition-Python.md | 301 ++++++----- src/UserGuide/develop/Tools/Tsfile-CLI.md | 183 +++++++ src/UserGuide/develop/Tools/Tsfile-Viewer.md | 93 ++++ .../latest/DataFrame/TsFileDataFrame.md | 287 ++++++++++ .../InterfaceDefinition-C.md | 253 +++++++-- .../InterfaceDefinition-CPP.md | 497 +++++++----------- .../InterfaceDefinition-Python.md | 301 ++++++----- src/UserGuide/latest/Tools/Tsfile-CLI.md | 183 +++++++ src/UserGuide/latest/Tools/Tsfile-Viewer.md | 93 ++++ .../develop/DataFrame/TsFileDataFrame.md | 270 ++++++++++ .../InterfaceDefinition-C.md | 208 +++++++- .../InterfaceDefinition-CPP.md | 458 +++++++--------- .../InterfaceDefinition-Python.md | 259 ++++----- src/zh/UserGuide/develop/Tools/Tsfile-CLI.md | 165 ++++++ .../UserGuide/develop/Tools/Tsfile-Viewer.md | 89 ++++ .../latest/DataFrame/TsFileDataFrame.md | 270 ++++++++++ .../InterfaceDefinition-C.md | 209 +++++++- .../InterfaceDefinition-CPP.md | 459 +++++++--------- .../InterfaceDefinition-Python.md | 259 ++++----- src/zh/UserGuide/latest/Tools/Tsfile-CLI.md | 165 ++++++ .../UserGuide/latest/Tools/Tsfile-Viewer.md | 89 ++++ 26 files changed, 4323 insertions(+), 1833 deletions(-) create mode 100644 src/UserGuide/develop/DataFrame/TsFileDataFrame.md create mode 100644 src/UserGuide/develop/Tools/Tsfile-CLI.md create mode 100644 src/UserGuide/develop/Tools/Tsfile-Viewer.md create mode 100644 src/UserGuide/latest/DataFrame/TsFileDataFrame.md create mode 100644 src/UserGuide/latest/Tools/Tsfile-CLI.md create mode 100644 src/UserGuide/latest/Tools/Tsfile-Viewer.md create mode 100644 src/zh/UserGuide/develop/DataFrame/TsFileDataFrame.md create mode 100644 src/zh/UserGuide/develop/Tools/Tsfile-CLI.md create mode 100644 src/zh/UserGuide/develop/Tools/Tsfile-Viewer.md create mode 100644 src/zh/UserGuide/latest/DataFrame/TsFileDataFrame.md create mode 100644 src/zh/UserGuide/latest/Tools/Tsfile-CLI.md create mode 100644 src/zh/UserGuide/latest/Tools/Tsfile-Viewer.md diff --git a/src/.vuepress/sidebar/v2.x/en.ts b/src/.vuepress/sidebar/v2.x/en.ts index aaa701e4f..c7ae96653 100644 --- a/src/.vuepress/sidebar/v2.x/en.ts +++ b/src/.vuepress/sidebar/v2.x/en.ts @@ -60,6 +60,20 @@ export const enSidebar = { { text: 'InterfaceDefinition-Python', link: 'InterfaceDefinition-Python' }, ], }, + { + text: 'TsFileDataFrame', + collapsible: true, + link: 'DataFrame/TsFileDataFrame', + }, + { + text: 'Tools', + collapsible: true, + prefix: 'Tools/', + children: [ + { text: 'tsfile-cli', link: 'Tsfile-CLI' }, + { text: 'tsfile-viewer', link: 'Tsfile-Viewer' }, + ], + }, /* { text: 'Ecosystem Integration', collapsible: true, diff --git a/src/.vuepress/sidebar/v2.x/zh.ts b/src/.vuepress/sidebar/v2.x/zh.ts index dd0ec7556..f1a40e158 100644 --- a/src/.vuepress/sidebar/v2.x/zh.ts +++ b/src/.vuepress/sidebar/v2.x/zh.ts @@ -60,6 +60,20 @@ export const zhSidebar = { { text: '接口定义-Python', link: 'InterfaceDefinition-Python' }, ], }, + { + text: 'TsFileDataFrame', + collapsible: true, + link: 'DataFrame/TsFileDataFrame', + }, + { + text: '工具', + collapsible: true, + prefix: 'Tools/', + children: [ + { text: 'tsfile-cli', link: 'Tsfile-CLI' }, + { text: 'tsfile-viewer', link: 'Tsfile-Viewer' }, + ], + }, /* { text: '生态集成', collapsible: true, diff --git a/src/UserGuide/develop/DataFrame/TsFileDataFrame.md b/src/UserGuide/develop/DataFrame/TsFileDataFrame.md new file mode 100644 index 000000000..b643f0225 --- /dev/null +++ b/src/UserGuide/develop/DataFrame/TsFileDataFrame.md @@ -0,0 +1,287 @@ + +# TsFileDataFrame + +`TsFileDataFrame` lets you read the time series inside one or more TsFiles the +same way you would work with a pandas DataFrame — without having to care about +the underlying file format or data-loading details. It is part of the Python +package (`pip install tsfile`). + +## Quick start + +```python +from tsfile import TsFileDataFrame + +df = TsFileDataFrame("table_data/") # load every .tsfile under the directory +print(df) # browse all series (metadata only) + +ts = df["weather.Beijing.humidity"] # pick one series (lazy handle) +window = ts[20:100] # slice by row index -> np.ndarray + +data = df.loc[start:end, [ # align multiple series on timestamps + "weather.Beijing.temperature", + "weather.Beijing.humidity", +]] +data.values # -> np.ndarray, shape = (N, 2) +``` + +## Core types + +`TsFileDataFrame` is built around three types: + +- **`TsFileDataFrame`** — the entry point. It loads one or more TsFiles and + exposes a unified view. Construction only scans metadata; **no values are read**. +- **`Timeseries`** — a lazy handle to a single series, obtained from `df[...]`. + It carries the series' metadata but reads nothing until you index it by row. +- **`AlignedTimeseries`** — the result of aligning several series on a common + time axis, obtained from `df.loc[...]`. It reads the requested range of the + requested series into memory at once. + +### TsFileDataFrame + +In the table below, `df` is a `TsFileDataFrame` instance, created with +`df = TsFileDataFrame(paths)`. + +| Example | Operation | Returns | +|---|---|---| +| `TsFileDataFrame(paths)` | Load a file / list of files / directory | `TsFileDataFrame` | +| `len(df)` | Number of time series | `int` | +| `df.list_timeseries("weather")` | Series names, optionally filtered by prefix | `List[str]` | +| `df["weather.Beijing.humidity"]`, `df[0]`, `df[-1]` | One series | `Timeseries` | +| `df["city"]` | A metadata column (a tag / `field` / `start_time` / `end_time` / `count`) | `pandas.Series` | +| `df[0:3]`, `df[[0, 2, 5]]` | A subset view | `TsFileDataFrame` | +| `df[df["city"] == "Beijing"]` | Filter by a metadata column | `TsFileDataFrame` | +| `df.loc[start:end, series_list]` | Timestamp-aligned query | `AlignedTimeseries` | +| `df.show(max_rows=20)` / `print(df)` | Formatted metadata table | — | +| `df.close()` | Release file handles | — | + +### Timeseries + +In the table below, `ts` is a `Timeseries`, obtained from `ts = df[...]`. + +| Example | Operation | Returns | +|---|---|---| +| `ts.name` | Series name | `str` | +| `len(ts)` | Number of points | `int` | +| `ts.stats` | Series statistics | `dict` (`start_time`, `end_time`, `count`) | +| `ts[20]` | Single value | `float` (or `None` if null) | +| `ts[20:100]` | Row-range slice | `np.ndarray` | +| `ts.timestamps` | Timestamp array | `np.ndarray` | + +### AlignedTimeseries + +In the table below, `data` is an `AlignedTimeseries`, obtained from +`data = df.loc[...]`. + +| Example | Operation | Returns | +|---|---|---| +| `data.timestamps` | Timestamp array | `np.ndarray` | +| `data.values` | Value matrix | `np.ndarray`, shape `(N, M)` | +| `data.series_names` | Series names | `List[str]` | +| `data.shape` | Shape `(N, M)` — N timestamps, M series | `tuple` | +| `len(data)` | Number of rows | `int` | +| `data[0]`, `data[0:10]`, `data[0, 1]` | Row / element indexing | `np.ndarray` / scalar | +| `data.show(50)` / `print(data)` | Formatted output (auto-truncated) | — | + +## Series names + +A series is uniquely identified by its **series name**, a string formed by +joining the **table name**, the **tag-column values**, and the **field name** +with `.`, in that order: + +```text +{table_name}.{tag_value_1}.{tag_value_2}...{field_name} +``` + +`list_timeseries()` returns series names; name-based indexing (`df[...]`) and +series selection in `df.loc[...]` both take a series name. + +Examples: + +- `weather.Beijing.humidity` — table `weather`, tag `Beijing`, field `humidity` +- `sensor.s1.pressure` — table `sensor`, tag `s1`, field `pressure` + +> A series name can be obtained from `list_timeseries()` and need not be +> constructed by hand; a series may also be selected by integer index (`df[0]`) +> or metadata filter (`df[df["city"] == "Beijing"]`). + +## Loading + +A path may be a single file, a list of files, or a directory: + +```python +from tsfile import TsFileDataFrame + +df = TsFileDataFrame(["data/weather.tsfile", "data/sensor.tsfile"]) +df = TsFileDataFrame("data/") # recursively find every .tsfile under the directory +print(df) +``` + +Construction only scans metadata; actual values are not read. When several files +are loaded, metadata is scanned in parallel. + +If several files contain the **same series** (e.g. daily shards of +`weather.Beijing.humidity`), they are merged into one continuous series. For +duplicate timestamps only the first is kept — this is not an expected situation, +so deduplicate during preprocessing to avoid metadata distortion. + +### Displaying a DataFrame + +`print(df)` (and `df.show(max_rows=...)`) prints series metadata, head/tail +truncated when large. The header is: + +```text +index │ table │ │ ... │ field │ start_time │ end_time │ count +``` + +For devices with different numbers of tags the tag values are left-aligned and +shorter ones are padded with `None` at the end. + +```text +TsFileDataFrame(table model, 972 time series, 5 files) + table ps_id sn frac field start_time end_time count + 0 pvf 10 30100194A00234H00572 1 pac 2024-04-02 00:00:00 2024-10-28 23:45:00 20160 + 1 pvf 10 30100194A00234H00572 1 tenmeterswindspeed 2024-04-02 00:00:00 2024-10-28 23:45:00 20160 +... +``` + +### Closing + +A `with` block closes file handles automatically; you can also close manually: + +```python +with TsFileDataFrame("data/") as df: + ... # handles released on exit + +tsdf = TsFileDataFrame("data/") +tsdf.close() # or close it yourself +``` + +## Browsing series + +`list_timeseries(path_prefix="")` lists the series names in the loaded files, +optionally filtered by a prefix. Calling it with no argument returns all series. + +```python +>>> df.list_timeseries("weather") +['weather.Beijing.humidity', 'weather.Beijing.temperature', + 'weather.Shanghai.humidity', 'weather.Shanghai.temperature'] +>>> df.list_timeseries("weather.Beijing") +['weather.Beijing.humidity', 'weather.Beijing.temperature'] +``` + +To inspect metadata such as start/end time and count, print the DataFrame (or a +subset of it) — see [Displaying a DataFrame](#displaying-a-dataframe). + +## Selecting series + +`df[...]` returns a lazy `Timeseries` handle (no data read) or a subset view: + +```python +ts = df["weather.Beijing.humidity"] # by name +ts = df[0] # by index (negative indices allowed) + +sub_df = df[0:3] # slice -> TsFileDataFrame (view) +sub_df = df[[0, 2, 5]] # integer list -> TsFileDataFrame (view) +sub_df = df[df["city"] == "Beijing"] # metadata filter -> TsFileDataFrame (view) +``` + +```text +>>> df["weather.Beijing.humidity"] +Timeseries('weather.Beijing.humidity', count=2880, start=2026-01-27 00:00:00, end=2026-02-05 23:55:00) +``` + +Series metadata is served from cache (no I/O): + +```python +>>> ts = df["weather.Beijing.humidity"] +>>> ts.name +'weather.Beijing.humidity' +>>> len(ts) +2880 +>>> ts.stats +{'start_time': 1769443200000, 'end_time': 1770306900000, 'count': 2880} +``` + +## Reading data + +Indexing a `Timeseries` by row triggers the actual file read: + +```python +val = ts[20] # -> float +window = ts[20:100] # -> np.ndarray, shape = (80,) +last_ten = ts[-10:] # -> np.ndarray +sampled = ts[::2] # -> np.ndarray (strided sampling) +ts.timestamps[20:100] # -> the timestamps for those rows, np.ndarray +``` + +```text +>>> ts[20] +46.1 +>>> ts[20:100] +array([46.1 , 41.72, 52.94, ..., 76.3 , 84.35]) +>>> ts.timestamps[20:100] +array([1769449200000, 1769449500000, ..., 1769472900000]) +``` + +## Timestamp-aligned queries + +When you need several series strictly aligned on one time axis, use `.loc`: + +```python +data = df.loc[start_time:end_time, [ + "weather.Beijing.humidity", + "weather.Beijing.temperature", + "sensor.s1.pressure", +]] +``` + +The returned `AlignedTimeseries` aligns all series to the **union** of their +timestamps and fills missing positions with `NaN`: + +```python +data.timestamps # np.ndarray, millisecond timestamps +data.values # np.ndarray, shape = (N, 3) +data.series_names # ["weather.Beijing.humidity", ...] +data.shape # (N, 3) +data[0:10] # first 10 rows, np.ndarray shape = (10, 3) +data.show(50) # show up to 50 rows +``` + +Series may be given by name or by index, mixed freely: + +```python +df.loc[start_time:end_time, [0, 1, 4]] +df.loc[start_time:end_time, [0, "weather.Beijing.temperature", 4]] +``` + +```text +>>> df.loc[1769616000000:1769702100000, +... ['weather.Beijing.temperature', 'weather.Beijing.humidity', 'sensor.s2.pressure']] +AlignedTimeseries(288 rows, 3 series) + timestamp weather.Beijing.temperature weather.Beijing.humidity sensor.s2.pressure +2026-01-29 00:00:00 29.12 92.87 NaN +2026-01-29 00:05:00 1.55 87.34 NaN +... +``` + +The pretty-printed view shows only value columns; to read the aligned timestamp +column use `df.loc[...].timestamps`. diff --git a/src/UserGuide/develop/QuickStart/InterfaceDefinition/InterfaceDefinition-C.md b/src/UserGuide/develop/QuickStart/InterfaceDefinition/InterfaceDefinition-C.md index 9b152991b..0927966c7 100644 --- a/src/UserGuide/develop/QuickStart/InterfaceDefinition/InterfaceDefinition-C.md +++ b/src/UserGuide/develop/QuickStart/InterfaceDefinition/InterfaceDefinition-C.md @@ -32,13 +32,46 @@ typedef enum { TS_DATATYPE_FLOAT = 3, TS_DATATYPE_DOUBLE = 4, TS_DATATYPE_TEXT = 5, - TS_DATATYPE_STRING = 11 + TS_DATATYPE_TIMESTAMP = 8, + TS_DATATYPE_DATE = 9, + TS_DATATYPE_BLOB = 10, + TS_DATATYPE_STRING = 11, + TS_DATATYPE_INVALID = 255 } TSDataType; -typedef enum column_category { TAG = 0, FIELD = 1 } ColumnCategory; - -// ColumnSchema: Represents the schema of a single column, +// Value encoding. +typedef enum { + TS_ENCODING_PLAIN = 0, + TS_ENCODING_DICTIONARY = 1, + TS_ENCODING_RLE = 2, + TS_ENCODING_TS_2DIFF = 4, + TS_ENCODING_GORILLA = 8, + TS_ENCODING_ZIGZAG = 9, + TS_ENCODING_SPRINTZ = 12, + TS_ENCODING_INVALID = 255 +} TSEncoding; + +// Compression type. LZ4 is the default. +typedef enum { + TS_COMPRESSION_UNCOMPRESSED = 0, + TS_COMPRESSION_SNAPPY = 1, + TS_COMPRESSION_GZIP = 2, + TS_COMPRESSION_LZO = 3, + TS_COMPRESSION_LZ4 = 7, + TS_COMPRESSION_INVALID = 255 +} CompressionType; + +typedef enum column_category { + TAG = 0, + FIELD = 1, + ATTRIBUTE = 2, + TIME = 3 +} ColumnCategory; + +// ColumnSchema: Represents the schema of a single column, // including its name, data type, and category. +// Encoding/compression for columns follow the global defaults +// (see "Configuration" below). typedef struct column_schema { char* column_name; TSDataType data_type; @@ -62,6 +95,9 @@ typedef struct result_set_meta_data { } ResultSetMetaData; ``` +> `ColumnSchema` does not carry encoding/compression — those follow the global +> defaults (see [Configuration](#configuration-encoding--compression)). + ## Write Interface @@ -88,7 +124,7 @@ void free_write_file(WriteFile* write_file); ### TsFile Writer Create/Close When creating a TsFile Writer, you need to specify WriteFile and TableSchema. You can use the memory_threshold parameter in -tsfile_writer_new_with_memory_threshold to limit the memory usage of the Writer during data writing, but in the current version, this parameter does not take effect. +tsfile_writer_new_with_memory_threshold to set a memory threshold. ```C /** @@ -260,6 +296,41 @@ ERRNO tsfile_writer_write(TsFileWriter writer, Tablet tablet); +## Configuration (encoding & compression) + +Columns are stored with the **global default** encoding and compression for their +data type (a `ColumnSchema` does not carry codec settings). Change those +defaults *before* creating a writer with the functions below. + +Each setter returns `RET_OK` (0) on success, or `RET_NOT_SUPPORT` (40) for an +unsupported data-type/encoding or compression combination. + +```C +/* Default value encoding per data type, and default compression. */ +int set_datatype_encoding(uint8_t data_type, uint8_t encoding); +int set_global_compression(uint8_t compression); +uint8_t get_datatype_encoding(uint8_t data_type); +uint8_t get_global_compression(); + +/* Time column (the time data type is fixed to INT64). */ +int set_global_time_encoding(uint8_t encoding); +int set_global_time_compression(uint8_t compression); +uint8_t get_global_time_encoding(); +uint8_t get_global_time_compression(); +``` + +Allowed values: encoding accepts `PLAIN` for `BOOLEAN`; `PLAIN`/`TS_2DIFF`/ +`GORILLA`/`ZIGZAG`/`RLE`/`SPRINTZ` for `INT32`/`INT64`/`DATE`; +`PLAIN`/`TS_2DIFF`/`GORILLA`/`SPRINTZ` for `FLOAT`/`DOUBLE`; +`PLAIN`/`DICTIONARY` for `STRING`/`TEXT`. Compression accepts `UNCOMPRESSED`, +`SNAPPY`, `GZIP`, `LZO`, or `LZ4`. + +```C +// e.g. write every column with LZ4 compression +ERRNO code = set_global_compression(TS_COMPRESSION_LZ4); +if (code != RET_OK) { /* handle unsupported value */ } +``` + ## Read Interface ### TsFile Reader Create/Close @@ -291,20 +362,22 @@ ERRNO tsfile_reader_close(TsFileReader reader); -### Query table/get next/query by row +### Query table/get next ```C + /** - * @brief Queries data from the specified table and columns within a given time range. + * @brief Query data from the specific table and columns within time range. * - * @param reader [in] A valid TsFileReader handle obtained by tsfile_reader_new(). - * @param table_name [in] Name of the target table, which must exist in the TsFile. - * @param columns [in] Array of column names to be queried. - * @param column_num [in] Number of columns in the column name array. + * @param reader [in] Valid TsFileReader handle from tsfile_reader_new(). + * @param table_name [in] Target table name. Must exist in the TsFile. + * @param columns [in] Array of column names to fetch. + * @param column_num [in] Number of columns in array. * @param start_time [in] Start timestamp. - * @param end_time [in] End timestamp, which must be greater than or equal to start_time. - * @param err_code [out] Returns RET_OK(0) on success, otherwise returns an error code defined in errno_define_c.h. - * @return ResultSet Handle of the query result set. Must be released by free_tsfile_result_set() after use. + * @param end_time [in] End timestamp. Must ≥ start_time. + * @param err_code [out] RET_OK(0) on success, or error code in errno_define_c.h. + * @return ResultSet Query results handle. Must be freed with + * free_tsfile_result_set(). */ ResultSet tsfile_query_table(TsFileReader reader, const char* table_name, char** columns, uint32_t column_num, @@ -312,61 +385,149 @@ ResultSet tsfile_query_table(TsFileReader reader, const char* table_name, ERRNO* err_code); /** - * @brief Checks and retrieves the next row of data in the result set. + * @brief Check and fetch the next row in the ResultSet. * - * @param result_set [in] A valid ResultSet handle. - * @param error_code [out] Returns RET_OK(0) on success, otherwise returns an error code defined in errno_define_c.h. - * @return bool - true: Next row exists, false: Reached the end or an error occurred. + * @param result_set [in] Valid ResultSet handle. + * @param error_code RET_OK(0) on success, or error code in errno_define_c.h. + * @return bool - true: Row available, false: End of data or error. */ bool tsfile_result_set_next(ResultSet result_set, ERRNO* error_code); /** - * @brief Releases the resources of the result set. + * @brief Free Result set * - * @param result_set [in] Pointer to a valid ResultSet handle. + * @param result_set [in] Valid ResultSet handle ptr. */ void free_tsfile_result_set(ResultSet* result_set); +``` + + + +### Filtering by tag + +**TAG columns** form the device identity (a joint primary +key) — their values are what distinguish one device from another within a table. +A *tag filter* restricts a query to the devices whose TAG values match a +predicate, so you read only the devices you care about. Build a filter from the +reader, pass it to one of the table-query functions below, then release it with +`tsfile_tag_filter_free()`. + +```C +// Opaque handle to a tag filter. Build it with the functions below. +typedef void* TagFilterHandle; + +// Comparison operators for a single-column TAG predicate. +typedef enum { + TAG_FILTER_EQ = 0, // column == value + TAG_FILTER_NEQ = 1, // column != value + TAG_FILTER_LT = 2, // column < value + TAG_FILTER_LTEQ = 3, // column <= value + TAG_FILTER_GT = 4, // column > value + TAG_FILTER_GTEQ = 5, // column >= value + TAG_FILTER_REGEXP = 6, // column matches the regex value + TAG_FILTER_NOT_REGEXP = 7, // column does not match the regex value +} TagFilterOp; /** - * @brief Queries time-series data by row (tree model), supporting offset and row count limitation + * @brief Create a single-column TAG predicate: ` `. * - * @param reader [in] A valid TsFileReader handle obtained by tsfile_reader_new() - * @param device_ids [in] Array of device IDs - * @param device_ids_len [in] Number of device IDs - * @param measurement_names [in] Array of measurement (sensor) names - * @param measurement_names_len [in] Number of measurement names - * @param offset [in] Number of starting rows to skip (must be >= 0) - * @param limit [in] Maximum number of rows to return, < 0 means no limitation - * @param err_code [out] Error code, returns E_OK(0) on success - * @return Returns ResultSet handle on success, NULL on failure + * @param reader [in] Valid TsFileReader handle. + * @param table_name [in] Table whose schema defines the TAG columns. + * @param column_name [in] Name of the TAG column to filter on. + * @param value [in] Comparison value (TAG columns are STRING). + * @param op [in] Comparison operator (TagFilterOp). + * @param err_code [out] RET_OK(0) on success, or error code in errno_define_c.h. + * @return TagFilterHandle on success; NULL on failure. */ -ResultSet tsfile_reader_query_tree_by_row(TsFileReader reader, - char** device_ids, int device_ids_len, - char** measurement_names, - int measurement_names_len, int offset, - int limit, ERRNO* err_code); +TagFilterHandle tsfile_tag_filter_create(TsFileReader reader, + const char* table_name, + const char* column_name, + const char* value, TagFilterOp op, + ERRNO* err_code); /** - * @brief Queries table model data by row, supporting offset and row count limitation pushdown + * @brief Create a range predicate: lower <= column <= upper + * (pass is_not = true for NOT BETWEEN). + */ +TagFilterHandle tsfile_tag_filter_between(TsFileReader reader, + const char* table_name, + const char* column_name, + const char* lower, const char* upper, + bool is_not, ERRNO* err_code); + +// Combine predicates. AND/OR/NOT take ownership of their children; free the root only. +TagFilterHandle tsfile_tag_filter_and(TagFilterHandle left, TagFilterHandle right); +TagFilterHandle tsfile_tag_filter_or(TagFilterHandle left, TagFilterHandle right); +TagFilterHandle tsfile_tag_filter_not(TagFilterHandle filter); + +// Free a tag filter and all of its children. +void tsfile_tag_filter_free(TagFilterHandle filter); +``` + +### Table queries with tag filter, paging and batching + +These query functions accept an optional `tag_filter` (pass `NULL` +for no filtering) and a `batch_size` (`<= 0` returns rows one by one; `> 0` +returns a block of that size). + +```C +/** + * @brief Query a table by row, with offset/limit pushdown and an optional tag filter. * - * @param reader [in] A valid TsFileReader handle obtained by tsfile_reader_new() - * @param table_name [in] Name of the target table - * @param column_names [in] Array of column names to be queried - * @param column_names_len [in] Number of columns to be queried - * @param offset [in] Number of starting rows to skip (must be >= 0) - * @param limit [in] Maximum number of rows to return, < 0 means no limitation - * @param tag_filter [in] Tag filter handle - * @param batch_size [in] Batch size for data query - * @param err_code [out] Error code, returns E_OK(0) on success - * @return Returns ResultSet handle on success, NULL on failure + * @param reader [in] Valid TsFileReader handle. + * @param table_name [in] Target table name. + * @param column_names [in] Requested column names. + * @param column_names_len [in] Number of requested columns. + * @param offset [in] Leading rows to skip (>= 0). + * @param limit [in] Max rows to return; < 0 means unlimited. + * @param tag_filter [in] TAG predicate, or NULL for no filtering. + * @param batch_size [in] <= 0 row-by-row; > 0 block size. + * @param err_code [out] RET_OK(0) on success, or error code. + * @return ResultSet handle; NULL on failure. Free with free_tsfile_result_set(). */ ResultSet tsfile_reader_query_table_by_row( TsFileReader reader, const char* table_name, char** column_names, int column_names_len, int offset, int limit, TagFilterHandle tag_filter, int batch_size, ERRNO* err_code); + +/** + * @brief Query a table within a time range, with an optional tag filter and batching. + * + * @param batch_size <= 0 row-by-row return; > 0 returns a TsBlock of that size. + */ +ResultSet tsfile_query_table_batch(TsFileReader reader, const char* table_name, + char** columns, uint32_t column_num, + Timestamp start_time, Timestamp end_time, + TagFilterHandle tag_filter, int batch_size, + ERRNO* err_code); + +/** + * @brief Query a table with a tag filter (time range + TAG predicate). + * + * @param batch_size <= 0 row-by-row return; > 0 returns a TsBlock of that size. + */ +ResultSet tsfile_query_table_with_tag_filter( + TsFileReader reader, const char* table_name, char** columns, + uint32_t column_num, Timestamp start_time, Timestamp end_time, + TagFilterHandle tag_filter, int batch_size, ERRNO* err_code); ``` +Example — read `temperature` only for devices whose `region` TAG equals +`shanghai`: + +```C +ERRNO ec = RET_OK; +TagFilterHandle f = tsfile_tag_filter_create( + reader, "weather", "region", "shanghai", TAG_FILTER_EQ, &ec); +char* cols[] = {"temperature"}; +ResultSet rs = tsfile_reader_query_table_by_row( + reader, "weather", cols, 1, /*offset*/ 0, /*limit*/ -1, f, /*batch*/ 0, &ec); + +// ... iterate rs with tsfile_result_set_next(), then release: +free_tsfile_result_set(&rs); +tsfile_tag_filter_free(f); +``` ### Get Data from result set diff --git a/src/UserGuide/develop/QuickStart/InterfaceDefinition/InterfaceDefinition-CPP.md b/src/UserGuide/develop/QuickStart/InterfaceDefinition/InterfaceDefinition-CPP.md index 101fcdfbf..00c2bd142 100644 --- a/src/UserGuide/develop/QuickStart/InterfaceDefinition/InterfaceDefinition-CPP.md +++ b/src/UserGuide/develop/QuickStart/InterfaceDefinition/InterfaceDefinition-CPP.md @@ -20,6 +20,67 @@ --> # Interface Definitions - C++ +## Data Types, Encoding and Compression + +These enumerations are shared by the read and write interfaces. The numeric +codes are also the values stored on disk. + +```cpp +// Supported measurement/column data types. +enum TSDataType : uint8_t { + BOOLEAN = 0, + INT32 = 1, + INT64 = 2, + FLOAT = 3, + DOUBLE = 4, + TEXT = 5, + TIMESTAMP = 8, + DATE = 9, + BLOB = 10, + STRING = 11, +}; + +// Value encoding. See the table below for which encodings apply to which types. +enum TSEncoding : uint8_t { + PLAIN = 0, + DICTIONARY = 1, + RLE = 2, + TS_2DIFF = 4, + GORILLA = 8, + ZIGZAG = 9, + SPRINTZ = 12, +}; + +// Compression type. SNAPPY/GZIP/LZO/LZ4 depend on build options; LZ4 is the default. +enum CompressionType : uint8_t { + UNCOMPRESSED = 0, + SNAPPY = 1, + GZIP = 2, + LZO = 3, + LZ4 = 7, +}; + +// Column role within a table schema. +enum class ColumnCategory { TAG = 0, FIELD = 1, ATTRIBUTE = 2, TIME = 3 }; +``` + +Encodings applicable to each data type: + +| Encoding | Applicable types | +|---|---| +| `PLAIN` | all types | +| `DICTIONARY` | `TEXT`, `STRING` | +| `RLE` | `INT32`, `INT64`, `TIMESTAMP`, `DATE` | +| `TS_2DIFF` | `INT32`, `INT64`, `TIMESTAMP`, `DATE`, `FLOAT`, `DOUBLE` | +| `GORILLA` | `INT32`, `INT64`, `TIMESTAMP`, `DATE`, `FLOAT`, `DOUBLE` | +| `ZIGZAG` | `INT32`, `INT64` | +| `SPRINTZ` | `INT32`, `INT64`, `FLOAT`, `DOUBLE` | + +Default value encoding per type: `BOOLEAN → PLAIN`, `INT32 / INT64 → TS_2DIFF`, +`FLOAT / DOUBLE → GORILLA`, `TEXT / STRING / BLOB → PLAIN`. The default +compression is `LZ4`. See [Configuring encoding and compression](#configuring-encoding-and-compression) +for how to override these. + ## Write Interface ### TsFileTableWriter @@ -27,92 +88,53 @@ Used to write data to tsfile ```cpp -namespace storage { -class RestorableTsFileIOWriter; - /** - * @brief Supports writing structured table data to TsFile according to the specified table schema + * @brief Facilitates writing structured table data into a TsFile with a specified schema. * - * The TsFileTableWriter class is used to write structured data (especially suitable for time-series data) - * to TsFile optimized for efficient storage and querying. - * Users can define the structure of the table to be written, add data rows according to the structure, - * and serialize the data into TsFile. - * Meanwhile, this class provides the ability to limit memory usage during the writing process. + * The TsFileTableWriter class is designed to write structured data, particularly suitable for time-series data, + * into a file optimized for efficient storage and retrieval (referred to as TsFile here). It allows users to define + * the schema of the tables they want to write, add rows of data according to that schema, and serialize this data + * into a TsFile. Additionally, it provides options to limit memory usage during the writing process. */ class TsFileTableWriter { public: /** - * TsFileTableWriter is used to write table data to the target file according to the specified table schema, - * and can optionally limit the memory usage. - * - * @param writer_file Target file for writing table data, cannot be a null pointer - * @param table_schema Used to construct the table structure and define the schema of the table to be written - * @param memory_threshold Optional parameter. When the written data volume exceeds this threshold, - * data will be automatically flushed to disk. The default value is 128MB - */ - template - explicit TsFileTableWriter(storage::WriteFile* writer_file, T* table_schema, - uint64_t memory_threshold = 128 * 1024 * 1024) { - static_assert(!std::is_same::value, - "table_schema cannot be nullptr"); - tsfile_writer_ = std::make_shared(); - tsfile_writer_->init(writer_file); - tsfile_writer_->set_generate_table_schema(false); - - // Perform a deep copy. The source TableSchema object may be allocated on the stack/heap - auto table_schema_ptr = std::make_shared(*table_schema); - error_number = tsfile_writer_->register_table(table_schema_ptr); - exclusive_table_name_ = table_schema->get_table_name(); - common::g_config_value_.chunk_group_size_threshold_ = memory_threshold; - } - - /** - * Constructs TsFileTableWriter from a restorable TsFileIOWriter, - * supporting appending table data after failure recovery. - * The schema is read from the recovered file without additional TableSchema input. - * - * @param restorable_writer Recovered I/O writer; cannot be a null pointer, - * and must be opened in truncate mode to ensure can_write() returns true - * @param memory_threshold Optional memory threshold for cached data - */ - explicit TsFileTableWriter( - storage::RestorableTsFileIOWriter* restorable_writer, - uint64_t memory_threshold = 128 * 1024 * 1024); - - /** - * Registers a table schema with the writer + * TsFileTableWriter is used to write table data into a target file with the given schema, + * optionally limiting the memory usage. * - * @param table_schema The table schema to be registered, cannot be a null pointer - * @return Returns 0 on success, non-zero error code on failure + * @param writer_file Target file where the table data will be written. Must not be null. + * @param table_schema Used to construct table structures. Defines the schema of the table + * being written. + * @param memory_threshold Optional parameter. When the size of written + * data exceeds this value, the data will be automatically flushed to the + * disk. Default value is 128MB. */ - int register_table(const std::shared_ptr& table_schema); - + TsFileTableWriter(WriteFile* writer_file, + TableSchema* table_schema, + uint64_t memory_threshold = 128 * 1024 * 1024); + ~TsFileTableWriter(); /** - * Writes the specified Tablet data to the target file according to the table schema + * Writes the given tablet data into the target file according to the schema. * - * @param tablet Tablet containing the data to be written, cannot be a null pointer - * @return Returns 0 on success, non-zero error code on failure + * @param tablet The tablet containing the data to be written. Must not be null. + * @return Returns 0 on success, or a non-zero error code on failure. */ - int write_table(Tablet& tablet) const; - + int write_table(const Tablet& tablet); /** - * Flushes all cached data to the underlying storage medium to ensure all data is persisted. - * This method guarantees that all pending data is written to disk. + * Flushes any buffered data to the underlying storage medium, ensuring all data is written out. + * This method ensures that all pending writes are persisted. * - * @return Returns 0 on success, non-zero error code on failure + * @return Returns 0 on success, or a non-zero error code on failure. */ int flush(); - /** - * Closes the writer and releases all resources it occupies. - * No subsequent operations should be performed on the current instance after calling this method. + * Closes the writer and releases any resources held by it. + * After calling this method, no further operations should be performed on this instance. * - * @return Returns 0 on success, non-zero error code on failure + * @return Returns 0 on success, or a non-zero error code on failure. */ int close(); }; - -} // namespace storage ``` ### TableSchema @@ -150,44 +172,41 @@ class TableSchema { struct ColumnSchema { std::string column_name_; common::TSDataType data_type_; + common::CompressionType compression_; + common::TSEncoding encoding_; ColumnCategory column_category_; /** - * @brief Constructs a ColumnSchema object with the given parameters. + * @brief Constructs a ColumnSchema with explicit compression and encoding. * * @param column_name The name of the column. Must be a non-empty string. - * This name is used to identify the column within the table. - * @param data_type The data type of the measurement, such as INT32, DOUBLE, TEXT, etc. - * This determines how the data will be stored and interpreted. - * @param column_category The category of the column indicating its role or type - * within the schema, e.g., FIELD, TAG. - * Defaults to ColumnCategory::FIELD if not specified. - * @note It is the responsibility of the caller to ensure that `column_name` is not empty. + * @param data_type The data type of the column (INT32, DOUBLE, TEXT, ...). + * @param compression The compression applied to the column's chunks. + * @param encoding The encoding applied to the column's values. + * @param column_category The role of the column (FIELD, TAG, ...). Defaults to FIELD. */ ColumnSchema(std::string column_name, common::TSDataType data_type, - ColumnCategory column_category = ColumnCategory::FIELD) : column_name_(std::move(column_name)), - data_type_(data_type), - column_category_(column_category) { - } -}; + common::CompressionType compression, common::TSEncoding encoding, + ColumnCategory column_category = ColumnCategory::FIELD); -/** - * @brief Represents the data type of a measurement. - * - * This enumeration defines the supported data types for measurements in the system. - */ -enum TSDataType : uint8_t { - BOOLEAN = 0, - INT32 = 1, - INT64 = 2, - FLOAT = 3, - DOUBLE = 4, - TEXT = 5, - STRING = 11 + /** + * @brief Constructs a ColumnSchema using the engine's default encoding and + * compression for the given data type. + * + * @param column_name The name of the column. Must be a non-empty string. + * @param data_type The data type of the column. + * @param column_category The role of the column. Defaults to FIELD. + */ + ColumnSchema(std::string column_name, common::TSDataType data_type, + ColumnCategory column_category = ColumnCategory::FIELD); }; - ``` +> `TAG` columns are the device identifier (joint primary key); their data type is +> always `STRING`. `FIELD` columns hold the measured values. The encoding and +> compression you set on a `ColumnSchema` apply to that column when written; the +> two-argument constructor falls back to the per-type defaults. + ### Tablet Write column memory structure @@ -253,159 +272,117 @@ public: }; ``` -### RestorableTsFileIOWriter -> V2.3.1 - -```cpp -namespace storage { -/** - * RestorableTsFileIOWriter is used to open a TsFile and perform optional recovery operations on it. - * Inherits from TsFileIOWriter and supports continuous writing after file recovery. - * - * (1) If the TsFile was closed normally: has_crashed()=false, can_write()=false - * - * (2) If the TsFile is incomplete / the program crashed: has_crashed()=true, - * can_write()=true. The writer will truncate the corrupted data and allow further writing. - * - * Implemented based on standard C++11, uses RAII and smart pointers to avoid memory leaks. - */ -class RestorableTsFileIOWriter : public TsFileIOWriter { - public: - RestorableTsFileIOWriter(); +### Configuring encoding and compression - /** - * Opens a TsFile for recovery / appending data. - * Uses O_RDWR|O_CREAT mode without O_TRUNC, so the original file content is preserved. - * - * @param file_path Path of the TsFile - * @param truncate_corrupted If true, truncate the corrupted data; - * If false, do not truncate (the incomplete file remains unchanged) - * @return E_OK on success, error code on failure - */ - int open(const std::string& file_path, bool truncate_corrupted = true); +Encoding and compression are chosen **per data type**: each type has a default +(see the table above). You can change those defaults, or pass an explicit +encoding/compression on a schema. - /** - * Closes the file - */ - void close(); -}; +**1. On a schema.** Pass an explicit encoding and compression when you build a +`ColumnSchema`: -} // namespace storage +```cpp +// Store column "temperature" as TS_2DIFF + LZ4. +common::ColumnSchema col("temperature", common::INT64, + common::LZ4, common::TS_2DIFF, + common::ColumnCategory::FIELD); ``` +**2. Per-type defaults.** Change the defaults *before* creating a writer; they then +apply to any column whose schema does not specify its own encoding/compression. +These helpers live in `common`/`storage` and validate their arguments (returning +`E_NOT_SUPPORT` for an unsupported combination): +```cpp +// Default value encoding per data type and default compression. +int common::set_datatype_encoding(uint8_t data_type, uint8_t encoding); +int common::set_global_compression(uint8_t compression); +uint8_t common::get_datatype_encoding(uint8_t data_type); +uint8_t common::get_global_compression(); + +// Time-column encoding/compression (the data type is fixed to INT64). +int common::set_global_time_encoding(uint8_t encoding); +int common::set_global_time_compression(uint8_t compression); +``` ## Read Interface ### Tsfile Reader -use to execute query in tsfile and return value by ResultSet. ```cpp -namespace storage { /** - * @brief TsFileReader provides the ability to query all files with the .tsfile suffix + * @brief TsfileReader provides the ability to query all files with the suffix + * .tsfile * - * TsFileReader is designed specifically for querying .tsfile files, supporting both tree-model queries and table-model queries. - * It also supports querying metadata such as table schemas (TableSchema) and time-series schemas (TimeseriesSchema). + * TsfileReader is designed to query .tsfile files. It accepts table-model + * queries and supports querying metadata such as TableSchema. */ class TsFileReader { public: TsFileReader(); + ~TsFileReader(); /** - * @brief Opens a TsFile + * @brief open the tsfile * - * @param file_path Path of the TsFile to be opened - * @return 0 on success, non-zero error code on failure + * @param file_path the path of the tsfile which will be opened + * @return Returns 0 on success, or a non-zero error code on failure. */ - int open(const std::string& file_path); + int open(const std::string &file_path); /** - * @brief Closes the TsFile. This method should be called after queries are completed. + * @brief close the tsfile, this method should be called after the + * query is finished * - * @return 0 on success, non-zero error code on failure + * @return Returns 0 on success, or a non-zero error code on failure. */ int close(); /** - * @brief Queries the TsFile using a query expression. Users can construct custom query expressions for execution. - * - * @param [in] qe Query expression - * @param [out] ret_qds Result set - * @return 0 on success, non-zero error code on failure - */ - int query(storage::QueryExpression* qe, ResultSet*& ret_qds); - /** - * @brief Queries the TsFile by path list, start time, and end time. - * This method is used for tree-model queries on TsFile. + * @brief query the tsfile by the query expression,Users can construct + * their own query expressions to query tsfile * - * @param [in] path_list Path list - * @param [in] start_time Start timestamp - * @param [in] end_time End timestamp - * @param [out] result_set Result set - * @return 0 on success, non-zero error code on failure + * @param [in] qe the query expression + * @param [out] ret_qds the result set + * @return Returns 0 on success, or a non-zero error code on failure. */ - int query(std::vector& path_list, int64_t start_time, - int64_t end_time, ResultSet*& result_set); + int query(storage::QueryExpression *qe, ResultSet *&ret_qds); /** - * @brief Queries the TsFile by table name, column names, start time, and end time. - * This method is used for table-model queries on TsFile. + * @brief query the tsfile by the table name, columns names, start time + * and end time. * - * @param [in] table_name Table name - * @param [in] columns_names List of column names - * @param [in] start_time Start timestamp - * @param [in] end_time End timestamp - * @param [out] result_set Result set - * @param [in] batch_size ≤ 0 for row-by-row mode; - * > 0 to return TsBlock chunks of the specified size - * @return 0 on success, non-zero error code on failure + * @param [in] table_name the table name + * @param [in] columns_names the columns names + * @param [in] start_time the start time + * @param [in] end_time the end time + * @param [out] result_set the result set */ - int query(const std::string& table_name, - const std::vector& columns_names, int64_t start_time, - int64_t end_time, ResultSet*& result_set, int batch_size = -1); + int query(const std::string &table_name, + const std::vector &columns_names, int64_t start_time, + int64_t end_time, ResultSet *&result_set); /** - * @brief Queries the TsFile by table name, column names, start time, end time, and tag filter conditions. - * This method is used for table-model queries on TsFile. + * @brief query the tsfile by the table name, columns names, start time + * and end time, tag filter. * - * @param [in] table_name Table name - * @param [in] columns_names List of column names - * @param [in] start_time Start timestamp - * @param [in] end_time End timestamp - * @param [in] tag_filter Tag filter condition - * @param [out] result_set Result set - * @param [in] batch_size Batch reading size - * @return 0 on success, non-zero error code on failure + * @param [in] table_name the table name + * @param [in] columns_names the columns names + * @param [in] start_time the start time + * @param [in] end_time the end time + * @param [in] tag_filter the tag filter + * @param [out] result_set the result set */ int query(const std::string& table_name, const std::vector& columns_names, int64_t start_time, - int64_t end_time, ResultSet*& result_set, Filter* tag_filter, - int batch_size = 0); + int64_t end_time, ResultSet*& result_set, Filter* tag_filter); /** - * @brief Queries tree-model time-series data by row with offset and row limit. + * @brief query a table by row, with offset/limit pushdown and an optional + * tag filter. * - * @param path_list Full paths to query (device.measurement) - * @param offset Number of starting rows to skip (>= 0) - * @param limit Maximum number of rows to return; no limit if < 0 - * @param[out] result_set Result set to store query results - * @return 0 on success, non-zero error code on failure - */ - int queryByRow(std::vector& path_list, int offset, int limit, - ResultSet*& result_set); - - /** - * @brief Queries table-model data by row with pushed-down offset and row limit. - * - * For dense devices (all columns have the same row count), - * offset/limit are pushed down to the data block/page level via SSI, - * skipping entire blocks/pages without decoding. - * For sparse devices, offset/limit take effect during row merging. - * Entire devices can be skipped directly if their total rows fall within the offset range. - * - * @param table_name Table name to query - * @param column_names Column names to query - * @param offset Number of starting rows to skip (>= 0) - * @param limit Maximum number of rows to return; no limit if < 0 - * @param[out] result_set Result set to store query results - * @param tag_filter Optional tag filter condition for filtering data by tag columns - * @param batch_size Batch reading size - * @return 0 on success, non-zero error code on failure + * @param [in] table_name the table name + * @param [in] column_names the column names + * @param [in] offset leading rows to skip (>= 0) + * @param [in] limit max rows to return; < 0 means unlimited + * @param [out] result_set the result set + * @param [in] tag_filter optional tag filter built with TagFilterBuilder, or nullptr + * @param [in] batch_size <= 0 returns rows one by one; > 0 returns blocks of that size + * @return Returns 0 on success, or a non-zero error code on failure. */ int queryByRow(const std::string& table_name, const std::vector& column_names, int offset, @@ -413,115 +390,37 @@ class TsFileReader { Filter* tag_filter = nullptr, int batch_size = 0); /** - * @brief Performs a table query on the tree model. - * - * @param measurement_names List of measurement names - * @param start_time Start timestamp - * @param end_time End timestamp - * @param result_set Result set - * @return 0 on success, non-zero error code on failure - */ - int query_table_on_tree(const std::vector& measurement_names, - int64_t start_time, int64_t end_time, - ResultSet*& result_set); - /** - * @brief Destroys the result set. This method should be called after the query is completed and the result set is no longer used. - * - * @param qds Result set object - */ - void destroy_query_data_set(ResultSet* qds); - /** - * @brief Reads time-series data by device ID and measurement names. - * - * @param device_id Device ID - * @param measurement_name List of measurement names - * @return Result set object - */ - ResultSet* read_timeseries( - const std::shared_ptr& device_id, - const std::vector& measurement_name); - /** - * @brief Gets all devices in the TsFile for a specified table. - * - * @param table_name Table name - * @return List of device IDs - */ - std::vector> get_all_devices( - std::string table_name); - - /** - * @brief Gets all device IDs in the TsFile. - * - * @return List of device IDs - */ - std::vector> get_all_device_ids(); - - /** - * @brief Gets all device IDs in the file (functionally identical to get_all_device_ids). + * @brief destroy the result set, this method should be called after the + * query is finished and result_set * - * @return List of devices + * @param qds the result set */ - std::vector> get_all_devices(); - - /** - * @brief Gets time-series schemas by device ID and measurement names. - * - * @param [in] device_id Device ID - * @param [out] result List of measurement schemas - * @return 0 on success, non-zero error code on failure - */ - int get_timeseries_schema(std::shared_ptr device_id, - std::vector& result); - + void destroy_query_data_set(ResultSet *qds); /** - * @brief Gets time-series metadata for specified devices. - * - * Only devices existing in the file are included in the result. - * Returns an empty map if the device ID list is empty. + * @brief get the table schema by the table name * - * @param device_ids List of devices to query - * @return Mapping: Device ID -> List of time-series metadata (existing entries only) - */ - DeviceTimeseriesMetadataMap get_timeseries_metadata( - const std::vector>& device_ids); - - /** - * @brief Gets time-series metadata for all devices in the file. - * - * @return Mapping: Device ID -> List of time-series metadata - */ - DeviceTimeseriesMetadataMap get_timeseries_metadata(); - - /** - * @brief Gets the table schema by table name. - * - * @param table_name Table name - * @return Shared pointer to the table schema + * @param table_name the table name + * @return std::shared_ptr the table schema */ std::shared_ptr get_table_schema( - const std::string& table_name); + const std::string &table_name); /** - * @brief Gets all table schemas in the TsFile. + * @brief get all table schemas in the tsfile * - * @return List of table schemas + * @return std::vector> the table schema list */ std::vector> get_all_table_schemas(); }; ``` ### ResultSet -A collection of query.Support iterator to get data, and directly through the column name or index to get specific data. ```cpp /** * @brief ResultSet is the query result of the TsfileReader. It provides access * to the results. * * ResultSet is a virtual class. Convert it to the corresponding implementation - * class when used - * @note When using the tree model and the filter is a global time filter, - * it should be cast as QDSWithoutTimeGenerator. - * @note When using the tree model and the filter is not a global time filter, - * it should be QDSWithTimeGenerator. - * @note If the query uses the table model, the cast should be TableResultSet + * class when used. + * @note The concrete type is TableResultSet. */ class ResultSet { public: @@ -557,6 +456,7 @@ class ResultSet { */ template T get_value(const std::string& column_name); + /** * @brief Get the value of the column by column index * * @param column_index the index of the column starting from 1 @@ -586,7 +486,6 @@ class ResultSet { }; ``` ### ResultMeta -user can obtain the metadata from ResultSetMetadata, including all columnnames and data types. When a user uses a table model, the first columndefaults to the time column. ```cpp /** * @brief metadata of result set diff --git a/src/UserGuide/develop/QuickStart/InterfaceDefinition/InterfaceDefinition-Python.md b/src/UserGuide/develop/QuickStart/InterfaceDefinition/InterfaceDefinition-Python.md index f74354b0d..11a09b8f8 100644 --- a/src/UserGuide/develop/QuickStart/InterfaceDefinition/InterfaceDefinition-Python.md +++ b/src/UserGuide/develop/QuickStart/InterfaceDefinition/InterfaceDefinition-Python.md @@ -39,15 +39,41 @@ class TSDataType(IntEnum): BLOB = 10 STRING = 11 +class TSEncoding(IntEnum): + """ + Value encoding accepted by the writer. The comment after each + member lists the data types it can be used with. + """ + PLAIN = 0 # all types + DICTIONARY = 1 # STRING, TEXT + RLE = 2 # INT32, INT64, TIMESTAMP, DATE + TS_2DIFF = 4 # INT32, INT64, TIMESTAMP, DATE, FLOAT, DOUBLE + GORILLA = 8 # INT32, INT64, TIMESTAMP, DATE, FLOAT, DOUBLE + ZIGZAG = 9 # INT32, INT64 + SPRINTZ = 12 # INT32, INT64, FLOAT, DOUBLE + +class Compressor(IntEnum): + """ + Compression accepted by the writer. The default is LZ4. + """ + UNCOMPRESSED = 0 + SNAPPY = 1 + GZIP = 2 + LZO = 3 + LZ4 = 7 + class ColumnCategory(IntEnum): """ Enumeration of column categories in TsFile. - TAG: Represents a tag column, used for metadata. - FIELD: Represents a field column, used for storing actual data values. + TAG: a tag column (part of the device identifier / joint primary key). + FIELD: a field column, holding the measured values. + ATTRIBUTE / TIME: reserved column roles. """ TAG = 0 FIELD = 1 + ATTRIBUTE = 2 + TIME = 3 class ColumnSchema: """Defines schema for a table column (name, datatype, category).""" @@ -95,9 +121,11 @@ class TsFileTableWriter: """ :param path: The path of tsfile, will create if it doesn't exist. :param table_schema: describes the schema of the tables want to write. + :param memory_threshold: bytes buffered before an automatic flush (default 128MB). :return: no return value. """ - def __init__(self, path: str, table_schema: TableSchema) + def __init__(self, path: str, table_schema: TableSchema, + memory_threshold: int = 128 * 1024 * 1024) """ @@ -106,21 +134,38 @@ class TsFileTableWriter: :return: no return value. """ def write_table(self, tablet: Tablet) - + + """ + Write a pandas DataFrame into the table. Column encoding/compression follow + the table schema (or the engine defaults). + :param dataframe: the data to write. + :return: no return value. + """ + def write_dataframe(self, dataframe: pandas.DataFrame) + + """ + Flush buffered data to disk. + :return: no return value. + """ + def flush(self) + """ Close TsFileTableWriter and flush data automatically. :return: no return value. """ def close(self) + # Usable as a context manager: + # with TsFileTableWriter(path, schema) as w: + # w.write_table(tablet) + def __enter__(self) + def __exit__(self, exc_type, exc_val, exc_tb) ``` ### Tablet definition -You can use Tablet to insert data into TsFile in batches. - ```Python class Tablet(object) """ @@ -140,6 +185,49 @@ class Tablet(object) ``` +### dataframe_to_tsfile + +```python +def dataframe_to_tsfile(dataframe: pd.DataFrame, + file_path: str, + table_name: Optional[str] = None, + time_column: Optional[str] = None, + tag_column: Optional[list[str]] = None) + """ + Write a pandas DataFrame to a TsFile. + + :param dataframe: the data to write. + :param file_path: destination .tsfile path. + :param table_name: output table name. + :param time_column: name of the column to use as the timestamp column. + :param tag_column: names of the columns to treat as TAG columns. + """ +``` + +## Configuration + +Global write defaults — the default per-type encodings, the default compression, +and the time-column encoding/compression — are exposed as a single dictionary. +Change them **before** creating a writer. + +```python +from tsfile import get_tsfile_config, set_tsfile_config +from tsfile import TSEncoding, Compressor + +cfg = get_tsfile_config() # -> dict of all config values +# e.g. cfg["default_compression_type_"], cfg["int64_encoding_type_"], +# cfg["time_encoding_type_"], cfg["time_compress_type_"], ... + +set_tsfile_config({ + "default_compression_type_": Compressor.LZ4, + "int64_encoding_type_": TSEncoding.TS_2DIFF, +}) +``` + +`set_tsfile_config` validates each value and only updates the keys you pass. +Encoding/compression values are `TSEncoding` / `Compressor` members; the same +type-vs-encoding restrictions as the C++ API apply. + ## Read Interface ### TsFileReader @@ -147,141 +235,71 @@ class Tablet(object) ```python class TsFileReader: """ - Query table data and time-series data from TsFile, providing standardized file reading and query interfaces. - Supports full core capabilities including table model query, tree model query, metadata acquisition, and resource management. - """ - - def __init__(self, pathname: str): - """ - Initialize the TsFile reader for the specified path, complete file loading and underlying reader initialization, - and maintain all active query result sets to ensure all result sets are invalidated synchronously when the reader is closed. - - :param pathname: Full path of the TsFile to be read - :return: No return value - """ - - def query_table(self, table_name: str, column_names: List[str], - start_time: int = np.iinfo(np.int64).min, - end_time: int = np.iinfo(np.int64).max, - tag_filter: Optional[object] = None, - batch_size: int = 0) -> object: - """ - Perform time-range query on the specified table and columns, supporting tag filtering and batch reading mode. - Adapts to both row-by-row return and fixed-size data block return modes to meet reading requirements in different scenarios. - - :param table_name: Name of the target table to query, case-insensitive - :param column_names: List of target column names to retrieve; all columns are queried by default if empty - :param start_time: Start timestamp of the query range, default is the minimum value of int64 type - :param end_time: End timestamp of the query range, default is the maximum value of int64 type - :param tag_filter: Optional parameter, filter conditions based on tag columns, supporting equality, range, and logical combination filters - :param batch_size: Batch reading size; row-by-row mode is enabled when ≤ 0, data blocks are returned by the specified size when > 0 - :return: Encapsulated query result set handler for traversing data, reading data, and obtaining metadata - """ - - def query_table_on_tree(self, column_names: List[str], - start_time: int = np.iinfo(np.int64).min, - end_time: int = np.iinfo(np.int64).max) -> object: - """ - Perform table query on the tree model structure, adapted for query scenarios of native tree-structured time-series data. - Query directly based on measurement names without specifying a table name; path names are case-sensitive. - - :param column_names: List of measurement names to query, corresponding to node paths in the tree structure - :param start_time: Start timestamp of the query range, default is the minimum value of int64 type - :param end_time: End timestamp of the query range, default is the maximum value of int64 type - :return: Result set handler corresponding to the tree model query - """ - - def query_tree_by_row(self, device_ids: List[str], measurement_names: List[str], - offset: int = 0, limit: int = -1) -> object: - """ - Query tree model time-series data by row with pagination, supporting offset skipping and maximum return row limit. - Adapted for large data volume pagination reading to avoid memory overflow caused by loading excessive data at once. - - :param device_ids: List of device IDs to query, cannot be empty - :param measurement_names: List of measurement names to query, cannot be empty - :param offset: Number of starting rows to skip, starting from 0 by default - :param limit: Maximum number of rows to return; no limit if less than 0 - :return: Result set handler for tree model pagination query - """ - - def query_table_by_row(self, table_name: str, column_names: List[str], - offset: int = 0, limit: int = -1, - tag_filter: Optional[object] = None, - batch_size: int = 0) -> object: - """ - Query table model data by row with pagination, supporting offset and row limit pushdown, and can be used with tag filtering. - Invalid data can be skipped at the data block level for dense devices, greatly improving pagination query efficiency. - - :param table_name: Name of the target table to query - :param column_names: List of column names to query - :param offset: Number of starting rows to skip, starting from 0 by default - :param limit: Maximum number of rows to return; no limit if less than 0 - :param tag_filter: Optional parameter, tag filter condition to filter device data that meets the criteria - :param batch_size: Batch reading size, adapted to the underlying data block reading logic - :return: Result set handler for table model pagination query - """ - - def query_timeseries(self, device_name: str, sensor_list: List[str], - start_time: int = 0, end_time: int = 0) -> object: - """ - Perform time-range time-series data query for a single specified device. - Adapted for precise query scenarios of a single device with multiple sensors, simplifying query invocation logic. - - :param device_name: Name/path of the target device - :param sensor_list: List of sensor (measurement) names to query - :param start_time: Query start timestamp; starts from the earliest time of the file by default if 0 - :param end_time: Query end timestamp; ends at the latest time of the file by default if 0 - :return: Result set handler for single-device time-series query - """ - - def get_table_schema(self, table_name: str) -> object: - """ - Get the complete schema information of the specified table, including full metadata such as column names, data types, tag columns, and time-series constraints. - Used to verify the legality of query fields in advance and parse data structures. - - :param table_name: Name of the target table - :return: Schema information object of the corresponding table, containing full configuration of the table structure - """ - - def get_all_table_schemas(self) -> Dict[str, object]: - """ - Get schema information of all tables in the current TsFile. - Traverse all data table structures in the file with one click without querying table by table. - - :return: Dictionary structure, key is table name, value is schema information object of the corresponding table - """ - - def get_all_timeseries_schemas(self) -> List[object]: - """ - Get schema information of all time-series in the TsFile. - Covers field, type, and constraint information of full time-series data in both tree model and table model. - - :return: List of all time-series schema information - """ - - def get_all_devices(self) -> List[str]: - """ - Get identification information of all devices in the TsFile. - Can traverse all devices in the file, adapted for full-device statistics and batch query pre-operations. - - :return: List composed of all device IDs/device paths - """ - - def get_timeseries_metadata(self, device_ids: Optional[List[str]] = None) -> Dict[str, object]: - """ - Get time-series metadata of specified devices, including data storage segments, field constraints, data ranges, etc. - Returns metadata of all devices by default if no device ID is passed, returns an empty dictionary if an empty list is passed. - - :param device_ids: Optional parameter, list of device IDs to query metadata for - :return: Dictionary structure, key is device path, value is time-series metadata group of the corresponding device - """ - - def close(self) -> None: - """ - Close the TsFile reader, release underlying file handles and memory resources. - Mark all current active query result sets as invalid and prohibit subsequent data reading operations. - No query or metadata acquisition operations can be performed after closing; the reader needs to be reinitialized. - """ + Query table data from a TsFile. + """ + + """ + Initialize a TsFile reader for the specified file path. + :param pathname: The path to the TsFile. + :return no return value. + """ + def __init__(self, pathname) + + + """ + Executes a time range query on the specified table and columns. + + :param table_name: The name of the table to query. + :param column_names: A list of column names to retrieve. + :param start_time: The start time of the query range (default: minimum int64 value). + :param end_time: The end time of the query range (default: maximum int64 value). + :return: A query result set handler. + """ + def query_table(self, table_name : str, column_names : List[str], + start_time : int = np.iinfo(np.int64).min, + end_time: int = np.iinfo(np.int64).max) -> ResultSet + + """ + Execute a table query by row, with offset/limit pushdown and an optional + tag filter. A TAG predicate restricts the query to the devices whose + TAG-column values match. Build a filter with the helpers in tsfile.tag_filter + (tag_eq, tag_neq, tag_lt, tag_lteq, tag_gt, tag_gteq, tag_between, ...) and + combine filters with &, | and ~. + + :param table_name: The name of the table to query. + :param column_names: A list of column names to retrieve. + :param offset: Number of leading rows to skip (default 0). + :param limit: Maximum number of rows to return; < 0 means unlimited. + :param tag_filter: Optional tag predicate (TagFilter), or None for no filtering. + :param batch_size: <= 0 returns rows one by one; > 0 returns blocks of that size. + :return: A query result set handler. + """ + def query_table_by_row(self, table_name : str, column_names : List[str], + offset : int = 0, limit : int = -1, + tag_filter = None, batch_size : int = 0) -> ResultSet + + """ + Retrieves the schema of the specified table. + + :param table_name: The name of the table. + :return: The schema of the specified table. + """ + def get_table_schema(self, table_name : str)-> TableSchema + + + """ + Retrieves the schemas of all tables in the TsFile. + + :return: A dictionary mapping table names to their schemas. + """ + def get_all_table_schemas(self) ->dict[str, TableSchema] + + + """ + Closes the TsFile reader. If the reader has active result sets, they will be invalidated. + """ + def close(self) + ``` ### ResultSet @@ -388,7 +406,6 @@ def to_dataframe(file_path: str, Read data from a TsFile and convert it into a Pandas DataFrame or an iterator of DataFrames. - This function supports both table-model and tree-model TsFiles. Users can filter data by table name, column names, time range, and maximum number of rows. diff --git a/src/UserGuide/develop/Tools/Tsfile-CLI.md b/src/UserGuide/develop/Tools/Tsfile-CLI.md new file mode 100644 index 000000000..d335c1bb9 --- /dev/null +++ b/src/UserGuide/develop/Tools/Tsfile-CLI.md @@ -0,0 +1,183 @@ + +# tsfile-cli + +`tsfile-cli` is a single, pipe-friendly C++ command-line tool for inspecting +**and** importing Apache TsFile (`.tsfile`) files from the shell. Read commands print data to **stdout** and +diagnostics to **stderr**, so they compose with `awk`, `jq`, `sort`, and friends; +the `write` command imports CSV/TSV into a new `.tsfile`. It is built on the +public `TsFileReader` and `TsFileTableWriter` APIs. + +## Building from source + +The CLI is part of the C++ module. Build it with the Maven wrapper, which +downloads a pinned CMake and compiles the whole C++ module (the `libtsfile` +shared library + the `tsfile-cli` executable) for you. + +**Prerequisites:** a JDK (8+) to run Maven, and a C++11 compiler (GCC / Clang). +The third-party C++ dependencies (Snappy, LZ4, LZOKAY, Zlib, …) are bundled under +`cpp/third_party/` and built automatically. + +From the repository root: + +```bash +./mvnw clean package -P with-cpp +``` + +This produces, under `cpp/target/build/`: + +| Artifact | Path | +|---|---| +| CLI executable | `cpp/target/build/bin/tsfile-cli` | +| Shared library | `cpp/target/build/lib/libtsfile.so` (Linux) — `libtsfile.dylib` on macOS | + +`tsfile-cli` is dynamically linked against `libtsfile`. Run it **in place** by its +full path and it finds the library automatically: + +```bash +cpp/target/build/bin/tsfile-cli --version # -> tsfile-cli (Apache TsFile C++) +cpp/target/build/bin/tsfile-cli --help +``` + +To run the binary from **somewhere else** (e.g. after copying it out of the build +tree), the dynamic loader must be able to find `libtsfile.so`. Either point the +loader at the build's `lib/` directory, or copy the library to a standard +location: + +```bash +# point the loader at the build's lib directory (Linux; macOS uses DYLD_LIBRARY_PATH) +export LD_LIBRARY_PATH=/path/to/cpp/target/build/lib:$LD_LIBRARY_PATH + +# — or — copy the library to a system library path +sudo cp cpp/target/build/lib/libtsfile.so /usr/local/lib/ && sudo ldconfig +``` + +## Usage + +```text +tsfile-cli [options] +tsfile-cli --help | --version | help +``` + +Exit codes: `0` success, `1` usage/argument error, `2` file open/corrupt, +`3` query/runtime error. + +### Reading + +| Command | Description | +|---|---| +| `ls` | List devices (tree model) or tables (table model), one name per line | +| `schema` | Per-series `target, measurement, datatype, encoding, compression` | +| `meta` | File summary: model, device/table/series counts, time range, file size | +| `stats` | Per-series `count, start_time, end_time, min, max, first, last, sum` | +| `count` | Per-series row counts plus a `total` row (from statistics, no page scan) | +| `head` | First N rows (default 10; use `-n`) | +| `cat` | All matching rows, streamed (`table` format buffers to align columns) | +| `sample` | Reproducible reservoir sample (default 10; `-n`, `--seed`) | + +The metadata commands (`ls` / `schema` / `meta` / `stats` / `count`) answer most +questions **without decoding data pages**. + +Shared options: + +| Option | Meaning | +|---|---| +| `-f, --format csv\|tsv\|json\|table` | Output format; defaults to `table` on a TTY, `tsv` when piped | +| `-d, --device ` / `-t, --table ` | Scope to one device / table (mutually exclusive) | +| `-m, --measurements a,b,c` | Column projection (`schema`, `stats`, `count`, `head`, `cat`, `sample`) | +| `-n, --limit N` / `--offset N` | Max rows / rows to skip (`head`, `cat`; `--offset` not valid for `sample`) | +| `--start ` / `--end ` | Inclusive epoch-millisecond time range (`head`, `cat`, `sample`) | +| `--seed N` | Reproducible sampling seed (`sample` only) | +| `--tag-filter C OP V` / `--tag-between C L U` / `--tag-not-between C L U` | Table TAG predicate for `head`, `cat`, `sample`; `OP` is `eq`, `neq`, `lt`, `lteq`, `gt`, `gteq`, `regexp`, or `not-regexp` | +| `--no-header` | Omit the header row | +| `--model tree\|table` | Force the model (otherwise auto-detected) | + +`json` output is NDJSON (one object per line; numbers/booleans bare, other values +quoted, nulls as `null`; non-finite floats — NaN/Inf — become `null`). CSV output +follows RFC 4180. Timestamps are raw epoch milliseconds. The `table` format +buffers all rows in memory to align columns, so prefer `csv`/`tsv`/`json` when +dumping large files. + +```bash +BIN=cpp/build/Debug/bin/tsfile-cli +$BIN ls -f tsv data.tsfile # list tables / devices +$BIN meta data.tsfile # quick file overview +$BIN count -t table1 -f tsv data.tsfile # row counts, no page scan +$BIN cat -t table1 --tag-filter device eq dev_1 -m temp -f tsv data.tsfile +$BIN cat -m temp,humidity --start 1700000000000 -f csv data.tsfile | head +$BIN sample -m temp -n 20 --seed 42 -f json data.tsfile | jq . +``` + +> For a table-model file, the row commands (`head` / `cat` / `sample`) query the +> **first** table unless you pass `-t `. `count` covers all tables. + +### Writing (import) + +`tsfile-cli write` imports CSV/TSV rows into a **new table-model** `.tsfile` (the +output is overwritten). The first input column is the timestamp (epoch +milliseconds); the remaining columns are declared explicitly with `--columns` — +there is no type inference. + +Timestamps must be **strictly increasing per device**, where a device is +identified by its `tag` column values (rows that share the same tags form one +device's timeline). Rows for different tag combinations may freely interleave and +reuse timestamps. Out-of-order input is rejected with the offending line number, +and a failed import leaves no output file behind. `--output` must differ from the +input file. + +```text +tsfile-cli write --table --columns -o \ + [-f csv|tsv] [--no-header] [--header-match] [-v] [ | -] +``` + +`--columns` is a comma-separated list of `name:TYPE:category`, where `category` +(case-insensitive) is `tag` or `field` and `TYPE` (case-insensitive) is one of +`BOOLEAN, INT32, INT64, FLOAT, DOUBLE, STRING, TEXT, TIMESTAMP, DATE, BLOB` — for +example `--columns "id1:STRING:tag,s1:INT64:field"`. `DATE` cells are written as +`YYYY-MM-DD`; `TIMESTAMP` cells as epoch milliseconds. Each column is stored with +the engine's default encoding and compression for its type. + +| Option | Meaning | +|---|---| +| `--table ` | Output table name (lower-cased) | +| `--columns ` | Ordered data columns (excludes the leading timestamp column) | +| `-o, --output ` | Output `.tsfile` (required; overwritten) | +| `` / `-` | Input file, or `-` / omitted for stdin | +| `-f csv\|tsv` | Input delimiter (default csv; `json` / `table` are rejected) | +| `--no-header` | Input has no header row (default: first line is a header and is skipped) | +| `--header-match` | Validate header names against `--columns` | +| `-v, --verbose` | Print `wrote N rows to ` to stderr (otherwise silent on success) | + +An empty cell is written as null. The command is silent on success (Unix-style); +pass `-v` for a one-line summary. + +```bash +# round-trip through a pipe +printf 'time,id1,s1\n0,dev,0\n1,dev,10\n' \ + | tsfile-cli write --table t1 --columns "id1:STRING:tag,s1:INT64:field" -o out.tsfile - +tsfile-cli count -f tsv out.tsfile # -> t1.dev s1 2 +``` + +## Using the skill with an AI assistant + +`cpp/tools/skills/tsfile-cli/SKILL.md` is a machine-readable reference that +documents how to drive `tsfile-cli`. AI coding assistants that support skills can +load it to help you inspect and import `.tsfile` files. diff --git a/src/UserGuide/develop/Tools/Tsfile-Viewer.md b/src/UserGuide/develop/Tools/Tsfile-Viewer.md new file mode 100644 index 000000000..c4f2a4b7c --- /dev/null +++ b/src/UserGuide/develop/Tools/Tsfile-Viewer.md @@ -0,0 +1,93 @@ + +# tsfile-viewer + +[Apache TsFile Viewer](https://github.com/apache/tsfile-viewer) is a web-based +application for browsing and analyzing TsFile data in your browser. It pairs a +Spring Boot backend (which reads `.tsfile` files via the Apache TsFile library) +with a Vue 3 frontend that renders metadata, paginated tables, and interactive +charts. + +- **Repository:** +- **License:** Apache-2.0 + +## Features + +- **File browsing and upload** — open `.tsfile` files from the UI. +- **Metadata display** — schema, devices, and measurements. +- **Paginated data tables** with filtering by time range, devices, measurements, + and value range. +- **Interactive charts** (ECharts) with multi-series overlay and aggregation. +- **Both data models** — supports tree-model and table-model TsFiles. +- **Export** — data as CSV or JSON; charts as PNG or SVG. +- **Performance** — chunk-level reading and metadata caching. + +## Requirements + +| Component | Version | +|---|---| +| JDK | 17 or 21 (LTS) | +| Maven | 3.9+ | +| Node.js | `^20.19.0 \|\| >=22.12.0` | +| pnpm | latest | +| Apache TsFile | 2.3.0 (bundled dependency) | + +## Get the source + +Clone the repository, then build and run it as shown below: + +```bash +git clone https://github.com/apache/tsfile-viewer.git +cd tsfile-viewer +``` + +## Running from source (development) + +Run the backend and frontend in two terminals. + +**Backend** (Spring Boot): + +```bash +cd backend +mvn spring-boot:run +``` + +**Frontend** (Vue + Vite dev server): + +```bash +cd frontend +pnpm install +pnpm dev +``` + +Then open the dev UI at . + +## Building and running a production bundle + +Build a self-contained distribution, then launch the packaged jar (the frontend +is served by the backend): + +```bash +./build-dist.sh +java -jar backend/target/tsfile-viewer-*.jar +``` + +Open the app at . diff --git a/src/UserGuide/latest/DataFrame/TsFileDataFrame.md b/src/UserGuide/latest/DataFrame/TsFileDataFrame.md new file mode 100644 index 000000000..b643f0225 --- /dev/null +++ b/src/UserGuide/latest/DataFrame/TsFileDataFrame.md @@ -0,0 +1,287 @@ + +# TsFileDataFrame + +`TsFileDataFrame` lets you read the time series inside one or more TsFiles the +same way you would work with a pandas DataFrame — without having to care about +the underlying file format or data-loading details. It is part of the Python +package (`pip install tsfile`). + +## Quick start + +```python +from tsfile import TsFileDataFrame + +df = TsFileDataFrame("table_data/") # load every .tsfile under the directory +print(df) # browse all series (metadata only) + +ts = df["weather.Beijing.humidity"] # pick one series (lazy handle) +window = ts[20:100] # slice by row index -> np.ndarray + +data = df.loc[start:end, [ # align multiple series on timestamps + "weather.Beijing.temperature", + "weather.Beijing.humidity", +]] +data.values # -> np.ndarray, shape = (N, 2) +``` + +## Core types + +`TsFileDataFrame` is built around three types: + +- **`TsFileDataFrame`** — the entry point. It loads one or more TsFiles and + exposes a unified view. Construction only scans metadata; **no values are read**. +- **`Timeseries`** — a lazy handle to a single series, obtained from `df[...]`. + It carries the series' metadata but reads nothing until you index it by row. +- **`AlignedTimeseries`** — the result of aligning several series on a common + time axis, obtained from `df.loc[...]`. It reads the requested range of the + requested series into memory at once. + +### TsFileDataFrame + +In the table below, `df` is a `TsFileDataFrame` instance, created with +`df = TsFileDataFrame(paths)`. + +| Example | Operation | Returns | +|---|---|---| +| `TsFileDataFrame(paths)` | Load a file / list of files / directory | `TsFileDataFrame` | +| `len(df)` | Number of time series | `int` | +| `df.list_timeseries("weather")` | Series names, optionally filtered by prefix | `List[str]` | +| `df["weather.Beijing.humidity"]`, `df[0]`, `df[-1]` | One series | `Timeseries` | +| `df["city"]` | A metadata column (a tag / `field` / `start_time` / `end_time` / `count`) | `pandas.Series` | +| `df[0:3]`, `df[[0, 2, 5]]` | A subset view | `TsFileDataFrame` | +| `df[df["city"] == "Beijing"]` | Filter by a metadata column | `TsFileDataFrame` | +| `df.loc[start:end, series_list]` | Timestamp-aligned query | `AlignedTimeseries` | +| `df.show(max_rows=20)` / `print(df)` | Formatted metadata table | — | +| `df.close()` | Release file handles | — | + +### Timeseries + +In the table below, `ts` is a `Timeseries`, obtained from `ts = df[...]`. + +| Example | Operation | Returns | +|---|---|---| +| `ts.name` | Series name | `str` | +| `len(ts)` | Number of points | `int` | +| `ts.stats` | Series statistics | `dict` (`start_time`, `end_time`, `count`) | +| `ts[20]` | Single value | `float` (or `None` if null) | +| `ts[20:100]` | Row-range slice | `np.ndarray` | +| `ts.timestamps` | Timestamp array | `np.ndarray` | + +### AlignedTimeseries + +In the table below, `data` is an `AlignedTimeseries`, obtained from +`data = df.loc[...]`. + +| Example | Operation | Returns | +|---|---|---| +| `data.timestamps` | Timestamp array | `np.ndarray` | +| `data.values` | Value matrix | `np.ndarray`, shape `(N, M)` | +| `data.series_names` | Series names | `List[str]` | +| `data.shape` | Shape `(N, M)` — N timestamps, M series | `tuple` | +| `len(data)` | Number of rows | `int` | +| `data[0]`, `data[0:10]`, `data[0, 1]` | Row / element indexing | `np.ndarray` / scalar | +| `data.show(50)` / `print(data)` | Formatted output (auto-truncated) | — | + +## Series names + +A series is uniquely identified by its **series name**, a string formed by +joining the **table name**, the **tag-column values**, and the **field name** +with `.`, in that order: + +```text +{table_name}.{tag_value_1}.{tag_value_2}...{field_name} +``` + +`list_timeseries()` returns series names; name-based indexing (`df[...]`) and +series selection in `df.loc[...]` both take a series name. + +Examples: + +- `weather.Beijing.humidity` — table `weather`, tag `Beijing`, field `humidity` +- `sensor.s1.pressure` — table `sensor`, tag `s1`, field `pressure` + +> A series name can be obtained from `list_timeseries()` and need not be +> constructed by hand; a series may also be selected by integer index (`df[0]`) +> or metadata filter (`df[df["city"] == "Beijing"]`). + +## Loading + +A path may be a single file, a list of files, or a directory: + +```python +from tsfile import TsFileDataFrame + +df = TsFileDataFrame(["data/weather.tsfile", "data/sensor.tsfile"]) +df = TsFileDataFrame("data/") # recursively find every .tsfile under the directory +print(df) +``` + +Construction only scans metadata; actual values are not read. When several files +are loaded, metadata is scanned in parallel. + +If several files contain the **same series** (e.g. daily shards of +`weather.Beijing.humidity`), they are merged into one continuous series. For +duplicate timestamps only the first is kept — this is not an expected situation, +so deduplicate during preprocessing to avoid metadata distortion. + +### Displaying a DataFrame + +`print(df)` (and `df.show(max_rows=...)`) prints series metadata, head/tail +truncated when large. The header is: + +```text +index │ table │ │ ... │ field │ start_time │ end_time │ count +``` + +For devices with different numbers of tags the tag values are left-aligned and +shorter ones are padded with `None` at the end. + +```text +TsFileDataFrame(table model, 972 time series, 5 files) + table ps_id sn frac field start_time end_time count + 0 pvf 10 30100194A00234H00572 1 pac 2024-04-02 00:00:00 2024-10-28 23:45:00 20160 + 1 pvf 10 30100194A00234H00572 1 tenmeterswindspeed 2024-04-02 00:00:00 2024-10-28 23:45:00 20160 +... +``` + +### Closing + +A `with` block closes file handles automatically; you can also close manually: + +```python +with TsFileDataFrame("data/") as df: + ... # handles released on exit + +tsdf = TsFileDataFrame("data/") +tsdf.close() # or close it yourself +``` + +## Browsing series + +`list_timeseries(path_prefix="")` lists the series names in the loaded files, +optionally filtered by a prefix. Calling it with no argument returns all series. + +```python +>>> df.list_timeseries("weather") +['weather.Beijing.humidity', 'weather.Beijing.temperature', + 'weather.Shanghai.humidity', 'weather.Shanghai.temperature'] +>>> df.list_timeseries("weather.Beijing") +['weather.Beijing.humidity', 'weather.Beijing.temperature'] +``` + +To inspect metadata such as start/end time and count, print the DataFrame (or a +subset of it) — see [Displaying a DataFrame](#displaying-a-dataframe). + +## Selecting series + +`df[...]` returns a lazy `Timeseries` handle (no data read) or a subset view: + +```python +ts = df["weather.Beijing.humidity"] # by name +ts = df[0] # by index (negative indices allowed) + +sub_df = df[0:3] # slice -> TsFileDataFrame (view) +sub_df = df[[0, 2, 5]] # integer list -> TsFileDataFrame (view) +sub_df = df[df["city"] == "Beijing"] # metadata filter -> TsFileDataFrame (view) +``` + +```text +>>> df["weather.Beijing.humidity"] +Timeseries('weather.Beijing.humidity', count=2880, start=2026-01-27 00:00:00, end=2026-02-05 23:55:00) +``` + +Series metadata is served from cache (no I/O): + +```python +>>> ts = df["weather.Beijing.humidity"] +>>> ts.name +'weather.Beijing.humidity' +>>> len(ts) +2880 +>>> ts.stats +{'start_time': 1769443200000, 'end_time': 1770306900000, 'count': 2880} +``` + +## Reading data + +Indexing a `Timeseries` by row triggers the actual file read: + +```python +val = ts[20] # -> float +window = ts[20:100] # -> np.ndarray, shape = (80,) +last_ten = ts[-10:] # -> np.ndarray +sampled = ts[::2] # -> np.ndarray (strided sampling) +ts.timestamps[20:100] # -> the timestamps for those rows, np.ndarray +``` + +```text +>>> ts[20] +46.1 +>>> ts[20:100] +array([46.1 , 41.72, 52.94, ..., 76.3 , 84.35]) +>>> ts.timestamps[20:100] +array([1769449200000, 1769449500000, ..., 1769472900000]) +``` + +## Timestamp-aligned queries + +When you need several series strictly aligned on one time axis, use `.loc`: + +```python +data = df.loc[start_time:end_time, [ + "weather.Beijing.humidity", + "weather.Beijing.temperature", + "sensor.s1.pressure", +]] +``` + +The returned `AlignedTimeseries` aligns all series to the **union** of their +timestamps and fills missing positions with `NaN`: + +```python +data.timestamps # np.ndarray, millisecond timestamps +data.values # np.ndarray, shape = (N, 3) +data.series_names # ["weather.Beijing.humidity", ...] +data.shape # (N, 3) +data[0:10] # first 10 rows, np.ndarray shape = (10, 3) +data.show(50) # show up to 50 rows +``` + +Series may be given by name or by index, mixed freely: + +```python +df.loc[start_time:end_time, [0, 1, 4]] +df.loc[start_time:end_time, [0, "weather.Beijing.temperature", 4]] +``` + +```text +>>> df.loc[1769616000000:1769702100000, +... ['weather.Beijing.temperature', 'weather.Beijing.humidity', 'sensor.s2.pressure']] +AlignedTimeseries(288 rows, 3 series) + timestamp weather.Beijing.temperature weather.Beijing.humidity sensor.s2.pressure +2026-01-29 00:00:00 29.12 92.87 NaN +2026-01-29 00:05:00 1.55 87.34 NaN +... +``` + +The pretty-printed view shows only value columns; to read the aligned timestamp +column use `df.loc[...].timestamps`. diff --git a/src/UserGuide/latest/QuickStart/InterfaceDefinition/InterfaceDefinition-C.md b/src/UserGuide/latest/QuickStart/InterfaceDefinition/InterfaceDefinition-C.md index 9b152991b..0927966c7 100644 --- a/src/UserGuide/latest/QuickStart/InterfaceDefinition/InterfaceDefinition-C.md +++ b/src/UserGuide/latest/QuickStart/InterfaceDefinition/InterfaceDefinition-C.md @@ -32,13 +32,46 @@ typedef enum { TS_DATATYPE_FLOAT = 3, TS_DATATYPE_DOUBLE = 4, TS_DATATYPE_TEXT = 5, - TS_DATATYPE_STRING = 11 + TS_DATATYPE_TIMESTAMP = 8, + TS_DATATYPE_DATE = 9, + TS_DATATYPE_BLOB = 10, + TS_DATATYPE_STRING = 11, + TS_DATATYPE_INVALID = 255 } TSDataType; -typedef enum column_category { TAG = 0, FIELD = 1 } ColumnCategory; - -// ColumnSchema: Represents the schema of a single column, +// Value encoding. +typedef enum { + TS_ENCODING_PLAIN = 0, + TS_ENCODING_DICTIONARY = 1, + TS_ENCODING_RLE = 2, + TS_ENCODING_TS_2DIFF = 4, + TS_ENCODING_GORILLA = 8, + TS_ENCODING_ZIGZAG = 9, + TS_ENCODING_SPRINTZ = 12, + TS_ENCODING_INVALID = 255 +} TSEncoding; + +// Compression type. LZ4 is the default. +typedef enum { + TS_COMPRESSION_UNCOMPRESSED = 0, + TS_COMPRESSION_SNAPPY = 1, + TS_COMPRESSION_GZIP = 2, + TS_COMPRESSION_LZO = 3, + TS_COMPRESSION_LZ4 = 7, + TS_COMPRESSION_INVALID = 255 +} CompressionType; + +typedef enum column_category { + TAG = 0, + FIELD = 1, + ATTRIBUTE = 2, + TIME = 3 +} ColumnCategory; + +// ColumnSchema: Represents the schema of a single column, // including its name, data type, and category. +// Encoding/compression for columns follow the global defaults +// (see "Configuration" below). typedef struct column_schema { char* column_name; TSDataType data_type; @@ -62,6 +95,9 @@ typedef struct result_set_meta_data { } ResultSetMetaData; ``` +> `ColumnSchema` does not carry encoding/compression — those follow the global +> defaults (see [Configuration](#configuration-encoding--compression)). + ## Write Interface @@ -88,7 +124,7 @@ void free_write_file(WriteFile* write_file); ### TsFile Writer Create/Close When creating a TsFile Writer, you need to specify WriteFile and TableSchema. You can use the memory_threshold parameter in -tsfile_writer_new_with_memory_threshold to limit the memory usage of the Writer during data writing, but in the current version, this parameter does not take effect. +tsfile_writer_new_with_memory_threshold to set a memory threshold. ```C /** @@ -260,6 +296,41 @@ ERRNO tsfile_writer_write(TsFileWriter writer, Tablet tablet); +## Configuration (encoding & compression) + +Columns are stored with the **global default** encoding and compression for their +data type (a `ColumnSchema` does not carry codec settings). Change those +defaults *before* creating a writer with the functions below. + +Each setter returns `RET_OK` (0) on success, or `RET_NOT_SUPPORT` (40) for an +unsupported data-type/encoding or compression combination. + +```C +/* Default value encoding per data type, and default compression. */ +int set_datatype_encoding(uint8_t data_type, uint8_t encoding); +int set_global_compression(uint8_t compression); +uint8_t get_datatype_encoding(uint8_t data_type); +uint8_t get_global_compression(); + +/* Time column (the time data type is fixed to INT64). */ +int set_global_time_encoding(uint8_t encoding); +int set_global_time_compression(uint8_t compression); +uint8_t get_global_time_encoding(); +uint8_t get_global_time_compression(); +``` + +Allowed values: encoding accepts `PLAIN` for `BOOLEAN`; `PLAIN`/`TS_2DIFF`/ +`GORILLA`/`ZIGZAG`/`RLE`/`SPRINTZ` for `INT32`/`INT64`/`DATE`; +`PLAIN`/`TS_2DIFF`/`GORILLA`/`SPRINTZ` for `FLOAT`/`DOUBLE`; +`PLAIN`/`DICTIONARY` for `STRING`/`TEXT`. Compression accepts `UNCOMPRESSED`, +`SNAPPY`, `GZIP`, `LZO`, or `LZ4`. + +```C +// e.g. write every column with LZ4 compression +ERRNO code = set_global_compression(TS_COMPRESSION_LZ4); +if (code != RET_OK) { /* handle unsupported value */ } +``` + ## Read Interface ### TsFile Reader Create/Close @@ -291,20 +362,22 @@ ERRNO tsfile_reader_close(TsFileReader reader); -### Query table/get next/query by row +### Query table/get next ```C + /** - * @brief Queries data from the specified table and columns within a given time range. + * @brief Query data from the specific table and columns within time range. * - * @param reader [in] A valid TsFileReader handle obtained by tsfile_reader_new(). - * @param table_name [in] Name of the target table, which must exist in the TsFile. - * @param columns [in] Array of column names to be queried. - * @param column_num [in] Number of columns in the column name array. + * @param reader [in] Valid TsFileReader handle from tsfile_reader_new(). + * @param table_name [in] Target table name. Must exist in the TsFile. + * @param columns [in] Array of column names to fetch. + * @param column_num [in] Number of columns in array. * @param start_time [in] Start timestamp. - * @param end_time [in] End timestamp, which must be greater than or equal to start_time. - * @param err_code [out] Returns RET_OK(0) on success, otherwise returns an error code defined in errno_define_c.h. - * @return ResultSet Handle of the query result set. Must be released by free_tsfile_result_set() after use. + * @param end_time [in] End timestamp. Must ≥ start_time. + * @param err_code [out] RET_OK(0) on success, or error code in errno_define_c.h. + * @return ResultSet Query results handle. Must be freed with + * free_tsfile_result_set(). */ ResultSet tsfile_query_table(TsFileReader reader, const char* table_name, char** columns, uint32_t column_num, @@ -312,61 +385,149 @@ ResultSet tsfile_query_table(TsFileReader reader, const char* table_name, ERRNO* err_code); /** - * @brief Checks and retrieves the next row of data in the result set. + * @brief Check and fetch the next row in the ResultSet. * - * @param result_set [in] A valid ResultSet handle. - * @param error_code [out] Returns RET_OK(0) on success, otherwise returns an error code defined in errno_define_c.h. - * @return bool - true: Next row exists, false: Reached the end or an error occurred. + * @param result_set [in] Valid ResultSet handle. + * @param error_code RET_OK(0) on success, or error code in errno_define_c.h. + * @return bool - true: Row available, false: End of data or error. */ bool tsfile_result_set_next(ResultSet result_set, ERRNO* error_code); /** - * @brief Releases the resources of the result set. + * @brief Free Result set * - * @param result_set [in] Pointer to a valid ResultSet handle. + * @param result_set [in] Valid ResultSet handle ptr. */ void free_tsfile_result_set(ResultSet* result_set); +``` + + + +### Filtering by tag + +**TAG columns** form the device identity (a joint primary +key) — their values are what distinguish one device from another within a table. +A *tag filter* restricts a query to the devices whose TAG values match a +predicate, so you read only the devices you care about. Build a filter from the +reader, pass it to one of the table-query functions below, then release it with +`tsfile_tag_filter_free()`. + +```C +// Opaque handle to a tag filter. Build it with the functions below. +typedef void* TagFilterHandle; + +// Comparison operators for a single-column TAG predicate. +typedef enum { + TAG_FILTER_EQ = 0, // column == value + TAG_FILTER_NEQ = 1, // column != value + TAG_FILTER_LT = 2, // column < value + TAG_FILTER_LTEQ = 3, // column <= value + TAG_FILTER_GT = 4, // column > value + TAG_FILTER_GTEQ = 5, // column >= value + TAG_FILTER_REGEXP = 6, // column matches the regex value + TAG_FILTER_NOT_REGEXP = 7, // column does not match the regex value +} TagFilterOp; /** - * @brief Queries time-series data by row (tree model), supporting offset and row count limitation + * @brief Create a single-column TAG predicate: ` `. * - * @param reader [in] A valid TsFileReader handle obtained by tsfile_reader_new() - * @param device_ids [in] Array of device IDs - * @param device_ids_len [in] Number of device IDs - * @param measurement_names [in] Array of measurement (sensor) names - * @param measurement_names_len [in] Number of measurement names - * @param offset [in] Number of starting rows to skip (must be >= 0) - * @param limit [in] Maximum number of rows to return, < 0 means no limitation - * @param err_code [out] Error code, returns E_OK(0) on success - * @return Returns ResultSet handle on success, NULL on failure + * @param reader [in] Valid TsFileReader handle. + * @param table_name [in] Table whose schema defines the TAG columns. + * @param column_name [in] Name of the TAG column to filter on. + * @param value [in] Comparison value (TAG columns are STRING). + * @param op [in] Comparison operator (TagFilterOp). + * @param err_code [out] RET_OK(0) on success, or error code in errno_define_c.h. + * @return TagFilterHandle on success; NULL on failure. */ -ResultSet tsfile_reader_query_tree_by_row(TsFileReader reader, - char** device_ids, int device_ids_len, - char** measurement_names, - int measurement_names_len, int offset, - int limit, ERRNO* err_code); +TagFilterHandle tsfile_tag_filter_create(TsFileReader reader, + const char* table_name, + const char* column_name, + const char* value, TagFilterOp op, + ERRNO* err_code); /** - * @brief Queries table model data by row, supporting offset and row count limitation pushdown + * @brief Create a range predicate: lower <= column <= upper + * (pass is_not = true for NOT BETWEEN). + */ +TagFilterHandle tsfile_tag_filter_between(TsFileReader reader, + const char* table_name, + const char* column_name, + const char* lower, const char* upper, + bool is_not, ERRNO* err_code); + +// Combine predicates. AND/OR/NOT take ownership of their children; free the root only. +TagFilterHandle tsfile_tag_filter_and(TagFilterHandle left, TagFilterHandle right); +TagFilterHandle tsfile_tag_filter_or(TagFilterHandle left, TagFilterHandle right); +TagFilterHandle tsfile_tag_filter_not(TagFilterHandle filter); + +// Free a tag filter and all of its children. +void tsfile_tag_filter_free(TagFilterHandle filter); +``` + +### Table queries with tag filter, paging and batching + +These query functions accept an optional `tag_filter` (pass `NULL` +for no filtering) and a `batch_size` (`<= 0` returns rows one by one; `> 0` +returns a block of that size). + +```C +/** + * @brief Query a table by row, with offset/limit pushdown and an optional tag filter. * - * @param reader [in] A valid TsFileReader handle obtained by tsfile_reader_new() - * @param table_name [in] Name of the target table - * @param column_names [in] Array of column names to be queried - * @param column_names_len [in] Number of columns to be queried - * @param offset [in] Number of starting rows to skip (must be >= 0) - * @param limit [in] Maximum number of rows to return, < 0 means no limitation - * @param tag_filter [in] Tag filter handle - * @param batch_size [in] Batch size for data query - * @param err_code [out] Error code, returns E_OK(0) on success - * @return Returns ResultSet handle on success, NULL on failure + * @param reader [in] Valid TsFileReader handle. + * @param table_name [in] Target table name. + * @param column_names [in] Requested column names. + * @param column_names_len [in] Number of requested columns. + * @param offset [in] Leading rows to skip (>= 0). + * @param limit [in] Max rows to return; < 0 means unlimited. + * @param tag_filter [in] TAG predicate, or NULL for no filtering. + * @param batch_size [in] <= 0 row-by-row; > 0 block size. + * @param err_code [out] RET_OK(0) on success, or error code. + * @return ResultSet handle; NULL on failure. Free with free_tsfile_result_set(). */ ResultSet tsfile_reader_query_table_by_row( TsFileReader reader, const char* table_name, char** column_names, int column_names_len, int offset, int limit, TagFilterHandle tag_filter, int batch_size, ERRNO* err_code); + +/** + * @brief Query a table within a time range, with an optional tag filter and batching. + * + * @param batch_size <= 0 row-by-row return; > 0 returns a TsBlock of that size. + */ +ResultSet tsfile_query_table_batch(TsFileReader reader, const char* table_name, + char** columns, uint32_t column_num, + Timestamp start_time, Timestamp end_time, + TagFilterHandle tag_filter, int batch_size, + ERRNO* err_code); + +/** + * @brief Query a table with a tag filter (time range + TAG predicate). + * + * @param batch_size <= 0 row-by-row return; > 0 returns a TsBlock of that size. + */ +ResultSet tsfile_query_table_with_tag_filter( + TsFileReader reader, const char* table_name, char** columns, + uint32_t column_num, Timestamp start_time, Timestamp end_time, + TagFilterHandle tag_filter, int batch_size, ERRNO* err_code); ``` +Example — read `temperature` only for devices whose `region` TAG equals +`shanghai`: + +```C +ERRNO ec = RET_OK; +TagFilterHandle f = tsfile_tag_filter_create( + reader, "weather", "region", "shanghai", TAG_FILTER_EQ, &ec); +char* cols[] = {"temperature"}; +ResultSet rs = tsfile_reader_query_table_by_row( + reader, "weather", cols, 1, /*offset*/ 0, /*limit*/ -1, f, /*batch*/ 0, &ec); + +// ... iterate rs with tsfile_result_set_next(), then release: +free_tsfile_result_set(&rs); +tsfile_tag_filter_free(f); +``` ### Get Data from result set diff --git a/src/UserGuide/latest/QuickStart/InterfaceDefinition/InterfaceDefinition-CPP.md b/src/UserGuide/latest/QuickStart/InterfaceDefinition/InterfaceDefinition-CPP.md index 101fcdfbf..00c2bd142 100644 --- a/src/UserGuide/latest/QuickStart/InterfaceDefinition/InterfaceDefinition-CPP.md +++ b/src/UserGuide/latest/QuickStart/InterfaceDefinition/InterfaceDefinition-CPP.md @@ -20,6 +20,67 @@ --> # Interface Definitions - C++ +## Data Types, Encoding and Compression + +These enumerations are shared by the read and write interfaces. The numeric +codes are also the values stored on disk. + +```cpp +// Supported measurement/column data types. +enum TSDataType : uint8_t { + BOOLEAN = 0, + INT32 = 1, + INT64 = 2, + FLOAT = 3, + DOUBLE = 4, + TEXT = 5, + TIMESTAMP = 8, + DATE = 9, + BLOB = 10, + STRING = 11, +}; + +// Value encoding. See the table below for which encodings apply to which types. +enum TSEncoding : uint8_t { + PLAIN = 0, + DICTIONARY = 1, + RLE = 2, + TS_2DIFF = 4, + GORILLA = 8, + ZIGZAG = 9, + SPRINTZ = 12, +}; + +// Compression type. SNAPPY/GZIP/LZO/LZ4 depend on build options; LZ4 is the default. +enum CompressionType : uint8_t { + UNCOMPRESSED = 0, + SNAPPY = 1, + GZIP = 2, + LZO = 3, + LZ4 = 7, +}; + +// Column role within a table schema. +enum class ColumnCategory { TAG = 0, FIELD = 1, ATTRIBUTE = 2, TIME = 3 }; +``` + +Encodings applicable to each data type: + +| Encoding | Applicable types | +|---|---| +| `PLAIN` | all types | +| `DICTIONARY` | `TEXT`, `STRING` | +| `RLE` | `INT32`, `INT64`, `TIMESTAMP`, `DATE` | +| `TS_2DIFF` | `INT32`, `INT64`, `TIMESTAMP`, `DATE`, `FLOAT`, `DOUBLE` | +| `GORILLA` | `INT32`, `INT64`, `TIMESTAMP`, `DATE`, `FLOAT`, `DOUBLE` | +| `ZIGZAG` | `INT32`, `INT64` | +| `SPRINTZ` | `INT32`, `INT64`, `FLOAT`, `DOUBLE` | + +Default value encoding per type: `BOOLEAN → PLAIN`, `INT32 / INT64 → TS_2DIFF`, +`FLOAT / DOUBLE → GORILLA`, `TEXT / STRING / BLOB → PLAIN`. The default +compression is `LZ4`. See [Configuring encoding and compression](#configuring-encoding-and-compression) +for how to override these. + ## Write Interface ### TsFileTableWriter @@ -27,92 +88,53 @@ Used to write data to tsfile ```cpp -namespace storage { -class RestorableTsFileIOWriter; - /** - * @brief Supports writing structured table data to TsFile according to the specified table schema + * @brief Facilitates writing structured table data into a TsFile with a specified schema. * - * The TsFileTableWriter class is used to write structured data (especially suitable for time-series data) - * to TsFile optimized for efficient storage and querying. - * Users can define the structure of the table to be written, add data rows according to the structure, - * and serialize the data into TsFile. - * Meanwhile, this class provides the ability to limit memory usage during the writing process. + * The TsFileTableWriter class is designed to write structured data, particularly suitable for time-series data, + * into a file optimized for efficient storage and retrieval (referred to as TsFile here). It allows users to define + * the schema of the tables they want to write, add rows of data according to that schema, and serialize this data + * into a TsFile. Additionally, it provides options to limit memory usage during the writing process. */ class TsFileTableWriter { public: /** - * TsFileTableWriter is used to write table data to the target file according to the specified table schema, - * and can optionally limit the memory usage. - * - * @param writer_file Target file for writing table data, cannot be a null pointer - * @param table_schema Used to construct the table structure and define the schema of the table to be written - * @param memory_threshold Optional parameter. When the written data volume exceeds this threshold, - * data will be automatically flushed to disk. The default value is 128MB - */ - template - explicit TsFileTableWriter(storage::WriteFile* writer_file, T* table_schema, - uint64_t memory_threshold = 128 * 1024 * 1024) { - static_assert(!std::is_same::value, - "table_schema cannot be nullptr"); - tsfile_writer_ = std::make_shared(); - tsfile_writer_->init(writer_file); - tsfile_writer_->set_generate_table_schema(false); - - // Perform a deep copy. The source TableSchema object may be allocated on the stack/heap - auto table_schema_ptr = std::make_shared(*table_schema); - error_number = tsfile_writer_->register_table(table_schema_ptr); - exclusive_table_name_ = table_schema->get_table_name(); - common::g_config_value_.chunk_group_size_threshold_ = memory_threshold; - } - - /** - * Constructs TsFileTableWriter from a restorable TsFileIOWriter, - * supporting appending table data after failure recovery. - * The schema is read from the recovered file without additional TableSchema input. - * - * @param restorable_writer Recovered I/O writer; cannot be a null pointer, - * and must be opened in truncate mode to ensure can_write() returns true - * @param memory_threshold Optional memory threshold for cached data - */ - explicit TsFileTableWriter( - storage::RestorableTsFileIOWriter* restorable_writer, - uint64_t memory_threshold = 128 * 1024 * 1024); - - /** - * Registers a table schema with the writer + * TsFileTableWriter is used to write table data into a target file with the given schema, + * optionally limiting the memory usage. * - * @param table_schema The table schema to be registered, cannot be a null pointer - * @return Returns 0 on success, non-zero error code on failure + * @param writer_file Target file where the table data will be written. Must not be null. + * @param table_schema Used to construct table structures. Defines the schema of the table + * being written. + * @param memory_threshold Optional parameter. When the size of written + * data exceeds this value, the data will be automatically flushed to the + * disk. Default value is 128MB. */ - int register_table(const std::shared_ptr& table_schema); - + TsFileTableWriter(WriteFile* writer_file, + TableSchema* table_schema, + uint64_t memory_threshold = 128 * 1024 * 1024); + ~TsFileTableWriter(); /** - * Writes the specified Tablet data to the target file according to the table schema + * Writes the given tablet data into the target file according to the schema. * - * @param tablet Tablet containing the data to be written, cannot be a null pointer - * @return Returns 0 on success, non-zero error code on failure + * @param tablet The tablet containing the data to be written. Must not be null. + * @return Returns 0 on success, or a non-zero error code on failure. */ - int write_table(Tablet& tablet) const; - + int write_table(const Tablet& tablet); /** - * Flushes all cached data to the underlying storage medium to ensure all data is persisted. - * This method guarantees that all pending data is written to disk. + * Flushes any buffered data to the underlying storage medium, ensuring all data is written out. + * This method ensures that all pending writes are persisted. * - * @return Returns 0 on success, non-zero error code on failure + * @return Returns 0 on success, or a non-zero error code on failure. */ int flush(); - /** - * Closes the writer and releases all resources it occupies. - * No subsequent operations should be performed on the current instance after calling this method. + * Closes the writer and releases any resources held by it. + * After calling this method, no further operations should be performed on this instance. * - * @return Returns 0 on success, non-zero error code on failure + * @return Returns 0 on success, or a non-zero error code on failure. */ int close(); }; - -} // namespace storage ``` ### TableSchema @@ -150,44 +172,41 @@ class TableSchema { struct ColumnSchema { std::string column_name_; common::TSDataType data_type_; + common::CompressionType compression_; + common::TSEncoding encoding_; ColumnCategory column_category_; /** - * @brief Constructs a ColumnSchema object with the given parameters. + * @brief Constructs a ColumnSchema with explicit compression and encoding. * * @param column_name The name of the column. Must be a non-empty string. - * This name is used to identify the column within the table. - * @param data_type The data type of the measurement, such as INT32, DOUBLE, TEXT, etc. - * This determines how the data will be stored and interpreted. - * @param column_category The category of the column indicating its role or type - * within the schema, e.g., FIELD, TAG. - * Defaults to ColumnCategory::FIELD if not specified. - * @note It is the responsibility of the caller to ensure that `column_name` is not empty. + * @param data_type The data type of the column (INT32, DOUBLE, TEXT, ...). + * @param compression The compression applied to the column's chunks. + * @param encoding The encoding applied to the column's values. + * @param column_category The role of the column (FIELD, TAG, ...). Defaults to FIELD. */ ColumnSchema(std::string column_name, common::TSDataType data_type, - ColumnCategory column_category = ColumnCategory::FIELD) : column_name_(std::move(column_name)), - data_type_(data_type), - column_category_(column_category) { - } -}; + common::CompressionType compression, common::TSEncoding encoding, + ColumnCategory column_category = ColumnCategory::FIELD); -/** - * @brief Represents the data type of a measurement. - * - * This enumeration defines the supported data types for measurements in the system. - */ -enum TSDataType : uint8_t { - BOOLEAN = 0, - INT32 = 1, - INT64 = 2, - FLOAT = 3, - DOUBLE = 4, - TEXT = 5, - STRING = 11 + /** + * @brief Constructs a ColumnSchema using the engine's default encoding and + * compression for the given data type. + * + * @param column_name The name of the column. Must be a non-empty string. + * @param data_type The data type of the column. + * @param column_category The role of the column. Defaults to FIELD. + */ + ColumnSchema(std::string column_name, common::TSDataType data_type, + ColumnCategory column_category = ColumnCategory::FIELD); }; - ``` +> `TAG` columns are the device identifier (joint primary key); their data type is +> always `STRING`. `FIELD` columns hold the measured values. The encoding and +> compression you set on a `ColumnSchema` apply to that column when written; the +> two-argument constructor falls back to the per-type defaults. + ### Tablet Write column memory structure @@ -253,159 +272,117 @@ public: }; ``` -### RestorableTsFileIOWriter -> V2.3.1 - -```cpp -namespace storage { -/** - * RestorableTsFileIOWriter is used to open a TsFile and perform optional recovery operations on it. - * Inherits from TsFileIOWriter and supports continuous writing after file recovery. - * - * (1) If the TsFile was closed normally: has_crashed()=false, can_write()=false - * - * (2) If the TsFile is incomplete / the program crashed: has_crashed()=true, - * can_write()=true. The writer will truncate the corrupted data and allow further writing. - * - * Implemented based on standard C++11, uses RAII and smart pointers to avoid memory leaks. - */ -class RestorableTsFileIOWriter : public TsFileIOWriter { - public: - RestorableTsFileIOWriter(); +### Configuring encoding and compression - /** - * Opens a TsFile for recovery / appending data. - * Uses O_RDWR|O_CREAT mode without O_TRUNC, so the original file content is preserved. - * - * @param file_path Path of the TsFile - * @param truncate_corrupted If true, truncate the corrupted data; - * If false, do not truncate (the incomplete file remains unchanged) - * @return E_OK on success, error code on failure - */ - int open(const std::string& file_path, bool truncate_corrupted = true); +Encoding and compression are chosen **per data type**: each type has a default +(see the table above). You can change those defaults, or pass an explicit +encoding/compression on a schema. - /** - * Closes the file - */ - void close(); -}; +**1. On a schema.** Pass an explicit encoding and compression when you build a +`ColumnSchema`: -} // namespace storage +```cpp +// Store column "temperature" as TS_2DIFF + LZ4. +common::ColumnSchema col("temperature", common::INT64, + common::LZ4, common::TS_2DIFF, + common::ColumnCategory::FIELD); ``` +**2. Per-type defaults.** Change the defaults *before* creating a writer; they then +apply to any column whose schema does not specify its own encoding/compression. +These helpers live in `common`/`storage` and validate their arguments (returning +`E_NOT_SUPPORT` for an unsupported combination): +```cpp +// Default value encoding per data type and default compression. +int common::set_datatype_encoding(uint8_t data_type, uint8_t encoding); +int common::set_global_compression(uint8_t compression); +uint8_t common::get_datatype_encoding(uint8_t data_type); +uint8_t common::get_global_compression(); + +// Time-column encoding/compression (the data type is fixed to INT64). +int common::set_global_time_encoding(uint8_t encoding); +int common::set_global_time_compression(uint8_t compression); +``` ## Read Interface ### Tsfile Reader -use to execute query in tsfile and return value by ResultSet. ```cpp -namespace storage { /** - * @brief TsFileReader provides the ability to query all files with the .tsfile suffix + * @brief TsfileReader provides the ability to query all files with the suffix + * .tsfile * - * TsFileReader is designed specifically for querying .tsfile files, supporting both tree-model queries and table-model queries. - * It also supports querying metadata such as table schemas (TableSchema) and time-series schemas (TimeseriesSchema). + * TsfileReader is designed to query .tsfile files. It accepts table-model + * queries and supports querying metadata such as TableSchema. */ class TsFileReader { public: TsFileReader(); + ~TsFileReader(); /** - * @brief Opens a TsFile + * @brief open the tsfile * - * @param file_path Path of the TsFile to be opened - * @return 0 on success, non-zero error code on failure + * @param file_path the path of the tsfile which will be opened + * @return Returns 0 on success, or a non-zero error code on failure. */ - int open(const std::string& file_path); + int open(const std::string &file_path); /** - * @brief Closes the TsFile. This method should be called after queries are completed. + * @brief close the tsfile, this method should be called after the + * query is finished * - * @return 0 on success, non-zero error code on failure + * @return Returns 0 on success, or a non-zero error code on failure. */ int close(); /** - * @brief Queries the TsFile using a query expression. Users can construct custom query expressions for execution. - * - * @param [in] qe Query expression - * @param [out] ret_qds Result set - * @return 0 on success, non-zero error code on failure - */ - int query(storage::QueryExpression* qe, ResultSet*& ret_qds); - /** - * @brief Queries the TsFile by path list, start time, and end time. - * This method is used for tree-model queries on TsFile. + * @brief query the tsfile by the query expression,Users can construct + * their own query expressions to query tsfile * - * @param [in] path_list Path list - * @param [in] start_time Start timestamp - * @param [in] end_time End timestamp - * @param [out] result_set Result set - * @return 0 on success, non-zero error code on failure + * @param [in] qe the query expression + * @param [out] ret_qds the result set + * @return Returns 0 on success, or a non-zero error code on failure. */ - int query(std::vector& path_list, int64_t start_time, - int64_t end_time, ResultSet*& result_set); + int query(storage::QueryExpression *qe, ResultSet *&ret_qds); /** - * @brief Queries the TsFile by table name, column names, start time, and end time. - * This method is used for table-model queries on TsFile. + * @brief query the tsfile by the table name, columns names, start time + * and end time. * - * @param [in] table_name Table name - * @param [in] columns_names List of column names - * @param [in] start_time Start timestamp - * @param [in] end_time End timestamp - * @param [out] result_set Result set - * @param [in] batch_size ≤ 0 for row-by-row mode; - * > 0 to return TsBlock chunks of the specified size - * @return 0 on success, non-zero error code on failure + * @param [in] table_name the table name + * @param [in] columns_names the columns names + * @param [in] start_time the start time + * @param [in] end_time the end time + * @param [out] result_set the result set */ - int query(const std::string& table_name, - const std::vector& columns_names, int64_t start_time, - int64_t end_time, ResultSet*& result_set, int batch_size = -1); + int query(const std::string &table_name, + const std::vector &columns_names, int64_t start_time, + int64_t end_time, ResultSet *&result_set); /** - * @brief Queries the TsFile by table name, column names, start time, end time, and tag filter conditions. - * This method is used for table-model queries on TsFile. + * @brief query the tsfile by the table name, columns names, start time + * and end time, tag filter. * - * @param [in] table_name Table name - * @param [in] columns_names List of column names - * @param [in] start_time Start timestamp - * @param [in] end_time End timestamp - * @param [in] tag_filter Tag filter condition - * @param [out] result_set Result set - * @param [in] batch_size Batch reading size - * @return 0 on success, non-zero error code on failure + * @param [in] table_name the table name + * @param [in] columns_names the columns names + * @param [in] start_time the start time + * @param [in] end_time the end time + * @param [in] tag_filter the tag filter + * @param [out] result_set the result set */ int query(const std::string& table_name, const std::vector& columns_names, int64_t start_time, - int64_t end_time, ResultSet*& result_set, Filter* tag_filter, - int batch_size = 0); + int64_t end_time, ResultSet*& result_set, Filter* tag_filter); /** - * @brief Queries tree-model time-series data by row with offset and row limit. + * @brief query a table by row, with offset/limit pushdown and an optional + * tag filter. * - * @param path_list Full paths to query (device.measurement) - * @param offset Number of starting rows to skip (>= 0) - * @param limit Maximum number of rows to return; no limit if < 0 - * @param[out] result_set Result set to store query results - * @return 0 on success, non-zero error code on failure - */ - int queryByRow(std::vector& path_list, int offset, int limit, - ResultSet*& result_set); - - /** - * @brief Queries table-model data by row with pushed-down offset and row limit. - * - * For dense devices (all columns have the same row count), - * offset/limit are pushed down to the data block/page level via SSI, - * skipping entire blocks/pages without decoding. - * For sparse devices, offset/limit take effect during row merging. - * Entire devices can be skipped directly if their total rows fall within the offset range. - * - * @param table_name Table name to query - * @param column_names Column names to query - * @param offset Number of starting rows to skip (>= 0) - * @param limit Maximum number of rows to return; no limit if < 0 - * @param[out] result_set Result set to store query results - * @param tag_filter Optional tag filter condition for filtering data by tag columns - * @param batch_size Batch reading size - * @return 0 on success, non-zero error code on failure + * @param [in] table_name the table name + * @param [in] column_names the column names + * @param [in] offset leading rows to skip (>= 0) + * @param [in] limit max rows to return; < 0 means unlimited + * @param [out] result_set the result set + * @param [in] tag_filter optional tag filter built with TagFilterBuilder, or nullptr + * @param [in] batch_size <= 0 returns rows one by one; > 0 returns blocks of that size + * @return Returns 0 on success, or a non-zero error code on failure. */ int queryByRow(const std::string& table_name, const std::vector& column_names, int offset, @@ -413,115 +390,37 @@ class TsFileReader { Filter* tag_filter = nullptr, int batch_size = 0); /** - * @brief Performs a table query on the tree model. - * - * @param measurement_names List of measurement names - * @param start_time Start timestamp - * @param end_time End timestamp - * @param result_set Result set - * @return 0 on success, non-zero error code on failure - */ - int query_table_on_tree(const std::vector& measurement_names, - int64_t start_time, int64_t end_time, - ResultSet*& result_set); - /** - * @brief Destroys the result set. This method should be called after the query is completed and the result set is no longer used. - * - * @param qds Result set object - */ - void destroy_query_data_set(ResultSet* qds); - /** - * @brief Reads time-series data by device ID and measurement names. - * - * @param device_id Device ID - * @param measurement_name List of measurement names - * @return Result set object - */ - ResultSet* read_timeseries( - const std::shared_ptr& device_id, - const std::vector& measurement_name); - /** - * @brief Gets all devices in the TsFile for a specified table. - * - * @param table_name Table name - * @return List of device IDs - */ - std::vector> get_all_devices( - std::string table_name); - - /** - * @brief Gets all device IDs in the TsFile. - * - * @return List of device IDs - */ - std::vector> get_all_device_ids(); - - /** - * @brief Gets all device IDs in the file (functionally identical to get_all_device_ids). + * @brief destroy the result set, this method should be called after the + * query is finished and result_set * - * @return List of devices + * @param qds the result set */ - std::vector> get_all_devices(); - - /** - * @brief Gets time-series schemas by device ID and measurement names. - * - * @param [in] device_id Device ID - * @param [out] result List of measurement schemas - * @return 0 on success, non-zero error code on failure - */ - int get_timeseries_schema(std::shared_ptr device_id, - std::vector& result); - + void destroy_query_data_set(ResultSet *qds); /** - * @brief Gets time-series metadata for specified devices. - * - * Only devices existing in the file are included in the result. - * Returns an empty map if the device ID list is empty. + * @brief get the table schema by the table name * - * @param device_ids List of devices to query - * @return Mapping: Device ID -> List of time-series metadata (existing entries only) - */ - DeviceTimeseriesMetadataMap get_timeseries_metadata( - const std::vector>& device_ids); - - /** - * @brief Gets time-series metadata for all devices in the file. - * - * @return Mapping: Device ID -> List of time-series metadata - */ - DeviceTimeseriesMetadataMap get_timeseries_metadata(); - - /** - * @brief Gets the table schema by table name. - * - * @param table_name Table name - * @return Shared pointer to the table schema + * @param table_name the table name + * @return std::shared_ptr the table schema */ std::shared_ptr get_table_schema( - const std::string& table_name); + const std::string &table_name); /** - * @brief Gets all table schemas in the TsFile. + * @brief get all table schemas in the tsfile * - * @return List of table schemas + * @return std::vector> the table schema list */ std::vector> get_all_table_schemas(); }; ``` ### ResultSet -A collection of query.Support iterator to get data, and directly through the column name or index to get specific data. ```cpp /** * @brief ResultSet is the query result of the TsfileReader. It provides access * to the results. * * ResultSet is a virtual class. Convert it to the corresponding implementation - * class when used - * @note When using the tree model and the filter is a global time filter, - * it should be cast as QDSWithoutTimeGenerator. - * @note When using the tree model and the filter is not a global time filter, - * it should be QDSWithTimeGenerator. - * @note If the query uses the table model, the cast should be TableResultSet + * class when used. + * @note The concrete type is TableResultSet. */ class ResultSet { public: @@ -557,6 +456,7 @@ class ResultSet { */ template T get_value(const std::string& column_name); + /** * @brief Get the value of the column by column index * * @param column_index the index of the column starting from 1 @@ -586,7 +486,6 @@ class ResultSet { }; ``` ### ResultMeta -user can obtain the metadata from ResultSetMetadata, including all columnnames and data types. When a user uses a table model, the first columndefaults to the time column. ```cpp /** * @brief metadata of result set diff --git a/src/UserGuide/latest/QuickStart/InterfaceDefinition/InterfaceDefinition-Python.md b/src/UserGuide/latest/QuickStart/InterfaceDefinition/InterfaceDefinition-Python.md index f74354b0d..11a09b8f8 100644 --- a/src/UserGuide/latest/QuickStart/InterfaceDefinition/InterfaceDefinition-Python.md +++ b/src/UserGuide/latest/QuickStart/InterfaceDefinition/InterfaceDefinition-Python.md @@ -39,15 +39,41 @@ class TSDataType(IntEnum): BLOB = 10 STRING = 11 +class TSEncoding(IntEnum): + """ + Value encoding accepted by the writer. The comment after each + member lists the data types it can be used with. + """ + PLAIN = 0 # all types + DICTIONARY = 1 # STRING, TEXT + RLE = 2 # INT32, INT64, TIMESTAMP, DATE + TS_2DIFF = 4 # INT32, INT64, TIMESTAMP, DATE, FLOAT, DOUBLE + GORILLA = 8 # INT32, INT64, TIMESTAMP, DATE, FLOAT, DOUBLE + ZIGZAG = 9 # INT32, INT64 + SPRINTZ = 12 # INT32, INT64, FLOAT, DOUBLE + +class Compressor(IntEnum): + """ + Compression accepted by the writer. The default is LZ4. + """ + UNCOMPRESSED = 0 + SNAPPY = 1 + GZIP = 2 + LZO = 3 + LZ4 = 7 + class ColumnCategory(IntEnum): """ Enumeration of column categories in TsFile. - TAG: Represents a tag column, used for metadata. - FIELD: Represents a field column, used for storing actual data values. + TAG: a tag column (part of the device identifier / joint primary key). + FIELD: a field column, holding the measured values. + ATTRIBUTE / TIME: reserved column roles. """ TAG = 0 FIELD = 1 + ATTRIBUTE = 2 + TIME = 3 class ColumnSchema: """Defines schema for a table column (name, datatype, category).""" @@ -95,9 +121,11 @@ class TsFileTableWriter: """ :param path: The path of tsfile, will create if it doesn't exist. :param table_schema: describes the schema of the tables want to write. + :param memory_threshold: bytes buffered before an automatic flush (default 128MB). :return: no return value. """ - def __init__(self, path: str, table_schema: TableSchema) + def __init__(self, path: str, table_schema: TableSchema, + memory_threshold: int = 128 * 1024 * 1024) """ @@ -106,21 +134,38 @@ class TsFileTableWriter: :return: no return value. """ def write_table(self, tablet: Tablet) - + + """ + Write a pandas DataFrame into the table. Column encoding/compression follow + the table schema (or the engine defaults). + :param dataframe: the data to write. + :return: no return value. + """ + def write_dataframe(self, dataframe: pandas.DataFrame) + + """ + Flush buffered data to disk. + :return: no return value. + """ + def flush(self) + """ Close TsFileTableWriter and flush data automatically. :return: no return value. """ def close(self) + # Usable as a context manager: + # with TsFileTableWriter(path, schema) as w: + # w.write_table(tablet) + def __enter__(self) + def __exit__(self, exc_type, exc_val, exc_tb) ``` ### Tablet definition -You can use Tablet to insert data into TsFile in batches. - ```Python class Tablet(object) """ @@ -140,6 +185,49 @@ class Tablet(object) ``` +### dataframe_to_tsfile + +```python +def dataframe_to_tsfile(dataframe: pd.DataFrame, + file_path: str, + table_name: Optional[str] = None, + time_column: Optional[str] = None, + tag_column: Optional[list[str]] = None) + """ + Write a pandas DataFrame to a TsFile. + + :param dataframe: the data to write. + :param file_path: destination .tsfile path. + :param table_name: output table name. + :param time_column: name of the column to use as the timestamp column. + :param tag_column: names of the columns to treat as TAG columns. + """ +``` + +## Configuration + +Global write defaults — the default per-type encodings, the default compression, +and the time-column encoding/compression — are exposed as a single dictionary. +Change them **before** creating a writer. + +```python +from tsfile import get_tsfile_config, set_tsfile_config +from tsfile import TSEncoding, Compressor + +cfg = get_tsfile_config() # -> dict of all config values +# e.g. cfg["default_compression_type_"], cfg["int64_encoding_type_"], +# cfg["time_encoding_type_"], cfg["time_compress_type_"], ... + +set_tsfile_config({ + "default_compression_type_": Compressor.LZ4, + "int64_encoding_type_": TSEncoding.TS_2DIFF, +}) +``` + +`set_tsfile_config` validates each value and only updates the keys you pass. +Encoding/compression values are `TSEncoding` / `Compressor` members; the same +type-vs-encoding restrictions as the C++ API apply. + ## Read Interface ### TsFileReader @@ -147,141 +235,71 @@ class Tablet(object) ```python class TsFileReader: """ - Query table data and time-series data from TsFile, providing standardized file reading and query interfaces. - Supports full core capabilities including table model query, tree model query, metadata acquisition, and resource management. - """ - - def __init__(self, pathname: str): - """ - Initialize the TsFile reader for the specified path, complete file loading and underlying reader initialization, - and maintain all active query result sets to ensure all result sets are invalidated synchronously when the reader is closed. - - :param pathname: Full path of the TsFile to be read - :return: No return value - """ - - def query_table(self, table_name: str, column_names: List[str], - start_time: int = np.iinfo(np.int64).min, - end_time: int = np.iinfo(np.int64).max, - tag_filter: Optional[object] = None, - batch_size: int = 0) -> object: - """ - Perform time-range query on the specified table and columns, supporting tag filtering and batch reading mode. - Adapts to both row-by-row return and fixed-size data block return modes to meet reading requirements in different scenarios. - - :param table_name: Name of the target table to query, case-insensitive - :param column_names: List of target column names to retrieve; all columns are queried by default if empty - :param start_time: Start timestamp of the query range, default is the minimum value of int64 type - :param end_time: End timestamp of the query range, default is the maximum value of int64 type - :param tag_filter: Optional parameter, filter conditions based on tag columns, supporting equality, range, and logical combination filters - :param batch_size: Batch reading size; row-by-row mode is enabled when ≤ 0, data blocks are returned by the specified size when > 0 - :return: Encapsulated query result set handler for traversing data, reading data, and obtaining metadata - """ - - def query_table_on_tree(self, column_names: List[str], - start_time: int = np.iinfo(np.int64).min, - end_time: int = np.iinfo(np.int64).max) -> object: - """ - Perform table query on the tree model structure, adapted for query scenarios of native tree-structured time-series data. - Query directly based on measurement names without specifying a table name; path names are case-sensitive. - - :param column_names: List of measurement names to query, corresponding to node paths in the tree structure - :param start_time: Start timestamp of the query range, default is the minimum value of int64 type - :param end_time: End timestamp of the query range, default is the maximum value of int64 type - :return: Result set handler corresponding to the tree model query - """ - - def query_tree_by_row(self, device_ids: List[str], measurement_names: List[str], - offset: int = 0, limit: int = -1) -> object: - """ - Query tree model time-series data by row with pagination, supporting offset skipping and maximum return row limit. - Adapted for large data volume pagination reading to avoid memory overflow caused by loading excessive data at once. - - :param device_ids: List of device IDs to query, cannot be empty - :param measurement_names: List of measurement names to query, cannot be empty - :param offset: Number of starting rows to skip, starting from 0 by default - :param limit: Maximum number of rows to return; no limit if less than 0 - :return: Result set handler for tree model pagination query - """ - - def query_table_by_row(self, table_name: str, column_names: List[str], - offset: int = 0, limit: int = -1, - tag_filter: Optional[object] = None, - batch_size: int = 0) -> object: - """ - Query table model data by row with pagination, supporting offset and row limit pushdown, and can be used with tag filtering. - Invalid data can be skipped at the data block level for dense devices, greatly improving pagination query efficiency. - - :param table_name: Name of the target table to query - :param column_names: List of column names to query - :param offset: Number of starting rows to skip, starting from 0 by default - :param limit: Maximum number of rows to return; no limit if less than 0 - :param tag_filter: Optional parameter, tag filter condition to filter device data that meets the criteria - :param batch_size: Batch reading size, adapted to the underlying data block reading logic - :return: Result set handler for table model pagination query - """ - - def query_timeseries(self, device_name: str, sensor_list: List[str], - start_time: int = 0, end_time: int = 0) -> object: - """ - Perform time-range time-series data query for a single specified device. - Adapted for precise query scenarios of a single device with multiple sensors, simplifying query invocation logic. - - :param device_name: Name/path of the target device - :param sensor_list: List of sensor (measurement) names to query - :param start_time: Query start timestamp; starts from the earliest time of the file by default if 0 - :param end_time: Query end timestamp; ends at the latest time of the file by default if 0 - :return: Result set handler for single-device time-series query - """ - - def get_table_schema(self, table_name: str) -> object: - """ - Get the complete schema information of the specified table, including full metadata such as column names, data types, tag columns, and time-series constraints. - Used to verify the legality of query fields in advance and parse data structures. - - :param table_name: Name of the target table - :return: Schema information object of the corresponding table, containing full configuration of the table structure - """ - - def get_all_table_schemas(self) -> Dict[str, object]: - """ - Get schema information of all tables in the current TsFile. - Traverse all data table structures in the file with one click without querying table by table. - - :return: Dictionary structure, key is table name, value is schema information object of the corresponding table - """ - - def get_all_timeseries_schemas(self) -> List[object]: - """ - Get schema information of all time-series in the TsFile. - Covers field, type, and constraint information of full time-series data in both tree model and table model. - - :return: List of all time-series schema information - """ - - def get_all_devices(self) -> List[str]: - """ - Get identification information of all devices in the TsFile. - Can traverse all devices in the file, adapted for full-device statistics and batch query pre-operations. - - :return: List composed of all device IDs/device paths - """ - - def get_timeseries_metadata(self, device_ids: Optional[List[str]] = None) -> Dict[str, object]: - """ - Get time-series metadata of specified devices, including data storage segments, field constraints, data ranges, etc. - Returns metadata of all devices by default if no device ID is passed, returns an empty dictionary if an empty list is passed. - - :param device_ids: Optional parameter, list of device IDs to query metadata for - :return: Dictionary structure, key is device path, value is time-series metadata group of the corresponding device - """ - - def close(self) -> None: - """ - Close the TsFile reader, release underlying file handles and memory resources. - Mark all current active query result sets as invalid and prohibit subsequent data reading operations. - No query or metadata acquisition operations can be performed after closing; the reader needs to be reinitialized. - """ + Query table data from a TsFile. + """ + + """ + Initialize a TsFile reader for the specified file path. + :param pathname: The path to the TsFile. + :return no return value. + """ + def __init__(self, pathname) + + + """ + Executes a time range query on the specified table and columns. + + :param table_name: The name of the table to query. + :param column_names: A list of column names to retrieve. + :param start_time: The start time of the query range (default: minimum int64 value). + :param end_time: The end time of the query range (default: maximum int64 value). + :return: A query result set handler. + """ + def query_table(self, table_name : str, column_names : List[str], + start_time : int = np.iinfo(np.int64).min, + end_time: int = np.iinfo(np.int64).max) -> ResultSet + + """ + Execute a table query by row, with offset/limit pushdown and an optional + tag filter. A TAG predicate restricts the query to the devices whose + TAG-column values match. Build a filter with the helpers in tsfile.tag_filter + (tag_eq, tag_neq, tag_lt, tag_lteq, tag_gt, tag_gteq, tag_between, ...) and + combine filters with &, | and ~. + + :param table_name: The name of the table to query. + :param column_names: A list of column names to retrieve. + :param offset: Number of leading rows to skip (default 0). + :param limit: Maximum number of rows to return; < 0 means unlimited. + :param tag_filter: Optional tag predicate (TagFilter), or None for no filtering. + :param batch_size: <= 0 returns rows one by one; > 0 returns blocks of that size. + :return: A query result set handler. + """ + def query_table_by_row(self, table_name : str, column_names : List[str], + offset : int = 0, limit : int = -1, + tag_filter = None, batch_size : int = 0) -> ResultSet + + """ + Retrieves the schema of the specified table. + + :param table_name: The name of the table. + :return: The schema of the specified table. + """ + def get_table_schema(self, table_name : str)-> TableSchema + + + """ + Retrieves the schemas of all tables in the TsFile. + + :return: A dictionary mapping table names to their schemas. + """ + def get_all_table_schemas(self) ->dict[str, TableSchema] + + + """ + Closes the TsFile reader. If the reader has active result sets, they will be invalidated. + """ + def close(self) + ``` ### ResultSet @@ -388,7 +406,6 @@ def to_dataframe(file_path: str, Read data from a TsFile and convert it into a Pandas DataFrame or an iterator of DataFrames. - This function supports both table-model and tree-model TsFiles. Users can filter data by table name, column names, time range, and maximum number of rows. diff --git a/src/UserGuide/latest/Tools/Tsfile-CLI.md b/src/UserGuide/latest/Tools/Tsfile-CLI.md new file mode 100644 index 000000000..d335c1bb9 --- /dev/null +++ b/src/UserGuide/latest/Tools/Tsfile-CLI.md @@ -0,0 +1,183 @@ + +# tsfile-cli + +`tsfile-cli` is a single, pipe-friendly C++ command-line tool for inspecting +**and** importing Apache TsFile (`.tsfile`) files from the shell. Read commands print data to **stdout** and +diagnostics to **stderr**, so they compose with `awk`, `jq`, `sort`, and friends; +the `write` command imports CSV/TSV into a new `.tsfile`. It is built on the +public `TsFileReader` and `TsFileTableWriter` APIs. + +## Building from source + +The CLI is part of the C++ module. Build it with the Maven wrapper, which +downloads a pinned CMake and compiles the whole C++ module (the `libtsfile` +shared library + the `tsfile-cli` executable) for you. + +**Prerequisites:** a JDK (8+) to run Maven, and a C++11 compiler (GCC / Clang). +The third-party C++ dependencies (Snappy, LZ4, LZOKAY, Zlib, …) are bundled under +`cpp/third_party/` and built automatically. + +From the repository root: + +```bash +./mvnw clean package -P with-cpp +``` + +This produces, under `cpp/target/build/`: + +| Artifact | Path | +|---|---| +| CLI executable | `cpp/target/build/bin/tsfile-cli` | +| Shared library | `cpp/target/build/lib/libtsfile.so` (Linux) — `libtsfile.dylib` on macOS | + +`tsfile-cli` is dynamically linked against `libtsfile`. Run it **in place** by its +full path and it finds the library automatically: + +```bash +cpp/target/build/bin/tsfile-cli --version # -> tsfile-cli (Apache TsFile C++) +cpp/target/build/bin/tsfile-cli --help +``` + +To run the binary from **somewhere else** (e.g. after copying it out of the build +tree), the dynamic loader must be able to find `libtsfile.so`. Either point the +loader at the build's `lib/` directory, or copy the library to a standard +location: + +```bash +# point the loader at the build's lib directory (Linux; macOS uses DYLD_LIBRARY_PATH) +export LD_LIBRARY_PATH=/path/to/cpp/target/build/lib:$LD_LIBRARY_PATH + +# — or — copy the library to a system library path +sudo cp cpp/target/build/lib/libtsfile.so /usr/local/lib/ && sudo ldconfig +``` + +## Usage + +```text +tsfile-cli [options] +tsfile-cli --help | --version | help +``` + +Exit codes: `0` success, `1` usage/argument error, `2` file open/corrupt, +`3` query/runtime error. + +### Reading + +| Command | Description | +|---|---| +| `ls` | List devices (tree model) or tables (table model), one name per line | +| `schema` | Per-series `target, measurement, datatype, encoding, compression` | +| `meta` | File summary: model, device/table/series counts, time range, file size | +| `stats` | Per-series `count, start_time, end_time, min, max, first, last, sum` | +| `count` | Per-series row counts plus a `total` row (from statistics, no page scan) | +| `head` | First N rows (default 10; use `-n`) | +| `cat` | All matching rows, streamed (`table` format buffers to align columns) | +| `sample` | Reproducible reservoir sample (default 10; `-n`, `--seed`) | + +The metadata commands (`ls` / `schema` / `meta` / `stats` / `count`) answer most +questions **without decoding data pages**. + +Shared options: + +| Option | Meaning | +|---|---| +| `-f, --format csv\|tsv\|json\|table` | Output format; defaults to `table` on a TTY, `tsv` when piped | +| `-d, --device ` / `-t, --table ` | Scope to one device / table (mutually exclusive) | +| `-m, --measurements a,b,c` | Column projection (`schema`, `stats`, `count`, `head`, `cat`, `sample`) | +| `-n, --limit N` / `--offset N` | Max rows / rows to skip (`head`, `cat`; `--offset` not valid for `sample`) | +| `--start ` / `--end ` | Inclusive epoch-millisecond time range (`head`, `cat`, `sample`) | +| `--seed N` | Reproducible sampling seed (`sample` only) | +| `--tag-filter C OP V` / `--tag-between C L U` / `--tag-not-between C L U` | Table TAG predicate for `head`, `cat`, `sample`; `OP` is `eq`, `neq`, `lt`, `lteq`, `gt`, `gteq`, `regexp`, or `not-regexp` | +| `--no-header` | Omit the header row | +| `--model tree\|table` | Force the model (otherwise auto-detected) | + +`json` output is NDJSON (one object per line; numbers/booleans bare, other values +quoted, nulls as `null`; non-finite floats — NaN/Inf — become `null`). CSV output +follows RFC 4180. Timestamps are raw epoch milliseconds. The `table` format +buffers all rows in memory to align columns, so prefer `csv`/`tsv`/`json` when +dumping large files. + +```bash +BIN=cpp/build/Debug/bin/tsfile-cli +$BIN ls -f tsv data.tsfile # list tables / devices +$BIN meta data.tsfile # quick file overview +$BIN count -t table1 -f tsv data.tsfile # row counts, no page scan +$BIN cat -t table1 --tag-filter device eq dev_1 -m temp -f tsv data.tsfile +$BIN cat -m temp,humidity --start 1700000000000 -f csv data.tsfile | head +$BIN sample -m temp -n 20 --seed 42 -f json data.tsfile | jq . +``` + +> For a table-model file, the row commands (`head` / `cat` / `sample`) query the +> **first** table unless you pass `-t
`. `count` covers all tables. + +### Writing (import) + +`tsfile-cli write` imports CSV/TSV rows into a **new table-model** `.tsfile` (the +output is overwritten). The first input column is the timestamp (epoch +milliseconds); the remaining columns are declared explicitly with `--columns` — +there is no type inference. + +Timestamps must be **strictly increasing per device**, where a device is +identified by its `tag` column values (rows that share the same tags form one +device's timeline). Rows for different tag combinations may freely interleave and +reuse timestamps. Out-of-order input is rejected with the offending line number, +and a failed import leaves no output file behind. `--output` must differ from the +input file. + +```text +tsfile-cli write --table --columns -o \ + [-f csv|tsv] [--no-header] [--header-match] [-v] [ | -] +``` + +`--columns` is a comma-separated list of `name:TYPE:category`, where `category` +(case-insensitive) is `tag` or `field` and `TYPE` (case-insensitive) is one of +`BOOLEAN, INT32, INT64, FLOAT, DOUBLE, STRING, TEXT, TIMESTAMP, DATE, BLOB` — for +example `--columns "id1:STRING:tag,s1:INT64:field"`. `DATE` cells are written as +`YYYY-MM-DD`; `TIMESTAMP` cells as epoch milliseconds. Each column is stored with +the engine's default encoding and compression for its type. + +| Option | Meaning | +|---|---| +| `--table ` | Output table name (lower-cased) | +| `--columns ` | Ordered data columns (excludes the leading timestamp column) | +| `-o, --output ` | Output `.tsfile` (required; overwritten) | +| `` / `-` | Input file, or `-` / omitted for stdin | +| `-f csv\|tsv` | Input delimiter (default csv; `json` / `table` are rejected) | +| `--no-header` | Input has no header row (default: first line is a header and is skipped) | +| `--header-match` | Validate header names against `--columns` | +| `-v, --verbose` | Print `wrote N rows to ` to stderr (otherwise silent on success) | + +An empty cell is written as null. The command is silent on success (Unix-style); +pass `-v` for a one-line summary. + +```bash +# round-trip through a pipe +printf 'time,id1,s1\n0,dev,0\n1,dev,10\n' \ + | tsfile-cli write --table t1 --columns "id1:STRING:tag,s1:INT64:field" -o out.tsfile - +tsfile-cli count -f tsv out.tsfile # -> t1.dev s1 2 +``` + +## Using the skill with an AI assistant + +`cpp/tools/skills/tsfile-cli/SKILL.md` is a machine-readable reference that +documents how to drive `tsfile-cli`. AI coding assistants that support skills can +load it to help you inspect and import `.tsfile` files. diff --git a/src/UserGuide/latest/Tools/Tsfile-Viewer.md b/src/UserGuide/latest/Tools/Tsfile-Viewer.md new file mode 100644 index 000000000..c4f2a4b7c --- /dev/null +++ b/src/UserGuide/latest/Tools/Tsfile-Viewer.md @@ -0,0 +1,93 @@ + +# tsfile-viewer + +[Apache TsFile Viewer](https://github.com/apache/tsfile-viewer) is a web-based +application for browsing and analyzing TsFile data in your browser. It pairs a +Spring Boot backend (which reads `.tsfile` files via the Apache TsFile library) +with a Vue 3 frontend that renders metadata, paginated tables, and interactive +charts. + +- **Repository:** +- **License:** Apache-2.0 + +## Features + +- **File browsing and upload** — open `.tsfile` files from the UI. +- **Metadata display** — schema, devices, and measurements. +- **Paginated data tables** with filtering by time range, devices, measurements, + and value range. +- **Interactive charts** (ECharts) with multi-series overlay and aggregation. +- **Both data models** — supports tree-model and table-model TsFiles. +- **Export** — data as CSV or JSON; charts as PNG or SVG. +- **Performance** — chunk-level reading and metadata caching. + +## Requirements + +| Component | Version | +|---|---| +| JDK | 17 or 21 (LTS) | +| Maven | 3.9+ | +| Node.js | `^20.19.0 \|\| >=22.12.0` | +| pnpm | latest | +| Apache TsFile | 2.3.0 (bundled dependency) | + +## Get the source + +Clone the repository, then build and run it as shown below: + +```bash +git clone https://github.com/apache/tsfile-viewer.git +cd tsfile-viewer +``` + +## Running from source (development) + +Run the backend and frontend in two terminals. + +**Backend** (Spring Boot): + +```bash +cd backend +mvn spring-boot:run +``` + +**Frontend** (Vue + Vite dev server): + +```bash +cd frontend +pnpm install +pnpm dev +``` + +Then open the dev UI at . + +## Building and running a production bundle + +Build a self-contained distribution, then launch the packaged jar (the frontend +is served by the backend): + +```bash +./build-dist.sh +java -jar backend/target/tsfile-viewer-*.jar +``` + +Open the app at . diff --git a/src/zh/UserGuide/develop/DataFrame/TsFileDataFrame.md b/src/zh/UserGuide/develop/DataFrame/TsFileDataFrame.md new file mode 100644 index 000000000..cff291dda --- /dev/null +++ b/src/zh/UserGuide/develop/DataFrame/TsFileDataFrame.md @@ -0,0 +1,270 @@ + +# TsFileDataFrame + +`TsFileDataFrame` 让你像操作 pandas DataFrame 一样读取一个或多个 TsFile 中的时序数据, +无需关心底层文件格式与数据加载细节。它是 Python 包的一部分(`pip install tsfile`)。 + +## 快速上手 + +```python +from tsfile import TsFileDataFrame + +df = TsFileDataFrame("table_data/") # 加载目录下所有 .tsfile +print(df) # 浏览所有序列(仅元数据) + +ts = df["weather.Beijing.humidity"] # 取一条序列(懒加载句柄) +window = ts[20:100] # 按行号切片 -> np.ndarray + +data = df.loc[start:end, [ # 按时间戳对齐多条序列 + "weather.Beijing.temperature", + "weather.Beijing.humidity", +]] +data.values # -> np.ndarray, shape = (N, 2) +``` + +## 核心类型 + +`TsFileDataFrame` 围绕三个核心类型: + +- **`TsFileDataFrame`**:入口对象,加载一至多个 TsFile 并提供统一视图。初始化时只扫描元数据, + **不读取实际数值**。 +- **`Timeseries`**:单条序列的懒加载句柄,通过 `df[...]` 获得。它携带序列元信息,但在按行号索引前 + 不读取任何数据。 +- **`AlignedTimeseries`**:多条序列在同一时间轴上的对齐结果,通过 `df.loc[...]` 获得,会一次性将 + 指定时间范围内的多条序列读入内存。 + +### TsFileDataFrame + +下表中 `df` 是一个 `TsFileDataFrame` 实例,由 `df = TsFileDataFrame(paths)` 创建。 + +| 示例 | 操作 | 返回类型 | +|---|---|---| +| `TsFileDataFrame(paths)` | 加载文件 / 文件列表 / 目录 | `TsFileDataFrame` | +| `len(df)` | 时间序列总数 | `int` | +| `df.list_timeseries("weather")` | 获取序列名,可按前缀筛选 | `List[str]` | +| `df["weather.Beijing.humidity"]`、`df[0]`、`df[-1]` | 获取单条序列 | `Timeseries` | +| `df["city"]` | 获取某元数据列(标签 / `field` / `start_time` / `end_time` / `count`) | `pandas.Series` | +| `df[0:3]`、`df[[0, 2, 5]]` | 获取子集视图 | `TsFileDataFrame` | +| `df[df["city"] == "Beijing"]` | 按元数据列过滤 | `TsFileDataFrame` | +| `df.loc[start:end, series_list]` | 按时间戳对齐查询 | `AlignedTimeseries` | +| `df.show(max_rows=20)` / `print(df)` | 格式化元数据表格 | — | +| `df.close()` | 释放文件句柄 | — | + +### Timeseries + +下表中 `ts` 是一条 `Timeseries`,由 `ts = df[...]` 获得。 + +| 示例 | 操作 | 返回类型 | +|---|---|---| +| `ts.name` | 序列名 | `str` | +| `len(ts)` | 序列点数 | `int` | +| `ts.stats` | 序列统计信息 | `dict`(`start_time`、`end_time`、`count`) | +| `ts[20]` | 单值读取 | `float`(空值为 `None`) | +| `ts[20:100]` | 行范围切片 | `np.ndarray` | +| `ts.timestamps` | 时间戳数组 | `np.ndarray` | + +### AlignedTimeseries + +下表中 `data` 是一个 `AlignedTimeseries`,由 `data = df.loc[...]` 获得。 + +| 示例 | 操作 | 返回类型 | +|---|---|---| +| `data.timestamps` | 时间戳数组 | `np.ndarray` | +| `data.values` | 值矩阵 | `np.ndarray`,shape `(N, M)` | +| `data.series_names` | 序列名列表 | `List[str]` | +| `data.shape` | 形状 `(N, M)`——N 为时间戳数,M 为序列数 | `tuple` | +| `len(data)` | 行数 | `int` | +| `data[0]`、`data[0:10]`、`data[0, 1]` | 行 / 元素索引 | `np.ndarray` / 标量 | +| `data.show(50)` / `print(data)` | 格式化输出(自动截断) | — | + +## 序列名 + +TsFileDataFrame 以**序列名**(一个字符串)作为序列的唯一标识。序列名由 **表名**、**各标签列的取值**、 +**字段名** 三部分按此顺序经 `.` 连接构成: + +```text +{表名}.{标签值1}.{标签值2}...{字段名} +``` + +`list_timeseries()` 返回的即为序列名;按名称索引(`df[...]`)与 `df.loc[...]` 中的序列选择均以序列名为参数。 + +示例: + +- `weather.Beijing.humidity` — 表 `weather`,标签 `Beijing`,字段 `humidity` +- `sensor.s1.pressure` — 表 `sensor`,标签 `s1`,字段 `pressure` + +> 序列名可由 `list_timeseries()` 获取,无需手工构造;亦可改用整数索引(`df[0]`)或元数据过滤 +> (`df[df["city"] == "Beijing"]`)选择序列。 + +## 加载 + +路径可以是单个文件、文件列表或目录: + +```python +from tsfile import TsFileDataFrame + +df = TsFileDataFrame(["data/weather.tsfile", "data/sensor.tsfile"]) +df = TsFileDataFrame("data/") # 递归查找目录下所有 .tsfile +print(df) +``` + +初始化时只扫描元数据,不读取实际数值。加载多个文件时会并行扫描元数据。 + +如果多个文件包含 **同名序列**(如按日分片的 `weather.Beijing.humidity`),会自动合并为一条连续序列。 +对于重复时间戳仅保留第一条——这并非预期情况,请在预处理阶段去重,以免造成元数据失真。 + +### DataFrame 的展示 + +`print(df)`(以及 `df.show(max_rows=...)`)打印序列元信息,数据量大时头尾截断。表头为: + +```text +index │ table │ │ ... │ field │ start_time │ end_time │ count +``` + +对于标签数量不同的设备:标签值按左对齐,较短的在末尾补 `None`。 + +```text +TsFileDataFrame(table model, 972 time series, 5 files) + table ps_id sn frac field start_time end_time count + 0 pvf 10 30100194A00234H00572 1 pac 2024-04-02 00:00:00 2024-10-28 23:45:00 20160 + 1 pvf 10 30100194A00234H00572 1 tenmeterswindspeed 2024-04-02 00:00:00 2024-10-28 23:45:00 20160 +... +``` + +### 关闭 + +`with` 语句会自动释放文件句柄,也可以手动关闭: + +```python +with TsFileDataFrame("data/") as df: + ... # 退出后自动关闭 + +tsdf = TsFileDataFrame("data/") +tsdf.close() # 也可以自己关闭 +``` + +## 浏览序列 + +`list_timeseries(path_prefix="")` 列出已加载文件中的序列名,可按前缀筛选;不传参返回全部序列。 + +```python +>>> df.list_timeseries("weather") +['weather.Beijing.humidity', 'weather.Beijing.temperature', + 'weather.Shanghai.humidity', 'weather.Shanghai.temperature'] +>>> df.list_timeseries("weather.Beijing") +['weather.Beijing.humidity', 'weather.Beijing.temperature'] +``` + +若需查看起止时间、点数等元信息,可打印 DataFrame(或其子集)——见[DataFrame 的展示](#dataframe-的展示)。 + +## 选取序列 + +`df[...]` 返回懒加载的 `Timeseries` 句柄(不触发读取),或返回子集视图: + +```python +ts = df["weather.Beijing.humidity"] # 按名称 +ts = df[0] # 按索引(支持负索引) + +sub_df = df[0:3] # 切片 -> TsFileDataFrame(视图) +sub_df = df[[0, 2, 5]] # 整数列表 -> TsFileDataFrame(视图) +sub_df = df[df["city"] == "Beijing"] # 按元数据过滤 -> TsFileDataFrame(视图) +``` + +```text +>>> df["weather.Beijing.humidity"] +Timeseries('weather.Beijing.humidity', count=2880, start=2026-01-27 00:00:00, end=2026-02-05 23:55:00) +``` + +序列元信息从缓存读取(无 I/O): + +```python +>>> ts = df["weather.Beijing.humidity"] +>>> ts.name +'weather.Beijing.humidity' +>>> len(ts) +2880 +>>> ts.stats +{'start_time': 1769443200000, 'end_time': 1770306900000, 'count': 2880} +``` + +## 读取数据 + +对 `Timeseries` 按行号索引时才触发实际的文件读取: + +```python +val = ts[20] # -> float +window = ts[20:100] # -> np.ndarray, shape = (80,) +last_ten = ts[-10:] # -> np.ndarray +sampled = ts[::2] # -> np.ndarray(步长采样) +ts.timestamps[20:100] # -> 对应行号的时间戳, np.ndarray +``` + +```text +>>> ts[20] +46.1 +>>> ts[20:100] +array([46.1 , 41.72, 52.94, ..., 76.3 , 84.35]) +>>> ts.timestamps[20:100] +array([1769449200000, 1769449500000, ..., 1769472900000]) +``` + +## 多序列对齐查询 + +当需要多条序列在同一时间轴上严格对齐时,使用 `.loc`: + +```python +data = df.loc[start_time:end_time, [ + "weather.Beijing.humidity", + "weather.Beijing.temperature", + "sensor.s1.pressure", +]] +``` + +返回的 `AlignedTimeseries` 将所有序列对齐到时间戳的 **并集**,缺失位置填充 `NaN`: + +```python +data.timestamps # np.ndarray,毫秒时间戳 +data.values # np.ndarray, shape = (N, 3) +data.series_names # ["weather.Beijing.humidity", ...] +data.shape # (N, 3) +data[0:10] # 前 10 行, np.ndarray shape = (10, 3) +data.show(50) # 最多显示 50 行 +``` + +序列可按名称或索引指定,并可混用: + +```python +df.loc[start_time:end_time, [0, 1, 4]] +df.loc[start_time:end_time, [0, "weather.Beijing.temperature", 4]] +``` + +```text +>>> df.loc[1769616000000:1769702100000, +... ['weather.Beijing.temperature', 'weather.Beijing.humidity', 'sensor.s2.pressure']] +AlignedTimeseries(288 rows, 3 series) + timestamp weather.Beijing.temperature weather.Beijing.humidity sensor.s2.pressure +2026-01-29 00:00:00 29.12 92.87 NaN +2026-01-29 00:05:00 1.55 87.34 NaN +... +``` + +该美化视图仅展示值列;如需读取对齐后的时间戳列,请使用 `df.loc[...].timestamps`。 diff --git a/src/zh/UserGuide/develop/QuickStart/InterfaceDefinition/InterfaceDefinition-C.md b/src/zh/UserGuide/develop/QuickStart/InterfaceDefinition/InterfaceDefinition-C.md index c08b06d1b..5c0e8a0e7 100644 --- a/src/zh/UserGuide/develop/QuickStart/InterfaceDefinition/InterfaceDefinition-C.md +++ b/src/zh/UserGuide/develop/QuickStart/InterfaceDefinition/InterfaceDefinition-C.md @@ -32,12 +32,44 @@ typedef enum { TS_DATATYPE_FLOAT = 3, TS_DATATYPE_DOUBLE = 4, TS_DATATYPE_TEXT = 5, - TS_DATATYPE_STRING = 11 + TS_DATATYPE_TIMESTAMP = 8, + TS_DATATYPE_DATE = 9, + TS_DATATYPE_BLOB = 10, + TS_DATATYPE_STRING = 11, + TS_DATATYPE_INVALID = 255 } TSDataType; -typedef enum column_category { TAG = 0, FIELD = 1 } ColumnCategory; +// 值编码 +typedef enum { + TS_ENCODING_PLAIN = 0, + TS_ENCODING_DICTIONARY = 1, + TS_ENCODING_RLE = 2, + TS_ENCODING_TS_2DIFF = 4, + TS_ENCODING_GORILLA = 8, + TS_ENCODING_ZIGZAG = 9, + TS_ENCODING_SPRINTZ = 12, + TS_ENCODING_INVALID = 255 +} TSEncoding; + +// 压缩类型,默认值为 LZ4。 +typedef enum { + TS_COMPRESSION_UNCOMPRESSED = 0, + TS_COMPRESSION_SNAPPY = 1, + TS_COMPRESSION_GZIP = 2, + TS_COMPRESSION_LZO = 3, + TS_COMPRESSION_LZ4 = 7, + TS_COMPRESSION_INVALID = 255 +} CompressionType; + +typedef enum column_category { + TAG = 0, + FIELD = 1, + ATTRIBUTE = 2, + TIME = 3 +} ColumnCategory; // ColumnSchema:表示单个列的模式,包括列名、数据类型和分类。 +// 列的编码/压缩遵循全局默认值(见下文“配置”)。 typedef struct column_schema { char* column_name; TSDataType data_type; @@ -59,6 +91,8 @@ typedef struct result_set_meta_data { } ResultSetMetaData; ``` +> `ColumnSchema` 不携带编码/压缩——它们遵循全局默认值(见[配置](#配置编码与压缩))。 + ## 写入接口 ### 创建/关闭 TsFile 写入文件 @@ -250,6 +284,38 @@ ERRNO tsfile_writer_write(TsFileWriter writer, Tablet tablet); +## 配置(编码与压缩) + +列按其数据类型的 **全局默认** 编码与压缩存储(`ColumnSchema` 不携带编解码设置)。 +请在创建写入器 *之前* 用下列函数修改这些默认值。 + +每个 setter 成功返回 `RET_OK`(0),遇到不支持的数据类型/编码或压缩组合返回 `RET_NOT_SUPPORT`(40)。 + +```C +/* 按数据类型的默认值编码,以及默认压缩。 */ +int set_datatype_encoding(uint8_t data_type, uint8_t encoding); +int set_global_compression(uint8_t compression); +uint8_t get_datatype_encoding(uint8_t data_type); +uint8_t get_global_compression(); + +/* 时间列(时间数据类型固定为 INT64)。 */ +int set_global_time_encoding(uint8_t encoding); +int set_global_time_compression(uint8_t compression); +uint8_t get_global_time_encoding(); +uint8_t get_global_time_compression(); +``` + +允许的取值:编码方面,`BOOLEAN` 仅 `PLAIN`;`INT32`/`INT64`/`DATE` 为 +`PLAIN`/`TS_2DIFF`/`GORILLA`/`ZIGZAG`/`RLE`/`SPRINTZ`;`FLOAT`/`DOUBLE` 为 +`PLAIN`/`TS_2DIFF`/`GORILLA`/`SPRINTZ`;`STRING`/`TEXT` 为 `PLAIN`/`DICTIONARY`。 +压缩可取 `UNCOMPRESSED`、`SNAPPY`、`GZIP`、`LZO`、`LZ4`。 + +```C +// 例如:所有列均以 LZ4 压缩写入 +ERRNO code = set_global_compression(TS_COMPRESSION_LZ4); +if (code != RET_OK) { /* 处理不支持的取值 */ } +``` + ## 读取接口 ### TsFile Reader 创建/关闭 @@ -277,7 +343,7 @@ ERRNO tsfile_reader_close(TsFileReader reader); -### 查询表 / 获取下一行 / 按行查询 +### 查询表 / 获取下一行 ```C @@ -313,45 +379,129 @@ bool tsfile_result_set_next(ResultSet result_set, ERRNO* error_code); * @param result_set [输入] 有效的 ResultSet 句柄指针。 */ void free_tsfile_result_set(ResultSet* result_set); +``` + + + +### 按标签过滤 + +**标签列(TAG)** 构成设备的唯一标识(联合主键)——正是它们的取值在一个表内 +区分不同的设备。*标签过滤器* 把查询限定到标签取值满足条件的设备,从而只读取你关心的设备数据。 +用 reader 构造一个过滤器,传给下文的表查询函数,用完再用 `tsfile_tag_filter_free()` 释放。 + +```C +// 标签过滤器的不透明句柄,用下面的函数构造。 +typedef void* TagFilterHandle; + +// 单列标签谓词的比较运算符。 +typedef enum { + TAG_FILTER_EQ = 0, // 列 == 值 + TAG_FILTER_NEQ = 1, // 列 != 值 + TAG_FILTER_LT = 2, // 列 < 值 + TAG_FILTER_LTEQ = 3, // 列 <= 值 + TAG_FILTER_GT = 4, // 列 > 值 + TAG_FILTER_GTEQ = 5, // 列 >= 值 + TAG_FILTER_REGEXP = 6, // 列匹配正则 值 + TAG_FILTER_NOT_REGEXP = 7, // 列不匹配正则 值 +} TagFilterOp; /** - * @brief 按行查询时间序列数据(树模型),支持偏移量与行数限制 - * - * @param reader [in] 有效的 TsFileReader 句柄,通过 tsfile_reader_new() 获取 - * @param device_ids [in] 设备 ID 数组 - * @param device_ids_len [in] 设备 ID 的数量 - * @param measurement_names [in] 测量项(传感器)名称数组 - * @param measurement_names_len [in] 测量项名称的数量 - * @param offset [in] 需要跳过的起始行数(必须 >= 0) - * @param limit [in] 最多返回的行数,< 0 表示不限制 - * @param err_code [out] 错误码,成功返回 E_OK(0) - * @return 成功返回结果集 ResultSet 句柄,失败返回 NULL + * @brief 创建单列标签谓词:` `。 + * + * @param reader [输入] 有效的 TsFileReader 句柄。 + * @param table_name [输入] 其 schema 定义了这些标签列的表名。 + * @param column_name [输入] 要过滤的标签列名。 + * @param value [输入] 比较值(标签列为 STRING 类型)。 + * @param op [输入] 比较运算符(TagFilterOp)。 + * @param err_code [输出] 成功返回 RET_OK(0),否则返回 errno_define_c.h 中的错误码。 + * @return 成功返回 TagFilterHandle,失败返回 NULL。 */ -ResultSet tsfile_reader_query_tree_by_row(TsFileReader reader, - char** device_ids, int device_ids_len, - char** measurement_names, - int measurement_names_len, int offset, - int limit, ERRNO* err_code); +TagFilterHandle tsfile_tag_filter_create(TsFileReader reader, + const char* table_name, + const char* column_name, + const char* value, TagFilterOp op, + ERRNO* err_code); + +/** + * @brief 创建范围谓词:lower <= 列 <= upper(is_not 为 true 表示 NOT BETWEEN)。 + */ +TagFilterHandle tsfile_tag_filter_between(TsFileReader reader, + const char* table_name, + const char* column_name, + const char* lower, const char* upper, + bool is_not, ERRNO* err_code); + +// 组合谓词。AND/OR/NOT 会接管其子节点的所有权,只需释放根节点。 +TagFilterHandle tsfile_tag_filter_and(TagFilterHandle left, TagFilterHandle right); +TagFilterHandle tsfile_tag_filter_or(TagFilterHandle left, TagFilterHandle right); +TagFilterHandle tsfile_tag_filter_not(TagFilterHandle filter); + +// 释放标签过滤器及其全部子节点。 +void tsfile_tag_filter_free(TagFilterHandle filter); +``` +### 带标签过滤、分页与分批的表查询 + +下列查询函数接受一个可选的 `tag_filter`(传 `NULL` 表示不过滤)和 `batch_size` +(`<= 0` 逐行返回;`> 0` 按该大小返回数据块)。 + +```C /** - * @brief 按行查询表模型数据,支持偏移量与行数限制下推 - * - * @param reader [in] 有效的 TsFileReader 句柄,通过 tsfile_reader_new() 获取 - * @param table_name [in] 目标表名 - * @param column_names [in] 要查询的列名数组 - * @param column_names_len [in] 要查询的列数量 - * @param offset [in] 需要跳过的起始行数(必须 >= 0) - * @param limit [in] 最多返回的行数,< 0 表示不限制 - * @param err_code [out] 错误码,成功返回 E_OK(0) - * @return 成功返回结果集 ResultSet 句柄,失败返回 NULL + * @brief 按行查询表,支持偏移量/行数限制下推与可选的标签过滤。 + * + * @param reader [输入] 有效的 TsFileReader 句柄。 + * @param table_name [输入] 目标表名。 + * @param column_names [输入] 要查询的列名数组。 + * @param column_names_len [输入] 要查询的列数量。 + * @param offset [输入] 需要跳过的起始行数(>= 0)。 + * @param limit [输入] 最多返回行数;< 0 表示不限制。 + * @param tag_filter [输入] 标签谓词,NULL 表示不过滤。 + * @param batch_size [输入] <= 0 逐行;> 0 数据块大小。 + * @param err_code [输出] 成功返回 RET_OK(0),否则返回错误码。 + * @return 成功返回 ResultSet 句柄,失败返回 NULL。用 free_tsfile_result_set() 释放。 */ ResultSet tsfile_reader_query_table_by_row( TsFileReader reader, const char* table_name, char** column_names, int column_names_len, int offset, int limit, TagFilterHandle tag_filter, int batch_size, ERRNO* err_code); + +/** + * @brief 在时间范围内查询表,支持可选的标签过滤与分批。 + * + * @param batch_size <= 0 逐行返回;> 0 返回该大小的 TsBlock。 + */ +ResultSet tsfile_query_table_batch(TsFileReader reader, const char* table_name, + char** columns, uint32_t column_num, + Timestamp start_time, Timestamp end_time, + TagFilterHandle tag_filter, int batch_size, + ERRNO* err_code); + +/** + * @brief 带标签过滤的表查询(时间范围 + 标签谓词)。 + * + * @param batch_size <= 0 逐行返回;> 0 返回该大小的 TsBlock。 + */ +ResultSet tsfile_query_table_with_tag_filter( + TsFileReader reader, const char* table_name, char** columns, + uint32_t column_num, Timestamp start_time, Timestamp end_time, + TagFilterHandle tag_filter, int batch_size, ERRNO* err_code); ``` +示例——只读取 `region` 标签等于 `shanghai` 的设备的 `temperature`: + +```C +ERRNO ec = RET_OK; +TagFilterHandle f = tsfile_tag_filter_create( + reader, "weather", "region", "shanghai", TAG_FILTER_EQ, &ec); +char* cols[] = {"temperature"}; +ResultSet rs = tsfile_reader_query_table_by_row( + reader, "weather", cols, 1, /*offset*/ 0, /*limit*/ -1, f, /*batch*/ 0, &ec); + +// ... 用 tsfile_result_set_next() 遍历 rs,然后释放: +free_tsfile_result_set(&rs); +tsfile_tag_filter_free(f); +``` ### 从结果集中获取数据 diff --git a/src/zh/UserGuide/develop/QuickStart/InterfaceDefinition/InterfaceDefinition-CPP.md b/src/zh/UserGuide/develop/QuickStart/InterfaceDefinition/InterfaceDefinition-CPP.md index bc43211d1..078a17cb8 100644 --- a/src/zh/UserGuide/develop/QuickStart/InterfaceDefinition/InterfaceDefinition-CPP.md +++ b/src/zh/UserGuide/develop/QuickStart/InterfaceDefinition/InterfaceDefinition-CPP.md @@ -20,6 +20,65 @@ --> # 接口定义 - C++ +## 数据类型、编码与压缩 + +下列枚举为读写接口共用。其数值编码同时也是磁盘上存储的取值。 + +```cpp +// 支持的测点/列数据类型。 +enum TSDataType : uint8_t { + BOOLEAN = 0, + INT32 = 1, + INT64 = 2, + FLOAT = 3, + DOUBLE = 4, + TEXT = 5, + TIMESTAMP = 8, + DATE = 9, + BLOB = 10, + STRING = 11, +}; + +// 值编码。各编码适用于哪些类型见下表。 +enum TSEncoding : uint8_t { + PLAIN = 0, + DICTIONARY = 1, + RLE = 2, + TS_2DIFF = 4, + GORILLA = 8, + ZIGZAG = 9, + SPRINTZ = 12, +}; + +// 压缩类型。SNAPPY/GZIP/LZO/LZ4 取决于构建选项;默认压缩为 LZ4。 +enum CompressionType : uint8_t { + UNCOMPRESSED = 0, + SNAPPY = 1, + GZIP = 2, + LZO = 3, + LZ4 = 7, +}; + +// 列在表 schema 内的角色。 +enum class ColumnCategory { TAG = 0, FIELD = 1, ATTRIBUTE = 2, TIME = 3 }; +``` + +各数据类型适用的编码: + +| 编码 | 适用类型 | +|---|---| +| `PLAIN` | 所有类型 | +| `DICTIONARY` | `TEXT`、`STRING` | +| `RLE` | `INT32`、`INT64`、`TIMESTAMP`、`DATE` | +| `TS_2DIFF` | `INT32`、`INT64`、`TIMESTAMP`、`DATE`、`FLOAT`、`DOUBLE` | +| `GORILLA` | `INT32`、`INT64`、`TIMESTAMP`、`DATE`、`FLOAT`、`DOUBLE` | +| `ZIGZAG` | `INT32`、`INT64` | +| `SPRINTZ` | `INT32`、`INT64`、`FLOAT`、`DOUBLE` | + +各类型的默认值编码:`BOOLEAN → PLAIN`、`INT32 / INT64 → TS_2DIFF`、 +`FLOAT / DOUBLE → GORILLA`、`TEXT / STRING / BLOB → PLAIN`。默认压缩为 `LZ4`。 +覆盖方式见[配置编码与压缩](#配置编码与压缩)。 + ## 写入接口 ### TsFileTableWriter @@ -27,89 +86,55 @@ 用于写入 TsFile. ```cpp -namespace storage { -class RestorableTsFileIOWriter; - /** - * @brief 支持按照指定表结构,将结构化表数据写入 TsFile 文件 + * @brief 用于将结构化表格数据写入具有指定模式的 TsFile。 * - * TsFileTableWriter 类用于将结构化数据(特别适用于时序数据) - * 写入专为高效存储与查询优化的 TsFile 文件。 - * 使用者可定义待写入表的结构,按照该结构添加数据行, - * 并将数据序列化为 TsFile。 - * 同时,该类提供写入过程中的内存使用限制能力。 + * TsFileTableWriter 类被设计用于写入结构化数据,特别适合时序数据, + * 数据将被写入一种为高效存储与检索优化的文件格式(即 TsFile)。该类允许用户定义 + * 所需写入表的模式,按照该模式添加数据行,并将这些数据序列化写入 TsFile。 + * 此外,还提供了在写入过程中限制内存使用的选项。 */ class TsFileTableWriter { public: /** - * TsFileTableWriter 用于根据指定的表结构,将表数据写入目标文件, - * 并可选择性地限制内存使用量。 + * TsFileTableWriter 用于将表格数据写入具有指定模式的目标文件, + * 可选地限制内存使用。 * - * @param writer_file 表数据的目标写入文件,不能为空指针 - * @param table_schema 用于构建表结构,定义待写入表的 schema - * @param memory_threshold 可选参数。当已写入数据量超过该阈值时, - * 数据将自动刷新到磁盘。默认值为 128MB + * @param writer_file 要写入表数据的目标文件。不能为空。 + * @param table_schema 用于构造表结构,定义正在写入表的模式。 + * @param memory_threshold 可选参数。当写入数据的大小超过该值时, + * 数据将自动刷新到磁盘。默认值为 128MB。 */ - template - explicit TsFileTableWriter(storage::WriteFile* writer_file, T* table_schema, - uint64_t memory_threshold = 128 * 1024 * 1024) { - static_assert(!std::is_same::value, - "table_schema cannot be nullptr"); - tsfile_writer_ = std::make_shared(); - tsfile_writer_->init(writer_file); - tsfile_writer_->set_generate_table_schema(false); - - // 执行深拷贝。源 TableSchema 对象可能分配在栈/堆上 - auto table_schema_ptr = std::make_shared(*table_schema); - error_number = tsfile_writer_->register_table(table_schema_ptr); - exclusive_table_name_ = table_schema->get_table_name(); - common::g_config_value_.chunk_group_size_threshold_ = memory_threshold; - } + TsFileTableWriter(WriteFile* writer_file, + TableSchema* table_schema, + uint64_t memory_threshold = 128 * 1024 * 1024); + ~TsFileTableWriter(); /** - * 通过可恢复的 TsFileIOWriter 构建 TsFileTableWriter, - * 支持在故障恢复后追加表数据。Schema 从已恢复的文件中读取, - * 无需额外传入 TableSchema。 + * 将给定的 Tablet 数据按照表的模式写入目标文件。 * - * @param restorable_writer 已恢复的 I/O 写入器;不能为空指针, - * 且必须以截断模式打开,保证 can_write() 返回 true - * @param memory_threshold 可选的缓存数据内存阈值 + * @param tablet 包含待写入数据的 Tablet。不能为空。 + * @return 成功时返回 0,失败时返回 errno_define.h 中的非零错误码。 */ - explicit TsFileTableWriter( - storage::RestorableTsFileIOWriter* restorable_writer, - uint64_t memory_threshold = 128 * 1024 * 1024); + int write_table(const Tablet& tablet); /** - * 向写入器注册表结构 - * - * @param table_schema 待注册的表结构,不能为空指针 - * @return 成功返回 0,失败返回非零错误码 - */ - int register_table(const std::shared_ptr& table_schema); - /** - * 根据表结构,将指定的 Tablet 数据写入目标文件 + * 将所有缓冲数据刷新到底层存储介质,确保所有数据都已写出。 + * 此方法确保所有未完成的写入操作被持久化。 * - * @param tablet 包含待写入数据的 Tablet,不能为空指针 - * @return 成功返回 0,失败返回非零错误码 - */ - int write_table(Tablet& tablet) const; - /** - * 将所有缓存数据刷新到底层存储介质,确保所有数据都被持久化。 - * 该方法保证所有待写入数据都被落盘。 - * - * @return 成功返回 0,失败返回非零错误码 + * @return 成功时返回 0,失败时返回 errno_define.h 中的非零错误码。 */ + int flush(); /** * 关闭写入器并释放其占用的所有资源。 - * 调用该方法后,不应对当前实例执行任何后续操作。 + * 调用此方法后,不应再对该实例执行任何操作。 * - * @return 成功返回 0,失败返回非零错误码 + * @return 成功时返回 0,失败时返回 errno_define.h 中的非零错误码。 */ + int close(); }; - -} // namespace storage ``` ### TableSchema @@ -147,46 +172,41 @@ class TableSchema { struct ColumnSchema { std::string column_name_; common::TSDataType data_type_; + common::CompressionType compression_; + common::TSEncoding encoding_; ColumnCategory column_category_; - /** - * @brief 使用给定参数构造一个 ColumnSchema 对象。 + /** + * @brief 使用显式的压缩与编码构造 ColumnSchema。 * * @param column_name 列的名称,必须为非空字符串。 - * 此名称用于在表中标识该列。 - * @param data_type 该列的数据类型,例如 INT32、DOUBLE、TEXT 等。 - * 数据类型决定了数据的存储与解释方式。 - * @param column_category 列的类别,用于标识其在模式中的角色或类型, - * 例如 FIELD(字段)、TAG(标签)。 - * 如果未指定,默认为 ColumnCategory::FIELD。 - * @note 调用者有责任确保 `column_name` 非空。 + * @param data_type 该列的数据类型(INT32、DOUBLE、TEXT 等)。 + * @param compression 该列 chunk 使用的压缩方式。 + * @param encoding 该列值使用的编码方式。 + * @param column_category 列的类别(FIELD、TAG 等),默认为 FIELD。 */ ColumnSchema(std::string column_name, common::TSDataType data_type, - ColumnCategory column_category = ColumnCategory::FIELD) : column_name_(std::move(column_name)), - data_type_(data_type), - column_category_(column_category) { - } -}; + common::CompressionType compression, common::TSEncoding encoding, + ColumnCategory column_category = ColumnCategory::FIELD); -/** - * @brief Represents the data type of a measurement. - * - * This enumeration defines the supported data types for measurements in the system. - */ -enum TSDataType : uint8_t { - BOOLEAN = 0, - INT32 = 1, - INT64 = 2, - FLOAT = 3, - DOUBLE = 4, - TEXT = 5, - STRING = 11 + /** + * @brief 使用引擎对该数据类型的默认编码与压缩构造 ColumnSchema。 + * + * @param column_name 列的名称,必须为非空字符串。 + * @param data_type 该列的数据类型。 + * @param column_category 列的类别,默认为 FIELD。 + */ + ColumnSchema(std::string column_name, common::TSDataType data_type, + ColumnCategory column_category = ColumnCategory::FIELD); }; - ``` +> `TAG` 列是设备的唯一标识(联合主键),数据类型固定为 `STRING`;`FIELD` 列存储测量值。 +> 在 `ColumnSchema` 上设置的编码与压缩会在写入时作用于该列;双参数构造函数则回退到按类型的默认值。 + ### Tablet + ```cpp /** * @brief 表示用于插入到表中的数据行集合及其相关元数据。 @@ -251,151 +271,110 @@ public: }; ``` -### RestorableTsFileIOWriter -> V2.3.1 +### 配置编码与压缩 -```cpp -namespace storage { -/** - * RestorableTsFileIOWriter 用于打开 TsFile 并对其进行可选的恢复操作 - * 继承自 TsFileIOWriter,支持在文件恢复后继续写入 - * - * (1) 若 TsFile 正常关闭:has_crashed()=false,can_write()=false - * - * (2) 若 TsFile 不完整/程序崩溃:has_crashed()=true, - * can_write()=true,写入器会截断损坏数据并允许继续写入 - * - * 基于标准 C++11 实现,通过 RAII 和智能指针避免内存泄漏 - */ -class RestorableTsFileIOWriter : public TsFileIOWriter { - public: - RestorableTsFileIOWriter(); +编码与压缩 **按数据类型** 选取:每种类型都有默认值(见上表)。你可以修改这些默认值, +也可以在 schema 上传入显式的编码/压缩。 - /** - * 打开 TsFile 用于恢复/追加写入 - * 使用 O_RDWR|O_CREAT 模式,不使用 O_TRUNC,因此会保留文件原有内容 - * - * @param file_path TsFile 文件路径 - * @param truncate_corrupted 若为 true,则截断损坏的数据; - * 若为 false,则不截断(不完整文件保持原样) - * @return 成功返回 E_OK,失败返回错误码 - */ - int open(const std::string& file_path, bool truncate_corrupted = true); - - /** - * 关闭文件 - */ - void close(); -}; +**1. 在 schema 上指定**:在构造 `ColumnSchema` 时传入显式的编码与压缩: -} // namespace storage +```cpp +// 将 "temperature" 列以 TS_2DIFF + LZ4 存储。 +common::ColumnSchema col("temperature", common::INT64, + common::LZ4, common::TS_2DIFF, + common::ColumnCategory::FIELD); ``` +**2. 按类型的默认值**:在创建写入器 *之前* 修改默认值;它们会作用于所有未在 schema 中 +指定自身编码/压缩的列。这些函数位于 `common`/`storage` 命名空间,会校验参数(不支持的组合返回 +`E_NOT_SUPPORT`): + +```cpp +// 按数据类型的默认值编码,以及默认压缩。 +int common::set_datatype_encoding(uint8_t data_type, uint8_t encoding); +int common::set_global_compression(uint8_t compression); +uint8_t common::get_datatype_encoding(uint8_t data_type); +uint8_t common::get_global_compression(); + +// 时间列的编码/压缩(数据类型固定为 INT64)。 +int common::set_global_time_encoding(uint8_t encoding); +int common::set_global_time_compression(uint8_t compression); +``` ## 读取接口 ### Tsfile Reader ```cpp /** - * @brief TsFileReader 提供查询所有后缀为 .tsfile 的文件的能力 + * @brief TsFileReader 提供了查询所有以 .tsfile 为后缀的文件的能力。 * - * TsFileReader 专为查询 .tsfile 文件设计,支持树模型查询和表模型查询, - * 同时支持查询表结构(TableSchema)、时间序列结构(TimeseriesSchema)等元数据。 + * TsFileReader 旨在用于查询 .tsfile 文件,支持表模型查询, + * 并支持查询元数据信息,如 TableSchema。 */ + class TsFileReader { public: TsFileReader(); + ~TsFileReader(); /** - * @brief 打开 tsfile 文件 + * @brief 打开 tsfile 文件。 * - * @param file_path 待打开的 tsfile 文件路径 - * @return 成功返回0,失败返回非零错误码 + * @param file_path 要打开的 tsfile 文件路径。 + * @return 成功时返回 0,失败时返回 errno_define.h 中的非零错误码。 */ - int open(const std::string& file_path); + + int open(const std::string &file_path); /** - * @brief 关闭 tsfile 文件,该方法应在查询完成后调用 + * @brief 关闭 tsfile,查询完成后应调用此方法。 * - * @return 成功返回0,失败返回非零错误码 + * @return 成功时返回 0,失败时返回 errno_define.h 中的非零错误码。 */ int close(); /** - * @brief 通过查询表达式查询 tsfile 文件,用户可自行构造查询表达式进行查询 + * @brief 通过查询表达式对 tsfile 进行查询,用户可以自行构造查询表达式来查询 tsfile。 * - * @param [in] qe 查询表达式 - * @param [out] ret_qds 结果集 - * @return 成功返回0,失败返回非零错误码 + * @param [in] qe 查询表达式。 + * @param [out] ret_qds 查询结果集。 + * @return 成功时返回 0,失败时返回 errno_define.h 中的非零错误码。 */ - int query(storage::QueryExpression* qe, ResultSet*& ret_qds); + int query(storage::QueryExpression *qe, ResultSet *&ret_qds); /** - * @brief 通过路径列表、起始时间和结束时间查询 tsfile 文件 - * 该方法用于树模型下的 tsfile 文件查询 + * @brief 通过表名、列名、起始时间和结束时间查询 tsfile。 * - * @param [in] path_list 路径列表 - * @param [in] start_time 起始时间 - * @param [in] end_time 结束时间 - * @param [out] result_set 结果集 + * @param [in] table_name 表名。 + * @param [in] columns_names 列名列表。 + * @param [in] start_time 起始时间。 + * @param [in] end_time 结束时间。 + * @param [out] result_set 查询结果集。 */ - int query(std::vector& path_list, int64_t start_time, - int64_t end_time, ResultSet*& result_set); + int query(const std::string &table_name, + const std::vector &columns_names, int64_t start_time, + int64_t end_time, ResultSet *&result_set); + /** - * @brief 通过表名、列名、起始时间和结束时间查询 tsfile 文件 - * 该方法用于表模型下的 tsfile 文件查询 + * @brief 通过表名、列名、开始时间、结束时间和标签过滤器查询 tsfile。 * * @param [in] table_name 表名 - * @param [in] columns_names 列名列表 - * @param [in] start_time 起始时间 + * @param [in] columns_names 列名 + * @param [in] start_time 开始时间 * @param [in] end_time 结束时间 + * @param [in] tag_filter 标签过滤器 * @param [out] result_set 结果集 - * @param [in] batch_size 小于等于0表示逐行返回模式, - * 大于0表示按指定大小返回TsBlock数据块 */ int query(const std::string& table_name, const std::vector& columns_names, int64_t start_time, - int64_t end_time, ResultSet*& result_set, int batch_size = -1); + int64_t end_time, ResultSet*& result_set, Filter* tag_filter); /** - * @brief 通过表名、列名、起始时间、结束时间和标签过滤条件查询 tsfile 文件 - * 该方法用于表模型下的 tsfile 文件查询 + * @brief 按行查询表,支持偏移量/行数限制下推与可选的标签过滤。 * * @param [in] table_name 表名 - * @param [in] columns_names 列名列表 - * @param [in] start_time 起始时间 - * @param [in] end_time 结束时间 - * @param [in] tag_filter 标签过滤条件 + * @param [in] column_names 列名 + * @param [in] offset 需要跳过的起始行数(>= 0) + * @param [in] limit 最多返回行数;< 0 表示不限制 * @param [out] result_set 结果集 - */ - int query(const std::string& table_name, - const std::vector& columns_names, int64_t start_time, - int64_t end_time, ResultSet*& result_set, Filter* tag_filter, - int batch_size = 0); - - /** - * @brief 基于偏移量和限制条数,按行查询树模型时间序列数据 - * - * @param path_list 待查询的完整路径(设备.测量项) - * @param offset 需要跳过的起始行数(>=0) - * @param limit 最大返回行数,小于0表示无限制 - * @param[out] result_set 存储查询结果的结果集 - * @return 成功返回0,失败返回非零错误码 - */ - int queryByRow(std::vector& path_list, int offset, int limit, - ResultSet*& result_set); - - /** - * @brief 基于偏移量和限制条数下推,按行查询表模型数据 - * - * 对于密集型设备(所有列行数相同), - * 偏移量/限制条数会通过SSI下推至数据块/数据页级别, - * 无需解码即可跳过整个数据块/数据页。 - * 对于稀疏型设备,偏移量/限制条数在行合并阶段生效。 - * 当设备总行数处于偏移量范围内时,可直接跳过整个设备。 - * - * @param table_name 待查询的表名 - * @param column_names 待查询的列名 - * @param offset 需要跳过的起始行数(>=0) - * @param limit 最大返回行数,小于0表示无限制 - * @param[out] result_set 存储查询结果的结果集 - * @param tag_filter 可选的标签过滤条件,用于按标签列过滤数据 - * @return 成功返回0,失败返回非零错误码 + * @param [in] tag_filter 可选标签过滤器(用 TagFilterBuilder 构造),或 nullptr + * @param [in] batch_size <= 0 逐行返回;> 0 按该大小返回数据块 + * @return 成功时返回 0,失败时返回 errno_define.h 中的非零错误码。 */ int queryByRow(const std::string& table_name, const std::vector& column_names, int offset, @@ -403,96 +382,23 @@ class TsFileReader { Filter* tag_filter = nullptr, int batch_size = 0); /** - * @brief 在树模型上执行表查询 + * @brief 销毁结果集,该方法应在查询完成并使用完 result_set 后调用。 * - * @param measurement_names 测量项名称列表 - * @param star_time 起始时间 - * @param end_time 结束时间 - * @param result_set 结果集 + * @param qds 查询结果集。 */ - int query_table_on_tree(const std::vector& measurement_names, - int64_t star_time, int64_t end_time, - ResultSet*& result_set); - /** - * @brief 销毁结果集,该方法应在查询完成、使用完结果集后调用 - * - * @param qds 结果集对象 - */ - void destroy_query_data_set(ResultSet* qds); - /** - * @brief 根据设备ID和测量项名称读取时间序列数据 - * - * @param device_id 设备ID - * @param measurement_name 测量项名称列表 - * @return 结果集对象 - */ - ResultSet* read_timeseries( - const std::shared_ptr& device_id, - const std::vector& measurement_name); - /** - * @brief 获取 tsfile 文件中的所有设备 - * - * @param table_name 表名 - * @return 设备ID列表 - */ - std::vector> get_all_devices( - std::string table_name); - - /** - * @brief 获取 tsfile 文件中的所有设备 - * - * @return 设备ID列表 - */ - std::vector> get_all_device_ids(); - - /** - * @brief 获取文件中的所有设备ID(与get_all_device_ids功能一致) - * - * @return 设备列表 - */ - std::vector> get_all_devices(); - - /** - * @brief 根据设备ID和测量项名称获取时间序列结构 - * - * @param [in] device_id 设备ID - * @param [out] result 测量项结构列表 - * @return 成功返回0,失败返回非零错误码 - */ - int get_timeseries_schema(std::shared_ptr device_id, - std::vector& result); - - /** - * @brief 获取指定设备的时间序列元数据 - * - * 仅文件中存在的设备会被包含在结果中 - * 若设备ID列表为空,返回空映射表 - * - * @param device_ids 待查询的设备列表 - * @return 映射关系:设备ID -> 时间序列元数据列表(仅包含存在的数据) - */ - DeviceTimeseriesMetadataMap get_timeseries_metadata( - const std::vector>& device_ids); - - /** - * @brief 获取文件中所有设备的时间序列元数据 - * - * @return 映射关系:设备ID -> 时间序列元数据列表 - */ - DeviceTimeseriesMetadataMap get_timeseries_metadata(); - + void destroy_query_data_set(ResultSet *qds); /** - * @brief 根据表名获取表结构 + * @brief 根据表名获取表的模式信息。 * - * @param table_name 表名 - * @return 表结构智能指针 + * @param table_name 表名。 + * @return std::shared_ptr 表的模式信息。 */ std::shared_ptr get_table_schema( - const std::string& table_name); + const std::string &table_name); /** - * @brief 获取 tsfile 文件中的所有表结构 + * @brief 获取 tsfile 中所有表的模式信息。 * - * @return 表结构列表 + * @return std::vector> 表模式信息列表。 */ std::vector> get_all_table_schemas(); }; @@ -503,9 +409,7 @@ class TsFileReader { * @brief ResultSet 是 TsFileReader 的查询结果集,用于访问查询结果。 * * ResultSet 是一个虚类,使用时应转换为相应的实现类。 - * @note 当使用树模型且过滤器是全局时间过滤器时,应转换为 QDSWithoutTimeGenerator。 - * @note 当使用树模型但过滤器不是全局时间过滤器时,应转换为 QDSWithTimeGenerator。 - * @note 如果查询使用的是表模型,则应转换为 TableResultSet。 + * @note 结果集的具体类型为 TableResultSet。 */ class ResultSet { public: diff --git a/src/zh/UserGuide/develop/QuickStart/InterfaceDefinition/InterfaceDefinition-Python.md b/src/zh/UserGuide/develop/QuickStart/InterfaceDefinition/InterfaceDefinition-Python.md index 906ca0112..00aae7fb2 100644 --- a/src/zh/UserGuide/develop/QuickStart/InterfaceDefinition/InterfaceDefinition-Python.md +++ b/src/zh/UserGuide/develop/QuickStart/InterfaceDefinition/InterfaceDefinition-Python.md @@ -38,14 +38,39 @@ class TSDataType(IntEnum): BLOB = 10 STRING = 11 +class TSEncoding(IntEnum): + """ + 写入器支持的值编码。每个成员后的注释列出其可用于哪些数据类型。 + """ + PLAIN = 0 # 所有类型 + DICTIONARY = 1 # STRING、TEXT + RLE = 2 # INT32、INT64、TIMESTAMP、DATE + TS_2DIFF = 4 # INT32、INT64、TIMESTAMP、DATE、FLOAT、DOUBLE + GORILLA = 8 # INT32、INT64、TIMESTAMP、DATE、FLOAT、DOUBLE + ZIGZAG = 9 # INT32、INT64 + SPRINTZ = 12 # INT32、INT64、FLOAT、DOUBLE + +class Compressor(IntEnum): + """ + 写入器支持的压缩,默认值为 LZ4。 + """ + UNCOMPRESSED = 0 + SNAPPY = 1 + GZIP = 2 + LZO = 3 + LZ4 = 7 + class ColumnCategory(IntEnum): """ TsFile 中的列类别枚举。 - TAG:表示标签列,用于存储元数据。 - FIELD:表示测点列,用于存储实际数据值。 + TAG:标签列 + FIELD:测点列,存储测量值。 + ATTRIBUTE / TIME:保留的列角色。 """ TAG = 0 FIELD = 1 + ATTRIBUTE = 2 + TIME = 3 class ColumnSchema: """定义表中某一列的模式(名称、数据类型、类别)。""" @@ -92,9 +117,11 @@ class TsFileTableWriter: """ :param path: tsfile 文件路径,如果不存在则会创建。 :param table_schema: 描述要写入表的结构信息。 + :param memory_threshold: 触发自动刷盘前缓冲的字节数(默认 128MB)。 :return: 无返回值。 """ - def __init__(self, path: str, table_schema: TableSchema) + def __init__(self, path: str, table_schema: TableSchema, + memory_threshold: int = 128 * 1024 * 1024) """ 将一个 Tablet 写入 TsFile 中的表中。 @@ -103,12 +130,30 @@ class TsFileTableWriter: """ def write_table(self, tablet: Tablet) + """ + 将一个 pandas DataFrame 写入表中。列的编码/压缩遵循表 schema(或引擎默认值)。 + :param dataframe: 要写入的数据。 + :return: 无返回值。 + """ + def write_dataframe(self, dataframe: pandas.DataFrame) + + """ + 将缓冲数据刷新到磁盘。 + :return: 无返回值。 + """ + def flush(self) + """ 关闭 TsFileTableWriter,并自动刷新数据。 :return: 无返回值。 """ def close(self) + # 可作为上下文管理器使用: + # with TsFileTableWriter(path, schema) as w: + # w.write_table(tablet) + def __enter__(self) + def __exit__(self, exc_type, exc_val, exc_tb) ``` @@ -134,147 +179,116 @@ class Tablet(object) ``` -## 读取接口 - -### TsFileReader +### dataframe_to_tsfile ```python -class TsFileReader: +def dataframe_to_tsfile(dataframe: pd.DataFrame, + file_path: str, + table_name: Optional[str] = None, + time_column: Optional[str] = None, + tag_column: Optional[list[str]] = None) """ - 从 TsFile 中查询表格数据、时序数据,提供标准化的文件读取与查询接口, - 支持表模型查询、树模型查询、元数据获取、资源管控等全量核心能力。 + 将 pandas DataFrame 写入 TsFile。 + + :param dataframe: 要写入的数据。 + :param file_path: 目标 .tsfile 路径。 + :param table_name: 输出表名。 + :param time_column: 用作时间戳列的列名。 + :param tag_column: 作为 TAG 列处理的列名列表。 """ +``` - def __init__(self, pathname: str): - """ - 初始化指定路径的 TsFile 读取器,完成文件加载与底层读取器初始化, - 同时维护当前所有活跃的查询结果集,确保读取器关闭时同步失效所有结果集。 - :param pathname: 待读取的 TsFile 文件的完整路径 - :return: 无返回值 - """ +## 配置 - def query_table(self, table_name: str, column_names: List[str], - start_time: int = np.iinfo(np.int64).min, - end_time: int = np.iinfo(np.int64).max, - tag_filter: Optional[object] = None, - batch_size: int = 0) -> object: - """ - 对指定的表和列执行时间范围查询,支持标签过滤与批量读取模式。 - 可适配逐行返回与固定大小数据块返回两种模式,满足不同场景的读取需求。 - - :param table_name: 要查询的目标表名,不区分大小写 - :param column_names: 要检索的目标列名列表,为空时默认查询全列 - :param start_time: 查询范围的起始时间戳,默认值为 int64 类型最小值 - :param end_time: 查询范围的结束时间戳,默认值为 int64 类型最大值 - :param tag_filter: 可选参数,基于标签列的过滤条件,支持等值、范围、逻辑组合过滤 - :param batch_size: 批量读取大小,小于等于0时启用逐行返回模式,大于0时按指定大小返回数据块 - :return: 封装完成的查询结果集处理器,可用于遍历、读取数据、获取元数据 - """ +全局写入默认值——包括各类型的默认编码、默认压缩、时间列编码/压缩——以一个字典暴露。 +请在创建写入器 **之前** 修改它们。 - def query_table_on_tree(self, column_names: List[str], - start_time: int = np.iinfo(np.int64).min, - end_time: int = np.iinfo(np.int64).max) -> object: - """ - 在树模型结构上执行表查询,适配原生树结构时序数据的查询场景, - 直接基于测量项名称查询,无需指定表名,路径名称区分大小写。 +```python +from tsfile import get_tsfile_config, set_tsfile_config +from tsfile import TSEncoding, Compressor - :param column_names: 待查询的测量项名称列表,对应树结构中的节点路径 - :param start_time: 查询范围的起始时间戳,默认值为 int64 类型最小值 - :param end_time: 查询范围的结束时间戳,默认值为 int64 类型最大值 - :return: 树模型查询对应的结果集处理器 - """ +cfg = get_tsfile_config() # -> 包含所有配置项的 dict +# 例如 cfg["default_compression_type_"]、cfg["int64_encoding_type_"]、 +# cfg["time_encoding_type_"]、cfg["time_compress_type_"] 等。 - def query_tree_by_row(self, device_ids: List[str], measurement_names: List[str], - offset: int = 0, limit: int = -1) -> object: - """ - 按行分页查询树模型时序数据,支持偏移量跳过、最大返回行数限制, - 适配大数据量分页读取场景,避免单次加载过多数据导致内存溢出。 - - :param device_ids: 待查询的设备ID列表,不能为空 - :param measurement_names: 待查询的测量项名称列表,不能为空 - :param offset: 需要跳过的起始行数,默认从0开始 - :param limit: 最大返回行数,小于0表示不限制返回行数 - :return: 树模型分页查询的结果集处理器 - """ +set_tsfile_config({ + "default_compression_type_": Compressor.LZ4, + "int64_encoding_type_": TSEncoding.TS_2DIFF, +}) +``` - def query_table_by_row(self, table_name: str, column_names: List[str], - offset: int = 0, limit: int = -1, - tag_filter: Optional[object] = None, - batch_size: int = 0) -> object: - """ - 按行分页查询表模型数据,支持偏移量与行数限制下推,可结合标签过滤使用, - 密集型设备可在数据块级别跳过无效数据,大幅提升分页查询效率。 - - :param table_name: 待查询的目标表名 - :param column_names: 待查询的列名列表 - :param offset: 需要跳过的起始行数,默认从0开始 - :param limit: 最大返回行数,小于0表示不限制返回行数 - :param tag_filter: 可选参数,标签过滤条件,过滤符合条件的设备数据 - :param batch_size: 批量读取大小,适配底层数据块读取逻辑 - :return: 表模型分页查询的结果集处理器 - """ +`set_tsfile_config` 会校验每个取值,且只更新你传入的键。编码/压缩取值为 `TSEncoding` / `Compressor` +成员;类型与编码的适配限制同 C++ 接口。 - def query_timeseries(self, device_name: str, sensor_list: List[str], - start_time: int = 0, end_time: int = 0) -> object: - """ - 针对单个指定设备,执行时间范围时序数据查询, - 适配单设备多传感器的精准查询场景,简化查询调用逻辑。 - - :param device_name: 目标设备的名称/路径 - :param sensor_list: 待查询的传感器(测量项)名称列表 - :param start_time: 查询起始时间戳,为0时默认从文件最早时间开始 - :param end_time: 查询结束时间戳,为0时默认到文件最晚时间结束 - :return: 单设备时序查询的结果集处理器 - """ +## 读取接口 - def get_table_schema(self, table_name: str) -> object: - """ - 获取指定表的完整模式信息,包含列名、数据类型、标签列、时序约束等全量元数据, - 用于提前校验查询字段合法性、解析数据结构。 +### TsFileReader - :param table_name: 目标表名 - :return: 对应表的模式信息对象,包含表结构全量配置 - """ +```python +class TsFileReader: + """ + 从 TsFile 中查询表格数据。 + """ - def get_all_table_schemas(self) -> Dict[str, object]: - """ - 获取当前 TsFile 文件中所有表的模式信息, - 一键遍历文件内全部数据表结构,无需逐个表查询。 + """ + 初始化指定路径的 TsFile 读取器。 + :param pathname: TsFile 文件的路径。 + :return: 无返回值。 + """ + def __init__(self, pathname) - :return: 字典结构,key为表名,value为对应表的模式信息对象 - """ + """ + 对指定的表和列执行时间范围查询。 - def get_all_timeseries_schemas(self) -> List[object]: - """ - 获取 TsFile 内所有时序序列的模式信息, - 覆盖树模型、表模型全量时序数据的字段、类型、约束信息。 + :param table_name: 要查询的表名。 + :param column_names: 要检索的列名列表。 + :param start_time: 查询范围的起始时间(默认:int64 最小值)。 + :param end_time: 查询范围的结束时间(默认:int64 最大值)。 + :return: 查询结果集处理器。 + """ + def query_table(self, table_name : str, column_names : List[str], + start_time : int = np.iinfo(np.int64).min, + end_time: int = np.iinfo(np.int64).max) -> ResultSet - :return: 所有时序模式信息组成的列表 - """ + """ + 按行查询表,支持偏移量/行数限制下推与可选的标签过滤。标签谓词把查询限定到 + 标签列取值满足条件的设备。用 tsfile.tag_filter 中的辅助函数构造过滤器 + (tag_eq、tag_neq、tag_lt、tag_lteq、tag_gt、tag_gteq、tag_between 等), + 并用 &、| 和 ~ 组合。 - def get_all_devices(self) -> List[str]: - """ - 获取 TsFile 文件内所有设备的标识信息, - 可遍历文件内全部设备,适配全设备统计、批量查询前置操作。 + :param table_name: 要查询的表名。 + :param column_names: 要检索的列名列表。 + :param offset: 需要跳过的起始行数(默认 0)。 + :param limit: 最多返回的行数;< 0 表示不限制。 + :param tag_filter: 可选的标签谓词(TagFilter),None 表示不过滤。 + :param batch_size: <= 0 逐行返回;> 0 按该大小返回数据块。 + :return: 查询结果集处理器。 + """ + def query_table_by_row(self, table_name : str, column_names : List[str], + offset : int = 0, limit : int = -1, + tag_filter = None, batch_size : int = 0) -> ResultSet - :return: 所有设备ID/设备路径组成的列表 - """ + """ + 获取指定表的模式信息。 - def get_timeseries_metadata(self, device_ids: Optional[List[str]] = None) -> Dict[str, object]: - """ - 获取指定设备的时序元数据,包含数据存储分段、字段约束、数据范围等信息, - 不传设备ID时默认返回全设备元数据,传入空列表返回空字典。 + :param table_name: 表名。 + :return: 指定表的模式信息。 + """ + def get_table_schema(self, table_name : str) -> TableSchema + + """ + 获取 TsFile 中所有表的模式信息。 + + :return: 一个将表名映射到其模式的字典。 + """ + def get_all_table_schemas(self) -> dict[str, TableSchema] + + """ + 关闭 TsFile 读取器。如果读取器中有活动的结果集,它们将失效。 + """ + def close(self) - :param device_ids: 可选参数,待查询元数据的设备ID列表 - :return: 字典结构,key为设备路径,value为对应设备的时序元数据组 - """ - def close(self) -> None: - """ - 关闭 TsFile 读取器,释放底层文件句柄、内存资源, - 同时将当前所有活跃的查询结果集标记为失效,禁止后续数据读取操作。 - 关闭后不可再次执行查询、元数据获取操作,需重新初始化读取器。 - """ ``` ### ResultSet @@ -371,7 +385,6 @@ def to_dataframe(file_path: str, 从 TsFile 中读取数据,并将其转换为 Pandas DataFrame 或 DataFrame 迭代器。 - 该函数同时支持表模型(table-model)和树模型(tree-model)的 TsFile。 用户可以通过表名、列名、时间范围以及最大行数对数据进行过滤。 Parameters diff --git a/src/zh/UserGuide/develop/Tools/Tsfile-CLI.md b/src/zh/UserGuide/develop/Tools/Tsfile-CLI.md new file mode 100644 index 000000000..f08f24e2b --- /dev/null +++ b/src/zh/UserGuide/develop/Tools/Tsfile-CLI.md @@ -0,0 +1,165 @@ + +# tsfile-cli + +`tsfile-cli` 是一个单一、对管道友好的 C++ 命令行工具,用于在 shell 中检视 **并** 导入 Apache +TsFile(`.tsfile`)文件。读取类命令将数据打印到 +**stdout**、诊断信息打印到 **stderr**,因此可与 `awk`、`jq`、`sort` 等组合使用;`write` 命令 +将 CSV/TSV 导入为新的 `.tsfile`。它构建于公开的 `TsFileReader` 与 `TsFileTableWriter` 接口之上。 + +## 从源码构建 + +CLI 是 C++ 模块的一部分。用 Maven 包装器构建即可——它会下载固定版本的 CMake,为你编译整个 +C++ 模块(`libtsfile` 共享库 + `tsfile-cli` 可执行文件)。 + +**前置条件**:用于运行 Maven 的 JDK(8+),以及 C++11 编译器(GCC / Clang)。第三方 C++ 依赖 +(Snappy、LZ4、LZOKAY、Zlib 等)已捆绑在 `cpp/third_party/` 下并自动构建。 + +在仓库根目录执行: + +```bash +./mvnw clean package -P with-cpp +``` + +会在 `cpp/target/build/` 下生成: + +| 产物 | 路径 | +|---|---| +| CLI 可执行文件 | `cpp/target/build/bin/tsfile-cli` | +| 共享库 | `cpp/target/build/lib/libtsfile.so`(Linux)——macOS 为 `libtsfile.dylib` | + +`tsfile-cli` 动态链接 `libtsfile`。用完整路径 **就地** 运行即可自动找到该库: + +```bash +cpp/target/build/bin/tsfile-cli --version # -> tsfile-cli (Apache TsFile C++) +cpp/target/build/bin/tsfile-cli --help +``` + +若要在 **别处** 运行该二进制(例如把它从构建目录拷出来后),动态加载器必须能找到 `libtsfile.so`。 +要么让加载器指向构建目录下的 `lib/`,要么把该库拷到标准位置: + +```bash +# 让加载器指向构建目录的 lib(Linux;macOS 用 DYLD_LIBRARY_PATH) +export LD_LIBRARY_PATH=/path/to/cpp/target/build/lib:$LD_LIBRARY_PATH + +# —— 或者 —— 把库拷到系统库路径 +sudo cp cpp/target/build/lib/libtsfile.so /usr/local/lib/ && sudo ldconfig +``` + +## 使用方式 + +```text +tsfile-cli [options] +tsfile-cli --help | --version | help +``` + +退出码:`0` 成功,`1` 用法/参数错误,`2` 文件打开/损坏,`3` 查询/运行时错误。 + +### 读取 + +| 命令 | 说明 | +|---|---| +| `ls` | 列出设备(树模型)或表(表模型),每行一个名称 | +| `schema` | 每序列的 `target, measurement, datatype, encoding, compression` | +| `meta` | 文件概要:模型、设备/表/序列数、时间范围、文件大小 | +| `stats` | 每序列的 `count, start_time, end_time, min, max, first, last, sum` | +| `count` | 每序列行数及一行 `total`(来自统计信息,不扫描 page) | +| `head` | 前 N 行(默认 10;用 `-n`) | +| `cat` | 所有匹配行,流式输出(`table` 格式会缓冲以对齐列) | +| `sample` | 可复现的蓄水池抽样(默认 10;`-n`、`--seed`) | + +元数据类命令(`ls` / `schema` / `meta` / `stats` / `count`)无需解码数据即可回答大多数问题。 + +通用选项: + +| 选项 | 含义 | +|---|---| +| `-f, --format csv\|tsv\|json\|table` | 输出格式;TTY 下默认 `table`,管道下默认 `tsv` | +| `-d, --device ` / `-t, --table ` | 限定到一个设备 / 表(互斥) | +| `-m, --measurements a,b,c` | 列投影(`schema`、`stats`、`count`、`head`、`cat`、`sample`) | +| `-n, --limit N` / `--offset N` | 最大行数 / 跳过行数(`head`、`cat`;`--offset` 不适用于 `sample`) | +| `--start ` / `--end ` | 闭区间的毫秒时间范围(`head`、`cat`、`sample`) | +| `--seed N` | 可复现抽样种子(仅 `sample`) | +| `--tag-filter C OP V` / `--tag-between C L U` / `--tag-not-between C L U` | `head`、`cat`、`sample` 的表标签谓词;`OP` 为 `eq`、`neq`、`lt`、`lteq`、`gt`、`gteq`、`regexp`、`not-regexp` | +| `--no-header` | 不输出表头行 | +| `--model tree\|table` | 强制指定模型(否则自动检测) | + +`json` 输出为 NDJSON(每行一个对象;数字/布尔裸输出,其他值加引号,空值为 `null`;非有限浮点数 +——NaN/Inf——变为 `null`)。CSV 输出遵循 RFC 4180。时间戳为原始毫秒时间戳。`table` 格式会在内存中 +缓冲所有行以对齐列,因此导出大文件时优先用 `csv`/`tsv`/`json`。 + +```bash +BIN=cpp/build/Debug/bin/tsfile-cli +$BIN ls -f tsv data.tsfile # 列出表 / 设备 +$BIN meta data.tsfile # 快速文件概览 +$BIN count -t table1 -f tsv data.tsfile # 行数,不扫描 page +$BIN cat -t table1 --tag-filter device eq dev_1 -m temp -f tsv data.tsfile +$BIN cat -m temp,humidity --start 1700000000000 -f csv data.tsfile | head +$BIN sample -m temp -n 20 --seed 42 -f json data.tsfile | jq . +``` + +> 对于表模型文件,行命令(`head` / `cat` / `sample`)在不指定 `-t
` 时只查询 **第一个** 表; +> `count` 覆盖所有表。 + +### 写入(导入) + +`tsfile-cli write` 将 CSV/TSV 行导入为一个 **新的表模型** `.tsfile`(输出会被覆盖)。输入的第一列是 +时间戳(毫秒);其余列通过 `--columns` 显式声明——不做类型推断。 + +时间戳必须 **按设备严格递增**,设备由其 `tag` 列取值标识(共享相同标签的行构成同一设备的时间线)。 +不同标签组合的行可自由交错并复用时间戳。乱序输入会被拒绝并报出错行号,导入失败不会留下输出文件。 +`--output` 必须与输入文件不同。 + +```text +tsfile-cli write --table --columns -o \ + [-f csv|tsv] [--no-header] [--header-match] [-v] [ | -] +``` + +`--columns` 是逗号分隔的 `name:TYPE:category` 列表,其中 `category`(不区分大小写)为 `tag` 或 +`field`,`TYPE`(不区分大小写)为 `BOOLEAN, INT32, INT64, FLOAT, DOUBLE, STRING, TEXT, TIMESTAMP, +DATE, BLOB` 之一——例如 `--columns "id1:STRING:tag,s1:INT64:field"`。`DATE` 单元格写作 +`YYYY-MM-DD`,`TIMESTAMP` 单元格为毫秒。每列按其类型的引擎默认编码与压缩存储。 + +| 选项 | 含义 | +|---|---| +| `--table ` | 输出表名(会转小写) | +| `--columns ` | 有序数据列(不含开头的时间戳列) | +| `-o, --output ` | 输出 `.tsfile`(必填;会被覆盖) | +| `` / `-` | 输入文件,或 `-` / 省略表示 stdin | +| `-f csv\|tsv` | 输入分隔符(默认 csv;`json` / `table` 被拒绝) | +| `--no-header` | 输入无表头行(默认首行为表头并跳过) | +| `--header-match` | 校验表头名是否与 `--columns` 一致 | +| `-v, --verbose` | 向 stderr 打印 `wrote N rows to `(否则成功时静默) | + +空单元格写为 null。成功时静默(Unix 风格),用 `-v` 打印一行摘要。 + +```bash +# 通过管道往返 +printf 'time,id1,s1\n0,dev,0\n1,dev,10\n' \ + | tsfile-cli write --table t1 --columns "id1:STRING:tag,s1:INT64:field" -o out.tsfile - +tsfile-cli count -f tsv out.tsfile # -> t1.dev s1 2 +``` + +## 与 AI 助手配合使用 skill + +`cpp/tools/skills/tsfile-cli/SKILL.md` 是一份机器可读的参考,描述了如何 +正确驱动 `tsfile-cli`。支持 skill 的 AI 编码助手可以加载它,从而辅助检视与导入 `.tsfile` +文件。 diff --git a/src/zh/UserGuide/develop/Tools/Tsfile-Viewer.md b/src/zh/UserGuide/develop/Tools/Tsfile-Viewer.md new file mode 100644 index 000000000..f613bc84a --- /dev/null +++ b/src/zh/UserGuide/develop/Tools/Tsfile-Viewer.md @@ -0,0 +1,89 @@ + +# tsfile-viewer + +[Apache TsFile Viewer](https://github.com/apache/tsfile-viewer) 是一个基于 Web 的应用, +用于在浏览器中浏览与分析 TsFile 数据。它由一个 Spring Boot 后端(通过 Apache TsFile 库读取 +`.tsfile` 文件)与一个 Vue 3 前端(渲染元数据、分页表格与交互式图表)组成。 + +- **仓库**: +- **许可证**:Apache-2.0 + +## 功能 + +- **文件浏览与上传**——在界面中打开 `.tsfile` 文件。 +- **元数据展示**——schema、设备与测点。 +- **分页数据表格**,支持按时间范围、设备、测点、数值范围过滤。 +- **交互式图表**(ECharts),支持多序列叠加与聚合。 +- **支持两种数据模型**——树模型与表模型 TsFile。 +- **导出**——数据导出为 CSV 或 JSON;图表导出为 PNG 或 SVG。 +- **性能**——chunk 级读取与元数据缓存。 + +## 环境要求 + +| 组件 | 版本 | +|---|---| +| JDK | 17 或 21(LTS) | +| Maven | 3.9+ | +| Node.js | `^20.19.0 \|\| >=22.12.0` | +| pnpm | 最新版 | +| Apache TsFile | 2.3.0(捆绑依赖) | + +## 获取源码 + +克隆仓库,然后按下文构建并运行: + +```bash +git clone https://github.com/apache/tsfile-viewer.git +cd tsfile-viewer +``` + +## 从源码运行(开发模式) + +在两个终端分别运行后端与前端。 + +**后端**(Spring Boot): + +```bash +cd backend +mvn spring-boot:run +``` + +**前端**(Vue + Vite 开发服务器): + +```bash +cd frontend +pnpm install +pnpm dev +``` + +然后打开开发界面 。 + +## 构建并运行生产包 + +构建一个自包含的发行包,再启动打好的 jar(前端由后端提供服务): + +```bash +./build-dist.sh +java -jar backend/target/tsfile-viewer-*.jar +``` + +打开应用 。 diff --git a/src/zh/UserGuide/latest/DataFrame/TsFileDataFrame.md b/src/zh/UserGuide/latest/DataFrame/TsFileDataFrame.md new file mode 100644 index 000000000..cff291dda --- /dev/null +++ b/src/zh/UserGuide/latest/DataFrame/TsFileDataFrame.md @@ -0,0 +1,270 @@ + +# TsFileDataFrame + +`TsFileDataFrame` 让你像操作 pandas DataFrame 一样读取一个或多个 TsFile 中的时序数据, +无需关心底层文件格式与数据加载细节。它是 Python 包的一部分(`pip install tsfile`)。 + +## 快速上手 + +```python +from tsfile import TsFileDataFrame + +df = TsFileDataFrame("table_data/") # 加载目录下所有 .tsfile +print(df) # 浏览所有序列(仅元数据) + +ts = df["weather.Beijing.humidity"] # 取一条序列(懒加载句柄) +window = ts[20:100] # 按行号切片 -> np.ndarray + +data = df.loc[start:end, [ # 按时间戳对齐多条序列 + "weather.Beijing.temperature", + "weather.Beijing.humidity", +]] +data.values # -> np.ndarray, shape = (N, 2) +``` + +## 核心类型 + +`TsFileDataFrame` 围绕三个核心类型: + +- **`TsFileDataFrame`**:入口对象,加载一至多个 TsFile 并提供统一视图。初始化时只扫描元数据, + **不读取实际数值**。 +- **`Timeseries`**:单条序列的懒加载句柄,通过 `df[...]` 获得。它携带序列元信息,但在按行号索引前 + 不读取任何数据。 +- **`AlignedTimeseries`**:多条序列在同一时间轴上的对齐结果,通过 `df.loc[...]` 获得,会一次性将 + 指定时间范围内的多条序列读入内存。 + +### TsFileDataFrame + +下表中 `df` 是一个 `TsFileDataFrame` 实例,由 `df = TsFileDataFrame(paths)` 创建。 + +| 示例 | 操作 | 返回类型 | +|---|---|---| +| `TsFileDataFrame(paths)` | 加载文件 / 文件列表 / 目录 | `TsFileDataFrame` | +| `len(df)` | 时间序列总数 | `int` | +| `df.list_timeseries("weather")` | 获取序列名,可按前缀筛选 | `List[str]` | +| `df["weather.Beijing.humidity"]`、`df[0]`、`df[-1]` | 获取单条序列 | `Timeseries` | +| `df["city"]` | 获取某元数据列(标签 / `field` / `start_time` / `end_time` / `count`) | `pandas.Series` | +| `df[0:3]`、`df[[0, 2, 5]]` | 获取子集视图 | `TsFileDataFrame` | +| `df[df["city"] == "Beijing"]` | 按元数据列过滤 | `TsFileDataFrame` | +| `df.loc[start:end, series_list]` | 按时间戳对齐查询 | `AlignedTimeseries` | +| `df.show(max_rows=20)` / `print(df)` | 格式化元数据表格 | — | +| `df.close()` | 释放文件句柄 | — | + +### Timeseries + +下表中 `ts` 是一条 `Timeseries`,由 `ts = df[...]` 获得。 + +| 示例 | 操作 | 返回类型 | +|---|---|---| +| `ts.name` | 序列名 | `str` | +| `len(ts)` | 序列点数 | `int` | +| `ts.stats` | 序列统计信息 | `dict`(`start_time`、`end_time`、`count`) | +| `ts[20]` | 单值读取 | `float`(空值为 `None`) | +| `ts[20:100]` | 行范围切片 | `np.ndarray` | +| `ts.timestamps` | 时间戳数组 | `np.ndarray` | + +### AlignedTimeseries + +下表中 `data` 是一个 `AlignedTimeseries`,由 `data = df.loc[...]` 获得。 + +| 示例 | 操作 | 返回类型 | +|---|---|---| +| `data.timestamps` | 时间戳数组 | `np.ndarray` | +| `data.values` | 值矩阵 | `np.ndarray`,shape `(N, M)` | +| `data.series_names` | 序列名列表 | `List[str]` | +| `data.shape` | 形状 `(N, M)`——N 为时间戳数,M 为序列数 | `tuple` | +| `len(data)` | 行数 | `int` | +| `data[0]`、`data[0:10]`、`data[0, 1]` | 行 / 元素索引 | `np.ndarray` / 标量 | +| `data.show(50)` / `print(data)` | 格式化输出(自动截断) | — | + +## 序列名 + +TsFileDataFrame 以**序列名**(一个字符串)作为序列的唯一标识。序列名由 **表名**、**各标签列的取值**、 +**字段名** 三部分按此顺序经 `.` 连接构成: + +```text +{表名}.{标签值1}.{标签值2}...{字段名} +``` + +`list_timeseries()` 返回的即为序列名;按名称索引(`df[...]`)与 `df.loc[...]` 中的序列选择均以序列名为参数。 + +示例: + +- `weather.Beijing.humidity` — 表 `weather`,标签 `Beijing`,字段 `humidity` +- `sensor.s1.pressure` — 表 `sensor`,标签 `s1`,字段 `pressure` + +> 序列名可由 `list_timeseries()` 获取,无需手工构造;亦可改用整数索引(`df[0]`)或元数据过滤 +> (`df[df["city"] == "Beijing"]`)选择序列。 + +## 加载 + +路径可以是单个文件、文件列表或目录: + +```python +from tsfile import TsFileDataFrame + +df = TsFileDataFrame(["data/weather.tsfile", "data/sensor.tsfile"]) +df = TsFileDataFrame("data/") # 递归查找目录下所有 .tsfile +print(df) +``` + +初始化时只扫描元数据,不读取实际数值。加载多个文件时会并行扫描元数据。 + +如果多个文件包含 **同名序列**(如按日分片的 `weather.Beijing.humidity`),会自动合并为一条连续序列。 +对于重复时间戳仅保留第一条——这并非预期情况,请在预处理阶段去重,以免造成元数据失真。 + +### DataFrame 的展示 + +`print(df)`(以及 `df.show(max_rows=...)`)打印序列元信息,数据量大时头尾截断。表头为: + +```text +index │ table │ │ ... │ field │ start_time │ end_time │ count +``` + +对于标签数量不同的设备:标签值按左对齐,较短的在末尾补 `None`。 + +```text +TsFileDataFrame(table model, 972 time series, 5 files) + table ps_id sn frac field start_time end_time count + 0 pvf 10 30100194A00234H00572 1 pac 2024-04-02 00:00:00 2024-10-28 23:45:00 20160 + 1 pvf 10 30100194A00234H00572 1 tenmeterswindspeed 2024-04-02 00:00:00 2024-10-28 23:45:00 20160 +... +``` + +### 关闭 + +`with` 语句会自动释放文件句柄,也可以手动关闭: + +```python +with TsFileDataFrame("data/") as df: + ... # 退出后自动关闭 + +tsdf = TsFileDataFrame("data/") +tsdf.close() # 也可以自己关闭 +``` + +## 浏览序列 + +`list_timeseries(path_prefix="")` 列出已加载文件中的序列名,可按前缀筛选;不传参返回全部序列。 + +```python +>>> df.list_timeseries("weather") +['weather.Beijing.humidity', 'weather.Beijing.temperature', + 'weather.Shanghai.humidity', 'weather.Shanghai.temperature'] +>>> df.list_timeseries("weather.Beijing") +['weather.Beijing.humidity', 'weather.Beijing.temperature'] +``` + +若需查看起止时间、点数等元信息,可打印 DataFrame(或其子集)——见[DataFrame 的展示](#dataframe-的展示)。 + +## 选取序列 + +`df[...]` 返回懒加载的 `Timeseries` 句柄(不触发读取),或返回子集视图: + +```python +ts = df["weather.Beijing.humidity"] # 按名称 +ts = df[0] # 按索引(支持负索引) + +sub_df = df[0:3] # 切片 -> TsFileDataFrame(视图) +sub_df = df[[0, 2, 5]] # 整数列表 -> TsFileDataFrame(视图) +sub_df = df[df["city"] == "Beijing"] # 按元数据过滤 -> TsFileDataFrame(视图) +``` + +```text +>>> df["weather.Beijing.humidity"] +Timeseries('weather.Beijing.humidity', count=2880, start=2026-01-27 00:00:00, end=2026-02-05 23:55:00) +``` + +序列元信息从缓存读取(无 I/O): + +```python +>>> ts = df["weather.Beijing.humidity"] +>>> ts.name +'weather.Beijing.humidity' +>>> len(ts) +2880 +>>> ts.stats +{'start_time': 1769443200000, 'end_time': 1770306900000, 'count': 2880} +``` + +## 读取数据 + +对 `Timeseries` 按行号索引时才触发实际的文件读取: + +```python +val = ts[20] # -> float +window = ts[20:100] # -> np.ndarray, shape = (80,) +last_ten = ts[-10:] # -> np.ndarray +sampled = ts[::2] # -> np.ndarray(步长采样) +ts.timestamps[20:100] # -> 对应行号的时间戳, np.ndarray +``` + +```text +>>> ts[20] +46.1 +>>> ts[20:100] +array([46.1 , 41.72, 52.94, ..., 76.3 , 84.35]) +>>> ts.timestamps[20:100] +array([1769449200000, 1769449500000, ..., 1769472900000]) +``` + +## 多序列对齐查询 + +当需要多条序列在同一时间轴上严格对齐时,使用 `.loc`: + +```python +data = df.loc[start_time:end_time, [ + "weather.Beijing.humidity", + "weather.Beijing.temperature", + "sensor.s1.pressure", +]] +``` + +返回的 `AlignedTimeseries` 将所有序列对齐到时间戳的 **并集**,缺失位置填充 `NaN`: + +```python +data.timestamps # np.ndarray,毫秒时间戳 +data.values # np.ndarray, shape = (N, 3) +data.series_names # ["weather.Beijing.humidity", ...] +data.shape # (N, 3) +data[0:10] # 前 10 行, np.ndarray shape = (10, 3) +data.show(50) # 最多显示 50 行 +``` + +序列可按名称或索引指定,并可混用: + +```python +df.loc[start_time:end_time, [0, 1, 4]] +df.loc[start_time:end_time, [0, "weather.Beijing.temperature", 4]] +``` + +```text +>>> df.loc[1769616000000:1769702100000, +... ['weather.Beijing.temperature', 'weather.Beijing.humidity', 'sensor.s2.pressure']] +AlignedTimeseries(288 rows, 3 series) + timestamp weather.Beijing.temperature weather.Beijing.humidity sensor.s2.pressure +2026-01-29 00:00:00 29.12 92.87 NaN +2026-01-29 00:05:00 1.55 87.34 NaN +... +``` + +该美化视图仅展示值列;如需读取对齐后的时间戳列,请使用 `df.loc[...].timestamps`。 diff --git a/src/zh/UserGuide/latest/QuickStart/InterfaceDefinition/InterfaceDefinition-C.md b/src/zh/UserGuide/latest/QuickStart/InterfaceDefinition/InterfaceDefinition-C.md index fc22b5782..5c0e8a0e7 100644 --- a/src/zh/UserGuide/latest/QuickStart/InterfaceDefinition/InterfaceDefinition-C.md +++ b/src/zh/UserGuide/latest/QuickStart/InterfaceDefinition/InterfaceDefinition-C.md @@ -32,12 +32,44 @@ typedef enum { TS_DATATYPE_FLOAT = 3, TS_DATATYPE_DOUBLE = 4, TS_DATATYPE_TEXT = 5, - TS_DATATYPE_STRING = 11 + TS_DATATYPE_TIMESTAMP = 8, + TS_DATATYPE_DATE = 9, + TS_DATATYPE_BLOB = 10, + TS_DATATYPE_STRING = 11, + TS_DATATYPE_INVALID = 255 } TSDataType; -typedef enum column_category { TAG = 0, FIELD = 1 } ColumnCategory; +// 值编码 +typedef enum { + TS_ENCODING_PLAIN = 0, + TS_ENCODING_DICTIONARY = 1, + TS_ENCODING_RLE = 2, + TS_ENCODING_TS_2DIFF = 4, + TS_ENCODING_GORILLA = 8, + TS_ENCODING_ZIGZAG = 9, + TS_ENCODING_SPRINTZ = 12, + TS_ENCODING_INVALID = 255 +} TSEncoding; + +// 压缩类型,默认值为 LZ4。 +typedef enum { + TS_COMPRESSION_UNCOMPRESSED = 0, + TS_COMPRESSION_SNAPPY = 1, + TS_COMPRESSION_GZIP = 2, + TS_COMPRESSION_LZO = 3, + TS_COMPRESSION_LZ4 = 7, + TS_COMPRESSION_INVALID = 255 +} CompressionType; + +typedef enum column_category { + TAG = 0, + FIELD = 1, + ATTRIBUTE = 2, + TIME = 3 +} ColumnCategory; // ColumnSchema:表示单个列的模式,包括列名、数据类型和分类。 +// 列的编码/压缩遵循全局默认值(见下文“配置”)。 typedef struct column_schema { char* column_name; TSDataType data_type; @@ -59,6 +91,8 @@ typedef struct result_set_meta_data { } ResultSetMetaData; ``` +> `ColumnSchema` 不携带编码/压缩——它们遵循全局默认值(见[配置](#配置编码与压缩))。 + ## 写入接口 ### 创建/关闭 TsFile 写入文件 @@ -250,6 +284,38 @@ ERRNO tsfile_writer_write(TsFileWriter writer, Tablet tablet); +## 配置(编码与压缩) + +列按其数据类型的 **全局默认** 编码与压缩存储(`ColumnSchema` 不携带编解码设置)。 +请在创建写入器 *之前* 用下列函数修改这些默认值。 + +每个 setter 成功返回 `RET_OK`(0),遇到不支持的数据类型/编码或压缩组合返回 `RET_NOT_SUPPORT`(40)。 + +```C +/* 按数据类型的默认值编码,以及默认压缩。 */ +int set_datatype_encoding(uint8_t data_type, uint8_t encoding); +int set_global_compression(uint8_t compression); +uint8_t get_datatype_encoding(uint8_t data_type); +uint8_t get_global_compression(); + +/* 时间列(时间数据类型固定为 INT64)。 */ +int set_global_time_encoding(uint8_t encoding); +int set_global_time_compression(uint8_t compression); +uint8_t get_global_time_encoding(); +uint8_t get_global_time_compression(); +``` + +允许的取值:编码方面,`BOOLEAN` 仅 `PLAIN`;`INT32`/`INT64`/`DATE` 为 +`PLAIN`/`TS_2DIFF`/`GORILLA`/`ZIGZAG`/`RLE`/`SPRINTZ`;`FLOAT`/`DOUBLE` 为 +`PLAIN`/`TS_2DIFF`/`GORILLA`/`SPRINTZ`;`STRING`/`TEXT` 为 `PLAIN`/`DICTIONARY`。 +压缩可取 `UNCOMPRESSED`、`SNAPPY`、`GZIP`、`LZO`、`LZ4`。 + +```C +// 例如:所有列均以 LZ4 压缩写入 +ERRNO code = set_global_compression(TS_COMPRESSION_LZ4); +if (code != RET_OK) { /* 处理不支持的取值 */ } +``` + ## 读取接口 ### TsFile Reader 创建/关闭 @@ -277,9 +343,10 @@ ERRNO tsfile_reader_close(TsFileReader reader); -### 查询表 / 获取下一行 / 按行查询 +### 查询表 / 获取下一行 ```C + /** * @brief 在指定时间范围内,从指定表和列中查询数据。 * @@ -312,45 +379,129 @@ bool tsfile_result_set_next(ResultSet result_set, ERRNO* error_code); * @param result_set [输入] 有效的 ResultSet 句柄指针。 */ void free_tsfile_result_set(ResultSet* result_set); +``` + + + +### 按标签过滤 + +**标签列(TAG)** 构成设备的唯一标识(联合主键)——正是它们的取值在一个表内 +区分不同的设备。*标签过滤器* 把查询限定到标签取值满足条件的设备,从而只读取你关心的设备数据。 +用 reader 构造一个过滤器,传给下文的表查询函数,用完再用 `tsfile_tag_filter_free()` 释放。 + +```C +// 标签过滤器的不透明句柄,用下面的函数构造。 +typedef void* TagFilterHandle; + +// 单列标签谓词的比较运算符。 +typedef enum { + TAG_FILTER_EQ = 0, // 列 == 值 + TAG_FILTER_NEQ = 1, // 列 != 值 + TAG_FILTER_LT = 2, // 列 < 值 + TAG_FILTER_LTEQ = 3, // 列 <= 值 + TAG_FILTER_GT = 4, // 列 > 值 + TAG_FILTER_GTEQ = 5, // 列 >= 值 + TAG_FILTER_REGEXP = 6, // 列匹配正则 值 + TAG_FILTER_NOT_REGEXP = 7, // 列不匹配正则 值 +} TagFilterOp; /** - * @brief 按行查询时间序列数据(树模型),支持偏移量与行数限制 - * - * @param reader [in] 有效的 TsFileReader 句柄,通过 tsfile_reader_new() 获取 - * @param device_ids [in] 设备 ID 数组 - * @param device_ids_len [in] 设备 ID 的数量 - * @param measurement_names [in] 测量项(传感器)名称数组 - * @param measurement_names_len [in] 测量项名称的数量 - * @param offset [in] 需要跳过的起始行数(必须 >= 0) - * @param limit [in] 最多返回的行数,< 0 表示不限制 - * @param err_code [out] 错误码,成功返回 E_OK(0) - * @return 成功返回结果集 ResultSet 句柄,失败返回 NULL + * @brief 创建单列标签谓词:` `。 + * + * @param reader [输入] 有效的 TsFileReader 句柄。 + * @param table_name [输入] 其 schema 定义了这些标签列的表名。 + * @param column_name [输入] 要过滤的标签列名。 + * @param value [输入] 比较值(标签列为 STRING 类型)。 + * @param op [输入] 比较运算符(TagFilterOp)。 + * @param err_code [输出] 成功返回 RET_OK(0),否则返回 errno_define_c.h 中的错误码。 + * @return 成功返回 TagFilterHandle,失败返回 NULL。 */ -ResultSet tsfile_reader_query_tree_by_row(TsFileReader reader, - char** device_ids, int device_ids_len, - char** measurement_names, - int measurement_names_len, int offset, - int limit, ERRNO* err_code); +TagFilterHandle tsfile_tag_filter_create(TsFileReader reader, + const char* table_name, + const char* column_name, + const char* value, TagFilterOp op, + ERRNO* err_code); /** - * @brief 按行查询表模型数据,支持偏移量与行数限制下推 - * - * @param reader [in] 有效的 TsFileReader 句柄,通过 tsfile_reader_new() 获取 - * @param table_name [in] 目标表名 - * @param column_names [in] 要查询的列名数组 - * @param column_names_len [in] 要查询的列数量 - * @param offset [in] 需要跳过的起始行数(必须 >= 0) - * @param limit [in] 最多返回的行数,< 0 表示不限制 - * @param err_code [out] 错误码,成功返回 E_OK(0) - * @return 成功返回结果集 ResultSet 句柄,失败返回 NULL + * @brief 创建范围谓词:lower <= 列 <= upper(is_not 为 true 表示 NOT BETWEEN)。 + */ +TagFilterHandle tsfile_tag_filter_between(TsFileReader reader, + const char* table_name, + const char* column_name, + const char* lower, const char* upper, + bool is_not, ERRNO* err_code); + +// 组合谓词。AND/OR/NOT 会接管其子节点的所有权,只需释放根节点。 +TagFilterHandle tsfile_tag_filter_and(TagFilterHandle left, TagFilterHandle right); +TagFilterHandle tsfile_tag_filter_or(TagFilterHandle left, TagFilterHandle right); +TagFilterHandle tsfile_tag_filter_not(TagFilterHandle filter); + +// 释放标签过滤器及其全部子节点。 +void tsfile_tag_filter_free(TagFilterHandle filter); +``` + +### 带标签过滤、分页与分批的表查询 + +下列查询函数接受一个可选的 `tag_filter`(传 `NULL` 表示不过滤)和 `batch_size` +(`<= 0` 逐行返回;`> 0` 按该大小返回数据块)。 + +```C +/** + * @brief 按行查询表,支持偏移量/行数限制下推与可选的标签过滤。 + * + * @param reader [输入] 有效的 TsFileReader 句柄。 + * @param table_name [输入] 目标表名。 + * @param column_names [输入] 要查询的列名数组。 + * @param column_names_len [输入] 要查询的列数量。 + * @param offset [输入] 需要跳过的起始行数(>= 0)。 + * @param limit [输入] 最多返回行数;< 0 表示不限制。 + * @param tag_filter [输入] 标签谓词,NULL 表示不过滤。 + * @param batch_size [输入] <= 0 逐行;> 0 数据块大小。 + * @param err_code [输出] 成功返回 RET_OK(0),否则返回错误码。 + * @return 成功返回 ResultSet 句柄,失败返回 NULL。用 free_tsfile_result_set() 释放。 */ ResultSet tsfile_reader_query_table_by_row( TsFileReader reader, const char* table_name, char** column_names, int column_names_len, int offset, int limit, TagFilterHandle tag_filter, int batch_size, ERRNO* err_code); + +/** + * @brief 在时间范围内查询表,支持可选的标签过滤与分批。 + * + * @param batch_size <= 0 逐行返回;> 0 返回该大小的 TsBlock。 + */ +ResultSet tsfile_query_table_batch(TsFileReader reader, const char* table_name, + char** columns, uint32_t column_num, + Timestamp start_time, Timestamp end_time, + TagFilterHandle tag_filter, int batch_size, + ERRNO* err_code); + +/** + * @brief 带标签过滤的表查询(时间范围 + 标签谓词)。 + * + * @param batch_size <= 0 逐行返回;> 0 返回该大小的 TsBlock。 + */ +ResultSet tsfile_query_table_with_tag_filter( + TsFileReader reader, const char* table_name, char** columns, + uint32_t column_num, Timestamp start_time, Timestamp end_time, + TagFilterHandle tag_filter, int batch_size, ERRNO* err_code); ``` +示例——只读取 `region` 标签等于 `shanghai` 的设备的 `temperature`: + +```C +ERRNO ec = RET_OK; +TagFilterHandle f = tsfile_tag_filter_create( + reader, "weather", "region", "shanghai", TAG_FILTER_EQ, &ec); +char* cols[] = {"temperature"}; +ResultSet rs = tsfile_reader_query_table_by_row( + reader, "weather", cols, 1, /*offset*/ 0, /*limit*/ -1, f, /*batch*/ 0, &ec); + +// ... 用 tsfile_result_set_next() 遍历 rs,然后释放: +free_tsfile_result_set(&rs); +tsfile_tag_filter_free(f); +``` ### 从结果集中获取数据 diff --git a/src/zh/UserGuide/latest/QuickStart/InterfaceDefinition/InterfaceDefinition-CPP.md b/src/zh/UserGuide/latest/QuickStart/InterfaceDefinition/InterfaceDefinition-CPP.md index b85c7900d..078a17cb8 100644 --- a/src/zh/UserGuide/latest/QuickStart/InterfaceDefinition/InterfaceDefinition-CPP.md +++ b/src/zh/UserGuide/latest/QuickStart/InterfaceDefinition/InterfaceDefinition-CPP.md @@ -20,6 +20,65 @@ --> # 接口定义 - C++ +## 数据类型、编码与压缩 + +下列枚举为读写接口共用。其数值编码同时也是磁盘上存储的取值。 + +```cpp +// 支持的测点/列数据类型。 +enum TSDataType : uint8_t { + BOOLEAN = 0, + INT32 = 1, + INT64 = 2, + FLOAT = 3, + DOUBLE = 4, + TEXT = 5, + TIMESTAMP = 8, + DATE = 9, + BLOB = 10, + STRING = 11, +}; + +// 值编码。各编码适用于哪些类型见下表。 +enum TSEncoding : uint8_t { + PLAIN = 0, + DICTIONARY = 1, + RLE = 2, + TS_2DIFF = 4, + GORILLA = 8, + ZIGZAG = 9, + SPRINTZ = 12, +}; + +// 压缩类型。SNAPPY/GZIP/LZO/LZ4 取决于构建选项;默认压缩为 LZ4。 +enum CompressionType : uint8_t { + UNCOMPRESSED = 0, + SNAPPY = 1, + GZIP = 2, + LZO = 3, + LZ4 = 7, +}; + +// 列在表 schema 内的角色。 +enum class ColumnCategory { TAG = 0, FIELD = 1, ATTRIBUTE = 2, TIME = 3 }; +``` + +各数据类型适用的编码: + +| 编码 | 适用类型 | +|---|---| +| `PLAIN` | 所有类型 | +| `DICTIONARY` | `TEXT`、`STRING` | +| `RLE` | `INT32`、`INT64`、`TIMESTAMP`、`DATE` | +| `TS_2DIFF` | `INT32`、`INT64`、`TIMESTAMP`、`DATE`、`FLOAT`、`DOUBLE` | +| `GORILLA` | `INT32`、`INT64`、`TIMESTAMP`、`DATE`、`FLOAT`、`DOUBLE` | +| `ZIGZAG` | `INT32`、`INT64` | +| `SPRINTZ` | `INT32`、`INT64`、`FLOAT`、`DOUBLE` | + +各类型的默认值编码:`BOOLEAN → PLAIN`、`INT32 / INT64 → TS_2DIFF`、 +`FLOAT / DOUBLE → GORILLA`、`TEXT / STRING / BLOB → PLAIN`。默认压缩为 `LZ4`。 +覆盖方式见[配置编码与压缩](#配置编码与压缩)。 + ## 写入接口 ### TsFileTableWriter @@ -27,89 +86,55 @@ 用于写入 TsFile. ```cpp -namespace storage { -class RestorableTsFileIOWriter; - /** - * @brief 支持按照指定表结构,将结构化表数据写入 TsFile 文件 + * @brief 用于将结构化表格数据写入具有指定模式的 TsFile。 * - * TsFileTableWriter 类用于将结构化数据(特别适用于时序数据) - * 写入专为高效存储与查询优化的 TsFile 文件。 - * 使用者可定义待写入表的结构,按照该结构添加数据行, - * 并将数据序列化为 TsFile。 - * 同时,该类提供写入过程中的内存使用限制能力。 + * TsFileTableWriter 类被设计用于写入结构化数据,特别适合时序数据, + * 数据将被写入一种为高效存储与检索优化的文件格式(即 TsFile)。该类允许用户定义 + * 所需写入表的模式,按照该模式添加数据行,并将这些数据序列化写入 TsFile。 + * 此外,还提供了在写入过程中限制内存使用的选项。 */ class TsFileTableWriter { public: /** - * TsFileTableWriter 用于根据指定的表结构,将表数据写入目标文件, - * 并可选择性地限制内存使用量。 + * TsFileTableWriter 用于将表格数据写入具有指定模式的目标文件, + * 可选地限制内存使用。 * - * @param writer_file 表数据的目标写入文件,不能为空指针 - * @param table_schema 用于构建表结构,定义待写入表的 schema - * @param memory_threshold 可选参数。当已写入数据量超过该阈值时, - * 数据将自动刷新到磁盘。默认值为 128MB + * @param writer_file 要写入表数据的目标文件。不能为空。 + * @param table_schema 用于构造表结构,定义正在写入表的模式。 + * @param memory_threshold 可选参数。当写入数据的大小超过该值时, + * 数据将自动刷新到磁盘。默认值为 128MB。 */ - template - explicit TsFileTableWriter(storage::WriteFile* writer_file, T* table_schema, - uint64_t memory_threshold = 128 * 1024 * 1024) { - static_assert(!std::is_same::value, - "table_schema cannot be nullptr"); - tsfile_writer_ = std::make_shared(); - tsfile_writer_->init(writer_file); - tsfile_writer_->set_generate_table_schema(false); - - // 执行深拷贝。源 TableSchema 对象可能分配在栈/堆上 - auto table_schema_ptr = std::make_shared(*table_schema); - error_number = tsfile_writer_->register_table(table_schema_ptr); - exclusive_table_name_ = table_schema->get_table_name(); - common::g_config_value_.chunk_group_size_threshold_ = memory_threshold; - } + TsFileTableWriter(WriteFile* writer_file, + TableSchema* table_schema, + uint64_t memory_threshold = 128 * 1024 * 1024); + ~TsFileTableWriter(); /** - * 通过可恢复的 TsFileIOWriter 构建 TsFileTableWriter, - * 支持在故障恢复后追加表数据。Schema 从已恢复的文件中读取, - * 无需额外传入 TableSchema。 + * 将给定的 Tablet 数据按照表的模式写入目标文件。 * - * @param restorable_writer 已恢复的 I/O 写入器;不能为空指针, - * 且必须以截断模式打开,保证 can_write() 返回 true - * @param memory_threshold 可选的缓存数据内存阈值 + * @param tablet 包含待写入数据的 Tablet。不能为空。 + * @return 成功时返回 0,失败时返回 errno_define.h 中的非零错误码。 */ - explicit TsFileTableWriter( - storage::RestorableTsFileIOWriter* restorable_writer, - uint64_t memory_threshold = 128 * 1024 * 1024); + int write_table(const Tablet& tablet); /** - * 向写入器注册表结构 - * - * @param table_schema 待注册的表结构,不能为空指针 - * @return 成功返回 0,失败返回非零错误码 - */ - int register_table(const std::shared_ptr& table_schema); - /** - * 根据表结构,将指定的 Tablet 数据写入目标文件 + * 将所有缓冲数据刷新到底层存储介质,确保所有数据都已写出。 + * 此方法确保所有未完成的写入操作被持久化。 * - * @param tablet 包含待写入数据的 Tablet,不能为空指针 - * @return 成功返回 0,失败返回非零错误码 - */ - int write_table(Tablet& tablet) const; - /** - * 将所有缓存数据刷新到底层存储介质,确保所有数据都被持久化。 - * 该方法保证所有待写入数据都被落盘。 - * - * @return 成功返回 0,失败返回非零错误码 + * @return 成功时返回 0,失败时返回 errno_define.h 中的非零错误码。 */ + int flush(); /** * 关闭写入器并释放其占用的所有资源。 - * 调用该方法后,不应对当前实例执行任何后续操作。 + * 调用此方法后,不应再对该实例执行任何操作。 * - * @return 成功返回 0,失败返回非零错误码 + * @return 成功时返回 0,失败时返回 errno_define.h 中的非零错误码。 */ + int close(); }; - -} // namespace storage ``` ### TableSchema @@ -147,46 +172,41 @@ class TableSchema { struct ColumnSchema { std::string column_name_; common::TSDataType data_type_; + common::CompressionType compression_; + common::TSEncoding encoding_; ColumnCategory column_category_; - /** - * @brief 使用给定参数构造一个 ColumnSchema 对象。 + /** + * @brief 使用显式的压缩与编码构造 ColumnSchema。 * * @param column_name 列的名称,必须为非空字符串。 - * 此名称用于在表中标识该列。 - * @param data_type 该列的数据类型,例如 INT32、DOUBLE、TEXT 等。 - * 数据类型决定了数据的存储与解释方式。 - * @param column_category 列的类别,用于标识其在模式中的角色或类型, - * 例如 FIELD(字段)、TAG(标签)。 - * 如果未指定,默认为 ColumnCategory::FIELD。 - * @note 调用者有责任确保 `column_name` 非空。 + * @param data_type 该列的数据类型(INT32、DOUBLE、TEXT 等)。 + * @param compression 该列 chunk 使用的压缩方式。 + * @param encoding 该列值使用的编码方式。 + * @param column_category 列的类别(FIELD、TAG 等),默认为 FIELD。 */ ColumnSchema(std::string column_name, common::TSDataType data_type, - ColumnCategory column_category = ColumnCategory::FIELD) : column_name_(std::move(column_name)), - data_type_(data_type), - column_category_(column_category) { - } -}; + common::CompressionType compression, common::TSEncoding encoding, + ColumnCategory column_category = ColumnCategory::FIELD); -/** - * @brief Represents the data type of a measurement. - * - * This enumeration defines the supported data types for measurements in the system. - */ -enum TSDataType : uint8_t { - BOOLEAN = 0, - INT32 = 1, - INT64 = 2, - FLOAT = 3, - DOUBLE = 4, - TEXT = 5, - STRING = 11 + /** + * @brief 使用引擎对该数据类型的默认编码与压缩构造 ColumnSchema。 + * + * @param column_name 列的名称,必须为非空字符串。 + * @param data_type 该列的数据类型。 + * @param column_category 列的类别,默认为 FIELD。 + */ + ColumnSchema(std::string column_name, common::TSDataType data_type, + ColumnCategory column_category = ColumnCategory::FIELD); }; - ``` +> `TAG` 列是设备的唯一标识(联合主键),数据类型固定为 `STRING`;`FIELD` 列存储测量值。 +> 在 `ColumnSchema` 上设置的编码与压缩会在写入时作用于该列;双参数构造函数则回退到按类型的默认值。 + ### Tablet + ```cpp /** * @brief 表示用于插入到表中的数据行集合及其相关元数据。 @@ -251,151 +271,110 @@ public: }; ``` -### RestorableTsFileIOWriter -> V2.3.1 - -```cpp -namespace storage { -/** - * RestorableTsFileIOWriter 用于打开 TsFile 并对其进行可选的恢复操作 - * 继承自 TsFileIOWriter,支持在文件恢复后继续写入 - * - * (1) 若 TsFile 正常关闭:has_crashed()=false,can_write()=false - * - * (2) 若 TsFile 不完整/程序崩溃:has_crashed()=true, - * can_write()=true,写入器会截断损坏数据并允许继续写入 - * - * 基于标准 C++11 实现,通过 RAII 和智能指针避免内存泄漏 - */ -class RestorableTsFileIOWriter : public TsFileIOWriter { - public: - RestorableTsFileIOWriter(); +### 配置编码与压缩 - /** - * 打开 TsFile 用于恢复/追加写入 - * 使用 O_RDWR|O_CREAT 模式,不使用 O_TRUNC,因此会保留文件原有内容 - * - * @param file_path TsFile 文件路径 - * @param truncate_corrupted 若为 true,则截断损坏的数据; - * 若为 false,则不截断(不完整文件保持原样) - * @return 成功返回 E_OK,失败返回错误码 - */ - int open(const std::string& file_path, bool truncate_corrupted = true); +编码与压缩 **按数据类型** 选取:每种类型都有默认值(见上表)。你可以修改这些默认值, +也可以在 schema 上传入显式的编码/压缩。 - /** - * 关闭文件 - */ - void close(); -}; +**1. 在 schema 上指定**:在构造 `ColumnSchema` 时传入显式的编码与压缩: -} // namespace storage +```cpp +// 将 "temperature" 列以 TS_2DIFF + LZ4 存储。 +common::ColumnSchema col("temperature", common::INT64, + common::LZ4, common::TS_2DIFF, + common::ColumnCategory::FIELD); ``` +**2. 按类型的默认值**:在创建写入器 *之前* 修改默认值;它们会作用于所有未在 schema 中 +指定自身编码/压缩的列。这些函数位于 `common`/`storage` 命名空间,会校验参数(不支持的组合返回 +`E_NOT_SUPPORT`): + +```cpp +// 按数据类型的默认值编码,以及默认压缩。 +int common::set_datatype_encoding(uint8_t data_type, uint8_t encoding); +int common::set_global_compression(uint8_t compression); +uint8_t common::get_datatype_encoding(uint8_t data_type); +uint8_t common::get_global_compression(); + +// 时间列的编码/压缩(数据类型固定为 INT64)。 +int common::set_global_time_encoding(uint8_t encoding); +int common::set_global_time_compression(uint8_t compression); +``` ## 读取接口 ### Tsfile Reader ```cpp /** - * @brief TsFileReader 提供查询所有后缀为 .tsfile 的文件的能力 + * @brief TsFileReader 提供了查询所有以 .tsfile 为后缀的文件的能力。 * - * TsFileReader 专为查询 .tsfile 文件设计,支持树模型查询和表模型查询, - * 同时支持查询表结构(TableSchema)、时间序列结构(TimeseriesSchema)等元数据。 + * TsFileReader 旨在用于查询 .tsfile 文件,支持表模型查询, + * 并支持查询元数据信息,如 TableSchema。 */ + class TsFileReader { public: TsFileReader(); + ~TsFileReader(); /** - * @brief 打开 tsfile 文件 + * @brief 打开 tsfile 文件。 * - * @param file_path 待打开的 tsfile 文件路径 - * @return 成功返回0,失败返回非零错误码 + * @param file_path 要打开的 tsfile 文件路径。 + * @return 成功时返回 0,失败时返回 errno_define.h 中的非零错误码。 */ - int open(const std::string& file_path); + + int open(const std::string &file_path); /** - * @brief 关闭 tsfile 文件,该方法应在查询完成后调用 + * @brief 关闭 tsfile,查询完成后应调用此方法。 * - * @return 成功返回0,失败返回非零错误码 + * @return 成功时返回 0,失败时返回 errno_define.h 中的非零错误码。 */ int close(); /** - * @brief 通过查询表达式查询 tsfile 文件,用户可自行构造查询表达式进行查询 + * @brief 通过查询表达式对 tsfile 进行查询,用户可以自行构造查询表达式来查询 tsfile。 * - * @param [in] qe 查询表达式 - * @param [out] ret_qds 结果集 - * @return 成功返回0,失败返回非零错误码 + * @param [in] qe 查询表达式。 + * @param [out] ret_qds 查询结果集。 + * @return 成功时返回 0,失败时返回 errno_define.h 中的非零错误码。 */ - int query(storage::QueryExpression* qe, ResultSet*& ret_qds); + int query(storage::QueryExpression *qe, ResultSet *&ret_qds); /** - * @brief 通过路径列表、起始时间和结束时间查询 tsfile 文件 - * 该方法用于树模型下的 tsfile 文件查询 + * @brief 通过表名、列名、起始时间和结束时间查询 tsfile。 * - * @param [in] path_list 路径列表 - * @param [in] start_time 起始时间 - * @param [in] end_time 结束时间 - * @param [out] result_set 结果集 + * @param [in] table_name 表名。 + * @param [in] columns_names 列名列表。 + * @param [in] start_time 起始时间。 + * @param [in] end_time 结束时间。 + * @param [out] result_set 查询结果集。 */ - int query(std::vector& path_list, int64_t start_time, - int64_t end_time, ResultSet*& result_set); + int query(const std::string &table_name, + const std::vector &columns_names, int64_t start_time, + int64_t end_time, ResultSet *&result_set); + /** - * @brief 通过表名、列名、起始时间和结束时间查询 tsfile 文件 - * 该方法用于表模型下的 tsfile 文件查询 + * @brief 通过表名、列名、开始时间、结束时间和标签过滤器查询 tsfile。 * * @param [in] table_name 表名 - * @param [in] columns_names 列名列表 - * @param [in] start_time 起始时间 + * @param [in] columns_names 列名 + * @param [in] start_time 开始时间 * @param [in] end_time 结束时间 + * @param [in] tag_filter 标签过滤器 * @param [out] result_set 结果集 - * @param [in] batch_size 小于等于0表示逐行返回模式, - * 大于0表示按指定大小返回TsBlock数据块 */ int query(const std::string& table_name, const std::vector& columns_names, int64_t start_time, - int64_t end_time, ResultSet*& result_set, int batch_size = -1); + int64_t end_time, ResultSet*& result_set, Filter* tag_filter); /** - * @brief 通过表名、列名、起始时间、结束时间和标签过滤条件查询 tsfile 文件 - * 该方法用于表模型下的 tsfile 文件查询 + * @brief 按行查询表,支持偏移量/行数限制下推与可选的标签过滤。 * * @param [in] table_name 表名 - * @param [in] columns_names 列名列表 - * @param [in] start_time 起始时间 - * @param [in] end_time 结束时间 - * @param [in] tag_filter 标签过滤条件 + * @param [in] column_names 列名 + * @param [in] offset 需要跳过的起始行数(>= 0) + * @param [in] limit 最多返回行数;< 0 表示不限制 * @param [out] result_set 结果集 - */ - int query(const std::string& table_name, - const std::vector& columns_names, int64_t start_time, - int64_t end_time, ResultSet*& result_set, Filter* tag_filter, - int batch_size = 0); - - /** - * @brief 基于偏移量和限制条数,按行查询树模型时间序列数据 - * - * @param path_list 待查询的完整路径(设备.测量项) - * @param offset 需要跳过的起始行数(>=0) - * @param limit 最大返回行数,小于0表示无限制 - * @param[out] result_set 存储查询结果的结果集 - * @return 成功返回0,失败返回非零错误码 - */ - int queryByRow(std::vector& path_list, int offset, int limit, - ResultSet*& result_set); - - /** - * @brief 基于偏移量和限制条数下推,按行查询表模型数据 - * - * 对于密集型设备(所有列行数相同), - * 偏移量/限制条数会通过SSI下推至数据块/数据页级别, - * 无需解码即可跳过整个数据块/数据页。 - * 对于稀疏型设备,偏移量/限制条数在行合并阶段生效。 - * 当设备总行数处于偏移量范围内时,可直接跳过整个设备。 - * - * @param table_name 待查询的表名 - * @param column_names 待查询的列名 - * @param offset 需要跳过的起始行数(>=0) - * @param limit 最大返回行数,小于0表示无限制 - * @param[out] result_set 存储查询结果的结果集 - * @param tag_filter 可选的标签过滤条件,用于按标签列过滤数据 - * @return 成功返回0,失败返回非零错误码 + * @param [in] tag_filter 可选标签过滤器(用 TagFilterBuilder 构造),或 nullptr + * @param [in] batch_size <= 0 逐行返回;> 0 按该大小返回数据块 + * @return 成功时返回 0,失败时返回 errno_define.h 中的非零错误码。 */ int queryByRow(const std::string& table_name, const std::vector& column_names, int offset, @@ -403,110 +382,34 @@ class TsFileReader { Filter* tag_filter = nullptr, int batch_size = 0); /** - * @brief 在树模型上执行表查询 + * @brief 销毁结果集,该方法应在查询完成并使用完 result_set 后调用。 * - * @param measurement_names 测量项名称列表 - * @param star_time 起始时间 - * @param end_time 结束时间 - * @param result_set 结果集 + * @param qds 查询结果集。 */ - int query_table_on_tree(const std::vector& measurement_names, - int64_t star_time, int64_t end_time, - ResultSet*& result_set); + void destroy_query_data_set(ResultSet *qds); /** - * @brief 销毁结果集,该方法应在查询完成、使用完结果集后调用 + * @brief 根据表名获取表的模式信息。 * - * @param qds 结果集对象 - */ - void destroy_query_data_set(ResultSet* qds); - /** - * @brief 根据设备ID和测量项名称读取时间序列数据 - * - * @param device_id 设备ID - * @param measurement_name 测量项名称列表 - * @return 结果集对象 - */ - ResultSet* read_timeseries( - const std::shared_ptr& device_id, - const std::vector& measurement_name); - /** - * @brief 获取 tsfile 文件中的所有设备 - * - * @param table_name 表名 - * @return 设备ID列表 - */ - std::vector> get_all_devices( - std::string table_name); - - /** - * @brief 获取 tsfile 文件中的所有设备 - * - * @return 设备ID列表 - */ - std::vector> get_all_device_ids(); - - /** - * @brief 获取文件中的所有设备ID(与get_all_device_ids功能一致) - * - * @return 设备列表 - */ - std::vector> get_all_devices(); - - /** - * @brief 根据设备ID和测量项名称获取时间序列结构 - * - * @param [in] device_id 设备ID - * @param [out] result 测量项结构列表 - * @return 成功返回0,失败返回非零错误码 - */ - int get_timeseries_schema(std::shared_ptr device_id, - std::vector& result); - - /** - * @brief 获取指定设备的时间序列元数据 - * - * 仅文件中存在的设备会被包含在结果中 - * 若设备ID列表为空,返回空映射表 - * - * @param device_ids 待查询的设备列表 - * @return 映射关系:设备ID -> 时间序列元数据列表(仅包含存在的数据) - */ - DeviceTimeseriesMetadataMap get_timeseries_metadata( - const std::vector>& device_ids); - - /** - * @brief 获取文件中所有设备的时间序列元数据 - * - * @return 映射关系:设备ID -> 时间序列元数据列表 - */ - DeviceTimeseriesMetadataMap get_timeseries_metadata(); - - /** - * @brief 根据表名获取表结构 - * - * @param table_name 表名 - * @return 表结构智能指针 + * @param table_name 表名。 + * @return std::shared_ptr 表的模式信息。 */ std::shared_ptr get_table_schema( - const std::string& table_name); + const std::string &table_name); /** - * @brief 获取 tsfile 文件中的所有表结构 + * @brief 获取 tsfile 中所有表的模式信息。 * - * @return 表结构列表 + * @return std::vector> 表模式信息列表。 */ std::vector> get_all_table_schemas(); }; ``` - ### ResultSet ```cpp /** * @brief ResultSet 是 TsFileReader 的查询结果集,用于访问查询结果。 * * ResultSet 是一个虚类,使用时应转换为相应的实现类。 - * @note 当使用树模型且过滤器是全局时间过滤器时,应转换为 QDSWithoutTimeGenerator。 - * @note 当使用树模型但过滤器不是全局时间过滤器时,应转换为 QDSWithTimeGenerator。 - * @note 如果查询使用的是表模型,则应转换为 TableResultSet。 + * @note 结果集的具体类型为 TableResultSet。 */ class ResultSet { public: diff --git a/src/zh/UserGuide/latest/QuickStart/InterfaceDefinition/InterfaceDefinition-Python.md b/src/zh/UserGuide/latest/QuickStart/InterfaceDefinition/InterfaceDefinition-Python.md index 906ca0112..00aae7fb2 100644 --- a/src/zh/UserGuide/latest/QuickStart/InterfaceDefinition/InterfaceDefinition-Python.md +++ b/src/zh/UserGuide/latest/QuickStart/InterfaceDefinition/InterfaceDefinition-Python.md @@ -38,14 +38,39 @@ class TSDataType(IntEnum): BLOB = 10 STRING = 11 +class TSEncoding(IntEnum): + """ + 写入器支持的值编码。每个成员后的注释列出其可用于哪些数据类型。 + """ + PLAIN = 0 # 所有类型 + DICTIONARY = 1 # STRING、TEXT + RLE = 2 # INT32、INT64、TIMESTAMP、DATE + TS_2DIFF = 4 # INT32、INT64、TIMESTAMP、DATE、FLOAT、DOUBLE + GORILLA = 8 # INT32、INT64、TIMESTAMP、DATE、FLOAT、DOUBLE + ZIGZAG = 9 # INT32、INT64 + SPRINTZ = 12 # INT32、INT64、FLOAT、DOUBLE + +class Compressor(IntEnum): + """ + 写入器支持的压缩,默认值为 LZ4。 + """ + UNCOMPRESSED = 0 + SNAPPY = 1 + GZIP = 2 + LZO = 3 + LZ4 = 7 + class ColumnCategory(IntEnum): """ TsFile 中的列类别枚举。 - TAG:表示标签列,用于存储元数据。 - FIELD:表示测点列,用于存储实际数据值。 + TAG:标签列 + FIELD:测点列,存储测量值。 + ATTRIBUTE / TIME:保留的列角色。 """ TAG = 0 FIELD = 1 + ATTRIBUTE = 2 + TIME = 3 class ColumnSchema: """定义表中某一列的模式(名称、数据类型、类别)。""" @@ -92,9 +117,11 @@ class TsFileTableWriter: """ :param path: tsfile 文件路径,如果不存在则会创建。 :param table_schema: 描述要写入表的结构信息。 + :param memory_threshold: 触发自动刷盘前缓冲的字节数(默认 128MB)。 :return: 无返回值。 """ - def __init__(self, path: str, table_schema: TableSchema) + def __init__(self, path: str, table_schema: TableSchema, + memory_threshold: int = 128 * 1024 * 1024) """ 将一个 Tablet 写入 TsFile 中的表中。 @@ -103,12 +130,30 @@ class TsFileTableWriter: """ def write_table(self, tablet: Tablet) + """ + 将一个 pandas DataFrame 写入表中。列的编码/压缩遵循表 schema(或引擎默认值)。 + :param dataframe: 要写入的数据。 + :return: 无返回值。 + """ + def write_dataframe(self, dataframe: pandas.DataFrame) + + """ + 将缓冲数据刷新到磁盘。 + :return: 无返回值。 + """ + def flush(self) + """ 关闭 TsFileTableWriter,并自动刷新数据。 :return: 无返回值。 """ def close(self) + # 可作为上下文管理器使用: + # with TsFileTableWriter(path, schema) as w: + # w.write_table(tablet) + def __enter__(self) + def __exit__(self, exc_type, exc_val, exc_tb) ``` @@ -134,147 +179,116 @@ class Tablet(object) ``` -## 读取接口 - -### TsFileReader +### dataframe_to_tsfile ```python -class TsFileReader: +def dataframe_to_tsfile(dataframe: pd.DataFrame, + file_path: str, + table_name: Optional[str] = None, + time_column: Optional[str] = None, + tag_column: Optional[list[str]] = None) """ - 从 TsFile 中查询表格数据、时序数据,提供标准化的文件读取与查询接口, - 支持表模型查询、树模型查询、元数据获取、资源管控等全量核心能力。 + 将 pandas DataFrame 写入 TsFile。 + + :param dataframe: 要写入的数据。 + :param file_path: 目标 .tsfile 路径。 + :param table_name: 输出表名。 + :param time_column: 用作时间戳列的列名。 + :param tag_column: 作为 TAG 列处理的列名列表。 """ +``` - def __init__(self, pathname: str): - """ - 初始化指定路径的 TsFile 读取器,完成文件加载与底层读取器初始化, - 同时维护当前所有活跃的查询结果集,确保读取器关闭时同步失效所有结果集。 - :param pathname: 待读取的 TsFile 文件的完整路径 - :return: 无返回值 - """ +## 配置 - def query_table(self, table_name: str, column_names: List[str], - start_time: int = np.iinfo(np.int64).min, - end_time: int = np.iinfo(np.int64).max, - tag_filter: Optional[object] = None, - batch_size: int = 0) -> object: - """ - 对指定的表和列执行时间范围查询,支持标签过滤与批量读取模式。 - 可适配逐行返回与固定大小数据块返回两种模式,满足不同场景的读取需求。 - - :param table_name: 要查询的目标表名,不区分大小写 - :param column_names: 要检索的目标列名列表,为空时默认查询全列 - :param start_time: 查询范围的起始时间戳,默认值为 int64 类型最小值 - :param end_time: 查询范围的结束时间戳,默认值为 int64 类型最大值 - :param tag_filter: 可选参数,基于标签列的过滤条件,支持等值、范围、逻辑组合过滤 - :param batch_size: 批量读取大小,小于等于0时启用逐行返回模式,大于0时按指定大小返回数据块 - :return: 封装完成的查询结果集处理器,可用于遍历、读取数据、获取元数据 - """ +全局写入默认值——包括各类型的默认编码、默认压缩、时间列编码/压缩——以一个字典暴露。 +请在创建写入器 **之前** 修改它们。 - def query_table_on_tree(self, column_names: List[str], - start_time: int = np.iinfo(np.int64).min, - end_time: int = np.iinfo(np.int64).max) -> object: - """ - 在树模型结构上执行表查询,适配原生树结构时序数据的查询场景, - 直接基于测量项名称查询,无需指定表名,路径名称区分大小写。 +```python +from tsfile import get_tsfile_config, set_tsfile_config +from tsfile import TSEncoding, Compressor - :param column_names: 待查询的测量项名称列表,对应树结构中的节点路径 - :param start_time: 查询范围的起始时间戳,默认值为 int64 类型最小值 - :param end_time: 查询范围的结束时间戳,默认值为 int64 类型最大值 - :return: 树模型查询对应的结果集处理器 - """ +cfg = get_tsfile_config() # -> 包含所有配置项的 dict +# 例如 cfg["default_compression_type_"]、cfg["int64_encoding_type_"]、 +# cfg["time_encoding_type_"]、cfg["time_compress_type_"] 等。 - def query_tree_by_row(self, device_ids: List[str], measurement_names: List[str], - offset: int = 0, limit: int = -1) -> object: - """ - 按行分页查询树模型时序数据,支持偏移量跳过、最大返回行数限制, - 适配大数据量分页读取场景,避免单次加载过多数据导致内存溢出。 - - :param device_ids: 待查询的设备ID列表,不能为空 - :param measurement_names: 待查询的测量项名称列表,不能为空 - :param offset: 需要跳过的起始行数,默认从0开始 - :param limit: 最大返回行数,小于0表示不限制返回行数 - :return: 树模型分页查询的结果集处理器 - """ +set_tsfile_config({ + "default_compression_type_": Compressor.LZ4, + "int64_encoding_type_": TSEncoding.TS_2DIFF, +}) +``` - def query_table_by_row(self, table_name: str, column_names: List[str], - offset: int = 0, limit: int = -1, - tag_filter: Optional[object] = None, - batch_size: int = 0) -> object: - """ - 按行分页查询表模型数据,支持偏移量与行数限制下推,可结合标签过滤使用, - 密集型设备可在数据块级别跳过无效数据,大幅提升分页查询效率。 - - :param table_name: 待查询的目标表名 - :param column_names: 待查询的列名列表 - :param offset: 需要跳过的起始行数,默认从0开始 - :param limit: 最大返回行数,小于0表示不限制返回行数 - :param tag_filter: 可选参数,标签过滤条件,过滤符合条件的设备数据 - :param batch_size: 批量读取大小,适配底层数据块读取逻辑 - :return: 表模型分页查询的结果集处理器 - """ +`set_tsfile_config` 会校验每个取值,且只更新你传入的键。编码/压缩取值为 `TSEncoding` / `Compressor` +成员;类型与编码的适配限制同 C++ 接口。 - def query_timeseries(self, device_name: str, sensor_list: List[str], - start_time: int = 0, end_time: int = 0) -> object: - """ - 针对单个指定设备,执行时间范围时序数据查询, - 适配单设备多传感器的精准查询场景,简化查询调用逻辑。 - - :param device_name: 目标设备的名称/路径 - :param sensor_list: 待查询的传感器(测量项)名称列表 - :param start_time: 查询起始时间戳,为0时默认从文件最早时间开始 - :param end_time: 查询结束时间戳,为0时默认到文件最晚时间结束 - :return: 单设备时序查询的结果集处理器 - """ +## 读取接口 - def get_table_schema(self, table_name: str) -> object: - """ - 获取指定表的完整模式信息,包含列名、数据类型、标签列、时序约束等全量元数据, - 用于提前校验查询字段合法性、解析数据结构。 +### TsFileReader - :param table_name: 目标表名 - :return: 对应表的模式信息对象,包含表结构全量配置 - """ +```python +class TsFileReader: + """ + 从 TsFile 中查询表格数据。 + """ - def get_all_table_schemas(self) -> Dict[str, object]: - """ - 获取当前 TsFile 文件中所有表的模式信息, - 一键遍历文件内全部数据表结构,无需逐个表查询。 + """ + 初始化指定路径的 TsFile 读取器。 + :param pathname: TsFile 文件的路径。 + :return: 无返回值。 + """ + def __init__(self, pathname) - :return: 字典结构,key为表名,value为对应表的模式信息对象 - """ + """ + 对指定的表和列执行时间范围查询。 - def get_all_timeseries_schemas(self) -> List[object]: - """ - 获取 TsFile 内所有时序序列的模式信息, - 覆盖树模型、表模型全量时序数据的字段、类型、约束信息。 + :param table_name: 要查询的表名。 + :param column_names: 要检索的列名列表。 + :param start_time: 查询范围的起始时间(默认:int64 最小值)。 + :param end_time: 查询范围的结束时间(默认:int64 最大值)。 + :return: 查询结果集处理器。 + """ + def query_table(self, table_name : str, column_names : List[str], + start_time : int = np.iinfo(np.int64).min, + end_time: int = np.iinfo(np.int64).max) -> ResultSet - :return: 所有时序模式信息组成的列表 - """ + """ + 按行查询表,支持偏移量/行数限制下推与可选的标签过滤。标签谓词把查询限定到 + 标签列取值满足条件的设备。用 tsfile.tag_filter 中的辅助函数构造过滤器 + (tag_eq、tag_neq、tag_lt、tag_lteq、tag_gt、tag_gteq、tag_between 等), + 并用 &、| 和 ~ 组合。 - def get_all_devices(self) -> List[str]: - """ - 获取 TsFile 文件内所有设备的标识信息, - 可遍历文件内全部设备,适配全设备统计、批量查询前置操作。 + :param table_name: 要查询的表名。 + :param column_names: 要检索的列名列表。 + :param offset: 需要跳过的起始行数(默认 0)。 + :param limit: 最多返回的行数;< 0 表示不限制。 + :param tag_filter: 可选的标签谓词(TagFilter),None 表示不过滤。 + :param batch_size: <= 0 逐行返回;> 0 按该大小返回数据块。 + :return: 查询结果集处理器。 + """ + def query_table_by_row(self, table_name : str, column_names : List[str], + offset : int = 0, limit : int = -1, + tag_filter = None, batch_size : int = 0) -> ResultSet - :return: 所有设备ID/设备路径组成的列表 - """ + """ + 获取指定表的模式信息。 - def get_timeseries_metadata(self, device_ids: Optional[List[str]] = None) -> Dict[str, object]: - """ - 获取指定设备的时序元数据,包含数据存储分段、字段约束、数据范围等信息, - 不传设备ID时默认返回全设备元数据,传入空列表返回空字典。 + :param table_name: 表名。 + :return: 指定表的模式信息。 + """ + def get_table_schema(self, table_name : str) -> TableSchema + + """ + 获取 TsFile 中所有表的模式信息。 + + :return: 一个将表名映射到其模式的字典。 + """ + def get_all_table_schemas(self) -> dict[str, TableSchema] + + """ + 关闭 TsFile 读取器。如果读取器中有活动的结果集,它们将失效。 + """ + def close(self) - :param device_ids: 可选参数,待查询元数据的设备ID列表 - :return: 字典结构,key为设备路径,value为对应设备的时序元数据组 - """ - def close(self) -> None: - """ - 关闭 TsFile 读取器,释放底层文件句柄、内存资源, - 同时将当前所有活跃的查询结果集标记为失效,禁止后续数据读取操作。 - 关闭后不可再次执行查询、元数据获取操作,需重新初始化读取器。 - """ ``` ### ResultSet @@ -371,7 +385,6 @@ def to_dataframe(file_path: str, 从 TsFile 中读取数据,并将其转换为 Pandas DataFrame 或 DataFrame 迭代器。 - 该函数同时支持表模型(table-model)和树模型(tree-model)的 TsFile。 用户可以通过表名、列名、时间范围以及最大行数对数据进行过滤。 Parameters diff --git a/src/zh/UserGuide/latest/Tools/Tsfile-CLI.md b/src/zh/UserGuide/latest/Tools/Tsfile-CLI.md new file mode 100644 index 000000000..f08f24e2b --- /dev/null +++ b/src/zh/UserGuide/latest/Tools/Tsfile-CLI.md @@ -0,0 +1,165 @@ + +# tsfile-cli + +`tsfile-cli` 是一个单一、对管道友好的 C++ 命令行工具,用于在 shell 中检视 **并** 导入 Apache +TsFile(`.tsfile`)文件。读取类命令将数据打印到 +**stdout**、诊断信息打印到 **stderr**,因此可与 `awk`、`jq`、`sort` 等组合使用;`write` 命令 +将 CSV/TSV 导入为新的 `.tsfile`。它构建于公开的 `TsFileReader` 与 `TsFileTableWriter` 接口之上。 + +## 从源码构建 + +CLI 是 C++ 模块的一部分。用 Maven 包装器构建即可——它会下载固定版本的 CMake,为你编译整个 +C++ 模块(`libtsfile` 共享库 + `tsfile-cli` 可执行文件)。 + +**前置条件**:用于运行 Maven 的 JDK(8+),以及 C++11 编译器(GCC / Clang)。第三方 C++ 依赖 +(Snappy、LZ4、LZOKAY、Zlib 等)已捆绑在 `cpp/third_party/` 下并自动构建。 + +在仓库根目录执行: + +```bash +./mvnw clean package -P with-cpp +``` + +会在 `cpp/target/build/` 下生成: + +| 产物 | 路径 | +|---|---| +| CLI 可执行文件 | `cpp/target/build/bin/tsfile-cli` | +| 共享库 | `cpp/target/build/lib/libtsfile.so`(Linux)——macOS 为 `libtsfile.dylib` | + +`tsfile-cli` 动态链接 `libtsfile`。用完整路径 **就地** 运行即可自动找到该库: + +```bash +cpp/target/build/bin/tsfile-cli --version # -> tsfile-cli (Apache TsFile C++) +cpp/target/build/bin/tsfile-cli --help +``` + +若要在 **别处** 运行该二进制(例如把它从构建目录拷出来后),动态加载器必须能找到 `libtsfile.so`。 +要么让加载器指向构建目录下的 `lib/`,要么把该库拷到标准位置: + +```bash +# 让加载器指向构建目录的 lib(Linux;macOS 用 DYLD_LIBRARY_PATH) +export LD_LIBRARY_PATH=/path/to/cpp/target/build/lib:$LD_LIBRARY_PATH + +# —— 或者 —— 把库拷到系统库路径 +sudo cp cpp/target/build/lib/libtsfile.so /usr/local/lib/ && sudo ldconfig +``` + +## 使用方式 + +```text +tsfile-cli [options] +tsfile-cli --help | --version | help +``` + +退出码:`0` 成功,`1` 用法/参数错误,`2` 文件打开/损坏,`3` 查询/运行时错误。 + +### 读取 + +| 命令 | 说明 | +|---|---| +| `ls` | 列出设备(树模型)或表(表模型),每行一个名称 | +| `schema` | 每序列的 `target, measurement, datatype, encoding, compression` | +| `meta` | 文件概要:模型、设备/表/序列数、时间范围、文件大小 | +| `stats` | 每序列的 `count, start_time, end_time, min, max, first, last, sum` | +| `count` | 每序列行数及一行 `total`(来自统计信息,不扫描 page) | +| `head` | 前 N 行(默认 10;用 `-n`) | +| `cat` | 所有匹配行,流式输出(`table` 格式会缓冲以对齐列) | +| `sample` | 可复现的蓄水池抽样(默认 10;`-n`、`--seed`) | + +元数据类命令(`ls` / `schema` / `meta` / `stats` / `count`)无需解码数据即可回答大多数问题。 + +通用选项: + +| 选项 | 含义 | +|---|---| +| `-f, --format csv\|tsv\|json\|table` | 输出格式;TTY 下默认 `table`,管道下默认 `tsv` | +| `-d, --device ` / `-t, --table ` | 限定到一个设备 / 表(互斥) | +| `-m, --measurements a,b,c` | 列投影(`schema`、`stats`、`count`、`head`、`cat`、`sample`) | +| `-n, --limit N` / `--offset N` | 最大行数 / 跳过行数(`head`、`cat`;`--offset` 不适用于 `sample`) | +| `--start ` / `--end ` | 闭区间的毫秒时间范围(`head`、`cat`、`sample`) | +| `--seed N` | 可复现抽样种子(仅 `sample`) | +| `--tag-filter C OP V` / `--tag-between C L U` / `--tag-not-between C L U` | `head`、`cat`、`sample` 的表标签谓词;`OP` 为 `eq`、`neq`、`lt`、`lteq`、`gt`、`gteq`、`regexp`、`not-regexp` | +| `--no-header` | 不输出表头行 | +| `--model tree\|table` | 强制指定模型(否则自动检测) | + +`json` 输出为 NDJSON(每行一个对象;数字/布尔裸输出,其他值加引号,空值为 `null`;非有限浮点数 +——NaN/Inf——变为 `null`)。CSV 输出遵循 RFC 4180。时间戳为原始毫秒时间戳。`table` 格式会在内存中 +缓冲所有行以对齐列,因此导出大文件时优先用 `csv`/`tsv`/`json`。 + +```bash +BIN=cpp/build/Debug/bin/tsfile-cli +$BIN ls -f tsv data.tsfile # 列出表 / 设备 +$BIN meta data.tsfile # 快速文件概览 +$BIN count -t table1 -f tsv data.tsfile # 行数,不扫描 page +$BIN cat -t table1 --tag-filter device eq dev_1 -m temp -f tsv data.tsfile +$BIN cat -m temp,humidity --start 1700000000000 -f csv data.tsfile | head +$BIN sample -m temp -n 20 --seed 42 -f json data.tsfile | jq . +``` + +> 对于表模型文件,行命令(`head` / `cat` / `sample`)在不指定 `-t
` 时只查询 **第一个** 表; +> `count` 覆盖所有表。 + +### 写入(导入) + +`tsfile-cli write` 将 CSV/TSV 行导入为一个 **新的表模型** `.tsfile`(输出会被覆盖)。输入的第一列是 +时间戳(毫秒);其余列通过 `--columns` 显式声明——不做类型推断。 + +时间戳必须 **按设备严格递增**,设备由其 `tag` 列取值标识(共享相同标签的行构成同一设备的时间线)。 +不同标签组合的行可自由交错并复用时间戳。乱序输入会被拒绝并报出错行号,导入失败不会留下输出文件。 +`--output` 必须与输入文件不同。 + +```text +tsfile-cli write --table --columns -o \ + [-f csv|tsv] [--no-header] [--header-match] [-v] [ | -] +``` + +`--columns` 是逗号分隔的 `name:TYPE:category` 列表,其中 `category`(不区分大小写)为 `tag` 或 +`field`,`TYPE`(不区分大小写)为 `BOOLEAN, INT32, INT64, FLOAT, DOUBLE, STRING, TEXT, TIMESTAMP, +DATE, BLOB` 之一——例如 `--columns "id1:STRING:tag,s1:INT64:field"`。`DATE` 单元格写作 +`YYYY-MM-DD`,`TIMESTAMP` 单元格为毫秒。每列按其类型的引擎默认编码与压缩存储。 + +| 选项 | 含义 | +|---|---| +| `--table ` | 输出表名(会转小写) | +| `--columns ` | 有序数据列(不含开头的时间戳列) | +| `-o, --output ` | 输出 `.tsfile`(必填;会被覆盖) | +| `` / `-` | 输入文件,或 `-` / 省略表示 stdin | +| `-f csv\|tsv` | 输入分隔符(默认 csv;`json` / `table` 被拒绝) | +| `--no-header` | 输入无表头行(默认首行为表头并跳过) | +| `--header-match` | 校验表头名是否与 `--columns` 一致 | +| `-v, --verbose` | 向 stderr 打印 `wrote N rows to `(否则成功时静默) | + +空单元格写为 null。成功时静默(Unix 风格),用 `-v` 打印一行摘要。 + +```bash +# 通过管道往返 +printf 'time,id1,s1\n0,dev,0\n1,dev,10\n' \ + | tsfile-cli write --table t1 --columns "id1:STRING:tag,s1:INT64:field" -o out.tsfile - +tsfile-cli count -f tsv out.tsfile # -> t1.dev s1 2 +``` + +## 与 AI 助手配合使用 skill + +`cpp/tools/skills/tsfile-cli/SKILL.md` 是一份机器可读的参考,描述了如何 +正确驱动 `tsfile-cli`。支持 skill 的 AI 编码助手可以加载它,从而辅助检视与导入 `.tsfile` +文件。 diff --git a/src/zh/UserGuide/latest/Tools/Tsfile-Viewer.md b/src/zh/UserGuide/latest/Tools/Tsfile-Viewer.md new file mode 100644 index 000000000..f613bc84a --- /dev/null +++ b/src/zh/UserGuide/latest/Tools/Tsfile-Viewer.md @@ -0,0 +1,89 @@ + +# tsfile-viewer + +[Apache TsFile Viewer](https://github.com/apache/tsfile-viewer) 是一个基于 Web 的应用, +用于在浏览器中浏览与分析 TsFile 数据。它由一个 Spring Boot 后端(通过 Apache TsFile 库读取 +`.tsfile` 文件)与一个 Vue 3 前端(渲染元数据、分页表格与交互式图表)组成。 + +- **仓库**: +- **许可证**:Apache-2.0 + +## 功能 + +- **文件浏览与上传**——在界面中打开 `.tsfile` 文件。 +- **元数据展示**——schema、设备与测点。 +- **分页数据表格**,支持按时间范围、设备、测点、数值范围过滤。 +- **交互式图表**(ECharts),支持多序列叠加与聚合。 +- **支持两种数据模型**——树模型与表模型 TsFile。 +- **导出**——数据导出为 CSV 或 JSON;图表导出为 PNG 或 SVG。 +- **性能**——chunk 级读取与元数据缓存。 + +## 环境要求 + +| 组件 | 版本 | +|---|---| +| JDK | 17 或 21(LTS) | +| Maven | 3.9+ | +| Node.js | `^20.19.0 \|\| >=22.12.0` | +| pnpm | 最新版 | +| Apache TsFile | 2.3.0(捆绑依赖) | + +## 获取源码 + +克隆仓库,然后按下文构建并运行: + +```bash +git clone https://github.com/apache/tsfile-viewer.git +cd tsfile-viewer +``` + +## 从源码运行(开发模式) + +在两个终端分别运行后端与前端。 + +**后端**(Spring Boot): + +```bash +cd backend +mvn spring-boot:run +``` + +**前端**(Vue + Vite 开发服务器): + +```bash +cd frontend +pnpm install +pnpm dev +``` + +然后打开开发界面 。 + +## 构建并运行生产包 + +构建一个自包含的发行包,再启动打好的 jar(前端由后端提供服务): + +```bash +./build-dist.sh +java -jar backend/target/tsfile-viewer-*.jar +``` + +打开应用