From 14efc9029fdbb1008dee60b3e6ff6a3650f94abd Mon Sep 17 00:00:00 2001 From: jaogoy Date: Mon, 12 Jan 2026 15:17:44 +0800 Subject: [PATCH 01/20] Feat: Add StarRocks engine support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - **Add StarRocks engine support to SQLMesh** via StarRocks’ MySQL-compatible protocol. - Ship **engine adapter + docs + real integration tests** to ensure generated SQL works on StarRocks. - **User demand / adoption**: StarRocks is a common OLAP choice; SQLMesh users want to run the same model lifecycle (build, incremental maintenance, views/MVs) on StarRocks without bespoke SQL. - **Engine-specific semantics**: StarRocks differs from vanilla MySQL in DDL/DML constraints (e.g., key types, delete behavior, rename caveats). An adapter is needed to produce correct and predictable SQL. - **Confidence & maintainability**: Documenting config patterns + codifying behavior with integration tests prevents regressions and makes support “real” (not just “it parses”). - **Connectivity**: Connect through MySQL protocol (e.g., `pymysql`). - **Table creation / DDL**: - Key table types via `physical_properties`: **DUPLICATE KEY (default)**, **PRIMARY KEY (recommended for incremental)**, **UNIQUE KEY** - **Partitioning**: simple `partitioned_by` and advanced `partition_by` (complex expression partitioning) + optional initial `partitions` - **Distribution**: `distributed_by` structured form or string fallback (HASH / RANDOM; buckets required) - **Ordering**: `order_by` / `clustered_by` - **Generic PROPERTIES passthrough** (string key/value) - **Views**: - Regular views - **Materialized views** via `kind VIEW(materialized true)` with StarRocks-specific notes/constraints - **DML / maintenance**: - Insert/select/update basics - Delete behavior handled with StarRocks compatibility constraints (PRIMARY KEY tables recommended for robust deletes) - **Engine adapter**: `sqlmesh/core/engine_adapter/starrocks.py` - **Docs**: `docs/integrations/engines/starrocks.md` - **Integration tests**: `tests/core/engine_adapter/integration/test_integration_starrocks.py`, and `tests/core/engine_adapter/test_starrocks.py` - **Integration tests require a running StarRocks** instance. - Ran: - set `STARROCKS_HOST/PORT/USER/PASSWORD` - `pytest -m "starrocks and docker" tests/core/engine_adapter/integration/test_integration_starrocks.py` - **No sync MV support (currently)** - **No tuple IN**: `(c1, c2) IN ((v1, v2), ...)` - **No `SELECT ... FOR UPDATE`** - **RENAME caveat**: rename target can’t be qualified with a database name - **Changes are StarRocks-scoped** (adapter/docs/tests) and should not impact other engines. Signed-off-by: jaogoy Signed-off-by: Mateusz Jukiewicz --- .github/scripts/wait-for-db.sh | 28 + .readthedocs.yaml | 2 +- Makefile | 3 + docs/guides/configuration.md | 2 + docs/guides/connections.md | 1 + docs/integrations/engines/starrocks.md | 487 +++ docs/integrations/overview.md | 1 + mkdocs.yml | 1 + pyproject.toml | 2 + sqlmesh/core/config/__init__.py | 1 + sqlmesh/core/config/connection.py | 75 + sqlmesh/core/engine_adapter/__init__.py | 2 + sqlmesh/core/engine_adapter/starrocks.py | 3570 +++++++++++++++++ sqlmesh/core/snapshot/evaluator.py | 64 +- .../engine_adapter/integration/__init__.py | 11 +- .../engine_adapter/integration/config.yaml | 10 + .../integration/docker/compose.starrocks.yaml | 27 + .../integration/test_integration.py | 5 + .../integration/test_integration_starrocks.py | 2400 +++++++++++ tests/core/engine_adapter/test_starrocks.py | 1823 +++++++++ tests/core/test_connection_config.py | 57 + 21 files changed, 8565 insertions(+), 7 deletions(-) create mode 100644 docs/integrations/engines/starrocks.md create mode 100644 sqlmesh/core/engine_adapter/starrocks.py create mode 100644 tests/core/engine_adapter/integration/docker/compose.starrocks.yaml create mode 100644 tests/core/engine_adapter/integration/test_integration_starrocks.py create mode 100644 tests/core/engine_adapter/test_starrocks.py diff --git a/.github/scripts/wait-for-db.sh b/.github/scripts/wait-for-db.sh index 07502e3898..85f30dc7d7 100755 --- a/.github/scripts/wait-for-db.sh +++ b/.github/scripts/wait-for-db.sh @@ -50,6 +50,34 @@ spark_ready() { probe_port 15002 } +starrocks_ready() { + probe_port 9030 + + echo "Checking for 1 alive StarRocks backends..." + sleep 5 + + while true; do + echo "Checking StarRocks backends..." + ALIVE_BACKENDS=$(docker exec -i starrocks-fe mysql -h127.0.0.1 -P9030 -uroot -e "show backends \G" | grep -c "^ *Alive: true *$") + + # fallback value if failed to get number + if ! [[ "$ALIVE_BACKENDS" =~ ^[0-9]+$ ]]; then + echo "WARN: Unable to parse number of alive backends, got: '$ALIVE_BACKENDS'" + ALIVE_BACKENDS=0 + fi + + echo "Found $ALIVE_BACKENDS alive backends" + + if [ "$ALIVE_BACKENDS" -ge 1 ]; then + echo "StarRocks has 1 or more alive backends" + break + fi + + echo "Waiting for more backends to become alive..." + sleep 5 + done +} + trino_ready() { # Trino has a built-in healthcheck script, just call that docker compose -f tests/core/engine_adapter/integration/docker/compose.trino.yaml exec trino /bin/bash -c '/usr/lib/trino/bin/health-check' diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 68d856c589..ee4794538f 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -6,7 +6,7 @@ build: python: "3.10" jobs: pre_build: - - pip install -e ".[athena,azuresql,bigframes,bigquery,clickhouse,databricks,dbt,dlt,gcppostgres,github,llm,mssql,mysql,mwaa,postgres,redshift,slack,snowflake,trino,web,risingwave]" + - pip install -e ".[athena,azuresql,bigframes,bigquery,clickhouse,databricks,dbt,dlt,gcppostgres,github,llm,mssql,mysql,mwaa,postgres,redshift,slack,snowflake,starrocks,trino,web,risingwave]" - make api-docs mkdocs: diff --git a/Makefile b/Makefile index 843beb0624..3892731352 100644 --- a/Makefile +++ b/Makefile @@ -212,6 +212,9 @@ trino-test: engine-trino-up risingwave-test: engine-risingwave-up pytest -n auto -m "risingwave" --reruns 3 --junitxml=test-results/junit-risingwave.xml +starrocks-test: engine-starrocks-up + pytest -n auto -m "starrocks" --reruns 3 --junitxml=test-results/junit-starrocks.xml + ################# # Cloud Engines # ################# diff --git a/docs/guides/configuration.md b/docs/guides/configuration.md index 2f5f1f6e53..fd8e8dd8cd 100644 --- a/docs/guides/configuration.md +++ b/docs/guides/configuration.md @@ -920,6 +920,7 @@ These pages describe the connection configuration options for each execution eng * [GCP Postgres](../integrations/engines/gcp-postgres.md) * [Redshift](../integrations/engines/redshift.md) * [Snowflake](../integrations/engines/snowflake.md) +* [StarRocks](../integrations/engines/starrocks.md) * [Spark](../integrations/engines/spark.md) * [Trino](../integrations/engines/trino.md) @@ -952,6 +953,7 @@ Unsupported state engines, even for development: * [ClickHouse](../integrations/engines/clickhouse.md) * [Spark](../integrations/engines/spark.md) +* [StarRocks](../integrations/engines/starrocks.md) * [Trino](../integrations/engines/trino.md) This example gateway configuration uses Snowflake for the data warehouse connection and Postgres for the state backend connection: diff --git a/docs/guides/connections.md b/docs/guides/connections.md index e0dca0f7a4..bc763f3f5a 100644 --- a/docs/guides/connections.md +++ b/docs/guides/connections.md @@ -90,4 +90,5 @@ default_gateway: local_db * [Redshift](../integrations/engines/redshift.md) * [Snowflake](../integrations/engines/snowflake.md) * [Spark](../integrations/engines/spark.md) +* [StarRocks](../integrations/engines/starrocks.md) * [Trino](../integrations/engines/trino.md) diff --git a/docs/integrations/engines/starrocks.md b/docs/integrations/engines/starrocks.md new file mode 100644 index 0000000000..2749fb9d9a --- /dev/null +++ b/docs/integrations/engines/starrocks.md @@ -0,0 +1,487 @@ +# StarRocks + +## Overview + +[StarRocks](https://www.starrocks.io/) is a next-generation sub-second MPP OLAP database designed for real-time analytics. It provides high concurrency, low latency, and supports both batch and stream processing. + +SQLMesh supports StarRocks through its MySQL-compatible protocol, providing StarRocks-specific optimizations for table models, indexing, partitioning, and more. The adapter leverages StarRocks's strengths for analytical workloads with sensible defaults and advanced configuration support. + +## Connection Configuration Example + +```yaml +starrocks: + connection: + type: starrocks + host: starrocks-fe # Frontend (FE) node address + port: 9030 # Query port (default: 9030) + user: starrocks_user + password: your_password + database: your_database + # Optional MySQL-compatible settings +``` + +## Quickstart + +### 1) A minimal table (DUPLICATE KEY default) + +```sql +MODEL ( + name user_events, + kind FULL, + physical_properties ( + distributed_by = RANDOM + ) +); + +SELECT + user_id, + event_time, + event_type +FROM source.user_events; +``` + +A `DUPLICATE KEY` table can usually be used as a `FULL` kind model. + +### 2) An incremental table (PRIMARY KEY recommended) + +```sql +MODEL ( + name user_events_inc, + kind INCREMENTAL_BY_TIME_RANGE( + time_column event_date + ), + physical_properties ( + primary_key = (user_id, event_date), + partition_by = (date_trunc('day', event_date)), + distributed_by = (kind=HASH, expressions=user_id, buckets=16) + ) +); + +SELECT + user_id, + event_date, + COUNT(*) AS cnt +FROM source.user_events +WHERE event_date BETWEEN @start_ds AND @end_ds +GROUP BY user_id, event_date; +``` + +## Table Types + +StarRocks supports four table types: **DUPLICATE KEY**, **PRIMARY KEY**, **UNIQUE KEY**, and **AGGREGATE KEY**. + +SQLMesh configures StarRocks table types via `physical_properties` (engine-specific table properties). + +> **Note**: StarRocks `AGGREGATE KEY` requires per-value-column aggregation functions, which SQLMesh model syntax **DOES NOT** currently support. Use `PRIMARY KEY` or `DUPLICATE KEY` instead. + +### DUPLICATE KEY Type (Default) + +If you do not set a key type, StarRocks creates a DUPLICATE KEY table by default. + +**Example:** + +```sql +MODEL ( + name user_events, + kind FULL, + physical_properties ( + distributed_by = RANDOM + ) +); +``` + +### PRIMARY KEY Type + +For incremental models, **PRIMARY KEY tables are needed** (and effectively required for robust deletes) because StarRocks supports *weaker* `DELETE ... WHERE ...` on non-primary-key table types. + +SQLMesh will apply conservative `WHERE` transformations for compatibility (for example, converting `BETWEEN` to `>= AND <=`, removing boolean literals, and converting `DELETE ... WHERE TRUE` to `TRUNCATE TABLE`). To avoid limitations and keep incremental maintenance reliable, use a `PRIMARY KEY` table by setting `physical_properties.primary_key`. + +> SQLMesh currently does not support specifying `primary_key` as a model parameter. + +**Example (INCREMENTAL_BY_TIME_RANGE):** + +```sql +MODEL ( + name user_events, + kind INCREMENTAL_BY_TIME_RANGE( + time_column event_date + ), + physical_properties ( + primary_key = (user_id, event_date), + distributed_by = (kind=HASH, expressions=user_id, buckets=16) + ) +); + +SELECT + user_id, + event_date, + COUNT(*) AS cnt +FROM source.user_events +WHERE event_date BETWEEN @start_ds AND @end_ds +GROUP BY user_id, event_date; +``` + +### UNIQUE KEY Type + +You can create a UNIQUE KEY table by setting `physical_properties.unique_key`. In most incremental use cases, a PRIMARY KEY table is recommended instead. + +**Example:** + +```sql +MODEL ( + name user_events_unique, + kind FULL, + physical_properties ( + unique_key = (user_id, event_date), + distributed_by = (kind=HASH, expressions=user_id, buckets=16) + ) +); +``` + +## Table Properties + +This section documents StarRocks engine-specific table properties via `physical_properties (...)` (table properties). Most properties support: + +* **Structured form** (recommended): easier validation and clearer intent +* **String fallback**: for convenience or when you want to paste native StarRocks syntax quickly + +Most of the time, the value syntax is the same or similar as a corresponding clause in StarRocks, espacially for a **string** type value. + +When specifying **string** values, prefer **single quotes**. + +### Configuration Matrix + +| Property | Where | Recommended form | String fallback | Notes | +| --- | --- | --- | --- | --- | +| `primary_key` | `physical_properties` | `primary_key = (col1, col2)` | `primary_key = 'col1, col2'` | Required for PRIMARY KEY tables (recommended for incremental). | +| `duplicate_key` | `physical_properties` | `duplicate_key = (col1, col2)` | `duplicate_key = 'col1, col2'` | Explicitly sets DUPLICATE KEY table type. | +| `unique_key` | `physical_properties` | `unique_key = (col1, col2)` | `unique_key = 'col1, col2'` | Sets UNIQUE KEY table type. | +| `partitioned_by` / `partition_by` | `MODEL` / `physical_properties` | `partitioned_by (dt)` (model param) / `partition_by = RANGE(dt, region)` (table property) | `partition_by = 'RANGE(dt, region)'` | Its' recommended to use `partition_by` in `physical_properties` for RANGE/LIST partitioning together with `partitions`. | +| `partitions` | `physical_properties` | `partitions = ('PARTITION ...', 'PARTITION ...')` | `partitions = 'PARTITION ...'` | Initial partitions; easiest to express as strings. When using RANGE or LIST partitioning, you need to specify initial `partitions`. | +| `distributed_by` | `physical_properties` | `distributed_by = (kind=HASH, expressions=(c1, c2), buckets=10)` | `distributed_by = 'HASH(c1, c2) BUCKETS 10'` / `distributed_by = 'RANDOM'` | | +| `clustered_by` / `order_by` | `MODEL` / `physical_properties` | `clustered_by (col1, col2)` / `order_by = (col1, col2)` | `order_by = 'col1, col2'` | Ordering/clustering columns for query performance if it's not the same as the table key. | +| Other properties | `physical_properties` | Use strings (recommended) | Use strings | StarRocks `PROPERTIES` are string key/value pairs. | + +**Notes:** + +* You can use enum-like values without quotes (for example `HASH`, `RANDOM`, `IMMEDIATE`), but strings are also accepted (prefer single quotes). +* Aliases exist for convenience: use `partition_by` (table property) as an alias of `partitioned_by` (model parameter), and `order_by` ↔ `clustered_by`. +* Only several properties can be set as model +parameters: `partitioned_by`, `clustered_by`. But, for +simplity, you're recommended to use table properties +only. + +### Table Key Properties + +Table key properties accept multiple forms: + +* **Structured**: `col` or `(col1, col2, ...)` +* **String**: `'col'` or `'col1, col2'` + +**Syntax:** + +* Structured: `primary_key = col`, `primary_key = (col1, col2)`, `duplicate_key = (col2)` +* String: `primary_key = 'col1, col2'`, `unique_key = '(col2, col3)'`. + +#### PRIMARY KEY + +```sql +MODEL ( + name my_pk_table, + kind FULL, + physical_properties ( + primary_key = (id, ds), + distributed_by = (kind=HASH, expressions=id, buckets=10) + ) +); +``` + +#### DUPLICATE KEY + +```sql +MODEL ( + name my_dup_table, + kind FULL, + physical_properties ( + duplicate_key = (id, ds), + distributed_by = RANDOM + ) +); +``` + +#### UNIQUE KEY + +```sql +MODEL ( + name my_unique_table, + kind FULL, + physical_properties ( + unique_key = (id, ds), + distributed_by = (kind=HASH, expressions=id, buckets=10) + ) +); +``` + +### Partitioning + +StarRocks supports `RANGE` partitioning, `LIST` partitioning, and **expression partitioning**. + +You can specify partitioning either: + +* As a **model parameter**: `partitioned_by (...)` (good for simple expressions) +* As a **table property**: `physical_properties(partition_by=...)` (recommended when you need RANGE/LIST, or complex expressions) + +For `RANGE` and `LIST` partitioning, you generally need to provide initial `partitions` (pre-created partitions). For expression partitioning, `partitions` is usually not needed. + +#### `partitioned_by` / `partition_by` + +NOTE: + +* `partitioned_by (...)` can only be used as a model parameter (SQLMesh enforces this constraint). +* `partition_by` can be provided in `physical_properties` as table properties (for advanced partitioning). + +**Syntax:** + +* Expression list: `partitioned_by (col)` / `partitioned_by (expr1, expr2)` + * for complex example: `partition_by = (date_trunc('day', col2), col3)` +* RANGE/LIST: `partition_by = RANGE(col1, col2)` / `partition_by = LIST(col1, col2)` +* String fallback: `partition_by = 'RANGE(col1, col2)'` + +#### `partitions` + +**Syntax:** + +* Tuple of strings: `partitions = ('PARTITION ...', 'PARTITION ...')` +* Single string: `partitions = 'PARTITION ...'` + +#### Expression partitioning + +```sql +MODEL ( + name my_partitioned_model, + kind INCREMENTAL_BY_TIME_RANGE(time_column event_date), + partitioned_by (date_trunc('day', event_time), region), + physical_properties ( + primary_key = (user_id, event_date, region), + distributed_by = (kind=HASH, expressions=user_id, buckets=10) + ) +); +``` + +#### RANGE partitioning + +```sql +MODEL ( + name my_partitioned_model_advanced, + kind FULL, + physical_properties ( + partition_by = RANGE(event_time), + partitions = ( + 'PARTITION p20240101 VALUES [("2024-01-01"), ("2024-01-02"))', + 'PARTITION p20240102 VALUES [("2024-01-02"), ("2024-01-03"))' + ), + distributed_by = (kind=HASH, expressions=region, buckets=10) + ) +); +``` + +It's similar for `LIST` partitioning as `RANGE` partitioning. + +### Distribution + +StarRocks supports both `HASH` and `RANDOM` distribution. You can use a structured value or a string. + +1. Structured type syntax: ```(kind= [, expressions=] [, buckets=])``` + + * **kind**: `HASH` OR `RANDOM`. + * **expressions**: a single column or a tuple of columns, such as `col1` or `(col1, col2)`. (optional) + * **buckets**: bucket number. (optional) + +2. String type is similar as: `'HASH(id) BUCKETS 10'`, which is the same as the distribution clause in StarRocks's `CREATE TABLE`. +3. Or even a single enum-like value: `distributed_by = RANDOM`. + +#### HASH distribution + +Structured type (recommended): + +```sql +MODEL ( + name my_table, + kind FULL, + physical_properties ( + distributed_by = (kind=HASH, expressions=(user_id), buckets=10) + ) +); +``` + +#### RANDOM distribution + +Simple enumerate type: + +```sql +MODEL ( + name my_table_random, + kind FULL, + physical_properties ( + distributed_by = RANDOM + ) +); +``` + +#### String fallback + +A single string, which is the same as the clause in StarRocks's `CREATE TABLE`. + +```sql +MODEL ( + name my_table_string_dist, + kind FULL, + physical_properties ( + distributed_by = 'HASH(user_id) BUCKETS 10' + ) +); +``` + +### Ordering + +You can use `clustered_by` or `order_by` to specify the column ordering to optimize query performance if it's not the same the table key. + +You can specify `clustered_by` both as a model parameter and a table property, but you can only specify `order_by` as a table property. + +**Syntax:** + +* Structured: `order_by = col` / `order_by = (col1, col2)` +* String fallback: `order_by = 'col1, col2'` + +```sql +MODEL ( + name my_ordered_table, + kind FULL, + physical_properties ( + order_by = (ds, id), + distributed_by = (kind=HASH, expressions=id, buckets=10) + ) +); +``` + +### Generic PROPERTIES + +Any additional properties in `physical_properties` are passed through as StarRocks `PROPERTIES`. Since StarRocks `PROPERTIES` values are typically strings, using strings is recommended. + +```sql +MODEL ( + name advanced_table, + kind FULL, + physical_properties ( + primary_key = (id), + distributed_by = (kind=HASH, expressions=id, buckets=8), + replication_num = '1', + storage_medium = 'SSD', + enable_persistent_index = 'true', + compression = 'LZ4' + ) +); +``` + +## Views and Materialized Views + +### Views + +StarRocks supports view `SECURITY` via `physical_properties.security`. + +**Syntax:** + +* `security = INVOKER` or `security = NONE`. (optional) + +```sql +MODEL ( + name user_summary_view, + kind VIEW, + physical_properties ( + security = INVOKER + ) +); + +SELECT + user_id, + COUNT(*) AS event_count, + MAX(event_time) AS last_event_time +FROM user_events +GROUP BY user_id; +``` + +### Materialized Views (MV) + +SQLMesh uses `kind VIEW (materialized true)` to create materialized views. + +You can specify StarRocks MV refresh settings using the same `physical_properties` block. + +**Refresh properties:** + +* `refresh_moment`: `IMMEDIATE` or `DEFERRED` (optional) +* `refresh_scheme`: `MANUAL` or `ASYNC ...` (optional) + * Examples: `ASYNC`, `MANUAL`, `ASYNC START ("2024-01-01 00:00:00") EVERY (INTERVAL 5 MINUTE)` + * The syntax of `ASYNC ...` clause is the same as the clause in StarRocks. + +```sql +MODEL ( + name user_summary_mv, + kind VIEW ( + materialized true + ), + physical_properties ( + refresh_moment = DEFERRED, + refresh_scheme = 'ASYNC START ("2024-01-01 00:00:00") EVERY (INTERVAL 5 MINUTE)' + ) +); + +SELECT + user_id, + COUNT(*) AS event_count, + MAX(event_time) AS last_event_time +FROM user_events +GROUP BY user_id; +``` + +**Other properties:** + +You can specify `partitioning`, `distribution`, `order by` and `properties` the same as normal table properties. But notice that only supported MV properties are useful, Refer to StarRocks' doc for MV creation. + +**Notes:** + +* If you create materialized views with `replace=true`, SQLMesh may drop and recreate the MV. When an MV is dropped, its data is removed and the MV must be refreshed/rebuilt again. +* There are some restriction for `partitioning`, you need to refer StarRocks' doc for MV partitioning specification. +* StarRocks MV schema supports a column list but does **not** support explicit data types in that list. Column data types come from the `AS SELECT ...` query. +* If you create MVs from a dataframe via the Python API, provide `target_columns_to_types` (a `Dict[str, exp.DataType]`). If you don't care about exact types, you can set all columns to `VARCHAR` as a fallback: + +```python +from sqlglot import exp + +target_columns_to_types = { + "col1": exp.DataType.build("VARCHAR"), + "col2": exp.DataType.build("VARCHAR"), +} +``` + +## Limitations + +* **No sync MV support (currently)**: synchronous materialized views are not supported yet. +* **No tuple IN**: StarRocks does not support `(c1, c2) IN ((v1, v2), ...)`. +* **No `SELECT ... FOR UPDATE`**: StarRocks is an OLAP database and does not support row locks; SQLMesh removes `FOR UPDATE` when executing SQLGlot expressions. +* **RENAME caveat**: `ALTER TABLE db.old RENAME db.new` is not supported; the `RENAME` target cannot be qualified with a database name. + +## Dependencies + +To use StarRocks with SQLMesh, install the required MySQL driver: + +```bash +pip install "sqlmesh[starrocks]" +# or +pip install pymysql +``` + +## Resources + +* [StarRocks Documentation](https://docs.starrocks.io/) +* [StarRocks Table Design Guide](https://docs.starrocks.io/docs/table_design/StarRocks_table_design/) +* [StarRocks SQL Reference](https://docs.starrocks.io/docs/sql-reference/sql-statements/data-definition/CREATE_TABLE/) diff --git a/docs/integrations/overview.md b/docs/integrations/overview.md index 94b9289d21..10525fecea 100644 --- a/docs/integrations/overview.md +++ b/docs/integrations/overview.md @@ -26,4 +26,5 @@ SQLMesh supports the following execution engines for running SQLMesh projects (e * [Redshift](./engines/redshift.md) (redshift) * [Snowflake](./engines/snowflake.md) (snowflake) * [Spark](./engines/spark.md) (spark) +* [StarRocks](./engines/starrocks.md) (starrocks) * [Trino](./engines/trino.md) (trino) diff --git a/mkdocs.yml b/mkdocs.yml index 86761de9d7..368fb6690a 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -93,6 +93,7 @@ nav: - integrations/engines/risingwave.md - integrations/engines/snowflake.md - integrations/engines/spark.md + - integrations/engines/starrocks.md - integrations/engines/trino.md - Resources: - comparisons.md diff --git a/pyproject.toml b/pyproject.toml index f556092ffd..0f9a9e2c3e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -124,6 +124,7 @@ snowflake = [ "snowflake-connector-python[pandas,secure-local-storage]", "snowflake-snowpark-python", ] +starrocks = ["pymysql"] trino = ["trino"] web = [ "fastapi==0.120.1", @@ -271,6 +272,7 @@ markers = [ "pyspark: test for PySpark that need to run separately from the other spark tests", "trino: test for Trino (all connectors)", "risingwave: test for Risingwave", + "starrocks: test for StarRocks", # Other "set_default_connection", diff --git a/sqlmesh/core/config/__init__.py b/sqlmesh/core/config/__init__.py index 42ed82c6e6..50d2d9a5a2 100644 --- a/sqlmesh/core/config/__init__.py +++ b/sqlmesh/core/config/__init__.py @@ -22,6 +22,7 @@ RedshiftConnectionConfig as RedshiftConnectionConfig, SnowflakeConnectionConfig as SnowflakeConnectionConfig, SparkConnectionConfig as SparkConnectionConfig, + StarRocksConnectionConfig as StarRocksConnectionConfig, TrinoConnectionConfig as TrinoConnectionConfig, parse_connection_config as parse_connection_config, ) diff --git a/sqlmesh/core/config/connection.py b/sqlmesh/core/config/connection.py index 343414eab2..7d79df03a9 100644 --- a/sqlmesh/core/config/connection.py +++ b/sqlmesh/core/config/connection.py @@ -58,6 +58,7 @@ "trino", # Nullable types are problematic "clickhouse", + "starrocks", } MOTHERDUCK_TOKEN_REGEX = re.compile(r"(\?|\&)(motherduck_token=)(\S*)") PASSWORD_REGEX = re.compile(r"(password=)(\S+)") @@ -2349,6 +2350,80 @@ def init(cursor: t.Any) -> None: return init +class StarRocksConnectionConfig(ConnectionConfig): + """Configuration for the StarRocks connection. + + StarRocks uses MySQL network protocol and is compatible with MySQL ecosystem tools, + JDBC/ODBC drivers, and various visualization tools. + + Args: + host: The hostname of the StarRocks FE (Frontend) node. + user: The StarRocks username. + password: The StarRocks password. + port: The port number of the StarRocks FE node. Default is 9030. + database: The optional database name. + charset: The optional character set. TODO: may be not supported yet. + collation: The optional collation. TODO: may be not supported yet. + ssl_disabled: Whether to disable SSL connection. TODO: need to check it. + concurrent_tasks: The maximum number of tasks that can use this connection concurrently. + register_comments: Whether or not to register model comments with the SQL engine. + local_infile: Whether or not to allow local file access. + pre_ping: Whether or not to pre-ping the connection before starting a new transaction to ensure it is still alive. + """ + + host: str + user: str + password: str + port: t.Optional[int] = 9030 + database: t.Optional[str] = None + charset: t.Optional[str] = None + collation: t.Optional[str] = None + ssl_disabled: t.Optional[bool] = None + + concurrent_tasks: int = 4 + register_comments: bool = True + local_infile: bool = False + pre_ping: bool = True + + type_: t.Literal["starrocks"] = Field(alias="type", default="starrocks") + DIALECT: t.ClassVar[t.Literal["starrocks"]] = "starrocks" + DISPLAY_NAME: t.ClassVar[t.Literal["StarRocks"]] = "StarRocks" + DISPLAY_ORDER: t.ClassVar[t.Literal[19]] = 19 + + _engine_import_validator = _get_engine_import_validator("pymysql", "starrocks") + + @property + def _connection_kwargs_keys(self) -> t.Set[str]: + connection_keys = { + "host", + "user", + "password", + } + if self.port is not None: + connection_keys.add("port") + if self.database is not None: + connection_keys.add("database") + if self.charset is not None: + connection_keys.add("charset") + if self.collation is not None: + connection_keys.add("collation") + if self.ssl_disabled is not None: + connection_keys.add("ssl_disabled") + if self.local_infile is not None: + connection_keys.add("local_infile") + return connection_keys + + @property + def _engine_adapter(self) -> t.Type[EngineAdapter]: + return engine_adapter.StarRocksEngineAdapter + + @property + def _connection_factory(self) -> t.Callable: + from pymysql import connect + + return connect + + _CONNECTION_CONFIG_EXCLUDE: t.Set[t.Type[ConnectionConfig]] = { ConnectionConfig, # type: ignore[type-abstract] BaseDuckDBConnectionConfig, # type: ignore[type-abstract] diff --git a/sqlmesh/core/engine_adapter/__init__.py b/sqlmesh/core/engine_adapter/__init__.py index ab29885c7b..cb9db5ea77 100644 --- a/sqlmesh/core/engine_adapter/__init__.py +++ b/sqlmesh/core/engine_adapter/__init__.py @@ -16,6 +16,7 @@ from sqlmesh.core.engine_adapter.redshift import RedshiftEngineAdapter from sqlmesh.core.engine_adapter.snowflake import SnowflakeEngineAdapter from sqlmesh.core.engine_adapter.spark import SparkEngineAdapter +from sqlmesh.core.engine_adapter.starrocks import StarRocksEngineAdapter from sqlmesh.core.engine_adapter.trino import TrinoEngineAdapter from sqlmesh.core.engine_adapter.athena import AthenaEngineAdapter from sqlmesh.core.engine_adapter.risingwave import RisingwaveEngineAdapter @@ -37,6 +38,7 @@ "athena": AthenaEngineAdapter, "risingwave": RisingwaveEngineAdapter, "fabric": FabricEngineAdapter, + "starrocks": StarRocksEngineAdapter, } DIALECT_ALIASES = { diff --git a/sqlmesh/core/engine_adapter/starrocks.py b/sqlmesh/core/engine_adapter/starrocks.py new file mode 100644 index 0000000000..b8ff61b158 --- /dev/null +++ b/sqlmesh/core/engine_adapter/starrocks.py @@ -0,0 +1,3570 @@ +from __future__ import annotations + +import logging +import re +import sqlglot +from sqlglot import exp +import typing as t + +from sqlmesh.core.engine_adapter.base import ( + InsertOverwriteStrategy, + get_source_columns_to_types, +) +from sqlmesh.core.engine_adapter.mixins import ( + ClusteredByMixin, + LogicalMergeMixin, + PandasNativeFetchDFSupportMixin, +) +from sqlmesh.core.engine_adapter.shared import ( + CommentCreationTable, + CommentCreationView, + DataObject, + DataObjectType, + set_catalog, + to_schema, +) +from sqlmesh.core.node import IntervalUnit +from sqlmesh.utils.errors import SQLMeshError + +if t.TYPE_CHECKING: + from sqlmesh.core._typing import SchemaName, TableName + from sqlmesh.core.engine_adapter._typing import QueryOrDF + +logger = logging.getLogger(__name__) + + +############################################################################### +# Declarative Type System for Property Validation and Normalization +############################################################################### +""" +Declarative type system for property validation and normalization. + +This module provides a declarative way to define property types with clear separation +between validation (type checking) and normalization (type conversion). +""" +Validated = t.Any # validated intermediate value (AST nodes, string, list...) +Normalized = t.Any # final normalized output + +# Allowed outputs for EnumType normalize / or general property outputs. +PROPERTY_OUTPUT_TYPES = { + "str", # "HASH" + "var", # exp.Var("ASYNC") + "identifier", # exp.Identifier + "literal", # exp.Literal.string("HASH") + "column", # exp.Column(this="HASH") + "ast_expr", # generic exp.Expression +} + + +# ============================================================ +# Fragment parser (robust-ish) +# ============================================================ +def parse_fragment(text: str) -> t.Union[exp.Expression, t.List[exp.Expression]]: + """ + Try to parse a DSL fragment into SQLGlot AST(s). + + Behavior: + 1. If parse_one succeeds, return the exp.Expression. + 2. If fails but text contains comma, split by commas and parse each part. + 3. If it's parenthesized like "(a, b)", parse and return exp.Tuple or list. + 4. If it's a simple token like "IDENT", return exp.Identifier. + """ + if isinstance(text, exp.Expression): + return text + + if not isinstance(text, str): + raise TypeError("parse_fragment expects a string") + + s = text.strip() + try: + parsed = sqlglot.parse_one(s) + return parsed + except Exception: + raise ValueError(f"Unable to parse fragment: {s}") + + +# ============================================================ +# Base Type +# ============================================================ +class DeclarativeType: + """ + Base class for declarative type system. + + Design Philosophy: + ----------------- + - validate(value): Type checking only - returns validated intermediate value or None + - normalize(validated): Type conversion only - transforms to target output format + + Methods: + -------- + validate(value) -> Optional[Validated] + Check if value conforms to this type, maybe include some tiny different types + Returns: Validated intermediate value if valid, None otherwise. + + normalize(validated) -> Normalized + Convert validated intermediate value to final output format. + Returns: Normalized value in target format. + + __call__(value) -> Normalized + Convenience method: validate + normalize in one step. + """ + + def validate(self, value: t.Any) -> t.Optional[Validated]: + """Check if value conforms to this type. Return validated value or None. + String that can be parsed as literal + """ + raise NotImplementedError( + f"{self.__class__.__name__}.validate() must be implemented" + ) + + def normalize(self, validated: Validated) -> Normalized: + """Convert validated intermediate value to final output format.""" + # Default: identity transformation + return validated + + def __call__(self, value: t.Any) -> Normalized: + """Validate and normalize in one step.""" + validated = self.validate(value) + if validated is None: + raise ValueError( + f"Value {value!r} does not conform to type {self.__class__.__name__}" + ) + return self.normalize(validated) + + +# ============================================================ +# Primitive Types +# ============================================================ +class StringType(DeclarativeType): + """ + String type validator. + + Accepts: + - Python str only + + Validation: Returns the string if valid, None otherwise. + Normalization: Returns the string as-is (identity). + """ + + def __init__(self, normalized_type: str = "str"): + """ + Args: + normalized_type: Target type for normalization. + - "literal": Convert to exp.Literal.string() + - "str": Keep as string (default) + - "identifier": Convert to exp.Identifier + """ + self.normalized_type = normalized_type + + def validate(self, value: t.Any) -> t.Optional[str]: + """Check if value is a Python string. Returns string or None.""" + return value if isinstance(value, str) else None + + def normalize(self, validated: str) -> str: + """Return string as-is (identity normalization).""" + return validated + + +class LiteralType(DeclarativeType): + """ + Literal type validator. + + Accepts: + - exp.Literal only (from AST) + - String that can be parsed as literal + + Validation: Returns exp.Literal if valid, None otherwise. + Normalization: Converts to target type based on normalized_type parameter. + """ + + def __init__(self, normalized_type: t.Optional[str] = None): + """ + Args: + normalized_type: Target type for normalization. + - None: Keep as exp.Literal (default) + - "literal": Keep as exp.Literal + - "str": Convert to Python string + """ + self.normalized_type = normalized_type + + def validate(self, value: t.Any) -> t.Optional[exp.Literal]: + """Check if value is a literal type. Returns exp.Literal or None.""" + # Try parsing string first + if isinstance(value, str): + try: + value = parse_fragment(value) + except Exception: + return None + + # Check if it's a Literal + if isinstance(value, exp.Literal): + return value + + return None + + def normalize(self, validated: exp.Literal) -> t.Union[exp.Literal, str]: + """Convert to target type based on normalized_type.""" + if self.normalized_type == "str": + return validated.this + # None or "literal" - keep as-is + return validated + + +class IdentifierType(DeclarativeType): + """ + Identifier type validator. + + Accepts: + - exp.Identifier only + - String that can be parsed as identifier + + Validation: Returns exp.Identifier if valid, None otherwise. + Normalization: Converts to target type based on normalized_type parameter. + """ + + def __init__(self, normalized_type: t.Optional[str] = None): + """ + Args: + normalized_type: Target type for normalization. + - None: Keep as exp.Identifier (default) + - "literal": Convert to exp.Literal.string() + - "str": Convert to Python string + - "identifier": Keep as exp.Identifier + - "column": Convert to exp.Column + """ + self.normalized_type = normalized_type + + def validate(self, value: t.Any) -> t.Optional[exp.Identifier]: + """Check if value is an identifier type. Returns exp.Identifier or None.""" + # Try parsing string first + if isinstance(value, str): + try: + value = parse_fragment(value) + except Exception: + return None + + # Check if it's an Identifier + if isinstance(value, exp.Identifier): + return value + + return None + + def normalize( + self, validated: exp.Identifier + ) -> t.Union[exp.Identifier, exp.Column, exp.Literal, str]: + """Convert to target type based on normalized_type.""" + if self.normalized_type == "column": + return exp.column(validated.this) + if self.normalized_type == "literal": + return exp.Literal.string(validated.this) + if self.normalized_type == "str": + return validated.this + # None or "identifier" - keep as-is + return validated + + +class ColumnType(DeclarativeType): + """ + Column type validator. + + Accepts: + - exp.Column only + - String that can be parsed as column + + Validation: Returns exp.Column if valid, None otherwise. + Normalization: Converts to target type based on normalized_type parameter. + """ + + def __init__(self, normalized_type: t.Optional[str] = None): + """ + Args: + normalized_type: Target type for normalization. + - None: Keep as exp.Column (default) + - "literal": Convert to exp.Literal.string() + - "str": Convert to Python string + - "identifier": Convert to exp.Identifier + - "column": Keep as exp.Column + """ + self.normalized_type = normalized_type + + def validate(self, value: t.Any) -> t.Optional[exp.Column]: + """Check if value is a column type. Returns exp.Column or None.""" + # Try parsing string first + if isinstance(value, str): + try: + value = parse_fragment(value) + except Exception: + return None + + # Check if it's a Column + if isinstance(value, exp.Column): + return value + + return None + + def normalize( + self, validated: exp.Column + ) -> t.Union[exp.Column, exp.Identifier, exp.Literal, str]: + """Convert to target type based on normalized_type.""" + if self.normalized_type == "identifier": + return exp.Identifier(this=validated.this) + if self.normalized_type == "literal": + return exp.Literal.string(validated.this) + if self.normalized_type == "str": + return str(validated.this) + # None or "column" - keep as-is + return validated + + +class EqType(DeclarativeType): + """ + EQ expression type validator (key=value pairs). + + Accepts: + - exp.EQ(left, right) + - String that can be parsed as key=value + + Validation: Returns (key_name, value_expr) tuple if valid, None otherwise. + Normalization: Returns the (key, value) tuple as-is. + """ + + def validate(self, value: t.Any) -> t.Optional[t.Tuple[str, t.Any]]: + """Check if value is an EQ expression. Returns (key, value) tuple or None.""" + # Try parsing string first + if isinstance(value, str): + try: + value = parse_fragment(value) + except Exception: + return None + + # Check if it's an EQ expression + if isinstance(value, exp.EQ): + # Extract key name from left side + left = value.this + # Extract value from right side + right = value.expression + + key_name = None + if isinstance(left, exp.Column): + key_name = left.this.name if hasattr(left.this, 'name') else str(left.this) + elif isinstance(left, exp.Identifier): + key_name = left.this + elif isinstance(left, str): + key_name = left + else: + key_name = str(left) + + return (key_name, right) + + return None + + def normalize(self, validated: t.Tuple[str, t.Any]) -> t.Tuple[str, t.Any]: + """Return (key, value) tuple as-is (identity normalization).""" + return validated + + +class EnumType(DeclarativeType): + """ + Enumerated value type validator. + + Accepts values from a predefined set of allowed values. + Following input types are allowed: + - str + - exp.Literal + - exp.Var + - exp.Identifier + - exp.Column + + Parameters: + ----------- + valid_values : t.Sequence[str] + List of allowed values (e.g., ["HASH", "RANDOM"]) + normalized_type : t.Optional[str] + Target type for normalization: + - "str": Python string (default) + - "identifier": exp.Identifier + - "literal": exp.Literal.string() + - "column": exp.Column + - "ast_expr": generic exp.Expression (defaults to Identifier) + case_sensitive : bool + Whether to perform case-sensitive matching (default: False) + + Validation: Checks if value is in allowed set, returns canonical string. + Normalization: Converts to specified target type. + """ + + def __init__( + self, + valid_values: t.Sequence[str], + normalized_type: str = "str", + case_sensitive: bool = False, + ): + self.valid_values = list(valid_values) + self.case_sensitive = bool(case_sensitive) + self.normalized_type = normalized_type + + if ( + self.normalized_type is not None + and self.normalized_type not in PROPERTY_OUTPUT_TYPES + ): + raise ValueError( + f"normalized_type must be one of {PROPERTY_OUTPUT_TYPES}, got {self.normalized_type!r}" + ) + + # Pre-compute normalized values for efficient lookup + self._values_normalized = [ + v if case_sensitive else v.upper() for v in self.valid_values + ] + + def _extract_text(self, value: t.Any) -> t.Optional[str]: + """Extract text from various value types.""" + if isinstance(value, str): + return value + if isinstance(value, (exp.Literal, exp.Var)): + return str(value.this) + if isinstance(value, (exp.Identifier, exp.Column)): + # For Identifier/Column, this might be another Expression + if isinstance(value.this, str): + return value.this + elif hasattr(value.this, "name"): # noqa: RET505 + return str(value.this.name) + else: + return str(value.this) + return None + + def _normalize_text(self, text: str) -> str: + """Normalize text for comparison based on case sensitivity.""" + return text if self.case_sensitive else text.upper() + + def validate(self, value: t.Any) -> t.Optional[str]: + """Check if value is in the allowed enum set. Returns canonical string or None.""" + # Try parsing string first + if isinstance(value, str): + try: + parsed = parse_fragment(value) + # If parsed successfully, extract text from AST node + if isinstance(parsed, (exp.Identifier, exp.Literal, exp.Column)): + value = parsed + except Exception: + # If parsing fails, treat as plain string + pass + + # Extract text from value + text = self._extract_text(value) + + if text is None: + return None + + # Normalize and check against allowed values + normalized_text = self._normalize_text(text) + if normalized_text in self._values_normalized: + return normalized_text + + return None + + def normalize(self, validated: str) -> Normalized: + """Convert validated enum string to target type.""" + # validated is already canonical (e.g., "HASH") + if self.normalized_type is None or self.normalized_type == "str": + return validated + if self.normalized_type == "var": + return exp.Var(this=validated) + if self.normalized_type == "literal": + return exp.Literal.string(validated) + if self.normalized_type == "identifier": + return exp.Identifier(this=validated) + if self.normalized_type == "column": + return exp.Column(this=validated) + if self.normalized_type == "ast_expr": + return exp.Identifier(this=validated) + + # Fallback to string + return validated + + +class FuncType(DeclarativeType): + """ + Function type validator. + + Accepts: + - exp.Func (built-in functions like date_trunc, CAST, etc.) + - exp.Anonymous (custom/dialect functions like RANGE, LIST) + - String that can be parsed as function call + + Validation: Returns exp.Func or exp.Anonymous if valid, None otherwise. + Normalization: Returns the function expression as-is (identity). + + Examples: + date_trunc('day', col1) → exp.Func + RANGE(col1, col2) → exp.Anonymous + LIST(region, status) → exp.Anonymous + """ + + def validate(self, value: t.Any) -> t.Optional[t.Union[exp.Func, exp.Anonymous]]: + """Check if value is a function type. Returns exp.Func/exp.Anonymous or None.""" + # Try parsing string first + if isinstance(value, str): + try: + value = parse_fragment(value) + except Exception: + return None + + # Check if it's a Func or Anonymous function + if isinstance(value, (exp.Func, exp.Anonymous)): + return value + + return None + + def normalize( + self, validated: t.Union[exp.Func, exp.Anonymous] + ) -> t.Union[exp.Func, exp.Anonymous]: + """Return function expression as-is (identity normalization).""" + return validated + + +# ============================================================ +# AnyOf (combinator) +# ============================================================ +class AnyOf(DeclarativeType): + """ + Union type - accepts first matching subtype. + + This is a combinator type that tries each subtype in order and accepts + the first one that validates successfully. + + Validation: Tries each subtype, returns (matched_type, validated_value) tuple. + Normalization: Uses the matched subtype's normalize method. + """ + + def __init__(self, *types: DeclarativeType): + if not types: + raise ValueError("AnyOf requires at least one type") + + # Validate all types are DeclarativeType instances + for type_ in types: + if not isinstance(type_, DeclarativeType): + raise TypeError( + f"AnyOf expects DeclarativeType instances, got {type_!r}" + ) + + self.types: t.List[DeclarativeType] = list(types) + + def validate(self, value: t.Any) -> t.Optional[t.Tuple[DeclarativeType, Validated]]: + """Try each subtype in order, return (matched_type, validated_value) or None.""" + for sub_type in self.types: + validated = sub_type.validate(value) + if validated is not None: + # Return both the matched type and validated value + return (sub_type, validated) + + # No type matched + return None + + def normalize(self, validated: t.Tuple[DeclarativeType, Validated]) -> Normalized: + """Normalize using the matched subtype's normalize method.""" + matched_type, validated_value = validated + return matched_type.normalize(validated_value) + + +# ============================================================ +# SequenceOf (Tuple/List/Paren/Single -> normalized list/tuple) +# ============================================================ +class SequenceOf(DeclarativeType): + """ + Sequence/List type validator with built-in union type support. + + Accepts various sequence representations and validates each element against + one or more possible types (similar to AnyOf for each element). + Optionally accepts single elements (promoted to single-item lists). + + Accepts: + - exp.Tuple: (a, b, c) + - exp.Array: [a, b, c] + - exp.Paren: (a) or ((a, b)) + - Python list/tuple: [a, b] or (a, b) + - String: "a, b, c" (parsed) + - Single element: a (if allow_single=True, promoted to [a]) + + Validation: Returns list of (matched_type, validated_value) tuples or None. + Normalization: Returns list of normalized elements using matched type's normalize. + + Examples: + # Single type + SequenceOf(ColumnType()) + + # Multiple types (union) - each element tries types in order + SequenceOf(ColumnType(), IdentifierType(), LiteralType()) + + # Allow single element + SequenceOf(ColumnType(), allow_single=True) + + # Multiple types + allow single + SequenceOf(ColumnType(), IdentifierType(), allow_single=True) + """ + + def __init__( + self, + *elem_types: DeclarativeType, + allow_single: bool = False, + output_as: str = "list", + ): + """ + Args: + *elem_types: One or more type validators for elements. + If multiple types provided, each element tries types in order (AnyOf behavior). + allow_single: Whether to accept single elements (promoted to list). Default: False. + output_as: Output format - "list" or "tuple". Default: "list". + """ + if not elem_types: + raise ValueError("SequenceOf requires at least one element type") + + self.elem_types: t.List[DeclarativeType] = list(elem_types) + self.allow_single = allow_single + self.output_as = output_as + + def validate( + self, value: t.Any + ) -> t.Optional[t.List[t.Tuple[DeclarativeType, Validated]]]: + """Validate each element in the sequence. Returns list of (matched_type, validated_value) tuples or None.""" + # Extract elements from various container types + elems = self._extract_elements(value) + if elems is None: + return None + + # Validate each element against all possible types (AnyOf behavior) + validated_items: t.List[t.Tuple[DeclarativeType, Validated]] = [] + for elem in elems: + # Try each type until one matches + matched = False + for elem_type in self.elem_types: + validated = elem_type.validate(elem) + if validated is not None: + validated_items.append((elem_type, validated)) + matched = True + break + + # If no type matched, the whole sequence fails if any element fails + if not matched: + return None + + return validated_items + + def normalize( + self, validated: t.List[t.Tuple[DeclarativeType, Validated]] + ) -> t.Union[t.List[Normalized], t.Tuple[Normalized, ...]]: + """Normalize each validated element using its matched type's normalize method.""" + normalized_items = [ + elem_type.normalize(value) for elem_type, value in validated + ] + + # Convert to desired output format + if self.output_as == "tuple": + return tuple(normalized_items) + return normalized_items # default: list + + def _extract_elements(self, value: t.Any) -> t.Optional[t.List[t.Any]]: + """ + Extract elements from various container representations. + Returns list of raw elements or None if extraction fails. + """ + # Python list/tuple - process first before string parsing + if isinstance(value, (list, tuple)): + return list(value) + + # Try parsing string for AST types + if isinstance(value, str): + try: + value = parse_fragment(value) + except Exception: + # If parsing fails and we accept single strings, promote to list + if self.allow_single and any( + isinstance(t, StringType) for t in self.elem_types + ): + return [value] + return None + + # SQL Tuple: (a, b, c) + if isinstance(value, exp.Tuple): + return list(value.expressions) + + # SQL Array: [a, b, c] + if isinstance(value, exp.Array): + return list(value.expressions) + + # SQL Paren: (a) or ((a, b)) + if isinstance(value, exp.Paren): + inner = value.this + if isinstance(inner, exp.Tuple): + return list(inner.expressions) + return [inner] + + # Single AST element: promote to list (if allow_single) + if self.allow_single and isinstance(value, exp.Expression): + return [value] + + return None + + +# ============================================================ +# Field Definition for Structured Types +# ============================================================ +class Field: + """ + Field specification for StructuredTupleType. + + Defines validation rules, types, and metadata for a single field. + + Args: + type: DeclarativeType instance for validating field value + required: Whether this field is required (default: False) + aliases: List of alternative field names (default: []) + doc: Documentation string for this field + + Example: + Field( + type=EnumType(["HASH", "RANDOM"]), + required=True, + aliases=["distribution_type"], + doc="Distribution kind: HASH or RANDOM" + ) + """ + + def __init__( + self, + type: DeclarativeType, + required: bool = False, + aliases: t.Optional[t.List[str]] = None, + doc: t.Optional[str] = None, + ): + self.type = type + self.required = required + self.aliases = aliases or [] + self.doc = doc + + +# ============================================================ +# StructuredTupleType - Base class for typed tuples +# ============================================================ +class StructuredTupleType(DeclarativeType): + """ + Base class for validating tuples with typed fields. + + Subclasses define FIELDS dict to specify structure: + + FIELDS = { + "field_name": Field( + type=SomeType(), + required=True, + aliases=["alt_name1", "alt_name2"] + ), + ... + } + + Validation Process: + 1. Parse tuple into key=value pairs (exp.EQ) + 2. Match keys against FIELDS (including aliases) + 3. Validate each field value with specified type + 4. Check required fields are present + 5. Handle unknown/invalid fields based on error flags + + Returns: Dict[str, Any] with canonical field names as keys + + Example: + class DistributionTupleInputType(StructuredTupleType): + FIELDS = { + "kind": Field(type=EnumType(["HASH", "RANDOM"]), required=True), + "columns": Field(type=SequenceOf(ColumnType())), + } + + Args: + error_on_unknown_field: If True, raise error when encountering unknown fields. + If False, silently skip unknown fields (default: False) + error_on_invalid_field: If True, raise error when field value validation fails. + If False, return None for entire validation (default: True) + """ + + FIELDS: t.Dict[str, Field] = {} # Subclasses override this + + def __init__( + self, error_on_unknown_field: bool = True, error_on_invalid_field: bool = True + ): + self.error_on_unknown_field = error_on_unknown_field + self.error_on_invalid_field = error_on_invalid_field + + # Build alias mapping: alias -> canonical_name + self._alias_map: t.Dict[str, str] = {} + for field_name, field_spec in self.FIELDS.items(): + # Map canonical name to itself + self._alias_map[field_name] = field_name + # Map aliases to canonical name + for alias in field_spec.aliases: + self._alias_map[alias] = field_name + + def validate( + self, value: t.Any + ) -> t.Optional[t.Dict[str, t.Tuple[DeclarativeType, Validated]]]: + """ + Validate structured tuple. + + Returns: Dict mapping canonical field names to (matched_type, validated_value) tuples, + or None if validation fails. + + Raises: + ValueError: If error_on_unknown_field=True and unknown field encountered + ValueError: If error_on_invalid_field=True and field validation fails + """ + # Try parsing string first + if isinstance(value, str): + try: + value = parse_fragment(value) + except Exception: + return None + + # Extract key=value pairs from tuple/paren + pairs = self._extract_pairs(value) + if pairs is None: + return None + + # Validate each pair and build result dict + result: t.Dict[str, t.Tuple[DeclarativeType, Validated]] = {} + eq_type = EqType() + + for pair_expr in pairs: + # Validate as EQ expression + eq_validated = eq_type.validate(pair_expr) + if eq_validated is None: + continue # Skip non-EQ expressions + + key, value_expr = eq_validated + + # Resolve alias to canonical name + canonical_name = self._alias_map.get(key) + if canonical_name is None: + # Unknown field + if self.error_on_unknown_field: + raise ValueError( + f"Unknown field '{key}' in {self.__class__.__name__}. " + f"Valid fields: {list(self.FIELDS.keys())}" + ) + # Skip unknown field + continue + + # Get field spec + field_spec = self.FIELDS[canonical_name] + + # Validate field value with specified type + validated_value = field_spec.type.validate(value_expr) + if validated_value is None: + # Field validation failed + if self.error_on_invalid_field: + raise ValueError( + f"Invalid value for field '{canonical_name}': {value_expr}. " + f"Expected type: {field_spec.type.__class__.__name__}, " + f"Actual type: {type(value_expr).__name__}" + ) + # Return None for entire validation + return None + + # Store with canonical name + result[canonical_name] = (field_spec.type, validated_value) + + # Check required fields + for field_name, field_spec in self.FIELDS.items(): + if field_spec.required and field_name not in result: + # Required field missing + if self.error_on_invalid_field: + raise ValueError( + f"Required field '{field_name}' is missing in {self.__class__.__name__}" + ) + return None + + return result + + def normalize( + self, validated: t.Dict[str, t.Tuple[DeclarativeType, Validated]] + ) -> t.Dict[str, Normalized]: + """ + Normalize validated fields. + + Returns: Dict mapping canonical field names to normalized values. + """ + return { + field_name: field_type.normalize(value) + for field_name, (field_type, value) in validated.items() + } + + def _extract_pairs(self, value: t.Any) -> t.Optional[t.List[t.Any]]: + """ + Extract list of expressions from tuple/paren. + Each expression should be an exp.EQ (key=value). + """ + # exp.Tuple: (a=1, b=2) + if isinstance(value, (exp.Tuple, list)): + return list(value.expressions) + + # exp.Paren: (a=1) or ((a=1, b=2)) + if isinstance(value, exp.Paren): + inner = value.this + if isinstance(inner, exp.Tuple): + return list(inner.expressions) + return [inner] + + return None + + +class DistributionTupleInputType(StructuredTupleType): + """ + StarRocks distribution tuple validator. + + Accepts: + - (kind='HASH', columns=(id, dt), buckets=10) + - (kind='HASH', expressions=(id, dt), bucket_num=10) + - (kind='RANDOM') + + Returns: Dict with fields: + - kind: "HASH" or "RANDOM" (string) + - columns: List[exp.Column] (optional, for HASH) + - buckets: exp.Literal (optional) + + Field Aliases: + - columns: expressions + - buckets: bucket, bucket_num + + Examples: + Input: (kind='HASH', columns=(id, dt), buckets=10) + Output: { + 'kind': 'HASH', + 'columns': [exp.Column('id'), exp.Column('dt')], + 'buckets': exp.Literal.number(10) + } + + Input: (kind='RANDOM') + Output: {'kind': 'RANDOM'} + + Conversion: + Use factory methods to convert normalized values to unified dict format: + - from_enum(): Convert EnumType normalized value (str) → dict + - from_func(): Convert FuncType normalized value (exp.Func) → dict + - to_unified_dict(): Convert any normalized value → dict + """ + + FIELDS = { + "kind": Field( + type=EnumType(["HASH", "RANDOM"], normalized_type="str"), + required=True, + doc="Distribution type: HASH or RANDOM", + ), + "columns": Field( + type=SequenceOf( + ColumnType(), + IdentifierType(normalized_type="column"), + allow_single=True, + ), + required=False, + aliases=["expressions"], + doc="Columns for HASH distribution", + ), + "buckets": Field( + type=AnyOf(LiteralType(), StringType(normalized_type="literal")), + required=False, + aliases=["bucket", "bucket_num"], + doc="Number of buckets", + ), + } + + +class DistributionTupleOutputType(StructuredTupleType): + """ + Output validator for distribution tuple. + + Used to validate normalized distribution values which are already dicts. + Overrides validate() to handle dict input directly (for output validation), + while parent class handles tuple/string input (for input validation). + """ + + FIELDS = { + "kind": Field( + type=EnumType(["HASH", "RANDOM"]), + required=True, + ), + "columns": Field( + type=SequenceOf(ColumnType(), allow_single=False), + required=False, + ), + "buckets": Field( + type=LiteralType(), + required=False, + ), + } + + def validate(self, value: t.Any) -> t.Optional[t.Dict[str, t.Any]]: + """ + Validate a distribution value for OUTPUT validation. + + For output validation, accepts: + - dict: Validate structure directly (normalized output) + - tuple/string: Delegate to parent class (for completeness) + + Returns: The dict if valid, None otherwise + """ + # For output validation, handle dict directly + if isinstance(value, dict): + # Validate required 'kind' field + kind = value.get("kind") + if kind is None: + return None + + # Validate 'kind' is a valid enum value + kind_spec = self.FIELDS["kind"].type + if kind_spec.validate(kind) is None: + return None + + # Validate 'columns' if present + columns = value.get("columns") + if columns is not None: + columns_spec = self.FIELDS["columns"].type + if columns_spec.validate(columns) is None: + return None + + # Validate 'buckets' if present + buckets = value.get("buckets") + if buckets is not None: + buckets_spec = self.FIELDS["buckets"].type + if buckets_spec.validate(buckets) is None: + return None + + return value + + # For tuple/string, delegate to parent class + return super().validate(value) + + # ============================================================ + # Factory methods for conversion from other normalized types + # ============================================================ + + @staticmethod + def from_enum( + enum_value: str, buckets: t.Optional[int] = None + ) -> t.Dict[str, t.Any]: + """ + Create distribution dict from EnumType normalized value. + + Args: + enum_value: "RANDOM" (from EnumType) + buckets: Optional bucket count + + Returns: + Dict with kind/columns/buckets fields + + Example: + >>> DistributionTupleInputType.from_enum("RANDOM") + {"kind": "RANDOM", "columns": [], "buckets": None} + """ + return { + "kind": enum_value, + "columns": [], + "buckets": buckets + } + + @staticmethod + def from_func( + func: t.Union[exp.Func, exp.Anonymous], buckets: t.Optional[int] = None + ) -> t.Dict[str, t.Any]: + """ + Create distribution dict from FuncType normalized value. + + Args: + func: HASH(id, dt) or RANDOM() (from FuncType) + buckets: Optional bucket count + + Returns: + Dict with kind/columns/buckets fields + + Example: + >>> func = parse_one("HASH(id, dt)") + >>> DistributionTupleInputType.from_func(func) + {"kind": "HASH", "columns": [exp.Column("id"), exp.Column("dt")], "buckets": None} + """ + func_name = ( + func.name.upper() if hasattr(func, "name") else str(func.this).upper() + ) + + if func_name == "HASH": + # Extract columns from HASH(col1, col2, ...) + columns: list[exp.Column] = ( + [func.this] if isinstance(func.this, exp.Column) else [] + ) + columns.extend(func.expressions) + return { + "kind": "HASH", + "columns": columns, + "buckets": buckets + } + elif func_name == "RANDOM": # noqa: RET505 + return { + "kind": "RANDOM", + "columns": [], + "buckets": buckets + } + else: + raise ValueError(f"Unknown distribution function: {func_name}") + + @staticmethod + def to_unified_dict( + normalized_value: t.Any, buckets: t.Optional[int] = None + ) -> t.Dict[str, t.Any]: + """ + Convert any normalized distribution value to unified dict format. + + This is a convenience method that dispatches to appropriate factory method. + + Args: + normalized_value: Result from DistributedByInputSpec normalization + (dict | str | exp.Func) + buckets: Optional bucket count override + + Returns: + Unified dict with kind/columns/buckets fields + + Raises: + TypeError: If value type is not supported + + Example: + >>> # From DistributionTupleInputType + >>> DistributionTupleInputType.to_unified_dict({"kind": "HASH", "columns": [...]}) + {"kind": "HASH", "columns": [...], "buckets": None} + + >>> # From EnumType + >>> DistributionTupleInputType.to_unified_dict("RANDOM") + {"kind": "RANDOM", "columns": [], "buckets": None} + + >>> # From FuncType + >>> DistributionTupleInputType.to_unified_dict(parse_one("HASH(id)")) + {"kind": "HASH", "columns": [exp.Column("id")], "buckets": None} + """ + if isinstance(normalized_value, dict): + # Already in DistributionTupleInputType format + return normalized_value + elif isinstance(normalized_value, str): # noqa: RET505 + # From EnumType: "RANDOM" + return DistributionTupleOutputType.from_enum(normalized_value, buckets) + elif isinstance(normalized_value, (exp.Func, exp.Anonymous)): + # From FuncType: HASH(id, dt) + return DistributionTupleOutputType.from_func(normalized_value, buckets) + else: + raise TypeError( + f"Cannot convert {type(normalized_value).__name__} to distribution dict. " + f"Expected dict, str, or exp.Func/exp.Anonymous." + ) + + +# ============================================================ +# Type Specifications for StarRocks Properties (INPUT and OUTPUT) +# ============================================================ +class PropertySpecs: + + # Accepts: + # - Single column: id + # - Multiple columns: (id, dt) + # - String for string input: "id, dt" (will be auto-wrapped and parsed by preprocess_parentheses) + GeneralColumnListInputSpec = SequenceOf( + ColumnType(), + StringType(normalized_type="column"), + IdentifierType(normalized_type="column"), + allow_single=True, + ) + + # TableKey: Simple key specification (primary_key, duplicate_key, unique_key, aggregate_key) + # Accepts: + # - Single column: id + # - Multiple columns: (id, dt) + TableKeyInputSpec = GeneralColumnListInputSpec + + # Partitioned By: Flexible partition specification + # Accepts: + # - Single column: col1 + # - Multiple columns: (col1, col2) + # - Mixed: (col1, "col2") - string will be parsed + # - RANGE(col1) or RANGE(col1, col2) + # - LIST(col1) or LIST(col1, col2) + # - Expression: (date_trunc('day', col1), col2) + PartitionedByInputSpec = SequenceOf( + ColumnType(), + StringType(normalized_type="column"), + IdentifierType(normalized_type="column"), + FuncType(), # RANGE(), LIST(), date_trunc(), etc. + allow_single=True, + ) + + # Partitions: List of partition definitions (strings) + # Accepts: + # - Single partition: 'PARTITION p1 VALUES LESS THAN ("2024-01-01")' + # - Multiple partitions: ('PARTITION p1 ...', 'PARTITION p2 ...') + # Note: Single string is auto-promoted to list + PartitionsInputSpec = SequenceOf( + StringType(), LiteralType(normalized_type="str"), allow_single=True + ) + + # Distribution: StarRocks distribution specification + # Accepts: + # - Structured tuple1: (kind='HASH', columns=(id, dt), buckets=10) + # - Structured tuple2: (kind='RANDOM') + # - String format: "HASH(id)", "RANDOM", or "(kind='HASH', columns=(id), buckets=10)" + # Note: Does NOT accept simple columns like id or (id, dt) + # And it can't directly accept "HASH(id) BUCKETS 10", you need to split it with "BUCKETS" to two parts. + DistributedByInputSpec = AnyOf( + DistributionTupleInputType(), # Try structured tuple first (most specific) + EnumType(["RANDOM"], normalized_type="str"), # "RANDOM" + FuncType(), # "HASH(id)", + ) + + # OrderBy: Simple ordering specification + # Accepts: + # - Single column: dt + # - Multiple columns: (dt, id, status) + OrderByInputSpec = GeneralColumnListInputSpec + + # Refresh scheme: Accepts various types, normalizes to string + # For properties like refresh_scheme, it can be a string, identifier, or column + RefreshSchemeInputSpec = AnyOf( + EnumType(["ASYNC", "MANUAL"], normalized_type="var"), + ColumnType(normalized_type="str"), # Columns → will be converted to string + IdentifierType(normalized_type="str"), # Identifiers → will be converted to string + LiteralType(normalized_type="str"), # Numbers and string literals → will be converted to string + StringType(), # Plain strings + ) + + # Generic property value: Accepts various types, normalizes to string + # For properties like replication_num, storage_medium, etc. + # StarRocks PROPERTIES syntax requires all values to be strings: "value" + # So we normalize everything to string for consistent SQL generation + GenericPropertyInputSpec = AnyOf( + StringType(), # Plain strings + LiteralType(normalized_type="str"), # Numbers and string literals → will be converted to string + IdentifierType(normalized_type="str"), # Identifiers → will be converted to string + ColumnType(normalized_type="str"), # Columns → will be converted to string + ) + + """ + Input Property Specification for StarRocks + + This specification defines the validation and normalization rules for StarRocks properties. + Properties are specified in the physical_properties block of a SQLMesh model. + + Supported properties: + - partitioned_by / partition_by: Partition specification + - partitions: List of partition definitions + - distributed_by: Distribution specification (HASH/RANDOM with structured tuple or string) + - order_by: Ordering specification (simple column list) + - table key: + - primary_key: Primary key columns + - duplicate_key: Duplicate key columns (for DUPLICATE KEY table) + - unique_key: Unique key columns (for UNIQUE KEY table) + - aggregate_key: Aggregate key columns (for AGGREGATE KEY table) + - other properties: Any other properties not listed above will be treated as generic + string properties (e.g., replication_num, storage_medium, etc.) + + Examples: + duplicate_key = dt # Single key + primary_key = (id, customer_id) # Multiple keys + + partitioned_by = col1 # Single column + partitioned_by = (col1, col2) # Multiple columns + partitioned_by = (col1, "col2") # Mixed (string will be parsed) + partitioned_by = date_trunc('day', col1) # Expression partition with single func + partitioned_by = (date_trunc('day', col1), col2) # Expression partition with multiple exprs + partitioned_by = RANGE(col1, col2) # RANGE partition + partitioned_by = LIST(region, status) # LIST partition + + distributed_by = (kind='HASH', columns=(id, dt), buckets=10) # Structured + distributed_by = (kind='RANDOM') # RANDOM distribution + distributed_by = "HASH(id)" # String format + distributed_by = "RANDOM" # String format + + order_by = dt # Single column + order_by = (dt, id, status) # Multiple columns + + replication_num = 3 # Generic property (auto-handled) + storage_medium = "SSD" # Generic property (auto-handled) + """ + PROPERTY_INPUT_SPECS: t.Dict[str, DeclarativeType] = { + # Table key properties + "primary_key": TableKeyInputSpec, + "duplicate_key": TableKeyInputSpec, + "unique_key": TableKeyInputSpec, + "aggregate_key": TableKeyInputSpec, + + # Partition-related properties + "partitioned_by": PartitionedByInputSpec, + "partitions": PartitionsInputSpec, + + # Distribution property + "distributed_by": DistributedByInputSpec, + + # Ordering property + "clustered_by": OrderByInputSpec, + + # View properties + # StarRocks syntax: SECURITY {NONE | INVOKER | DEFINER} + "security": EnumType(["NONE", "INVOKER", "DEFINER"], normalized_type="str"), + + # Materialized view refresh properties (StarRocks uses REFRESH ...) + # - refresh_moment: IMMEDIATE | DEFERRED + "refresh_moment": EnumType(["IMMEDIATE", "DEFERRED"], normalized_type="str"), + # - refresh_scheme: ASYNC | ASYNC [START (...) EVERY (INTERVAL ...)] | MANUAL + # it should be a string/literal if START/EVERY is present, other than ASYNC + "refresh_scheme": RefreshSchemeInputSpec, + + # Note: All other properties not listed here will be handled, an example here + "replication_num": GenericPropertyInputSpec, + } + + + # Default output spec for properties not in PROPERTY_OUTPUT_SPECS + GenericPropertyOutputSpec = StringType() + + """ + Output Property Specification for StarRocks after validation+normalization + + This specification describes the expected types after normalization. + For most properties, OUTPUT spec is the same as INPUT spec since normalization + preserves the diverse types (dict | str | exp.Func for distribution). + + Conversion to unified formats (e.g., all distributions → dict) happens separately + in the usage layer via factory methods like DistributionTupleInputType.to_unified_dict(). + + Expected Output Types (after normalization): + - table keys: List[exp.Expression] - columns + - partitioned_by: List[exp.Expression] - columns, functions + - partitions: List[str] - partition definition strings + - distributed_by: Dict | str | exp.Func - DistributionTupleInputType, EnumType, or FuncType output + - order_by: List[exp.Expression] - columns + - generic properties: str - normalized string values + """ + PROPERTY_OUTPUT_SPECS: t.Dict[str, DeclarativeType] = { + "primary_key": ( + GeneralColumnListOutputSpec := SequenceOf(ColumnType(), allow_single=False) + ), + "duplicate_key": GeneralColumnListOutputSpec, + "unique_key": GeneralColumnListOutputSpec, + "aggregate_key": GeneralColumnListOutputSpec, + "partitioned_by": SequenceOf(ColumnType(), FuncType(), allow_single=False), + "partitions": SequenceOf(StringType(), allow_single=False), + "distributed_by": AnyOf( + DistributionTupleOutputType(), # Try structured tuple first (most specific) + EnumType(["RANDOM"], normalized_type="str"), # "RANDOM" + FuncType(), # "HASH(id)", + ), # Still dict | str | exp.Func after normalize + "clustered_by": GeneralColumnListOutputSpec, + "security": EnumType(["NONE", "INVOKER", "DEFINER"], normalized_type="str"), + "refresh_moment": EnumType(["IMMEDIATE", "DEFERRED"], normalized_type="str"), + "refresh_scheme": AnyOf( + EnumType(["ASYNC", "MANUAL"], normalized_type="var"), + StringType(), + ), + # Generic properties use GenericPropertyOutputSpec, an example here + "replication_num": GenericPropertyOutputSpec, + } + + # ============================================================ + # Helper functions + # ============================================================ + + @staticmethod + def get_property_input_spec(property_name: str) -> DeclarativeType: + """ + Get the INPUT type validator for a property. + + Returns the specific type from PROPERTY_INPUT_SPECS if defined, + otherwise returns GenericPropertyInputSpec for unknown properties. + + This allows any property not explicitly defined to be treated + as a generic string property. + """ + return PropertySpecs.PROPERTY_INPUT_SPECS.get( + property_name, PropertySpecs.GenericPropertyInputSpec + ) + + @staticmethod + def get_property_output_spec(property_name: str) -> DeclarativeType: + """ + Get the OUTPUT type validator for a property. + + Returns the specific type from PROPERTY_OUTPUT_SPECS if defined, + otherwise returns GenericPropertyOutputSpec for unknown properties. + + This allows validating that normalized values conform to expected output types. + """ + return PropertySpecs.PROPERTY_OUTPUT_SPECS.get( + property_name, PropertySpecs.GenericPropertyOutputSpec + ) + + +# ============================================================ +# Property Validation Helpers +# ============================================================ +class PropertyValidator: + """ + Centralized property validation helpers for table properties. + + Provides reusable validation functions to avoid code duplication + and ensure consistent error messages across different property handlers. + """ + + TABLE_KEY_TYPES = {"primary_key", "duplicate_key", "unique_key", "aggregate_key"} + + # All important properties except generic properties + IMPORTANT_PROPERTY_NAMES = { + *TABLE_KEY_TYPES, + "partitioned_by", + "partitions", + "distributed_by", + "clustered_by", + } + + # Centralized property alias configuration + # Maps canonical name -> list of valid aliases + PROPERTY_ALIASES: t.Dict[str, t.List[str]] = { + "partitioned_by": {"partition_by"}, + "clustered_by": {"order_by"}, + } + + EXCLUSIVE_PROPERTY_NAME_MAP: t.Dict[str, t.List[str]] = { + "key_type": set(TABLE_KEY_TYPES), + **PROPERTY_ALIASES, + } + + # Centralized invalid property name configuration + # Maps canonical name -> list of invalid/deprecated names + INVALID_PROPERTY_NAME_MAP: t.Dict[str, t.List[str]] = { + "partitioned_by": ["partition"], + "distributed_by": ["distribution", "distribute"], + "clustered_by": ["order", "ordering"], + } + + @staticmethod + def ensure_parenthesized(value: t.Any) -> t.Any: + """ + Ensure string value is wrapped in parentheses for parse_fragment compatibility. + + For string inputs like 'id1, id2', wraps to '(id1, id2)' so that + parse_fragment can parse it correctly. + + Args: + value: Input value (string, expression, or other) + + Returns: + - For strings/Literal/Column(quoted): wrapped in parentheses if not already + - For other types: returned unchanged + + Example: + >>> PropertyValidator.ensure_parenthesized('id1, id2') + '(id1, id2)' + >>> PropertyValidator.ensure_parenthesized('(id1, id2)') + '(id1, id2)' + >>> PropertyValidator.ensure_parenthesized(exp.Literal.string('id1, id2')) + '(id1, id2)' + >>> PropertyValidator.ensure_parenthesized(exp.Column(quoted=True, name='id1, id2')) + '(id1, id2)' + """ + # logger.debug("ensure_parenthesized. value: %s, type: %s", value, type(value)) + + # Extract string content from Literal + if isinstance(value, exp.Literal) and value.is_string: + value = value.this + # Extract string content from Column (quoted) + elif ( + isinstance(value, exp.Column) + and hasattr(value.this, "quoted") + and value.this.quoted + ): + value = value.name # Column.name returns the string + elif not isinstance(value, str): + return value + + stripped = value.strip() + if not stripped: + return value + + # Check if already wrapped in parentheses + if stripped.startswith("(") and stripped.endswith(")"): + return value + + return f"({stripped})" + + @staticmethod + def validate_and_normalize_property( + property_name: str, value: t.Any, preprocess_parentheses: bool = False + ) -> t.Any: + """ + Complete property processing pipeline using SPEC: + 1. Optionally preprocess string with parentheses + 2. Get INPUT type validator + 3. Validate and normalize input value + 4. Get OUTPUT type validator + 5. Verify normalized output conforms to expected type + 6. Return verified output + + After validation, the output type is guaranteed by SPEC. + Unexpected types indicate SPEC configuration errors. + + Args: + property_name: Name of the property + value: The property value to validate + preprocess_parentheses: If True, wrap string values in parentheses + + Returns: + The normalized value + + Raises: + SQLMeshError: If validation fails + + Example: + >>> validated = PropertyValidator.validate_and_normalize_property("distributed_by", "RANDOM") + >>> # Result: "RANDOM" (string from EnumType) + """ + logger.debug( + "validate_and_normalize_property. value: %s, type: %s", value, type(value) + ) + + # Step 1: Optionally preprocess string with parentheses + if preprocess_parentheses: + value = PropertyValidator.ensure_parenthesized(value) + + # Step 2: Get INPUT type validator + input_spec = PropertySpecs.get_property_input_spec(property_name) + if input_spec is None: + raise SQLMeshError(f"Unknown property '{property_name}'.") + + # Step 3: Validate + validated = input_spec.validate(value) + if validated is None: + raise SQLMeshError( + f"Invalid value type for property '{property_name}': {value!r}." + ) + + # Step 4: Normalize + normalized = input_spec.normalize(validated) + + # Step 5: Check by using output spec + output_spec = PropertySpecs.get_property_output_spec(property_name) + if output_spec is not None: + if output_spec.validate(normalized) is None: + raise SQLMeshError( + f"Normalized value for property '{property_name}' doesn't match output spec: {normalized!r}." + ) + + # Step 6: Return + return normalized + + @staticmethod + def check_invalid_names( + valid_name: str, + invalid_names: t.List[str], + table_properties: t.Dict[str, t.Any], + suggestion: t.Optional[str] = None, + ) -> None: + """ + Check for invalid/deprecated property names and raise error with suggestion. + + Args: + valid_name: The correct property name + invalid_names: List of invalid/deprecated names to check for + table_properties: Table properties dictionary to check + suggestion: Optional custom error message suggestion + + Raises: + SQLMeshError: If any invalid name is found + + Example: + >>> PropertyValidator.check_invalid_names( + ... valid_name="partitioned_by", + ... invalid_names=["partition_by", "partition"], + ... table_properties={"partition_by": "dt"} + ... ) + SQLMeshError: Invalid property 'partition_by'. Use 'partitioned_by' instead. + """ + for invalid_name in invalid_names: + if invalid_name in table_properties: + msg = suggestion or f"Use '{valid_name}' instead" + raise SQLMeshError(f"Invalid property '{invalid_name}'. {msg}.") + + @classmethod + def check_all_invalid_names(cls, table_properties: t.Dict[str, t.Any]) -> None: + """ + Check all invalid property names at once using INVALID_PROPERTY_NAME_MAP config. + + Args: + table_properties: Table properties dictionary to check + + Raises: + SQLMeshError: If any invalid name is found + """ + for valid_name, invalid_names in cls.INVALID_PROPERTY_NAME_MAP.items(): + cls.check_invalid_names(valid_name, invalid_names, table_properties) + + @staticmethod + def check_at_most_one( + property_name: str, + property_description: str, + table_properties: t.Dict[str, t.Any], + exclusive_property_names: t.Set[str] = None, + parameter_value: t.Optional[t.Any] = None, + ) -> t.Optional[str]: + """ + Ensure at most one property from a mutually exclusive group is defined. + + Args: + property_name: the canonical name + property_description: description of the property group (for error messages) + exclusive_property_names: List of mutually exclusive property names. + Defaults to canonical name and aliases if not provided. + table_properties: Table properties dictionary to check + parameter_value: Optional parameter value (takes priority over table_properties) + + Returns: + Name of the active property, or None if none found + NOTE: If the parameter value is provided, it returns None + + Raises: + SQLMeshError: If multiple properties from the group are defined + + Example: + >>> PropertyValidator.check_at_most_one( + ... property_name="primary_key", + ... property_description="key type", + ... exclusive_property_names=["primary_key", "duplicate_key", "unique_key", "aggregate_key"], + ... table_properties={"primary_key": "(id)", "duplicate_key": "(id)"} + ... ) + SQLMeshError: Multiple key type properties defined: ['primary_key', 'duplicate_key']. + Only one is allowed. + """ + if not exclusive_property_names: + exclusive_property_names = ( + PropertyValidator.EXCLUSIVE_PROPERTY_NAME_MAP.get(property_name, set()) + | {property_name} + ) + # logger.debug("Checking at most one property for '%s': %s", property_name, exclusive_property_names) + # Check parameter first (highest priority) + if parameter_value is not None: + # Check if any conflicting properties exist in table_properties + conflicts = [ + name for name in exclusive_property_names if name in table_properties + ] + if conflicts: + param_display = f"{property_name} (parameter)" + raise SQLMeshError( + f"Conflicting {property_description} definitions: " + f"{param_display} provided along with table_properties {conflicts}. " + f"Only one {property_description} is allowed." + ) + return None + + # Check table_properties for multiple definitions + present = [ + name for name in exclusive_property_names if name in table_properties + ] + # logger.debug("Get table key names for %s from table_properties: %s", property_name, present) + + if len(present) > 1: + raise SQLMeshError( + f"Multiple {property_description} properties defined: {present}. " + f"Only one is allowed." + ) + + return present[0] if present else None + + +############################################################################### +# StarRocks Engine Adapter +############################################################################### +@set_catalog() +class StarRocksEngineAdapter( + LogicalMergeMixin, + PandasNativeFetchDFSupportMixin, + ClusteredByMixin, +): + """ + StarRocks Engine Adapter for SQLMesh. + + StarRocks is a high-performance analytical database with its own dialect-specific + behavior. This adapter highlights a few key characteristics: + + 1. PRIMARY KEY support is native and must be emitted in the post-schema section. + 2. DELETE with subqueries is supported on PRIMARY KEY tables, but other key types still + need guard rails (no boolean literals, TRUNCATE for WHERE TRUE, etc.). + 3. Partitioning supports RANGE, LIST, and expression-based syntaxes. + + Implementation strategy: + - Override only where StarRocks syntax/behavior diverges from the base adapter. + - Keep the rest of the functionality delegated to the shared base implementation. + """ + + # ==================== Class Attributes (Declarative Configuration) ==================== + + DIALECT = "starrocks" + """SQLGlot dialect name for SQL generation""" + + DEFAULT_BATCH_SIZE = 10000 + """Default batch size for bulk operations""" + + SUPPORTS_TRANSACTIONS = False + """ + StarRocks does not support transactions for multiple DML statements. + - No BEGIN/COMMIT/ROLLBACK (only txn for multiple INSERT statements from v3.5) + - Operations are auto-committed + - Backfill uses partition-level atomicity + """ + + INSERT_OVERWRITE_STRATEGY = InsertOverwriteStrategy.DELETE_INSERT + """ + StarRocks does support INSERT OVERWRITE syntax (and dynamic overwrite from v3.5). + Use DELETE + INSERT pattern: + 1. DELETE FROM table WHERE condition + 2. INSERT INTO table SELECT ... + + Base class automatically handles this strategy without overriding insert methods. + + TODO: later, we can add support for INSERT OVERWRITE, even use Primary Key for beter performance + """ + + COMMENT_CREATION_TABLE = CommentCreationTable.IN_SCHEMA_DEF_CTAS + """Table comments are added in both CREATE TABLE statement and CTAS""" + + COMMENT_CREATION_VIEW = CommentCreationView.IN_SCHEMA_DEF_NO_COMMANDS + """View comments are added in CREATE VIEW statement""" + + SUPPORTS_MATERIALIZED_VIEWS = True + """StarRocks supports materialized views with refresh strategies""" + + SUPPORTS_MATERIALIZED_VIEW_SCHEMA = True + """ + StarRocks materialized views support specifying a column list, but the column definition is + limited (e.g. column name + comment, not full type definitions). We set this to True and + implement custom MV schema rendering in create_view/_create_materialized_view. + """ + + SUPPORTS_REPLACE_TABLE = False + """No REPLACE TABLE syntax; use DROP + CREATE instead""" + + SUPPORTS_CREATE_DROP_CATALOG = False + """StarRocks supports DROPing external catalogs. + TODO: whether it's external catalogs, or includes the internal catalog + """ + + SUPPORTS_INDEXES = True + """ + StarRocks supports PRIMARY KEY in CREATE TABLE, but NOT standalone CREATE INDEX. + + We set this to True to enable PRIMARY KEY generation in CREATE TABLE statements. + The create_index() method is overridden to prevent actual CREATE INDEX execution. + + Supported (defined in CREATE TABLE): + - PRIMARY KEY: Automatically creates sorted index + - INDEX clause: For bloom filter, bitmap, inverted indexes + NOT supported: + CREATE INDEX idx_name ON t (name); -- Will be skipped by create_index() + """ + + SUPPORTS_TUPLE_IN = False + """ + StarRocks does NOT support tuple IN syntax: (col1, col2) IN ((val1, val2), (val3, val4)) + + Instead, use OR with AND conditions: + (col1 = val1 AND col2 = val2) OR (col1 = val3 AND col2 = val4) + + This is automatically handled by snapshot_id_filter and snapshot_name_version_filter + in sqlmesh/core/state_sync/db/utils.py when SUPPORTS_TUPLE_IN = False. + """ + + MAX_TABLE_COMMENT_LENGTH = 2048 + """Maximum length for table comments""" + + MAX_COLUMN_COMMENT_LENGTH = 255 + """Maximum length for column comments""" + + MAX_IDENTIFIER_LENGTH = 64 + """Maximum length for table/column names""" + + # ==================== Schema Operations ==================== + # StarRocks supports CREATE/DROP SCHEMA the same as CREATE/DROP DATABSE. + # So, no need to implement create_schema / drop_schema + + # ==================== Data Object Query ==================== + def _get_data_objects( + self, schema_name: SchemaName, object_names: t.Optional[t.Set[str]] = None + ) -> t.List[DataObject]: + """ + Returns all the data objects that exist in the given schema. + Uses information_schema tables which are compatible with MySQL protocol. + + StarRocks uses the MySQL-compatible information_schema layout, so the same query + works here. + Note: Materialized View is not reliably distinguished from View (both may appear as `VIEW`) + in information_schema.tables. We therefore best-effort detect MVs via + information_schema.materialized_views and upgrade matching objects to `materialized_view`. + + Args: + schema_name: The schema (database) to query + object_names: Optional set of specific table names to filter + + Returns: + List of DataObject instances representing tables and views + """ + schema_db = to_schema(schema_name).db + query = ( + exp.select( + exp.column("table_schema").as_("schema_name"), + exp.column("table_name").as_("name"), + exp.case(exp.column("table_type")) + .when( + exp.Literal.string("BASE TABLE"), + exp.Literal.string("table"), + ) + .when( + exp.Literal.string("VIEW"), + exp.Literal.string("view"), + ) + .else_("table_type") + .as_("type"), + ) + .from_(exp.table_("tables", db="information_schema")) + .where(exp.column("table_schema").eq(schema_db)) + ) + if object_names: + # StarRocks may treat information_schema table_name comparisons as case-sensitive. + # Use LOWER(table_name) to match case-insensitively. + lowered_names = [name.lower() for name in object_names] + query = query.where( + exp.func("LOWER", exp.column("table_name")).isin(*lowered_names) + ) + + df = self.fetchdf(query) + objects = [ + DataObject( + schema=row.schema_name, + name=row.name, + type=DataObjectType.from_str(row.type), + ) + for row in df.itertuples() + ] + + # Best-effort upgrade of MV types using information_schema.materialized_views. + # If this fails (unsupported / permissions / version), fall back to information_schema.tables. + try: + mv_query = ( + exp.select( + exp.column("table_schema").as_("schema_name"), + exp.column("table_name").as_("name"), + ) + .from_(exp.table_("materialized_views", db="information_schema")) + .where(exp.column("table_schema").eq(schema_db)) + ) + if object_names: + lowered_names = [name.lower() for name in object_names] + mv_query = mv_query.where( + exp.func("LOWER", exp.column("table_name")).isin(*lowered_names) + ) + + mv_df = self.fetchdf(mv_query) + mv_names: t.Set[str] = {t.cast(str, r.name).lower() for r in mv_df.itertuples() if r.name} + + if mv_names: + for obj in objects: + if obj.name.lower() in mv_names: + obj.type = DataObjectType.MATERIALIZED_VIEW + except Exception: + logger.warning(f"[StarRocks] Failed to get materialized views from information_schema.materialized_views") + + return objects + + def create_index( + self, + table_name: TableName, + index_name: str, + columns: t.Tuple[str, ...], + exists: bool = True, + ) -> None: + """ + Override to prevent CREATE INDEX statements (not supported in StarRocks). + + StarRocks does not support standalone CREATE INDEX statements. + Indexes must be defined during CREATE TABLE using INDEX clause. + + Since SQLMesh state tables use PRIMARY KEY (which provides efficient indexing), + we simply log and skip additional index creation requests. + + This matches upstream StarRocks limitations and prevents accidental CREATE INDEX calls. + """ + logger.warning( + f"[StarRocks] Skipping CREATE INDEX {index_name} on {table_name} - " + f"StarRocks does not support standalone CREATE INDEX statements. " + f"PRIMARY KEY provides equivalent indexing for columns: {columns}" + ) + return + + def _create_table_like( + self, + target_table_name: TableName, + source_table_name: TableName, + exists: bool, + **kwargs: t.Any, + ) -> None: + """Create a new table using StarRocks' native `CREATE TABLE ... LIKE ...` syntax. + + The base implementation re-creates the target table from `columns(source)` which can + lose non-column metadata. Using LIKE lets the engine preserve more of the original + table definition (engine-defined behavior). + """ + self.execute( + exp.Create( + this=exp.to_table(target_table_name), + kind="TABLE", + exists=exists, + properties=exp.Properties( + expressions=[ + exp.LikeProperty( + this=exp.to_table(source_table_name), + ), + ], + ), + ) + ) + + def delete_from( + self, + table_name: TableName, + where: t.Optional[t.Union[str, exp.Expression]] = None, + ) -> None: + """ + Delete from a table. + + StarRocks DELETE limitations by table type: + + PRIMARY KEY tables: + - Support complex WHERE conditions (subqueries, BETWEEN, etc.) + - No special handling needed + + Other table types (DUPLICATE/UNIQUE/AGGREGATE KEY): + - WHERE TRUE not supported → use TRUNCATE TABLE + - Boolean literals (TRUE/FALSE) not supported + - BETWEEN not supported → convert to >= AND <= + - Others not supported: + - CAST() not supported in WHERE + - Subqueries not supported + - ... + + But, I don't know what the table type is. + + Args: + table_name: The table to delete from + where: The where clause to filter rows to delete + """ + # Parse where clause if it's a string + if isinstance(where, str): + from sqlglot import parse_one + + where: exp.Expression = parse_one(where, dialect=self.dialect) + + # If no where clause or WHERE TRUE, use TRUNCATE TABLE (for all table types) + if not where or where == exp.true(): + table_expr = ( + exp.to_table(table_name) if isinstance(table_name, str) else table_name + ) + logger.info( + f"Converting DELETE FROM {table_name} WHERE TRUE to TRUNCATE TABLE " + "(StarRocks does not support WHERE TRUE in DELETE)" + ) + self.execute( + f"TRUNCATE TABLE {table_expr.sql(dialect=self.dialect, identify=True)}" + ) + return + + # For non-PRIMARY KEY tables, apply WHERE clause restrictions + # Note: We conservatively apply restrictions to all tables since we can't easily + # determine table type at DELETE time. PRIMARY KEY tables will still work with + # simplified conditions, while non-PRIMARY KEY tables require them. + if isinstance(where, exp.Expression): + original_where = where + # Remove boolean literals (not supported in any table type) + where = self._where_clause_remove_boolean_literals(where) + # Convert BETWEEN to >= AND <= (required for DUPLICATE/UNIQUE/AGGREGATE KEY tables) + where = self._where_clause_convert_between_to_comparison(where) + + if where != original_where: + logger.debug( + f"Converted WHERE clause for StarRocks compatibility, table: {table_name}.\n" + f" Original: {original_where.sql(dialect=self.dialect)}\n" + f" Converted: {where.sql(dialect=self.dialect)}" + ) + + # Use parent implementation + super().delete_from(table_name, where) + + def _where_clause_remove_boolean_literals( + self, expression: exp.Expression + ) -> exp.Expression: + """ + Remove TRUE/FALSE boolean literals from WHERE expressions. + + StarRocks Limitation (except PRIMARY KEY tables): + Boolean literals (TRUE/FALSE) are not supported in WHERE clauses. + + This method simplifies expressions: + - (condition) AND TRUE / TRUE AND (condition) → condition + - (condition) OR FALSE / FALSE OR (condition) → condition + - WHERE TRUE → 1=1 (though TRUNCATE is used instead) + - WHERE FALSE → 1=0 + + Args: + expression: The expression to clean + + Returns: + Cleaned expression without boolean literals + """ + + def transform(node: exp.Expression) -> exp.Expression: + # Handle standalone TRUE/FALSE at the top level + if node == exp.true(): + # Convert TRUE to 1=1 + return exp.EQ( + this=exp.Literal.number(1), expression=exp.Literal.number(1) + ) + elif node == exp.false(): # noqa: RET505 + # Convert FALSE to 1=0 + return exp.EQ( + this=exp.Literal.number(1), expression=exp.Literal.number(0) + ) + + # Handle AND expressions + elif isinstance(node, exp.And): + left = node.this + right = node.expression + + # Remove TRUE from AND + if left == exp.true(): + return right + if right == exp.true(): + return left + + # Handle OR expressions + elif isinstance(node, exp.Or): + left = node.this + right = node.expression + + # Remove FALSE from OR + if left == exp.false(): + return right + if right == exp.false(): + return left + + return node + + # Transform the expression tree + return expression.transform(transform, copy=True) + + def _where_clause_convert_between_to_comparison( + self, expression: exp.Expression + ) -> exp.Expression: + """ + Convert BETWEEN expressions to >= AND <= comparisons. + + StarRocks Limitation (DUPLICATE/UNIQUE/AGGREGATE KEY Tables): + BETWEEN is not supported in DELETE WHERE clauses for non-PRIMARY KEY tables. + + PRIMARY KEY tables support BETWEEN, but this conversion is safe for all table types + since the converted form (>= AND <=) is semantically equivalent. + + This method converts: + - col BETWEEN a AND b → col >= a AND col <= b + + Args: + expression: The expression potentially containing BETWEEN + + Returns: + Expression with BETWEEN converted to comparisons + """ + + def transform(node: exp.Expression) -> exp.Expression: + if isinstance(node, exp.Between): + # Extract components: col BETWEEN low AND high + column = node.this # The column being tested + low = node.args.get("low") # Lower bound + high = node.args.get("high") # Upper bound + + if column and low and high: + # Build: column >= low AND column <= high + gte = exp.GTE(this=column.copy(), expression=low.copy()) + lte = exp.LTE(this=column.copy(), expression=high.copy()) + return exp.And(this=gte, expression=lte) + + return node + + # Transform the expression tree + return expression.transform(transform, copy=True) + + def execute( + self, + expressions: t.Union[str, exp.Expression, t.Sequence[exp.Expression]], + ignore_unsupported_errors: bool = False, + quote_identifiers: bool = True, + track_rows_processed: bool = False, + **kwargs: t.Any, + ) -> None: + """ + Override execute to strip FOR UPDATE from queries (not supported in StarRocks). + + StarRocks is an OLAP database and does not support row-level locking via + SELECT ... FOR UPDATE. This method removes lock expressions before execution. + + Args: + expressions: SQL expression(s) to execute + ignore_unsupported_errors: Whether to ignore unsupported errors + quote_identifiers: Whether to quote identifiers + track_rows_processed: Whether to track rows processed + **kwargs: Additional arguments + """ + from sqlglot.helper import ensure_list + + # Process expressions to remove FOR UPDATE + processed_expressions = [] + for e in ensure_list(expressions): + if isinstance(e, exp.Expression): + # Remove lock (FOR UPDATE) from SELECT statements + if isinstance(e, exp.Select) and e.args.get("locks"): + e = e.copy() + e.set("locks", None) + logger.warning(f"[StarRocks] Removed FOR UPDATE from SELECT statement: " + f"{e.sql(dialect=self.dialect, identify=quote_identifiers)}") + processed_expressions.append(e) + else: + # For string SQL, we can't easily remove FOR UPDATE without parsing + # Just pass through and let StarRocks reject it if present + processed_expressions.append(e) + + # Call parent execute with processed expressions + super().execute( + processed_expressions, + ignore_unsupported_errors=ignore_unsupported_errors, + quote_identifiers=quote_identifiers, + track_rows_processed=track_rows_processed, + **kwargs, + ) + + # ==================== Table Creation (CORE IMPLEMENTATION) ==================== + def _create_table_from_columns( + self, + table_name: TableName, + target_columns_to_types: t.Dict[str, exp.DataType], + primary_key: t.Optional[t.Tuple[str, ...]] = None, + exists: bool = True, + table_description: t.Optional[str] = None, + column_descriptions: t.Optional[t.Dict[str, str]] = None, + **kwargs: t.Any, + ) -> None: + """ + Create a table using column definitions. + + Unified Model Parameter vs Physical Properties Handling: + For properties that can be defined both as model parameters and in physical_properties, + this method implements a unified priority strategy: + 1. Model parameter takes priority if present + 2. Otherwise, use value from physical_properties + 3. Ensure at most one definition exists + + Supported unified properties: + - primary_key: Model parameter OR physical_properties.primary_key + - partitioned_by: Model parameter OR physical_properties.partitioned_by/partition_by + - clustered_by: Model parameter OR physical_properties.clustered_by/order_by + + Other key types (duplicate_key, aggregate_key, unique_key) only support physical_properties. + + StarRocks Key Column Ordering Constraint: + ALL key types (PRIMARY KEY, UNIQUE KEY, DUPLICATE KEY, AGGREGATE KEY) require: + - Key columns MUST be the first N columns in CREATE TABLE + - Column order MUST match the KEY clause order + + Implementation Strategy: + 1. Normalize model parameters into table_properties with priority handling + 2. Extract and validate key columns from unified table_properties + 3. Validate no conflicts between different key types + 4. Reorder columns to place key columns first + 5. For PRIMARY KEY: Pass to base class (sets SUPPORTS_INDEXES=True) + 6. For other keys: Handle in _build_table_key_property + + Args: + table_name: Fully qualified table name + target_columns_to_types: Column definitions {name: DataType} + primary_key: Primary key column names (model parameter, takes priority) + exists: Add IF NOT EXISTS clause + table_description: Table comment + column_descriptions: Column comments {column_name: comment} + kwargs: Additional properties including: + - partitioned_by: Partition columns (model parameter) + - clustered_by: Clustering columns (model parameter) + - table_properties: Physical properties dict + + Example: + # Model parameter (priority): + partitioned_by=dt, + clustered_by=(dt, id)) + physical_properties( + primary_key=(id, dt) + ) + + # Or physical_properties only: + physical_properties( + duplicate_key=(id, dt), + partitioned_by=dt, + order_by=(dt, id) + ) + """ + # Use setdefault to simplify table_properties access + table_properties = kwargs.setdefault("table_properties", {}) + + # Log entry point + logger.debug( + "_create_table_from_columns: table=%s, primary_key=%s (from model param), " + "table_properties.keys=%s", + table_name, + primary_key, + list(table_properties.keys()), + ) + + # Extract and validate key columns from table_properties + # Priority: parameter primary_key > table_properties (already handled above) + key_type, key_columns = self._extract_and_validate_key_columns( + table_properties, primary_key + ) + logger.debug( + "_create_table_from_columns: extracted key_type=%s, key_columns=%s", + key_type, key_columns + ) + + # IMPORTANT: Normalize parameter primary_key into table_properties for unified handling + # This ensures _build_table_properties_exp() can access primary_key even when + # it's passed as a model parameter rather than in physical_properties + if primary_key: + table_properties["primary_key"] = primary_key + logger.debug("_create_table_from_columns: unified primary_key into table_properties") + elif key_type: + logger.debug("table key type '%s' may be handled in _build_table_key_property", key_type) + + # StarRocks key column ordering constraint: All key types need reordering + if key_columns: + target_columns_to_types = self._reorder_columns_for_key( + target_columns_to_types, key_columns, key_type or "key" + ) + logger.debug("_create_table_from_columns: reordered columns for %s", key_type) + + # IMPORTANT: Do NOT pass primary_key to base class! + # Unlike other databases, StarRocks requires PRIMARY KEY to be in POST_SCHEMA location + # (in properties section after columns), not inside schema (inside column definitions). + # We handle ALL key types (including PRIMARY KEY) in _build_table_key_property. + logger.debug( + "_create_table_from_columns: NOT passing primary_key to base class (handled in _build_table_key_property)" + ) + super()._create_table_from_columns( + table_name=table_name, + target_columns_to_types=target_columns_to_types, + primary_key=None, # StarRocks handles PRIMARY KEY in properties, not schema + exists=exists, + table_description=table_description, + column_descriptions=column_descriptions, + **kwargs, + ) + + # ==================== View / Materialized View ==================== + def create_view( + self, + view_name: TableName, + query_or_df: QueryOrDF, + target_columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None, + replace: bool = True, + materialized: bool = False, + materialized_properties: t.Optional[t.Dict[str, t.Any]] = None, + table_description: t.Optional[str] = None, + column_descriptions: t.Optional[t.Dict[str, str]] = None, + view_properties: t.Optional[t.Dict[str, exp.Expression]] = None, + source_columns: t.Optional[t.List[str]] = None, + **create_kwargs: t.Any, + ) -> None: + """ + StarRocks behavior: + - Regular VIEW: supports CREATE OR REPLACE (base behavior) + - MATERIALIZED VIEW: does NOT support CREATE OR REPLACE, so replace=True => DROP + CREATE + """ + if not materialized: + return super().create_view( + view_name=view_name, + query_or_df=query_or_df, + target_columns_to_types=target_columns_to_types, + replace=replace, + materialized=False, + materialized_properties=materialized_properties, + table_description=table_description, + column_descriptions=column_descriptions, + view_properties=view_properties, + source_columns=source_columns, + **create_kwargs, + ) + + # MATERIALIZED VIEW path + if replace: + # Avoid DROP MATERIALIZED VIEW failure when an object with the same name exists but is not an MV. + self.drop_data_object_on_type_mismatch( + self.get_data_object(view_name), DataObjectType.MATERIALIZED_VIEW + ) + self.drop_view(view_name, ignore_if_not_exists=True, materialized=True) + logger.debug( + f"Creating materialized view: {view_name}, materialized: {materialized}, " + f"materialized_properties: {materialized_properties}, " + f"view_properties: {view_properties}, create_kwargs: {create_kwargs}, " + ) + + return self._create_materialized_view( + view_name=view_name, + query_or_df=query_or_df, + target_columns_to_types=target_columns_to_types, + materialized_properties=materialized_properties, + table_description=table_description, + column_descriptions=column_descriptions, + view_properties=view_properties, + source_columns=source_columns, + **create_kwargs, + ) + + def _create_materialized_view( + self, + view_name: TableName, + query_or_df: QueryOrDF, + target_columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None, + materialized_properties: t.Optional[t.Dict[str, t.Any]] = None, + table_description: t.Optional[str] = None, + column_descriptions: t.Optional[t.Dict[str, str]] = None, + view_properties: t.Optional[t.Dict[str, exp.Expression]] = None, + source_columns: t.Optional[t.List[str]] = None, + **create_kwargs: t.Any, + ) -> None: + """ + Create a StarRocks materialized view. + + StarRocks MV schema supports a column list but does NOT support explicit data types in that list. + We therefore build a schema with column names + optional COMMENT only. + """ + import pandas as pd + + query_or_df = self._native_df_to_pandas_df(query_or_df) + + if isinstance(query_or_df, pd.DataFrame): + values: t.List[t.Tuple[t.Any, ...]] = list( + query_or_df.itertuples(index=False, name=None) + ) + target_columns_to_types, source_columns = self._columns_to_types( + query_or_df, target_columns_to_types, source_columns + ) + if not target_columns_to_types: + raise SQLMeshError("columns_to_types must be provided for dataframes") + source_columns_to_types = get_source_columns_to_types( + target_columns_to_types, source_columns + ) + query_or_df = self._values_to_sql( + values, + source_columns_to_types, + batch_start=0, + batch_end=len(values), + ) + + source_queries, target_columns_to_types = self._get_source_queries_and_columns_to_types( + query_or_df, + target_columns_to_types, + batch_size=0, + target_table=view_name, + source_columns=source_columns, + ) + if len(source_queries) != 1: + raise SQLMeshError("Only one source query is supported for creating materialized views") + + target_table = exp.to_table(view_name) + schema: t.Union[exp.Table, exp.Schema] = self._build_materialized_view_schema_exp( + target_table, + target_columns_to_types=target_columns_to_types, + column_descriptions=column_descriptions, + ) + + # Pass model materialized properties through the existing properties builder + partitioned_by = None + clustered_by = None + partition_interval_unit = None + if materialized_properties: + partitioned_by = materialized_properties.get("partitioned_by") + clustered_by = materialized_properties.get("clustered_by") + partition_interval_unit = materialized_properties.get("partition_interval_unit") + logger.debug(f"Get info from materialized_properties: {materialized_properties}, " + f"partitioned_by: {partitioned_by}, " + f"clustered_by: {clustered_by}, " + f"partition_interval_unit: {partition_interval_unit}") + + properties_exp = self._build_table_properties_exp( + catalog_name=target_table.catalog, + table_properties=view_properties, + target_columns_to_types=target_columns_to_types, + table_description=table_description, + partitioned_by=partitioned_by, + clustered_by=clustered_by, + partition_interval_unit=partition_interval_unit, + table_kind="MATERIALIZED_VIEW", + ) + + with source_queries[0] as query: + self.execute( + exp.Create( + this=schema, + kind="VIEW", + replace=False, + expression=query, + properties=properties_exp, + **create_kwargs, + ), + quote_identifiers=self.QUOTE_IDENTIFIERS_IN_VIEWS, + ) + + self._clear_data_object_cache(view_name) + + def _build_materialized_view_schema_exp( + self, + table: exp.Table, + *, + target_columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None, + column_descriptions: t.Optional[t.Dict[str, str]] = None, + ) -> t.Union[exp.Table, exp.Schema]: + """ + Build a StarRocks MV schema with column names + optional COMMENT only (no types). + """ + columns: t.List[str] = [] + if target_columns_to_types: + columns = list(target_columns_to_types) + elif column_descriptions: + columns = list(column_descriptions) + + if not columns: + return table + + column_descriptions = column_descriptions or {} + expressions: t.List[exp.Expression] = [] + for col in columns: + constraints: t.List[exp.ColumnConstraint] = [] + comment = column_descriptions.get(col) + if comment: + constraints.append( + exp.ColumnConstraint( + kind=exp.CommentColumnConstraint( + this=exp.Literal.string(self._truncate_column_comment(comment)) + ) + ) + ) + expressions.append( + exp.ColumnDef( + this=exp.to_identifier(col), + constraints=constraints, + ) + ) + + return exp.Schema(this=table, expressions=expressions) + + # ==================== Table Properties Builder (for Table and MV/VIew) ==================== + def _build_table_properties_exp( + self, + catalog_name: t.Optional[str] = None, + table_format: t.Optional[str] = None, + storage_format: t.Optional[str] = None, + partitioned_by: t.Optional[t.List[exp.Expression]] = None, + partition_interval_unit: t.Optional[IntervalUnit] = None, + clustered_by: t.Optional[t.List[exp.Expression]] = None, + table_properties: t.Optional[t.Dict[str, exp.Expression]] = None, + target_columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None, + table_description: t.Optional[str] = None, + table_kind: t.Optional[str] = None, + **kwargs: t.Any, + ) -> t.Optional[exp.Properties]: + """ + Build table properties for StarRocks CREATE TABLE statement. + + Unified Model Parameter vs Physical Properties Handling: + This method receives both model parameters (partitioned_by, clustered_by) and + physical_properties (table_properties dict). Priority is handled as follows: + + 1. primary_key / partitioned_by / clustered_by (ORDER BY) + - Model parameter takes priority + - Falls back to physical_properties.xxx + - Handled in _build_partition_property + + 2. special for primary_key: + - Still need to be processed in _build_table_key_property + + 3. Other key types (duplicate_key, unique_key, aggregate_key): + - Only available via physical_properties + - Handled in _build_table_key_property + + Handles: + - Key constraints (PRIMARY KEY, DUPLICATE KEY, UNIQUE KEY) + - Partition expressions (RANGE/LIST/EXPRESSION) + - Distribution (HASH/RANDOM) + - Order by (clustering) + - Table comment + - Other properties (replication_num, storage_medium, etc.) + + Args: + partitioned_by: Partition columns/expression from model parameter (takes priority) + clustered_by: Clustering columns from model parameter (takes priority) + table_properties: Dictionary containing physical_properties: + - primary_key/duplicate_key/unique_key/aggregate_key: Tuple/list of column names + - partitioned_by(partition_by): Partition definition (fallback) + - distributed_by: Tuple of EQ expressions (kind, expressions, buckets) or string + - clustered_by(order_by): Clustering definition (fallback) + - replication_num, storage_medium, etc.: Literal values + table_description: Table comment + """ + properties: t.List[exp.Expression] = [] + table_properties_copy = dict(table_properties) if table_properties else {} + logger.debug( + "_build_table_properties_exp: table_properties=%s", + table_properties.keys() if table_properties else [], + ) + + is_mv = table_kind == "MATERIALIZED_VIEW" + if is_mv: + # Required for CREATE MATERIALIZED VIEW (SQLGlot uses this property to switch the keyword) + properties.append(exp.MaterializedProperty()) + + # Validate all property names at once + PropertyValidator.check_all_invalid_names(table_properties_copy) + + # Check for mutually exclusive key types + # Note: primary_key is already set into table_properties if model param is set + active_key_type = PropertyValidator.check_at_most_one( + property_name="key_type", + property_description="key type", + table_properties=table_properties_copy, + ) + logger.debug( + "_build_table_properties_exp: active_key_type='%s'", active_key_type + ) + if is_mv and active_key_type: + raise SQLMeshError( + f"You can't specify the table type when the table is a materialized view. " + f"Current specified key type '{active_key_type}'." + ) + + # 0. Extract key columns for partition/distribution validation (read-only, don't pop yet) + key_type, key_columns = None, None + if active_key_type: + key_type = active_key_type + key_expr = table_properties_copy[key_type] + # Use validate_and_normalize_property to get List[exp.Column], then extract names + normalized = PropertyValidator.validate_and_normalize_property( + key_type, key_expr, preprocess_parentheses=True + ) + key_columns = tuple(col.name for col in normalized) + logger.debug( + "_build_table_properties_exp: key_type=%s, key_columns=%s", + key_type, + key_columns, + ) + + # 1. Handle key constraints (ALL types including PRIMARY KEY) + key_prop = self._build_table_key_property( + table_properties_copy, active_key_type + ) + if key_prop: + properties.append(key_prop) + logger.debug( + "_build_table_properties_exp: generated key_prop=%s", + type(key_prop).__name__, + ) + else: + logger.debug("_build_table_properties_exp: key_prop skipped (not defined)") + + # 2. Add table comment (it must be ahead of other properties except the talbe key/type) + if table_description: + properties.append( + exp.SchemaCommentProperty( + this=exp.Literal.string( + self._truncate_table_comment(table_description) + ) + ) + ) + + # 3. Handle partitioned_by (PARTITION BY RANGE/LIST/EXPRESSION) + partition_prop = self._build_partition_property( + partitioned_by, + partition_interval_unit, + target_columns_to_types, + catalog_name, + table_properties_copy, + key_type, + key_columns, + ) + if partition_prop: + properties.append(partition_prop) + logger.debug( + "_build_table_properties_exp: generated partition_prop=%s", + type(partition_prop).__name__, + ) + else: + logger.debug( + "_build_table_properties_exp: partition_prop skipped (not defined)" + ) + + # 4. Handle distributed_by (DISTRIBUTED BY HASH/RANDOM) + distributed_prop = self._build_distributed_by_property( + table_properties_copy, key_columns + ) + if distributed_prop: + properties.append(distributed_prop) + logger.debug( + "_build_table_properties_exp: generated distributed_prop=%s", + type(distributed_prop).__name__, + ) + else: + logger.debug( + "_build_table_properties_exp: distributed_prop skipped (not defined)" + ) + + # 5. Handle refresh_property (REFRESH ...) + if is_mv: + refresh_prop = self._build_refresh_property(table_properties_copy) + if refresh_prop: + properties.append(refresh_prop) + logger.debug("_build_table_properties_exp: generated refresh_prop=%s", type(refresh_prop).__name__) + else: + logger.debug("_build_table_properties_exp: refresh_prop skipped (not defined)") + + # 6. Handle order_by/clustered_by (ORDER BY ...) + order_prop = self._build_order_by_property( + table_properties_copy, clustered_by or None + ) + if order_prop: + properties.append(order_prop) + logger.debug( + "_build_table_properties_exp: generated order_prop=%s", + type(order_prop).__name__, + ) + else: + logger.debug( + "_build_table_properties_exp: order_prop skipped (not defined)" + ) + + # 5. Handle other properties (replication_num, storage_medium, etc.) + other_props = self._build_other_properties(table_properties_copy) + properties.extend(other_props) + + return exp.Properties(expressions=properties) if properties else None + + def _build_view_properties_exp( + self, + view_properties: t.Optional[t.Dict[str, exp.Expression]] = None, + table_description: t.Optional[str] = None, + **kwargs: t.Any, + ) -> t.Optional[exp.Properties]: + """ + Build CREATE VIEW properties for StarRocks. + + Supports StarRocks view SECURITY syntax: SECURITY {NONE | INVOKER} + via exp.SecurityProperty (renders as `SECURITY `). + """ + properties: t.List[exp.Expression] = [] + + if table_description: + properties.append( + exp.SchemaCommentProperty( + this=exp.Literal.string(self._truncate_table_comment(table_description)) + ) + ) + + if view_properties: + view_properties_copy = dict(view_properties) + security = view_properties_copy.pop("security", None) + if security is not None: + security_text = PropertyValidator.validate_and_normalize_property("security", security) + # exp.SecurityProperty renders as `SECURITY ` (no '=') + properties.append(exp.SecurityProperty(this=exp.Var(this=security_text))) + + properties.extend( + self._table_or_view_properties_to_expressions(view_properties_copy) + ) + + if properties: + return exp.Properties(expressions=properties) + return None + + def _build_table_key_property( + self, table_properties: t.Dict[str, t.Any], active_key_type: t.Optional[str] + ) -> t.Optional[exp.Expression]: + """ + Build key constraint property for ALL key types including PRIMARY KEY. + + Unlike other databases where PRIMARY KEY is handled by base class in schema, + StarRocks requires ALL key types (PRIMARY KEY, DUPLICATE KEY, UNIQUE KEY, AGGREGATE KEY) + to be in POST_SCHEMA location (properties section after columns). + + Handles: + - PRIMARY KEY + - DUPLICATE KEY + - UNIQUE KEY + - AGGREGATE KEY (when implemented) + + Args: + table_properties: Dictionary containing key definitions (will be modified) + active_key_type: The active key type or None + + Returns: + Key property expression for the active key type, or None + """ + if not active_key_type: + logger.debug("_build_table_key_property: no active_key_type, skipped") + return None + + logger.debug("_build_table_key_property: processing %s", active_key_type) + + # Configuration: key_name -> Property class (excluding primary_key) + KEY_PROPERTY_CLASSES: t.Dict[str, t.Type[exp.Expression]] = { + "primary_key": exp.PrimaryKey, + "duplicate_key": exp.DuplicateKeyProperty, + "unique_key": exp.UniqueKeyProperty, + # "aggregate_key": exp.AggregateKeyProperty, # Not implemented yet + } + + property_class = KEY_PROPERTY_CLASSES.get(active_key_type) + key_value = table_properties.pop(active_key_type, None) + if not property_class: + # Aggregate key requires special handling + if active_key_type == "aggregate_key": + raise SQLMeshError( + "AGGREGATE KEY tables are not currently supported. " + "AGGREGATE KEY requires specifying aggregation functions (SUM/MAX/MIN/REPLACE) " + "for value columns, which is not supported in the current model configuration syntax. " + "Please use PRIMARY KEY, UNIQUE KEY, or DUPLICATE KEY instead." + ) + # Unknown key type + logger.warning(f"[StarRocks] Unknown key type: {active_key_type}") + return None + if key_value is None: + logger.error(f"Failed to get the parameter value for {active_key_type!r}") + return None + + logger.debug( + "_build_table_key_property: input key=%s value=%s", + active_key_type, + key_value, + ) + + # Validate and normalize + # preprocess_parentheses=True handles string preprocessing like 'id, dt' -> '(id, dt)' + normalized = PropertyValidator.validate_and_normalize_property( + active_key_type, key_value, preprocess_parentheses=True + ) + # normalized is List[exp.Column] as defined in TableKeyInputSpec + result = property_class(expressions=list(normalized)) + logger.debug( + "_build_table_key_property: generated %s with columns=%s", + type(result).__name__, + [col.name for col in normalized], + ) + return result + + def _build_partition_property( + self, + partitioned_by: t.Optional[t.List[exp.Expression]], + partition_interval_unit: t.Optional["IntervalUnit"], + target_columns_to_types: t.Optional[t.Dict[str, exp.DataType]], + catalog_name: t.Optional[str], + table_properties: t.Dict[str, t.Any], + key_type: t.Optional[str], + key_columns: t.Optional[t.Tuple[str, ...]], + ) -> t.Optional[exp.Expression]: + """ + Build partition property expression. + + StarRocks supports: + - PARTITION BY RANGE (cols) - for time-based partitions + - PARTITION BY LIST (cols) - for categorical partitions + - PARTITION BY (exprs) - for expression partitions, can also be `exprs` (without `(`, and `)`) + + Args: + partitioned_by: Partition column expressions from parameter + partition_interval_unit: Optional time unit for automatic partitioning + target_columns_to_types: Column definitions + catalog_name: Catalog name (if applicable) + table_properties: Dictionary containing partitioned_by/partitions (will be modified) + key_type: Table key type (for validation) + key_columns: Table key columns (partition columns must be subset) + + Returns: + Partition property expression or None + """ + # Priority: parameter > partition_by (alias) > partitioned_by + # Use PropertyValidator to check mutual exclusion between parameter and properties + partition_param_name = PropertyValidator.check_at_most_one( + property_name="partitioned_by", + property_description="partition definition", + table_properties=table_properties, + parameter_value=partitioned_by or None, + ) + + # If parameter was provided, it takes priority + if partitioned_by: + logger.debug( + "_build_partition_property: using partitioned_by from model param=%s", + partitioned_by, + ) + elif not partitioned_by and partition_param_name: + # Get from table_properties + partitioned_by = table_properties.pop(partition_param_name, None) + logger.debug( + "_build_partition_property: using partitioned_by from table_properties[%s]=%s", + partition_param_name, + partitioned_by, + ) + + if not partitioned_by: + logger.debug( + "_build_partition_property: no 'partitioned_by' defined, skipped" + ) + return None + + # Parse partition expressions to extract columns and kind (RANGE/LIST) + partition_kind, partition_cols = self._parse_partition_expressions( + partitioned_by + ) + logger.debug( + "_build_partition_property: partition_kind=%s, partition_cols=%s", + partition_kind, + partition_cols, + ) + + def extract_column_name(expr: exp.Expression) -> t.Optional[str]: + if isinstance(expr, exp.Column): + return str(expr.name) + elif isinstance(expr, (exp.Anonymous, exp.Func)): # noqa: RET505 + return None # not implemented + else: + return str(expr) + + # Validate partition columns are in key columns (StarRocks requirement) + if key_columns: + partition_col_names = set( + extract_column_name(expr) for expr in partition_cols + ) - {None} + key_cols_set = set(key_columns) + not_in_key = partition_col_names - key_cols_set + if not_in_key: + logger.warning( + f"[StarRocks] Partition columns {not_in_key} not in {key_type} columns {key_cols_set}. " + "StarRocks requires partition columns to be part of the table key." + ) + + # Get partition definitions (RANGE/LIST partitions) + # Note: Expression-based partitioning (partition_kind=None) does not support pre-created partitions + if partitions := table_properties.pop("partitions", None): + logger.debug("Pre-created partitions: %s", partitions) + if partition_kind is None: + logger.warning( + "[StarRocks] 'partitions' parameter is ignored for expression-based partitioning. " + "Expression partitioning creates partitions automatically and does not support " + "pre-created partition definitions." + ) + partitions = None # Ignore partitions for expression-based partitioning + else: + partitions = PropertyValidator.validate_and_normalize_property( + "partitions", partitions + ) + + # Build partition expression using base class method + result = self._build_partitioned_by_exp( + partition_cols, + partition_interval_unit=partition_interval_unit, + target_columns_to_types=target_columns_to_types, + catalog_name=catalog_name, + partitions=partitions, + partition_kind=partition_kind, + ) + logger.debug("_build_partition_property: generated %s", result) + return result + + def _parse_partition_expressions( + self, partitioned_by: t.List[exp.Expression] + ) -> t.Tuple[t.Optional[str], t.List[exp.Expression]]: + """ + Parse partition expressions and extract partition kind (RANGE/LIST). + + Uses PartitionedByInputSpec to validate and normalize the entire list, + then extracts RANGE/LIST kind from function expressions. + + The SPEC output is List[exp.Column | exp.Anonymous | exp.Func], where: + - exp.Column: Regular column reference + - exp.Anonymous: Function call like RANGE(col), LIST(col), and other datetime related functions + - exp.Func: date_trunc(), and other built-in functions + + Args: + partitioned_by: List of partition expressions + + Returns: + Tuple of (partition_kind, normalized_columns) + - partition_kind: "RANGE", "LIST", or None + - normalized_columns: List of Column expressions, or function expressions + """ + parsed_cols: t.List[exp.Expression] = [] + partition_kind: t.Optional[str] = None + + normalized = PropertyValidator.validate_and_normalize_property( + "partitioned_by", partitioned_by, preprocess_parentheses=True + ) + # Process each normalized expression + for norm_expr in normalized: + # Check if it's a RANGE function (exp.Anonymous) + if isinstance(norm_expr, exp.Anonymous) and norm_expr.this: + func_name = str(norm_expr.this).upper() + if func_name in ("RANGE", "LIST"): + partition_kind = func_name + # Extract column expressions from function arguments + for arg in norm_expr.expressions: + if isinstance(arg, exp.Column): + parsed_cols.append(arg) + else: + parsed_cols.append(exp.to_column(str(arg))) + continue + + # Check if it's a LIST expression (SQLGlot parses LIST(...) as exp.List) + if isinstance(norm_expr, exp.List): + partition_kind = "LIST" + # Extract column expressions from list items + for item in norm_expr.expressions: + if isinstance(item, exp.Column): + parsed_cols.append(item) + else: + parsed_cols.append(exp.to_column(str(item))) + continue + + # Regular column or other function (date_trunc, etc.) + parsed_cols.append(norm_expr) + + return partition_kind, parsed_cols + + def _build_partitioned_by_exp( + self, + partitioned_by: t.List[exp.Expression], + *, + partition_interval_unit: t.Optional["IntervalUnit"] = None, + target_columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None, + catalog_name: t.Optional[str] = None, + **kwargs: t.Any, + ) -> t.Optional[ + t.Union[ + exp.PartitionedByProperty, + exp.PartitionByRangeProperty, + exp.PartitionByListProperty, + exp.Property, + ] + ]: + """ + Build StarRocks partitioning expression. + + - partition_kind: RANGE/LIST/None (passed via kwargs, None as expression partitioning) + - partitioned_by: normalized partition column/func/anonymous expressions + - partitions: partition definitions as List[str] (passed via kwargs) + + Supports both RANGE and LIST partition syntaxes, and expression partition syntax. + + Args: + partitioned_by: List of partition column expressions + partition_interval_unit: Optional time unit (unused for now) + target_columns_to_types: Column definitions (unused for now) + catalog_name: Catalog name (unused for now) + **kwargs: Must contain 'partition_kind' and optionally 'partitions' + + Returns: + PartitionByRangeProperty, PartitionByListProperty, or None + """ + partition_kind = kwargs.get("partition_kind") + partitions: t.Optional[t.List[str]] = kwargs.get("partitions") + logger.debug( + "_build_partitioned_by_exp: partition_kind=%s, partitioned_by=%s, partitions=%s", + partition_kind, + partitioned_by, + partitions, + ) + + # Process partitions to create_expressions + # partitions is already List[str] after SPEC normalization + create_expressions: t.Optional[t.List[exp.Var]] = None + if partitions: + create_expressions = [exp.Var(this=p, quoted=False) for p in partitions] + + # Build partition expression + if partition_kind == "LIST": + return exp.PartitionByListProperty( + partition_expressions=partitioned_by, + create_expressions=create_expressions, + ) + elif partition_kind == "RANGE": # noqa: RET505 + return exp.PartitionByRangeProperty( + partition_expressions=partitioned_by, + create_expressions=create_expressions, + ) + elif partition_kind is None: + return exp.PartitionedByProperty(this=exp.tuple_(*partitioned_by)) + + def _build_distributed_by_property( + self, + table_properties: t.Dict[str, t.Any], + key_columns: t.Optional[t.Tuple[str, ...]], + ) -> t.Optional[exp.DistributedByProperty]: + """ + Build DISTRIBUTED BY property from table_properties. + + Supports: + 1. Structured tuple: (kind='HASH', columns=(id, dt), buckets=10) + 2. String format: "HASH(id)", "RANDOM", "HASH(id) BUCKETS 10" + 3. None: Returns None (no default distribution) + + For complex string like "HASH(id) BUCKETS 10", uses split-and-combine: + - Split on 'BUCKETS' to separate HASH part and bucket count + - Parse HASH part via DistributedByInputSpec + - Parse bucket count as number + - Combine into unified dict + + Args: + table_properties: Dictionary containing distributed_by (will be modified) + key_columns: Table key columns (used for default distribution) + + Returns: + DistributedByProperty or None + """ + distributed_by = table_properties.pop("distributed_by", None) + + # No default - if not set, return None + if distributed_by is None: + logger.debug( + "_build_distributed_by_property: no 'distributed_by' defined, skipped" + ) + return None + + logger.debug( + "_build_distributed_by_property: using distributed_by from table_properties=%s", + distributed_by, + ) + + # Try to parse complex string with BUCKETS first + unified = self._parse_distribution_with_buckets(distributed_by) + logger.debug( + "_build_distributed_by_property: parsed distribution with buckets: %s", + unified, + ) + if unified is None: + # Fall back to SPEC-based parsing + normalized = PropertyValidator.validate_and_normalize_property( + "distributed_by", distributed_by + ) + # Convert to unified dict format + unified = DistributionTupleOutputType.to_unified_dict(normalized) + + logger.debug( + "_build_distributed_by_property: normalized to kind=%s, columns=%s, buckets=%s", + unified.get("kind"), + unified.get("columns"), + unified.get("buckets"), + ) + + # Build expression + kind_expr = exp.Var(this=unified["kind"]) + # Convert columns to expressions + columns: t.List[exp.Column] = unified.get("columns", []) + expressions_list: t.List[exp.Expression] = [] + for col in columns: + if isinstance(col, exp.Expression): + expressions_list.append(col) + else: + expressions_list.append(exp.to_column(str(col))) + # Build buckets expression + buckets: t.Optional[t.Any] = unified.get("buckets") + if buckets is not None: + if isinstance(buckets, exp.Literal): + buckets_expr = buckets + else: + buckets_expr = exp.Literal.number(int(buckets)) + else: + buckets_expr = None + + result = exp.DistributedByProperty( + kind=kind_expr, + expressions=expressions_list, + buckets=buckets_expr, + order=None, + ) + logger.debug( + "_build_distributed_by_property: generated DistributedByProperty: %s", + result, + ) + return result + + def _build_refresh_property( + self, table_properties: t.Dict[str, t.Any] + ) -> t.Optional[exp.RefreshTriggerProperty]: + """ + Build StarRocks MV REFRESH clause as exp.RefreshTriggerProperty. + + Input (from physical_properties): + - refresh_moment: IMMEDIATE | DEFERRED (optional) + - refresh_scheme: MANUAL | ASYNC [START ()] EVERY (INTERVAL ) (optional) + + Output mapping (to match sqlglot StarRocks generator refreshtriggerproperty_sql): + - method: refresh_moment when provided; otherwise a sentinel that won't render + - kind: ASYNC | MANUAL + - starts/every/unit: parsed from refresh_scheme if present + """ + refresh_moment = table_properties.pop("refresh_moment", None) + refresh_scheme = table_properties.pop("refresh_scheme", None) + if refresh_moment is None and refresh_scheme is None: + return None + + # method is required by exp.RefreshTriggerProperty, but StarRocks syntax does NOT support AUTO. + # We use a sentinel value that the StarRocks generator will not render (it only renders + # IMMEDIATE/DEFERRED). + method_expr = exp.Var(this="UNSPECIFIED") + if refresh_moment is not None: + refresh_moment_text = PropertyValidator.validate_and_normalize_property( + "refresh_moment", refresh_moment + ) + method_expr = exp.Var(this=refresh_moment_text) + + kind_expr: t.Optional[exp.Expression] = None + starts_expr: t.Optional[exp.Expression] = None + every_expr: t.Optional[exp.Expression] = None + unit_expr: t.Optional[exp.Expression] = None + + if refresh_scheme is not None: + scheme_text = PropertyValidator.validate_and_normalize_property( + "refresh_scheme", refresh_scheme + ) + if isinstance(scheme_text, exp.Var): + kind_expr = scheme_text + else: + kind_expr, starts_expr, every_expr, unit_expr = self._parse_refresh_scheme(scheme_text) + + return exp.RefreshTriggerProperty( + method=method_expr, + kind=kind_expr, + starts=starts_expr, + every=every_expr, + unit=unit_expr, + ) + + def _parse_refresh_scheme( + self, refresh_scheme: str + ) -> t.Tuple[ + t.Optional[exp.Expression], + t.Optional[exp.Expression], + t.Optional[exp.Expression], + t.Optional[exp.Expression], + ]: + """ + Parse StarRocks refresh_scheme text into (kind, starts, every, unit). + + parsing simple and robust. We only extract: + - kind: ASYNC | MANUAL (must appear at the beginning), None if not provided + - starts: START () where is treated as a raw string + - every/unit: EVERY (INTERVAL ) + """ + text = (refresh_scheme or "").strip() + if not text: + return None, None, None, None + + m_kind = re.match(r"^(MANUAL|ASYNC)\b", text, flags=re.IGNORECASE) + if not m_kind: + raise SQLMeshError( + f"[StarRocks] Invalid refresh_scheme {refresh_scheme!r}. Expected to start with MANUAL or ASYNC." + ) + kind = m_kind.group(1).upper() + kind_expr: t.Optional[exp.Expression] = exp.Var(this=kind) + + starts_expr: t.Optional[exp.Expression] = None + every_expr: t.Optional[exp.Expression] = None + unit_expr: t.Optional[exp.Expression] = None + m_start = re.search(r"\bSTART\s*\(\s*(?:'([^']*)'|\"([^\"]*)\"|([^)]*))\s*\)", text, flags=re.IGNORECASE) + if m_start: + start_inner = (m_start.group(1) or m_start.group(2) or m_start.group(3) or "").strip() + starts_expr = exp.Literal.string(start_inner) + m_every = re.search(r"\bEVERY\s*\(\s*INTERVAL\s+(\d+)\s+(\w+)\s*\)", text, flags=re.IGNORECASE) + if m_every: + every_expr = exp.Literal.number(int(m_every.group(1))) + unit_expr = exp.Var(this=m_every.group(2).upper()) + return kind_expr, starts_expr, every_expr, unit_expr + + def _parse_distribution_with_buckets( + self, distributed_by: t.Any + ) -> t.Optional[t.Dict[str, t.Any]]: + """ + Parse complex distribution expressions like 'HASH(id) BUCKETS 10'. + + Since SQLGlot cannot parse 'HASH(id) BUCKETS 10' directly, we: + 1. Detect if input is a string containing 'BUCKETS' + 2. Split into HASH part and BUCKETS part + 3. Parse HASH part via DistributedByInputSpec + 4. Extract bucket count as number + 5. Combine into unified dict + + Args: + distributed_by: The distribution value (may be string, expression, etc.) + + Returns: + Unified dict with keys: kind, columns, buckets + Returns None if not a complex BUCKETS expression + (The output function will still handle "HASH(id)" without BUCKETS) + """ + # Only handle string or Literal string values + logger.debug( + "_parse_distribution_with_buckets: distributed_by: %s, type: %s", + distributed_by, + type(distributed_by), + ) + if isinstance(distributed_by, str): + text = distributed_by + elif isinstance(distributed_by, exp.Literal) and distributed_by.is_string: + text = str(distributed_by.this) + else: + return None + + # Check if contains BUCKETS keyword (case-insensitive) + if "BUCKETS" not in text.upper(): + return None + + # Split on BUCKETS (case-insensitive) + match = re.match( + r"^(.+?)\s+BUCKETS\s+(\d+)\s*$", text.strip(), flags=re.IGNORECASE + ) + if not match: + return None + + hash_part = match.group(1).strip() + buckets_str = match.group(2) + + # Parse the HASH/RANDOM part via SPEC + normalized = PropertyValidator.validate_and_normalize_property( + "distributed_by", hash_part + ) + logger.debug( + "_parse_distribution_with_buckets: parsed hash part: %s, type: %s", + normalized, + type(normalized), + ) + + return DistributionTupleOutputType.to_unified_dict(normalized, int(buckets_str)) + + def _build_order_by_property( + self, + table_properties: t.Dict[str, t.Any], + clustered_by: t.Optional[t.List[exp.Expression]], + ) -> t.Optional[exp.Cluster]: + """ + Build ORDER BY (clustering) property. + + Supports both: + - clustered_by parameter (from create_table call) + - order_by in table_properties (backward compatibility alias) + + Priority: clustered_by parameter > order_by in table_properties + + Args: + table_properties: Dictionary containing optional order_by (will be modified) + clustered_by: Clustering columns from parameter + + Returns: + Cluster expression (generates ORDER BY) or None + """ + # Priority: clustered_by parameter > order_by in table_properties + # Use PropertyValidator to check mutual exclusion between parameter and property + order_by_param_name = PropertyValidator.check_at_most_one( + property_name="clustered_by", + property_description="clustering definition", + table_properties=table_properties, + parameter_value=clustered_by, + ) + + # If parameter was provided, it takes priority + if clustered_by: + logger.debug( + "_build_order_by_property: using clustered_by from model param=%s", + clustered_by, + ) + elif clustered_by is None and order_by_param_name: + # Get order_by from table_properties (already validated by check_at_most_one) + order_by = table_properties.pop(order_by_param_name, None) + if order_by is not None: + normalized = PropertyValidator.validate_and_normalize_property( + "clustered_by", order_by, preprocess_parentheses=True + ) + clustered_by = list(normalized) + logger.debug( + "_build_order_by_property: using clustered_by from table_properties[%s]=%s", + order_by_param_name, + clustered_by, + ) + + if clustered_by: + result = exp.Cluster(expressions=clustered_by) + logger.debug("_build_order_by_property: generated Cluster") + return result + else: # noqa: RET505 + logger.debug("_build_order_by_property: no 'clustered_by' defined, skipped") + return None + + def _build_other_properties( + self, table_properties: t.Dict[str, t.Any] + ) -> t.List[exp.Property]: + """ + Build other literal properties (replication_num, storage_medium, etc.). + + Uses validate_and_normalize_property for validation and ensures output is string, + as StarRocks PROPERTIES syntax requires all values to be strings. + + Args: + table_properties: Dictionary containing properties (will be modified) + + Returns: + List of Property expressions + """ + other_props = [] + + for key, value in list(table_properties.items()): + # Skip special keys handled elsewhere + if key in PropertyValidator.IMPORTANT_PROPERTY_NAMES: + logger.warning( + f"[StarRocks] {key!r} should have been processed already, skipping" + ) + continue + + # Remove from properties + table_properties.pop(key) + + # Validate and normalize to string + # All other properties are treated as generic string properties + try: + normalized = PropertyValidator.validate_and_normalize_property( + key, value + ) + other_props.append( + exp.Property( + this=exp.to_identifier(key), + value=exp.Literal.string(str(normalized)), + ) + ) + except SQLMeshError as e: + logger.warning( + "[StarRocks] skipping property %s due to error: %s", key, e + ) + + return other_props + + def _extract_and_validate_key_columns( + self, + table_properties: t.Dict[str, t.Any], + primary_key: t.Optional[t.Tuple[str, ...]] = None, + ) -> t.Tuple[t.Optional[str], t.Optional[t.Tuple[str, ...]]]: + """ + Extract and validate key columns from table_properties. + + All key types require: + - Key columns must be the first N columns in CREATE TABLE + - Column order must match the KEY clause order + + Priority: + - Parameter primary_key > table_properties primary_key + - Only one key type allowed per table + + Args: + table_properties: Table properties dictionary (lowercase keys expected) + primary_key: Primary key from method parameter (highest priority) + + Returns: + Tuple of (key_type, key_columns) + - key_type: One of 'primary_key', 'unique_key', 'duplicate_key', 'aggregate_key', None + - key_columns: Tuple of column names, or None + + Raises: + SQLMeshError: If multiple key types are defined or column extraction fails + """ + # Use PropertyValidator to check mutual exclusion + active_key_type = PropertyValidator.check_at_most_one( + property_name="key_type", # dummy + property_description="table key type", + table_properties=table_properties, + parameter_value=primary_key, + ) + logger.debug("get table key: %s", {active_key_type}) + + # If parameter primary_key was provided, return it + if primary_key: + return ("primary_key", primary_key) + + # Extract from table_properties + if not active_key_type: + return (None, None) + + # Get the key expression and normalize via SPEC + key_expr = table_properties[active_key_type] # Read without popping + # Use validate_and_normalize_property to get List[exp.Column], then extract names + normalized = PropertyValidator.validate_and_normalize_property( + active_key_type, key_expr, preprocess_parentheses=True + ) + key_columns = tuple(col.name for col in normalized) + + logger.debug( + "Extracted '%s' from table_properties, value=%s", + active_key_type, + key_columns, + ) + + return (active_key_type, key_columns) + + def _reorder_columns_for_key( + self, + target_columns_to_types: t.Dict[str, exp.DataType], + key_columns: t.Tuple[str, ...], + key_type: str = "key", + ) -> t.Dict[str, exp.DataType]: + """ + Reorder columns to place key columns first. + + StarRocks Constraint (ALL Table Types): + Key columns (PRIMARY/UNIQUE/DUPLICATE/AGGREGATE) MUST be the first N columns + in the CREATE TABLE statement, in the same order as defined in the KEY clause. + + Example: + Input: + columns = {"customer_id": INT, "order_id": BIGINT, "event_date": DATE} + key_columns = ("order_id", "event_date") + key_type = "primary_key" + + Output: + {"order_id": BIGINT, "event_date": DATE, "customer_id": INT} + + Args: + target_columns_to_types: Original column order (from SELECT) + key_columns: Key column names in desired order + key_type: Type of key for logging (primary_key, unique_key, etc.) + + Returns: + Reordered columns with key columns first + + Raises: + SQLMeshError: If a key column is not found in target_columns_to_types + """ + # Validate that all key columns exist + missing_key_cols = set(key_columns) - set(target_columns_to_types.keys()) + if missing_key_cols: + raise SQLMeshError( + f"{key_type} columns {missing_key_cols} not found in table columns. " + f"Available columns: {list(target_columns_to_types.keys())}" + ) + + # Build new ordered dict: key columns first, then remaining columns + reordered = {} + + # 1. Add key columns in key order + for key_col in key_columns: + reordered[key_col] = target_columns_to_types[key_col] + + # 2. Add remaining columns (preserve original order) + for col_name, col_type in target_columns_to_types.items(): + if col_name not in key_columns: + reordered[col_name] = col_type + + logger.info( + f"Reordered columns for {key_type.upper()}: " + f"Original order: {list(target_columns_to_types.keys())}, " + f"New order: {list(reordered.keys())}" + ) + + return reordered + + def _build_create_comment_table_exp( + self, table: exp.Table, table_comment: str, table_kind: str = "TABLE" + ) -> str: + """ + Build ALTER TABLE COMMENT SQL for table comment modification. + + StarRocks uses non-standard syntax for table comments: + ALTER TABLE {table} COMMENT = '{comment}' + + Note: This method is typically NOT called for StarRocks because: + - COMMENT_CREATION_TABLE = IN_SCHEMA_DEF_CTAS + - Comments are included directly in CREATE TABLE via SchemaCommentProperty + + However, this override is provided for potential future use cases: + - Modifying comments on existing tables via ALTER TABLE + - View comments (if COMMENT_CREATION_VIEW changes) + + Args: + table: Table expression + table_comment: The comment to add + table_kind: Type of object (TABLE, VIEW, etc.) + + Returns: + SQL string for ALTER TABLE COMMENT + """ + table_sql = table.sql(dialect=self.dialect, identify=True) + comment_sql = exp.Literal.string( + self._truncate_table_comment(table_comment) + ).sql(dialect=self.dialect) + return f"ALTER TABLE {table_sql} COMMENT = {comment_sql}" + + def _build_create_comment_column_exp( + self, + table: exp.Table, + column_name: str, + column_comment: str, + table_kind: str = "TABLE", + ) -> str: + """ + Build ALTER TABLE MODIFY COLUMN SQL for column comment modification. + + StarRocks requires column type in MODIFY COLUMN statement: + ALTER TABLE {table} MODIFY COLUMN {column} {type} COMMENT '{comment}' + + Note: This method is typically NOT called for StarRocks because: + - COMMENT_CREATION_TABLE = IN_SCHEMA_DEF_CTAS + - Column comments are included directly in CREATE TABLE DDL + + However, this override is provided for potential future use cases: + - Modifying column comments on existing tables via ALTER TABLE + + Args: + table: Table expression + column_name: Name of the column + column_comment: The comment to add + table_kind: Type of object (TABLE, VIEW, etc.) + + Returns: + SQL string for ALTER TABLE MODIFY COLUMN with COMMENT + """ + table_sql = table.sql(dialect=self.dialect, identify=True) + column_sql = exp.to_identifier(column_name).sql( + dialect=self.dialect, identify=True + ) + + comment_sql = exp.Literal.string( + self._truncate_column_comment(column_comment) + ).sql(dialect=self.dialect) + + return f"ALTER TABLE {table_sql} MODIFY COLUMN {column_sql} COMMENT {comment_sql}" + + # ==================== Methods NOT Needing Override (Base Class Works) ==================== + # The following methods work correctly with base class implementation: + # - columns(): Query column definitions via DESCRIBE TABLE + # - table_exists(): Check if table exists via information_schema + # - insert_append(): Standard INSERT INTO ... SELECT + # - insert_overwrite_by_time_partition(): Uses DELETE_INSERT strategy (handled by base) + # - fetchall() / fetchone(): Standard query execution + # - execute(): Base SQL execution. (Modifyed for `FOR UPDATE` lock operation only) + # - create_table_properties(): Delegate to _build_table_properties_exp() diff --git a/sqlmesh/core/snapshot/evaluator.py b/sqlmesh/core/snapshot/evaluator.py index 497763533b..4e3aaa13a8 100644 --- a/sqlmesh/core/snapshot/evaluator.py +++ b/sqlmesh/core/snapshot/evaluator.py @@ -2045,6 +2045,40 @@ def run_post_statements(self, snapshot: Snapshot, render_kwargs: t.Any) -> None: self.adapter.execute(snapshot.model.render_post_statements(**render_kwargs)) +def _ensure_primary_key_for_starrocks_when_incremental_by_unique_key( + model: Model, physical_properties: t.Optional[t.Dict[str, t.Any]] +) -> t.Dict[str, t.Any]: + """ + Promote StarRocks incremental-by-unique-key models to PRIMARY KEY tables so that + complex DELETE/MERGE statements remain supported. + """ + + if ( + model.dialect != "starrocks" + or not model.kind.is_incremental_by_unique_key + or "primary_key" in physical_properties + ): + return physical_properties + + properties = dict(physical_properties or {}) + unique_key: t.Optional[t.List[exp.Expression]] = model.unique_key + if unique_key: + properties["primary_key"] = ( + unique_key[0] if len(unique_key) == 1 else exp.Tuple(expressions=unique_key) + ) + logger.info( + "Model '%s' promoted to PRIMARY KEY table on StarRocks to support rich DELETE operations.", + model.name, + ) + else: + logger.warning( + f"StarRocks incremental-by-unique-key model '{model.name}' requires a PRIMARY KEY table. " + f"Specify `physical_properties['primary_key']` or set `unique_key` on the model.", + ) + + return properties + + class MaterializableStrategy(PromotableStrategy, abc.ABC): def create( self, @@ -2057,6 +2091,8 @@ def create( ) -> None: ctas_query = model.ctas_query(**render_kwargs) physical_properties = kwargs.get("physical_properties", model.physical_properties) + physical_properties = _ensure_primary_key_for_starrocks_when_incremental_by_unique_key( + model, physical_properties) logger.info("Creating table '%s'", table_name) if model.annotated: @@ -2171,6 +2207,9 @@ def _replace_query_for_model( except Exception: columns_to_types, source_columns = None, None + physical_properties = kwargs.get("physical_properties", model.physical_properties) + physical_properties = _ensure_primary_key_for_starrocks_when_incremental_by_unique_key( + model, physical_properties) self.adapter.replace_query( name, query_or_df, @@ -2179,7 +2218,7 @@ def _replace_query_for_model( partitioned_by=model.partitioned_by, partition_interval_unit=model.partition_interval_unit, clustered_by=model.clustered_by, - table_properties=kwargs.get("physical_properties", model.physical_properties), + table_properties=physical_properties, table_description=model.description, column_descriptions=model.column_descriptions, target_columns_to_types=columns_to_types, @@ -2313,6 +2352,9 @@ def insert( table_name, render_kwargs=render_kwargs, ) + physical_properties = kwargs.get("physical_properties", model.physical_properties) + physical_properties = _ensure_primary_key_for_starrocks_when_incremental_by_unique_key( + model, physical_properties) self.adapter.merge( table_name, query_or_df, @@ -2324,7 +2366,7 @@ def insert( end=kwargs.get("end"), execution_time=kwargs.get("execution_time"), ), - physical_properties=kwargs.get("physical_properties", model.physical_properties), + physical_properties=physical_properties, source_columns=source_columns, ) @@ -2339,6 +2381,9 @@ def append( columns_to_types, source_columns = self._get_target_and_source_columns( model, table_name, render_kwargs=render_kwargs ) + physical_properties = kwargs.get("physical_properties", model.physical_properties) + physical_properties = _ensure_primary_key_for_starrocks_when_incremental_by_unique_key( + model, physical_properties) self.adapter.merge( table_name, query_or_df, @@ -2350,7 +2395,7 @@ def append( end=kwargs.get("end"), execution_time=kwargs.get("execution_time"), ), - physical_properties=kwargs.get("physical_properties", model.physical_properties), + physical_properties=physical_properties, source_columns=source_columns, ) @@ -2693,12 +2738,20 @@ def insert( return logger.info("Replacing view '%s'", table_name) + materialized_properties = None + if is_materialized_view: + materialized_properties = { + "partitioned_by": model.partitioned_by, + "clustered_by": model.clustered_by, + "partition_interval_unit": model.partition_interval_unit, + } self.adapter.create_view( table_name, query_or_df, model.columns_to_types, replace=must_recreate_view, materialized=is_materialized_view, + materialized_properties=materialized_properties, view_properties=kwargs.get("physical_properties", model.physical_properties), table_description=model.description, column_descriptions=model.column_descriptions, @@ -3120,13 +3173,16 @@ def create( if is_table_deployable and is_snapshot_deployable: # We could deploy this to prod; create a proper managed table logger.info("Creating managed table: %s", table_name) + physical_properties = kwargs.get("physical_properties", model.physical_properties) + physical_properties = _ensure_primary_key_for_starrocks_when_incremental_by_unique_key( + model, physical_properties) self.adapter.create_managed_table( table_name=table_name, query=model.render_query_or_raise(**render_kwargs), target_columns_to_types=model.columns_to_types, partitioned_by=model.partitioned_by, clustered_by=model.clustered_by, # type: ignore[arg-type] - table_properties=kwargs.get("physical_properties", model.physical_properties), + table_properties=physical_properties, table_description=model.description, column_descriptions=model.column_descriptions, table_format=model.table_format, diff --git a/tests/core/engine_adapter/integration/__init__.py b/tests/core/engine_adapter/integration/__init__.py index 47ccdc876a..ee19a92a56 100644 --- a/tests/core/engine_adapter/integration/__init__.py +++ b/tests/core/engine_adapter/integration/__init__.py @@ -77,6 +77,7 @@ def pytest_marks(self) -> t.List[MarkDecorator]: IntegrationTestEngine("spark", native_dataframe_type="pyspark"), IntegrationTestEngine("clickhouse", catalog_types=["standalone", "cluster"]), IntegrationTestEngine("risingwave"), + IntegrationTestEngine("starrocks"), # Cloud engines that need paid accounts / special credentials IntegrationTestEngine("clickhouse_cloud", cloud=True), IntegrationTestEngine("redshift", cloud=True), @@ -265,6 +266,7 @@ def timestamp_columns(self) -> t.List[str]: for k, v in self.columns_to_types.items() if v.sql().lower().startswith("timestamp") or (v.sql().lower() == "datetime" and self.dialect == "bigquery") + or (v.sql().lower() == "datetime" and self.dialect == "starrocks") ] @property @@ -307,6 +309,9 @@ def supports_merge(self) -> bool: if self.dialect == "risingwave": return False + if self.dialect == "starrocks": + return False + return True @property @@ -448,7 +453,7 @@ def get_table_comment( AND pgc.relkind = '{"v" if table_kind == "VIEW" else "r"}' ; """ - elif self.dialect in ["mysql", "snowflake"]: + elif self.dialect in ["mysql", "snowflake", "starrocks"]: # Snowflake treats all identifiers as uppercase unless they are lowercase and quoted. # They are lowercase and quoted in sushi but not in the inline tests. if self.dialect == "snowflake" and snowflake_capitalize_ids: @@ -458,6 +463,7 @@ def get_table_comment( comment_field_name = { "mysql": "table_comment", "snowflake": "comment", + "starrocks": "table_comment", } query = f""" @@ -563,7 +569,7 @@ def get_column_comments( AND pgc.relkind = '{"v" if table_kind == "VIEW" else "r"}' ; """ - elif self.dialect in ["mysql", "snowflake", "trino"]: + elif self.dialect in ["mysql", "snowflake", "trino", "starrocks"]: # Snowflake treats all identifiers as uppercase unless they are lowercase and quoted. # They are lowercase and quoted in sushi but not in the inline tests. if self.dialect == "snowflake" and snowflake_capitalize_ids: @@ -574,6 +580,7 @@ def get_column_comments( "mysql": "column_comment", "snowflake": "comment", "trino": "comment", + "starrocks": "column_comment", } query = f""" diff --git a/tests/core/engine_adapter/integration/config.yaml b/tests/core/engine_adapter/integration/config.yaml index 5635f4e1ba..c9b4a9b6cf 100644 --- a/tests/core/engine_adapter/integration/config.yaml +++ b/tests/core/engine_adapter/integration/config.yaml @@ -118,6 +118,16 @@ gateways: host: {{ env_var('DOCKER_HOSTNAME', 'localhost') }} port: 4566 check_import: false + inttest_starrocks: + connection: + type: starrocks + host: {{ env_var('DOCKER_HOSTNAME', 'localhost') }} + port: 9030 + user: root + password: "" + check_import: false + state_connection: + type: duckdb # Cloud databases diff --git a/tests/core/engine_adapter/integration/docker/compose.starrocks.yaml b/tests/core/engine_adapter/integration/docker/compose.starrocks.yaml new file mode 100644 index 0000000000..3a19fa6a3f --- /dev/null +++ b/tests/core/engine_adapter/integration/docker/compose.starrocks.yaml @@ -0,0 +1,27 @@ +services: + starrocks-fe: + image: starrocks/fe-ubuntu:3.5-latest + container_name: starrocks-fe + hostname: starrocks-fe + environment: + - FE_SERVERS=fe1:starrocks-fe:9030 + ports: + - "9030:9030" # MySQL protocol port for tests + - "8030:8030" # HTTP port + networks: + - starrocks_net + + starrocks-be: + image: starrocks/be-ubuntu:3.5-latest + container_name: starrocks-be + hostname: starrocks-be + depends_on: + - starrocks-fe + environment: + - FE_SERVERS=starrocks-fe:9030 + networks: + - starrocks_net + +networks: + starrocks_net: + driver: bridge diff --git a/tests/core/engine_adapter/integration/test_integration.py b/tests/core/engine_adapter/integration/test_integration.py index 1fba346db3..86b54d7399 100644 --- a/tests/core/engine_adapter/integration/test_integration.py +++ b/tests/core/engine_adapter/integration/test_integration.py @@ -777,6 +777,8 @@ def test_insert_overwrite_by_time_partition(ctx_query_and_df: TestContext): ds_type = "datetime" if ctx.dialect == "tsql": ds_type = "varchar(max)" + if ctx.dialect == "starrocks": + ds_type = "datetime" ctx.columns_to_types = {"id": "int", "ds": ds_type} table = ctx.table("test_table") @@ -865,6 +867,8 @@ def test_insert_overwrite_by_time_partition_source_columns(ctx_query_and_df: Tes ds_type = "datetime" if ctx.dialect == "tsql": ds_type = "varchar(max)" + if ctx.dialect == "starrocks": + ds_type = "datetime" ctx.columns_to_types = {"id": "int", "ds": ds_type} columns_to_types = { @@ -2579,6 +2583,7 @@ def test_dialects(ctx: TestContext): "mysql": pd.Timestamp("2020-01-01 00:00:00"), "spark": pd.Timestamp("2020-01-01 00:00:00"), "databricks": pd.Timestamp("2020-01-01 00:00:00"), + "starrocks": pd.Timestamp("2020-01-01 00:00:00"), }, ), ( diff --git a/tests/core/engine_adapter/integration/test_integration_starrocks.py b/tests/core/engine_adapter/integration/test_integration_starrocks.py new file mode 100644 index 0000000000..135c1443af --- /dev/null +++ b/tests/core/engine_adapter/integration/test_integration_starrocks.py @@ -0,0 +1,2400 @@ +""" +Integration tests for StarRocks Engine Adapter + +These tests require a running StarRocks instance. +They verify that the generated SQL actually works on real StarRocks database. + +Strategy: +- Basic test: Verify fundamental functionality works +- Complex test: Verify comprehensive SQL with all features works + +Run with: + pytest -m "starrocks and docker" tests/core/engine_adapter/integration/test_integration_starrocks.py + +Or against local StarRocks: + export STARROCKS_HOST=localhost + export STARROCKS_PORT=9030 + export STARROCKS_USER=root + export STARROCKS_PASSWORD="" + pytest tests/core/engine_adapter/integration/test_integration_starrocks.py +""" + +import logging +import os +import re +import typing as t +from functools import partial + +import pytest +from sqlglot import exp + +from sqlmesh.core.engine_adapter.starrocks import StarRocksEngineAdapter +from sqlmesh.core.model.definition import load_sql_based_model, SqlModel +import sqlmesh.core.dialect as d + +from tests.core.engine_adapter.integration import TestContext + +# Mark as docker test (can also run against local StarRocks) +# Remove 'docker' marker if you want to run against local instance only +pytestmark = [pytest.mark.starrocks, pytest.mark.docker, pytest.mark.engine] + + +logger = logging.getLogger(__name__) + + +def _load_sql_model(model_sql: str) -> SqlModel: + expressions = d.parse(model_sql, default_dialect="starrocks") + return t.cast(SqlModel, load_sql_based_model(expressions)) + + +def _materialized_properties_from_model(model: SqlModel) -> t.Optional[t.Dict[str, t.Any]]: + props: t.Dict[str, t.Any] = {} + if model.partitioned_by: + props["partitioned_by"] = model.partitioned_by + if model.clustered_by: + props["clustered_by"] = model.clustered_by + return props or None + + +def _model_name_from_table(table: exp.Table) -> str: + if table.db: + return f"{table.db}.{table.name}" + return table.name + + +def normalize_sql(sql: str) -> str: + """Normalizes a SQL string for comparison.""" + # Remove comments + sql = re.sub(r'--.*\n', '', sql) + # Replace newlines and tabs with spaces + sql = sql.replace('\n', ' ').replace('\t', '') + # Collapse multiple spaces into one + sql = re.sub(r'\s+', ' ', sql) + # Remove spaces around parentheses, commas, and equals for consistency + sql = re.sub(r'\s*\(\s*', '(', sql) + sql = re.sub(r'\s*\)\s*', ')', sql) + sql = re.sub(r'\s*,\s*', ',', sql) + sql = re.sub(r'\s*=\s*', '=', sql) + # Remove all paired backticks around identifiers + sql = re.sub(r'`([^`]+)`', r'\1', sql) + sql = re.sub(r'\'', '"', sql) + + return sql.strip() + + +# ============================================================================= +# TestContext-based Integration Tests +# ============================================================================= +# +# These tests demonstrate how to use SQLMesh's TestContext helpers in a StarRocks-specific +# integration file: +# - Automatic schema isolation via ctx.test_id +# - Automatic cleanup of created schemas +# +# Unlike the shared integration harness (which loads the full gateway config), this local +# fixture keeps StarRocks tests self-contained and runnable with only StarRocks deps installed. + + +@pytest.fixture(scope="module") +def starrocks_connection_config() -> t.Dict[str, t.Any]: + """StarRocks connection configuration from environment variables.""" + return { + "host": os.getenv("STARROCKS_HOST", "localhost"), + "port": int(os.getenv("STARROCKS_PORT", "9030")), + "user": os.getenv("STARROCKS_USER", "myname"), + "password": os.getenv("STARROCKS_PASSWORD", "pswd1234"), + } + + +@pytest.fixture +def ctx(tmp_path, starrocks_connection_config) -> t.Iterable[TestContext]: + """ + A lightweight TestContext fixture which avoids loading the full integration gateway config. + + This keeps the StarRocks integration tests self-contained (similar to `starrocks_adapter`) + while still providing TestContext niceties like: + - ctx.table(...) naming + schema isolation + - automatic cleanup + """ + from pymysql import connect + + adapter = StarRocksEngineAdapter(partial(connect, **starrocks_connection_config)) + ctx = TestContext( + "query", + adapter, + mark="starrocks", + gateway="manual_starrocks", + tmp_path=tmp_path, + is_remote=False, + ) + + ctx.init() + try: + with ctx.engine_adapter.session({}): + yield ctx + finally: + ctx.cleanup() + + +@pytest.fixture +def engine_adapter(ctx: TestContext) -> StarRocksEngineAdapter: + assert isinstance(ctx.engine_adapter, StarRocksEngineAdapter) + return ctx.engine_adapter + + +@pytest.fixture(scope="module") +def starrocks_adapter(starrocks_connection_config) -> StarRocksEngineAdapter: + """Create a real StarRocks adapter connected to database. + It's still used in a lot of tests, so it can't be removed yet. + """ + from pymysql import connect + + connection_factory = partial(connect, **starrocks_connection_config) + adapter = StarRocksEngineAdapter(connection_factory) + + yield adapter + + # Cleanup: adapter will auto-close connection + + +class TestBasicOperations: + """ + Basic Operations + + Each test method verifies one fundamental SQL operation. + This allows running individual tests and clear failure reporting. + """ + + def test_create_drop_schema(self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter): + """Test CREATE DATABASE and DROP DATABASE (TestContext version).""" + db_name = ctx.schema("sr_test_create_drop_db") + + # CREATE DATABASE + engine_adapter.create_schema(db_name, ignore_if_exists=True) + result = engine_adapter.fetchone( + f"SELECT SCHEMA_NAME FROM information_schema.SCHEMATA WHERE SCHEMA_NAME = '{db_name}'" + ) + assert result is not None, "CREATE DATABASE failed" + assert result[0] == db_name + + # DROP DATABASE + engine_adapter.drop_schema(db_name) + result = engine_adapter.fetchone( + f"SELECT SCHEMA_NAME FROM information_schema.SCHEMATA WHERE SCHEMA_NAME = '{db_name}'" + ) + assert result is None, "DROP DATABASE failed" + + def test_create_drop_table(self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter): + """Test CREATE TABLE and DROP TABLE (TestContext version). + """ + table = ctx.table("sr_test_table") + + engine_adapter.create_table( + table, + target_columns_to_types={ + "id": exp.DataType.build("INT"), + "name": exp.DataType.build("VARCHAR(100)"), + }, + ) + + db_name = table.db + table_name = table.name + exists = engine_adapter.fetchone( + f"SELECT TABLE_NAME FROM information_schema.TABLES " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = '{table_name}'" + ) + assert exists is not None, "CREATE TABLE failed" + + engine_adapter.drop_table(table) + exists = engine_adapter.fetchone( + f"SELECT TABLE_NAME FROM information_schema.TABLES " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = '{table_name}'" + ) + assert exists is None, "DROP TABLE failed" + + def test_create_table_like_preserves_metadata_and_copies_no_data( + self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter + ) -> None: + """ + Verify StarRocks native CREATE TABLE LIKE semantics: + - Copies schema (columns) + - Does NOT copy data + - Preserves key table metadata (at least PRIMARY KEY / DISTRIBUTED BY) + """ + source = ctx.table("src_like") + target = ctx.table("tgt_like") + + engine_adapter.create_table( + source, + target_columns_to_types={ + "id": exp.DataType.build("BIGINT"), + "name": exp.DataType.build("VARCHAR(100)"), + }, + primary_key=("id",), + table_properties={ + # Make metadata visible in SHOW CREATE TABLE so LIKE preservation is testable. + "distributed_by": "HASH(id) BUCKETS 10", + "replication_num": "1", + }, + ) + + engine_adapter.execute( + f"INSERT INTO {source.sql(dialect=ctx.dialect, identify=True)} (id, name) " + "VALUES (1, 'a'), (2, 'b')" + ) + + engine_adapter.create_table_like(target, source, exists=True) + + # Like should not copy data. + src_count = engine_adapter.fetchone( + f"SELECT COUNT(*) FROM {source.sql(dialect=ctx.dialect, identify=True)}" + )[0] + tgt_count = engine_adapter.fetchone( + f"SELECT COUNT(*) FROM {target.sql(dialect=ctx.dialect, identify=True)}" + )[0] + assert src_count == 2 + assert tgt_count == 0 + + # Like should preserve key metadata (engine-defined behavior). + ddl = engine_adapter.fetchone( + f"SHOW CREATE TABLE {target.sql(dialect=ctx.dialect, identify=True)}" + )[1] + ddl_upper = ddl.upper() + assert "PRIMARY KEY" in ddl_upper + assert "DISTRIBUTED BY" in ddl_upper + + def test_create_table_like_exists_false_raises( + self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter + ) -> None: + """If exists=False and target already exists, StarRocks should error.""" + source = ctx.table("src_like_exists") + target = ctx.table("tgt_like_exists") + + engine_adapter.create_table( + source, + target_columns_to_types={ + "id": exp.DataType.build("INT"), + }, + primary_key=("id",), + table_properties={"replication_num": "1"}, + ) + engine_adapter.create_table_like(target, source, exists=True) + + with pytest.raises(Exception): + engine_adapter.create_table_like(target, source, exists=False) + + + def test_delete(self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter): + """Test DELETE operation (TestContext version).""" + table = ctx.table("sr_test_table") + table_sql = table.sql(dialect=ctx.dialect, identify=True) + + engine_adapter.create_table( + table, + target_columns_to_types={ + "id": exp.DataType.build("INT"), + "name": exp.DataType.build("VARCHAR(100)"), + }, + ) + engine_adapter.execute( + f"INSERT INTO {table_sql} (id, name) VALUES (1, 'Alice'), (2, 'Bob')" + ) + + engine_adapter.delete_from(table, "id = 2") + count = engine_adapter.fetchone(f"SELECT COUNT(*) FROM {table_sql}") + assert count[0] == 1, "DELETE failed" + + def test_rename_table(self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter): + """Test RENAME TABLE operation (TestContext version).""" + old_table = ctx.table("old_table") + new_table = ctx.table("new_table") + + old_table_sql = old_table.sql(dialect=ctx.dialect, identify=True) + new_table_sql = new_table.sql(dialect=ctx.dialect, identify=True) + + engine_adapter.create_table( + old_table, + target_columns_to_types={ + "id": exp.DataType.build("INT"), + "name": exp.DataType.build("VARCHAR(100)"), + }, + ) + + engine_adapter.execute(f"INSERT INTO {old_table_sql} (id, name) VALUES (1, 'Test')") + engine_adapter.rename_table(old_table, new_table) + + db_name = old_table.db + old_table_name = old_table.name + new_table_name = new_table.name + + old_exists = engine_adapter.fetchone( + f"SELECT TABLE_NAME FROM information_schema.TABLES " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = '{old_table_name}'" + ) + assert old_exists is None, "Old table should not exist after rename" + + new_exists = engine_adapter.fetchone( + f"SELECT TABLE_NAME FROM information_schema.TABLES " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = '{new_table_name}'" + ) + assert new_exists is not None, "New table should exist after rename" + + count = engine_adapter.fetchone(f"SELECT COUNT(*) FROM {new_table_sql}") + assert count[0] == 1, "Data should be preserved after rename" + + def test_create_index(self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter): + """Test CREATE INDEX operation (skipped for StarRocks) (TestContext version).""" + table = ctx.table("sr_test_table") + table_sql = table.sql(dialect=ctx.dialect, identify=True) + + engine_adapter.create_table( + table, + target_columns_to_types={ + "id": exp.DataType.build("INT"), + "name": exp.DataType.build("VARCHAR(100)"), + }, + ) + + # CREATE INDEX (should be skipped silently) + engine_adapter.create_index(table, "idx_name", ("name",)) + + count = engine_adapter.fetchone(f"SELECT COUNT(*) FROM {table_sql}") + assert count is not None, "Table should still be functional after skipped index creation" + + def test_create_drop_view(self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter): + """Test CREATE VIEW and DROP VIEW (TestContext version).""" + table = ctx.table("sr_test_table") + view = ctx.table("sr_test_view") + + engine_adapter.create_table( + table, + target_columns_to_types={ + "id": exp.DataType.build("INT"), + "name": exp.DataType.build("VARCHAR(100)"), + }, + ) + + query = exp.select(exp.column("id"), exp.column("name")).from_(table) + engine_adapter.create_view(view, query) + + db_name = view.db + view_name = view.name + result = engine_adapter.fetchone( + f"SELECT TABLE_NAME FROM information_schema.VIEWS " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = '{view_name}'" + ) + assert result is not None, "CREATE VIEW failed" + + engine_adapter.drop_view(view) + result = engine_adapter.fetchone( + f"SELECT TABLE_NAME FROM information_schema.VIEWS " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = '{view_name}'" + ) + assert result is None, "DROP VIEW failed" + + +class TestViewAndMaterializedViewFeatures: + """Integration tests for StarRocks view SECURITY and MV property combos.""" + + def test_create_view_with_security( + self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter + ): + source = ctx.table("sr_sec_src") + view = ctx.table("sr_sec_view") + source_sql_ident = source.sql(dialect=ctx.dialect, identify=True) + view_sql_ident = view.sql(dialect=ctx.dialect, identify=True) + view_model_name = _model_name_from_table(view) + + engine_adapter.create_table( + source, + target_columns_to_types={ + "id": exp.DataType.build("INT"), + "name": exp.DataType.build("VARCHAR(100)"), + }, + ) + engine_adapter.execute( + f"INSERT INTO {source_sql_ident} (id, name) VALUES (1, 'Alice'), (2, 'Bob')" + ) + + model_sql = f""" + MODEL ( + name {view_model_name}, + kind VIEW, + dialect starrocks, + columns ( + id INT, + name VARCHAR(100) + ), + virtual_properties ( + security = invoker + ) + ); + SELECT id, name FROM {source_sql_ident}; + """ + model = _load_sql_model(model_sql) + query = model.render_query() + assert query is not None + engine_adapter.create_view( + view, + query, + replace=True, + target_columns_to_types=model.columns_to_types, + view_properties=model.virtual_properties, + ) + + ddl = engine_adapter.fetchone(f"SHOW CREATE VIEW {view_sql_ident}")[1] + assert "SECURITY INVOKER" in ddl.upper() + + def test_create_view_replace_flag( + self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter + ): + source = ctx.table("sr_replace_src") + view = ctx.table("sr_replace_view") + source_sql_ident = source.sql(dialect=ctx.dialect, identify=True) + view_model_name = _model_name_from_table(view) + + engine_adapter.create_table( + source, + target_columns_to_types={ + "id": exp.DataType.build("INT"), + "name": exp.DataType.build("VARCHAR(100)"), + }, + ) + engine_adapter.execute( + f"INSERT INTO {source_sql_ident} (id, name) VALUES (1, 'A')" + ) + + model_sql = f""" + MODEL ( + name {view_model_name}, + kind VIEW, + dialect starrocks, + columns (id INT, name VARCHAR(100)) + ); + SELECT id, name FROM {source_sql_ident}; + """ + model = _load_sql_model(model_sql) + query = model.render_query() + assert query is not None + + # Success with replace=True to replace the old one + engine_adapter.create_view( + view, + query, + replace=True, + target_columns_to_types=model.columns_to_types, + view_properties=model.virtual_properties, + ) + + # Failed to create a view when it's existing + with pytest.raises(Exception): + engine_adapter.create_view( + view, + query, + replace=False, + target_columns_to_types=model.columns_to_types, + view_properties=model.virtual_properties, + ) + + def _create_sales_source_table( + self, + ctx: TestContext, + engine_adapter: StarRocksEngineAdapter, + table: exp.Table, + ) -> str: + table_sql = table.sql(dialect=ctx.dialect, identify=True) + engine_adapter.create_table( + table, + target_columns_to_types={ + "order_id": exp.DataType.build("BIGINT"), + "customer_id": exp.DataType.build("INT"), + "event_date": exp.DataType.build("DATE"), + "amount": exp.DataType.build("DECIMAL(18,2)"), + "region": exp.DataType.build("VARCHAR(50)"), + }, + primary_key=("order_id", "event_date"), + partitioned_by="event_date", + ) + engine_adapter.execute( + f""" + INSERT INTO {table_sql} (order_id, customer_id, event_date, amount, region) + VALUES + (1, 1001, '2024-01-01', 10.50, 'us'), + (2, 1002, '2024-01-02', 20.75, 'eu') + """ + ) + return table_sql + + def test_materialized_view_combo_with_materialized_properties( + self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter + ): + source = ctx.table("sr_mv_combo_a_src") + mv = ctx.table("sr_mv_combo_a") + mv_sql = mv.sql(dialect=ctx.dialect, identify=True) + source_sql = source.sql(dialect=ctx.dialect, identify=True) + mv_model_name = _model_name_from_table(mv) + + self._create_sales_source_table(ctx, engine_adapter, source) + + model_sql = f""" + MODEL ( + name {mv_model_name}, + kind VIEW ( + materialized true + ), + dialect starrocks, + description 'MV combo A description', + columns ( + order_id BIGINT, + customer_id INT, + event_date DATE, + amount DECIMAL(18,2), + region VARCHAR(50) + ), + column_descriptions ( + order_id = 'Order identifier', + customer_id = 'Customer identifier' + ), + partitioned_by (event_date), + clustered_by (customer_id, region), + virtual_properties ( + distributed_by = 'HASH(order_id) BUCKETS 8', + refresh_moment = DEFERRED, + refresh_scheme = 'ASYNC START (''2025-01-01 00:00:00'') EVERY (INTERVAL 5 MINUTE)', + replication_num = '1' + ) + ); + SELECT order_id, customer_id, event_date, amount, region + FROM {source_sql}; + """ + model = _load_sql_model(model_sql) + query = model.render_query() + assert query is not None + materialized_properties = _materialized_properties_from_model(model) + + engine_adapter.create_view( + mv, + query, + replace=True, + materialized=True, + target_columns_to_types=model.columns_to_types, + materialized_properties=materialized_properties, + view_properties=model.virtual_properties, + table_description=model.description, + column_descriptions=model.column_descriptions, + ) + + ddl = engine_adapter.fetchone(f"SHOW CREATE MATERIALIZED VIEW {mv_sql}")[1] + logger.debug(f"mv ddl: {ddl}") + ddl_upper = normalize_sql(ddl).upper() + assert "REFRESH DEFERRED ASYNC" in ddl_upper + assert "START('2025-01-01 00:00:00')EVERY(INTERVAL 5 MINUTE)" in ddl_upper \ + or 'START("2025-01-01 00:00:00")EVERY(INTERVAL 5 MINUTE)' in ddl_upper + assert "PARTITION BY(EVENT_DATE)" in ddl_upper + assert "ORDER BY(CUSTOMER_ID,REGION)" in ddl_upper + assert "DISTRIBUTED BY HASH(ORDER_ID)BUCKETS 8" in ddl_upper + assert "COMMENT 'MV COMBO A DESCRIPTION'" in ddl_upper \ + or 'COMMENT "MV COMBO A DESCRIPTION"' in ddl_upper + + def test_materialized_view_combo_all_properties_block( + self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter + ): + source = ctx.table("sr_mv_combo_b_src") + mv = ctx.table("sr_mv_combo_b") + mv_sql = mv.sql(dialect=ctx.dialect, identify=True) + source_sql = source.sql(dialect=ctx.dialect, identify=True) + mv_model_name = _model_name_from_table(mv) + + self._create_sales_source_table(ctx, engine_adapter, source) + + model_sql = f""" + MODEL ( + name {mv_model_name}, + kind VIEW ( + materialized true + ), + dialect starrocks, + description 'Analytics MV combo B', + columns ( + order_id BIGINT, + customer_id INT, + event_date DATE, + amount DECIMAL(18,2) + ), + column_descriptions ( + amount = 'Order amount' + ), + virtual_properties ( + partition_by = event_date, + -- ignored when MV + partitions = ( + 'PARTITION p202401 VALUES LESS THAN ("2024-02-01")', + 'PARTITION p202402 VALUES LESS THAN ("2024-03-01")' + ), + distributed_by = (kind=HASH, expressions=(order_id, customer_id), buckets=4), + order_by = (order_id, event_date), + refresh_scheme = MANUAL, + replication_num = '1' + ) + ); + SELECT order_id, customer_id, event_date, amount + FROM {source_sql}; + """ + model = _load_sql_model(model_sql) + query = model.render_query() + assert query is not None + materialized_properties = _materialized_properties_from_model(model) + + engine_adapter.create_view( + mv, + query, + replace=True, + materialized=True, + target_columns_to_types=model.columns_to_types, + materialized_properties=materialized_properties, + view_properties=model.virtual_properties, + table_description=model.description, + column_descriptions=model.column_descriptions, + ) + + ddl = engine_adapter.fetchone(f"SHOW CREATE MATERIALIZED VIEW {mv_sql}")[1] + ddl_upper = normalize_sql(ddl).upper() + assert "REFRESH MANUAL" in ddl_upper + assert "PARTITION P202401" not in ddl_upper # ignored when MV + assert "PARTITION P202402" not in ddl_upper # ignored when MV + assert "PARTITION BY(EVENT_DATE)" in ddl_upper + assert "ORDER BY(ORDER_ID,EVENT_DATE)" in ddl_upper + assert "DISTRIBUTED BY HASH(ORDER_ID,CUSTOMER_ID)BUCKETS 4" in ddl_upper + assert "COMMENT 'ANALYTICS MV COMBO B'" in ddl_upper \ + or 'COMMENT "ANALYTICS MV COMBO B"' in ddl_upper + + +class TestTableFeatures: + """ + Table Features + + Each test method verifies one CREATE TABLE feature that is NOT covered by E2E tests. + Focus on independent functionality like comments and data type compatibility. + """ + + def test_table_and_column_comments(self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter): + """Test table and column comments.""" + table = ctx.table("sr_comment_table") + db_name = table.db + table_name = table.name + + # CREATE TABLE with comments + engine_adapter.create_table( + table, + target_columns_to_types={ + "id": exp.DataType.build("INT"), + "name": exp.DataType.build("VARCHAR(100)"), + }, + table_description="Test table comment", + column_descriptions={ + "id": "User ID", + "name": "User name", + }, + ) + + # Verify table comment + result = engine_adapter.fetchone( + f"SELECT TABLE_COMMENT FROM information_schema.TABLES " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = '{table_name}'" + ) + assert result[0] == "Test table comment", "Table comment not set" + + # Verify column comments + columns = engine_adapter.fetchall( + f"SELECT COLUMN_NAME, COLUMN_COMMENT FROM information_schema.COLUMNS " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = '{table_name}' " + f"ORDER BY ORDINAL_POSITION" + ) + column_comments = {row[0]: row[1] for row in columns} + assert column_comments["id"] == "User ID" + assert column_comments["name"] == "User name" + + def test_multiple_data_types(self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter): + """ + Test basic data types support. + + Covers: numeric, string, datetime, boolean, and JSON types with precision. + Reference: https://docs.starrocks.io/docs/sql-reference/data-types/ + """ + table = ctx.table("sr_types_table") + db_name = table.db + table_name = table.name + table_sql = table.sql(dialect=ctx.dialect, identify=True) + + # CREATE TABLE with multiple data types + engine_adapter.create_table( + table, + target_columns_to_types={ + # Numeric types + "col_tinyint": exp.DataType.build("TINYINT"), + "col_smallint": exp.DataType.build("SMALLINT"), + "col_int": exp.DataType.build("INT"), + "col_bigint": exp.DataType.build("BIGINT"), + "col_float": exp.DataType.build("FLOAT"), + "col_double": exp.DataType.build("DOUBLE"), + "col_decimal": exp.DataType.build("DECIMAL(18,2)"), + # String types with precision + "col_char": exp.DataType.build("CHAR(10)"), + "col_varchar": exp.DataType.build("VARCHAR(200)"), + "col_string": exp.DataType.build("STRING"), + # Date/Time types + "col_date": exp.DataType.build("DATE"), + "col_datetime": exp.DataType.build("DATETIME"), + # Boolean and JSON + "col_boolean": exp.DataType.build("BOOLEAN"), + "col_json": exp.DataType.build("JSON"), + }, + ) + + # Verify all columns created with correct types + columns = engine_adapter.fetchall( + f"SELECT COLUMN_NAME, DATA_TYPE FROM information_schema.COLUMNS " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = '{table_name}' " + f"ORDER BY ORDINAL_POSITION" + ) + assert len(columns) == 14, f"Expected 14 columns, got {len(columns)}" + + # Test data insertion with various types + engine_adapter.execute( + f""" + INSERT INTO {table_sql} + (col_tinyint, col_smallint, col_int, col_bigint, col_float, col_double, col_decimal, + col_char, col_varchar, col_string, col_date, col_datetime, col_boolean, col_json) + VALUES + (127, 32767, 2147483647, 9223372036854775807, 3.14, 3.141592653589793, 12345.67, + 'test', 'test varchar', 'test string', '2024-01-01', '2024-01-01 12:00:00', + true, '{{"key": "value"}}') + """ + ) + + # Verify insertion + count = engine_adapter.fetchone(f"SELECT COUNT(*) FROM {table_sql}") + assert count[0] == 1, "Data insertion with basic types failed" + + # Verify data retrieval + result = engine_adapter.fetchone(f"SELECT col_int, col_varchar, col_date FROM {table_sql}") + assert result[0] == 2147483647 + assert result[1] == "test varchar" + + # @pytest.mark.skip(reason="Complex types (ARRAY/MAP/STRUCT) may not be fully supported yet") + def test_complex_data_types(self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter): + """ + Test complex and nested data types support (ARRAY, MAP, STRUCT). + + Covers: + - Simple complex types: ARRAY, MAP, STRUCT + - Nested ARRAY: ARRAY> + - Nested MAP: MAP> + - Nested STRUCT: STRUCT, metadata MAP> + - Mixed nesting: ARRAY> + - Deep nesting: MAP>> + + Note: These types are available in StarRocks 2.5+ but may require additional + configuration or may not be fully supported in the current adapter. + Reference: https://docs.starrocks.io/docs/sql-reference/data-types/ + """ + table = ctx.table("sr_complex_types_table") + db_name = table.db + table_name = table.name + table_sql = table.sql(dialect=ctx.dialect, identify=True) + + # CREATE TABLE with complex and nested data types + engine_adapter.create_table( + table, + target_columns_to_types={ + "id": exp.DataType.build("BIGINT"), + # Simple complex types + "col_array_simple": exp.DataType.build("ARRAY"), + "col_map_simple": exp.DataType.build("MAP"), + "col_struct_simple": exp.DataType.build("STRUCT"), + # Nested ARRAY + "col_array_nested": exp.DataType.build("ARRAY>"), + # Nested MAP (value is ARRAY) + "col_map_nested": exp.DataType.build("MAP>"), + # Nested STRUCT (contains ARRAY and MAP) + "col_struct_nested": exp.DataType.build( + "STRUCT, metadata MAP>" + ), + # ARRAY of STRUCT + "col_array_of_struct": exp.DataType.build("ARRAY>"), + # Deep nesting: MAP with ARRAY of STRUCT + "col_deep_nested": exp.DataType.build( + "MAP>>" + ), + }, + ) + + # Verify all columns created + columns = engine_adapter.fetchall( + f"SELECT COLUMN_NAME, DATA_TYPE FROM information_schema.COLUMNS " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = '{table_name}' " + f"ORDER BY ORDINAL_POSITION" + ) + assert len(columns) == 9, f"Expected 9 columns, got {len(columns)}" + + # Test data insertion with nested types + engine_adapter.execute( + f""" + INSERT INTO {table_sql} + (id, col_array_simple, col_map_simple, col_struct_simple, + col_array_nested, col_map_nested, col_struct_nested, + col_array_of_struct, col_deep_nested) + VALUES ( + 1, + [1,2,3], + map{{'key1':10,'key2':20}}, + row(100,'simple'), + [[1,2],[3,4]], + map{{'arr1':[1,2],'arr2':[3,4]}}, + row(1001, ['tag1','tag2'], map{{'meta1':1,'meta2':2}}), + [row(1,'Alice'), row(2,'Bob')], + map{{'group1':[row(10,'field_a'), row(20,'field_b')]}} + ) + """ + ) + + # Verify insertion + count = engine_adapter.fetchone(f"SELECT COUNT(*) FROM {table_sql}") + assert count[0] == 1, "Data insertion with complex nested types failed" + + # Verify data retrieval for simple types + result = engine_adapter.fetchone( + f"SELECT col_array_simple, col_struct_simple FROM {table_sql}" + ) + assert result is not None, "Failed to retrieve complex type data" + + +class TestEndToEndModelParsing: + """ + End-to-End Model Parsing Integration Tests + + These tests verify the BASIC and COMPLETE pipeline from MODEL definition to SQL execution. + And will cover some important edge cases, to know whether the whole process can work: + + MODEL Definition (String) + ↓ + d.parse() + load_sql_based_model() + ↓ + Model Object (with physical_properties, partitioned_by_, clustered_by, etc.) + ↓ + adapter.create_table( + partitioned_by=model.partitioned_by_, # MODEL-level parameter + clustered_by=model.clustered_by, # MODEL-level parameter + table_properties=model.physical_properties # From physical_properties block + ) + ↓ + SQL Generation + ↓ + Execute on Real StarRocks + ↓ + Verify via SHOW CREATE TABLE (with ACTUAL column names) + + This ensures that the parameter forms passed to create_table() match + what SQLMesh actually produces when parsing a .sql model file. + + Test Categories: + ================ + + 1. Physical Properties Tests (properties inside physical_properties block): + 2. Model-Level Parameter Tests (parameters at MODEL level, not in physical_properties): + + Property Test Matrix (End-to-End): + +------------------+----------------------------------------+----------------------------------------+ + | Property | MODEL Syntax | Expected DDL | + +------------------+----------------------------------------+----------------------------------------+ + | primary_key | primary_key = (order_id, event_date) | PRIMARY KEY (order_id, event_date) | + | duplicate_key | duplicate_key = (id, name) | DUPLICATE KEY (id, name) | + | partitioned_by | partitioned_by (event_date) | PARTITION BY RANGE (event_date) | + | distributed_by | distributed_by = (kind='HASH', ...) | DISTRIBUTED BY HASH (id) BUCKETS N | + | clustered_by | clustered_by (order_id, region) | ORDER BY (order_id, region) | + | order_by | order_by = (dt, region) | ORDER BY (dt, region) | + | replication_num | replication_num = '1' | PROPERTIES ('replication_num'='1') | + +------------------+----------------------------------------+----------------------------------------+ + """ + + def _parse_model_and_get_all_params(self, model_sql: str) -> t.Dict[str, t.Any]: + """ + Helper: Parse MODEL definition and extract ALL parameters. + + This method returns a dictionary containing ALL parameters that would be passed + to adapter.create_table(), matching what SQLMesh actually does when processing + a model file. This ensures tests verify the real parameter forms, not hand-crafted ones. + + Returns: + Dict containing: + - physical_properties: Dict[str, exp.Expression] from MODEL's physical_properties + - partitioned_by: List[exp.Expression] from MODEL's partitioned_by parameter + - clustered_by: List[exp.Expression] from MODEL's clustered_by parameter + - target_columns_to_types: Dict[str, exp.DataType] from MODEL's columns or query + - table_description: Optional[str] from MODEL's description + - storage_format: Optional[str] from MODEL's storage_format + """ + expressions = d.parse(model_sql, default_dialect="starrocks") + model = load_sql_based_model(expressions, dialect="starrocks") + logger.debug(f"model params: {model}") + + return { + "partitioned_by": model.partitioned_by_, + "clustered_by": model.clustered_by, + "target_columns_to_types": model.columns_to_types or {}, + "table_description": model.description, + "storage_format": model.storage_format, + "table_properties": model.physical_properties, + } + + # ======================================== + # Case 1: Model Parameters (test_design.md Case 1) + # Covers: partitioned_by (multi-expr with function), clustered_by (multi-column) + # ======================================== + + def test_e2e_model_parameters(self, starrocks_adapter: StarRocksEngineAdapter): + """ + Test Case 1: Model-level parameters (partitioned_by + clustered_by). + + Covers: partitioned_by (multi-expr with function), clustered_by (multi-column) + """ + db_name = "sr_e2e_model_params_db" + table_name = f"{db_name}.sr_model_params_table" + + model_sql = """ + MODEL ( + name test.model_parameters, + kind FULL, + columns ( + ts BIGINT, + region VARCHAR(50), + order_id BIGINT, + customer_id INT + ), + partitioned_by (from_unixtime(ts), region), -- Multi-expr with function + clustered_by (order_id, customer_id) -- Multi-column + ); + SELECT * + """ + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + + params = self._parse_model_and_get_all_params(model_sql) + starrocks_adapter.create_table(table_name, **params) + + show_create = starrocks_adapter.fetchone(f"SHOW CREATE TABLE {table_name}") + ddl = show_create[1] + logger.info(f"Case 1 DDL:\n{ddl}") + + # Precise assertions: verify PARTITION BY RANGE with actual columns + import re + + assert "PARTITION BY " in ddl + # Note: PARTITION BY may contain function expressions like from_unixtime(ts) + # We verify the clause exists and contains expected patterns + part_match = re.search(r"PARTITION BY \s*\(([^)]+)\)", ddl) + assert part_match, "PARTITION BY clause not found" + part_cols = part_match.group(1) + # Verify function expression and column references + assert ( + # "from_unixtime" in part_cols or "ts" in part_cols + "__generated_partition_column_" in part_cols + and "region" in part_cols + ), f"Expected partition expression with generated column/region, got {part_cols}" + + # Verify ORDER BY from clustered_by + order_match = re.search(r"ORDER BY\s*\(([^)]+)\)", ddl) + assert order_match, "ORDER BY clause not found" + order_cols = order_match.group(1) + assert ( + "order_id" in order_cols and "customer_id" in order_cols + ), f"Expected ORDER BY (order_id, customer_id), got {order_cols}" + + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + # ======================================== + # Case 2: Physical Properties Core (test_design.md Case 2) + # Covers: primary_key (tuple), distributed_by (string multi-col), order_by (tuple), generic props + # ======================================== + + def test_e2e_physical_properties_core( + self, starrocks_adapter: StarRocksEngineAdapter + ): + """ + Test Case 2: Core physical_properties. + + Covers: primary_key (tuple), distributed_by (string multi-col), order_by (tuple), generic props + """ + db_name = "sr_e2e_core_props_db" + table_name = f"{db_name}.sr_core_props_table" + + model_sql = """ + MODEL ( + name test.physical_props_core, + kind FULL, + dialect starrocks, + columns ( + order_id BIGINT, + event_date DATE, + customer_id INT, + region VARCHAR(50), + amount DECIMAL(18,2) + ), + physical_properties ( + primary_key = (order_id, event_date, customer_id, region), + distributed_by = "HASH(customer_id, region) BUCKETS 16", + order_by = (order_id, region), + -- clustered_by = (order_id, region), -- also OK + -- replication_num = '1', + bucket_size = '12345678', + enable_persistent_index = 'true' + ) + ); + SELECT * + """ + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + + params = self._parse_model_and_get_all_params(model_sql) + starrocks_adapter.create_table(table_name, **params) + + show_create = starrocks_adapter.fetchone(f"SHOW CREATE TABLE {table_name}") + ddl = show_create[1] + logger.info(f"Case 2 DDL:\n{ddl}") + + # Precise assertions + import re + + # Verify PRIMARY KEY with exact columns + pk_match = re.search(r"PRIMARY KEY\s*\(([^)]+)\)", ddl) + assert pk_match, "PRIMARY KEY clause not found" + assert "order_id" in pk_match.group(1) and "event_date" in pk_match.group(1) + + # Verify DISTRIBUTED BY HASH with exact columns + dist_match = re.search(r"DISTRIBUTED BY HASH\s*\(([^)]+)\)", ddl) + assert dist_match, "DISTRIBUTED BY HASH clause not found" + dist_cols = dist_match.group(1) + assert ( + "customer_id" in dist_cols and "region" in dist_cols + ), f"Expected HASH(customer_id, region), got HASH({dist_cols})" + assert "BUCKETS 16" in ddl + + # Verify ORDER BY + order_match = re.search(r"ORDER BY\s*\(([^)]+)\)", ddl) + assert order_match, "ORDER BY clause not found" + assert "order_id" in order_match.group(1) and "region" in order_match.group( + 1 + ) + + # assert "replication_num" not in ddl + + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + # ======================================== + # Case 3: String No-Paren Auto-Wrap (test_design.md Case 3) + # Covers: primary_key = "id, dt" auto-conversion + # ======================================== + + def test_e2e_string_no_paren_auto_wrap( + self, starrocks_adapter: StarRocksEngineAdapter + ): + """ + Test Case 3: String form without parentheses auto-wrap. + + Covers: primary_key = "id, dt" auto-conversion (multi-column string) + """ + db_name = "sr_e2e_auto_wrap_db" + table_name = f"{db_name}.sr_auto_wrap_table" + + model_sql = """ + MODEL ( + name test.string_no_paren, + kind FULL, + dialect starrocks, + columns ( + order_id BIGINT, + event_date DATE + ), + physical_properties ( + primary_key = "order_id, event_date", -- No parentheses, auto-wrapped + distributed_by = 'HASH(order_id) BUCKETS 10', + replication_num = '1' + ) + ); + SELECT * + """ + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + + params = self._parse_model_and_get_all_params(model_sql) + starrocks_adapter.create_table(table_name, **params) + + show_create = starrocks_adapter.fetchone(f"SHOW CREATE TABLE {table_name}") + ddl = show_create[1] + logger.info(f"Case 3 DDL:\n{ddl}") + + # Precise assertion: verify exact PRIMARY KEY columns + import re + + pk_match = re.search(r"PRIMARY KEY\s*\(([^)]+)\)", ddl) + assert pk_match, "PRIMARY KEY clause not found" + pk_clause = pk_match.group(1) + assert ( + "order_id" in pk_clause and "event_date" in pk_clause + ), f"Expected both order_id and event_date in PRIMARY KEY, got {pk_clause}" + + # Verify distributed_by with exact columns + dist_match = re.search(r"DISTRIBUTED BY HASH\s*\(([^)]+)\)", ddl) + assert dist_match, "DISTRIBUTED BY HASH clause not found" + assert "order_id" in dist_match.group( + 1 + ), f"Expected HASH(order_id), got HASH({dist_match.group(1)})" + assert "BUCKETS 10" in ddl + + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + # ======================================== + # Case 4: Structured Distribution (test_design.md Case 4) + # Covers: kind=HASH (unquoted), kind=RANDOM + # ======================================== + + def test_e2e_distribution_structured_hash( + self, starrocks_adapter: StarRocksEngineAdapter + ): + """Test Case 4A: Structured HASH distribution with unquoted kind.""" + db_name = "sr_e2e_dist_hash_db" + table_name = f"{db_name}.sr_dist_hash_table" + + model_sql = """ + MODEL ( + name test.dist_hash_structured, + kind FULL, + dialect starrocks, + columns ( + customer_id INT, + region VARCHAR(50) + ), + physical_properties ( + distributed_by = (kind=HASH, expressions=(customer_id, region), buckets=16), + replication_num = '1' + ) + ); + SELECT * + """ + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + + params = self._parse_model_and_get_all_params(model_sql) + starrocks_adapter.create_table(table_name, **params) + + show_create = starrocks_adapter.fetchone(f"SHOW CREATE TABLE {table_name}") + ddl = show_create[1] + logger.info(f"Case 4A DDL:\n{ddl}") + + # Precise assertions + import re + + assert "DISTRIBUTED BY HASH" in ddl + dist_match = re.search(r"DISTRIBUTED BY HASH\s*\(([^)]+)\)", ddl) + assert dist_match, "DISTRIBUTED BY HASH clause not found" + assert "customer_id" in dist_match.group( + 1 + ) and "region" in dist_match.group(1) + assert "BUCKETS 16" in ddl + + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + def test_e2e_distribution_structured_random( + self, starrocks_adapter: StarRocksEngineAdapter + ): + """Test Case 4B: Structured RANDOM distribution.""" + db_name = "sr_e2e_dist_random_db" + table_name = f"{db_name}.sr_dist_random_table" + + model_sql = """ + MODEL ( + name test.dist_random_structured, + kind FULL, + dialect starrocks, + columns ( + log_id BIGINT, + event_time DATETIME, + message VARCHAR(500) + ), + physical_properties ( + distributed_by = (kind=RANDOM, buckets=10), + replication_num = '1' + ) + ); + SELECT * + """ + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + + params = self._parse_model_and_get_all_params(model_sql) + starrocks_adapter.create_table(table_name, **params) + + show_create = starrocks_adapter.fetchone(f"SHOW CREATE TABLE {table_name}") + ddl = show_create[1] + logger.info(f"Case 4B DDL:\n{ddl}") + + assert "DISTRIBUTED BY RANDOM" in ddl + assert "BUCKETS 10" in ddl + + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + # ======================================== + # Case 5: Partition with RANGE (test_design.md Case 5) + # Covers: partitioned_by RANGE, partitions tuple + # ======================================== + + def test_e2e_partition_range(self, starrocks_adapter: StarRocksEngineAdapter): + """Test Case 5: RANGE partition with multiple partition definitions.""" + db_name = "sr_e2e_part_range_db" + table_name = f"{db_name}.sr_part_range_table" + + model_sql = """ + MODEL ( + name test.partition_range, + kind FULL, + dialect starrocks, + columns ( + id BIGINT, + year smallint, + month smallint + ), + physical_properties ( + primary_key = (id, year, month), + partition_by = RANGE(year, month), + partitions = ( + 'PARTITION p202401 VALUES LESS THAN ("2024", "02")', + 'PARTITION p202402 VALUES LESS THAN ("2024", "03")', + 'PARTITION p202403 VALUES LESS THAN ("2024", "04")' + ), + distributed_by = 'HASH(id) BUCKETS 10', + replication_num = '1' + ) + ); + SELECT * + """ + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + + params = self._parse_model_and_get_all_params(model_sql) + starrocks_adapter.create_table(table_name, **params) + + show_create = starrocks_adapter.fetchone(f"SHOW CREATE TABLE {table_name}") + ddl = show_create[1] + logger.info(f"Case 5 DDL:\n{ddl}") + + # Precise assertions + import re + + assert "PARTITION BY RANGE" in ddl + # Verify partition columns + part_match = re.search(r"PARTITION BY RANGE\s*\(([^)]+)\)", ddl) + assert part_match, "PARTITION BY RANGE clause not found" + assert "year" in part_match.group(1) and "month" in part_match.group(1) + # Verify partition definitions + assert "p202401" in ddl and "p202402" in ddl and "p202403" in ddl + + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + # ======================================== + # Case 6: Partition with LIST (test_design.md Case 6) + # Covers: LIST partition with partitions values + # ======================================== + + def test_e2e_partition_list(self, starrocks_adapter: StarRocksEngineAdapter): + """Test Case 6: LIST partition.""" + db_name = "sr_e2e_part_list_db" + table_name = f"{db_name}.sr_part_list_table" + + model_sql = """ + MODEL ( + name test.partition_list, + kind FULL, + dialect starrocks, + columns ( + id BIGINT, + region VARCHAR(20) + ), + physical_properties ( + primary_key = (id, region), + partition_by = LIST(region), -- can't use partitioned_by + partitions = ( + 'PARTITION p_cn VALUES IN ("cn", "tw", "hk")', + 'PARTITION p_us VALUES IN ("us", "ca")' + ), + distributed_by = 'HASH(id) BUCKETS 8', + replication_num = '1' + ) + ); + SELECT * + """ + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + + params = self._parse_model_and_get_all_params(model_sql) + starrocks_adapter.create_table(table_name, **params) + + show_create = starrocks_adapter.fetchone(f"SHOW CREATE TABLE {table_name}") + ddl = show_create[1] + logger.info(f"Case 6 DDL:\n{ddl}") + + # Precise assertions + import re + + assert "PARTITION BY LIST" in ddl + # Verify partition column + part_match = re.search(r"PARTITION BY LIST\s*\(([^)]+)\)", ddl) + assert part_match, "PARTITION BY LIST clause not found" + assert "region" in part_match.group(1) + # Verify partition definitions + assert "p_cn" in ddl and "p_us" in ddl + + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + # ======================================== + # Case 7: Other Key Types (test_design.md Case 7) + # Covers: duplicate_key, unique_key, aggregate_key + # ======================================== + + def test_e2e_key_type_duplicate(self, starrocks_adapter: StarRocksEngineAdapter): + """Test Case 7A: DUPLICATE KEY.""" + db_name = "sr_e2e_dup_key_db" + table_name = f"{db_name}.sr_dup_key_table" + + model_sql = """ + MODEL ( + name test.duplicate_key_model, + kind FULL, + dialect starrocks, + columns ( + id BIGINT, + dt DATE + ), + physical_properties ( + duplicate_key = (id, dt), + distributed_by = 'HASH(id) BUCKETS 10', + replication_num = '1' + ) + ); + SELECT * + """ + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + + params = self._parse_model_and_get_all_params(model_sql) + starrocks_adapter.create_table(table_name, **params) + + show_create = starrocks_adapter.fetchone(f"SHOW CREATE TABLE {table_name}") + ddl = show_create[1] + logger.info(f"Case 7A DDL:\n{ddl}") + + # Verify DUPLICATE KEY with exact columns + import re + + dup_match = re.search(r"DUPLICATE KEY\s*\(([^)]+)\)", ddl) + assert dup_match, "DUPLICATE KEY clause not found" + assert "id" in dup_match.group(1) and "dt" in dup_match.group( + 1 + ), f"Expected DUPLICATE KEY(id, dt), got DUPLICATE KEY({dup_match.group(1)})" + + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + def test_e2e_key_type_unique(self, starrocks_adapter: StarRocksEngineAdapter): + """Test Case 7B: UNIQUE KEY.""" + db_name = "sr_e2e_uniq_key_db" + table_name = f"{db_name}.sr_uniq_key_table" + + model_sql = """ + MODEL ( + name test.unique_key_model, + kind FULL, + dialect starrocks, + columns ( + id BIGINT, + dt DATE + ), + physical_properties ( + unique_key = (id, dt), + distributed_by = 'HASH(id) BUCKETS 10', + replication_num = '1' + ) + ); + SELECT * + """ + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + + params = self._parse_model_and_get_all_params(model_sql) + starrocks_adapter.create_table(table_name, **params) + + show_create = starrocks_adapter.fetchone(f"SHOW CREATE TABLE {table_name}") + ddl = show_create[1] + logger.info(f"Case 7B DDL:\n{ddl}") + + assert "UNIQUE KEY" in ddl, "UNIQUE KEY missing" + + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + def test_e2e_key_type_aggregate(self, starrocks_adapter: StarRocksEngineAdapter): + """Test Case 7C: AGGREGATE KEY - should raise exception (unsupported).""" + db_name = "sr_e2e_agg_key_db" + table_name = f"{db_name}.sr_agg_key_table" + + model_sql = """ + MODEL ( + name test.aggregate_key_model, + kind FULL, + dialect starrocks, + columns ( + id BIGINT, + dt DATE + ), + physical_properties ( + aggregate_key = (id, dt), + distributed_by = 'HASH(id) BUCKETS 10', + replication_num = '1' + ) + ); + SELECT * + """ + + from sqlmesh.utils.errors import SQLMeshError + import pytest + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + + params = self._parse_model_and_get_all_params(model_sql) + + # Expect SQLMeshError to be raised for unsupported AGGREGATE KEY + with pytest.raises(SQLMeshError, match="AGGREGATE KEY.*not supported"): + starrocks_adapter.create_table(table_name, **params) + + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + # ======================================== + # Final: Comprehensive Test (all properties combined) + # ======================================== + + def test_e2e_comprehensive(self, starrocks_adapter: StarRocksEngineAdapter): + """Final: Comprehensive test with ALL property types combined.""" + db_name = "sr_e2e_comprehensive_db" + table_name = f"{db_name}.sr_comprehensive_table" + + model_sql = """ + MODEL ( + name test.comprehensive_model, + kind FULL, + dialect starrocks, + description 'Comprehensive test table with all properties', + columns ( + order_id BIGINT, + event_date DATE, + customer_id INT, + amount DECIMAL(18,2), + status VARCHAR(20) + ), + partitioned_by (event_date), + clustered_by (order_id, event_date), + physical_properties ( + primary_key = (order_id, event_date), + distributed_by = (kind=HASH, expressions=order_id, buckets=8), + replication_num = '1', + storage_medium = 'HDD' + ) + ); + SELECT * + """ + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + + params = self._parse_model_and_get_all_params(model_sql) + starrocks_adapter.create_table(table_name, **params) + + show_create = starrocks_adapter.fetchone(f"SHOW CREATE TABLE {table_name}") + ddl = show_create[1] + logger.info(f"Comprehensive DDL:\n{ddl}") + + # Precise assertions for all major clauses + import re + + # Verify PRIMARY KEY + pk_match = re.search(r"PRIMARY KEY\s*\(([^)]+)\)", ddl) + assert pk_match, "PRIMARY KEY clause not found" + assert "order_id" in pk_match.group(1) and "event_date" in pk_match.group(1) + + # Verify PARTITION BY + assert "PARTITION BY" in ddl + # Verify exact partition column + part_match = re.search(r"PARTITION BY[^(]*\(([^)]+)\)", ddl) + assert part_match, "PARTITION BY clause not found" + part_cols = part_match.group(1) + assert ( + "event_date" in part_cols + ), f"Expected event_date in PARTITION BY, got {part_cols}" + + # Verify DISTRIBUTED BY + assert "DISTRIBUTED BY HASH" in ddl + assert "BUCKETS 8" in ddl + + # Verify ORDER BY + order_match = re.search(r"ORDER BY\s*\(([^)]+)\)", ddl) + assert order_match, "ORDER BY clause not found" + assert "order_id" in order_match.group( + 1 + ) and "event_date" in order_match.group(1) + + # Verify PROPERTIES + assert "replication_num" in ddl + + # Functional test + starrocks_adapter.execute( + f"INSERT INTO {table_name} " + f"(order_id, event_date, customer_id, amount, status) " + f"VALUES (1001, '2024-01-15', 100, 1234.56, 'completed')" + ) + + result = starrocks_adapter.fetchone( + f"SELECT order_id, customer_id FROM {table_name} WHERE order_id = 1001" + ) + assert result is not None, "INSERT/SELECT failed" + assert result[0] == 1001, "order_id mismatch" + + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + # ======================================== + # Quote Character Handling Test + # Tests single quotes vs double quotes in MODEL parsing + # ======================================== + + def test_e2e_quote_character_handling( + self, starrocks_adapter: StarRocksEngineAdapter + ): + """ + Test Case: Quote Character Handling (Single vs Double Quotes). + + This test verifies that MODEL parsing correctly handles different quote types: + - Single quotes 'value' → Literal(is_string=True) ✓ + - Double quotes "value" → Column(quoted=True) (parser quirk, but we handle it) ✓ + - Bare identifiers → proper parsing + + We test this by using different quote forms in MODEL physical_properties + and verifying that the final DDL is correct. + + Quote Behavior: + =============== + In MySQL/StarRocks: + - Backtick ` : identifier quote + - Single quote ': string literal + - Double quote ": string literal (default) OR identifier (ANSI_QUOTES mode) + + In SQLMesh MODEL parsing: + - Single quotes 'value' → exp.Literal (correct) + - Double quotes "value" → exp.Column(quoted=True) (inconsistent with SQL, but handled) + + This test ensures our workaround in ensure_parenthesized() works correctly. + """ + db_name = "sr_e2e_quote_handling_db" + table_name = f"{db_name}.sr_quote_test_table" + + # Test with different quote forms in MODEL + model_sql = """ + MODEL ( + name test.quote_handling_model, + kind FULL, + dialect starrocks, + columns ( + id BIGINT, + dt DATE, + region VARCHAR(50), + customer_id INT + ), + physical_properties ( + -- Single quotes (correct way) - parses to Literal + primary_key = 'id, dt, region', + + partition_by = "date_trunc('day', dt), region", + + -- Double quotes (parser quirk) - parses to Column(quoted=True) + -- But our ensure_parenthesized handles this + order_by = "id, region", + + -- Structured form with single-quoted string + distributed_by = 'HASH(id) BUCKETS 8', + + -- Generic properties with single quotes + replication_num = '1', + -- storage_medium = "HDD" -- not valid in shared-data cluster + ) + ); + SELECT * + """ + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + + # Parse MODEL and extract parameters (this is where quote handling happens) + params = self._parse_model_and_get_all_params(model_sql) + + # Log parsed parameters for debugging + logger.info(f"Parsed physical_properties: {params['table_properties']}") + for key, value in params["table_properties"].items(): + logger.info(f" {key}: {type(value).__name__} = {value}") + + # Create table with parsed parameters + starrocks_adapter.create_table(table_name, **params) + + # Verify via SHOW CREATE TABLE + show_create = starrocks_adapter.fetchone(f"SHOW CREATE TABLE {table_name}") + ddl = show_create[1] + logger.info(f"Quote Handling Test DDL:\n{ddl}") + + # Precise assertions + import re + + # 1. Verify PRIMARY KEY (from single-quoted string 'id, dt') + pk_match = re.search(r"PRIMARY KEY\s*\(([^)]+)\)", ddl) + assert pk_match, "PRIMARY KEY clause not found" + pk_cols = pk_match.group(1) + assert "id" in pk_cols and "dt" in pk_cols, ( + f"Expected PRIMARY KEY (id, dt), got {pk_cols}. " + f"Single-quoted string 'id, dt' was not correctly parsed!" + ) + + # 2. Verify ORDER BY (from double-quoted string \"id, region\") + # This tests our Column(quoted=True) workaround + order_match = re.search(r"ORDER BY\s*\(([^)]+)\)", ddl) + assert order_match, "ORDER BY clause not found" + order_cols = order_match.group(1) + assert "id" in order_cols and "region" in order_cols, ( + f"Expected ORDER BY (id, region), got {order_cols}. " + f'Double-quoted string "id, region" was not correctly handled!' + ) + + # 3. Verify DISTRIBUTED BY (from single-quoted string) + assert "DISTRIBUTED BY HASH" in ddl, "DISTRIBUTED BY clause not found" + assert "customer_id" in ddl, "customer_id not found in DISTRIBUTED BY" + assert "BUCKETS 8" in ddl, "BUCKETS not found in DISTRIBUTED BY" + + # 4. Verify PROPERTIES (generic properties with single quotes) + assert "replication_num" in ddl, "replication_num not found in PROPERTIES" + # assert "storage_medium" in ddl or "HDD" in ddl, "storage_medium not found in PROPERTIES" + + # Functional test: Verify table actually works + starrocks_adapter.execute( + f"INSERT INTO {table_name} " + f"(id, dt, region, customer_id) " + f"VALUES (100, '2024-01-01', 'US', 1001)" + ) + + result = starrocks_adapter.fetchone( + f"SELECT id, region, customer_id FROM {table_name} WHERE id = 100" + ) + assert result is not None, "INSERT/SELECT failed" + assert result == (100, "US", 1001), f"Data mismatch: {result}" + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + +# ==================== StarRocks Native SQL Capabilities ==================== + + +class TestStarRocksAbility: + """ + Test StarRocks native SQL capabilities and limitations. + + This test class validates StarRocks database features by executing + raw SQL statements directly, without going through SQLMesh abstraction layers. + + Purpose: + - Document which SQL features are supported + - Verify expected failures for unsupported operations + - Guide adapter implementation decisions + + Note: Tests marked with @pytest.mark.xfail are EXPECTED to fail. + """ + + @pytest.fixture(scope="class") + def test_tables( + self, starrocks_adapter: StarRocksEngineAdapter + ) -> t.Dict[str, str]: + """ + Pre-create tables of different types for testing. + + Returns: + Dict mapping table type to fully qualified table name + """ + db_name = "sr_ability_test" + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + + tables = {} + + # 1. PRIMARY KEY table + # Note: StarRocks PRIMARY KEY tables support complex DELETE operations (BETWEEN, subqueries, etc.) + pk_table = f"{db_name}.pk_table" + starrocks_adapter.execute( + f""" + CREATE TABLE IF NOT EXISTS {pk_table} ( + id INT, + dt DATE, + name STRING, + status STRING + ) PRIMARY KEY (id, dt) + DISTRIBUTED BY HASH(id) BUCKETS 10 + """ + ) + # Verify table creation + result = starrocks_adapter.fetchone( + f"SELECT COUNT(*) FROM information_schema.TABLES " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'pk_table'" + ) + assert result[0] == 1, f"PRIMARY KEY table {pk_table} creation failed" + tables["primary_key"] = pk_table + + # 2. DUPLICATE KEY table + dup_table = f"{db_name}.dup_table" + starrocks_adapter.execute( + f""" + CREATE TABLE IF NOT EXISTS {dup_table} ( + id INT, + dt DATE, + name STRING, + status STRING + ) DUPLICATE KEY (id, dt) + DISTRIBUTED BY HASH(id) BUCKETS 10 + """ + ) + # Verify table creation + result = starrocks_adapter.fetchone( + f"SELECT COUNT(*) FROM information_schema.TABLES " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'dup_table'" + ) + assert result[0] == 1, f"DUPLICATE KEY table {dup_table} creation failed" + tables["duplicate_key"] = dup_table + + # 3. UNIQUE KEY table + unique_table = f"{db_name}.unique_table" + starrocks_adapter.execute( + f""" + CREATE TABLE IF NOT EXISTS {unique_table} ( + id INT, + dt DATE, + name STRING, + status STRING + ) UNIQUE KEY (id, dt) + DISTRIBUTED BY HASH(id) BUCKETS 10 + """ + ) + # Verify table creation + result = starrocks_adapter.fetchone( + f"SELECT COUNT(*) FROM information_schema.TABLES " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'unique_table'" + ) + assert result[0] == 1, f"UNIQUE KEY table {unique_table} creation failed" + tables["unique_key"] = unique_table + + yield tables + + # Cleanup + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + # ==================== Schema Operations ==================== + + @pytest.mark.parametrize("sql_keyword", ["SCHEMA", "DATABASE"]) + def test_create_drop_keyword_support( + self, starrocks_adapter: StarRocksEngineAdapter, sql_keyword: str + ): + """ + Test both CREATE SCHEMA and CREATE DATABASE syntax. + + Expected: Both keywords should work (they are synonyms in StarRocks) + """ + test_name = f"sr_ability_{sql_keyword.lower()}" + + try: + # CREATE + starrocks_adapter.execute(f"CREATE {sql_keyword} IF NOT EXISTS {test_name}") + result = starrocks_adapter.fetchone( + f"SELECT SCHEMA_NAME FROM information_schema.SCHEMATA WHERE SCHEMA_NAME = '{test_name}'" + ) + assert result is not None, f"CREATE {sql_keyword} failed" + + # DROP + starrocks_adapter.execute(f"DROP {sql_keyword} IF EXISTS {test_name}") + result = starrocks_adapter.fetchone( + f"SELECT SCHEMA_NAME FROM information_schema.SCHEMATA WHERE SCHEMA_NAME = '{test_name}'" + ) + assert result is None, f"DROP {sql_keyword} failed" + + finally: + starrocks_adapter.execute(f"DROP {sql_keyword} IF EXISTS {test_name}") + + # ==================== DML Capabilities ==================== + + def test_insert_select_supported(self, starrocks_adapter: StarRocksEngineAdapter): + """Basic INSERT/SELECT support (raw SQL capability check).""" + db_name = "sr_ability_insert_select" + table_name = f"{db_name}.t" + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + starrocks_adapter.execute( + f""" + CREATE TABLE IF NOT EXISTS {table_name} ( + id INT, + name VARCHAR(100) + ) PRIMARY KEY (id) + DISTRIBUTED BY HASH(id) BUCKETS 10 + """ + ) + starrocks_adapter.execute( + f"INSERT INTO {table_name} (id, name) VALUES (1, 'Alice'), (2, 'Bob')" + ) + rows = starrocks_adapter.fetchall(f"SELECT id, name FROM {table_name} ORDER BY id") + assert list(rows) == [(1, "Alice"), (2, "Bob")], f"Data mismatch: {rows}" + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + def test_update_supported(self, starrocks_adapter: StarRocksEngineAdapter): + """Basic UPDATE support (raw SQL capability check).""" + db_name = "sr_ability_update" + table_name = f"{db_name}.t" + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + starrocks_adapter.execute( + f""" + CREATE TABLE IF NOT EXISTS {table_name} ( + id INT, + name VARCHAR(100) + ) PRIMARY KEY (id) + DISTRIBUTED BY HASH(id) BUCKETS 10 + """ + ) + starrocks_adapter.execute(f"INSERT INTO {table_name} (id, name) VALUES (1, 'Alice')") + starrocks_adapter.execute( + f"UPDATE {table_name} SET name = 'Alice Updated' WHERE id = 1" + ) + result = starrocks_adapter.fetchone(f"SELECT name FROM {table_name} WHERE id = 1") + assert result == ("Alice Updated",), f"UPDATE failed: {result}" + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + # ==================== DELETE Operations - Success Cases ==================== + + @pytest.mark.parametrize( + "table_type,delete_clause,expected_remaining", + [ + # PRIMARY KEY table - full support + ("primary_key", "WHERE id = 1", 2), + ("primary_key", "WHERE dt BETWEEN '2024-01-01' AND '2024-06-30'", 1), + ( + "primary_key", + "WHERE id IN (SELECT id FROM {table} WHERE status = 'deleted')", + 2, + ), + ("primary_key", "WHERE TRUE", 0), + # PRIMARY KEY with USING (JOIN delete) + ( + "primary_key", + "USING {table} t2 WHERE {table}.id = t2.id AND t2.status = 'deleted'", + 2, + ), + # DUPLICATE/UNIQUE KEY - only simple WHERE + ("duplicate_key", "WHERE id = 1", 2), + ("unique_key", "WHERE id = 1", 2), + ], + ids=[ + "pk_simple_where", + "pk_between", + "pk_subquery", + "pk_where_true", + "pk_using_join", + "dup_simple_where", + "unique_simple_where", + ], + ) + def test_delete_supported_syntax( + self, + starrocks_adapter: StarRocksEngineAdapter, + test_tables: t.Dict[str, str], + table_type: str, + delete_clause: str, + expected_remaining: int, + ): + """ + Test DELETE operations that should succeed. + + Expected: DELETE succeeds and leaves expected number of rows + """ + table_name = test_tables[table_type] + + # Prepare test data for this specific test (better isolation) + # All tables have the same column structure: (id, dt, name, status) + test_data = """ + (1, '2024-01-15', 'Alice', 'active'), + (2, '2024-06-10', 'Bob', 'deleted'), + (3, '2024-12-05', 'Charlie', 'active') + """ + starrocks_adapter.execute(f"TRUNCATE TABLE {table_name}") + starrocks_adapter.execute(f"INSERT INTO {table_name} VALUES {test_data}") + + # Format delete clause (for subquery/using with table reference) + delete_sql = ( + f"DELETE FROM {table_name} {delete_clause.format(table=table_name)}" + ) + + # Debug: Log the SQL before execution + logger.info(f"Executing DELETE SQL: {delete_sql}") + + # Execute delete + starrocks_adapter.execute(delete_sql) + + # Verify result + count = starrocks_adapter.fetchone(f"SELECT COUNT(*) FROM {table_name}")[0] + logger.info( + f"After DELETE: {count} rows remaining (expected {expected_remaining})" + ) + assert ( + count == expected_remaining + ), f"Expected {expected_remaining} rows, got {count} for {table_type} with {delete_clause}" + + # ==================== DELETE Operations - Failure Cases ==================== + + syntax_error = "not supported|syntax error|getting analyzing error" + + @pytest.mark.parametrize( + "table_type,delete_clause,error_pattern", + [ + # DUPLICATE KEY - unsupported syntax + ( + "duplicate_key", + "WHERE dt BETWEEN '2024-01-01' AND '2024-12-31'", + syntax_error, + ), + ( + "duplicate_key", + "WHERE id IN (SELECT id FROM {table} WHERE status = 'deleted')", + syntax_error, + ), + ("duplicate_key", "WHERE TRUE", syntax_error), + # UNIQUE KEY - unsupported syntax + ( + "unique_key", + "WHERE dt BETWEEN '2024-01-01' AND '2024-12-31'", + syntax_error, + ), + ("unique_key", "WHERE id IN (SELECT id FROM {table})", syntax_error), + ], + ids=[ + "dup_between_unsupported", + "dup_subquery_unsupported", + "dup_where_true_unsupported", + "unique_between_unsupported", + "unique_subquery_unsupported", + ], + ) + def test_delete_unsupported_syntax( + self, + starrocks_adapter: StarRocksEngineAdapter, + test_tables: t.Dict[str, str], + table_type: str, + delete_clause: str, + error_pattern: str, + ): + """ + Test DELETE operations that should fail on non-PRIMARY KEY tables. + + Expected: DELETE fails with specific error message. + """ + table_name = test_tables[table_type] + delete_sql = ( + f"DELETE FROM {table_name} {delete_clause.format(table=table_name)}" + ) + + # This should raise an exception + with pytest.raises(Exception) as exc_info: + starrocks_adapter.execute(delete_sql) + + # Verify error message matches expected pattern + import re + + error_msg = str(exc_info.value).lower() + assert re.search( + error_pattern, error_msg + ), f"Expected error pattern '{error_pattern}', got: {exc_info.value}" + + # ==================== COMMENT Syntax Tests ==================== + + @pytest.mark.parametrize( + "comment_type,sql_template", + [ + # Table comment variants + ("table_standard", "ALTER TABLE {table} COMMENT = '{comment}'"), + # Failed without `=` + # ("table_standard", "ALTER TABLE {table} COMMENT '{comment}'"), + # No MODIFY keyworkd + # ("table_modify", 'ALTER TABLE {table} MODIFY COMMENT "{comment}"'), + # # Column comment variants + ( + "column_no_type", + "ALTER TABLE {table} MODIFY COLUMN {column} COMMENT '{comment}'", + ), + # it will take some time to change the column type + # ("column_with_type", "ALTER TABLE {table} MODIFY COLUMN {column} BIGINT COMMENT '{comment}'"), + ], + ids=[ + "table_comment_standard", + # "table_comment_standard_without_equal", # FAIL + # "table_comment_modify", # FAIL + "column_comment_no_type", + # "column_comment_with_type", + ], + ) + def test_comment_syntax_variants( + self, + starrocks_adapter: StarRocksEngineAdapter, + comment_type: str, + sql_template: str, + ): + """ + Test different COMMENT syntax variations to determine StarRocks support. + + Purpose: Guide whether we need to override comment methods in adapter + """ + db_name = "sr_ability_comment" + table_name = f"{db_name}.test_comment" + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + starrocks_adapter.execute( + f""" + CREATE TABLE {table_name} ( + id INT, + col1 INT + ) + DUPLICATE KEY (id) -- key columns can't be changed. + DISTRIBUTED BY HASH(id) BUCKETS 10 + """ + ) + + # Generate SQL based on template + if "table" in comment_type: + sql = sql_template.format( + table=table_name, comment=f"test {comment_type}" + ) + else: # column + sql = sql_template.format( + table=table_name, column="col1", comment=f"test {comment_type}" + ) + + # Try to execute + try: + starrocks_adapter.execute(sql) + + # Verify comment was set + if "table" in comment_type: + result = starrocks_adapter.fetchone( + f"SELECT TABLE_COMMENT FROM information_schema.TABLES " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'test_comment'" + )[0] + assert ( + f"test {comment_type}" in result + ), f"Comment not set correctly for {comment_type}" + else: # column + result_row = starrocks_adapter.fetchone( + f"SELECT COLUMN_NAME, COLUMN_COMMENT FROM information_schema.COLUMNS " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'test_comment' " + f"AND COLUMN_NAME = 'col1'" + ) + logger.info(f"Column comment: {result_row}") + result = result_row[1] + assert ( + f"test {comment_type}" in result + ), f"Comment not set correctly for {comment_type}" + + logger.info(f"✅ {comment_type}: SUPPORTED") + + except Exception as e: + logger.warning(f"❌ {comment_type}: NOT SUPPORTED - {e}") + # Re-raise for test failure + raise + + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + # ==================== Quote Type Tests ==================== + + @pytest.mark.parametrize( + "quote_type,comment_value", + [ + ("single", "single quotes"), + ("double", "double quotes"), + ("escaped_single", "It\\'s a test"), + ("escaped_double", 'Say \\"hello\\"'), + ], + ids=["single_quotes", "double_quotes", "escaped_single", "escaped_double"], + ) + def test_comment_quote_types( + self, + starrocks_adapter: StarRocksEngineAdapter, + quote_type: str, + comment_value: str, + ): + """ + Test different quote types in COMMENT clauses. + + Purpose: Determine which quote types StarRocks accepts + """ + db_name = "sr_ability_quotes" + table_name = f"{db_name}.test_quotes" + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + starrocks_adapter.execute( + f""" + CREATE TABLE {table_name} (id INT) + DISTRIBUTED BY HASH(id) BUCKETS 10 + """ + ) + + # Build SQL with appropriate quotes + if "single" in quote_type: + sql = f"ALTER TABLE {table_name} COMMENT = '{comment_value}'" + else: # double + sql = f'ALTER TABLE {table_name} COMMENT = "{comment_value}"' + + starrocks_adapter.execute(sql) + logger.info(f"✅ {quote_type}: SUPPORTED") + + except Exception as e: + logger.warning(f"❌ {quote_type}: NOT SUPPORTED - {e}") + raise + + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + def test_comment_in_create_table(self, starrocks_adapter: StarRocksEngineAdapter): + """ + Test COMMENT clauses in CREATE TABLE statement. + + Expected: Verify comments are registered during table creation + """ + db_name = "sr_ability_create_comment" + table_name = f"{db_name}.test_create_comment" + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + + # Create table with comments + starrocks_adapter.execute( + f""" + CREATE TABLE {table_name} ( + id INT COMMENT 'id column', + name VARCHAR(100) COMMENT 'name column' + ) + PRIMARY KEY (id) + COMMENT 'test table' + DISTRIBUTED BY HASH(id) BUCKETS 10 + """ + ) + + # Verify table comment + table_comment = starrocks_adapter.fetchone( + f"SELECT TABLE_COMMENT FROM information_schema.TABLES " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'test_create_comment'" + )[0] + assert ( + table_comment == "test table" + ), f"Table comment mismatch: {table_comment}" + + # Verify column comments + column_comments = {} + results = starrocks_adapter.fetchall( + f"SELECT COLUMN_NAME, COLUMN_COMMENT FROM information_schema.COLUMNS " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'test_create_comment'" + ) + for col_name, col_comment in results: + if col_comment: # Skip empty comments + column_comments[col_name] = col_comment + + assert ( + column_comments.get("id") == "id column" + ), f"Column comment mismatch: {column_comments}" + assert ( + column_comments.get("name") == "name column" + ), f"Column comment mismatch: {column_comments}" + + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + +class TestCommentMethods: + """ + Test _build_create_comment_table_exp and _build_create_comment_column_exp methods. + + These methods are used to generate ALTER TABLE SQL for modifying comments. + Although StarRocks uses COMMENT_CREATION_TABLE = IN_SCHEMA_DEF_CTAS (comments + are included in CREATE TABLE), these methods may be used for: + - Modifying existing table comments + - View comments (depending on COMMENT_CREATION_VIEW) + - Future ALTER TABLE support + """ + + def test_build_create_comment_table_exp( + self, starrocks_adapter: StarRocksEngineAdapter + ): + """ + Test _build_create_comment_table_exp generates correct ALTER TABLE COMMENT SQL. + + Verifies: + 1. Method generates correct SQL syntax + 2. SQL can be executed successfully + 3. Comment is actually updated in database + """ + db_name = "sr_test_comment_table" + table_name = f"{db_name}.test_table" + + try: + # Setup: Create schema and table + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + starrocks_adapter.execute( + f""" + CREATE TABLE {table_name} ( + id INT, + name VARCHAR(100) + ) + PRIMARY KEY (id) + COMMENT 'initial comment' + DISTRIBUTED BY HASH(id) BUCKETS 10 + """ + ) + + # Test: Use _build_create_comment_table_exp to generate SQL + table_expr = exp.to_table(table_name) + new_comment = "Updated table comment via method" + comment_sql = starrocks_adapter._build_create_comment_table_exp( + table=table_expr, table_comment=new_comment, table_kind="TABLE" + ) + + # Verify: SQL format is correct + assert "ALTER TABLE" in comment_sql, f"Invalid SQL format: {comment_sql}" + assert ( + "COMMENT =" in comment_sql + ), f"Missing COMMENT = in SQL: {comment_sql}" + assert new_comment in comment_sql, f"Comment not in SQL: {comment_sql}" + + # Execute the generated SQL + starrocks_adapter.execute(comment_sql) + + # Verify: Comment was actually updated + result = starrocks_adapter.fetchone( + f"SELECT TABLE_COMMENT FROM information_schema.TABLES " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'test_table'" + ) + assert result is not None, "Table not found after comment update" + assert ( + result[0] == new_comment + ), f"Comment not updated. Expected: {new_comment}, Got: {result[0]}" + + logger.info("✅ _build_create_comment_table_exp generates valid SQL") + + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + def test_build_create_comment_column_exp( + self, starrocks_adapter: StarRocksEngineAdapter + ): + """ + Test _build_create_comment_column_exp generates correct ALTER TABLE MODIFY COLUMN SQL. + + Verifies: + 1. Method generates correct SQL with column type + 2. SQL can be executed successfully + 3. Column comment is actually updated in database + 4. Column type is preserved (not changed) + """ + db_name = "sr_test_comment_column" + table_name = f"{db_name}.test_table" + + try: + # Setup: Create schema and table + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + starrocks_adapter.execute( + f""" + CREATE TABLE {table_name} ( + id INT COMMENT 'initial id comment', + name VARCHAR(100) COMMENT 'initial name comment', + amount DECIMAL(10, 2) + ) + PRIMARY KEY (id) + DISTRIBUTED BY HASH(id) BUCKETS 10 + """ + ) + + # Test: Use _build_create_comment_column_exp to generate SQL + table_expr = exp.to_table(table_name) + new_comment = "Updated column comment via method" + comment_sql = starrocks_adapter._build_create_comment_column_exp( + table=table_expr, + column_name="name", + column_comment=new_comment, + table_kind="TABLE", + ) + + # Verify: SQL format is correct + assert "ALTER TABLE" in comment_sql, f"Invalid SQL format: {comment_sql}" + assert ( + "MODIFY COLUMN" in comment_sql + ), f"Missing MODIFY COLUMN in SQL: {comment_sql}" + assert "COMMENT" in comment_sql, f"Missing COMMENT in SQL: {comment_sql}" + assert new_comment in comment_sql, f"Comment not in SQL: {comment_sql}" + + # Execute the generated SQL + starrocks_adapter.execute(comment_sql) + + # Verify: Column comment was actually updated + result = starrocks_adapter.fetchone( + f"SELECT COLUMN_TYPE, COLUMN_COMMENT FROM information_schema.COLUMNS " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'test_table' AND COLUMN_NAME = 'name'" + ) + assert result is not None, "Column not found after comment update" + column_type, column_comment = result + assert ( + column_comment == new_comment + ), f"Comment not updated. Expected: {new_comment}, Got: {column_comment}" + assert ( + "varchar(100)" in column_type.lower() + ), f"Column type changed unexpectedly: {column_type}" + + logger.info( + "✅ _build_create_comment_column_exp generates valid SQL with correct type" + ) + + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) diff --git a/tests/core/engine_adapter/test_starrocks.py b/tests/core/engine_adapter/test_starrocks.py new file mode 100644 index 0000000000..5a81795ef4 --- /dev/null +++ b/tests/core/engine_adapter/test_starrocks.py @@ -0,0 +1,1823 @@ +"""Tests for StarRocks Engine Adapter + +This test suite covers the StarRocks-specific functionality of the engine adapter, +including schema operations, table operations, and StarRocks-specific table properties. + +Test classes are organized by functionality (following the standard order): +- TestSchemaOperations: Schema/Database operations +- TestTableOperations: Basic table operations +- TestKeyPropertyBuilding: Table key types (primary_key, duplicate_key, unique_key, aggregate_key) +- TestPartitionPropertyBuilding: Partition (partitioned_by, partitions) +- TestDistributionPropertyBuilding: Distribution (distributed_by) +- TestOrderByPropertyBuilding: Order By (order_by, clustered_by) +- TestCommentPropertyBuilding: Comments (table and column) +- TestGenericPropertyBuilding: Generic properties (replication_num, etc.) +- TestComprehensive: Comprehensive tests with all features combined + +Unit tests use @pytest.mark.parametrize to systematically cover all value forms. +""" + +import typing as t + +import pytest +from sqlglot import expressions as exp +from sqlglot import parse, parse_one +from pytest_mock.plugin import MockerFixture +from sqlmesh.core.engine_adapter.shared import DataObjectType +from sqlmesh.utils.errors import SQLMeshError + +from tests.core.engine_adapter import to_sql_calls +from sqlmesh.core.engine_adapter.starrocks import StarRocksEngineAdapter +from sqlmesh.core.dialect import parse +from sqlmesh.core.model import load_sql_based_model, SqlModel + +pytestmark = [pytest.mark.starrocks, pytest.mark.engine] + + +def _load_sql_model(model_sql: str) -> SqlModel: + """Parse StarRocks MODEL SQL into a SqlModel instance.""" + expressions = parse(model_sql, default_dialect="starrocks") + return t.cast(SqlModel, load_sql_based_model(expressions)) + + +# ============================================================================= +# Schema Operations +# ============================================================================= +class TestSchemaOperations: + """Tests for schema (database) operations.""" + + def test_create_schema( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test CREATE DATABASE statement generation. + + StarRocks uses DATABASE keyword (MySQL-style) instead of SCHEMA. + """ + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_schema("test_schema") + + assert to_sql_calls(adapter) == [ + "CREATE SCHEMA IF NOT EXISTS `test_schema`", + ] + + def test_create_schema_without_if_exists( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test CREATE DATABASE without IF NOT EXISTS clause.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_schema("test_schema", ignore_if_exists=False) + + assert to_sql_calls(adapter) == [ + "CREATE SCHEMA `test_schema`", + ] + + def test_drop_schema( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test DROP DATABASE statement generation.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.drop_schema("test_schema") + adapter.drop_schema("test_schema", ignore_if_not_exists=False) + + assert to_sql_calls(adapter) == [ + "DROP SCHEMA IF EXISTS `test_schema`", + "DROP SCHEMA `test_schema`", + ] + + +# ============================================================================= +# Data Object Query (MV vs VIEW) +# ============================================================================= +class TestDataObjectQuery: + def test_get_data_object_materialized_view_is_distinguished_from_view( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + mocker: MockerFixture, + ) -> None: + """ + StarRocks may report materialized views as TABLE_TYPE='VIEW' in information_schema.tables. + Ensure StarRocksEngineAdapter upgrades MV objects using information_schema.materialized_views. + """ + import pandas as pd + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter, patch_get_data_objects=False) + + # information_schema.tables output (MV appears as 'view') + # fetchdf is called twice: + # 1) information_schema.tables + # 2) information_schema.materialized_views + tables_df = pd.DataFrame( + [ + {"schema_name": "test_db", "name": "mv1", "type": "view"}, + {"schema_name": "test_db", "name": "mv2", "type": "view"}, + {"schema_name": "test_db", "name": "v1", "type": "view"}, + {"schema_name": "test_db", "name": "t1", "type": "table"}, + ] + ) + mv_df = pd.DataFrame( + [ + {"schema_name": "test_db", "name": "mv1"}, + {"schema_name": "test_db", "name": "mv2"}, + ] + ) + + known_names = ["mv1", "mv2", "v1", "t1"] + + def fetchdf_side_effect(query: exp.Expression, *_: t.Any, **__: t.Any): + query_sql = query.sql(dialect="starrocks").lower() + requested = [ + name for name in known_names if f"'{name}'" in query_sql or f"`{name}`" in query_sql + ] + if "information_schema.materialized_views" in query_sql: + df = mv_df + else: + df = tables_df + if requested: + mask = df["name"].str.lower().isin(requested) + return df[mask].reset_index(drop=True) + return df.reset_index(drop=True) + + adapter.fetchdf = mocker.Mock(side_effect=fetchdf_side_effect) + + mv1 = adapter.get_data_object("test_db.mv1") + assert mv1 is not None + assert mv1.type == DataObjectType.MATERIALIZED_VIEW + + v1 = adapter.get_data_object("test_db.v1") + assert v1 is not None + assert v1.type == DataObjectType.VIEW + + mv2_objects = adapter.get_data_objects(schema_name="test_db", object_names={"mv2"}) + assert len(mv2_objects) == 1 + assert mv2_objects[0].name.lower() == "mv2" + assert mv2_objects[0].type == DataObjectType.MATERIALIZED_VIEW + + +# ============================================================================= +# Basic Table Operations +# ============================================================================= +class TestTableOperations: + """Tests for basic table operations.""" + + def test_create_table( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test basic CREATE TABLE statement generation.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + "test_table", + target_columns_to_types={ + "a": exp.DataType.build("INT"), + "b": exp.DataType.build("VARCHAR(100)"), + }, + ) + + sql = to_sql_calls(adapter)[0] + assert "CREATE TABLE IF NOT EXISTS `test_table`" in sql + assert "`a` INT" in sql + assert "`b` VARCHAR(100)" in sql + + def test_create_table_like( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test CREATE TABLE LIKE statement.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table_like("target_table", "source_table") + assert to_sql_calls(adapter) == [ + "CREATE TABLE IF NOT EXISTS `target_table` LIKE `source_table`", + ] + + def test_create_table_like_exists_false( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test CREATE TABLE LIKE with exists=False (no IF NOT EXISTS).""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table_like("target_table", "source_table", exists=False) + assert to_sql_calls(adapter) == [ + "CREATE TABLE `target_table` LIKE `source_table`", + ] + + def test_create_table_like_qualified_names( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test CREATE TABLE LIKE with database-qualified names.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table_like("db.target_table", "db.source_table") + assert to_sql_calls(adapter) == [ + "CREATE TABLE IF NOT EXISTS `db`.`target_table` LIKE `db`.`source_table`", + ] + + def test_create_table_like_does_not_call_columns( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + mocker: MockerFixture, + ): + """ + StarRocks overrides _create_table_like to use native CREATE TABLE LIKE and should + not fall back to the base implementation (which calls columns(source)). + """ + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + columns_mock = mocker.patch.object( + adapter, "columns", side_effect=AssertionError("columns() should not be called") + ) + + adapter.create_table_like("target_table", "source_table") + assert columns_mock.call_count == 0 + + def test_create_table_like_clears_cache( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + mocker: MockerFixture, + ): + """create_table_like should clear the data object cache for the target table.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + clear_cache = mocker.patch.object(adapter, "_clear_data_object_cache") + + adapter.create_table_like("target_table", "source_table") + clear_cache.assert_called_once_with("target_table") + + def test_rename_table( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test RENAME TABLE statement.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + # Test 1: Simple table names (no database qualifier) + adapter.rename_table("old_table", "new_table") + adapter.cursor.execute.assert_called_with( + "ALTER TABLE `old_table` RENAME `new_table`" + ) + + # Test 2: Database-qualified names - RENAME only uses table name + adapter.cursor.execute.reset_mock() + adapter.rename_table("db.old_table", "db.new_table") + # StarRocks RENAME clause requires unqualified table name + adapter.cursor.execute.assert_called_with( + "ALTER TABLE `db`.`old_table` RENAME `new_table`" + ) + + def test_delete_from( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test DELETE statement generation.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.delete_from(exp.to_table("test_table"), "id = 1") + + assert to_sql_calls(adapter) == [ + "DELETE FROM `test_table` WHERE `id` = 1", + ] + + def test_create_index( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test CREATE INDEX statement - StarRocks doesn't support standalone indexes.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_index("test_table", "idx_name", ("cola",)) + + # StarRocks skips index creation - verify no execute call was made + adapter.cursor.execute.assert_not_called() + + def test_create_view( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test CREATE VIEW statement generation.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_view("test_view", parse_one("SELECT a FROM tbl")) + adapter.create_view("test_view", parse_one("SELECT a FROM tbl"), replace=False) + + assert to_sql_calls(adapter) == [ + "CREATE OR REPLACE VIEW `test_view` AS SELECT `a` FROM `tbl`", + "CREATE VIEW `test_view` AS SELECT `a` FROM `tbl`", + ] + + def test_create_view_with_security( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test CREATE VIEW with StarRocks SECURITY property.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_view( + "test_view", + parse_one("SELECT a FROM tbl"), + replace=False, + view_properties={"security": exp.Var(this="INVOKER")}, + ) + + sql = to_sql_calls(adapter)[0] + assert "SECURITY INVOKER" in sql + + def test_create_materialized_view_replace_with_refresh_and_comments( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test CREATE MATERIALIZED VIEW generation (drop+create, refresh, comments, schema).""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_view( + "test_mv", + parse_one("SELECT a FROM tbl"), + materialized=True, + target_columns_to_types={"a": exp.DataType.build("INT")}, + table_description="Test MV description", + column_descriptions={"a": "Column A description"}, + view_properties={ + "refresh_moment": exp.Var(this="IMMEDIATE"), + "refresh_scheme": exp.Literal.string( + "ASYNC START ('2025-01-01 00:00:00') EVERY (INTERVAL 5 MINUTE)" + ), + }, + ) + + calls = to_sql_calls(adapter) + assert calls[0] == "DROP MATERIALIZED VIEW IF EXISTS `test_mv`" + assert "CREATE MATERIALIZED VIEW" in calls[1] + assert "COMMENT 'Test MV description'" in calls[1] + assert "COMMENT 'Column A description'" in calls[1] + assert "REFRESH IMMEDIATE ASYNC" in calls[1] + assert "START ('2025-01-01 00:00:00')" in calls[1] + assert "EVERY (INTERVAL 5 MINUTE)" in calls[1] + + def test_delete_where_true_optimization( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """ + Test DELETE with WHERE TRUE optimization. + + WHERE TRUE is converted to TRUNCATE TABLE for better performance. + This works for all StarRocks table types and is semantically equivalent. + """ + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + # Test WHERE TRUE + adapter.delete_from(exp.to_table("test_table"), exp.true()) + assert to_sql_calls(adapter) == [ + "TRUNCATE TABLE `test_table`", + ] + + adapter.cursor.reset_mock() + + # Test no WHERE clause (also uses TRUNCATE) + adapter.delete_from(exp.to_table("test_table"), None) + assert to_sql_calls(adapter) == [ + "TRUNCATE TABLE `test_table`", + ] + + +# ============================================================================= +# WHERE Clause Transformations +# ============================================================================= +class TestWhereClauseTransformations: + """ + Tests for WHERE clause transformations in DELETE statements. + + StarRocks has limitations on DELETE WHERE clauses for non-PRIMARY KEY tables: + - BETWEEN is not supported → converted to >= AND <= + - Boolean literals (TRUE/FALSE) are not supported → removed or converted to 1=1/1=0 + + These transformations are applied conservatively to all DELETE statements since + table type cannot be easily determined at DELETE time. + """ + + def test_delete_with_between_simple( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """ + Test BETWEEN is converted to >= AND <= in DELETE WHERE. + + StarRocks Limitation: + BETWEEN is not supported in DELETE WHERE for DUPLICATE/UNIQUE/AGGREGATE KEY tables. + """ + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + adapter.delete_from( + exp.to_table("test_table"), + parse_one("dt BETWEEN '2024-01-01' AND '2024-12-31'"), + ) + + sql = to_sql_calls(adapter)[0] + assert "BETWEEN" not in sql + assert "`dt` >= '2024-01-01'" in sql + assert "`dt` <= '2024-12-31'" in sql + assert "AND" in sql + + def test_delete_with_between_numeric( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test BETWEEN with numeric values.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + adapter.delete_from( + exp.to_table("test_table"), + parse_one("id BETWEEN 100 AND 200"), + ) + + sql = to_sql_calls(adapter)[0] + assert "BETWEEN" not in sql + assert "`id` >= 100" in sql + assert "`id` <= 200" in sql + + def test_delete_with_between_and_other_conditions( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test BETWEEN combined with other WHERE conditions.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + # Complex WHERE: id > 50 AND dt BETWEEN '2024-01-01' AND '2024-12-31' + adapter.delete_from( + exp.to_table("test_table"), + parse_one("id > 50 AND dt BETWEEN '2024-01-01' AND '2024-12-31'"), + ) + + sql = to_sql_calls(adapter)[0] + assert "BETWEEN" not in sql + assert "`id` > 50" in sql + assert "`dt` >= '2024-01-01'" in sql + assert "`dt` <= '2024-12-31'" in sql + + def test_delete_with_multiple_between( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test multiple BETWEEN expressions in one WHERE clause.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + adapter.delete_from( + exp.to_table("test_table"), + parse_one("dt BETWEEN '2024-01-01' AND '2024-12-31' AND id BETWEEN 1 AND 100"), + ) + + sql = to_sql_calls(adapter)[0] + assert "BETWEEN" not in sql + assert "`dt` >= '2024-01-01'" in sql + assert "`dt` <= '2024-12-31'" in sql + assert "`id` >= 1" in sql + assert "`id` <= 100" in sql + + def test_delete_with_and_true( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """ + Test AND TRUE is removed from WHERE clause. + + StarRocks Limitation: + Boolean literals are not supported in WHERE clauses. + """ + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + adapter.delete_from( + exp.to_table("test_table"), + parse_one("id > 100 AND TRUE"), + ) + + sql = to_sql_calls(adapter)[0] + assert "TRUE" not in sql + assert "`id` > 100" in sql + # Should not have extra AND + assert sql.count("AND") == 0 + + def test_delete_with_true_and_condition( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test TRUE AND condition (reverse order).""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + adapter.delete_from( + exp.to_table("test_table"), + parse_one("TRUE AND id > 100"), + ) + + sql = to_sql_calls(adapter)[0] + assert "TRUE" not in sql + assert "`id` > 100" in sql + + def test_delete_with_or_false( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test OR FALSE is removed from WHERE clause.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + adapter.delete_from( + exp.to_table("test_table"), + parse_one("id > 100 OR FALSE"), + ) + + sql = to_sql_calls(adapter)[0] + assert "FALSE" not in sql + assert "`id` > 100" in sql + assert sql.count("OR") == 0 + + def test_delete_with_false_or_condition( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test FALSE OR condition (reverse order).""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + adapter.delete_from( + exp.to_table("test_table"), + parse_one("FALSE OR id > 100"), + ) + + sql = to_sql_calls(adapter)[0] + assert "FALSE" not in sql + assert "`id` > 100" in sql + + def test_delete_with_standalone_false( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test standalone FALSE is converted to 1=0.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + adapter.delete_from( + exp.to_table("test_table"), + exp.false(), + ) + + sql = to_sql_calls(adapter)[0] + assert "FALSE" not in sql + # Converted to 1=0 (always false condition) + assert "1 = 0" in sql or "1=0" in sql + + def test_delete_with_combined_transformations( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """ + Test BETWEEN + boolean literals together. + + Verifies that multiple transformations work correctly when combined. + """ + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + # WHERE: dt BETWEEN '2024-01-01' AND '2024-12-31' AND TRUE + adapter.delete_from( + exp.to_table("test_table"), + parse_one("dt BETWEEN '2024-01-01' AND '2024-12-31' AND TRUE"), + ) + + sql = to_sql_calls(adapter)[0] + assert "BETWEEN" not in sql + assert "TRUE" not in sql + assert "`dt` >= '2024-01-01'" in sql + assert "`dt` <= '2024-12-31'" in sql + + def test_delete_with_nested_boolean_expressions( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test nested boolean expressions with multiple levels.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + # WHERE: (id > 100 AND TRUE) OR (name = 'test' AND FALSE) + # After transformation: id > 100 OR (name = 'test' AND FALSE) + # After transformation: id > 100 OR FALSE + # After transformation: id > 100 + adapter.delete_from( + exp.to_table("test_table"), + parse_one("(id > 100 AND TRUE) OR (name = 'test' AND FALSE)"), + ) + + sql = to_sql_calls(adapter)[0] + assert "TRUE" not in sql + # Note: The AND FALSE cannot be fully simplified without more complex logic + # Our transformation only handles direct AND TRUE / OR FALSE at the binary level + + def test_delete_with_between_in_complex_expression( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test BETWEEN within a complex nested expression.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + adapter.delete_from( + exp.to_table("test_table"), + parse_one("(dt BETWEEN '2024-01-01' AND '2024-06-30') OR (dt BETWEEN '2024-07-01' AND '2024-12-31')"), + ) + + sql = to_sql_calls(adapter)[0] + assert "BETWEEN" not in sql + # First BETWEEN converted + assert "`dt` >= '2024-01-01'" in sql + assert "`dt` <= '2024-06-30'" in sql + # Second BETWEEN converted + assert "`dt` >= '2024-07-01'" in sql + assert "`dt` <= '2024-12-31'" in sql + assert "OR" in sql + + +# ============================================================================= +# Key Property Building +# ============================================================================= +class TestKeyPropertyBuilding: + """ + Tests for table key types: primary_key, duplicate_key, unique_key, aggregate_key. + + Key columns must be the first N columns in the table definition. + Tests parse actual Model SQL to ensure real-world compatibility. + """ + + @pytest.mark.parametrize( + "key_type,key_value,expected_clause", + [ + # primary_key - single column + ("primary_key", "id", "PRIMARY KEY (`id`)"), + # primary_key - tuple form (multi-column) + ("primary_key", "(id, dt)", "PRIMARY KEY (`id`, `dt`)"), + # duplicate_key - tuple form + ("duplicate_key", "(id, name)", "DUPLICATE KEY (`id`, `name`)"), + # unique_key - tuple form + ("unique_key", "(id, dt)", "UNIQUE KEY (`id`, `dt`)"), + # aggregate_key - multi-column. not supported (requires aggregation function specification) + # ("aggregate_key", ("id", "dt"), "AGGREGATE KEY (`id`, `dt`)"), + ], + ) + def test_key_types_with_tuple_form( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + key_type: str, + key_value: str, + expected_clause: str, + ): + """Test key types with tuple form: (id, dt) parsed from physical_properties.""" + model_sql = f""" + MODEL ( + name t, + kind FULL, + dialect starrocks, + columns (id INT, dt DATE, name STRING, value DECIMAL(10,2)), + physical_properties ( + {key_type} = {key_value} + ) + ); + SELECT 1; + """ + + parsed = parse(model_sql, default_dialect="starrocks") + model: SqlModel = t.cast(SqlModel, load_sql_based_model(parsed)) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + model.name, + model.columns_to_types, + table_properties=model.physical_properties, + ) + + sql = to_sql_calls(adapter)[0] + assert expected_clause in sql + + @pytest.mark.parametrize( + "key_string,expected_clause", + [ + # String with parentheses + ('"(id, dt)"', "PRIMARY KEY (`id`, `dt`)"), + # String without parentheses (auto-wrapped) + ('"id, dt"', "PRIMARY KEY (`id`, `dt`)"), + # Single column string + ('"id"', "PRIMARY KEY (`id`)"), + ], + ) + def test_primary_key_string_forms( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + key_string: str, + expected_clause: str, + ): + """Test primary_key with string forms (with/without parentheses) parsed from physical_properties.""" + model_sql = f""" + MODEL ( + name t, + kind FULL, + dialect starrocks, + columns (id INT, dt DATE, value DECIMAL(10,2)), + physical_properties ( + primary_key = {key_string} + ) + ); + SELECT 1; + """ + + parsed = parse(model_sql, default_dialect="starrocks") + model: SqlModel = t.cast(SqlModel, load_sql_based_model(parsed)) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + model.name, + model.columns_to_types, + table_properties=model.physical_properties, + ) + + sql = to_sql_calls(adapter)[0] + assert expected_clause in sql + + def test_primary_key_single_identifier( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test primary_key = id (single identifier without quotes).""" + model_sql = """ + MODEL ( + name t, + kind FULL, + dialect starrocks, + columns (id INT, dt DATE), + physical_properties ( + primary_key = id + ) + ); + SELECT 1; + """ + + parsed = parse(model_sql, default_dialect="starrocks") + model: SqlModel = t.cast(SqlModel, load_sql_based_model(parsed)) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + model.name, + model.columns_to_types, + table_properties=model.physical_properties, + ) + + sql = to_sql_calls(adapter)[0] + assert "PRIMARY KEY (`id`)" in sql + + def test_primary_key_via_table_properties_tuple( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test primary_key passed via physical_properties with tuple form - duplicate of test_key_types_with_tuple_form.""" + model_sql = """ + MODEL ( + name t, + kind FULL, + dialect starrocks, + columns (id INT, dt DATE, value DECIMAL(10,2)), + physical_properties ( + primary_key = (id, dt) + ) + ); + SELECT 1; + """ + + parsed = parse(model_sql, default_dialect="starrocks") + model: SqlModel = t.cast(SqlModel, load_sql_based_model(parsed)) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + model.name, + model.columns_to_types, + table_properties=model.physical_properties, + ) + + sql = to_sql_calls(adapter)[0] + assert "PRIMARY KEY (`id`, `dt`)" in sql + + def test_column_reordering_for_key( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test column reordering for key tables. + + StarRocks Requirement: + Key columns MUST be the first N columns in CREATE TABLE statement. + """ + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + columns_to_types = { + "customer_id": exp.DataType.build("INT"), + "region": exp.DataType.build("VARCHAR(50)"), + "order_id": exp.DataType.build("BIGINT"), + "event_date": exp.DataType.build("DATE"), + "amount": exp.DataType.build("DECIMAL(18,2)"), + } + + adapter.create_table( + "test_table", + target_columns_to_types=columns_to_types, + primary_key=("order_id", "event_date"), + ) + + sql = to_sql_calls(adapter)[0] + assert "PRIMARY KEY (`order_id`, `event_date`)" in sql + + import re + + col_match = re.search(r"CREATE TABLE.*?\((.*)\)\s*PRIMARY KEY", sql, re.DOTALL) + assert col_match, "Could not extract column definitions" + col_defs = col_match.group(1) + + order_id_pos = col_defs.find("`order_id`") + event_date_pos = col_defs.find("`event_date`") + customer_id_pos = col_defs.find("`customer_id`") + + assert order_id_pos < event_date_pos, "order_id must appear before event_date" + assert ( + event_date_pos < customer_id_pos + ), "event_date must appear before customer_id" + + +# ============================================================================= +# Partition Property Building +# ============================================================================= +class TestPartitionPropertyBuilding: + """Tests for partitioned_by/partition_by and partitions properties.""" + + @pytest.mark.parametrize( + "partition_expr,expected_clause", + [ + # Expression partitioning - single column + ("'dt'", "PARTITION BY dt"), + # Expression partitioning - multi-column + ("(year, month)", "PARTITION BY year, month"), + # RANGE partitioning + ("RANGE (dt)", "PARTITION BY RANGE (`dt`) ()"), + # LIST partitioning + ("LIST (region)", "PARTITION BY LIST (`region`) ()"), + ], + ) + def test_partitioned_by_forms( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + partition_expr: str, + expected_clause: str, + ): + """Test partition_by with various forms parsed from physical_properties.""" + model_sql = f""" + MODEL ( + name t, + kind FULL, + dialect starrocks, + columns (id INT, dt DATE, year INT, month INT, region STRING), + physical_properties ( + partition_by = {partition_expr} + ) + ); + SELECT 1; + """ + + parsed = parse(model_sql, default_dialect="starrocks") + model: SqlModel = t.cast(SqlModel, load_sql_based_model(parsed)) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + model.name, + model.columns_to_types, + partitioned_by=model.partitioned_by, + table_properties=model.physical_properties, + ) + + sql = to_sql_calls(adapter)[0] + assert expected_clause in sql + + def test_partition_by_alias( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test partition_by as alias for partitioned_by in physical_properties.""" + model_sql = """ + MODEL ( + name t, + kind FULL, + dialect starrocks, + columns (id INT, year INT, month INT), + physical_properties ( + partition_by = (year, month) + ) + ); + SELECT 1; + """ + + parsed = parse(model_sql, default_dialect="starrocks") + model: SqlModel = t.cast(SqlModel, load_sql_based_model(parsed)) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + model.name, + model.columns_to_types, + partitioned_by=model.partitioned_by, + table_properties=model.physical_properties, + ) + + sql = to_sql_calls(adapter)[0] + assert "PARTITION BY year, month" in sql + + def test_partitioned_by_as_model_parameter( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test partitioned_by as model-level parameter (not in physical_properties).""" + model_sql = """ + MODEL ( + name t, + kind FULL, + dialect starrocks, + columns (id INT, year INT, month INT, value DECIMAL(10,2)), + partitioned_by (year, month) + ); + SELECT 1; + """ + + parsed = parse(model_sql, default_dialect="starrocks") + model: SqlModel = t.cast(SqlModel, load_sql_based_model(parsed)) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + model.name, + model.columns_to_types, + partitioned_by=model.partitioned_by, + ) + + sql = to_sql_calls(adapter)[0] + assert "PARTITION BY year, month" in sql + + def test_partitions_value_forms( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test partitions property with single and multiple partition definitions.""" + # Single partition string (paren) + model_sql_single = """ + MODEL ( + name t, + kind FULL, + dialect starrocks, + columns (id INT, dt DATE), + physical_properties ( + partition_by = RANGE(dt), + partitions = 'PARTITION p1 VALUES LESS THAN ("2024-01-01")' + ) + ); + SELECT 1; + """ + + parsed = parse(model_sql_single, default_dialect="starrocks") + model: SqlModel = t.cast(SqlModel, load_sql_based_model(parsed)) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + model.name, + model.columns_to_types, + partitioned_by=model.partitioned_by, + table_properties=model.physical_properties, + ) + + sql = to_sql_calls(adapter)[0] + assert "PARTITION p1" in sql + assert "VALUES LESS THAN" in sql + + # Multiple partitions (tuple of strings) + model_sql_multiple = """ + MODEL ( + name t, + kind FULL, + dialect starrocks, + columns (id INT, dt DATE), + physical_properties ( + partition_by = RANGE(dt), + partitions = ( + 'PARTITION p1 VALUES LESS THAN ("2024-01-01")', + 'PARTITION p2 VALUES LESS THAN ("2024-02-01")' + ) + ) + ); + SELECT 1; + """ + + parsed = parse(model_sql_multiple, default_dialect="starrocks") + model = t.cast(SqlModel, load_sql_based_model(parsed)) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + model.name, + model.columns_to_types, + partitioned_by=model.partitioned_by, + table_properties=model.physical_properties, + ) + + sql = to_sql_calls(adapter)[0] + assert "PARTITION p1" in sql + assert "PARTITION p2" in sql + + +# ============================================================================= +# Distribution Property Building +# ============================================================================= +class TestDistributionPropertyBuilding: + """Tests for distributed_by property.""" + + @pytest.mark.parametrize( + "dist_input,expected_clause", + [ + # String form: HASH single column + ('"HASH(id) BUCKETS 10"', "DISTRIBUTED BY HASH (`id`) BUCKETS 10"), + # String form: HASH multi-column + ( + '"HASH(id, region) BUCKETS 16"', + "DISTRIBUTED BY HASH (`id`, `region`) BUCKETS 16", + ), + # String form: RANDOM + ('"RANDOM"', "DISTRIBUTED BY RANDOM"), + # String form: RANDOM with BUCKETS + ('"RANDOM BUCKETS 10"', "DISTRIBUTED BY RANDOM BUCKETS 10"), + ], + ) + def test_distributed_by_string_forms( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + dist_input: str, + expected_clause: str, + ): + """Test distributed_by with string forms parsed from physical_properties.""" + model_sql = f""" + MODEL ( + name t, + kind FULL, + dialect starrocks, + columns (id INT, region STRING), + physical_properties ( + distributed_by = {dist_input} + ) + ); + SELECT 1; + """ + + parsed = parse(model_sql, default_dialect="starrocks") + model: SqlModel = t.cast(SqlModel, load_sql_based_model(parsed)) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + model.name, + model.columns_to_types, + table_properties=model.physical_properties, + ) + + sql = to_sql_calls(adapter)[0] + assert expected_clause in sql + + @pytest.mark.parametrize( + "dist_struct,expected_clause", + [ + # Structured: HASH with quoted kind + ("(kind='HASH', expressions=id, buckets=32)", "DISTRIBUTED BY HASH (`id`) BUCKETS 32"), + # Structured: HASH with unquoted kind (Column) + ("(kind=HASH, expressions=id, buckets=10)", "DISTRIBUTED BY HASH (`id`) BUCKETS 10"), + # Structured: HASH multi-column + ( + "(kind='HASH', expressions=(a, b), buckets=16)", + "DISTRIBUTED BY HASH (`a`, `b`) BUCKETS 16", + ), + # Structured: RANDOM + ("(kind='RANDOM')", "DISTRIBUTED BY RANDOM"), + # Structured: RANDOM with buckets + ("(kind=RANDOM, buckets=10)", "DISTRIBUTED BY RANDOM BUCKETS 10"), + ], + ) + def test_distributed_by_structured_forms( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + dist_struct: str, + expected_clause: str, + ): + """Test distributed_by with structured tuple forms parsed from physical_properties.""" + model_sql = f""" + MODEL ( + name t, + kind FULL, + dialect starrocks, + columns (id INT, a INT, b STRING, region STRING), + physical_properties ( + distributed_by = {dist_struct} + ) + ); + SELECT 1; + """ + + parsed = parse(model_sql, default_dialect="starrocks") + model: SqlModel = t.cast(SqlModel, load_sql_based_model(parsed)) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + model.name, + model.columns_to_types, + table_properties=model.physical_properties, + ) + + sql = to_sql_calls(adapter)[0] + assert expected_clause in sql + + +# ============================================================================= +# Order By Property Building +# ============================================================================= +class TestOrderByPropertyBuilding: + """Tests for order_by and clustered_by properties.""" + + @pytest.mark.parametrize( + "order_value,expected_clause,description", + [ + # String form (double-quoted string) + ('"id"', "ORDER BY (`id`)", "Bare string: single column"), + ( + '"id, timestamp"', + "ORDER BY (`id`, `timestamp`)", + "Bare string: multi-column without parens", + ), + ('"(id, timestamp)"', "ORDER BY (`id`, `timestamp`)", "String with parens"), + # Literal form (single-quoted string) + ("'id'", "ORDER BY (`id`)", "Bare string: single column"), + ( + "'id, timestamp'", + "ORDER BY (`id`, `timestamp`)", + "Bare string: multi-column without parens", + ), + ("'(id, timestamp)'", "ORDER BY (`id`, `timestamp`)", "String with parens"), + # Tuple form (direct expression construction in MODEL) + ("(id, timestamp)", "ORDER BY (`id`, `timestamp`)", "Tuple: multi-column"), + # Single identifier (unquoted) + ("id", "ORDER BY (`id`)", "Identifier: single column"), + ], + ) + def test_order_by_value_forms( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + order_value: str, + expected_clause: str, + description: str, + ): + """Test ORDER BY with various input forms parsed from physical_properties.""" + model_sql = f""" + MODEL ( + name t, + kind FULL, + dialect starrocks, + columns (id INT, timestamp DATETIME, value DECIMAL(10,2)), + physical_properties ( + order_by = {order_value} + ) + ); + SELECT 1; + """ + + parsed = parse(model_sql, default_dialect="starrocks") + model: SqlModel = t.cast(SqlModel, load_sql_based_model(parsed)) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + model.name, + model.columns_to_types, + table_properties=model.physical_properties, + ) + + sql = to_sql_calls(adapter)[0] + assert expected_clause in sql, ( + f"\nTest case: {description}\n" + f"Input: {order_value}\n" + f"Expected: {expected_clause}\n" + f"Actual SQL: {sql}" + ) + + def test_clustered_by_generates_order_by( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test that clustered_by parameter generates ORDER BY clause.""" + model_sql = """ + MODEL ( + name t, + kind FULL, + dialect starrocks, + columns (id INT, timestamp DATETIME, value DECIMAL(10,2)), + physical_properties ( + clustered_by = (id, timestamp) + ) + ); + SELECT 1; + """ + + parsed = parse(model_sql, default_dialect="starrocks") + model: SqlModel = t.cast(SqlModel, load_sql_based_model(parsed)) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + model.name, + model.columns_to_types, + table_properties=model.physical_properties, + ) + + sql = to_sql_calls(adapter)[0] + assert "ORDER BY (`id`, `timestamp`)" in sql + assert "CLUSTER BY" not in sql + + def test_clustered_by_as_model_parameter( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test clustered_by as model-level parameter (not in physical_properties).""" + model_sql = """ + MODEL ( + name t, + kind FULL, + dialect starrocks, + columns (id INT, timestamp DATETIME, value DECIMAL(10,2)), + clustered_by id + ); + SELECT 1; + """ + + parsed = parse(model_sql, default_dialect="starrocks") + model: SqlModel = t.cast(SqlModel, load_sql_based_model(parsed)) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + model.name, + model.columns_to_types, + clustered_by=model.clustered_by, + ) + + sql = to_sql_calls(adapter)[0] + assert "ORDER BY (`id`)" in sql + # Verify that StarRocks uses ORDER BY, not CLUSTER BY + assert "CLUSTER BY" not in sql + + +# ============================================================================= +# Generic Property Building +# ============================================================================= +class TestGenericPropertyBuilding: + """Tests for generic table properties (replication_num, etc.).""" + + @pytest.mark.parametrize( + "prop_name,prop_value,expected_in_sql", + [ + # Integer value + ("replication_num", "1", "'replication_num'='1'"), + ("replication_num", "3", "'replication_num'='3'"), + # Boolean TRUE + ("enable_persistent_index", "TRUE", "'enable_persistent_index'='TRUE'"), + # Boolean FALSE + ("in_memory", "FALSE", "'in_memory'='FALSE'"), + # String value + ("compression", "LZ4", "'compression'='LZ4'"), + ], + ) + def test_generic_property_value_forms( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + prop_name: str, + prop_value: str, + expected_in_sql: str, + ): + """Test generic properties with various value types.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + adapter.create_table( + "test_table", + target_columns_to_types={ + "id": exp.DataType.build("INT"), + "name": exp.DataType.build("VARCHAR(100)"), + }, + primary_key=("id",), + table_properties={ + prop_name: prop_value, + }, + ) + + sql = to_sql_calls(adapter)[0] + assert expected_in_sql in sql + + +# ============================================================================= +# View Property Building +# ============================================================================= +class TestViewPropertyBuilding: + """Tests for StarRocks-specific view properties (SECURITY).""" + + @pytest.mark.parametrize( + "property_sql,expected_fragment", + [ + ("INVOKER", "SECURITY INVOKER"), + ("'INVOKER'", "SECURITY INVOKER"), + ("invoker", "SECURITY INVOKER"), + ("NONE", "SECURITY NONE"), + ], + ) + def test_security_value_forms( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + property_sql: str, + expected_fragment: str, + ): + """Ensure different input forms render SECURITY .""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + model_sql = f""" + MODEL ( + name test_schema.test_view_security, + kind VIEW, + dialect starrocks, + columns (c INT), + virtual_properties ( + security = {property_sql} + ) + ); + SELECT 1 AS c; + """ + model = _load_sql_model(model_sql) + + query = model.render_query() + adapter.create_view( + model.name, + query, + replace=False, + target_columns_to_types=model.columns_to_types, + view_properties=model.virtual_properties, + ) + + sql = to_sql_calls(adapter)[0] + assert expected_fragment in sql + + def test_security_invalid_value( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Invalid SECURITY enum should raise SQLMeshError.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + model_sql = """ + MODEL ( + name test_schema.test_view_security_invalid, + kind VIEW, + dialect starrocks, + columns (c INT), + virtual_properties ( + security = foo + ) + ); + SELECT 1 AS c; + """ + model = _load_sql_model(model_sql) + + query = model.render_query() + with pytest.raises(SQLMeshError, match="security"): + adapter.create_view( + model.name, + query, + replace=False, + target_columns_to_types=model.columns_to_types, + view_properties=model.virtual_properties, + ) + + +# ============================================================================= +# Materialized View Refresh Property Building +# ============================================================================= +class TestMVRefreshPropertyBuilding: + """Tests for refresh_moment / refresh_scheme parsing and rendering.""" + + def _build_mv_model(self, property_sql: str) -> SqlModel: + model_sql = f""" + MODEL ( + name test_schema.test_mv_refresh_model, + kind VIEW, + dialect starrocks, + columns (a INT), + virtual_properties ( + {property_sql} + ) + ); + SELECT 1 AS a; + """ + return _load_sql_model(model_sql) + + def _create_simple_mv( + self, + adapter: StarRocksEngineAdapter, + model: SqlModel, + ) -> str: + query = model.render_query() + adapter.create_view( + "test_mv_refresh", + query, + replace=False, + materialized=True, + target_columns_to_types=model.columns_to_types, + view_properties=model.virtual_properties, + ) + # replace=False → only CREATE statement is emitted + return to_sql_calls(adapter)[-1] + + @pytest.mark.parametrize( + "property_sql,expected_fragment", + [ + ("refresh_moment = IMMEDIATE", "REFRESH IMMEDIATE"), + ("refresh_moment = deferred", "REFRESH DEFERRED"), + ], + ) + def test_refresh_moment_value_forms( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + property_sql: str, + expected_fragment: str, + ): + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + model = self._build_mv_model(property_sql) + sql = self._create_simple_mv(adapter, model) + assert expected_fragment in sql + + @pytest.mark.parametrize( + "property_sql,expected_fragments", + [ + ("refresh_scheme = ASYNC", ["REFRESH", "ASYNC"]), + # single quote value with single quote start + ( + "refresh_scheme = 'ASYNC START (''2025-01-01 00:00:00'') EVERY (INTERVAL 5 MINUTE)'", + [ + "REFRESH", + "ASYNC", + "START ('2025-01-01 00:00:00')", + "EVERY (INTERVAL 5 MINUTE)", + ], + ), + # single quote value with double quote start + ( + "refresh_scheme = 'ASYNC START (\"2025-02-01 00:00:00\") EVERY (INTERVAL 5 MINUTE)'", + [ + "REFRESH", + "ASYNC", + "START ('2025-02-01 00:00:00')", + "EVERY (INTERVAL 5 MINUTE)", + ], + ), + # double quote value with single quote start + ( + "refresh_scheme = \"async start ('2025-03-01') every (interval 10 minute)\"", + [ + "REFRESH", + "ASYNC", + "START ('2025-03-01')", + "EVERY (INTERVAL 10 MINUTE)", + ], + ), + ("refresh_scheme = MANUAL", ["REFRESH", "MANUAL"]), + ], + ) + def test_refresh_scheme_value_forms( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + property_sql: str, + expected_fragments: t.List[str], + ): + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + model = self._build_mv_model(property_sql) + sql = self._create_simple_mv(adapter, model) + for fragment in expected_fragments: + assert fragment in sql + + def test_refresh_moment_invalid_value( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + model = self._build_mv_model("refresh_moment = AUTO") + with pytest.raises(SQLMeshError): + self._create_simple_mv(adapter, model) + + def test_refresh_scheme_invalid_prefix( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + model = self._build_mv_model( + "refresh_scheme = 'SCHEDULE EVERY (INTERVAL 5 MINUTE)'" + ) + with pytest.raises(SQLMeshError, match="refresh_scheme"): + self._create_simple_mv(adapter, model) + + +# ============================================================================= +# Comment Property Building +# ============================================================================= +class TestCommentPropertyBuilding: + """Tests for table and column comments.""" + + def test_table_and_column_comments( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test CREATE TABLE with table and column comments.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + "test_table", + target_columns_to_types={ + "a": exp.DataType.build("INT"), + "b": exp.DataType.build("VARCHAR(100)"), + }, + table_description="Test table description", + column_descriptions={ + "a": "Column A description", + "b": "Column B description", + }, + ) + + sql = to_sql_calls(adapter)[0] + assert "COMMENT 'Test table description'" in sql + assert "COMMENT 'Column A description'" in sql + assert "COMMENT 'Column B description'" in sql + + def test_view_with_comments( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test CREATE VIEW with comments.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_view( + "test_view", + parse_one("SELECT a FROM tbl"), + replace=False, + target_columns_to_types={"a": exp.DataType.build("INT")}, + table_description="Test view description", + column_descriptions={"a": "Column A description"}, + ) + + sql = to_sql_calls(adapter)[0] + assert "COMMENT 'Test view description'" in sql + assert "COMMENT 'Column A description'" in sql + + @pytest.mark.parametrize( + "table_name,comment,expected_sql", + [ + ( + "test_table", + "Test table comment", + "ALTER TABLE `test_table` COMMENT = 'Test table comment'", + ), + ( + "db.test_table", + "Database qualified table comment", + "ALTER TABLE `db`.`test_table` COMMENT = 'Database qualified table comment'", + ), + ( + "test_table", + "It's a test", + None, # Will check for escaped quote + ), + ], + ids=["simple_table", "qualified_table", "special_chars"], + ) + def test_build_create_comment_table_exp( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + table_name: str, + comment: str, + expected_sql: t.Optional[str], + ): + """ + Test _build_create_comment_table_exp generates correct ALTER TABLE COMMENT SQL. + + Verifies: + 1. SQL format: ALTER TABLE {table} COMMENT = '{comment}' + 2. No MODIFY keyword (StarRocks uses direct COMMENT =) + 3. Comment is properly quoted + 4. Table name is properly quoted + 5. Special characters are escaped + """ + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + table = exp.to_table(table_name) + sql = adapter._build_create_comment_table_exp(table, comment, "TABLE") + + if expected_sql: + assert sql == expected_sql + else: + # Special chars case - check for escaped quote + assert "It\'s a test" in sql or "It''s a test" in sql + + # Common assertions for all cases + assert "ALTER TABLE" in sql + assert "COMMENT =" in sql + assert "MODIFY" not in sql # StarRocks doesn't use MODIFY for table comments + + def test_build_create_comment_table_exp_truncation( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """ + Test _build_create_comment_table_exp truncates long comments. + + Verifies comments longer than MAX_TABLE_COMMENT_LENGTH (2048) are truncated. + """ + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + table = exp.to_table("test_table") + long_comment = "x" * 3000 # Longer than MAX_TABLE_COMMENT_LENGTH (2048) + sql = adapter._build_create_comment_table_exp(table, long_comment, "TABLE") + + # The comment should be truncated to 2048 characters + expected_truncated = "x" * 2048 + assert expected_truncated in sql + assert "xxx" * 1000 not in sql # Verify it's actually truncated + + @pytest.mark.parametrize( + "table_name,column_name,comment,expected_sql", + [ + ( + "test_table", + "test_column", + "Test column comment", + "ALTER TABLE `test_table` MODIFY COLUMN `test_column` COMMENT 'Test column comment'", + ), + ( + "db.test_table", + "id", + "ID column", + "ALTER TABLE `db`.`test_table` MODIFY COLUMN `id` COMMENT 'ID column'", + ), + ], + ids=["simple_table", "qualified_table"], + ) + def test_build_create_comment_column_exp( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + table_name: str, + column_name: str, + comment: str, + expected_sql: str, + ): + """ + Test _build_create_comment_column_exp generates correct ALTER TABLE MODIFY COLUMN SQL. + + Verifies: + 1. SQL format: ALTER TABLE {table} MODIFY COLUMN {column} COMMENT '{comment}' + 2. No column type required (StarRocks supports this) + 3. Comment is properly quoted + """ + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + table = exp.to_table(table_name) + sql = adapter._build_create_comment_column_exp(table, column_name, comment, "TABLE") + + assert sql == expected_sql + # Should NOT contain column type + assert "VARCHAR" not in sql + assert "INT" not in sql + assert "BIGINT" not in sql + + def test_build_create_comment_column_exp_truncation( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """ + Test _build_create_comment_column_exp truncates long comments. + + Verifies comments longer than MAX_COLUMN_COMMENT_LENGTH (255) are truncated. + """ + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + table = exp.to_table("test_table") + long_comment = "y" * 500 # Longer than MAX_COLUMN_COMMENT_LENGTH (255) + sql = adapter._build_create_comment_column_exp(table, "test_col", long_comment, "TABLE") + + # The comment should be truncated to 255 characters + expected_truncated = "y" * 255 + assert expected_truncated in sql + assert "yyy" * 200 not in sql # Verify it's actually truncated + + +# ============================================================================= +# Invalid Property Scenarios +# ============================================================================= +class TestInvalidPropertyScenarios: + """Unit tests for property validation errors (mutual exclusivity, aliases, names).""" + + def test_key_type_mutually_exclusive( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + model_sql = """ + MODEL ( + name test_schema.test_conflicting_keys, + kind FULL, + dialect starrocks, + columns ( + id INT, + dt DATE, + value INT + ), + physical_properties ( + primary_key = (id), + unique_key = (id) + ) + ); + SELECT id, dt, value FROM source_table; + """ + model = _load_sql_model(model_sql) + columns = model.columns_to_types + + with pytest.raises(SQLMeshError, match="Multiple table key type"): + adapter.create_table( + "test_conflicting_keys", + target_columns_to_types=columns, + table_properties=model.physical_properties, + ) + + def test_partition_alias_conflict_with_parameter( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + model_sql = """ + MODEL ( + name test_schema.test_partition_conflict, + kind FULL, + dialect starrocks, + columns ( + id INT, + dt DATE, + value INT + ), + partitioned_by (dt), + physical_properties ( + partition_by = (dt) + ) + ); + SELECT id, dt, value FROM source_table; + """ + model = _load_sql_model(model_sql) + + with pytest.raises(SQLMeshError, match="partition definition"): + adapter.create_table( + model.name, + target_columns_to_types=model.columns_to_types, + partitioned_by=model.partitioned_by, + table_properties=model.physical_properties, + ) + + def test_invalid_property_name_detection( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + model_sql = """ + MODEL ( + name test_schema.test_invalid_property, + kind FULL, + dialect starrocks, + columns ( + id INT, + dt DATE, + value INT + ), + physical_properties ( + partition = dt + ) + ); + SELECT id, dt, value FROM source_table; + """ + model = _load_sql_model(model_sql) + + with pytest.raises(SQLMeshError, match="Invalid property 'partition'"): + adapter.create_table( + model.name, + target_columns_to_types=model.columns_to_types, + table_properties=model.physical_properties, + ) + + +# ============================================================================= +# Comprehensive Tests +# ============================================================================= +class TestComprehensive: + """Comprehensive tests combining multiple features.""" + + def test_create_table_comprehensive( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """ + Test CREATE TABLE with all features combined: + - PRIMARY KEY + - Table and column comments + - DISTRIBUTED BY + - ORDER BY + - Custom properties + """ + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + "test_table", + target_columns_to_types={ + "customer_id": exp.DataType.build("INT"), + "order_id": exp.DataType.build("BIGINT"), + "event_date": exp.DataType.build("DATE"), + "amount": exp.DataType.build("DECIMAL(10,2)"), + }, + primary_key=("order_id", "event_date"), + table_description="Sales transaction table", + column_descriptions={ + "customer_id": "Customer identifier", + "order_id": "Order identifier", + }, + table_properties={ + "distributed_by": exp.Tuple( + expressions=[ + exp.EQ( + this=exp.Column(this="kind"), + expression=exp.Literal.string("HASH"), + ), + exp.EQ( + this=exp.Column(this="expressions"), + expression=exp.Tuple( + expressions=[exp.to_column("customer_id")] + ), + ), + exp.EQ( + this=exp.Column(this="buckets"), + expression=exp.Literal.number(10), + ), + ] + ), + "replication_num": "3", + }, + clustered_by=[exp.to_column("customer_id"), exp.to_column("order_id")], + ) + + sql = to_sql_calls(adapter)[0] + assert "CREATE TABLE IF NOT EXISTS `test_table`" in sql + assert "PRIMARY KEY (`order_id`, `event_date`)" in sql + assert "COMMENT 'Sales transaction table'" in sql + assert "COMMENT 'Customer identifier'" in sql + assert "COMMENT 'Order identifier'" in sql + assert "DISTRIBUTED BY HASH (`customer_id`) BUCKETS 10" in sql + assert "ORDER BY (`customer_id`, `order_id`)" in sql + assert "PROPERTIES ('replication_num'='3')" in sql diff --git a/tests/core/test_connection_config.py b/tests/core/test_connection_config.py index 7d8ccef4b0..3e3065b600 100644 --- a/tests/core/test_connection_config.py +++ b/tests/core/test_connection_config.py @@ -20,6 +20,7 @@ MySQLConnectionConfig, PostgresConnectionConfig, SnowflakeConnectionConfig, + StarRocksConnectionConfig, TrinoAuthenticationMethod, AthenaConnectionConfig, MSSQLConnectionConfig, @@ -1998,3 +1999,59 @@ def test_schema_differ_overrides(make_config) -> None: adapter = config.create_engine_adapter() assert adapter._schema_differ_overrides == override assert adapter.schema_differ.parameterized_type_defaults == {} + + +def test_starrocks(make_config): + """Test StarRocksConnectionConfig basic functionality""" + # Basic configuration + config = make_config( + type="starrocks", + host="localhost", + user="root", + password="password", + port=9030, + database="testdb", + check_import=False, + ) + assert isinstance(config, StarRocksConnectionConfig) + assert config.type_ == "starrocks" + assert config.host == "localhost" + assert config.user == "root" + assert config.password == "password" + assert config.port == 9030 + assert config.database == "testdb" + assert config.DIALECT == "starrocks" + assert config.DISPLAY_NAME == "StarRocks" + assert config.DISPLAY_ORDER == 19 + assert config.is_recommended_for_state_sync is False + + # Test with minimal configuration (using default port) + minimal_config = make_config( + type="starrocks", + host="starrocks-fe", + user="starrocks_user", + password="starrocks_pswd", + check_import=False, + ) + assert isinstance(minimal_config, StarRocksConnectionConfig) + assert minimal_config.port == 9030 # Default StarRocks FE port + assert minimal_config.host == "starrocks-fe" + assert minimal_config.user == "starrocks_user" + + # Test with additional MySQL-compatible options + advanced_config = make_config( + type="starrocks", + host="starrocks-fe", + user="admin", + password="admin123", + port=9030, + database="testdb", + charset="utf8mb4", + ssl_disabled=True, + concurrent_tasks=10, + check_import=False, + ) + assert isinstance(advanced_config, StarRocksConnectionConfig) + assert advanced_config.charset == "utf8mb4" + assert advanced_config.ssl_disabled is True + assert advanced_config.concurrent_tasks == 10 From 17a8bcf6d08052e65efe9a0e97cb55ec507820c5 Mon Sep 17 00:00:00 2001 From: jaogoy Date: Tue, 13 Jan 2026 15:58:29 +0800 Subject: [PATCH 02/20] refine code style And optimize some test cases. Signed-off-by: jaogoy Signed-off-by: Mateusz Jukiewicz --- sqlmesh/core/engine_adapter/starrocks.py | 368 ++++++--------- sqlmesh/core/snapshot/evaluator.py | 23 +- .../integration/test_integration_starrocks.py | 442 ++++++++++-------- tests/core/engine_adapter/test_starrocks.py | 95 ++-- 4 files changed, 456 insertions(+), 472 deletions(-) diff --git a/sqlmesh/core/engine_adapter/starrocks.py b/sqlmesh/core/engine_adapter/starrocks.py index b8ff61b158..212c40143b 100644 --- a/sqlmesh/core/engine_adapter/starrocks.py +++ b/sqlmesh/core/engine_adapter/starrocks.py @@ -113,9 +113,7 @@ def validate(self, value: t.Any) -> t.Optional[Validated]: """Check if value conforms to this type. Return validated value or None. String that can be parsed as literal """ - raise NotImplementedError( - f"{self.__class__.__name__}.validate() must be implemented" - ) + raise NotImplementedError(f"{self.__class__.__name__}.validate() must be implemented") def normalize(self, validated: Validated) -> Normalized: """Convert validated intermediate value to final output format.""" @@ -126,9 +124,7 @@ def __call__(self, value: t.Any) -> Normalized: """Validate and normalize in one step.""" validated = self.validate(value) if validated is None: - raise ValueError( - f"Value {value!r} does not conform to type {self.__class__.__name__}" - ) + raise ValueError(f"Value {value!r} does not conform to type {self.__class__.__name__}") return self.normalize(validated) @@ -346,7 +342,7 @@ def validate(self, value: t.Any) -> t.Optional[t.Tuple[str, t.Any]]: key_name = None if isinstance(left, exp.Column): - key_name = left.this.name if hasattr(left.this, 'name') else str(left.this) + key_name = left.this.name if hasattr(left.this, "name") else str(left.this) elif isinstance(left, exp.Identifier): key_name = left.this elif isinstance(left, str): @@ -403,18 +399,13 @@ def __init__( self.case_sensitive = bool(case_sensitive) self.normalized_type = normalized_type - if ( - self.normalized_type is not None - and self.normalized_type not in PROPERTY_OUTPUT_TYPES - ): + if self.normalized_type is not None and self.normalized_type not in PROPERTY_OUTPUT_TYPES: raise ValueError( f"normalized_type must be one of {PROPERTY_OUTPUT_TYPES}, got {self.normalized_type!r}" ) # Pre-compute normalized values for efficient lookup - self._values_normalized = [ - v if case_sensitive else v.upper() for v in self.valid_values - ] + self._values_normalized = [v if case_sensitive else v.upper() for v in self.valid_values] def _extract_text(self, value: t.Any) -> t.Optional[str]: """Extract text from various value types.""" @@ -543,9 +534,7 @@ def __init__(self, *types: DeclarativeType): # Validate all types are DeclarativeType instances for type_ in types: if not isinstance(type_, DeclarativeType): - raise TypeError( - f"AnyOf expects DeclarativeType instances, got {type_!r}" - ) + raise TypeError(f"AnyOf expects DeclarativeType instances, got {type_!r}") self.types: t.List[DeclarativeType] = list(types) @@ -622,9 +611,7 @@ def __init__( self.allow_single = allow_single self.output_as = output_as - def validate( - self, value: t.Any - ) -> t.Optional[t.List[t.Tuple[DeclarativeType, Validated]]]: + def validate(self, value: t.Any) -> t.Optional[t.List[t.Tuple[DeclarativeType, Validated]]]: """Validate each element in the sequence. Returns list of (matched_type, validated_value) tuples or None.""" # Extract elements from various container types elems = self._extract_elements(value) @@ -653,9 +640,7 @@ def normalize( self, validated: t.List[t.Tuple[DeclarativeType, Validated]] ) -> t.Union[t.List[Normalized], t.Tuple[Normalized, ...]]: """Normalize each validated element using its matched type's normalize method.""" - normalized_items = [ - elem_type.normalize(value) for elem_type, value in validated - ] + normalized_items = [elem_type.normalize(value) for elem_type, value in validated] # Convert to desired output format if self.output_as == "tuple": @@ -677,9 +662,7 @@ def _extract_elements(self, value: t.Any) -> t.Optional[t.List[t.Any]]: value = parse_fragment(value) except Exception: # If parsing fails and we accept single strings, promote to list - if self.allow_single and any( - isinstance(t, StringType) for t in self.elem_types - ): + if self.allow_single and any(isinstance(t, StringType) for t in self.elem_types): return [value] return None @@ -785,9 +768,7 @@ class DistributionTupleInputType(StructuredTupleType): FIELDS: t.Dict[str, Field] = {} # Subclasses override this - def __init__( - self, error_on_unknown_field: bool = True, error_on_invalid_field: bool = True - ): + def __init__(self, error_on_unknown_field: bool = True, error_on_invalid_field: bool = True): self.error_on_unknown_field = error_on_unknown_field self.error_on_invalid_field = error_on_invalid_field @@ -899,7 +880,9 @@ def _extract_pairs(self, value: t.Any) -> t.Optional[t.List[t.Any]]: Each expression should be an exp.EQ (key=value). """ # exp.Tuple: (a=1, b=2) - if isinstance(value, (exp.Tuple, list)): + if isinstance(value, list): + return value + if isinstance(value, exp.Tuple): return list(value.expressions) # exp.Paren: (a=1) or ((a=1, b=2)) @@ -1043,9 +1026,7 @@ def validate(self, value: t.Any) -> t.Optional[t.Dict[str, t.Any]]: # ============================================================ @staticmethod - def from_enum( - enum_value: str, buckets: t.Optional[int] = None - ) -> t.Dict[str, t.Any]: + def from_enum(enum_value: str, buckets: t.Optional[int] = None) -> t.Dict[str, t.Any]: """ Create distribution dict from EnumType normalized value. @@ -1060,11 +1041,7 @@ def from_enum( >>> DistributionTupleInputType.from_enum("RANDOM") {"kind": "RANDOM", "columns": [], "buckets": None} """ - return { - "kind": enum_value, - "columns": [], - "buckets": buckets - } + return {"kind": enum_value, "columns": [], "buckets": buckets} @staticmethod def from_func( @@ -1085,27 +1062,15 @@ def from_func( >>> DistributionTupleInputType.from_func(func) {"kind": "HASH", "columns": [exp.Column("id"), exp.Column("dt")], "buckets": None} """ - func_name = ( - func.name.upper() if hasattr(func, "name") else str(func.this).upper() - ) + func_name = func.name.upper() if hasattr(func, "name") else str(func.this).upper() if func_name == "HASH": # Extract columns from HASH(col1, col2, ...) - columns: list[exp.Column] = ( - [func.this] if isinstance(func.this, exp.Column) else [] - ) + columns: list[exp.Column] = [func.this] if isinstance(func.this, exp.Column) else [] columns.extend(func.expressions) - return { - "kind": "HASH", - "columns": columns, - "buckets": buckets - } + return {"kind": "HASH", "columns": columns, "buckets": buckets} elif func_name == "RANDOM": # noqa: RET505 - return { - "kind": "RANDOM", - "columns": [], - "buckets": buckets - } + return {"kind": "RANDOM", "columns": [], "buckets": buckets} else: raise ValueError(f"Unknown distribution function: {func_name}") @@ -1162,7 +1127,6 @@ def to_unified_dict( # Type Specifications for StarRocks Properties (INPUT and OUTPUT) # ============================================================ class PropertySpecs: - # Accepts: # - Single column: id # - Multiple columns: (id, dt) @@ -1228,10 +1192,10 @@ class PropertySpecs: # For properties like refresh_scheme, it can be a string, identifier, or column RefreshSchemeInputSpec = AnyOf( EnumType(["ASYNC", "MANUAL"], normalized_type="var"), - ColumnType(normalized_type="str"), # Columns → will be converted to string - IdentifierType(normalized_type="str"), # Identifiers → will be converted to string - LiteralType(normalized_type="str"), # Numbers and string literals → will be converted to string - StringType(), # Plain strings + ColumnType(normalized_type="str"), # Columns → will be converted to string + IdentifierType(normalized_type="str"), # Identifiers → will be converted to string + LiteralType(normalized_type="str"), # Numbers and string → to string + StringType(), # Plain strings ) # Generic property value: Accepts various types, normalizes to string @@ -1239,10 +1203,10 @@ class PropertySpecs: # StarRocks PROPERTIES syntax requires all values to be strings: "value" # So we normalize everything to string for consistent SQL generation GenericPropertyInputSpec = AnyOf( - StringType(), # Plain strings - LiteralType(normalized_type="str"), # Numbers and string literals → will be converted to string - IdentifierType(normalized_type="str"), # Identifiers → will be converted to string - ColumnType(normalized_type="str"), # Columns → will be converted to string + StringType(), # Plain strings + LiteralType(normalized_type="str"), # Numbers and string → will be converted to string + IdentifierType(normalized_type="str"), # Identifiers → will be converted to string + ColumnType(normalized_type="str"), # Columns → will be converted to string ) """ @@ -1293,33 +1257,26 @@ class PropertySpecs: "duplicate_key": TableKeyInputSpec, "unique_key": TableKeyInputSpec, "aggregate_key": TableKeyInputSpec, - # Partition-related properties "partitioned_by": PartitionedByInputSpec, "partitions": PartitionsInputSpec, - # Distribution property "distributed_by": DistributedByInputSpec, - # Ordering property "clustered_by": OrderByInputSpec, - # View properties # StarRocks syntax: SECURITY {NONE | INVOKER | DEFINER} "security": EnumType(["NONE", "INVOKER", "DEFINER"], normalized_type="str"), - # Materialized view refresh properties (StarRocks uses REFRESH ...) # - refresh_moment: IMMEDIATE | DEFERRED "refresh_moment": EnumType(["IMMEDIATE", "DEFERRED"], normalized_type="str"), # - refresh_scheme: ASYNC | ASYNC [START (...) EVERY (INTERVAL ...)] | MANUAL # it should be a string/literal if START/EVERY is present, other than ASYNC "refresh_scheme": RefreshSchemeInputSpec, - # Note: All other properties not listed here will be handled, an example here "replication_num": GenericPropertyInputSpec, } - # Default output spec for properties not in PROPERTY_OUTPUT_SPECS GenericPropertyOutputSpec = StringType() @@ -1341,10 +1298,10 @@ class PropertySpecs: - order_by: List[exp.Expression] - columns - generic properties: str - normalized string values """ + GeneralColumnListOutputSpec: DeclarativeType = SequenceOf(ColumnType(), allow_single=False) + PROPERTY_OUTPUT_SPECS: t.Dict[str, DeclarativeType] = { - "primary_key": ( - GeneralColumnListOutputSpec := SequenceOf(ColumnType(), allow_single=False) - ), + "primary_key": GeneralColumnListOutputSpec, "duplicate_key": GeneralColumnListOutputSpec, "unique_key": GeneralColumnListOutputSpec, "aggregate_key": GeneralColumnListOutputSpec, @@ -1424,12 +1381,12 @@ class PropertyValidator: # Centralized property alias configuration # Maps canonical name -> list of valid aliases - PROPERTY_ALIASES: t.Dict[str, t.List[str]] = { + PROPERTY_ALIASES: t.Dict[str, t.Set[str]] = { "partitioned_by": {"partition_by"}, "clustered_by": {"order_by"}, } - EXCLUSIVE_PROPERTY_NAME_MAP: t.Dict[str, t.List[str]] = { + EXCLUSIVE_PROPERTY_NAME_MAP: t.Dict[str, t.Set[str]] = { "key_type": set(TABLE_KEY_TYPES), **PROPERTY_ALIASES, } @@ -1473,11 +1430,7 @@ def ensure_parenthesized(value: t.Any) -> t.Any: if isinstance(value, exp.Literal) and value.is_string: value = value.this # Extract string content from Column (quoted) - elif ( - isinstance(value, exp.Column) - and hasattr(value.this, "quoted") - and value.this.quoted - ): + elif isinstance(value, exp.Column) and hasattr(value.this, "quoted") and value.this.quoted: value = value.name # Column.name returns the string elif not isinstance(value, str): return value @@ -1523,9 +1476,7 @@ def validate_and_normalize_property( >>> validated = PropertyValidator.validate_and_normalize_property("distributed_by", "RANDOM") >>> # Result: "RANDOM" (string from EnumType) """ - logger.debug( - "validate_and_normalize_property. value: %s, type: %s", value, type(value) - ) + logger.debug("validate_and_normalize_property. value: %s, type: %s", value, type(value)) # Step 1: Optionally preprocess string with parentheses if preprocess_parentheses: @@ -1539,9 +1490,7 @@ def validate_and_normalize_property( # Step 3: Validate validated = input_spec.validate(value) if validated is None: - raise SQLMeshError( - f"Invalid value type for property '{property_name}': {value!r}." - ) + raise SQLMeshError(f"Invalid value type for property '{property_name}': {value!r}.") # Step 4: Normalize normalized = input_spec.normalize(validated) @@ -1608,7 +1557,7 @@ def check_at_most_one( property_name: str, property_description: str, table_properties: t.Dict[str, t.Any], - exclusive_property_names: t.Set[str] = None, + exclusive_property_names: t.Optional[t.Set[str]] = None, parameter_value: t.Optional[t.Any] = None, ) -> t.Optional[str]: """ @@ -1640,17 +1589,14 @@ def check_at_most_one( Only one is allowed. """ if not exclusive_property_names: - exclusive_property_names = ( - PropertyValidator.EXCLUSIVE_PROPERTY_NAME_MAP.get(property_name, set()) - | {property_name} - ) + exclusive_property_names = PropertyValidator.EXCLUSIVE_PROPERTY_NAME_MAP.get( + property_name, set() + ) | {property_name} # logger.debug("Checking at most one property for '%s': %s", property_name, exclusive_property_names) # Check parameter first (highest priority) if parameter_value is not None: # Check if any conflicting properties exist in table_properties - conflicts = [ - name for name in exclusive_property_names if name in table_properties - ] + conflicts = [name for name in exclusive_property_names if name in table_properties] if conflicts: param_display = f"{property_name} (parameter)" raise SQLMeshError( @@ -1661,9 +1607,7 @@ def check_at_most_one( return None # Check table_properties for multiple definitions - present = [ - name for name in exclusive_property_names if name in table_properties - ] + present = [name for name in exclusive_property_names if name in table_properties] # logger.debug("Get table key names for %s from table_properties: %s", property_name, present) if len(present) > 1: @@ -1835,16 +1779,14 @@ def _get_data_objects( # StarRocks may treat information_schema table_name comparisons as case-sensitive. # Use LOWER(table_name) to match case-insensitively. lowered_names = [name.lower() for name in object_names] - query = query.where( - exp.func("LOWER", exp.column("table_name")).isin(*lowered_names) - ) + query = query.where(exp.func("LOWER", exp.column("table_name")).isin(*lowered_names)) df = self.fetchdf(query) objects = [ DataObject( schema=row.schema_name, name=row.name, - type=DataObjectType.from_str(row.type), + type=DataObjectType.from_str(str(row.type)), ) for row in df.itertuples() ] @@ -1867,14 +1809,18 @@ def _get_data_objects( ) mv_df = self.fetchdf(mv_query) - mv_names: t.Set[str] = {t.cast(str, r.name).lower() for r in mv_df.itertuples() if r.name} + mv_names: t.Set[str] = { + t.cast(str, r.name).lower() for r in mv_df.itertuples() if r.name + } if mv_names: for obj in objects: if obj.name.lower() in mv_names: obj.type = DataObjectType.MATERIALIZED_VIEW except Exception: - logger.warning(f"[StarRocks] Failed to get materialized views from information_schema.materialized_views") + logger.warning( + f"[StarRocks] Failed to get materialized views from information_schema.materialized_views" + ) return objects @@ -1961,49 +1907,46 @@ def delete_from( where: The where clause to filter rows to delete """ # Parse where clause if it's a string + where_expr: t.Optional[exp.Expression] if isinstance(where, str): from sqlglot import parse_one - where: exp.Expression = parse_one(where, dialect=self.dialect) + where_expr = parse_one(where, dialect=self.dialect) + else: + where_expr = where # If no where clause or WHERE TRUE, use TRUNCATE TABLE (for all table types) - if not where or where == exp.true(): - table_expr = ( - exp.to_table(table_name) if isinstance(table_name, str) else table_name - ) + if not where_expr or where_expr == exp.true(): + table_expr = exp.to_table(table_name) if isinstance(table_name, str) else table_name logger.info( f"Converting DELETE FROM {table_name} WHERE TRUE to TRUNCATE TABLE " "(StarRocks does not support WHERE TRUE in DELETE)" ) - self.execute( - f"TRUNCATE TABLE {table_expr.sql(dialect=self.dialect, identify=True)}" - ) + self.execute(f"TRUNCATE TABLE {table_expr.sql(dialect=self.dialect, identify=True)}") return # For non-PRIMARY KEY tables, apply WHERE clause restrictions # Note: We conservatively apply restrictions to all tables since we can't easily # determine table type at DELETE time. PRIMARY KEY tables will still work with # simplified conditions, while non-PRIMARY KEY tables require them. - if isinstance(where, exp.Expression): - original_where = where + if isinstance(where_expr, exp.Expression): + original_where = where_expr # Remove boolean literals (not supported in any table type) - where = self._where_clause_remove_boolean_literals(where) + where_expr = self._where_clause_remove_boolean_literals(where_expr) # Convert BETWEEN to >= AND <= (required for DUPLICATE/UNIQUE/AGGREGATE KEY tables) - where = self._where_clause_convert_between_to_comparison(where) + where_expr = self._where_clause_convert_between_to_comparison(where_expr) - if where != original_where: + if where_expr != original_where: logger.debug( f"Converted WHERE clause for StarRocks compatibility, table: {table_name}.\n" f" Original: {original_where.sql(dialect=self.dialect)}\n" - f" Converted: {where.sql(dialect=self.dialect)}" + f" Converted: {where_expr.sql(dialect=self.dialect)}" ) # Use parent implementation - super().delete_from(table_name, where) + super().delete_from(table_name, where_expr) - def _where_clause_remove_boolean_literals( - self, expression: exp.Expression - ) -> exp.Expression: + def _where_clause_remove_boolean_literals(self, expression: exp.Expression) -> exp.Expression: """ Remove TRUE/FALSE boolean literals from WHERE expressions. @@ -2027,14 +1970,10 @@ def transform(node: exp.Expression) -> exp.Expression: # Handle standalone TRUE/FALSE at the top level if node == exp.true(): # Convert TRUE to 1=1 - return exp.EQ( - this=exp.Literal.number(1), expression=exp.Literal.number(1) - ) + return exp.EQ(this=exp.Literal.number(1), expression=exp.Literal.number(1)) elif node == exp.false(): # noqa: RET505 # Convert FALSE to 1=0 - return exp.EQ( - this=exp.Literal.number(1), expression=exp.Literal.number(0) - ) + return exp.EQ(this=exp.Literal.number(1), expression=exp.Literal.number(0)) # Handle AND expressions elif isinstance(node, exp.And): @@ -2126,21 +2065,38 @@ def execute( """ from sqlglot.helper import ensure_list + if isinstance(expressions, str): + super().execute( + expressions, + ignore_unsupported_errors=ignore_unsupported_errors, + quote_identifiers=quote_identifiers, + track_rows_processed=track_rows_processed, + **kwargs, + ) + return + # Process expressions to remove FOR UPDATE - processed_expressions = [] + processed_expressions: t.List[exp.Expression] = [] for e in ensure_list(expressions): - if isinstance(e, exp.Expression): - # Remove lock (FOR UPDATE) from SELECT statements - if isinstance(e, exp.Select) and e.args.get("locks"): - e = e.copy() - e.set("locks", None) - logger.warning(f"[StarRocks] Removed FOR UPDATE from SELECT statement: " - f"{e.sql(dialect=self.dialect, identify=quote_identifiers)}") - processed_expressions.append(e) - else: - # For string SQL, we can't easily remove FOR UPDATE without parsing - # Just pass through and let StarRocks reject it if present - processed_expressions.append(e) + if not isinstance(e, exp.Expression): + super().execute( + expressions, + ignore_unsupported_errors=ignore_unsupported_errors, + quote_identifiers=quote_identifiers, + track_rows_processed=track_rows_processed, + **kwargs, + ) + return + + # Remove lock (FOR UPDATE) from SELECT statements + if isinstance(e, exp.Select) and e.args.get("locks"): + e = e.copy() + e.set("locks", None) + logger.warning( + f"[StarRocks] Removed FOR UPDATE from SELECT statement: " + f"{e.sql(dialect=self.dialect, identify=quote_identifiers)}" + ) + processed_expressions.append(e) # Call parent execute with processed expressions super().execute( @@ -2238,7 +2194,8 @@ def _create_table_from_columns( ) logger.debug( "_create_table_from_columns: extracted key_type=%s, key_columns=%s", - key_type, key_columns + key_type, + key_columns, ) # IMPORTANT: Normalize parameter primary_key into table_properties for unified handling @@ -2248,7 +2205,9 @@ def _create_table_from_columns( table_properties["primary_key"] = primary_key logger.debug("_create_table_from_columns: unified primary_key into table_properties") elif key_type: - logger.debug("table key type '%s' may be handled in _build_table_key_property", key_type) + logger.debug( + "table key type '%s' may be handled in _build_table_key_property", key_type + ) # StarRocks key column ordering constraint: All key types need reordering if key_columns: @@ -2400,10 +2359,12 @@ def _create_materialized_view( partitioned_by = materialized_properties.get("partitioned_by") clustered_by = materialized_properties.get("clustered_by") partition_interval_unit = materialized_properties.get("partition_interval_unit") - logger.debug(f"Get info from materialized_properties: {materialized_properties}, " - f"partitioned_by: {partitioned_by}, " - f"clustered_by: {clustered_by}, " - f"partition_interval_unit: {partition_interval_unit}") + logger.debug( + f"Get info from materialized_properties: {materialized_properties}, " + f"partitioned_by: {partitioned_by}, " + f"clustered_by: {clustered_by}, " + f"partition_interval_unit: {partition_interval_unit}" + ) properties_exp = self._build_table_properties_exp( catalog_name=target_table.catalog, @@ -2547,9 +2508,7 @@ def _build_table_properties_exp( property_description="key type", table_properties=table_properties_copy, ) - logger.debug( - "_build_table_properties_exp: active_key_type='%s'", active_key_type - ) + logger.debug("_build_table_properties_exp: active_key_type='%s'", active_key_type) if is_mv and active_key_type: raise SQLMeshError( f"You can't specify the table type when the table is a materialized view. " @@ -2573,9 +2532,7 @@ def _build_table_properties_exp( ) # 1. Handle key constraints (ALL types including PRIMARY KEY) - key_prop = self._build_table_key_property( - table_properties_copy, active_key_type - ) + key_prop = self._build_table_key_property(table_properties_copy, active_key_type) if key_prop: properties.append(key_prop) logger.debug( @@ -2589,9 +2546,7 @@ def _build_table_properties_exp( if table_description: properties.append( exp.SchemaCommentProperty( - this=exp.Literal.string( - self._truncate_table_comment(table_description) - ) + this=exp.Literal.string(self._truncate_table_comment(table_description)) ) ) @@ -2612,14 +2567,10 @@ def _build_table_properties_exp( type(partition_prop).__name__, ) else: - logger.debug( - "_build_table_properties_exp: partition_prop skipped (not defined)" - ) + logger.debug("_build_table_properties_exp: partition_prop skipped (not defined)") # 4. Handle distributed_by (DISTRIBUTED BY HASH/RANDOM) - distributed_prop = self._build_distributed_by_property( - table_properties_copy, key_columns - ) + distributed_prop = self._build_distributed_by_property(table_properties_copy, key_columns) if distributed_prop: properties.append(distributed_prop) logger.debug( @@ -2627,23 +2578,22 @@ def _build_table_properties_exp( type(distributed_prop).__name__, ) else: - logger.debug( - "_build_table_properties_exp: distributed_prop skipped (not defined)" - ) + logger.debug("_build_table_properties_exp: distributed_prop skipped (not defined)") # 5. Handle refresh_property (REFRESH ...) if is_mv: refresh_prop = self._build_refresh_property(table_properties_copy) if refresh_prop: properties.append(refresh_prop) - logger.debug("_build_table_properties_exp: generated refresh_prop=%s", type(refresh_prop).__name__) + logger.debug( + "_build_table_properties_exp: generated refresh_prop=%s", + type(refresh_prop).__name__, + ) else: logger.debug("_build_table_properties_exp: refresh_prop skipped (not defined)") # 6. Handle order_by/clustered_by (ORDER BY ...) - order_prop = self._build_order_by_property( - table_properties_copy, clustered_by or None - ) + order_prop = self._build_order_by_property(table_properties_copy, clustered_by or None) if order_prop: properties.append(order_prop) logger.debug( @@ -2651,9 +2601,7 @@ def _build_table_properties_exp( type(order_prop).__name__, ) else: - logger.debug( - "_build_table_properties_exp: order_prop skipped (not defined)" - ) + logger.debug("_build_table_properties_exp: order_prop skipped (not defined)") # 5. Handle other properties (replication_num, storage_medium, etc.) other_props = self._build_other_properties(table_properties_copy) @@ -2686,13 +2634,13 @@ def _build_view_properties_exp( view_properties_copy = dict(view_properties) security = view_properties_copy.pop("security", None) if security is not None: - security_text = PropertyValidator.validate_and_normalize_property("security", security) + security_text = PropertyValidator.validate_and_normalize_property( + "security", security + ) # exp.SecurityProperty renders as `SECURITY ` (no '=') properties.append(exp.SecurityProperty(this=exp.Var(this=security_text))) - properties.extend( - self._table_or_view_properties_to_expressions(view_properties_copy) - ) + properties.extend(self._table_or_view_properties_to_expressions(view_properties_copy)) if properties: return exp.Properties(expressions=properties) @@ -2828,15 +2776,11 @@ def _build_partition_property( ) if not partitioned_by: - logger.debug( - "_build_partition_property: no 'partitioned_by' defined, skipped" - ) + logger.debug("_build_partition_property: no 'partitioned_by' defined, skipped") return None # Parse partition expressions to extract columns and kind (RANGE/LIST) - partition_kind, partition_cols = self._parse_partition_expressions( - partitioned_by - ) + partition_kind, partition_cols = self._parse_partition_expressions(partitioned_by) logger.debug( "_build_partition_property: partition_kind=%s, partition_cols=%s", partition_kind, @@ -2853,9 +2797,7 @@ def extract_column_name(expr: exp.Expression) -> t.Optional[str]: # Validate partition columns are in key columns (StarRocks requirement) if key_columns: - partition_col_names = set( - extract_column_name(expr) for expr in partition_cols - ) - {None} + partition_col_names = set(extract_column_name(expr) for expr in partition_cols) - {None} key_cols_set = set(key_columns) not_in_key = partition_col_names - key_cols_set if not_in_key: @@ -3015,6 +2957,8 @@ def _build_partitioned_by_exp( elif partition_kind is None: return exp.PartitionedByProperty(this=exp.tuple_(*partitioned_by)) + return None + def _build_distributed_by_property( self, table_properties: t.Dict[str, t.Any], @@ -3045,9 +2989,7 @@ def _build_distributed_by_property( # No default - if not set, return None if distributed_by is None: - logger.debug( - "_build_distributed_by_property: no 'distributed_by' defined, skipped" - ) + logger.debug("_build_distributed_by_property: no 'distributed_by' defined, skipped") return None logger.debug( @@ -3150,7 +3092,9 @@ def _build_refresh_property( if isinstance(scheme_text, exp.Var): kind_expr = scheme_text else: - kind_expr, starts_expr, every_expr, unit_expr = self._parse_refresh_scheme(scheme_text) + kind_expr, starts_expr, every_expr, unit_expr = self._parse_refresh_scheme( + scheme_text + ) return exp.RefreshTriggerProperty( method=method_expr, @@ -3191,11 +3135,15 @@ def _parse_refresh_scheme( starts_expr: t.Optional[exp.Expression] = None every_expr: t.Optional[exp.Expression] = None unit_expr: t.Optional[exp.Expression] = None - m_start = re.search(r"\bSTART\s*\(\s*(?:'([^']*)'|\"([^\"]*)\"|([^)]*))\s*\)", text, flags=re.IGNORECASE) + m_start = re.search( + r"\bSTART\s*\(\s*(?:'([^']*)'|\"([^\"]*)\"|([^)]*))\s*\)", text, flags=re.IGNORECASE + ) if m_start: start_inner = (m_start.group(1) or m_start.group(2) or m_start.group(3) or "").strip() starts_expr = exp.Literal.string(start_inner) - m_every = re.search(r"\bEVERY\s*\(\s*INTERVAL\s+(\d+)\s+(\w+)\s*\)", text, flags=re.IGNORECASE) + m_every = re.search( + r"\bEVERY\s*\(\s*INTERVAL\s+(\d+)\s+(\w+)\s*\)", text, flags=re.IGNORECASE + ) if m_every: every_expr = exp.Literal.number(int(m_every.group(1))) unit_expr = exp.Var(this=m_every.group(2).upper()) @@ -3240,9 +3188,7 @@ def _parse_distribution_with_buckets( return None # Split on BUCKETS (case-insensitive) - match = re.match( - r"^(.+?)\s+BUCKETS\s+(\d+)\s*$", text.strip(), flags=re.IGNORECASE - ) + match = re.match(r"^(.+?)\s+BUCKETS\s+(\d+)\s*$", text.strip(), flags=re.IGNORECASE) if not match: return None @@ -3250,9 +3196,7 @@ def _parse_distribution_with_buckets( buckets_str = match.group(2) # Parse the HASH/RANDOM part via SPEC - normalized = PropertyValidator.validate_and_normalize_property( - "distributed_by", hash_part - ) + normalized = PropertyValidator.validate_and_normalize_property("distributed_by", hash_part) logger.debug( "_parse_distribution_with_buckets: parsed hash part: %s, type: %s", normalized, @@ -3319,9 +3263,7 @@ def _build_order_by_property( logger.debug("_build_order_by_property: no 'clustered_by' defined, skipped") return None - def _build_other_properties( - self, table_properties: t.Dict[str, t.Any] - ) -> t.List[exp.Property]: + def _build_other_properties(self, table_properties: t.Dict[str, t.Any]) -> t.List[exp.Property]: """ Build other literal properties (replication_num, storage_medium, etc.). @@ -3339,9 +3281,7 @@ def _build_other_properties( for key, value in list(table_properties.items()): # Skip special keys handled elsewhere if key in PropertyValidator.IMPORTANT_PROPERTY_NAMES: - logger.warning( - f"[StarRocks] {key!r} should have been processed already, skipping" - ) + logger.warning(f"[StarRocks] {key!r} should have been processed already, skipping") continue # Remove from properties @@ -3350,9 +3290,7 @@ def _build_other_properties( # Validate and normalize to string # All other properties are treated as generic string properties try: - normalized = PropertyValidator.validate_and_normalize_property( - key, value - ) + normalized = PropertyValidator.validate_and_normalize_property(key, value) other_props.append( exp.Property( this=exp.to_identifier(key), @@ -3360,9 +3298,7 @@ def _build_other_properties( ) ) except SQLMeshError as e: - logger.warning( - "[StarRocks] skipping property %s due to error: %s", key, e - ) + logger.warning("[StarRocks] skipping property %s due to error: %s", key, e) return other_props @@ -3514,9 +3450,9 @@ def _build_create_comment_table_exp( SQL string for ALTER TABLE COMMENT """ table_sql = table.sql(dialect=self.dialect, identify=True) - comment_sql = exp.Literal.string( - self._truncate_table_comment(table_comment) - ).sql(dialect=self.dialect) + comment_sql = exp.Literal.string(self._truncate_table_comment(table_comment)).sql( + dialect=self.dialect + ) return f"ALTER TABLE {table_sql} COMMENT = {comment_sql}" def _build_create_comment_column_exp( @@ -3549,13 +3485,11 @@ def _build_create_comment_column_exp( SQL string for ALTER TABLE MODIFY COLUMN with COMMENT """ table_sql = table.sql(dialect=self.dialect, identify=True) - column_sql = exp.to_identifier(column_name).sql( - dialect=self.dialect, identify=True - ) + column_sql = exp.to_identifier(column_name).sql(dialect=self.dialect, identify=True) - comment_sql = exp.Literal.string( - self._truncate_column_comment(column_comment) - ).sql(dialect=self.dialect) + comment_sql = exp.Literal.string(self._truncate_column_comment(column_comment)).sql( + dialect=self.dialect + ) return f"ALTER TABLE {table_sql} MODIFY COLUMN {column_sql} COMMENT {comment_sql}" diff --git a/sqlmesh/core/snapshot/evaluator.py b/sqlmesh/core/snapshot/evaluator.py index 4e3aaa13a8..55046427bc 100644 --- a/sqlmesh/core/snapshot/evaluator.py +++ b/sqlmesh/core/snapshot/evaluator.py @@ -2053,14 +2053,14 @@ def _ensure_primary_key_for_starrocks_when_incremental_by_unique_key( complex DELETE/MERGE statements remain supported. """ + properties = dict(physical_properties or {}) + if ( model.dialect != "starrocks" or not model.kind.is_incremental_by_unique_key - or "primary_key" in physical_properties + or "primary_key" in properties ): - return physical_properties - - properties = dict(physical_properties or {}) + return properties unique_key: t.Optional[t.List[exp.Expression]] = model.unique_key if unique_key: properties["primary_key"] = ( @@ -2092,7 +2092,8 @@ def create( ctas_query = model.ctas_query(**render_kwargs) physical_properties = kwargs.get("physical_properties", model.physical_properties) physical_properties = _ensure_primary_key_for_starrocks_when_incremental_by_unique_key( - model, physical_properties) + model, physical_properties + ) logger.info("Creating table '%s'", table_name) if model.annotated: @@ -2209,7 +2210,8 @@ def _replace_query_for_model( physical_properties = kwargs.get("physical_properties", model.physical_properties) physical_properties = _ensure_primary_key_for_starrocks_when_incremental_by_unique_key( - model, physical_properties) + model, physical_properties + ) self.adapter.replace_query( name, query_or_df, @@ -2354,7 +2356,8 @@ def insert( ) physical_properties = kwargs.get("physical_properties", model.physical_properties) physical_properties = _ensure_primary_key_for_starrocks_when_incremental_by_unique_key( - model, physical_properties) + model, physical_properties + ) self.adapter.merge( table_name, query_or_df, @@ -2383,7 +2386,8 @@ def append( ) physical_properties = kwargs.get("physical_properties", model.physical_properties) physical_properties = _ensure_primary_key_for_starrocks_when_incremental_by_unique_key( - model, physical_properties) + model, physical_properties + ) self.adapter.merge( table_name, query_or_df, @@ -3175,7 +3179,8 @@ def create( logger.info("Creating managed table: %s", table_name) physical_properties = kwargs.get("physical_properties", model.physical_properties) physical_properties = _ensure_primary_key_for_starrocks_when_incremental_by_unique_key( - model, physical_properties) + model, physical_properties + ) self.adapter.create_managed_table( table_name=table_name, query=model.render_query_or_raise(**render_kwargs), diff --git a/tests/core/engine_adapter/integration/test_integration_starrocks.py b/tests/core/engine_adapter/integration/test_integration_starrocks.py index 135c1443af..eee2b2054b 100644 --- a/tests/core/engine_adapter/integration/test_integration_starrocks.py +++ b/tests/core/engine_adapter/integration/test_integration_starrocks.py @@ -65,23 +65,35 @@ def _model_name_from_table(table: exp.Table) -> str: def normalize_sql(sql: str) -> str: """Normalizes a SQL string for comparison.""" # Remove comments - sql = re.sub(r'--.*\n', '', sql) + sql = re.sub(r"--.*\n", "", sql) # Replace newlines and tabs with spaces - sql = sql.replace('\n', ' ').replace('\t', '') + sql = sql.replace("\n", " ").replace("\t", "") # Collapse multiple spaces into one - sql = re.sub(r'\s+', ' ', sql) + sql = re.sub(r"\s+", " ", sql) # Remove spaces around parentheses, commas, and equals for consistency - sql = re.sub(r'\s*\(\s*', '(', sql) - sql = re.sub(r'\s*\)\s*', ')', sql) - sql = re.sub(r'\s*,\s*', ',', sql) - sql = re.sub(r'\s*=\s*', '=', sql) + sql = re.sub(r"\s*\(\s*", "(", sql) + sql = re.sub(r"\s*\)\s*", ")", sql) + sql = re.sub(r"\s*,\s*", ",", sql) + sql = re.sub(r"\s*=\s*", "=", sql) # Remove all paired backticks around identifiers - sql = re.sub(r'`([^`]+)`', r'\1', sql) - sql = re.sub(r'\'', '"', sql) + sql = re.sub(r"`([^`]+)`", r"\1", sql) + sql = re.sub(r"\'", '"', sql) return sql.strip() +Row = t.Tuple[t.Any, ...] + + +def expect_row(row: t.Optional[Row]) -> Row: + assert row is not None + return row + + +def fetchone_or_fail(adapter: StarRocksEngineAdapter, query: t.Any) -> Row: + return expect_row(adapter.fetchone(query)) + + # ============================================================================= # TestContext-based Integration Tests # ============================================================================= @@ -101,13 +113,13 @@ def starrocks_connection_config() -> t.Dict[str, t.Any]: return { "host": os.getenv("STARROCKS_HOST", "localhost"), "port": int(os.getenv("STARROCKS_PORT", "9030")), - "user": os.getenv("STARROCKS_USER", "myname"), - "password": os.getenv("STARROCKS_PASSWORD", "pswd1234"), + "user": os.getenv("STARROCKS_USER", "root"), + "password": os.getenv("STARROCKS_PASSWORD", ""), } @pytest.fixture -def ctx(tmp_path, starrocks_connection_config) -> t.Iterable[TestContext]: +def ctx(tmp_path, starrocks_connection_config) -> t.Generator[TestContext, None, None]: """ A lightweight TestContext fixture which avoids loading the full integration gateway config. @@ -143,7 +155,9 @@ def engine_adapter(ctx: TestContext) -> StarRocksEngineAdapter: @pytest.fixture(scope="module") -def starrocks_adapter(starrocks_connection_config) -> StarRocksEngineAdapter: +def starrocks_adapter( + starrocks_connection_config, +) -> t.Generator[StarRocksEngineAdapter, None, None]: """Create a real StarRocks adapter connected to database. It's still used in a lot of tests, so it can't be removed yet. """ @@ -157,6 +171,62 @@ def starrocks_adapter(starrocks_connection_config) -> StarRocksEngineAdapter: # Cleanup: adapter will auto-close connection +@pytest.fixture(scope="module", autouse=True) +def init_test_integration_env(starrocks_adapter: StarRocksEngineAdapter) -> None: + """ + Auto-adjust default_replication_num for small shared-nothing clusters. + + If run_mode is shared_nothing and available backends < 3, set default_replication_num = 1 + to prevent replica-creation failures in tests. + """ + + def _get_config_value(name: str) -> t.Optional[str]: + try: + row = starrocks_adapter.fetchone(f"ADMIN SHOW FRONTEND CONFIG LIKE '{name}'") + except Exception as e: # pragma: no cover - defensive for older SR versions + logger.warning("Skipping config lookup %s: %s", name, e) + return None + if not row or len(row) < 3: + logger.warning("Unexpected result for %s: %s", name, row) + return None + return str(row[2]).strip() + + run_mode = _get_config_value("run_mode") + if not run_mode or run_mode.lower() != "shared_nothing": + return + + try: + backends = starrocks_adapter.fetchall("SHOW BACKENDS") + except Exception as e: # pragma: no cover - defensive for older SR versions + logger.warning("Skipping backend count check: %s", e) + return + + be_count = len(backends) + if be_count >= 3: + return + + current_replication = _get_config_value("default_replication_num") + try: + current_replication_int = int(current_replication) if current_replication is not None else None + except Exception: + current_replication_int = None + + if current_replication_int is not None and current_replication_int <= be_count: + return + + try: + starrocks_adapter.execute('ADMIN SET FRONTEND CONFIG ("default_replication_num" = "1")') + logger.info( + "Set default_replication_num=1 for shared_nothing cluster with %s backends (was %s)", + be_count, + current_replication, + ) + except Exception as e: # pragma: no cover - do not break tests if lacking privilege + logger.warning( + "Failed to set default_replication_num for shared_nothing cluster: %s", e + ) + + class TestBasicOperations: """ Basic Operations @@ -171,22 +241,21 @@ def test_create_drop_schema(self, ctx: TestContext, engine_adapter: StarRocksEng # CREATE DATABASE engine_adapter.create_schema(db_name, ignore_if_exists=True) - result = engine_adapter.fetchone( - f"SELECT SCHEMA_NAME FROM information_schema.SCHEMATA WHERE SCHEMA_NAME = '{db_name}'" + result = fetchone_or_fail( + engine_adapter, + f"SELECT SCHEMA_NAME FROM information_schema.SCHEMATA WHERE SCHEMA_NAME = '{db_name}'", ) - assert result is not None, "CREATE DATABASE failed" assert result[0] == db_name # DROP DATABASE engine_adapter.drop_schema(db_name) - result = engine_adapter.fetchone( + result: t.Optional[Row] = engine_adapter.fetchone( f"SELECT SCHEMA_NAME FROM information_schema.SCHEMATA WHERE SCHEMA_NAME = '{db_name}'" ) assert result is None, "DROP DATABASE failed" def test_create_drop_table(self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter): - """Test CREATE TABLE and DROP TABLE (TestContext version). - """ + """Test CREATE TABLE and DROP TABLE (TestContext version).""" table = ctx.table("sr_test_table") engine_adapter.create_table( @@ -246,18 +315,18 @@ def test_create_table_like_preserves_metadata_and_copies_no_data( engine_adapter.create_table_like(target, source, exists=True) # Like should not copy data. - src_count = engine_adapter.fetchone( - f"SELECT COUNT(*) FROM {source.sql(dialect=ctx.dialect, identify=True)}" + src_count = fetchone_or_fail( + engine_adapter, f"SELECT COUNT(*) FROM {source.sql(dialect=ctx.dialect, identify=True)}" )[0] - tgt_count = engine_adapter.fetchone( - f"SELECT COUNT(*) FROM {target.sql(dialect=ctx.dialect, identify=True)}" + tgt_count = fetchone_or_fail( + engine_adapter, f"SELECT COUNT(*) FROM {target.sql(dialect=ctx.dialect, identify=True)}" )[0] assert src_count == 2 assert tgt_count == 0 # Like should preserve key metadata (engine-defined behavior). - ddl = engine_adapter.fetchone( - f"SHOW CREATE TABLE {target.sql(dialect=ctx.dialect, identify=True)}" + ddl = fetchone_or_fail( + engine_adapter, f"SHOW CREATE TABLE {target.sql(dialect=ctx.dialect, identify=True)}" )[1] ddl_upper = ddl.upper() assert "PRIMARY KEY" in ddl_upper @@ -283,7 +352,6 @@ def test_create_table_like_exists_false_raises( with pytest.raises(Exception): engine_adapter.create_table_like(target, source, exists=False) - def test_delete(self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter): """Test DELETE operation (TestContext version).""" table = ctx.table("sr_test_table") @@ -301,7 +369,7 @@ def test_delete(self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter): ) engine_adapter.delete_from(table, "id = 2") - count = engine_adapter.fetchone(f"SELECT COUNT(*) FROM {table_sql}") + count = fetchone_or_fail(engine_adapter, f"SELECT COUNT(*) FROM {table_sql}") assert count[0] == 1, "DELETE failed" def test_rename_table(self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter): @@ -339,7 +407,7 @@ def test_rename_table(self, ctx: TestContext, engine_adapter: StarRocksEngineAda ) assert new_exists is not None, "New table should exist after rename" - count = engine_adapter.fetchone(f"SELECT COUNT(*) FROM {new_table_sql}") + count = fetchone_or_fail(engine_adapter, f"SELECT COUNT(*) FROM {new_table_sql}") assert count[0] == 1, "Data should be preserved after rename" def test_create_index(self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter): @@ -358,8 +426,8 @@ def test_create_index(self, ctx: TestContext, engine_adapter: StarRocksEngineAda # CREATE INDEX (should be skipped silently) engine_adapter.create_index(table, "idx_name", ("name",)) - count = engine_adapter.fetchone(f"SELECT COUNT(*) FROM {table_sql}") - assert count is not None, "Table should still be functional after skipped index creation" + count = fetchone_or_fail(engine_adapter, f"SELECT COUNT(*) FROM {table_sql}") + assert count[0] >= 0, "Table should still be functional after skipped index creation" def test_create_drop_view(self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter): """Test CREATE VIEW and DROP VIEW (TestContext version).""" @@ -379,18 +447,19 @@ def test_create_drop_view(self, ctx: TestContext, engine_adapter: StarRocksEngin db_name = view.db view_name = view.name - result = engine_adapter.fetchone( + result = fetchone_or_fail( + engine_adapter, f"SELECT TABLE_NAME FROM information_schema.VIEWS " - f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = '{view_name}'" + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = '{view_name}'", ) - assert result is not None, "CREATE VIEW failed" + assert result, "CREATE VIEW failed" engine_adapter.drop_view(view) - result = engine_adapter.fetchone( + result_optional: t.Optional[Row] = engine_adapter.fetchone( f"SELECT TABLE_NAME FROM information_schema.VIEWS " f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = '{view_name}'" ) - assert result is None, "DROP VIEW failed" + assert result_optional is None, "DROP VIEW failed" class TestViewAndMaterializedViewFeatures: @@ -442,7 +511,7 @@ def test_create_view_with_security( view_properties=model.virtual_properties, ) - ddl = engine_adapter.fetchone(f"SHOW CREATE VIEW {view_sql_ident}")[1] + ddl = fetchone_or_fail(engine_adapter, f"SHOW CREATE VIEW {view_sql_ident}")[1] assert "SECURITY INVOKER" in ddl.upper() def test_create_view_replace_flag( @@ -460,9 +529,7 @@ def test_create_view_replace_flag( "name": exp.DataType.build("VARCHAR(100)"), }, ) - engine_adapter.execute( - f"INSERT INTO {source_sql_ident} (id, name) VALUES (1, 'A')" - ) + engine_adapter.execute(f"INSERT INTO {source_sql_ident} (id, name) VALUES (1, 'A')") model_sql = f""" MODEL ( @@ -584,17 +651,21 @@ def test_materialized_view_combo_with_materialized_properties( column_descriptions=model.column_descriptions, ) - ddl = engine_adapter.fetchone(f"SHOW CREATE MATERIALIZED VIEW {mv_sql}")[1] + ddl = fetchone_or_fail(engine_adapter, f"SHOW CREATE MATERIALIZED VIEW {mv_sql}")[1] logger.debug(f"mv ddl: {ddl}") ddl_upper = normalize_sql(ddl).upper() assert "REFRESH DEFERRED ASYNC" in ddl_upper - assert "START('2025-01-01 00:00:00')EVERY(INTERVAL 5 MINUTE)" in ddl_upper \ + assert ( + "START('2025-01-01 00:00:00')EVERY(INTERVAL 5 MINUTE)" in ddl_upper or 'START("2025-01-01 00:00:00")EVERY(INTERVAL 5 MINUTE)' in ddl_upper + ) assert "PARTITION BY(EVENT_DATE)" in ddl_upper assert "ORDER BY(CUSTOMER_ID,REGION)" in ddl_upper assert "DISTRIBUTED BY HASH(ORDER_ID)BUCKETS 8" in ddl_upper - assert "COMMENT 'MV COMBO A DESCRIPTION'" in ddl_upper \ + assert ( + "COMMENT 'MV COMBO A DESCRIPTION'" in ddl_upper or 'COMMENT "MV COMBO A DESCRIPTION"' in ddl_upper + ) def test_materialized_view_combo_all_properties_block( self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter @@ -657,7 +728,7 @@ def test_materialized_view_combo_all_properties_block( column_descriptions=model.column_descriptions, ) - ddl = engine_adapter.fetchone(f"SHOW CREATE MATERIALIZED VIEW {mv_sql}")[1] + ddl = fetchone_or_fail(engine_adapter, f"SHOW CREATE MATERIALIZED VIEW {mv_sql}")[1] ddl_upper = normalize_sql(ddl).upper() assert "REFRESH MANUAL" in ddl_upper assert "PARTITION P202401" not in ddl_upper # ignored when MV @@ -665,8 +736,10 @@ def test_materialized_view_combo_all_properties_block( assert "PARTITION BY(EVENT_DATE)" in ddl_upper assert "ORDER BY(ORDER_ID,EVENT_DATE)" in ddl_upper assert "DISTRIBUTED BY HASH(ORDER_ID,CUSTOMER_ID)BUCKETS 4" in ddl_upper - assert "COMMENT 'ANALYTICS MV COMBO B'" in ddl_upper \ + assert ( + "COMMENT 'ANALYTICS MV COMBO B'" in ddl_upper or 'COMMENT "ANALYTICS MV COMBO B"' in ddl_upper + ) class TestTableFeatures: @@ -677,7 +750,9 @@ class TestTableFeatures: Focus on independent functionality like comments and data type compatibility. """ - def test_table_and_column_comments(self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter): + def test_table_and_column_comments( + self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter + ): """Test table and column comments.""" table = ctx.table("sr_comment_table") db_name = table.db @@ -698,9 +773,10 @@ def test_table_and_column_comments(self, ctx: TestContext, engine_adapter: StarR ) # Verify table comment - result = engine_adapter.fetchone( + result = fetchone_or_fail( + engine_adapter, f"SELECT TABLE_COMMENT FROM information_schema.TABLES " - f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = '{table_name}'" + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = '{table_name}'", ) assert result[0] == "Test table comment", "Table comment not set" @@ -773,11 +849,13 @@ def test_multiple_data_types(self, ctx: TestContext, engine_adapter: StarRocksEn ) # Verify insertion - count = engine_adapter.fetchone(f"SELECT COUNT(*) FROM {table_sql}") + count = fetchone_or_fail(engine_adapter, f"SELECT COUNT(*) FROM {table_sql}") assert count[0] == 1, "Data insertion with basic types failed" # Verify data retrieval - result = engine_adapter.fetchone(f"SELECT col_int, col_varchar, col_date FROM {table_sql}") + result = fetchone_or_fail( + engine_adapter, f"SELECT col_int, col_varchar, col_date FROM {table_sql}" + ) assert result[0] == 2147483647 assert result[1] == "test varchar" @@ -859,12 +937,12 @@ def test_complex_data_types(self, ctx: TestContext, engine_adapter: StarRocksEng ) # Verify insertion - count = engine_adapter.fetchone(f"SELECT COUNT(*) FROM {table_sql}") + count = fetchone_or_fail(engine_adapter, f"SELECT COUNT(*) FROM {table_sql}") assert count[0] == 1, "Data insertion with complex nested types failed" # Verify data retrieval for simple types - result = engine_adapter.fetchone( - f"SELECT col_array_simple, col_struct_simple FROM {table_sql}" + result = fetchone_or_fail( + engine_adapter, f"SELECT col_array_simple, col_struct_simple FROM {table_sql}" ) assert result is not None, "Failed to retrieve complex type data" @@ -983,7 +1061,7 @@ def test_e2e_model_parameters(self, starrocks_adapter: StarRocksEngineAdapter): params = self._parse_model_and_get_all_params(model_sql) starrocks_adapter.create_table(table_name, **params) - show_create = starrocks_adapter.fetchone(f"SHOW CREATE TABLE {table_name}") + show_create = fetchone_or_fail(starrocks_adapter, f"SHOW CREATE TABLE {table_name}") ddl = show_create[1] logger.info(f"Case 1 DDL:\n{ddl}") @@ -999,17 +1077,16 @@ def test_e2e_model_parameters(self, starrocks_adapter: StarRocksEngineAdapter): # Verify function expression and column references assert ( # "from_unixtime" in part_cols or "ts" in part_cols - "__generated_partition_column_" in part_cols - and "region" in part_cols + "__generated_partition_column_" in part_cols and "region" in part_cols ), f"Expected partition expression with generated column/region, got {part_cols}" # Verify ORDER BY from clustered_by order_match = re.search(r"ORDER BY\s*\(([^)]+)\)", ddl) assert order_match, "ORDER BY clause not found" order_cols = order_match.group(1) - assert ( - "order_id" in order_cols and "customer_id" in order_cols - ), f"Expected ORDER BY (order_id, customer_id), got {order_cols}" + assert "order_id" in order_cols and "customer_id" in order_cols, ( + f"Expected ORDER BY (order_id, customer_id), got {order_cols}" + ) finally: starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) @@ -1019,9 +1096,7 @@ def test_e2e_model_parameters(self, starrocks_adapter: StarRocksEngineAdapter): # Covers: primary_key (tuple), distributed_by (string multi-col), order_by (tuple), generic props # ======================================== - def test_e2e_physical_properties_core( - self, starrocks_adapter: StarRocksEngineAdapter - ): + def test_e2e_physical_properties_core(self, starrocks_adapter: StarRocksEngineAdapter): """ Test Case 2: Core physical_properties. @@ -1061,7 +1136,7 @@ def test_e2e_physical_properties_core( params = self._parse_model_and_get_all_params(model_sql) starrocks_adapter.create_table(table_name, **params) - show_create = starrocks_adapter.fetchone(f"SHOW CREATE TABLE {table_name}") + show_create = fetchone_or_fail(starrocks_adapter, f"SHOW CREATE TABLE {table_name}") ddl = show_create[1] logger.info(f"Case 2 DDL:\n{ddl}") @@ -1077,17 +1152,15 @@ def test_e2e_physical_properties_core( dist_match = re.search(r"DISTRIBUTED BY HASH\s*\(([^)]+)\)", ddl) assert dist_match, "DISTRIBUTED BY HASH clause not found" dist_cols = dist_match.group(1) - assert ( - "customer_id" in dist_cols and "region" in dist_cols - ), f"Expected HASH(customer_id, region), got HASH({dist_cols})" + assert "customer_id" in dist_cols and "region" in dist_cols, ( + f"Expected HASH(customer_id, region), got HASH({dist_cols})" + ) assert "BUCKETS 16" in ddl # Verify ORDER BY order_match = re.search(r"ORDER BY\s*\(([^)]+)\)", ddl) assert order_match, "ORDER BY clause not found" - assert "order_id" in order_match.group(1) and "region" in order_match.group( - 1 - ) + assert "order_id" in order_match.group(1) and "region" in order_match.group(1) # assert "replication_num" not in ddl @@ -1099,9 +1172,7 @@ def test_e2e_physical_properties_core( # Covers: primary_key = "id, dt" auto-conversion # ======================================== - def test_e2e_string_no_paren_auto_wrap( - self, starrocks_adapter: StarRocksEngineAdapter - ): + def test_e2e_string_no_paren_auto_wrap(self, starrocks_adapter: StarRocksEngineAdapter): """ Test Case 3: String form without parentheses auto-wrap. @@ -1134,7 +1205,7 @@ def test_e2e_string_no_paren_auto_wrap( params = self._parse_model_and_get_all_params(model_sql) starrocks_adapter.create_table(table_name, **params) - show_create = starrocks_adapter.fetchone(f"SHOW CREATE TABLE {table_name}") + show_create = fetchone_or_fail(starrocks_adapter, f"SHOW CREATE TABLE {table_name}") ddl = show_create[1] logger.info(f"Case 3 DDL:\n{ddl}") @@ -1144,16 +1215,16 @@ def test_e2e_string_no_paren_auto_wrap( pk_match = re.search(r"PRIMARY KEY\s*\(([^)]+)\)", ddl) assert pk_match, "PRIMARY KEY clause not found" pk_clause = pk_match.group(1) - assert ( - "order_id" in pk_clause and "event_date" in pk_clause - ), f"Expected both order_id and event_date in PRIMARY KEY, got {pk_clause}" + assert "order_id" in pk_clause and "event_date" in pk_clause, ( + f"Expected both order_id and event_date in PRIMARY KEY, got {pk_clause}" + ) # Verify distributed_by with exact columns dist_match = re.search(r"DISTRIBUTED BY HASH\s*\(([^)]+)\)", ddl) assert dist_match, "DISTRIBUTED BY HASH clause not found" - assert "order_id" in dist_match.group( - 1 - ), f"Expected HASH(order_id), got HASH({dist_match.group(1)})" + assert "order_id" in dist_match.group(1), ( + f"Expected HASH(order_id), got HASH({dist_match.group(1)})" + ) assert "BUCKETS 10" in ddl finally: @@ -1164,9 +1235,7 @@ def test_e2e_string_no_paren_auto_wrap( # Covers: kind=HASH (unquoted), kind=RANDOM # ======================================== - def test_e2e_distribution_structured_hash( - self, starrocks_adapter: StarRocksEngineAdapter - ): + def test_e2e_distribution_structured_hash(self, starrocks_adapter: StarRocksEngineAdapter): """Test Case 4A: Structured HASH distribution with unquoted kind.""" db_name = "sr_e2e_dist_hash_db" table_name = f"{db_name}.sr_dist_hash_table" @@ -1194,7 +1263,7 @@ def test_e2e_distribution_structured_hash( params = self._parse_model_and_get_all_params(model_sql) starrocks_adapter.create_table(table_name, **params) - show_create = starrocks_adapter.fetchone(f"SHOW CREATE TABLE {table_name}") + show_create = fetchone_or_fail(starrocks_adapter, f"SHOW CREATE TABLE {table_name}") ddl = show_create[1] logger.info(f"Case 4A DDL:\n{ddl}") @@ -1204,17 +1273,13 @@ def test_e2e_distribution_structured_hash( assert "DISTRIBUTED BY HASH" in ddl dist_match = re.search(r"DISTRIBUTED BY HASH\s*\(([^)]+)\)", ddl) assert dist_match, "DISTRIBUTED BY HASH clause not found" - assert "customer_id" in dist_match.group( - 1 - ) and "region" in dist_match.group(1) + assert "customer_id" in dist_match.group(1) and "region" in dist_match.group(1) assert "BUCKETS 16" in ddl finally: starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) - def test_e2e_distribution_structured_random( - self, starrocks_adapter: StarRocksEngineAdapter - ): + def test_e2e_distribution_structured_random(self, starrocks_adapter: StarRocksEngineAdapter): """Test Case 4B: Structured RANDOM distribution.""" db_name = "sr_e2e_dist_random_db" table_name = f"{db_name}.sr_dist_random_table" @@ -1243,7 +1308,7 @@ def test_e2e_distribution_structured_random( params = self._parse_model_and_get_all_params(model_sql) starrocks_adapter.create_table(table_name, **params) - show_create = starrocks_adapter.fetchone(f"SHOW CREATE TABLE {table_name}") + show_create = fetchone_or_fail(starrocks_adapter, f"SHOW CREATE TABLE {table_name}") ddl = show_create[1] logger.info(f"Case 4B DDL:\n{ddl}") @@ -1294,7 +1359,7 @@ def test_e2e_partition_range(self, starrocks_adapter: StarRocksEngineAdapter): params = self._parse_model_and_get_all_params(model_sql) starrocks_adapter.create_table(table_name, **params) - show_create = starrocks_adapter.fetchone(f"SHOW CREATE TABLE {table_name}") + show_create = fetchone_or_fail(starrocks_adapter, f"SHOW CREATE TABLE {table_name}") ddl = show_create[1] logger.info(f"Case 5 DDL:\n{ddl}") @@ -1351,7 +1416,7 @@ def test_e2e_partition_list(self, starrocks_adapter: StarRocksEngineAdapter): params = self._parse_model_and_get_all_params(model_sql) starrocks_adapter.create_table(table_name, **params) - show_create = starrocks_adapter.fetchone(f"SHOW CREATE TABLE {table_name}") + show_create = fetchone_or_fail(starrocks_adapter, f"SHOW CREATE TABLE {table_name}") ddl = show_create[1] logger.info(f"Case 6 DDL:\n{ddl}") @@ -1403,7 +1468,7 @@ def test_e2e_key_type_duplicate(self, starrocks_adapter: StarRocksEngineAdapter) params = self._parse_model_and_get_all_params(model_sql) starrocks_adapter.create_table(table_name, **params) - show_create = starrocks_adapter.fetchone(f"SHOW CREATE TABLE {table_name}") + show_create = fetchone_or_fail(starrocks_adapter, f"SHOW CREATE TABLE {table_name}") ddl = show_create[1] logger.info(f"Case 7A DDL:\n{ddl}") @@ -1412,9 +1477,9 @@ def test_e2e_key_type_duplicate(self, starrocks_adapter: StarRocksEngineAdapter) dup_match = re.search(r"DUPLICATE KEY\s*\(([^)]+)\)", ddl) assert dup_match, "DUPLICATE KEY clause not found" - assert "id" in dup_match.group(1) and "dt" in dup_match.group( - 1 - ), f"Expected DUPLICATE KEY(id, dt), got DUPLICATE KEY({dup_match.group(1)})" + assert "id" in dup_match.group(1) and "dt" in dup_match.group(1), ( + f"Expected DUPLICATE KEY(id, dt), got DUPLICATE KEY({dup_match.group(1)})" + ) finally: starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) @@ -1448,7 +1513,7 @@ def test_e2e_key_type_unique(self, starrocks_adapter: StarRocksEngineAdapter): params = self._parse_model_and_get_all_params(model_sql) starrocks_adapter.create_table(table_name, **params) - show_create = starrocks_adapter.fetchone(f"SHOW CREATE TABLE {table_name}") + show_create = fetchone_or_fail(starrocks_adapter, f"SHOW CREATE TABLE {table_name}") ddl = show_create[1] logger.info(f"Case 7B DDL:\n{ddl}") @@ -1535,7 +1600,7 @@ def test_e2e_comprehensive(self, starrocks_adapter: StarRocksEngineAdapter): params = self._parse_model_and_get_all_params(model_sql) starrocks_adapter.create_table(table_name, **params) - show_create = starrocks_adapter.fetchone(f"SHOW CREATE TABLE {table_name}") + show_create = fetchone_or_fail(starrocks_adapter, f"SHOW CREATE TABLE {table_name}") ddl = show_create[1] logger.info(f"Comprehensive DDL:\n{ddl}") @@ -1553,9 +1618,9 @@ def test_e2e_comprehensive(self, starrocks_adapter: StarRocksEngineAdapter): part_match = re.search(r"PARTITION BY[^(]*\(([^)]+)\)", ddl) assert part_match, "PARTITION BY clause not found" part_cols = part_match.group(1) - assert ( - "event_date" in part_cols - ), f"Expected event_date in PARTITION BY, got {part_cols}" + assert "event_date" in part_cols, ( + f"Expected event_date in PARTITION BY, got {part_cols}" + ) # Verify DISTRIBUTED BY assert "DISTRIBUTED BY HASH" in ddl @@ -1564,9 +1629,7 @@ def test_e2e_comprehensive(self, starrocks_adapter: StarRocksEngineAdapter): # Verify ORDER BY order_match = re.search(r"ORDER BY\s*\(([^)]+)\)", ddl) assert order_match, "ORDER BY clause not found" - assert "order_id" in order_match.group( - 1 - ) and "event_date" in order_match.group(1) + assert "order_id" in order_match.group(1) and "event_date" in order_match.group(1) # Verify PROPERTIES assert "replication_num" in ddl @@ -1578,10 +1641,11 @@ def test_e2e_comprehensive(self, starrocks_adapter: StarRocksEngineAdapter): f"VALUES (1001, '2024-01-15', 100, 1234.56, 'completed')" ) - result = starrocks_adapter.fetchone( - f"SELECT order_id, customer_id FROM {table_name} WHERE order_id = 1001" + result = fetchone_or_fail( + starrocks_adapter, + f"SELECT order_id, customer_id FROM {table_name} WHERE order_id = 1001", ) - assert result is not None, "INSERT/SELECT failed" + assert result, "INSERT/SELECT failed" assert result[0] == 1001, "order_id mismatch" finally: @@ -1592,9 +1656,7 @@ def test_e2e_comprehensive(self, starrocks_adapter: StarRocksEngineAdapter): # Tests single quotes vs double quotes in MODEL parsing # ======================================== - def test_e2e_quote_character_handling( - self, starrocks_adapter: StarRocksEngineAdapter - ): + def test_e2e_quote_character_handling(self, starrocks_adapter: StarRocksEngineAdapter): """ Test Case: Quote Character Handling (Single vs Double Quotes). @@ -1670,7 +1732,7 @@ def test_e2e_quote_character_handling( starrocks_adapter.create_table(table_name, **params) # Verify via SHOW CREATE TABLE - show_create = starrocks_adapter.fetchone(f"SHOW CREATE TABLE {table_name}") + show_create = fetchone_or_fail(starrocks_adapter, f"SHOW CREATE TABLE {table_name}") ddl = show_create[1] logger.info(f"Quote Handling Test DDL:\n{ddl}") @@ -1712,10 +1774,11 @@ def test_e2e_quote_character_handling( f"VALUES (100, '2024-01-01', 'US', 1001)" ) - result = starrocks_adapter.fetchone( - f"SELECT id, region, customer_id FROM {table_name} WHERE id = 100" + result = fetchone_or_fail( + starrocks_adapter, + f"SELECT id, region, customer_id FROM {table_name} WHERE id = 100", ) - assert result is not None, "INSERT/SELECT failed" + assert result, "INSERT/SELECT failed" assert result == (100, "US", 1001), f"Data mismatch: {result}" finally: starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) @@ -1742,7 +1805,7 @@ class TestStarRocksAbility: @pytest.fixture(scope="class") def test_tables( self, starrocks_adapter: StarRocksEngineAdapter - ) -> t.Dict[str, str]: + ) -> t.Generator[t.Dict[str, str], None, None]: """ Pre-create tables of different types for testing. @@ -1769,9 +1832,10 @@ def test_tables( """ ) # Verify table creation - result = starrocks_adapter.fetchone( + result = fetchone_or_fail( + starrocks_adapter, f"SELECT COUNT(*) FROM information_schema.TABLES " - f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'pk_table'" + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'pk_table'", ) assert result[0] == 1, f"PRIMARY KEY table {pk_table} creation failed" tables["primary_key"] = pk_table @@ -1790,9 +1854,10 @@ def test_tables( """ ) # Verify table creation - result = starrocks_adapter.fetchone( + result = fetchone_or_fail( + starrocks_adapter, f"SELECT COUNT(*) FROM information_schema.TABLES " - f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'dup_table'" + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'dup_table'", ) assert result[0] == 1, f"DUPLICATE KEY table {dup_table} creation failed" tables["duplicate_key"] = dup_table @@ -1811,9 +1876,10 @@ def test_tables( """ ) # Verify table creation - result = starrocks_adapter.fetchone( + result = fetchone_or_fail( + starrocks_adapter, f"SELECT COUNT(*) FROM information_schema.TABLES " - f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'unique_table'" + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'unique_table'", ) assert result[0] == 1, f"UNIQUE KEY table {unique_table} creation failed" tables["unique_key"] = unique_table @@ -1839,17 +1905,18 @@ def test_create_drop_keyword_support( try: # CREATE starrocks_adapter.execute(f"CREATE {sql_keyword} IF NOT EXISTS {test_name}") - result = starrocks_adapter.fetchone( - f"SELECT SCHEMA_NAME FROM information_schema.SCHEMATA WHERE SCHEMA_NAME = '{test_name}'" + result = fetchone_or_fail( + starrocks_adapter, + f"SELECT SCHEMA_NAME FROM information_schema.SCHEMATA WHERE SCHEMA_NAME = '{test_name}'", ) - assert result is not None, f"CREATE {sql_keyword} failed" + assert result, f"CREATE {sql_keyword} failed" # DROP starrocks_adapter.execute(f"DROP {sql_keyword} IF EXISTS {test_name}") - result = starrocks_adapter.fetchone( + result_optional: t.Optional[Row] = starrocks_adapter.fetchone( f"SELECT SCHEMA_NAME FROM information_schema.SCHEMATA WHERE SCHEMA_NAME = '{test_name}'" ) - assert result is None, f"DROP {sql_keyword} failed" + assert result_optional is None, f"DROP {sql_keyword} failed" finally: starrocks_adapter.execute(f"DROP {sql_keyword} IF EXISTS {test_name}") @@ -1900,7 +1967,9 @@ def test_update_supported(self, starrocks_adapter: StarRocksEngineAdapter): starrocks_adapter.execute( f"UPDATE {table_name} SET name = 'Alice Updated' WHERE id = 1" ) - result = starrocks_adapter.fetchone(f"SELECT name FROM {table_name} WHERE id = 1") + result = fetchone_or_fail( + starrocks_adapter, f"SELECT name FROM {table_name} WHERE id = 1" + ) assert result == ("Alice Updated",), f"UPDATE failed: {result}" finally: starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) @@ -1965,9 +2034,7 @@ def test_delete_supported_syntax( starrocks_adapter.execute(f"INSERT INTO {table_name} VALUES {test_data}") # Format delete clause (for subquery/using with table reference) - delete_sql = ( - f"DELETE FROM {table_name} {delete_clause.format(table=table_name)}" - ) + delete_sql = f"DELETE FROM {table_name} {delete_clause.format(table=table_name)}" # Debug: Log the SQL before execution logger.info(f"Executing DELETE SQL: {delete_sql}") @@ -1976,13 +2043,11 @@ def test_delete_supported_syntax( starrocks_adapter.execute(delete_sql) # Verify result - count = starrocks_adapter.fetchone(f"SELECT COUNT(*) FROM {table_name}")[0] - logger.info( - f"After DELETE: {count} rows remaining (expected {expected_remaining})" + count = fetchone_or_fail(starrocks_adapter, f"SELECT COUNT(*) FROM {table_name}")[0] + logger.info(f"After DELETE: {count} rows remaining (expected {expected_remaining})") + assert count == expected_remaining, ( + f"Expected {expected_remaining} rows, got {count} for {table_type} with {delete_clause}" ) - assert ( - count == expected_remaining - ), f"Expected {expected_remaining} rows, got {count} for {table_type} with {delete_clause}" # ==================== DELETE Operations - Failure Cases ==================== @@ -2033,9 +2098,7 @@ def test_delete_unsupported_syntax( Expected: DELETE fails with specific error message. """ table_name = test_tables[table_type] - delete_sql = ( - f"DELETE FROM {table_name} {delete_clause.format(table=table_name)}" - ) + delete_sql = f"DELETE FROM {table_name} {delete_clause.format(table=table_name)}" # This should raise an exception with pytest.raises(Exception) as exc_info: @@ -2045,9 +2108,9 @@ def test_delete_unsupported_syntax( import re error_msg = str(exc_info.value).lower() - assert re.search( - error_pattern, error_msg - ), f"Expected error pattern '{error_pattern}', got: {exc_info.value}" + assert re.search(error_pattern, error_msg), ( + f"Expected error pattern '{error_pattern}', got: {exc_info.value}" + ) # ==================== COMMENT Syntax Tests ==================== @@ -2105,9 +2168,7 @@ def test_comment_syntax_variants( # Generate SQL based on template if "table" in comment_type: - sql = sql_template.format( - table=table_name, comment=f"test {comment_type}" - ) + sql = sql_template.format(table=table_name, comment=f"test {comment_type}") else: # column sql = sql_template.format( table=table_name, column="col1", comment=f"test {comment_type}" @@ -2119,24 +2180,26 @@ def test_comment_syntax_variants( # Verify comment was set if "table" in comment_type: - result = starrocks_adapter.fetchone( + result = fetchone_or_fail( + starrocks_adapter, f"SELECT TABLE_COMMENT FROM information_schema.TABLES " - f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'test_comment'" + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'test_comment'", )[0] - assert ( - f"test {comment_type}" in result - ), f"Comment not set correctly for {comment_type}" + assert f"test {comment_type}" in result, ( + f"Comment not set correctly for {comment_type}" + ) else: # column - result_row = starrocks_adapter.fetchone( + result_row = fetchone_or_fail( + starrocks_adapter, f"SELECT COLUMN_NAME, COLUMN_COMMENT FROM information_schema.COLUMNS " f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'test_comment' " - f"AND COLUMN_NAME = 'col1'" + f"AND COLUMN_NAME = 'col1'", ) logger.info(f"Column comment: {result_row}") result = result_row[1] - assert ( - f"test {comment_type}" in result - ), f"Comment not set correctly for {comment_type}" + assert f"test {comment_type}" in result, ( + f"Comment not set correctly for {comment_type}" + ) logger.info(f"✅ {comment_type}: SUPPORTED") @@ -2225,13 +2288,12 @@ def test_comment_in_create_table(self, starrocks_adapter: StarRocksEngineAdapter ) # Verify table comment - table_comment = starrocks_adapter.fetchone( + table_comment = fetchone_or_fail( + starrocks_adapter, f"SELECT TABLE_COMMENT FROM information_schema.TABLES " - f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'test_create_comment'" + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'test_create_comment'", )[0] - assert ( - table_comment == "test table" - ), f"Table comment mismatch: {table_comment}" + assert table_comment == "test table", f"Table comment mismatch: {table_comment}" # Verify column comments column_comments = {} @@ -2243,12 +2305,12 @@ def test_comment_in_create_table(self, starrocks_adapter: StarRocksEngineAdapter if col_comment: # Skip empty comments column_comments[col_name] = col_comment - assert ( - column_comments.get("id") == "id column" - ), f"Column comment mismatch: {column_comments}" - assert ( - column_comments.get("name") == "name column" - ), f"Column comment mismatch: {column_comments}" + assert column_comments.get("id") == "id column", ( + f"Column comment mismatch: {column_comments}" + ) + assert column_comments.get("name") == "name column", ( + f"Column comment mismatch: {column_comments}" + ) finally: starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) @@ -2266,9 +2328,7 @@ class TestCommentMethods: - Future ALTER TABLE support """ - def test_build_create_comment_table_exp( - self, starrocks_adapter: StarRocksEngineAdapter - ): + def test_build_create_comment_table_exp(self, starrocks_adapter: StarRocksEngineAdapter): """ Test _build_create_comment_table_exp generates correct ALTER TABLE COMMENT SQL. @@ -2304,32 +2364,29 @@ def test_build_create_comment_table_exp( # Verify: SQL format is correct assert "ALTER TABLE" in comment_sql, f"Invalid SQL format: {comment_sql}" - assert ( - "COMMENT =" in comment_sql - ), f"Missing COMMENT = in SQL: {comment_sql}" + assert "COMMENT =" in comment_sql, f"Missing COMMENT = in SQL: {comment_sql}" assert new_comment in comment_sql, f"Comment not in SQL: {comment_sql}" # Execute the generated SQL starrocks_adapter.execute(comment_sql) # Verify: Comment was actually updated - result = starrocks_adapter.fetchone( + result = fetchone_or_fail( + starrocks_adapter, f"SELECT TABLE_COMMENT FROM information_schema.TABLES " - f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'test_table'" + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'test_table'", + ) + assert result, "Table not found after comment update" + assert result[0] == new_comment, ( + f"Comment not updated. Expected: {new_comment}, Got: {result[0]}" ) - assert result is not None, "Table not found after comment update" - assert ( - result[0] == new_comment - ), f"Comment not updated. Expected: {new_comment}, Got: {result[0]}" logger.info("✅ _build_create_comment_table_exp generates valid SQL") finally: starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) - def test_build_create_comment_column_exp( - self, starrocks_adapter: StarRocksEngineAdapter - ): + def test_build_create_comment_column_exp(self, starrocks_adapter: StarRocksEngineAdapter): """ Test _build_create_comment_column_exp generates correct ALTER TABLE MODIFY COLUMN SQL. @@ -2369,9 +2426,7 @@ def test_build_create_comment_column_exp( # Verify: SQL format is correct assert "ALTER TABLE" in comment_sql, f"Invalid SQL format: {comment_sql}" - assert ( - "MODIFY COLUMN" in comment_sql - ), f"Missing MODIFY COLUMN in SQL: {comment_sql}" + assert "MODIFY COLUMN" in comment_sql, f"Missing MODIFY COLUMN in SQL: {comment_sql}" assert "COMMENT" in comment_sql, f"Missing COMMENT in SQL: {comment_sql}" assert new_comment in comment_sql, f"Comment not in SQL: {comment_sql}" @@ -2379,22 +2434,21 @@ def test_build_create_comment_column_exp( starrocks_adapter.execute(comment_sql) # Verify: Column comment was actually updated - result = starrocks_adapter.fetchone( + result = fetchone_or_fail( + starrocks_adapter, f"SELECT COLUMN_TYPE, COLUMN_COMMENT FROM information_schema.COLUMNS " - f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'test_table' AND COLUMN_NAME = 'name'" + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'test_table' AND COLUMN_NAME = 'name'", ) assert result is not None, "Column not found after comment update" column_type, column_comment = result - assert ( - column_comment == new_comment - ), f"Comment not updated. Expected: {new_comment}, Got: {column_comment}" - assert ( - "varchar(100)" in column_type.lower() - ), f"Column type changed unexpectedly: {column_type}" - - logger.info( - "✅ _build_create_comment_column_exp generates valid SQL with correct type" + assert column_comment == new_comment, ( + f"Comment not updated. Expected: {new_comment}, Got: {column_comment}" ) + assert "varchar(100)" in column_type.lower(), ( + f"Column type changed unexpectedly: {column_type}" + ) + + logger.info("✅ _build_create_comment_column_exp generates valid SQL with correct type") finally: starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) diff --git a/tests/core/engine_adapter/test_starrocks.py b/tests/core/engine_adapter/test_starrocks.py index 5a81795ef4..46f9b5be08 100644 --- a/tests/core/engine_adapter/test_starrocks.py +++ b/tests/core/engine_adapter/test_starrocks.py @@ -21,7 +21,7 @@ import pytest from sqlglot import expressions as exp -from sqlglot import parse, parse_one +from sqlglot import parse_one from pytest_mock.plugin import MockerFixture from sqlmesh.core.engine_adapter.shared import DataObjectType from sqlmesh.utils.errors import SQLMeshError @@ -40,6 +40,11 @@ def _load_sql_model(model_sql: str) -> SqlModel: return t.cast(SqlModel, load_sql_based_model(expressions)) +def _columns(model: SqlModel) -> t.Dict[str, exp.DataType]: + assert model.columns_to_types is not None + return model.columns_to_types + + # ============================================================================= # Schema Operations # ============================================================================= @@ -71,9 +76,7 @@ def test_create_schema_without_if_exists( "CREATE SCHEMA `test_schema`", ] - def test_drop_schema( - self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] - ): + def test_drop_schema(self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter]): """Test DROP DATABASE statement generation.""" adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) adapter.drop_schema("test_schema") @@ -137,7 +140,7 @@ def fetchdf_side_effect(query: exp.Expression, *_: t.Any, **__: t.Any): return df[mask].reset_index(drop=True) return df.reset_index(drop=True) - adapter.fetchdf = mocker.Mock(side_effect=fetchdf_side_effect) + adapter.fetchdf = mocker.Mock(side_effect=fetchdf_side_effect) # type: ignore[assignment] mv1 = adapter.get_data_object("test_db.mv1") assert mv1 is not None @@ -244,21 +247,15 @@ def test_rename_table( # Test 1: Simple table names (no database qualifier) adapter.rename_table("old_table", "new_table") - adapter.cursor.execute.assert_called_with( - "ALTER TABLE `old_table` RENAME `new_table`" - ) + adapter.cursor.execute.assert_called_with("ALTER TABLE `old_table` RENAME `new_table`") # Test 2: Database-qualified names - RENAME only uses table name adapter.cursor.execute.reset_mock() adapter.rename_table("db.old_table", "db.new_table") # StarRocks RENAME clause requires unqualified table name - adapter.cursor.execute.assert_called_with( - "ALTER TABLE `db`.`old_table` RENAME `new_table`" - ) + adapter.cursor.execute.assert_called_with("ALTER TABLE `db`.`old_table` RENAME `new_table`") - def test_delete_from( - self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] - ): + def test_delete_from(self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter]): """Test DELETE statement generation.""" adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) adapter.delete_from(exp.to_table("test_table"), "id = 1") @@ -277,9 +274,7 @@ def test_create_index( # StarRocks skips index creation - verify no execute call was made adapter.cursor.execute.assert_not_called() - def test_create_view( - self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] - ): + def test_create_view(self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter]): """Test CREATE VIEW statement generation.""" adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) adapter.create_view("test_view", parse_one("SELECT a FROM tbl")) @@ -583,7 +578,9 @@ def test_delete_with_between_in_complex_expression( adapter.delete_from( exp.to_table("test_table"), - parse_one("(dt BETWEEN '2024-01-01' AND '2024-06-30') OR (dt BETWEEN '2024-07-01' AND '2024-12-31')"), + parse_one( + "(dt BETWEEN '2024-01-01' AND '2024-06-30') OR (dt BETWEEN '2024-07-01' AND '2024-12-31')" + ), ) sql = to_sql_calls(adapter)[0] @@ -650,7 +647,7 @@ def test_key_types_with_tuple_form( adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) adapter.create_table( model.name, - model.columns_to_types, + _columns(model), table_properties=model.physical_properties, ) @@ -694,7 +691,7 @@ def test_primary_key_string_forms( adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) adapter.create_table( model.name, - model.columns_to_types, + _columns(model), table_properties=model.physical_properties, ) @@ -724,7 +721,7 @@ def test_primary_key_single_identifier( adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) adapter.create_table( model.name, - model.columns_to_types, + _columns(model), table_properties=model.physical_properties, ) @@ -754,7 +751,7 @@ def test_primary_key_via_table_properties_tuple( adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) adapter.create_table( model.name, - model.columns_to_types, + _columns(model), table_properties=model.physical_properties, ) @@ -799,9 +796,7 @@ def test_column_reordering_for_key( customer_id_pos = col_defs.find("`customer_id`") assert order_id_pos < event_date_pos, "order_id must appear before event_date" - assert ( - event_date_pos < customer_id_pos - ), "event_date must appear before customer_id" + assert event_date_pos < customer_id_pos, "event_date must appear before customer_id" # ============================================================================= @@ -814,9 +809,9 @@ class TestPartitionPropertyBuilding: "partition_expr,expected_clause", [ # Expression partitioning - single column - ("'dt'", "PARTITION BY dt"), + ("'dt'", "PARTITION BY (dt)"), # Expression partitioning - multi-column - ("(year, month)", "PARTITION BY year, month"), + ("(year, month)", "PARTITION BY (year, month)"), # RANGE partitioning ("RANGE (dt)", "PARTITION BY RANGE (`dt`) ()"), # LIST partitioning @@ -849,7 +844,7 @@ def test_partitioned_by_forms( adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) adapter.create_table( model.name, - model.columns_to_types, + _columns(model), partitioned_by=model.partitioned_by, table_properties=model.physical_properties, ) @@ -880,13 +875,13 @@ def test_partition_by_alias( adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) adapter.create_table( model.name, - model.columns_to_types, + _columns(model), partitioned_by=model.partitioned_by, table_properties=model.physical_properties, ) sql = to_sql_calls(adapter)[0] - assert "PARTITION BY year, month" in sql + assert "PARTITION BY (year, month)" in sql def test_partitioned_by_as_model_parameter( self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] @@ -909,12 +904,12 @@ def test_partitioned_by_as_model_parameter( adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) adapter.create_table( model.name, - model.columns_to_types, + _columns(model), partitioned_by=model.partitioned_by, ) sql = to_sql_calls(adapter)[0] - assert "PARTITION BY year, month" in sql + assert "PARTITION BY (year, month)" in sql def test_partitions_value_forms( self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] @@ -941,7 +936,7 @@ def test_partitions_value_forms( adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) adapter.create_table( model.name, - model.columns_to_types, + _columns(model), partitioned_by=model.partitioned_by, table_properties=model.physical_properties, ) @@ -974,7 +969,7 @@ def test_partitions_value_forms( adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) adapter.create_table( model.name, - model.columns_to_types, + _columns(model), partitioned_by=model.partitioned_by, table_properties=model.physical_properties, ) @@ -1032,7 +1027,7 @@ def test_distributed_by_string_forms( adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) adapter.create_table( model.name, - model.columns_to_types, + _columns(model), table_properties=model.physical_properties, ) @@ -1083,7 +1078,7 @@ def test_distributed_by_structured_forms( adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) adapter.create_table( model.name, - model.columns_to_types, + _columns(model), table_properties=model.physical_properties, ) @@ -1149,7 +1144,7 @@ def test_order_by_value_forms( adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) adapter.create_table( model.name, - model.columns_to_types, + _columns(model), table_properties=model.physical_properties, ) @@ -1184,7 +1179,7 @@ def test_clustered_by_generates_order_by( adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) adapter.create_table( model.name, - model.columns_to_types, + _columns(model), table_properties=model.physical_properties, ) @@ -1213,7 +1208,7 @@ def test_clustered_by_as_model_parameter( adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) adapter.create_table( model.name, - model.columns_to_types, + _columns(model), clustered_by=model.clustered_by, ) @@ -1312,7 +1307,7 @@ def test_security_value_forms( model.name, query, replace=False, - target_columns_to_types=model.columns_to_types, + target_columns_to_types=_columns(model), view_properties=model.virtual_properties, ) @@ -1344,7 +1339,7 @@ def test_security_invalid_value( model.name, query, replace=False, - target_columns_to_types=model.columns_to_types, + target_columns_to_types=_columns(model), view_properties=model.virtual_properties, ) @@ -1381,7 +1376,7 @@ def _create_simple_mv( query, replace=False, materialized=True, - target_columns_to_types=model.columns_to_types, + target_columns_to_types=_columns(model), view_properties=model.virtual_properties, ) # replace=False → only CREATE statement is emitted @@ -1466,9 +1461,7 @@ def test_refresh_scheme_invalid_prefix( self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] ): adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) - model = self._build_mv_model( - "refresh_scheme = 'SCHEDULE EVERY (INTERVAL 5 MINUTE)'" - ) + model = self._build_mv_model("refresh_scheme = 'SCHEDULE EVERY (INTERVAL 5 MINUTE)'") with pytest.raises(SQLMeshError, match="refresh_scheme"): self._create_simple_mv(adapter, model) @@ -1567,7 +1560,7 @@ def test_build_create_comment_table_exp( assert sql == expected_sql else: # Special chars case - check for escaped quote - assert "It\'s a test" in sql or "It''s a test" in sql + assert "It's a test" in sql or "It''s a test" in sql # Common assertions for all cases assert "ALTER TABLE" in sql @@ -1686,7 +1679,7 @@ def test_key_type_mutually_exclusive( SELECT id, dt, value FROM source_table; """ model = _load_sql_model(model_sql) - columns = model.columns_to_types + columns = _columns(model) with pytest.raises(SQLMeshError, match="Multiple table key type"): adapter.create_table( @@ -1721,7 +1714,7 @@ def test_partition_alias_conflict_with_parameter( with pytest.raises(SQLMeshError, match="partition definition"): adapter.create_table( model.name, - target_columns_to_types=model.columns_to_types, + target_columns_to_types=_columns(model), partitioned_by=model.partitioned_by, table_properties=model.physical_properties, ) @@ -1751,7 +1744,7 @@ def test_invalid_property_name_detection( with pytest.raises(SQLMeshError, match="Invalid property 'partition'"): adapter.create_table( model.name, - target_columns_to_types=model.columns_to_types, + target_columns_to_types=_columns(model), table_properties=model.physical_properties, ) @@ -1797,9 +1790,7 @@ def test_create_table_comprehensive( ), exp.EQ( this=exp.Column(this="expressions"), - expression=exp.Tuple( - expressions=[exp.to_column("customer_id")] - ), + expression=exp.Tuple(expressions=[exp.to_column("customer_id")]), ), exp.EQ( this=exp.Column(this="buckets"), From 2e22e87ca37e14f529d6280e857712b5a1940fe0 Mon Sep 17 00:00:00 2001 From: jaogoy Date: Tue, 13 Jan 2026 18:27:50 +0800 Subject: [PATCH 03/20] optimize doc-test Signed-off-by: jaogoy Signed-off-by: Mateusz Jukiewicz --- sqlmesh/core/engine_adapter/starrocks.py | 30 ++++++++++++------------ 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/sqlmesh/core/engine_adapter/starrocks.py b/sqlmesh/core/engine_adapter/starrocks.py index 212c40143b..cb7c80aacd 100644 --- a/sqlmesh/core/engine_adapter/starrocks.py +++ b/sqlmesh/core/engine_adapter/starrocks.py @@ -1038,8 +1038,8 @@ def from_enum(enum_value: str, buckets: t.Optional[int] = None) -> t.Dict[str, t Dict with kind/columns/buckets fields Example: - >>> DistributionTupleInputType.from_enum("RANDOM") - {"kind": "RANDOM", "columns": [], "buckets": None} + >>> DistributionTupleOutputType.from_enum("RANDOM") + {'kind': 'RANDOM', 'columns': [], 'buckets': None} """ return {"kind": enum_value, "columns": [], "buckets": buckets} @@ -1058,8 +1058,8 @@ def from_func( Dict with kind/columns/buckets fields Example: - >>> func = parse_one("HASH(id, dt)") - >>> DistributionTupleInputType.from_func(func) + >> func = sqlglot.parse_one("HASH(id, dt)") + >> DistributionTupleOutputType.from_func(func) {"kind": "HASH", "columns": [exp.Column("id"), exp.Column("dt")], "buckets": None} """ func_name = func.name.upper() if hasattr(func, "name") else str(func.this).upper() @@ -1095,17 +1095,17 @@ def to_unified_dict( TypeError: If value type is not supported Example: - >>> # From DistributionTupleInputType - >>> DistributionTupleInputType.to_unified_dict({"kind": "HASH", "columns": [...]}) - {"kind": "HASH", "columns": [...], "buckets": None} + >>> # From DistributionTupleOutputType + >>> DistributionTupleOutputType.to_unified_dict({"kind": "HASH", "columns": [...]}) + {'kind': 'HASH', 'columns': [Ellipsis]} >>> # From EnumType - >>> DistributionTupleInputType.to_unified_dict("RANDOM") - {"kind": "RANDOM", "columns": [], "buckets": None} + >>> DistributionTupleOutputType.to_unified_dict("RANDOM") + {'kind': 'RANDOM', 'columns': [], 'buckets': None} - >>> # From FuncType - >>> DistributionTupleInputType.to_unified_dict(parse_one("HASH(id)")) - {"kind": "HASH", "columns": [exp.Column("id")], "buckets": None} + >> # From FuncType + >> DistributionTupleOutputType.to_unified_dict(sqlglot.parse_one("HASH(id)")) + {'kind': 'HASH', 'columns': [exp.Column('id')], 'buckets': None} """ if isinstance(normalized_value, dict): # Already in DistributionTupleInputType format @@ -1422,7 +1422,7 @@ def ensure_parenthesized(value: t.Any) -> t.Any: >>> PropertyValidator.ensure_parenthesized(exp.Literal.string('id1, id2')) '(id1, id2)' >>> PropertyValidator.ensure_parenthesized(exp.Column(quoted=True, name='id1, id2')) - '(id1, id2)' + Column(quoted=True, name=id1, id2) """ # logger.debug("ensure_parenthesized. value: %s, type: %s", value, type(value)) @@ -1526,7 +1526,7 @@ def check_invalid_names( SQLMeshError: If any invalid name is found Example: - >>> PropertyValidator.check_invalid_names( + >> PropertyValidator.check_invalid_names( ... valid_name="partitioned_by", ... invalid_names=["partition_by", "partition"], ... table_properties={"partition_by": "dt"} @@ -1579,7 +1579,7 @@ def check_at_most_one( SQLMeshError: If multiple properties from the group are defined Example: - >>> PropertyValidator.check_at_most_one( + >> PropertyValidator.check_at_most_one( ... property_name="primary_key", ... property_description="key type", ... exclusive_property_names=["primary_key", "duplicate_key", "unique_key", "aggregate_key"], From 5aa926a01004b342e7c73b6a8d4519485eeda666 Mon Sep 17 00:00:00 2001 From: jaogoy Date: Tue, 13 Jan 2026 19:46:46 +0800 Subject: [PATCH 04/20] remove some debug log Signed-off-by: jaogoy Signed-off-by: Mateusz Jukiewicz --- sqlmesh/core/engine_adapter/starrocks.py | 176 ++++------------------- 1 file changed, 30 insertions(+), 146 deletions(-) diff --git a/sqlmesh/core/engine_adapter/starrocks.py b/sqlmesh/core/engine_adapter/starrocks.py index cb7c80aacd..18a0d8a4e5 100644 --- a/sqlmesh/core/engine_adapter/starrocks.py +++ b/sqlmesh/core/engine_adapter/starrocks.py @@ -1476,7 +1476,7 @@ def validate_and_normalize_property( >>> validated = PropertyValidator.validate_and_normalize_property("distributed_by", "RANDOM") >>> # Result: "RANDOM" (string from EnumType) """ - logger.debug("validate_and_normalize_property. value: %s, type: %s", value, type(value)) + # logger.debug("validate_and_normalize_property. value: %s, type: %s", value, type(value)) # Step 1: Optionally preprocess string with parentheses if preprocess_parentheses: @@ -2178,25 +2178,16 @@ def _create_table_from_columns( # Use setdefault to simplify table_properties access table_properties = kwargs.setdefault("table_properties", {}) - # Log entry point - logger.debug( - "_create_table_from_columns: table=%s, primary_key=%s (from model param), " - "table_properties.keys=%s", - table_name, - primary_key, - list(table_properties.keys()), - ) - # Extract and validate key columns from table_properties # Priority: parameter primary_key > table_properties (already handled above) key_type, key_columns = self._extract_and_validate_key_columns( table_properties, primary_key ) - logger.debug( - "_create_table_from_columns: extracted key_type=%s, key_columns=%s", - key_type, - key_columns, - ) + # logger.debug( + # "_create_table_from_columns: extracted key_type=%s, key_columns=%s", + # key_type, + # key_columns, + # ) # IMPORTANT: Normalize parameter primary_key into table_properties for unified handling # This ensures _build_table_properties_exp() can access primary_key even when @@ -2205,24 +2196,24 @@ def _create_table_from_columns( table_properties["primary_key"] = primary_key logger.debug("_create_table_from_columns: unified primary_key into table_properties") elif key_type: - logger.debug( - "table key type '%s' may be handled in _build_table_key_property", key_type - ) + # logger.debug( + # "table key type '%s' may be handled in _build_table_key_property", key_type + # ) + pass # StarRocks key column ordering constraint: All key types need reordering if key_columns: target_columns_to_types = self._reorder_columns_for_key( target_columns_to_types, key_columns, key_type or "key" ) - logger.debug("_create_table_from_columns: reordered columns for %s", key_type) # IMPORTANT: Do NOT pass primary_key to base class! # Unlike other databases, StarRocks requires PRIMARY KEY to be in POST_SCHEMA location # (in properties section after columns), not inside schema (inside column definitions). # We handle ALL key types (including PRIMARY KEY) in _build_table_key_property. - logger.debug( - "_create_table_from_columns: NOT passing primary_key to base class (handled in _build_table_key_property)" - ) + # logger.debug( + # "_create_table_from_columns: NOT passing primary_key to base class (handled in _build_table_key_property)" + # ) super()._create_table_from_columns( table_name=table_name, target_columns_to_types=target_columns_to_types, @@ -2275,11 +2266,11 @@ def create_view( self.get_data_object(view_name), DataObjectType.MATERIALIZED_VIEW ) self.drop_view(view_name, ignore_if_not_exists=True, materialized=True) - logger.debug( - f"Creating materialized view: {view_name}, materialized: {materialized}, " - f"materialized_properties: {materialized_properties}, " - f"view_properties: {view_properties}, create_kwargs: {create_kwargs}, " - ) + # logger.debug( + # f"Creating materialized view: {view_name}, materialized: {materialized}, " + # f"materialized_properties: {materialized_properties}, " + # f"view_properties: {view_properties}, create_kwargs: {create_kwargs}, " + # ) return self._create_materialized_view( view_name=view_name, @@ -2359,12 +2350,12 @@ def _create_materialized_view( partitioned_by = materialized_properties.get("partitioned_by") clustered_by = materialized_properties.get("clustered_by") partition_interval_unit = materialized_properties.get("partition_interval_unit") - logger.debug( - f"Get info from materialized_properties: {materialized_properties}, " - f"partitioned_by: {partitioned_by}, " - f"clustered_by: {clustered_by}, " - f"partition_interval_unit: {partition_interval_unit}" - ) + # logger.debug( + # f"Get info from materialized_properties: {materialized_properties}, " + # f"partitioned_by: {partitioned_by}, " + # f"clustered_by: {clustered_by}, " + # f"partition_interval_unit: {partition_interval_unit}" + # ) properties_exp = self._build_table_properties_exp( catalog_name=target_table.catalog, @@ -2488,10 +2479,10 @@ def _build_table_properties_exp( """ properties: t.List[exp.Expression] = [] table_properties_copy = dict(table_properties) if table_properties else {} - logger.debug( - "_build_table_properties_exp: table_properties=%s", - table_properties.keys() if table_properties else [], - ) + # logger.debug( + # "_build_table_properties_exp: table_properties=%s", + # table_properties.keys() if table_properties else [], + # ) is_mv = table_kind == "MATERIALIZED_VIEW" if is_mv: @@ -2508,7 +2499,6 @@ def _build_table_properties_exp( property_description="key type", table_properties=table_properties_copy, ) - logger.debug("_build_table_properties_exp: active_key_type='%s'", active_key_type) if is_mv and active_key_type: raise SQLMeshError( f"You can't specify the table type when the table is a materialized view. " @@ -2525,22 +2515,11 @@ def _build_table_properties_exp( key_type, key_expr, preprocess_parentheses=True ) key_columns = tuple(col.name for col in normalized) - logger.debug( - "_build_table_properties_exp: key_type=%s, key_columns=%s", - key_type, - key_columns, - ) # 1. Handle key constraints (ALL types including PRIMARY KEY) key_prop = self._build_table_key_property(table_properties_copy, active_key_type) if key_prop: properties.append(key_prop) - logger.debug( - "_build_table_properties_exp: generated key_prop=%s", - type(key_prop).__name__, - ) - else: - logger.debug("_build_table_properties_exp: key_prop skipped (not defined)") # 2. Add table comment (it must be ahead of other properties except the talbe key/type) if table_description: @@ -2562,46 +2541,22 @@ def _build_table_properties_exp( ) if partition_prop: properties.append(partition_prop) - logger.debug( - "_build_table_properties_exp: generated partition_prop=%s", - type(partition_prop).__name__, - ) - else: - logger.debug("_build_table_properties_exp: partition_prop skipped (not defined)") # 4. Handle distributed_by (DISTRIBUTED BY HASH/RANDOM) distributed_prop = self._build_distributed_by_property(table_properties_copy, key_columns) if distributed_prop: properties.append(distributed_prop) - logger.debug( - "_build_table_properties_exp: generated distributed_prop=%s", - type(distributed_prop).__name__, - ) - else: - logger.debug("_build_table_properties_exp: distributed_prop skipped (not defined)") # 5. Handle refresh_property (REFRESH ...) if is_mv: refresh_prop = self._build_refresh_property(table_properties_copy) if refresh_prop: properties.append(refresh_prop) - logger.debug( - "_build_table_properties_exp: generated refresh_prop=%s", - type(refresh_prop).__name__, - ) - else: - logger.debug("_build_table_properties_exp: refresh_prop skipped (not defined)") # 6. Handle order_by/clustered_by (ORDER BY ...) order_prop = self._build_order_by_property(table_properties_copy, clustered_by or None) if order_prop: properties.append(order_prop) - logger.debug( - "_build_table_properties_exp: generated order_prop=%s", - type(order_prop).__name__, - ) - else: - logger.debug("_build_table_properties_exp: order_prop skipped (not defined)") # 5. Handle other properties (replication_num, storage_medium, etc.) other_props = self._build_other_properties(table_properties_copy) @@ -2670,11 +2625,8 @@ def _build_table_key_property( Key property expression for the active key type, or None """ if not active_key_type: - logger.debug("_build_table_key_property: no active_key_type, skipped") return None - logger.debug("_build_table_key_property: processing %s", active_key_type) - # Configuration: key_name -> Property class (excluding primary_key) KEY_PROPERTY_CLASSES: t.Dict[str, t.Type[exp.Expression]] = { "primary_key": exp.PrimaryKey, @@ -2714,11 +2666,6 @@ def _build_table_key_property( ) # normalized is List[exp.Column] as defined in TableKeyInputSpec result = property_class(expressions=list(normalized)) - logger.debug( - "_build_table_key_property: generated %s with columns=%s", - type(result).__name__, - [col.name for col in normalized], - ) return result def _build_partition_property( @@ -2761,22 +2708,10 @@ def _build_partition_property( ) # If parameter was provided, it takes priority - if partitioned_by: - logger.debug( - "_build_partition_property: using partitioned_by from model param=%s", - partitioned_by, - ) - elif not partitioned_by and partition_param_name: + if not partitioned_by and partition_param_name: # Get from table_properties partitioned_by = table_properties.pop(partition_param_name, None) - logger.debug( - "_build_partition_property: using partitioned_by from table_properties[%s]=%s", - partition_param_name, - partitioned_by, - ) - if not partitioned_by: - logger.debug("_build_partition_property: no 'partitioned_by' defined, skipped") return None # Parse partition expressions to extract columns and kind (RANGE/LIST) @@ -2809,7 +2744,6 @@ def extract_column_name(expr: exp.Expression) -> t.Optional[str]: # Get partition definitions (RANGE/LIST partitions) # Note: Expression-based partitioning (partition_kind=None) does not support pre-created partitions if partitions := table_properties.pop("partitions", None): - logger.debug("Pre-created partitions: %s", partitions) if partition_kind is None: logger.warning( "[StarRocks] 'partitions' parameter is ignored for expression-based partitioning. " @@ -2831,7 +2765,6 @@ def extract_column_name(expr: exp.Expression) -> t.Optional[str]: partitions=partitions, partition_kind=partition_kind, ) - logger.debug("_build_partition_property: generated %s", result) return result def _parse_partition_expressions( @@ -2930,12 +2863,6 @@ def _build_partitioned_by_exp( """ partition_kind = kwargs.get("partition_kind") partitions: t.Optional[t.List[str]] = kwargs.get("partitions") - logger.debug( - "_build_partitioned_by_exp: partition_kind=%s, partitioned_by=%s, partitions=%s", - partition_kind, - partitioned_by, - partitions, - ) # Process partitions to create_expressions # partitions is already List[str] after SPEC normalization @@ -2989,20 +2916,10 @@ def _build_distributed_by_property( # No default - if not set, return None if distributed_by is None: - logger.debug("_build_distributed_by_property: no 'distributed_by' defined, skipped") return None - logger.debug( - "_build_distributed_by_property: using distributed_by from table_properties=%s", - distributed_by, - ) - # Try to parse complex string with BUCKETS first unified = self._parse_distribution_with_buckets(distributed_by) - logger.debug( - "_build_distributed_by_property: parsed distribution with buckets: %s", - unified, - ) if unified is None: # Fall back to SPEC-based parsing normalized = PropertyValidator.validate_and_normalize_property( @@ -3044,10 +2961,6 @@ def _build_distributed_by_property( buckets=buckets_expr, order=None, ) - logger.debug( - "_build_distributed_by_property: generated DistributedByProperty: %s", - result, - ) return result def _build_refresh_property( @@ -3171,11 +3084,6 @@ def _parse_distribution_with_buckets( (The output function will still handle "HASH(id)" without BUCKETS) """ # Only handle string or Literal string values - logger.debug( - "_parse_distribution_with_buckets: distributed_by: %s, type: %s", - distributed_by, - type(distributed_by), - ) if isinstance(distributed_by, str): text = distributed_by elif isinstance(distributed_by, exp.Literal) and distributed_by.is_string: @@ -3197,11 +3105,6 @@ def _parse_distribution_with_buckets( # Parse the HASH/RANDOM part via SPEC normalized = PropertyValidator.validate_and_normalize_property("distributed_by", hash_part) - logger.debug( - "_parse_distribution_with_buckets: parsed hash part: %s, type: %s", - normalized, - type(normalized), - ) return DistributionTupleOutputType.to_unified_dict(normalized, int(buckets_str)) @@ -3236,12 +3139,7 @@ def _build_order_by_property( ) # If parameter was provided, it takes priority - if clustered_by: - logger.debug( - "_build_order_by_property: using clustered_by from model param=%s", - clustered_by, - ) - elif clustered_by is None and order_by_param_name: + if clustered_by is None and order_by_param_name: # Get order_by from table_properties (already validated by check_at_most_one) order_by = table_properties.pop(order_by_param_name, None) if order_by is not None: @@ -3249,18 +3147,11 @@ def _build_order_by_property( "clustered_by", order_by, preprocess_parentheses=True ) clustered_by = list(normalized) - logger.debug( - "_build_order_by_property: using clustered_by from table_properties[%s]=%s", - order_by_param_name, - clustered_by, - ) if clustered_by: result = exp.Cluster(expressions=clustered_by) - logger.debug("_build_order_by_property: generated Cluster") return result else: # noqa: RET505 - logger.debug("_build_order_by_property: no 'clustered_by' defined, skipped") return None def _build_other_properties(self, table_properties: t.Dict[str, t.Any]) -> t.List[exp.Property]: @@ -3337,7 +3228,6 @@ def _extract_and_validate_key_columns( table_properties=table_properties, parameter_value=primary_key, ) - logger.debug("get table key: %s", {active_key_type}) # If parameter primary_key was provided, return it if primary_key: @@ -3355,12 +3245,6 @@ def _extract_and_validate_key_columns( ) key_columns = tuple(col.name for col in normalized) - logger.debug( - "Extracted '%s' from table_properties, value=%s", - active_key_type, - key_columns, - ) - return (active_key_type, key_columns) def _reorder_columns_for_key( From 3c85ada94b25758d90e9bf52ca59e15dc537e8d6 Mon Sep 17 00:00:00 2001 From: jaogoy Date: Thu, 15 Jan 2026 18:32:17 +0800 Subject: [PATCH 05/20] optimize doc Signed-off-by: jaogoy Signed-off-by: Mateusz Jukiewicz --- docs/integrations/engines/starrocks.md | 57 +++++++++++++++++++++----- sqlmesh/core/config/connection.py | 2 +- 2 files changed, 48 insertions(+), 11 deletions(-) diff --git a/docs/integrations/engines/starrocks.md b/docs/integrations/engines/starrocks.md index 2749fb9d9a..cc7ab8a2e9 100644 --- a/docs/integrations/engines/starrocks.md +++ b/docs/integrations/engines/starrocks.md @@ -6,18 +6,55 @@ SQLMesh supports StarRocks through its MySQL-compatible protocol, providing StarRocks-specific optimizations for table models, indexing, partitioning, and more. The adapter leverages StarRocks's strengths for analytical workloads with sensible defaults and advanced configuration support. +## Prerequisites + +* Install SQLMesh with the StarRocks extra: + +```bash +pip install "sqlmesh[starrocks]" +``` + +* Initialize a SQLMesh project (if you haven't already): + +```bash +sqlmesh init +``` + +* Configure a separate state backend: + * StarRocks is currently **not supported** as a SQLMesh `state_connection`. + * Use DuckDB (recommended) or another engine for SQLMesh state. + ## Connection Configuration Example -```yaml -starrocks: - connection: - type: starrocks - host: starrocks-fe # Frontend (FE) node address - port: 9030 # Query port (default: 9030) - user: starrocks_user - password: your_password - database: your_database - # Optional MySQL-compatible settings +```yaml linenums="1" hl_lines="2 4-8 13-15" +gateways: + starrocks: + connection: + type: starrocks + host: starrocks-fe # Frontend (FE) node address + port: 9030 # Query port (default: 9030) + user: starrocks_user + password: your_password + database: your_database + # Optional MySQL-compatible settings + # charset: utf8mb4 + # connect_timeout: 60 + state_connection: + type: duckdb + database: ./state/sqlmesh_state.db + +default_gateway: starrocks + +model_defaults: + dialect: starrocks +``` + +### StarRocks setup note (optional) + +If you're running a shared-nothing cluster with a single backend, you may need to adjust the default replication number: + +```sql +ADMIN SET frontend config ("default_replication_num" = "1"); ``` ## Quickstart diff --git a/sqlmesh/core/config/connection.py b/sqlmesh/core/config/connection.py index 7d79df03a9..7ed6b315c9 100644 --- a/sqlmesh/core/config/connection.py +++ b/sqlmesh/core/config/connection.py @@ -2388,7 +2388,7 @@ class StarRocksConnectionConfig(ConnectionConfig): type_: t.Literal["starrocks"] = Field(alias="type", default="starrocks") DIALECT: t.ClassVar[t.Literal["starrocks"]] = "starrocks" DISPLAY_NAME: t.ClassVar[t.Literal["StarRocks"]] = "StarRocks" - DISPLAY_ORDER: t.ClassVar[t.Literal[19]] = 19 + DISPLAY_ORDER: t.ClassVar[t.Literal[18]] = 18 _engine_import_validator = _get_engine_import_validator("pymysql", "starrocks") From f3758f431d98646d4ecf342b52172acc2b234190 Mon Sep 17 00:00:00 2001 From: jaogoy Date: Fri, 16 Jan 2026 15:25:20 +0800 Subject: [PATCH 06/20] starrocks: add some test cases for partition Signed-off-by: jaogoy Signed-off-by: Mateusz Jukiewicz --- .../integration/test_integration_starrocks.py | 157 ++++++++++++++++++ tests/core/engine_adapter/test_starrocks.py | 71 +++++++- tests/core/test_connection_config.py | 2 +- 3 files changed, 222 insertions(+), 8 deletions(-) diff --git a/tests/core/engine_adapter/integration/test_integration_starrocks.py b/tests/core/engine_adapter/integration/test_integration_starrocks.py index eee2b2054b..6d46d91b42 100644 --- a/tests/core/engine_adapter/integration/test_integration_starrocks.py +++ b/tests/core/engine_adapter/integration/test_integration_starrocks.py @@ -1434,6 +1434,163 @@ def test_e2e_partition_list(self, starrocks_adapter: StarRocksEngineAdapter): finally: starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + # ======================================== + # Case 6B: Expression Partitioning (table + MV) + # Covers: expression partitioning with/without functions, table vs MV + # ======================================== + + @pytest.mark.parametrize( + "partition_expr", + [ + "(event_date, region)", # plain columns + "date_trunc('day', event_date)", # one function expression + "(from_unixtime(ts), region)", # multiple expressions + ], + ) + def test_e2e_partition_expression_for_table( + self, + starrocks_adapter: StarRocksEngineAdapter, + partition_expr: str, + ): + """Expression partitioning for regular tables (outer paren only when no functions).""" + db_name = "sr_e2e_part_expr_tbl_db" + table_name = f"{db_name}.sr_part_expr_table" + + model_sql = f""" + MODEL ( + name test.partition_expr_table, + kind FULL, + dialect starrocks, + columns ( + id BIGINT, + ts BIGINT, + event_date DATE, + region VARCHAR(50) + ), + partitioned_by {partition_expr}, + ); + SELECT * + """ + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + params = self._parse_model_and_get_all_params(model_sql) + starrocks_adapter.create_table(table_name, **params) + + ddl = fetchone_or_fail(starrocks_adapter, f"SHOW CREATE TABLE {table_name}")[1] + logger.info(f"Case 6B DDL:\n{ddl}") + ddl_upper = ddl.upper() + assert "PARTITION BY" in ddl_upper + + before, after = ddl_upper.split("PARTITION BY", 1) + after = after.lstrip() + + # Column/function presence + if "DATE" in partition_expr.upper(): + assert "EVENT_DATE" in after + else: + assert "REGION" in after + if "FROM_UNIXTIME" in partition_expr.upper(): + assert "FROM_UNIXTIME" in after or \ + ("FROM_UNIXTIME" in before and "__GENERATED_PARTITION_COLUMN" in after) + if "DATE_TRUNC" in partition_expr.upper(): + assert "DATE_TRUNC" in after + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + @pytest.mark.parametrize( + "partition_clause,has_func", + [ + ("(event_date, region)", False), + ("(date_trunc('day', event_date), region)", True), + ], + ) + def test_e2e_partition_expression_for_mv( + self, + starrocks_adapter: StarRocksEngineAdapter, + partition_clause: str, + has_func: bool, + ): + """Expression partitioning for MVs should always keep outer parentheses.""" + db_name = "sr_e2e_part_expr_mv_db" + src_table = f"{db_name}.sr_part_expr_src" + mv_table = f"{db_name}.sr_part_expr_mv" + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + + # Source table + data + starrocks_adapter.create_table( + src_table, + target_columns_to_types={ + "id": exp.DataType.build("BIGINT"), + "ts": exp.DataType.build("BIGINT"), + "event_date": exp.DataType.build("DATE"), + "region": exp.DataType.build("VARCHAR(50)"), + }, + primary_key=("id", "event_date", "region"), + table_properties={ + "partitioned_by": partition_clause, + }, + ) + starrocks_adapter.execute( + f""" + INSERT INTO {src_table} (id, ts, event_date, region) + VALUES (1, 1700000000, '2024-01-01', 'us') + """ + ) + + model_sql = f""" + MODEL ( + name test.partition_expr_mv, + kind VIEW ( + materialized true + ), + dialect starrocks, + columns ( + id BIGINT, + ts BIGINT, + event_date DATE, + region VARCHAR(50) + ), + partitioned_by {partition_clause}, + physical_properties ( + distributed_by = 'HASH(id) BUCKETS 2', + refresh_moment = 'IMMEDIATE', + refresh_scheme = 'ASYNC' + ) + ); + SELECT id, ts, event_date, region FROM {src_table}; + """ + + model = _load_sql_model(model_sql) + query = model.render_query() + assert query is not None + materialized_properties = _materialized_properties_from_model(model) + + starrocks_adapter.create_view( + mv_table, + query, + replace=True, + materialized=True, + target_columns_to_types=model.columns_to_types, + materialized_properties=materialized_properties, + view_properties=model.physical_properties, + ) + + ddl = fetchone_or_fail( + starrocks_adapter, f"SHOW CREATE MATERIALIZED VIEW {mv_table}" + )[1] + logger.info(f"Case 6B DDL:\n{ddl}") + ddl_upper = ddl.upper() + assert "PARTITION BY" in ddl_upper + after = ddl_upper.split("PARTITION BY", 1)[1].lstrip() + assert after.startswith("("), f"MV partition should keep parentheses, got: {after[:50]}" + assert "REGION" in after + assert ("DATE_TRUNC" in after) == has_func + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + # ======================================== # Case 7: Other Key Types (test_design.md Case 7) # Covers: duplicate_key, unique_key, aggregate_key diff --git a/tests/core/engine_adapter/test_starrocks.py b/tests/core/engine_adapter/test_starrocks.py index 46f9b5be08..f0610fce72 100644 --- a/tests/core/engine_adapter/test_starrocks.py +++ b/tests/core/engine_adapter/test_starrocks.py @@ -809,9 +809,11 @@ class TestPartitionPropertyBuilding: "partition_expr,expected_clause", [ # Expression partitioning - single column - ("'dt'", "PARTITION BY (dt)"), + ("'dt'", "PARTITION BY (`dt`)"), # Expression partitioning - multi-column - ("(year, month)", "PARTITION BY (year, month)"), + ("(year, month)", "PARTITION BY (`year`, `month`)"), + # Expression partitioning - multi-column with func + ("(date_trunc('day', dt), region)", "PARTITION BY DATE_TRUNC('DAY', `dt`), `region`"), # RANGE partitioning ("RANGE (dt)", "PARTITION BY RANGE (`dt`) ()"), # LIST partitioning @@ -852,6 +854,61 @@ def test_partitioned_by_forms( sql = to_sql_calls(adapter)[0] assert expected_clause in sql + @pytest.mark.parametrize( + "partition_expr,expected_clause", + [ + ("(year, month)", "PARTITION BY (`year`, `month`)"), + ( + "(date_trunc('day', dt), region)", + "PARTITION BY (DATE_TRUNC('DAY', `dt`), `region`)", + ), + ( + "(from_unixtime(dt))", + "PARTITION BY (FROM_UNIXTIME(`dt`))", + ), + ], + ) + def test_partitioned_by_forms_for_mv( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + partition_expr: str, + expected_clause: str, + ): + """MV partition_by should keep outer parentheses when rendering partition tuples.""" + model_sql = f""" + MODEL ( + name test_schema.test_mv_partition, + kind VIEW ( + materialized true, + ), + dialect starrocks, + columns (dt DATE, region STRING, year INT, month INT), + physical_properties ( + partition_by = {partition_expr} + ) + ); + SELECT dt, region, year, month FROM src; + """ + + model = _load_sql_model(model_sql) + materialized_properties = ( + {"partitioned_by": model.partitioned_by} if model.partitioned_by else None + ) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_view( + model.name, + model.render_query(), + materialized=True, + replace=False, + target_columns_to_types=_columns(model), + materialized_properties=materialized_properties, + view_properties=model.physical_properties, + ) + + sql = to_sql_calls(adapter)[0] + assert expected_clause in sql + def test_partition_by_alias( self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] ): @@ -881,7 +938,7 @@ def test_partition_by_alias( ) sql = to_sql_calls(adapter)[0] - assert "PARTITION BY (year, month)" in sql + assert "PARTITION BY (year, month)" in sql or "PARTITION BY (`year`, `month`)" in sql def test_partitioned_by_as_model_parameter( self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] @@ -909,7 +966,7 @@ def test_partitioned_by_as_model_parameter( ) sql = to_sql_calls(adapter)[0] - assert "PARTITION BY (year, month)" in sql + assert "PARTITION BY (year, month)" in sql or "PARTITION BY (`year`, `month`)" in sql def test_partitions_value_forms( self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] @@ -1354,10 +1411,10 @@ def _build_mv_model(self, property_sql: str) -> SqlModel: model_sql = f""" MODEL ( name test_schema.test_mv_refresh_model, - kind VIEW, + kind VIEW (materialized true), dialect starrocks, columns (a INT), - virtual_properties ( + physical_properties ( {property_sql} ) ); @@ -1377,7 +1434,7 @@ def _create_simple_mv( replace=False, materialized=True, target_columns_to_types=_columns(model), - view_properties=model.virtual_properties, + view_properties=model.physical_properties, ) # replace=False → only CREATE statement is emitted return to_sql_calls(adapter)[-1] diff --git a/tests/core/test_connection_config.py b/tests/core/test_connection_config.py index 3e3065b600..b0ea640819 100644 --- a/tests/core/test_connection_config.py +++ b/tests/core/test_connection_config.py @@ -2022,7 +2022,7 @@ def test_starrocks(make_config): assert config.database == "testdb" assert config.DIALECT == "starrocks" assert config.DISPLAY_NAME == "StarRocks" - assert config.DISPLAY_ORDER == 19 + assert config.DISPLAY_ORDER == 18 assert config.is_recommended_for_state_sync is False # Test with minimal configuration (using default port) From bf8af5b487098a9bb27e13d7496c5d1585efa589 Mon Sep 17 00:00:00 2001 From: jaogoy Date: Tue, 20 Jan 2026 09:59:55 +0800 Subject: [PATCH 07/20] optimize docs for View and MV Signed-off-by: jaogoy Signed-off-by: Mateusz Jukiewicz --- docs/integrations/engines/starrocks.md | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/docs/integrations/engines/starrocks.md b/docs/integrations/engines/starrocks.md index cc7ab8a2e9..c7855dee70 100644 --- a/docs/integrations/engines/starrocks.md +++ b/docs/integrations/engines/starrocks.md @@ -424,7 +424,7 @@ MODEL ( ### Views -StarRocks supports view `SECURITY` via `physical_properties.security`. +StarRocks supports view `SECURITY` via **`virtual_properties`**.`security`. **Syntax:** @@ -434,7 +434,7 @@ StarRocks supports view `SECURITY` via `physical_properties.security`. MODEL ( name user_summary_view, kind VIEW, - physical_properties ( + virtual_properties ( security = INVOKER ) ); @@ -451,12 +451,15 @@ GROUP BY user_id; SQLMesh uses `kind VIEW (materialized true)` to create materialized views. -You can specify StarRocks MV refresh settings using the same `physical_properties` block. +For ASYNC MVs, StarRocks requires a `REFRESH` clause, so you must specify **at least one** of `refresh_moment` or `refresh_scheme`. + +MV properties (including `refresh_moment` / `refresh_scheme` and other table-like properties such as partitioning, distribution, ordering, and generic properties) must be specified in **`physical_properties`**. **Refresh properties:** * `refresh_moment`: `IMMEDIATE` or `DEFERRED` (optional) * `refresh_scheme`: `MANUAL` or `ASYNC ...` (optional) + * If you specify it with the `START/EVERY`, you must specify it as a whole string, quoted by a pair of quotes. * Examples: `ASYNC`, `MANUAL`, `ASYNC START ("2024-01-01 00:00:00") EVERY (INTERVAL 5 MINUTE)` * The syntax of `ASYNC ...` clause is the same as the clause in StarRocks. @@ -502,7 +505,7 @@ target_columns_to_types = { ## Limitations -* **No sync MV support (currently)**: synchronous materialized views are not supported yet. +* **No SYNC MV support**: synchronous materialized views are not supported yet. * **No tuple IN**: StarRocks does not support `(c1, c2) IN ((v1, v2), ...)`. * **No `SELECT ... FOR UPDATE`**: StarRocks is an OLAP database and does not support row locks; SQLMesh removes `FOR UPDATE` when executing SQLGlot expressions. * **RENAME caveat**: `ALTER TABLE db.old RENAME db.new` is not supported; the `RENAME` target cannot be qualified with a database name. From 8fb92463d05259555bbdcd4261ba022600d2e6de Mon Sep 17 00:00:00 2001 From: jaogoy Date: Tue, 20 Jan 2026 18:37:47 +0800 Subject: [PATCH 08/20] optimize code style Signed-off-by: jaogoy Signed-off-by: Mateusz Jukiewicz --- .../integration/test_integration_starrocks.py | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/tests/core/engine_adapter/integration/test_integration_starrocks.py b/tests/core/engine_adapter/integration/test_integration_starrocks.py index 6d46d91b42..1c6d8e0151 100644 --- a/tests/core/engine_adapter/integration/test_integration_starrocks.py +++ b/tests/core/engine_adapter/integration/test_integration_starrocks.py @@ -207,7 +207,9 @@ def _get_config_value(name: str) -> t.Optional[str]: current_replication = _get_config_value("default_replication_num") try: - current_replication_int = int(current_replication) if current_replication is not None else None + current_replication_int = ( + int(current_replication) if current_replication is not None else None + ) except Exception: current_replication_int = None @@ -222,9 +224,7 @@ def _get_config_value(name: str) -> t.Optional[str]: current_replication, ) except Exception as e: # pragma: no cover - do not break tests if lacking privilege - logger.warning( - "Failed to set default_replication_num for shared_nothing cluster: %s", e - ) + logger.warning("Failed to set default_replication_num for shared_nothing cluster: %s", e) class TestBasicOperations: @@ -249,10 +249,10 @@ def test_create_drop_schema(self, ctx: TestContext, engine_adapter: StarRocksEng # DROP DATABASE engine_adapter.drop_schema(db_name) - result: t.Optional[Row] = engine_adapter.fetchone( + dropped_result: t.Optional[Row] = engine_adapter.fetchone( f"SELECT SCHEMA_NAME FROM information_schema.SCHEMATA WHERE SCHEMA_NAME = '{db_name}'" ) - assert result is None, "DROP DATABASE failed" + assert dropped_result is None, "DROP DATABASE failed" def test_create_drop_table(self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter): """Test CREATE TABLE and DROP TABLE (TestContext version).""" @@ -1491,8 +1491,9 @@ def test_e2e_partition_expression_for_table( else: assert "REGION" in after if "FROM_UNIXTIME" in partition_expr.upper(): - assert "FROM_UNIXTIME" in after or \ - ("FROM_UNIXTIME" in before and "__GENERATED_PARTITION_COLUMN" in after) + assert "FROM_UNIXTIME" in after or ( + "FROM_UNIXTIME" in before and "__GENERATED_PARTITION_COLUMN" in after + ) if "DATE_TRUNC" in partition_expr.upper(): assert "DATE_TRUNC" in after finally: @@ -1578,9 +1579,9 @@ def test_e2e_partition_expression_for_mv( view_properties=model.physical_properties, ) - ddl = fetchone_or_fail( - starrocks_adapter, f"SHOW CREATE MATERIALIZED VIEW {mv_table}" - )[1] + ddl = fetchone_or_fail(starrocks_adapter, f"SHOW CREATE MATERIALIZED VIEW {mv_table}")[ + 1 + ] logger.info(f"Case 6B DDL:\n{ddl}") ddl_upper = ddl.upper() assert "PARTITION BY" in ddl_upper From 41474ca3ce9321f94634801851d461aae4c4d119 Mon Sep 17 00:00:00 2001 From: jaogoy Date: Wed, 21 Jan 2026 19:34:37 +0800 Subject: [PATCH 09/20] fix docker compose usage Signed-off-by: jaogoy Signed-off-by: Mateusz Jukiewicz --- .../integration/docker/compose.starrocks.yaml | 35 +++++++++++++++---- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/tests/core/engine_adapter/integration/docker/compose.starrocks.yaml b/tests/core/engine_adapter/integration/docker/compose.starrocks.yaml index 3a19fa6a3f..bdd87adbab 100644 --- a/tests/core/engine_adapter/integration/docker/compose.starrocks.yaml +++ b/tests/core/engine_adapter/integration/docker/compose.starrocks.yaml @@ -3,25 +3,48 @@ services: image: starrocks/fe-ubuntu:3.5-latest container_name: starrocks-fe hostname: starrocks-fe - environment: - - FE_SERVERS=fe1:starrocks-fe:9030 + command: | + bash /opt/starrocks/fe/bin/start_fe.sh --host_type FQDN ports: - - "9030:9030" # MySQL protocol port for tests - - "8030:8030" # HTTP port + - 9030:9030 # MySQL protocol port for tests + - 9020:9020 + - 8030:8030 # HTTP port networks: - starrocks_net + healthcheck: + test: 'mysql -u root -h starrocks-fe -P 9030 -e "SHOW FRONTENDS\G" |grep "Alive: true"' + interval: 10s + timeout: 5s + retries: 3 starrocks-be: image: starrocks/be-ubuntu:3.5-latest container_name: starrocks-be hostname: starrocks-be depends_on: - - starrocks-fe + starrocks-fe: + condition: service_healthy + command: + - /bin/bash + - -c + - | + ulimit -n 65535; + sleep 15s + mysql --connect-timeout 2 -h starrocks-fe -P 9030 -u root -e "ALTER SYSTEM ADD BACKEND \"starrocks-be:9050\";" + bash /opt/starrocks/be/bin/start_be.sh environment: - - FE_SERVERS=starrocks-fe:9030 + - HOST_TYPE=FQDN + ports: + - 8040:8040 networks: - starrocks_net + healthcheck: + test: 'mysql -u root -h starrocks-fe -P 9030 -e "SHOW BACKENDS\G" |grep "Alive: true"' + interval: 10s + timeout: 5s + retries: 3 networks: starrocks_net: driver: bridge + From 79924fa3d3c631cda58cca937473e9c1e6d10896 Mon Sep 17 00:00:00 2001 From: jaogoy Date: Wed, 28 Jan 2026 15:35:37 +0800 Subject: [PATCH 10/20] modity test cases according to the sqlglot change Signed-off-by: jaogoy Signed-off-by: Mateusz Jukiewicz --- tests/core/engine_adapter/test_starrocks.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/tests/core/engine_adapter/test_starrocks.py b/tests/core/engine_adapter/test_starrocks.py index f0610fce72..aebb415932 100644 --- a/tests/core/engine_adapter/test_starrocks.py +++ b/tests/core/engine_adapter/test_starrocks.py @@ -806,18 +806,18 @@ class TestPartitionPropertyBuilding: """Tests for partitioned_by/partition_by and partitions properties.""" @pytest.mark.parametrize( - "partition_expr,expected_clause", + "partition_expr,expected_clause,expected_clause2", [ # Expression partitioning - single column - ("'dt'", "PARTITION BY (`dt`)"), + ("'dt'", "PARTITION BY `dt`", "PARTITION BY (`dt`)"), # Expression partitioning - multi-column - ("(year, month)", "PARTITION BY (`year`, `month`)"), + ("(year, month)", "PARTITION BY `year`, `month`", "PARTITION BY (`year`, `month`)"), # Expression partitioning - multi-column with func - ("(date_trunc('day', dt), region)", "PARTITION BY DATE_TRUNC('DAY', `dt`), `region`"), + ("(date_trunc('day', dt), region)", "PARTITION BY DATE_TRUNC('DAY', `dt`), `region`", None), # RANGE partitioning - ("RANGE (dt)", "PARTITION BY RANGE (`dt`) ()"), + ("RANGE (dt)", "PARTITION BY RANGE (`dt`) ()", None), # LIST partitioning - ("LIST (region)", "PARTITION BY LIST (`region`) ()"), + ("LIST (region)", "PARTITION BY LIST (`region`) ()", None), ], ) def test_partitioned_by_forms( @@ -825,6 +825,7 @@ def test_partitioned_by_forms( make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], partition_expr: str, expected_clause: str, + expected_clause2: t.Optional[str], ): """Test partition_by with various forms parsed from physical_properties.""" model_sql = f""" @@ -852,7 +853,7 @@ def test_partitioned_by_forms( ) sql = to_sql_calls(adapter)[0] - assert expected_clause in sql + assert expected_clause in sql or expected_clause2 and expected_clause2 in sql @pytest.mark.parametrize( "partition_expr,expected_clause", @@ -938,7 +939,7 @@ def test_partition_by_alias( ) sql = to_sql_calls(adapter)[0] - assert "PARTITION BY (year, month)" in sql or "PARTITION BY (`year`, `month`)" in sql + assert "PARTITION BY (`year`, `month`)" in sql or "PARTITION BY `year`, `month`" in sql def test_partitioned_by_as_model_parameter( self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] @@ -966,7 +967,7 @@ def test_partitioned_by_as_model_parameter( ) sql = to_sql_calls(adapter)[0] - assert "PARTITION BY (year, month)" in sql or "PARTITION BY (`year`, `month`)" in sql + assert "PARTITION BY (year, month)" in sql or "PARTITION BY `year`, `month`" in sql or "PARTITION BY (`year`, `month`)" in sql def test_partitions_value_forms( self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] From 63135e79f6ac12f77b5547db174a56e11a74c6fc Mon Sep 17 00:00:00 2001 From: jaogoy Date: Thu, 5 Feb 2026 15:59:26 +0800 Subject: [PATCH 11/20] modify code according some small change of sqlglot Signed-off-by: jaogoy Signed-off-by: Mateusz Jukiewicz --- sqlmesh/core/engine_adapter/starrocks.py | 7 +++---- tests/core/engine_adapter/test_starrocks.py | 12 ++++++++++-- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/sqlmesh/core/engine_adapter/starrocks.py b/sqlmesh/core/engine_adapter/starrocks.py index 18a0d8a4e5..4b6df20a20 100644 --- a/sqlmesh/core/engine_adapter/starrocks.py +++ b/sqlmesh/core/engine_adapter/starrocks.py @@ -2882,7 +2882,7 @@ def _build_partitioned_by_exp( create_expressions=create_expressions, ) elif partition_kind is None: - return exp.PartitionedByProperty(this=exp.tuple_(*partitioned_by)) + return exp.PartitionedByProperty(this=exp.Schema(expressions=partitioned_by)) return None @@ -2947,13 +2947,12 @@ def _build_distributed_by_property( expressions_list.append(exp.to_column(str(col))) # Build buckets expression buckets: t.Optional[t.Any] = unified.get("buckets") + buckets_expr: t.Optional[exp.Expression] = None if buckets is not None: if isinstance(buckets, exp.Literal): buckets_expr = buckets else: buckets_expr = exp.Literal.number(int(buckets)) - else: - buckets_expr = None result = exp.DistributedByProperty( kind=kind_expr, @@ -2986,7 +2985,7 @@ def _build_refresh_property( # method is required by exp.RefreshTriggerProperty, but StarRocks syntax does NOT support AUTO. # We use a sentinel value that the StarRocks generator will not render (it only renders # IMMEDIATE/DEFERRED). - method_expr = exp.Var(this="UNSPECIFIED") + method_expr = None if refresh_moment is not None: refresh_moment_text = PropertyValidator.validate_and_normalize_property( "refresh_moment", refresh_moment diff --git a/tests/core/engine_adapter/test_starrocks.py b/tests/core/engine_adapter/test_starrocks.py index aebb415932..524870e78a 100644 --- a/tests/core/engine_adapter/test_starrocks.py +++ b/tests/core/engine_adapter/test_starrocks.py @@ -813,7 +813,11 @@ class TestPartitionPropertyBuilding: # Expression partitioning - multi-column ("(year, month)", "PARTITION BY `year`, `month`", "PARTITION BY (`year`, `month`)"), # Expression partitioning - multi-column with func - ("(date_trunc('day', dt), region)", "PARTITION BY DATE_TRUNC('DAY', `dt`), `region`", None), + ( + "(date_trunc('day', dt), region)", + "PARTITION BY DATE_TRUNC('DAY', `dt`), `region`", + None, + ), # RANGE partitioning ("RANGE (dt)", "PARTITION BY RANGE (`dt`) ()", None), # LIST partitioning @@ -967,7 +971,11 @@ def test_partitioned_by_as_model_parameter( ) sql = to_sql_calls(adapter)[0] - assert "PARTITION BY (year, month)" in sql or "PARTITION BY `year`, `month`" in sql or "PARTITION BY (`year`, `month`)" in sql + assert ( + "PARTITION BY (year, month)" in sql + or "PARTITION BY `year`, `month`" in sql + or "PARTITION BY (`year`, `month`)" in sql + ) def test_partitions_value_forms( self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] From a57f72130fc0f742f5be1730da5b30338539dff7 Mon Sep 17 00:00:00 2001 From: jaogoy Date: Sun, 8 Feb 2026 18:24:51 +0800 Subject: [PATCH 12/20] modify test about materialized_properteis Signed-off-by: jaogoy Signed-off-by: Mateusz Jukiewicz --- sqlmesh/core/snapshot/evaluator.py | 6 ++++-- tests/core/test_snapshot_evaluator.py | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/sqlmesh/core/snapshot/evaluator.py b/sqlmesh/core/snapshot/evaluator.py index 55046427bc..0b3003c665 100644 --- a/sqlmesh/core/snapshot/evaluator.py +++ b/sqlmesh/core/snapshot/evaluator.py @@ -2743,11 +2743,13 @@ def insert( logger.info("Replacing view '%s'", table_name) materialized_properties = None - if is_materialized_view: + if is_materialized_view and ( + model.partitioned_by or model.partition_interval_unit or model.clustered_by + ): materialized_properties = { "partitioned_by": model.partitioned_by, - "clustered_by": model.clustered_by, "partition_interval_unit": model.partition_interval_unit, + "clustered_by": model.clustered_by, } self.adapter.create_view( table_name, diff --git a/tests/core/test_snapshot_evaluator.py b/tests/core/test_snapshot_evaluator.py index 2b07e94fd6..51ea5431f7 100644 --- a/tests/core/test_snapshot_evaluator.py +++ b/tests/core/test_snapshot_evaluator.py @@ -782,6 +782,7 @@ def test_evaluate_materialized_view_with_execution_time_macro( view_properties={}, table_description=None, column_descriptions={}, + materialized_properties=None, ) From 42492d1f6b2eec67095eae04b241ab6041a7d006 Mon Sep 17 00:00:00 2001 From: jaogoy Date: Mon, 9 Feb 2026 14:56:48 +0800 Subject: [PATCH 13/20] set default replication num to 1 Signed-off-by: jaogoy Signed-off-by: Mateusz Jukiewicz --- .github/scripts/wait-for-db.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/scripts/wait-for-db.sh b/.github/scripts/wait-for-db.sh index 85f30dc7d7..e69504b6da 100755 --- a/.github/scripts/wait-for-db.sh +++ b/.github/scripts/wait-for-db.sh @@ -76,6 +76,9 @@ starrocks_ready() { echo "Waiting for more backends to become alive..." sleep 5 done + + # set default replication num to 1 (there is only one be in the docker compose file) + docker exec -i starrocks-fe mysql -h127.0.0.1 -P9030 -uroot -e "ADMIN SET frontend config ('default_replication_num' = '1');" } trino_ready() { From 55b3bb54e58248216ec00fac1f7b57dce5d8e595 Mon Sep 17 00:00:00 2001 From: Mateusz Jukiewicz Date: Tue, 2 Jun 2026 15:09:40 +0200 Subject: [PATCH 14/20] add starrocks to the CI matrix in the GitHub Actions workflow Signed-off-by: Mateusz Jukiewicz --- .github/workflows/pr.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 8610f6ff39..8759bd484c 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -252,7 +252,7 @@ jobs: fail-fast: false matrix: engine: - [duckdb, postgres, mysql, mssql, trino, spark, clickhouse, risingwave] + [duckdb, postgres, mysql, mssql, trino, spark, clickhouse, risingwave, starrocks] env: PYTEST_XDIST_AUTO_NUM_WORKERS: 2 SQLMESH__DISABLE_ANONYMIZED_ANALYTICS: '1' From a5c258de3118c7e9ce749b07967ded867e09383b Mon Sep 17 00:00:00 2001 From: Mateusz Jukiewicz Date: Tue, 2 Jun 2026 15:45:18 +0200 Subject: [PATCH 15/20] Fix `make style` errors (mypy, ruff) Signed-off-by: Mateusz Jukiewicz --- sqlmesh/core/engine_adapter/starrocks.py | 114 +++++++++++------------ sqlmesh/core/snapshot/evaluator.py | 2 +- 2 files changed, 57 insertions(+), 59 deletions(-) diff --git a/sqlmesh/core/engine_adapter/starrocks.py b/sqlmesh/core/engine_adapter/starrocks.py index 4b6df20a20..227b061c9f 100644 --- a/sqlmesh/core/engine_adapter/starrocks.py +++ b/sqlmesh/core/engine_adapter/starrocks.py @@ -52,24 +52,24 @@ "identifier", # exp.Identifier "literal", # exp.Literal.string("HASH") "column", # exp.Column(this="HASH") - "ast_expr", # generic exp.Expression + "ast_expr", # generic exp.Expr } # ============================================================ # Fragment parser (robust-ish) # ============================================================ -def parse_fragment(text: str) -> t.Union[exp.Expression, t.List[exp.Expression]]: +def parse_fragment(text: str) -> t.Union[exp.Expr, t.List[exp.Expr]]: """ Try to parse a DSL fragment into SQLGlot AST(s). Behavior: - 1. If parse_one succeeds, return the exp.Expression. + 1. If parse_one succeeds, return the exp.Expr. 2. If fails but text contains comma, split by commas and parse each part. 3. If it's parenthesized like "(a, b)", parse and return exp.Tuple or list. 4. If it's a simple token like "IDENT", return exp.Identifier. """ - if isinstance(text, exp.Expression): + if isinstance(text, exp.Expr): return text if not isinstance(text, str): @@ -381,7 +381,7 @@ class EnumType(DeclarativeType): - "identifier": exp.Identifier - "literal": exp.Literal.string() - "column": exp.Column - - "ast_expr": generic exp.Expression (defaults to Identifier) + - "ast_expr": generic exp.Expr (defaults to Identifier) case_sensitive : bool Whether to perform case-sensitive matching (default: False) @@ -682,7 +682,7 @@ def _extract_elements(self, value: t.Any) -> t.Optional[t.List[t.Any]]: return [inner] # Single AST element: promote to list (if allow_single) - if self.allow_single and isinstance(value, exp.Expression): + if self.allow_single and isinstance(value, exp.Expr): return [value] return None @@ -1291,11 +1291,11 @@ class PropertySpecs: in the usage layer via factory methods like DistributionTupleInputType.to_unified_dict(). Expected Output Types (after normalization): - - table keys: List[exp.Expression] - columns - - partitioned_by: List[exp.Expression] - columns, functions + - table keys: List[exp.Expr] - columns + - partitioned_by: List[exp.Expr] - columns, functions - partitions: List[str] - partition definition strings - distributed_by: Dict | str | exp.Func - DistributionTupleInputType, EnumType, or FuncType output - - order_by: List[exp.Expression] - columns + - order_by: List[exp.Expr] - columns - generic properties: str - normalized string values """ GeneralColumnListOutputSpec: DeclarativeType = SequenceOf(ColumnType(), allow_single=False) @@ -1880,7 +1880,7 @@ def _create_table_like( def delete_from( self, table_name: TableName, - where: t.Optional[t.Union[str, exp.Expression]] = None, + where: t.Optional[t.Union[str, exp.Expr]] = None, ) -> None: """ Delete from a table. @@ -1907,7 +1907,7 @@ def delete_from( where: The where clause to filter rows to delete """ # Parse where clause if it's a string - where_expr: t.Optional[exp.Expression] + where_expr: t.Optional[exp.Expr] if isinstance(where, str): from sqlglot import parse_one @@ -1929,7 +1929,7 @@ def delete_from( # Note: We conservatively apply restrictions to all tables since we can't easily # determine table type at DELETE time. PRIMARY KEY tables will still work with # simplified conditions, while non-PRIMARY KEY tables require them. - if isinstance(where_expr, exp.Expression): + if isinstance(where_expr, exp.Expr): original_where = where_expr # Remove boolean literals (not supported in any table type) where_expr = self._where_clause_remove_boolean_literals(where_expr) @@ -1946,7 +1946,7 @@ def delete_from( # Use parent implementation super().delete_from(table_name, where_expr) - def _where_clause_remove_boolean_literals(self, expression: exp.Expression) -> exp.Expression: + def _where_clause_remove_boolean_literals(self, expression: exp.Expr) -> exp.Expr: """ Remove TRUE/FALSE boolean literals from WHERE expressions. @@ -1966,7 +1966,7 @@ def _where_clause_remove_boolean_literals(self, expression: exp.Expression) -> e Cleaned expression without boolean literals """ - def transform(node: exp.Expression) -> exp.Expression: + def transform(node: exp.Expr) -> exp.Expr: # Handle standalone TRUE/FALSE at the top level if node == exp.true(): # Convert TRUE to 1=1 @@ -2002,9 +2002,7 @@ def transform(node: exp.Expression) -> exp.Expression: # Transform the expression tree return expression.transform(transform, copy=True) - def _where_clause_convert_between_to_comparison( - self, expression: exp.Expression - ) -> exp.Expression: + def _where_clause_convert_between_to_comparison(self, expression: exp.Expr) -> exp.Expr: """ Convert BETWEEN expressions to >= AND <= comparisons. @@ -2024,7 +2022,7 @@ def _where_clause_convert_between_to_comparison( Expression with BETWEEN converted to comparisons """ - def transform(node: exp.Expression) -> exp.Expression: + def transform(node: exp.Expr) -> exp.Expr: if isinstance(node, exp.Between): # Extract components: col BETWEEN low AND high column = node.this # The column being tested @@ -2044,7 +2042,7 @@ def transform(node: exp.Expression) -> exp.Expression: def execute( self, - expressions: t.Union[str, exp.Expression, t.Sequence[exp.Expression]], + expressions: t.Union[str, exp.Expr, t.Sequence[exp.Expr]], ignore_unsupported_errors: bool = False, quote_identifiers: bool = True, track_rows_processed: bool = False, @@ -2076,9 +2074,9 @@ def execute( return # Process expressions to remove FOR UPDATE - processed_expressions: t.List[exp.Expression] = [] + processed_expressions: t.List[exp.Expr] = [] for e in ensure_list(expressions): - if not isinstance(e, exp.Expression): + if not isinstance(e, exp.Expr): super().execute( expressions, ignore_unsupported_errors=ignore_unsupported_errors, @@ -2235,7 +2233,7 @@ def create_view( materialized_properties: t.Optional[t.Dict[str, t.Any]] = None, table_description: t.Optional[str] = None, column_descriptions: t.Optional[t.Dict[str, str]] = None, - view_properties: t.Optional[t.Dict[str, exp.Expression]] = None, + view_properties: t.Optional[t.Dict[str, exp.Expr]] = None, source_columns: t.Optional[t.List[str]] = None, **create_kwargs: t.Any, ) -> None: @@ -2292,7 +2290,7 @@ def _create_materialized_view( materialized_properties: t.Optional[t.Dict[str, t.Any]] = None, table_description: t.Optional[str] = None, column_descriptions: t.Optional[t.Dict[str, str]] = None, - view_properties: t.Optional[t.Dict[str, exp.Expression]] = None, + view_properties: t.Optional[t.Dict[str, exp.Expr]] = None, source_columns: t.Optional[t.List[str]] = None, **create_kwargs: t.Any, ) -> None: @@ -2403,7 +2401,7 @@ def _build_materialized_view_schema_exp( return table column_descriptions = column_descriptions or {} - expressions: t.List[exp.Expression] = [] + expressions: t.List[exp.Expr] = [] for col in columns: constraints: t.List[exp.ColumnConstraint] = [] comment = column_descriptions.get(col) @@ -2430,10 +2428,10 @@ def _build_table_properties_exp( catalog_name: t.Optional[str] = None, table_format: t.Optional[str] = None, storage_format: t.Optional[str] = None, - partitioned_by: t.Optional[t.List[exp.Expression]] = None, + partitioned_by: t.Optional[t.List[exp.Expr]] = None, partition_interval_unit: t.Optional[IntervalUnit] = None, - clustered_by: t.Optional[t.List[exp.Expression]] = None, - table_properties: t.Optional[t.Dict[str, exp.Expression]] = None, + clustered_by: t.Optional[t.List[exp.Expr]] = None, + table_properties: t.Optional[t.Dict[str, exp.Expr]] = None, target_columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None, table_description: t.Optional[str] = None, table_kind: t.Optional[str] = None, @@ -2477,7 +2475,7 @@ def _build_table_properties_exp( - replication_num, storage_medium, etc.: Literal values table_description: Table comment """ - properties: t.List[exp.Expression] = [] + properties: t.List[exp.Expr] = [] table_properties_copy = dict(table_properties) if table_properties else {} # logger.debug( # "_build_table_properties_exp: table_properties=%s", @@ -2566,7 +2564,7 @@ def _build_table_properties_exp( def _build_view_properties_exp( self, - view_properties: t.Optional[t.Dict[str, exp.Expression]] = None, + view_properties: t.Optional[t.Dict[str, exp.Expr]] = None, table_description: t.Optional[str] = None, **kwargs: t.Any, ) -> t.Optional[exp.Properties]: @@ -2574,9 +2572,9 @@ def _build_view_properties_exp( Build CREATE VIEW properties for StarRocks. Supports StarRocks view SECURITY syntax: SECURITY {NONE | INVOKER} - via exp.SecurityProperty (renders as `SECURITY `). + via exp.SqlSecurityProperty (renders as `SECURITY `). """ - properties: t.List[exp.Expression] = [] + properties: t.List[exp.Expr] = [] if table_description: properties.append( @@ -2592,8 +2590,8 @@ def _build_view_properties_exp( security_text = PropertyValidator.validate_and_normalize_property( "security", security ) - # exp.SecurityProperty renders as `SECURITY ` (no '=') - properties.append(exp.SecurityProperty(this=exp.Var(this=security_text))) + # exp.SqlSecurityProperty renders as `SECURITY ` (no '=') + properties.append(exp.SqlSecurityProperty(this=exp.Var(this=security_text))) properties.extend(self._table_or_view_properties_to_expressions(view_properties_copy)) @@ -2603,7 +2601,7 @@ def _build_view_properties_exp( def _build_table_key_property( self, table_properties: t.Dict[str, t.Any], active_key_type: t.Optional[str] - ) -> t.Optional[exp.Expression]: + ) -> t.Optional[exp.Expr]: """ Build key constraint property for ALL key types including PRIMARY KEY. @@ -2628,7 +2626,7 @@ def _build_table_key_property( return None # Configuration: key_name -> Property class (excluding primary_key) - KEY_PROPERTY_CLASSES: t.Dict[str, t.Type[exp.Expression]] = { + KEY_PROPERTY_CLASSES: t.Dict[str, t.Type[exp.Expr]] = { "primary_key": exp.PrimaryKey, "duplicate_key": exp.DuplicateKeyProperty, "unique_key": exp.UniqueKeyProperty, @@ -2670,14 +2668,14 @@ def _build_table_key_property( def _build_partition_property( self, - partitioned_by: t.Optional[t.List[exp.Expression]], + partitioned_by: t.Optional[t.List[exp.Expr]], partition_interval_unit: t.Optional["IntervalUnit"], target_columns_to_types: t.Optional[t.Dict[str, exp.DataType]], catalog_name: t.Optional[str], table_properties: t.Dict[str, t.Any], key_type: t.Optional[str], key_columns: t.Optional[t.Tuple[str, ...]], - ) -> t.Optional[exp.Expression]: + ) -> t.Optional[exp.Expr]: """ Build partition property expression. @@ -2722,7 +2720,7 @@ def _build_partition_property( partition_cols, ) - def extract_column_name(expr: exp.Expression) -> t.Optional[str]: + def extract_column_name(expr: exp.Expr) -> t.Optional[str]: if isinstance(expr, exp.Column): return str(expr.name) elif isinstance(expr, (exp.Anonymous, exp.Func)): # noqa: RET505 @@ -2768,8 +2766,8 @@ def extract_column_name(expr: exp.Expression) -> t.Optional[str]: return result def _parse_partition_expressions( - self, partitioned_by: t.List[exp.Expression] - ) -> t.Tuple[t.Optional[str], t.List[exp.Expression]]: + self, partitioned_by: t.List[exp.Expr] + ) -> t.Tuple[t.Optional[str], t.List[exp.Expr]]: """ Parse partition expressions and extract partition kind (RANGE/LIST). @@ -2789,7 +2787,7 @@ def _parse_partition_expressions( - partition_kind: "RANGE", "LIST", or None - normalized_columns: List of Column expressions, or function expressions """ - parsed_cols: t.List[exp.Expression] = [] + parsed_cols: t.List[exp.Expr] = [] partition_kind: t.Optional[str] = None normalized = PropertyValidator.validate_and_normalize_property( @@ -2828,7 +2826,7 @@ def _parse_partition_expressions( def _build_partitioned_by_exp( self, - partitioned_by: t.List[exp.Expression], + partitioned_by: t.List[exp.Expr], *, partition_interval_unit: t.Optional["IntervalUnit"] = None, target_columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None, @@ -2939,15 +2937,15 @@ def _build_distributed_by_property( kind_expr = exp.Var(this=unified["kind"]) # Convert columns to expressions columns: t.List[exp.Column] = unified.get("columns", []) - expressions_list: t.List[exp.Expression] = [] + expressions_list: t.List[exp.Expr] = [] for col in columns: - if isinstance(col, exp.Expression): + if isinstance(col, exp.Expr): expressions_list.append(col) else: expressions_list.append(exp.to_column(str(col))) # Build buckets expression buckets: t.Optional[t.Any] = unified.get("buckets") - buckets_expr: t.Optional[exp.Expression] = None + buckets_expr: t.Optional[exp.Expr] = None if buckets is not None: if isinstance(buckets, exp.Literal): buckets_expr = buckets @@ -2992,10 +2990,10 @@ def _build_refresh_property( ) method_expr = exp.Var(this=refresh_moment_text) - kind_expr: t.Optional[exp.Expression] = None - starts_expr: t.Optional[exp.Expression] = None - every_expr: t.Optional[exp.Expression] = None - unit_expr: t.Optional[exp.Expression] = None + kind_expr: t.Optional[exp.Expr] = None + starts_expr: t.Optional[exp.Expr] = None + every_expr: t.Optional[exp.Expr] = None + unit_expr: t.Optional[exp.Expr] = None if refresh_scheme is not None: scheme_text = PropertyValidator.validate_and_normalize_property( @@ -3019,10 +3017,10 @@ def _build_refresh_property( def _parse_refresh_scheme( self, refresh_scheme: str ) -> t.Tuple[ - t.Optional[exp.Expression], - t.Optional[exp.Expression], - t.Optional[exp.Expression], - t.Optional[exp.Expression], + t.Optional[exp.Expr], + t.Optional[exp.Expr], + t.Optional[exp.Expr], + t.Optional[exp.Expr], ]: """ Parse StarRocks refresh_scheme text into (kind, starts, every, unit). @@ -3042,11 +3040,11 @@ def _parse_refresh_scheme( f"[StarRocks] Invalid refresh_scheme {refresh_scheme!r}. Expected to start with MANUAL or ASYNC." ) kind = m_kind.group(1).upper() - kind_expr: t.Optional[exp.Expression] = exp.Var(this=kind) + kind_expr: t.Optional[exp.Expr] = exp.Var(this=kind) - starts_expr: t.Optional[exp.Expression] = None - every_expr: t.Optional[exp.Expression] = None - unit_expr: t.Optional[exp.Expression] = None + starts_expr: t.Optional[exp.Expr] = None + every_expr: t.Optional[exp.Expr] = None + unit_expr: t.Optional[exp.Expr] = None m_start = re.search( r"\bSTART\s*\(\s*(?:'([^']*)'|\"([^\"]*)\"|([^)]*))\s*\)", text, flags=re.IGNORECASE ) @@ -3110,7 +3108,7 @@ def _parse_distribution_with_buckets( def _build_order_by_property( self, table_properties: t.Dict[str, t.Any], - clustered_by: t.Optional[t.List[exp.Expression]], + clustered_by: t.Optional[t.List[exp.Expr]], ) -> t.Optional[exp.Cluster]: """ Build ORDER BY (clustering) property. diff --git a/sqlmesh/core/snapshot/evaluator.py b/sqlmesh/core/snapshot/evaluator.py index 0b3003c665..07eb4e42ce 100644 --- a/sqlmesh/core/snapshot/evaluator.py +++ b/sqlmesh/core/snapshot/evaluator.py @@ -2061,7 +2061,7 @@ def _ensure_primary_key_for_starrocks_when_incremental_by_unique_key( or "primary_key" in properties ): return properties - unique_key: t.Optional[t.List[exp.Expression]] = model.unique_key + unique_key: t.Optional[t.List[exp.Expr]] = model.unique_key if unique_key: properties["primary_key"] = ( unique_key[0] if len(unique_key) == 1 else exp.Tuple(expressions=unique_key) From 9dec6d9f8899375c6012747a16c8e3b823b301f5 Mon Sep 17 00:00:00 2001 From: Mateusz Jukiewicz Date: Wed, 3 Jun 2026 01:52:21 +0200 Subject: [PATCH 16/20] Update StarRocks Docker images to version 4.1 in the integration compose file Signed-off-by: Mateusz Jukiewicz --- .../engine_adapter/integration/docker/compose.starrocks.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/core/engine_adapter/integration/docker/compose.starrocks.yaml b/tests/core/engine_adapter/integration/docker/compose.starrocks.yaml index bdd87adbab..23ff1ad7cc 100644 --- a/tests/core/engine_adapter/integration/docker/compose.starrocks.yaml +++ b/tests/core/engine_adapter/integration/docker/compose.starrocks.yaml @@ -1,6 +1,6 @@ services: starrocks-fe: - image: starrocks/fe-ubuntu:3.5-latest + image: starrocks/fe-ubuntu:4.1-latest container_name: starrocks-fe hostname: starrocks-fe command: | @@ -18,7 +18,7 @@ services: retries: 3 starrocks-be: - image: starrocks/be-ubuntu:3.5-latest + image: starrocks/be-ubuntu:4.1-latest container_name: starrocks-be hostname: starrocks-be depends_on: From f258b5e7260ae9ed2ac8904d8d4395691dd3a16c Mon Sep 17 00:00:00 2001 From: Mateusz Jukiewicz Date: Thu, 4 Jun 2026 13:44:08 +0200 Subject: [PATCH 17/20] * Refactoring * Making optional but failing requirements mandatory * Fixing unit tests * Documentation updates Signed-off-by: Mateusz Jukiewicz --- docs/integrations/engines/starrocks.md | 10 +- sqlmesh/core/engine_adapter/base.py | 29 ++++ sqlmesh/core/engine_adapter/starrocks.py | 81 +++++++++-- sqlmesh/core/snapshot/evaluator.py | 86 ++++++----- .../integration/test_integration.py | 31 ++++ .../integration/test_integration_starrocks.py | 92 ++++++++++-- tests/core/engine_adapter/test_starrocks.py | 135 +++++++++++++++++- tests/core/test_snapshot_evaluator.py | 9 ++ 8 files changed, 393 insertions(+), 80 deletions(-) diff --git a/docs/integrations/engines/starrocks.md b/docs/integrations/engines/starrocks.md index c7855dee70..b283c4f686 100644 --- a/docs/integrations/engines/starrocks.md +++ b/docs/integrations/engines/starrocks.md @@ -79,7 +79,7 @@ FROM source.user_events; A `DUPLICATE KEY` table can usually be used as a `FULL` kind model. -### 2) An incremental table (PRIMARY KEY recommended) +### 2) An incremental table (PRIMARY KEY required) ```sql MODEL ( @@ -129,9 +129,11 @@ MODEL ( ### PRIMARY KEY Type -For incremental models, **PRIMARY KEY tables are needed** (and effectively required for robust deletes) because StarRocks supports *weaker* `DELETE ... WHERE ...` on non-primary-key table types. +For incremental models, a **PRIMARY KEY table is mandatory**. StarRocks only supports the full `DELETE ... WHERE ...` and `MERGE` semantics that incremental kinds rely on (such as `INCREMENTAL_BY_TIME_RANGE`, `INCREMENTAL_BY_UNIQUE_KEY`, `INCREMENTAL_BY_PARTITION`, and `SCD_TYPE_2`) on PRIMARY KEY tables. On DUPLICATE KEY, UNIQUE KEY, and AGGREGATE KEY tables these operations are not supported well enough. -SQLMesh will apply conservative `WHERE` transformations for compatibility (for example, converting `BETWEEN` to `>= AND <=`, removing boolean literals, and converting `DELETE ... WHERE TRUE` to `TRUNCATE TABLE`). To avoid limitations and keep incremental maintenance reliable, use a `PRIMARY KEY` table by setting `physical_properties.primary_key`. +SQLMesh enforces this: an incremental model on StarRocks without a primary key fails fast with a clear error. Set `physical_properties.primary_key`, for example `physical_properties (primary_key = (user_id, event_date))`. As a convenience, an `INCREMENTAL_BY_UNIQUE_KEY` model's `unique_key` is automatically promoted to a PRIMARY KEY table. + +SQLMesh engine also applies conservative `WHERE` transformations for compatibility (for example, converting `BETWEEN` to `>= AND <=`, removing boolean literals, and converting `DELETE ... WHERE TRUE` to `TRUNCATE TABLE`). > SQLMesh currently does not support specifying `primary_key` as a model parameter. @@ -160,7 +162,7 @@ GROUP BY user_id, event_date; ### UNIQUE KEY Type -You can create a UNIQUE KEY table by setting `physical_properties.unique_key`. In most incremental use cases, a PRIMARY KEY table is recommended instead. +You can create a UNIQUE KEY table by setting `physical_properties.unique_key`. Note that a UNIQUE KEY table is **not** sufficient for incremental models — incremental kinds require a PRIMARY KEY table (see [PRIMARY KEY Type](#primary-key-type)). **Example:** diff --git a/sqlmesh/core/engine_adapter/base.py b/sqlmesh/core/engine_adapter/base.py index 5465ea1197..a4e450dd1c 100644 --- a/sqlmesh/core/engine_adapter/base.py +++ b/sqlmesh/core/engine_adapter/base.py @@ -2728,6 +2728,35 @@ def _build_clustered_by_exp( ) -> t.Optional[exp.Cluster]: return None + def adjust_physical_properties_for_incremental( + self, + physical_properties: t.Dict[str, t.Any], + *, + requires_delete_capable_table: bool, + unique_key: t.Optional[t.List[exp.Expr]], + model_name: str, + ) -> t.Dict[str, t.Any]: + """Adjusts physical properties for an incremental model before the table is created. + + Some engines require a specific physical table layout before they can run the DELETE/MERGE + statements that incremental model kinds rely on (e.g. StarRocks only supports those on + PRIMARY KEY tables). This hook lets each engine derive or validate the required properties + while keeping the generic evaluator free of engine-specific branching. + + Args: + physical_properties: The model's physical properties. + requires_delete_capable_table: Whether the model kind issues DELETE/MERGE statements + (as opposed to append-only INSERTs), as determined by the generic evaluator. + unique_key: The model's unique key, populated only when the kind allows promoting it to + an engine-specific key (i.e. INCREMENTAL_BY_UNIQUE_KEY); otherwise None. + model_name: The model name, for use in diagnostics. + + Returns: + The (possibly adjusted) physical properties. Implementations own the given mapping and + may mutate it in place; the base implementation returns it unchanged. + """ + return physical_properties + def _build_table_properties_exp( self, catalog_name: t.Optional[str] = None, diff --git a/sqlmesh/core/engine_adapter/starrocks.py b/sqlmesh/core/engine_adapter/starrocks.py index 227b061c9f..8ce78ca059 100644 --- a/sqlmesh/core/engine_adapter/starrocks.py +++ b/sqlmesh/core/engine_adapter/starrocks.py @@ -1672,8 +1672,11 @@ class StarRocksEngineAdapter( TODO: later, we can add support for INSERT OVERWRITE, even use Primary Key for beter performance """ - COMMENT_CREATION_TABLE = CommentCreationTable.IN_SCHEMA_DEF_CTAS - """Table comments are added in both CREATE TABLE statement and CTAS""" + COMMENT_CREATION_TABLE = CommentCreationTable.IN_SCHEMA_DEF_NO_CTAS + """Column comments are added inline in a plain CREATE TABLE, but StarRocks CTAS only accepts a + bare column-name list (no types or per-column COMMENT) before AS SELECT. So for CTAS we emit + `CREATE TABLE t COMMENT '...' AS SELECT ...` (table comment only) and register column comments + afterward via ALTER TABLE ... MODIFY COLUMN ... COMMENT (see _build_create_comment_column_exp).""" COMMENT_CREATION_VIEW = CommentCreationView.IN_SCHEMA_DEF_NO_COMMANDS """View comments are added in CREATE VIEW statement""" @@ -2105,6 +2108,48 @@ def execute( **kwargs, ) + def adjust_physical_properties_for_incremental( + self, + physical_properties: t.Dict[str, t.Any], + *, + requires_delete_capable_table: bool, + unique_key: t.Optional[t.List[exp.Expr]], + model_name: str, + ) -> t.Dict[str, t.Any]: + """Enforce that StarRocks incremental models use a PRIMARY KEY table. + + Incremental kinds rely on DELETE/MERGE statements that StarRocks only supports on PRIMARY + KEY tables; DUPLICATE/UNIQUE/AGGREGATE KEY tables reject the predicates SQLMesh generates + (e.g. a time-range DELETE with a CAST bound, or any non-key-column predicate). When a + unique_key is available (INCREMENTAL_BY_UNIQUE_KEY) we promote it to a PRIMARY KEY; + otherwise a PRIMARY KEY must be specified explicitly via physical_properties, and we raise + so the failure is clear at creation time rather than producing a broken table. + + The caller owns ``physical_properties`` (it is already a defensive copy), so we mutate and + return it in place. + """ + if not requires_delete_capable_table or "primary_key" in physical_properties: + return physical_properties + + # Promote the model's unique_key to a PRIMARY KEY table so that complex DELETE/MERGE + # statements remain supported. + if unique_key: + physical_properties["primary_key"] = ( + unique_key[0] if len(unique_key) == 1 else exp.Tuple(expressions=unique_key) + ) + logger.info( + "Model '%s' promoted to PRIMARY KEY table on StarRocks to support rich DELETE operations.", + model_name, + ) + return physical_properties + + raise SQLMeshError( + f"StarRocks incremental model '{model_name}' requires a PRIMARY KEY table. " + "Incremental kinds use DELETE/MERGE operations that StarRocks only supports on PRIMARY KEY " + "tables; DUPLICATE/UNIQUE/AGGREGATE KEY tables are not sufficient. " + "Specify `physical_properties (primary_key = (...))`, or set `unique_key` on the model." + ) + # ==================== Table Creation (CORE IMPLEMENTATION) ==================== def _create_table_from_columns( self, @@ -2546,10 +2591,18 @@ def _build_table_properties_exp( properties.append(distributed_prop) # 5. Handle refresh_property (REFRESH ...) + # StarRocks only supports ASYNC materialized views, which require a REFRESH clause. + # Synchronous MVs are not supported, so a missing refresh is a hard error rather than + # a silent fallback (which would create an undetectable sync MV). if is_mv: refresh_prop = self._build_refresh_property(table_properties_copy) - if refresh_prop: - properties.append(refresh_prop) + if refresh_prop is None: + raise SQLMeshError( + "StarRocks materialized views require a REFRESH clause. " + "Specify at least one of 'refresh_moment' or 'refresh_scheme' in the model's " + "physical_properties (e.g. refresh_scheme = 'ASYNC')." + ) + properties.append(refresh_prop) # 6. Handle order_by/clustered_by (ORDER BY ...) order_prop = self._build_order_by_property(table_properties_copy, clustered_by or None) @@ -3314,9 +3367,9 @@ def _build_create_comment_table_exp( StarRocks uses non-standard syntax for table comments: ALTER TABLE {table} COMMENT = '{comment}' - Note: This method is typically NOT called for StarRocks because: - - COMMENT_CREATION_TABLE = IN_SCHEMA_DEF_CTAS - - Comments are included directly in CREATE TABLE via SchemaCommentProperty + Note: This method is typically NOT called for StarRocks because the table comment is + included directly in CREATE TABLE (and CTAS) via SchemaCommentProperty, which StarRocks + accepts even for `CREATE TABLE ... COMMENT '...' AS SELECT`. However, this override is provided for potential future use cases: - Modifying comments on existing tables via ALTER TABLE @@ -3346,15 +3399,13 @@ def _build_create_comment_column_exp( """ Build ALTER TABLE MODIFY COLUMN SQL for column comment modification. - StarRocks requires column type in MODIFY COLUMN statement: - ALTER TABLE {table} MODIFY COLUMN {column} {type} COMMENT '{comment}' - - Note: This method is typically NOT called for StarRocks because: - - COMMENT_CREATION_TABLE = IN_SCHEMA_DEF_CTAS - - Column comments are included directly in CREATE TABLE DDL + StarRocks accepts the comment without re-stating the column type: + ALTER TABLE {table} MODIFY COLUMN {column} COMMENT '{comment}' - However, this override is provided for potential future use cases: - - Modifying column comments on existing tables via ALTER TABLE + Because COMMENT_CREATION_TABLE = IN_SCHEMA_DEF_NO_CTAS, column comments are inlined for a + plain CREATE TABLE but NOT for CTAS (StarRocks rejects types/comments in a CTAS column + list). This method is therefore the fallback used to register column comments after a CTAS, + and to modify column comments on existing tables. Args: table: Table expression diff --git a/sqlmesh/core/snapshot/evaluator.py b/sqlmesh/core/snapshot/evaluator.py index 07eb4e42ce..cb7ef17580 100644 --- a/sqlmesh/core/snapshot/evaluator.py +++ b/sqlmesh/core/snapshot/evaluator.py @@ -2045,38 +2045,36 @@ def run_post_statements(self, snapshot: Snapshot, render_kwargs: t.Any) -> None: self.adapter.execute(snapshot.model.render_post_statements(**render_kwargs)) -def _ensure_primary_key_for_starrocks_when_incremental_by_unique_key( - model: Model, physical_properties: t.Optional[t.Dict[str, t.Any]] +def _adjust_physical_properties_for_engine( + adapter: EngineAdapter, + model: Model, + physical_properties: t.Optional[t.Dict[str, t.Any]], ) -> t.Dict[str, t.Any]: - """ - Promote StarRocks incremental-by-unique-key models to PRIMARY KEY tables so that - complex DELETE/MERGE statements remain supported. - """ + """Let the target engine adjust/validate physical properties for an incremental model. - properties = dict(physical_properties or {}) - - if ( - model.dialect != "starrocks" - or not model.kind.is_incremental_by_unique_key - or "primary_key" in properties - ): - return properties - unique_key: t.Optional[t.List[exp.Expr]] = model.unique_key - if unique_key: - properties["primary_key"] = ( - unique_key[0] if len(unique_key) == 1 else exp.Tuple(expressions=unique_key) - ) - logger.info( - "Model '%s' promoted to PRIMARY KEY table on StarRocks to support rich DELETE operations.", - model.name, - ) - else: - logger.warning( - f"StarRocks incremental-by-unique-key model '{model.name}' requires a PRIMARY KEY table. " - f"Specify `physical_properties['primary_key']` or set `unique_key` on the model.", - ) + The generic responsibility here is to determine, from the model kind, whether the table will + be the target of DELETE/MERGE statements (vs. append-only INSERTs) and whether its unique_key + may be promoted to an engine-specific key. The engine adapter decides what, if anything, to do + with that information (see ``EngineAdapter.adjust_physical_properties_for_incremental``). + """ + kind = model.kind + + # Only incremental kinds that issue DELETE/MERGE need a delete-capable table. Append-only + # INCREMENTAL_UNMANAGED (insert_overwrite=False) only does INSERT, so it does not. + requires_delete_capable_table = ( + kind.is_incremental_by_time_range + or kind.is_incremental_by_unique_key + or kind.is_incremental_by_partition + or kind.is_scd_type_2 + or (isinstance(kind, IncrementalUnmanagedKind) and kind.insert_overwrite) + ) - return properties + return adapter.adjust_physical_properties_for_incremental( + dict(physical_properties or {}), + requires_delete_capable_table=requires_delete_capable_table, + unique_key=model.unique_key if kind.is_incremental_by_unique_key else None, + model_name=model.name, + ) class MaterializableStrategy(PromotableStrategy, abc.ABC): @@ -2090,9 +2088,8 @@ def create( **kwargs: t.Any, ) -> None: ctas_query = model.ctas_query(**render_kwargs) - physical_properties = kwargs.get("physical_properties", model.physical_properties) - physical_properties = _ensure_primary_key_for_starrocks_when_incremental_by_unique_key( - model, physical_properties + physical_properties = _adjust_physical_properties_for_engine( + self.adapter, model, kwargs.get("physical_properties", model.physical_properties) ) logger.info("Creating table '%s'", table_name) @@ -2208,9 +2205,8 @@ def _replace_query_for_model( except Exception: columns_to_types, source_columns = None, None - physical_properties = kwargs.get("physical_properties", model.physical_properties) - physical_properties = _ensure_primary_key_for_starrocks_when_incremental_by_unique_key( - model, physical_properties + physical_properties = _adjust_physical_properties_for_engine( + self.adapter, model, kwargs.get("physical_properties", model.physical_properties) ) self.adapter.replace_query( name, @@ -2354,9 +2350,8 @@ def insert( table_name, render_kwargs=render_kwargs, ) - physical_properties = kwargs.get("physical_properties", model.physical_properties) - physical_properties = _ensure_primary_key_for_starrocks_when_incremental_by_unique_key( - model, physical_properties + physical_properties = _adjust_physical_properties_for_engine( + self.adapter, model, kwargs.get("physical_properties", model.physical_properties) ) self.adapter.merge( table_name, @@ -2384,9 +2379,8 @@ def append( columns_to_types, source_columns = self._get_target_and_source_columns( model, table_name, render_kwargs=render_kwargs ) - physical_properties = kwargs.get("physical_properties", model.physical_properties) - physical_properties = _ensure_primary_key_for_starrocks_when_incremental_by_unique_key( - model, physical_properties + physical_properties = _adjust_physical_properties_for_engine( + self.adapter, model, kwargs.get("physical_properties", model.physical_properties) ) self.adapter.merge( table_name, @@ -2594,6 +2588,9 @@ def create( columns_to_types = model.columns_to_types_or_raise if isinstance(model.kind, SCDType2ByTimeKind): columns_to_types[model.kind.updated_at_name.name] = model.kind.time_data_type + physical_properties = _adjust_physical_properties_for_engine( + self.adapter, model, kwargs.get("physical_properties", model.physical_properties) + ) self.adapter.create_table( table_name, target_columns_to_types=columns_to_types, @@ -2602,7 +2599,7 @@ def create( partitioned_by=model.partitioned_by, partition_interval_unit=model.partition_interval_unit, clustered_by=model.clustered_by, - table_properties=kwargs.get("physical_properties", model.physical_properties), + table_properties=physical_properties, table_description=model.description if is_table_deployable else None, column_descriptions=model.column_descriptions if is_table_deployable else None, ) @@ -3179,9 +3176,8 @@ def create( if is_table_deployable and is_snapshot_deployable: # We could deploy this to prod; create a proper managed table logger.info("Creating managed table: %s", table_name) - physical_properties = kwargs.get("physical_properties", model.physical_properties) - physical_properties = _ensure_primary_key_for_starrocks_when_incremental_by_unique_key( - model, physical_properties + physical_properties = _adjust_physical_properties_for_engine( + self.adapter, model, kwargs.get("physical_properties", model.physical_properties) ) self.adapter.create_managed_table( table_name=table_name, diff --git a/tests/core/engine_adapter/integration/test_integration.py b/tests/core/engine_adapter/integration/test_integration.py index 86b54d7399..44f680dafb 100644 --- a/tests/core/engine_adapter/integration/test_integration.py +++ b/tests/core/engine_adapter/integration/test_integration.py @@ -413,6 +413,12 @@ def test_materialized_view(ctx_query_and_df: TestContext): ) if ctx.engine_adapter.dialect == "snowflake": pytest.skip("Snowflake requires enterprise edition which we do not have setup") + if ctx.engine_adapter.dialect == "starrocks": + pytest.skip( + "StarRocks materialized views require a REFRESH clause (refresh_moment/refresh_scheme), " + "which this generic test does not provide; StarRocks MVs are covered by " + "test_integration_starrocks.py" + ) input_data = pd.DataFrame( [ {"id": 1, "ds": "2022-01-01"}, @@ -1956,6 +1962,12 @@ def test_sushi( pytest.skip( "Sushi end-to-end tests only need to run once for Athena because sushi needs a hybrid of both Hive and Iceberg" ) + if ctx.dialect == "starrocks": + pytest.skip( + "StarRocks requires incremental models to use a PRIMARY KEY table; the shared sushi " + "example uses cross-engine incremental/SCD models without a StarRocks primary_key, so " + "this end-to-end test does not apply to StarRocks" + ) sushi_test_schema = ctx.add_test_suffix("sushi") sushi_state_schema = ctx.add_test_suffix("sushi_state") @@ -2355,6 +2367,13 @@ def validate_no_comments( def test_init_project(ctx: TestContext, tmp_path: pathlib.Path): + if ctx.dialect == "starrocks": + pytest.skip( + "StarRocks requires incremental models to use a PRIMARY KEY table; the default example " + "project's incremental_model has no StarRocks primary_key, so this cross-engine test " + "does not apply to StarRocks" + ) + schema_name = ctx.add_test_suffix(TEST_SCHEMA) state_schema = ctx.add_test_suffix("sqlmesh_state") @@ -3652,6 +3671,12 @@ def test_janitor( and not ctx.engine_adapter.SUPPORTS_CREATE_DROP_CATALOG ): pytest.skip("Engine does not support catalog-based virtual environments") + if ctx.dialect == "starrocks": + pytest.skip( + "StarRocks requires incremental models to use a PRIMARY KEY table; the example project " + "used here has an incremental_model without a StarRocks primary_key, so this " + "cross-engine test does not apply to StarRocks" + ) schema = ctx.schema() # catalog.schema parsed_schema = d.to_schema(schema) @@ -3790,6 +3815,12 @@ def test_materialized_view_evaluation(ctx: TestContext): pytest.skip(f"Skipping engine {dialect} as it does not support materialized views") elif dialect in ("snowflake", "databricks"): pytest.skip(f"Skipping {dialect} as they're not enabled on standard accounts") + elif dialect == "starrocks": + pytest.skip( + "StarRocks materialized views require a REFRESH clause (refresh_moment/refresh_scheme), " + "which this generic test does not provide; StarRocks MVs are covered by " + "test_integration_starrocks.py" + ) model_name = ctx.table("test_tbl") mview_name = ctx.table("test_mview") diff --git a/tests/core/engine_adapter/integration/test_integration_starrocks.py b/tests/core/engine_adapter/integration/test_integration_starrocks.py index 1c6d8e0151..64f0776c9d 100644 --- a/tests/core/engine_adapter/integration/test_integration_starrocks.py +++ b/tests/core/engine_adapter/integration/test_integration_starrocks.py @@ -30,6 +30,7 @@ from sqlmesh.core.engine_adapter.starrocks import StarRocksEngineAdapter from sqlmesh.core.model.definition import load_sql_based_model, SqlModel +from sqlmesh.utils.errors import SQLMeshError import sqlmesh.core.dialect as d from tests.core.engine_adapter.integration import TestContext @@ -654,7 +655,10 @@ def test_materialized_view_combo_with_materialized_properties( ddl = fetchone_or_fail(engine_adapter, f"SHOW CREATE MATERIALIZED VIEW {mv_sql}")[1] logger.debug(f"mv ddl: {ddl}") ddl_upper = normalize_sql(ddl).upper() - assert "REFRESH DEFERRED ASYNC" in ddl_upper + # StarRocks renders a scheduled async refresh (ASYNC START ... EVERY ...) as + # "REFRESH DEFERRED SCHEDULE START ... EVERY ..." in SHOW CREATE (newer versions); + # older versions render it as "REFRESH DEFERRED ASYNC START ...". Accept either. + assert "REFRESH DEFERRED SCHEDULE" in ddl_upper or "REFRESH DEFERRED ASYNC" in ddl_upper assert ( "START('2025-01-01 00:00:00')EVERY(INTERVAL 5 MINUTE)" in ddl_upper or 'START("2025-01-01 00:00:00")EVERY(INTERVAL 5 MINUTE)' in ddl_upper @@ -741,6 +745,59 @@ def test_materialized_view_combo_all_properties_block( or 'COMMENT "ANALYTICS MV COMBO B"' in ddl_upper ) + def test_materialized_view_requires_refresh( + self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter + ): + """StarRocks only supports ASYNC materialized views, which require a REFRESH clause. + + Creating an MV without refresh_moment/refresh_scheme must raise a clear error rather than + silently creating an undetectable synchronous MV. + """ + source = ctx.table("sr_mv_no_refresh_src") + mv = ctx.table("sr_mv_no_refresh") + source_sql = source.sql(dialect=ctx.dialect, identify=True) + mv_model_name = _model_name_from_table(mv) + + self._create_sales_source_table(ctx, engine_adapter, source) + + model_sql = f""" + MODEL ( + name {mv_model_name}, + kind VIEW ( + materialized true + ), + dialect starrocks, + columns ( + order_id BIGINT, + customer_id INT, + event_date DATE, + amount DECIMAL(18,2), + region VARCHAR(50) + ), + virtual_properties ( + distributed_by = 'HASH(order_id) BUCKETS 4', + replication_num = '1' + ) + ); + SELECT order_id, customer_id, event_date, amount, region + FROM {source_sql}; + """ + model = _load_sql_model(model_sql) + query = model.render_query() + assert query is not None + materialized_properties = _materialized_properties_from_model(model) + + with pytest.raises(SQLMeshError, match="require a REFRESH clause"): + engine_adapter.create_view( + mv, + query, + replace=True, + materialized=True, + target_columns_to_types=model.columns_to_types, + materialized_properties=materialized_properties, + view_properties=model.virtual_properties, + ) + class TestTableFeatures: """ @@ -1071,14 +1128,17 @@ def test_e2e_model_parameters(self, starrocks_adapter: StarRocksEngineAdapter): assert "PARTITION BY " in ddl # Note: PARTITION BY may contain function expressions like from_unixtime(ts) # We verify the clause exists and contains expected patterns - part_match = re.search(r"PARTITION BY \s*\(([^)]+)\)", ddl) + # Capture the full PARTITION BY (...) clause, allowing one level of nested parens so + # that function expressions like from_unixtime(ts) are included alongside `region`. + part_match = re.search(r"PARTITION BY\s*(\((?:[^()]|\([^()]*\))*\))", ddl) assert part_match, "PARTITION BY clause not found" part_cols = part_match.group(1) - # Verify function expression and column references - assert ( - # "from_unixtime" in part_cols or "ts" in part_cols - "__generated_partition_column_" in part_cols and "region" in part_cols - ), f"Expected partition expression with generated column/region, got {part_cols}" + assert "from_unixtime" in part_cols and "ts" in part_cols, ( + f"Expected partition expression with from_unixtime(ts), got {part_cols}" + ) + assert "region" in part_cols, ( + f"Expected 'region' partition column in PARTITION BY, got {part_cols}" + ) # Verify ORDER BY from clustered_by order_match = re.search(r"ORDER BY\s*\(([^)]+)\)", ddl) @@ -1123,7 +1183,8 @@ def test_e2e_physical_properties_core(self, starrocks_adapter: StarRocksEngineAd order_by = (order_id, region), -- clustered_by = (order_id, region), -- also OK -- replication_num = '1', - bucket_size = '12345678', + -- bucket_size only applies to RANDOM distribution with enable_automatic_bucket; + -- StarRocks rejects it for HASH-distributed tables. enable_persistent_index = 'true' ) ); @@ -1513,7 +1574,8 @@ def test_e2e_partition_expression_for_mv( has_func: bool, ): """Expression partitioning for MVs should always keep outer parentheses.""" - db_name = "sr_e2e_part_expr_mv_db" + suffix = "func" if has_func else "cols" + db_name = f"sr_e2e_part_expr_mv_db_{suffix}" src_table = f"{db_name}.sr_part_expr_src" mv_table = f"{db_name}.sr_part_expr_mv" @@ -1962,7 +2024,7 @@ class TestStarRocksAbility: @pytest.fixture(scope="class") def test_tables( - self, starrocks_adapter: StarRocksEngineAdapter + self, starrocks_adapter: StarRocksEngineAdapter, worker_id: str ) -> t.Generator[t.Dict[str, str], None, None]: """ Pre-create tables of different types for testing. @@ -1970,7 +2032,7 @@ def test_tables( Returns: Dict mapping table type to fully qualified table name """ - db_name = "sr_ability_test" + db_name = f"sr_ability_test_{worker_id}" starrocks_adapter.create_schema(db_name, ignore_if_exists=True) tables = {} @@ -2479,11 +2541,11 @@ class TestCommentMethods: Test _build_create_comment_table_exp and _build_create_comment_column_exp methods. These methods are used to generate ALTER TABLE SQL for modifying comments. - Although StarRocks uses COMMENT_CREATION_TABLE = IN_SCHEMA_DEF_CTAS (comments - are included in CREATE TABLE), these methods may be used for: - - Modifying existing table comments + StarRocks uses COMMENT_CREATION_TABLE = IN_SCHEMA_DEF_NO_CTAS: column comments are inlined for a + plain CREATE TABLE but registered via ALTER TABLE ... MODIFY COLUMN ... COMMENT after a CTAS + (StarRocks rejects types/comments in a CTAS column list). These methods are also used for: + - Modifying existing table/column comments - View comments (depending on COMMENT_CREATION_VIEW) - - Future ALTER TABLE support """ def test_build_create_comment_table_exp(self, starrocks_adapter: StarRocksEngineAdapter): diff --git a/tests/core/engine_adapter/test_starrocks.py b/tests/core/engine_adapter/test_starrocks.py index 524870e78a..191dab0882 100644 --- a/tests/core/engine_adapter/test_starrocks.py +++ b/tests/core/engine_adapter/test_starrocks.py @@ -27,9 +27,12 @@ from sqlmesh.utils.errors import SQLMeshError from tests.core.engine_adapter import to_sql_calls +from sqlmesh.core.engine_adapter.base import EngineAdapter from sqlmesh.core.engine_adapter.starrocks import StarRocksEngineAdapter +from sqlmesh.core.engine_adapter.duckdb import DuckDBEngineAdapter from sqlmesh.core.dialect import parse from sqlmesh.core.model import load_sql_based_model, SqlModel +from sqlmesh.core.snapshot.evaluator import _adjust_physical_properties_for_engine pytestmark = [pytest.mark.starrocks, pytest.mark.engine] @@ -329,6 +332,24 @@ def test_create_materialized_view_replace_with_refresh_and_comments( assert "START ('2025-01-01 00:00:00')" in calls[1] assert "EVERY (INTERVAL 5 MINUTE)" in calls[1] + def test_create_materialized_view_without_refresh_raises( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """StarRocks only supports ASYNC MVs, which require a REFRESH clause. + + Creating an MV without refresh_moment/refresh_scheme must raise rather than silently + producing an undetectable synchronous MV. + """ + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + with pytest.raises(SQLMeshError, match="require a REFRESH clause"): + adapter.create_view( + "test_mv", + parse_one("SELECT a FROM tbl"), + materialized=True, + target_columns_to_types={"a": exp.DataType.build("INT")}, + view_properties={"replication_num": exp.Literal.string("1")}, + ) + def test_delete_where_true_optimization( self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] ): @@ -889,7 +910,8 @@ def test_partitioned_by_forms_for_mv( dialect starrocks, columns (dt DATE, region STRING, year INT, month INT), physical_properties ( - partition_by = {partition_expr} + partition_by = {partition_expr}, + refresh_scheme = ASYNC ) ); SELECT dt, region, year, month FROM src; @@ -1878,3 +1900,114 @@ def test_create_table_comprehensive( assert "DISTRIBUTED BY HASH (`customer_id`) BUCKETS 10" in sql assert "ORDER BY (`customer_id`, `order_id`)" in sql assert "PROPERTIES ('replication_num'='3')" in sql + + +# ============================================================================= +# Incremental models require a PRIMARY KEY table on StarRocks +# ============================================================================= +class TestIncrementalRequiresPrimaryKey: + """StarRocks incremental kinds rely on DELETE/MERGE, which only work on PRIMARY KEY tables. + + Declaring such a model without a PRIMARY KEY must therefore fail at creation time, while + append-only kinds and non-StarRocks engines are unaffected. The rule is enforced by + ``StarRocksEngineAdapter.adjust_physical_properties_for_incremental`` (reached from the + evaluator's ``_adjust_physical_properties_for_engine``); these tests drive that path with + declared models and assert the observable outcome. + """ + + def _adjust(self, adapter: EngineAdapter, model: SqlModel) -> t.Dict[str, t.Any]: + return _adjust_physical_properties_for_engine(adapter, model, model.physical_properties) + + def test_incremental_model_without_primary_key_raises( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ) -> None: + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + model = _load_sql_model( + """ + MODEL ( + name test_schema.inc_no_pk, + kind INCREMENTAL_BY_TIME_RANGE (time_column event_date), + dialect starrocks, + columns (id INT, event_date DATE) + ); + SELECT id, event_date FROM src WHERE event_date BETWEEN @start_ds AND @end_ds; + """ + ) + with pytest.raises(SQLMeshError, match="requires a PRIMARY KEY"): + self._adjust(adapter, model) + + def test_incremental_model_with_primary_key_is_allowed( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ) -> None: + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + model = _load_sql_model( + """ + MODEL ( + name test_schema.inc_pk, + kind INCREMENTAL_BY_TIME_RANGE (time_column event_date), + dialect starrocks, + columns (id INT, event_date DATE), + physical_properties (primary_key = (id, event_date)) + ); + SELECT id, event_date FROM src WHERE event_date BETWEEN @start_ds AND @end_ds; + """ + ) + assert "primary_key" in self._adjust(adapter, model) + + def test_incremental_by_unique_key_is_promoted_to_primary_key( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ) -> None: + # INCREMENTAL_BY_UNIQUE_KEY auto-promotes the unique_key to a PRIMARY KEY (a multi-column + # key becomes a tuple) rather than requiring one to be declared explicitly. + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + model = _load_sql_model( + """ + MODEL ( + name test_schema.inc_by_uk, + kind INCREMENTAL_BY_UNIQUE_KEY (unique_key (id, event_date)), + dialect starrocks, + columns (id INT, event_date DATE) + ); + SELECT id, event_date FROM src; + """ + ) + primary_key = self._adjust(adapter, model)["primary_key"] + assert isinstance(primary_key, exp.Tuple) + assert [c.name for c in primary_key.expressions] == ["id", "event_date"] + + def test_append_only_incremental_does_not_require_primary_key( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ) -> None: + # Append-only INCREMENTAL_UNMANAGED (insert_overwrite=False) only does INSERT, so it does + # not need a PRIMARY KEY table. + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + model = _load_sql_model( + """ + MODEL ( + name test_schema.inc_append, + kind INCREMENTAL_UNMANAGED, + dialect starrocks, + columns (id INT, event_date DATE) + ); + SELECT id, event_date FROM src; + """ + ) + assert "primary_key" not in self._adjust(adapter, model) + + def test_non_starrocks_incremental_is_unaffected( + self, make_mocked_engine_adapter: t.Callable[..., DuckDBEngineAdapter] + ) -> None: + # Engines without the PRIMARY KEY requirement inherit the base no-op and never raise. + adapter = make_mocked_engine_adapter(DuckDBEngineAdapter) + model = _load_sql_model( + """ + MODEL ( + name test_schema.inc_duckdb, + kind INCREMENTAL_BY_TIME_RANGE (time_column event_date), + dialect duckdb, + columns (id INT, event_date DATE) + ); + SELECT id, event_date FROM src WHERE event_date BETWEEN @start_ds AND @end_ds; + """ + ) + assert "primary_key" not in self._adjust(adapter, model) diff --git a/tests/core/test_snapshot_evaluator.py b/tests/core/test_snapshot_evaluator.py index 51ea5431f7..161e3f9f5e 100644 --- a/tests/core/test_snapshot_evaluator.py +++ b/tests/core/test_snapshot_evaluator.py @@ -132,6 +132,9 @@ def mock_exit(self, exc_type, exc_value, traceback): adapter_mock.wap_supported.return_value = False adapter_mock.get_data_objects.return_value = [] adapter_mock.with_settings.return_value = adapter_mock + adapter_mock.adjust_physical_properties_for_incremental.side_effect = ( + lambda physical_properties, **kwargs: physical_properties + ) return adapter_mock @@ -155,6 +158,9 @@ def adapters(mocker: MockerFixture): adapter_mock.wap_supported.return_value = False adapter_mock.get_data_objects.return_value = [] adapter_mock.with_settings.return_value = adapter_mock + adapter_mock.adjust_physical_properties_for_incremental.side_effect = ( + lambda physical_properties, **kwargs: physical_properties + ) adapters.append(adapter_mock) return adapters @@ -4044,6 +4050,9 @@ def test_migrate_snapshot(snapshot: Snapshot, mocker: MockerFixture, adapter_moc adapter_mock = mocker.patch("sqlmesh.core.engine_adapter.EngineAdapter") adapter_mock.dialect = "duckdb" adapter_mock.with_settings.return_value = adapter_mock + adapter_mock.adjust_physical_properties_for_incremental.side_effect = ( + lambda physical_properties, **kwargs: physical_properties + ) evaluator = SnapshotEvaluator(adapter_mock) evaluator.create([snapshot], {}) From 33ed712c792f17c12a40700f51687b02cd9cf90c Mon Sep 17 00:00:00 2001 From: Mateusz Jukiewicz Date: Sun, 7 Jun 2026 17:36:13 +0200 Subject: [PATCH 18/20] Make StarRocks AMVs not recreate themselves during sqlmesh run Signed-off-by: Mateusz Jukiewicz --- docs/integrations/engines/starrocks.md | 6 ++- sqlmesh/core/engine_adapter/base.py | 1 + sqlmesh/core/engine_adapter/starrocks.py | 7 +++ sqlmesh/core/snapshot/evaluator.py | 13 +++++ tests/core/engine_adapter/test_starrocks.py | 9 ++++ tests/core/test_snapshot_evaluator.py | 55 +++++++++++++++++++++ 6 files changed, 90 insertions(+), 1 deletion(-) diff --git a/docs/integrations/engines/starrocks.md b/docs/integrations/engines/starrocks.md index b283c4f686..a4a56dad49 100644 --- a/docs/integrations/engines/starrocks.md +++ b/docs/integrations/engines/starrocks.md @@ -491,7 +491,11 @@ You can specify `partitioning`, `distribution`, `order by` and `properties` the **Notes:** -* If you create materialized views with `replace=true`, SQLMesh may drop and recreate the MV. When an MV is dropped, its data is removed and the MV must be refreshed/rebuilt again. +* SQLMesh does not recreate materialized views on every `sqlmesh run`. Once an MV exists, SQLMesh leaves it in place and lets StarRocks keep it current. This is intentional: + * StarRocks async MVs revalidate themselves automatically, even when the underlying data is dropped, so a periodic drop-and-recreate is unnecessary. + * StarRocks async MVs either refresh automatically (per their `refresh_scheme`) or can be refreshed explicitly with `REFRESH MATERIALIZED VIEW`, which also enables partition-level (incremental) refresh. A SQLMesh-driven recreate would instead force a full rebuild. + + The MV is (re)built only when it does not yet exist — for example when you first deploy it, or when a model change produces a new version. To change a materialized view's definition, update the model and run `sqlmesh plan`. * There are some restriction for `partitioning`, you need to refer StarRocks' doc for MV partitioning specification. * StarRocks MV schema supports a column list but does **not** support explicit data types in that list. Column data types come from the `AS SELECT ...` query. * If you create MVs from a dataframe via the Python API, provide `target_columns_to_types` (a `Dict[str, exp.DataType]`). If you don't care about exact types, you can set all columns to `VARCHAR` as a fallback: diff --git a/sqlmesh/core/engine_adapter/base.py b/sqlmesh/core/engine_adapter/base.py index a4e450dd1c..3f816a42ee 100644 --- a/sqlmesh/core/engine_adapter/base.py +++ b/sqlmesh/core/engine_adapter/base.py @@ -114,6 +114,7 @@ class EngineAdapter: SCHEMA_DIFFER_KWARGS: t.Dict[str, t.Any] = {} SUPPORTS_TUPLE_IN = True HAS_VIEW_BINDING = False + RECREATE_MATERIALIZED_VIEW_ON_EVALUATION = True SUPPORTS_REPLACE_TABLE = True SUPPORTS_GRANTS = False DEFAULT_CATALOG_TYPE = DIALECT diff --git a/sqlmesh/core/engine_adapter/starrocks.py b/sqlmesh/core/engine_adapter/starrocks.py index 8ce78ca059..921f7e3845 100644 --- a/sqlmesh/core/engine_adapter/starrocks.py +++ b/sqlmesh/core/engine_adapter/starrocks.py @@ -1691,6 +1691,13 @@ class StarRocksEngineAdapter( implement custom MV schema rendering in create_view/_create_materialized_view. """ + RECREATE_MATERIALIZED_VIEW_ON_EVALUATION = False + """ + StarRocks async materialized views maintain themselves: they revalidate automatically even if the + underlying data is dropped, and the data is kept current either by StarRocks' automatic refresh or + by an explicit `REFRESH MATERIALIZED VIEW` (which also enables partition-level incremental refresh). + """ + SUPPORTS_REPLACE_TABLE = False """No REPLACE TABLE syntax; use DROP + CREATE instead""" diff --git a/sqlmesh/core/snapshot/evaluator.py b/sqlmesh/core/snapshot/evaluator.py index cb7ef17580..74c83a1c86 100644 --- a/sqlmesh/core/snapshot/evaluator.py +++ b/sqlmesh/core/snapshot/evaluator.py @@ -2734,6 +2734,19 @@ def insert( is_materialized_view and is_first_insert ) + # Some engines (e.g. StarRocks) maintain materialized views automatically (via auto/scheduled + # REFRESH) and can only recreate them via a destructive DROP + CREATE, which deletes the + # materialized data and forces a full rebuild. For those, an existing MV must not be recreated + # on routine evaluation (e.g. every `sqlmesh run`); only build it on the first insert (a new + # version) or when a rebuild is explicitly forced (intervals cleared by `should_force_rebuild`, + # which sets `is_first_insert`). + if ( + is_materialized_view + and not is_first_insert + and not self.adapter.RECREATE_MATERIALIZED_VIEW_ON_EVALUATION + ): + must_recreate_view = False + if self.adapter.table_exists(table_name) and not must_recreate_view: logger.info("Skipping creation of the view '%s'", table_name) return diff --git a/tests/core/engine_adapter/test_starrocks.py b/tests/core/engine_adapter/test_starrocks.py index 191dab0882..dc876947ad 100644 --- a/tests/core/engine_adapter/test_starrocks.py +++ b/tests/core/engine_adapter/test_starrocks.py @@ -350,6 +350,15 @@ def test_create_materialized_view_without_refresh_raises( view_properties={"replication_num": exp.Literal.string("1")}, ) + def test_does_not_recreate_materialized_view_on_evaluation(self): + """StarRocks async MVs maintain themselves, so SQLMesh must not recreate them on every run. + + The adapter opts out of per-evaluation recreation via + RECREATE_MATERIALIZED_VIEW_ON_EVALUATION = False, which the evaluator's ViewStrategy honors + for materialized views that already exist. + """ + assert StarRocksEngineAdapter.RECREATE_MATERIALIZED_VIEW_ON_EVALUATION is False + def test_delete_where_true_optimization( self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] ): diff --git a/tests/core/test_snapshot_evaluator.py b/tests/core/test_snapshot_evaluator.py index 161e3f9f5e..7453cc5687 100644 --- a/tests/core/test_snapshot_evaluator.py +++ b/tests/core/test_snapshot_evaluator.py @@ -707,6 +707,61 @@ def test_evaluate_materialized_view( assert adapter_mock.create_view.call_count == 1 +@pytest.mark.parametrize( + "view_exists, has_intervals, expected_create_view_calls", + [ + # MV already exists and has intervals -> routine evaluation (e.g. `sqlmesh run`): do NOT recreate + (True, True, 0), + # MV does not exist yet -> first build: create it + (False, False, 1), + # MV missing but intervals present -> still a first insert (table gone): rebuild it + (False, True, 1), + ], +) +def test_evaluate_materialized_view_not_recreated_on_evaluation( + adapter_mock, + make_snapshot, + view_exists: bool, + has_intervals: bool, + expected_create_view_calls: int, +): + # Engines that maintain MVs themselves (e.g. StarRocks) opt out of recreating an existing MV on + # every evaluation by setting RECREATE_MATERIALIZED_VIEW_ON_EVALUATION = False. + adapter_mock.RECREATE_MATERIALIZED_VIEW_ON_EVALUATION = False + adapter_mock.table_exists.return_value = view_exists + evaluator = SnapshotEvaluator(adapter_mock) + + model = load_sql_based_model( + parse( # type: ignore + """ + MODEL ( + name test_schema.test_model, + kind VIEW ( + materialized true + ) + ); + + SELECT a::int FROM tbl; + """ + ), + ) + + snapshot = make_snapshot(model) + snapshot.categorize_as(SnapshotChangeCategory.BREAKING) + if has_intervals: + snapshot.add_interval("2023-01-01", "2023-01-01") + + evaluator.evaluate( + snapshot, + start="2020-01-01", + end="2020-01-02", + execution_time="2020-01-02", + snapshots={}, + ) + + assert adapter_mock.create_view.call_count == expected_create_view_calls + + def test_evaluate_materialized_view_with_partitioned_by_cluster_by( mocker: MockerFixture, adapter_mock, make_snapshot ): From 10378c95ca8a9904b68fce61b27a066e69f16102 Mon Sep 17 00:00:00 2001 From: Mateusz Jukiewicz Date: Sun, 7 Jun 2026 23:50:28 +0200 Subject: [PATCH 19/20] Docs update + StarRocks AMV with audits gets refreshed immediately Signed-off-by: Mateusz Jukiewicz --- docs/integrations/engines/starrocks.md | 28 +++++ sqlmesh/core/engine_adapter/starrocks.py | 48 +++++++++ sqlmesh/core/snapshot/evaluator.py | 19 +++- tests/core/engine_adapter/test_starrocks.py | 112 ++++++++++++++++++++ tests/core/test_snapshot_evaluator.py | 48 ++++++++- 5 files changed, 250 insertions(+), 5 deletions(-) diff --git a/docs/integrations/engines/starrocks.md b/docs/integrations/engines/starrocks.md index a4a56dad49..8d5352cd36 100644 --- a/docs/integrations/engines/starrocks.md +++ b/docs/integrations/engines/starrocks.md @@ -485,6 +485,31 @@ FROM user_events GROUP BY user_id; ``` +**Audits on materialized views:** + +Audits require data to exist in the materialized view when they run. Because StarRocks refreshes async MVs as background jobs, the data is not guaranteed to be present immediately after the MV is created. To make audits deterministic, when a materialized view has audits SQLMesh issues a synchronous `REFRESH MATERIALIZED VIEW WITH SYNC MODE` right after creating the MV, which blocks until the data is materialized. + +For this to work safely, a materialized view with audits **must** set `refresh_moment = 'DEFERRED'`. This prevents StarRocks' automatic (IMMEDIATE) refresh from racing with the synchronous refresh that SQLMesh issues. If the MV has audits and `refresh_moment` is `IMMEDIATE` (or unset, which defaults to `IMMEDIATE` in StarRocks), SQLMesh raises an error before creating the MV. + +```sql +MODEL ( + name user_summary_mv, + kind VIEW ( + materialized true + ), + audits ( + not_null(columns := (user_id)) + ), + physical_properties ( + -- required when the MV has audits + refresh_moment = DEFERRED, + refresh_scheme = 'ASYNC' + ) +); + +SELECT user_id, COUNT(*) AS event_count FROM user_events GROUP BY user_id; +``` + **Other properties:** You can specify `partitioning`, `distribution`, `order by` and `properties` the same as normal table properties. But notice that only supported MV properties are useful, Refer to StarRocks' doc for MV creation. @@ -512,6 +537,9 @@ target_columns_to_types = { ## Limitations * **No SYNC MV support**: synchronous materialized views are not supported yet. +* **`FULL` models are not replaced atomically**: StarRocks does not support `CREATE OR REPLACE TABLE` and has no multi-statement transactions (in version 3.5 and lower), so SQLMesh refreshes a `FULL` model by emptying the existing table (a `TRUNCATE`, or a `DELETE` when a filter applies) and then inserting the new result set as separate, auto-committed statements. There is a brief window between the truncate/delete and the completion of the insert during which the table is empty or partially populated, so readers querying it during that window may see missing or incomplete data. Incremental kinds (e.g. `INCREMENTAL_BY_TIME_RANGE`, `INCREMENTAL_BY_PARTITION`) do not fully eliminate this — StarRocks applies them as the same non-atomic delete-then-insert — but they narrow the affected rows to the partition/time range being processed rather than emptying the whole table, so unaffected partitions remain readable throughout. SQLMesh has no way to make these replacements atomic on StarRocks 3.5 and lower. + + Future work: this PR targeted StarRocks 3.5, but StarRocks has since expanded its capabilities considerably (the integration now runs against 4.1). Later work should investigate using `INSERT OVERWRITE` together with the transactional/atomic-swap guarantees available in newer StarRocks versions to close this gap (see the `INSERT_OVERWRITE_STRATEGY` and `SUPPORTS_TRANSACTIONS` flags in the StarRocks engine adapter). * **No tuple IN**: StarRocks does not support `(c1, c2) IN ((v1, v2), ...)`. * **No `SELECT ... FOR UPDATE`**: StarRocks is an OLAP database and does not support row locks; SQLMesh removes `FOR UPDATE` when executing SQLGlot expressions. * **RENAME caveat**: `ALTER TABLE db.old RENAME db.new` is not supported; the `RENAME` target cannot be qualified with a database name. diff --git a/sqlmesh/core/engine_adapter/starrocks.py b/sqlmesh/core/engine_adapter/starrocks.py index 921f7e3845..7b857645da 100644 --- a/sqlmesh/core/engine_adapter/starrocks.py +++ b/sqlmesh/core/engine_adapter/starrocks.py @@ -2310,6 +2310,13 @@ def create_view( ) # MATERIALIZED VIEW path + # MVs with audits get a synchronous refresh after creation (see _create_materialized_view), + # which requires REFRESH DEFERRED. Validate before the drop so we fail without destroying + # an existing MV. + has_audits = bool((materialized_properties or {}).get("has_audits")) + if has_audits: + self._validate_deferred_refresh_for_audits(view_name, view_properties) + if replace: # Avoid DROP MATERIALIZED VIEW failure when an object with the same name exists but is not an MV. self.drop_data_object_on_type_mismatch( @@ -2431,6 +2438,16 @@ def _create_materialized_view( quote_identifiers=self.QUOTE_IDENTIFIERS_IN_VIEWS, ) + # MVs with audits are created with REFRESH DEFERRED (enforced in create_view), so StarRocks + # does not populate them on creation. Audits need data, so block on a synchronous refresh. + if bool((materialized_properties or {}).get("has_audits")): + refresh_sql = ( + f"REFRESH MATERIALIZED VIEW " + f"{exp.to_table(view_name).sql(dialect=self.dialect, identify=True)} " + f"WITH SYNC MODE" + ) + self.execute(refresh_sql) + self._clear_data_object_cache(view_name) def _build_materialized_view_schema_exp( @@ -3020,6 +3037,37 @@ def _build_distributed_by_property( ) return result + def _validate_deferred_refresh_for_audits( + self, + view_name: TableName, + view_properties: t.Optional[t.Dict[str, exp.Expr]], + ) -> None: + """ + Ensure a materialized view with audits uses REFRESH DEFERRED. + + StarRocks audits require data to exist in the MV, so SQLMesh issues an explicit synchronous + `REFRESH MATERIALIZED VIEW ... WITH SYNC MODE` right after creating the MV. For that to be + deterministic, the MV must use `refresh_moment = 'DEFERRED'`; otherwise StarRocks' automatic + (IMMEDIATE) refresh would run concurrently and race with the explicit one. A missing + refresh_moment defaults to IMMEDIATE in StarRocks, so it is rejected as well. + """ + refresh_moment = (view_properties or {}).get("refresh_moment") + normalized = ( + PropertyValidator.validate_and_normalize_property("refresh_moment", refresh_moment) + if refresh_moment is not None + else None + ) + if normalized != "DEFERRED": + raise SQLMeshError( + f"[StarRocks] Materialized view '{exp.to_table(view_name).sql(dialect=self.dialect)}' " + "has audits, which require a synchronous refresh after creation. This is only " + "supported with deferred refresh, so the model must set " + "`refresh_moment = 'DEFERRED'` in its physical_properties " + f"(got {normalized or 'no refresh_moment; StarRocks defaults to IMMEDIATE'}). " + "DEFERRED prevents StarRocks' " + "automatic refresh from racing with the synchronous refresh SQLMesh issues." + ) + def _build_refresh_property( self, table_properties: t.Dict[str, t.Any] ) -> t.Optional[exp.RefreshTriggerProperty]: diff --git a/sqlmesh/core/snapshot/evaluator.py b/sqlmesh/core/snapshot/evaluator.py index 74c83a1c86..11b3fd1f33 100644 --- a/sqlmesh/core/snapshot/evaluator.py +++ b/sqlmesh/core/snapshot/evaluator.py @@ -2753,13 +2753,12 @@ def insert( logger.info("Replacing view '%s'", table_name) materialized_properties = None - if is_materialized_view and ( - model.partitioned_by or model.partition_interval_unit or model.clustered_by - ): + if is_materialized_view: materialized_properties = { "partitioned_by": model.partitioned_by, "partition_interval_unit": model.partition_interval_unit, "clustered_by": model.clustered_by, + "has_audits": bool(model.audits_with_args), } self.adapter.create_view( table_name, @@ -2818,6 +2817,7 @@ def create( "partitioned_by": model.partitioned_by, "clustered_by": model.clustered_by, "partition_interval_unit": model.partition_interval_unit, + "has_audits": bool(model.audits_with_args), } self.adapter.create_view( table_name, @@ -2853,11 +2853,22 @@ def migrate( execution_time=now(), snapshots=kwargs["snapshots"], engine_adapter=self.adapter ) + is_materialized_view = self._is_materialized_view(model) + materialized_properties = None + if is_materialized_view: + materialized_properties = { + "partitioned_by": model.partitioned_by, + "clustered_by": model.clustered_by, + "partition_interval_unit": model.partition_interval_unit, + "has_audits": bool(model.audits_with_args), + } + self.adapter.create_view( target_table_name, model.render_query_or_raise(**render_kwargs), model.columns_to_types, - materialized=self._is_materialized_view(model), + materialized=is_materialized_view, + materialized_properties=materialized_properties, view_properties=model.render_physical_properties(**render_kwargs), table_description=model.description, column_descriptions=model.column_descriptions, diff --git a/tests/core/engine_adapter/test_starrocks.py b/tests/core/engine_adapter/test_starrocks.py index dc876947ad..cfc045e6b5 100644 --- a/tests/core/engine_adapter/test_starrocks.py +++ b/tests/core/engine_adapter/test_starrocks.py @@ -350,6 +350,118 @@ def test_create_materialized_view_without_refresh_raises( view_properties={"replication_num": exp.Literal.string("1")}, ) + def test_create_materialized_view_with_audits_emits_sync_refresh( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """When an MV has audits, SQLMesh must synchronously refresh it right after creation. + + Audits require data to exist in the MV. With REFRESH DEFERRED, StarRocks does not populate + the MV on creation, so SQLMesh issues an explicit `REFRESH MATERIALIZED VIEW ... WITH SYNC + MODE` that blocks until the data is materialized. + """ + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_view( + "test_mv", + parse_one("SELECT a FROM tbl"), + materialized=True, + target_columns_to_types={"a": exp.DataType.build("INT")}, + materialized_properties={"has_audits": True}, + view_properties={ + "refresh_moment": exp.Var(this="DEFERRED"), + "refresh_scheme": exp.Var(this="ASYNC"), + }, + ) + + calls = to_sql_calls(adapter) + assert calls[0] == "DROP MATERIALIZED VIEW IF EXISTS `test_mv`" + assert "CREATE MATERIALIZED VIEW" in calls[1] + assert "REFRESH DEFERRED" in calls[1] + assert calls[2] == "REFRESH MATERIALIZED VIEW `test_mv` WITH SYNC MODE" + + def test_create_materialized_view_with_audits_emits_sync_refresh_on_first_create( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """The sync refresh must also fire on first-time creation (replace=False). + + ViewStrategy.create calls create_view with replace=False, so the DROP is skipped, but the + synchronous refresh still needs to populate the MV before audits run. + """ + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_view( + "test_mv", + parse_one("SELECT a FROM tbl"), + replace=False, + materialized=True, + target_columns_to_types={"a": exp.DataType.build("INT")}, + materialized_properties={"has_audits": True}, + view_properties={ + "refresh_moment": exp.Var(this="DEFERRED"), + "refresh_scheme": exp.Var(this="ASYNC"), + }, + ) + + calls = to_sql_calls(adapter) + assert all("DROP MATERIALIZED VIEW" not in sql for sql in calls) + assert "CREATE MATERIALIZED VIEW" in calls[0] + assert "REFRESH DEFERRED" in calls[0] + assert calls[1] == "REFRESH MATERIALIZED VIEW `test_mv` WITH SYNC MODE" + + def test_create_materialized_view_with_audits_immediate_refresh_raises( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """An MV with audits must use REFRESH DEFERRED; IMMEDIATE must raise and not create anything.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + with pytest.raises(SQLMeshError, match="DEFERRED"): + adapter.create_view( + "test_mv", + parse_one("SELECT a FROM tbl"), + materialized=True, + target_columns_to_types={"a": exp.DataType.build("INT")}, + materialized_properties={"has_audits": True}, + view_properties={ + "refresh_moment": exp.Var(this="IMMEDIATE"), + "refresh_scheme": exp.Var(this="ASYNC"), + }, + ) + + # Fail-fast: nothing should have been dropped or created. + assert all("CREATE MATERIALIZED VIEW" not in sql for sql in to_sql_calls(adapter)) + assert all("DROP MATERIALIZED VIEW" not in sql for sql in to_sql_calls(adapter)) + + def test_create_materialized_view_with_audits_missing_refresh_moment_raises( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """A missing refresh_moment defaults to IMMEDIATE in StarRocks, so audits must raise.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + with pytest.raises(SQLMeshError, match="DEFERRED"): + adapter.create_view( + "test_mv", + parse_one("SELECT a FROM tbl"), + materialized=True, + target_columns_to_types={"a": exp.DataType.build("INT")}, + materialized_properties={"has_audits": True}, + view_properties={"refresh_scheme": exp.Var(this="ASYNC")}, + ) + + def test_create_materialized_view_without_audits_does_not_sync_refresh( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Without audits, no synchronous refresh is issued and IMMEDIATE refresh is allowed.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_view( + "test_mv", + parse_one("SELECT a FROM tbl"), + materialized=True, + target_columns_to_types={"a": exp.DataType.build("INT")}, + materialized_properties={"has_audits": False}, + view_properties={ + "refresh_moment": exp.Var(this="IMMEDIATE"), + "refresh_scheme": exp.Var(this="ASYNC"), + }, + ) + + assert all("WITH SYNC MODE" not in sql for sql in to_sql_calls(adapter)) + def test_does_not_recreate_materialized_view_on_evaluation(self): """StarRocks async MVs maintain themselves, so SQLMesh must not recreate them on every run. diff --git a/tests/core/test_snapshot_evaluator.py b/tests/core/test_snapshot_evaluator.py index 7453cc5687..edb1d8ed7d 100644 --- a/tests/core/test_snapshot_evaluator.py +++ b/tests/core/test_snapshot_evaluator.py @@ -843,7 +843,12 @@ def test_evaluate_materialized_view_with_execution_time_macro( view_properties={}, table_description=None, column_descriptions={}, - materialized_properties=None, + materialized_properties={ + "partitioned_by": [], + "partition_interval_unit": None, + "clustered_by": [], + "has_audits": False, + }, ) @@ -1305,6 +1310,7 @@ def test_create_materialized_view(mocker: MockerFixture, adapter_mock, make_snap "clustered_by": [], "partition_interval_unit": None, "partitioned_by": [], + "has_audits": False, }, view_properties={}, table_description=None, @@ -1354,6 +1360,7 @@ def test_create_view_with_properties(mocker: MockerFixture, adapter_mock, make_s "clustered_by": [], "partition_interval_unit": None, "partitioned_by": [], + "has_audits": False, }, table_description=None, replace=False, @@ -1364,6 +1371,45 @@ def test_create_view_with_properties(mocker: MockerFixture, adapter_mock, make_s ) +def test_create_materialized_view_with_audits_sets_has_audits( + mocker: MockerFixture, adapter_mock, make_snapshot +): + """A materialized view model with audits must propagate has_audits=True to the adapter. + + Engines like StarRocks rely on this flag to synchronously refresh the MV before audits run. + """ + adapter_mock.get_data_objects.return_value = [] + adapter_mock.table_exists.return_value = False + + evaluator = SnapshotEvaluator(adapter_mock) + + model = load_sql_based_model( + parse( # type: ignore + """ + MODEL ( + name test_schema.test_model, + kind VIEW ( + materialized true + ), + audits ( + not_null(columns := (a)) + ) + ); + + SELECT a::int FROM tbl; + """ + ), + ) + + snapshot = make_snapshot(model) + snapshot.categorize_as(SnapshotChangeCategory.BREAKING) + + evaluator.create([snapshot], {}) + + _, kwargs = adapter_mock.create_view.call_args + assert kwargs["materialized_properties"]["has_audits"] is True + + def test_promote_model_info(mocker: MockerFixture, make_snapshot): adapter_mock = mocker.patch("sqlmesh.core.engine_adapter.EngineAdapter") adapter_mock.dialect = "duckdb" From 091a21519c5d145cb65c97afa12c563d671eaa05 Mon Sep 17 00:00:00 2001 From: Mateusz Jukiewicz Date: Mon, 8 Jun 2026 11:55:44 +0200 Subject: [PATCH 20/20] Physical properties map to physical table names based on configuration Signed-off-by: Mateusz Jukiewicz --- docs/integrations/engines/starrocks.md | 30 ++ sqlmesh/core/engine_adapter/base.py | 4 + sqlmesh/core/engine_adapter/starrocks.py | 10 + sqlmesh/core/model/definition.py | 51 ++- tests/core/engine_adapter/test_starrocks.py | 354 ++++++++++++++++++++ tests/core/test_snapshot_evaluator.py | 7 + 6 files changed, 455 insertions(+), 1 deletion(-) diff --git a/docs/integrations/engines/starrocks.md b/docs/integrations/engines/starrocks.md index 8d5352cd36..a5314b87b6 100644 --- a/docs/integrations/engines/starrocks.md +++ b/docs/integrations/engines/starrocks.md @@ -510,6 +510,36 @@ MODEL ( SELECT user_id, COUNT(*) AS event_count FROM user_events GROUP BY user_id; ``` +**Excluding tables from refresh:** + +`excluded_trigger_tables` and `excluded_refresh_tables` let you control which base tables participate in an async MV's refresh cycle: + +* `excluded_trigger_tables`: base tables whose data changes should **not** automatically trigger a refresh of this MV. +* `excluded_refresh_tables`: base tables that should **not** be scanned when the MV refreshes. + +Both properties accept a single table reference or a comma-separated list of table references. + +StarRocks requires the **physical** base table name for these properties, not the logical view name that SQLMesh normally exposes. SQLMesh handles this automatically: when a reference matches a managed SQLMesh model, the logical name is resolved to its physical table name before the `CREATE MATERIALIZED VIEW` statement is issued. References that do not match any managed model are passed through unchanged. + +```sql +MODEL ( + name mydb.order_summary_mv, + kind VIEW ( + materialized true + ), + physical_properties ( + refresh_scheme = 'ASYNC', + -- SQLMesh resolves mydb.orders and mydb.order_items to their physical table names + excluded_trigger_tables = 'mydb.orders,mydb.order_items', + excluded_refresh_tables = mydb.orders + ) +); + +SELECT order_id, SUM(amount) AS total FROM mydb.orders GROUP BY order_id; +``` + +A single reference can be written as a bare identifier (`mydb.orders`) or as a quoted string. Multiple references must be provided as a quoted, comma-separated string (`'mydb.orders,mydb.order_items'`). + **Other properties:** You can specify `partitioning`, `distribution`, `order by` and `properties` the same as normal table properties. But notice that only supported MV properties are useful, Refer to StarRocks' doc for MV creation. diff --git a/sqlmesh/core/engine_adapter/base.py b/sqlmesh/core/engine_adapter/base.py index 3f816a42ee..9081227cb7 100644 --- a/sqlmesh/core/engine_adapter/base.py +++ b/sqlmesh/core/engine_adapter/base.py @@ -123,6 +123,10 @@ class EngineAdapter: ATTACH_CORRELATION_ID = True SUPPORTS_QUERY_EXECUTION_TRACKING = False SUPPORTS_METADATA_TABLE_LAST_MODIFIED_TS = False + RESOLVE_TABLE_REFS_IN_PHYSICAL_PROPERTIES: t.FrozenSet[str] = frozenset() + """Physical property keys whose values may contain logical model references that + should be resolved to physical table names during property rendering. Engines that + need such resolution (e.g. StarRocks' excluded_trigger_tables) override this set.""" def __init__( self, diff --git a/sqlmesh/core/engine_adapter/starrocks.py b/sqlmesh/core/engine_adapter/starrocks.py index 7b857645da..05120db0e3 100644 --- a/sqlmesh/core/engine_adapter/starrocks.py +++ b/sqlmesh/core/engine_adapter/starrocks.py @@ -1740,6 +1740,16 @@ class StarRocksEngineAdapter( MAX_IDENTIFIER_LENGTH = 64 """Maximum length for table/column names""" + RESOLVE_TABLE_REFS_IN_PHYSICAL_PROPERTIES: t.FrozenSet[str] = frozenset( + {"excluded_trigger_tables", "excluded_refresh_tables"} + ) + """StarRocks async materialized views accept these properties to exclude certain tables from + triggering or participating in refreshes. When the value references a managed SQLMesh model, + StarRocks needs the physical table name (db.table), not the logical view name. Managed-model + physical names carry no catalog prefix (catalog support is UNSUPPORTED), so they are already in + the warehouse-local db.table form StarRocks expects; unmanaged references (e.g. an external + catalog's ext_catalog.db.table) pass through unchanged.""" + # ==================== Schema Operations ==================== # StarRocks supports CREATE/DROP SCHEMA the same as CREATE/DROP DATABSE. # So, no need to implement create_schema / drop_schema diff --git a/sqlmesh/core/model/definition.py b/sqlmesh/core/model/definition.py index 5b3a656a54..3533d1b669 100644 --- a/sqlmesh/core/model/definition.py +++ b/sqlmesh/core/model/definition.py @@ -718,7 +718,30 @@ def _render(expression: exp.Expr) -> exp.Expr | None: } def render_physical_properties(self, **render_kwargs: t.Any) -> t.Dict[str, t.Any]: - return self._render_properties(properties=self.physical_properties, **render_kwargs) + rendered = self._render_properties(properties=self.physical_properties, **render_kwargs) + + # Some engines (e.g. StarRocks) accept properties whose values reference other models and + # need the physical table name rather than the logical view SQLMesh exposes. Resolve those. + engine_adapter = render_kwargs.get("engine_adapter") + resolve_keys: t.FrozenSet[str] = getattr( + engine_adapter, "RESOLVE_TABLE_REFS_IN_PHYSICAL_PROPERTIES", frozenset() + ) + keys_to_resolve = [key for key in resolve_keys if key in rendered] + if keys_to_resolve: + # Local import: sqlmesh.core.snapshot.definition imports _Model, so importing + # to_table_mapping at module scope would be circular. + from sqlmesh.core.snapshot.definition import to_table_mapping + + table_mapping = to_table_mapping( + (render_kwargs.get("snapshots") or {}).values(), + render_kwargs.get("deployability_index"), + ) + for key in keys_to_resolve: + rendered[key] = _resolve_model_refs_to_physical_tables( + rendered[key], table_mapping, self.dialect + ) + + return rendered def render_virtual_properties(self, **render_kwargs: t.Any) -> t.Dict[str, t.Any]: return self._render_properties(properties=self.virtual_properties, **render_kwargs) @@ -2813,6 +2836,32 @@ def _split_sql_model_statements( return query, sql_statements[:pos], sql_statements[pos + 1 :], on_virtual_update, inline_audits +def _resolve_model_refs_to_physical_tables( + value: exp.Expr, table_mapping: t.Dict[str, str], dialect: DialectType +) -> exp.Literal: + """Resolve managed-model references in a property value to their physical table names. + + The value is a single table reference or a comma-separated list of them. Each reference that + matches a managed model (via ``table_mapping``) is swapped for its physical ``db.table`` name; + anything else (e.g. a raw source) is kept as written. Returns a single string literal so the + property renders just like a hand-written value. + """ + if isinstance(value, exp.Literal) and value.is_string: + refs = value.this.split(",") + else: + refs = [value.sql(dialect=dialect)] + + def resolve(ref: str) -> str: + table = exp.to_table(ref.strip(), dialect=dialect) + physical = table_mapping.get(exp.table_name(table, identify=True)) + # Managed model -> physical table; otherwise keep the reference (just unquoted/normalized). + return exp.table_name( + exp.to_table(physical, dialect=dialect) if physical else table, identify=False + ) + + return exp.Literal.string(",".join(resolve(ref) for ref in refs if ref.strip())) + + def _resolve_properties( default: t.Optional[t.Dict[str, t.Any]], provided: t.Optional[exp.Expr | t.Dict[str, t.Any]], diff --git a/tests/core/engine_adapter/test_starrocks.py b/tests/core/engine_adapter/test_starrocks.py index cfc045e6b5..db0b1cc4ae 100644 --- a/tests/core/engine_adapter/test_starrocks.py +++ b/tests/core/engine_adapter/test_starrocks.py @@ -32,6 +32,11 @@ from sqlmesh.core.engine_adapter.duckdb import DuckDBEngineAdapter from sqlmesh.core.dialect import parse from sqlmesh.core.model import load_sql_based_model, SqlModel +from sqlmesh.core.snapshot.definition import ( + DeployabilityIndex, + Snapshot, + SnapshotChangeCategory, +) from sqlmesh.core.snapshot.evaluator import _adjust_physical_properties_for_engine pytestmark = [pytest.mark.starrocks, pytest.mark.engine] @@ -2132,3 +2137,352 @@ def test_non_starrocks_incremental_is_unaffected( """ ) assert "primary_key" not in self._adjust(adapter, model) + + +# ============================================================================= +# excluded_trigger_tables / excluded_refresh_tables physical table ref resolution +# ============================================================================= +class TestExcludedTablesResolution: + """Tests for automatic resolution of logical model names to physical table names + in excluded_trigger_tables and excluded_refresh_tables physical_properties. + + StarRocks async materialized views accept these properties to skip certain tables + from triggering or participating in refreshes. When a value is a managed SQLMesh + model, StarRocks needs the physical name (db.table), not the logical view name. + """ + + @staticmethod + def _make_snapshot(model: SqlModel) -> Snapshot: + snapshot = Snapshot.from_node(model, nodes={}, ttl="in 1 week") + snapshot.categorize_as(SnapshotChangeCategory.BREAKING) + return snapshot + + def _build_mv_with_excluded_tables( + self, + adapter: StarRocksEngineAdapter, + model: SqlModel, + snapshots: t.Dict[str, Snapshot], + ) -> str: + """Render physical properties with snapshot resolution and create the MV, returning DDL.""" + rendered_props = model.render_physical_properties( + snapshots=snapshots, + engine_adapter=adapter, + ) + query = model.render_query() + adapter.create_view( + model.name, + query, + replace=False, + materialized=True, + target_columns_to_types={"a": exp.DataType.build("INT")}, + view_properties=rendered_props, + ) + calls = to_sql_calls(adapter) + return calls[-1] + + def test_single_managed_model_ref_is_resolved_to_physical_name( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + ) -> None: + """excluded_trigger_tables referencing a managed model is resolved to physical db.table.""" + base_model = _load_sql_model( + """ + MODEL ( + name starrocks.test_1_model, + kind FULL, + dialect starrocks, + columns (a INT) + ); + SELECT 1 AS a; + """ + ) + base_snapshot = self._make_snapshot(base_model) + physical_name = exp.to_table(base_snapshot.table_name()) + expected_physical = f"{physical_name.db}.{physical_name.name}" + + mv_model = _load_sql_model( + """ + MODEL ( + name starrocks.test_mv, + kind VIEW (materialized true), + dialect starrocks, + columns (a INT), + physical_properties ( + refresh_scheme = ASYNC, + excluded_trigger_tables = starrocks.test_1_model + ) + ); + SELECT a FROM starrocks.test_1_model; + """ + ) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + snapshots = {base_snapshot.name: base_snapshot} + ddl = self._build_mv_with_excluded_tables(adapter, mv_model, snapshots) + + assert expected_physical in ddl + # Logical name must NOT appear as the property value + assert f"'excluded_trigger_tables'='starrocks.test_1_model'" not in ddl + + def test_single_managed_model_ref_in_excluded_refresh_tables( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + ) -> None: + """excluded_refresh_tables referencing a managed model is also resolved.""" + base_model = _load_sql_model( + """ + MODEL ( + name starrocks.source_model, + kind FULL, + dialect starrocks, + columns (a INT) + ); + SELECT 1 AS a; + """ + ) + base_snapshot = self._make_snapshot(base_model) + physical_name = exp.to_table(base_snapshot.table_name()) + expected_physical = f"{physical_name.db}.{physical_name.name}" + + mv_model = _load_sql_model( + """ + MODEL ( + name starrocks.test_mv2, + kind VIEW (materialized true), + dialect starrocks, + columns (a INT), + physical_properties ( + refresh_scheme = ASYNC, + excluded_refresh_tables = starrocks.source_model + ) + ); + SELECT a FROM starrocks.source_model; + """ + ) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + snapshots = {base_snapshot.name: base_snapshot} + ddl = self._build_mv_with_excluded_tables(adapter, mv_model, snapshots) + + assert expected_physical in ddl + + def test_unmanaged_source_is_left_as_is( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + ) -> None: + """A raw source that is not a managed snapshot passes through unchanged.""" + mv_model = _load_sql_model( + """ + MODEL ( + name starrocks.test_mv3, + kind VIEW (materialized true), + dialect starrocks, + columns (a INT), + physical_properties ( + refresh_scheme = ASYNC, + excluded_trigger_tables = "external_db.raw_table" + ) + ); + SELECT 1 AS a; + """ + ) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + ddl = self._build_mv_with_excluded_tables(adapter, mv_model, snapshots={}) + + assert "external_db.raw_table" in ddl + + def test_unmanaged_external_catalog_ref_keeps_catalog( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + ) -> None: + """A three-part external-catalog reference is preserved in full (catalog not stripped).""" + mv_model = _load_sql_model( + """ + MODEL ( + name starrocks.test_mv_ext, + kind VIEW (materialized true), + dialect starrocks, + columns (a INT), + physical_properties ( + refresh_scheme = ASYNC, + excluded_trigger_tables = "ext_catalog.ext_db.raw_table" + ) + ); + SELECT 1 AS a; + """ + ) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + ddl = self._build_mv_with_excluded_tables(adapter, mv_model, snapshots={}) + + assert "ext_catalog.ext_db.raw_table" in ddl + + def test_mixed_list_managed_and_unmanaged( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + ) -> None: + """A comma-separated list: managed model resolved, unmanaged source left as-is.""" + base_model = _load_sql_model( + """ + MODEL ( + name starrocks.managed_model, + kind FULL, + dialect starrocks, + columns (a INT) + ); + SELECT 1 AS a; + """ + ) + base_snapshot = self._make_snapshot(base_model) + physical_name = exp.to_table(base_snapshot.table_name()) + expected_physical = f"{physical_name.db}.{physical_name.name}" + + mv_model = _load_sql_model( + """ + MODEL ( + name starrocks.test_mv4, + kind VIEW (materialized true), + dialect starrocks, + columns (a INT), + physical_properties ( + refresh_scheme = ASYNC, + excluded_trigger_tables = "starrocks.managed_model,external_db.raw_table" + ) + ); + SELECT 1 AS a; + """ + ) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + snapshots = {base_snapshot.name: base_snapshot} + ddl = self._build_mv_with_excluded_tables(adapter, mv_model, snapshots) + + assert expected_physical in ddl + assert "external_db.raw_table" in ddl + # The logical name must NOT appear as a property value; the physical name MUST be present. + # StarRocks DDL format: PROPERTIES ('key'='value'), both key and value in single quotes. + import re + + match = re.search(r"'excluded_trigger_tables'='([^']*)'", ddl) + assert match is not None, "excluded_trigger_tables property not found in DDL" + prop_value = match.group(1) + assert "starrocks.managed_model" not in prop_value, ( + "Logical model name leaked into excluded_trigger_tables property value" + ) + assert expected_physical in prop_value, ( + "Physical table name not found in excluded_trigger_tables property value" + ) + + def test_no_snapshots_passes_value_through( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + ) -> None: + """When no snapshots are provided, the value passes through unchanged.""" + mv_model = _load_sql_model( + """ + MODEL ( + name starrocks.test_mv5, + kind VIEW (materialized true), + dialect starrocks, + columns (a INT), + physical_properties ( + refresh_scheme = ASYNC, + excluded_trigger_tables = starrocks.some_table + ) + ); + SELECT 1 AS a; + """ + ) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + ddl = self._build_mv_with_excluded_tables(adapter, mv_model, snapshots={}) + + assert "starrocks.some_table" in ddl + + def test_dev_plan_non_deployable_snapshot_resolves_to_dev_table( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + ) -> None: + """When a snapshot is non-deployable (dev plan), the dev physical table is used. + + A snapshot that is not representative in the deployability_index has its dev + (``__dev``-suffixed) table selected by ``to_table_mapping``. The property value + must reference this dev physical name, NOT the production physical name. + """ + base_model = _load_sql_model( + """ + MODEL ( + name starrocks.upstream, + kind FULL, + dialect starrocks, + columns (a INT) + ); + SELECT 1 AS a; + """ + ) + base_snapshot = self._make_snapshot(base_model) + prod_physical_name = exp.to_table(base_snapshot.table_name(is_deployable=True)) + dev_physical_name = exp.to_table(base_snapshot.table_name(is_deployable=False)) + + prod_expected = f"{prod_physical_name.db}.{prod_physical_name.name}" + dev_expected = f"{dev_physical_name.db}.{dev_physical_name.name}" + + mv_model = _load_sql_model( + """ + MODEL ( + name starrocks.test_mv_dev, + kind VIEW (materialized true), + dialect starrocks, + columns (a INT), + physical_properties ( + refresh_scheme = ASYNC, + excluded_trigger_tables = starrocks.upstream + ) + ); + SELECT a FROM starrocks.upstream; + """ + ) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + snapshots = {base_snapshot.name: base_snapshot} + + # With all_deployable (default/prod plan): resolves to prod physical name + deployability_all = DeployabilityIndex.all_deployable() + rendered_prod = mv_model.render_physical_properties( + snapshots=snapshots, + engine_adapter=adapter, + deployability_index=deployability_all, + ) + prod_value = rendered_prod["excluded_trigger_tables"] + assert hasattr(prod_value, "this"), "expected exp.Literal" + # prod value must equal the prod table exactly (no __dev suffix) + assert prod_value.this == prod_expected + assert "__dev" not in prod_value.this + + # With none_deployable (dev plan): resolves to dev (__dev-suffixed) physical name + deployability_none = DeployabilityIndex.none_deployable() + rendered_dev = mv_model.render_physical_properties( + snapshots=snapshots, + engine_adapter=adapter, + deployability_index=deployability_none, + ) + dev_value = rendered_dev["excluded_trigger_tables"] + assert hasattr(dev_value, "this"), "expected exp.Literal" + # dev value must equal the dev table exactly (has __dev suffix) + assert dev_value.this == dev_expected + assert "__dev" in dev_value.this + + def test_non_starrocks_engine_is_not_affected(self) -> None: + """The RESOLVE_TABLE_REFS_IN_PHYSICAL_PROPERTIES set is empty on the base EngineAdapter.""" + from sqlmesh.core.engine_adapter.base import EngineAdapter + + assert len(EngineAdapter.RESOLVE_TABLE_REFS_IN_PHYSICAL_PROPERTIES) == 0 + assert ( + "excluded_trigger_tables" + in StarRocksEngineAdapter.RESOLVE_TABLE_REFS_IN_PHYSICAL_PROPERTIES + ) + assert ( + "excluded_refresh_tables" + in StarRocksEngineAdapter.RESOLVE_TABLE_REFS_IN_PHYSICAL_PROPERTIES + ) diff --git a/tests/core/test_snapshot_evaluator.py b/tests/core/test_snapshot_evaluator.py index edb1d8ed7d..27bcbe05ae 100644 --- a/tests/core/test_snapshot_evaluator.py +++ b/tests/core/test_snapshot_evaluator.py @@ -129,6 +129,7 @@ def mock_exit(self, exc_type, exc_value, traceback): adapter_mock.session.return_value = session_mock adapter_mock.dialect = "duckdb" adapter_mock.HAS_VIEW_BINDING = False + adapter_mock.RESOLVE_TABLE_REFS_IN_PHYSICAL_PROPERTIES = frozenset() adapter_mock.wap_supported.return_value = False adapter_mock.get_data_objects.return_value = [] adapter_mock.with_settings.return_value = adapter_mock @@ -155,6 +156,7 @@ def adapters(mocker: MockerFixture): adapter_mock.session.return_value = session_mock adapter_mock.dialect = "duckdb" adapter_mock.HAS_VIEW_BINDING = False + adapter_mock.RESOLVE_TABLE_REFS_IN_PHYSICAL_PROPERTIES = frozenset() adapter_mock.wap_supported.return_value = False adapter_mock.get_data_objects.return_value = [] adapter_mock.with_settings.return_value = adapter_mock @@ -1171,6 +1173,7 @@ def test_create_tables_exist( adapter_mock = mocker.patch("sqlmesh.core.engine_adapter.EngineAdapter") adapter_mock.dialect = "duckdb" adapter_mock.with_settings.return_value = adapter_mock + adapter_mock.RESOLVE_TABLE_REFS_IN_PHYSICAL_PROPERTIES = frozenset() evaluator = SnapshotEvaluator(adapter_mock) snapshot.categorize_as(category=snapshot_category, forward_only=forward_only) @@ -1414,6 +1417,7 @@ def test_promote_model_info(mocker: MockerFixture, make_snapshot): adapter_mock = mocker.patch("sqlmesh.core.engine_adapter.EngineAdapter") adapter_mock.dialect = "duckdb" adapter_mock.with_settings.return_value = adapter_mock + adapter_mock.RESOLVE_TABLE_REFS_IN_PHYSICAL_PROPERTIES = frozenset() evaluator = SnapshotEvaluator(adapter_mock) @@ -1443,6 +1447,7 @@ def test_promote_deployable(mocker: MockerFixture, make_snapshot): adapter_mock = mocker.patch("sqlmesh.core.engine_adapter.EngineAdapter") adapter_mock.dialect = "duckdb" adapter_mock.with_settings.return_value = adapter_mock + adapter_mock.RESOLVE_TABLE_REFS_IN_PHYSICAL_PROPERTIES = frozenset() evaluator = SnapshotEvaluator(adapter_mock) @@ -4151,6 +4156,7 @@ def test_migrate_snapshot(snapshot: Snapshot, mocker: MockerFixture, adapter_moc adapter_mock = mocker.patch("sqlmesh.core.engine_adapter.EngineAdapter") adapter_mock.dialect = "duckdb" adapter_mock.with_settings.return_value = adapter_mock + adapter_mock.RESOLVE_TABLE_REFS_IN_PHYSICAL_PROPERTIES = frozenset() adapter_mock.adjust_physical_properties_for_incremental.side_effect = ( lambda physical_properties, **kwargs: physical_properties ) @@ -5068,6 +5074,7 @@ def test_properties_are_preserved_in_both_create_statements( adapter_mock.session.return_value = session_mock adapter_mock.dialect = "trino" adapter_mock.HAS_VIEW_BINDING = False + adapter_mock.RESOLVE_TABLE_REFS_IN_PHYSICAL_PROPERTIES = frozenset() adapter_mock.wap_supported.return_value = False adapter_mock.get_data_objects.return_value = [] adapter_mock.with_settings.return_value = adapter_mock