diff --git a/packages/bigframes/bigframes/core/compile/compiled.py b/packages/bigframes/bigframes/core/compile/compiled.py index e334c687f5cc..fea94f6e6edc 100644 --- a/packages/bigframes/bigframes/core/compile/compiled.py +++ b/packages/bigframes/bigframes/core/compile/compiled.py @@ -381,6 +381,7 @@ def isin_join( new_column = ( (left_table[conditions[0]]) .isin((right_table[conditions[1]])) + .fillna(False) .name(indicator_col) ) diff --git a/packages/bigframes/bigframes/core/compile/sqlglot/sqlglot_ir.py b/packages/bigframes/bigframes/core/compile/sqlglot/sqlglot_ir.py index 1b7babf6ee6b..27b79f266bc1 100644 --- a/packages/bigframes/bigframes/core/compile/sqlglot/sqlglot_ir.py +++ b/packages/bigframes/bigframes/core/compile/sqlglot/sqlglot_ir.py @@ -385,9 +385,13 @@ def isin_join( ) ) else: - new_column = sge.In( - this=conditions[0].expr, - expressions=[right._as_subquery()], + new_column = sge.func( + "COALESCE", + sge.In( + this=conditions[0].expr, + expressions=[right._as_subquery()], + ), + sql.literal(False, dtypes.BOOL_DTYPE), ) new_column = sge.Alias( diff --git a/packages/bigframes/bigframes/session/_io/bigquery/__init__.py b/packages/bigframes/bigframes/session/_io/bigquery/__init__.py index 88c70b6a186b..780ba55c50db 100644 --- a/packages/bigframes/bigframes/session/_io/bigquery/__init__.py +++ b/packages/bigframes/bigframes/session/_io/bigquery/__init__.py @@ -516,19 +516,21 @@ def to_query( ) -> str: """Compile query_or_table with conditions(filters, wildcards) to query.""" if is_query(query_or_table): - sub_query = f"({query_or_table})" + from_item = f"({query_or_table})" else: # Table ID can have 1, 2, 3, or 4 parts. Quoting all parts to be safe. # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#identifiers parts = query_or_table.split(".") - sub_query = ".".join(f"`{part}`" for part in parts) + from_item = ".".join(f"`{part}`" for part in parts) # TODO(b/338111344): Generate an index based on DefaultIndexKind if we # don't have index columns specified. if columns: # We only reduce the selection if columns is set, but we always # want to make sure index_cols is also included. - select_clause = "SELECT " + ", ".join(f"`{column}`" for column in columns) + select_clause = "SELECT " + ", ".join( + f"`_bf_source`.`{column}`" for column in columns + ) else: select_clause = "SELECT *" @@ -545,7 +547,7 @@ def to_query( return ( f"{select_clause} " - f"FROM {sub_query}" + f"FROM {from_item} AS _bf_source" f"{time_travel_clause}{where_clause}{limit_clause}" ) diff --git a/packages/bigframes/bigframes/session/polars_executor.py b/packages/bigframes/bigframes/session/polars_executor.py index 43e3609ac3c1..06c7fcb925c4 100644 --- a/packages/bigframes/bigframes/session/polars_executor.py +++ b/packages/bigframes/bigframes/session/polars_executor.py @@ -122,7 +122,7 @@ def _is_node_polars_executable(node: nodes.BigFrameNode): return False for expr in node._node_expressions: if isinstance(expr, agg_expressions.Aggregation): - if not type(expr.op) in _COMPATIBLE_AGG_OPS: + if type(expr.op) not in _COMPATIBLE_AGG_OPS: return False if isinstance(expr, expression.Expression): if not set(map(type, _get_expr_ops(expr))).issubset(_COMPATIBLE_SCALAR_OPS): diff --git a/packages/bigframes/tests/unit/core/compile/sqlglot/snapshots/test_compile_isin/test_compile_isin_not_nullable/out.sql b/packages/bigframes/tests/unit/core/compile/sqlglot/snapshots/test_compile_isin/test_compile_isin_not_nullable/out.sql index cc1633d3a3a1..81c83dee6c9f 100644 --- a/packages/bigframes/tests/unit/core/compile/sqlglot/snapshots/test_compile_isin/test_compile_isin_not_nullable/out.sql +++ b/packages/bigframes/tests/unit/core/compile/sqlglot/snapshots/test_compile_isin/test_compile_isin_not_nullable/out.sql @@ -20,11 +20,11 @@ WITH `bfcte_0` AS ( ), `bfcte_4` AS ( SELECT *, - `bfcol_4` IN (( + COALESCE(`bfcol_4` IN (( SELECT * FROM `bfcte_3` - )) AS `bfcol_5` + )), FALSE) AS `bfcol_5` FROM `bfcte_1` ) SELECT diff --git a/packages/bigframes/tests/unit/core/compile/sqlglot/tpch/snapshots/test_tpch/test_tpch_query/16/out.sql b/packages/bigframes/tests/unit/core/compile/sqlglot/tpch/snapshots/test_tpch/test_tpch_query/16/out.sql index c68bb37d7cfe..228d51a76c7c 100644 --- a/packages/bigframes/tests/unit/core/compile/sqlglot/tpch/snapshots/test_tpch/test_tpch_query/16/out.sql +++ b/packages/bigframes/tests/unit/core/compile/sqlglot/tpch/snapshots/test_tpch/test_tpch_query/16/out.sql @@ -51,11 +51,11 @@ WITH `bfcte_0` AS ( ), `bfcte_6` AS ( SELECT *, - `bfcol_58` IN (( + COALESCE(`bfcol_58` IN (( SELECT * FROM `bfcte_5` - )) AS `bfcol_59` + )), FALSE) AS `bfcol_59` FROM `bfcte_4` ), `bfcte_7` AS ( SELECT diff --git a/packages/bigframes/tests/unit/core/compile/sqlglot/tpch/snapshots/test_tpch/test_tpch_query/18/out.sql b/packages/bigframes/tests/unit/core/compile/sqlglot/tpch/snapshots/test_tpch/test_tpch_query/18/out.sql index c1b3629ddf78..6fcdb343940d 100644 --- a/packages/bigframes/tests/unit/core/compile/sqlglot/tpch/snapshots/test_tpch/test_tpch_query/18/out.sql +++ b/packages/bigframes/tests/unit/core/compile/sqlglot/tpch/snapshots/test_tpch/test_tpch_query/18/out.sql @@ -44,11 +44,11 @@ WITH `bfcte_0` AS ( ), `bfcte_7` AS ( SELECT *, - `bfcol_4` IN (( + COALESCE(`bfcol_4` IN (( SELECT * FROM `bfcte_6` - )) AS `bfcol_14` + )), FALSE) AS `bfcol_14` FROM `bfcte_2` ), `bfcte_8` AS ( SELECT diff --git a/packages/bigframes/tests/unit/core/compile/sqlglot/tpch/snapshots/test_tpch/test_tpch_query/20/out.sql b/packages/bigframes/tests/unit/core/compile/sqlglot/tpch/snapshots/test_tpch/test_tpch_query/20/out.sql index 5afd4ee08545..197588f5c845 100644 --- a/packages/bigframes/tests/unit/core/compile/sqlglot/tpch/snapshots/test_tpch/test_tpch_query/20/out.sql +++ b/packages/bigframes/tests/unit/core/compile/sqlglot/tpch/snapshots/test_tpch/test_tpch_query/20/out.sql @@ -85,11 +85,11 @@ WITH `bfcte_0` AS ( ), `bfcte_10` AS ( SELECT *, - `bfcol_2` IN (( + COALESCE(`bfcol_2` IN (( SELECT * FROM `bfcte_8` - )) AS `bfcol_37` + )), FALSE) AS `bfcol_37` FROM `bfcte_1` ), `bfcte_11` AS ( SELECT @@ -127,11 +127,11 @@ WITH `bfcte_0` AS ( ), `bfcte_15` AS ( SELECT *, - `bfcol_41` IN (( + COALESCE(`bfcol_41` IN (( SELECT * FROM `bfcte_14` - )) AS `bfcol_62` + )), FALSE) AS `bfcol_62` FROM `bfcte_7` ) SELECT diff --git a/packages/bigframes/tests/unit/core/compile/sqlglot/tpch/snapshots/test_tpch/test_tpch_query/22/out.sql b/packages/bigframes/tests/unit/core/compile/sqlglot/tpch/snapshots/test_tpch/test_tpch_query/22/out.sql index 3ae51f1cdfff..5ab22d3cdaff 100644 --- a/packages/bigframes/tests/unit/core/compile/sqlglot/tpch/snapshots/test_tpch/test_tpch_query/22/out.sql +++ b/packages/bigframes/tests/unit/core/compile/sqlglot/tpch/snapshots/test_tpch/test_tpch_query/22/out.sql @@ -92,11 +92,11 @@ WITH `bfcte_0` AS ( ), `bfcte_12` AS ( SELECT *, - `bfcol_61` IN (( + COALESCE(`bfcol_61` IN (( SELECT * FROM `bfcte_6` - )) AS `bfcol_64` + )), FALSE) AS `bfcol_64` FROM `bfcte_11` ), `bfcte_13` AS ( SELECT diff --git a/packages/bigframes/tests/unit/session/test_io_bigquery.py b/packages/bigframes/tests/unit/session/test_io_bigquery.py index 903433acbe98..79996e185ecf 100644 --- a/packages/bigframes/tests/unit/session/test_io_bigquery.py +++ b/packages/bigframes/tests/unit/session/test_io_bigquery.py @@ -344,7 +344,7 @@ def test_bq_schema_to_sql(schema: Iterable[bigquery.SchemaField], expected: str) 2024, 5, 14, 12, 42, 36, 125125, tzinfo=datetime.timezone.utc ), ( - "SELECT `row_index`, `string_col` FROM `test_table` " + "SELECT `_bf_source`.`row_index`, `_bf_source`.`string_col` FROM `test_table` AS _bf_source " "FOR SYSTEM_TIME AS OF CAST('2024-05-14T12:42:36.125125+00:00' AS TIMESTAMP) " "WHERE `rowindex` NOT IN (0, 6) OR `string_col` IN ('Hello, World!', " "'こんにちは') LIMIT 123" @@ -369,11 +369,11 @@ def test_bq_schema_to_sql(schema: Iterable[bigquery.SchemaField], expected: str) 2024, 5, 14, 12, 42, 36, 125125, tzinfo=datetime.timezone.utc ), ( - """SELECT `rowindex`, `string_col` FROM (SELECT + """SELECT `_bf_source`.`rowindex`, `_bf_source`.`string_col` FROM (SELECT rowindex, string_col, FROM `test_table` AS t - ) """ + ) AS _bf_source """ "FOR SYSTEM_TIME AS OF CAST('2024-05-14T12:42:36.125125+00:00' AS TIMESTAMP) " "WHERE `rowindex` < 4 AND `string_col` = 'Hello, World!' " "LIMIT 123" @@ -386,7 +386,7 @@ def test_bq_schema_to_sql(schema: Iterable[bigquery.SchemaField], expected: str) [], None, # max_results None, # time_travel_timestampe - "SELECT `col_a`, `col_b` FROM `test_table`", + "SELECT `_bf_source`.`col_a`, `_bf_source`.`col_b` FROM `test_table` AS _bf_source", id="table-columns", ), pytest.param( @@ -395,7 +395,7 @@ def test_bq_schema_to_sql(schema: Iterable[bigquery.SchemaField], expected: str) [("date_col", ">", "2022-10-20")], None, # max_results None, # time_travel_timestampe - "SELECT * FROM `test_table` WHERE `date_col` > '2022-10-20'", + "SELECT * FROM `test_table` AS _bf_source WHERE `date_col` > '2022-10-20'", id="table-filter", ), pytest.param( @@ -404,7 +404,7 @@ def test_bq_schema_to_sql(schema: Iterable[bigquery.SchemaField], expected: str) [], None, # max_results None, # time_travel_timestampe - "SELECT * FROM `test_table*`", + "SELECT * FROM `test_table*` AS _bf_source", id="wildcard-no_params", ), pytest.param( @@ -413,7 +413,7 @@ def test_bq_schema_to_sql(schema: Iterable[bigquery.SchemaField], expected: str) [("_TABLE_SUFFIX", ">", "2022-10-20")], None, # max_results None, # time_travel_timestampe - "SELECT * FROM `test_table*` WHERE `_TABLE_SUFFIX` > '2022-10-20'", + "SELECT * FROM `test_table*` AS _bf_source WHERE `_TABLE_SUFFIX` > '2022-10-20'", id="wildcard-filter", ), ], diff --git a/packages/bigframes/third_party/bigframes_vendored/sqlglot/parser.py b/packages/bigframes/third_party/bigframes_vendored/sqlglot/parser.py index 8189dbf39926..706649f43fbe 100644 --- a/packages/bigframes/third_party/bigframes_vendored/sqlglot/parser.py +++ b/packages/bigframes/third_party/bigframes_vendored/sqlglot/parser.py @@ -290,11 +290,11 @@ class Parser(metaclass=_Parser): "RIGHTPAD": lambda args: build_pad(args, is_left=False), "RPAD": lambda args: build_pad(args, is_left=False), "RTRIM": lambda args: build_trim(args, is_left=False), - "SCOPE_RESOLUTION": lambda args: exp.ScopeResolution( - expression=seq_get(args, 0) - ) - if len(args) != 2 - else exp.ScopeResolution(this=seq_get(args, 0), expression=seq_get(args, 1)), + "SCOPE_RESOLUTION": lambda args: ( + exp.ScopeResolution(expression=seq_get(args, 0)) + if len(args) != 2 + else exp.ScopeResolution(this=seq_get(args, 0), expression=seq_get(args, 1)) + ), "STRPOS": exp.StrPosition.from_arg_list, "CHARINDEX": lambda args: build_locate_strposition(args), "INSTR": exp.StrPosition.from_arg_list, @@ -943,7 +943,9 @@ class Parser(metaclass=_Parser): } UNARY_PARSERS = { - TokenType.PLUS: lambda self: self._parse_unary(), # Unary + is handled as a no-op + TokenType.PLUS: lambda self: ( + self._parse_unary() + ), # Unary + is handled as a no-op TokenType.NOT: lambda self: self.expression( exp.Not, this=self._parse_equality() ), @@ -1246,12 +1248,14 @@ class Parser(metaclass=_Parser): exp.NotNullColumnConstraint, allow_null=True ), "ON": lambda self: ( - self._match(TokenType.UPDATE) - and self.expression( - exp.OnUpdateColumnConstraint, this=self._parse_function() + ( + self._match(TokenType.UPDATE) + and self.expression( + exp.OnUpdateColumnConstraint, this=self._parse_function() + ) ) - ) - or self.expression(exp.OnProperty, this=self._parse_id_var()), + or self.expression(exp.OnProperty, this=self._parse_id_var()) + ), "PATH": lambda self: self.expression( exp.PathColumnConstraint, this=self._parse_string() ), @@ -3885,8 +3889,9 @@ def _parse_hint_body(self) -> t.Optional[exp.Hint]: try: for hint in iter( lambda: self._parse_csv( - lambda: self._parse_hint_function_call() - or self._parse_var(upper=True), + lambda: ( + self._parse_hint_function_call() or self._parse_var(upper=True) + ), ), [], ): @@ -4305,8 +4310,9 @@ def _parse_table_hints(self) -> t.Optional[t.List[exp.Expression]]: self.expression( exp.WithTableHint, expressions=self._parse_csv( - lambda: self._parse_function() - or self._parse_var(any_token=True) + lambda: ( + self._parse_function() or self._parse_var(any_token=True) + ) ), ) ) @@ -4469,6 +4475,14 @@ def _parse_table( if schema: return self._parse_schema(this=this) + # see: https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#from_clause + # from_item, then alias, then time travel, then sample. + alias = self._parse_table_alias( + alias_tokens=alias_tokens or self.TABLE_ALIAS_TOKENS + ) + if alias: + this.set("alias", alias) + version = self._parse_version() if version: @@ -4477,12 +4491,6 @@ def _parse_table( if self.dialect.ALIAS_POST_TABLESAMPLE: this.set("sample", self._parse_table_sample()) - alias = self._parse_table_alias( - alias_tokens=alias_tokens or self.TABLE_ALIAS_TOKENS - ) - if alias: - this.set("alias", alias) - if self._match(TokenType.INDEXED_BY): this.set("indexed", self._parse_table_parts()) elif self._match_text_seq("NOT", "INDEXED"): @@ -4935,11 +4943,13 @@ def _parse_group(self, skip_group_by_token: bool = False) -> t.Optional[exp.Grou elements["expressions"].extend( self._parse_csv( - lambda: None - if self._match_set( - (TokenType.CUBE, TokenType.ROLLUP), advance=False + lambda: ( + None + if self._match_set( + (TokenType.CUBE, TokenType.ROLLUP), advance=False + ) + else self._parse_disjunction() ) - else self._parse_disjunction() ) ) @@ -6225,9 +6235,9 @@ def _parse_column_ops( # https://cloud.google.com/bigquery/docs/reference/standard-sql/functions-reference#function_call_rules if isinstance(field, (exp.Func, exp.Window)) and this: this = this.transform( - lambda n: n.to_dot(include_dots=False) - if isinstance(n, exp.Column) - else n + lambda n: ( + n.to_dot(include_dots=False) if isinstance(n, exp.Column) else n + ) ) if op: