From 83ce4d16371c90c6452bd8de0e186cceecd8b5c8 Mon Sep 17 00:00:00 2001 From: Qi Zhu Date: Fri, 29 May 2026 15:42:46 +0800 Subject: [PATCH 1/2] perf(optimizer): EliminateCrossJoin fast-path for join-free plans MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #22583. The rule's body only does real work when the root (or its Filter child) is an inner Join; in every other case it falls through to rewrite_children, which recurses into the plan, processes uncorrelated subqueries, and rewrites every direct child via map_children (clone-on-write), then calls recompute_schema on the way back. This is paid by every query in the logical optimizer pipeline — including simple point queries with no joins anywhere in the tree. Add a read-only apply scan at the top of rewrite that bails out with Transformed::no(plan) when no LogicalPlan::Join node exists in the tree. Mirrors the plan_has_subqueries fast-path landed in #22298 — same shape (cheap up-front scan, early stop on first match) and follows the precedent the reviewers approved there. Four unit tests cover: - plan_has_joins detects a root Join - plan_has_joins detects a nested Join - plan_has_joins returns false on join-free plans - rewrite short-circuits with Transformed::no on join-free plans --- .../optimizer/src/eliminate_cross_join.rs | 100 +++++++++++++++++- 1 file changed, 99 insertions(+), 1 deletion(-) diff --git a/datafusion/optimizer/src/eliminate_cross_join.rs b/datafusion/optimizer/src/eliminate_cross_join.rs index 8306d4b54c256..d032810d7a507 100644 --- a/datafusion/optimizer/src/eliminate_cross_join.rs +++ b/datafusion/optimizer/src/eliminate_cross_join.rs @@ -20,7 +20,7 @@ use crate::{OptimizerConfig, OptimizerRule}; use std::sync::Arc; use crate::join_key_set::JoinKeySet; -use datafusion_common::tree_node::{Transformed, TreeNode}; +use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion}; use datafusion_common::{NullEquality, Result}; use datafusion_expr::expr::{BinaryExpr, Expr}; use datafusion_expr::logical_plan::{ @@ -85,6 +85,17 @@ impl OptimizerRule for EliminateCrossJoin { plan: LogicalPlan, config: &dyn OptimizerConfig, ) -> Result> { + // Fast path: nothing to do if the plan contains no `Join` nodes. + // Without this guard the rule still falls through to + // `rewrite_children`, which walks the entire plan, processes + // uncorrelated subqueries, and rewrites every direct child via + // `map_children` (clone-on-write) — paid by every query in the + // logical optimizer pipeline. Same shape as the + // `plan_has_subqueries` fast-path landed in #22298. + if !plan_has_joins(&plan) { + return Ok(Transformed::no(plan)); + } + let plan_schema = Arc::clone(plan.schema()); let mut possible_join_keys = JoinKeySet::new(); let mut all_inputs: Vec = vec![]; @@ -207,6 +218,25 @@ impl OptimizerRule for EliminateCrossJoin { } } +/// Returns `true` if `plan` contains at least one [`LogicalPlan::Join`] +/// node anywhere in its tree. +/// +/// Used as a fast-path gate at the top of [`EliminateCrossJoin::rewrite`] +/// so that join-free plans skip the full recursive rewrite — read-only +/// `apply` walk with early stop on first match, no allocations. +fn plan_has_joins(plan: &LogicalPlan) -> bool { + let mut found = false; + let _ = plan.apply(|node| { + if matches!(node, LogicalPlan::Join(_)) { + found = true; + Ok(TreeNodeRecursion::Stop) + } else { + Ok(TreeNodeRecursion::Continue) + } + }); + found +} + fn rewrite_children( optimizer: &impl OptimizerRule, plan: LogicalPlan, @@ -1418,4 +1448,72 @@ mod tests { Ok(()) } + + // ---------------- fast-path tests ---------------- + + /// `plan_has_joins` detects a `Join` at the root of the plan. + #[test] + fn plan_has_joins_detects_root_join() -> Result<()> { + let plan = LogicalPlanBuilder::from(test_table_scan_with_name("t1")?) + .cross_join(test_table_scan_with_name("t2")?)? + .build()?; + assert!(plan_has_joins(&plan)); + Ok(()) + } + + /// `plan_has_joins` detects a `Join` nested under other operators. + #[test] + fn plan_has_joins_detects_nested_join() -> Result<()> { + let plan = LogicalPlanBuilder::from(test_table_scan_with_name("t1")?) + .cross_join(test_table_scan_with_name("t2")?)? + .filter(col("t1.a").eq(col("t2.a")))? + .project(vec![col("t1.a")])? + .build()?; + assert!(plan_has_joins(&plan)); + Ok(()) + } + + /// Join-free plans return `false` so the fast-path in `rewrite` can + /// bail out before doing any recursion. + #[test] + fn plan_has_joins_returns_false_for_join_free_plan() -> Result<()> { + let plan = LogicalPlanBuilder::from(test_table_scan_with_name("t1")?) + .filter(col("a").gt(lit(0_i32)))? + .project(vec![col("a"), col("b")])? + .build()?; + assert!(!plan_has_joins(&plan)); + Ok(()) + } + + /// `EliminateCrossJoin::rewrite` short-circuits on join-free plans: + /// no recursion into `rewrite_children`, no `Transformed::yes`, + /// the plan comes back identical. + #[test] + fn rewrite_short_circuits_when_plan_has_no_joins() -> Result<()> { + let plan = LogicalPlanBuilder::from(test_table_scan_with_name("t1")?) + .filter(col("a").gt(lit(0_i32)))? + .project(vec![col("a"), col("b")])? + .build()?; + + let starting_display = plan.display_indent_schema().to_string(); + let starting_schema = Arc::clone(plan.schema()); + + let rule = EliminateCrossJoin::new(); + let Transformed { + transformed, + data: optimized_plan, + .. + } = rule.rewrite(plan, &OptimizerContext::new())?; + + assert!( + !transformed, + "join-free plan should not be marked as transformed" + ); + assert_eq!(&starting_schema, optimized_plan.schema()); + assert_eq!( + starting_display, + optimized_plan.display_indent_schema().to_string() + ); + Ok(()) + } } From 0b4e30c8c20bae99c9f4204ee5c3ae049a9c21ed Mon Sep 17 00:00:00 2001 From: Qi Zhu Date: Fri, 29 May 2026 16:16:10 +0800 Subject: [PATCH 2/2] fix: also walk subquery plans in plan_has_joins Copilot review found a correctness gap: `LogicalPlan::apply` does not descend into the subquery plans referenced from `Expr::ScalarSubquery` / `Expr::InSubquery` / `Expr::Exists` / `Expr::SetComparison`. A query whose outer plan has no Join but contains `IN (SELECT ... FROM a CROSS JOIN b)`-style predicates would short-circuit and skip optimizing the subquery's join. `rewrite_children` reaches those subqueries via `map_uncorrelated_subqueries`, so `plan_has_joins` has to consider them too. Extend the helper to recurse into each subquery plan through `apply_expressions` + an `Expr::apply` walk. New regression test `plan_has_joins_detects_join_inside_subquery` exercises the `IN (SELECT ... CROSS JOIN ...)` shape. --- .../optimizer/src/eliminate_cross_join.rs | 73 ++++++++++++++++++- 1 file changed, 70 insertions(+), 3 deletions(-) diff --git a/datafusion/optimizer/src/eliminate_cross_join.rs b/datafusion/optimizer/src/eliminate_cross_join.rs index d032810d7a507..790674d67f84e 100644 --- a/datafusion/optimizer/src/eliminate_cross_join.rs +++ b/datafusion/optimizer/src/eliminate_cross_join.rs @@ -219,16 +219,53 @@ impl OptimizerRule for EliminateCrossJoin { } /// Returns `true` if `plan` contains at least one [`LogicalPlan::Join`] -/// node anywhere in its tree. +/// node, either directly in its tree *or* inside an embedded subquery +/// plan reachable through `Expr::ScalarSubquery` / `Expr::InSubquery` +/// / `Expr::Exists` / `Expr::SetComparison`. /// /// Used as a fast-path gate at the top of [`EliminateCrossJoin::rewrite`] -/// so that join-free plans skip the full recursive rewrite — read-only -/// `apply` walk with early stop on first match, no allocations. +/// so that join-free plans skip the full recursive rewrite. The +/// expression-side recursion is required because `rewrite_children` +/// also dives into uncorrelated subqueries via +/// `map_uncorrelated_subqueries`; ignoring them here would skip +/// optimizing a `CROSS JOIN` that sits only inside an +/// `IN (SELECT ... FROM a, b)`-style predicate. +/// +/// Read-only `apply` walk with early stop on first match, no +/// allocations. fn plan_has_joins(plan: &LogicalPlan) -> bool { let mut found = false; let _ = plan.apply(|node| { if matches!(node, LogicalPlan::Join(_)) { found = true; + return Ok(TreeNodeRecursion::Stop); + } + // Recurse into subquery plans referenced from this node's + // expressions. `LogicalPlan::apply` does not descend into the + // subquery plan, so we have to do it explicitly. + let _ = node.apply_expressions(|expr| { + if found { + return Ok(TreeNodeRecursion::Stop); + } + let _ = expr.apply(|e| { + let subquery_plan = match e { + Expr::ScalarSubquery(sq) => Some(sq.subquery.as_ref()), + Expr::InSubquery(in_sq) => Some(in_sq.subquery.subquery.as_ref()), + Expr::Exists(exists) => Some(exists.subquery.subquery.as_ref()), + Expr::SetComparison(sc) => Some(sc.subquery.subquery.as_ref()), + _ => None, + }; + if let Some(sub) = subquery_plan + && plan_has_joins(sub) + { + found = true; + return Ok(TreeNodeRecursion::Stop); + } + Ok(TreeNodeRecursion::Continue) + }); + Ok(TreeNodeRecursion::Continue) + }); + if found { Ok(TreeNodeRecursion::Stop) } else { Ok(TreeNodeRecursion::Continue) @@ -1485,6 +1522,36 @@ mod tests { Ok(()) } + /// `plan_has_joins` walks into embedded subquery plans — e.g. an + /// outer `Filter` whose predicate is `IN (SELECT ... FROM a, b)` + /// where the inner plan contains a `CROSS JOIN`. Without this the + /// fast-path would silently skip optimizing joins-in-subqueries + /// because `LogicalPlan::apply` doesn't descend into subquery + /// plan trees. + #[test] + fn plan_has_joins_detects_join_inside_subquery() -> Result<()> { + use datafusion_expr::in_subquery; + + // Subquery plan that itself contains a join. + let subquery_plan = + LogicalPlanBuilder::from(test_table_scan_with_name("sub_t1")?) + .cross_join(test_table_scan_with_name("sub_t2")?)? + .project(vec![col("sub_t1.a")])? + .build()?; + + // Outer plan with NO direct Join — only the IN subquery reaches one. + let outer = LogicalPlanBuilder::from(test_table_scan_with_name("t1")?) + .filter(in_subquery(col("a"), Arc::new(subquery_plan)))? + .project(vec![col("a")])? + .build()?; + + assert!( + plan_has_joins(&outer), + "plan_has_joins must descend into subquery plans" + ); + Ok(()) + } + /// `EliminateCrossJoin::rewrite` short-circuits on join-free plans: /// no recursion into `rewrite_children`, no `Transformed::yes`, /// the plan comes back identical.