From 616af4bcbf166259f3bf786f06408ede9c6f6e95 Mon Sep 17 00:00:00 2001 From: Onur Satici Date: Tue, 2 Jun 2026 09:10:56 +0100 Subject: [PATCH 1/3] split registration to respect nested projection and filters --- vortex-array/src/dtype/field.rs | 47 ++++- vortex-array/src/expr/analysis/mod.rs | 2 + .../expr/analysis/referenced_field_paths.rs | 172 ++++++++++++++++++ vortex-layout/src/scan/scan_builder.rs | 112 ++++++++---- 4 files changed, 295 insertions(+), 38 deletions(-) create mode 100644 vortex-array/src/expr/analysis/referenced_field_paths.rs diff --git a/vortex-array/src/dtype/field.rs b/vortex-array/src/dtype/field.rs index 45ddb70c9a7..0f2ffbd1574 100644 --- a/vortex-array/src/dtype/field.rs +++ b/vortex-array/src/dtype/field.rs @@ -238,7 +238,7 @@ impl Display for FieldPath { } } -#[derive(Default, Clone, Debug)] +#[derive(Default, Clone, Debug, PartialEq, Eq)] /// Contains a set of field paths, and can answer an efficient field path contains queries. pub struct FieldPathSet { /// While this is currently a set wrapper it can be replaced with a trie. @@ -251,6 +251,30 @@ impl FieldPathSet { pub fn contains(&self, path: &FieldPath) -> bool { self.set.contains(path) } + + /// Inserts a field path prefix unless an existing prefix already covers it. + /// + /// Any existing field paths covered by the new prefix are removed. + pub fn insert_prefix(&mut self, path: FieldPath) { + if self + .set + .iter() + .any(|existing| path.parts().starts_with(existing.parts())) + { + return; + } + + self.set + .retain(|existing| !existing.parts().starts_with(path.parts())); + self.set.insert(path); + } + + /// Inserts field path prefixes, retaining only the minimal covering set. + pub fn extend_prefixes(&mut self, paths: impl IntoIterator) { + for path in paths { + self.insert_prefix(path); + } + } } impl FromIterator for FieldPathSet { @@ -260,6 +284,15 @@ impl FromIterator for FieldPathSet { } } +impl IntoIterator for FieldPathSet { + type Item = FieldPath; + type IntoIter = as IntoIterator>::IntoIter; + + fn into_iter(self) -> Self::IntoIter { + self.set.into_iter() + } +} + #[cfg(test)] mod tests { use super::*; @@ -418,4 +451,16 @@ mod tests { assert!(!path1.overlap(&path3)); assert!(!path3.overlap(&path1)); } + + #[test] + fn insert_prefix_retains_minimal_covering_set() { + let mut paths = FieldPathSet::default(); + paths.extend_prefixes([field_path!(a.b), field_path!(x), field_path!(a)]); + paths.insert_prefix(field_path!(a.c)); + + assert_eq!( + paths, + FieldPathSet::from_iter([field_path!(a), field_path!(x)]) + ); + } } diff --git a/vortex-array/src/expr/analysis/mod.rs b/vortex-array/src/expr/analysis/mod.rs index a0b07eb96e6..f5208a31be8 100644 --- a/vortex-array/src/expr/analysis/mod.rs +++ b/vortex-array/src/expr/analysis/mod.rs @@ -6,6 +6,7 @@ mod fallible; pub mod immediate_access; mod labeling; mod null_sensitive; +mod referenced_field_paths; pub use annotation::*; pub use fallible::label_is_fallible; @@ -13,3 +14,4 @@ pub use immediate_access::*; pub use labeling::*; pub use null_sensitive::BooleanLabels; pub use null_sensitive::label_null_sensitive; +pub use referenced_field_paths::referenced_field_paths; diff --git a/vortex-array/src/expr/analysis/referenced_field_paths.rs b/vortex-array/src/expr/analysis/referenced_field_paths.rs new file mode 100644 index 00000000000..94444011c98 --- /dev/null +++ b/vortex-array/src/expr/analysis/referenced_field_paths.rs @@ -0,0 +1,172 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::iter::once; + +use vortex_error::VortexResult; +use vortex_error::vortex_err; + +use crate::dtype::DType; +use crate::dtype::Field; +use crate::dtype::FieldPath; +use crate::dtype::FieldPathSet; +use crate::expr::Expression; +use crate::scalar_fn::fns::get_item::GetItem; +use crate::scalar_fn::fns::root::Root; +use crate::scalar_fn::fns::select::Select; + +/// Returns a prefix-minimal set of rooted field paths referenced by an expression. +/// +/// A standalone root expression is represented by [`FieldPath::root`], which conservatively +/// selects all fields. When one referenced path is a prefix of another, only the prefix is returned. +/// Scalar functions other than `GetItem` and `Select` conservatively reference each complete child +/// output. +pub fn referenced_field_paths(expr: &Expression, scope: &DType) -> VortexResult { + // Validate the whole expression so plain GetItem paths and Select paths behave consistently. + expr.return_dtype(scope)?; + + let mut field_paths = FieldPathSet::default(); + collect_referenced_field_paths(expr, scope, &FieldPath::root(), &mut field_paths)?; + Ok(field_paths) +} + +fn collect_referenced_field_paths( + expr: &Expression, + scope: &DType, + requested_path: &FieldPath, + field_paths: &mut FieldPathSet, +) -> VortexResult<()> { + if expr.is::() { + field_paths.insert_prefix(requested_path.clone()); + return Ok(()); + } + + if let Some(field_name) = expr.as_opt::() { + let requested_path = FieldPath::from_iter( + once(Field::Name(field_name.clone())).chain(requested_path.parts().iter().cloned()), + ); + return collect_referenced_field_paths(expr.child(0), scope, &requested_path, field_paths); + } + + if let Some(selection) = expr.as_opt::() { - let child_dtype = expr.child(0).return_dtype(scope)?; - let child_fields = child_dtype - .as_struct_fields_opt() - .ok_or_else(|| vortex_err!("Select child is not a struct"))?; - let included_fields = selection.normalize_to_included_fields(child_fields.names())?; - - if requested_path.is_root() { - for field_name in included_fields { - collect_referenced_field_paths( - expr.child(0), - scope, - &FieldPath::from_name(field_name), - field_paths, - )?; - } - } else if let Some(Field::Name(field_name)) = requested_path.parts().first() - && included_fields + Ok(field_paths) +} + +/// Threads the set of currently-requested field paths down the expression tree, narrowing it at +/// each `GetItem`/`Select`, and records the rooted paths reached at each `Root` leaf. +/// +/// Narrowing the requested path is only sound through `GetItem` (a genuine field access) and +/// `Select` (a genuine column projection). Any other function is opaque—we cannot assume it +/// preserves a field's provenance—so its children conservatively re-request the whole scope. This +/// is what keeps an expression like `f($).x` reading every field of `$` rather than just `x`. +struct ReferencedFieldPaths<'a> { + scope: &'a DType, + field_paths: FieldPathSet, +} + +impl NodeFolderContext for ReferencedFieldPaths<'_> { + type NodeTy = Expression; + type Result = (); + type Context = Vec; + + fn visit_down( + &mut self, + requested: &Self::Context, + node: &Expression, + ) -> VortexResult> { + // A `Root` leaf resolves the requested paths against the actual scope. + if node.is::() { + self.field_paths.extend_prefixes(requested.iter().cloned()); + return Ok(FoldDownContext::Skip(())); + } + + // `GetItem` prepends its field to every requested path before descending into its child. + if let Some(field_name) = node.as_opt::() { + let prefixed = requested .iter() - .any(|included| included == field_name) - { - collect_referenced_field_paths(expr.child(0), scope, requested_path, field_paths)?; + .map(|path| { + FieldPath::from_iter( + once(Field::Name(field_name.clone())).chain(path.parts().iter().cloned()), + ) + }) + .collect(); + return Ok(FoldDownContext::Continue(prefixed)); } - return Ok(()); - } - for child in expr.children().iter() { - collect_referenced_field_paths(child, scope, &FieldPath::root(), field_paths)?; + // `Select` keeps requested paths whose head is included, expanding a whole-scope request + // into one path per included field. + if let Some(selection) = node.as_opt::() { let child_dtype = node.child(0).return_dtype(self.scope)?; let child_fields = child_dtype @@ -118,11 +119,11 @@ impl NodeFolderContext for ReferencedFieldPaths<'_> { .ok_or_else(|| vortex_err!("Select child is not a struct"))?; let included_fields = selection.normalize_to_included_fields(child_fields.names())?; - let mut narrowed = Vec::new(); + let mut narrowed = Vec::with_capacity(requested.len()); for path in requested { if path.is_root() { narrowed.extend(included_fields.iter().cloned().map(FieldPath::from_name)); - } else if let Some(Field::Name(field_name)) = path.parts().first() + } else if let Some(Field::Name(field_name)) = path.parts().last() && included_fields .iter() .any(|included| included == field_name) @@ -155,6 +156,8 @@ impl NodeFolderContext for ReferencedFieldPaths<'_> { #[cfg(test)] mod tests { + use vortex_utils::aliases::hash_set::HashSet; + use super::*; use crate::dtype::Nullability::NonNullable; use crate::dtype::PType::I32; @@ -178,13 +181,20 @@ mod tests { ) } + /// Collects the prefix-minimal field paths referenced by `expr` against [`scope`]. + fn referenced(expr: &Expression) -> VortexResult> { + Ok(referenced_field_paths(expr, &scope())? + .into_iter() + .collect()) + } + #[test] fn nested_select_preserves_field_path() -> VortexResult<()> { let expr = select(["x"], get_item("a", root())); assert_eq!( - referenced_field_paths(&expr, &scope())?, - FieldPathSet::from_iter([FieldPath::from_name("a").push("x")]) + referenced(&expr)?, + HashSet::from_iter([FieldPath::from_name("a").push("x")]) ); Ok(()) } @@ -194,8 +204,8 @@ mod tests { let expr = get_item("x", select(["x", "y"], get_item("a", root()))); assert_eq!( - referenced_field_paths(&expr, &scope())?, - FieldPathSet::from_iter([FieldPath::from_name("a").push("x")]) + referenced(&expr)?, + HashSet::from_iter([FieldPath::from_name("a").push("x")]) ); Ok(()) } @@ -205,8 +215,8 @@ mod tests { let expr = select_exclude(["y"], get_item("a", root())); assert_eq!( - referenced_field_paths(&expr, &scope())?, - FieldPathSet::from_iter([FieldPath::from_name("a").push("x")]) + referenced(&expr)?, + HashSet::from_iter([FieldPath::from_name("a").push("x")]) ); Ok(()) } @@ -222,8 +232,8 @@ mod tests { ); assert_eq!( - referenced_field_paths(&expr, &scope())?, - FieldPathSet::from_iter([FieldPath::from_name("a")]) + referenced(&expr)?, + HashSet::from_iter([FieldPath::from_name("a")]) ); Ok(()) } @@ -234,18 +244,15 @@ mod tests { // as a scope field access, so the wrapped `root()` conservatively references all fields. let expr = get_item("x", pack([("x", root())], NonNullable)); - assert_eq!( - referenced_field_paths(&expr, &scope())?, - FieldPathSet::from_iter([FieldPath::root()]) - ); + assert_eq!(referenced(&expr)?, HashSet::from_iter([FieldPath::root()])); Ok(()) } #[test] fn root_references_all_fields() -> VortexResult<()> { assert_eq!( - referenced_field_paths(&root(), &scope())?, - FieldPathSet::from_iter([FieldPath::root()]) + referenced(&root())?, + HashSet::from_iter([FieldPath::root()]) ); Ok(()) } diff --git a/vortex-layout/src/scan/scan_builder.rs b/vortex-layout/src/scan/scan_builder.rs index 550f67e5017..d916003fc78 100644 --- a/vortex-layout/src/scan/scan_builder.rs +++ b/vortex-layout/src/scan/scan_builder.rs @@ -419,7 +419,7 @@ pub fn referenced_field_masks( let mut field_paths = referenced_field_paths(projection, dtype)?; if let Some(filter) = filter { - field_paths.extend_prefixes(referenced_field_paths(filter, dtype)?); + field_paths.extend(referenced_field_paths(filter, dtype)?); } Ok(field_paths.into_iter().map(FieldMask::Prefix).collect_vec()) }