diff --git a/vortex-array/src/dtype/field.rs b/vortex-array/src/dtype/field.rs index 45ddb70c9a7..84f631b5074 100644 --- a/vortex-array/src/dtype/field.rs +++ b/vortex-array/src/dtype/field.rs @@ -239,18 +239,56 @@ impl Display for FieldPath { } #[derive(Default, Clone, Debug)] -/// Contains a set of field paths, and can answer an efficient field path contains queries. +/// A set of field paths supporting efficient `contains` queries. +/// +/// Paths are stored as inserted. Prefix-minimization—collapsing a path into an ancestor that +/// already covers it—is deferred until the set is iterated via [`IntoIterator`], so insertion stays +/// cheap. pub struct FieldPathSet { - /// While this is currently a set wrapper it can be replaced with a trie. + /// While this is currently a set wrapper it can be replaced with a trie, at which point the + /// deferred minimization in [`IntoIterator`] becomes cheap. // TODO(joe): this can be replaced with a `FieldPath` trie set: HashSet, } impl FieldPathSet { - /// Checks if a set contains a field path + /// Checks if the set contains exactly this field path. pub fn contains(&self, path: &FieldPath) -> bool { self.set.contains(path) } + + /// Iterates over the field paths in the set, as inserted (not prefix-minimized). + pub fn iter(&self) -> impl Iterator { + self.set.iter() + } + + /// Inserts a field path. Prefix-minimization is deferred until the set is iterated. + pub fn insert(&mut self, path: FieldPath) { + self.set.insert(path); + } +} + +/// Reduces field paths to their minimal covering set: any path that has another path in the set as +/// a prefix is redundant and dropped. +fn minimal_covering_set(paths: impl IntoIterator) -> Vec { + let mut covering: Vec = Vec::new(); + for path in paths { + if covering + .iter() + .any(|existing| path.parts().starts_with(existing.parts())) + { + continue; + } + covering.retain(|existing| !existing.parts().starts_with(path.parts())); + covering.push(path); + } + covering +} + +impl Extend for FieldPathSet { + fn extend>(&mut self, iter: T) { + self.set.extend(iter); + } } impl FromIterator for FieldPathSet { @@ -260,6 +298,16 @@ impl FromIterator for FieldPathSet { } } +impl IntoIterator for FieldPathSet { + type Item = FieldPath; + type IntoIter = std::vec::IntoIter; + + /// Iterates the prefix-minimal covering set: redundant descendants are dropped. + fn into_iter(self) -> Self::IntoIter { + minimal_covering_set(self.set).into_iter() + } +} + #[cfg(test)] mod tests { use super::*; @@ -418,4 +466,17 @@ mod tests { assert!(!path1.overlap(&path3)); assert!(!path3.overlap(&path1)); } + + #[test] + fn iteration_yields_minimal_covering_set() { + let mut paths = FieldPathSet::default(); + paths.extend([field_path!(a.b), field_path!(x), field_path!(a)]); + paths.insert(field_path!(a.c)); + + // Iteration collapses `a.b`/`a.c` into the covering `a`. + assert_eq!( + paths.into_iter().collect::>(), + HashSet::from_iter([field_path!(a), field_path!(x)]) + ); + } } diff --git a/vortex-array/src/expr/analysis/mod.rs b/vortex-array/src/expr/analysis/mod.rs index a0b07eb96e6..f5208a31be8 100644 --- a/vortex-array/src/expr/analysis/mod.rs +++ b/vortex-array/src/expr/analysis/mod.rs @@ -6,6 +6,7 @@ mod fallible; pub mod immediate_access; mod labeling; mod null_sensitive; +mod referenced_field_paths; pub use annotation::*; pub use fallible::label_is_fallible; @@ -13,3 +14,4 @@ pub use immediate_access::*; pub use labeling::*; pub use null_sensitive::BooleanLabels; pub use null_sensitive::label_null_sensitive; +pub use referenced_field_paths::referenced_field_paths; diff --git a/vortex-array/src/expr/analysis/referenced_field_paths.rs b/vortex-array/src/expr/analysis/referenced_field_paths.rs new file mode 100644 index 00000000000..5239e4a691c --- /dev/null +++ b/vortex-array/src/expr/analysis/referenced_field_paths.rs @@ -0,0 +1,264 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_error::VortexResult; +use vortex_error::vortex_err; + +use crate::dtype::DType; +use crate::dtype::Field; +use crate::dtype::FieldPath; +use crate::dtype::FieldPathSet; +use crate::expr::Expression; +use crate::expr::traversal::FoldDownContext; +use crate::expr::traversal::FoldUp; +use crate::expr::traversal::NodeExt; +use crate::expr::traversal::NodeFolderContext; +use crate::scalar_fn::fns::get_item::GetItem; +use crate::scalar_fn::fns::root::Root; +use crate::scalar_fn::fns::select::Select; + +/// Returns the rooted field paths referenced by an expression. +/// +/// Iterating the returned set (via [`IntoIterator`]) yields the prefix-minimal covering set: when +/// one referenced path is a prefix of another, only the prefix is kept. A standalone root +/// expression is represented by [`FieldPath::root`], which conservatively selects all fields. +/// Scalar functions other than `GetItem` and `Select` conservatively reference each complete child +/// output. +pub fn referenced_field_paths(expr: &Expression, scope: &DType) -> VortexResult { + // Validate the whole expression so plain GetItem paths and Select paths behave consistently. + expr.return_dtype(scope)?; + + let mut collector = ReferencedFieldPaths { + scope, + field_paths: FieldPathSet::default(), + }; + expr.clone() + .fold_context(&vec![FieldPath::root()], &mut collector)?; + let field_paths = collector.field_paths; + + // The top-level field of every referenced path must be one of the immediately accessed scope + // fields: this analysis only refines *which nested fields* are read, never which top-level + // fields. `FieldPath::root()` stands in for "all fields", so it expands to the whole scope. + #[cfg(debug_assertions)] + if let Some(scope_fields) = scope.as_struct_fields_opt() { + use vortex_utils::aliases::hash_set::HashSet; + + use crate::dtype::FieldName; + use crate::expr::analysis::immediate_access::immediate_scope_access; + + let referenced_heads: HashSet = if field_paths.iter().any(FieldPath::is_root) { + scope_fields.names().iter().cloned().collect() + } else { + field_paths + .iter() + .filter_map(|path| match path.parts().first() { + Some(Field::Name(name)) => Some(name.clone()), + _ => None, + }) + .collect() + }; + debug_assert_eq!( + referenced_heads, + immediate_scope_access(expr, scope_fields), + "referenced field path heads must match the immediately accessed scope fields" + ); + } + + Ok(field_paths) +} + +/// Threads the set of currently-requested field paths down the expression tree, narrowing it at +/// each `GetItem`/`Select`, and records the rooted paths reached at each `Root` leaf. +/// +/// Paths are carried reversed so a `GetItem` can `push` its field instead of prepending it; they +/// are reversed back to rooted order when recorded at a `Root`, and `Select` reads a path's head +/// from its last element. +/// +/// Narrowing is only sound through `GetItem` (a genuine field access) and `Select` (a genuine +/// column projection). Any other function is opaque—we cannot assume it preserves a field's +/// provenance—so its children conservatively re-request the whole scope, which is what keeps an +/// expression like `f($).x` reading every field of `$` rather than just `x`. +struct ReferencedFieldPaths<'a> { + scope: &'a DType, + field_paths: FieldPathSet, +} + +impl NodeFolderContext for ReferencedFieldPaths<'_> { + type NodeTy = Expression; + type Result = (); + type Context = Vec; + + fn visit_down( + &mut self, + requested: &Self::Context, + node: &Expression, + ) -> VortexResult> { + if node.is::() { + self.field_paths.extend( + requested + .iter() + .map(|path| FieldPath::from_iter(path.parts().iter().rev().cloned())), + ); + return Ok(FoldDownContext::Skip(())); + } + + if let Some(field_name) = node.as_opt::() { + let appended = requested + .iter() + .map(|path| path.clone().push(Field::Name(field_name.clone()))) + .collect(); + return Ok(FoldDownContext::Continue(appended)); + } + + // Keep requested paths whose head is included, expanding a whole-scope request into one + // path per included field. + if let Some(selection) = node.as_opt::