From a9fd8aa25c03473201c72f7f862225444a447cb1 Mon Sep 17 00:00:00 2001 From: Mykhailo Chalyi Date: Thu, 11 Jun 2026 20:29:32 -0500 Subject: [PATCH] fix(rg): honor include filters for hidden files --- crates/bashkit-python/src/lib.rs | 124 +++++++++----- .../tests/_bashkit_categories.py | 21 +++ crates/bashkit-python/tests/test_vfs.py | 2 + crates/bashkit/src/builtins/rg/mod.rs | 159 +++++++++++++++++- crates/bashkit/src/interpreter/mod.rs | 42 ++++- 5 files changed, 294 insertions(+), 54 deletions(-) diff --git a/crates/bashkit-python/src/lib.rs b/crates/bashkit-python/src/lib.rs index f43384c42..72b63e9b8 100644 --- a/crates/bashkit-python/src/lib.rs +++ b/crates/bashkit-python/src/lib.rs @@ -55,7 +55,7 @@ use pyo3_async_runtimes::tokio::future_into_py; type CallerLoopLocals = TaskLocals; #[cfg(target_arch = "wasm32")] type CallerLoopLocals = std::convert::Infallible; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::future::Future; use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; @@ -1164,6 +1164,9 @@ fn normalize_find_path(path: &str) -> String { } const GLOB_MAX_RESULTS: usize = 10_000; +// Direct VFS glob runs outside Bash::exec, so it needs its own traversal budget +// and timeout to bound no-match walks over large RealFs mounts. +const GLOB_MAX_VISITED_ENTRIES: usize = 10_000; fn glob_search_root(pattern: &str) -> String { let wildcard_idx = pattern.find(['*', '?', '[']); @@ -1181,58 +1184,87 @@ fn glob_search_root(pattern: &str) -> String { } } -fn glob_via_bash(rt: &Arc, inner: &Arc>, pattern: String) -> Vec { +fn glob_timeout(timeout_seconds: Option) -> Duration { + timeout_seconds + .map(Duration::from_secs_f64) + .unwrap_or_else(|| ExecutionLimits::default().timeout) +} + +fn glob_via_bash( + rt: &Arc, + inner: &Arc>, + pattern: String, + timeout: Duration, +) -> Vec { if !is_safe_glob_pattern(&pattern) { return Vec::new(); } - with_live_fs(rt, inner, move |fs| async move { - let root = normalize_find_path(&glob_search_root(&pattern)); - let root_path = Path::new(&root); - let mut matches = Vec::new(); - let mut stack = vec![root.clone()]; - - // If the root resolves to a single file (no wildcards in pattern), - // just match against it and return. Otherwise fall through to walk - // the directory tree rooted at `root`. - if let Ok(metadata) = fs.stat(root_path).await - && metadata.file_type == FsFileType::File - { - if glob_match_path(&root, &pattern) { - matches.push(root); - } - return Ok(matches); - } + let inner = inner.clone(); + rt.block_on(async move { + let fs = { + let bash = inner.lock().await; + bash.fs() + }; - while let Some(dir) = stack.pop() { - let entries = match fs.read_dir(Path::new(&dir)).await { - Ok(entries) => entries, - Err(_) => continue, - }; + tokio::time::timeout(timeout, async move { + let root = normalize_find_path(&glob_search_root(&pattern)); + let root_path = Path::new(&root); + let mut matches = Vec::new(); + let mut stack = vec![root.clone()]; + let mut visited_dirs = HashSet::from([root.clone()]); + let mut visited_entries = 0usize; + + // If the root resolves to a single file (no wildcards in pattern), + // just match against it and return. Otherwise fall through to walk + // the directory tree rooted at `root`. + if let Ok(metadata) = fs.stat(root_path).await + && metadata.file_type == FsFileType::File + { + if glob_match_path(&root, &pattern) { + matches.push(root); + } + return matches; + } - for entry in entries { - let child = if dir == "/" { - format!("/{}", entry.name) - } else { - format!("{}/{}", dir, entry.name) + while let Some(dir) = stack.pop() { + let entries = match fs.read_dir(Path::new(&dir)).await { + Ok(entries) => entries, + Err(_) => continue, }; - match entry.metadata.file_type { - FsFileType::Directory => stack.push(child), - FsFileType::File if glob_match_path(&child, &pattern) => { - matches.push(child); - if matches.len() >= GLOB_MAX_RESULTS { - return Ok(matches); + for entry in entries { + visited_entries += 1; + if visited_entries > GLOB_MAX_VISITED_ENTRIES { + return matches; + } + + let child = if dir == "/" { + format!("/{}", entry.name) + } else { + format!("{}/{}", dir, entry.name) + }; + + match entry.metadata.file_type { + FsFileType::Directory if visited_dirs.insert(child.clone()) => { + stack.push(child); } + FsFileType::File if glob_match_path(&child, &pattern) => { + matches.push(child); + if matches.len() >= GLOB_MAX_RESULTS { + return matches; + } + } + _ => {} } - _ => {} } } - } - Ok(matches) + matches + }) + .await + .unwrap_or_default() }) - .unwrap_or_default() } // Decision: snapshot factories build with caller kwargs first, then restore @@ -3775,7 +3807,12 @@ impl PyBash { fn glob(&self, py: Python<'_>, pattern: String) -> PyResult> { self.reject_external_handler_reentry()?; let matches = py.detach(|| -> PyResult> { - Ok(glob_via_bash(&self.rt, &self.inner, pattern)) + Ok(glob_via_bash( + &self.rt, + &self.inner, + pattern, + glob_timeout(self.timeout_seconds), + )) })?; Ok(PyList::new(py, matches)?.into_any().unbind()) } @@ -4423,7 +4460,12 @@ impl BashTool { fn glob(&self, py: Python<'_>, pattern: String) -> PyResult> { let matches = py.detach(|| -> PyResult> { - Ok(glob_via_bash(&self.rt, &self.inner, pattern)) + Ok(glob_via_bash( + &self.rt, + &self.inner, + pattern, + glob_timeout(self.timeout_seconds), + )) })?; Ok(PyList::new(py, matches)?.into_any().unbind()) } diff --git a/crates/bashkit-python/tests/_bashkit_categories.py b/crates/bashkit-python/tests/_bashkit_categories.py index 2370080e3..064f40a39 100644 --- a/crates/bashkit-python/tests/_bashkit_categories.py +++ b/crates/bashkit-python/tests/_bashkit_categories.py @@ -455,6 +455,27 @@ def test_bash_direct_vfs_glob_limits_result_count(): assert len(matches) == 9_000 +def test_bash_direct_vfs_glob_bounds_realfs_traversal(tmp_path): + # Security regression: direct glob over RealFs must stop by work budget + # instead of walking every host entry outside Bash execution limits. + for idx in range(10_050): + (tmp_path / f"dir_{idx:05}").mkdir() + (tmp_path / "zz_after_budget.txt").write_text("late") + + bash = Bash(mounts=[{"host_path": str(tmp_path), "vfs_path": "/host"}]) + + assert bash.glob("/host/z*_after_budget.txt") == [] + + +def test_bash_direct_vfs_glob_honors_timeout(tmp_path): + for idx in range(512): + (tmp_path / f"dir_{idx:05}").mkdir() + + bash = Bash(mounts=[{"host_path": str(tmp_path), "vfs_path": "/host"}], timeout_seconds=0) + + assert bash.glob("/host/*/missing.txt") == [] + + # -- Bash: FS / mount error cases ------------------------------------------ diff --git a/crates/bashkit-python/tests/test_vfs.py b/crates/bashkit-python/tests/test_vfs.py index c7518d375..a56d0c467 100644 --- a/crates/bashkit-python/tests/test_vfs.py +++ b/crates/bashkit-python/tests/test_vfs.py @@ -23,6 +23,8 @@ "test_bash_direct_vfs_methods_raise_on_missing_paths_and_non_utf8", "test_bash_direct_vfs_methods_track_shell_changes_and_reset", "test_bash_direct_vfs_glob_limits_result_count", + "test_bash_direct_vfs_glob_bounds_realfs_traversal", + "test_bash_direct_vfs_glob_honors_timeout", "test_bash_unmount_nonexistent_raises", "test_bash_mounts_missing_host_path_raises", "test_bash_mounts_invalid_entry_raises", diff --git a/crates/bashkit/src/builtins/rg/mod.rs b/crates/bashkit/src/builtins/rg/mod.rs index 69a55ea53..c8c5aa4f9 100644 --- a/crates/bashkit/src/builtins/rg/mod.rs +++ b/crates/bashkit/src/builtins/rg/mod.rs @@ -1481,6 +1481,22 @@ impl RgOptions { matched.unwrap_or(!has_include) } + fn has_matching_positive_glob(&self, path: &Path, cwd: &Path) -> bool { + self.glob_rules + .iter() + .any(|rule| rule.include && rule.matches(path, cwd)) + } + + fn has_matching_type_include(&self, path: &Path) -> bool { + self.type_includes + .iter() + .any(|file_type| file_type.matches(path)) + } + + fn explicitly_includes_hidden_file(&self, path: &Path, cwd: &Path) -> bool { + self.has_matching_positive_glob(path, cwd) || self.has_matching_type_include(path) + } + fn matches_type_filters(&self, path: &Path) -> bool { if !self.type_includes.is_empty() && !self @@ -3590,11 +3606,9 @@ async fn collect_rg_files_recursive( }; if let Ok(entries) = fs.read_dir(&item.actual).await { for entry in entries { - if !opts.hidden && is_hidden_name(&entry.name) { - continue; - } let path = item.logical.join(&entry.name); let actual_path = item.actual.join(&entry.name); + let hidden_entry = !opts.hidden && is_hidden_name(&entry.name); let entry_depth = item.depth + 1; let (entry_actual_path, entry_metadata) = if entry.metadata.file_type.is_symlink() && opts.follow_symlinks { @@ -3609,7 +3623,7 @@ async fn collect_rg_files_recursive( }; if entry_metadata.file_type.is_dir() { - if opts.is_ignored_by_rules(&path, true, &rules) { + if hidden_entry || opts.is_ignored_by_rules(&path, true, &rules) { continue; } if opts @@ -3637,6 +3651,7 @@ async fn collect_rg_files_recursive( && !opts.is_ignored_by_rules(&path, false, &rules) && opts.matches_globs(&path, cwd) && opts.matches_type_filters(&path) + && (!hidden_entry || opts.explicitly_includes_hidden_file(&path, cwd)) { result.push(RgFileCandidate { logical: path, @@ -3870,8 +3885,7 @@ async fn try_indexed_search( || !seen_paths.insert(candidate.clone()) || (!explicit_file_match && !opts.matches_globs(&candidate, cwd)) || (!explicit_file_match - && !opts.hidden - && path_has_hidden_component_relative_to(&candidate, &root)) + && !path_allowed_by_hidden_filter(&candidate, &root, opts, cwd)) { continue; } @@ -3887,11 +3901,30 @@ async fn try_indexed_search( Some(inputs) } -fn path_has_hidden_component_relative_to(path: &Path, root: &Path) -> bool { - path.strip_prefix(root) +fn path_allowed_by_hidden_filter(path: &Path, root: &Path, opts: &RgOptions, cwd: &Path) -> bool { + // Match ripgrep: positive globs unhide hidden files, not hidden directories. + if opts.hidden { + return true; + } + + let components: Vec<_> = path + .strip_prefix(root) .unwrap_or(path) .components() - .any(|component| component.as_os_str().to_str().is_some_and(is_hidden_name)) + .collect(); + for (idx, component) in components.iter().enumerate() { + let Some(name) = component.as_os_str().to_str() else { + continue; + }; + if !is_hidden_name(name) { + continue; + } + if idx + 1 == components.len() { + return opts.explicitly_includes_hidden_file(path, cwd); + } + return false; + } + true } struct RgPrefix<'a> { @@ -13584,6 +13617,54 @@ mod tests { assert!(!result.stdout.contains("vendor")); } + #[tokio::test] + async fn test_rg_positive_glob_includes_hidden_files_not_hidden_dirs() { + let result = run_rg_with_cwd( + &["-g", "*.rs", "needle", "."], + None, + &[ + ("/proj/.lib.rs", b"needle\n"), + ("/proj/lib.rs", b"needle\n"), + ("/proj/visible/.mod.rs", b"needle\n"), + ("/proj/.hidden_dir/lib.rs", b"needle\n"), + ("/proj/readme.md", b"needle\n"), + ], + "/proj", + ) + .await; + + assert_eq!(result.exit_code, 0); + assert!(result.stdout.contains("./.lib.rs:needle")); + assert!(result.stdout.contains("./lib.rs:needle")); + assert!(result.stdout.contains("./visible/.mod.rs:needle")); + assert!(!result.stdout.contains(".hidden_dir")); + assert!(!result.stdout.contains("readme.md")); + } + + #[tokio::test] + async fn test_rg_type_include_includes_hidden_files_not_hidden_dirs() { + let result = run_rg_with_cwd( + &["-t", "rust", "needle", "."], + None, + &[ + ("/proj/.lib.rs", b"needle\n"), + ("/proj/lib.rs", b"needle\n"), + ("/proj/visible/.mod.rs", b"needle\n"), + ("/proj/.hidden_dir/lib.rs", b"needle\n"), + ("/proj/readme.md", b"needle\n"), + ], + "/proj", + ) + .await; + + assert_eq!(result.exit_code, 0); + assert!(result.stdout.contains("./.lib.rs:needle")); + assert!(result.stdout.contains("./lib.rs:needle")); + assert!(result.stdout.contains("./visible/.mod.rs:needle")); + assert!(!result.stdout.contains(".hidden_dir")); + assert!(!result.stdout.contains("readme.md")); + } + #[tokio::test] async fn test_rg_replace_caps_amplified_output() { // A line with many matches × a giant replacement string would otherwise @@ -13871,6 +13952,66 @@ mod tests { assert_eq!(result.stdout, ""); } + #[tokio::test] + async fn test_rg_indexed_positive_glob_includes_hidden_files_not_hidden_dirs() { + let inner = InMemoryFs::new(); + inner.mkdir(Path::new("/safe/visible"), true).await.unwrap(); + inner + .mkdir(Path::new("/safe/.hidden_dir"), true) + .await + .unwrap(); + inner + .write_file(Path::new("/safe/.lib.rs"), b"needle\n") + .await + .unwrap(); + inner + .write_file(Path::new("/safe/lib.rs"), b"needle\n") + .await + .unwrap(); + inner + .write_file(Path::new("/safe/visible/.mod.rs"), b"needle\n") + .await + .unwrap(); + inner + .write_file(Path::new("/safe/.hidden_dir/lib.rs"), b"needle\n") + .await + .unwrap(); + + let fs = Arc::new(IndexedTestFs { + inner, + matches: vec![ + SearchMatch { + path: PathBuf::from("/safe/.lib.rs"), + line_number: 1, + line_content: "needle".to_string(), + }, + SearchMatch { + path: PathBuf::from("/safe/lib.rs"), + line_number: 1, + line_content: "needle".to_string(), + }, + SearchMatch { + path: PathBuf::from("/safe/visible/.mod.rs"), + line_number: 1, + line_content: "needle".to_string(), + }, + SearchMatch { + path: PathBuf::from("/safe/.hidden_dir/lib.rs"), + line_number: 1, + line_content: "needle".to_string(), + }, + ], + }); + + let result = + run_rg_with_fs(&["--no-ignore", "-g", "*.rs", "needle", "/safe"], None, fs).await; + assert_eq!(result.exit_code, 0); + assert!(result.stdout.contains("/safe/.lib.rs:needle")); + assert!(result.stdout.contains("/safe/lib.rs:needle")); + assert!(result.stdout.contains("/safe/visible/.mod.rs:needle")); + assert!(!result.stdout.contains(".hidden_dir")); + } + #[tokio::test] async fn test_rg_crlf_skips_indexed_prefilter_and_falls_back_to_linear_scan() { let inner = InMemoryFs::new(); diff --git a/crates/bashkit/src/interpreter/mod.rs b/crates/bashkit/src/interpreter/mod.rs index aa08e18cb..6a943969c 100644 --- a/crates/bashkit/src/interpreter/mod.rs +++ b/crates/bashkit/src/interpreter/mod.rs @@ -7317,6 +7317,7 @@ impl Interpreter { let baseline_call_stack_len = self.call_stack.len(); let baseline_bash_source_len = self.bash_source_stack.len(); + let baseline_function_depth = self.counters.function_depth; let baseline_pipeline_stdin = self.pipeline_stdin.clone(); let exec_future = self.execute_command(&inner_cmd); match timeout(duration, exec_future).await { @@ -7326,6 +7327,7 @@ impl Interpreter { self.reconcile_cancelled_execution_state( baseline_call_stack_len, baseline_bash_source_len, + baseline_function_depth, baseline_pipeline_stdin, ); // Timeout expired. @@ -7444,6 +7446,7 @@ impl Interpreter { &mut self, baseline_call_stack_len: usize, baseline_bash_source_len: usize, + baseline_function_depth: usize, baseline_pipeline_stdin: Option, ) { let leaked_call_frames = self @@ -7463,10 +7466,8 @@ impl Interpreter { self.update_bash_source(); } - for _ in 0..leaked_call_frames.max(leaked_bash_source_entries) { - self.counters.pop_function(); - } - + // Some cancellable paths push call frames or BASH_SOURCE without pushing function depth. + self.counters.function_depth = baseline_function_depth; self.pipeline_stdin = baseline_pipeline_stdin; if self.call_stack.is_empty() { @@ -11598,6 +11599,39 @@ mod tests { assert_eq!(result.stdout, ""); } + #[test] + fn test_cancelled_shell_frame_does_not_pop_function_depth() { + let fs: Arc = Arc::new(InMemoryFs::new()); + let mut interp = Interpreter::new(Arc::clone(&fs)); + interp.counters.function_depth = 1; + interp.call_stack.push(CallFrame { + name: "caller".to_string(), + locals: HashMap::new(), + positional: Vec::new(), + }); + let baseline_call_stack_len = interp.call_stack.len(); + let baseline_bash_source_len = interp.bash_source_stack.len(); + let baseline_function_depth = interp.counters.function_depth; + + interp.call_stack.push(CallFrame { + name: "bash".to_string(), + locals: HashMap::new(), + positional: Vec::new(), + }); + interp.bash_source_stack.push("script.sh".to_string()); + + interp.reconcile_cancelled_execution_state( + baseline_call_stack_len, + baseline_bash_source_len, + baseline_function_depth, + None, + ); + + assert_eq!(interp.call_stack.len(), baseline_call_stack_len); + assert_eq!(interp.bash_source_stack.len(), baseline_bash_source_len); + assert_eq!(interp.counters.function_depth, baseline_function_depth); + } + /// Test that parse_duration preserves subsecond precision #[test] fn test_parse_timeout_duration_subsecond() {