From 58ebf00702aeb4ae73c5764967961a22c800bfc5 Mon Sep 17 00:00:00 2001 From: Mykhailo Chalyi Date: Thu, 11 Jun 2026 20:28:35 -0500 Subject: [PATCH] fix(interpreter): bound IFS word splitting --- crates/bashkit/src/interpreter/mod.rs | 226 ++++++++++++++++++-------- crates/bashkit/src/limits.rs | 40 ++++- 2 files changed, 192 insertions(+), 74 deletions(-) diff --git a/crates/bashkit/src/interpreter/mod.rs b/crates/bashkit/src/interpreter/mod.rs index 7c261bbb..58a6c054 100644 --- a/crates/bashkit/src/interpreter/mod.rs +++ b/crates/bashkit/src/interpreter/mod.rs @@ -33,6 +33,53 @@ static PROC_SUB_COUNTER: AtomicU64 = AtomicU64::new(0); const COMPAT_BASH_VERSION: &str = "5.2.15(1)-release"; const COMPAT_BASH_VERSINFO: [&str; 6] = ["5", "2", "15", "1", "release", "virtual"]; +// Important decision: IFS is script-controlled, so split membership must be O(1) +// and field materialization must be bounded before loops/commands consume args. +struct IfsClassifier { + ascii: [bool; 128], + non_ascii: HashSet, + all_whitespace_ifs: bool, +} + +impl IfsClassifier { + fn new(ifs: &str) -> Self { + let mut ascii = [false; 128]; + let mut non_ascii = HashSet::new(); + let mut all_whitespace_ifs = true; + for c in ifs.chars() { + if !matches!(c, ' ' | '\t' | '\n') { + all_whitespace_ifs = false; + } + if c.is_ascii() { + ascii[c as usize] = true; + } else { + non_ascii.insert(c); + } + } + Self { + ascii, + non_ascii, + all_whitespace_ifs, + } + } + + fn is_ifs(&self, c: char) -> bool { + if c.is_ascii() { + self.ascii[c as usize] + } else { + self.non_ascii.contains(&c) + } + } + + fn is_ifs_ws(&self, c: char) -> bool { + matches!(c, ' ' | '\t' | '\n') && self.is_ifs(c) + } + + fn is_ifs_nws(&self, c: char) -> bool { + self.is_ifs(c) && !matches!(c, ' ' | '\t' | '\n') + } +} + use futures_util::FutureExt; use crate::builtins::{self, Builtin}; @@ -4567,7 +4614,7 @@ impl Interpreter { break; } let expanded = self.expand_word(word).await?; - for field in self.ifs_split_limited(&expanded, remaining) { + for field in self.ifs_split_limited(&expanded, remaining)? { next_arr.insert(idx, field); idx += 1; } @@ -8476,7 +8523,7 @@ impl Interpreter { // $@ unquoted: each param is subject to further IFS splitting let mut fields = Vec::new(); for p in &positional { - fields.extend(self.ifs_split(p)); + fields.extend(self.ifs_split(p)?); } return Ok(fields); } @@ -8502,7 +8549,7 @@ impl Interpreter { // $* unquoted: each param is subject to IFS splitting let mut fields = Vec::new(); for p in &positional { - fields.extend(self.ifs_split(p)); + fields.extend(self.ifs_split(p)?); } return Ok(fields); } @@ -8579,7 +8626,7 @@ impl Interpreter { }); if has_expansion { - Ok(self.ifs_split(&expanded)) + self.ifs_split(&expanded) } else { Ok(vec![expanded]) } @@ -8705,14 +8752,14 @@ impl Interpreter { /// an empty field between them. /// - `` = single delimiter (ws absorbed into the nws delimiter). /// - Empty IFS → no splitting. Unset IFS → default " \t\n". - fn ifs_split(&self, s: &str) -> Vec { - self.ifs_split_limited(s, usize::MAX) + fn ifs_split(&self, s: &str) -> Result> { + self.ifs_split_limited(s, self.limits.max_word_split_fields) } - /// Split a string on IFS characters, returning at most `limit` fields. - fn ifs_split_limited(&self, s: &str, limit: usize) -> Vec { + /// Split a string on IFS characters, returning an error if resource caps are exceeded. + fn ifs_split_limited(&self, s: &str, limit: usize) -> Result> { if limit == 0 { - return Vec::new(); + return Ok(Vec::new()); } let ifs = self @@ -8722,93 +8769,100 @@ impl Interpreter { .unwrap_or_else(|| " \t\n".to_string()); if ifs.is_empty() { - return vec![s.to_string()]; + return self.push_ifs_field(Vec::new(), s.to_string(), limit, s.len()); } - let is_ifs = |c: char| ifs.contains(c); - let is_ifs_ws = |c: char| ifs.contains(c) && " \t\n".contains(c); - let is_ifs_nws = |c: char| ifs.contains(c) && !" \t\n".contains(c); - let all_whitespace_ifs = ifs.chars().all(|c| " \t\n".contains(c)); + let classifier = IfsClassifier::new(&ifs); - if all_whitespace_ifs { - // IFS is only whitespace: split on runs, elide empties - return s - .split(|c: char| is_ifs(c)) + if classifier.all_whitespace_ifs { + let mut fields = Vec::new(); + let mut bytes = 0usize; + for field in s + .split(|c: char| classifier.is_ifs(c)) .filter(|f| !f.is_empty()) - .take(limit) - .map(|f| f.to_string()) - .collect(); + { + bytes = bytes.saturating_add(field.len()); + fields = self.push_ifs_field(fields, field.to_string(), limit, bytes)?; + } + return Ok(fields); } - // Mixed or pure non-whitespace IFS. let mut fields: Vec = Vec::new(); let mut current = String::new(); - let chars: Vec = s.chars().collect(); - let mut i = 0; + let mut bytes = 0usize; + let mut chars = s.chars().peekable(); - // Skip leading IFS whitespace - while i < chars.len() && is_ifs_ws(chars[i]) { - i += 1; + while chars.peek().is_some_and(|c| classifier.is_ifs_ws(*c)) { + chars.next(); } - // Leading non-whitespace IFS produces an empty first field - if i < chars.len() && is_ifs_nws(chars[i]) { - fields.push(String::new()); - if fields.len() >= limit { - return fields; - } - i += 1; - while i < chars.len() && is_ifs_ws(chars[i]) { - i += 1; + if chars.peek().is_some_and(|c| classifier.is_ifs_nws(*c)) { + fields = self.push_ifs_field(fields, String::new(), limit, bytes)?; + chars.next(); + while chars.peek().is_some_and(|c| classifier.is_ifs_ws(*c)) { + chars.next(); } } - while i < chars.len() { - let c = chars[i]; - if is_ifs_nws(c) { - // Non-whitespace IFS delimiter: finalize current field - fields.push(std::mem::take(&mut current)); - if fields.len() >= limit { - return fields; - } - i += 1; - // Consume trailing IFS whitespace - while i < chars.len() && is_ifs_ws(chars[i]) { - i += 1; + while let Some(c) = chars.next() { + if classifier.is_ifs_nws(c) { + let field = std::mem::take(&mut current); + bytes = bytes.saturating_add(field.len()); + fields = self.push_ifs_field(fields, field, limit, bytes)?; + while chars.peek().is_some_and(|c| classifier.is_ifs_ws(*c)) { + chars.next(); } - } else if is_ifs_ws(c) { - // IFS whitespace: skip it, then check for non-ws delimiter - while i < chars.len() && is_ifs_ws(chars[i]) { - i += 1; + } else if classifier.is_ifs_ws(c) { + while chars.peek().is_some_and(|c| classifier.is_ifs_ws(*c)) { + chars.next(); } - if i < chars.len() && is_ifs_nws(chars[i]) { - // = single delimiter. Push current field. - fields.push(std::mem::take(&mut current)); - if fields.len() >= limit { - return fields; - } - i += 1; // consume the nws char - while i < chars.len() && is_ifs_ws(chars[i]) { - i += 1; - } - } else if i < chars.len() { - // ws alone as delimiter (no nws follows) - fields.push(std::mem::take(&mut current)); - if fields.len() >= limit { - return fields; + if chars.peek().is_some_and(|c| classifier.is_ifs_nws(*c)) { + let field = std::mem::take(&mut current); + bytes = bytes.saturating_add(field.len()); + fields = self.push_ifs_field(fields, field, limit, bytes)?; + chars.next(); + while chars.peek().is_some_and(|c| classifier.is_ifs_ws(*c)) { + chars.next(); } + } else if chars.peek().is_some() { + let field = std::mem::take(&mut current); + bytes = bytes.saturating_add(field.len()); + fields = self.push_ifs_field(fields, field, limit, bytes)?; } - // trailing ws at end → ignore (don't push empty field) } else { current.push(c); - i += 1; } } - if !current.is_empty() && fields.len() < limit { - fields.push(current); + if !current.is_empty() { + bytes = bytes.saturating_add(current.len()); + fields = self.push_ifs_field(fields, current, limit, bytes)?; } - fields + Ok(fields) + } + + fn push_ifs_field( + &self, + mut fields: Vec, + field: String, + limit: usize, + bytes: usize, + ) -> Result> { + if fields.len() >= limit { + return Err(crate::limits::LimitExceeded::Memory(format!( + "word split field limit ({limit}) exceeded" + )) + .into()); + } + if bytes > self.limits.max_word_split_bytes { + return Err(crate::limits::LimitExceeded::Memory(format!( + "word split byte limit ({}) exceeded", + self.limits.max_word_split_bytes + )) + .into()); + } + fields.push(field); + Ok(fields) } /// Expand an operand string from a parameter expansion (sync, lazy). @@ -11640,6 +11694,34 @@ mod tests { interp.execute(&ast).await.unwrap() } + #[tokio::test] + async fn test_ifs_split_field_limit_rejects_exploding_command_substitution() { + let fs: Arc = Arc::new(InMemoryFs::new()); + let mut interp = Interpreter::new(Arc::clone(&fs)); + interp.set_limits(ExecutionLimits::default().max_word_split_fields(3)); + let parser = Parser::new("IFS=,; for x in $(echo a,b,c,d); do :; done"); + let ast = parser.parse().unwrap(); + let err = interp.execute(&ast).await.unwrap_err(); + assert!( + err.to_string() + .contains("word split field limit (3) exceeded") + ); + } + + #[tokio::test] + async fn test_ifs_split_byte_limit_rejects_large_materialized_field() { + let fs: Arc = Arc::new(InMemoryFs::new()); + let mut interp = Interpreter::new(Arc::clone(&fs)); + interp.set_limits(ExecutionLimits::default().max_word_split_bytes(5)); + let parser = Parser::new("v=abcdef; echo $v"); + let ast = parser.parse().unwrap(); + let err = interp.execute(&ast).await.unwrap_err(); + assert!( + err.to_string() + .contains("word split byte limit (5) exceeded") + ); + } + #[tokio::test] async fn test_colon_null_utility() { // POSIX : (colon) - null utility, should return success diff --git a/crates/bashkit/src/limits.rs b/crates/bashkit/src/limits.rs index 07ce732a..a10257d1 100644 --- a/crates/bashkit/src/limits.rs +++ b/crates/bashkit/src/limits.rs @@ -98,6 +98,14 @@ pub struct ExecutionLimits { /// Default: 1024 pub max_file_descriptors: usize, + /// Maximum fields produced by one IFS word-splitting operation. + /// Default: 100,000 + pub max_word_split_fields: usize, + + /// Maximum total bytes copied into fields by one IFS word-splitting operation. + /// Default: 10MB (10,000,000 bytes) + pub max_word_split_bytes: usize, + /// Whether to capture the final environment state in ExecResult. /// Default: false (opt-in to avoid cloning cost when not needed) pub capture_final_env: bool, @@ -119,6 +127,8 @@ impl Default for ExecutionLimits { max_stderr_bytes: 1_048_576, // 1MB max_subst_depth: 32, max_file_descriptors: 1024, + max_word_split_fields: 100_000, + max_word_split_bytes: 10_000_000, capture_final_env: false, } } @@ -262,6 +272,24 @@ impl ExecutionLimits { self } + /// Set maximum fields produced by one IFS word-splitting operation. + /// Passing 0 is treated as "use default" (no-op) to prevent misconfiguration. + pub fn max_word_split_fields(mut self, count: usize) -> Self { + if count > 0 { + self.max_word_split_fields = count; + } + self + } + + /// Set maximum total bytes copied by one IFS word-splitting operation. + /// Passing 0 is treated as "use default" (no-op) to prevent misconfiguration. + pub fn max_word_split_bytes(mut self, bytes: usize) -> Self { + if bytes > 0 { + self.max_word_split_bytes = bytes; + } + self + } + /// Enable capturing final environment state in ExecResult pub fn capture_final_env(mut self, capture: bool) -> Self { self.capture_final_env = capture; @@ -1084,7 +1112,9 @@ mod tests { .max_stdout_bytes(0) .max_stderr_bytes(0) .max_subst_depth(0) - .max_file_descriptors(0); + .max_file_descriptors(0) + .max_word_split_fields(0) + .max_word_split_bytes(0); assert_eq!(limits.max_commands, defaults.max_commands); assert_eq!(limits.max_loop_iterations, defaults.max_loop_iterations); @@ -1100,6 +1130,8 @@ mod tests { assert_eq!(limits.max_stderr_bytes, defaults.max_stderr_bytes); assert_eq!(limits.max_subst_depth, defaults.max_subst_depth); assert_eq!(limits.max_file_descriptors, defaults.max_file_descriptors); + assert_eq!(limits.max_word_split_fields, defaults.max_word_split_fields); + assert_eq!(limits.max_word_split_bytes, defaults.max_word_split_bytes); } #[test] @@ -1115,7 +1147,9 @@ mod tests { .max_stdout_bytes(2048) .max_stderr_bytes(4096) .max_subst_depth(8) - .max_file_descriptors(16); + .max_file_descriptors(16) + .max_word_split_fields(17) + .max_word_split_bytes(18); assert_eq!(limits.max_commands, 5); assert_eq!(limits.max_loop_iterations, 7); @@ -1128,6 +1162,8 @@ mod tests { assert_eq!(limits.max_stderr_bytes, 4096); assert_eq!(limits.max_subst_depth, 8); assert_eq!(limits.max_file_descriptors, 16); + assert_eq!(limits.max_word_split_fields, 17); + assert_eq!(limits.max_word_split_bytes, 18); } #[test]