diff --git a/architecture/sandbox.md b/architecture/sandbox.md index c7e789cae..fcef69b4f 100644 --- a/architecture/sandbox.md +++ b/architecture/sandbox.md @@ -32,6 +32,7 @@ All paths are relative to `crates/openshell-sandbox/src/`. | `l7/tls.rs` | Ephemeral CA generation (`SandboxCa`), per-hostname leaf cert cache (`CertCache`), TLS termination/connection helpers, `looks_like_tls()` auto-detection | | `l7/relay.rs` | Protocol-aware bidirectional relay with per-request OPA evaluation, credential-injection-only passthrough relay | | `l7/rest.rs` | HTTP/1.1 request/response parsing, body framing (Content-Length, chunked), deny response generation | +| `l7/path.rs` | Request-target canonicalization: percent-decoding, dot-segment resolution, `;params` stripping, encoded-slash policy (opt-in per endpoint via `allow_encoded_slash: true` for upstreams like GitLab that embed `%2F` in paths). Single source of truth for the path both OPA evaluates and the upstream receives. | | `l7/provider.rs` | `L7Provider` trait and `L7Request`/`BodyLength` types | | `secrets.rs` | `SecretResolver` credential placeholder system — placeholder generation, multi-location rewriting (headers, query params, path segments, Basic auth), fail-closed scanning, secret validation, percent-encoding | diff --git a/crates/openshell-policy/src/lib.rs b/crates/openshell-policy/src/lib.rs index e3c26061a..2848ecc92 100644 --- a/crates/openshell-policy/src/lib.rs +++ b/crates/openshell-policy/src/lib.rs @@ -102,6 +102,12 @@ struct NetworkEndpointDef { allowed_ips: Vec, #[serde(default, skip_serializing_if = "Vec::is_empty")] deny_rules: Vec, + /// When true, percent-encoded `/` (`%2F`) is preserved in path segments + /// rather than rejected by the L7 path canonicalizer. Required for + /// upstreams like GitLab that embed `%2F` in namespaced resource paths. + /// Defaults to false (strict). + #[serde(default, skip_serializing_if = "std::ops::Not::not")] + allow_encoded_slash: bool, } fn is_zero(v: &u16) -> bool { @@ -254,6 +260,7 @@ fn to_proto(raw: PolicyFile) -> SandboxPolicy { .collect(), }) .collect(), + allow_encoded_slash: e.allow_encoded_slash, } }) .collect(), @@ -393,6 +400,7 @@ fn from_proto(policy: &SandboxPolicy) -> PolicyFile { .collect(), }) .collect(), + allow_encoded_slash: e.allow_encoded_slash, } }) .collect(), diff --git a/crates/openshell-sandbox/data/sandbox-policy.rego b/crates/openshell-sandbox/data/sandbox-policy.rego index 939c2e5cb..a1a980a9e 100644 --- a/crates/openshell-sandbox/data/sandbox-policy.rego +++ b/crates/openshell-sandbox/data/sandbox-policy.rego @@ -328,6 +328,13 @@ method_matches(actual, expected) if { } # Path matching: "**" matches everything; otherwise glob.match with "/" delimiter. +# +# INVARIANT: `input.request.path` is canonicalized by the sandbox before +# policy evaluation — percent-decoded, dot-segments resolved, doubled +# slashes collapsed, `;params` stripped, `%2F` rejected (unless an +# endpoint opts in). Patterns here must therefore match canonical paths; +# do not attempt defensive matching against `..` or `%2e%2e` — those +# inputs are rejected at the L7 parser boundary before this rule runs. path_matches(_, "**") if true path_matches(actual, pattern) if { @@ -394,8 +401,8 @@ command_matches(actual, expected) if { # --- Matched endpoint config (for L7 and allowed_ips extraction) --- # Returns the raw endpoint object for the matched policy + host:port. -# Used by Rust to extract L7 config (protocol, tls, enforcement) and/or -# allowed_ips for SSRF allowlist validation. +# Used by Rust to extract L7 config (protocol, tls, enforcement, +# allow_encoded_slash) and/or allowed_ips for SSRF allowlist validation. # Per-policy helper: returns matching endpoint configs for a single policy. _policy_endpoint_configs(policy) := [ep | diff --git a/crates/openshell-sandbox/src/l7/mod.rs b/crates/openshell-sandbox/src/l7/mod.rs index d354f562b..3b26f8122 100644 --- a/crates/openshell-sandbox/src/l7/mod.rs +++ b/crates/openshell-sandbox/src/l7/mod.rs @@ -9,6 +9,7 @@ //! evaluated against OPA policy, and either forwarded or denied. pub mod inference; +pub mod path; pub mod provider; pub mod relay; pub mod rest; @@ -59,6 +60,10 @@ pub struct L7EndpointConfig { pub protocol: L7Protocol, pub tls: TlsMode, pub enforcement: EnforcementMode, + /// When true, percent-encoded `/` (`%2F`) is preserved in path segments + /// rather than rejected at the parser. Needed by upstreams like GitLab + /// that embed `%2F` in namespaced project paths. Defaults to false. + pub allow_encoded_slash: bool, } /// Result of an L7 policy decision for a single request. @@ -122,10 +127,13 @@ pub fn parse_l7_config(val: ®orus::Value) -> Option { _ => EnforcementMode::Audit, }; + let allow_encoded_slash = get_object_bool(val, "allow_encoded_slash").unwrap_or(false); + Some(L7EndpointConfig { protocol, tls, enforcement, + allow_encoded_slash, }) } @@ -141,6 +149,19 @@ pub fn parse_tls_mode(val: ®orus::Value) -> TlsMode { } } +/// Extract a bool value from a regorus object. Returns `None` when the key +/// is absent or not a boolean. +fn get_object_bool(val: ®orus::Value, key: &str) -> Option { + let key_val = regorus::Value::String(key.into()); + match val { + regorus::Value::Object(map) => match map.get(&key_val) { + Some(regorus::Value::Bool(b)) => Some(*b), + _ => None, + }, + _ => None, + } +} + /// Extract a string value from a regorus object. fn get_object_str(val: ®orus::Value, key: &str) -> Option { let key_val = regorus::Value::String(key.into()); @@ -746,6 +767,26 @@ mod tests { assert!(parse_l7_config(&val).is_none()); } + #[test] + fn parse_l7_config_allow_encoded_slash_defaults_false() { + let val = regorus::Value::from_json_str( + r#"{"protocol": "rest", "host": "api.example.com", "port": 443}"#, + ) + .unwrap(); + let config = parse_l7_config(&val).unwrap(); + assert!(!config.allow_encoded_slash); + } + + #[test] + fn parse_l7_config_allow_encoded_slash_opt_in() { + let val = regorus::Value::from_json_str( + r#"{"protocol": "rest", "host": "gitlab.example.com", "port": 443, "allow_encoded_slash": true}"#, + ) + .unwrap(); + let config = parse_l7_config(&val).unwrap(); + assert!(config.allow_encoded_slash); + } + #[test] fn validate_rules_and_access_mutual_exclusion() { let data = serde_json::json!({ diff --git a/crates/openshell-sandbox/src/l7/path.rs b/crates/openshell-sandbox/src/l7/path.rs new file mode 100644 index 000000000..1b425b5f4 --- /dev/null +++ b/crates/openshell-sandbox/src/l7/path.rs @@ -0,0 +1,563 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! HTTP request-target canonicalization for L7 policy enforcement. +//! +//! The L7 REST proxy evaluates OPA rules against the request path and +//! forwards the raw request line to the upstream server. If the path the +//! policy sees is not the path the upstream dispatches on, any path-based +//! allow rule can be bypassed with non-canonical encodings (`..`, `%2e%2e`, +//! `//`, `;params`). This module resolves that divergence by producing a +//! single canonical path that is both the input to policy evaluation and +//! the bytes written onto the wire. +//! +//! Behavior for v1: +//! - Percent-decode unreserved path bytes; preserve the rest as uppercase +//! `%HH`. +//! - Resolve `.` and `..` segments per RFC 3986 Section 5.2.4. `..` that +//! would escape the root is rejected rather than silently clamped to +//! `/` — non-canonical input is almost always adversarial. +//! - Collapse repeated slashes. +//! - Reject control bytes (`0x00..=0x1F`, `0x7F`), fragments in the +//! request-target, raw non-ASCII bytes, and paths that cannot be parsed +//! as origin-form. +//! - Strip trailing `;params` from each segment by default (Tomcat-class +//! `;jsessionid` ACL-bypass mitigation). +//! - Reject `%2F` (encoded slash) inside a segment by default. Operators +//! can opt in per-endpoint for APIs that rely on encoded slashes in +//! slugs. + +use thiserror::Error; + +/// Reasons a request-target can be rejected at the canonicalization boundary. +#[derive(Debug, Clone, PartialEq, Eq, Error)] +pub enum CanonicalizeError { + #[error("request-target contains a null or control byte")] + NullOrControlByte, + #[error("request-target contains an invalid percent-encoded sequence")] + InvalidPercentEncoding, + #[error("request-target contains an encoded '/' (%2F) which is not allowed on this endpoint")] + EncodedSlashNotAllowed, + #[error("request-target contains a fragment")] + FragmentInRequestTarget, + #[error("request-target contains raw non-ASCII bytes; non-ASCII must be percent-encoded")] + NonAscii, + #[error("request-target's `..` segment would escape the path root")] + TraversalAboveRoot, + #[error("request-target exceeds the configured maximum length")] + PathTooLong, + #[error("request-target is not a valid origin-form path")] + MalformedTarget, +} + +/// Options controlling canonicalization strictness. +/// +/// Produced by the endpoint configuration. Defaults are intentionally strict: +/// operators opt in to looser behavior per-endpoint when the upstream API +/// requires it. +#[derive(Debug, Clone, Copy)] +pub struct CanonicalizeOptions { + /// When `true`, `%2F` inside a segment is preserved (re-emitted as + /// `%2F` on the wire) rather than rejected. Defaults to `false`. + pub allow_encoded_slash: bool, + /// When `true`, RFC 3986 path parameters (`;param`) are stripped from + /// each segment before policy evaluation and before forwarding. + /// Defaults to `true`: path parameters are an ambiguity surface + /// historically used to bypass ACLs and are not part of any policy + /// we author. + pub strip_path_parameters: bool, +} + +impl Default for CanonicalizeOptions { + fn default() -> Self { + Self { + allow_encoded_slash: false, + strip_path_parameters: true, + } + } +} + +/// Result of a successful canonicalization. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct CanonicalPath { + /// The canonical path. Always starts with `/`. Contains no `.`/`..` + /// segments, no doubled slashes, and no `;params` (when stripping is + /// enabled). + pub path: String, + /// `true` if the canonical form differs from the input. Callers use + /// this to decide whether to rewrite the outbound request line. + pub rewritten: bool, +} + +/// Maximum accepted length of an origin-form path (bytes). +pub(crate) const MAX_PATH_LEN: usize = 4 * 1024; + +/// Sentinel byte used to represent a `%2F`-decoded slash inside a segment. +/// Chosen from the C0 control range so no legitimate decoded byte collides +/// with it; any raw `0x01` in the input is rejected up front. +const ENCODED_SLASH_SENTINEL: u8 = 0x01; + +/// Canonicalize an HTTP request-target's path component. +/// +/// Accepts origin-form (`"/a/b?q=1"`) or absolute-form (`"http://h/a/b"`) +/// targets. Asterisk-form (`"*"`, used only for `OPTIONS *`) is rejected +/// because the L7 enforcement pipeline does not handle it. +/// +/// Returns the canonical path plus the original query suffix (byte-for-byte +/// as supplied by the client). Query-parameter parsing is left to the +/// caller — this function only operates on the path component. +pub fn canonicalize_request_target( + target: &str, + opts: &CanonicalizeOptions, +) -> Result<(CanonicalPath, Option), CanonicalizeError> { + // 1. Reject control bytes and raw non-ASCII outright. These tests also + // cover CR/LF which are never legal in a single-line request-target. + for &b in target.as_bytes() { + if b == 0 || b == b'\n' || b == b'\r' || b == b'\t' || b == 0x7F { + return Err(CanonicalizeError::NullOrControlByte); + } + if b < 0x20 { + return Err(CanonicalizeError::NullOrControlByte); + } + if b >= 0x80 { + return Err(CanonicalizeError::NonAscii); + } + } + + // 2. Reject fragments — forbidden in request-target per RFC 7230. + if target.contains('#') { + return Err(CanonicalizeError::FragmentInRequestTarget); + } + + // 3. Split off query at the first `?`. Query is preserved verbatim. + let (path_part, query_part) = match target.split_once('?') { + Some((p, q)) => (p, Some(q.to_string())), + None => (target, None), + }; + + // 4. Handle absolute-form by stripping scheme://authority. + let raw_path = if let Some(idx) = path_part.find("://") { + let after_scheme = &path_part[idx + 3..]; + match after_scheme.find('/') { + Some(slash) => &after_scheme[slash..], + None => "/", + } + } else { + path_part + }; + + // 5. Empty is equivalent to "/". + let raw_path = if raw_path.is_empty() { "/" } else { raw_path }; + + // 6. Must begin with '/' (origin-form). + if !raw_path.starts_with('/') { + return Err(CanonicalizeError::MalformedTarget); + } + + // 7. Length bound. + if raw_path.len() > MAX_PATH_LEN { + return Err(CanonicalizeError::PathTooLong); + } + + // 8. Percent-decode the path into bytes. `%2F` is replaced by a + // sentinel byte so that subsequent `/` splitting cannot confuse it + // with a real path separator. + let decoded = percent_decode_with_sentinel(raw_path.as_bytes(), opts.allow_encoded_slash)?; + + // 9. Split on literal `/` and resolve dot-segments. + let segments = split_path_segments(&decoded); + let resolved = resolve_dot_segments(segments)?; + + // 10. Reconstruct. Strip `;params` per segment if requested; re-encode + // any byte that must be percent-encoded in the pchar set. + let canonical = build_canonical_path(&resolved, decoded.last().copied() == Some(b'/'), opts); + + let rewritten = canonical != raw_path; + Ok(( + CanonicalPath { + path: canonical, + rewritten, + }, + query_part, + )) +} + +// --------------------------------------------------------------------------- +// Internals +// --------------------------------------------------------------------------- + +fn percent_decode_with_sentinel( + raw: &[u8], + allow_encoded_slash: bool, +) -> Result, CanonicalizeError> { + let mut out = Vec::with_capacity(raw.len()); + let mut i = 0; + while i < raw.len() { + let b = raw[i]; + if b == ENCODED_SLASH_SENTINEL { + // Raw sentinel byte in input — already rejected by the C0 + // control-byte sweep above, but double-check here to avoid + // collisions in case the sweep is ever relaxed. + return Err(CanonicalizeError::NullOrControlByte); + } + if b == b'%' { + if i + 2 >= raw.len() { + return Err(CanonicalizeError::InvalidPercentEncoding); + } + let decoded = match (decode_hex(raw[i + 1]), decode_hex(raw[i + 2])) { + (Some(hi), Some(lo)) => (hi << 4) | lo, + _ => return Err(CanonicalizeError::InvalidPercentEncoding), + }; + if decoded == b'/' { + if !allow_encoded_slash { + return Err(CanonicalizeError::EncodedSlashNotAllowed); + } + out.push(ENCODED_SLASH_SENTINEL); + } else if decoded == 0 || decoded == 0x7F || (decoded < 0x20 && decoded != b'\t') { + return Err(CanonicalizeError::NullOrControlByte); + } else if decoded == b'\n' || decoded == b'\r' || decoded == b'\t' { + // %-encoded CR/LF/TAB are still control bytes; reject. + return Err(CanonicalizeError::NullOrControlByte); + } else { + out.push(decoded); + } + i += 3; + } else { + out.push(b); + i += 1; + } + } + Ok(out) +} + +fn split_path_segments(decoded: &[u8]) -> Vec<&[u8]> { + // decoded is guaranteed to start with `/`. Skip the leading `/` and + // split on subsequent `/` bytes. The sentinel byte for encoded slashes + // never matches, so it stays inside its segment. + decoded[1..].split(|&b| b == b'/').collect() +} + +fn resolve_dot_segments(segments: Vec<&[u8]>) -> Result>, CanonicalizeError> { + let mut stack: Vec> = Vec::with_capacity(segments.len()); + let last = segments.len().saturating_sub(1); + for (idx, seg) in segments.into_iter().enumerate() { + if seg == b".." { + if stack.pop().is_none() { + return Err(CanonicalizeError::TraversalAboveRoot); + } + if idx == last { + // A trailing `..` leaves a "directory" (trailing slash). + stack.push(Vec::new()); + } + continue; + } + if seg == b"." { + if idx == last { + stack.push(Vec::new()); + } + continue; + } + if seg.is_empty() && idx != last { + // Collapse repeated slashes except at the very end, where an + // empty trailing segment encodes a trailing `/`. + continue; + } + stack.push(seg.to_vec()); + } + Ok(stack) +} + +fn build_canonical_path( + segments: &[Vec], + _trailing_slash_hint: bool, + opts: &CanonicalizeOptions, +) -> String { + let mut out = String::from("/"); + for (idx, seg) in segments.iter().enumerate() { + if idx > 0 { + out.push('/'); + } + let trimmed: &[u8] = if opts.strip_path_parameters { + match seg.iter().position(|&b| b == b';') { + Some(pos) => &seg[..pos], + None => seg, + } + } else { + seg + }; + for &b in trimmed { + if b == ENCODED_SLASH_SENTINEL { + out.push_str("%2F"); + } else if is_pchar_unreserved(b) { + out.push(b as char); + } else { + out.push('%'); + out.push(upper_hex_nibble(b >> 4)); + out.push(upper_hex_nibble(b & 0x0F)); + } + } + } + out +} + +fn is_pchar_unreserved(b: u8) -> bool { + // RFC 3986 pchar without the percent-encoded slot — i.e. bytes we emit + // literally. Unreserved plus RFC 3986 sub-delims plus `:` and `@`. + b.is_ascii_alphanumeric() + || matches!( + b, + b'-' | b'.' + | b'_' + | b'~' + | b'!' + | b'$' + | b'&' + | b'\'' + | b'(' + | b')' + | b'*' + | b'+' + | b',' + | b';' + | b'=' + | b':' + | b'@' + ) +} + +fn decode_hex(b: u8) -> Option { + match b { + b'0'..=b'9' => Some(b - b'0'), + b'a'..=b'f' => Some(b - b'a' + 10), + b'A'..=b'F' => Some(b - b'A' + 10), + _ => None, + } +} + +fn upper_hex_nibble(n: u8) -> char { + match n { + 0..=9 => (b'0' + n) as char, + 10..=15 => (b'A' + (n - 10)) as char, + _ => unreachable!("nibble out of range"), + } +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +#[cfg(test)] +mod tests { + use super::*; + + fn canon(input: &str) -> Result { + let opts = CanonicalizeOptions::default(); + canonicalize_request_target(input, &opts).map(|(p, _)| p.path) + } + + fn canon_with(input: &str, opts: CanonicalizeOptions) -> Result { + canonicalize_request_target(input, &opts).map(|(p, _)| p.path) + } + + #[test] + fn literal_dot_segments_resolve() { + assert_eq!(canon("/a/./b").unwrap(), "/a/b"); + assert_eq!(canon("/a/b/.").unwrap(), "/a/b/"); + assert_eq!(canon("/a/../b").unwrap(), "/b"); + assert_eq!(canon("/a/b/..").unwrap(), "/a/"); + } + + #[test] + fn percent_encoded_dot_segments_resolve_the_same_way() { + assert_eq!(canon("/public/%2e%2e/secret").unwrap(), "/secret"); + assert_eq!(canon("/public/%2E%2E/secret").unwrap(), "/secret"); + assert_eq!(canon("/public/%2e/secret").unwrap(), "/public/secret"); + } + + #[test] + fn traversal_above_root_is_rejected() { + assert_eq!(canon("/.."), Err(CanonicalizeError::TraversalAboveRoot)); + assert_eq!( + canon("/a/../.."), + Err(CanonicalizeError::TraversalAboveRoot) + ); + assert_eq!( + canon("/a/%2e%2e/%2e%2e"), + Err(CanonicalizeError::TraversalAboveRoot) + ); + } + + #[test] + fn doubled_slashes_collapse() { + assert_eq!(canon("//").unwrap(), "/"); + assert_eq!(canon("//public//../secret").unwrap(), "/secret"); + assert_eq!(canon("/public//secret").unwrap(), "/public/secret"); + } + + #[test] + fn encoded_slash_rejected_by_default() { + assert_eq!( + canon("/a/%2f/b"), + Err(CanonicalizeError::EncodedSlashNotAllowed) + ); + assert_eq!( + canon("/public/..%2fsecret"), + Err(CanonicalizeError::EncodedSlashNotAllowed) + ); + } + + #[test] + fn encoded_slash_preserved_when_opted_in() { + let opts = CanonicalizeOptions { + allow_encoded_slash: true, + ..CanonicalizeOptions::default() + }; + assert_eq!(canon_with("/a/%2f/b", opts).unwrap(), "/a/%2F/b"); + assert_eq!(canon_with("/a/%2F/b", opts).unwrap(), "/a/%2F/b"); + } + + #[test] + fn null_and_control_bytes_rejected() { + assert_eq!(canon("/a%00b"), Err(CanonicalizeError::NullOrControlByte)); + assert_eq!(canon("/a%0Ab"), Err(CanonicalizeError::NullOrControlByte)); + assert_eq!(canon("/a%0Db"), Err(CanonicalizeError::NullOrControlByte)); + assert_eq!(canon("/a%7Fb"), Err(CanonicalizeError::NullOrControlByte)); + // Raw CR/LF/TAB in input should also fail. Build strings via + // byte-level concatenation since the literals in the source are + // otherwise flagged as control bytes in CI. + let mut raw = String::from("/a"); + raw.push('\n'); + raw.push('b'); + assert_eq!(canon(&raw), Err(CanonicalizeError::NullOrControlByte)); + } + + #[test] + fn fragment_rejected() { + assert_eq!( + canon("/a#b"), + Err(CanonicalizeError::FragmentInRequestTarget) + ); + } + + #[test] + fn absolute_form_strips_authority() { + assert_eq!(canon("http://host/a/../b").unwrap(), "/b"); + assert_eq!(canon("https://host").unwrap(), "/"); + assert_eq!(canon("http://host:443/foo").unwrap(), "/foo"); + } + + #[test] + fn legitimate_percent_encoded_bytes_round_trip() { + assert_eq!( + canon("/files/hello%20world.txt").unwrap(), + "/files/hello%20world.txt" + ); + assert_eq!(canon("/search/a%3Fb").unwrap(), "/search/a%3Fb"); + assert_eq!(canon("/users/%40alice").unwrap(), "/users/@alice"); + } + + #[test] + fn path_parameters_stripped_by_default() { + assert_eq!(canon("/a;jsessionid=xyz/b").unwrap(), "/a/b"); + assert_eq!(canon("/public;x=1/../secret").unwrap(), "/secret"); + } + + #[test] + fn path_parameters_preserved_when_disabled() { + let opts = CanonicalizeOptions { + strip_path_parameters: false, + ..CanonicalizeOptions::default() + }; + assert_eq!( + canon_with("/a;jsessionid=xyz/b", opts).unwrap(), + "/a;jsessionid=xyz/b" + ); + } + + #[test] + fn non_ascii_raw_byte_rejected() { + let mut raw = String::from("/a"); + raw.push('é'); + assert_eq!(canon(&raw), Err(CanonicalizeError::NonAscii)); + } + + #[test] + fn percent_encoded_non_ascii_bytes_round_trip() { + // `é` in UTF-8 is 0xC3 0xA9. The proxy treats the path as opaque + // bytes; percent-encoded high bytes pass through unchanged. + assert_eq!(canon("/users/caf%C3%A9").unwrap(), "/users/caf%C3%A9"); + } + + #[test] + fn empty_and_root_equivalent() { + assert_eq!(canon("").unwrap(), "/"); + assert_eq!(canon("/").unwrap(), "/"); + } + + #[test] + fn path_too_long_rejected() { + let long = format!("/{}", "a".repeat(MAX_PATH_LEN)); + assert_eq!(canon(&long), Err(CanonicalizeError::PathTooLong)); + } + + #[test] + fn mixed_case_percent_normalizes_to_uppercase() { + // Request comes in with lowercase %c3 — after canonicalization we + // emit %C3 so policy authors don't need to enumerate both cases. + assert_eq!(canon("/a/caf%c3%a9").unwrap(), "/a/caf%C3%A9"); + } + + #[test] + fn rewritten_flag_reflects_transformation() { + let (canon, _) = + canonicalize_request_target("/a", &CanonicalizeOptions::default()).unwrap(); + assert!(!canon.rewritten); + let (canon, _) = + canonicalize_request_target("/a/../b", &CanonicalizeOptions::default()).unwrap(); + assert!(canon.rewritten); + } + + #[test] + fn query_suffix_is_returned_separately() { + let (canon, query) = + canonicalize_request_target("/a?q=1&r=2", &CanonicalizeOptions::default()).unwrap(); + assert_eq!(canon.path, "/a"); + assert_eq!(query.as_deref(), Some("q=1&r=2")); + } + + // --------------------------------------------------------------------- + // Regression tests for the documented attack payloads. Every one of + // these used to bypass a `/public/**` allow rule because the proxy and + // the OPA policy never agreed with the upstream on what path was being + // dispatched. + // --------------------------------------------------------------------- + + #[test] + fn regression_public_slash_dotdot_secret() { + assert_eq!(canon("/public/../secret").unwrap(), "/secret"); + } + + #[test] + fn regression_public_slash_percent_dotdot_secret() { + assert_eq!(canon("/public/%2e%2e/secret").unwrap(), "/secret"); + assert_eq!(canon("/public/%2E%2E/secret").unwrap(), "/secret"); + } + + #[test] + fn regression_percent_encoded_slash_in_dotdot_rejected() { + assert_eq!( + canon("/public/%2E%2E%2Fsecret"), + Err(CanonicalizeError::EncodedSlashNotAllowed) + ); + } + + #[test] + fn regression_double_slash_prefix() { + assert_eq!(canon("//public/../secret").unwrap(), "/secret"); + } + + #[test] + fn regression_dot_slash_dotdot() { + assert_eq!(canon("/public/./../secret").unwrap(), "/secret"); + } +} diff --git a/crates/openshell-sandbox/src/l7/relay.rs b/crates/openshell-sandbox/src/l7/relay.rs index 110f777e9..a0e54062d 100644 --- a/crates/openshell-sandbox/src/l7/relay.rs +++ b/crates/openshell-sandbox/src/l7/relay.rs @@ -128,9 +128,17 @@ where C: AsyncRead + AsyncWrite + Unpin + Send, U: AsyncRead + AsyncWrite + Unpin + Send, { + // Build a provider carrying the per-endpoint canonicalization options so + // request parsing honors the endpoint's `allow_encoded_slash` setting + // (e.g. APIs like GitLab that embed `%2F` in path segments). + let provider = + crate::l7::rest::RestProvider::with_options(crate::l7::path::CanonicalizeOptions { + allow_encoded_slash: config.allow_encoded_slash, + ..Default::default() + }); loop { // Parse one HTTP request from client - let req = match crate::l7::rest::RestProvider.parse_request(client).await { + let req = match provider.parse_request(client).await { Ok(Some(req)) => req, Ok(None) => return Ok(()), // Client closed connection Err(e) => { @@ -274,7 +282,7 @@ where } } else { // Enforce mode: deny with 403 and close connection (use redacted target) - crate::l7::rest::RestProvider + provider .deny_with_redacted_target( &req, &ctx.policy_name, @@ -374,7 +382,11 @@ where C: AsyncRead + AsyncWrite + Unpin + Send, U: AsyncRead + AsyncWrite + Unpin + Send, { - let provider = crate::l7::rest::RestProvider; + // Passthrough path: no L7 policy is enforced here, so use default + // (strict) canonicalization options. Calls to GitLab-style APIs that + // need `%2F` must be configured as L7 endpoints so the per-endpoint + // `allow_encoded_slash` opt-in applies. + let provider = crate::l7::rest::RestProvider::default(); let mut request_count: u64 = 0; let resolver = ctx.secret_resolver.as_deref(); diff --git a/crates/openshell-sandbox/src/l7/rest.rs b/crates/openshell-sandbox/src/l7/rest.rs index 6bbf7be4e..4d8909b9e 100644 --- a/crates/openshell-sandbox/src/l7/rest.rs +++ b/crates/openshell-sandbox/src/l7/rest.rs @@ -22,14 +22,33 @@ const RELAY_BUF_SIZE: usize = 8192; const RELAY_EOF_IDLE_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(5); /// HTTP/1.1 REST protocol provider. -pub struct RestProvider; +/// +/// Carries the path-canonicalization options derived from the endpoint +/// config so that different endpoints (e.g. one backed by GitLab that needs +/// `%2F` in paths and one backed by a strict API) can apply different +/// canonicalization strictness to the same `RestProvider` call surface. +#[derive(Debug, Clone, Default)] +pub struct RestProvider { + canonicalize_options: crate::l7::path::CanonicalizeOptions, +} + +impl RestProvider { + /// Construct a provider with explicit canonicalization options. Used by + /// `relay_rest` so endpoint config can opt in to looser behavior such + /// as `allow_encoded_slash`. + pub fn with_options(canonicalize_options: crate::l7::path::CanonicalizeOptions) -> Self { + Self { + canonicalize_options, + } + } +} impl L7Provider for RestProvider { async fn parse_request( &self, client: &mut C, ) -> Result> { - parse_http_request(client).await + parse_http_request(client, &self.canonicalize_options).await } async fn relay( @@ -78,7 +97,10 @@ impl RestProvider { /// forwarded upstream without L7 policy evaluation -- a request /// smuggling vulnerability. Byte-at-a-time overhead is negligible for /// the typical 200-800 byte headers on L7-inspected REST endpoints. -async fn parse_http_request(client: &mut C) -> Result> { +async fn parse_http_request( + client: &mut C, + canonicalize_options: &crate::l7::path::CanonicalizeOptions, +) -> Result> { let mut buf = Vec::with_capacity(4096); loop { @@ -149,17 +171,68 @@ async fn parse_http_request(client: &mut C) -> Result parse_query_params(q)?, + None => HashMap::new(), + }; + + if canonical.rewritten { + buf = rewrite_request_line_target( + &buf, + &method, + &canonical.path, + raw_query.as_deref(), + version, + )?; + } Ok(Some(L7Request { action: method, - target: path, + target: canonical.path, query_params, raw_header: buf, // exact header bytes up to and including \r\n\r\n body_length, })) } +/// Rebuild the request line in a raw HTTP header block with a canonicalized +/// target. Called when the canonical path differs from what the client sent, +/// so the upstream dispatches on the exact bytes the policy engine evaluated. +fn rewrite_request_line_target( + raw: &[u8], + method: &str, + canonical_path: &str, + raw_query: Option<&str>, + version: &str, +) -> Result> { + let eol = raw + .windows(2) + .position(|w| w == b"\r\n") + .ok_or_else(|| miette!("request line missing CRLF"))?; + let rest = &raw[eol..]; + let new_target = match raw_query { + Some(q) if !q.is_empty() => format!("{canonical_path}?{q}"), + _ => canonical_path.to_string(), + }; + let new_request_line = format!("{method} {new_target} {version}"); + let mut out = Vec::with_capacity(new_request_line.len() + rest.len()); + out.extend_from_slice(new_request_line.as_bytes()); + out.extend_from_slice(rest); + Ok(out) +} + pub(crate) fn parse_target_query(target: &str) -> Result<(String, HashMap>)> { match target.split_once('?') { Some((path, query)) => Ok((path.to_string(), parse_query_params(query)?)), @@ -167,7 +240,7 @@ pub(crate) fn parse_target_query(target: &str) -> Result<(String, HashMap Result>> { +pub(crate) fn parse_query_params(query: &str) -> Result>> { let mut params: HashMap> = HashMap::new(); if query.is_empty() { return Ok(params); @@ -1064,7 +1137,11 @@ mod tests { .await .unwrap(); }); - let result = parse_http_request(&mut client).await; + let result = parse_http_request( + &mut client, + &crate::l7::path::CanonicalizeOptions::default(), + ) + .await; assert!(result.is_err(), "Must reject headers with bare LF"); } @@ -1077,7 +1154,11 @@ mod tests { raw.extend_from_slice(b"GET /api HTTP/1.1\r\nHost: x\r\nX-Bad: \xc0\xaf\r\n\r\n"); writer.write_all(&raw).await.unwrap(); }); - let result = parse_http_request(&mut client).await; + let result = parse_http_request( + &mut client, + &crate::l7::path::CanonicalizeOptions::default(), + ) + .await; assert!(result.is_err(), "Must reject headers with invalid UTF-8"); } @@ -1091,10 +1172,180 @@ mod tests { .await .unwrap(); }); - let result = parse_http_request(&mut client).await; + let result = parse_http_request( + &mut client, + &crate::l7::path::CanonicalizeOptions::default(), + ) + .await; assert!(result.is_err(), "Must reject unsupported HTTP version"); } + #[tokio::test] + async fn parse_http_request_canonicalizes_target_and_rewrites_raw_header() { + let (mut client, mut writer) = tokio::io::duplex(4096); + tokio::spawn(async move { + writer + .write_all(b"GET /public/../secret HTTP/1.1\r\nHost: api.example.com\r\n\r\n") + .await + .unwrap(); + }); + let req = parse_http_request( + &mut client, + &crate::l7::path::CanonicalizeOptions::default(), + ) + .await + .expect("request should parse") + .expect("request should exist"); + // Path fed to OPA evaluation is canonical. + assert_eq!(req.target, "/secret"); + // raw_header (forwarded byte-for-byte to upstream) is also canonical + // — this is the invariant the L7 canonicalization PR must uphold. + assert_eq!( + req.raw_header, b"GET /secret HTTP/1.1\r\nHost: api.example.com\r\n\r\n", + "outbound request line must carry the canonical path" + ); + } + + #[tokio::test] + async fn parse_http_request_canonicalization_preserves_query_string() { + let (mut client, mut writer) = tokio::io::duplex(4096); + tokio::spawn(async move { + writer + .write_all(b"GET /public/../v1/list?limit=10&sort=asc HTTP/1.1\r\nHost: h\r\n\r\n") + .await + .unwrap(); + }); + let req = parse_http_request( + &mut client, + &crate::l7::path::CanonicalizeOptions::default(), + ) + .await + .unwrap() + .unwrap(); + assert_eq!(req.target, "/v1/list"); + assert_eq!( + req.raw_header, b"GET /v1/list?limit=10&sort=asc HTTP/1.1\r\nHost: h\r\n\r\n", + "canonical rewrite must preserve the query string verbatim" + ); + } + + #[tokio::test] + async fn parse_http_request_leaves_canonical_input_byte_for_byte() { + // When the input is already canonical, the raw_header must pass + // through unchanged — otherwise legitimate traffic pays a rewrite + // cost on every request. + let (mut client, mut writer) = tokio::io::duplex(4096); + tokio::spawn(async move { + writer + .write_all(b"GET /api/v1/users HTTP/1.1\r\nHost: api.example.com\r\n\r\n") + .await + .unwrap(); + }); + let req = parse_http_request( + &mut client, + &crate::l7::path::CanonicalizeOptions::default(), + ) + .await + .unwrap() + .unwrap(); + assert_eq!(req.target, "/api/v1/users"); + assert_eq!( + req.raw_header, + b"GET /api/v1/users HTTP/1.1\r\nHost: api.example.com\r\n\r\n", + ); + } + + #[tokio::test] + async fn parse_http_request_rejects_traversal_above_root() { + let (mut client, mut writer) = tokio::io::duplex(4096); + tokio::spawn(async move { + writer + .write_all(b"GET /.. HTTP/1.1\r\nHost: h\r\n\r\n") + .await + .unwrap(); + }); + let result = parse_http_request( + &mut client, + &crate::l7::path::CanonicalizeOptions::default(), + ) + .await; + assert!( + result.is_err(), + "a target that escapes the path root must be rejected at the parser" + ); + } + + #[tokio::test] + async fn parse_http_request_accepts_encoded_slash_when_endpoint_opts_in() { + // GitLab-style endpoints legitimately embed `%2F` in path segments + // (e.g. `/api/v4/projects/group%2Fproject`). Passing a provider + // constructed with `allow_encoded_slash: true` models the + // endpoint-config wiring that flows from `L7EndpointConfig`. + let (mut client, mut writer) = tokio::io::duplex(4096); + tokio::spawn(async move { + writer + .write_all(b"GET /api/v4/projects/group%2Fproject HTTP/1.1\r\nHost: g\r\n\r\n") + .await + .unwrap(); + }); + let options = crate::l7::path::CanonicalizeOptions { + allow_encoded_slash: true, + ..Default::default() + }; + let req = parse_http_request(&mut client, &options) + .await + .unwrap() + .unwrap(); + assert_eq!(req.target, "/api/v4/projects/group%2Fproject"); + } + + #[tokio::test] + async fn parse_http_request_rejects_encoded_slash_by_default() { + // Default strict options must reject `%2F` — this is the security + // posture for endpoints where an encoded slash would let an + // attacker disagree with the upstream on segment boundaries. + let (mut client, mut writer) = tokio::io::duplex(4096); + tokio::spawn(async move { + writer + .write_all(b"GET /api/v4/projects/group%2Fproject HTTP/1.1\r\nHost: g\r\n\r\n") + .await + .unwrap(); + }); + let result = parse_http_request( + &mut client, + &crate::l7::path::CanonicalizeOptions::default(), + ) + .await; + assert!( + result.is_err(), + "default options must reject encoded slashes in the path" + ); + } + + #[tokio::test] + async fn parse_http_request_preserves_http_10_version_on_rewrite() { + let (mut client, mut writer) = tokio::io::duplex(4096); + tokio::spawn(async move { + writer + .write_all(b"GET /a/./b HTTP/1.0\r\nHost: h\r\n\r\n") + .await + .unwrap(); + }); + let req = parse_http_request( + &mut client, + &crate::l7::path::CanonicalizeOptions::default(), + ) + .await + .unwrap() + .unwrap(); + assert_eq!(req.target, "/a/b"); + assert!( + req.raw_header.starts_with(b"GET /a/b HTTP/1.0\r\n"), + "rewrite must preserve the original HTTP version, got: {:?}", + String::from_utf8_lossy(&req.raw_header) + ); + } + #[tokio::test] async fn parse_http_request_splits_path_and_query_params() { let (mut client, mut writer) = tokio::io::duplex(4096); @@ -1106,10 +1357,13 @@ mod tests { .await .unwrap(); }); - let req = parse_http_request(&mut client) - .await - .expect("request should parse") - .expect("request should exist"); + let req = parse_http_request( + &mut client, + &crate::l7::path::CanonicalizeOptions::default(), + ) + .await + .expect("request should parse") + .expect("request should exist"); assert_eq!(req.target, "/download"); assert_eq!( req.query_params.get("slug").cloned(), @@ -1139,10 +1393,13 @@ mod tests { .unwrap(); }); - let first = parse_http_request(&mut client) - .await - .expect("first request should parse") - .expect("expected first request"); + let first = parse_http_request( + &mut client, + &crate::l7::path::CanonicalizeOptions::default(), + ) + .await + .expect("first request should parse") + .expect("expected first request"); assert_eq!(first.action, "GET"); assert_eq!(first.target, "/allowed"); assert!(first.query_params.is_empty()); @@ -1151,10 +1408,13 @@ mod tests { "raw_header must contain only the first request's headers" ); - let second = parse_http_request(&mut client) - .await - .expect("second request should parse") - .expect("expected second request"); + let second = parse_http_request( + &mut client, + &crate::l7::path::CanonicalizeOptions::default(), + ) + .await + .expect("second request should parse") + .expect("expected second request"); assert_eq!(second.action, "POST"); assert_eq!(second.target, "/blocked"); assert!(second.query_params.is_empty()); diff --git a/crates/openshell-sandbox/src/proxy.rs b/crates/openshell-sandbox/src/proxy.rs index f91f2c551..d1310cd57 100644 --- a/crates/openshell-sandbox/src/proxy.rs +++ b/crates/openshell-sandbox/src/proxy.rs @@ -1735,15 +1735,21 @@ fn query_allowed_ips( } } +/// Canonicalize the request-target for inference pattern detection. +/// +/// Falls back to the raw path on canonicalization error: the request is then +/// routed through the normal forward path, where `rest.rs::parse_http_request` +/// will reject it properly. Returning the raw path here prevents a crafted +/// target from bypassing inference routing without our detection logic having +/// to implement a second, duplicate error-response surface. fn normalize_inference_path(path: &str) -> String { - if let Some(scheme_idx) = path.find("://") { - let after_scheme = &path[scheme_idx + 3..]; - if let Some(path_start) = after_scheme.find('/') { - return after_scheme[path_start..].to_string(); - } - return "/".to_string(); + match crate::l7::path::canonicalize_request_target( + path, + &crate::l7::path::CanonicalizeOptions::default(), + ) { + Ok((canon, _)) => canon.path, + Err(_) => path.to_string(), } - path.to_string() } /// Extract the hostname from an absolute-form URI used in plain HTTP proxy requests. @@ -1974,8 +1980,11 @@ async fn handle_forward_proxy( secret_resolver: Option>, denial_tx: Option<&mpsc::UnboundedSender>, ) -> Result<()> { - // 1. Parse the absolute-form URI - let (scheme, host, port, path) = match parse_proxy_uri(target_uri) { + // 1. Parse the absolute-form URI. `path` is marked `mut` so that, when an + // L7 config applies, the canonicalized form produced below replaces it + // in-place — keeping OPA evaluation and the bytes written onto the wire + // in sync. See the L7 block below. + let (scheme, host, port, mut path) = match parse_proxy_uri(target_uri) { Ok(parsed) => parsed, Err(e) => { let event = HttpActivityBuilder::new(crate::ocsf_ctx()) @@ -2154,11 +2163,54 @@ async fn handle_forward_proxy( secret_resolver: secret_resolver.clone(), }; - let (target_path, query_params) = crate::l7::rest::parse_target_query(&path) - .unwrap_or_else(|_| (path.clone(), std::collections::HashMap::new())); + // Canonicalize the request-target. The canonical form is fed to OPA + // AND reassigned to the outer `path` variable so the later call to + // `rewrite_forward_request` writes canonical bytes to the upstream. + // This closes the policy/upstream parser-differential at this site; + // without this reassignment, OPA would evaluate the canonical form + // while the upstream re-normalizes the raw input and dispatches on a + // potentially different path. + let canonicalize_options = crate::l7::path::CanonicalizeOptions { + allow_encoded_slash: l7_config.allow_encoded_slash, + ..Default::default() + }; + let query_params = + match crate::l7::path::canonicalize_request_target(&path, &canonicalize_options) { + Ok((canon, query)) => { + let params = match query.as_deref() { + Some(q) => crate::l7::rest::parse_query_params(q).unwrap_or_default(), + None => std::collections::HashMap::new(), + }; + path = canon.path; + params + } + Err(e) => { + let event = NetworkActivityBuilder::new(crate::ocsf_ctx()) + .activity(ActivityId::Fail) + .severity(SeverityId::Medium) + .status(StatusId::Failure) + .dst_endpoint(Endpoint::from_domain(&host_lc, port)) + .message(format!( + "FORWARD_L7 rejecting non-canonical request-target: {e}" + )) + .build(); + ocsf_emit!(event); + respond( + client, + &build_json_error_response( + 400, + "Bad Request", + "invalid_request_target", + "request-target must be canonical", + ), + ) + .await?; + return Ok(()); + } + }; let request_info = crate::l7::L7RequestInfo { action: method.to_string(), - target: target_path, + target: path.clone(), query_params, }; @@ -3453,6 +3505,35 @@ mod tests { assert!(!result_str.contains("Via: 1.1 openshell-sandbox")); } + #[test] + fn test_rewrite_forward_request_uses_canonical_path_on_the_wire() { + // Regression: the forward-proxy caller must canonicalize first and + // then pass the canonical form to rewrite_forward_request so that + // OPA's policy evaluation and the bytes dispatched to the upstream + // agree. Prior to this guarantee, OPA saw the canonical form while + // the upstream re-normalized the raw path independently, re-opening + // the parser-differential this PR closes. + let raw = b"GET http://host/public/../secret HTTP/1.1\r\nHost: host\r\n\r\n"; + let (canon, _) = crate::l7::path::canonicalize_request_target( + "/public/../secret", + &crate::l7::path::CanonicalizeOptions::default(), + ) + .expect("canonicalization should succeed for the attack payload"); + assert_eq!(canon.path, "/secret"); + + let rewritten = rewrite_forward_request(raw, raw.len(), &canon.path, None) + .expect("rewrite_forward_request should succeed"); + let rewritten_str = String::from_utf8_lossy(&rewritten); + assert!( + rewritten_str.starts_with("GET /secret HTTP/1.1\r\n"), + "outbound request line must use canonical path, got: {rewritten_str:?}" + ); + assert!( + !rewritten_str.contains(".."), + "outbound bytes must not leak the pre-canonical form, got: {rewritten_str:?}" + ); + } + #[test] fn test_rewrite_resolves_placeholder_auth_headers() { let (_, resolver) = SecretResolver::from_provider_env( diff --git a/crates/openshell-sandbox/tests/websocket_upgrade.rs b/crates/openshell-sandbox/tests/websocket_upgrade.rs index ec226c9cf..81eb46e1f 100644 --- a/crates/openshell-sandbox/tests/websocket_upgrade.rs +++ b/crates/openshell-sandbox/tests/websocket_upgrade.rs @@ -111,7 +111,7 @@ async fn websocket_upgrade_through_l7_relay_exchanges_message() { // Run the relay in a background task (simulates what relay_rest does) let relay_handle = tokio::spawn(async move { - let outcome = RestProvider + let outcome = RestProvider::default() .relay(&req, &mut client_proxy, &mut upstream) .await .expect("relay should succeed"); @@ -239,7 +239,7 @@ async fn normal_http_request_still_works_after_relay_changes() { let outcome = tokio::time::timeout( std::time::Duration::from_secs(5), - RestProvider.relay(&req, &mut client_proxy, &mut upstream), + RestProvider::default().relay(&req, &mut client_proxy, &mut upstream), ) .await .expect("should not deadlock") diff --git a/proto/sandbox.proto b/proto/sandbox.proto index 974ef91b7..8a9abbce2 100644 --- a/proto/sandbox.proto +++ b/proto/sandbox.proto @@ -97,6 +97,11 @@ message NetworkEndpoint { // are blocked even if they match an allow rule or access preset. // Deny rules take precedence over allow rules. repeated L7DenyRule deny_rules = 10; + // When true, percent-encoded '/' (%2F) is preserved in path segments + // rather than rejected by the L7 path canonicalizer. Required for + // upstreams like GitLab that embed %2F in namespaced resource paths. + // Defaults to false (strict). + bool allow_encoded_slash = 11; } // An L7 deny rule that blocks specific requests.