From e22d2cdba2cc152b6eb56423fd081b85ac729e66 Mon Sep 17 00:00:00 2001 From: Jason Hernandez <7144515+jasonhernandez@users.noreply.github.com> Date: Wed, 27 May 2026 08:40:08 -0700 Subject: [PATCH 1/2] workload-replay: redact query literals with the Mz parser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The regex literal redaction only handles single-quoted strings, so numeric literals (account numbers, SSNs, ids), dollar-quoted strings, and escape strings in query predicates were emitted verbatim. Use Materialize's own parser instead, which handles every literal form the dialect supports. Add `mz-sql-anonymize`, a small CLI that reads a JSON array of SQL strings on stdin and writes back each statement run through `to_ast_string_redacted()` (or null when it does not parse). It depends only on the standalone `mz-sql-parser` crate. The anonymizer locates the built binary (via MZ_SQL_ANONYMIZE_BIN or target/{release,debug}), redacts all query SQL in one batch, and falls back per-statement to the regex when the binary is unavailable or a statement does not parse, printing a warning that points at `cargo build --release -p mz-sql-anonymize`. Scope: only query SQL goes through the parser. DDL create_sql keeps the blanket regex, because `to_ast_string_redacted()` deliberately does not redact DDL option strings (connection hosts/users, sink topics, source options) — routing those through the parser would regress the connection and sink leak fix from the parent commit. The verify pass accepts both the parser's `''` and the regex's `'literal_N'` placeholders. This addresses the "wrap the Mz parser" TODO for literals. Identifier renaming still uses the regex: the parser alone cannot resolve which object a bare name refers to (it has no catalog), so scoped renaming remains future work. Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.lock | 8 ++ Cargo.toml | 2 + .../materialize/cli/mz_workload_anonymize.py | 85 +++++++++++++++++- src/sql-anonymize/Cargo.toml | 18 ++++ src/sql-anonymize/src/main.rs | 89 +++++++++++++++++++ test/workload-replay/README.md | 22 +++-- 6 files changed, 215 insertions(+), 9 deletions(-) create mode 100644 src/sql-anonymize/Cargo.toml create mode 100644 src/sql-anonymize/src/main.rs diff --git a/Cargo.lock b/Cargo.lock index a20cca5d2f06b..bf6d8cd462823 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7809,6 +7809,14 @@ dependencies = [ "version-compare", ] +[[package]] +name = "mz-sql-anonymize" +version = "0.0.0" +dependencies = [ + "mz-sql-parser", + "serde_json", +] + [[package]] name = "mz-sql-lexer" version = "0.0.0" diff --git a/Cargo.toml b/Cargo.toml index c85279ae3cac7..746a233f6c1d4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -97,6 +97,7 @@ members = [ "src/server-core", "src/service", "src/sql", + "src/sql-anonymize", "src/sql-lexer", "src/sql-parser", "src/sql-pretty", @@ -221,6 +222,7 @@ default-members = [ "src/server-core", "src/service", "src/sql", + "src/sql-anonymize", "src/sql-lexer", "src/sql-parser", "src/sql-pretty", diff --git a/misc/python/materialize/cli/mz_workload_anonymize.py b/misc/python/materialize/cli/mz_workload_anonymize.py index 7ff7afb234b4a..d1516b4bcdf45 100644 --- a/misc/python/materialize/cli/mz_workload_anonymize.py +++ b/misc/python/materialize/cli/mz_workload_anonymize.py @@ -8,8 +8,12 @@ # by the Apache License, Version 2.0. import argparse +import json +import os import re +import subprocess import sys +from pathlib import Path from typing import Any import yaml @@ -17,6 +21,48 @@ from materialize import MZ_ROOT +def _locate_redactor() -> list[str] | None: + """Locate the mz-sql-anonymize helper binary, if it has been built. + + Honors MZ_SQL_ANONYMIZE_BIN, then looks for a release or debug build in the + Cargo target directory. Returns the argv prefix to run it, or None. + """ + override = os.environ.get("MZ_SQL_ANONYMIZE_BIN") + if override and Path(override).exists(): + return [override] + for profile in ("release", "debug"): + candidate = MZ_ROOT / "target" / profile / "mz-sql-anonymize" + if candidate.exists(): + return [str(candidate)] + return None + + +def redact_literals_via_parser(sqls: list[str]) -> list[str | None] | None: + """Redact literals in each SQL string using Materialize's own parser. + + Returns a list aligned with the input, where each element is the redacted + SQL or None if that statement could not be parsed. Returns None for the + whole batch if the helper binary is unavailable or errors, signaling the + caller to fall back to regex-based redaction. + """ + cmd = _locate_redactor() + if cmd is None: + return None + proc = subprocess.run( + cmd, + input=json.dumps(sqls), + capture_output=True, + text=True, + ) + if proc.returncode != 0: + print( + f"warning: {cmd[0]} failed, falling back to regex redaction:\n{proc.stderr}", + file=sys.stderr, + ) + return None + return json.loads(proc.stdout) + + def keywords() -> set[str]: with open(MZ_ROOT / "src" / "sql-lexer" / "src" / "keywords.txt") as f: result = set( @@ -55,9 +101,10 @@ def verify_anonymized( This is a backstop for the heuristic text substitution, not a proof: it catches whole-word survivals of original identifiers and any single-quoted - literal that was not reduced to a 'literal_N' placeholder. It cannot detect + literal that was not reduced to a placeholder ('' from the + parser-based path, or 'literal_N' from the regex fallback). It cannot detect sensitive data hidden in dollar-quoted strings, comments, or numeric - literals, which the anonymizer does not handle. + literals when the regex fallback is in use. Cluster create_sql is exempt from the literal check: its literals (SIZE, replication factor, availability zones) are non-sensitive configuration that @@ -78,7 +125,7 @@ def verify_anonymized( identifier_checks.append((original, pattern)) string_literal = re.compile(r"'(?:[^']|'')*'") - placeholder = re.compile(r"^'literal_\d+'$") + placeholder = re.compile(r"^'(?:literal_\d+|)'$") for location, sql in _iter_sql(new): for original, pattern in identifier_checks: @@ -331,6 +378,11 @@ def replace_identifiers(d: dict[str, Any], entry: str) -> None: if args.identifiers: d[entry] = pattern.sub(lambda m: mapping[m.group(0)], d[entry]) + # DDL create_sql is redacted with the blanket regex: option strings like + # connection hosts, sink topics, and source options must be scrubbed, and + # the parser's to_ast_string_redacted() intentionally does NOT redact those + # (it only redacts expression/value literals, treating DDL options as + # config). Query SQL is handled separately, via the parser (see below). def replace_literals(d: dict[str, Any], entry: str) -> None: if args.literals: d[entry] = anonymize_literals_in_sql(d[entry]) @@ -390,6 +442,7 @@ def replace_literals(d: dict[str, Any], entry: str) -> None: # Sink create_sql carries topic names, broker lists, and # bucket/path URLs as string literals; anonymize them. replace_literals(sink, "create_sql") + query_literal_targets: list[dict[str, Any]] = [] for query in workload["queries"]: if args.identifiers: query["cluster"] = mapping.get(query["cluster"], query["cluster"]) @@ -399,9 +452,33 @@ def replace_literals(d: dict[str, Any], entry: str) -> None: ] replace_identifiers(query, "sql") if args.literals: - replace_literals(query, "sql") + query_literal_targets.append(query) new["queries"].append(query) + # Redact literals in query SQL with Materialize's own parser, in one batch. + # The parser handles every literal form the dialect supports (numbers, hex + # strings, intervals, dollar-quoted and escape strings) where the regex only + # caught single-quoted strings. Fall back to the regex per-statement when + # the helper binary is unavailable or cannot parse a given statement. + if query_literal_targets: + sqls = [q["sql"] for q in query_literal_targets] + redacted = redact_literals_via_parser(sqls) + if redacted is None: + print( + "warning: mz-sql-anonymize helper not found; using regex literal " + "redaction for queries, which misses numbers, dollar-quoted " + "strings, and comments. Build it for exact redaction:\n" + " cargo build --release -p mz-sql-anonymize", + file=sys.stderr, + ) + for q in query_literal_targets: + q["sql"] = anonymize_literals_in_sql(q["sql"]) + else: + for q, red in zip(query_literal_targets, redacted): + q["sql"] = ( + red if red is not None else anonymize_literals_in_sql(q["sql"]) + ) + if args.verify: problems = verify_anonymized(new, mapping, args) if problems: diff --git a/src/sql-anonymize/Cargo.toml b/src/sql-anonymize/Cargo.toml new file mode 100644 index 0000000000000..b6ba6ac17cb64 --- /dev/null +++ b/src/sql-anonymize/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "mz-sql-anonymize" +description = "A small CLI that redacts literals from SQL using Materialize's own parser." +version = "0.0.0" +edition.workspace = true +rust-version.workspace = true +publish = false + +[lints] +workspace = true + +[[bin]] +name = "mz-sql-anonymize" +path = "src/main.rs" + +[dependencies] +mz-sql-parser = { path = "../sql-parser", default-features = false } +serde_json.workspace = true diff --git a/src/sql-anonymize/src/main.rs b/src/sql-anonymize/src/main.rs new file mode 100644 index 0000000000000..a448dc158d7aa --- /dev/null +++ b/src/sql-anonymize/src/main.rs @@ -0,0 +1,89 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Redacts literals from SQL using Materialize's own parser. +//! +//! This is a helper for `bin/mz-workload-anonymize`. Doing literal redaction +//! with the real parser (rather than a regex) handles every literal form the +//! dialect supports — quoted strings with `''` escapes, escape strings, +//! dollar-quoted strings, numbers, hex strings, and intervals — and reuses the +//! exact `''` placeholder the rest of Materialize uses to turn +//! "customer data" into "usage data". +//! +//! Protocol: reads a JSON array of SQL strings on stdin and writes a JSON +//! array of the same length on stdout. Each output element is either the +//! redacted SQL or `null` when the input could not be parsed, in which case +//! the caller should fall back to its own redaction for that element. + +use std::io::{self, Read, Write}; + +use mz_sql_parser::ast::display::AstDisplay; +use mz_sql_parser::parser; + +/// Redacts every literal in `sql`, or returns `None` if it does not parse. +/// +/// A workload may hold more than one statement per entry, so we redact each +/// parsed statement and rejoin them with `; `. +fn redact(sql: &str) -> Option { + let stmts = parser::parse_statements(sql).ok()?; + let redacted: Vec = stmts + .iter() + .map(|s| s.ast.to_ast_string_redacted()) + .collect(); + Some(redacted.join("; ")) +} + +fn main() -> io::Result<()> { + let mut input = String::new(); + io::stdin().read_to_string(&mut input)?; + + let sqls: Vec = + serde_json::from_str(&input).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + let redacted: Vec> = sqls.iter().map(|sql| redact(sql)).collect(); + + let out = serde_json::to_string(&redacted)?; + io::stdout().write_all(out.as_bytes())?; + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::redact; + + #[test] + fn redacts_strings_and_numbers() { + let redacted = redact("SELECT 'secret', 42 FROM t WHERE name = 'alice'").expect("parses"); + assert!(!redacted.contains("secret"), "{redacted}"); + assert!(!redacted.contains("alice"), "{redacted}"); + assert!(!redacted.contains("42"), "{redacted}"); + assert!(redacted.contains(""), "{redacted}"); + // Identifiers must be preserved; only literals are redacted here. + assert!(redacted.contains("name"), "{redacted}"); + } + + #[test] + fn preserves_quoted_identifier_that_looks_like_a_literal() { + // A column named with embedded spaces is double-quoted, not a literal, + // and must survive redaction. + let redacted = redact(r#"SELECT "my col" FROM t"#).expect("parses"); + assert!(redacted.contains(r#""my col""#), "{redacted}"); + } + + #[test] + fn returns_none_on_parse_error() { + assert_eq!(redact("SELEC not valid sql"), None); + } + + #[test] + fn redacts_each_statement_in_a_multi_statement_entry() { + let redacted = redact("SELECT 'a'; SELECT 'b'").expect("parses"); + assert!(!redacted.contains("'a'"), "{redacted}"); + assert!(!redacted.contains("'b'"), "{redacted}"); + } +} diff --git a/test/workload-replay/README.md b/test/workload-replay/README.md index c0adccb4cb649..02dbc98a54af3 100644 --- a/test/workload-replay/README.md +++ b/test/workload-replay/README.md @@ -78,9 +78,19 @@ Anonymizes identifiers and literals in workload captures for sharing without exp - All identifiers in `create_sql` definitions and queries *Literals (`--literals`, enabled by default):* -- String literals in SQL → `'literal_1'`, `'literal_2'`, ... -- String default values in table/source/child columns -- String literals in queries +- Query SQL is redacted with Materialize's own parser (`mz-sql-anonymize`), + replacing all literals — strings, numbers, hex strings, intervals — with + `''`. If the helper binary is not built, the tool falls back to a + regex that only catches single-quoted strings (and prints a warning). +- `create_sql` strings (including connection hosts/users, sink topics, source + options, and column defaults) → `'literal_1'`, `'literal_2'`, ... via regex. + The parser is not used here because `to_ast_string_redacted()` intentionally + does not redact DDL option strings. + +For exact, parser-based query redaction, build the helper once: +```bash +cargo build --release -p mz-sql-anonymize +``` **Usage:** ```bash @@ -90,9 +100,11 @@ bin/mz-workload-anonymize [OPTIONS] **Options:** | Option | Description | Default | |--------|-------------|---------| -| `-o, --output` | Path to write output | overwrites input file | +| `-o, --output` | Path to write output (`-` for stdout); required unless `--in-place` | — | +| `--in-place` | Overwrite the input file (destroys the original capture) | off | | `--identifiers` / `--no-identifiers` | Anonymize object names | enabled | -| `--literals` / `--no-literals` | Anonymize string literals | enabled | +| `--literals` / `--no-literals` | Anonymize literals | enabled | +| `--verify` / `--no-verify` | Re-scan output for leaks and refuse to write if any are found | enabled | **Examples:** ```bash From cfd23c8863410a0b321f948065e687b7fc934cfd Mon Sep 17 00:00:00 2001 From: Jason Hernandez <7144515+jasonhernandez@users.noreply.github.com> Date: Wed, 27 May 2026 22:58:24 -0700 Subject: [PATCH 2/2] sql-anonymize: use #[mz_ore::test] for the lint CI's check-rust-test-attributes.sh rejects plain #[test]. Add mz-ore as a dev-dependency with the 'test' feature (matching sql-parser's pattern) and switch the unit tests over. Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.lock | 1 + src/sql-anonymize/Cargo.toml | 3 +++ src/sql-anonymize/src/main.rs | 8 ++++---- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bf6d8cd462823..4c9e8217ac32b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7813,6 +7813,7 @@ dependencies = [ name = "mz-sql-anonymize" version = "0.0.0" dependencies = [ + "mz-ore", "mz-sql-parser", "serde_json", ] diff --git a/src/sql-anonymize/Cargo.toml b/src/sql-anonymize/Cargo.toml index b6ba6ac17cb64..e4eee1c042a93 100644 --- a/src/sql-anonymize/Cargo.toml +++ b/src/sql-anonymize/Cargo.toml @@ -16,3 +16,6 @@ path = "src/main.rs" [dependencies] mz-sql-parser = { path = "../sql-parser", default-features = false } serde_json.workspace = true + +[dev-dependencies] +mz-ore = { path = "../ore", default-features = false, features = ["test"] } diff --git a/src/sql-anonymize/src/main.rs b/src/sql-anonymize/src/main.rs index a448dc158d7aa..e31cfbd36ab6f 100644 --- a/src/sql-anonymize/src/main.rs +++ b/src/sql-anonymize/src/main.rs @@ -56,7 +56,7 @@ fn main() -> io::Result<()> { mod tests { use super::redact; - #[test] + #[mz_ore::test] fn redacts_strings_and_numbers() { let redacted = redact("SELECT 'secret', 42 FROM t WHERE name = 'alice'").expect("parses"); assert!(!redacted.contains("secret"), "{redacted}"); @@ -67,7 +67,7 @@ mod tests { assert!(redacted.contains("name"), "{redacted}"); } - #[test] + #[mz_ore::test] fn preserves_quoted_identifier_that_looks_like_a_literal() { // A column named with embedded spaces is double-quoted, not a literal, // and must survive redaction. @@ -75,12 +75,12 @@ mod tests { assert!(redacted.contains(r#""my col""#), "{redacted}"); } - #[test] + #[mz_ore::test] fn returns_none_on_parse_error() { assert_eq!(redact("SELEC not valid sql"), None); } - #[test] + #[mz_ore::test] fn redacts_each_statement_in_a_multi_statement_entry() { let redacted = redact("SELECT 'a'; SELECT 'b'").expect("parses"); assert!(!redacted.contains("'a'"), "{redacted}");