From e22d2cdba2cc152b6eb56423fd081b85ac729e66 Mon Sep 17 00:00:00 2001
From: Jason Hernandez <7144515+jasonhernandez@users.noreply.github.com>
Date: Wed, 27 May 2026 08:40:08 -0700
Subject: [PATCH 1/2] workload-replay: redact query literals with the Mz parser
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The regex literal redaction only handles single-quoted strings, so numeric
literals (account numbers, SSNs, ids), dollar-quoted strings, and escape
strings in query predicates were emitted verbatim. Use Materialize's own
parser instead, which handles every literal form the dialect supports.

Add `mz-sql-anonymize`, a small CLI that reads a JSON array of SQL strings
on stdin and writes back each statement run through
`to_ast_string_redacted()` (or null when it does not parse). It depends only
on the standalone `mz-sql-parser` crate. The anonymizer locates the built
binary (via MZ_SQL_ANONYMIZE_BIN or target/{release,debug}), redacts all
query SQL in one batch, and falls back per-statement to the regex when the
binary is unavailable or a statement does not parse, printing a warning that
points at `cargo build --release -p mz-sql-anonymize`.

Scope: only query SQL goes through the parser. DDL create_sql keeps the
blanket regex, because `to_ast_string_redacted()` deliberately does not
redact DDL option strings (connection hosts/users, sink topics, source
options) — routing those through the parser would regress the connection and
sink leak fix from the parent commit. The verify pass accepts both the
parser's `'<REDACTED>'` and the regex's `'literal_N'` placeholders.

This addresses the "wrap the Mz parser" TODO for literals. Identifier
renaming still uses the regex: the parser alone cannot resolve which object a
bare name refers to (it has no catalog), so scoped renaming remains future
work.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 Cargo.lock                                    |  8 ++
 Cargo.toml                                    |  2 +
 .../materialize/cli/mz_workload_anonymize.py  | 85 +++++++++++++++++-
 src/sql-anonymize/Cargo.toml                  | 18 ++++
 src/sql-anonymize/src/main.rs                 | 89 +++++++++++++++++++
 test/workload-replay/README.md                | 22 +++--
 6 files changed, 215 insertions(+), 9 deletions(-)
 create mode 100644 src/sql-anonymize/Cargo.toml
 create mode 100644 src/sql-anonymize/src/main.rs
diff --git a/Cargo.lock b/Cargo.lock
index a20cca5d2f06b..bf6d8cd462823 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -7809,6 +7809,14 @@ dependencies = [
  "version-compare",
 ]
 
+[[package]]
+name = "mz-sql-anonymize"
+version = "0.0.0"
+dependencies = [
+ "mz-sql-parser",
+ "serde_json",
+]
+
 [[package]]
 name = "mz-sql-lexer"
 version = "0.0.0"
diff --git a/Cargo.toml b/Cargo.toml
index c85279ae3cac7..746a233f6c1d4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -97,6 +97,7 @@ members = [
     "src/server-core",
     "src/service",
     "src/sql",
+    "src/sql-anonymize",
     "src/sql-lexer",
     "src/sql-parser",
     "src/sql-pretty",
@@ -221,6 +222,7 @@ default-members = [
     "src/server-core",
     "src/service",
     "src/sql",
+    "src/sql-anonymize",
     "src/sql-lexer",
     "src/sql-parser",
     "src/sql-pretty",
diff --git a/misc/python/materialize/cli/mz_workload_anonymize.py b/misc/python/materialize/cli/mz_workload_anonymize.py
index 7ff7afb234b4a..d1516b4bcdf45 100644
--- a/misc/python/materialize/cli/mz_workload_anonymize.py
+++ b/misc/python/materialize/cli/mz_workload_anonymize.py
@@ -8,8 +8,12 @@
 # by the Apache License, Version 2.0.
 
 import argparse
+import json
+import os
 import re
+import subprocess
 import sys
+from pathlib import Path
 from typing import Any
 
 import yaml
@@ -17,6 +21,48 @@
 from materialize import MZ_ROOT
 
 
+def _locate_redactor() -> list[str] | None:
+    """Locate the mz-sql-anonymize helper binary, if it has been built.
+
+    Honors MZ_SQL_ANONYMIZE_BIN, then looks for a release or debug build in the
+    Cargo target directory. Returns the argv prefix to run it, or None.
+    """
+    override = os.environ.get("MZ_SQL_ANONYMIZE_BIN")
+    if override and Path(override).exists():
+        return [override]
+    for profile in ("release", "debug"):
+        candidate = MZ_ROOT / "target" / profile / "mz-sql-anonymize"
+        if candidate.exists():
+            return [str(candidate)]
+    return None
+
+
+def redact_literals_via_parser(sqls: list[str]) -> list[str | None] | None:
+    """Redact literals in each SQL string using Materialize's own parser.
+
+    Returns a list aligned with the input, where each element is the redacted
+    SQL or None if that statement could not be parsed. Returns None for the
+    whole batch if the helper binary is unavailable or errors, signaling the
+    caller to fall back to regex-based redaction.
+    """
+    cmd = _locate_redactor()
+    if cmd is None:
+        return None
+    proc = subprocess.run(
+        cmd,
+        input=json.dumps(sqls),
+        capture_output=True,
+        text=True,
+    )
+    if proc.returncode != 0:
+        print(
+            f"warning: {cmd[0]} failed, falling back to regex redaction:\n{proc.stderr}",
+            file=sys.stderr,
+        )
+        return None
+    return json.loads(proc.stdout)
+
+
 def keywords() -> set[str]:
     with open(MZ_ROOT / "src" / "sql-lexer" / "src" / "keywords.txt") as f:
         result = set(
@@ -55,9 +101,10 @@ def verify_anonymized(
 
     This is a backstop for the heuristic text substitution, not a proof: it
     catches whole-word survivals of original identifiers and any single-quoted
-    literal that was not reduced to a 'literal_N' placeholder. It cannot detect
+    literal that was not reduced to a placeholder ('<REDACTED>' from the
+    parser-based path, or 'literal_N' from the regex fallback). It cannot detect
     sensitive data hidden in dollar-quoted strings, comments, or numeric
-    literals, which the anonymizer does not handle.
+    literals when the regex fallback is in use.
 
     Cluster create_sql is exempt from the literal check: its literals (SIZE,
     replication factor, availability zones) are non-sensitive configuration that
@@ -78,7 +125,7 @@ def verify_anonymized(
             identifier_checks.append((original, pattern))
 
     string_literal = re.compile(r"'(?:[^']|'')*'")
-    placeholder = re.compile(r"^'literal_\d+'$")
+    placeholder = re.compile(r"^'(?:literal_\d+|<REDACTED>)'$")
 
     for location, sql in _iter_sql(new):
         for original, pattern in identifier_checks:
@@ -331,6 +378,11 @@ def replace_identifiers(d: dict[str, Any], entry: str) -> None:
         if args.identifiers:
             d[entry] = pattern.sub(lambda m: mapping[m.group(0)], d[entry])
 
+    # DDL create_sql is redacted with the blanket regex: option strings like
+    # connection hosts, sink topics, and source options must be scrubbed, and
+    # the parser's to_ast_string_redacted() intentionally does NOT redact those
+    # (it only redacts expression/value literals, treating DDL options as
+    # config). Query SQL is handled separately, via the parser (see below).
     def replace_literals(d: dict[str, Any], entry: str) -> None:
         if args.literals:
             d[entry] = anonymize_literals_in_sql(d[entry])
@@ -390,6 +442,7 @@ def replace_literals(d: dict[str, Any], entry: str) -> None:
                 # Sink create_sql carries topic names, broker lists, and
                 # bucket/path URLs as string literals; anonymize them.
                 replace_literals(sink, "create_sql")
+    query_literal_targets: list[dict[str, Any]] = []
     for query in workload["queries"]:
         if args.identifiers:
             query["cluster"] = mapping.get(query["cluster"], query["cluster"])
@@ -399,9 +452,33 @@ def replace_literals(d: dict[str, Any], entry: str) -> None:
             ]
             replace_identifiers(query, "sql")
         if args.literals:
-            replace_literals(query, "sql")
+            query_literal_targets.append(query)
         new["queries"].append(query)
 
+    # Redact literals in query SQL with Materialize's own parser, in one batch.
+    # The parser handles every literal form the dialect supports (numbers, hex
+    # strings, intervals, dollar-quoted and escape strings) where the regex only
+    # caught single-quoted strings. Fall back to the regex per-statement when
+    # the helper binary is unavailable or cannot parse a given statement.
+    if query_literal_targets:
+        sqls = [q["sql"] for q in query_literal_targets]
+        redacted = redact_literals_via_parser(sqls)
+        if redacted is None:
+            print(
+                "warning: mz-sql-anonymize helper not found; using regex literal "
+                "redaction for queries, which misses numbers, dollar-quoted "
+                "strings, and comments. Build it for exact redaction:\n"
+                "    cargo build --release -p mz-sql-anonymize",
+                file=sys.stderr,
+            )
+            for q in query_literal_targets:
+                q["sql"] = anonymize_literals_in_sql(q["sql"])
+        else:
+            for q, red in zip(query_literal_targets, redacted):
+                q["sql"] = (
+                    red if red is not None else anonymize_literals_in_sql(q["sql"])
+                )
+
     if args.verify:
         problems = verify_anonymized(new, mapping, args)
         if problems:
diff --git a/src/sql-anonymize/Cargo.toml b/src/sql-anonymize/Cargo.toml
new file mode 100644
index 0000000000000..b6ba6ac17cb64
--- /dev/null
+++ b/src/sql-anonymize/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "mz-sql-anonymize"
+description = "A small CLI that redacts literals from SQL using Materialize's own parser."
+version = "0.0.0"
+edition.workspace = true
+rust-version.workspace = true
+publish = false
+
+[lints]
+workspace = true
+
+[[bin]]
+name = "mz-sql-anonymize"
+path = "src/main.rs"
+
+[dependencies]
+mz-sql-parser = { path = "../sql-parser", default-features = false }
+serde_json.workspace = true
diff --git a/src/sql-anonymize/src/main.rs b/src/sql-anonymize/src/main.rs
new file mode 100644
index 0000000000000..a448dc158d7aa
--- /dev/null
+++ b/src/sql-anonymize/src/main.rs
@@ -0,0 +1,89 @@
+// Copyright Materialize, Inc. and contributors. All rights reserved.
+//
+// Use of this software is governed by the Business Source License
+// included in the LICENSE file.
+//
+// As of the Change Date specified in that file, in accordance with
+// the Business Source License, use of this software will be governed
+// by the Apache License, Version 2.0.
+
+//! Redacts literals from SQL using Materialize's own parser.
+//!
+//! This is a helper for `bin/mz-workload-anonymize`. Doing literal redaction
+//! with the real parser (rather than a regex) handles every literal form the
+//! dialect supports — quoted strings with `''` escapes, escape strings,
+//! dollar-quoted strings, numbers, hex strings, and intervals — and reuses the
+//! exact `'<REDACTED>'` placeholder the rest of Materialize uses to turn
+//! "customer data" into "usage data".
+//!
+//! Protocol: reads a JSON array of SQL strings on stdin and writes a JSON
+//! array of the same length on stdout. Each output element is either the
+//! redacted SQL or `null` when the input could not be parsed, in which case
+//! the caller should fall back to its own redaction for that element.
+
+use std::io::{self, Read, Write};
+
+use mz_sql_parser::ast::display::AstDisplay;
+use mz_sql_parser::parser;
+
+/// Redacts every literal in `sql`, or returns `None` if it does not parse.
+///
+/// A workload may hold more than one statement per entry, so we redact each
+/// parsed statement and rejoin them with `; `.
+fn redact(sql: &str) -> Option<String> {
+    let stmts = parser::parse_statements(sql).ok()?;
+    let redacted: Vec<String> = stmts
+        .iter()
+        .map(|s| s.ast.to_ast_string_redacted())
+        .collect();
+    Some(redacted.join("; "))
+}
+
+fn main() -> io::Result<()> {
+    let mut input = String::new();
+    io::stdin().read_to_string(&mut input)?;
+
+    let sqls: Vec<String> =
+        serde_json::from_str(&input).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
+    let redacted: Vec<Option<String>> = sqls.iter().map(|sql| redact(sql)).collect();
+
+    let out = serde_json::to_string(&redacted)?;
+    io::stdout().write_all(out.as_bytes())?;
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::redact;
+
+    #[test]
+    fn redacts_strings_and_numbers() {
+        let redacted = redact("SELECT 'secret', 42 FROM t WHERE name = 'alice'").expect("parses");
+        assert!(!redacted.contains("secret"), "{redacted}");
+        assert!(!redacted.contains("alice"), "{redacted}");
+        assert!(!redacted.contains("42"), "{redacted}");
+        assert!(redacted.contains("<REDACTED>"), "{redacted}");
+        // Identifiers must be preserved; only literals are redacted here.
+        assert!(redacted.contains("name"), "{redacted}");
+    }
+
+    #[test]
+    fn preserves_quoted_identifier_that_looks_like_a_literal() {
+        // A column named with embedded spaces is double-quoted, not a literal,
+        // and must survive redaction.
+        let redacted = redact(r#"SELECT "my col" FROM t"#).expect("parses");
+        assert!(redacted.contains(r#""my col""#), "{redacted}");
+    }
+
+    #[test]
+    fn returns_none_on_parse_error() {
+        assert_eq!(redact("SELEC not valid sql"), None);
+    }
+
+    #[test]
+    fn redacts_each_statement_in_a_multi_statement_entry() {
+        let redacted = redact("SELECT 'a'; SELECT 'b'").expect("parses");
+        assert!(!redacted.contains("'a'"), "{redacted}");
+        assert!(!redacted.contains("'b'"), "{redacted}");
+    }
+}
diff --git a/test/workload-replay/README.md b/test/workload-replay/README.md
index c0adccb4cb649..02dbc98a54af3 100644
--- a/test/workload-replay/README.md
+++ b/test/workload-replay/README.md
@@ -78,9 +78,19 @@ Anonymizes identifiers and literals in workload captures for sharing without exp
 - All identifiers in `create_sql` definitions and queries
 
 *Literals (`--literals`, enabled by default):*
-- String literals in SQL → `'literal_1'`, `'literal_2'`, ...
-- String default values in table/source/child columns
-- String literals in queries
+- Query SQL is redacted with Materialize's own parser (`mz-sql-anonymize`),
+  replacing all literals — strings, numbers, hex strings, intervals — with
+  `'<REDACTED>'`. If the helper binary is not built, the tool falls back to a
+  regex that only catches single-quoted strings (and prints a warning).
+- `create_sql` strings (including connection hosts/users, sink topics, source
+  options, and column defaults) → `'literal_1'`, `'literal_2'`, ... via regex.
+  The parser is not used here because `to_ast_string_redacted()` intentionally
+  does not redact DDL option strings.
+
+For exact, parser-based query redaction, build the helper once:
+```bash
+cargo build --release -p mz-sql-anonymize
+```
 
 **Usage:**
 ```bash
@@ -90,9 +100,11 @@ bin/mz-workload-anonymize <file> [OPTIONS]
 **Options:**
 | Option | Description | Default |
 |--------|-------------|---------|
-| `-o, --output` | Path to write output | overwrites input file |
+| `-o, --output` | Path to write output (`-` for stdout); required unless `--in-place` | — |
+| `--in-place` | Overwrite the input file (destroys the original capture) | off |
 | `--identifiers` / `--no-identifiers` | Anonymize object names | enabled |
-| `--literals` / `--no-literals` | Anonymize string literals | enabled |
+| `--literals` / `--no-literals` | Anonymize literals | enabled |
+| `--verify` / `--no-verify` | Re-scan output for leaks and refuse to write if any are found | enabled |
 
 **Examples:**
 ```bash

From cfd23c8863410a0b321f948065e687b7fc934cfd Mon Sep 17 00:00:00 2001
From: Jason Hernandez <7144515+jasonhernandez@users.noreply.github.com>
Date: Wed, 27 May 2026 22:58:24 -0700
Subject: [PATCH 2/2] sql-anonymize: use #[mz_ore::test] for the lint

CI's check-rust-test-attributes.sh rejects plain #[test]. Add mz-ore as a
dev-dependency with the 'test' feature (matching sql-parser's pattern) and
switch the unit tests over.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 Cargo.lock                    | 1 +
 src/sql-anonymize/Cargo.toml  | 3 +++
 src/sql-anonymize/src/main.rs | 8 ++++----
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index bf6d8cd462823..4c9e8217ac32b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -7813,6 +7813,7 @@ dependencies = [
 name = "mz-sql-anonymize"
 version = "0.0.0"
 dependencies = [
+ "mz-ore",
  "mz-sql-parser",
  "serde_json",
 ]
diff --git a/src/sql-anonymize/Cargo.toml b/src/sql-anonymize/Cargo.toml
index b6ba6ac17cb64..e4eee1c042a93 100644
--- a/src/sql-anonymize/Cargo.toml
+++ b/src/sql-anonymize/Cargo.toml
@@ -16,3 +16,6 @@ path = "src/main.rs"
 [dependencies]
 mz-sql-parser = { path = "../sql-parser", default-features = false }
 serde_json.workspace = true
+
+[dev-dependencies]
+mz-ore = { path = "../ore", default-features = false, features = ["test"] }
diff --git a/src/sql-anonymize/src/main.rs b/src/sql-anonymize/src/main.rs
index a448dc158d7aa..e31cfbd36ab6f 100644
--- a/src/sql-anonymize/src/main.rs
+++ b/src/sql-anonymize/src/main.rs
@@ -56,7 +56,7 @@ fn main() -> io::Result<()> {
 mod tests {
     use super::redact;
 
-    #[test]
+    #[mz_ore::test]
     fn redacts_strings_and_numbers() {
         let redacted = redact("SELECT 'secret', 42 FROM t WHERE name = 'alice'").expect("parses");
         assert!(!redacted.contains("secret"), "{redacted}");
@@ -67,7 +67,7 @@ mod tests {
         assert!(redacted.contains("name"), "{redacted}");
     }
 
-    #[test]
+    #[mz_ore::test]
     fn preserves_quoted_identifier_that_looks_like_a_literal() {
         // A column named with embedded spaces is double-quoted, not a literal,
         // and must survive redaction.
@@ -75,12 +75,12 @@ mod tests {
         assert!(redacted.contains(r#""my col""#), "{redacted}");
     }
 
-    #[test]
+    #[mz_ore::test]
     fn returns_none_on_parse_error() {
         assert_eq!(redact("SELEC not valid sql"), None);
     }
 
-    #[test]
+    #[mz_ore::test]
     fn redacts_each_statement_in_a_multi_statement_entry() {
         let redacted = redact("SELECT 'a'; SELECT 'b'").expect("parses");
         assert!(!redacted.contains("'a'"), "{redacted}");