MaterializeInc · jasonhernandez · May 27, 2026 · May 28, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -97,6 +97,7 @@ members = [
     "src/server-core",
     "src/service",
     "src/sql",
+    "src/sql-anonymize",
     "src/sql-lexer",
     "src/sql-parser",
     "src/sql-pretty",
@@ -221,6 +222,7 @@ default-members = [
     "src/server-core",
     "src/service",
     "src/sql",
+    "src/sql-anonymize",
     "src/sql-lexer",
     "src/sql-parser",
     "src/sql-pretty",

diff --git a/misc/python/materialize/cli/mz_workload_anonymize.py b/misc/python/materialize/cli/mz_workload_anonymize.py
@@ -8,15 +8,61 @@
 # by the Apache License, Version 2.0.
 
 import argparse
+import json
+import os
 import re
+import subprocess
 import sys
+from pathlib import Path
 from typing import Any
 
 import yaml
 
 from materialize import MZ_ROOT
 
 
+def _locate_redactor() -> list[str] | None:
+    """Locate the mz-sql-anonymize helper binary, if it has been built.
+
+    Honors MZ_SQL_ANONYMIZE_BIN, then looks for a release or debug build in the
+    Cargo target directory. Returns the argv prefix to run it, or None.
+    """
+    override = os.environ.get("MZ_SQL_ANONYMIZE_BIN")
+    if override and Path(override).exists():
+        return [override]
+    for profile in ("release", "debug"):
+        candidate = MZ_ROOT / "target" / profile / "mz-sql-anonymize"
+        if candidate.exists():
+            return [str(candidate)]
+    return None
+
+
+def redact_literals_via_parser(sqls: list[str]) -> list[str | None] | None:
+    """Redact literals in each SQL string using Materialize's own parser.
+
+    Returns a list aligned with the input, where each element is the redacted
+    SQL or None if that statement could not be parsed. Returns None for the
+    whole batch if the helper binary is unavailable or errors, signaling the
+    caller to fall back to regex-based redaction.
+    """
+    cmd = _locate_redactor()
+    if cmd is None:
+        return None
+    proc = subprocess.run(
+        cmd,
+        input=json.dumps(sqls),
+        capture_output=True,
+        text=True,
+    )
+    if proc.returncode != 0:
+        print(
+            f"warning: {cmd[0]} failed, falling back to regex redaction:\n{proc.stderr}",
+            file=sys.stderr,
+        )
+        return None
+    return json.loads(proc.stdout)
+
+
 def keywords() -> set[str]:
     with open(MZ_ROOT / "src" / "sql-lexer" / "src" / "keywords.txt") as f:
         result = set(
@@ -55,9 +101,10 @@ def verify_anonymized(
 
     This is a backstop for the heuristic text substitution, not a proof: it
     catches whole-word survivals of original identifiers and any single-quoted
-    literal that was not reduced to a 'literal_N' placeholder. It cannot detect
+    literal that was not reduced to a placeholder ('<REDACTED>' from the
+    parser-based path, or 'literal_N' from the regex fallback). It cannot detect
     sensitive data hidden in dollar-quoted strings, comments, or numeric
-    literals, which the anonymizer does not handle.
+    literals when the regex fallback is in use.
 
     Cluster create_sql is exempt from the literal check: its literals (SIZE,
     replication factor, availability zones) are non-sensitive configuration that
@@ -78,7 +125,7 @@ def verify_anonymized(
             identifier_checks.append((original, pattern))
 
     string_literal = re.compile(r"'(?:[^']|'')*'")
-    placeholder = re.compile(r"^'literal_\d+'$")
+    placeholder = re.compile(r"^'(?:literal_\d+|<REDACTED>)'$")
 
     for location, sql in _iter_sql(new):
         for original, pattern in identifier_checks:
@@ -331,6 +378,11 @@ def replace_identifiers(d: dict[str, Any], entry: str) -> None:
         if args.identifiers:
             d[entry] = pattern.sub(lambda m: mapping[m.group(0)], d[entry])
 
+    # DDL create_sql is redacted with the blanket regex: option strings like
+    # connection hosts, sink topics, and source options must be scrubbed, and
+    # the parser's to_ast_string_redacted() intentionally does NOT redact those
+    # (it only redacts expression/value literals, treating DDL options as
+    # config). Query SQL is handled separately, via the parser (see below).
     def replace_literals(d: dict[str, Any], entry: str) -> None:
         if args.literals:
             d[entry] = anonymize_literals_in_sql(d[entry])
@@ -390,6 +442,7 @@ def replace_literals(d: dict[str, Any], entry: str) -> None:
                 # Sink create_sql carries topic names, broker lists, and
                 # bucket/path URLs as string literals; anonymize them.
                 replace_literals(sink, "create_sql")
+    query_literal_targets: list[dict[str, Any]] = []
     for query in workload["queries"]:
         if args.identifiers:
             query["cluster"] = mapping.get(query["cluster"], query["cluster"])
@@ -399,9 +452,33 @@ def replace_literals(d: dict[str, Any], entry: str) -> None:
             ]
             replace_identifiers(query, "sql")
         if args.literals:
-            replace_literals(query, "sql")
+            query_literal_targets.append(query)
         new["queries"].append(query)
 
+    # Redact literals in query SQL with Materialize's own parser, in one batch.
+    # The parser handles every literal form the dialect supports (numbers, hex
+    # strings, intervals, dollar-quoted and escape strings) where the regex only
+    # caught single-quoted strings. Fall back to the regex per-statement when
+    # the helper binary is unavailable or cannot parse a given statement.
+    if query_literal_targets:
+        sqls = [q["sql"] for q in query_literal_targets]
+        redacted = redact_literals_via_parser(sqls)
+        if redacted is None:
+            print(
+                "warning: mz-sql-anonymize helper not found; using regex literal "
+                "redaction for queries, which misses numbers, dollar-quoted "
+                "strings, and comments. Build it for exact redaction:\n"
+                "    cargo build --release -p mz-sql-anonymize",
+                file=sys.stderr,
+            )
+            for q in query_literal_targets:
+                q["sql"] = anonymize_literals_in_sql(q["sql"])
+        else:
+            for q, red in zip(query_literal_targets, redacted):
+                q["sql"] = (
+                    red if red is not None else anonymize_literals_in_sql(q["sql"])
+                )
+
     if args.verify:
         problems = verify_anonymized(new, mapping, args)
         if problems:

diff --git a/src/sql-anonymize/Cargo.toml b/src/sql-anonymize/Cargo.toml
@@ -0,0 +1,21 @@
+[package]
+name = "mz-sql-anonymize"
+description = "A small CLI that redacts literals from SQL using Materialize's own parser."
+version = "0.0.0"
+edition.workspace = true
+rust-version.workspace = true
+publish = false
+
+[lints]
+workspace = true
+
+[[bin]]
+name = "mz-sql-anonymize"
+path = "src/main.rs"
+
+[dependencies]
+mz-sql-parser = { path = "../sql-parser", default-features = false }
+serde_json.workspace = true
+
+[dev-dependencies]
+mz-ore = { path = "../ore", default-features = false, features = ["test"] }
diff --git a/src/sql-anonymize/src/main.rs b/src/sql-anonymize/src/main.rs
@@ -0,0 +1,89 @@
+// Copyright Materialize, Inc. and contributors. All rights reserved.
+//
+// Use of this software is governed by the Business Source License
+// included in the LICENSE file.
+//
+// As of the Change Date specified in that file, in accordance with
+// the Business Source License, use of this software will be governed
+// by the Apache License, Version 2.0.
+
+//! Redacts literals from SQL using Materialize's own parser.
+//!
+//! This is a helper for `bin/mz-workload-anonymize`. Doing literal redaction
+//! with the real parser (rather than a regex) handles every literal form the
+//! dialect supports — quoted strings with `''` escapes, escape strings,
+//! dollar-quoted strings, numbers, hex strings, and intervals — and reuses the
+//! exact `'<REDACTED>'` placeholder the rest of Materialize uses to turn
+//! "customer data" into "usage data".
+//!
+//! Protocol: reads a JSON array of SQL strings on stdin and writes a JSON
+//! array of the same length on stdout. Each output element is either the
+//! redacted SQL or `null` when the input could not be parsed, in which case
+//! the caller should fall back to its own redaction for that element.
+
+use std::io::{self, Read, Write};
+
+use mz_sql_parser::ast::display::AstDisplay;
+use mz_sql_parser::parser;
+
+/// Redacts every literal in `sql`, or returns `None` if it does not parse.
+///
+/// A workload may hold more than one statement per entry, so we redact each
+/// parsed statement and rejoin them with `; `.
+fn redact(sql: &str) -> Option<String> {
+    let stmts = parser::parse_statements(sql).ok()?;
+    let redacted: Vec<String> = stmts
+        .iter()
+        .map(|s| s.ast.to_ast_string_redacted())
+        .collect();
+    Some(redacted.join("; "))
+}
+
+fn main() -> io::Result<()> {
+    let mut input = String::new();
+    io::stdin().read_to_string(&mut input)?;
+
+    let sqls: Vec<String> =
+        serde_json::from_str(&input).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
+    let redacted: Vec<Option<String>> = sqls.iter().map(|sql| redact(sql)).collect();
+
+    let out = serde_json::to_string(&redacted)?;
+    io::stdout().write_all(out.as_bytes())?;
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::redact;
+
+    #[mz_ore::test]
+    fn redacts_strings_and_numbers() {
+        let redacted = redact("SELECT 'secret', 42 FROM t WHERE name = 'alice'").expect("parses");
+        assert!(!redacted.contains("secret"), "{redacted}");
+        assert!(!redacted.contains("alice"), "{redacted}");
+        assert!(!redacted.contains("42"), "{redacted}");
+        assert!(redacted.contains("<REDACTED>"), "{redacted}");
+        // Identifiers must be preserved; only literals are redacted here.
+        assert!(redacted.contains("name"), "{redacted}");
+    }
+
+    #[mz_ore::test]
+    fn preserves_quoted_identifier_that_looks_like_a_literal() {
+        // A column named with embedded spaces is double-quoted, not a literal,
+        // and must survive redaction.
+        let redacted = redact(r#"SELECT "my col" FROM t"#).expect("parses");
+        assert!(redacted.contains(r#""my col""#), "{redacted}");
+    }
+
+    #[mz_ore::test]
+    fn returns_none_on_parse_error() {
+        assert_eq!(redact("SELEC not valid sql"), None);
+    }
+
+    #[mz_ore::test]
+    fn redacts_each_statement_in_a_multi_statement_entry() {
+        let redacted = redact("SELECT 'a'; SELECT 'b'").expect("parses");
+        assert!(!redacted.contains("'a'"), "{redacted}");
+        assert!(!redacted.contains("'b'"), "{redacted}");
+    }
+}
diff --git a/test/workload-replay/README.md b/test/workload-replay/README.md
@@ -78,9 +78,19 @@ Anonymizes identifiers and literals in workload captures for sharing without exp
 - All identifiers in `create_sql` definitions and queries
 
 *Literals (`--literals`, enabled by default):*
-- String literals in SQL → `'literal_1'`, `'literal_2'`, ...
-- String default values in table/source/child columns
-- String literals in queries
+- Query SQL is redacted with Materialize's own parser (`mz-sql-anonymize`),
+  replacing all literals — strings, numbers, hex strings, intervals — with
+  `'<REDACTED>'`. If the helper binary is not built, the tool falls back to a
+  regex that only catches single-quoted strings (and prints a warning).
+- `create_sql` strings (including connection hosts/users, sink topics, source
+  options, and column defaults) → `'literal_1'`, `'literal_2'`, ... via regex.
+  The parser is not used here because `to_ast_string_redacted()` intentionally
+  does not redact DDL option strings.
+
+For exact, parser-based query redaction, build the helper once:
+```bash
+cargo build --release -p mz-sql-anonymize
+```
 
 **Usage:**
 ```bash
@@ -90,9 +100,11 @@ bin/mz-workload-anonymize <file> [OPTIONS]
 **Options:**
 | Option | Description | Default |
 |--------|-------------|---------|
-| `-o, --output` | Path to write output | overwrites input file |
+| `-o, --output` | Path to write output (`-` for stdout); required unless `--in-place` | — |
+| `--in-place` | Overwrite the input file (destroys the original capture) | off |
 | `--identifiers` / `--no-identifiers` | Anonymize object names | enabled |
-| `--literals` / `--no-literals` | Anonymize string literals | enabled |
+| `--literals` / `--no-literals` | Anonymize literals | enabled |
+| `--verify` / `--no-verify` | Re-scan output for leaks and refuse to write if any are found | enabled |
 
 **Examples:**
 ```bash