Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ members = [
"src/server-core",
"src/service",
"src/sql",
"src/sql-anonymize",
"src/sql-lexer",
"src/sql-parser",
"src/sql-pretty",
Expand Down Expand Up @@ -221,6 +222,7 @@ default-members = [
"src/server-core",
"src/service",
"src/sql",
"src/sql-anonymize",
"src/sql-lexer",
"src/sql-parser",
"src/sql-pretty",
Expand Down
85 changes: 81 additions & 4 deletions misc/python/materialize/cli/mz_workload_anonymize.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,61 @@
# by the Apache License, Version 2.0.

import argparse
import json
import os
import re
import subprocess
import sys
from pathlib import Path
from typing import Any

import yaml

from materialize import MZ_ROOT


def _locate_redactor() -> list[str] | None:
"""Locate the mz-sql-anonymize helper binary, if it has been built.

Honors MZ_SQL_ANONYMIZE_BIN, then looks for a release or debug build in the
Cargo target directory. Returns the argv prefix to run it, or None.
"""
override = os.environ.get("MZ_SQL_ANONYMIZE_BIN")
if override and Path(override).exists():
return [override]
for profile in ("release", "debug"):
candidate = MZ_ROOT / "target" / profile / "mz-sql-anonymize"
if candidate.exists():
return [str(candidate)]
return None


def redact_literals_via_parser(sqls: list[str]) -> list[str | None] | None:
"""Redact literals in each SQL string using Materialize's own parser.

Returns a list aligned with the input, where each element is the redacted
SQL or None if that statement could not be parsed. Returns None for the
whole batch if the helper binary is unavailable or errors, signaling the
caller to fall back to regex-based redaction.
"""
cmd = _locate_redactor()
if cmd is None:
return None
proc = subprocess.run(
cmd,
input=json.dumps(sqls),
capture_output=True,
text=True,
)
if proc.returncode != 0:
print(
f"warning: {cmd[0]} failed, falling back to regex redaction:\n{proc.stderr}",
file=sys.stderr,
)
return None
return json.loads(proc.stdout)


def keywords() -> set[str]:
with open(MZ_ROOT / "src" / "sql-lexer" / "src" / "keywords.txt") as f:
result = set(
Expand Down Expand Up @@ -55,9 +101,10 @@ def verify_anonymized(

This is a backstop for the heuristic text substitution, not a proof: it
catches whole-word survivals of original identifiers and any single-quoted
literal that was not reduced to a 'literal_N' placeholder. It cannot detect
literal that was not reduced to a placeholder ('<REDACTED>' from the
parser-based path, or 'literal_N' from the regex fallback). It cannot detect
sensitive data hidden in dollar-quoted strings, comments, or numeric
literals, which the anonymizer does not handle.
literals when the regex fallback is in use.

Cluster create_sql is exempt from the literal check: its literals (SIZE,
replication factor, availability zones) are non-sensitive configuration that
Expand All @@ -78,7 +125,7 @@ def verify_anonymized(
identifier_checks.append((original, pattern))

string_literal = re.compile(r"'(?:[^']|'')*'")
placeholder = re.compile(r"^'literal_\d+'$")
placeholder = re.compile(r"^'(?:literal_\d+|<REDACTED>)'$")

for location, sql in _iter_sql(new):
for original, pattern in identifier_checks:
Expand Down Expand Up @@ -331,6 +378,11 @@ def replace_identifiers(d: dict[str, Any], entry: str) -> None:
if args.identifiers:
d[entry] = pattern.sub(lambda m: mapping[m.group(0)], d[entry])

# DDL create_sql is redacted with the blanket regex: option strings like
# connection hosts, sink topics, and source options must be scrubbed, and
# the parser's to_ast_string_redacted() intentionally does NOT redact those
# (it only redacts expression/value literals, treating DDL options as
# config). Query SQL is handled separately, via the parser (see below).
def replace_literals(d: dict[str, Any], entry: str) -> None:
if args.literals:
d[entry] = anonymize_literals_in_sql(d[entry])
Expand Down Expand Up @@ -390,6 +442,7 @@ def replace_literals(d: dict[str, Any], entry: str) -> None:
# Sink create_sql carries topic names, broker lists, and
# bucket/path URLs as string literals; anonymize them.
replace_literals(sink, "create_sql")
query_literal_targets: list[dict[str, Any]] = []
for query in workload["queries"]:
if args.identifiers:
query["cluster"] = mapping.get(query["cluster"], query["cluster"])
Expand All @@ -399,9 +452,33 @@ def replace_literals(d: dict[str, Any], entry: str) -> None:
]
replace_identifiers(query, "sql")
if args.literals:
replace_literals(query, "sql")
query_literal_targets.append(query)
new["queries"].append(query)

# Redact literals in query SQL with Materialize's own parser, in one batch.
# The parser handles every literal form the dialect supports (numbers, hex
# strings, intervals, dollar-quoted and escape strings) where the regex only
# caught single-quoted strings. Fall back to the regex per-statement when
# the helper binary is unavailable or cannot parse a given statement.
if query_literal_targets:
sqls = [q["sql"] for q in query_literal_targets]
redacted = redact_literals_via_parser(sqls)
if redacted is None:
print(
"warning: mz-sql-anonymize helper not found; using regex literal "
"redaction for queries, which misses numbers, dollar-quoted "
"strings, and comments. Build it for exact redaction:\n"
" cargo build --release -p mz-sql-anonymize",
file=sys.stderr,
)
for q in query_literal_targets:
q["sql"] = anonymize_literals_in_sql(q["sql"])
else:
for q, red in zip(query_literal_targets, redacted):
q["sql"] = (
red if red is not None else anonymize_literals_in_sql(q["sql"])
)

if args.verify:
problems = verify_anonymized(new, mapping, args)
if problems:
Expand Down
21 changes: 21 additions & 0 deletions src/sql-anonymize/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
[package]
name = "mz-sql-anonymize"
description = "A small CLI that redacts literals from SQL using Materialize's own parser."
version = "0.0.0"
edition.workspace = true
rust-version.workspace = true
publish = false

[lints]
workspace = true

[[bin]]
name = "mz-sql-anonymize"
path = "src/main.rs"

[dependencies]
mz-sql-parser = { path = "../sql-parser", default-features = false }
serde_json.workspace = true

[dev-dependencies]
mz-ore = { path = "../ore", default-features = false, features = ["test"] }
89 changes: 89 additions & 0 deletions src/sql-anonymize/src/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
// Copyright Materialize, Inc. and contributors. All rights reserved.
//
// Use of this software is governed by the Business Source License
// included in the LICENSE file.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0.

//! Redacts literals from SQL using Materialize's own parser.
//!
//! This is a helper for `bin/mz-workload-anonymize`. Doing literal redaction
//! with the real parser (rather than a regex) handles every literal form the
//! dialect supports — quoted strings with `''` escapes, escape strings,
//! dollar-quoted strings, numbers, hex strings, and intervals — and reuses the
//! exact `'<REDACTED>'` placeholder the rest of Materialize uses to turn
//! "customer data" into "usage data".
//!
//! Protocol: reads a JSON array of SQL strings on stdin and writes a JSON
//! array of the same length on stdout. Each output element is either the
//! redacted SQL or `null` when the input could not be parsed, in which case
//! the caller should fall back to its own redaction for that element.

use std::io::{self, Read, Write};

use mz_sql_parser::ast::display::AstDisplay;
use mz_sql_parser::parser;

/// Redacts every literal in `sql`, or returns `None` if it does not parse.
///
/// A workload may hold more than one statement per entry, so we redact each
/// parsed statement and rejoin them with `; `.
fn redact(sql: &str) -> Option<String> {
let stmts = parser::parse_statements(sql).ok()?;
let redacted: Vec<String> = stmts
.iter()
.map(|s| s.ast.to_ast_string_redacted())
.collect();
Some(redacted.join("; "))
}

fn main() -> io::Result<()> {
let mut input = String::new();
io::stdin().read_to_string(&mut input)?;

let sqls: Vec<String> =
serde_json::from_str(&input).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
let redacted: Vec<Option<String>> = sqls.iter().map(|sql| redact(sql)).collect();

let out = serde_json::to_string(&redacted)?;
io::stdout().write_all(out.as_bytes())?;
Ok(())
}

#[cfg(test)]
mod tests {
use super::redact;

#[mz_ore::test]
fn redacts_strings_and_numbers() {
let redacted = redact("SELECT 'secret', 42 FROM t WHERE name = 'alice'").expect("parses");
assert!(!redacted.contains("secret"), "{redacted}");
assert!(!redacted.contains("alice"), "{redacted}");
assert!(!redacted.contains("42"), "{redacted}");
assert!(redacted.contains("<REDACTED>"), "{redacted}");
// Identifiers must be preserved; only literals are redacted here.
assert!(redacted.contains("name"), "{redacted}");
}

#[mz_ore::test]
fn preserves_quoted_identifier_that_looks_like_a_literal() {
// A column named with embedded spaces is double-quoted, not a literal,
// and must survive redaction.
let redacted = redact(r#"SELECT "my col" FROM t"#).expect("parses");
assert!(redacted.contains(r#""my col""#), "{redacted}");
}

#[mz_ore::test]
fn returns_none_on_parse_error() {
assert_eq!(redact("SELEC not valid sql"), None);
}

#[mz_ore::test]
fn redacts_each_statement_in_a_multi_statement_entry() {
let redacted = redact("SELECT 'a'; SELECT 'b'").expect("parses");
assert!(!redacted.contains("'a'"), "{redacted}");
assert!(!redacted.contains("'b'"), "{redacted}");
}
}
22 changes: 17 additions & 5 deletions test/workload-replay/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,19 @@ Anonymizes identifiers and literals in workload captures for sharing without exp
- All identifiers in `create_sql` definitions and queries

*Literals (`--literals`, enabled by default):*
- String literals in SQL → `'literal_1'`, `'literal_2'`, ...
- String default values in table/source/child columns
- String literals in queries
- Query SQL is redacted with Materialize's own parser (`mz-sql-anonymize`),
replacing all literals — strings, numbers, hex strings, intervals — with
`'<REDACTED>'`. If the helper binary is not built, the tool falls back to a
regex that only catches single-quoted strings (and prints a warning).
- `create_sql` strings (including connection hosts/users, sink topics, source
options, and column defaults) → `'literal_1'`, `'literal_2'`, ... via regex.
The parser is not used here because `to_ast_string_redacted()` intentionally
does not redact DDL option strings.

For exact, parser-based query redaction, build the helper once:
```bash
cargo build --release -p mz-sql-anonymize
```

**Usage:**
```bash
Expand All @@ -90,9 +100,11 @@ bin/mz-workload-anonymize <file> [OPTIONS]
**Options:**
| Option | Description | Default |
|--------|-------------|---------|
| `-o, --output` | Path to write output | overwrites input file |
| `-o, --output` | Path to write output (`-` for stdout); required unless `--in-place` | — |
| `--in-place` | Overwrite the input file (destroys the original capture) | off |
| `--identifiers` / `--no-identifiers` | Anonymize object names | enabled |
| `--literals` / `--no-literals` | Anonymize string literals | enabled |
| `--literals` / `--no-literals` | Anonymize literals | enabled |
| `--verify` / `--no-verify` | Re-scan output for leaks and refuse to write if any are found | enabled |

**Examples:**
```bash
Expand Down
Loading