diff --git a/README.md b/README.md index ddf4377..9f4c9a4 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,7 @@ sanitizeData(input); - [Parse JSON strings](#parse-json-strings) - [Remove fields instead of masking](#remove-fields-instead-of-masking) - [Sanitize PII and PHI with custom patterns](#sanitize-pii-and-phi-with-custom-patterns) + - [Sanitize Maps and Sets](#sanitize-maps-and-sets) - [Options](#options) - [Default patterns](#default-patterns) - [Default matchers](#default-matchers) @@ -224,19 +225,58 @@ sanitizeData(patient, { // => { accountId: 'acct_123' } ``` +### Sanitize Maps and Sets + +Enable `sanitizeCollections: true` to traverse `Map` and `Set` instances. +Each collection is sanitized and returned as a new instance — the original +is never mutated. + +```typescript +const session = new Map([ + ['token', 'abc123'], + ['username', 'mark'], +]); + +sanitizeData({ session }, { sanitizeCollections: true }); +// => { session: Map { 'token' => '**********', 'username' => 'mark' } } +``` + +```typescript +const tags = new Set(['api_key=hunter2', 'env=production']); + +sanitizeData({ tags }, { sanitizeCollections: true }); +// => { tags: Set { 'api_key=**********', 'env=production' } } +``` + +> [!TIP] +> `Map` and `Set` are not JSON-serializable by default — `JSON.stringify` turns +> them into `{}` and `[]`. To include them in structured logs, spread them first: +> +> ```typescript +> // Map with string keys → plain object +> JSON.stringify(Object.fromEntries(sanitizedMap)); +> +> // Map with mixed or object keys → entries array +> JSON.stringify([...sanitizedMap.entries()]); +> +> // Set → array +> JSON.stringify([...sanitizedSet]); +> ``` + ## Options -| Option | Type | Default | Description | -| -------------------- | --------------------------- | ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `patternMask` | `string` | `**********` | String used to replace matched string field values | -| `numericMask` | `number` | `9999999999` | Number used to replace matched number field values | -| `removeMatches` | `boolean` | `false` | Remove matched fields entirely instead of masking | -| `scanStringValues` | `boolean` | `true` | Scan string values on non-sensitive keys for embedded patterns. Applies to object input and to string input when `parseJsonStrings` is enabled; has no effect on raw string input. | -| `parseJsonStrings` | `boolean` | `false` | Parse valid JSON string inputs as structured data and sanitize by field name. Re-serializes with `JSON.stringify`, discarding original whitespace. | -| `customPatterns` | `string[]` | `[]` | Additional field name patterns to match | -| `customMatchers` | `DataSanitizationMatcher[]` | `[]` | Additional regex matchers for custom string formats | -| `useDefaultPatterns` | `boolean` | `true` | Set to `false` to use only your custom patterns, ignoring the built-in defaults. | -| `useDefaultMatchers` | `boolean` | `true` | Set to `false` to use only your custom matchers, ignoring the built-in defaults. | +| Option | Type | Default | Description | +| --------------------- | --------------------------- | ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `patternMask` | `string` | `**********` | String used to replace matched string field values | +| `numericMask` | `number` | `9999999999` | Number used to replace matched number field values | +| `removeMatches` | `boolean` | `false` | Remove matched fields entirely instead of masking | +| `sanitizeCollections` | `boolean` | `false` | Sanitize `Map` and `Set` instances by traversing their entries and returning a new sanitized copy. When false, these pass through unchanged like other non-plain object instances. | +| `scanStringValues` | `boolean` | `true` | Scan string values on non-sensitive keys for embedded patterns. Applies to object input and to string input when `parseJsonStrings` is enabled; has no effect on raw string input. | +| `parseJsonStrings` | `boolean` | `false` | Parse valid JSON string inputs as structured data and sanitize by field name. Re-serializes with `JSON.stringify`, discarding original whitespace. | +| `customPatterns` | `string[]` | `[]` | Additional field name patterns to match | +| `customMatchers` | `DataSanitizationMatcher[]` | `[]` | Additional regex matchers for custom string formats | +| `useDefaultPatterns` | `boolean` | `true` | Set to `false` to use only your custom patterns, ignoring the built-in defaults. | +| `useDefaultMatchers` | `boolean` | `true` | Set to `false` to use only your custom matchers, ignoring the built-in defaults. | ## Default patterns @@ -359,13 +399,19 @@ original input payload. their values are strings, numbers, arrays, objects, or other primitives. 3. **Plain nested objects and arrays** are cloned as they are sanitized. Non-plain object instances are preserved without modification to avoid - corrupting their prototypes. -4. **Null input** is accepted and returns `null`. -5. **For object input**, each pattern is matched case-insensitively against key + corrupting their prototypes. Enable `sanitizeCollections: true` to instead + traverse `Map` and `Set` instances, producing a new sanitized copy. +4. **Object property names and Map string keys** are used for pattern matching + but are not themselves sanitized. If a property name or string Map key + happens to contain sensitive data it will appear unsanitized in the output. + Map keys that are objects are recursed into and sanitized like any other + nested object. +5. **Null input** is accepted and returns `null`. +6. **For object input**, each pattern is matched case-insensitively against key names. By default (`scanStringValues: true`), string values on non-sensitive keys are also scanned, which catches credentials embedded in log messages or other free-text fields. -6. **For string input**, each pattern is tested against each matcher to find and +7. **For string input**, each pattern is tested against each matcher to find and replace sensitive values in the raw string directly. ## Performance @@ -446,8 +492,8 @@ yarn bench Bug reports and pull requests are welcome. Open an issue or PR on [GitHub](https://github.com/ioncache/data-sanitization). -See [docs/development.md](docs/development.md) for setup, build, test, and -release instructions, and [docs/ROADMAP.md](docs/ROADMAP.md) for planned work. +See [docs/development.md](docs/development.md) for setup, build, and test +instructions. And [docs/ROADMAP.md](docs/ROADMAP.md) for planned work. ## License diff --git a/bench/sanitize-data.bench.ts b/bench/sanitize-data.bench.ts index 209960d..c00b789 100644 --- a/bench/sanitize-data.bench.ts +++ b/bench/sanitize-data.bench.ts @@ -21,6 +21,7 @@ * - Object with an array-of-strings field (per-item string scan) * - String input variants (form-encoded, escaped JSON) * - Deeply nested objects with many non-sensitive strings per level + * - Map and Set with sanitizeCollections: true */ import { bench, describe } from 'vitest'; @@ -637,6 +638,48 @@ describe('sanitizeData — deeply nested, many non-sensitive strings (5 × 10 fi }); }); +// --------------------------------------------------------------------------- +// Map — shallow (1 sensitive key, 3 safe keys) +// Comparable to the shallow object benchmark above. +// --------------------------------------------------------------------------- + +describe('sanitizeData — Map, shallow (1 sensitive key)', () => { + const input = { + session: new Map([ + ['api_key', SENSITIVE_STRING_VALUE], + ['email', 'user@example.com'], + ['region', 'us-east-1'], + ['username', 'mark'], + ]), + }; + + bench('sanitizeCollections enabled', () => { + sanitizeData(input, { sanitizeCollections: true }); + }); + bench('sanitizeCollections enabled, scanStringValues disabled', () => { + sanitizeData(input, { sanitizeCollections: true, scanStringValues: false }); + }); +}); + +// --------------------------------------------------------------------------- +// Set — small (1 embedded-pattern string, 2 clean strings) +// Comparable to the array-of-strings benchmark above. +// --------------------------------------------------------------------------- + +describe('sanitizeData — Set, small string values (1 embedded pattern)', () => { + const input = { + tags: new Set([ + `api_key=${SENSITIVE_STRING_VALUE}`, + 'env=production', + 'region=us-east-1', + ]), + }; + + bench('sanitizeCollections enabled', () => { + sanitizeData(input, { sanitizeCollections: true }); + }); +}); + // --------------------------------------------------------------------------- // parseJsonStrings: true vs false — string input containing JSON // --------------------------------------------------------------------------- diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md index 292860d..e590d71 100644 --- a/docs/ROADMAP.md +++ b/docs/ROADMAP.md @@ -144,8 +144,8 @@ Current behavior: non-plain objects (custom prototypes) pass through untouched. | Type | Difficulty | Value | Breaking Change | Notes | | ----------------- | ---------- | ------ | --------------------- | ---------------------------------------------------- | -| Map | Medium | High | No (additive, opt-in) | v1.x candidate; traverse entries behind opt-in flag | -| Set | Low | Low | No (additive, opt-in) | Consider alongside Map; limited real-world demand | +| Map | — | — | No (additive, opt-in) | Implemented; enabled via `sanitizeCollections: true` | +| Set | — | — | No (additive, opt-in) | Implemented; enabled via `sanitizeCollections: true` | | Date | None | None | No | Already preserved correctly; no work needed | | TypedArrays | None | None | No | Already preserved correctly; no work needed | | Class instances | High | Medium | Yes (without opt-in) | v2 candidate; needs opt-in flag and prototype safety | @@ -153,20 +153,17 @@ Current behavior: non-plain objects (custom prototypes) pass through untouched. ### Collect Usage Signals -Collect real v1.x usage signals before planning Map/Set or class instance -support. +Collect real-world signals before committing to class instance support. Concrete evidence of user impact should drive timing. Signals to watch for: -- GitHub issues where users report Map, Set, or class instance values passing - through without sanitization unexpectedly -- Feature requests for Map or Set traversal +- GitHub issues where users report class instance values passing through without + sanitization unexpectedly - Repeated questions about why non-plain objects are not sanitized To collect: monitor the GitHub Issues tracker; periodically search open and - closed issues for keywords like `Map`, `Set`, `class`, `instance`, or - `non-plain`. A cluster of independent requests for the same type is the - clearest signal that v1.x work on that type is justified. + closed issues for keywords like `class`, `instance`, or `non-plain`. A cluster + of independent requests is the clearest signal that the work is justified. ## Planning Workflow diff --git a/docs/plans/012-map-set-sanitization.md b/docs/plans/012-map-set-sanitization.md new file mode 100644 index 0000000..08b9695 --- /dev/null +++ b/docs/plans/012-map-set-sanitization.md @@ -0,0 +1,82 @@ +# Map and Set Sanitization + +## Approach + +Add opt-in sanitization support for `Map` and `Set` instances via a new +`sanitizeCollections` option (default `false`). When enabled, `objectReplacer` +traverses each collection and returns a new sanitized copy rather than passing +the original through unchanged. For `Map`, string keys are matched against the +active field-name patterns and their values are masked or removed accordingly; +object keys are recursed into and sanitized like any other nested object. String +keys themselves are not sanitized, matching the existing behaviour for plain +object property names. For `Set`, each value is recursed into and sanitized. +Defaulting to `false` preserves the current pass-through behaviour for all +non-plain object instances. + +## Steps + +1. Add `sanitizeCollections?: boolean` to `DataSanitizationReplacerOptions` in + `src/types.ts`, between `removeMatches` and `scanStringValues`. + +2. Destructure `sanitizeCollections = false` in `objectReplacer` in + `src/replacers.ts` and add a `Map` branch in `sanitizeValue` between the + array check and the non-plain-object pass-through. The branch builds a new + `Map` from sanitized entries: string keys are tested against `keyMatchers` to + decide masking; object keys are sanitized recursively via `sanitizeValue`. + +3. Add a `Set` branch in `sanitizeValue` in `src/replacers.ts`, immediately + after the `Map` branch. The branch iterates each item through `sanitizeValue` + and returns a new `Set`. + +4. Add `describe('Map sanitization')` and `describe('Set sanitization')` blocks + inside the `objectReplacer` describe block in `test/replacers.test.ts`. + Cover: pass-through when option is off, new instance returned, string-key + masking, numeric masking, `removeMatches`, embedded-pattern scanning, nested + object value recursion, object key recursion, nested `Map`-in-`Map`, and + `Map`-in-`Set`. + +5. Add `sanitizeCollections` to the options table in `README.md`, add a + dedicated "Sanitize Maps and Sets" usage subsection with a serialization tip, + and update the "How it works" section to document the key-sanitization + limitation (string keys and object property names used for matching only, not + sanitized themselves; Map object keys are recursed into). + +6. Add Map and Set benchmark cases to `bench/sanitize-data.bench.ts`. + +7. Update the Map and Set rows in `docs/ROADMAP.md` to reflect implemented + status. + +## Relevant Files + +- `src/types.ts` — updated; adds `sanitizeCollections` option +- `src/replacers.ts` — updated; adds Map and Set branches in `sanitizeValue` +- `test/replacers.test.ts` — updated; adds Map and Set sanitization tests +- `bench/sanitize-data.bench.ts` — updated; adds Map and Set benchmark cases +- `README.md` — updated; adds option to table, usage section, and How it works note +- `docs/ROADMAP.md` — updated; marks Map/Set as implemented + +## Verification + +Run `yarn test:coverage` and confirm 100% coverage is maintained and all tests +pass. + +## Decisions + +**Option B — new copy rather than in-place mutation:** Mutating the caller's +original `Map` or `Set` would be a surprising side-effect and could corrupt +data the caller still holds a reference to. Returning a new sanitized copy is +consistent with how `objectReplacer` already handles plain objects and arrays. + +**`sanitizeCollections` defaults to `false`:** The current behaviour for +non-plain object instances is pass-through. Changing that default would be a +breaking change for callers whose Maps or Sets contain data that would be +altered. An opt-in flag lets callers adopt the feature without risk. + +**String Map keys are used for matching but not sanitized:** Plain object +property names are never sanitized today — only their values are. Map string +keys follow the same rule for consistency. Keys that are objects, however, are +recursed into and sanitized because they carry real structured data that could +contain sensitive fields. + +**WeakMap and WeakSet are not supported:** These types are not iterable by +design and cannot be traversed. They continue to pass through unchanged. diff --git a/src/replacers.ts b/src/replacers.ts index 1f794ff..7256b05 100644 --- a/src/replacers.ts +++ b/src/replacers.ts @@ -206,6 +206,7 @@ const objectReplacer: DataSanitizationReplacer = (data, options = {}) => { numericMask, patternMask, removeMatches = false, + sanitizeCollections = false, scanStringValues = true, useDefaultMatchers = true, useDefaultPatterns = true, @@ -260,6 +261,39 @@ const objectReplacer: DataSanitizationReplacer = (data, options = {}) => { return nextArray; } + if (sanitizeCollections && value instanceof Map) { + const entries: [unknown, unknown][] = []; + for (const [k, v] of value) { + const sanitizedKey = + typeof k === 'object' && k !== null ? sanitizeValue(k) : k; + const isSensitiveStringKey = + typeof k === 'string' && keyMatchers.some((m) => m.test(k)); + if (isSensitiveStringKey) { + if (!removeMatches) { + entries.push([ + sanitizedKey, + typeof v === 'number' + ? (numericMask ?? DEFAULT_NUMERIC_MASK) + : mask, + ]); + } + } else { + entries.push([sanitizedKey, sanitizeValue(v)]); + } + } + seen.delete(value); + return new Map(entries); + } + + if (sanitizeCollections && value instanceof Set) { + const items: unknown[] = []; + for (const item of value) { + items.push(sanitizeValue(item)); + } + seen.delete(value); + return new Set(items); + } + const prototype = Object.getPrototypeOf(value); if (prototype !== Object.prototype && prototype !== null) { seen.delete(value); diff --git a/src/types.ts b/src/types.ts index b4a13fc..6bf852f 100644 --- a/src/types.ts +++ b/src/types.ts @@ -54,6 +54,15 @@ interface DataSanitizationReplacerOptions { * Whether to remove fields matched instead of masking them. Default: false */ removeMatches?: boolean; + /** + * Whether to sanitize `Map` and `Set` instances by traversing their entries + * and returning a new sanitized copy. When false (the default), these pass + * through without modification, matching the behaviour for other non-plain + * object instances. + * + * Default: false + */ + sanitizeCollections?: boolean; /** * Whether to scan string values on non-sensitive-key fields for embedded * sensitive patterns. Disabling this improves performance on object diff --git a/test/replacers.test.ts b/test/replacers.test.ts index e98848c..356aff7 100644 --- a/test/replacers.test.ts +++ b/test/replacers.test.ts @@ -1032,5 +1032,245 @@ describe('DataSanitizationReplacers', () => { expect(result.username).toEqual('mark'); }); }); + + describe('Map sanitization', () => { + it('should pass through Map unchanged when sanitizeCollections is not enabled', () => { + // Arrange + const map = new Map([['password', 'secret']]); + + // Act + const result = objectReplacer({ data: map }) as Record; + + // Assert + expect(result.data).toBe(map); + }); + + it('should return a new Map instance when sanitizeCollections is true', () => { + // Arrange + const map = new Map([['username', 'mark']]); + + // Act + const result = objectReplacer( + { data: map }, + { sanitizeCollections: true }, + ) as Record; + + // Assert + expect(result.data).not.toBe(map); + expect(result.data).toBeInstanceOf(Map); + }); + + it('should mask string value when string key matches sensitive field pattern', () => { + // Arrange + const map = new Map([ + ['password', 'secret'], + ['username', 'mark'], + ]); + + // Act + const result = objectReplacer( + { data: map }, + { sanitizeCollections: true }, + ) as Record; + const sanitized = result.data as Map; + + // Assert + expect(sanitized.get('password')).toBe(DEFAULT_PATTERN_MASK); + expect(sanitized.get('username')).toBe('mark'); + }); + + it('should mask numeric value with numericMask when string key matches sensitive field pattern', () => { + // Arrange + const map = new Map([['token', 42]]); + + // Act + const result = objectReplacer( + { data: map }, + { sanitizeCollections: true }, + ) as Record; + const sanitized = result.data as Map; + + // Assert + expect(sanitized.get('token')).toBe(DEFAULT_NUMERIC_MASK); + }); + + it('should omit entry when removeMatches is true and string key matches', () => { + // Arrange + const map = new Map([ + ['password', 'secret'], + ['username', 'mark'], + ]); + + // Act + const result = objectReplacer( + { data: map }, + { removeMatches: true, sanitizeCollections: true }, + ) as Record; + const sanitized = result.data as Map; + + // Assert + expect(sanitized.has('password')).toBe(false); + expect(sanitized.get('username')).toBe('mark'); + }); + + it('should scan string values on non-sensitive keys for embedded patterns', () => { + // Arrange + const map = new Map([['message', 'api_key=hunter2']]); + + // Act + const result = objectReplacer( + { data: map }, + { sanitizeCollections: true }, + ) as Record; + const sanitized = result.data as Map; + + // Assert + expect(sanitized.get('message')).toBe( + `api_key=${DEFAULT_PATTERN_MASK}`, + ); + }); + + it('should sanitize plain object values recursively', () => { + // Arrange + const map = new Map([ + ['data', { password: 'secret', username: 'mark' }], + ]); + + // Act + const result = objectReplacer( + { map }, + { sanitizeCollections: true }, + ) as Record; + const sanitized = result.map as Map; + const nested = sanitized.get('data') as Record; + + // Assert + expect(nested.password).toBe(DEFAULT_PATTERN_MASK); + expect(nested.username).toBe('mark'); + }); + + it('should sanitize object keys recursively', () => { + // Arrange + const sensitiveKey = { id: 1, password: 'secret' }; + const map = new Map([[sensitiveKey, 'value']]); + + // Act + const result = objectReplacer( + { data: map }, + { sanitizeCollections: true }, + ) as Record; + const sanitized = result.data as Map, string>; + const [sanitizedKey] = sanitized.keys(); + + // Assert + expect(sanitizedKey.password).toBe(DEFAULT_PATTERN_MASK); + expect(sanitizedKey.id).toBe(1); + expect(sanitized.get(sanitizedKey)).toBe('value'); + }); + + it('should sanitize nested Maps', () => { + // Arrange + const inner = new Map([['token', 'abc']]); + const outer = new Map([['nested', inner]]); + + // Act + const result = objectReplacer( + { data: outer }, + { sanitizeCollections: true }, + ) as Record; + const sanitizedOuter = result.data as Map; + const sanitizedInner = sanitizedOuter.get('nested') as Map< + string, + unknown + >; + + // Assert + expect(sanitizedInner).toBeInstanceOf(Map); + expect(sanitizedInner).not.toBe(inner); + expect(sanitizedInner.get('token')).toBe(DEFAULT_PATTERN_MASK); + }); + }); + + describe('Set sanitization', () => { + it('should pass through Set unchanged when sanitizeCollections is not enabled', () => { + // Arrange + const set = new Set(['secret']); + + // Act + const result = objectReplacer({ data: set }) as Record; + + // Assert + expect(result.data).toBe(set); + }); + + it('should return a new Set instance when sanitizeCollections is true', () => { + // Arrange + const set = new Set(['mark']); + + // Act + const result = objectReplacer( + { data: set }, + { sanitizeCollections: true }, + ) as Record; + + // Assert + expect(result.data).not.toBe(set); + expect(result.data).toBeInstanceOf(Set); + }); + + it('should scan string values for embedded sensitive patterns', () => { + // Arrange + const set = new Set(['api_key=hunter2', 'safe-value']); + + // Act + const result = objectReplacer( + { data: set }, + { sanitizeCollections: true }, + ) as Record; + const sanitized = result.data as Set; + + // Assert + expect(sanitized.has(`api_key=${DEFAULT_PATTERN_MASK}`)).toBe(true); + expect(sanitized.has('safe-value')).toBe(true); + expect(sanitized.has('api_key=hunter2')).toBe(false); + }); + + it('should sanitize plain object values recursively', () => { + // Arrange + const obj = { password: 'secret', username: 'mark' }; + const set = new Set([obj]); + + // Act + const result = objectReplacer( + { data: set }, + { sanitizeCollections: true }, + ) as Record; + const sanitized = result.data as Set>; + const [sanitizedObj] = sanitized; + + // Assert + expect(sanitizedObj.password).toBe(DEFAULT_PATTERN_MASK); + expect(sanitizedObj.username).toBe('mark'); + }); + + it('should sanitize Map values within a Set', () => { + // Arrange + const map = new Map([['token', 'abc']]); + const set = new Set([map]); + + // Act + const result = objectReplacer( + { data: set }, + { sanitizeCollections: true }, + ) as Record; + const sanitized = result.data as Set>; + const [sanitizedMap] = sanitized; + + // Assert + expect(sanitizedMap).toBeInstanceOf(Map); + expect(sanitizedMap).not.toBe(map); + expect(sanitizedMap.get('token')).toBe(DEFAULT_PATTERN_MASK); + }); + }); }); });