From de4a29779294252020bc8efc5529b430256de012 Mon Sep 17 00:00:00 2001 From: Berend Klein Haneveld Date: Tue, 26 May 2026 11:09:16 +0200 Subject: [PATCH 1/2] perf: trim common prefix/suffix in diff_lists Before running the Levenshtein DP, strip the common prefix and suffix from the two lists and run the DP only over the changed region. The DP table goes from O(m_full * n_full) to O((m_full - prefix - suffix) * (n_full - prefix - suffix)), which is a massive win for the common case of lists that are mostly identical with localized edits. Op indexes from the traceback are shifted by `prefix` so they refer to positions in the original input/output, leaving the padding logic and patch output format unchanged. Co-Authored-By: Claude Opus 4.7 --- benchmarks/benchmark.py | 22 ++++++ patchdiff/diff.py | 51 +++++++++---- tests/test_diff_prefix_suffix.py | 124 +++++++++++++++++++++++++++++++ 3 files changed, 182 insertions(+), 15 deletions(-) create mode 100644 tests/test_diff_prefix_suffix.py diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py index 063abf2..c4d35af 100644 --- a/benchmarks/benchmark.py +++ b/benchmarks/benchmark.py @@ -124,6 +124,28 @@ def test_list_diff_identical(benchmark): benchmark(diff, a, b) +def _make_localized_change_list(n: int, n_changes: int = 5) -> tuple[list, list]: + """Build a pair of lists that share a long common prefix and suffix with + a few replacements clustered in the middle. Exercises the prefix/suffix + trim path in `diff_lists`.""" + rng = random.Random(42) + a = [rng.randint(0, 100) for _ in range(n)] + b = a.copy() + mid = n // 2 + for i in range(n_changes): + b[mid + i] = -1 + return a, b + + +@pytest.mark.benchmark(group="list-diff-similar") +@pytest.mark.parametrize("size", [1000, 5000, 10000]) +def test_list_diff_similar_localized_changes(benchmark, size): + """Benchmark: lists of given size sharing a long common prefix/suffix + with a small cluster of localized changes in the middle.""" + a, b = _make_localized_change_list(size, n_changes=5) + benchmark(diff, a, b) + + # ======================================== # Dict Diff Benchmarks # ======================================== diff --git a/patchdiff/diff.py b/patchdiff/diff.py index e74cba8..1a62f66 100644 --- a/patchdiff/diff.py +++ b/patchdiff/diff.py @@ -7,10 +7,29 @@ def diff_lists(input: List, output: List, ptr: Pointer) -> Tuple[List, List]: - m, n = len(input), len(output) + m_full, n_full = len(input), len(output) + + # Strip common prefix so the DP table only covers the changed region. + prefix = 0 + prefix_limit = min(m_full, n_full) + while prefix < prefix_limit and input[prefix] == output[prefix]: + prefix += 1 + + # Strip common suffix without crossing into the prefix region. + suffix = 0 + while ( + suffix < (m_full - prefix) + and suffix < (n_full - prefix) + and input[m_full - 1 - suffix] == output[n_full - 1 - suffix] + ): + suffix += 1 + + sub_input = input[prefix : m_full - suffix] + sub_output = output[prefix : n_full - suffix] + m, n = len(sub_input), len(sub_output) # Build DP table bottom-up (iterative approach) - # dp[i][j] = cost of transforming input[0:i] to output[0:j] + # dp[i][j] = cost of transforming sub_input[0:i] to sub_output[0:j] dp = [[0] * (n + 1) for _ in range(m + 1)] # Initialize base cases @@ -22,7 +41,7 @@ def diff_lists(input: List, output: List, ptr: Pointer) -> Tuple[List, List]: # Fill DP table for i in range(1, m + 1): for j in range(1, n + 1): - if input[i - 1] == output[j - 1]: + if sub_input[i - 1] == sub_output[j - 1]: # Elements match, no operation needed dp[i][j] = dp[i - 1][j - 1] else: @@ -33,42 +52,44 @@ def diff_lists(input: List, output: List, ptr: Pointer) -> Tuple[List, List]: dp[i - 1][j - 1] + 1, # Replace ) - # Traceback to extract operations + # Traceback to extract operations. Indexes are emitted in sub-list + # coordinates and shifted by `prefix` below so they refer to positions + # in the original input/output. ops = [] rops = [] i, j = m, n while i > 0 or j > 0: - if i > 0 and j > 0 and input[i - 1] == output[j - 1]: + if i > 0 and j > 0 and sub_input[i - 1] == sub_output[j - 1]: # Elements match, no operation i -= 1 j -= 1 elif i > 0 and (j == 0 or dp[i][j] == dp[i - 1][j] + 1): # Remove from input - ops.append({"op": "remove", "idx": i - 1}) - rops.append({"op": "add", "idx": j - 1, "value": input[i - 1]}) + ops.append({"op": "remove", "idx": i - 1 + prefix}) + rops.append({"op": "add", "idx": j - 1 + prefix, "value": sub_input[i - 1]}) i -= 1 elif j > 0 and (i == 0 or dp[i][j] == dp[i][j - 1] + 1): # Add from output - ops.append({"op": "add", "idx": i - 1, "value": output[j - 1]}) - rops.append({"op": "remove", "idx": j - 1}) + ops.append({"op": "add", "idx": i - 1 + prefix, "value": sub_output[j - 1]}) + rops.append({"op": "remove", "idx": j - 1 + prefix}) j -= 1 else: # Replace ops.append( { "op": "replace", - "idx": i - 1, - "original": input[i - 1], - "value": output[j - 1], + "idx": i - 1 + prefix, + "original": sub_input[i - 1], + "value": sub_output[j - 1], } ) rops.append( { "op": "replace", - "idx": j - 1, - "original": output[j - 1], - "value": input[i - 1], + "idx": j - 1 + prefix, + "original": sub_output[j - 1], + "value": sub_input[i - 1], } ) i -= 1 diff --git a/tests/test_diff_prefix_suffix.py b/tests/test_diff_prefix_suffix.py new file mode 100644 index 0000000..a9fc830 --- /dev/null +++ b/tests/test_diff_prefix_suffix.py @@ -0,0 +1,124 @@ +"""Round-trip property tests for prefix/suffix-heavy list pairs. + +These exercise lists that share long common prefixes and/or suffixes with +localized edits in the middle — the case that the prefix/suffix trimming +optimization targets in `diff_lists`. Each randomized case asserts the +bi-directional round-trip property: applying `ops` to `a` yields `b`, and +applying `rops` to `b` yields `a`. +""" + +import random + +from patchdiff import apply, diff + + +def _random_list(rng: random.Random, n: int) -> list: + return [rng.randint(0, 100) for _ in range(n)] + + +def _mutate_middle( + rng: random.Random, + base: list, + n_changes: int, + kinds: tuple, +) -> list: + """Apply `n_changes` localized edits in the middle of `base`.""" + out = list(base) + mid = len(out) // 2 + for i in range(n_changes): + idx = mid + i + kind = rng.choice(kinds) + if kind == "replace" and idx < len(out): + out[idx] = -(rng.randint(1, 10_000)) + elif kind == "insert": + out.insert(idx, -(rng.randint(1, 10_000))) + elif kind == "delete" and idx < len(out): + del out[idx] + return out + + +def test_round_trip_prefix_suffix_heavy_lists(): + """Generates 20+ randomized prefix/suffix-heavy list pairs and asserts + bi-directional round-trip apply correctness.""" + rng = random.Random(0xC0FFEE) + cases = 0 + for seed in range(25): + rng = random.Random(seed) + n = rng.choice([50, 200, 500, 1000]) + base = _random_list(rng, n) + n_changes = rng.choice([1, 2, 5, 10]) + kinds = rng.choice( + [ + ("replace",), + ("insert",), + ("delete",), + ("replace", "insert"), + ("replace", "insert", "delete"), + ] + ) + mutated = _mutate_middle(rng, base, n_changes, kinds) + + ops, rops = diff(base, mutated) + assert apply(base, ops) == mutated, ( + f"forward apply failed for seed={seed}, n={n}" + ) + assert apply(mutated, rops) == base, ( + f"reverse apply failed for seed={seed}, n={n}" + ) + cases += 1 + + assert cases >= 20 + + +def test_round_trip_pure_common_prefix(): + """Pair with only a common prefix (suffix differs).""" + rng = random.Random(7) + for seed in range(5): + rng = random.Random(seed) + prefix = _random_list(rng, 100) + a = prefix + _random_list(rng, 10) + b = prefix + _random_list(rng, 12) + + ops, rops = diff(a, b) + assert apply(a, ops) == b + assert apply(b, rops) == a + + +def test_round_trip_pure_common_suffix(): + """Pair with only a common suffix (prefix differs).""" + for seed in range(5): + rng = random.Random(seed) + suffix = _random_list(rng, 100) + a = _random_list(rng, 10) + suffix + b = _random_list(rng, 12) + suffix + + ops, rops = diff(a, b) + assert apply(a, ops) == b + assert apply(b, rops) == a + + +def test_round_trip_identical_lists_no_ops(): + """Identical lists must produce no operations regardless of trim path.""" + rng = random.Random(42) + a = _random_list(rng, 200) + ops, rops = diff(a, list(a)) + assert ops == [] + assert rops == [] + + +def test_round_trip_full_prefix_match_one_appended(): + """`b` extends `a` with extra trailing element.""" + a = list(range(50)) + b = [*a, 999] + ops, rops = diff(a, b) + assert apply(a, ops) == b + assert apply(b, rops) == a + + +def test_round_trip_full_suffix_match_one_prepended(): + """`b` prepends a single element to `a`.""" + a = list(range(50)) + b = [-1, *a] + ops, rops = diff(a, b) + assert apply(a, ops) == b + assert apply(b, rops) == a From 24f8c5548dbf390660ea22e66854438cbcef4c95 Mon Sep 17 00:00:00 2001 From: Berend Klein Haneveld Date: Tue, 26 May 2026 13:57:42 +0200 Subject: [PATCH 2/2] test+bench: lock in nested-item lists for diff_lists trim Adds tests and a benchmark group that exercise the prefix/suffix trim on lists whose items have non-trivial structural equality (nested dicts, lists, sets, and mixed objects that are equal-by-value but distinct identities). Tests added in tests/test_diff_prefix_suffix.py: - test_round_trip_lists_of_nested_dicts - test_round_trip_lists_with_nested_lists_as_items - test_round_trip_lists_of_sets - test_prefix_suffix_with_equal_but_distinct_nested_objects Benchmark group added in benchmarks/benchmark.py: - list-diff-similar-nested @ n in {500, 1000, 2000} These guard against any future regression where the trim's `==` is swapped for an identity check or shortcut that would silently misbehave for items with structural equality. Co-Authored-By: Claude Opus 4.7 --- benchmarks/benchmark.py | 36 +++++++++++ tests/test_diff_prefix_suffix.py | 104 +++++++++++++++++++++++++++++++ 2 files changed, 140 insertions(+) diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py index a8ac2c1..2ef9111 100644 --- a/benchmarks/benchmark.py +++ b/benchmarks/benchmark.py @@ -146,6 +146,42 @@ def test_list_diff_similar_localized_changes(benchmark, size): benchmark(diff, a, b) +def _nested_dict_item(i: int) -> dict: + """Build a dict whose equality is a deep structural compare.""" + return { + "id": i, + "name": f"item_{i}", + "tags": [f"t{i}_{k}" for k in range(4)], + "meta": {"a": i % 7, "b": i % 11, "nested": {"x": [i, i + 1, i + 2]}}, + } + + +def _make_nested_localized_change_lists( + n: int, n_changes: int = 5 +) -> tuple[list, list]: + """Pair of lists of nested dicts with a localized cluster of changes + in the middle. Items in the common prefix/suffix are equal-by-value + but distinct objects, so `==` (not `is`) is the deciding factor.""" + a = [_nested_dict_item(i) for i in range(n)] + b = [_nested_dict_item(i) for i in range(n)] + mid = n // 2 + for k in range(n_changes): + b[mid + k] = _nested_dict_item(-1 - k) + return a, b + + +@pytest.mark.benchmark(group="list-diff-similar-nested") +@pytest.mark.parametrize("size", [500, 1000, 2000]) +def test_list_diff_similar_nested_localized_changes(benchmark, size): + """Benchmark: lists of nested dicts (deep `==`) sharing a long common + prefix/suffix with a small cluster of localized changes in the middle. + + Locks in the regression target from the prefix/suffix trim for items + whose equality is a non-trivial structural compare.""" + a, b = _make_nested_localized_change_lists(size, n_changes=5) + benchmark(diff, a, b) + + # ======================================== # Dict Diff Benchmarks # ======================================== diff --git a/tests/test_diff_prefix_suffix.py b/tests/test_diff_prefix_suffix.py index a9fc830..68d70c5 100644 --- a/tests/test_diff_prefix_suffix.py +++ b/tests/test_diff_prefix_suffix.py @@ -122,3 +122,107 @@ def test_round_trip_full_suffix_match_one_prepended(): ops, rops = diff(a, b) assert apply(a, ops) == b assert apply(b, rops) == a + + +# --------------------------------------------------------------------------- +# Nested-item lists +# +# The prefix/suffix trim uses the same `==` operator the DP uses, so for any +# value type the trim's "matched" decision agrees with the DP's diagonal-step +# decision. These cases lock that in for items whose equality is a deep +# structural compare (dicts, lists, sets, and mixed nested containers). +# --------------------------------------------------------------------------- + + +def _nested_dict(rng: random.Random, i: int) -> dict: + """Build a dict whose equality is a non-trivial structural compare.""" + return { + "id": i, + "name": f"item_{i}", + "tags": [f"t{i}_{k}" for k in range(rng.randint(2, 5))], + "meta": {"a": i % 7, "b": i % 11, "nested": {"x": [i, i + 1, i + 2]}}, + "flags": {f"f{i % 5}", f"f{i % 3}"}, + } + + +def test_round_trip_lists_of_nested_dicts(): + """Round-trip property test for lists whose items are nested dicts.""" + for seed in range(10): + rng = random.Random(seed) + n = rng.choice([20, 50, 100]) + base = [_nested_dict(rng, i) for i in range(n)] + + # Mutate a small slice in the middle: replace, insert, delete one each. + mutated = [ + {**d, "meta": dict(d["meta"])} for d in base + ] # shallow-deep copy enough to detach + mid = n // 2 + # Replace: change a deeply-nested field + mutated[mid]["meta"]["nested"] = {"x": [-1, -2, -3]} + # Insert: brand-new dict between mid+1 and mid+2 + mutated.insert(mid + 1, _nested_dict(rng, 10_000 + seed)) + # Delete: drop the item after the inserted one + del mutated[mid + 2] + + ops, rops = diff(base, mutated) + assert apply(base, ops) == mutated, f"forward failed for seed={seed}" + assert apply(mutated, rops) == base, f"reverse failed for seed={seed}" + + +def test_round_trip_lists_with_nested_lists_as_items(): + """Items are themselves lists — equality is recursive.""" + rng = random.Random(99) + base = [[rng.randint(0, 9) for _ in range(rng.randint(3, 8))] for _ in range(60)] + mutated = [list(row) for row in base] + # Localized change deep inside one item. + mutated[30][0] = 999 + # And replace one whole item. + mutated[31] = [-1, -2, -3] + + ops, rops = diff(base, mutated) + assert apply(base, ops) == mutated + assert apply(mutated, rops) == base + + +def test_round_trip_lists_of_sets(): + """Items are sets — `==` is set-equality, not identity.""" + base = [{i, i + 1, i + 2} for i in range(40)] + # Equal-by-value but distinct objects in the prefix/suffix region + # ensures we exercise structural equality, not `is`. + mutated = [{i, i + 1, i + 2} for i in range(40)] + mutated[20] = {-1, -2, -3} # one change in the middle + + ops, rops = diff(base, mutated) + assert apply(base, ops) == mutated + assert apply(mutated, rops) == base + + +def test_prefix_suffix_with_equal_but_distinct_nested_objects(): + """Items in the common prefix/suffix are *equal* but not the *same* + object. The trim must rely on `==`, not `is`.""" + shared_prefix_a = [{"k": i, "v": [i, i + 1]} for i in range(20)] + shared_prefix_b = [{"k": i, "v": [i, i + 1]} for i in range(20)] + assert shared_prefix_a == shared_prefix_b + assert all(x is not y for x, y in zip(shared_prefix_a, shared_prefix_b)) + + middle_a = [{"k": "a_only", "v": [1]}] + middle_b = [{"k": "b_only", "v": [2]}] + + shared_suffix_a = [{"k": i + 100, "v": [i]} for i in range(20)] + shared_suffix_b = [{"k": i + 100, "v": [i]} for i in range(20)] + + a = shared_prefix_a + middle_a + shared_suffix_a + b = shared_prefix_b + middle_b + shared_suffix_b + + ops, rops = diff(a, b) + assert apply(a, ops) == b + assert apply(b, rops) == a + + # The change should be confined to the middle position (index 20). + # We don't assert the exact op shape, but every emitted op's path must + # start with /20 (the middle index in the full list). + for op in ops: + path_tokens = op["path"].tokens + assert path_tokens and path_tokens[0] == 20, ( + f"unexpected op outside the middle region: {op}" + )