From 9d120f67cc33cab86d359f3cba95c529238873ec Mon Sep 17 00:00:00 2001
From: Ross Kukard <ross.kukard@revvity.com>
Date: Wed, 13 May 2026 09:54:29 -0700
Subject: [PATCH 1/8] Add RossK1 to the README

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 44e0723..179cbcb 100644
--- a/README.md
+++ b/README.md
@@ -91,3 +91,4 @@ scripts/
 ```
 
 Each round's `data/` directory is generated locally and gitignored.
+This is RossK1's PR

From d99cc18bc365fec61094b17371549b9b5a742149 Mon Sep 17 00:00:00 2001
From: Ross Kukard <ross.kukard@revvity.com>
Date: Wed, 13 May 2026 10:07:59 -0700
Subject: [PATCH 2/8] Just switching out to code instead of the function

---
 rounds/1_histogram/solution.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/rounds/1_histogram/solution.py b/rounds/1_histogram/solution.py
index dffbee5..49ea272 100644
--- a/rounds/1_histogram/solution.py
+++ b/rounds/1_histogram/solution.py
@@ -8,7 +8,13 @@
 
 def compute_histogram(path: str) -> dict[bytes, int]:
     """Frequency of every 2-byte bigram in the file at ``path``."""
-    # TODO: remove this delegation and write your own implementation here.
-    from .baseline import compute_histogram as _baseline
-
-    return _baseline(path)
+    with open(path, "rb") as f:
+        data = f.read()
+    counts: dict[bytes, int] = {}
+    for i in range(len(data) - 1):
+        bigram = data[i : i + 2]
+        if bigram in counts:
+            counts[bigram] += 1
+        else:
+            counts[bigram] = 1
+    return counts

From 6d156e09f65f99b2a151bd6546ce84ad81ee386d Mon Sep 17 00:00:00 2001
From: Ross Kukard <ross.kukard@revvity.com>
Date: Wed, 13 May 2026 10:15:32 -0700
Subject: [PATCH 3/8] Switching to keeping track with set instead

---
 rounds/1_histogram/solution.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/rounds/1_histogram/solution.py b/rounds/1_histogram/solution.py
index 49ea272..8921703 100644
--- a/rounds/1_histogram/solution.py
+++ b/rounds/1_histogram/solution.py
@@ -11,10 +11,12 @@ def compute_histogram(path: str) -> dict[bytes, int]:
     with open(path, "rb") as f:
         data = f.read()
     counts: dict[bytes, int] = {}
+    bigrams_seen = set()
     for i in range(len(data) - 1):
         bigram = data[i : i + 2]
-        if bigram in counts:
+        if bigram in bigrams_seen:
             counts[bigram] += 1
         else:
+            bigrams_seen.add(bigram)
             counts[bigram] = 1
     return counts

From df1dad00e532c604d1a3c71f20e5a2a079ac84c8 Mon Sep 17 00:00:00 2001
From: Ross Kukard <ross.kukard@revvity.com>
Date: Wed, 13 May 2026 10:54:59 -0700
Subject: [PATCH 4/8] Round 3 first try

---
 rounds/3_dna/solution.py | 49 ++++++++++++++++++++++++++++++++--------
 1 file changed, 40 insertions(+), 9 deletions(-)

diff --git a/rounds/3_dna/solution.py b/rounds/3_dna/solution.py
index 8b917da..3fe38dc 100644
--- a/rounds/3_dna/solution.py
+++ b/rounds/3_dna/solution.py
@@ -1,11 +1,8 @@
-"""Your Round 3 solution — DNA sequence matcher.
+"""Your Round 3 solution — DNA sequence matcher."""
 
-**Edit this file.** It currently delegates to ``baseline.py`` so everything
-passes out of the box. Replace the body of ``find_matches`` with your
-own faster implementation.
-"""
-
-from .baseline import find_matches as _baseline
+from __future__ import annotations
+import re
+from concurrent.futures import ThreadPoolExecutor, as_completed
 
 
 def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]:
@@ -13,5 +10,39 @@ def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]
 
     Returns ``[(record_id, [positions...]), ...]`` in file order.
     """
-    # TODO: remove this delegation and write your own implementation here.
-    return _baseline(fasta_path, pattern)
+    # Read as bytes — no decode overhead, pattern stays as bytes.
+    with open(fasta_path, "rb") as f:
+        data = f.read()
+
+    # Pre-compile a lookahead regex so overlapping matches are found in one pass.
+    regex = re.compile(b"(?=" + re.escape(pattern) + b")")
+
+    def process_record(record: bytes) -> tuple[str, list[int]] | None:
+        if not record.strip():
+            return None
+        lines = record.split(b"\n")
+        record_id = lines[0].strip().decode("ascii")
+        sequence = b"".join(lines[1:]).replace(b" ", b"")
+        positions = [m.start() for m in regex.finditer(sequence)]
+        if positions:
+            return (record_id, positions)
+        return None
+
+    # Split on b'>' — first chunk is empty for well-formed files.
+    records = data.split(b">")[1:]  # skip leading empty chunk
+
+    results: list[tuple[str, list[int]]] = []
+
+    # re operations release the GIL, so ThreadPoolExecutor gives real parallelism.
+    with ThreadPoolExecutor() as executor:
+        # Submit in order, preserve file order via index.
+        futures = {executor.submit(process_record, r): i for i, r in enumerate(records)}
+        ordered: list[tuple[int, tuple[str, list[int]]]] = []
+        for future in as_completed(futures):
+            result = future.result()
+            if result is not None:
+                ordered.append((futures[future], result))
+
+    ordered.sort(key=lambda x: x[0])
+    results = [r for _, r in ordered]
+    return results

From c0ae909edd3361fbaca2b5d64126ef7459ffb218 Mon Sep 17 00:00:00 2001
From: Ross Kukard <ross.kukard@revvity.com>
Date: Wed, 13 May 2026 11:19:07 -0700
Subject: [PATCH 5/8] Round 3 using mmap

---
 rounds/3_dna/solution.py | 59 +++++++++++++++++++++-------------------
 1 file changed, 31 insertions(+), 28 deletions(-)

diff --git a/rounds/3_dna/solution.py b/rounds/3_dna/solution.py
index 3fe38dc..96eff93 100644
--- a/rounds/3_dna/solution.py
+++ b/rounds/3_dna/solution.py
@@ -3,6 +3,8 @@
 from __future__ import annotations
 import re
 from concurrent.futures import ThreadPoolExecutor, as_completed
+import mmap
+import os
 
 
 def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]:
@@ -11,38 +13,39 @@ def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]
     Returns ``[(record_id, [positions...]), ...]`` in file order.
     """
     # Read as bytes — no decode overhead, pattern stays as bytes.
-    with open(fasta_path, "rb") as f:
-        data = f.read()
-
-    # Pre-compile a lookahead regex so overlapping matches are found in one pass.
     regex = re.compile(b"(?=" + re.escape(pattern) + b")")
-
-    def process_record(record: bytes) -> tuple[str, list[int]] | None:
-        if not record.strip():
-            return None
-        lines = record.split(b"\n")
-        record_id = lines[0].strip().decode("ascii")
+    with open(fasta_path, "rb") as f:
+        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
+            data = bytes(mm)
+        size = len(data)
+
+    # Find record boundaries without copying data
+    offsets = [0]
+    pos = data.find(b">", 1)
+    while pos != -1:
+        offsets.append(pos)
+        pos = data.find(b">", pos + 1)
+    offsets.append(size)
+
+    def process_record(
+        start: int, end: int, idx: int
+    ) -> tuple[int, tuple[str, list[int]]] | None:
+        chunk = data[start:end]
+        lines = chunk.split(b"\n")
+        record_id = lines[0][1:].strip().decode("ascii")
         sequence = b"".join(lines[1:]).replace(b" ", b"")
         positions = [m.start() for m in regex.finditer(sequence)]
         if positions:
-            return (record_id, positions)
+            return (idx, (record_id, positions))
         return None
 
-    # Split on b'>' — first chunk is empty for well-formed files.
-    records = data.split(b">")[1:]  # skip leading empty chunk
-
-    results: list[tuple[str, list[int]]] = []
-
-    # re operations release the GIL, so ThreadPoolExecutor gives real parallelism.
-    with ThreadPoolExecutor() as executor:
-        # Submit in order, preserve file order via index.
-        futures = {executor.submit(process_record, r): i for i, r in enumerate(records)}
-        ordered: list[tuple[int, tuple[str, list[int]]]] = []
-        for future in as_completed(futures):
-            result = future.result()
-            if result is not None:
-                ordered.append((futures[future], result))
+    max_workers = min(32, (os.cpu_count() or 1) * 2)
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = [
+            executor.submit(process_record, offsets[i], offsets[i + 1], i)
+            for i in range(len(offsets) - 1)
+        ]
+        results = [r for f in as_completed(futures) if (r := f.result()) is not None]
 
-    ordered.sort(key=lambda x: x[0])
-    results = [r for _, r in ordered]
-    return results
+    results.sort(key=lambda x: x[0])
+    return [r for _, r in results]

From a02fce2fe8e46130db676063689d39ac1ed42339 Mon Sep 17 00:00:00 2001
From: Ross Kukard <ross.kukard@revvity.com>
Date: Wed, 13 May 2026 11:30:01 -0700
Subject: [PATCH 6/8] Round 1 fixes

---
 rounds/1_histogram/solution.py | 25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/rounds/1_histogram/solution.py b/rounds/1_histogram/solution.py
index 8921703..fe9bc2c 100644
--- a/rounds/1_histogram/solution.py
+++ b/rounds/1_histogram/solution.py
@@ -1,22 +1,15 @@
-"""Your Round 1 solution — byte-pair histogram.
+"""Your Round 1 solution — byte-pair histogram."""
 
-**Edit this file.** It currently delegates to ``baseline.py`` so everything
-passes out of the box. Replace the body of ``compute_histogram`` with your
-own faster implementation.
-"""
+import numpy as np
+import mmap
 
 
 def compute_histogram(path: str) -> dict[bytes, int]:
     """Frequency of every 2-byte bigram in the file at ``path``."""
     with open(path, "rb") as f:
-        data = f.read()
-    counts: dict[bytes, int] = {}
-    bigrams_seen = set()
-    for i in range(len(data) - 1):
-        bigram = data[i : i + 2]
-        if bigram in bigrams_seen:
-            counts[bigram] += 1
-        else:
-            bigrams_seen.add(bigram)
-            counts[bigram] = 1
-    return counts
+        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
+            # Copy while mmap is still open — no exported pointer issue
+            data = np.frombuffer(mm, dtype=np.uint8).copy()
+        keys = data[:-1].astype(np.uint16) << 8 | data[1:].astype(np.uint16)
+        counts = np.bincount(keys, minlength=65536)
+    return {bytes([k >> 8, k & 0xFF]): int(counts[k]) for k in np.nonzero(counts)[0]}

From 3e9f823b9a25afcf434a98871c0147cd61b72bbc Mon Sep 17 00:00:00 2001
From: Ross Kukard <ross.kukard@revvity.com>
Date: Wed, 13 May 2026 12:08:50 -0700
Subject: [PATCH 7/8] Round 3 - Switching from regex to find

---
 rounds/3_dna/solution.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/rounds/3_dna/solution.py b/rounds/3_dna/solution.py
index 96eff93..74d2393 100644
--- a/rounds/3_dna/solution.py
+++ b/rounds/3_dna/solution.py
@@ -34,7 +34,11 @@ def process_record(
         lines = chunk.split(b"\n")
         record_id = lines[0][1:].strip().decode("ascii")
         sequence = b"".join(lines[1:]).replace(b" ", b"")
-        positions = [m.start() for m in regex.finditer(sequence)]
+        positions = []
+        start_pos = 0
+        while (hit := sequence.find(pattern, start_pos)) != -1:
+            positions.append(hit)
+            start_pos = hit + 1
         if positions:
             return (idx, (record_id, positions))
         return None

From 6c050fa680d1265755fdb89763b466fde51e552d Mon Sep 17 00:00:00 2001
From: Ross Kukard <ross.kukard@revvity.com>
Date: Wed, 13 May 2026 12:18:35 -0700
Subject: [PATCH 8/8] Round 2 - Corruptions cleaned up

---
 rounds/2_corruption/solution.py | 34 ++++++++++++++++++++++++---------
 rounds/3_dna/solution.py        |  6 ++----
 2 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/rounds/2_corruption/solution.py b/rounds/2_corruption/solution.py
index a5b752a..9f86315 100644
--- a/rounds/2_corruption/solution.py
+++ b/rounds/2_corruption/solution.py
@@ -1,14 +1,30 @@
-"""Your Round 2 solution — corruption scanner.
+from __future__ import annotations
 
-**Edit this file.** It currently delegates to ``baseline.py`` so everything
-passes out of the box. Replace the body of ``find_corruptions`` with your
-own faster implementation.
-"""
-
-from .baseline import find_corruptions as _baseline
+import numpy as np
 
 
 def find_corruptions(ref_path: str, cor_path: str) -> list[tuple[int, int]]:
     """Return ``[(offset, length), ...]`` for every differing byte range."""
-    # TODO: remove this delegation and write your own implementation here.
-    return _baseline(ref_path, cor_path)
+    with open(ref_path, "rb") as f:
+        ref = np.frombuffer(f.read(), dtype=np.uint8)
+    with open(cor_path, "rb") as f:
+        cor = np.frombuffer(f.read(), dtype=np.uint8)
+
+    if len(ref) != len(cor):
+        raise ValueError("reference and corrupted files differ in length")
+
+    # Boolean mask of differing positions
+    mask = ref != cor
+    if not mask.any():
+        return []
+
+    # Find run boundaries using diff on the mask
+    padded = np.empty(len(mask) + 2, dtype=np.int8)
+    padded[0] = 0
+    padded[1:-1] = mask.view(np.int8)
+    padded[-1] = 0
+    d = np.diff(padded.astype(np.int8))
+    starts = np.where(d == 1)[0]
+    ends = np.where(d == -1)[0]
+
+    return [(int(s), int(e - s)) for s, e in zip(starts, ends)]
diff --git a/rounds/3_dna/solution.py b/rounds/3_dna/solution.py
index 74d2393..2a60ef6 100644
--- a/rounds/3_dna/solution.py
+++ b/rounds/3_dna/solution.py
@@ -1,7 +1,6 @@
 """Your Round 3 solution — DNA sequence matcher."""
 
 from __future__ import annotations
-import re
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import mmap
 import os
@@ -13,7 +12,6 @@ def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]
     Returns ``[(record_id, [positions...]), ...]`` in file order.
     """
     # Read as bytes — no decode overhead, pattern stays as bytes.
-    regex = re.compile(b"(?=" + re.escape(pattern) + b")")
     with open(fasta_path, "rb") as f:
         with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
             data = bytes(mm)
@@ -32,8 +30,8 @@ def process_record(
     ) -> tuple[int, tuple[str, list[int]]] | None:
         chunk = data[start:end]
         lines = chunk.split(b"\n")
-        record_id = lines[0][1:].strip().decode("ascii")
-        sequence = b"".join(lines[1:]).replace(b" ", b"")
+        record_id = lines[0][1:].rstrip().decode("ascii")
+        sequence = b"".join(lines[1:])
         positions = []
         start_pos = 0
         while (hit := sequence.find(pattern, start_pos)) != -1: