From 9d120f67cc33cab86d359f3cba95c529238873ec Mon Sep 17 00:00:00 2001 From: Ross Kukard Date: Wed, 13 May 2026 09:54:29 -0700 Subject: [PATCH 1/8] Add RossK1 to the README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 44e0723..179cbcb 100644 --- a/README.md +++ b/README.md @@ -91,3 +91,4 @@ scripts/ ``` Each round's `data/` directory is generated locally and gitignored. +This is RossK1's PR From d99cc18bc365fec61094b17371549b9b5a742149 Mon Sep 17 00:00:00 2001 From: Ross Kukard Date: Wed, 13 May 2026 10:07:59 -0700 Subject: [PATCH 2/8] Just switching out to code instead of the function --- rounds/1_histogram/solution.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/rounds/1_histogram/solution.py b/rounds/1_histogram/solution.py index dffbee5..49ea272 100644 --- a/rounds/1_histogram/solution.py +++ b/rounds/1_histogram/solution.py @@ -8,7 +8,13 @@ def compute_histogram(path: str) -> dict[bytes, int]: """Frequency of every 2-byte bigram in the file at ``path``.""" - # TODO: remove this delegation and write your own implementation here. - from .baseline import compute_histogram as _baseline - - return _baseline(path) + with open(path, "rb") as f: + data = f.read() + counts: dict[bytes, int] = {} + for i in range(len(data) - 1): + bigram = data[i : i + 2] + if bigram in counts: + counts[bigram] += 1 + else: + counts[bigram] = 1 + return counts From 6d156e09f65f99b2a151bd6546ce84ad81ee386d Mon Sep 17 00:00:00 2001 From: Ross Kukard Date: Wed, 13 May 2026 10:15:32 -0700 Subject: [PATCH 3/8] Switching to keeping track with set instead --- rounds/1_histogram/solution.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/rounds/1_histogram/solution.py b/rounds/1_histogram/solution.py index 49ea272..8921703 100644 --- a/rounds/1_histogram/solution.py +++ b/rounds/1_histogram/solution.py @@ -11,10 +11,12 @@ def compute_histogram(path: str) -> dict[bytes, int]: with open(path, "rb") as f: data = f.read() counts: dict[bytes, int] = {} + bigrams_seen = set() for i in range(len(data) - 1): bigram = data[i : i + 2] - if bigram in counts: + if bigram in bigrams_seen: counts[bigram] += 1 else: + bigrams_seen.add(bigram) counts[bigram] = 1 return counts From df1dad00e532c604d1a3c71f20e5a2a079ac84c8 Mon Sep 17 00:00:00 2001 From: Ross Kukard Date: Wed, 13 May 2026 10:54:59 -0700 Subject: [PATCH 4/8] Round 3 first try --- rounds/3_dna/solution.py | 49 ++++++++++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 9 deletions(-) diff --git a/rounds/3_dna/solution.py b/rounds/3_dna/solution.py index 8b917da..3fe38dc 100644 --- a/rounds/3_dna/solution.py +++ b/rounds/3_dna/solution.py @@ -1,11 +1,8 @@ -"""Your Round 3 solution — DNA sequence matcher. +"""Your Round 3 solution — DNA sequence matcher.""" -**Edit this file.** It currently delegates to ``baseline.py`` so everything -passes out of the box. Replace the body of ``find_matches`` with your -own faster implementation. -""" - -from .baseline import find_matches as _baseline +from __future__ import annotations +import re +from concurrent.futures import ThreadPoolExecutor, as_completed def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]: @@ -13,5 +10,39 @@ def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]] Returns ``[(record_id, [positions...]), ...]`` in file order. """ - # TODO: remove this delegation and write your own implementation here. - return _baseline(fasta_path, pattern) + # Read as bytes — no decode overhead, pattern stays as bytes. + with open(fasta_path, "rb") as f: + data = f.read() + + # Pre-compile a lookahead regex so overlapping matches are found in one pass. + regex = re.compile(b"(?=" + re.escape(pattern) + b")") + + def process_record(record: bytes) -> tuple[str, list[int]] | None: + if not record.strip(): + return None + lines = record.split(b"\n") + record_id = lines[0].strip().decode("ascii") + sequence = b"".join(lines[1:]).replace(b" ", b"") + positions = [m.start() for m in regex.finditer(sequence)] + if positions: + return (record_id, positions) + return None + + # Split on b'>' — first chunk is empty for well-formed files. + records = data.split(b">")[1:] # skip leading empty chunk + + results: list[tuple[str, list[int]]] = [] + + # re operations release the GIL, so ThreadPoolExecutor gives real parallelism. + with ThreadPoolExecutor() as executor: + # Submit in order, preserve file order via index. + futures = {executor.submit(process_record, r): i for i, r in enumerate(records)} + ordered: list[tuple[int, tuple[str, list[int]]]] = [] + for future in as_completed(futures): + result = future.result() + if result is not None: + ordered.append((futures[future], result)) + + ordered.sort(key=lambda x: x[0]) + results = [r for _, r in ordered] + return results From c0ae909edd3361fbaca2b5d64126ef7459ffb218 Mon Sep 17 00:00:00 2001 From: Ross Kukard Date: Wed, 13 May 2026 11:19:07 -0700 Subject: [PATCH 5/8] Round 3 using mmap --- rounds/3_dna/solution.py | 59 +++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 28 deletions(-) diff --git a/rounds/3_dna/solution.py b/rounds/3_dna/solution.py index 3fe38dc..96eff93 100644 --- a/rounds/3_dna/solution.py +++ b/rounds/3_dna/solution.py @@ -3,6 +3,8 @@ from __future__ import annotations import re from concurrent.futures import ThreadPoolExecutor, as_completed +import mmap +import os def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]: @@ -11,38 +13,39 @@ def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]] Returns ``[(record_id, [positions...]), ...]`` in file order. """ # Read as bytes — no decode overhead, pattern stays as bytes. - with open(fasta_path, "rb") as f: - data = f.read() - - # Pre-compile a lookahead regex so overlapping matches are found in one pass. regex = re.compile(b"(?=" + re.escape(pattern) + b")") - - def process_record(record: bytes) -> tuple[str, list[int]] | None: - if not record.strip(): - return None - lines = record.split(b"\n") - record_id = lines[0].strip().decode("ascii") + with open(fasta_path, "rb") as f: + with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm: + data = bytes(mm) + size = len(data) + + # Find record boundaries without copying data + offsets = [0] + pos = data.find(b">", 1) + while pos != -1: + offsets.append(pos) + pos = data.find(b">", pos + 1) + offsets.append(size) + + def process_record( + start: int, end: int, idx: int + ) -> tuple[int, tuple[str, list[int]]] | None: + chunk = data[start:end] + lines = chunk.split(b"\n") + record_id = lines[0][1:].strip().decode("ascii") sequence = b"".join(lines[1:]).replace(b" ", b"") positions = [m.start() for m in regex.finditer(sequence)] if positions: - return (record_id, positions) + return (idx, (record_id, positions)) return None - # Split on b'>' — first chunk is empty for well-formed files. - records = data.split(b">")[1:] # skip leading empty chunk - - results: list[tuple[str, list[int]]] = [] - - # re operations release the GIL, so ThreadPoolExecutor gives real parallelism. - with ThreadPoolExecutor() as executor: - # Submit in order, preserve file order via index. - futures = {executor.submit(process_record, r): i for i, r in enumerate(records)} - ordered: list[tuple[int, tuple[str, list[int]]]] = [] - for future in as_completed(futures): - result = future.result() - if result is not None: - ordered.append((futures[future], result)) + max_workers = min(32, (os.cpu_count() or 1) * 2) + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = [ + executor.submit(process_record, offsets[i], offsets[i + 1], i) + for i in range(len(offsets) - 1) + ] + results = [r for f in as_completed(futures) if (r := f.result()) is not None] - ordered.sort(key=lambda x: x[0]) - results = [r for _, r in ordered] - return results + results.sort(key=lambda x: x[0]) + return [r for _, r in results] From a02fce2fe8e46130db676063689d39ac1ed42339 Mon Sep 17 00:00:00 2001 From: Ross Kukard Date: Wed, 13 May 2026 11:30:01 -0700 Subject: [PATCH 6/8] Round 1 fixes --- rounds/1_histogram/solution.py | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/rounds/1_histogram/solution.py b/rounds/1_histogram/solution.py index 8921703..fe9bc2c 100644 --- a/rounds/1_histogram/solution.py +++ b/rounds/1_histogram/solution.py @@ -1,22 +1,15 @@ -"""Your Round 1 solution — byte-pair histogram. +"""Your Round 1 solution — byte-pair histogram.""" -**Edit this file.** It currently delegates to ``baseline.py`` so everything -passes out of the box. Replace the body of ``compute_histogram`` with your -own faster implementation. -""" +import numpy as np +import mmap def compute_histogram(path: str) -> dict[bytes, int]: """Frequency of every 2-byte bigram in the file at ``path``.""" with open(path, "rb") as f: - data = f.read() - counts: dict[bytes, int] = {} - bigrams_seen = set() - for i in range(len(data) - 1): - bigram = data[i : i + 2] - if bigram in bigrams_seen: - counts[bigram] += 1 - else: - bigrams_seen.add(bigram) - counts[bigram] = 1 - return counts + with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm: + # Copy while mmap is still open — no exported pointer issue + data = np.frombuffer(mm, dtype=np.uint8).copy() + keys = data[:-1].astype(np.uint16) << 8 | data[1:].astype(np.uint16) + counts = np.bincount(keys, minlength=65536) + return {bytes([k >> 8, k & 0xFF]): int(counts[k]) for k in np.nonzero(counts)[0]} From 3e9f823b9a25afcf434a98871c0147cd61b72bbc Mon Sep 17 00:00:00 2001 From: Ross Kukard Date: Wed, 13 May 2026 12:08:50 -0700 Subject: [PATCH 7/8] Round 3 - Switching from regex to find --- rounds/3_dna/solution.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/rounds/3_dna/solution.py b/rounds/3_dna/solution.py index 96eff93..74d2393 100644 --- a/rounds/3_dna/solution.py +++ b/rounds/3_dna/solution.py @@ -34,7 +34,11 @@ def process_record( lines = chunk.split(b"\n") record_id = lines[0][1:].strip().decode("ascii") sequence = b"".join(lines[1:]).replace(b" ", b"") - positions = [m.start() for m in regex.finditer(sequence)] + positions = [] + start_pos = 0 + while (hit := sequence.find(pattern, start_pos)) != -1: + positions.append(hit) + start_pos = hit + 1 if positions: return (idx, (record_id, positions)) return None From 6c050fa680d1265755fdb89763b466fde51e552d Mon Sep 17 00:00:00 2001 From: Ross Kukard Date: Wed, 13 May 2026 12:18:35 -0700 Subject: [PATCH 8/8] Round 2 - Corruptions cleaned up --- rounds/2_corruption/solution.py | 34 ++++++++++++++++++++++++--------- rounds/3_dna/solution.py | 6 ++---- 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/rounds/2_corruption/solution.py b/rounds/2_corruption/solution.py index a5b752a..9f86315 100644 --- a/rounds/2_corruption/solution.py +++ b/rounds/2_corruption/solution.py @@ -1,14 +1,30 @@ -"""Your Round 2 solution — corruption scanner. +from __future__ import annotations -**Edit this file.** It currently delegates to ``baseline.py`` so everything -passes out of the box. Replace the body of ``find_corruptions`` with your -own faster implementation. -""" - -from .baseline import find_corruptions as _baseline +import numpy as np def find_corruptions(ref_path: str, cor_path: str) -> list[tuple[int, int]]: """Return ``[(offset, length), ...]`` for every differing byte range.""" - # TODO: remove this delegation and write your own implementation here. - return _baseline(ref_path, cor_path) + with open(ref_path, "rb") as f: + ref = np.frombuffer(f.read(), dtype=np.uint8) + with open(cor_path, "rb") as f: + cor = np.frombuffer(f.read(), dtype=np.uint8) + + if len(ref) != len(cor): + raise ValueError("reference and corrupted files differ in length") + + # Boolean mask of differing positions + mask = ref != cor + if not mask.any(): + return [] + + # Find run boundaries using diff on the mask + padded = np.empty(len(mask) + 2, dtype=np.int8) + padded[0] = 0 + padded[1:-1] = mask.view(np.int8) + padded[-1] = 0 + d = np.diff(padded.astype(np.int8)) + starts = np.where(d == 1)[0] + ends = np.where(d == -1)[0] + + return [(int(s), int(e - s)) for s, e in zip(starts, ends)] diff --git a/rounds/3_dna/solution.py b/rounds/3_dna/solution.py index 74d2393..2a60ef6 100644 --- a/rounds/3_dna/solution.py +++ b/rounds/3_dna/solution.py @@ -1,7 +1,6 @@ """Your Round 3 solution — DNA sequence matcher.""" from __future__ import annotations -import re from concurrent.futures import ThreadPoolExecutor, as_completed import mmap import os @@ -13,7 +12,6 @@ def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]] Returns ``[(record_id, [positions...]), ...]`` in file order. """ # Read as bytes — no decode overhead, pattern stays as bytes. - regex = re.compile(b"(?=" + re.escape(pattern) + b")") with open(fasta_path, "rb") as f: with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm: data = bytes(mm) @@ -32,8 +30,8 @@ def process_record( ) -> tuple[int, tuple[str, list[int]]] | None: chunk = data[start:end] lines = chunk.split(b"\n") - record_id = lines[0][1:].strip().decode("ascii") - sequence = b"".join(lines[1:]).replace(b" ", b"") + record_id = lines[0][1:].rstrip().decode("ascii") + sequence = b"".join(lines[1:]) positions = [] start_pos = 0 while (hit := sequence.find(pattern, start_pos)) != -1: