From 025c52c718f6900d6d08d6b1bccf880238048e44 Mon Sep 17 00:00:00 2001 From: Tim Schilling Date: Wed, 13 May 2026 09:17:27 -0700 Subject: [PATCH 1/7] Add Tim Schilling to the README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 44e0723..aa94959 100644 --- a/README.md +++ b/README.md @@ -91,3 +91,4 @@ scripts/ ``` Each round's `data/` directory is generated locally and gitignored. +This is Tim Schilling's PR From 4bca2683345a173c5b14d376a02610dda55d41c1 Mon Sep 17 00:00:00 2001 From: Tim Schilling Date: Wed, 13 May 2026 09:47:46 -0700 Subject: [PATCH 2/7] Round 1 1.45s, collections.Counter --- rounds/1_histogram/solution.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/rounds/1_histogram/solution.py b/rounds/1_histogram/solution.py index dffbee5..e59f210 100644 --- a/rounds/1_histogram/solution.py +++ b/rounds/1_histogram/solution.py @@ -4,11 +4,26 @@ passes out of the box. Replace the body of ``compute_histogram`` with your own faster implementation. """ +from collections import Counter + + +def get_biagrams(data): + i = 0 + max_len = len(data) -1 + while i < max_len: + biagram = data[i: i + 2] + yield biagram + i += 1 def compute_histogram(path: str) -> dict[bytes, int]: """Frequency of every 2-byte bigram in the file at ``path``.""" - # TODO: remove this delegation and write your own implementation here. - from .baseline import compute_histogram as _baseline + # Step 1: read the whole file into memory as a single bytes object. + with open(path, "rb") as f: + data = f.read() - return _baseline(path) + # Step 2: slide a 2-byte window across the buffer. For ``b"ABCD"`` the + # iterations produce ``b"AB"``, ``b"BC"``, then ``b"CD"``. For each window, + # bump the matching bucket in a ``dict`` keyed by the bigram itself. + counts = Counter(get_biagrams(data)) + return counts From 6af57c3497e4330875b9947e74aadad6fcd0fae0 Mon Sep 17 00:00:00 2001 From: Tim Schilling Date: Wed, 13 May 2026 10:14:33 -0700 Subject: [PATCH 3/7] Use struct.unpack and iterate over rather than creating new lists. --- rounds/1_histogram/solution.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/rounds/1_histogram/solution.py b/rounds/1_histogram/solution.py index e59f210..6f78811 100644 --- a/rounds/1_histogram/solution.py +++ b/rounds/1_histogram/solution.py @@ -5,15 +5,16 @@ own faster implementation. """ from collections import Counter +from struct import unpack def get_biagrams(data): - i = 0 - max_len = len(data) -1 - while i < max_len: - biagram = data[i: i + 2] - yield biagram - i += 1 + data_iter = iter(unpack(f'{len(data)}c', data)) + val_0, val_1 = next(data_iter), next(data_iter) + for value in data_iter: + yield val_0+val_1 + val_0, val_1 = val_1, value + yield val_0+val_1 def compute_histogram(path: str) -> dict[bytes, int]: From ee4959fca47d340f3cfc9c4631dce1a0c41efce4 Mon Sep 17 00:00:00 2001 From: Tim Schilling Date: Wed, 13 May 2026 11:11:27 -0700 Subject: [PATCH 4/7] Use threads to collect the matches. --- rounds/3_dna/solution.py | 50 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 2 deletions(-) diff --git a/rounds/3_dna/solution.py b/rounds/3_dna/solution.py index 8b917da..9f3ce95 100644 --- a/rounds/3_dna/solution.py +++ b/rounds/3_dna/solution.py @@ -4,14 +4,60 @@ passes out of the box. Replace the body of ``find_matches`` with your own faster implementation. """ +import re +from concurrent.futures import ThreadPoolExecutor, as_completed from .baseline import find_matches as _baseline +def find_record_matches(pattern_str, sequence): + # Step 4: walk the sequence with ``str.find()``, advancing one byte + # past each hit so overlapping matches are reported too. + positions: list[int] = [] + start = 0 + while True: + pos = sequence.find(pattern_str, start) + if pos == -1: + break + positions.append(pos) + start = pos + 1 + return positions + def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]: """Find every FASTA record whose sequence contains ``pattern``. Returns ``[(record_id, [positions...]), ...]`` in file order. """ - # TODO: remove this delegation and write your own implementation here. - return _baseline(fasta_path, pattern) + # Step 1: read the whole FASTA file as text and decode the pattern so the + # search below can use a single ``str`` API. + pattern_str = pattern.decode("ascii") + with open(fasta_path, "r") as f: + text = f.read() + + # Step 2: split the file on '>' to peel off one record at a time. The + # first element is the chunk before any header (empty for well-formed + # files) and is skipped by the ``.strip()`` guard below. + + futures = {} + + with ThreadPoolExecutor(max_workers=None) as executor: + + for record in text.split(">"): + if not record.strip(): + continue + + # Step 3: a record looks like ``"\n\n\n..."``. + # The id is the first line; the remaining lines are joined back into a + # single contiguous sequence string. + lines = record.split("\n") + record_id = lines[0].strip() + sequence = "".join(lines[1:]).replace(" ", "") + + futures[executor.submit(find_record_matches, pattern_str, sequence)] = record_id + + + return [ + (record_id, positions) + for future, record_id in futures.items() + if (positions := future.result()) + ] From 2510d94ecb403b570892a58e44e2a09c5d536f90 Mon Sep 17 00:00:00 2001 From: Tim Schilling Date: Wed, 13 May 2026 11:20:47 -0700 Subject: [PATCH 5/7] Use Numpy vectorized comparisons. --- rounds/3_dna/solution.py | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/rounds/3_dna/solution.py b/rounds/3_dna/solution.py index 9f3ce95..31cf0d7 100644 --- a/rounds/3_dna/solution.py +++ b/rounds/3_dna/solution.py @@ -4,10 +4,35 @@ passes out of the box. Replace the body of ``find_matches`` with your own faster implementation. """ -import re -from concurrent.futures import ThreadPoolExecutor, as_completed +from concurrent.futures import ThreadPoolExecutor -from .baseline import find_matches as _baseline +import numpy as np + + +def find_record_matches(pattern_str, sequence): + if not pattern_str or not sequence: + return [] + + # Convert strings to numpy byte arrays + seq_arr = np.frombuffer(sequence.encode(), dtype=np.uint8) + pat_arr = np.frombuffer(pattern_str.encode(), dtype=np.uint8) + + pat_len = len(pat_arr) + seq_len = len(seq_arr) + + if pat_len > seq_len: + return [] + + # Create a 2D view of the sequence using a sliding window (no data copy) + # Shape: (seq_len - pat_len + 1, pat_len) + shape = (seq_len - pat_len + 1, pat_len) + strides = (seq_arr.strides[0], seq_arr.strides[0]) + windows = np.lib.stride_tricks.as_strided(seq_arr, shape=shape, strides=strides) + + # Compare every window against the pattern in one vectorized operation + matches = np.all(windows == pat_arr, axis=1) + + return np.where(matches)[0].tolist() def find_record_matches(pattern_str, sequence): From 30446b32d1b9ec487cb6d8605dc6630af738f779 Mon Sep 17 00:00:00 2001 From: Tim Schilling Date: Wed, 13 May 2026 11:56:59 -0700 Subject: [PATCH 6/7] Remove numpy. Switch to chunking the file into separate threads. --- rounds/3_dna/solution.py | 117 +++++++++++++++++++-------------------- 1 file changed, 56 insertions(+), 61 deletions(-) diff --git a/rounds/3_dna/solution.py b/rounds/3_dna/solution.py index 31cf0d7..02d9dff 100644 --- a/rounds/3_dna/solution.py +++ b/rounds/3_dna/solution.py @@ -4,40 +4,10 @@ passes out of the box. Replace the body of ``find_matches`` with your own faster implementation. """ +import os from concurrent.futures import ThreadPoolExecutor -import numpy as np - - -def find_record_matches(pattern_str, sequence): - if not pattern_str or not sequence: - return [] - - # Convert strings to numpy byte arrays - seq_arr = np.frombuffer(sequence.encode(), dtype=np.uint8) - pat_arr = np.frombuffer(pattern_str.encode(), dtype=np.uint8) - - pat_len = len(pat_arr) - seq_len = len(seq_arr) - - if pat_len > seq_len: - return [] - - # Create a 2D view of the sequence using a sliding window (no data copy) - # Shape: (seq_len - pat_len + 1, pat_len) - shape = (seq_len - pat_len + 1, pat_len) - strides = (seq_arr.strides[0], seq_arr.strides[0]) - windows = np.lib.stride_tricks.as_strided(seq_arr, shape=shape, strides=strides) - - # Compare every window against the pattern in one vectorized operation - matches = np.all(windows == pat_arr, axis=1) - - return np.where(matches)[0].tolist() - - -def find_record_matches(pattern_str, sequence): - # Step 4: walk the sequence with ``str.find()``, advancing one byte - # past each hit so overlapping matches are reported too. +def _find_record_matches(pattern_str, sequence): positions: list[int] = [] start = 0 while True: @@ -48,41 +18,66 @@ def find_record_matches(pattern_str, sequence): start = pos + 1 return positions + +def _search_chunk(fasta_path, chunk_start, chunk_end, pattern_str): + with open(fasta_path, "r") as f: + f.seek(chunk_start) + if chunk_end is None: + text = f.read() + else: + # One bulk read for the chunk, then a few readline() calls to + # complete the last record that extends past our boundary. + text = f.read(chunk_end - chunk_start) + while True: + line = f.readline() + if not line or line.startswith(">"): + break + text += line + + # For chunks that don't start at byte 0, skip the partial-record fragment + # at the front (bytes belonging to the previous chunk's last record). + if chunk_start > 0: + if not text.startswith(">"): + idx = text.find("\n>") + if idx == -1: + return [] + text = text[idx + 1:] # keep the ">" + + results = [] + for record in text.split(">"): + if not record.strip(): + continue + lines = record.split("\n") + record_id = lines[0].strip() + sequence = "".join(lines[1:]).replace(" ", "") + positions = _find_record_matches(pattern_str, sequence) + if positions: + results.append((record_id, positions)) + return results + + def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]: """Find every FASTA record whose sequence contains ``pattern``. Returns ``[(record_id, [positions...]), ...]`` in file order. """ - # Step 1: read the whole FASTA file as text and decode the pattern so the - # search below can use a single ``str`` API. pattern_str = pattern.decode("ascii") - with open(fasta_path, "r") as f: - text = f.read() - - # Step 2: split the file on '>' to peel off one record at a time. The - # first element is the chunk before any header (empty for well-formed - # files) and is skipped by the ``.strip()`` guard below. - - futures = {} + num_threads = os.cpu_count() or 4 + file_size = os.path.getsize(fasta_path) + chunk_size = max(1, file_size // num_threads) - with ThreadPoolExecutor(max_workers=None) as executor: - - for record in text.split(">"): - if not record.strip(): - continue - - # Step 3: a record looks like ``"\n\n\n..."``. - # The id is the first line; the remaining lines are joined back into a - # single contiguous sequence string. - lines = record.split("\n") - record_id = lines[0].strip() - sequence = "".join(lines[1:]).replace(" ", "") - - futures[executor.submit(find_record_matches, pattern_str, sequence)] = record_id + chunks = [ + (i * chunk_size, (i + 1) * chunk_size if i < num_threads - 1 else None) + for i in range(num_threads) + ] + with ThreadPoolExecutor(max_workers=num_threads) as executor: + futures = [ + executor.submit(_search_chunk, fasta_path, start, end, pattern_str) + for start, end in chunks + ] - return [ - (record_id, positions) - for future, record_id in futures.items() - if (positions := future.result()) - ] + results = [] + for future in futures: + results.extend(future.result()) + return results From 810aa8da1b3ac49680fe66b8d8825b0a077cb065 Mon Sep 17 00:00:00 2001 From: Tim Schilling Date: Wed, 13 May 2026 12:11:06 -0700 Subject: [PATCH 7/7] Switch to bytes and away from strings. --- rounds/3_dna/solution.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/rounds/3_dna/solution.py b/rounds/3_dna/solution.py index 02d9dff..816092f 100644 --- a/rounds/3_dna/solution.py +++ b/rounds/3_dna/solution.py @@ -7,11 +7,12 @@ import os from concurrent.futures import ThreadPoolExecutor -def _find_record_matches(pattern_str, sequence): + +def _find_record_matches(pattern, sequence): positions: list[int] = [] start = 0 while True: - pos = sequence.find(pattern_str, start) + pos = sequence.find(pattern, start) if pos == -1: break positions.append(pos) @@ -19,38 +20,40 @@ def _find_record_matches(pattern_str, sequence): return positions -def _search_chunk(fasta_path, chunk_start, chunk_end, pattern_str): - with open(fasta_path, "r") as f: +def _search_chunk(fasta_path, chunk_start, chunk_end, pattern): + with open(fasta_path, "rb") as f: f.seek(chunk_start) if chunk_end is None: text = f.read() else: # One bulk read for the chunk, then a few readline() calls to # complete the last record that extends past our boundary. - text = f.read(chunk_end - chunk_start) + # Collect parts in a list to avoid O(n²) bytes concatenation. + parts = [f.read(chunk_end - chunk_start)] while True: line = f.readline() - if not line or line.startswith(">"): + if not line or line.startswith(b">"): break - text += line + parts.append(line) + text = b"".join(parts) # For chunks that don't start at byte 0, skip the partial-record fragment # at the front (bytes belonging to the previous chunk's last record). if chunk_start > 0: - if not text.startswith(">"): - idx = text.find("\n>") + if not text.startswith(b">"): + idx = text.find(b"\n>") if idx == -1: return [] text = text[idx + 1:] # keep the ">" results = [] - for record in text.split(">"): + for record in text.split(b">"): if not record.strip(): continue - lines = record.split("\n") - record_id = lines[0].strip() - sequence = "".join(lines[1:]).replace(" ", "") - positions = _find_record_matches(pattern_str, sequence) + lines = record.split(b"\n") + record_id = lines[0].strip().decode("ascii") + sequence = b"".join(lines[1:]).replace(b" ", b"") + positions = _find_record_matches(pattern, sequence) if positions: results.append((record_id, positions)) return results @@ -61,7 +64,6 @@ def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]] Returns ``[(record_id, [positions...]), ...]`` in file order. """ - pattern_str = pattern.decode("ascii") num_threads = os.cpu_count() or 4 file_size = os.path.getsize(fasta_path) chunk_size = max(1, file_size // num_threads) @@ -73,7 +75,7 @@ def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]] with ThreadPoolExecutor(max_workers=num_threads) as executor: futures = [ - executor.submit(_search_chunk, fasta_path, start, end, pattern_str) + executor.submit(_search_chunk, fasta_path, start, end, pattern) for start, end in chunks ]