diff --git a/README.md b/README.md index 44e0723..179cbcb 100644 --- a/README.md +++ b/README.md @@ -91,3 +91,4 @@ scripts/ ``` Each round's `data/` directory is generated locally and gitignored. +This is RossK1's PR diff --git a/rounds/1_histogram/solution.py b/rounds/1_histogram/solution.py index dffbee5..fe9bc2c 100644 --- a/rounds/1_histogram/solution.py +++ b/rounds/1_histogram/solution.py @@ -1,14 +1,15 @@ -"""Your Round 1 solution — byte-pair histogram. +"""Your Round 1 solution — byte-pair histogram.""" -**Edit this file.** It currently delegates to ``baseline.py`` so everything -passes out of the box. Replace the body of ``compute_histogram`` with your -own faster implementation. -""" +import numpy as np +import mmap def compute_histogram(path: str) -> dict[bytes, int]: """Frequency of every 2-byte bigram in the file at ``path``.""" - # TODO: remove this delegation and write your own implementation here. - from .baseline import compute_histogram as _baseline - - return _baseline(path) + with open(path, "rb") as f: + with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm: + # Copy while mmap is still open — no exported pointer issue + data = np.frombuffer(mm, dtype=np.uint8).copy() + keys = data[:-1].astype(np.uint16) << 8 | data[1:].astype(np.uint16) + counts = np.bincount(keys, minlength=65536) + return {bytes([k >> 8, k & 0xFF]): int(counts[k]) for k in np.nonzero(counts)[0]} diff --git a/rounds/2_corruption/solution.py b/rounds/2_corruption/solution.py index a5b752a..9f86315 100644 --- a/rounds/2_corruption/solution.py +++ b/rounds/2_corruption/solution.py @@ -1,14 +1,30 @@ -"""Your Round 2 solution — corruption scanner. +from __future__ import annotations -**Edit this file.** It currently delegates to ``baseline.py`` so everything -passes out of the box. Replace the body of ``find_corruptions`` with your -own faster implementation. -""" - -from .baseline import find_corruptions as _baseline +import numpy as np def find_corruptions(ref_path: str, cor_path: str) -> list[tuple[int, int]]: """Return ``[(offset, length), ...]`` for every differing byte range.""" - # TODO: remove this delegation and write your own implementation here. - return _baseline(ref_path, cor_path) + with open(ref_path, "rb") as f: + ref = np.frombuffer(f.read(), dtype=np.uint8) + with open(cor_path, "rb") as f: + cor = np.frombuffer(f.read(), dtype=np.uint8) + + if len(ref) != len(cor): + raise ValueError("reference and corrupted files differ in length") + + # Boolean mask of differing positions + mask = ref != cor + if not mask.any(): + return [] + + # Find run boundaries using diff on the mask + padded = np.empty(len(mask) + 2, dtype=np.int8) + padded[0] = 0 + padded[1:-1] = mask.view(np.int8) + padded[-1] = 0 + d = np.diff(padded.astype(np.int8)) + starts = np.where(d == 1)[0] + ends = np.where(d == -1)[0] + + return [(int(s), int(e - s)) for s, e in zip(starts, ends)] diff --git a/rounds/3_dna/solution.py b/rounds/3_dna/solution.py index 8b917da..2a60ef6 100644 --- a/rounds/3_dna/solution.py +++ b/rounds/3_dna/solution.py @@ -1,11 +1,9 @@ -"""Your Round 3 solution — DNA sequence matcher. +"""Your Round 3 solution — DNA sequence matcher.""" -**Edit this file.** It currently delegates to ``baseline.py`` so everything -passes out of the box. Replace the body of ``find_matches`` with your -own faster implementation. -""" - -from .baseline import find_matches as _baseline +from __future__ import annotations +from concurrent.futures import ThreadPoolExecutor, as_completed +import mmap +import os def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]: @@ -13,5 +11,43 @@ def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]] Returns ``[(record_id, [positions...]), ...]`` in file order. """ - # TODO: remove this delegation and write your own implementation here. - return _baseline(fasta_path, pattern) + # Read as bytes — no decode overhead, pattern stays as bytes. + with open(fasta_path, "rb") as f: + with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm: + data = bytes(mm) + size = len(data) + + # Find record boundaries without copying data + offsets = [0] + pos = data.find(b">", 1) + while pos != -1: + offsets.append(pos) + pos = data.find(b">", pos + 1) + offsets.append(size) + + def process_record( + start: int, end: int, idx: int + ) -> tuple[int, tuple[str, list[int]]] | None: + chunk = data[start:end] + lines = chunk.split(b"\n") + record_id = lines[0][1:].rstrip().decode("ascii") + sequence = b"".join(lines[1:]) + positions = [] + start_pos = 0 + while (hit := sequence.find(pattern, start_pos)) != -1: + positions.append(hit) + start_pos = hit + 1 + if positions: + return (idx, (record_id, positions)) + return None + + max_workers = min(32, (os.cpu_count() or 1) * 2) + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = [ + executor.submit(process_record, offsets[i], offsets[i + 1], i) + for i in range(len(offsets) - 1) + ] + results = [r for f in as_completed(futures) if (r := f.result()) is not None] + + results.sort(key=lambda x: x[0]) + return [r for _, r in results]