Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,6 @@ __pycache__/

# CodSpeed
.codspeed/

# Mine
codes
2 changes: 1 addition & 1 deletion .python-version
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.15t
3.14
23 changes: 20 additions & 3 deletions rounds/1_histogram/solution.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,28 @@
passes out of the box. Replace the body of ``compute_histogram`` with your
own faster implementation.
"""
import numpy as np


def compute_histogram(path: str) -> dict[bytes, int]:
"""Frequency of every 2-byte bigram in the file at ``path``."""
# TODO: remove this delegation and write your own implementation here.
from .baseline import compute_histogram as _baseline
# Step 1: read the whole file into memory as a single bytes object.
with open(path, "rb") as f:
data = f.read()

return _baseline(path)
# Step 2: slide a 2-byte window across the buffer. For ``b"ABCD"`` the
# iterations produce ``b"AB"``, ``b"BC"``, then ``b"CD"``. For each window,
# bump the matching bucket in a ``dict`` keyed by the bigram itself.
counts= [[0] * 256 for _ in range(256)]

for i in range(len(data) - 1):
a, b = data[i], data[i + 1]
counts[a][b] += 1

result = {}
for i, row in enumerate(counts):
for j, count in enumerate(row):
if count > 0:
bigram = bytes([i, j])
result[bigram] = count
return result
49 changes: 47 additions & 2 deletions rounds/3_dna/solution.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,57 @@
"""

from .baseline import find_matches as _baseline
from threading import Thread
import numpy as np


def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]:
"""Find every FASTA record whose sequence contains ``pattern``.

Returns ``[(record_id, [positions...]), ...]`` in file order.
"""
# TODO: remove this delegation and write your own implementation here.
return _baseline(fasta_path, pattern)
# Step 1: read the whole FASTA file as text and decode the pattern so the
# search below can use a single ``str`` API.
pattern_str = pattern.decode("ascii")
with open(fasta_path, "rb") as f:
text = f.read()
matches: list[tuple[str, list[int]]] = []

# Step 2: split the file on '>' to peel off one record at a time. The
# first element is the chunk before any header (empty for well-formed
# files) and is skipped by the ``.strip()`` guard below.
sequences = []
for record in text.split(b">"):
if not record.strip():
continue

# Step 3: a record looks like ``"<id>\n<seq line 1>\n<seq line 2>\n..."``.
# The id is the first line; the remaining lines are joined back into a
# single contiguous sequence string.
lines = record.split(b"\n")
record_id = lines[0].strip().decode("ascii")
sequence = b"".join(lines[1:]).replace(b" ", b"").decode("ascii")
sequences.append((record_id, sequence))

threads = []
for record_id, sequence in sequences:
thread = Thread(target=match_record, args=(record_id, sequence, pattern_str, matches))
thread.start()
threads.append(thread)
for thread in threads:
thread.join()

return matches

def match_record(record_id, sequence, pattern_str, matches):
positions: list[int] = []
start = 0
while True:
pos = sequence.find(pattern_str, start)
if pos == -1:
break
positions.append(pos)
start = pos + 1

if positions:
matches.append((record_id, positions))
Loading