Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .python-version
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.15t
3.15
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Python Performance Lab: Sharpening Your Instincts
# Python Performance Lab: Sharpening Your Instincts - `adriencaccia`

A PyCon US 2026 hands-on tutorial. You optimize intentionally slow Python code
across three rounds plus a team challenge, measuring every change with
Expand Down
20 changes: 17 additions & 3 deletions rounds/1_histogram/solution.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,21 @@

def compute_histogram(path: str) -> dict[bytes, int]:
"""Frequency of every 2-byte bigram in the file at ``path``."""
# TODO: remove this delegation and write your own implementation here.
from .baseline import compute_histogram as _baseline
# Step 1: read the whole file into memory as a single bytes object.
with open(path, "rb") as f:
data = f.read()

return _baseline(path)
# Create a 2D matrix to count bigrams
counts = [[0] * 256 for _ in range(256)]

for i in range(len(data) - 1):
# Increment the count in each cell
counts[data[i]][data[i + 1]] += 1

# Convert the matrix to the original format
output = {}
for i in range(256):
for j in range(256):
if counts[i][j] > 0:
output[bytes([i, j])] = counts[i][j]
return output
79 changes: 73 additions & 6 deletions rounds/3_dna/solution.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,80 @@
own faster implementation.
"""

from .baseline import find_matches as _baseline
from __future__ import annotations

import os
from concurrent.futures import ThreadPoolExecutor

_NL = 0x0A # b"\n"


def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]:
"""Find every FASTA record whose sequence contains ``pattern``.
with open(fasta_path, "rb") as f:
data = f.read()

# Step 1: locate every record start. A record starts with ``>`` either at
# offset 0 or immediately after a ``\n``.
starts: list[int] = []
i = 0
while True:
p = data.find(b">", i)
if p == -1:
break
if p == 0 or data[p - 1] == _NL:
starts.append(p)
i = p + 1
starts.append(len(data)) # sentinel marking the end of the last record.

num_records = len(starts) - 1
if num_records <= 0:
return []

# Step 2: parallel scan. Choose enough batches to keep workers balanced
# even when record sizes vary.
n_workers = max(1, os.cpu_count() or 1)
batches = max(1, n_workers * 4)
batch_size = max(1, (num_records + batches - 1) // batches)

def scan_batch(start_idx: int, end_idx: int) -> list[tuple[int, str, list[int]]]:
out: list[tuple[int, str, list[int]]] = []
for j in range(start_idx, end_idx):
rec_start = starts[j]
rec_end = starts[j + 1]

# Locate the end of the header line within this record's slice.
nl = data.find(b"\n", rec_start, rec_end)
if nl <= rec_start:
continue # Malformed or header-only.

record_id = data[rec_start + 1 : nl].decode("ascii").strip()

# Contiguous sequence: drop the newlines so matches that straddle
# line breaks are still found by ``bytes.find``.
sequence = data[nl + 1 : rec_end].replace(b"\n", b"")

positions: list[int] = []
s = 0
while True:
p = sequence.find(pattern, s)
if p == -1:
break
positions.append(p)
s = p + 1

if positions:
out.append((j, record_id, positions))
return out

with ThreadPoolExecutor(max_workers=n_workers) as pool:
futures = [
pool.submit(scan_batch, lo, min(lo + batch_size, num_records))
for lo in range(0, num_records, batch_size)
]
chunks = [f.result() for f in futures]

Returns ``[(record_id, [positions...]), ...]`` in file order.
"""
# TODO: remove this delegation and write your own implementation here.
return _baseline(fasta_path, pattern)
# Step 3: flatten and restore file order (record index is monotonic per
# batch, but batches finish in arbitrary order).
flat = [item for chunk in chunks for item in chunk]
flat.sort(key=lambda triple: triple[0])
return [(rid, positions) for _, rid, positions in flat]
Loading