From c6de5a7edefcb963c02c092e242036026fc15a3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EC=86=90=EC=84=B1=EC=A4=80?= Date: Thu, 2 Jul 2026 11:22:28 +0900 Subject: [PATCH] Add MS MARCO large-tier benchmark path --- .github/workflows/public-scale.yml | 34 +++- .gitignore | 1 + .../diagnostics/public_scale_20260702.md | 20 ++ examples/ablation/download_benchmarks.py | 182 +++++++++++++++++- examples/ablation/run_tier1_benchmarks.py | 145 +++++++++++--- tests/test_download_benchmarks.py | 66 +++++++ tests/test_tier1_benchmarks.py | 57 ++++++ 7 files changed, 476 insertions(+), 29 deletions(-) create mode 100644 tests/test_download_benchmarks.py diff --git a/.github/workflows/public-scale.yml b/.github/workflows/public-scale.yml index 1c231c7..86992b0 100644 --- a/.github/workflows/public-scale.yml +++ b/.github/workflows/public-scale.yml @@ -8,7 +8,18 @@ name: Public Scale Guard on: schedule: - cron: "30 6 * * 2" # Tuesdays 06:30 UTC - workflow_dispatch: {} + workflow_dispatch: + inputs: + run_msmarco: + description: "Also run the manual MS MARCO large-tier smoke" + required: false + type: boolean + default: false + msmarco_corpus_limit: + description: "MS MARCO shard/search corpus limit" + required: false + type: string + default: "100000" jobs: public-scale: @@ -67,6 +78,25 @@ jobs: cp "$(ls -t examples/ablation/diagnostics/tier1_*.md | head -1)" \ /tmp/trec_covid_scale_guard.md + - name: Download MS MARCO large-tier shard + if: ${{ github.event_name == 'workflow_dispatch' && inputs.run_msmarco }} + run: | + uv run --extra eval python examples/ablation/download_benchmarks.py \ + --only msmarco_passage \ + --large-corpus-limit "${{ inputs.msmarco_corpus_limit }}" + + - name: Run MS MARCO large-tier smoke + if: ${{ github.event_name == 'workflow_dispatch' && inputs.run_msmarco }} + run: | + set -o pipefail + PYTHONUNBUFFERED=1 uv run --extra sqlite python examples/ablation/run_tier1_benchmarks.py \ + --only msmarco \ + --subset 50 \ + --corpus-limit "${{ inputs.msmarco_corpus_limit }}" \ + --use-sqlite-graph | tee /tmp/msmarco_large_guard.log + cp "$(ls -t examples/ablation/diagnostics/tier1_*.md | head -1)" \ + /tmp/msmarco_large_guard.md + - name: Upload public scale results if: always() uses: actions/upload-artifact@v4 @@ -77,4 +107,6 @@ jobs: /tmp/fiqa_scale_guard.md /tmp/trec_covid_scale_guard.log /tmp/trec_covid_scale_guard.md + /tmp/msmarco_large_guard.log + /tmp/msmarco_large_guard.md if-no-files-found: ignore diff --git a/.gitignore b/.gitignore index 1b2d1c2..29ce6ce 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ __pycache__/ *.pyc tests/benchmark/data/*.json +tests/benchmark/data/*.jsonl .claude/ tmp/ 마사회/ diff --git a/examples/ablation/diagnostics/public_scale_20260702.md b/examples/ablation/diagnostics/public_scale_20260702.md index 07e33c5..9ca4b94 100644 --- a/examples/ablation/diagnostics/public_scale_20260702.md +++ b/examples/ablation/diagnostics/public_scale_20260702.md @@ -6,6 +6,7 @@ |---------|----------------|-------:|--------:|-------------| | BEIR FiQA test | `tests/benchmark/data/fiqa.json` | 57,638 docs | 648 | 5-10 queries | | BEIR TREC-COVID test | `tests/benchmark/data/trec_covid.json` | 171,332 docs | 50 | 10 queries | +| BEIR MS MARCO passage dev | `tests/benchmark/data/msmarco_passage.json` + `.corpus.jsonl` | 1M shard by default from ~8.8M source passages | validation qrels | manual large tier | Mode: embedder-free `graph.search()` with `SqliteGraphBackend`. @@ -22,6 +23,9 @@ PYTHONUNBUFFERED=1 uv run --extra sqlite python examples/ablation/run_tier1_benc PYTHONUNBUFFERED=1 uv run --extra sqlite python examples/ablation/run_tier1_benchmarks.py --only trec_covid --subset 10 --corpus-limit 50000 --use-sqlite-graph PYTHONUNBUFFERED=1 uv run --extra sqlite python examples/ablation/run_tier1_benchmarks.py --only trec_covid --subset 10 --corpus-limit 100000 --use-sqlite-graph PYTHONUNBUFFERED=1 uv run --extra sqlite python examples/ablation/run_tier1_benchmarks.py --only trec_covid --subset 10 --use-sqlite-graph + +uv run --extra eval python examples/ablation/download_benchmarks.py --only msmarco_passage --large-corpus-limit 1000000 +PYTHONUNBUFFERED=1 uv run --extra sqlite python examples/ablation/run_tier1_benchmarks.py --only msmarco --subset 50 --corpus-limit 1000000 --use-sqlite-graph ``` ## FiQA Results @@ -55,9 +59,24 @@ After the SQLite batch FTS optimization: TREC-COVID has many relevant documents per query, so R@5/R@10 is naturally small in this smoke even when Hit@10 is perfect. +## MS MARCO Passage Results + +Manual large-tier shard from BEIR/MS MARCO passage validation: + +| Docs | Queries | MRR@10 | R@5 | R@10 | Hit@10 | Build | Search | +|-----:|--------:|-------:|----:|-----:|-------:|------:|-------:| +| 100,000 | 50 | 0.673 | 0.740 | 0.770 | 39/50 | 81.9s | 5.4s | + +The local artifacts are gitignored: + +- `tests/benchmark/data/msmarco_passage.json` - 511 KB manifest +- `tests/benchmark/data/msmarco_passage.corpus.jsonl` - 35 MB corpus shard + ## Interpretation - Search latency remains usable at 171k docs: 5.2s over 10 queries. +- MS MARCO confirms the large-tier path on a web passage corpus: 100k docs, + 50 queries, 5.4s total search, and 0.673 MRR@10 without embeddings or reranking. - The main large-corpus bottleneck is still initial FTS/index build, not retrieval. - Avoiding unnecessary FTS deletes for newly inserted nodes reduced full FiQA build time by about 9.9x. - Raising benchmark ingest batches to 20k reduced full TREC-COVID build time by about 2.7x. @@ -67,6 +86,7 @@ small in this smoke even when Hit@10 is perfect. - `.github/workflows/public-scale.yml` runs weekly/manual FiQA 10k and TREC-COVID 50k staged smokes. - FiQA 25k/full and TREC-COVID 100k/full remain manual checks because they are multi-minute runs and depend on ignored local benchmark data. +- MS MARCO passage is the manual large tier: the downloader writes metadata JSON plus a gitignored corpus JSONL shard so 100k/1M/8.8M-style scale can be tested without committing giant artifacts. - If 100k+ docs becomes a required routine gate, the next target is faster initial FTS/index build. ## Remote Guard Dispatch diff --git a/examples/ablation/download_benchmarks.py b/examples/ablation/download_benchmarks.py index fc6d160..0949838 100644 --- a/examples/ablation/download_benchmarks.py +++ b/examples/ablation/download_benchmarks.py @@ -38,6 +38,7 @@ REPO_ROOT = Path(__file__).resolve().parents[2] OUT_DIR = REPO_ROOT / "tests" / "benchmark" / "data" +MSMARCO_DEFAULT_CORPUS_LIMIT = 1_000_000 def _hash_doc(title: str, text: str) -> str: @@ -45,17 +46,39 @@ def _hash_doc(title: str, text: str) -> str: return hashlib.blake2b((title + "||" + text).encode("utf-8"), digest_size=8).hexdigest() +def _display_path(path: Path) -> Path: + try: + return path.relative_to(REPO_ROOT) + except ValueError: + return path + + def _write(path: Path, obj: dict) -> None: path.parent.mkdir(parents=True, exist_ok=True) with open(path, "w", encoding="utf-8") as f: json.dump(obj, f, ensure_ascii=False) size_mb = path.stat().st_size / (1024 * 1024) print( - f" → {path.relative_to(REPO_ROOT)} " + f" → {_display_path(path)} " f"({size_mb:.1f} MB, {len(obj['corpus'])} docs, {len(obj['queries'])} queries)" ) +def _write_manifest(path: Path, obj: dict) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "w", encoding="utf-8") as f: + json.dump(obj, f, ensure_ascii=False) + corpus_path = path.parent / str(obj.get("corpus_path", "")) + corpus_size_mb = corpus_path.stat().st_size / (1024 * 1024) + manifest_size_kb = path.stat().st_size / 1024 + print( + f" → {_display_path(path)} " + f"({manifest_size_kb:.1f} KB manifest, " + f"{corpus_size_mb:.1f} MB corpus jsonl, " + f"{obj['corpus_size']} docs, {len(obj['queries'])} queries)" + ) + + # --- HotPotQA -------------------------------------------------------- @@ -322,6 +345,140 @@ def build_scifact(out_path: Path) -> None: _build_beir("BeIR/scifact", "test", "scifact", out_path) +def _build_beir_jsonl_shard( + corpus_repo: str, + split: str, + label: str, + out_path: Path, + *, + corpus_limit: int, + numeric_docid_index: bool = False, +) -> None: + """Build a large BEIR shard as metadata JSON + corpus JSONL. + + The small BEIR datasets fit comfortably in one JSON object. MS MARCO + does not: the source corpus has millions of passages. This writer + keeps all positive qrel docs for the selected split, then fills the + shard with corpus-order distractors up to ``corpus_limit``. + """ + from datasets import load_dataset + + if corpus_limit <= 0: + raise ValueError("--large-corpus-limit must be positive") + + print( + f"Loading BEIR {label} ({corpus_repo}, split={split}, " + f"jsonl shard limit={corpus_limit:,})..." + ) + + queries_ds = load_dataset(corpus_repo, "queries", split="queries") + qrels_ds = load_dataset(f"BeIR/{label}-qrels", split=split) + + qrels: dict[str, dict[str, int]] = {} + for row in qrels_ds: + qid = str(row["query-id"]) + did = str(row["corpus-id"]) + score = int(row.get("score") or 0) + if score <= 0: + continue + qrels.setdefault(qid, {})[did] = score + + queries: dict[str, str] = {} + for row in queries_ds: + qid = str(row["_id"]) + text = str(row.get("text") or "").strip() + if text and qid in qrels: + queries[qid] = text + + qrels = {qid: rel for qid, rel in qrels.items() if qid in queries} + gold_doc_ids = {did for rel in qrels.values() for did in rel} + filler_budget = max(corpus_limit - len(gold_doc_ids), 0) + + def row_payload(row: dict, doc_id: str) -> dict[str, str]: + return { + "_id": doc_id, + "title": str(row.get("title") or ""), + "text": str(row.get("text") or ""), + } + + corpus_path = out_path.with_suffix(".corpus.jsonl") + written_gold: set[str] = set() + written_docs = 0 + filler_docs = 0 + + corpus_ds = load_dataset( + corpus_repo, + "corpus", + split="corpus", + streaming=not numeric_docid_index, + ) + out_path.parent.mkdir(parents=True, exist_ok=True) + with open(corpus_path, "w", encoding="utf-8") as f: + if numeric_docid_index: + for did in sorted(gold_doc_ids, key=lambda value: int(value)): + row = corpus_ds[int(did)] + if str(row.get("_id")) != did: + continue + f.write(json.dumps(row_payload(row, did), ensure_ascii=False) + "\n") + written_gold.add(did) + written_docs += 1 + + for row in corpus_ds: + did = str(row["_id"]) + if did in gold_doc_ids: + if did in written_gold: + continue + written_gold.add(did) + elif filler_docs < filler_budget: + filler_docs += 1 + else: + continue + + f.write(json.dumps(row_payload(row, did), ensure_ascii=False) + "\n") + written_docs += 1 + + if filler_docs >= filler_budget and len(written_gold) >= len(gold_doc_ids): + break + + missing_gold = sorted(gold_doc_ids - written_gold) + _write_manifest( + out_path, + { + "name": f"BEIR {label} {split} large shard", + "schema": "beir_jsonl_v1", + "source": f"huggingface: {corpus_repo}", + "source_corpus": "MS MARCO passage ranking (~8.8M passages)", + "corpus_path": corpus_path.name, + "corpus_limit": corpus_limit, + "corpus_size": written_docs, + "query_size": len(queries), + "qrels_size": len(qrels), + "qrels_rows": sum(len(rel) for rel in qrels.values()), + "preserved_gold_docs": len(written_gold), + "missing_gold_docs": missing_gold[:100], + "queries": queries, + "qrels": qrels, + }, + ) + + +def build_msmarco_passage(out_path: Path, *, corpus_limit: int) -> None: + """BEIR MS MARCO passage dev — web-scale passage retrieval. + + The full source corpus is ~8.8M passages. The default local shard is + 1M passages, preserving validation positives before filling with + distractors. Increase ``--large-corpus-limit`` for heavier runs. + """ + _build_beir_jsonl_shard( + "BeIR/msmarco", + "validation", + "msmarco", + out_path, + corpus_limit=corpus_limit, + numeric_docid_index=True, + ) + + BUILDERS = { "hotpotqa_full": (build_hotpotqa, "hotpotqa_full.json"), "musique": (build_musique, "musique_dev.json"), @@ -330,6 +487,9 @@ def build_scifact(out_path: Path) -> None: "fiqa": (build_fiqa, "fiqa.json"), "scifact": (build_scifact, "scifact.json"), } +LARGE_BUILDERS = { + "msmarco_passage": (build_msmarco_passage, "msmarco_passage.json"), +} def main() -> None: @@ -339,15 +499,31 @@ def main() -> None: default=",".join(BUILDERS), help="comma-separated dataset names (default: all)", ) + p.add_argument( + "--large-corpus-limit", + type=int, + default=MSMARCO_DEFAULT_CORPUS_LIMIT, + help=( + "Corpus rows to keep for large JSONL-sharded datasets such as " + f"msmarco_passage (default: {MSMARCO_DEFAULT_CORPUS_LIMIT:,})." + ), + ) args = p.parse_args() names = [n.strip() for n in args.only.split(",") if n.strip()] - unknown = [n for n in names if n not in BUILDERS] + available = {**BUILDERS, **LARGE_BUILDERS} + unknown = [n for n in names if n not in available] if unknown: - print(f"Unknown datasets: {unknown}; available: {list(BUILDERS)}") + print(f"Unknown datasets: {unknown}; available: {list(available)}") sys.exit(1) for name in names: + if name in LARGE_BUILDERS: + builder, filename = LARGE_BUILDERS[name] + out_path = OUT_DIR / filename + print(f"\n=== {name} ===") + builder(out_path, corpus_limit=args.large_corpus_limit) + continue builder, filename = BUILDERS[name] out_path = OUT_DIR / filename print(f"\n=== {name} ===") diff --git a/examples/ablation/run_tier1_benchmarks.py b/examples/ablation/run_tier1_benchmarks.py index 7af75d8..2fdba61 100644 --- a/examples/ablation/run_tier1_benchmarks.py +++ b/examples/ablation/run_tier1_benchmarks.py @@ -5,7 +5,9 @@ large BEIR-style retrieval corpora (FiQA, TREC-COVID, SciFact). The multi-hop sets are the datasets HippoRAG2, GraphRAG, and the broader KG-RAG line use for head-to-head comparisons; the BEIR sets are useful -large-corpus scale checks with query/qrels ground truth. +large-corpus scale checks with query/qrels ground truth. MS MARCO uses +a metadata JSON + corpus JSONL shard so 1M+ passage runs do not require +committing giant benchmark artifacts. Two modes: @@ -30,6 +32,8 @@ python examples/ablation/run_tier1_benchmarks.py python examples/ablation/run_tier1_benchmarks.py --only hotpotqa python examples/ablation/run_tier1_benchmarks.py --only fiqa --subset 100 + python examples/ablation/run_tier1_benchmarks.py --only msmarco --subset 50 \\ + --corpus-limit 1000000 --use-sqlite-graph python examples/ablation/run_tier1_benchmarks.py --subset 200 # Full pipeline with Ollama embedder + TEI cross-encoder @@ -114,9 +118,121 @@ class Dataset: path=BENCH / "scifact.json", reference="BEIR SciFact: ~5k docs / 300 test queries", ), + Dataset( + name="MS MARCO passage dev", + path=BENCH / "msmarco_passage.json", + reference="BEIR/MS MARCO passage: ~8.8M source passages; JSONL shard", + ), ] +CorpusItem = tuple[str, str, str] + + +def _selected_gold_doc_ids( + qrels: dict, + query_items: list[tuple[str, str]], +) -> set[str]: + gold_doc_ids: set[str] = set() + for qid, _qtext in query_items: + rel = qrels.get(qid, {}) + if isinstance(rel, dict): + gold_doc_ids.update(str(doc_id) for doc_id in rel) + else: + gold_doc_ids.update(str(doc_id) for doc_id in rel) + return gold_doc_ids + + +def _load_inline_corpus_items( + corpus: dict, + qrels: dict, + query_items: list[tuple[str, str]], + corpus_limit: int | None, +) -> list[CorpusItem]: + items_all = [ + ( + doc_id, + str(doc.get("title", "") or doc_id), + str(doc.get("text", "")), + ) + for doc_id, doc in corpus.items() + ] + items_all = [(d, t, x) for d, t, x in items_all if t or x] + if corpus_limit is not None and 0 < corpus_limit < len(items_all): + gold_doc_ids = _selected_gold_doc_ids(qrels, query_items) + gold_items = [item for item in items_all if item[0] in gold_doc_ids] + filler_items = [item for item in items_all if item[0] not in gold_doc_ids] + if len(gold_items) >= corpus_limit: + return gold_items + return [*gold_items, *filler_items[: corpus_limit - len(gold_items)]] + return items_all + + +def _load_jsonl_corpus_items( + data: dict, + dataset_path: Path, + qrels: dict, + query_items: list[tuple[str, str]], + corpus_limit: int | None, +) -> list[CorpusItem]: + raw_corpus_path = data.get("corpus_path") + if not raw_corpus_path: + raise ValueError(f"{dataset_path} is missing corpus_path") + corpus_path = dataset_path.parent / str(raw_corpus_path) + if not corpus_path.exists(): + raise FileNotFoundError( + f"{corpus_path} missing. Run: " + "python examples/ablation/download_benchmarks.py --only msmarco_passage" + ) + + downloaded_size = int(data.get("corpus_size") or 0) + target_limit = corpus_limit if corpus_limit and corpus_limit > 0 else downloaded_size + gold_doc_ids = _selected_gold_doc_ids(qrels, query_items) + filler_budget = max(target_limit - len(gold_doc_ids), 0) + + gold_items: list[CorpusItem] = [] + filler_items: list[CorpusItem] = [] + seen_gold: set[str] = set() + with open(corpus_path, encoding="utf-8") as f: + for line in f: + if not line.strip(): + continue + row = json.loads(line) + doc_id = str(row.get("_id") or row.get("id") or "") + if not doc_id: + continue + title = str(row.get("title") or doc_id) + text = str(row.get("text") or "") + if not (title or text): + continue + item = (doc_id, title, text) + if doc_id in gold_doc_ids: + if doc_id not in seen_gold: + gold_items.append(item) + seen_gold.add(doc_id) + elif len(filler_items) < filler_budget: + filler_items.append(item) + + if len(filler_items) >= filler_budget and seen_gold >= gold_doc_ids: + break + + if len(gold_items) >= target_limit: + return gold_items + return [*gold_items, *filler_items[: target_limit - len(gold_items)]] + + +def _load_corpus_items( + data: dict, + dataset_path: Path, + qrels: dict, + query_items: list[tuple[str, str]], + corpus_limit: int | None, +) -> list[CorpusItem]: + if data.get("schema") == "beir_jsonl_v1": + return _load_jsonl_corpus_items(data, dataset_path, qrels, query_items, corpus_limit) + return _load_inline_corpus_items(data["corpus"], qrels, query_items, corpus_limit) + + def _reciprocal_rank(retrieved: list[str], relevant: set[str]) -> float: for i, did in enumerate(retrieved): if did in relevant: @@ -166,7 +282,6 @@ async def run_one( with open(ds.path, encoding="utf-8") as f: data = json.load(f) - corpus = data["corpus"] queries_all = data["queries"] qrels = data["qrels"] @@ -200,28 +315,7 @@ async def run_one( # Pre-compute embeddings in large batches (GPU-friendly). # ``graph.add()`` accepts an ``embedding`` arg; if we pass it we # avoid the per-node single embed call that bottlenecks at batch=1. - items_all = [ - ( - doc_id, - str(doc.get("title", "") or doc_id), - str(doc.get("text", "")), - ) - for doc_id, doc in corpus.items() - ] - items_all = [(d, t, x) for d, t, x in items_all if t or x] - if corpus_limit is not None and 0 < corpus_limit < len(items_all): - gold_doc_ids: set[str] = set() - for qid, _qtext in query_items: - rel = qrels.get(qid, {}) - if isinstance(rel, dict): - gold_doc_ids.update(str(doc_id) for doc_id in rel) - else: - gold_doc_ids.update(str(doc_id) for doc_id in rel) - gold_items = [item for item in items_all if item[0] in gold_doc_ids] - filler_items = [item for item in items_all if item[0] not in gold_doc_ids] - items = [*gold_items, *filler_items][:corpus_limit] - else: - items = items_all + items = _load_corpus_items(data, ds.path, qrels, query_items, corpus_limit) embeddings: list[list[float] | None] = [None] * len(items) if embedder is not None: @@ -422,7 +516,7 @@ async def amain(argv: list[str]) -> int: default=",".join(["hotpotqa", "musique", "2wiki"]), help=( "comma-separated dataset keys " - "(hotpotqa | musique | 2wiki | fiqa | trec_covid | scifact)" + "(hotpotqa | musique | 2wiki | fiqa | trec_covid | scifact | msmarco)" ), ) p.add_argument("--subset", type=int, default=None) @@ -650,6 +744,7 @@ async def amain(argv: list[str]) -> int: "fiqa": DATASETS[3], "trec_covid": DATASETS[4], "scifact": DATASETS[5], + "msmarco": DATASETS[6], } selected = [] for raw_key in args.only.split(","): diff --git a/tests/test_download_benchmarks.py b/tests/test_download_benchmarks.py new file mode 100644 index 0000000..86726d6 --- /dev/null +++ b/tests/test_download_benchmarks.py @@ -0,0 +1,66 @@ +"""Tests for benchmark downloader helpers.""" + +from __future__ import annotations + +import importlib.util +import json +import sys +import types +from pathlib import Path + +DOWNLOADER_PATH = ( + Path(__file__).resolve().parents[1] / "examples" / "ablation" / "download_benchmarks.py" +) +SPEC = importlib.util.spec_from_file_location("download_benchmarks", DOWNLOADER_PATH) +assert SPEC is not None +assert SPEC.loader is not None +downloader = importlib.util.module_from_spec(SPEC) +sys.modules[SPEC.name] = downloader +SPEC.loader.exec_module(downloader) + + +class _FakeDataset(list): + pass + + +def test_msmarco_jsonl_shard_preserves_gold_before_filler(monkeypatch, tmp_path): + corpus = _FakeDataset( + [ + {"_id": "0", "title": "", "text": "filler zero"}, + {"_id": "1", "title": "", "text": "filler one"}, + {"_id": "2", "title": "", "text": "filler two"}, + {"_id": "3", "title": "", "text": "needle gold"}, + ] + ) + + def fake_load_dataset(repo, config=None, *, split, streaming=False): + if repo == "BeIR/msmarco" and config == "queries": + return _FakeDataset([{"_id": "q1", "text": "needle"}]) + if repo == "BeIR/msmarco-qrels": + return _FakeDataset([{"query-id": "q1", "corpus-id": "3", "score": 1}]) + if repo == "BeIR/msmarco" and config == "corpus": + assert streaming is False + return corpus + raise AssertionError((repo, config, split, streaming)) + + monkeypatch.setitem( + sys.modules, + "datasets", + types.SimpleNamespace(load_dataset=fake_load_dataset), + ) + + manifest_path = tmp_path / "msmarco_passage.json" + downloader.build_msmarco_passage(manifest_path, corpus_limit=3) + + manifest = json.loads(manifest_path.read_text(encoding="utf-8")) + rows = [ + json.loads(line) + for line in (tmp_path / "msmarco_passage.corpus.jsonl") + .read_text(encoding="utf-8") + .splitlines() + ] + + assert manifest["schema"] == "beir_jsonl_v1" + assert manifest["corpus_size"] == 3 + assert manifest["preserved_gold_docs"] == 1 + assert [row["_id"] for row in rows] == ["3", "0", "1"] diff --git a/tests/test_tier1_benchmarks.py b/tests/test_tier1_benchmarks.py index 7c4d65c..0f6d7e8 100644 --- a/tests/test_tier1_benchmarks.py +++ b/tests/test_tier1_benchmarks.py @@ -49,6 +49,63 @@ async def test_corpus_limit_keeps_selected_query_gold_docs(tmp_path): assert report.recall_at_10 == 1.0 +@pytest.mark.asyncio +async def test_jsonl_corpus_limit_keeps_selected_query_gold_docs(tmp_path): + manifest = tmp_path / "large_bench.json" + corpus_path = tmp_path / "large_bench.corpus.jsonl" + corpus_path.write_text( + "\n".join( + [ + json.dumps( + { + "_id": "filler_a", + "title": "Filler A", + "text": "unrelated alpha", + } + ), + json.dumps( + { + "_id": "filler_b", + "title": "Filler B", + "text": "unrelated beta", + } + ), + json.dumps( + { + "_id": "gold_doc", + "title": "Gold", + "text": "needle targetterm", + } + ), + ] + ), + encoding="utf-8", + ) + manifest.write_text( + json.dumps( + { + "name": "Tiny JSONL", + "schema": "beir_jsonl_v1", + "corpus_path": corpus_path.name, + "corpus_size": 3, + "queries": {"q1": "targetterm"}, + "qrels": {"q1": {"gold_doc": 1}}, + } + ), + encoding="utf-8", + ) + + report = await runner.run_one( + runner.Dataset(name="Tiny JSONL", path=manifest, reference="unit"), + subset=1, + corpus_limit=2, + ) + + assert report.n_docs == 2 + assert report.hit_at_10 == 1 + assert report.recall_at_10 == 1.0 + + def test_threshold_violations_report_scale_regressions(): report = runner.Report( name="Tiny",