From c6de5a7edefcb963c02c092e242036026fc15a3b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EC=86=90=EC=84=B1=EC=A4=80?= <sonsj97@gmail.com>
Date: Thu, 2 Jul 2026 11:22:28 +0900
Subject: [PATCH] Add MS MARCO large-tier benchmark path

---
 .github/workflows/public-scale.yml            |  34 +++-
 .gitignore                                    |   1 +
 .../diagnostics/public_scale_20260702.md      |  20 ++
 examples/ablation/download_benchmarks.py      | 182 +++++++++++++++++-
 examples/ablation/run_tier1_benchmarks.py     | 145 +++++++++++---
 tests/test_download_benchmarks.py             |  66 +++++++
 tests/test_tier1_benchmarks.py                |  57 ++++++
 7 files changed, 476 insertions(+), 29 deletions(-)
 create mode 100644 tests/test_download_benchmarks.py

diff --git a/.github/workflows/public-scale.yml b/.github/workflows/public-scale.yml
index 1c231c7..86992b0 100644
--- a/.github/workflows/public-scale.yml
+++ b/.github/workflows/public-scale.yml
@@ -8,7 +8,18 @@ name: Public Scale Guard
 on:
   schedule:
     - cron: "30 6 * * 2" # Tuesdays 06:30 UTC
-  workflow_dispatch: {}
+  workflow_dispatch:
+    inputs:
+      run_msmarco:
+        description: "Also run the manual MS MARCO large-tier smoke"
+        required: false
+        type: boolean
+        default: false
+      msmarco_corpus_limit:
+        description: "MS MARCO shard/search corpus limit"
+        required: false
+        type: string
+        default: "100000"
 
 jobs:
   public-scale:
@@ -67,6 +78,25 @@ jobs:
           cp "$(ls -t examples/ablation/diagnostics/tier1_*.md | head -1)" \
             /tmp/trec_covid_scale_guard.md
 
+      - name: Download MS MARCO large-tier shard
+        if: ${{ github.event_name == 'workflow_dispatch' && inputs.run_msmarco }}
+        run: |
+          uv run --extra eval python examples/ablation/download_benchmarks.py \
+            --only msmarco_passage \
+            --large-corpus-limit "${{ inputs.msmarco_corpus_limit }}"
+
+      - name: Run MS MARCO large-tier smoke
+        if: ${{ github.event_name == 'workflow_dispatch' && inputs.run_msmarco }}
+        run: |
+          set -o pipefail
+          PYTHONUNBUFFERED=1 uv run --extra sqlite python examples/ablation/run_tier1_benchmarks.py \
+            --only msmarco \
+            --subset 50 \
+            --corpus-limit "${{ inputs.msmarco_corpus_limit }}" \
+            --use-sqlite-graph | tee /tmp/msmarco_large_guard.log
+          cp "$(ls -t examples/ablation/diagnostics/tier1_*.md | head -1)" \
+            /tmp/msmarco_large_guard.md
+
       - name: Upload public scale results
         if: always()
         uses: actions/upload-artifact@v4
@@ -77,4 +107,6 @@ jobs:
             /tmp/fiqa_scale_guard.md
             /tmp/trec_covid_scale_guard.log
             /tmp/trec_covid_scale_guard.md
+            /tmp/msmarco_large_guard.log
+            /tmp/msmarco_large_guard.md
           if-no-files-found: ignore
diff --git a/.gitignore b/.gitignore
index 1b2d1c2..29ce6ce 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 __pycache__/
 *.pyc
 tests/benchmark/data/*.json
+tests/benchmark/data/*.jsonl
 .claude/
 tmp/
 마사회/
diff --git a/examples/ablation/diagnostics/public_scale_20260702.md b/examples/ablation/diagnostics/public_scale_20260702.md
index 07e33c5..9ca4b94 100644
--- a/examples/ablation/diagnostics/public_scale_20260702.md
+++ b/examples/ablation/diagnostics/public_scale_20260702.md
@@ -6,6 +6,7 @@
 |---------|----------------|-------:|--------:|-------------|
 | BEIR FiQA test | `tests/benchmark/data/fiqa.json` | 57,638 docs | 648 | 5-10 queries |
 | BEIR TREC-COVID test | `tests/benchmark/data/trec_covid.json` | 171,332 docs | 50 | 10 queries |
+| BEIR MS MARCO passage dev | `tests/benchmark/data/msmarco_passage.json` + `.corpus.jsonl` | 1M shard by default from ~8.8M source passages | validation qrels | manual large tier |
 
 Mode: embedder-free `graph.search()` with `SqliteGraphBackend`.
 
@@ -22,6 +23,9 @@ PYTHONUNBUFFERED=1 uv run --extra sqlite python examples/ablation/run_tier1_benc
 PYTHONUNBUFFERED=1 uv run --extra sqlite python examples/ablation/run_tier1_benchmarks.py --only trec_covid --subset 10 --corpus-limit 50000 --use-sqlite-graph
 PYTHONUNBUFFERED=1 uv run --extra sqlite python examples/ablation/run_tier1_benchmarks.py --only trec_covid --subset 10 --corpus-limit 100000 --use-sqlite-graph
 PYTHONUNBUFFERED=1 uv run --extra sqlite python examples/ablation/run_tier1_benchmarks.py --only trec_covid --subset 10 --use-sqlite-graph
+
+uv run --extra eval python examples/ablation/download_benchmarks.py --only msmarco_passage --large-corpus-limit 1000000
+PYTHONUNBUFFERED=1 uv run --extra sqlite python examples/ablation/run_tier1_benchmarks.py --only msmarco --subset 50 --corpus-limit 1000000 --use-sqlite-graph
 ```
 
 ## FiQA Results
@@ -55,9 +59,24 @@ After the SQLite batch FTS optimization:
 TREC-COVID has many relevant documents per query, so R@5/R@10 is naturally
 small in this smoke even when Hit@10 is perfect.
 
+## MS MARCO Passage Results
+
+Manual large-tier shard from BEIR/MS MARCO passage validation:
+
+| Docs | Queries | MRR@10 | R@5 | R@10 | Hit@10 | Build | Search |
+|-----:|--------:|-------:|----:|-----:|-------:|------:|-------:|
+| 100,000 | 50 | 0.673 | 0.740 | 0.770 | 39/50 | 81.9s | 5.4s |
+
+The local artifacts are gitignored:
+
+- `tests/benchmark/data/msmarco_passage.json` - 511 KB manifest
+- `tests/benchmark/data/msmarco_passage.corpus.jsonl` - 35 MB corpus shard
+
 ## Interpretation
 
 - Search latency remains usable at 171k docs: 5.2s over 10 queries.
+- MS MARCO confirms the large-tier path on a web passage corpus: 100k docs,
+  50 queries, 5.4s total search, and 0.673 MRR@10 without embeddings or reranking.
 - The main large-corpus bottleneck is still initial FTS/index build, not retrieval.
 - Avoiding unnecessary FTS deletes for newly inserted nodes reduced full FiQA build time by about 9.9x.
 - Raising benchmark ingest batches to 20k reduced full TREC-COVID build time by about 2.7x.
@@ -67,6 +86,7 @@ small in this smoke even when Hit@10 is perfect.
 
 - `.github/workflows/public-scale.yml` runs weekly/manual FiQA 10k and TREC-COVID 50k staged smokes.
 - FiQA 25k/full and TREC-COVID 100k/full remain manual checks because they are multi-minute runs and depend on ignored local benchmark data.
+- MS MARCO passage is the manual large tier: the downloader writes metadata JSON plus a gitignored corpus JSONL shard so 100k/1M/8.8M-style scale can be tested without committing giant artifacts.
 - If 100k+ docs becomes a required routine gate, the next target is faster initial FTS/index build.
 
 ## Remote Guard Dispatch
diff --git a/examples/ablation/download_benchmarks.py b/examples/ablation/download_benchmarks.py
index fc6d160..0949838 100644
--- a/examples/ablation/download_benchmarks.py
+++ b/examples/ablation/download_benchmarks.py
@@ -38,6 +38,7 @@
 
 REPO_ROOT = Path(__file__).resolve().parents[2]
 OUT_DIR = REPO_ROOT / "tests" / "benchmark" / "data"
+MSMARCO_DEFAULT_CORPUS_LIMIT = 1_000_000
 
 
 def _hash_doc(title: str, text: str) -> str:
@@ -45,17 +46,39 @@ def _hash_doc(title: str, text: str) -> str:
     return hashlib.blake2b((title + "||" + text).encode("utf-8"), digest_size=8).hexdigest()
 
 
+def _display_path(path: Path) -> Path:
+    try:
+        return path.relative_to(REPO_ROOT)
+    except ValueError:
+        return path
+
+
 def _write(path: Path, obj: dict) -> None:
     path.parent.mkdir(parents=True, exist_ok=True)
     with open(path, "w", encoding="utf-8") as f:
         json.dump(obj, f, ensure_ascii=False)
     size_mb = path.stat().st_size / (1024 * 1024)
     print(
-        f"  → {path.relative_to(REPO_ROOT)}  "
+        f"  → {_display_path(path)}  "
         f"({size_mb:.1f} MB, {len(obj['corpus'])} docs, {len(obj['queries'])} queries)"
     )
 
 
+def _write_manifest(path: Path, obj: dict) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(obj, f, ensure_ascii=False)
+    corpus_path = path.parent / str(obj.get("corpus_path", ""))
+    corpus_size_mb = corpus_path.stat().st_size / (1024 * 1024)
+    manifest_size_kb = path.stat().st_size / 1024
+    print(
+        f"  → {_display_path(path)}  "
+        f"({manifest_size_kb:.1f} KB manifest, "
+        f"{corpus_size_mb:.1f} MB corpus jsonl, "
+        f"{obj['corpus_size']} docs, {len(obj['queries'])} queries)"
+    )
+
+
 # --- HotPotQA --------------------------------------------------------
 
 
@@ -322,6 +345,140 @@ def build_scifact(out_path: Path) -> None:
     _build_beir("BeIR/scifact", "test", "scifact", out_path)
 
 
+def _build_beir_jsonl_shard(
+    corpus_repo: str,
+    split: str,
+    label: str,
+    out_path: Path,
+    *,
+    corpus_limit: int,
+    numeric_docid_index: bool = False,
+) -> None:
+    """Build a large BEIR shard as metadata JSON + corpus JSONL.
+
+    The small BEIR datasets fit comfortably in one JSON object. MS MARCO
+    does not: the source corpus has millions of passages. This writer
+    keeps all positive qrel docs for the selected split, then fills the
+    shard with corpus-order distractors up to ``corpus_limit``.
+    """
+    from datasets import load_dataset
+
+    if corpus_limit <= 0:
+        raise ValueError("--large-corpus-limit must be positive")
+
+    print(
+        f"Loading BEIR {label} ({corpus_repo}, split={split}, "
+        f"jsonl shard limit={corpus_limit:,})..."
+    )
+
+    queries_ds = load_dataset(corpus_repo, "queries", split="queries")
+    qrels_ds = load_dataset(f"BeIR/{label}-qrels", split=split)
+
+    qrels: dict[str, dict[str, int]] = {}
+    for row in qrels_ds:
+        qid = str(row["query-id"])
+        did = str(row["corpus-id"])
+        score = int(row.get("score") or 0)
+        if score <= 0:
+            continue
+        qrels.setdefault(qid, {})[did] = score
+
+    queries: dict[str, str] = {}
+    for row in queries_ds:
+        qid = str(row["_id"])
+        text = str(row.get("text") or "").strip()
+        if text and qid in qrels:
+            queries[qid] = text
+
+    qrels = {qid: rel for qid, rel in qrels.items() if qid in queries}
+    gold_doc_ids = {did for rel in qrels.values() for did in rel}
+    filler_budget = max(corpus_limit - len(gold_doc_ids), 0)
+
+    def row_payload(row: dict, doc_id: str) -> dict[str, str]:
+        return {
+            "_id": doc_id,
+            "title": str(row.get("title") or ""),
+            "text": str(row.get("text") or ""),
+        }
+
+    corpus_path = out_path.with_suffix(".corpus.jsonl")
+    written_gold: set[str] = set()
+    written_docs = 0
+    filler_docs = 0
+
+    corpus_ds = load_dataset(
+        corpus_repo,
+        "corpus",
+        split="corpus",
+        streaming=not numeric_docid_index,
+    )
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(corpus_path, "w", encoding="utf-8") as f:
+        if numeric_docid_index:
+            for did in sorted(gold_doc_ids, key=lambda value: int(value)):
+                row = corpus_ds[int(did)]
+                if str(row.get("_id")) != did:
+                    continue
+                f.write(json.dumps(row_payload(row, did), ensure_ascii=False) + "\n")
+                written_gold.add(did)
+                written_docs += 1
+
+        for row in corpus_ds:
+            did = str(row["_id"])
+            if did in gold_doc_ids:
+                if did in written_gold:
+                    continue
+                written_gold.add(did)
+            elif filler_docs < filler_budget:
+                filler_docs += 1
+            else:
+                continue
+
+            f.write(json.dumps(row_payload(row, did), ensure_ascii=False) + "\n")
+            written_docs += 1
+
+            if filler_docs >= filler_budget and len(written_gold) >= len(gold_doc_ids):
+                break
+
+    missing_gold = sorted(gold_doc_ids - written_gold)
+    _write_manifest(
+        out_path,
+        {
+            "name": f"BEIR {label} {split} large shard",
+            "schema": "beir_jsonl_v1",
+            "source": f"huggingface: {corpus_repo}",
+            "source_corpus": "MS MARCO passage ranking (~8.8M passages)",
+            "corpus_path": corpus_path.name,
+            "corpus_limit": corpus_limit,
+            "corpus_size": written_docs,
+            "query_size": len(queries),
+            "qrels_size": len(qrels),
+            "qrels_rows": sum(len(rel) for rel in qrels.values()),
+            "preserved_gold_docs": len(written_gold),
+            "missing_gold_docs": missing_gold[:100],
+            "queries": queries,
+            "qrels": qrels,
+        },
+    )
+
+
+def build_msmarco_passage(out_path: Path, *, corpus_limit: int) -> None:
+    """BEIR MS MARCO passage dev — web-scale passage retrieval.
+
+    The full source corpus is ~8.8M passages. The default local shard is
+    1M passages, preserving validation positives before filling with
+    distractors. Increase ``--large-corpus-limit`` for heavier runs.
+    """
+    _build_beir_jsonl_shard(
+        "BeIR/msmarco",
+        "validation",
+        "msmarco",
+        out_path,
+        corpus_limit=corpus_limit,
+        numeric_docid_index=True,
+    )
+
+
 BUILDERS = {
     "hotpotqa_full": (build_hotpotqa, "hotpotqa_full.json"),
     "musique": (build_musique, "musique_dev.json"),
@@ -330,6 +487,9 @@ def build_scifact(out_path: Path) -> None:
     "fiqa": (build_fiqa, "fiqa.json"),
     "scifact": (build_scifact, "scifact.json"),
 }
+LARGE_BUILDERS = {
+    "msmarco_passage": (build_msmarco_passage, "msmarco_passage.json"),
+}
 
 
 def main() -> None:
@@ -339,15 +499,31 @@ def main() -> None:
         default=",".join(BUILDERS),
         help="comma-separated dataset names (default: all)",
     )
+    p.add_argument(
+        "--large-corpus-limit",
+        type=int,
+        default=MSMARCO_DEFAULT_CORPUS_LIMIT,
+        help=(
+            "Corpus rows to keep for large JSONL-sharded datasets such as "
+            f"msmarco_passage (default: {MSMARCO_DEFAULT_CORPUS_LIMIT:,})."
+        ),
+    )
     args = p.parse_args()
 
     names = [n.strip() for n in args.only.split(",") if n.strip()]
-    unknown = [n for n in names if n not in BUILDERS]
+    available = {**BUILDERS, **LARGE_BUILDERS}
+    unknown = [n for n in names if n not in available]
     if unknown:
-        print(f"Unknown datasets: {unknown}; available: {list(BUILDERS)}")
+        print(f"Unknown datasets: {unknown}; available: {list(available)}")
         sys.exit(1)
 
     for name in names:
+        if name in LARGE_BUILDERS:
+            builder, filename = LARGE_BUILDERS[name]
+            out_path = OUT_DIR / filename
+            print(f"\n=== {name} ===")
+            builder(out_path, corpus_limit=args.large_corpus_limit)
+            continue
         builder, filename = BUILDERS[name]
         out_path = OUT_DIR / filename
         print(f"\n=== {name} ===")
diff --git a/examples/ablation/run_tier1_benchmarks.py b/examples/ablation/run_tier1_benchmarks.py
index 7af75d8..2fdba61 100644
--- a/examples/ablation/run_tier1_benchmarks.py
+++ b/examples/ablation/run_tier1_benchmarks.py
@@ -5,7 +5,9 @@
 large BEIR-style retrieval corpora (FiQA, TREC-COVID, SciFact). The
 multi-hop sets are the datasets HippoRAG2, GraphRAG, and the broader
 KG-RAG line use for head-to-head comparisons; the BEIR sets are useful
-large-corpus scale checks with query/qrels ground truth.
+large-corpus scale checks with query/qrels ground truth. MS MARCO uses
+a metadata JSON + corpus JSONL shard so 1M+ passage runs do not require
+committing giant benchmark artifacts.
 
 Two modes:
 
@@ -30,6 +32,8 @@
     python examples/ablation/run_tier1_benchmarks.py
     python examples/ablation/run_tier1_benchmarks.py --only hotpotqa
     python examples/ablation/run_tier1_benchmarks.py --only fiqa --subset 100
+    python examples/ablation/run_tier1_benchmarks.py --only msmarco --subset 50 \\
+        --corpus-limit 1000000 --use-sqlite-graph
     python examples/ablation/run_tier1_benchmarks.py --subset 200
 
     # Full pipeline with Ollama embedder + TEI cross-encoder
@@ -114,9 +118,121 @@ class Dataset:
         path=BENCH / "scifact.json",
         reference="BEIR SciFact: ~5k docs / 300 test queries",
     ),
+    Dataset(
+        name="MS MARCO passage dev",
+        path=BENCH / "msmarco_passage.json",
+        reference="BEIR/MS MARCO passage: ~8.8M source passages; JSONL shard",
+    ),
 ]
 
 
+CorpusItem = tuple[str, str, str]
+
+
+def _selected_gold_doc_ids(
+    qrels: dict,
+    query_items: list[tuple[str, str]],
+) -> set[str]:
+    gold_doc_ids: set[str] = set()
+    for qid, _qtext in query_items:
+        rel = qrels.get(qid, {})
+        if isinstance(rel, dict):
+            gold_doc_ids.update(str(doc_id) for doc_id in rel)
+        else:
+            gold_doc_ids.update(str(doc_id) for doc_id in rel)
+    return gold_doc_ids
+
+
+def _load_inline_corpus_items(
+    corpus: dict,
+    qrels: dict,
+    query_items: list[tuple[str, str]],
+    corpus_limit: int | None,
+) -> list[CorpusItem]:
+    items_all = [
+        (
+            doc_id,
+            str(doc.get("title", "") or doc_id),
+            str(doc.get("text", "")),
+        )
+        for doc_id, doc in corpus.items()
+    ]
+    items_all = [(d, t, x) for d, t, x in items_all if t or x]
+    if corpus_limit is not None and 0 < corpus_limit < len(items_all):
+        gold_doc_ids = _selected_gold_doc_ids(qrels, query_items)
+        gold_items = [item for item in items_all if item[0] in gold_doc_ids]
+        filler_items = [item for item in items_all if item[0] not in gold_doc_ids]
+        if len(gold_items) >= corpus_limit:
+            return gold_items
+        return [*gold_items, *filler_items[: corpus_limit - len(gold_items)]]
+    return items_all
+
+
+def _load_jsonl_corpus_items(
+    data: dict,
+    dataset_path: Path,
+    qrels: dict,
+    query_items: list[tuple[str, str]],
+    corpus_limit: int | None,
+) -> list[CorpusItem]:
+    raw_corpus_path = data.get("corpus_path")
+    if not raw_corpus_path:
+        raise ValueError(f"{dataset_path} is missing corpus_path")
+    corpus_path = dataset_path.parent / str(raw_corpus_path)
+    if not corpus_path.exists():
+        raise FileNotFoundError(
+            f"{corpus_path} missing. Run: "
+            "python examples/ablation/download_benchmarks.py --only msmarco_passage"
+        )
+
+    downloaded_size = int(data.get("corpus_size") or 0)
+    target_limit = corpus_limit if corpus_limit and corpus_limit > 0 else downloaded_size
+    gold_doc_ids = _selected_gold_doc_ids(qrels, query_items)
+    filler_budget = max(target_limit - len(gold_doc_ids), 0)
+
+    gold_items: list[CorpusItem] = []
+    filler_items: list[CorpusItem] = []
+    seen_gold: set[str] = set()
+    with open(corpus_path, encoding="utf-8") as f:
+        for line in f:
+            if not line.strip():
+                continue
+            row = json.loads(line)
+            doc_id = str(row.get("_id") or row.get("id") or "")
+            if not doc_id:
+                continue
+            title = str(row.get("title") or doc_id)
+            text = str(row.get("text") or "")
+            if not (title or text):
+                continue
+            item = (doc_id, title, text)
+            if doc_id in gold_doc_ids:
+                if doc_id not in seen_gold:
+                    gold_items.append(item)
+                    seen_gold.add(doc_id)
+            elif len(filler_items) < filler_budget:
+                filler_items.append(item)
+
+            if len(filler_items) >= filler_budget and seen_gold >= gold_doc_ids:
+                break
+
+    if len(gold_items) >= target_limit:
+        return gold_items
+    return [*gold_items, *filler_items[: target_limit - len(gold_items)]]
+
+
+def _load_corpus_items(
+    data: dict,
+    dataset_path: Path,
+    qrels: dict,
+    query_items: list[tuple[str, str]],
+    corpus_limit: int | None,
+) -> list[CorpusItem]:
+    if data.get("schema") == "beir_jsonl_v1":
+        return _load_jsonl_corpus_items(data, dataset_path, qrels, query_items, corpus_limit)
+    return _load_inline_corpus_items(data["corpus"], qrels, query_items, corpus_limit)
+
+
 def _reciprocal_rank(retrieved: list[str], relevant: set[str]) -> float:
     for i, did in enumerate(retrieved):
         if did in relevant:
@@ -166,7 +282,6 @@ async def run_one(
     with open(ds.path, encoding="utf-8") as f:
         data = json.load(f)
 
-    corpus = data["corpus"]
     queries_all = data["queries"]
     qrels = data["qrels"]
 
@@ -200,28 +315,7 @@ async def run_one(
     # Pre-compute embeddings in large batches (GPU-friendly).
     # ``graph.add()`` accepts an ``embedding`` arg; if we pass it we
     # avoid the per-node single embed call that bottlenecks at batch=1.
-    items_all = [
-        (
-            doc_id,
-            str(doc.get("title", "") or doc_id),
-            str(doc.get("text", "")),
-        )
-        for doc_id, doc in corpus.items()
-    ]
-    items_all = [(d, t, x) for d, t, x in items_all if t or x]
-    if corpus_limit is not None and 0 < corpus_limit < len(items_all):
-        gold_doc_ids: set[str] = set()
-        for qid, _qtext in query_items:
-            rel = qrels.get(qid, {})
-            if isinstance(rel, dict):
-                gold_doc_ids.update(str(doc_id) for doc_id in rel)
-            else:
-                gold_doc_ids.update(str(doc_id) for doc_id in rel)
-        gold_items = [item for item in items_all if item[0] in gold_doc_ids]
-        filler_items = [item for item in items_all if item[0] not in gold_doc_ids]
-        items = [*gold_items, *filler_items][:corpus_limit]
-    else:
-        items = items_all
+    items = _load_corpus_items(data, ds.path, qrels, query_items, corpus_limit)
 
     embeddings: list[list[float] | None] = [None] * len(items)
     if embedder is not None:
@@ -422,7 +516,7 @@ async def amain(argv: list[str]) -> int:
         default=",".join(["hotpotqa", "musique", "2wiki"]),
         help=(
             "comma-separated dataset keys "
-            "(hotpotqa | musique | 2wiki | fiqa | trec_covid | scifact)"
+            "(hotpotqa | musique | 2wiki | fiqa | trec_covid | scifact | msmarco)"
         ),
     )
     p.add_argument("--subset", type=int, default=None)
@@ -650,6 +744,7 @@ async def amain(argv: list[str]) -> int:
         "fiqa": DATASETS[3],
         "trec_covid": DATASETS[4],
         "scifact": DATASETS[5],
+        "msmarco": DATASETS[6],
     }
     selected = []
     for raw_key in args.only.split(","):
diff --git a/tests/test_download_benchmarks.py b/tests/test_download_benchmarks.py
new file mode 100644
index 0000000..86726d6
--- /dev/null
+++ b/tests/test_download_benchmarks.py
@@ -0,0 +1,66 @@
+"""Tests for benchmark downloader helpers."""
+
+from __future__ import annotations
+
+import importlib.util
+import json
+import sys
+import types
+from pathlib import Path
+
+DOWNLOADER_PATH = (
+    Path(__file__).resolve().parents[1] / "examples" / "ablation" / "download_benchmarks.py"
+)
+SPEC = importlib.util.spec_from_file_location("download_benchmarks", DOWNLOADER_PATH)
+assert SPEC is not None
+assert SPEC.loader is not None
+downloader = importlib.util.module_from_spec(SPEC)
+sys.modules[SPEC.name] = downloader
+SPEC.loader.exec_module(downloader)
+
+
+class _FakeDataset(list):
+    pass
+
+
+def test_msmarco_jsonl_shard_preserves_gold_before_filler(monkeypatch, tmp_path):
+    corpus = _FakeDataset(
+        [
+            {"_id": "0", "title": "", "text": "filler zero"},
+            {"_id": "1", "title": "", "text": "filler one"},
+            {"_id": "2", "title": "", "text": "filler two"},
+            {"_id": "3", "title": "", "text": "needle gold"},
+        ]
+    )
+
+    def fake_load_dataset(repo, config=None, *, split, streaming=False):
+        if repo == "BeIR/msmarco" and config == "queries":
+            return _FakeDataset([{"_id": "q1", "text": "needle"}])
+        if repo == "BeIR/msmarco-qrels":
+            return _FakeDataset([{"query-id": "q1", "corpus-id": "3", "score": 1}])
+        if repo == "BeIR/msmarco" and config == "corpus":
+            assert streaming is False
+            return corpus
+        raise AssertionError((repo, config, split, streaming))
+
+    monkeypatch.setitem(
+        sys.modules,
+        "datasets",
+        types.SimpleNamespace(load_dataset=fake_load_dataset),
+    )
+
+    manifest_path = tmp_path / "msmarco_passage.json"
+    downloader.build_msmarco_passage(manifest_path, corpus_limit=3)
+
+    manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
+    rows = [
+        json.loads(line)
+        for line in (tmp_path / "msmarco_passage.corpus.jsonl")
+        .read_text(encoding="utf-8")
+        .splitlines()
+    ]
+
+    assert manifest["schema"] == "beir_jsonl_v1"
+    assert manifest["corpus_size"] == 3
+    assert manifest["preserved_gold_docs"] == 1
+    assert [row["_id"] for row in rows] == ["3", "0", "1"]
diff --git a/tests/test_tier1_benchmarks.py b/tests/test_tier1_benchmarks.py
index 7c4d65c..0f6d7e8 100644
--- a/tests/test_tier1_benchmarks.py
+++ b/tests/test_tier1_benchmarks.py
@@ -49,6 +49,63 @@ async def test_corpus_limit_keeps_selected_query_gold_docs(tmp_path):
     assert report.recall_at_10 == 1.0
 
 
+@pytest.mark.asyncio
+async def test_jsonl_corpus_limit_keeps_selected_query_gold_docs(tmp_path):
+    manifest = tmp_path / "large_bench.json"
+    corpus_path = tmp_path / "large_bench.corpus.jsonl"
+    corpus_path.write_text(
+        "\n".join(
+            [
+                json.dumps(
+                    {
+                        "_id": "filler_a",
+                        "title": "Filler A",
+                        "text": "unrelated alpha",
+                    }
+                ),
+                json.dumps(
+                    {
+                        "_id": "filler_b",
+                        "title": "Filler B",
+                        "text": "unrelated beta",
+                    }
+                ),
+                json.dumps(
+                    {
+                        "_id": "gold_doc",
+                        "title": "Gold",
+                        "text": "needle targetterm",
+                    }
+                ),
+            ]
+        ),
+        encoding="utf-8",
+    )
+    manifest.write_text(
+        json.dumps(
+            {
+                "name": "Tiny JSONL",
+                "schema": "beir_jsonl_v1",
+                "corpus_path": corpus_path.name,
+                "corpus_size": 3,
+                "queries": {"q1": "targetterm"},
+                "qrels": {"q1": {"gold_doc": 1}},
+            }
+        ),
+        encoding="utf-8",
+    )
+
+    report = await runner.run_one(
+        runner.Dataset(name="Tiny JSONL", path=manifest, reference="unit"),
+        subset=1,
+        corpus_limit=2,
+    )
+
+    assert report.n_docs == 2
+    assert report.hit_at_10 == 1
+    assert report.recall_at_10 == 1.0
+
+
 def test_threshold_violations_report_scale_regressions():
     report = runner.Report(
         name="Tiny",