From a1da6a9167842aab1bae36541dc719c0d38618c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EC=86=90=EC=84=B1=EC=A4=80?= Date: Thu, 2 Jul 2026 10:42:09 +0900 Subject: [PATCH 1/3] Add public large-corpus scale guard --- .github/workflows/public-scale.yml | 72 +++++++++ .../diagnostics/public_scale_20260702.md | 69 +++++++++ examples/ablation/run_tier1_benchmarks.py | 138 +++++++++++++++--- src/synaptic/backends/sqlite.py | 19 ++- src/synaptic/graph.py | 22 +-- tests/test_backend_sqlite.py | 33 +++++ tests/test_memory_operating_layer.py | 11 ++ tests/test_tier1_benchmarks.py | 49 +++++++ 8 files changed, 379 insertions(+), 34 deletions(-) create mode 100644 .github/workflows/public-scale.yml create mode 100644 examples/ablation/diagnostics/public_scale_20260702.md create mode 100644 tests/test_tier1_benchmarks.py diff --git a/.github/workflows/public-scale.yml b/.github/workflows/public-scale.yml new file mode 100644 index 0000000..9a8fcd1 --- /dev/null +++ b/.github/workflows/public-scale.yml @@ -0,0 +1,72 @@ +name: Public Scale Guard + +# Scheduled, public large-corpus smoke for the SQLite/EvidenceSearch path. +# Benchmark JSON files are gitignored, so this workflow regenerates public +# datasets from HuggingFace and runs staged guards that keep selected query +# gold docs in each indexed corpus. + +on: + schedule: + - cron: "30 6 * * 2" # Tuesdays 06:30 UTC + workflow_dispatch: {} + +jobs: + public-scale: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v5 + + - name: Set up Python 3.12 + run: uv python install 3.12 + + - name: Cache uv + uses: actions/cache@v4 + with: + path: ~/.cache/uv + key: uv-${{ runner.os }}-${{ hashFiles('uv.lock') }} + restore-keys: uv-${{ runner.os }}- + + - name: Install dependencies + run: uv sync --extra sqlite --extra eval + + - name: Download public scale benchmarks + run: | + uv run --extra eval python examples/ablation/download_benchmarks.py \ + --only fiqa,trec_covid + + - name: Run FiQA 10k scale smoke + run: | + set -o pipefail + PYTHONUNBUFFERED=1 uv run --extra sqlite python examples/ablation/run_tier1_benchmarks.py \ + --only fiqa \ + --subset 5 \ + --corpus-limit 10000 \ + --use-sqlite-graph | tee /tmp/fiqa_scale_guard.log + cp "$(ls -t examples/ablation/diagnostics/tier1_*.md | head -1)" \ + /tmp/fiqa_scale_guard.md + + - name: Run TREC-COVID 50k scale smoke + run: | + set -o pipefail + PYTHONUNBUFFERED=1 uv run --extra sqlite python examples/ablation/run_tier1_benchmarks.py \ + --only trec_covid \ + --subset 10 \ + --corpus-limit 50000 \ + --use-sqlite-graph | tee /tmp/trec_covid_scale_guard.log + cp "$(ls -t examples/ablation/diagnostics/tier1_*.md | head -1)" \ + /tmp/trec_covid_scale_guard.md + + - name: Upload public scale results + if: always() + uses: actions/upload-artifact@v4 + with: + name: public-scale-results + path: | + /tmp/fiqa_scale_guard.log + /tmp/fiqa_scale_guard.md + /tmp/trec_covid_scale_guard.log + /tmp/trec_covid_scale_guard.md + if-no-files-found: ignore diff --git a/examples/ablation/diagnostics/public_scale_20260702.md b/examples/ablation/diagnostics/public_scale_20260702.md new file mode 100644 index 0000000..5fb02f8 --- /dev/null +++ b/examples/ablation/diagnostics/public_scale_20260702.md @@ -0,0 +1,69 @@ +# Public Large-Corpus Scale Smoke - 2026-07-02 + +## Datasets + +| Dataset | Local artifact | Corpus | Queries | Smoke scope | +|---------|----------------|-------:|--------:|-------------| +| BEIR FiQA test | `tests/benchmark/data/fiqa.json` | 57,638 docs | 648 | 5-10 queries | +| BEIR TREC-COVID test | `tests/benchmark/data/trec_covid.json` | 171,332 docs | 50 | 10 queries | + +Mode: embedder-free `graph.search()` with `SqliteGraphBackend`. + +## Commands + +```bash +uv run --extra eval python examples/ablation/download_benchmarks.py --only fiqa +uv run --extra eval python examples/ablation/download_benchmarks.py --only trec_covid + +PYTHONUNBUFFERED=1 uv run --extra sqlite python examples/ablation/run_tier1_benchmarks.py --only fiqa --subset 5 --corpus-limit 10000 --use-sqlite-graph +PYTHONUNBUFFERED=1 uv run --extra sqlite python examples/ablation/run_tier1_benchmarks.py --only fiqa --subset 10 --corpus-limit 25000 --use-sqlite-graph +PYTHONUNBUFFERED=1 uv run --extra sqlite python examples/ablation/run_tier1_benchmarks.py --only fiqa --subset 10 --use-sqlite-graph + +PYTHONUNBUFFERED=1 uv run --extra sqlite python examples/ablation/run_tier1_benchmarks.py --only trec_covid --subset 10 --corpus-limit 50000 --use-sqlite-graph +PYTHONUNBUFFERED=1 uv run --extra sqlite python examples/ablation/run_tier1_benchmarks.py --only trec_covid --subset 10 --corpus-limit 100000 --use-sqlite-graph +PYTHONUNBUFFERED=1 uv run --extra sqlite python examples/ablation/run_tier1_benchmarks.py --only trec_covid --subset 10 --use-sqlite-graph +``` + +## FiQA Results + +Before the SQLite batch FTS optimization: + +| Docs | Queries | MRR@10 | R@5 | R@10 | Hit@10 | Build | Search | +|-----:|--------:|-------:|----:|-----:|-------:|------:|-------:| +| 10,000 | 5 | 0.425 | 0.300 | 0.400 | 3/5 | 13.2s | 0.1s | +| 25,000 | 10 | 0.353 | 0.333 | 0.383 | 5/10 | 101.7s | 0.6s | +| 57,638 | 10 | 0.202 | 0.233 | 0.333 | 5/10 | 577.4s | 1.5s | + +After the SQLite batch FTS optimization: + +| Docs | Queries | MRR@10 | R@5 | R@10 | Hit@10 | Build | Search | +|-----:|--------:|-------:|----:|-----:|-------:|------:|-------:| +| 10,000 | 5 | 0.425 | 0.300 | 0.400 | 3/5 | 3.2s | 0.1s | +| 25,000 | 10 | 0.353 | 0.333 | 0.383 | 5/10 | 9.3s | 0.6s | +| 57,638 | 10 | 0.202 | 0.233 | 0.333 | 5/10 | 58.4s | 1.4s | + +## TREC-COVID Results + +After the SQLite batch FTS optimization: + +| Docs | Queries | MRR@10 | R@5 | R@10 | Hit@10 | Build | Search | +|-----:|--------:|-------:|----:|-----:|-------:|------:|-------:| +| 50,000 | 10 | 0.933 | 0.008 | 0.015 | 10/10 | 49.3s | 1.3s | +| 100,000 | 10 | 0.750 | 0.007 | 0.012 | 10/10 | 138.2s | 2.8s | +| 171,332 | 10 | 0.598 | 0.004 | 0.011 | 10/10 | 370.4s | 5.2s | + +TREC-COVID has many relevant documents per query, so R@5/R@10 is naturally +small in this smoke even when Hit@10 is perfect. + +## Interpretation + +- Search latency remains usable at 171k docs: 5.2s over 10 queries. +- The main large-corpus bottleneck is still initial FTS/index build, not retrieval. +- Avoiding unnecessary FTS deletes for newly inserted nodes reduced full FiQA build time by about 9.9x. +- `--corpus-limit` provides practical staged scale gates while preserving selected query gold docs. + +## Guard Policy + +- `.github/workflows/public-scale.yml` runs weekly/manual FiQA 10k and TREC-COVID 50k staged smokes. +- FiQA 25k/full and TREC-COVID 100k/full remain manual checks because they are multi-minute runs and depend on ignored local benchmark data. +- If 100k+ docs becomes a required routine gate, the next target is faster initial FTS/index build. diff --git a/examples/ablation/run_tier1_benchmarks.py b/examples/ablation/run_tier1_benchmarks.py index fc0cd74..f0dea4a 100644 --- a/examples/ablation/run_tier1_benchmarks.py +++ b/examples/ablation/run_tier1_benchmarks.py @@ -1,9 +1,11 @@ -"""Tier-1 English multi-hop benchmark runner. +"""Tier-1 English retrieval benchmark runner. -Runs Synaptic's retrieval pipeline over three standard multi-hop -corpora: HotPotQA-dev (full), MuSiQue-Ans-dev, and 2WikiMultiHopQA-dev. -These are the datasets HippoRAG2, GraphRAG, and the broader KG-RAG -line use for head-to-head comparisons. +Runs Synaptic's retrieval pipeline over standard multi-hop corpora +(HotPotQA-dev full, MuSiQue-Ans-dev, and 2WikiMultiHopQA-dev) and +large BEIR-style retrieval corpora (FiQA, TREC-COVID, SciFact). The +multi-hop sets are the datasets HippoRAG2, GraphRAG, and the broader +KG-RAG line use for head-to-head comparisons; the BEIR sets are useful +large-corpus scale checks with query/qrels ground truth. Two modes: @@ -27,6 +29,7 @@ # Embedder-free baseline (current published numbers) python examples/ablation/run_tier1_benchmarks.py python examples/ablation/run_tier1_benchmarks.py --only hotpotqa + python examples/ablation/run_tier1_benchmarks.py --only fiqa --subset 100 python examples/ablation/run_tier1_benchmarks.py --subset 200 # Full pipeline with Ollama embedder + TEI cross-encoder @@ -44,6 +47,7 @@ import argparse import asyncio +import hashlib import json import tempfile import time @@ -58,6 +62,7 @@ ) from synaptic.extensions.reranker_cross import TEIReranker from synaptic.graph import SynapticGraph +from synaptic.models import Node, NodeKind REPO_ROOT = Path(__file__).resolve().parents[2] BENCH = REPO_ROOT / "tests" / "benchmark" / "data" @@ -66,6 +71,11 @@ TOP_K = 10 +def _benchmark_node_id(doc_id: str) -> str: + digest = hashlib.blake2b(doc_id.encode("utf-8"), digest_size=16).hexdigest() + return f"bench_{digest}" + + @dataclass class Dataset: name: str @@ -89,6 +99,21 @@ class Dataset: path=BENCH / "2wiki_dev.json", reference="HippoRAG2: R@5 90.4 %", ), + Dataset( + name="FiQA test", + path=BENCH / "fiqa.json", + reference="BEIR FiQA: ~57k docs / 648 test queries", + ), + Dataset( + name="TREC-COVID test", + path=BENCH / "trec_covid.json", + reference="BEIR TREC-COVID: ~171k docs / 50 test queries", + ), + Dataset( + name="SciFact test", + path=BENCH / "scifact.json", + reference="BEIR SciFact: ~5k docs / 300 test queries", + ), ] @@ -131,6 +156,8 @@ async def run_one( entity_linker_cfg: tuple[int, float] | None = None, use_sqlite_graph: bool = False, embed_batch: int = 256, + ingest_batch: int = 5000, + corpus_limit: int | None = None, ) -> Report: if not ds.path.exists(): raise FileNotFoundError( @@ -173,7 +200,7 @@ async def run_one( # Pre-compute embeddings in large batches (GPU-friendly). # ``graph.add()`` accepts an ``embedding`` arg; if we pass it we # avoid the per-node single embed call that bottlenecks at batch=1. - items = [ + items_all = [ ( doc_id, str(doc.get("title", "") or doc_id), @@ -181,7 +208,20 @@ async def run_one( ) for doc_id, doc in corpus.items() ] - items = [(d, t, x) for d, t, x in items if t or x] + items_all = [(d, t, x) for d, t, x in items_all if t or x] + if corpus_limit is not None and 0 < corpus_limit < len(items_all): + gold_doc_ids: set[str] = set() + for qid, _qtext in query_items: + rel = qrels.get(qid, {}) + if isinstance(rel, dict): + gold_doc_ids.update(str(doc_id) for doc_id in rel) + else: + gold_doc_ids.update(str(doc_id) for doc_id in rel) + gold_items = [item for item in items_all if item[0] in gold_doc_ids] + filler_items = [item for item in items_all if item[0] not in gold_doc_ids] + items = [*gold_items, *filler_items][:corpus_limit] + else: + items = items_all embeddings: list[list[float] | None] = [None] * len(items) if embedder is not None: @@ -192,13 +232,33 @@ async def run_one( for j, v in enumerate(vecs): embeddings[i + j] = v if v else None - for (doc_id, title, text), emb in zip(items, embeddings): - await graph.add( - title=title, - content=text, - properties={"doc_id": doc_id}, - embedding=emb, - ) + save_nodes_batch = getattr(backend, "save_nodes_batch", None) + if phrase_extractor is None and callable(save_nodes_batch): + for i in range(0, len(items), ingest_batch): + batch = [ + Node( + id=_benchmark_node_id(doc_id), + kind=NodeKind.CONCEPT, + title=title, + content=text, + properties={"doc_id": doc_id}, + embedding=emb or [], + ) + for (doc_id, title, text), emb in zip( + items[i : i + ingest_batch], + embeddings[i : i + ingest_batch], + ) + ] + await save_nodes_batch(batch) + else: + for (doc_id, title, text), emb in zip(items, embeddings): + await graph.add( + title=title, + content=text, + properties={"doc_id": doc_id}, + embedding=emb, + record_memory_event=False, + ) # Post-hoc DF-filtered entity linking (opt-in via --entity-linker). # Runs AFTER ingest because the DF filter needs global corpus @@ -262,9 +322,9 @@ async def run_one( search_sec = time.perf_counter() - t_search n = max(len(query_items), 1) - return Report( + report = Report( name=ds.name, - n_docs=len(corpus), + n_docs=len(items), n_queries=len(query_items), mrr=mrr_total / n, recall_at_5=r5_total / n, @@ -274,6 +334,10 @@ async def run_one( search_sec=search_sec, reference=ds.reference, ) + close = getattr(backend, "close", None) + if callable(close): + await close() + return report def _emit_markdown( @@ -285,15 +349,17 @@ def _emit_markdown( decomposer_label: str = "none", phrase_extractor_label: str = "none", entity_linker_label: str = "none", + corpus_limit: int | None = None, ) -> Path: OUT_DIR.mkdir(parents=True, exist_ok=True) stamp = time.strftime("%Y%m%d_%H%M%S") path = OUT_DIR / f"tier1_{stamp}.md" lines = [ - "# Tier-1 English multi-hop benchmark — Synaptic v0.16.0", + "# Tier-1 English retrieval benchmark — Synaptic", "", f"- Run at: {time.strftime('%Y-%m-%d %H:%M:%S %Z')}", f"- Subset: {subset if subset else 'full'}", + f"- Corpus limit: {corpus_limit if corpus_limit else 'full'}", f"- Embedder: {embedder_label}", f"- Reranker: {reranker_label}", f"- Decomposer: {decomposer_label}", @@ -324,7 +390,10 @@ async def amain(argv: list[str]) -> int: p.add_argument( "--only", default=",".join(["hotpotqa", "musique", "2wiki"]), - help="comma-separated dataset keys (hotpotqa | musique | 2wiki)", + help=( + "comma-separated dataset keys " + "(hotpotqa | musique | 2wiki | fiqa | trec_covid | scifact)" + ), ) p.add_argument("--subset", type=int, default=None) p.add_argument( @@ -376,6 +445,21 @@ async def amain(argv: list[str]) -> int: "(default: 64 - safe under 6 GB free VRAM). Bump to 128-256 " "if more headroom.", ) + p.add_argument( + "--ingest-batch", + type=int, + default=5000, + help="Batch size for benchmark corpus node writes (default: 5000).", + ) + p.add_argument( + "--corpus-limit", + type=int, + default=None, + help=( + "Index at most this many docs for staged scale smoke. The selected " + "queries' gold docs are kept first, then distractors are filled in." + ), + ) p.add_argument( "--llm-decomposer-url", default=None, @@ -509,18 +593,29 @@ async def amain(argv: list[str]) -> int: "hotpotqa": DATASETS[0], "musique": DATASETS[1], "2wiki": DATASETS[2], + "fiqa": DATASETS[3], + "trec_covid": DATASETS[4], + "scifact": DATASETS[5], } - selected = [by_key[k.strip()] for k in args.only.split(",") if k.strip()] + selected = [] + for raw_key in args.only.split(","): + key = raw_key.strip() + if not key: + continue + if key not in by_key: + raise SystemExit(f"Unknown dataset key: {key}; available: {', '.join(by_key)}") + selected.append(by_key[key]) mode = "full pipeline" if embedder or reranker else "embedder-free" backend_label = "SqliteGraphBackend (HNSW)" if args.use_sqlite_graph else "MemoryBackend" - print(f"Tier-1 multi-hop English benchmarks — Synaptic v0.16.0 {mode}") + print(f"Tier-1 English retrieval benchmarks — Synaptic {mode}") print(f" backend: {backend_label}") print(f" embedder: {embedder_label}") print(f" reranker: {reranker_label}") print(f" decomposer: {decomposer_label}") print(f" phrase hub: {phrase_extractor_label}") print(f" entity linker: {entity_linker_label}") + print(f" corpus limit: {args.corpus_limit if args.corpus_limit else 'full'}") if embedder is not None: print(f" embed batch: {args.embed_batch}") print() @@ -541,6 +636,8 @@ async def amain(argv: list[str]) -> int: entity_linker_cfg=entity_linker_cfg, use_sqlite_graph=args.use_sqlite_graph, embed_batch=args.embed_batch, + ingest_batch=args.ingest_batch, + corpus_limit=args.corpus_limit, ) except FileNotFoundError as e: print(f"{ds.name:<24} SKIP — {e}") @@ -561,6 +658,7 @@ async def amain(argv: list[str]) -> int: decomposer_label=decomposer_label, phrase_extractor_label=phrase_extractor_label, entity_linker_label=entity_linker_label, + corpus_limit=args.corpus_limit, ) print() print(f"Markdown report → {out.relative_to(REPO_ROOT)}") diff --git a/src/synaptic/backends/sqlite.py b/src/synaptic/backends/sqlite.py index be29501..12b2fe2 100644 --- a/src/synaptic/backends/sqlite.py +++ b/src/synaptic/backends/sqlite.py @@ -1680,6 +1680,16 @@ async def save_nodes_batch(self, nodes: Sequence[Node]) -> None: if not nodes: return db = self._db() + node_ids = [node.id for node in nodes] + existing_ids: set[str] = set() + for offset in range(0, len(node_ids), 500): + chunk = node_ids[offset : offset + 500] + placeholders = ",".join("?" for _ in chunk) + async with db.execute( + f"SELECT id FROM syn_nodes WHERE id IN ({placeholders})", + chunk, + ) as cur: + existing_ids.update(str(row["id"]) for row in await cur.fetchall()) node_rows = [] fts_rows = [] for node in nodes: @@ -1726,10 +1736,11 @@ async def save_nodes_batch(self, nodes: Sequence[Node]) -> None: node_rows, ) # FTS sync: delete then re-insert - await db.executemany( - "DELETE FROM syn_nodes_fts WHERE node_id = ?", - [(n.id,) for n in nodes], - ) + if existing_ids: + await db.executemany( + "DELETE FROM syn_nodes_fts WHERE node_id = ?", + [(node_id,) for node_id in existing_ids], + ) await db.executemany( "INSERT INTO syn_nodes_fts(node_id, title, content) VALUES (?, ?, ?)", fts_rows, diff --git a/src/synaptic/graph.py b/src/synaptic/graph.py index ba88f9e..3759062 100644 --- a/src/synaptic/graph.py +++ b/src/synaptic/graph.py @@ -1559,6 +1559,7 @@ async def add( embedding: list[float] | None = None, properties: dict[str, str] | None = None, node_id: str | None = None, + record_memory_event: bool = True, ) -> Node: # NFC-normalize all user-provided text. Korean on macOS HFS+ arrives # as NFD, which breaks substring / FTS matching against NFC queries. @@ -1639,17 +1640,18 @@ async def add( content, ) - await self._save_memory_event( - MemoryEvent( - kind=MemoryEventKind.INGEST, - source=node.source or "graph", - source_id=node.id, - content_hash=_node_content_hash(node), - node_ids=[node.id], - edge_ids=await self._touching_edge_ids(node.id), - properties={"operation": "SynapticGraph.add", "kind": str(node.kind)}, + if record_memory_event: + await self._save_memory_event( + MemoryEvent( + kind=MemoryEventKind.INGEST, + source=node.source or "graph", + source_id=node.id, + content_hash=_node_content_hash(node), + node_ids=[node.id], + edge_ids=await self._touching_edge_ids(node.id), + properties={"operation": "SynapticGraph.add", "kind": str(node.kind)}, + ) ) - ) return node async def add_document( diff --git a/tests/test_backend_sqlite.py b/tests/test_backend_sqlite.py index 3b351cb..ea68032 100644 --- a/tests/test_backend_sqlite.py +++ b/tests/test_backend_sqlite.py @@ -53,6 +53,39 @@ async def test_list_filter(self, sqlite: SQLiteBackend) -> None: assert len(lessons) == 1 assert lessons[0].kind == NodeKind.LESSON + async def test_save_nodes_batch_indexes_new_nodes_for_fts( + self, + sqlite: SQLiteBackend, + ) -> None: + await sqlite.save_nodes_batch( + [ + Node(id="batch_a", title="Alpha", content="fresh corpus term"), + Node(id="batch_b", title="Beta", content="other text"), + ] + ) + + results = await sqlite.search_fts("fresh") + + assert [node.id for node in results] == ["batch_a"] + + async def test_save_nodes_batch_refreshes_existing_fts_rows( + self, + sqlite: SQLiteBackend, + ) -> None: + await sqlite.save_nodes_batch( + [Node(id="batch_update", title="Original", content="old searchable term")] + ) + + await sqlite.save_nodes_batch( + [Node(id="batch_update", title="Updated", content="new searchable term")] + ) + + old_results = await sqlite.search_fts("old") + new_results = await sqlite.search_fts("new") + + assert "batch_update" not in {node.id for node in old_results} + assert "batch_update" in {node.id for node in new_results} + class TestSQLiteEdges: async def test_save_and_get(self, sqlite: SQLiteBackend) -> None: diff --git a/tests/test_memory_operating_layer.py b/tests/test_memory_operating_layer.py index 61149d4..f084b0e 100644 --- a/tests/test_memory_operating_layer.py +++ b/tests/test_memory_operating_layer.py @@ -733,6 +733,17 @@ async def test_graph_mutations_record_memory_events(): assert delete.properties["operation"] == "SynapticGraph.remove" +@pytest.mark.asyncio +async def test_graph_add_can_skip_memory_event_for_bulk_loads(): + backend = MemoryBackend() + graph = SynapticGraph(backend) + + node = await graph.add("Bulk Node", "Body", record_memory_event=False) + + assert node.id + assert await backend.list_memory_events(kind=MemoryEventKind.INGEST, limit=10) == [] + + @pytest.mark.asyncio async def test_graph_edge_mutations_record_memory_events(): backend = MemoryBackend() diff --git a/tests/test_tier1_benchmarks.py b/tests/test_tier1_benchmarks.py new file mode 100644 index 0000000..a8b28da --- /dev/null +++ b/tests/test_tier1_benchmarks.py @@ -0,0 +1,49 @@ +"""Tests for Tier-1 benchmark runner helpers.""" + +from __future__ import annotations + +import importlib.util +import json +import sys +from pathlib import Path + +import pytest + +RUNNER_PATH = ( + Path(__file__).resolve().parents[1] / "examples" / "ablation" / "run_tier1_benchmarks.py" +) +SPEC = importlib.util.spec_from_file_location("run_tier1_benchmarks", RUNNER_PATH) +assert SPEC is not None +assert SPEC.loader is not None +runner = importlib.util.module_from_spec(SPEC) +sys.modules[SPEC.name] = runner +SPEC.loader.exec_module(runner) + + +@pytest.mark.asyncio +async def test_corpus_limit_keeps_selected_query_gold_docs(tmp_path): + path = tmp_path / "tiny_bench.json" + path.write_text( + json.dumps( + { + "corpus": { + "filler_a": {"title": "Filler A", "text": "unrelated alpha"}, + "filler_b": {"title": "Filler B", "text": "unrelated beta"}, + "gold_doc": {"title": "Gold", "text": "needle targetterm"}, + }, + "queries": {"q1": "targetterm"}, + "qrels": {"q1": {"gold_doc": 1}}, + } + ), + encoding="utf-8", + ) + + report = await runner.run_one( + runner.Dataset(name="Tiny", path=path, reference="unit"), + subset=1, + corpus_limit=2, + ) + + assert report.n_docs == 2 + assert report.hit_at_10 == 1 + assert report.recall_at_10 == 1.0 From 7a8478c60a2fee315bb22bdc6957e05fd4ed41d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EC=86=90=EC=84=B1=EC=A4=80?= Date: Thu, 2 Jul 2026 10:50:40 +0900 Subject: [PATCH 2/3] Tune public scale ingest batch --- examples/ablation/diagnostics/public_scale_20260702.md | 7 ++++--- examples/ablation/run_tier1_benchmarks.py | 10 +++++++--- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/examples/ablation/diagnostics/public_scale_20260702.md b/examples/ablation/diagnostics/public_scale_20260702.md index 5fb02f8..ed23f0c 100644 --- a/examples/ablation/diagnostics/public_scale_20260702.md +++ b/examples/ablation/diagnostics/public_scale_20260702.md @@ -48,9 +48,9 @@ After the SQLite batch FTS optimization: | Docs | Queries | MRR@10 | R@5 | R@10 | Hit@10 | Build | Search | |-----:|--------:|-------:|----:|-----:|-------:|------:|-------:| -| 50,000 | 10 | 0.933 | 0.008 | 0.015 | 10/10 | 49.3s | 1.3s | -| 100,000 | 10 | 0.750 | 0.007 | 0.012 | 10/10 | 138.2s | 2.8s | -| 171,332 | 10 | 0.598 | 0.004 | 0.011 | 10/10 | 370.4s | 5.2s | +| 50,000 | 10 | 0.933 | 0.008 | 0.015 | 10/10 | 20.6s | 1.4s | +| 100,000 | 10 | 0.750 | 0.007 | 0.012 | 10/10 | 55.2s | 2.8s | +| 171,332 | 10 | 0.598 | 0.004 | 0.011 | 10/10 | 135.1s | 5.2s | TREC-COVID has many relevant documents per query, so R@5/R@10 is naturally small in this smoke even when Hit@10 is perfect. @@ -60,6 +60,7 @@ small in this smoke even when Hit@10 is perfect. - Search latency remains usable at 171k docs: 5.2s over 10 queries. - The main large-corpus bottleneck is still initial FTS/index build, not retrieval. - Avoiding unnecessary FTS deletes for newly inserted nodes reduced full FiQA build time by about 9.9x. +- Raising benchmark ingest batches to 20k reduced full TREC-COVID build time by about 2.7x. - `--corpus-limit` provides practical staged scale gates while preserving selected query gold docs. ## Guard Policy diff --git a/examples/ablation/run_tier1_benchmarks.py b/examples/ablation/run_tier1_benchmarks.py index f0dea4a..51b5980 100644 --- a/examples/ablation/run_tier1_benchmarks.py +++ b/examples/ablation/run_tier1_benchmarks.py @@ -156,7 +156,7 @@ async def run_one( entity_linker_cfg: tuple[int, float] | None = None, use_sqlite_graph: bool = False, embed_batch: int = 256, - ingest_batch: int = 5000, + ingest_batch: int = 20000, corpus_limit: int | None = None, ) -> Report: if not ds.path.exists(): @@ -350,6 +350,7 @@ def _emit_markdown( phrase_extractor_label: str = "none", entity_linker_label: str = "none", corpus_limit: int | None = None, + ingest_batch: int = 20000, ) -> Path: OUT_DIR.mkdir(parents=True, exist_ok=True) stamp = time.strftime("%Y%m%d_%H%M%S") @@ -360,6 +361,7 @@ def _emit_markdown( f"- Run at: {time.strftime('%Y-%m-%d %H:%M:%S %Z')}", f"- Subset: {subset if subset else 'full'}", f"- Corpus limit: {corpus_limit if corpus_limit else 'full'}", + f"- Ingest batch: {ingest_batch}", f"- Embedder: {embedder_label}", f"- Reranker: {reranker_label}", f"- Decomposer: {decomposer_label}", @@ -448,8 +450,8 @@ async def amain(argv: list[str]) -> int: p.add_argument( "--ingest-batch", type=int, - default=5000, - help="Batch size for benchmark corpus node writes (default: 5000).", + default=20000, + help="Batch size for benchmark corpus node writes (default: 20000).", ) p.add_argument( "--corpus-limit", @@ -616,6 +618,7 @@ async def amain(argv: list[str]) -> int: print(f" phrase hub: {phrase_extractor_label}") print(f" entity linker: {entity_linker_label}") print(f" corpus limit: {args.corpus_limit if args.corpus_limit else 'full'}") + print(f" ingest batch: {args.ingest_batch}") if embedder is not None: print(f" embed batch: {args.embed_batch}") print() @@ -659,6 +662,7 @@ async def amain(argv: list[str]) -> int: phrase_extractor_label=phrase_extractor_label, entity_linker_label=entity_linker_label, corpus_limit=args.corpus_limit, + ingest_batch=args.ingest_batch, ) print() print(f"Markdown report → {out.relative_to(REPO_ROOT)}") From e61963c408826b37e6023dc5d0e90896561ec5ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EC=86=90=EC=84=B1=EC=A4=80?= Date: Thu, 2 Jul 2026 10:54:45 +0900 Subject: [PATCH 3/3] Fail public scale guard on regressions --- .github/workflows/public-scale.yml | 12 ++++- examples/ablation/run_tier1_benchmarks.py | 65 +++++++++++++++++++++++ tests/test_tier1_benchmarks.py | 56 +++++++++++++++++++ 3 files changed, 131 insertions(+), 2 deletions(-) diff --git a/.github/workflows/public-scale.yml b/.github/workflows/public-scale.yml index 9a8fcd1..1c231c7 100644 --- a/.github/workflows/public-scale.yml +++ b/.github/workflows/public-scale.yml @@ -44,7 +44,11 @@ jobs: --only fiqa \ --subset 5 \ --corpus-limit 10000 \ - --use-sqlite-graph | tee /tmp/fiqa_scale_guard.log + --use-sqlite-graph \ + --max-build-sec 120 \ + --max-search-sec 20 \ + --min-hit-rate-at-10 0.40 \ + --min-mrr 0.20 | tee /tmp/fiqa_scale_guard.log cp "$(ls -t examples/ablation/diagnostics/tier1_*.md | head -1)" \ /tmp/fiqa_scale_guard.md @@ -55,7 +59,11 @@ jobs: --only trec_covid \ --subset 10 \ --corpus-limit 50000 \ - --use-sqlite-graph | tee /tmp/trec_covid_scale_guard.log + --use-sqlite-graph \ + --max-build-sec 240 \ + --max-search-sec 30 \ + --min-hit-rate-at-10 0.80 \ + --min-mrr 0.50 | tee /tmp/trec_covid_scale_guard.log cp "$(ls -t examples/ablation/diagnostics/tier1_*.md | head -1)" \ /tmp/trec_covid_scale_guard.md diff --git a/examples/ablation/run_tier1_benchmarks.py b/examples/ablation/run_tier1_benchmarks.py index 51b5980..7af75d8 100644 --- a/examples/ablation/run_tier1_benchmarks.py +++ b/examples/ablation/run_tier1_benchmarks.py @@ -387,6 +387,34 @@ def _emit_markdown( return path +def _threshold_violations( + reports: list[Report], + *, + max_build_sec: float | None = None, + max_search_sec: float | None = None, + min_hit_rate_at_10: float | None = None, + min_mrr: float | None = None, +) -> list[str]: + violations: list[str] = [] + for report in reports: + hit_rate = report.hit_at_10 / max(report.n_queries, 1) + if max_build_sec is not None and report.build_sec > max_build_sec: + violations.append( + f"{report.name}: build {report.build_sec:.1f}s > {max_build_sec:.1f}s" + ) + if max_search_sec is not None and report.search_sec > max_search_sec: + violations.append( + f"{report.name}: search {report.search_sec:.1f}s > {max_search_sec:.1f}s" + ) + if min_hit_rate_at_10 is not None and hit_rate < min_hit_rate_at_10: + violations.append( + f"{report.name}: hit@10 rate {hit_rate:.3f} < {min_hit_rate_at_10:.3f}" + ) + if min_mrr is not None and report.mrr < min_mrr: + violations.append(f"{report.name}: MRR@10 {report.mrr:.3f} < {min_mrr:.3f}") + return violations + + async def amain(argv: list[str]) -> int: p = argparse.ArgumentParser() p.add_argument( @@ -462,6 +490,30 @@ async def amain(argv: list[str]) -> int: "queries' gold docs are kept first, then distractors are filled in." ), ) + p.add_argument( + "--max-build-sec", + type=float, + default=None, + help="Fail if any dataset build takes longer than this many seconds.", + ) + p.add_argument( + "--max-search-sec", + type=float, + default=None, + help="Fail if any dataset search phase takes longer than this many seconds.", + ) + p.add_argument( + "--min-hit-rate-at-10", + type=float, + default=None, + help="Fail if any dataset Hit@10 / queries is below this value.", + ) + p.add_argument( + "--min-mrr", + type=float, + default=None, + help="Fail if any dataset MRR@10 is below this value.", + ) p.add_argument( "--llm-decomposer-url", default=None, @@ -666,6 +718,19 @@ async def amain(argv: list[str]) -> int: ) print() print(f"Markdown report → {out.relative_to(REPO_ROOT)}") + violations = _threshold_violations( + reports, + max_build_sec=args.max_build_sec, + max_search_sec=args.max_search_sec, + min_hit_rate_at_10=args.min_hit_rate_at_10, + min_mrr=args.min_mrr, + ) + if violations: + print() + print("Threshold violations:") + for violation in violations: + print(f" - {violation}") + return 1 return 0 diff --git a/tests/test_tier1_benchmarks.py b/tests/test_tier1_benchmarks.py index a8b28da..7c4d65c 100644 --- a/tests/test_tier1_benchmarks.py +++ b/tests/test_tier1_benchmarks.py @@ -47,3 +47,59 @@ async def test_corpus_limit_keeps_selected_query_gold_docs(tmp_path): assert report.n_docs == 2 assert report.hit_at_10 == 1 assert report.recall_at_10 == 1.0 + + +def test_threshold_violations_report_scale_regressions(): + report = runner.Report( + name="Tiny", + n_docs=100, + n_queries=10, + mrr=0.25, + recall_at_5=0.1, + recall_at_10=0.2, + hit_at_10=3, + build_sec=12.0, + search_sec=4.0, + reference="unit", + ) + + violations = runner._threshold_violations( + [report], + max_build_sec=10.0, + max_search_sec=3.0, + min_hit_rate_at_10=0.5, + min_mrr=0.3, + ) + + assert violations == [ + "Tiny: build 12.0s > 10.0s", + "Tiny: search 4.0s > 3.0s", + "Tiny: hit@10 rate 0.300 < 0.500", + "Tiny: MRR@10 0.250 < 0.300", + ] + + +def test_threshold_violations_accept_passing_report(): + report = runner.Report( + name="Tiny", + n_docs=100, + n_queries=10, + mrr=0.5, + recall_at_5=0.1, + recall_at_10=0.2, + hit_at_10=8, + build_sec=2.0, + search_sec=1.0, + reference="unit", + ) + + assert ( + runner._threshold_violations( + [report], + max_build_sec=10.0, + max_search_sec=3.0, + min_hit_rate_at_10=0.5, + min_mrr=0.3, + ) + == [] + )