From a1da6a9167842aab1bae36541dc719c0d38618c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EC=86=90=EC=84=B1=EC=A4=80?= <sonsj97@gmail.com>
Date: Thu, 2 Jul 2026 10:42:09 +0900
Subject: [PATCH 1/3] Add public large-corpus scale guard

---
 .github/workflows/public-scale.yml            |  72 +++++++++
 .../diagnostics/public_scale_20260702.md      |  69 +++++++++
 examples/ablation/run_tier1_benchmarks.py     | 138 +++++++++++++++---
 src/synaptic/backends/sqlite.py               |  19 ++-
 src/synaptic/graph.py                         |  22 +--
 tests/test_backend_sqlite.py                  |  33 +++++
 tests/test_memory_operating_layer.py          |  11 ++
 tests/test_tier1_benchmarks.py                |  49 +++++++
 8 files changed, 379 insertions(+), 34 deletions(-)
 create mode 100644 .github/workflows/public-scale.yml
 create mode 100644 examples/ablation/diagnostics/public_scale_20260702.md
 create mode 100644 tests/test_tier1_benchmarks.py

diff --git a/.github/workflows/public-scale.yml b/.github/workflows/public-scale.yml
new file mode 100644
index 0000000..9a8fcd1
--- /dev/null
+++ b/.github/workflows/public-scale.yml
@@ -0,0 +1,72 @@
+name: Public Scale Guard
+
+# Scheduled, public large-corpus smoke for the SQLite/EvidenceSearch path.
+# Benchmark JSON files are gitignored, so this workflow regenerates public
+# datasets from HuggingFace and runs staged guards that keep selected query
+# gold docs in each indexed corpus.
+
+on:
+  schedule:
+    - cron: "30 6 * * 2" # Tuesdays 06:30 UTC
+  workflow_dispatch: {}
+
+jobs:
+  public-scale:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+
+      - name: Set up Python 3.12
+        run: uv python install 3.12
+
+      - name: Cache uv
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/uv
+          key: uv-${{ runner.os }}-${{ hashFiles('uv.lock') }}
+          restore-keys: uv-${{ runner.os }}-
+
+      - name: Install dependencies
+        run: uv sync --extra sqlite --extra eval
+
+      - name: Download public scale benchmarks
+        run: |
+          uv run --extra eval python examples/ablation/download_benchmarks.py \
+            --only fiqa,trec_covid
+
+      - name: Run FiQA 10k scale smoke
+        run: |
+          set -o pipefail
+          PYTHONUNBUFFERED=1 uv run --extra sqlite python examples/ablation/run_tier1_benchmarks.py \
+            --only fiqa \
+            --subset 5 \
+            --corpus-limit 10000 \
+            --use-sqlite-graph | tee /tmp/fiqa_scale_guard.log
+          cp "$(ls -t examples/ablation/diagnostics/tier1_*.md | head -1)" \
+            /tmp/fiqa_scale_guard.md
+
+      - name: Run TREC-COVID 50k scale smoke
+        run: |
+          set -o pipefail
+          PYTHONUNBUFFERED=1 uv run --extra sqlite python examples/ablation/run_tier1_benchmarks.py \
+            --only trec_covid \
+            --subset 10 \
+            --corpus-limit 50000 \
+            --use-sqlite-graph | tee /tmp/trec_covid_scale_guard.log
+          cp "$(ls -t examples/ablation/diagnostics/tier1_*.md | head -1)" \
+            /tmp/trec_covid_scale_guard.md
+
+      - name: Upload public scale results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: public-scale-results
+          path: |
+            /tmp/fiqa_scale_guard.log
+            /tmp/fiqa_scale_guard.md
+            /tmp/trec_covid_scale_guard.log
+            /tmp/trec_covid_scale_guard.md
+          if-no-files-found: ignore
diff --git a/examples/ablation/diagnostics/public_scale_20260702.md b/examples/ablation/diagnostics/public_scale_20260702.md
new file mode 100644
index 0000000..5fb02f8
--- /dev/null
+++ b/examples/ablation/diagnostics/public_scale_20260702.md
@@ -0,0 +1,69 @@
+# Public Large-Corpus Scale Smoke - 2026-07-02
+
+## Datasets
+
+| Dataset | Local artifact | Corpus | Queries | Smoke scope |
+|---------|----------------|-------:|--------:|-------------|
+| BEIR FiQA test | `tests/benchmark/data/fiqa.json` | 57,638 docs | 648 | 5-10 queries |
+| BEIR TREC-COVID test | `tests/benchmark/data/trec_covid.json` | 171,332 docs | 50 | 10 queries |
+
+Mode: embedder-free `graph.search()` with `SqliteGraphBackend`.
+
+## Commands
+
+```bash
+uv run --extra eval python examples/ablation/download_benchmarks.py --only fiqa
+uv run --extra eval python examples/ablation/download_benchmarks.py --only trec_covid
+
+PYTHONUNBUFFERED=1 uv run --extra sqlite python examples/ablation/run_tier1_benchmarks.py --only fiqa --subset 5 --corpus-limit 10000 --use-sqlite-graph
+PYTHONUNBUFFERED=1 uv run --extra sqlite python examples/ablation/run_tier1_benchmarks.py --only fiqa --subset 10 --corpus-limit 25000 --use-sqlite-graph
+PYTHONUNBUFFERED=1 uv run --extra sqlite python examples/ablation/run_tier1_benchmarks.py --only fiqa --subset 10 --use-sqlite-graph
+
+PYTHONUNBUFFERED=1 uv run --extra sqlite python examples/ablation/run_tier1_benchmarks.py --only trec_covid --subset 10 --corpus-limit 50000 --use-sqlite-graph
+PYTHONUNBUFFERED=1 uv run --extra sqlite python examples/ablation/run_tier1_benchmarks.py --only trec_covid --subset 10 --corpus-limit 100000 --use-sqlite-graph
+PYTHONUNBUFFERED=1 uv run --extra sqlite python examples/ablation/run_tier1_benchmarks.py --only trec_covid --subset 10 --use-sqlite-graph
+```
+
+## FiQA Results
+
+Before the SQLite batch FTS optimization:
+
+| Docs | Queries | MRR@10 | R@5 | R@10 | Hit@10 | Build | Search |
+|-----:|--------:|-------:|----:|-----:|-------:|------:|-------:|
+| 10,000 | 5 | 0.425 | 0.300 | 0.400 | 3/5 | 13.2s | 0.1s |
+| 25,000 | 10 | 0.353 | 0.333 | 0.383 | 5/10 | 101.7s | 0.6s |
+| 57,638 | 10 | 0.202 | 0.233 | 0.333 | 5/10 | 577.4s | 1.5s |
+
+After the SQLite batch FTS optimization:
+
+| Docs | Queries | MRR@10 | R@5 | R@10 | Hit@10 | Build | Search |
+|-----:|--------:|-------:|----:|-----:|-------:|------:|-------:|
+| 10,000 | 5 | 0.425 | 0.300 | 0.400 | 3/5 | 3.2s | 0.1s |
+| 25,000 | 10 | 0.353 | 0.333 | 0.383 | 5/10 | 9.3s | 0.6s |
+| 57,638 | 10 | 0.202 | 0.233 | 0.333 | 5/10 | 58.4s | 1.4s |
+
+## TREC-COVID Results
+
+After the SQLite batch FTS optimization:
+
+| Docs | Queries | MRR@10 | R@5 | R@10 | Hit@10 | Build | Search |
+|-----:|--------:|-------:|----:|-----:|-------:|------:|-------:|
+| 50,000 | 10 | 0.933 | 0.008 | 0.015 | 10/10 | 49.3s | 1.3s |
+| 100,000 | 10 | 0.750 | 0.007 | 0.012 | 10/10 | 138.2s | 2.8s |
+| 171,332 | 10 | 0.598 | 0.004 | 0.011 | 10/10 | 370.4s | 5.2s |
+
+TREC-COVID has many relevant documents per query, so R@5/R@10 is naturally
+small in this smoke even when Hit@10 is perfect.
+
+## Interpretation
+
+- Search latency remains usable at 171k docs: 5.2s over 10 queries.
+- The main large-corpus bottleneck is still initial FTS/index build, not retrieval.
+- Avoiding unnecessary FTS deletes for newly inserted nodes reduced full FiQA build time by about 9.9x.
+- `--corpus-limit` provides practical staged scale gates while preserving selected query gold docs.
+
+## Guard Policy
+
+- `.github/workflows/public-scale.yml` runs weekly/manual FiQA 10k and TREC-COVID 50k staged smokes.
+- FiQA 25k/full and TREC-COVID 100k/full remain manual checks because they are multi-minute runs and depend on ignored local benchmark data.
+- If 100k+ docs becomes a required routine gate, the next target is faster initial FTS/index build.
diff --git a/examples/ablation/run_tier1_benchmarks.py b/examples/ablation/run_tier1_benchmarks.py
index fc0cd74..f0dea4a 100644
--- a/examples/ablation/run_tier1_benchmarks.py
+++ b/examples/ablation/run_tier1_benchmarks.py
@@ -1,9 +1,11 @@
-"""Tier-1 English multi-hop benchmark runner.
+"""Tier-1 English retrieval benchmark runner.
 
-Runs Synaptic's retrieval pipeline over three standard multi-hop
-corpora: HotPotQA-dev (full), MuSiQue-Ans-dev, and 2WikiMultiHopQA-dev.
-These are the datasets HippoRAG2, GraphRAG, and the broader KG-RAG
-line use for head-to-head comparisons.
+Runs Synaptic's retrieval pipeline over standard multi-hop corpora
+(HotPotQA-dev full, MuSiQue-Ans-dev, and 2WikiMultiHopQA-dev) and
+large BEIR-style retrieval corpora (FiQA, TREC-COVID, SciFact). The
+multi-hop sets are the datasets HippoRAG2, GraphRAG, and the broader
+KG-RAG line use for head-to-head comparisons; the BEIR sets are useful
+large-corpus scale checks with query/qrels ground truth.
 
 Two modes:
 
@@ -27,6 +29,7 @@
     # Embedder-free baseline (current published numbers)
     python examples/ablation/run_tier1_benchmarks.py
     python examples/ablation/run_tier1_benchmarks.py --only hotpotqa
+    python examples/ablation/run_tier1_benchmarks.py --only fiqa --subset 100
     python examples/ablation/run_tier1_benchmarks.py --subset 200
 
     # Full pipeline with Ollama embedder + TEI cross-encoder
@@ -44,6 +47,7 @@
 
 import argparse
 import asyncio
+import hashlib
 import json
 import tempfile
 import time
@@ -58,6 +62,7 @@
 )
 from synaptic.extensions.reranker_cross import TEIReranker
 from synaptic.graph import SynapticGraph
+from synaptic.models import Node, NodeKind
 
 REPO_ROOT = Path(__file__).resolve().parents[2]
 BENCH = REPO_ROOT / "tests" / "benchmark" / "data"
@@ -66,6 +71,11 @@
 TOP_K = 10
 
 
+def _benchmark_node_id(doc_id: str) -> str:
+    digest = hashlib.blake2b(doc_id.encode("utf-8"), digest_size=16).hexdigest()
+    return f"bench_{digest}"
+
+
 @dataclass
 class Dataset:
     name: str
@@ -89,6 +99,21 @@ class Dataset:
         path=BENCH / "2wiki_dev.json",
         reference="HippoRAG2: R@5 90.4 %",
     ),
+    Dataset(
+        name="FiQA test",
+        path=BENCH / "fiqa.json",
+        reference="BEIR FiQA: ~57k docs / 648 test queries",
+    ),
+    Dataset(
+        name="TREC-COVID test",
+        path=BENCH / "trec_covid.json",
+        reference="BEIR TREC-COVID: ~171k docs / 50 test queries",
+    ),
+    Dataset(
+        name="SciFact test",
+        path=BENCH / "scifact.json",
+        reference="BEIR SciFact: ~5k docs / 300 test queries",
+    ),
 ]
 
 
@@ -131,6 +156,8 @@ async def run_one(
     entity_linker_cfg: tuple[int, float] | None = None,
     use_sqlite_graph: bool = False,
     embed_batch: int = 256,
+    ingest_batch: int = 5000,
+    corpus_limit: int | None = None,
 ) -> Report:
     if not ds.path.exists():
         raise FileNotFoundError(
@@ -173,7 +200,7 @@ async def run_one(
     # Pre-compute embeddings in large batches (GPU-friendly).
     # ``graph.add()`` accepts an ``embedding`` arg; if we pass it we
     # avoid the per-node single embed call that bottlenecks at batch=1.
-    items = [
+    items_all = [
         (
             doc_id,
             str(doc.get("title", "") or doc_id),
@@ -181,7 +208,20 @@ async def run_one(
         )
         for doc_id, doc in corpus.items()
     ]
-    items = [(d, t, x) for d, t, x in items if t or x]
+    items_all = [(d, t, x) for d, t, x in items_all if t or x]
+    if corpus_limit is not None and 0 < corpus_limit < len(items_all):
+        gold_doc_ids: set[str] = set()
+        for qid, _qtext in query_items:
+            rel = qrels.get(qid, {})
+            if isinstance(rel, dict):
+                gold_doc_ids.update(str(doc_id) for doc_id in rel)
+            else:
+                gold_doc_ids.update(str(doc_id) for doc_id in rel)
+        gold_items = [item for item in items_all if item[0] in gold_doc_ids]
+        filler_items = [item for item in items_all if item[0] not in gold_doc_ids]
+        items = [*gold_items, *filler_items][:corpus_limit]
+    else:
+        items = items_all
 
     embeddings: list[list[float] | None] = [None] * len(items)
     if embedder is not None:
@@ -192,13 +232,33 @@ async def run_one(
             for j, v in enumerate(vecs):
                 embeddings[i + j] = v if v else None
 
-    for (doc_id, title, text), emb in zip(items, embeddings):
-        await graph.add(
-            title=title,
-            content=text,
-            properties={"doc_id": doc_id},
-            embedding=emb,
-        )
+    save_nodes_batch = getattr(backend, "save_nodes_batch", None)
+    if phrase_extractor is None and callable(save_nodes_batch):
+        for i in range(0, len(items), ingest_batch):
+            batch = [
+                Node(
+                    id=_benchmark_node_id(doc_id),
+                    kind=NodeKind.CONCEPT,
+                    title=title,
+                    content=text,
+                    properties={"doc_id": doc_id},
+                    embedding=emb or [],
+                )
+                for (doc_id, title, text), emb in zip(
+                    items[i : i + ingest_batch],
+                    embeddings[i : i + ingest_batch],
+                )
+            ]
+            await save_nodes_batch(batch)
+    else:
+        for (doc_id, title, text), emb in zip(items, embeddings):
+            await graph.add(
+                title=title,
+                content=text,
+                properties={"doc_id": doc_id},
+                embedding=emb,
+                record_memory_event=False,
+            )
 
     # Post-hoc DF-filtered entity linking (opt-in via --entity-linker).
     # Runs AFTER ingest because the DF filter needs global corpus
@@ -262,9 +322,9 @@ async def run_one(
     search_sec = time.perf_counter() - t_search
 
     n = max(len(query_items), 1)
-    return Report(
+    report = Report(
         name=ds.name,
-        n_docs=len(corpus),
+        n_docs=len(items),
         n_queries=len(query_items),
         mrr=mrr_total / n,
         recall_at_5=r5_total / n,
@@ -274,6 +334,10 @@ async def run_one(
         search_sec=search_sec,
         reference=ds.reference,
     )
+    close = getattr(backend, "close", None)
+    if callable(close):
+        await close()
+    return report
 
 
 def _emit_markdown(
@@ -285,15 +349,17 @@ def _emit_markdown(
     decomposer_label: str = "none",
     phrase_extractor_label: str = "none",
     entity_linker_label: str = "none",
+    corpus_limit: int | None = None,
 ) -> Path:
     OUT_DIR.mkdir(parents=True, exist_ok=True)
     stamp = time.strftime("%Y%m%d_%H%M%S")
     path = OUT_DIR / f"tier1_{stamp}.md"
     lines = [
-        "# Tier-1 English multi-hop benchmark — Synaptic v0.16.0",
+        "# Tier-1 English retrieval benchmark — Synaptic",
         "",
         f"- Run at: {time.strftime('%Y-%m-%d %H:%M:%S %Z')}",
         f"- Subset: {subset if subset else 'full'}",
+        f"- Corpus limit: {corpus_limit if corpus_limit else 'full'}",
         f"- Embedder: {embedder_label}",
         f"- Reranker: {reranker_label}",
         f"- Decomposer: {decomposer_label}",
@@ -324,7 +390,10 @@ async def amain(argv: list[str]) -> int:
     p.add_argument(
         "--only",
         default=",".join(["hotpotqa", "musique", "2wiki"]),
-        help="comma-separated dataset keys (hotpotqa | musique | 2wiki)",
+        help=(
+            "comma-separated dataset keys "
+            "(hotpotqa | musique | 2wiki | fiqa | trec_covid | scifact)"
+        ),
     )
     p.add_argument("--subset", type=int, default=None)
     p.add_argument(
@@ -376,6 +445,21 @@ async def amain(argv: list[str]) -> int:
         "(default: 64 - safe under 6 GB free VRAM). Bump to 128-256 "
         "if more headroom.",
     )
+    p.add_argument(
+        "--ingest-batch",
+        type=int,
+        default=5000,
+        help="Batch size for benchmark corpus node writes (default: 5000).",
+    )
+    p.add_argument(
+        "--corpus-limit",
+        type=int,
+        default=None,
+        help=(
+            "Index at most this many docs for staged scale smoke. The selected "
+            "queries' gold docs are kept first, then distractors are filled in."
+        ),
+    )
     p.add_argument(
         "--llm-decomposer-url",
         default=None,
@@ -509,18 +593,29 @@ async def amain(argv: list[str]) -> int:
         "hotpotqa": DATASETS[0],
         "musique": DATASETS[1],
         "2wiki": DATASETS[2],
+        "fiqa": DATASETS[3],
+        "trec_covid": DATASETS[4],
+        "scifact": DATASETS[5],
     }
-    selected = [by_key[k.strip()] for k in args.only.split(",") if k.strip()]
+    selected = []
+    for raw_key in args.only.split(","):
+        key = raw_key.strip()
+        if not key:
+            continue
+        if key not in by_key:
+            raise SystemExit(f"Unknown dataset key: {key}; available: {', '.join(by_key)}")
+        selected.append(by_key[key])
 
     mode = "full pipeline" if embedder or reranker else "embedder-free"
     backend_label = "SqliteGraphBackend (HNSW)" if args.use_sqlite_graph else "MemoryBackend"
-    print(f"Tier-1 multi-hop English benchmarks — Synaptic v0.16.0 {mode}")
+    print(f"Tier-1 English retrieval benchmarks — Synaptic {mode}")
     print(f"  backend:  {backend_label}")
     print(f"  embedder: {embedder_label}")
     print(f"  reranker: {reranker_label}")
     print(f"  decomposer: {decomposer_label}")
     print(f"  phrase hub: {phrase_extractor_label}")
     print(f"  entity linker: {entity_linker_label}")
+    print(f"  corpus limit: {args.corpus_limit if args.corpus_limit else 'full'}")
     if embedder is not None:
         print(f"  embed batch: {args.embed_batch}")
     print()
@@ -541,6 +636,8 @@ async def amain(argv: list[str]) -> int:
                 entity_linker_cfg=entity_linker_cfg,
                 use_sqlite_graph=args.use_sqlite_graph,
                 embed_batch=args.embed_batch,
+                ingest_batch=args.ingest_batch,
+                corpus_limit=args.corpus_limit,
             )
         except FileNotFoundError as e:
             print(f"{ds.name:<24}  SKIP — {e}")
@@ -561,6 +658,7 @@ async def amain(argv: list[str]) -> int:
             decomposer_label=decomposer_label,
             phrase_extractor_label=phrase_extractor_label,
             entity_linker_label=entity_linker_label,
+            corpus_limit=args.corpus_limit,
         )
         print()
         print(f"Markdown report → {out.relative_to(REPO_ROOT)}")
diff --git a/src/synaptic/backends/sqlite.py b/src/synaptic/backends/sqlite.py
index be29501..12b2fe2 100644
--- a/src/synaptic/backends/sqlite.py
+++ b/src/synaptic/backends/sqlite.py
@@ -1680,6 +1680,16 @@ async def save_nodes_batch(self, nodes: Sequence[Node]) -> None:
         if not nodes:
             return
         db = self._db()
+        node_ids = [node.id for node in nodes]
+        existing_ids: set[str] = set()
+        for offset in range(0, len(node_ids), 500):
+            chunk = node_ids[offset : offset + 500]
+            placeholders = ",".join("?" for _ in chunk)
+            async with db.execute(
+                f"SELECT id FROM syn_nodes WHERE id IN ({placeholders})",
+                chunk,
+            ) as cur:
+                existing_ids.update(str(row["id"]) for row in await cur.fetchall())
         node_rows = []
         fts_rows = []
         for node in nodes:
@@ -1726,10 +1736,11 @@ async def save_nodes_batch(self, nodes: Sequence[Node]) -> None:
                 node_rows,
             )
             # FTS sync: delete then re-insert
-            await db.executemany(
-                "DELETE FROM syn_nodes_fts WHERE node_id = ?",
-                [(n.id,) for n in nodes],
-            )
+            if existing_ids:
+                await db.executemany(
+                    "DELETE FROM syn_nodes_fts WHERE node_id = ?",
+                    [(node_id,) for node_id in existing_ids],
+                )
             await db.executemany(
                 "INSERT INTO syn_nodes_fts(node_id, title, content) VALUES (?, ?, ?)",
                 fts_rows,
diff --git a/src/synaptic/graph.py b/src/synaptic/graph.py
index ba88f9e..3759062 100644
--- a/src/synaptic/graph.py
+++ b/src/synaptic/graph.py
@@ -1559,6 +1559,7 @@ async def add(
         embedding: list[float] | None = None,
         properties: dict[str, str] | None = None,
         node_id: str | None = None,
+        record_memory_event: bool = True,
     ) -> Node:
         # NFC-normalize all user-provided text. Korean on macOS HFS+ arrives
         # as NFD, which breaks substring / FTS matching against NFC queries.
@@ -1639,17 +1640,18 @@ async def add(
                 content,
             )
 
-        await self._save_memory_event(
-            MemoryEvent(
-                kind=MemoryEventKind.INGEST,
-                source=node.source or "graph",
-                source_id=node.id,
-                content_hash=_node_content_hash(node),
-                node_ids=[node.id],
-                edge_ids=await self._touching_edge_ids(node.id),
-                properties={"operation": "SynapticGraph.add", "kind": str(node.kind)},
+        if record_memory_event:
+            await self._save_memory_event(
+                MemoryEvent(
+                    kind=MemoryEventKind.INGEST,
+                    source=node.source or "graph",
+                    source_id=node.id,
+                    content_hash=_node_content_hash(node),
+                    node_ids=[node.id],
+                    edge_ids=await self._touching_edge_ids(node.id),
+                    properties={"operation": "SynapticGraph.add", "kind": str(node.kind)},
+                )
             )
-        )
         return node
 
     async def add_document(
diff --git a/tests/test_backend_sqlite.py b/tests/test_backend_sqlite.py
index 3b351cb..ea68032 100644
--- a/tests/test_backend_sqlite.py
+++ b/tests/test_backend_sqlite.py
@@ -53,6 +53,39 @@ async def test_list_filter(self, sqlite: SQLiteBackend) -> None:
         assert len(lessons) == 1
         assert lessons[0].kind == NodeKind.LESSON
 
+    async def test_save_nodes_batch_indexes_new_nodes_for_fts(
+        self,
+        sqlite: SQLiteBackend,
+    ) -> None:
+        await sqlite.save_nodes_batch(
+            [
+                Node(id="batch_a", title="Alpha", content="fresh corpus term"),
+                Node(id="batch_b", title="Beta", content="other text"),
+            ]
+        )
+
+        results = await sqlite.search_fts("fresh")
+
+        assert [node.id for node in results] == ["batch_a"]
+
+    async def test_save_nodes_batch_refreshes_existing_fts_rows(
+        self,
+        sqlite: SQLiteBackend,
+    ) -> None:
+        await sqlite.save_nodes_batch(
+            [Node(id="batch_update", title="Original", content="old searchable term")]
+        )
+
+        await sqlite.save_nodes_batch(
+            [Node(id="batch_update", title="Updated", content="new searchable term")]
+        )
+
+        old_results = await sqlite.search_fts("old")
+        new_results = await sqlite.search_fts("new")
+
+        assert "batch_update" not in {node.id for node in old_results}
+        assert "batch_update" in {node.id for node in new_results}
+
 
 class TestSQLiteEdges:
     async def test_save_and_get(self, sqlite: SQLiteBackend) -> None:
diff --git a/tests/test_memory_operating_layer.py b/tests/test_memory_operating_layer.py
index 61149d4..f084b0e 100644
--- a/tests/test_memory_operating_layer.py
+++ b/tests/test_memory_operating_layer.py
@@ -733,6 +733,17 @@ async def test_graph_mutations_record_memory_events():
     assert delete.properties["operation"] == "SynapticGraph.remove"
 
 
+@pytest.mark.asyncio
+async def test_graph_add_can_skip_memory_event_for_bulk_loads():
+    backend = MemoryBackend()
+    graph = SynapticGraph(backend)
+
+    node = await graph.add("Bulk Node", "Body", record_memory_event=False)
+
+    assert node.id
+    assert await backend.list_memory_events(kind=MemoryEventKind.INGEST, limit=10) == []
+
+
 @pytest.mark.asyncio
 async def test_graph_edge_mutations_record_memory_events():
     backend = MemoryBackend()
diff --git a/tests/test_tier1_benchmarks.py b/tests/test_tier1_benchmarks.py
new file mode 100644
index 0000000..a8b28da
--- /dev/null
+++ b/tests/test_tier1_benchmarks.py
@@ -0,0 +1,49 @@
+"""Tests for Tier-1 benchmark runner helpers."""
+
+from __future__ import annotations
+
+import importlib.util
+import json
+import sys
+from pathlib import Path
+
+import pytest
+
+RUNNER_PATH = (
+    Path(__file__).resolve().parents[1] / "examples" / "ablation" / "run_tier1_benchmarks.py"
+)
+SPEC = importlib.util.spec_from_file_location("run_tier1_benchmarks", RUNNER_PATH)
+assert SPEC is not None
+assert SPEC.loader is not None
+runner = importlib.util.module_from_spec(SPEC)
+sys.modules[SPEC.name] = runner
+SPEC.loader.exec_module(runner)
+
+
+@pytest.mark.asyncio
+async def test_corpus_limit_keeps_selected_query_gold_docs(tmp_path):
+    path = tmp_path / "tiny_bench.json"
+    path.write_text(
+        json.dumps(
+            {
+                "corpus": {
+                    "filler_a": {"title": "Filler A", "text": "unrelated alpha"},
+                    "filler_b": {"title": "Filler B", "text": "unrelated beta"},
+                    "gold_doc": {"title": "Gold", "text": "needle targetterm"},
+                },
+                "queries": {"q1": "targetterm"},
+                "qrels": {"q1": {"gold_doc": 1}},
+            }
+        ),
+        encoding="utf-8",
+    )
+
+    report = await runner.run_one(
+        runner.Dataset(name="Tiny", path=path, reference="unit"),
+        subset=1,
+        corpus_limit=2,
+    )
+
+    assert report.n_docs == 2
+    assert report.hit_at_10 == 1
+    assert report.recall_at_10 == 1.0

From 7a8478c60a2fee315bb22bdc6957e05fd4ed41d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EC=86=90=EC=84=B1=EC=A4=80?= <sonsj97@gmail.com>
Date: Thu, 2 Jul 2026 10:50:40 +0900
Subject: [PATCH 2/3] Tune public scale ingest batch

---
 examples/ablation/diagnostics/public_scale_20260702.md |  7 ++++---
 examples/ablation/run_tier1_benchmarks.py              | 10 +++++++---
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/examples/ablation/diagnostics/public_scale_20260702.md b/examples/ablation/diagnostics/public_scale_20260702.md
index 5fb02f8..ed23f0c 100644
--- a/examples/ablation/diagnostics/public_scale_20260702.md
+++ b/examples/ablation/diagnostics/public_scale_20260702.md
@@ -48,9 +48,9 @@ After the SQLite batch FTS optimization:
 
 | Docs | Queries | MRR@10 | R@5 | R@10 | Hit@10 | Build | Search |
 |-----:|--------:|-------:|----:|-----:|-------:|------:|-------:|
-| 50,000 | 10 | 0.933 | 0.008 | 0.015 | 10/10 | 49.3s | 1.3s |
-| 100,000 | 10 | 0.750 | 0.007 | 0.012 | 10/10 | 138.2s | 2.8s |
-| 171,332 | 10 | 0.598 | 0.004 | 0.011 | 10/10 | 370.4s | 5.2s |
+| 50,000 | 10 | 0.933 | 0.008 | 0.015 | 10/10 | 20.6s | 1.4s |
+| 100,000 | 10 | 0.750 | 0.007 | 0.012 | 10/10 | 55.2s | 2.8s |
+| 171,332 | 10 | 0.598 | 0.004 | 0.011 | 10/10 | 135.1s | 5.2s |
 
 TREC-COVID has many relevant documents per query, so R@5/R@10 is naturally
 small in this smoke even when Hit@10 is perfect.
@@ -60,6 +60,7 @@ small in this smoke even when Hit@10 is perfect.
 - Search latency remains usable at 171k docs: 5.2s over 10 queries.
 - The main large-corpus bottleneck is still initial FTS/index build, not retrieval.
 - Avoiding unnecessary FTS deletes for newly inserted nodes reduced full FiQA build time by about 9.9x.
+- Raising benchmark ingest batches to 20k reduced full TREC-COVID build time by about 2.7x.
 - `--corpus-limit` provides practical staged scale gates while preserving selected query gold docs.
 
 ## Guard Policy
diff --git a/examples/ablation/run_tier1_benchmarks.py b/examples/ablation/run_tier1_benchmarks.py
index f0dea4a..51b5980 100644
--- a/examples/ablation/run_tier1_benchmarks.py
+++ b/examples/ablation/run_tier1_benchmarks.py
@@ -156,7 +156,7 @@ async def run_one(
     entity_linker_cfg: tuple[int, float] | None = None,
     use_sqlite_graph: bool = False,
     embed_batch: int = 256,
-    ingest_batch: int = 5000,
+    ingest_batch: int = 20000,
     corpus_limit: int | None = None,
 ) -> Report:
     if not ds.path.exists():
@@ -350,6 +350,7 @@ def _emit_markdown(
     phrase_extractor_label: str = "none",
     entity_linker_label: str = "none",
     corpus_limit: int | None = None,
+    ingest_batch: int = 20000,
 ) -> Path:
     OUT_DIR.mkdir(parents=True, exist_ok=True)
     stamp = time.strftime("%Y%m%d_%H%M%S")
@@ -360,6 +361,7 @@ def _emit_markdown(
         f"- Run at: {time.strftime('%Y-%m-%d %H:%M:%S %Z')}",
         f"- Subset: {subset if subset else 'full'}",
         f"- Corpus limit: {corpus_limit if corpus_limit else 'full'}",
+        f"- Ingest batch: {ingest_batch}",
         f"- Embedder: {embedder_label}",
         f"- Reranker: {reranker_label}",
         f"- Decomposer: {decomposer_label}",
@@ -448,8 +450,8 @@ async def amain(argv: list[str]) -> int:
     p.add_argument(
         "--ingest-batch",
         type=int,
-        default=5000,
-        help="Batch size for benchmark corpus node writes (default: 5000).",
+        default=20000,
+        help="Batch size for benchmark corpus node writes (default: 20000).",
     )
     p.add_argument(
         "--corpus-limit",
@@ -616,6 +618,7 @@ async def amain(argv: list[str]) -> int:
     print(f"  phrase hub: {phrase_extractor_label}")
     print(f"  entity linker: {entity_linker_label}")
     print(f"  corpus limit: {args.corpus_limit if args.corpus_limit else 'full'}")
+    print(f"  ingest batch: {args.ingest_batch}")
     if embedder is not None:
         print(f"  embed batch: {args.embed_batch}")
     print()
@@ -659,6 +662,7 @@ async def amain(argv: list[str]) -> int:
             phrase_extractor_label=phrase_extractor_label,
             entity_linker_label=entity_linker_label,
             corpus_limit=args.corpus_limit,
+            ingest_batch=args.ingest_batch,
         )
         print()
         print(f"Markdown report → {out.relative_to(REPO_ROOT)}")

From e61963c408826b37e6023dc5d0e90896561ec5ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EC=86=90=EC=84=B1=EC=A4=80?= <sonsj97@gmail.com>
Date: Thu, 2 Jul 2026 10:54:45 +0900
Subject: [PATCH 3/3] Fail public scale guard on regressions

---
 .github/workflows/public-scale.yml        | 12 ++++-
 examples/ablation/run_tier1_benchmarks.py | 65 +++++++++++++++++++++++
 tests/test_tier1_benchmarks.py            | 56 +++++++++++++++++++
 3 files changed, 131 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/public-scale.yml b/.github/workflows/public-scale.yml
index 9a8fcd1..1c231c7 100644
--- a/.github/workflows/public-scale.yml
+++ b/.github/workflows/public-scale.yml
@@ -44,7 +44,11 @@ jobs:
             --only fiqa \
             --subset 5 \
             --corpus-limit 10000 \
-            --use-sqlite-graph | tee /tmp/fiqa_scale_guard.log
+            --use-sqlite-graph \
+            --max-build-sec 120 \
+            --max-search-sec 20 \
+            --min-hit-rate-at-10 0.40 \
+            --min-mrr 0.20 | tee /tmp/fiqa_scale_guard.log
           cp "$(ls -t examples/ablation/diagnostics/tier1_*.md | head -1)" \
             /tmp/fiqa_scale_guard.md
 
@@ -55,7 +59,11 @@ jobs:
             --only trec_covid \
             --subset 10 \
             --corpus-limit 50000 \
-            --use-sqlite-graph | tee /tmp/trec_covid_scale_guard.log
+            --use-sqlite-graph \
+            --max-build-sec 240 \
+            --max-search-sec 30 \
+            --min-hit-rate-at-10 0.80 \
+            --min-mrr 0.50 | tee /tmp/trec_covid_scale_guard.log
           cp "$(ls -t examples/ablation/diagnostics/tier1_*.md | head -1)" \
             /tmp/trec_covid_scale_guard.md
 
diff --git a/examples/ablation/run_tier1_benchmarks.py b/examples/ablation/run_tier1_benchmarks.py
index 51b5980..7af75d8 100644
--- a/examples/ablation/run_tier1_benchmarks.py
+++ b/examples/ablation/run_tier1_benchmarks.py
@@ -387,6 +387,34 @@ def _emit_markdown(
     return path
 
 
+def _threshold_violations(
+    reports: list[Report],
+    *,
+    max_build_sec: float | None = None,
+    max_search_sec: float | None = None,
+    min_hit_rate_at_10: float | None = None,
+    min_mrr: float | None = None,
+) -> list[str]:
+    violations: list[str] = []
+    for report in reports:
+        hit_rate = report.hit_at_10 / max(report.n_queries, 1)
+        if max_build_sec is not None and report.build_sec > max_build_sec:
+            violations.append(
+                f"{report.name}: build {report.build_sec:.1f}s > {max_build_sec:.1f}s"
+            )
+        if max_search_sec is not None and report.search_sec > max_search_sec:
+            violations.append(
+                f"{report.name}: search {report.search_sec:.1f}s > {max_search_sec:.1f}s"
+            )
+        if min_hit_rate_at_10 is not None and hit_rate < min_hit_rate_at_10:
+            violations.append(
+                f"{report.name}: hit@10 rate {hit_rate:.3f} < {min_hit_rate_at_10:.3f}"
+            )
+        if min_mrr is not None and report.mrr < min_mrr:
+            violations.append(f"{report.name}: MRR@10 {report.mrr:.3f} < {min_mrr:.3f}")
+    return violations
+
+
 async def amain(argv: list[str]) -> int:
     p = argparse.ArgumentParser()
     p.add_argument(
@@ -462,6 +490,30 @@ async def amain(argv: list[str]) -> int:
             "queries' gold docs are kept first, then distractors are filled in."
         ),
     )
+    p.add_argument(
+        "--max-build-sec",
+        type=float,
+        default=None,
+        help="Fail if any dataset build takes longer than this many seconds.",
+    )
+    p.add_argument(
+        "--max-search-sec",
+        type=float,
+        default=None,
+        help="Fail if any dataset search phase takes longer than this many seconds.",
+    )
+    p.add_argument(
+        "--min-hit-rate-at-10",
+        type=float,
+        default=None,
+        help="Fail if any dataset Hit@10 / queries is below this value.",
+    )
+    p.add_argument(
+        "--min-mrr",
+        type=float,
+        default=None,
+        help="Fail if any dataset MRR@10 is below this value.",
+    )
     p.add_argument(
         "--llm-decomposer-url",
         default=None,
@@ -666,6 +718,19 @@ async def amain(argv: list[str]) -> int:
         )
         print()
         print(f"Markdown report → {out.relative_to(REPO_ROOT)}")
+    violations = _threshold_violations(
+        reports,
+        max_build_sec=args.max_build_sec,
+        max_search_sec=args.max_search_sec,
+        min_hit_rate_at_10=args.min_hit_rate_at_10,
+        min_mrr=args.min_mrr,
+    )
+    if violations:
+        print()
+        print("Threshold violations:")
+        for violation in violations:
+            print(f"  - {violation}")
+        return 1
     return 0
 
 
diff --git a/tests/test_tier1_benchmarks.py b/tests/test_tier1_benchmarks.py
index a8b28da..7c4d65c 100644
--- a/tests/test_tier1_benchmarks.py
+++ b/tests/test_tier1_benchmarks.py
@@ -47,3 +47,59 @@ async def test_corpus_limit_keeps_selected_query_gold_docs(tmp_path):
     assert report.n_docs == 2
     assert report.hit_at_10 == 1
     assert report.recall_at_10 == 1.0
+
+
+def test_threshold_violations_report_scale_regressions():
+    report = runner.Report(
+        name="Tiny",
+        n_docs=100,
+        n_queries=10,
+        mrr=0.25,
+        recall_at_5=0.1,
+        recall_at_10=0.2,
+        hit_at_10=3,
+        build_sec=12.0,
+        search_sec=4.0,
+        reference="unit",
+    )
+
+    violations = runner._threshold_violations(
+        [report],
+        max_build_sec=10.0,
+        max_search_sec=3.0,
+        min_hit_rate_at_10=0.5,
+        min_mrr=0.3,
+    )
+
+    assert violations == [
+        "Tiny: build 12.0s > 10.0s",
+        "Tiny: search 4.0s > 3.0s",
+        "Tiny: hit@10 rate 0.300 < 0.500",
+        "Tiny: MRR@10 0.250 < 0.300",
+    ]
+
+
+def test_threshold_violations_accept_passing_report():
+    report = runner.Report(
+        name="Tiny",
+        n_docs=100,
+        n_queries=10,
+        mrr=0.5,
+        recall_at_5=0.1,
+        recall_at_10=0.2,
+        hit_at_10=8,
+        build_sec=2.0,
+        search_sec=1.0,
+        reference="unit",
+    )
+
+    assert (
+        runner._threshold_violations(
+            [report],
+            max_build_sec=10.0,
+            max_search_sec=3.0,
+            min_hit_rate_at_10=0.5,
+            min_mrr=0.3,
+        )
+        == []
+    )