Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions examples/ablation/diagnostics/public_scale_20260702.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,11 +74,14 @@ Manual large-tier shard from BEIR/MS MARCO passage validation:
| persistent SQLite build | 1,000,000 | 50 | 0.462 | 0.543 | 0.580 | 30/50 | 2184.3s | 71.0s |
| persistent SQLite reuse | 1,000,000 | 50 | 0.462 | 0.543 | 0.580 | 30/50 | 0.0s | 70.1s |
| persistent SQLite reuse + English query filter | 1,000,000 | 50 | 0.479 | 0.553 | 0.600 | 31/50 | 0.0s | 9.1s |
| persistent SQLite reuse + tag-filtered anchors | 1,000,000 | 50 | 0.479 | 0.553 | 0.600 | 31/50 | 0.0s | 7.5s |

The local artifacts are gitignored:

- `tests/benchmark/data/msmarco_passage.json` - 511 KB manifest
- `tests/benchmark/data/msmarco_passage.corpus.jsonl` - 35 MB at 100k, 361 MB at 1M
- `tests/benchmark/data/msmarco_passage_5m.json` - 511 KB 5M manifest
- `tests/benchmark/data/msmarco_passage_5m.corpus.jsonl` - 1.8 GB, 5,000,000 rows
- `tests/benchmark/data/msmarco_1m.db` - 1.2 GB persistent SQLite DB
- `tests/benchmark/data/msmarco_1m.db.tier1.json` - 535 byte reuse sidecar

Expand All @@ -105,6 +108,16 @@ The local artifacts are gitignored:
(`how/is/the/of/to` etc.) before FTS5 `OR` matching. On the persistent 1M
DB this reduced 50-query search time from 70.1s to 9.1s while improving
MRR@10 from 0.462 to 0.479.
- QueryAnchor category loading now asks backends for nodes tagged `category`
instead of materializing the first 500 `CONCEPT` rows and filtering in
Python. On the persistent 1M MS MARCO DB, first anchor extraction dropped
from roughly 1.7-2.0s to 0.218s, and the 50-query reuse smoke improved from
9.1s to 7.5s with unchanged quality.
- 5M MS MARCO corpus data is now locally available as a side-by-side shard.
Generate it with:
`uv run --extra eval python examples/ablation/download_benchmarks.py --only msmarco_passage --large-corpus-limit 5000000 --large-output-suffix _5m`
and run it with:
`uv run python examples/ablation/run_tier1_benchmarks.py --only msmarco --msmarco-path tests/benchmark/data/msmarco_passage_5m.json --corpus-limit 5000000 --use-sqlite-graph --sqlite-db-path tests/benchmark/data/msmarco_5m.db --overwrite-sqlite-db`.

## Guard Policy

Expand Down
23 changes: 23 additions & 0 deletions examples/ablation/diagnostics/tier1_20260702_131442.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Tier-1 English retrieval benchmark — Synaptic

- Run at: 2026-07-02 13:14:42 KST
- Subset: 50
- Corpus limit: 1000000
- Ingest batch: 20000
- Progress every: 100000
- SQLite DB path: tests/benchmark/data/msmarco_1m.db
- SQLite DB reuse: yes
- Embedder: none (FTS-only baseline)
- Reranker: none
- Decomposer: none
- Phrase hub (inline): none
- Entity linker (post-hoc): none
- Engine: `graph.search()` default (EvidenceSearch)

| Dataset | Docs | Queries | MRR@10 | R@5 | R@10 | Hit@10 | Build | Search |
|---------|-----:|--------:|-------:|----:|-----:|-------:|------:|-------:|
| MS MARCO passage dev | 1000000 | 50 | 0.479 | 0.553 | 0.600 | 31/50 | 0.0s | 7.5s |

## Context

- **MS MARCO passage dev** — published baseline: BEIR/MS MARCO passage: ~8.8M source passages; JSONL shard
13 changes: 13 additions & 0 deletions examples/ablation/download_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,15 @@ def main() -> None:
f"msmarco_passage (default: {MSMARCO_DEFAULT_CORPUS_LIMIT:,})."
),
)
p.add_argument(
"--large-output-suffix",
default="",
help=(
"Optional suffix for large JSONL-sharded output manifests, e.g. "
"'_5m' writes msmarco_passage_5m.json and "
"msmarco_passage_5m.corpus.jsonl without replacing the default 1M shard."
),
)
args = p.parse_args()

names = [n.strip() for n in args.only.split(",") if n.strip()]
Expand All @@ -521,6 +530,10 @@ def main() -> None:
if name in LARGE_BUILDERS:
builder, filename = LARGE_BUILDERS[name]
out_path = OUT_DIR / filename
if args.large_output_suffix:
out_path = out_path.with_name(
f"{out_path.stem}{args.large_output_suffix}{out_path.suffix}"
)
print(f"\n=== {name} ===")
builder(out_path, corpus_limit=args.large_corpus_limit)
continue
Expand Down
39 changes: 30 additions & 9 deletions examples/ablation/run_tier1_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,26 @@ class Dataset:
CorpusItem = tuple[str, str, str]


def _dataset_key_map(msmarco_path: Path | None = None) -> dict[str, Dataset]:
by_key = {
"hotpotqa": DATASETS[0],
"musique": DATASETS[1],
"2wiki": DATASETS[2],
"fiqa": DATASETS[3],
"trec_covid": DATASETS[4],
"scifact": DATASETS[5],
"msmarco": DATASETS[6],
}
if msmarco_path is not None:
base = by_key["msmarco"]
by_key["msmarco"] = Dataset(
name=base.name,
path=msmarco_path,
reference=base.reference,
)
return by_key


def _selected_gold_doc_ids(
qrels: dict,
query_items: list[tuple[str, str]],
Expand Down Expand Up @@ -748,6 +768,15 @@ async def amain(argv: list[str]) -> int:
"--use-sqlite-graph to keep a built large-corpus index."
),
)
p.add_argument(
"--msmarco-path",
type=Path,
default=None,
help=(
"Override the MS MARCO manifest path, e.g. "
"tests/benchmark/data/msmarco_passage_5m.json for side-by-side large tiers."
),
)
p.add_argument(
"--reuse-sqlite-db",
action="store_true",
Expand Down Expand Up @@ -956,15 +985,7 @@ async def amain(argv: list[str]) -> int:
reranker = TEIReranker(base_url=args.reranker_url)
reranker_label = f"TEI cross-encoder @ {args.reranker_url}"

by_key = {
"hotpotqa": DATASETS[0],
"musique": DATASETS[1],
"2wiki": DATASETS[2],
"fiqa": DATASETS[3],
"trec_covid": DATASETS[4],
"scifact": DATASETS[5],
"msmarco": DATASETS[6],
}
by_key = _dataset_key_map(args.msmarco_path)
selected = []
for raw_key in args.only.split(","):
key = raw_key.strip()
Expand Down
13 changes: 13 additions & 0 deletions src/synaptic/backends/composite.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,19 @@ async def list_nodes(
) -> list[Node]:
return await self._graph.list_nodes(kind=kind, level=level, limit=limit)

async def list_nodes_by_tag(
self,
tag: str,
*,
kind: str | NodeKind | None = None,
limit: int = 100,
) -> list[Node]:
list_by_tag = getattr(self._graph, "list_nodes_by_tag", None)
if callable(list_by_tag):
return await list_by_tag(tag, kind=kind, limit=limit)
nodes = await self._graph.list_nodes(kind=kind, limit=limit)
return [node for node in nodes if tag in (node.tags or [])]

# --- Edge CRUD (all to Neo4j) ---

async def save_edge(self, edge: Edge) -> None:
Expand Down
18 changes: 18 additions & 0 deletions src/synaptic/backends/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,24 @@ async def list_nodes(
break
return result

async def list_nodes_by_tag(
self,
tag: str,
*,
kind: str | NodeKind | None = None,
limit: int = 100,
) -> list[Node]:
result: list[Node] = []
for node in self._nodes.values():
if kind is not None and node.kind != kind:
continue
if tag not in (node.tags or []):
continue
result.append(node)
if len(result) >= limit:
break
return result

async def get_nodes_batch(self, node_ids: list[str]) -> list[Node]:
return [self._nodes[nid] for nid in node_ids if nid in self._nodes]

Expand Down
20 changes: 20 additions & 0 deletions src/synaptic/backends/sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,26 @@ async def list_nodes(
rows = await cur.fetchall()
return [_row_to_node(r) for r in rows]

async def list_nodes_by_tag(
self,
tag: str,
*,
kind: str | NodeKind | None = None,
limit: int = 100,
) -> list[Node]:
db = self._db()
conditions = ["EXISTS (SELECT 1 FROM json_each(syn_nodes.tags_json) WHERE value = ?)"]
params: list[str | int] = [tag]
if kind is not None:
conditions.append("kind = ?")
params.append(str(kind))
params.append(limit)
where = " AND ".join(conditions)
sql = f"SELECT * FROM syn_nodes WHERE {where} ORDER BY updated_at DESC LIMIT ?"
async with db.execute(sql, params) as cur:
rows = await cur.fetchall()
return [_row_to_node(r) for r in rows]

async def find_nodes_by_property(
self, key: str, value: str, *, limit: int = 1000
) -> list[Node]:
Expand Down
21 changes: 14 additions & 7 deletions src/synaptic/extensions/query_anchor.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,18 +339,25 @@ async def _load_categories(self) -> list[tuple[str, str]]:
return self._category_cache

try:
nodes = await self._backend.list_nodes(
kind=NodeKind.CONCEPT,
limit=self._category_cache_limit,
)
list_by_tag = getattr(self._backend, "list_nodes_by_tag", None)
if callable(list_by_tag):
nodes = await list_by_tag(
"category",
kind=NodeKind.CONCEPT,
limit=self._category_cache_limit,
)
else:
nodes = await self._backend.list_nodes(
kind=NodeKind.CONCEPT,
limit=self._category_cache_limit,
)
nodes = [n for n in nodes if "category" in (n.tags or [])]
except Exception as exc:
logger.warning("query-anchor: failed to load categories — %s", exc)
self._category_cache = []
return []

pairs = [
(_nfc(n.title or ""), n.id) for n in nodes if n.title and "category" in (n.tags or [])
]
pairs = [(_nfc(n.title or ""), n.id) for n in nodes if n.title]
self._category_cache = pairs
logger.debug("query-anchor: cached %d category nodes", len(pairs))
return pairs
12 changes: 12 additions & 0 deletions tests/test_backend_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,18 @@ async def test_list_with_limit(self, backend: MemoryBackend) -> None:
limited = await backend.list_nodes(limit=3)
assert len(limited) == 3

async def test_list_nodes_by_tag_filters_kind_and_limit(self, backend: MemoryBackend) -> None:
await backend.save_node(Node(id="cat_a", title="A", tags=["category"]))
await backend.save_node(Node(id="cat_b", title="B", tags=["category"]))
await backend.save_node(Node(id="doc", title="Doc", tags=["document"]))
await backend.save_node(
Node(id="lesson", title="Lesson", kind=NodeKind.LESSON, tags=["category"])
)

hits = await backend.list_nodes_by_tag("category", kind=NodeKind.CONCEPT, limit=1)

assert [node.id for node in hits] == ["cat_a"]


class TestMemoryBackendEdges:
async def test_save_and_get(self, backend: MemoryBackend) -> None:
Expand Down
21 changes: 21 additions & 0 deletions tests/test_backend_sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,3 +525,24 @@ async def test_respects_limit(self, sqlite: SQLiteBackend) -> None:
for i in range(5):
await sqlite.save_node(Node(id=f"n{i}", properties={"k": "v"}))
assert len(await sqlite.find_nodes_by_property("k", "v", limit=3)) == 3


class TestListNodesByTag:
async def test_finds_exact_tag_with_kind_filter(self, sqlite: SQLiteBackend) -> None:
await sqlite.save_node(Node(id="cat", title="Category", tags=["category"]))
await sqlite.save_node(Node(id="doc", title="Doc", tags=["document"]))
await sqlite.save_node(
Node(id="lesson", title="Lesson", kind=NodeKind.LESSON, tags=["category"])
)

hits = await sqlite.list_nodes_by_tag("category", kind=NodeKind.CONCEPT)

assert [node.id for node in hits] == ["cat"]

async def test_respects_limit(self, sqlite: SQLiteBackend) -> None:
for i in range(5):
await sqlite.save_node(Node(id=f"cat_{i}", title=str(i), tags=["category"]))

hits = await sqlite.list_nodes_by_tag("category", limit=3)

assert len(hits) == 3
28 changes: 28 additions & 0 deletions tests/test_download_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,31 @@ def fake_load_dataset(repo, config=None, *, split, streaming=False):
assert manifest["corpus_size"] == 3
assert manifest["preserved_gold_docs"] == 1
assert [row["_id"] for row in rows] == ["3", "0", "1"]


def test_large_output_suffix_keeps_default_shard(monkeypatch, tmp_path):
calls: list[Path] = []
monkeypatch.setattr(downloader, "OUT_DIR", tmp_path)
monkeypatch.setitem(
downloader.LARGE_BUILDERS,
"msmarco_passage",
(lambda out_path, *, corpus_limit: calls.append(out_path), "msmarco_passage.json"),
)

monkeypatch.setattr(
sys,
"argv",
[
"download_benchmarks.py",
"--only",
"msmarco_passage",
"--large-corpus-limit",
"5000000",
"--large-output-suffix",
"_5m",
],
)

downloader.main()

assert calls == [tmp_path / "msmarco_passage_5m.json"]
11 changes: 9 additions & 2 deletions tests/test_query_anchor.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,11 +231,16 @@ class _CountingBackend:
def __init__(self, inner: MemoryBackend) -> None:
self._inner = inner
self.list_calls = 0
self.list_by_tag_calls = 0

async def list_nodes(self, **kwargs):
self.list_calls += 1
return await self._inner.list_nodes(**kwargs)

async def list_nodes_by_tag(self, *args, **kwargs):
self.list_by_tag_calls += 1
return await self._inner.list_nodes_by_tag(*args, **kwargs)

# Pass-through for anything else the extractor might call
def __getattr__(self, name: str):
return getattr(self._inner, name)
Expand All @@ -254,7 +259,8 @@ async def test_cache_reuse_across_calls(self):
await extractor.extract("규정 및 지침")
await extractor.extract("규정 및 지침")

assert backend.list_calls == 1
assert backend.list_calls == 0
assert backend.list_by_tag_calls == 1

async def test_invalidate_cache_forces_reload(self):
inner = MemoryBackend()
Expand All @@ -267,7 +273,8 @@ async def test_invalidate_cache_forces_reload(self):
extractor.invalidate_cache()
await extractor.extract("규정 및 지침")

assert backend.list_calls == 2
assert backend.list_calls == 0
assert backend.list_by_tag_calls == 2


# --- Integration: full anchor shape ---
Expand Down
8 changes: 8 additions & 0 deletions tests/test_tier1_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@
SPEC.loader.exec_module(runner)


def test_msmarco_path_override_retargets_dataset(tmp_path):
manifest = tmp_path / "msmarco_passage_5m.json"
by_key = runner._dataset_key_map(manifest)

assert by_key["msmarco"].path == manifest
assert by_key["msmarco"].name == runner.DATASETS[6].name


@pytest.mark.asyncio
async def test_corpus_limit_keeps_selected_query_gold_docs(tmp_path):
path = tmp_path / "tiny_bench.json"
Expand Down
Loading