Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions examples/ablation/diagnostics/public_scale_20260702.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ Manual large-tier shard from BEIR/MS MARCO passage validation:
| temp SQLite | 1,000,000 | 50 | 0.462 | 0.543 | 0.580 | 30/50 | 1913.3s | 69.9s |
| persistent SQLite build | 1,000,000 | 50 | 0.462 | 0.543 | 0.580 | 30/50 | 2184.3s | 71.0s |
| persistent SQLite reuse | 1,000,000 | 50 | 0.462 | 0.543 | 0.580 | 30/50 | 0.0s | 70.1s |
| persistent SQLite reuse + English query filter | 1,000,000 | 50 | 0.479 | 0.553 | 0.600 | 31/50 | 0.0s | 9.1s |

The local artifacts are gitignored:

Expand Down Expand Up @@ -100,6 +101,10 @@ The local artifacts are gitignored:
- The persistent 1M DB is now built locally. A reuse run validates the sidecar,
reports 1,000,000 docs, skips ingest, and preserves identical quality while
reducing build time from 2184.3s to 0.0s.
- English query-term filtering removes high-frequency question glue
(`how/is/the/of/to` etc.) before FTS5 `OR` matching. On the persistent 1M
DB this reduced 50-query search time from 70.1s to 9.1s while improving
MRR@10 from 0.462 to 0.479.

## Guard Policy

Expand Down
116 changes: 116 additions & 0 deletions src/synaptic/backends/sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,121 @@ def _get_kiwi():
}
)

_EN_QUERY_TOKEN = re.compile(r"[A-Za-z0-9][A-Za-z0-9_'-]*")
_EN_QUERY_STOPWORDS: frozenset[str] = frozenset(
{
"a",
"about",
"am",
"an",
"and",
"are",
"as",
"at",
"be",
"by",
"can",
"could",
"did",
"do",
"does",
"for",
"from",
"how",
"i",
"in",
"into",
"is",
"it",
"of",
"on",
"or",
"should",
"than",
"that",
"the",
"their",
"them",
"then",
"there",
"they",
"this",
"to",
"was",
"were",
"what",
"when",
"where",
"which",
"who",
"why",
"will",
"with",
"would",
"you",
"your",
}
)


def _prepare_fts_query_terms(terms: list[str]) -> list[str]:
"""Clean query-time FTS terms without changing indexed content.

Natural English questions often include high-frequency glue words
(``how/is/the/of/to``) and punctuation. With FTS5 ``OR`` queries on
million-document corpora, those terms dominate candidate generation
and BM25 sorting. Keep meaningful English tokens, preserve Korean
and other non-ASCII terms, and fall back to the cleaned original
terms when a query is entirely stopwords.
"""
cleaned: list[str] = []
meaningful_keys: set[str] = set()
seen: set[str] = set()

for raw in terms:
raw = raw.strip()
if not raw:
continue
if any(ord(c) > 127 for c in raw):
key = raw.casefold()
if key not in seen:
seen.add(key)
cleaned.append(raw)
meaningful_keys.add(key)
continue
chunks = _EN_QUERY_TOKEN.findall(raw)
if chunks:
for chunk in chunks:
key = chunk.lower().strip("'_-")
if not key or key in seen:
continue
seen.add(key)
cleaned.append(chunk)
if key not in _EN_QUERY_STOPWORDS and (
len(key) > 1 or any(c.isdigit() for c in key)
):
meaningful_keys.add(key)
continue

if not meaningful_keys:
return cleaned

filtered: list[str] = []
seen_filtered: set[str] = set()
for term in cleaned:
if _EN_QUERY_TOKEN.fullmatch(term):
key = term.lower().strip("'_-")
if key in _EN_QUERY_STOPWORDS:
continue
if len(key) <= 1 and not any(c.isdigit() for c in key):
continue
else:
key = term.casefold()
if key and key not in seen_filtered:
seen_filtered.add(key)
filtered.append(term)
return filtered or cleaned


def _normalize_korean(text: str, *, query_mode: bool = False) -> str:
"""Normalize Korean text for FTS indexing/querying.
Expand Down Expand Up @@ -1314,6 +1429,7 @@ async def search_fts(
if any(c.isdigit() or ("a" <= c.lower() <= "z") for c in t):
terms.append(t)
term_seen.add(t)
terms = _prepare_fts_query_terms(terms)
if not terms:
return []

Expand Down
69 changes: 68 additions & 1 deletion tests/test_backend_sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import pytest

from synaptic.backends.sqlite import SQLiteBackend
from synaptic.backends.sqlite import SQLiteBackend, _prepare_fts_query_terms
from synaptic.models import Edge, EdgeKind, Node, NodeKind


Expand Down Expand Up @@ -86,6 +86,73 @@ async def test_save_nodes_batch_refreshes_existing_fts_rows(
assert "batch_update" not in {node.id for node in old_results}
assert "batch_update" in {node.id for node in new_results}

def test_prepare_fts_query_terms_drops_english_question_noise(self) -> None:
terms = _prepare_fts_query_terms(
[
"how",
"many",
"years",
"did",
"william",
"bradford",
"serve",
"as",
"governor",
"of",
"plymouth",
"colony?",
]
)

assert terms == [
"many",
"years",
"william",
"bradford",
"serve",
"governor",
"plymouth",
"colony",
]

def test_prepare_fts_query_terms_keeps_stopword_only_query(self) -> None:
assert _prepare_fts_query_terms(["to", "be"]) == ["to", "be"]

def test_prepare_fts_query_terms_preserves_korean_terms(self) -> None:
assert _prepare_fts_query_terms(["경마산업", "관리", "규정"]) == [
"경마산업",
"관리",
"규정",
]

def test_prepare_fts_query_terms_preserves_non_ascii_latin_terms(self) -> None:
assert _prepare_fts_query_terms(["café", "prices"]) == ["café", "prices"]

async def test_search_fts_filters_english_question_noise(
self,
sqlite: SQLiteBackend,
) -> None:
await sqlite.save_nodes_batch(
[
Node(
id="plymouth",
title="William Bradford",
content="William Bradford served as governor of Plymouth Colony.",
),
Node(
id="question_glue",
title="Question Glue",
content="how did is the of to as many generic words",
),
]
)

results = await sqlite.search_fts(
"how many years did william bradford serve as governor of plymouth colony?"
)

assert [node.id for node in results][:1] == ["plymouth"]


class TestSQLiteEdges:
async def test_save_and_get(self, sqlite: SQLiteBackend) -> None:
Expand Down
Loading