PlateerLab · SonAIengine · Jul 2, 2026 · Jul 2, 2026
diff --git a/examples/ablation/diagnostics/public_scale_20260702.md b/examples/ablation/diagnostics/public_scale_20260702.md
@@ -73,6 +73,7 @@ Manual large-tier shard from BEIR/MS MARCO passage validation:
 | temp SQLite | 1,000,000 | 50 | 0.462 | 0.543 | 0.580 | 30/50 | 1913.3s | 69.9s |
 | persistent SQLite build | 1,000,000 | 50 | 0.462 | 0.543 | 0.580 | 30/50 | 2184.3s | 71.0s |
 | persistent SQLite reuse | 1,000,000 | 50 | 0.462 | 0.543 | 0.580 | 30/50 | 0.0s | 70.1s |
+| persistent SQLite reuse + English query filter | 1,000,000 | 50 | 0.479 | 0.553 | 0.600 | 31/50 | 0.0s | 9.1s |
 
 The local artifacts are gitignored:
 
@@ -100,6 +101,10 @@ The local artifacts are gitignored:
 - The persistent 1M DB is now built locally. A reuse run validates the sidecar,
   reports 1,000,000 docs, skips ingest, and preserves identical quality while
   reducing build time from 2184.3s to 0.0s.
+- English query-term filtering removes high-frequency question glue
+  (`how/is/the/of/to` etc.) before FTS5 `OR` matching. On the persistent 1M
+  DB this reduced 50-query search time from 70.1s to 9.1s while improving
+  MRR@10 from 0.462 to 0.479.
 
 ## Guard Policy
 

diff --git a/src/synaptic/backends/sqlite.py b/src/synaptic/backends/sqlite.py
@@ -96,6 +96,121 @@ def _get_kiwi():
     }
 )
 
+_EN_QUERY_TOKEN = re.compile(r"[A-Za-z0-9][A-Za-z0-9_'-]*")
+_EN_QUERY_STOPWORDS: frozenset[str] = frozenset(
+    {
+        "a",
+        "about",
+        "am",
+        "an",
+        "and",
+        "are",
+        "as",
+        "at",
+        "be",
+        "by",
+        "can",
+        "could",
+        "did",
+        "do",
+        "does",
+        "for",
+        "from",
+        "how",
+        "i",
+        "in",
+        "into",
+        "is",
+        "it",
+        "of",
+        "on",
+        "or",
+        "should",
+        "than",
+        "that",
+        "the",
+        "their",
+        "them",
+        "then",
+        "there",
+        "they",
+        "this",
+        "to",
+        "was",
+        "were",
+        "what",
+        "when",
+        "where",
+        "which",
+        "who",
+        "why",
+        "will",
+        "with",
+        "would",
+        "you",
+        "your",
+    }
+)
+
+
+def _prepare_fts_query_terms(terms: list[str]) -> list[str]:
+    """Clean query-time FTS terms without changing indexed content.
+
+    Natural English questions often include high-frequency glue words
+    (``how/is/the/of/to``) and punctuation. With FTS5 ``OR`` queries on
+    million-document corpora, those terms dominate candidate generation
+    and BM25 sorting. Keep meaningful English tokens, preserve Korean
+    and other non-ASCII terms, and fall back to the cleaned original
+    terms when a query is entirely stopwords.
+    """
+    cleaned: list[str] = []
+    meaningful_keys: set[str] = set()
+    seen: set[str] = set()
+
+    for raw in terms:
+        raw = raw.strip()
+        if not raw:
+            continue
+        if any(ord(c) > 127 for c in raw):
+            key = raw.casefold()
+            if key not in seen:
+                seen.add(key)
+                cleaned.append(raw)
+                meaningful_keys.add(key)
+            continue
+        chunks = _EN_QUERY_TOKEN.findall(raw)
+        if chunks:
+            for chunk in chunks:
+                key = chunk.lower().strip("'_-")
+                if not key or key in seen:
+                    continue
+                seen.add(key)
+                cleaned.append(chunk)
+                if key not in _EN_QUERY_STOPWORDS and (
+                    len(key) > 1 or any(c.isdigit() for c in key)
+                ):
+                    meaningful_keys.add(key)
+            continue
+
+    if not meaningful_keys:
+        return cleaned
+
+    filtered: list[str] = []
+    seen_filtered: set[str] = set()
+    for term in cleaned:
+        if _EN_QUERY_TOKEN.fullmatch(term):
+            key = term.lower().strip("'_-")
+            if key in _EN_QUERY_STOPWORDS:
+                continue
+            if len(key) <= 1 and not any(c.isdigit() for c in key):
+                continue
+        else:
+            key = term.casefold()
+        if key and key not in seen_filtered:
+            seen_filtered.add(key)
+            filtered.append(term)
+    return filtered or cleaned
+
 
 def _normalize_korean(text: str, *, query_mode: bool = False) -> str:
     """Normalize Korean text for FTS indexing/querying.
@@ -1314,6 +1429,7 @@ async def search_fts(
             if any(c.isdigit() or ("a" <= c.lower() <= "z") for c in t):
                 terms.append(t)
                 term_seen.add(t)
+        terms = _prepare_fts_query_terms(terms)
         if not terms:
             return []
 

diff --git a/tests/test_backend_sqlite.py b/tests/test_backend_sqlite.py
@@ -7,7 +7,7 @@
 
 import pytest
 
-from synaptic.backends.sqlite import SQLiteBackend
+from synaptic.backends.sqlite import SQLiteBackend, _prepare_fts_query_terms
 from synaptic.models import Edge, EdgeKind, Node, NodeKind
 
 
@@ -86,6 +86,73 @@ async def test_save_nodes_batch_refreshes_existing_fts_rows(
         assert "batch_update" not in {node.id for node in old_results}
         assert "batch_update" in {node.id for node in new_results}
 
+    def test_prepare_fts_query_terms_drops_english_question_noise(self) -> None:
+        terms = _prepare_fts_query_terms(
+            [
+                "how",
+                "many",
+                "years",
+                "did",
+                "william",
+                "bradford",
+                "serve",
+                "as",
+                "governor",
+                "of",
+                "plymouth",
+                "colony?",
+            ]
+        )
+
+        assert terms == [
+            "many",
+            "years",
+            "william",
+            "bradford",
+            "serve",
+            "governor",
+            "plymouth",
+            "colony",
+        ]
+
+    def test_prepare_fts_query_terms_keeps_stopword_only_query(self) -> None:
+        assert _prepare_fts_query_terms(["to", "be"]) == ["to", "be"]
+
+    def test_prepare_fts_query_terms_preserves_korean_terms(self) -> None:
+        assert _prepare_fts_query_terms(["경마산업", "관리", "규정"]) == [
+            "경마산업",
+            "관리",
+            "규정",
+        ]
+
+    def test_prepare_fts_query_terms_preserves_non_ascii_latin_terms(self) -> None:
+        assert _prepare_fts_query_terms(["café", "prices"]) == ["café", "prices"]
+
+    async def test_search_fts_filters_english_question_noise(
+        self,
+        sqlite: SQLiteBackend,
+    ) -> None:
+        await sqlite.save_nodes_batch(
+            [
+                Node(
+                    id="plymouth",
+                    title="William Bradford",
+                    content="William Bradford served as governor of Plymouth Colony.",
+                ),
+                Node(
+                    id="question_glue",
+                    title="Question Glue",
+                    content="how did is the of to as many generic words",
+                ),
+            ]
+        )
+
+        results = await sqlite.search_fts(
+            "how many years did william bradford serve as governor of plymouth colony?"
+        )
+
+        assert [node.id for node in results][:1] == ["plymouth"]
+
 
 class TestSQLiteEdges:
     async def test_save_and_get(self, sqlite: SQLiteBackend) -> None: