From d9fded3f839592becb643cc61278e717159d74f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EC=86=90=EC=84=B1=EC=A4=80?= Date: Thu, 2 Jul 2026 13:06:49 +0900 Subject: [PATCH] Filter English FTS query noise --- .../diagnostics/public_scale_20260702.md | 5 + src/synaptic/backends/sqlite.py | 116 ++++++++++++++++++ tests/test_backend_sqlite.py | 69 ++++++++++- 3 files changed, 189 insertions(+), 1 deletion(-) diff --git a/examples/ablation/diagnostics/public_scale_20260702.md b/examples/ablation/diagnostics/public_scale_20260702.md index 4d82515..6fb3200 100644 --- a/examples/ablation/diagnostics/public_scale_20260702.md +++ b/examples/ablation/diagnostics/public_scale_20260702.md @@ -73,6 +73,7 @@ Manual large-tier shard from BEIR/MS MARCO passage validation: | temp SQLite | 1,000,000 | 50 | 0.462 | 0.543 | 0.580 | 30/50 | 1913.3s | 69.9s | | persistent SQLite build | 1,000,000 | 50 | 0.462 | 0.543 | 0.580 | 30/50 | 2184.3s | 71.0s | | persistent SQLite reuse | 1,000,000 | 50 | 0.462 | 0.543 | 0.580 | 30/50 | 0.0s | 70.1s | +| persistent SQLite reuse + English query filter | 1,000,000 | 50 | 0.479 | 0.553 | 0.600 | 31/50 | 0.0s | 9.1s | The local artifacts are gitignored: @@ -100,6 +101,10 @@ The local artifacts are gitignored: - The persistent 1M DB is now built locally. A reuse run validates the sidecar, reports 1,000,000 docs, skips ingest, and preserves identical quality while reducing build time from 2184.3s to 0.0s. +- English query-term filtering removes high-frequency question glue + (`how/is/the/of/to` etc.) before FTS5 `OR` matching. On the persistent 1M + DB this reduced 50-query search time from 70.1s to 9.1s while improving + MRR@10 from 0.462 to 0.479. ## Guard Policy diff --git a/src/synaptic/backends/sqlite.py b/src/synaptic/backends/sqlite.py index 12b2fe2..2e5f20d 100644 --- a/src/synaptic/backends/sqlite.py +++ b/src/synaptic/backends/sqlite.py @@ -96,6 +96,121 @@ def _get_kiwi(): } ) +_EN_QUERY_TOKEN = re.compile(r"[A-Za-z0-9][A-Za-z0-9_'-]*") +_EN_QUERY_STOPWORDS: frozenset[str] = frozenset( + { + "a", + "about", + "am", + "an", + "and", + "are", + "as", + "at", + "be", + "by", + "can", + "could", + "did", + "do", + "does", + "for", + "from", + "how", + "i", + "in", + "into", + "is", + "it", + "of", + "on", + "or", + "should", + "than", + "that", + "the", + "their", + "them", + "then", + "there", + "they", + "this", + "to", + "was", + "were", + "what", + "when", + "where", + "which", + "who", + "why", + "will", + "with", + "would", + "you", + "your", + } +) + + +def _prepare_fts_query_terms(terms: list[str]) -> list[str]: + """Clean query-time FTS terms without changing indexed content. + + Natural English questions often include high-frequency glue words + (``how/is/the/of/to``) and punctuation. With FTS5 ``OR`` queries on + million-document corpora, those terms dominate candidate generation + and BM25 sorting. Keep meaningful English tokens, preserve Korean + and other non-ASCII terms, and fall back to the cleaned original + terms when a query is entirely stopwords. + """ + cleaned: list[str] = [] + meaningful_keys: set[str] = set() + seen: set[str] = set() + + for raw in terms: + raw = raw.strip() + if not raw: + continue + if any(ord(c) > 127 for c in raw): + key = raw.casefold() + if key not in seen: + seen.add(key) + cleaned.append(raw) + meaningful_keys.add(key) + continue + chunks = _EN_QUERY_TOKEN.findall(raw) + if chunks: + for chunk in chunks: + key = chunk.lower().strip("'_-") + if not key or key in seen: + continue + seen.add(key) + cleaned.append(chunk) + if key not in _EN_QUERY_STOPWORDS and ( + len(key) > 1 or any(c.isdigit() for c in key) + ): + meaningful_keys.add(key) + continue + + if not meaningful_keys: + return cleaned + + filtered: list[str] = [] + seen_filtered: set[str] = set() + for term in cleaned: + if _EN_QUERY_TOKEN.fullmatch(term): + key = term.lower().strip("'_-") + if key in _EN_QUERY_STOPWORDS: + continue + if len(key) <= 1 and not any(c.isdigit() for c in key): + continue + else: + key = term.casefold() + if key and key not in seen_filtered: + seen_filtered.add(key) + filtered.append(term) + return filtered or cleaned + def _normalize_korean(text: str, *, query_mode: bool = False) -> str: """Normalize Korean text for FTS indexing/querying. @@ -1314,6 +1429,7 @@ async def search_fts( if any(c.isdigit() or ("a" <= c.lower() <= "z") for c in t): terms.append(t) term_seen.add(t) + terms = _prepare_fts_query_terms(terms) if not terms: return [] diff --git a/tests/test_backend_sqlite.py b/tests/test_backend_sqlite.py index ea68032..1157f2c 100644 --- a/tests/test_backend_sqlite.py +++ b/tests/test_backend_sqlite.py @@ -7,7 +7,7 @@ import pytest -from synaptic.backends.sqlite import SQLiteBackend +from synaptic.backends.sqlite import SQLiteBackend, _prepare_fts_query_terms from synaptic.models import Edge, EdgeKind, Node, NodeKind @@ -86,6 +86,73 @@ async def test_save_nodes_batch_refreshes_existing_fts_rows( assert "batch_update" not in {node.id for node in old_results} assert "batch_update" in {node.id for node in new_results} + def test_prepare_fts_query_terms_drops_english_question_noise(self) -> None: + terms = _prepare_fts_query_terms( + [ + "how", + "many", + "years", + "did", + "william", + "bradford", + "serve", + "as", + "governor", + "of", + "plymouth", + "colony?", + ] + ) + + assert terms == [ + "many", + "years", + "william", + "bradford", + "serve", + "governor", + "plymouth", + "colony", + ] + + def test_prepare_fts_query_terms_keeps_stopword_only_query(self) -> None: + assert _prepare_fts_query_terms(["to", "be"]) == ["to", "be"] + + def test_prepare_fts_query_terms_preserves_korean_terms(self) -> None: + assert _prepare_fts_query_terms(["경마산업", "관리", "규정"]) == [ + "경마산업", + "관리", + "규정", + ] + + def test_prepare_fts_query_terms_preserves_non_ascii_latin_terms(self) -> None: + assert _prepare_fts_query_terms(["café", "prices"]) == ["café", "prices"] + + async def test_search_fts_filters_english_question_noise( + self, + sqlite: SQLiteBackend, + ) -> None: + await sqlite.save_nodes_batch( + [ + Node( + id="plymouth", + title="William Bradford", + content="William Bradford served as governor of Plymouth Colony.", + ), + Node( + id="question_glue", + title="Question Glue", + content="how did is the of to as many generic words", + ), + ] + ) + + results = await sqlite.search_fts( + "how many years did william bradford serve as governor of plymouth colony?" + ) + + assert [node.id for node in results][:1] == ["plymouth"] + class TestSQLiteEdges: async def test_save_and_get(self, sqlite: SQLiteBackend) -> None: