diff --git a/examples/ablation/diagnostics/agent_loop_deepseek_limit10_targeted_20260702.md b/examples/ablation/diagnostics/agent_loop_deepseek_limit10_targeted_20260702.md new file mode 100644 index 0000000..d4db788 --- /dev/null +++ b/examples/ablation/diagnostics/agent_loop_deepseek_limit10_targeted_20260702.md @@ -0,0 +1,27 @@ +# DeepSeek agent loop targeted limit-10 check + +- Run at: 2026-07-02 20:47 KST +- Model: `deepseek-v4-flash` +- Base corpus: `tests/benchmark/data/msmarco_passage_full.json` +- SQLite DB: `tests/benchmark/data/msmarco_full.db` +- Corpus limit: 8,841,823 passages +- Target set: six high-call misses from the prior 50-query DeepSeek run +- Change under test: expose `limit`/`read_top_k` to the agent tool schema and widen the default `deep_search` evidence pool from 5 to 10. + +## Result + +The targeted high-call miss set improved from `0/6` reached in the prior 50-query run to `2/6` reached after the wider `deep_search` evidence pool. + +| QID | Query | Prior 50-query run | Limit-10 targeted run | First relevant | +| --- | --- | ---: | ---: | --- | +| 54544 | blood diseases that are sexually transmitted | no | no | - | +| 293992 | how many product lines does coca cola have | no | yes | turn 2 / call 3 | +| 208145 | how bicycle tire tubes are sized | no | no | - | +| 14151 | age requirements for name change | no | yes | turn 1 / call 1 | +| 91711 | child psychiatrist salary 2016 | no | no | - | +| 237373 | how is soil created from rocks | no | no | - | + +## Interpretation + +This is not a full replacement for the 50-query gate, but it validates the specific bottleneck seen in high-call misses: some questions needed a wider first evidence pool rather than more repeated follow-up searches. The change keeps a hard runtime cap (`limit <= 20`, `read_top_k <= 5`) so broad searches can recover more candidates without letting the LLM request unbounded context. + diff --git a/src/synaptic/agent_loop.py b/src/synaptic/agent_loop.py index dcd92bb..6be9ac4 100644 --- a/src/synaptic/agent_loop.py +++ b/src/synaptic/agent_loop.py @@ -219,6 +219,20 @@ def _is_enumeration_query(query: str) -> bool: "properties": { "query": {"type": "string"}, "category": {"type": "string"}, + "limit": { + "type": "integer", + "description": ( + "Evidence items to return; use 10-20 for broad public-web " + "questions where the first few hits may miss the answer." + ), + }, + "read_top_k": { + "type": "integer", + "description": ( + "How many top parent documents to read; keep 1-3 unless " + "the snippets are inconclusive." + ), + }, }, "required": ["query"], }, @@ -231,7 +245,13 @@ def _is_enumeration_query(query: str) -> bool: "description": "Basic text search. Returns top candidate nodes.", "parameters": { "type": "object", - "properties": {"query": {"type": "string"}}, + "properties": { + "query": {"type": "string"}, + "limit": { + "type": "integer", + "description": "Candidate evidence items to return; capped by the runtime.", + }, + }, "required": ["query"], }, }, @@ -456,6 +476,14 @@ class AgentSearchResult: # --- Internals ----------------------------------------------------- +def _bounded_int(value: Any, *, default: int, minimum: int, maximum: int) -> int: + try: + parsed = int(value) + except (TypeError, ValueError): + parsed = default + return max(minimum, min(parsed, maximum)) + + async def _dispatch_tool( name: str, args: dict, @@ -486,10 +514,18 @@ async def _dispatch_tool( session, args.get("query", ""), category=args.get("category"), + limit=_bounded_int(args.get("limit"), default=10, minimum=1, maximum=20), + read_top_k=_bounded_int(args.get("read_top_k"), default=2, minimum=0, maximum=5), embedder=embedder, ) elif name == "search": - r = await search_tool(backend, session, args.get("query", ""), embedder=embedder) + r = await search_tool( + backend, + session, + args.get("query", ""), + limit=_bounded_int(args.get("limit"), default=10, minimum=1, maximum=20), + embedder=embedder, + ) elif name == "expand": r = await expand_tool( backend, diff --git a/src/synaptic/agent_tools_v2.py b/src/synaptic/agent_tools_v2.py index f1acf8d..e9e5238 100644 --- a/src/synaptic/agent_tools_v2.py +++ b/src/synaptic/agent_tools_v2.py @@ -43,12 +43,20 @@ logger = logging.getLogger("agent-tools-v2") +def _bounded_int(value: object, *, default: int, minimum: int, maximum: int) -> int: + try: + parsed = int(value) + except (TypeError, ValueError): + parsed = default + return max(minimum, min(parsed, maximum)) + + async def deep_search_tool( backend: StorageBackend, session: SearchSession, query: str, *, - limit: int = 5, + limit: int = 10, category: str | None = None, read_top_k: int = 2, embedder: object | None = None, @@ -74,6 +82,8 @@ async def deep_search_tool( embedder: Optional embedder for EvidenceSearch. reranker: Optional cross-encoder reranker. """ + limit = _bounded_int(limit, default=10, minimum=1, maximum=20) + read_top_k = _bounded_int(read_top_k, default=2, minimum=0, maximum=5) budget = _budget_check(session, "deep_search") if budget is not None: return budget diff --git a/tests/test_agent_efficiency.py b/tests/test_agent_efficiency.py index 93b41b9..a3aa524 100644 --- a/tests/test_agent_efficiency.py +++ b/tests/test_agent_efficiency.py @@ -15,10 +15,14 @@ from synaptic.agent_loop import ( _EFFICIENCY_DIRECTIVE, AGENT_SYSTEM, + AGENT_TOOLS, _args_key, + _bounded_int, + _dispatch_tool, _result_count, run_agent_loop, ) +from synaptic.agent_tools import ToolResult from synaptic.backends.sqlite_graph import SqliteGraphBackend from synaptic.models import ConsolidationLevel, Node, NodeKind @@ -45,6 +49,72 @@ def test_agent_system_preserves_query_constraints_for_followups(): assert "named entity, attribute, and relation" in AGENT_SYSTEM +def _tool_schema(name: str) -> dict: + for tool in AGENT_TOOLS: + fn = tool["function"] + if fn["name"] == name: + return fn + raise AssertionError(f"missing tool schema: {name}") + + +def test_agent_tool_schema_exposes_search_limits(): + deep_props = _tool_schema("deep_search")["parameters"]["properties"] + assert {"query", "category", "limit", "read_top_k"}.issubset(deep_props) + + search_props = _tool_schema("search")["parameters"]["properties"] + assert {"query", "limit"}.issubset(search_props) + + +def test_bounded_int_clamps_tool_limits(): + assert _bounded_int("15", default=10, minimum=1, maximum=20) == 15 + assert _bounded_int(" 15 ", default=10, minimum=1, maximum=20) == 15 + assert _bounded_int(None, default=10, minimum=1, maximum=20) == 10 + assert _bounded_int("invalid", default=10, minimum=1, maximum=20) == 10 + assert _bounded_int(99, default=10, minimum=1, maximum=20) == 20 + assert _bounded_int(-5, default=10, minimum=1, maximum=20) == 1 + + +@pytest.mark.asyncio +async def test_dispatch_passes_agent_requested_search_limits(monkeypatch): + import synaptic.agent_tools as agent_tools + import synaptic.agent_tools_v2 as agent_tools_v2 + + calls: list[tuple] = [] + + async def fake_deep_search_tool( + backend, + session, + query, + *, + category=None, + limit=10, + read_top_k=2, + embedder=None, + ): + calls.append(("deep_search", query, category, limit, read_top_k)) + return ToolResult(tool="deep_search", ok=True, data={"evidence": []}) + + async def fake_search_tool(backend, session, query, *, limit=10, embedder=None): + calls.append(("search", query, limit)) + return ToolResult(tool="search", ok=True, data={"evidence": []}) + + monkeypatch.setattr(agent_tools_v2, "deep_search_tool", fake_deep_search_tool) + monkeypatch.setattr(agent_tools, "search_tool", fake_search_tool) + + await _dispatch_tool( + "deep_search", + {"query": "q", "category": "cat", "limit": 99, "read_top_k": 9}, + None, + None, + ) + await _dispatch_tool("search", {"query": "q", "limit": -5}, None, None) + + assert calls == [ + ("deep_search", "q", "cat", 20, 5), + ("search", "q", 1), + ] + + # --- fake client (reused shape) --------------------------------------- diff --git a/tests/test_agent_tools.py b/tests/test_agent_tools.py index f218f7c..1e28858 100644 --- a/tests/test_agent_tools.py +++ b/tests/test_agent_tools.py @@ -14,6 +14,7 @@ import pytest +import synaptic.agent_tools_v2 as tools_v2 from synaptic.agent_tools import ( ToolResult, count_tool, @@ -274,6 +275,139 @@ async def test_search_hints_on_empty(self): assert len(result.hints) > 0 +# --- deep_search_tool --- + + +@pytest.mark.asyncio +async def test_deep_search_defaults_to_wider_evidence_pool(monkeypatch): + captured_limits: list[int] = [] + + async def fake_search_tool( + backend, + session, + query, + *, + limit, + category=None, + embedder=None, + **kwargs, + ): + captured_limits.append(limit) + return ToolResult( + tool="search", + ok=True, + data={"evidence": [], "anchors": {}}, + session=session.summary(), + ) + + monkeypatch.setattr(tools_v2, "search_tool", fake_search_tool) + backend = MemoryBackend() + await backend.connect() + + result = await tools_v2.deep_search_tool(backend, SearchSession(), "broad question") + + assert result.ok is True + assert captured_limits == [10] + + +@pytest.mark.asyncio +async def test_deep_search_caps_evidence_pool(monkeypatch): + captured_limits: list[int] = [] + + async def fake_search_tool( + backend, + session, + query, + *, + limit, + category=None, + embedder=None, + **kwargs, + ): + captured_limits.append(limit) + return ToolResult( + tool="search", + ok=True, + data={"evidence": [], "anchors": {}}, + session=session.summary(), + ) + + monkeypatch.setattr(tools_v2, "search_tool", fake_search_tool) + backend = MemoryBackend() + await backend.connect() + + result = await tools_v2.deep_search_tool( + backend, + SearchSession(), + "broad question", + limit=99, + read_top_k="invalid", + ) + + assert result.ok is True + assert captured_limits == [20] + + +@pytest.mark.asyncio +async def test_deep_search_caps_document_reads(monkeypatch): + document_ids: list[str] = [] + + async def fake_search_tool( + backend, + session, + query, + *, + limit, + category=None, + embedder=None, + **kwargs, + ): + return ToolResult( + tool="search", + ok=True, + data={ + "evidence": [ + {"id": f"chunk_{idx}", "document_id": f"doc_{idx}"} for idx in range(6) + ], + "anchors": {}, + }, + session=session.summary(), + ) + + async def fake_expand(backend, session, node_id): + return ToolResult( + tool="expand", + ok=True, + data={"seed": {"id": node_id}, "neighbours": []}, + session=session.summary(), + ) + + async def fake_get_doc(backend, session, doc_id, query): + document_ids.append(doc_id) + return ToolResult( + tool="get_document", + ok=True, + data={"document": {"id": doc_id}, "chunks": [], "chunk_count": 0}, + session=session.summary(), + ) + + monkeypatch.setattr(tools_v2, "search_tool", fake_search_tool) + monkeypatch.setattr(tools_v2, "_safe_expand", fake_expand) + monkeypatch.setattr(tools_v2, "_safe_get_doc", fake_get_doc) + backend = MemoryBackend() + await backend.connect() + + result = await tools_v2.deep_search_tool( + backend, + SearchSession(), + "broad question", + read_top_k=99, + ) + + assert result.ok is True + assert document_ids == ["doc_0", "doc_1", "doc_2", "doc_3", "doc_4"] + + # --- expand_tool ---