From aa0f3f319b023bc6390d2591a680a1d97944c27b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EC=86=90=EC=84=B1=EC=A4=80?= Date: Thu, 2 Jul 2026 19:13:27 +0900 Subject: [PATCH] Document DeepSeek agent loop benchmark path --- .../ablation/diagnostics/public_scale_20260702.md | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/examples/ablation/diagnostics/public_scale_20260702.md b/examples/ablation/diagnostics/public_scale_20260702.md index 0ba80a1..7a6430c 100644 --- a/examples/ablation/diagnostics/public_scale_20260702.md +++ b/examples/ablation/diagnostics/public_scale_20260702.md @@ -43,9 +43,15 @@ PYTHONUNBUFFERED=1 SYNAPTIC_SQLITE_FTS_AND_FIRST_THRESHOLD=20 uv run --extra sql PYTHONUNBUFFERED=1 SYNAPTIC_SQLITE_FTS_AND_FIRST_THRESHOLD=20 SYNAPTIC_SQLITE_FTS_LEXICAL_RERANK_POOL=500 uv run --extra sqlite python examples/ablation/run_tier1_benchmarks.py --only msmarco --subset 500 --msmarco-path tests/benchmark/data/msmarco_passage_full.json --corpus-limit 8841823 --use-sqlite-graph --sqlite-db-path tests/benchmark/data/msmarco_full.db --reuse-sqlite-db PYTHONUNBUFFERED=1 SYNAPTIC_SQLITE_FTS_AND_FIRST_THRESHOLD=20 SYNAPTIC_SQLITE_FTS_LEXICAL_RERANK_POOL=500 uv run python examples/ablation/run_agent_search_benchmarks.py --subset 50 --msmarco-path tests/benchmark/data/msmarco_passage_full.json --corpus-limit 8841823 --sqlite-db-path tests/benchmark/data/msmarco_full.db --modes graph_search,deep_search,scripted_session --result-limit 20 --tool-limit 10 --read-top-k 0 --scripted-turns 2 PYTHONUNBUFFERED=1 SYNAPTIC_SQLITE_FTS_AND_FIRST_THRESHOLD=20 SYNAPTIC_SQLITE_FTS_LEXICAL_RERANK_POOL=500 uv run python examples/ablation/run_agent_search_benchmarks.py --subset 10 --msmarco-path tests/benchmark/data/msmarco_passage_full.json --corpus-limit 8841823 --sqlite-db-path tests/benchmark/data/msmarco_full.db --modes agent_search --result-limit 20 --tool-limit 10 --intent context_explore -PYTHONUNBUFFERED=1 SYNAPTIC_SQLITE_FTS_AND_FIRST_THRESHOLD=20 SYNAPTIC_SQLITE_FTS_LEXICAL_RERANK_POOL=500 uv run python examples/ablation/run_agent_loop_benchmarks.py --subset 20 --msmarco-path tests/benchmark/data/msmarco_passage_full.json --corpus-limit 8841823 --sqlite-db-path tests/benchmark/data/msmarco_full.db --llm-base-url "$LLM_BASE_URL" --model "$LLM_MODEL" --api-key-env LLM_API_KEY --max-turns 5 -# Local Ollama fallback smoke when the H100/Qwen3.6 tunnel is down. +# DeepSeek Flash agent-loop quality path. +# Put DEEPSEEK_API_KEY in shell env, the repo .env, or the parent workspace .env. +# Do not put the key in docs, commands, JSONL, or DBs. +PYTHONUNBUFFERED=1 SYNAPTIC_SQLITE_FTS_AND_FIRST_THRESHOLD=20 SYNAPTIC_SQLITE_FTS_LEXICAL_RERANK_POOL=500 uv run python examples/ablation/run_agent_loop_benchmarks.py --llm-preset deepseek --subset 20 --msmarco-path tests/benchmark/data/msmarco_passage_full.json --corpus-limit 8841823 --sqlite-db-path tests/benchmark/data/msmarco_full.db --max-turns 5 --llm-timeout 180 --preflight-timeout 15 --out-jsonl examples/ablation/diagnostics/agent_loop_deepseek_v4_flash_20.jsonl --resume + +# Historical local Ollama fallback smoke when the H100/Qwen3.6 tunnel is down. +# This is functional/navigation-only and should not be used as the quality +# reference for the agent loop. # Terminal 1: ssh -N -L 18134:127.0.0.1:11434 go243 # Terminal 2: @@ -142,6 +148,10 @@ with `No route to host`), so this smoke used the `go243` Ollama fallback `qwen3:14b`. Treat it as a functional navigation smoke, not a Qwen3.6 quality reference. +The preferred quality path for follow-up runs is now DeepSeek Flash via +`--llm-preset deepseek`, with the API key supplied only through runtime env or a +gitignored `.env` file. + | Mode | Model | Docs | Queries | Reach | Mean turns | Mean calls | Mean first rel turn | Mean elapsed | Mean unique tools | Mean search targets | Mean rewrites | Multi-tool | Rewrites | Zero-tool | |------|-------|-----:|--------:|------:|-----------:|-----------:|--------------------:|-------------:|------------------:|--------------------:|--------------:|-----------:|---------:|----------:| | historical zero-tool allowed | `qwen3:14b` via Ollama | 8,841,823 | 20 | 6/20 | 2.50 | 1.90 | 1.17 | 41.3s | 1.90 | 1.85 | 1.20 | 12/20 | 14/20 | 2/20 |