From 4a2fa98ed496dc507744dcdb5710c6ba0c73a437 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EC=86=90=EC=84=B1=EC=A4=80?= Date: Thu, 2 Jul 2026 19:22:21 +0900 Subject: [PATCH] Add agent loop LLM preflight-only mode --- .../diagnostics/public_scale_20260702.md | 1 + .../ablation/run_agent_loop_benchmarks.py | 39 ++++++++++++++----- tests/test_agent_search_benchmarks.py | 27 +++++++++++++ 3 files changed, 58 insertions(+), 9 deletions(-) diff --git a/examples/ablation/diagnostics/public_scale_20260702.md b/examples/ablation/diagnostics/public_scale_20260702.md index 65c5ace..5881657 100644 --- a/examples/ablation/diagnostics/public_scale_20260702.md +++ b/examples/ablation/diagnostics/public_scale_20260702.md @@ -48,6 +48,7 @@ PYTHONUNBUFFERED=1 SYNAPTIC_SQLITE_FTS_AND_FIRST_THRESHOLD=20 SYNAPTIC_SQLITE_FT # Put DEEPSEEK_API_KEY in shell env, the repo .env, or the parent workspace .env. # Do not put the key in docs, commands, JSONL, or DBs. uv run python examples/ablation/configure_deepseek_env.py +uv run python examples/ablation/run_agent_loop_benchmarks.py --llm-preset deepseek --preflight-only --preflight-timeout 15 PYTHONUNBUFFERED=1 SYNAPTIC_SQLITE_FTS_AND_FIRST_THRESHOLD=20 SYNAPTIC_SQLITE_FTS_LEXICAL_RERANK_POOL=500 uv run python examples/ablation/run_agent_loop_benchmarks.py --llm-preset deepseek --subset 20 --msmarco-path tests/benchmark/data/msmarco_passage_full.json --corpus-limit 8841823 --sqlite-db-path tests/benchmark/data/msmarco_full.db --max-turns 5 --llm-timeout 180 --preflight-timeout 15 --out-jsonl examples/ablation/diagnostics/agent_loop_deepseek_v4_flash_20.jsonl --resume # Historical local Ollama fallback smoke when the H100/Qwen3.6 tunnel is down. diff --git a/examples/ablation/run_agent_loop_benchmarks.py b/examples/ablation/run_agent_loop_benchmarks.py index 4add631..44e6c61 100644 --- a/examples/ablation/run_agent_loop_benchmarks.py +++ b/examples/ablation/run_agent_loop_benchmarks.py @@ -341,6 +341,11 @@ def _llm_preflight_error_message(base_url: str, model: str, exc: BaseException) ) +def _validate_preflight_flags(*, preflight_only: bool, skip_preflight: bool) -> None: + if preflight_only and skip_preflight: + raise SystemExit("--preflight-only cannot be combined with --skip-preflight") + + async def _preflight_llm_endpoint( client: object, *, @@ -517,7 +522,7 @@ async def amain(argv: list[str] | None = None) -> int: _load_local_env() parser = argparse.ArgumentParser() parser.add_argument("--msmarco-path", type=Path, default=DEFAULT_MSMARCO_PATH) - parser.add_argument("--sqlite-db-path", type=Path, required=True) + parser.add_argument("--sqlite-db-path", type=Path, default=None) parser.add_argument("--subset", type=int, default=20) parser.add_argument("--corpus-limit", type=int, default=0) parser.add_argument( @@ -550,6 +555,11 @@ async def amain(argv: list[str] | None = None) -> int: action="store_true", help="Skip the initial /v1/models endpoint check.", ) + parser.add_argument( + "--preflight-only", + action="store_true", + help="Validate the resolved LLM endpoint/key/model and exit before loading data.", + ) parser.add_argument("--no-sufficiency-gate", action="store_true") parser.add_argument( "--allow-zero-tool-answer", @@ -579,23 +589,19 @@ async def amain(argv: list[str] | None = None) -> int: raise SystemExit("--preflight-timeout must be positive") if args.resume and args.out_jsonl is None: raise SystemExit("--resume requires --out-jsonl") + _validate_preflight_flags( + preflight_only=args.preflight_only, + skip_preflight=args.skip_preflight, + ) args.llm_base_url, args.model, args.api_key_env = _resolve_llm_settings( preset=args.llm_preset, llm_base_url=args.llm_base_url, model=args.model, api_key_env=args.api_key_env, ) - if not args.msmarco_path.exists(): - raise SystemExit(f"{args.msmarco_path} does not exist") - if not args.sqlite_db_path.exists(): - raise SystemExit(f"{args.sqlite_db_path} does not exist") from openai import AsyncOpenAI - data = json.loads(args.msmarco_path.read_text(encoding="utf-8")) - query_items = list(data["queries"].items())[: args.subset] - qrels = data["qrels"] - n_docs = args.corpus_limit or int(data.get("corpus_size") or 0) api_key = os.environ.get(args.api_key_env) or "" local_endpoint = any(marker in args.llm_base_url for marker in ("localhost", "127.0.0.1")) if not api_key and not local_endpoint: @@ -610,6 +616,21 @@ async def amain(argv: list[str] | None = None) -> int: model=args.model, timeout_sec=args.preflight_timeout, ) + if args.preflight_only: + print(f"LLM preflight OK: base_url={args.llm_base_url} model={args.model}") + return 0 + + if not args.msmarco_path.exists(): + raise SystemExit(f"{args.msmarco_path} does not exist") + if args.sqlite_db_path is None: + raise SystemExit("--sqlite-db-path is required unless --preflight-only is set") + if not args.sqlite_db_path.exists(): + raise SystemExit(f"{args.sqlite_db_path} does not exist") + + data = json.loads(args.msmarco_path.read_text(encoding="utf-8")) + query_items = list(data["queries"].items())[: args.subset] + qrels = data["qrels"] + n_docs = args.corpus_limit or int(data.get("corpus_size") or 0) backend = SqliteGraphBackend(str(args.sqlite_db_path)) await backend.connect() diff --git a/tests/test_agent_search_benchmarks.py b/tests/test_agent_search_benchmarks.py index 02d39f6..5515faa 100644 --- a/tests/test_agent_search_benchmarks.py +++ b/tests/test_agent_search_benchmarks.py @@ -8,6 +8,8 @@ from pathlib import Path from types import SimpleNamespace +import pytest + RUNNER_PATH = ( Path(__file__).resolve().parents[1] / "examples" / "ablation" / "run_agent_search_benchmarks.py" ) @@ -212,6 +214,31 @@ def test_agent_loop_local_preset_keeps_existing_defaults() -> None: ) +def test_agent_loop_preflight_only_rejects_skip_preflight() -> None: + with pytest.raises(SystemExit, match="cannot be combined"): + loop_runner._validate_preflight_flags(preflight_only=True, skip_preflight=True) + + +@pytest.mark.asyncio +async def test_agent_loop_preflight_only_skips_benchmark_inputs(monkeypatch, capsys) -> None: + calls = [] + + async def fake_preflight(client, *, base_url, model, timeout_sec) -> None: + calls.append((base_url, model, timeout_sec)) + + key_value = "value-for-test" + monkeypatch.setenv("DEEPSEEK_API_KEY", key_value) + monkeypatch.setattr(loop_runner, "_preflight_llm_endpoint", fake_preflight) + + assert await loop_runner.amain(["--llm-preset", "deepseek", "--preflight-only"]) == 0 + + captured = capsys.readouterr() + assert calls == [("https://api.deepseek.com/v1", "deepseek-v4-flash", 10.0)] + assert "LLM preflight OK" in captured.out + assert key_value not in captured.out + assert key_value not in captured.err + + def test_llm_preflight_error_message_names_endpoint_and_skip_hint() -> None: msg = loop_runner._llm_preflight_error_message( "http://127.0.0.1:18012/v1",