diff --git a/.gitignore b/.gitignore index 45dc651..f3c3ed5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,25 @@ +# Runtime artifacts +.supervisor.log +.supervisor.lock +.supervisor_snapshot/ +.logger_runs/ +.log/ +.mle_log.jsonl +gym_log.json + +# Python/editor cruft __pycache__/ -*.egg-info/ -dist/ -build/ -.DS_Store *.pyc +.DS_Store + +# gym-environment +.claudeignore +.copilotignore +.cursorignore +.cursorrules +.geminiignore +.github +.gitignore +AGENTS.md +CLAUDE.md +GEMINI.md diff --git a/README.md b/README.md index f64b306..bfb6d22 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,10 @@ aicodinggym configure --user-id USER_ID [--workspace-dir DIR] #### `aicodinggym swe fetch PROBLEM_ID` -Fetch a problem and clone the repo locally. +Fetch a problem and clone the repo locally. After a successful `swe fetch`, `mle download`, or `cr fetch`, the CLI downloads agent instruction files from [AICodingGym/gym-environment](https://github.com/AICodingGym/gym-environment) via the GitHub Contents API. By default it uses the **`test` branch**. Override with environment variables: + +- `AICODINGGYM_GYM_ENV_REPO` — `owner/repo` (default: `AICodingGym/gym-environment`) +- `AICODINGGYM_GYM_ENV_REF` — branch, tag, or commit SHA for `?ref=` (default: `test` when unset) ``` aicodinggym swe fetch PROBLEM_ID [--user-id ID] [--workspace-dir DIR] diff --git a/__init__.py b/__init__.py index c5f223c..40ec2de 100644 --- a/__init__.py +++ b/__init__.py @@ -1,3 +1,44 @@ -"""AI Coding Gym CLI.""" +"""AI Coding Gym CLI. -__version__ = "0.3.0" +Imports are lazy so tooling that loads this file without package context +(e.g. some pytest collection paths) does not fail on relative imports. +""" + +from __future__ import annotations + +import importlib +import importlib.metadata +from typing import TYPE_CHECKING, Any + +try: + __version__ = importlib.metadata.version("aicodinggym-cli") +except importlib.metadata.PackageNotFoundError: # pragma: no cover - dev without install + __version__ = "0.0.0" + +__all__ = [ + "__version__", + "ExperimentLog", + "LogEntry", + "capture_mle_provenance", + "log_entry", + "print_summary", + "set_log_path", + "gym_logger", +] + + +def __getattr__(name: str) -> Any: + if name in ("ExperimentLog", "LogEntry", "capture_mle_provenance"): + m = importlib.import_module("aicodinggym.experiment_log") + return getattr(m, name) + if name in ("log_entry", "print_summary", "set_log_path"): + m = importlib.import_module("aicodinggym.gym_logger") + return getattr(m, name) + if name == "gym_logger": + return importlib.import_module("aicodinggym.gym_logger") + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +if TYPE_CHECKING: + from .experiment_log import ExperimentLog, LogEntry, capture_mle_provenance + from .gym_logger import log_entry, print_summary, set_log_path diff --git a/api.py b/api.py index 08c6e85..67db924 100644 --- a/api.py +++ b/api.py @@ -76,8 +76,11 @@ def fetch_problem(user_id: str, problem_id: str) -> dict: def submit_notification(problem_id: str, user_id: str, commit_hash: str, - branch: str, commit_message: str, timestamp: str) -> dict: - """Notify backend of a submission.""" + branch: str, commit_message: str, timestamp: str, + tool: str | None = None, + tool_version: str | None = None, + ai_model: str | None = None) -> dict: + """Notify backend of a SWE submission, optionally attributing the tool/model used.""" return _post("submissions", { "problem_id": problem_id, "user_id": user_id, @@ -85,6 +88,9 @@ def submit_notification(problem_id: str, user_id: str, commit_hash: str, "branch": branch, "commit_message": commit_message, "timestamp": timestamp, + "tool": tool, + "tool_version": tool_version, + "ai_model": ai_model, }) @@ -93,18 +99,73 @@ def fetch_pr(user_id: str, problem_id: str) -> dict: return _post("code-review-fetch", {"user_id": user_id, "problem_id": problem_id}) -def cr_submit_review(user_id: str, problem_id: str, review: str) -> dict: +def cr_submit_review(user_id: str, problem_id: str, review: str, + tool: str | None = None, + tool_version: str | None = None, + ai_model: str | None = None) -> dict: """Submit a code review.""" return _post("code-review-submit", { "user_id": user_id, "problem_id": problem_id, "review": review, + "tool": tool, + "tool_version": tool_version, + "ai_model": ai_model, }) +def notify_mle_progress(user_id: str, problem_slug: str, best_percentile: float, + tool: str | None = None, + tool_version: str | None = None, + ai_model: str | None = None) -> dict: + """After an MLE-bench grade is returned, log tool/model attribution and + bestPercentile against the Prisma UserProgress row so the leaderboard + aggregator can pick it up.""" + payload = { + "problemSlug": problem_slug, + "status": "solved", + "bestPercentile": best_percentile, + "tool": tool, + "tool_version": tool_version, + "ai_model": ai_model, + } + return _post(f"users/{user_id}/progress", payload) + + def mlebench_download_info(user_id: str, competition_id: str, dest_path: str) -> None: - """Download dataset for an MLE-bench competition directly to dest_path.""" - resp = _get(f"competitions/{competition_id}/download", stream=True) + """Download dataset for an MLE-bench competition directly to dest_path. + + Uses a long read timeout: large zips can take many minutes between chunks + over slow links; the default 30s read timeout would abort mid-stream. + """ + read_s = int(os.environ.get("AICODINGGYM_DOWNLOAD_READ_TIMEOUT", "0")) + if read_s <= 0: + read_s = 7200 # seconds between reads; large zips need headroom + url = f"{API_BASE}/competitions/{competition_id}/download" + try: + resp = requests.get( + url, + stream=True, + timeout=(120, read_s), + ) + resp.raise_for_status() + except requests.ConnectionError: + raise APIError( + f"Cannot connect to {API_BASE}.\n" + "Check your internet connection and try again." + ) + except requests.Timeout: + raise APIError(f"Download from {url} timed out.") + except requests.HTTPError as e: + body = "" + try: + body = e.response.json().get("detail", e.response.text) + except Exception: + body = e.response.text + raise APIError(f"API error (HTTP {e.response.status_code}): {body}") + except requests.RequestException as e: + raise APIError(f"Request failed: {e}") + with open(dest_path, "wb") as f: for chunk in resp.iter_content(chunk_size=8192): f.write(chunk) @@ -122,15 +183,25 @@ def mlebench_download_file(url: str, dest_path: str, timeout: int = 300) -> None raise APIError(f"Download failed: {e}") -def mlebench_submit_csv(user_id: str, competition_id: str, csv_path: str) -> dict: +def mlebench_submit_csv(user_id: str, competition_id: str, csv_path: str, + tool: str | None = None, + tool_version: str | None = None, + ai_model: str | None = None) -> dict: """Upload a prediction CSV for an MLE-bench competition.""" try: csv_name = Path(csv_path).name with open(csv_path, "rb") as f: compressed = gzip.compress(f.read()) + form = { + "user_id": user_id, + "competition_id": competition_id, + "tool": tool or "", + "tool_version": tool_version or "", + "ai_model": ai_model or "", + } resp = requests.post( f"{API_BASE}/competitions/{competition_id}/submit", - data={"user_id": user_id, "competition_id": competition_id}, + data=form, files={"file": (csv_name + ".gz", compressed, "application/gzip")}, timeout=120, ) diff --git a/cli.py b/cli.py index ebc1c78..2738dcd 100644 --- a/cli.py +++ b/cli.py @@ -28,9 +28,11 @@ import re import subprocess import sys +import time import urllib.request from datetime import datetime from pathlib import Path +from typing import Any import click @@ -44,8 +46,10 @@ mlebench_download_file, mlebench_download_info, mlebench_submit_csv, + notify_mle_progress, submit_notification, ) +from .cli_env import read_solution_log_model, resolve as resolve_env from .config import ( load_config, load_credentials, @@ -80,14 +84,50 @@ def _warn(msg: str) -> None: click.echo(f"Warning: {msg}", err=True) -_GYM_ENV_API = "https://api.github.com/repos/AICodingGym/gym-environment/contents" _GYM_ENV_SKIP = {"README.md"} +_GYM_ENV_MLE_ONLY: set[str] = set() -def _install_gym_environment(dest: Path) -> None: - """Download gym-environment files into dest and add them to .gitignore.""" +def _gym_env_repo() -> str: + """GitHub ``owner/repo`` for gym-environment assets (override with env).""" + return os.environ.get("AICODINGGYM_GYM_ENV_REPO", "").strip() or "AICodingGym/gym-environment" + + +def _gym_env_ref() -> str: + """Git ref (branch, tag, or commit) for Contents API ``?ref=``. + + If ``AICODINGGYM_GYM_ENV_REF`` is unset or empty, defaults to ``test`` so + fetched problems get the same supervisor/dashboard stack as CI/staging. + Set ``AICODINGGYM_GYM_ENV_REF=main`` (or another branch) to override. + """ + ref = os.environ.get("AICODINGGYM_GYM_ENV_REF", "") + ref = ref.strip() + if ref: + return ref + return "test" + + +def _gym_env_contents_api_url(subpath: str = "") -> str: + """GitHub Contents API URL for gym-environment at the configured ref.""" + base = f"https://api.github.com/repos/{_gym_env_repo()}/contents" + subpath = subpath.strip("/") + if subpath: + base = f"{base}/{subpath}" + ref = _gym_env_ref() + return f"{base}?ref={ref}" + + +def _install_gym_environment(dest: Path, challenge: str | None = None) -> None: + """Download gym-environment files from GitHub into dest and add to .gitignore. + + Ref and repo are configurable via ``AICODINGGYM_GYM_ENV_REF`` and + ``AICODINGGYM_GYM_ENV_REPO``. When ref is unset, the ``test`` branch is used. + """ try: - req = urllib.request.Request(_GYM_ENV_API, headers={"Accept": "application/vnd.github.v3+json"}) + req = urllib.request.Request( + _gym_env_contents_api_url(), + headers={"Accept": "application/vnd.github.v3+json"}, + ) with urllib.request.urlopen(req, timeout=15) as resp: entries = json.loads(resp.read()) except Exception as e: @@ -117,7 +157,7 @@ def _install_gym_environment(dest: Path) -> None: # Fetch subdirectory contents recursively (one level deep) try: sub_req = urllib.request.Request( - f"{_GYM_ENV_API}/{name}", + _gym_env_contents_api_url(name), headers={"Accept": "application/vnd.github.v3+json"}, ) with urllib.request.urlopen(sub_req, timeout=15) as r: @@ -140,19 +180,117 @@ def _install_gym_environment(dest: Path) -> None: _warn(f"Failed to download {name}/{sub_name}: {e}") downloaded.append(name) - if not downloaded: - return + # Seed empty solution_log.json if absent (AI agent populates it after each prompt) + log_file = dest / "solution_log.json" + if not log_file.exists(): + log_file.write_text( + '{"version": "1.0", "problem": "", "problem_type": "mle", "prompts": []}\n', + encoding="utf-8", + ) # Append to .gitignore gitignore = dest / ".gitignore" existing = gitignore.read_text(encoding="utf-8") if gitignore.exists() else "" existing_lines = set(existing.splitlines()) - new_entries = [f for f in downloaded if f not in existing_lines and f"/{f}" not in existing_lines] - if new_entries: - block = "\n# gym-environment\n" + "\n".join(new_entries) + "\n" - with open(gitignore, "a", encoding="utf-8", newline="\n") as fh: - fh.write(block) + gym_artifacts = [".gym_watcher.lock", ".gym_watcher.log", "solution_log.json", ".dashboard.tmp"] + if downloaded: + new_entries = [f for f in downloaded if f not in existing_lines and f"/{f}" not in existing_lines] + new_entries += [a for a in gym_artifacts if a not in existing_lines and f"/{a}" not in existing_lines] + if new_entries: + block = "\n# gym-environment\n" + "\n".join(new_entries) + "\n" + with open(gitignore, "a", encoding="utf-8", newline="\n") as fh: + fh.write(block) + + +def _open_in_browser(path: Path) -> bool: + """Best-effort open a local file in the user's default browser. + + Returns True if the open call was dispatched, False otherwise. Never + raises - a missing display / headless box should not break ``fetch``. + """ + try: + if not path.exists(): + # Create a minimal placeholder so the browser has something to load; + # the watcher will overwrite it moments later. + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("AI Coding Gym

Preparing dashboard\u2026

", encoding="utf-8") + import webbrowser + return bool(webbrowser.open(path.resolve().as_uri())) + except Exception: + return False + + +def _autostart_watcher(problem_dir: Path) -> None: + """Launch gym_watcher.py in background inside problem_dir. Non-fatal.""" + problem_dir = Path(problem_dir) + watcher = problem_dir / "gym_watcher.py" + if not watcher.exists(): + return + lock = problem_dir / ".gym_watcher.lock" + if lock.exists(): + try: + pid = int(lock.read_text(encoding="utf-8").strip()) + except (OSError, ValueError): + pid = None + if pid and _pid_alive(pid): + click.echo("Gym watcher already running; skipping auto-start.") + return + try: + lock.unlink() + except OSError: + pass + log_path = problem_dir / ".gym_watcher.log" + try: + cmd = [sys.executable, str(watcher), str(problem_dir)] + log_fh = open(log_path, "ab", buffering=0) + kwargs: dict[str, Any] = { + "stdout": log_fh, + "stderr": log_fh, + "stdin": subprocess.DEVNULL, + "cwd": str(problem_dir), + } + if platform.system() == "Windows": + DETACHED_PROCESS = 0x00000008 + CREATE_NEW_PROCESS_GROUP = 0x00000200 + kwargs["creationflags"] = DETACHED_PROCESS | CREATE_NEW_PROCESS_GROUP + kwargs["close_fds"] = False + else: + kwargs["start_new_session"] = True + subprocess.Popen(cmd, **kwargs) # type: ignore[arg-type] + dashboard = problem_dir / "dashboard.html" + opened = _open_in_browser(dashboard) + msg = "Gym watcher started (logs: .gym_watcher.log)." + if not opened: + msg += f" Open {dashboard} to view dashboard." + click.echo(msg) + except Exception as exc: + _warn(f"Could not auto-start gym_watcher.py: {exc}.") + + +def _pid_alive(pid: int) -> bool: + """Cross-platform ``kill(0)`` equivalent.""" + try: + if platform.system() == "Windows": + # ``tasklist`` is universally available on Windows; short-circuit via signal. + out = subprocess.run( + ["tasklist", "/FI", f"PID eq {pid}", "/NH"], + capture_output=True, text=True, check=False, timeout=5, + ) + return str(pid) in out.stdout + else: + os.kill(pid, 0) + return True + except (ProcessLookupError, PermissionError, subprocess.TimeoutExpired, OSError): + return False + +def _shquote(text: str) -> str: + """Minimal POSIX-shell quoting sufficient for paths used by the autostart shim.""" + if not text: + return "''" + if all(ch.isalnum() or ch in "@%+=:,./-_" for ch in text): + return text + return "'" + text.replace("'", "'\"'\"'") + "'" def _resolve_user_id(config: dict, user_id: str | None) -> str: @@ -528,7 +666,8 @@ def swe_fetch(problem_id: str, user_id: str | None, workspace_dir: str | None): if not success: _error(msg) - _install_gym_environment(workspace / problem_id) + _install_gym_environment(workspace / problem_id, "swe") + _autostart_watcher(workspace / problem_id) click.echo( f"\nSuccessfully fetched problem: {problem_id}\n" @@ -555,8 +694,21 @@ def swe_fetch(problem_id: str, user_id: str | None, workspace_dir: str | None): "--workspace-dir", default=None, type=click.Path(), help="Workspace directory. Overrides configured/cached value.", ) +@click.option( + "--tool", default=None, + help="Override detected coding tool (e.g. claude-code, cursor, antigravity).", +) +@click.option( + "--tool-version", default=None, + help="Override detected tool version string.", +) +@click.option( + "--ai-model", default=None, + help="Override detected AI model (e.g. opus-4.7, gpt-5, gemini-2.5-pro).", +) def swe_submit(problem_id: str, user_id: str | None, message: str | None, - force: bool, workspace_dir: str | None): + force: bool, workspace_dir: str | None, + tool: str | None, tool_version: str | None, ai_model: str | None): """Submit your SWE-bench solution by committing and pushing changes. Stages all changes, commits them, pushes to the remote, and notifies @@ -626,6 +778,8 @@ def swe_submit(problem_id: str, user_id: str | None, message: str | None, if not success: _error(msg) + env = resolve_env(tool, tool_version, ai_model) + # Notify backend try: submit_notification( @@ -635,13 +789,24 @@ def swe_submit(problem_id: str, user_id: str | None, message: str | None, branch=branch, commit_message=commit_msg, timestamp=datetime.now().isoformat(), + **env, ) except APIError as e: _warn(f"Changes pushed, but failed to notify backend: {e}") + tool_line = "" + if env["tool"] or env["ai_model"]: + bits = [] + if env["tool"]: + bits.append(env["tool"] + (f" {env['tool_version']}" if env["tool_version"] else "")) + if env["ai_model"]: + bits.append(f"model={env['ai_model']}") + tool_line = f" Tool: {' · '.join(bits)}\n" + click.echo( f"\nSuccessfully submitted solution for {problem_id}\n" f"\n" + f"{tool_line}" f" Commit: {commit_hash[:8]}\n" f" Branch: {branch}\n" f" Status: Pushed and backend notified\n" @@ -1021,7 +1186,8 @@ def cr_fetch(problem_id: str, user_id: str | None, workspace_dir: str | None): if not success: _error(msg) - _install_gym_environment(workspace / problem_id) + _install_gym_environment(workspace / problem_id, "cr") + _autostart_watcher(workspace / problem_id) problem_dir = workspace / problem_id @@ -1072,8 +1238,21 @@ def cr_fetch(problem_id: str, user_id: str | None, workspace_dir: str | None): "-m", "--message", "review_text", help="Inline review text.", ) +@click.option( + "--tool", default=None, + help="Override detected coding tool (e.g. claude-code, cursor, antigravity).", +) +@click.option( + "--tool-version", default=None, + help="Override detected tool version string.", +) +@click.option( + "--ai-model", default=None, + help="Override detected AI model (e.g. opus-4.7, gpt-5, gemini-2.5-pro).", +) def cr_submit(problem_id: str, user_id: str | None, review_file: str | None, - review_text: str | None): + review_text: str | None, + tool: str | None, tool_version: str | None, ai_model: str | None): """Submit a code review for a Code Review challenge. Reads your review from a file (-f), inline text (-m), or piped stdin, @@ -1112,14 +1291,25 @@ def cr_submit(problem_id: str, user_id: str | None, review_file: str | None, f" aicodinggym cr submit {problem_id} -f review.md" ) + env = resolve_env(tool, tool_version, ai_model) try: - result = cr_submit_review(uid, problem_id, review.strip()) + result = cr_submit_review(uid, problem_id, review.strip(), **env) except APIError as e: _error(str(e)) + tool_line = "" + if env["tool"] or env["ai_model"]: + bits = [] + if env["tool"]: + bits.append(env["tool"] + (f" {env['tool_version']}" if env["tool_version"] else "")) + if env["ai_model"]: + bits.append(f"model={env['ai_model']}") + tool_line = f" Tool: {' · '.join(bits)}\n" + click.echo( f"\nSuccessfully submitted code review for {problem_id}\n" f"\n" + f"{tool_line}" f" Status: {result.get('status', 'COMPLETED')}\n" f"\n" f"View results at: {_hyperlink(f'https://aicodinggym.com/challenges/cr/{problem_id}')}" @@ -1182,7 +1372,8 @@ def mle_download(competition_id: str, user_id: str | None, workspace_dir: str | except APIError as e: _error(str(e)) - _install_gym_environment(workspace / competition_id) + _install_gym_environment(workspace / competition_id, "mle") + _autostart_watcher(workspace / competition_id) click.echo( f"\nDataset downloaded to: {dest_path}\n" @@ -1202,8 +1393,21 @@ def mle_download(competition_id: str, user_id: str | None, workspace_dir: str | "--message", "-m", default=None, help="Description of your submission (optional).", ) +@click.option( + "--tool", default=None, + help="Override detected coding tool (e.g. claude-code, cursor, antigravity).", +) +@click.option( + "--tool-version", default=None, + help="Override detected tool version string.", +) +@click.option( + "--ai-model", default=None, + help="Override detected AI model (e.g. opus-4.7, gpt-5, gemini-2.5-pro).", +) def mle_submit(competition_id: str, csv_path: str, user_id: str | None, - message: str | None): + message: str | None, + tool: str | None, tool_version: str | None, ai_model: str | None): """Submit a prediction CSV for an MLE-bench competition. Uploads your prediction CSV directly to the AI Coding Gym server @@ -1239,22 +1443,47 @@ def mle_submit(competition_id: str, csv_path: str, user_id: str | None, csv_src = Path(csv_path).resolve() + # solution_log.json (per CLAUDE.md) is the most accurate model record for MLE + log_model = read_solution_log_model(csv_src.parent) + env = resolve_env(tool, tool_version, ai_model or log_model) + click.echo(f"Uploading {csv_src.name} for '{competition_id}'...") try: - result = mlebench_submit_csv(uid, competition_id, str(csv_src)) + result = mlebench_submit_csv(uid, competition_id, str(csv_src), **env) except APIError as e: _error(str(e)) score_msg = result.get("message", "Submission received for scoring.") score = result.get("score") + percentile = result.get("leaderboard_percentile") + + # Forward percentile + attribution to the Prisma backend so the leaderboard + # aggregator can rank tools/models. Fire-and-forget — never fail the submit. + if percentile is not None: + try: + notify_mle_progress(uid, competition_id, float(percentile), **env) + except APIError as e: + _warn(f"Submitted, but failed to log progress: {e}") + + tool_line = "" + if env["tool"] or env["ai_model"]: + bits = [] + if env["tool"]: + bits.append(env["tool"] + (f" {env['tool_version']}" if env["tool_version"] else "")) + if env["ai_model"]: + bits.append(f"model={env['ai_model']}") + tool_line = f" Tool: {' · '.join(bits)}\n" click.echo( f"\nSuccessfully submitted prediction for {competition_id}\n" f"\n" + f"{tool_line}" f" CSV: {csv_src.name}\n" f" Status: {score_msg}\n" ) if score is not None: click.echo(f" Score: {score}\n") + if percentile is not None: + click.echo(f" Top %: {percentile}\n") click.echo(f"View results at: {_hyperlink(f'https://aicodinggym.com/challenges/mle/{competition_id}')}") diff --git a/cli_env.py b/cli_env.py new file mode 100644 index 0000000..71df63d --- /dev/null +++ b/cli_env.py @@ -0,0 +1,124 @@ +"""Detect AI coding tool + model used for the current shell session. + +Reads only an allowlist of well-known env vars — never the full environment — +so secrets cannot accidentally leak into the submission payload. +""" + +from __future__ import annotations + +import json +import os +import shutil +import subprocess +from pathlib import Path + +ALLOWED_TOOLS = ( + "claude-code", + "cursor", + "antigravity", + "aider", + "codex-cli", + "copilot-cli", + "windsurf", + "continue", + "cline", + "gemini-cli", +) + + +def detect_tool() -> tuple[str | None, str | None]: + """Return (tool_name, version) inferred from env signals. None if unknown.""" + if os.environ.get("CLAUDECODE") == "1": + return ("claude-code", _version("claude")) + if os.environ.get("CURSOR_TRACE_ID") or os.environ.get("TERM_PROGRAM") == "cursor": + return ("cursor", os.environ.get("CURSOR_VERSION")) + if os.environ.get("ANTIGRAVITY"): + return ("antigravity", os.environ.get("ANTIGRAVITY_VERSION")) + if os.environ.get("AIDER_MODEL") or shutil.which("aider"): + return ("aider", _version("aider")) + if os.environ.get("CODEX_CLI"): + return ("codex-cli", _version("codex")) + if os.environ.get("WINDSURF"): + return ("windsurf", os.environ.get("WINDSURF_VERSION")) + if os.environ.get("CONTINUE_CLI"): + return ("continue", _version("continue")) + if os.environ.get("CLINE_CLI"): + return ("cline", _version("cline")) + if os.environ.get("GEMINI_CLI"): + return ("gemini-cli", _version("gemini")) + return (None, None) + + +def detect_model() -> str | None: + """Best-effort model detection from env. Lowercase, trimmed.""" + raw = ( + os.environ.get("ANTHROPIC_MODEL") + or os.environ.get("CLAUDE_CODE_MODEL") + or os.environ.get("OPENAI_MODEL") + or os.environ.get("AIDER_MODEL") + or os.environ.get("GEMINI_MODEL") + or os.environ.get("CURSOR_MODEL") + ) + if not raw: + return None + return raw.strip().lower() + + +def resolve( + cli_tool: str | None, + cli_version: str | None, + cli_model: str | None, +) -> dict[str, str | None]: + """CLI flags win; env detection fills the gaps. Returns kwargs for api.py.""" + auto_tool, auto_ver = detect_tool() + return { + "tool": cli_tool or auto_tool, + "tool_version": cli_version or auto_ver, + "ai_model": cli_model or detect_model(), + } + + +def read_solution_log_model(problem_dir: Path) -> str | None: + """For MLE: prefer the model recorded in solution_log.json (set by the agent + after each prompt per CLAUDE.md). Falls back to None if missing or malformed. + """ + log_path = problem_dir / "solution_log.json" + if not log_path.exists(): + return None + try: + data = json.loads(log_path.read_text()) + except Exception: + return None + + # Tolerate a few common shapes: {"model": "..."} or {"model_id": "..."} + # or {"entries": [{"model": "..."}, ...]} — take the most recent one. + if isinstance(data, dict): + if isinstance(data.get("model"), str): + return data["model"].strip().lower() + if isinstance(data.get("model_id"), str): + return data["model_id"].strip().lower() + entries = data.get("entries") + if isinstance(entries, list) and entries: + last = entries[-1] + if isinstance(last, dict): + for key in ("model", "model_id"): + if isinstance(last.get(key), str): + return last[key].strip().lower() + return None + + +def _version(cmd: str) -> str | None: + if not shutil.which(cmd): + return None + try: + out = subprocess.check_output( + [cmd, "--version"], + text=True, + timeout=3, + stderr=subprocess.DEVNULL, + ) + # Keep the last token (often "1.2.3") and cap length. + token = out.strip().splitlines()[0].split()[-1] + return token[:32] + except Exception: + return None diff --git a/pyproject.toml b/pyproject.toml index 9f749e9..a3c733b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "aicodinggym-cli" -version = "0.5.1" +version = "0.6.0" description = "CLI tool for AI Coding Gym platform" readme = "README.md" requires-python = ">=3.10" @@ -37,3 +37,7 @@ packages = ["aicodinggym"] [tool.setuptools.package-dir] aicodinggym = "." + +[tool.pytest.ini_options] +testpaths = ["tests"] +python_files = ["test_*.py"]