coding-agents-databricks-apps/setup_codex.py at main · databrickslabs/coding-agents-databricks-apps · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
#!/usr/bin/env python
"""Configure OpenAI Codex CLI with Databricks Model Serving.

Codex CLI is OpenAI's coding agent that uses OpenAI-compatible chat endpoints.
Databricks provides an OpenAI-compatible endpoint at /serving-endpoints/openai
or via the AI Gateway at /openai/v1.

Config: ~/.codex/config.toml with custom model_providers for Databricks.
Auth: Bearer token via DATABRICKS_TOKEN environment variable.
"""
import json
import os
import shutil
import subprocess
from pathlib import Path

from utils import (
    adapt_instructions_file,
    ensure_https,
    get_gateway_host,
    get_npm_version,
    resolve_mlflow_experiment_id,
)

# Set HOME if not properly set
if not os.environ.get("HOME") or os.environ["HOME"] == "/":
    os.environ["HOME"] = "/app/python/source_code"

home = Path(os.environ["HOME"])

host = os.environ.get("DATABRICKS_HOST", "")
token = os.environ.get("DATABRICKS_TOKEN", "")
codex_model = os.environ.get("CODEX_MODEL", "databricks-gpt-5-5")

# 1. Install Codex CLI into ~/.local/bin (always, even without token)
local_bin = home / ".local" / "bin"
local_bin.mkdir(parents=True, exist_ok=True)
codex_bin = local_bin / "codex"

MAX_RETRIES = 3
RETRY_DELAY = 5  # seconds

if not codex_bin.exists():
    npm_prefix = str(home / ".local")
    codex_version = get_npm_version("@openai/codex")
    codex_pkg = f"@openai/codex@{codex_version}" if codex_version else "@openai/codex"

    for attempt in range(1, MAX_RETRIES + 1):
        print(f"Installing {codex_pkg} (attempt {attempt}/{MAX_RETRIES})...")
        result = subprocess.run(
            ["npm", "install", "-g", f"--prefix={npm_prefix}", codex_pkg],
            capture_output=True,
            text=True,
            env={**os.environ, "HOME": str(home)},
        )
        if result.returncode == 0 and codex_bin.exists():
            print(f"Codex CLI installed to {codex_bin}")
            break
        else:
            stderr = result.stderr.strip()
            print(f"Codex CLI install failed (attempt {attempt}/{MAX_RETRIES}, rc={result.returncode})")
            if stderr:
                print(f"  stderr: {stderr[:500]}")
            if result.stdout.strip():
                print(f"  stdout: {result.stdout.strip()[:500]}")
            if attempt < MAX_RETRIES:
                import time
                print(f"  Retrying in {RETRY_DELAY}s...")
                time.sleep(RETRY_DELAY)
            else:
                print(f"ERROR: Codex CLI installation failed after {MAX_RETRIES} attempts. "
                      f"Run manually: npm install -g --prefix=$HOME/.local @openai/codex")
else:
    print(f"Codex CLI already installed at {codex_bin}")

# 2. Skip auth config if no token (will be configured after PAT setup)
if not host or not token:
    print("Codex CLI installed — config will be set after PAT setup")
    exit(0)

# Strip trailing slash and ensure https:// prefix
host = ensure_https(host.rstrip("/"))

gateway_host = get_gateway_host()
gateway_token = os.environ.get("DATABRICKS_TOKEN", "") if gateway_host else ""
if gateway_host and not gateway_token:
    print("Warning: AI Gateway resolved but DATABRICKS_TOKEN missing, falling back to DATABRICKS_HOST")
    gateway_host = ""

if gateway_host:
    codex_base_url = f"{gateway_host}/openai/v1"
    auth_token = gateway_token
    print(f"Using Databricks AI Gateway: {gateway_host}")
else:
    codex_base_url = f"{host}/serving-endpoints"
    auth_token = token
    print(f"Using Databricks Host: {host}")

# 3. Create ~/.codex directory and write config.toml
codex_dir = home / ".codex"
codex_dir.mkdir(exist_ok=True)

# Copy bundled Databricks model catalog into ~/.codex so it can be referenced
# by relative path in config.toml (codex resolves relatives against CODEX_HOME).
catalog_src = Path(__file__).parent / ".codex" / "databricks-models.json"
catalog_dst = codex_dir / "databricks-models.json"
if catalog_src.exists() and catalog_src.resolve() != catalog_dst.resolve():
    shutil.copyfile(catalog_src, catalog_dst)
    print(f"Codex model catalog copied: {catalog_dst}")

# Optional: MLflow tracing notify hook (one switch enables Claude + Codex)
tracing_enabled = os.environ.get("MLFLOW_TRACING_ENABLED", "false").lower() == "true"
notify_line = ""
if tracing_enabled:
    notify_line = 'notify = ["mlflow-codex", "notify-hook"]\n'

# Codex CLI uses TOML config with custom model_providers
config_content = f"""# Databricks Model Serving Configuration for Codex CLI
# Generated by setup_codex.py

# Active model and provider
model = "{codex_model}"
model_provider = "databricks"
model_catalog_json = "databricks-models.json"

# Disable web_search - not supported by Databricks Responses API
web_search = "disabled"

{notify_line}
# Databricks custom provider
[model_providers.databricks]
name = "Databricks Model Serving"
base_url = "{codex_base_url}"
env_key = "OPENAI_API_KEY"
wire_api = "responses"
"""

config_path = codex_dir / "config.toml"
config_path.write_text(config_content)
print(f"Codex CLI configured: {config_path}")

# 4. Write OPENAI_API_KEY to shell profile for Codex to pick up
# Codex reads from env_key specified in config (OPENAI_API_KEY)
# We set this via the environment, but also write a .env file as backup
env_lines = [
    "# Databricks token for Codex CLI (OpenAI-compatible endpoint)",
    f"OPENAI_API_KEY={auth_token}",
]

# MLflow tracing env vars (read by @mlflow/codex notify hook)
app_owner = os.environ.get("APP_OWNER", "")
app_name = os.environ.get("DATABRICKS_APP_NAME", "coding-agents")
experiment_name = f"/Users/{app_owner}/{app_name}" if app_owner else ""

if tracing_enabled and experiment_name:
    experiment_id = resolve_mlflow_experiment_id(host, token, experiment_name)

    # Install @mlflow/codex (provides the `mlflow-codex` binary used by the notify hook)
    mlflow_codex_bin = local_bin / "mlflow-codex"
    if not mlflow_codex_bin.exists():
        npm_prefix = str(home / ".local")
        print("Installing @mlflow/codex for MLflow tracing...")
        result = subprocess.run(
            ["npm", "install", "-g", f"--prefix={npm_prefix}", "@mlflow/codex"],
            capture_output=True, text=True,
            env={**os.environ, "HOME": str(home)},
        )
        if result.returncode == 0:
            print(f"@mlflow/codex installed to {mlflow_codex_bin}")
        else:
            print(f"WARNING: @mlflow/codex install failed (rc={result.returncode}): {result.stderr.strip()[:300]}")

    # Pass MLflow connection details via env (override mlflow-tracing.json).
    # DATABRICKS_HOST/TOKEN are inherited from the app process — no need to
    # re-write them here (and that avoids stale-token bugs on PAT rotation).
    env_lines.extend([
        "",
        "# MLflow tracing (enabled by MLFLOW_TRACING_ENABLED=true)",
        "MLFLOW_TRACKING_URI=databricks",
    ])
    if experiment_id:
        env_lines.append(f"MLFLOW_EXPERIMENT_ID={experiment_id}")
    else:
        env_lines.append(f"MLFLOW_EXPERIMENT_NAME={experiment_name}")

    # mlflow-tracing.json as a fallback for the notify hook when env isn't loaded
    tracing_cfg = {"trackingUri": "databricks"}
    if experiment_id:
        tracing_cfg["experimentId"] = experiment_id
    (codex_dir / "mlflow-tracing.json").write_text(json.dumps(tracing_cfg, indent=2))
    print(f"Codex MLflow tracing configured: experiment_id={experiment_id or 'unresolved'}")

env_path = codex_dir / ".env"
env_path.write_text("\n".join(env_lines) + "\n")
env_path.chmod(0o600)
print(f"Codex CLI env configured: {env_path}")

# 5. Copy Claude skills into ~/.agents/skills/ where Codex discovers them.
# Codex searches `$HOME/.agents/skills/` plus `.agents/skills/` walking up
# from cwd; both resolve to the same path on the deployed app since
# HOME == repo root, and the user-level lookup also covers local dev.
claude_skills_dir = home / ".claude" / "skills"
codex_skills_dir = home / ".agents" / "skills"
if claude_skills_dir.exists():
    codex_skills_dir.parent.mkdir(exist_ok=True)
    if codex_skills_dir.exists():
        shutil.rmtree(codex_skills_dir)
    shutil.copytree(claude_skills_dir, codex_skills_dir)
    print(f"Skills copied: {claude_skills_dir} -> {codex_skills_dir}")
else:
    print(f"No Claude skills found at {claude_skills_dir}, skipping copy")

# 6. Adapt CLAUDE.md to AGENTS.md for Codex
# Look for CLAUDE.md in common locations
claude_md_locations = [
    Path(__file__).parent / "CLAUDE.md",  # Same directory as setup script
    home / ".claude" / "CLAUDE.md",        # User's Claude config
    Path("/app/python/source_code/CLAUDE.md"),  # Databricks App location
]

claude_md_path = None
for loc in claude_md_locations:
    if loc.exists():
        claude_md_path = loc
        break

agents_path = codex_dir / "AGENTS.md"
adapt_instructions_file(
    source_path=claude_md_path or claude_md_locations[0],
    target_path=agents_path,
    new_header="# Codex Agent Instructions",
    cli_name="Codex",
)

print("\nCodex CLI ready! Usage:")
print("  codex                              # Start Codex CLI")
print("  codex 'explain this codebase'      # Run with prompt")
print(f"\nEndpoint: {codex_base_url}")
print(f"Model: {codex_model}")
print("Auth: Bearer token (Databricks PAT via OPENAI_API_KEY)")