diff --git a/docs/en/kubeflow/how_to/mlflow-python-sdk.mdx b/docs/en/kubeflow/how_to/mlflow-python-sdk.mdx new file mode 100644 index 0000000..bcceb67 --- /dev/null +++ b/docs/en/kubeflow/how_to/mlflow-python-sdk.mdx @@ -0,0 +1,216 @@ +--- +weight: 46 +--- + +# Using the MLflow Python SDK with Authentication and RBAC + +On Alauda AI the [MLflow Tracking Server](./mlflow.mdx) runs behind single sign-on and multi-tenancy: an OAuth proxy authenticates every caller, and the server records each run under the calling user and authorizes it against Kubernetes RBAC. This guide drives the stock **MLflow Python SDK** through that OAuth proxy with your own identity, **browser-free**, using the OAuth2 **authorization code** flow (with PKCE) scripted against the platform login — no password grant, and never the MLflow container port. + +There are two browser-free ways to present your identity; pick one: + +- **Bearer token (recommended).** Obtain a Dex **id token** from the CLI or Python and pass it as `MLFLOW_TRACKING_TOKEN`; renew it with the refresh token. Needs one platform setting ([below](#platform-setup)). +- **Session cookie (no platform changes).** Drive the proxy's own login to obtain its `_oauth2_proxy` cookie and attach it to requests. Works on any install as-is ([below](#cookie-method)). + +## How authentication works + +Two layers sit in front of your runs: + +1. The **OAuth proxy** (`oauth2-proxy`) authenticates the request — either a Dex **id token** sent as `Authorization: Bearer …` (token method) or its `_oauth2_proxy` **session cookie** (cookie method). +2. The MLflow server's `kubernetes-auth` plugin reads your identity from that credential, records it as the run **owner**, and authorizes it against your Kubernetes permissions in the workspace. + +The client always goes through the OAuth proxy — never connect to the MLflow container port directly. + +## Prerequisites + +- `mlflow` **3.10 or later** (`pip install "mlflow>=3.10"`). Workspace selection (`mlflow.set_workspace`) is a 3.10+ feature. The Python token helper also uses `requests` and `cryptography`. +- A platform **username and password** — ideally a dedicated service account, not a person's login — that can access the target workspace (see [Workspace Access](./mlflow.mdx)). +- The platform's **OAuth client id and secret** — the client the MLflow proxy uses (from your administrator). On Alauda this is the platform auth client, e.g. `alauda-auth`; its secret lives in a Kubernetes `Secret` (e.g. `cpaas-oidc-secret`). + +## Platform setup for the token method (administrator, one-time) \{#platform-setup} + +The bearer-token method needs the MLflow OAuth proxy to accept Dex id tokens. Add `--skip-jwt-bearer-tokens=true` to the **MLflow plugin** — this is the MLflow proxy on the workload cluster, **not** the platform's global auth server: + +```yaml +# MLflow plugin values +auth: + oauth: + extraArgs: + - --skip-jwt-bearer-tokens=true +``` + +No Dex or global-auth change is required: the login below uses the `authorization_code` grant the platform client already allows. The **cookie method** needs no setting at all — skip this section if you use it. + +## Get a token from the command line (browser-free) \{#get-a-token} + +The platform login is an SSO page, but its API supports the standard OAuth **authorization code** flow with PKCE, so you can complete it from a script — no browser redirect. The password is RSA-encrypted with the login service's public key (`/dex/pubkey`), exactly as the login page does it, then exchanged for an **id token** (and a **refresh token** for headless renewal). + +### Python helper + +```python +import base64, hashlib, json, os, secrets +from urllib.parse import urlparse, parse_qs +import requests +from cryptography.hazmat.primitives.asymmetric import padding +from cryptography.hazmat.primitives.serialization import load_pem_public_key + +PLATFORM = os.environ["PLATFORM_ADDRESS"].rstrip("/") # https:// +CLIENT_ID = os.environ["DEX_CLIENT_ID"] # the MLflow proxy's client, e.g. alauda-auth +CLIENT_SECRET = os.environ["DEX_CLIENT_SECRET"] +USERNAME = os.environ["MLFLOW_USERNAME"] +PASSWORD = os.environ["MLFLOW_PASSWORD"] +REDIRECT_URI = f"{PLATFORM}/oauth2/callback" # any URI the client has registered +VERIFY_TLS = os.environ.get("PLATFORM_CA", False) # CA bundle path, or False to skip (lab only) + +s = requests.Session(); s.verify = VERIFY_TLS +_b64url = lambda b: base64.urlsafe_b64encode(b).rstrip(b"=").decode() + +def get_tokens() -> dict: + """Run the authorization-code + PKCE flow headlessly. Returns the Dex token response.""" + verifier = _b64url(secrets.token_bytes(48)) + challenge = _b64url(hashlib.sha256(verifier.encode()).digest()) + # 1) start the flow -> auth-request id + req = s.get(f"{PLATFORM}/dex/api/v1/authorize", params={ + "client_id": CLIENT_ID, "redirect_uri": REDIRECT_URI, "response_type": "code", + "scope": "openid email groups offline_access", "state": "cli", + "code_challenge": challenge, "code_challenge_method": "S256"}).json()["req"] + # 2) RSA-encrypt the password, then log in via the local connector -> auth code + pk = s.get(f"{PLATFORM}/dex/pubkey").json() # {"ts": ..., "pubkey": ""} + payload = json.dumps({"ts": pk["ts"], "password": PASSWORD}, separators=(",", ":")).encode() + enc = base64.b64encode(load_pem_public_key(pk["pubkey"].encode()).encrypt(payload, padding.PKCS1v15())).decode() + redirect = s.post(f"{PLATFORM}/dex/api/v1/authorize/local", params={"req": req}, + json={"account": USERNAME, "password": enc}).json()["redirect_url"] + code = parse_qs(urlparse(redirect).query)["code"][0] + # 3) exchange the code (with the PKCE verifier) -> id_token + refresh_token + return s.post(f"{PLATFORM}/dex/token", data={ + "grant_type": "authorization_code", "code": code, "redirect_uri": REDIRECT_URI, + "code_verifier": verifier, "client_id": CLIENT_ID, "client_secret": CLIENT_SECRET}).json() + +def refresh(refresh_token: str) -> str: + """Mint a fresh id token from a refresh token — no login, no browser.""" + return s.post(f"{PLATFORM}/dex/token", data={ + "grant_type": "refresh_token", "refresh_token": refresh_token, + "client_id": CLIENT_ID, "client_secret": CLIENT_SECRET, + "scope": "openid email groups"}).json()["id_token"] +``` + +### Shell equivalent (curl + openssl, no Python dependencies) + +```bash +PLATFORM=https://; CLIENT_ID=; CLIENT_SECRET= +USERNAME=''; PASSWORD=''; REDIRECT_URI="$PLATFORM/oauth2/callback" + +V=$(openssl rand -base64 48 | tr '+/' '-_' | tr -d '=' | cut -c1-64) # PKCE verifier +C=$(printf %s "$V" | openssl dgst -sha256 -binary | openssl base64 -A | tr '+/' '-_' | tr -d '=') +RU=$(jq -rn --arg u "$REDIRECT_URI" '$u|@uri'); SC=$(jq -rn '"openid email groups offline_access"|@uri') +REQ=$(curl -sk "$PLATFORM/dex/api/v1/authorize?client_id=$CLIENT_ID&redirect_uri=$RU&response_type=code&scope=$SC&state=cli&code_challenge=$C&code_challenge_method=S256" | jq -r .req) +PK=$(curl -sk "$PLATFORM/dex/pubkey"); TS=$(echo "$PK"|jq -r .ts); echo "$PK"|jq -r .pubkey >/tmp/dex_pub.pem +ENC=$(printf '{"ts":"%s","password":"%s"}' "$TS" "$PASSWORD" | openssl pkeyutl -encrypt -pubin -inkey /tmp/dex_pub.pem -pkeyopt rsa_padding_mode:pkcs1 | openssl base64 -A) +CODE=$(curl -sk -X POST "$PLATFORM/dex/api/v1/authorize/local?req=$REQ" -H 'Content-Type: application/json' \ + --data "$(jq -nc --arg a "$USERNAME" --arg p "$ENC" '{account:$a,password:$p}')" | jq -r .redirect_url | sed -E 's/.*code=([^&]+).*/\1/') +curl -sk "$PLATFORM/dex/token" -d grant_type=authorization_code -d code="$CODE" \ + --data-urlencode redirect_uri="$REDIRECT_URI" -d code_verifier="$V" \ + -d client_id="$CLIENT_ID" --data-urlencode client_secret="$CLIENT_SECRET" | jq -r .id_token +``` + +## Connect the SDK + +```python +import os, mlflow + +tok = get_tokens() +os.environ["MLFLOW_TRACKING_TOKEN"] = tok["id_token"].strip() # → Authorization: Bearer +mlflow.set_tracking_uri("http://mlflow-tracking-server.kubeflow:5000") # in-cluster Service (fronted by the OAuth proxy) +mlflow.set_workspace("team-a") # workspace namespace → X-MLFLOW-WORKSPACE +mlflow.set_experiment("my-experiment") + +with mlflow.start_run(run_name="sdk-quickstart") as run: + mlflow.log_param("learning_rate", 2e-4) + mlflow.log_metric("loss", 0.123) + print("run:", run.info.run_id) +``` + +The run appears under **Alauda AI → Tools → MLFlow**, owned by the user you authenticated as. (Verified end-to-end on a secured install: the run owner is the token's user identity.) + +Use the in-cluster Service URL `http://mlflow-tracking-server.kubeflow:5000` when the client runs **inside** the cluster (pipeline components, Workbench notebooks). From **outside** the cluster, point at the platform route `https:///clusters//mlflow` instead — both reach the same OAuth proxy (set `MLFLOW_TRACKING_INSECURE_TLS=true` if the platform certificate is not trusted by your machine). + +:::warning +Use a **dedicated service-account user** and keep its credentials and the client secret in a Kubernetes `Secret`, never in code. Always `.strip()` the token (a trailing newline produces `Invalid … character(s) in header value: 'Bearer …\n'`). id tokens expire (24 h by default); for long-running jobs renew with `refresh(tok["refresh_token"])` instead of logging in again. +::: + +## Selecting a workspace + +Runs are recorded in the workspace you select; if you select none, the server's default workspace is used. Any of these set it (the SDK turns them into the `X-MLFLOW-WORKSPACE` header): + +- `mlflow.set_workspace("team-a")` in code, +- the `MLFLOW_WORKSPACE=team-a` environment variable. + +You can only use a workspace your account has access to; see [Workspace Access](./mlflow.mdx). + +## Registering models + +The model registry is workspace-scoped and authorized the same way, so the usual SDK calls work once connected: + +```python +mlflow.set_workspace("team-a") +with mlflow.start_run(): + mlflow.sklearn.log_model(sk_model, name="model", registered_model_name="fraud-detector") +``` + +Promote the registered version to **Staging** or **Production** from the MLflow UI. + +## Alternative: session cookie (no platform changes) \{#cookie-method} + +If you cannot enable `--skip-jwt-bearer-tokens`, drive the proxy's own login flow to obtain its `_oauth2_proxy` cookie and attach it to requests — this works on any install unchanged. The proxy starts the OAuth flow for you (its own PKCE and `redirect_uri`); you just replay that through the same scripted login and hand the code back to the proxy callback: + +```bash +PLATFORM=https://; CLUSTER= +USERNAME=''; PASSWORD='' +JAR=$(mktemp) +# 1) start the MLflow proxy login -> the Dex auth query it wants +LOC=$(curl -sk -c "$JAR" -D - -o /dev/null "$PLATFORM/clusters/$CLUSTER/mlflow/" \ + | awk 'BEGIN{IGNORECASE=1}/^location:/{print $2}' | tr -d '\r') +QS=${LOC#*\?} +# 2) authorize -> req, then 3) scripted local login -> the proxy callback URL +REQ=$(curl -sk -b "$JAR" -c "$JAR" "$PLATFORM/dex/api/v1/authorize?$QS" | jq -r .req) +PK=$(curl -sk "$PLATFORM/dex/pubkey"); TS=$(echo "$PK"|jq -r .ts); echo "$PK"|jq -r .pubkey >/tmp/dex_pub.pem +ENC=$(printf '{"ts":"%s","password":"%s"}' "$TS" "$PASSWORD" | openssl pkeyutl -encrypt -pubin -inkey /tmp/dex_pub.pem -pkeyopt rsa_padding_mode:pkcs1 | openssl base64 -A) +CB=$(curl -sk -b "$JAR" -c "$JAR" -X POST "$PLATFORM/dex/api/v1/authorize/local?req=$REQ" -H 'Content-Type: application/json' \ + --data "$(jq -nc --arg a "$USERNAME" --arg p "$ENC" '{account:$a,password:$p}')" | jq -r .redirect_url) +# 4) the proxy callback exchanges the code and sets the _oauth2_proxy cookie +curl -sk -b "$JAR" -c "$JAR" -o /dev/null "$CB" +COOKIE=$(awk -F'\t' '$6 ~ /^_oauth2_proxy/{printf "%s=%s; ",$6,$7}' "$JAR" | sed 's/; $//') # includes any _oauth2_proxy_N chunks +echo "$COOKIE" +``` + +Then attach the cookie with a header provider (the cookie carries your identity — no token, no platform setting): + +```python +import os, mlflow +from mlflow.tracking.request_header.abstract_request_header_provider import RequestHeaderProvider +from mlflow.tracking.request_header.registry import _request_header_provider_registry + +class ProxySessionHeader(RequestHeaderProvider): + def in_context(self): + return bool(os.environ.get("MLFLOW_PROXY_COOKIE")) # export MLFLOW_PROXY_COOKIE='_oauth2_proxy=' + def request_headers(self): + return {"Cookie": os.environ["MLFLOW_PROXY_COOKIE"]} + +_request_header_provider_registry.register(ProxySessionHeader) +mlflow.set_tracking_uri("https:///clusters//mlflow") +mlflow.set_workspace("team-a") +``` + +You can also copy the `_oauth2_proxy` cookie from a browser session (DevTools → **Application/Storage → Cookies**). The session cookie expires — re-mint it when calls start returning a login redirect. + +## Troubleshooting + +| Symptom | Check | +|---------|-------| +| `/dex/api/v1/authorize` returns `PKCE code_challenge is required` | The client enforces PKCE. Send `code_challenge` and `code_challenge_method=S256` (the helper does this). | +| Local login returns a captcha challenge / `CaptchaError` | Too many recent failed logins triggered the retry-captcha. Wait, fix the credentials, then retry — a clean first login needs no captcha. | +| `/dex/token` returns `invalid_grant` | The auth code or PKCE verifier is stale or reused. Re-run the flow from the start (`authorize` → login → token); codes are single-use. | +| Call returns HTML or a redirect (`302` to the login page) | **Token method:** the proxy rejected the bearer token — confirm `--skip-jwt-bearer-tokens` is enabled and the token is a valid Dex id token (`aud` = the proxy's client). **Cookie method:** the `_oauth2_proxy` cookie is missing or expired. | +| `Invalid … character(s) in header value: 'Bearer …\n'` | The token has trailing whitespace. Set `MLFLOW_TRACKING_TOKEN` to the `.strip()`-ed value. | +| `Failed to query /api/3.0/mlflow/server-info` | The SDK could not reach the server through the proxy — verify the tracking URI and that the token/cookie is valid. | +| `403 PERMISSION_DENIED` | Your account lacks access to the workspace namespace. Request access to the workspace (see [Workspace Access](./mlflow.mdx)); no ServiceAccount is involved. | +| Run shows the wrong owner or workspace | The owner is your authenticated identity; the workspace is `set_workspace()` / `MLFLOW_WORKSPACE` (else the server default). Check both. | diff --git a/docs/en/kubeflow/how_to/mlflow.mdx b/docs/en/kubeflow/how_to/mlflow.mdx index eaf2e76..d7580db 100644 --- a/docs/en/kubeflow/how_to/mlflow.mdx +++ b/docs/en/kubeflow/how_to/mlflow.mdx @@ -69,6 +69,8 @@ subjects: ## Client Configuration +For authenticating the MLflow Python SDK with a user identity token — including the in-cluster connection details and RBAC — see [Using the MLflow Python SDK with Authentication and RBAC](./mlflow-python-sdk.mdx). + Set the MLflow tracking URI to the platform route and select the workspace: ```python diff --git a/docs/en/training_guides/fine-tune-with-trainer-v2.ipynb b/docs/en/training_guides/fine-tune-with-trainer-v2.ipynb index e904165..10fb95d 100644 --- a/docs/en/training_guides/fine-tune-with-trainer-v2.ipynb +++ b/docs/en/training_guides/fine-tune-with-trainer-v2.ipynb @@ -947,15 +947,7 @@ "cell_type": "markdown", "id": "27d2b476", "metadata": {}, - "source": [ - "## Step 5: View Training Metrics in MLflow\n", - "\n", - "If `MLFLOW_TRACKING_URI` is set and the MLflow server is reachable from the training pod, LlamaFactory will log metrics (loss, learning rate, etc.) to MLflow automatically via `report_to: mlflow` in the training config.\n", - "\n", - "To open the MLflow UI, go to **Alauda AI** - **Tools** - **MLFlow** (need MLFlow Cluster plugin installed). Look for the experiment named by `MLFLOW_EXPERIMENT_NAME`.\n", - "\n", - "Each `TrainJob` run will appear as a separate MLflow **run** under the same experiment, making it easy to compare training curves across different models and hyperparameters." - ] + "source": "## Step 5: View Training Metrics in MLflow\n\nIf `MLFLOW_TRACKING_URI` is set and the MLflow server is reachable from the training pod, LlamaFactory will log metrics (loss, learning rate, etc.) to MLflow automatically via `report_to: mlflow` in the training config.\n\nOn a secured (SSO + multi-tenant) MLflow install the trainer must also authenticate — set `MLFLOW_TRACKING_TOKEN` and select a workspace. See [Using the MLflow Python SDK with Authentication and RBAC](../kubeflow/how_to/mlflow-python-sdk.mdx) for how to obtain the token and how authorization/RBAC work.\n\nTo open the MLflow UI, go to **Alauda AI** - **Tools** - **MLFlow** (need MLFlow Cluster plugin installed). Look for the experiment named by `MLFLOW_EXPERIMENT_NAME`.\n\nEach `TrainJob` run will appear as a separate MLflow **run** under the same experiment, making it easy to compare training curves across different models and hyperparameters." }, { "cell_type": "markdown", @@ -1060,4 +1052,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/docs/en/training_guides/fine-tuning-using-notebooks.mdx b/docs/en/training_guides/fine-tuning-using-notebooks.mdx index a2bba02..93d7653 100644 --- a/docs/en/training_guides/fine-tuning-using-notebooks.mdx +++ b/docs/en/training_guides/fine-tuning-using-notebooks.mdx @@ -325,7 +325,9 @@ After success the merged model is pushed to a date-stamped branch (`sft-YYYYMMDD ## 8. Experiment tracking -Setting `report_to: mlflow` in the LLaMA-Factory config plus the `MLFLOW_TRACKING_URI` / `MLFLOW_EXPERIMENT_NAME` env vars routes metrics to MLflow. Find runs in **Alauda AI → Advanced → MLFlow**, compare loss curves, and pin the winning run. +Setting `report_to: mlflow` in the LLaMA-Factory config plus the `MLFLOW_TRACKING_URI` / `MLFLOW_EXPERIMENT_NAME` env vars routes metrics to MLflow. Find runs in **Alauda AI → Tools → MLFlow**, compare loss curves, and pin the winning run. + +On a secured (SSO + multi-tenant) MLflow install the job must also authenticate — supply an `MLFLOW_TRACKING_TOKEN` and select a workspace. See [Using the MLflow Python SDK with Authentication and RBAC](../kubeflow/how_to/mlflow-python-sdk.mdx) for how to obtain the token and configure the client. ## 9. Publish the fine-tuned model @@ -412,4 +414,4 @@ spec: ### Experiment tracking on other devices -LLaMA-Factory and Transformers integrate with MLflow / wandb directly. Set the destination in the framework config (e.g. `report_to: mlflow` for LLaMA-Factory) and supply `MLFLOW_TRACKING_URI` and `MLFLOW_EXPERIMENT_NAME` env vars. View results under **Alauda AI → Advanced → MLFlow**. +LLaMA-Factory and Transformers integrate with MLflow / wandb directly. Set the destination in the framework config (e.g. `report_to: mlflow` for LLaMA-Factory) and supply `MLFLOW_TRACKING_URI` and `MLFLOW_EXPERIMENT_NAME` env vars (plus `MLFLOW_TRACKING_TOKEN` on a secured install — see [Using the MLflow Python SDK with Authentication and RBAC](../kubeflow/how_to/mlflow-python-sdk.mdx)). View results under **Alauda AI → Tools → MLFlow**. diff --git a/docs/en/training_guides/pipelines-mlflow-integration.mdx b/docs/en/training_guides/pipelines-mlflow-integration.mdx new file mode 100644 index 0000000..65b1a55 --- /dev/null +++ b/docs/en/training_guides/pipelines-mlflow-integration.mdx @@ -0,0 +1,183 @@ +--- +weight: 55 +--- + +# Kubeflow Pipeline + MLflow Integration + +This guide shows how Kubeflow Pipelines (KFP) components log parameters, metrics, and models to [MLflow on Kubeflow](../kubeflow/how_to/mlflow.mdx) with the **MLflow Python client**. Authentication and workspace/RBAC follow [Using the MLflow Python SDK with Authentication and RBAC](../kubeflow/how_to/mlflow-python-sdk.mdx) — each component authenticates with a user identity token and the server records the run under that user. + +## Scope + +- Alauda AI 2.5 and later. +- Kubeflow Pipelines and the MLflow cluster plugin are installed. +- The MLflow workspace is a namespace labelled `mlflow-enabled=true`. +- For the bearer-token method, the MLflow OAuth proxy must accept Dex id tokens (`--skip-jwt-bearer-tokens`) — see [Platform setup](../kubeflow/how_to/mlflow-python-sdk.mdx#platform-setup) in the SDK guide. No global-auth change is needed, and the cookie method needs no setup at all. + +## Prerequisites + +- `kfp` and `kfp-kubernetes` Python SDKs (`pip install kfp kfp-kubernetes`). +- Access to a KFP endpoint (see [Use Kubeflow Pipelines](../kubeflow/how_to/pipelines.mdx)). +- A **Dex id token** for a dedicated service account, minted with the OAuth2 password grant (see the [SDK guide](../kubeflow/how_to/mlflow-python-sdk.mdx)). Store it in a Kubernetes `Secret` and inject it into the component. +- An MLflow workspace (a namespace with `mlflow-enabled=true`) the account can access. + +## How components reach MLflow + +A pipeline component runs **inside** the cluster, so it talks to MLflow through the in-cluster Service `http://mlflow-tracking-server.kubeflow:5000` (which is fronted by the OAuth proxy — components never use the MLflow container port directly). It authenticates exactly like any other MLflow client: + +- `MLFLOW_TRACKING_TOKEN` — a Dex id token; the MLflow client sends it as `Authorization: Bearer …`. +- `mlflow.set_workspace(...)` — selects the workspace (`X-MLFLOW-WORKSPACE`). + +The server reads the identity from the token and records the run under that user. See the [SDK guide](../kubeflow/how_to/mlflow-python-sdk.mdx) for how the token is obtained and how authorization works. + +## Complete example: training pipeline with MLflow + +The component uses the MLflow client and reads `MLFLOW_TRACKING_TOKEN` from a `Secret` injected with [`kfp-kubernetes`](https://kubeflow-pipelines.readthedocs.io/en/stable/source/kubernetes.html). KFP v2 packages each component from its own source, so `import mlflow` lives **inside** the function. + +```python +from kfp import dsl, compiler +from kfp import kubernetes + + +@dsl.component(base_image="python:3.11-slim", packages_to_install=["mlflow>=3.10"]) +def train_model( + workspace: str, + model_name: str, + learning_rate: float, + epochs: int, + run_id: str, +) -> dict: + """Simulated training component that logs to MLflow as the calling user.""" + import mlflow # MLFLOW_TRACKING_TOKEN is injected from a Secret (see the pipeline below) + + mlflow.set_tracking_uri("http://mlflow-tracking-server.kubeflow:5000") # in-cluster Service, via the OAuth proxy + mlflow.set_workspace(workspace) + mlflow.set_experiment("kfp-training-experiment") + + metrics = {} + with mlflow.start_run(run_name=f"run-{run_id}"): + mlflow.log_param("model_name", model_name) + mlflow.log_param("learning_rate", learning_rate) + mlflow.log_param("epochs", epochs) + for epoch in range(1, epochs + 1): + loss = 2.0 * (0.95 ** epoch) + accuracy = 1.0 - loss + mlflow.log_metric("loss", loss, step=epoch) + mlflow.log_metric("accuracy", accuracy, step=epoch) + metrics = {"final_loss": loss, "final_accuracy": accuracy} + + print("logged run:", mlflow.last_active_run().info.run_id) + return metrics + + +@dsl.pipeline(name="mlflow-training-pipeline", description="Train with MLflow tracking") +def training_pipeline( + workspace: str = "team-a", + model_name: str = "qwen3-0.6b", + learning_rate: float = 2e-4, + epochs: int = 10, +): + task = train_model( + workspace=workspace, + model_name=model_name, + learning_rate=learning_rate, + epochs=epochs, + # PIPELINE_JOB_ID_PLACEHOLDER resolves to the run's job id at runtime; + # pass it in as an argument (a component cannot reference dsl.* itself). + run_id=dsl.PIPELINE_JOB_ID_PLACEHOLDER, + ) + # Inject the Dex id token from a Secret as MLFLOW_TRACKING_TOKEN. + kubernetes.use_secret_as_env( + task, secret_name="mlflow-token", secret_key_to_env={"token": "MLFLOW_TRACKING_TOKEN"} + ) + + +compiler.Compiler().compile(training_pipeline, "pipeline.yaml") +``` + +Create the `mlflow-token` Secret with a Dex id token. Mint `ID_TOKEN` browser-free with the authorization-code flow from the SDK guide — see [Get a token from the command line](../kubeflow/how_to/mlflow-python-sdk.mdx#get-a-token): + +```bash +# ID_TOKEN: mint it with the curl/Python flow in the SDK guide (browser-free, current grants) +kubectl -n create secret generic mlflow-token --from-literal=token="$ID_TOKEN" +``` + +:::warning +id tokens expire (24 h by default), so refresh the `mlflow-token` Secret before submitting long pipelines — or mint the token **inside** the component from service-account credentials kept in a Secret (the [token flow](../kubeflow/how_to/mlflow-python-sdk.mdx#get-a-token) in the SDK guide) and renew it with the refresh token, so each run gets a fresh token. +::: + +## Upload and run + +### Via the KFP UI + +1. Go to **Kubeflow Dashboard → Pipelines → Upload Pipeline** and select `pipeline.yaml`. +2. Click **Create Run** and fill in the parameters (workspace, model name, epochs). +3. After the run starts, check the MLflow UI under **Alauda AI → Tools → MLFlow** — the run owner is the token's user. + +### Via the KFP SDK + +```python +from kfp.client import Client + +client = Client(host="") +run = client.create_run_from_pipeline_package( + "pipeline.yaml", + arguments=dict(workspace="team-a", model_name="qwen3-0.6b", epochs=10), +) +print(f"Run ID: {run.run_id}") +``` + +## Using MLflow in Trainer v2 pipelines + +If you fine-tune with [Kubeflow Trainer v2](./fine-tune-with-trainer-v2.mdx), the framework's MLflow integration (for example `report_to: mlflow` in LLaMA-Factory) authenticates the same way. Trainer v2 uses `apiVersion: trainer.kubeflow.org/v1alpha1`, `kind: TrainJob`, and a `spec.runtimeRef` + `spec.trainer` shape. Point it at the in-cluster Service and inject the id token from a `Secret`: + +```yaml +apiVersion: trainer.kubeflow.org/v1alpha1 +kind: TrainJob +metadata: + name: mlflow-finetune +spec: + runtimeRef: + name: torch-distributed # a TrainingRuntime / ClusterTrainingRuntime + trainer: + image: alaudadockerhub/fine_tune_with_llamafactory:v0.1.1 + env: + - name: MLFLOW_TRACKING_URI + value: "http://mlflow-tracking-server.kubeflow:5000" + - name: MLFLOW_EXPERIMENT_NAME + value: "trainer-v2-finetune" + - name: MLFLOW_TRACKING_TOKEN + valueFrom: + secretKeyRef: + name: mlflow-token # a Secret holding a Dex id token + key: token +``` + +See [Fine-tuning LLMs using Workbench](./fine-tuning-using-notebooks.mdx) for a full Trainer v2 + MLflow example. + +## Best practices + +### Use the pipeline job ID in MLflow + +KFP v2 provides `dsl.PIPELINE_JOB_ID_PLACEHOLDER` (the v1 `dsl.RUN_ID_PLACEHOLDER` was removed). It is a pipeline-level placeholder, so pass it into the component as an argument — a component cannot reference `dsl.*` from inside its own body. Use the received string in the run name to keep runs distinct per pipeline execution. + +### Keep credentials in a Secret and refresh tokens + +Never hardcode the token or service-account credentials in `pipeline.yaml` — compiled pipelines are stored and shared. Inject them from a `Secret`, and refresh the id token (or mint it inside the component) before it expires. + +### Log metrics inside a run + +Each metric belongs to a `mlflow.start_run()` block. If a component has multiple logical stages, open a run per stage rather than logging outside a run context. + +### Artifact storage for production + +Logging large model artifacts requires durable object storage. Configure S3-compatible storage in the MLflow plugin settings (see [MLflow Tracking Server](../kubeflow/how_to/mlflow.mdx) → High Availability And Storage) so artifact uploads do not hit pod disk limits. + +## Troubleshooting + +| Symptom | Check | +|---------|-------| +| Component fails with an HTML/redirect (`302`) response | The OAuth proxy rejected the token. Confirm the proxy has `--skip-jwt-bearer-tokens` and `MLFLOW_TRACKING_TOKEN` is a valid Dex id token (see the [SDK guide](../kubeflow/how_to/mlflow-python-sdk.mdx)). | +| `401 UNAUTHENTICATED` | `MLFLOW_TRACKING_TOKEN` is unset, empty, or expired — refresh the `mlflow-token` Secret. | +| `403 PERMISSION_DENIED` | The token's user lacks access to the workspace namespace. Grant access to the MLflow workspace (see [Workspace Access](../kubeflow/how_to/mlflow.mdx)); no ServiceAccount is involved. | +| Run shows up under the wrong owner / workspace | The owner is the token's identity; the workspace is `set_workspace()` (else the server default). Check both. | +| MLflow metrics not appearing in KFP UI | KFP and MLflow are separate systems. Metrics logged to MLflow appear in the MLflow UI (**Alauda AI → Tools → MLFlow**), not in the KFP run output. | diff --git a/e2e/lib.sh b/e2e/lib.sh index 16a8d42..435887a 100644 --- a/e2e/lib.sh +++ b/e2e/lib.sh @@ -161,14 +161,14 @@ _retry_kubectl_stdin() { local kfn="$1" verb="$2"; shift 2 local data data="$(cat)" - local attempts=0 max=20 delay=30 rc out + local attempts=0 max=20 delay=120 rc out while [ "${attempts}" -lt "${max}" ]; do if out="$(printf '%s' "${data}" | $kfn "${verb}" -f - "$@" 2>&1)"; then printf '%s' "${out}" return 0 fi rc=$? - if ! echo "${out}" | grep -qE 'failed calling webhook|x509|connection refused|EOF|context deadline exceeded|webhook.* connect: connection refused'; then + if ! echo "${out}" | grep -qE 'failed calling webhook|x509|connection refused|EOF|context deadline exceeded|webhook.* connect: connection refused|failed to download openapi|openapi'; then printf '%s\n' "${out}" >&2 return "${rc}" fi @@ -181,7 +181,7 @@ _retry_kubectl_stdin() { } retry_create() { _retry_kubectl_stdin "$1" create "${@:2}"; } -retry_apply() { _retry_kubectl_stdin "$1" apply "${@:2}"; } +retry_apply() { _retry_kubectl_stdin "$1" apply "${@:2}"; } # Locate a TrainJob's pod. Trainer v2 builds a JobSet named after the TrainJob, # with one Job per `replicatedJobs[*]` named `${trainjob}--0`. The first diff --git a/e2e/mlflow-user-identity-smoke.sh b/e2e/mlflow-user-identity-smoke.sh new file mode 100755 index 0000000..721e4b6 --- /dev/null +++ b/e2e/mlflow-user-identity-smoke.sh @@ -0,0 +1,145 @@ +#!/usr/bin/env bash +# Smoke test: log to MLflow as a real user, THROUGH the OAuth proxy — browser-free. +# +# Drives the platform's standard OAuth **authorization code** flow (with PKCE) +# from the shell: it starts the flow, logs in via the local connector with an +# RSA-encrypted password (exactly as the login page does), and gets back an auth +# code. From that code it derives, and exercises, both documented credentials: +# +# 1. Bearer token — exchange the code for a Dex id token, send it as +# Authorization: Bearer (needs --skip-jwt-bearer-tokens on +# the MLflow proxy; the test SKIPs this leg if it is off). +# 2. Session cookie — hand the code to the MLflow proxy callback to obtain the +# _oauth2_proxy cookie (works with no platform changes). +# +# Each leg logs a run over the platform route (i.e. through oauth2-proxy, never +# the container port) and asserts the run owner equals the caller's identity. +# No ROPC/password grant, no ServiceAccount, no direct container-port access. +# +# Required env: +# PLATFORM_ADDRESS e.g. https://192.168.142.163 +# CLUSTER e.g. g1-c1-x86 +# MLFLOW_USERNAME platform username (ideally a dedicated service account) +# MLFLOW_PASSWORD that user's password +# Optional env: +# DEX_CLIENT_ID OAuth client id (enables the bearer-token leg; default: alauda-auth) +# DEX_CLIENT_SECRET that client's secret (enables the bearer-token leg) +# MLFLOW_WORKSPACE target workspace namespace (default: mlops-demo-e2e) +set -euo pipefail + +: "${PLATFORM_ADDRESS:?set PLATFORM_ADDRESS, e.g. https://192.168.142.163}" +: "${CLUSTER:?set CLUSTER, e.g. g1-c1-x86}" +: "${MLFLOW_USERNAME:?set MLFLOW_USERNAME}" +: "${MLFLOW_PASSWORD:?set MLFLOW_PASSWORD}" +DEX_CLIENT_ID="${DEX_CLIENT_ID:-alauda-auth}" +WORKSPACE="${MLFLOW_WORKSPACE:-mlops-demo-e2e}" +P="${PLATFORM_ADDRESS%/}" +REDIRECT_URI="$P/oauth2/callback" # any URI the client has registered +BASE="$P/clusters/${CLUSTER}/mlflow/api/2.0/mlflow" + +TMP="$(mktemp -d)" +CLEAN_HDR=(); CLEAN_EID=() # parallel arrays of (auth header, experiment id) to delete on exit +cleanup() { + local i + for i in "${!CLEAN_EID[@]}"; do + curl -fsSk -H "${CLEAN_HDR[$i]}" -H "X-MLFLOW-WORKSPACE: ${WORKSPACE}" -H 'Content-Type: application/json' \ + -X POST "$BASE/experiments/delete" -d "{\"experiment_id\":\"${CLEAN_EID[$i]}\"}" >/dev/null 2>&1 || true + done + rm -rf "$TMP" +} +trap cleanup EXIT + +b64url_decode() { local d="$1"; d="${d//-/+}"; d="${d//_/\/}"; printf '%s%s' "$d" "$(printf '%*s' $(((4 - ${#d} % 4) % 4)) '' | tr ' ' '=')" | base64 -d 2>/dev/null; } + +# RSA-encrypt {"ts","password"} with a fresh /dex/pubkey (PKCS#1 v1.5), as the login page does. +rsa_password() { + local pk ts + pk="$(curl -fsSk "$P/dex/pubkey")"; ts="$(echo "$pk" | jq -r .ts)" + echo "$pk" | jq -r .pubkey > "$TMP/pub.pem" + printf '{"ts":"%s","password":"%s"}' "$ts" "$MLFLOW_PASSWORD" \ + | openssl pkeyutl -encrypt -pubin -inkey "$TMP/pub.pem" -pkeyopt rsa_padding_mode:pkcs1 | openssl base64 -A +} + +# Log a run + assert the owner. $1=label $2=auth header (Authorization/Cookie) $3=expected owner +run_and_assert() { + local label="$1" header="$2" expect="$3" exp eid rid owner status param run + exp="uit-${label}-$$-${RANDOM}" + eid="$(curl -fsSk -H "$header" -H "X-MLFLOW-WORKSPACE: ${WORKSPACE}" -H 'Content-Type: application/json' \ + -X POST "$BASE/experiments/create" -d "{\"name\":\"${exp}\"}" | jq -r '.experiment_id // empty')" + [ -n "$eid" ] || { echo "FAIL[$label]: experiment not created"; return 1; } + CLEAN_HDR+=("$header"); CLEAN_EID+=("$eid") + rid="$(curl -fsSk -H "$header" -H "X-MLFLOW-WORKSPACE: ${WORKSPACE}" -H 'Content-Type: application/json' \ + -X POST "$BASE/runs/create" -d "{\"experiment_id\":\"${eid}\",\"start_time\":1700000000000}" | jq -r '.run.info.run_id // empty')" + [ -n "$rid" ] || { echo "FAIL[$label]: run not created"; return 1; } + curl -fsSk -H "$header" -H "X-MLFLOW-WORKSPACE: ${WORKSPACE}" -H 'Content-Type: application/json' \ + -X POST "$BASE/runs/log-parameter" -d "{\"run_id\":\"${rid}\",\"key\":\"model_name\",\"value\":\"qwen3-0.6b\"}" >/dev/null + curl -fsSk -H "$header" -H "X-MLFLOW-WORKSPACE: ${WORKSPACE}" -H 'Content-Type: application/json' \ + -X POST "$BASE/runs/log-metric" -d "{\"run_id\":\"${rid}\",\"key\":\"loss\",\"value\":0.123,\"timestamp\":1700000000000,\"step\":1}" >/dev/null + curl -fsSk -H "$header" -H "X-MLFLOW-WORKSPACE: ${WORKSPACE}" -H 'Content-Type: application/json' \ + -X POST "$BASE/runs/update" -d "{\"run_id\":\"${rid}\",\"status\":\"FINISHED\",\"end_time\":1700000005000}" >/dev/null + run="$(curl -fsSk -H "$header" -H "X-MLFLOW-WORKSPACE: ${WORKSPACE}" "$BASE/runs/get?run_id=${rid}")" + owner="$(printf '%s' "$run" | jq -r '.run.info.user_id')" + status="$(printf '%s' "$run" | jq -r '.run.info.status')" + param="$(printf '%s' "$run" | jq -r '.run.data.params[] | select(.key=="model_name") | .value')" + echo " [$label] run_id=${rid} owner=${owner} status=${status} model_name=${param}" + [ "$status" = "FINISHED" ] || { echo "FAIL[$label]: run not FINISHED"; return 1; } + [ "$param" = "qwen3-0.6b" ] || { echo "FAIL[$label]: param not logged"; return 1; } + [ "$owner" = "$expect" ] || { echo "FAIL[$label]: owner '${owner}' != expected '${expect}'"; return 1; } +} + +EXPECT_OWNER="$MLFLOW_USERNAME" + +# --------------------------------------------------------------------------- +# Leg 1: bearer token (authorization_code + PKCE -> id_token) +# --------------------------------------------------------------------------- +if [ -n "${DEX_CLIENT_SECRET:-}" ]; then + echo "== leg 1: mint id token via authorization_code + PKCE ==" + V="$(openssl rand -base64 48 | tr '+/' '-_' | tr -d '=' | cut -c1-64)" + C="$(printf %s "$V" | openssl dgst -sha256 -binary | openssl base64 -A | tr '+/' '-_' | tr -d '=')" + RU="$(jq -rn --arg u "$REDIRECT_URI" '$u|@uri')"; SC="$(jq -rn '"openid email groups offline_access"|@uri')" + REQ="$(curl -fsSk "$P/dex/api/v1/authorize?client_id=${DEX_CLIENT_ID}&redirect_uri=${RU}&response_type=code&scope=${SC}&state=cli&code_challenge=${C}&code_challenge_method=S256" | jq -r '.req // empty')" + [ -n "$REQ" ] || { echo "FAIL: authorize returned no req (PKCE/client issue?)"; exit 1; } + ENC="$(rsa_password)" + CODE="$(curl -fsSk -X POST "$P/dex/api/v1/authorize/local?req=${REQ}" -H 'Content-Type: application/json' \ + --data "$(jq -nc --arg a "$MLFLOW_USERNAME" --arg p "$ENC" '{account:$a,password:$p}')" \ + | jq -r '.redirect_url // empty' | sed -E 's/.*code=([^&]+).*/\1/')" + [ -n "$CODE" ] || { echo "FAIL: login returned no auth code (captcha triggered or bad credentials?)"; exit 1; } + ID_TOKEN="$(curl -fsSk "$P/dex/token" -d grant_type=authorization_code -d code="$CODE" \ + --data-urlencode redirect_uri="$REDIRECT_URI" -d code_verifier="$V" \ + -d client_id="${DEX_CLIENT_ID}" --data-urlencode client_secret="${DEX_CLIENT_SECRET}" | jq -r '.id_token // empty')" + [ -n "$ID_TOKEN" ] || { echo "FAIL: token exchange returned no id_token"; exit 1; } + EXPECT_OWNER="$(b64url_decode "$(printf '%s' "$ID_TOKEN" | cut -d. -f2)" | jq -r '.email // .preferred_username // .name // .sub')" + echo " caller identity: ${EXPECT_OWNER}" + # Is the proxy configured to accept bearer tokens? + HTTP="$(curl -sk -o /dev/null -w '%{http_code}' -H "Authorization: Bearer ${ID_TOKEN}" -H "X-MLFLOW-WORKSPACE: ${WORKSPACE}" "$BASE/experiments/search?max_results=1")" + if [ "$HTTP" = "200" ]; then + run_and_assert "token" "Authorization: Bearer ${ID_TOKEN}" "$EXPECT_OWNER" + echo "PASS: bearer-token method (authorization_code + PKCE)" + else + echo "SKIP: bearer-token method — proxy returned HTTP ${HTTP} (enable --skip-jwt-bearer-tokens on the MLflow proxy)" + fi +else + echo "SKIP: bearer-token method — set DEX_CLIENT_SECRET to exercise it" +fi + +# --------------------------------------------------------------------------- +# Leg 2: session cookie (no platform changes) +# --------------------------------------------------------------------------- +echo "== leg 2: mint _oauth2_proxy cookie via the proxy login ==" +JAR="$TMP/proxyjar.txt"; : > "$JAR" +LOC="$(curl -sk -c "$JAR" -D - -o /dev/null "$P/clusters/${CLUSTER}/mlflow/" | awk 'BEGIN{IGNORECASE=1}/^location:/{print $2}' | tr -d '\r')" +QS="${LOC#*\?}" +[ "$QS" != "$LOC" ] || { echo "FAIL: MLflow route did not redirect to login"; exit 1; } +REQ="$(curl -sk -b "$JAR" -c "$JAR" "$P/dex/api/v1/authorize?${QS}" | jq -r '.req // empty')" +[ -n "$REQ" ] || { echo "FAIL: proxy authorize returned no req"; exit 1; } +ENC="$(rsa_password)" +CB="$(curl -sk -b "$JAR" -c "$JAR" -X POST "$P/dex/api/v1/authorize/local?req=${REQ}" -H 'Content-Type: application/json' \ + --data "$(jq -nc --arg a "$MLFLOW_USERNAME" --arg p "$ENC" '{account:$a,password:$p}')" | jq -r '.redirect_url // empty')" +[ -n "$CB" ] || { echo "FAIL: proxy login returned no callback url"; exit 1; } +curl -sk -b "$JAR" -c "$JAR" -o /dev/null "$CB" +COOKIE="$(awk -F'\t' '$6 ~ /^_oauth2_proxy/{printf "%s=%s; ",$6,$7}' "$JAR" | sed 's/; $//')" +[ -n "$COOKIE" ] || { echo "FAIL: no _oauth2_proxy cookie minted"; exit 1; } +run_and_assert "cookie" "Cookie: ${COOKIE}" "$EXPECT_OWNER" +echo "PASS: session-cookie method (no platform changes)" + +echo "DONE: authenticated to MLflow through the OAuth proxy as '${EXPECT_OWNER}' — browser-free, no container-port access"