diff --git a/docs/en/kubeflow/how_to/mlflow-python-sdk.mdx b/docs/en/kubeflow/how_to/mlflow-python-sdk.mdx
new file mode 100644
index 0000000..bcceb67
--- /dev/null
+++ b/docs/en/kubeflow/how_to/mlflow-python-sdk.mdx
@@ -0,0 +1,216 @@
+---
+weight: 46
+---
+
+# Using the MLflow Python SDK with Authentication and RBAC
+
+On Alauda AI the [MLflow Tracking Server](./mlflow.mdx) runs behind single sign-on and multi-tenancy: an OAuth proxy authenticates every caller, and the server records each run under the calling user and authorizes it against Kubernetes RBAC. This guide drives the stock **MLflow Python SDK** through that OAuth proxy with your own identity, **browser-free**, using the OAuth2 **authorization code** flow (with PKCE) scripted against the platform login — no password grant, and never the MLflow container port.
+
+There are two browser-free ways to present your identity; pick one:
+
+- **Bearer token (recommended).** Obtain a Dex **id token** from the CLI or Python and pass it as `MLFLOW_TRACKING_TOKEN`; renew it with the refresh token. Needs one platform setting ([below](#platform-setup)).
+- **Session cookie (no platform changes).** Drive the proxy's own login to obtain its `_oauth2_proxy` cookie and attach it to requests. Works on any install as-is ([below](#cookie-method)).
+
+## How authentication works
+
+Two layers sit in front of your runs:
+
+1. The **OAuth proxy** (`oauth2-proxy`) authenticates the request — either a Dex **id token** sent as `Authorization: Bearer …` (token method) or its `_oauth2_proxy` **session cookie** (cookie method).
+2. The MLflow server's `kubernetes-auth` plugin reads your identity from that credential, records it as the run **owner**, and authorizes it against your Kubernetes permissions in the workspace.
+
+The client always goes through the OAuth proxy — never connect to the MLflow container port directly.
+
+## Prerequisites
+
+- `mlflow` **3.10 or later** (`pip install "mlflow>=3.10"`). Workspace selection (`mlflow.set_workspace`) is a 3.10+ feature. The Python token helper also uses `requests` and `cryptography`.
+- A platform **username and password** — ideally a dedicated service account, not a person's login — that can access the target workspace (see [Workspace Access](./mlflow.mdx)).
+- The platform's **OAuth client id and secret** — the client the MLflow proxy uses (from your administrator). On Alauda this is the platform auth client, e.g. `alauda-auth`; its secret lives in a Kubernetes `Secret` (e.g. `cpaas-oidc-secret`).
+
+## Platform setup for the token method (administrator, one-time) \{#platform-setup}
+
+The bearer-token method needs the MLflow OAuth proxy to accept Dex id tokens. Add `--skip-jwt-bearer-tokens=true` to the **MLflow plugin** — this is the MLflow proxy on the workload cluster, **not** the platform's global auth server:
+
+```yaml
+# MLflow plugin values
+auth:
+  oauth:
+    extraArgs:
+      - --skip-jwt-bearer-tokens=true
+```
+
+No Dex or global-auth change is required: the login below uses the `authorization_code` grant the platform client already allows. The **cookie method** needs no setting at all — skip this section if you use it.
+
+## Get a token from the command line (browser-free) \{#get-a-token}
+
+The platform login is an SSO page, but its API supports the standard OAuth **authorization code** flow with PKCE, so you can complete it from a script — no browser redirect. The password is RSA-encrypted with the login service's public key (`/dex/pubkey`), exactly as the login page does it, then exchanged for an **id token** (and a **refresh token** for headless renewal).
+
+### Python helper
+
+```python
+import base64, hashlib, json, os, secrets
+from urllib.parse import urlparse, parse_qs
+import requests
+from cryptography.hazmat.primitives.asymmetric import padding
+from cryptography.hazmat.primitives.serialization import load_pem_public_key
+
+PLATFORM      = os.environ["PLATFORM_ADDRESS"].rstrip("/")    # https://<platform>
+CLIENT_ID     = os.environ["DEX_CLIENT_ID"]                   # the MLflow proxy's client, e.g. alauda-auth
+CLIENT_SECRET = os.environ["DEX_CLIENT_SECRET"]
+USERNAME      = os.environ["MLFLOW_USERNAME"]
+PASSWORD      = os.environ["MLFLOW_PASSWORD"]
+REDIRECT_URI  = f"{PLATFORM}/oauth2/callback"                 # any URI the client has registered
+VERIFY_TLS    = os.environ.get("PLATFORM_CA", False)         # CA bundle path, or False to skip (lab only)
+
+s = requests.Session(); s.verify = VERIFY_TLS
+_b64url = lambda b: base64.urlsafe_b64encode(b).rstrip(b"=").decode()
+
+def get_tokens() -> dict:
+    """Run the authorization-code + PKCE flow headlessly. Returns the Dex token response."""
+    verifier  = _b64url(secrets.token_bytes(48))
+    challenge = _b64url(hashlib.sha256(verifier.encode()).digest())
+    # 1) start the flow -> auth-request id
+    req = s.get(f"{PLATFORM}/dex/api/v1/authorize", params={
+        "client_id": CLIENT_ID, "redirect_uri": REDIRECT_URI, "response_type": "code",
+        "scope": "openid email groups offline_access", "state": "cli",
+        "code_challenge": challenge, "code_challenge_method": "S256"}).json()["req"]
+    # 2) RSA-encrypt the password, then log in via the local connector -> auth code
+    pk  = s.get(f"{PLATFORM}/dex/pubkey").json()              # {"ts": ..., "pubkey": "<PEM>"}
+    payload = json.dumps({"ts": pk["ts"], "password": PASSWORD}, separators=(",", ":")).encode()
+    enc = base64.b64encode(load_pem_public_key(pk["pubkey"].encode()).encrypt(payload, padding.PKCS1v15())).decode()
+    redirect = s.post(f"{PLATFORM}/dex/api/v1/authorize/local", params={"req": req},
+        json={"account": USERNAME, "password": enc}).json()["redirect_url"]
+    code = parse_qs(urlparse(redirect).query)["code"][0]
+    # 3) exchange the code (with the PKCE verifier) -> id_token + refresh_token
+    return s.post(f"{PLATFORM}/dex/token", data={
+        "grant_type": "authorization_code", "code": code, "redirect_uri": REDIRECT_URI,
+        "code_verifier": verifier, "client_id": CLIENT_ID, "client_secret": CLIENT_SECRET}).json()
+
+def refresh(refresh_token: str) -> str:
+    """Mint a fresh id token from a refresh token — no login, no browser."""
+    return s.post(f"{PLATFORM}/dex/token", data={
+        "grant_type": "refresh_token", "refresh_token": refresh_token,
+        "client_id": CLIENT_ID, "client_secret": CLIENT_SECRET,
+        "scope": "openid email groups"}).json()["id_token"]
+```
+
+### Shell equivalent (curl + openssl, no Python dependencies)
+
+```bash
+PLATFORM=https://<platform>; CLIENT_ID=<client>; CLIENT_SECRET=<secret>
+USERNAME='<user>'; PASSWORD='<password>'; REDIRECT_URI="$PLATFORM/oauth2/callback"
+
+V=$(openssl rand -base64 48 | tr '+/' '-_' | tr -d '=' | cut -c1-64)                       # PKCE verifier
+C=$(printf %s "$V" | openssl dgst -sha256 -binary | openssl base64 -A | tr '+/' '-_' | tr -d '=')
+RU=$(jq -rn --arg u "$REDIRECT_URI" '$u|@uri'); SC=$(jq -rn '"openid email groups offline_access"|@uri')
+REQ=$(curl -sk "$PLATFORM/dex/api/v1/authorize?client_id=$CLIENT_ID&redirect_uri=$RU&response_type=code&scope=$SC&state=cli&code_challenge=$C&code_challenge_method=S256" | jq -r .req)
+PK=$(curl -sk "$PLATFORM/dex/pubkey"); TS=$(echo "$PK"|jq -r .ts); echo "$PK"|jq -r .pubkey >/tmp/dex_pub.pem
+ENC=$(printf '{"ts":"%s","password":"%s"}' "$TS" "$PASSWORD" | openssl pkeyutl -encrypt -pubin -inkey /tmp/dex_pub.pem -pkeyopt rsa_padding_mode:pkcs1 | openssl base64 -A)
+CODE=$(curl -sk -X POST "$PLATFORM/dex/api/v1/authorize/local?req=$REQ" -H 'Content-Type: application/json' \
+  --data "$(jq -nc --arg a "$USERNAME" --arg p "$ENC" '{account:$a,password:$p}')" | jq -r .redirect_url | sed -E 's/.*code=([^&]+).*/\1/')
+curl -sk "$PLATFORM/dex/token" -d grant_type=authorization_code -d code="$CODE" \
+  --data-urlencode redirect_uri="$REDIRECT_URI" -d code_verifier="$V" \
+  -d client_id="$CLIENT_ID" --data-urlencode client_secret="$CLIENT_SECRET" | jq -r .id_token
+```
+
+## Connect the SDK
+
+```python
+import os, mlflow
+
+tok = get_tokens()
+os.environ["MLFLOW_TRACKING_TOKEN"] = tok["id_token"].strip()           # → Authorization: Bearer
+mlflow.set_tracking_uri("http://mlflow-tracking-server.kubeflow:5000")  # in-cluster Service (fronted by the OAuth proxy)
+mlflow.set_workspace("team-a")                                          # workspace namespace → X-MLFLOW-WORKSPACE
+mlflow.set_experiment("my-experiment")
+
+with mlflow.start_run(run_name="sdk-quickstart") as run:
+    mlflow.log_param("learning_rate", 2e-4)
+    mlflow.log_metric("loss", 0.123)
+    print("run:", run.info.run_id)
+```
+
+The run appears under **Alauda AI → Tools → MLFlow**, owned by the user you authenticated as. (Verified end-to-end on a secured install: the run owner is the token's user identity.)
+
+Use the in-cluster Service URL `http://mlflow-tracking-server.kubeflow:5000` when the client runs **inside** the cluster (pipeline components, Workbench notebooks). From **outside** the cluster, point at the platform route `https://<platform>/clusters/<cluster>/mlflow` instead — both reach the same OAuth proxy (set `MLFLOW_TRACKING_INSECURE_TLS=true` if the platform certificate is not trusted by your machine).
+
+:::warning
+Use a **dedicated service-account user** and keep its credentials and the client secret in a Kubernetes `Secret`, never in code. Always `.strip()` the token (a trailing newline produces `Invalid … character(s) in header value: 'Bearer …\n'`). id tokens expire (24 h by default); for long-running jobs renew with `refresh(tok["refresh_token"])` instead of logging in again.
+:::
+
+## Selecting a workspace
+
+Runs are recorded in the workspace you select; if you select none, the server's default workspace is used. Any of these set it (the SDK turns them into the `X-MLFLOW-WORKSPACE` header):
+
+- `mlflow.set_workspace("team-a")` in code,
+- the `MLFLOW_WORKSPACE=team-a` environment variable.
+
+You can only use a workspace your account has access to; see [Workspace Access](./mlflow.mdx).
+
+## Registering models
+
+The model registry is workspace-scoped and authorized the same way, so the usual SDK calls work once connected:
+
+```python
+mlflow.set_workspace("team-a")
+with mlflow.start_run():
+    mlflow.sklearn.log_model(sk_model, name="model", registered_model_name="fraud-detector")
+```
+
+Promote the registered version to **Staging** or **Production** from the MLflow UI.
+
+## Alternative: session cookie (no platform changes) \{#cookie-method}
+
+If you cannot enable `--skip-jwt-bearer-tokens`, drive the proxy's own login flow to obtain its `_oauth2_proxy` cookie and attach it to requests — this works on any install unchanged. The proxy starts the OAuth flow for you (its own PKCE and `redirect_uri`); you just replay that through the same scripted login and hand the code back to the proxy callback:
+
+```bash
+PLATFORM=https://<platform>; CLUSTER=<cluster>
+USERNAME='<user>'; PASSWORD='<password>'
+JAR=$(mktemp)
+# 1) start the MLflow proxy login -> the Dex auth query it wants
+LOC=$(curl -sk -c "$JAR" -D - -o /dev/null "$PLATFORM/clusters/$CLUSTER/mlflow/" \
+  | awk 'BEGIN{IGNORECASE=1}/^location:/{print $2}' | tr -d '\r')
+QS=${LOC#*\?}
+# 2) authorize -> req, then 3) scripted local login -> the proxy callback URL
+REQ=$(curl -sk -b "$JAR" -c "$JAR" "$PLATFORM/dex/api/v1/authorize?$QS" | jq -r .req)
+PK=$(curl -sk "$PLATFORM/dex/pubkey"); TS=$(echo "$PK"|jq -r .ts); echo "$PK"|jq -r .pubkey >/tmp/dex_pub.pem
+ENC=$(printf '{"ts":"%s","password":"%s"}' "$TS" "$PASSWORD" | openssl pkeyutl -encrypt -pubin -inkey /tmp/dex_pub.pem -pkeyopt rsa_padding_mode:pkcs1 | openssl base64 -A)
+CB=$(curl -sk -b "$JAR" -c "$JAR" -X POST "$PLATFORM/dex/api/v1/authorize/local?req=$REQ" -H 'Content-Type: application/json' \
+  --data "$(jq -nc --arg a "$USERNAME" --arg p "$ENC" '{account:$a,password:$p}')" | jq -r .redirect_url)
+# 4) the proxy callback exchanges the code and sets the _oauth2_proxy cookie
+curl -sk -b "$JAR" -c "$JAR" -o /dev/null "$CB"
+COOKIE=$(awk -F'\t' '$6 ~ /^_oauth2_proxy/{printf "%s=%s; ",$6,$7}' "$JAR" | sed 's/; $//')   # includes any _oauth2_proxy_N chunks
+echo "$COOKIE"
+```
+
+Then attach the cookie with a header provider (the cookie carries your identity — no token, no platform setting):
+
+```python
+import os, mlflow
+from mlflow.tracking.request_header.abstract_request_header_provider import RequestHeaderProvider
+from mlflow.tracking.request_header.registry import _request_header_provider_registry
+
+class ProxySessionHeader(RequestHeaderProvider):
+    def in_context(self):
+        return bool(os.environ.get("MLFLOW_PROXY_COOKIE"))     # export MLFLOW_PROXY_COOKIE='_oauth2_proxy=<value>'
+    def request_headers(self):
+        return {"Cookie": os.environ["MLFLOW_PROXY_COOKIE"]}
+
+_request_header_provider_registry.register(ProxySessionHeader)
+mlflow.set_tracking_uri("https://<platform>/clusters/<cluster>/mlflow")
+mlflow.set_workspace("team-a")
+```
+
+You can also copy the `_oauth2_proxy` cookie from a browser session (DevTools → **Application/Storage → Cookies**). The session cookie expires — re-mint it when calls start returning a login redirect.
+
+## Troubleshooting
+
+| Symptom | Check |
+|---------|-------|
+| `/dex/api/v1/authorize` returns `PKCE code_challenge is required` | The client enforces PKCE. Send `code_challenge` and `code_challenge_method=S256` (the helper does this). |
+| Local login returns a captcha challenge / `CaptchaError` | Too many recent failed logins triggered the retry-captcha. Wait, fix the credentials, then retry — a clean first login needs no captcha. |
+| `/dex/token` returns `invalid_grant` | The auth code or PKCE verifier is stale or reused. Re-run the flow from the start (`authorize` → login → token); codes are single-use. |
+| Call returns HTML or a redirect (`302` to the login page) | **Token method:** the proxy rejected the bearer token — confirm `--skip-jwt-bearer-tokens` is enabled and the token is a valid Dex id token (`aud` = the proxy's client). **Cookie method:** the `_oauth2_proxy` cookie is missing or expired. |
+| `Invalid … character(s) in header value: 'Bearer …\n'` | The token has trailing whitespace. Set `MLFLOW_TRACKING_TOKEN` to the `.strip()`-ed value. |
+| `Failed to query /api/3.0/mlflow/server-info` | The SDK could not reach the server through the proxy — verify the tracking URI and that the token/cookie is valid. |
+| `403 PERMISSION_DENIED` | Your account lacks access to the workspace namespace. Request access to the workspace (see [Workspace Access](./mlflow.mdx)); no ServiceAccount is involved. |
+| Run shows the wrong owner or workspace | The owner is your authenticated identity; the workspace is `set_workspace()` / `MLFLOW_WORKSPACE` (else the server default). Check both. |
diff --git a/docs/en/kubeflow/how_to/mlflow.mdx b/docs/en/kubeflow/how_to/mlflow.mdx
index eaf2e76..d7580db 100644
--- a/docs/en/kubeflow/how_to/mlflow.mdx
+++ b/docs/en/kubeflow/how_to/mlflow.mdx
@@ -69,6 +69,8 @@ subjects:
 
 ## Client Configuration
 
+For authenticating the MLflow Python SDK with a user identity token — including the in-cluster connection details and RBAC — see [Using the MLflow Python SDK with Authentication and RBAC](./mlflow-python-sdk.mdx).
+
 Set the MLflow tracking URI to the platform route and select the workspace:
 
 ```python
diff --git a/docs/en/training_guides/fine-tune-with-trainer-v2.ipynb b/docs/en/training_guides/fine-tune-with-trainer-v2.ipynb
index e904165..10fb95d 100644
--- a/docs/en/training_guides/fine-tune-with-trainer-v2.ipynb
+++ b/docs/en/training_guides/fine-tune-with-trainer-v2.ipynb
@@ -947,15 +947,7 @@
    "cell_type": "markdown",
    "id": "27d2b476",
    "metadata": {},
-   "source": [
-    "## Step 5: View Training Metrics in MLflow\n",
-    "\n",
-    "If `MLFLOW_TRACKING_URI` is set and the MLflow server is reachable from the training pod, LlamaFactory will log metrics (loss, learning rate, etc.) to MLflow automatically via `report_to: mlflow` in the training config.\n",
-    "\n",
-    "To open the MLflow UI, go to **Alauda AI** - **Tools** - **MLFlow** (need MLFlow Cluster plugin installed). Look for the experiment named by `MLFLOW_EXPERIMENT_NAME`.\n",
-    "\n",
-    "Each `TrainJob` run will appear as a separate MLflow **run** under the same experiment, making it easy to compare training curves across different models and hyperparameters."
-   ]
+   "source": "## Step 5: View Training Metrics in MLflow\n\nIf `MLFLOW_TRACKING_URI` is set and the MLflow server is reachable from the training pod, LlamaFactory will log metrics (loss, learning rate, etc.) to MLflow automatically via `report_to: mlflow` in the training config.\n\nOn a secured (SSO + multi-tenant) MLflow install the trainer must also authenticate — set `MLFLOW_TRACKING_TOKEN` and select a workspace. See [Using the MLflow Python SDK with Authentication and RBAC](../kubeflow/how_to/mlflow-python-sdk.mdx) for how to obtain the token and how authorization/RBAC work.\n\nTo open the MLflow UI, go to **Alauda AI** - **Tools** - **MLFlow** (need MLFlow Cluster plugin installed). Look for the experiment named by `MLFLOW_EXPERIMENT_NAME`.\n\nEach `TrainJob` run will appear as a separate MLflow **run** under the same experiment, making it easy to compare training curves across different models and hyperparameters."
   },
   {
    "cell_type": "markdown",
@@ -1060,4 +1052,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
\ No newline at end of file
diff --git a/docs/en/training_guides/fine-tuning-using-notebooks.mdx b/docs/en/training_guides/fine-tuning-using-notebooks.mdx
index a2bba02..93d7653 100644
--- a/docs/en/training_guides/fine-tuning-using-notebooks.mdx
+++ b/docs/en/training_guides/fine-tuning-using-notebooks.mdx
@@ -325,7 +325,9 @@ After success the merged model is pushed to a date-stamped branch (`sft-YYYYMMDD
 
 ## 8. Experiment tracking
 
-Setting `report_to: mlflow` in the LLaMA-Factory config plus the `MLFLOW_TRACKING_URI` / `MLFLOW_EXPERIMENT_NAME` env vars routes metrics to MLflow. Find runs in **Alauda AI → Advanced → MLFlow**, compare loss curves, and pin the winning run.
+Setting `report_to: mlflow` in the LLaMA-Factory config plus the `MLFLOW_TRACKING_URI` / `MLFLOW_EXPERIMENT_NAME` env vars routes metrics to MLflow. Find runs in **Alauda AI → Tools → MLFlow**, compare loss curves, and pin the winning run.
+
+On a secured (SSO + multi-tenant) MLflow install the job must also authenticate — supply an `MLFLOW_TRACKING_TOKEN` and select a workspace. See [Using the MLflow Python SDK with Authentication and RBAC](../kubeflow/how_to/mlflow-python-sdk.mdx) for how to obtain the token and configure the client.
 
 ## 9. Publish the fine-tuned model
 
@@ -412,4 +414,4 @@ spec:
 
 ### Experiment tracking on other devices
 
-LLaMA-Factory and Transformers integrate with MLflow / wandb directly. Set the destination in the framework config (e.g. `report_to: mlflow` for LLaMA-Factory) and supply `MLFLOW_TRACKING_URI` and `MLFLOW_EXPERIMENT_NAME` env vars. View results under **Alauda AI → Advanced → MLFlow**.
+LLaMA-Factory and Transformers integrate with MLflow / wandb directly. Set the destination in the framework config (e.g. `report_to: mlflow` for LLaMA-Factory) and supply `MLFLOW_TRACKING_URI` and `MLFLOW_EXPERIMENT_NAME` env vars (plus `MLFLOW_TRACKING_TOKEN` on a secured install — see [Using the MLflow Python SDK with Authentication and RBAC](../kubeflow/how_to/mlflow-python-sdk.mdx)). View results under **Alauda AI → Tools → MLFlow**.
diff --git a/docs/en/training_guides/pipelines-mlflow-integration.mdx b/docs/en/training_guides/pipelines-mlflow-integration.mdx
new file mode 100644
index 0000000..65b1a55
--- /dev/null
+++ b/docs/en/training_guides/pipelines-mlflow-integration.mdx
@@ -0,0 +1,183 @@
+---
+weight: 55
+---
+
+# Kubeflow Pipeline + MLflow Integration
+
+This guide shows how Kubeflow Pipelines (KFP) components log parameters, metrics, and models to [MLflow on Kubeflow](../kubeflow/how_to/mlflow.mdx) with the **MLflow Python client**. Authentication and workspace/RBAC follow [Using the MLflow Python SDK with Authentication and RBAC](../kubeflow/how_to/mlflow-python-sdk.mdx) — each component authenticates with a user identity token and the server records the run under that user.
+
+## Scope
+
+- Alauda AI 2.5 and later.
+- Kubeflow Pipelines and the MLflow cluster plugin are installed.
+- The MLflow workspace is a namespace labelled `mlflow-enabled=true`.
+- For the bearer-token method, the MLflow OAuth proxy must accept Dex id tokens (`--skip-jwt-bearer-tokens`) — see [Platform setup](../kubeflow/how_to/mlflow-python-sdk.mdx#platform-setup) in the SDK guide. No global-auth change is needed, and the cookie method needs no setup at all.
+
+## Prerequisites
+
+- `kfp` and `kfp-kubernetes` Python SDKs (`pip install kfp kfp-kubernetes`).
+- Access to a KFP endpoint (see [Use Kubeflow Pipelines](../kubeflow/how_to/pipelines.mdx)).
+- A **Dex id token** for a dedicated service account, minted with the OAuth2 password grant (see the [SDK guide](../kubeflow/how_to/mlflow-python-sdk.mdx)). Store it in a Kubernetes `Secret` and inject it into the component.
+- An MLflow workspace (a namespace with `mlflow-enabled=true`) the account can access.
+
+## How components reach MLflow
+
+A pipeline component runs **inside** the cluster, so it talks to MLflow through the in-cluster Service `http://mlflow-tracking-server.kubeflow:5000` (which is fronted by the OAuth proxy — components never use the MLflow container port directly). It authenticates exactly like any other MLflow client:
+
+- `MLFLOW_TRACKING_TOKEN` — a Dex id token; the MLflow client sends it as `Authorization: Bearer …`.
+- `mlflow.set_workspace(...)` — selects the workspace (`X-MLFLOW-WORKSPACE`).
+
+The server reads the identity from the token and records the run under that user. See the [SDK guide](../kubeflow/how_to/mlflow-python-sdk.mdx) for how the token is obtained and how authorization works.
+
+## Complete example: training pipeline with MLflow
+
+The component uses the MLflow client and reads `MLFLOW_TRACKING_TOKEN` from a `Secret` injected with [`kfp-kubernetes`](https://kubeflow-pipelines.readthedocs.io/en/stable/source/kubernetes.html). KFP v2 packages each component from its own source, so `import mlflow` lives **inside** the function.
+
+```python
+from kfp import dsl, compiler
+from kfp import kubernetes
+
+
+@dsl.component(base_image="python:3.11-slim", packages_to_install=["mlflow>=3.10"])
+def train_model(
+    workspace: str,
+    model_name: str,
+    learning_rate: float,
+    epochs: int,
+    run_id: str,
+) -> dict:
+    """Simulated training component that logs to MLflow as the calling user."""
+    import mlflow   # MLFLOW_TRACKING_TOKEN is injected from a Secret (see the pipeline below)
+
+    mlflow.set_tracking_uri("http://mlflow-tracking-server.kubeflow:5000")  # in-cluster Service, via the OAuth proxy
+    mlflow.set_workspace(workspace)
+    mlflow.set_experiment("kfp-training-experiment")
+
+    metrics = {}
+    with mlflow.start_run(run_name=f"run-{run_id}"):
+        mlflow.log_param("model_name", model_name)
+        mlflow.log_param("learning_rate", learning_rate)
+        mlflow.log_param("epochs", epochs)
+        for epoch in range(1, epochs + 1):
+            loss = 2.0 * (0.95 ** epoch)
+            accuracy = 1.0 - loss
+            mlflow.log_metric("loss", loss, step=epoch)
+            mlflow.log_metric("accuracy", accuracy, step=epoch)
+            metrics = {"final_loss": loss, "final_accuracy": accuracy}
+
+    print("logged run:", mlflow.last_active_run().info.run_id)
+    return metrics
+
+
+@dsl.pipeline(name="mlflow-training-pipeline", description="Train with MLflow tracking")
+def training_pipeline(
+    workspace: str = "team-a",
+    model_name: str = "qwen3-0.6b",
+    learning_rate: float = 2e-4,
+    epochs: int = 10,
+):
+    task = train_model(
+        workspace=workspace,
+        model_name=model_name,
+        learning_rate=learning_rate,
+        epochs=epochs,
+        # PIPELINE_JOB_ID_PLACEHOLDER resolves to the run's job id at runtime;
+        # pass it in as an argument (a component cannot reference dsl.* itself).
+        run_id=dsl.PIPELINE_JOB_ID_PLACEHOLDER,
+    )
+    # Inject the Dex id token from a Secret as MLFLOW_TRACKING_TOKEN.
+    kubernetes.use_secret_as_env(
+        task, secret_name="mlflow-token", secret_key_to_env={"token": "MLFLOW_TRACKING_TOKEN"}
+    )
+
+
+compiler.Compiler().compile(training_pipeline, "pipeline.yaml")
+```
+
+Create the `mlflow-token` Secret with a Dex id token. Mint `ID_TOKEN` browser-free with the authorization-code flow from the SDK guide — see [Get a token from the command line](../kubeflow/how_to/mlflow-python-sdk.mdx#get-a-token):
+
+```bash
+# ID_TOKEN: mint it with the curl/Python flow in the SDK guide (browser-free, current grants)
+kubectl -n <pipeline-namespace> create secret generic mlflow-token --from-literal=token="$ID_TOKEN"
+```
+
+:::warning
+id tokens expire (24 h by default), so refresh the `mlflow-token` Secret before submitting long pipelines — or mint the token **inside** the component from service-account credentials kept in a Secret (the [token flow](../kubeflow/how_to/mlflow-python-sdk.mdx#get-a-token) in the SDK guide) and renew it with the refresh token, so each run gets a fresh token.
+:::
+
+## Upload and run
+
+### Via the KFP UI
+
+1. Go to **Kubeflow Dashboard → Pipelines → Upload Pipeline** and select `pipeline.yaml`.
+2. Click **Create Run** and fill in the parameters (workspace, model name, epochs).
+3. After the run starts, check the MLflow UI under **Alauda AI → Tools → MLFlow** — the run owner is the token's user.
+
+### Via the KFP SDK
+
+```python
+from kfp.client import Client
+
+client = Client(host="<MY-KFP-ENDPOINT>")
+run = client.create_run_from_pipeline_package(
+    "pipeline.yaml",
+    arguments=dict(workspace="team-a", model_name="qwen3-0.6b", epochs=10),
+)
+print(f"Run ID: {run.run_id}")
+```
+
+## Using MLflow in Trainer v2 pipelines
+
+If you fine-tune with [Kubeflow Trainer v2](./fine-tune-with-trainer-v2.mdx), the framework's MLflow integration (for example `report_to: mlflow` in LLaMA-Factory) authenticates the same way. Trainer v2 uses `apiVersion: trainer.kubeflow.org/v1alpha1`, `kind: TrainJob`, and a `spec.runtimeRef` + `spec.trainer` shape. Point it at the in-cluster Service and inject the id token from a `Secret`:
+
+```yaml
+apiVersion: trainer.kubeflow.org/v1alpha1
+kind: TrainJob
+metadata:
+  name: mlflow-finetune
+spec:
+  runtimeRef:
+    name: torch-distributed        # a TrainingRuntime / ClusterTrainingRuntime
+  trainer:
+    image: alaudadockerhub/fine_tune_with_llamafactory:v0.1.1
+    env:
+      - name: MLFLOW_TRACKING_URI
+        value: "http://mlflow-tracking-server.kubeflow:5000"
+      - name: MLFLOW_EXPERIMENT_NAME
+        value: "trainer-v2-finetune"
+      - name: MLFLOW_TRACKING_TOKEN
+        valueFrom:
+          secretKeyRef:
+            name: mlflow-token       # a Secret holding a Dex id token
+            key: token
+```
+
+See [Fine-tuning LLMs using Workbench](./fine-tuning-using-notebooks.mdx) for a full Trainer v2 + MLflow example.
+
+## Best practices
+
+### Use the pipeline job ID in MLflow
+
+KFP v2 provides `dsl.PIPELINE_JOB_ID_PLACEHOLDER` (the v1 `dsl.RUN_ID_PLACEHOLDER` was removed). It is a pipeline-level placeholder, so pass it into the component as an argument — a component cannot reference `dsl.*` from inside its own body. Use the received string in the run name to keep runs distinct per pipeline execution.
+
+### Keep credentials in a Secret and refresh tokens
+
+Never hardcode the token or service-account credentials in `pipeline.yaml` — compiled pipelines are stored and shared. Inject them from a `Secret`, and refresh the id token (or mint it inside the component) before it expires.
+
+### Log metrics inside a run
+
+Each metric belongs to a `mlflow.start_run()` block. If a component has multiple logical stages, open a run per stage rather than logging outside a run context.
+
+### Artifact storage for production
+
+Logging large model artifacts requires durable object storage. Configure S3-compatible storage in the MLflow plugin settings (see [MLflow Tracking Server](../kubeflow/how_to/mlflow.mdx) → High Availability And Storage) so artifact uploads do not hit pod disk limits.
+
+## Troubleshooting
+
+| Symptom | Check |
+|---------|-------|
+| Component fails with an HTML/redirect (`302`) response | The OAuth proxy rejected the token. Confirm the proxy has `--skip-jwt-bearer-tokens` and `MLFLOW_TRACKING_TOKEN` is a valid Dex id token (see the [SDK guide](../kubeflow/how_to/mlflow-python-sdk.mdx)). |
+| `401 UNAUTHENTICATED` | `MLFLOW_TRACKING_TOKEN` is unset, empty, or expired — refresh the `mlflow-token` Secret. |
+| `403 PERMISSION_DENIED` | The token's user lacks access to the workspace namespace. Grant access to the MLflow workspace (see [Workspace Access](../kubeflow/how_to/mlflow.mdx)); no ServiceAccount is involved. |
+| Run shows up under the wrong owner / workspace | The owner is the token's identity; the workspace is `set_workspace()` (else the server default). Check both. |
+| MLflow metrics not appearing in KFP UI | KFP and MLflow are separate systems. Metrics logged to MLflow appear in the MLflow UI (**Alauda AI → Tools → MLFlow**), not in the KFP run output. |
diff --git a/e2e/lib.sh b/e2e/lib.sh
index 16a8d42..435887a 100644
--- a/e2e/lib.sh
+++ b/e2e/lib.sh
@@ -161,14 +161,14 @@ _retry_kubectl_stdin() {
   local kfn="$1" verb="$2"; shift 2
   local data
   data="$(cat)"
-  local attempts=0 max=20 delay=30 rc out
+  local attempts=0 max=20 delay=120 rc out
   while [ "${attempts}" -lt "${max}" ]; do
     if out="$(printf '%s' "${data}" | $kfn "${verb}" -f - "$@" 2>&1)"; then
       printf '%s' "${out}"
       return 0
     fi
     rc=$?
-    if ! echo "${out}" | grep -qE 'failed calling webhook|x509|connection refused|EOF|context deadline exceeded|webhook.* connect: connection refused'; then
+    if ! echo "${out}" | grep -qE 'failed calling webhook|x509|connection refused|EOF|context deadline exceeded|webhook.* connect: connection refused|failed to download openapi|openapi'; then
       printf '%s\n' "${out}" >&2
       return "${rc}"
     fi
@@ -181,7 +181,7 @@ _retry_kubectl_stdin() {
 }
 
 retry_create() { _retry_kubectl_stdin "$1" create "${@:2}"; }
-retry_apply()  { _retry_kubectl_stdin "$1" apply  "${@:2}"; }
+retry_apply()  { _retry_kubectl_stdin "$1" apply "${@:2}"; }
 
 # Locate a TrainJob's pod. Trainer v2 builds a JobSet named after the TrainJob,
 # with one Job per `replicatedJobs[*]` named `${trainjob}-<rjob>-0`. The first
diff --git a/e2e/mlflow-user-identity-smoke.sh b/e2e/mlflow-user-identity-smoke.sh
new file mode 100755
index 0000000..721e4b6
--- /dev/null
+++ b/e2e/mlflow-user-identity-smoke.sh
@@ -0,0 +1,145 @@
+#!/usr/bin/env bash
+# Smoke test: log to MLflow as a real user, THROUGH the OAuth proxy — browser-free.
+#
+# Drives the platform's standard OAuth **authorization code** flow (with PKCE)
+# from the shell: it starts the flow, logs in via the local connector with an
+# RSA-encrypted password (exactly as the login page does), and gets back an auth
+# code. From that code it derives, and exercises, both documented credentials:
+#
+#   1. Bearer token  — exchange the code for a Dex id token, send it as
+#                      Authorization: Bearer (needs --skip-jwt-bearer-tokens on
+#                      the MLflow proxy; the test SKIPs this leg if it is off).
+#   2. Session cookie — hand the code to the MLflow proxy callback to obtain the
+#                      _oauth2_proxy cookie (works with no platform changes).
+#
+# Each leg logs a run over the platform route (i.e. through oauth2-proxy, never
+# the container port) and asserts the run owner equals the caller's identity.
+# No ROPC/password grant, no ServiceAccount, no direct container-port access.
+#
+# Required env:
+#   PLATFORM_ADDRESS   e.g. https://192.168.142.163
+#   CLUSTER            e.g. g1-c1-x86
+#   MLFLOW_USERNAME    platform username (ideally a dedicated service account)
+#   MLFLOW_PASSWORD    that user's password
+# Optional env:
+#   DEX_CLIENT_ID      OAuth client id (enables the bearer-token leg; default: alauda-auth)
+#   DEX_CLIENT_SECRET  that client's secret (enables the bearer-token leg)
+#   MLFLOW_WORKSPACE   target workspace namespace (default: mlops-demo-e2e)
+set -euo pipefail
+
+: "${PLATFORM_ADDRESS:?set PLATFORM_ADDRESS, e.g. https://192.168.142.163}"
+: "${CLUSTER:?set CLUSTER, e.g. g1-c1-x86}"
+: "${MLFLOW_USERNAME:?set MLFLOW_USERNAME}"
+: "${MLFLOW_PASSWORD:?set MLFLOW_PASSWORD}"
+DEX_CLIENT_ID="${DEX_CLIENT_ID:-alauda-auth}"
+WORKSPACE="${MLFLOW_WORKSPACE:-mlops-demo-e2e}"
+P="${PLATFORM_ADDRESS%/}"
+REDIRECT_URI="$P/oauth2/callback"           # any URI the client has registered
+BASE="$P/clusters/${CLUSTER}/mlflow/api/2.0/mlflow"
+
+TMP="$(mktemp -d)"
+CLEAN_HDR=(); CLEAN_EID=()                  # parallel arrays of (auth header, experiment id) to delete on exit
+cleanup() {
+  local i
+  for i in "${!CLEAN_EID[@]}"; do
+    curl -fsSk -H "${CLEAN_HDR[$i]}" -H "X-MLFLOW-WORKSPACE: ${WORKSPACE}" -H 'Content-Type: application/json' \
+      -X POST "$BASE/experiments/delete" -d "{\"experiment_id\":\"${CLEAN_EID[$i]}\"}" >/dev/null 2>&1 || true
+  done
+  rm -rf "$TMP"
+}
+trap cleanup EXIT
+
+b64url_decode() { local d="$1"; d="${d//-/+}"; d="${d//_/\/}"; printf '%s%s' "$d" "$(printf '%*s' $(((4 - ${#d} % 4) % 4)) '' | tr ' ' '=')" | base64 -d 2>/dev/null; }
+
+# RSA-encrypt {"ts","password"} with a fresh /dex/pubkey (PKCS#1 v1.5), as the login page does.
+rsa_password() {
+  local pk ts
+  pk="$(curl -fsSk "$P/dex/pubkey")"; ts="$(echo "$pk" | jq -r .ts)"
+  echo "$pk" | jq -r .pubkey > "$TMP/pub.pem"
+  printf '{"ts":"%s","password":"%s"}' "$ts" "$MLFLOW_PASSWORD" \
+    | openssl pkeyutl -encrypt -pubin -inkey "$TMP/pub.pem" -pkeyopt rsa_padding_mode:pkcs1 | openssl base64 -A
+}
+
+# Log a run + assert the owner. $1=label  $2=auth header (Authorization/Cookie)  $3=expected owner
+run_and_assert() {
+  local label="$1" header="$2" expect="$3" exp eid rid owner status param run
+  exp="uit-${label}-$$-${RANDOM}"
+  eid="$(curl -fsSk -H "$header" -H "X-MLFLOW-WORKSPACE: ${WORKSPACE}" -H 'Content-Type: application/json' \
+         -X POST "$BASE/experiments/create" -d "{\"name\":\"${exp}\"}" | jq -r '.experiment_id // empty')"
+  [ -n "$eid" ] || { echo "FAIL[$label]: experiment not created"; return 1; }
+  CLEAN_HDR+=("$header"); CLEAN_EID+=("$eid")
+  rid="$(curl -fsSk -H "$header" -H "X-MLFLOW-WORKSPACE: ${WORKSPACE}" -H 'Content-Type: application/json' \
+         -X POST "$BASE/runs/create" -d "{\"experiment_id\":\"${eid}\",\"start_time\":1700000000000}" | jq -r '.run.info.run_id // empty')"
+  [ -n "$rid" ] || { echo "FAIL[$label]: run not created"; return 1; }
+  curl -fsSk -H "$header" -H "X-MLFLOW-WORKSPACE: ${WORKSPACE}" -H 'Content-Type: application/json' \
+    -X POST "$BASE/runs/log-parameter" -d "{\"run_id\":\"${rid}\",\"key\":\"model_name\",\"value\":\"qwen3-0.6b\"}" >/dev/null
+  curl -fsSk -H "$header" -H "X-MLFLOW-WORKSPACE: ${WORKSPACE}" -H 'Content-Type: application/json' \
+    -X POST "$BASE/runs/log-metric" -d "{\"run_id\":\"${rid}\",\"key\":\"loss\",\"value\":0.123,\"timestamp\":1700000000000,\"step\":1}" >/dev/null
+  curl -fsSk -H "$header" -H "X-MLFLOW-WORKSPACE: ${WORKSPACE}" -H 'Content-Type: application/json' \
+    -X POST "$BASE/runs/update" -d "{\"run_id\":\"${rid}\",\"status\":\"FINISHED\",\"end_time\":1700000005000}" >/dev/null
+  run="$(curl -fsSk -H "$header" -H "X-MLFLOW-WORKSPACE: ${WORKSPACE}" "$BASE/runs/get?run_id=${rid}")"
+  owner="$(printf '%s' "$run" | jq -r '.run.info.user_id')"
+  status="$(printf '%s' "$run" | jq -r '.run.info.status')"
+  param="$(printf '%s' "$run" | jq -r '.run.data.params[] | select(.key=="model_name") | .value')"
+  echo "  [$label] run_id=${rid} owner=${owner} status=${status} model_name=${param}"
+  [ "$status" = "FINISHED" ]  || { echo "FAIL[$label]: run not FINISHED"; return 1; }
+  [ "$param" = "qwen3-0.6b" ] || { echo "FAIL[$label]: param not logged"; return 1; }
+  [ "$owner" = "$expect" ]    || { echo "FAIL[$label]: owner '${owner}' != expected '${expect}'"; return 1; }
+}
+
+EXPECT_OWNER="$MLFLOW_USERNAME"
+
+# ---------------------------------------------------------------------------
+# Leg 1: bearer token (authorization_code + PKCE -> id_token)
+# ---------------------------------------------------------------------------
+if [ -n "${DEX_CLIENT_SECRET:-}" ]; then
+  echo "== leg 1: mint id token via authorization_code + PKCE =="
+  V="$(openssl rand -base64 48 | tr '+/' '-_' | tr -d '=' | cut -c1-64)"
+  C="$(printf %s "$V" | openssl dgst -sha256 -binary | openssl base64 -A | tr '+/' '-_' | tr -d '=')"
+  RU="$(jq -rn --arg u "$REDIRECT_URI" '$u|@uri')"; SC="$(jq -rn '"openid email groups offline_access"|@uri')"
+  REQ="$(curl -fsSk "$P/dex/api/v1/authorize?client_id=${DEX_CLIENT_ID}&redirect_uri=${RU}&response_type=code&scope=${SC}&state=cli&code_challenge=${C}&code_challenge_method=S256" | jq -r '.req // empty')"
+  [ -n "$REQ" ] || { echo "FAIL: authorize returned no req (PKCE/client issue?)"; exit 1; }
+  ENC="$(rsa_password)"
+  CODE="$(curl -fsSk -X POST "$P/dex/api/v1/authorize/local?req=${REQ}" -H 'Content-Type: application/json' \
+          --data "$(jq -nc --arg a "$MLFLOW_USERNAME" --arg p "$ENC" '{account:$a,password:$p}')" \
+          | jq -r '.redirect_url // empty' | sed -E 's/.*code=([^&]+).*/\1/')"
+  [ -n "$CODE" ] || { echo "FAIL: login returned no auth code (captcha triggered or bad credentials?)"; exit 1; }
+  ID_TOKEN="$(curl -fsSk "$P/dex/token" -d grant_type=authorization_code -d code="$CODE" \
+              --data-urlencode redirect_uri="$REDIRECT_URI" -d code_verifier="$V" \
+              -d client_id="${DEX_CLIENT_ID}" --data-urlencode client_secret="${DEX_CLIENT_SECRET}" | jq -r '.id_token // empty')"
+  [ -n "$ID_TOKEN" ] || { echo "FAIL: token exchange returned no id_token"; exit 1; }
+  EXPECT_OWNER="$(b64url_decode "$(printf '%s' "$ID_TOKEN" | cut -d. -f2)" | jq -r '.email // .preferred_username // .name // .sub')"
+  echo "  caller identity: ${EXPECT_OWNER}"
+  # Is the proxy configured to accept bearer tokens?
+  HTTP="$(curl -sk -o /dev/null -w '%{http_code}' -H "Authorization: Bearer ${ID_TOKEN}" -H "X-MLFLOW-WORKSPACE: ${WORKSPACE}" "$BASE/experiments/search?max_results=1")"
+  if [ "$HTTP" = "200" ]; then
+    run_and_assert "token" "Authorization: Bearer ${ID_TOKEN}" "$EXPECT_OWNER"
+    echo "PASS: bearer-token method (authorization_code + PKCE)"
+  else
+    echo "SKIP: bearer-token method — proxy returned HTTP ${HTTP} (enable --skip-jwt-bearer-tokens on the MLflow proxy)"
+  fi
+else
+  echo "SKIP: bearer-token method — set DEX_CLIENT_SECRET to exercise it"
+fi
+
+# ---------------------------------------------------------------------------
+# Leg 2: session cookie (no platform changes)
+# ---------------------------------------------------------------------------
+echo "== leg 2: mint _oauth2_proxy cookie via the proxy login =="
+JAR="$TMP/proxyjar.txt"; : > "$JAR"
+LOC="$(curl -sk -c "$JAR" -D - -o /dev/null "$P/clusters/${CLUSTER}/mlflow/" | awk 'BEGIN{IGNORECASE=1}/^location:/{print $2}' | tr -d '\r')"
+QS="${LOC#*\?}"
+[ "$QS" != "$LOC" ] || { echo "FAIL: MLflow route did not redirect to login"; exit 1; }
+REQ="$(curl -sk -b "$JAR" -c "$JAR" "$P/dex/api/v1/authorize?${QS}" | jq -r '.req // empty')"
+[ -n "$REQ" ] || { echo "FAIL: proxy authorize returned no req"; exit 1; }
+ENC="$(rsa_password)"
+CB="$(curl -sk -b "$JAR" -c "$JAR" -X POST "$P/dex/api/v1/authorize/local?req=${REQ}" -H 'Content-Type: application/json' \
+      --data "$(jq -nc --arg a "$MLFLOW_USERNAME" --arg p "$ENC" '{account:$a,password:$p}')" | jq -r '.redirect_url // empty')"
+[ -n "$CB" ] || { echo "FAIL: proxy login returned no callback url"; exit 1; }
+curl -sk -b "$JAR" -c "$JAR" -o /dev/null "$CB"
+COOKIE="$(awk -F'\t' '$6 ~ /^_oauth2_proxy/{printf "%s=%s; ",$6,$7}' "$JAR" | sed 's/; $//')"
+[ -n "$COOKIE" ] || { echo "FAIL: no _oauth2_proxy cookie minted"; exit 1; }
+run_and_assert "cookie" "Cookie: ${COOKIE}" "$EXPECT_OWNER"
+echo "PASS: session-cookie method (no platform changes)"
+
+echo "DONE: authenticated to MLflow through the OAuth proxy as '${EXPECT_OWNER}' — browser-free, no container-port access"