diff --git a/README.md b/README.md
index 8d9aab59..57cbf39a 100644
--- a/README.md
+++ b/README.md
@@ -429,6 +429,105 @@ hyp get-operator-logs hyp-pytorch-job --since-hours 0.5
 hyp delete hyp-pytorch-job --job-name <job-name>
 ```
 
+### Recipe Job
+
+Use `hyp-recipe-job` to submit fine-tuning and evaluation jobs using pre-built recipes from SageMaker JumpStart Hub — no YAML authoring required.
+
+
+#### Initialize Recipe Job Configuration
+
+```bash
+mkdir my-recipe-job && cd my-recipe-job
+
+# Option A: HuggingFace model ID
+hyp init hyp-recipe-job . \
+    --huggingface-model-id Qwen/Qwen3-0.6B \
+    --technique SFT \
+    --instance-type ml.g5.48xlarge
+
+# Option B: JumpStart model ID
+hyp init hyp-recipe-job . \
+    --model-id huggingface-reasoning-qwen3-06b \
+    --technique SFT \
+    --instance-type ml.g5.48xlarge
+```
+
+Supported job types:
+- **Fine-tuning**: `SFT`, `DPO`, `CPT`, `PPO`, `RLAIF`, `RLVR`
+- **Evaluation**: `deterministic`, `LLMAJ`
+
+> **Note**: If you omit `--instance-type`, the CLI will automatically query your HyperPod clusters and find clusters with instance types supported by the selected recipe and technique. You will be presented with a list of compatible clusters to choose from.
+
+#### Configure Recipe Job Parameters
+
+```bash
+hyp configure \
+    --name my-recipe-job \
+    --namespace default \
+    --data-path /data/recipes-data/sft/train.jsonl \
+    --global-batch-size 8 \
+    --learning-rate 0.0001 \
+    --max-epochs 1 \
+    --output-path /data/output/my-model \
+    --instance-type ml.g5.48xlarge
+```
+
+#### Validate Configuration
+
+```bash
+hyp validate
+```
+
+#### Reset Configuration
+
+To reset `config.yaml` back to its default values:
+
+```bash
+hyp reset
+```
+
+#### Submit Recipe Job
+
+```bash
+hyp create
+```
+
+#### List Recipe Jobs
+
+```bash
+hyp list hyp-recipe-job --namespace default
+```
+
+#### Describe a Recipe Job
+
+```bash
+hyp describe hyp-recipe-job --job-name <job-name> --namespace default
+```
+
+#### List Pods for a Recipe Job
+
+```bash
+hyp list-pods hyp-recipe-job --job-name <job-name> --namespace default
+```
+
+#### Get Logs from a Recipe Job Pod
+
+```bash
+hyp get-logs hyp-recipe-job --job-name <job-name> --pod-name <pod-name> --namespace default
+```
+
+#### Get Operator Logs
+
+```bash
+hyp get-operator-logs hyp-recipe-job
+```
+
+#### Delete a Recipe Job
+
+```bash
+hyp delete hyp-recipe-job --job-name <job-name> --namespace default
+```
+
 ### Inference 
 
 ### Jumpstart Endpoint Creation
diff --git a/doc/examples.md b/doc/examples.md
index 18d4b392..319075b5 100644
--- a/doc/examples.md
+++ b/doc/examples.md
@@ -66,6 +66,13 @@ For detailed examples of training with HyperPod, see:
 **Training Examples** Refer the Training SDK Example.
 :::
 
+:::{grid-item-card} Recipe Job CLI Example
+:link: https://github.com/aws/sagemaker-hyperpod-cli/blob/main/examples/end_to_end_walkthrough/01-training-job-submission/02-recipe-job-cli.ipynb
+:class-card: sd-border-primary
+
+**Recipe Job Example** Submit a fine-tuning job using `hyp-recipe-job` — pre-built recipes from SageMaker JumpStart Hub, no YAML required.
+:::
+
 ::::
 
 
diff --git a/doc/getting_started/training.md b/doc/getting_started/training.md
index 5e888c20..4bbdf42e 100644
--- a/doc/getting_started/training.md
+++ b/doc/getting_started/training.md
@@ -104,6 +104,107 @@ This will:
 - Initialize the job creation process
 
 
+## Creating Training Jobs -- Recipe Job Init Experience
+
+The `hyp-recipe-job` experience lets you submit fine-tuning and evaluation jobs using pre-built recipes published to SageMaker JumpStart Hub. No YAML authoring required — the CLI fetches the Kubernetes job template and parameter spec automatically.
+
+### 1. Initialize a Recipe Job
+
+`````{tab-set}
+````{tab-item} CLI (HuggingFace model ID)
+```bash
+mkdir my-recipe-job
+cd my-recipe-job
+hyp init hyp-recipe-job . \
+    --huggingface-model-id Qwen/Qwen3-0.6B \
+    --technique SFT \
+    --instance-type ml.g5.48xlarge
+```
+````
+````{tab-item} CLI (JumpStart model ID)
+```bash
+mkdir my-recipe-job
+cd my-recipe-job
+hyp init hyp-recipe-job . \
+    --model-id huggingface-reasoning-qwen3-06b \
+    --technique SFT \
+    --instance-type ml.g5.48xlarge
+```
+````
+`````
+
+Supported job types:
+- **Fine-tuning**: `SFT`, `DPO`, `CPT`, `PPO`, `RLAIF`, `RLVR`
+- **Evaluation**: `deterministic`, `LLMAJ`
+
+```{note}
+If you omit `--instance-type`, the CLI will automatically query your HyperPod clusters and find clusters with instance types supported by the selected recipe and technique. You will be presented with a list of compatible clusters to choose from. Note that this interactive prompt requires a terminal and is not supported in Jupyter notebooks.
+```
+
+This creates three files in your job directory:
+- `config.yaml` — your editable training parameters
+- `.override_spec.json` — the parameter schema
+- `k8s.jinja` — the Kubernetes job template
+
+### 3. Configure Recipe Job Parameters
+
+```bash
+hyp configure \
+    --name my-recipe-job \
+    --namespace default \
+    --data-path /data/recipes-data/sft/train.jsonl \
+    --global-batch-size 8 \
+    --learning-rate 0.0001 \
+    --max-epochs 1 \
+    --output-path /data/output/my-model \
+    --instance-type ml.g5.48xlarge
+```
+
+### 4. Validate Configuration
+
+```bash
+hyp validate
+```
+
+### 4a. Reset Configuration (Optional)
+
+To reset `config.yaml` back to its default values:
+
+```bash
+hyp reset
+```
+
+### 5. Submit the Recipe Job
+
+```bash
+hyp create
+```
+
+### 6. Manage Recipe Jobs
+
+```bash
+# List jobs
+hyp list hyp-recipe-job --namespace default
+
+# Describe a job
+hyp describe hyp-recipe-job --job-name <job-name> --namespace default
+
+# List pods
+hyp list-pods hyp-recipe-job --job-name <job-name> --namespace default
+
+# Get logs
+hyp get-logs hyp-recipe-job --job-name <job-name> --pod-name <pod-name> --namespace default
+
+# Get operator logs
+hyp get-operator-logs hyp-recipe-job
+
+# Exec into pods
+hyp exec hyp-recipe-job --job-name <job-name> --namespace default --all-pods -- echo hello
+
+# Delete job
+hyp delete hyp-recipe-job --job-name <job-name> --namespace default
+```
+
 ## Creating Training Jobs -- CLI/SDK
 
 You can create training jobs using either the CLI or SDK approach:
@@ -295,5 +396,6 @@ For detailed examples of training with HyperPod, see:
 - <a href="https://github.com/aws/sagemaker-hyperpod-cli/blob/main/examples/training/CLI/training-init-experience.ipynb" target="_blank">CLI Training Init Experience Example</a>
 - <a href="https://github.com/aws/sagemaker-hyperpod-cli/blob/main/examples/training/CLI/training-e2e-cli.ipynb" target="_blank">CLI Training Example</a>
 - <a href="https://github.com/aws/sagemaker-hyperpod-cli/blob/main/examples/training/SDK/training_sdk_example.ipynb" target="_blank">SDK Training Example</a>
+- <a href="https://github.com/aws/sagemaker-hyperpod-cli/blob/main/examples/end_to_end_walkthrough/01-training-job-submission/02-recipe-job-cli.ipynb" target="_blank">Recipe Job CLI Example</a>
 
 These examples demonstrate end-to-end workflows for creating and managing training jobs using both the CLI and SDK approaches.
diff --git a/examples/end_to_end_walkthrough/01-training-job-submission/02-recipe-job-cli.ipynb b/examples/end_to_end_walkthrough/01-training-job-submission/02-recipe-job-cli.ipynb
new file mode 100644
index 00000000..ac723f6d
--- /dev/null
+++ b/examples/end_to_end_walkthrough/01-training-job-submission/02-recipe-job-cli.ipynb
@@ -0,0 +1,335 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "a1b2c3d4",
+   "metadata": {},
+   "source": [
+    "# Submitting a Recipe Fine-Tuning Job - HyperPod CLI End-to-End Walkthrough\n",
+    "\n",
+    "This example shows how to fine-tune a model using the **HyperPod CLI recipe job** experience (`hyp-recipe-job`). Recipes are pre-built fine-tuning configurations published to SageMaker JumpStart Hub — they include a Kubernetes job template and parameter spec, so you don't need to write any YAML.\n",
+    "\n",
+    "The workflow is:\n",
+    "1. **`hyp init`** — fetch the recipe from SageMaker Hub and scaffold your job directory\n",
+    "2. **`hyp configure`** — set your training parameters\n",
+    "3. **`hyp validate`** — verify the configuration is complete and valid\n",
+    "4. **`hyp create`** — render and submit the Kubernetes job\n",
+    "\n",
+    "This example assumes you have completed the **Setup instructions** in [00-getting-started/00-setup.md](../00-getting-started/00-setup.md) and have a HyperPod EKS cluster with your kubeconfig configured."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b2c3d4e5",
+   "metadata": {},
+   "source": [
+    "## Prerequisites\n",
+    "\n",
+    "- HyperPod EKS cluster with kubeconfig configured\n",
+    "- `sagemaker-hyperpod` CLI installed (`pip install sagemaker-hyperpod`)\n",
+    "- FSx for Lustre volume mounted at `/data` with training data available\n",
+    "- AWS credentials with `sagemaker:DescribeHubContent`, `sagemaker:ListHubContents`, and `s3:GetObject` permissions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "set_cluster_md",
+   "metadata": {},
+   "source": [
+    "## Set Cluster Context\n",
+    "\n",
+    "Configure to point at your HyperPod EKS cluster before running any other commands."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "set_cluster_code",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "CLUSTER_NAME = \"<your-cluster-name>\"  # Replace with your HyperPod cluster name\n",
+    "!hyp set-cluster-context --cluster-name {CLUSTER_NAME}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c3d4e5f6",
+   "metadata": {},
+   "source": [
+    "## Step 0: Configuration\n",
+    "\n",
+    "Set your job name and working directory."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d4e5f6a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "JOB_NAME = \"qwen3-sft-recipe-job\"   # Change to a unique name\n",
+    "JOB_DIR  = f\"./{JOB_NAME}\"          # Local directory for job files\n",
+    "NAMESPACE = \"default\"\n",
+    "\n",
+    "os.makedirs(JOB_DIR, exist_ok=True)\n",
+    "print(f\"Job directory: {os.path.abspath(JOB_DIR)}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e5f6a7b8",
+   "metadata": {},
+   "source": [
+    "## Step 1: Initialize the Recipe Job\n",
+    "\n",
+    "The `hyp init hyp-recipe-job` command fetches the recipe from SageMaker JumpStart Hub and creates three files in your job directory:\n",
+    "- `config.yaml` — your editable training parameters\n",
+    "- `.override_spec.json` — the parameter schema (used by `configure` and `validate`)\n",
+    "- `k8s.jinja` — the Kubernetes job template\n",
+    "\n",
+    "### Specifying your model\n",
+    "\n",
+    "You can identify the model using either a **JumpStart model ID** or a **HuggingFace model ID**:\n",
+    "\n",
+    "```bash\n",
+    "# Option A: JumpStart model ID\n",
+    "hyp init hyp-recipe-job --model-id huggingface-reasoning-qwen3-06b --technique SFT --instance-type ml.g5.48xlarge\n",
+    "\n",
+    "# Option B: HuggingFace model ID (resolved automatically via JumpStart Hub search)\n",
+    "hyp init hyp-recipe-job --huggingface-model-id Qwen/Qwen3-0.6B --technique SFT --instance-type ml.g5.48xlarge\n",
+    "```\n",
+    "\n",
+    "### Instance type selection\n",
+    "\n",
+    "If you omit `--instance-type`, the CLI will **automatically query your HyperPod clusters** and find clusters with instance types supported by the recipe and technique you selected. You will be presented with a list of compatible clusters to choose from.\n",
+    "\n",
+    "Supported job types:\\n",
+    "- **Fine-tuning**: `SFT`, `DPO`, `CPT`, `PPO`, `RLAIF`, `RLVR`\\n",
+    "- **Evaluation**: `deterministic`, `LLMAJ`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "435a652c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Option A: JumpStart model ID\n",
+    "!hyp init hyp-recipe-job {JOB_DIR} \\\n",
+    "    --model-id huggingface-reasoning-qwen3-06b \\\n",
+    "    --technique SFT \\\n",
+    "    --instance-type ml.g5.48xlarge"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f6a7b8c9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Option B: HuggingFace model ID (uncomment to use)\n",
+    "# !hyp init hyp-recipe-job {JOB_DIR} \\\n",
+    "#     --huggingface-model-id Qwen/Qwen3-0.6B \\\n",
+    "#     --technique SFT \\\n",
+    "#     --instance-type ml.g5.48xlarge"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b084517f",
+   "metadata": {},
+   "source": [
+    "> **Note — Option C requires a terminal.** Omitting `--instance-type` triggers an interactive cluster selection prompt, which is not supported in Jupyter notebooks. Run this command in a terminal instead:\n",
+    ">\n",
+    "> ```bash\n",
+    "> hyp init hyp-recipe-job ./<job-dir> \\\n",
+    ">     --huggingface-model-id Qwen/Qwen3-0.6B \\\n",
+    ">     --technique SFT\n",
+    "> ```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a7b8c9d0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Verify files were created\n",
+    "!ls -la {JOB_DIR}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b8c9d0e1",
+   "metadata": {},
+   "source": [
+    "## Step 2: Configure Training Parameters\n",
+    "\n",
+    "Use `hyp configure` to set your training parameters. Run this from inside the job directory.\n",
+    "\n",
+    "You can see all available parameters with `hyp configure --help` (run from inside the job directory)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c9d0e1f2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash -s \"$JOB_NAME\" \"$NAMESPACE\"\n",
+    "cd $1\n",
+    "hyp configure \\\n",
+    "    --name $1 \\\n",
+    "    --namespace $2 \\\n",
+    "    --data-path /data/recipes-data/sft/zc_train_256.jsonl \\\n",
+    "    --global-batch-size 8 \\\n",
+    "    --learning-rate 0.0001 \\\n",
+    "    --lr-warmup-ratio 0.1 \\\n",
+    "    --max-epochs 5 \\\n",
+    "    --output-path /data/output/qwen3-sft \\\n",
+    "    --results-directory /data/results/qwen3-sft \\\n",
+    "    --resume-from-path /data/output/qwen3-sft \\\n",
+    "    --training-data-name zc_train_256 \\\n",
+    "    --validation-data-name zc_train_256 \\\n",
+    "    --validation-data-path /data/recipes-data/sft/zc_train_256.jsonl \\\n",
+    "    --train-val-split-ratio 0.9 \\\n",
+    "    --instance-type ml.g5.48xlarge"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d0e1f2a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Review the generated config\n",
+    "!cat {JOB_DIR}/config.yaml"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e1f2a3b4",
+   "metadata": {},
+   "source": [
+    "## Step 3: Validate Configuration\n",
+    "\n",
+    "Validate that all required fields are set and values are within allowed ranges."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f2a3b4c5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash -s \"$JOB_NAME\"\n",
+    "cd $1\n",
+    "hyp validate"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a3b4c5d6",
+   "metadata": {},
+   "source": [
+    "## Step 4: Submit the Job\n",
+    "\n",
+    "`hyp create` renders the Kubernetes YAML from `k8s.jinja` + `config.yaml` and submits it to your cluster. The rendered files are saved under `run/<timestamp>/` for reference."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b4c5d6e7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash -s \"$JOB_NAME\"\n",
+    "cd $1\n",
+    "hyp create"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c5d6e7f8",
+   "metadata": {},
+   "source": [
+    "## Step 5: Monitor the Job"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d6e7f8a9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check job status\n",
+    "!kubectl get hyperpodpytorchjob {JOB_NAME} -n {NAMESPACE}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e7f8a9b0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# List pods\n",
+    "!hyp list-pods hyp-recipe-job --job-name {JOB_NAME} --namespace {NAMESPACE}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f8a9b0c1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Stream logs from the first pod (replace pod name from list-pods output above)\n",
+    "# !kubectl logs -f <pod-name> -n {NAMESPACE}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a9b0c1d2",
+   "metadata": {},
+   "source": [
+    "## Step 6: Clean Up"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b0c1d2e3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!hyp delete hyp-recipe-job --job-name {JOB_NAME} --namespace {NAMESPACE}"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/end_to_end_walkthrough/01-training-job-submission/README.MD b/examples/end_to_end_walkthrough/01-training-job-submission/README.MD
index f23919b9..e099c613 100644
--- a/examples/end_to_end_walkthrough/01-training-job-submission/README.MD
+++ b/examples/end_to_end_walkthrough/01-training-job-submission/README.MD
@@ -2,4 +2,5 @@
 
 This folder contains the following files:
 - [00-pytorch-training-job.md](00-pytorch-training-job.md) - Instructions on how to create and submit a **Qwen3 4B Lora** fine-tuning job to the HyperPod cluster through the HyperPod CLI. Additionally, an example for instance failure recovery.
-- [01-pytorch-training-job-sdk.ipynb](01-pytorch-training-job-sdk.ipynb) - Instructions on how to to utilize the HyperPod Python SDK to create and submit the equivalent job to the HyperPod cluster.
\ No newline at end of file
+- [01-pytorch-training-job-sdk.ipynb](01-pytorch-training-job-sdk.ipynb) - Instructions on how to to utilize the HyperPod Python SDK to create and submit the equivalent job to the HyperPod cluster.
+- [02-recipe-job-cli.ipynb](02-recipe-job-cli.ipynb) - Instructions on how to submit a fine-tuning job using the HyperPod CLI recipe job experience (`hyp-recipe-job`). Recipes are pre-built fine-tuning configurations fetched from SageMaker JumpStart Hub — no YAML authoring required.
\ No newline at end of file
diff --git a/examples/end_to_end_walkthrough/README.md b/examples/end_to_end_walkthrough/README.md
index 2d6f5d6e..7b1c618b 100644
--- a/examples/end_to_end_walkthrough/README.md
+++ b/examples/end_to_end_walkthrough/README.md
@@ -9,6 +9,7 @@ A recording of the full walkthrough as part of re:invent 2025 session 371 is ava
 - [**Training Job Submission**](./01-training-job-submission/)
     - [00-pytorch-training-job.md](./01-training-job-submission/00-pytorch-training-job.md) - Instructions on how to create and submit a Qwen3 4B Lora fine-tuning job to the HyperPod cluster through the HyperPod CLI. Additionally, an example for instance failure recovery.
     - [01-pytorch-training-job-sdk.ipynb](./01-training-job-submission/01-pytorch-training-job-sdk.ipynb) - Instructions on how to to utilize the HyperPod Python SDK to create and submit the equivalent job to the HyperPod cluster.
+    - [02-recipe-job-cli.ipynb](./01-training-job-submission/02-recipe-job-cli.ipynb) - Instructions on how to submit a fine-tuning job using the HyperPod CLI recipe job experience (`hyp-recipe-job`). Recipes are pre-built fine-tuning configurations fetched from SageMaker JumpStart Hub — no YAML authoring required.
 - [**Inference Deployment**](./02-inference-deployment/)
     - [00-jumpstart-endpoint.md](./02-inference-deployment/00-jumpstart-endpoint.md) - Instructions on how to deploy models available on SageMaker JumpStart to the HyperPod cluster.
     - [01-custom-model-endpoint.md](./02-inference-deployment/01-custom-model-endpoint.md) - Instructions on how to deploy a custom model from an S3 bucket (TinyLlama) to the HyperPod cluster and how to utilize the autoscaling functionality.
diff --git a/setup.py b/setup.py
index 49097c3f..fcb88a8e 100644
--- a/setup.py
+++ b/setup.py
@@ -62,7 +62,7 @@
         "awscli-cwlogs>=1.4.6",
         "boto3>=1.35.3,<2.0",
         "botocore>=1.35.6 ",
-        "kubernetes==33.1.0",
+        "kubernetes>=33.1.0",
         "kr8s>=0.20.0",
         "pyyaml==6.0.2",
         "ratelimit==2.2.1",
@@ -76,6 +76,8 @@
         "omegaconf==2.3",
         "pynvml==11.4.1",
         "requests==2.32.4",
+        "urllib3>=1.21.1,<3",
+        "charset-normalizer>=2,<4",
         "tqdm==4.66.5",
         "zstandard==0.15.2",
         # Test dependencies
diff --git a/src/sagemaker/hyperpod/cli/commands/cluster.py b/src/sagemaker/hyperpod/cli/commands/cluster.py
index 289a827a..a66d3a33 100644
--- a/src/sagemaker/hyperpod/cli/commands/cluster.py
+++ b/src/sagemaker/hyperpod/cli/commands/cluster.py
@@ -358,6 +358,12 @@ def rate_limited_operation(
                     )
             cluster_capacities.append(capacities)
         return cluster_capacities
+    except RuntimeError as e:
+        if "not found" in str(e):
+            logger.debug(f"Skipping cluster {cluster_name}: {e}")
+        else:
+            logger.error(f"Error processing cluster {cluster_name}: {e}, continue...")
+        return None
     except Exception as e:
         logger.error(f"Error processing cluster {cluster_name}: {e}, continue...")
         return None
@@ -831,8 +837,11 @@ def _update_kube_config(
 
     try:
         # Execute the command to update kubeconfig
-        subprocess.run(command, check=True)
+        subprocess.run(command, check=True, capture_output=True)
     except subprocess.CalledProcessError as e:
+        stderr = e.stderr.decode() if e.stderr else ""
+        if "ResourceNotFoundException" in stderr or "No cluster found" in stderr:
+            raise RuntimeError(f"EKS cluster '{eks_name}' not found (may have been deleted)")
         raise RuntimeError(f"Failed to update kubeconfig: {e}")
     except (OSError, ValueError) as e:
         raise RuntimeError(f"Invalid command execution: {e}")
diff --git a/src/sagemaker/hyperpod/cli/commands/inference.py b/src/sagemaker/hyperpod/cli/commands/inference.py
index 49c14fa0..85a1fcd0 100644
--- a/src/sagemaker/hyperpod/cli/commands/inference.py
+++ b/src/sagemaker/hyperpod/cli/commands/inference.py
@@ -29,9 +29,7 @@
 @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "create_js_endpoint_cli")
 @handle_cli_exceptions()
 def js_create(version, debug, js_endpoint):
-    """
-    Create a jumpstart model endpoint.
-    """
+    """Create a jumpstart model endpoint"""
     click.echo(f"Using version: {version}")
     js_endpoint.create(debug=debug)
 
@@ -46,9 +44,7 @@ def js_create(version, debug, js_endpoint):
 @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "create_custom_endpoint_cli")
 @handle_cli_exceptions()
 def custom_create(version, debug, custom_endpoint):
-    """
-    Create a custom model endpoint.
-    """
+    """Create a custom model endpoint"""
     click.echo(f"Using version: {version}")
     custom_endpoint.create(debug=debug)
     
@@ -134,7 +130,7 @@ def js_list(
     namespace: Optional[str],
 ):
     """
-    List all Hyperpod Jumpstart model endpoints.
+    List all HyperPod Jumpstart model endpoints.
     """
     endpoints = HPJumpStartEndpoint.model_construct().list(namespace)
     data = [ep.model_dump() for ep in endpoints]
@@ -177,7 +173,7 @@ def custom_list(
     namespace: Optional[str],
 ):
     """
-    List all Hyperpod custom model endpoints.
+    List all HyperPod custom model endpoints.
     """
     endpoints = HPEndpoint.model_construct().list(namespace)
     data = [ep.model_dump() for ep in endpoints]
@@ -236,7 +232,7 @@ def js_describe(
     full: bool
 ):
     """
-    Describe a Hyperpod Jumpstart model endpoint.
+    Describe a HyperPod Jumpstart model endpoint.
     """
     my_endpoint = HPJumpStartEndpoint.model_construct().get(name, namespace)
     data = my_endpoint.model_dump()
@@ -385,7 +381,7 @@ def custom_describe(
     full: bool
 ):
     """
-    Describe a Hyperpod custom model endpoint.
+    Describe a HyperPod custom model endpoint.
     """
     my_endpoint = HPEndpoint.model_construct().get(name, namespace)
     data = my_endpoint.model_dump()
diff --git a/src/sagemaker/hyperpod/cli/commands/init.py b/src/sagemaker/hyperpod/cli/commands/init.py
index c3a54d16..8cfb15c1 100644
--- a/src/sagemaker/hyperpod/cli/commands/init.py
+++ b/src/sagemaker/hyperpod/cli/commands/init.py
@@ -1,10 +1,10 @@
 import click
 import yaml
 import sys
+import shutil
 from pathlib import Path
 from datetime import datetime
 from jinja2 import Template
-import shutil
 from sagemaker.hyperpod.cli.constants.init_constants import (
     USAGE_GUIDE_TEXT_CFN,
     USAGE_GUIDE_TEXT_CRD,
@@ -24,51 +24,96 @@
     build_config_from_schema,
     save_template,
     get_default_version_for_template,
-    create_from_k8s_yaml
+    create_from_k8s_yaml,
+    is_dynamic_template
 )
 from sagemaker.hyperpod.common.utils import get_aws_default_region
 from sagemaker.hyperpod.common.telemetry.telemetry_logging import (
     _hyperpod_telemetry_emitter,
 )
 from sagemaker.hyperpod.common.telemetry.constants import Feature
+from sagemaker.hyperpod.cli.commands.training_recipe import _init_training_job, _configure_dynamic_template, _create_dynamic_template
+from sagemaker.hyperpod.cli.recipe_utils import _validate_dynamic_template, _generate_dynamic_config_yaml
+
 
 @click.command("init")
 @click.argument("template", type=click.Choice(list(TEMPLATES.keys())))
 @click.argument("directory", type=click.Path(file_okay=False), default=".")
 @click.option("--version", "-v", default=None, help="Schema version")
+@click.option("--model-id", hidden=True, help="JumpStart model ID (e.g. meta-textgeneration-llama-3-2-1b)")
+@click.option("--huggingface-model-id", hidden=True, help="HuggingFace model ID (e.g. meta-llama/Llama-2-7b)")
+@click.option("--technique", hidden=True, help="Customization technique (for hyp-recipe-job only)")
+@click.option("--instance-type", hidden=True, help="Instance type (optional - if not provided, interactive cluster selection will be used)")
 @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "init_template_cli")
 def init(
     template: str,
     directory: str,
     version: str,
+    model_id: str,
+    huggingface_model_id: str,
+    technique: str,
+    instance_type: str,
 ):
     """
     Initialize a TEMPLATE scaffold in DIRECTORY.
-    
+
     This command creates a complete project scaffold for the specified template type.
     It performs the following steps:
-    
+
+    \b
     1. Checks if the directory already contains a config.yaml and handles existing configurations
     2. Creates the target directory if it doesn't exist
     3. Generates a config.yaml file with schema-based default values
     4. Creates a template file (.jinja) for the specified template type
     5. Adds a README.md with usage instructions
-    
+
     The generated files provide a starting point for configuring and submitting
     jobs to SageMaker HyperPod clusters orchestrated by Amazon EKS.
+
+    Available templates:
+
+    \b
+      hyp-pytorch-job         PyTorch distributed training job
+      hyp-jumpstart-endpoint  JumpStart model inference endpoint
+      hyp-custom-endpoint     Custom model inference endpoint
+      cluster-stack           HyperPod EKS cluster CloudFormation stack
+      hyp-recipe-job          Fine-tuning/evaluation job from JumpStart Hub recipe
+
+    For hyp-recipe-job, the following options are available:
+
+    \b
+      --model-id              JumpStart model ID (required, or use --huggingface-model-id)
+      --huggingface-model-id  HuggingFace model ID (required, or use --model-id)
+      --technique      Fine-tuning: SFT, DPO, RLAIF, RLVR, CPT, PPO
+                       Evaluation: deterministic, LLMAJ (required)
+      --instance-type  Instance type to use. If not provided, an interactive
+                       cluster selection will be launched (optional)
     """
+    # Original template initialization logic
     dir_path = Path(directory).resolve()
     config_file = dir_path / "config.yaml"
     skip_readme = False
 
+    # Validate required params for hyp-recipe-job before any output
+    if template in ["hyp-recipe-job"]:
+        if not model_id and not huggingface_model_id:
+            click.secho(f"❌ --model-id or --huggingface-model-id is required for {template}", fg="red")
+            return
+        if model_id and huggingface_model_id:
+            click.secho("❌ Specify either --model-id or --huggingface-model-id, not both", fg="red")
+            return
+        if not technique:
+            click.secho(f"❌ --technique is required for {template} (e.g. SFT, DPO, deterministic, LLMAJ)", fg="red")
+            return
+
     # 1) Inspect existing config.yaml
     try:
         if config_file.is_file():
             try:
-                existing = yaml.safe_load(config_file.read_text()) or {}
-                existing_template = existing.get("template")
+                # Use load_config to properly read commented template
+                _, existing_template, _ = load_config(dir_path)
             except Exception as e:
-                click.echo("Could not parse existing config.yaml: %s", e)
+                click.secho(f"⚠️  Could not parse existing config.yaml: {e}", fg="yellow")
                 existing_template = None
 
             if existing_template == template:
@@ -90,7 +135,7 @@ def init(
         else:
             click.echo(f"Initializing new scaffold for '{template}'…")
     except Exception as e:
-        click.secho("💥  Initialization aborted due to error: %s", e, fg="red")
+        click.secho(f"💥  Initialization aborted due to error: {e}", fg="red")
         sys.exit(1)
 
     # 2) Ensure directory exists
@@ -100,6 +145,14 @@ def init(
         click.secho(f"❌  Could not create directory {dir_path}: {e}", fg="red")
         sys.exit(1)
 
+    # Handle dynamic job templates
+    if template in ["hyp-recipe-job"]:
+        resolved_model_id = huggingface_model_id if huggingface_model_id else model_id
+        if _init_training_job(directory, template, resolved_model_id, technique, instance_type, is_huggingface=bool(huggingface_model_id)):
+            click.secho(f"✔️ {template.replace('-', ' ').title()} initialized successfully", fg="green")
+            click.secho("📄 Created: config.yaml, k8s.jinja", fg="green")
+        return
+
     # 3) Build config dict + comment map, then write config.yaml
     try:
         # Determine version: use user-provided version or default to latest
@@ -162,9 +215,19 @@ def reset():
     # 1) Load and validate config
     data, template, version = load_config(dir_path)
     
-    # 2) Build config with default values from schema
+    # 2) Check if this is a dynamic template
+    if is_dynamic_template(template, dir_path):
+        # For dynamic templates, reset using the helper function
+        try:
+            _generate_dynamic_config_yaml(dir_path, template, version)
+            click.secho("✔️ config.yaml reset: all fields set to default values.", fg="green")
+        except Exception as e:
+            click.secho(f"💥 Could not reset config.yaml: {e}", fg="red")
+            sys.exit(1)
+        return
+    
+    # 3) Standard template reset logic
     full_cfg, comment_map = build_config_from_schema(template, version)
-    # 3) Overwrite config.yaml
     try:
         save_config_yaml(
             prefill=full_cfg,
@@ -185,7 +248,7 @@ def reset():
 @generate_click_command()
 @click.pass_context
 @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "init_configure_cli")
-def configure(ctx, model_config):
+def configure(ctx, option, value, model_config):
     """
     Update any subset of fields in ./config.yaml by passing --<field> flags.
     
@@ -201,16 +264,27 @@ def configure(ctx, model_config):
         
         # Update multiple fields at once
         hyp configure --stack-name my-stack  --create-fsx-stack: False
-        
+
         # Update complex fields with JSON object
         hyp configure --availability-zone-ids '["id1", "id2"]'
-    
     """
     # 1) Load existing config without validation
     dir_path = Path(".").resolve()
     data, template, version = load_config(dir_path)
     
-    # 2) Determine which fields the user actually provided
+    # 2) Check if this is a dynamic template (recipe)
+    if is_dynamic_template(template, dir_path):
+        # Handle recipe configure logic
+        _configure_dynamic_template(ctx, option, value, dir_path)
+        return
+    
+    # 3) Handle standard template configure logic
+    _configure_standard_template(ctx, model_config, dir_path, data, template, version)
+
+
+def _configure_standard_template(ctx, model_config, dir_path, data, template, version):
+    """Handle configure for standard templates"""
+    # Determine which fields the user actually provided
     # Use Click's parameter source tracking to identify command-line provided parameters
     user_input_fields = set()
     
@@ -223,10 +297,10 @@ def configure(ctx, model_config):
                 user_input_fields.add(param_name)
     
     if not user_input_fields:
-        click.secho("⚠️  No arguments provided to configure.", fg="yellow")
-        return
+        click.echo(ctx.get_help())
+        ctx.exit(0)
 
-    # 3) Build merged config with user input
+    # Build merged config with user input
     full_cfg, comment_map = build_config_from_schema(
         template=template,
         version=version,
@@ -235,7 +309,7 @@ def configure(ctx, model_config):
         user_provided_fields=user_input_fields
     )
 
-    # 4) Validate the merged config, but only check user-provided fields
+    # Validate the merged config, but only check user-provided fields
     all_validation_errors = validate_config_against_model(full_cfg, template, version)
     user_input_errors = filter_validation_errors_for_user_input(all_validation_errors, user_input_fields)
     
@@ -249,7 +323,7 @@ def configure(ctx, model_config):
         click.secho("❌  config.yaml was not updated due to invalid input.", fg="red")
         sys.exit(1)
 
-    # 5) Write out the updated config.yaml (only if user input is valid)
+    # Write out the updated config.yaml (only if user input is valid)
     try:
         save_config_yaml(
             prefill=full_cfg,
@@ -268,7 +342,26 @@ def validate():
     Validate this directory's config.yaml against the appropriate schema.
     """
     dir_path = Path(".").resolve()
-    load_config_and_validate(dir_path)
+    
+    try:
+        # Load config to determine template type
+        data, template, version = load_config(dir_path)
+        
+        # Check if this is a dynamic template
+        if is_dynamic_template(template, dir_path):
+            # Validate dynamic template
+            _validate_dynamic_template(dir_path)
+            click.secho("✔️ Configuration validated successfully", fg="green")
+        else:
+            # Use standard validation
+            load_config_and_validate(dir_path)
+            click.secho("✔️ Configuration validated successfully", fg="green")
+    except (FileNotFoundError, ValueError) as e:
+        click.secho(f"❌ {e}", fg="red")
+        sys.exit(1)
+    except Exception as e:
+        click.secho(f"❌ Validation failed: {e}", fg="red")
+        sys.exit(1)
 
 
 @click.command(name="_default_create")
@@ -310,6 +403,17 @@ def _default_create(region, template_version, debug):
     # 1) Load config to determine template type
     data, template, version = load_config_and_validate(dir_path)
     
+    # Check if this is a dynamic template (recipe)
+    if is_dynamic_template(template, dir_path):
+        _create_dynamic_template(dir_path, data)
+        return
+    
+    # Handle standard templates (existing logic)
+    _create_standard_template(dir_path, data, template, version, region, template_version, debug)
+
+
+def _create_standard_template(dir_path: Path, data: dict, template: str, version: str, region: str, template_version: int, debug: bool = False):
+    """Handle create for standard templates"""
     # Check if region flag is used for non-cluster-stack templates
     if region and template != "cluster-stack":
         click.secho(f"❌  --region flag is only available for cluster-stack template, not for {template}.", fg="red")
@@ -324,6 +428,7 @@ def _default_create(region, template_version, debug):
         jinja_file = dir_path / 'k8s.jinja'
 
     # 3) Ensure files exist
+    config_file = dir_path / 'config.yaml'
     if not config_file.is_file() or not jinja_file.is_file():
         click.secho(f"❌  Missing config.yaml or {jinja_file.name}. Run `hyp init` first.", fg="red")
         sys.exit(1)
@@ -387,7 +492,6 @@ def _default_create(region, template_version, debug):
                 k8s_file = out_dir / 'k8s.yaml'
                 create_from_k8s_yaml(str(k8s_file), debug=debug)
 
-
     except Exception as e:
         click.secho(f"❌  Failed to submit the command: {e}", fg="red")
         sys.exit(1)
\ No newline at end of file
diff --git a/src/sagemaker/hyperpod/cli/commands/training.py b/src/sagemaker/hyperpod/cli/commands/training.py
index 5d3153ab..b59f874b 100644
--- a/src/sagemaker/hyperpod/cli/commands/training.py
+++ b/src/sagemaker/hyperpod/cli/commands/training.py
@@ -21,7 +21,7 @@
 @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "create_pytorchjob_cli")
 @handle_cli_exceptions()
 def pytorch_create(version, debug, job):
-    """Create a PyTorch job."""
+    """Create a PyTorch job"""
     click.echo(f"Using version: {version}")
     # Create job
     job.create(debug=debug)
diff --git a/src/sagemaker/hyperpod/cli/commands/training_recipe.py b/src/sagemaker/hyperpod/cli/commands/training_recipe.py
new file mode 100644
index 00000000..1c5f954f
--- /dev/null
+++ b/src/sagemaker/hyperpod/cli/commands/training_recipe.py
@@ -0,0 +1,321 @@
+from concurrent.futures import ThreadPoolExecutor
+import threading
+from datetime import datetime
+from kubernetes import client, config
+from kubernetes.client.rest import ApiException
+import yaml
+import json
+import sys
+import click
+from pathlib import Path
+from sagemaker.hyperpod.cli.init_utils import load_dynamic_schema
+from sagemaker.hyperpod.common.utils import handle_exception
+from sagemaker.hyperpod.cli.type_handler_utils import is_undefined_value
+from sagemaker.hyperpod.cli.recipe_utils import (
+    _fetch_recipe_from_hub, _download_s3_content, _download_s3_json,
+    _validate_and_convert_value, _collect_all_parameters_interactively,
+    _submit_k8s_resources, _render_k8s_template, _get_sagemaker_client,
+    _get_s3_client, _get_k8s_custom_client, _validate_dynamic_template,
+    _generate_dynamic_config_yaml, _update_config_field
+)
+import shutil
+from sagemaker.hyperpod.common.telemetry.constants import Feature
+from sagemaker.hyperpod.common.telemetry.telemetry_logging import _hyperpod_telemetry_emitter
+from sagemaker.hyperpod.common.cli_decorators import handle_cli_exceptions
+
+
+def _interactive_cluster_selection(sagemaker_client, model_id: str, job_type: str, technique: str = None, is_huggingface: bool = False):
+    """Interactive cluster and instance type selection."""
+    try:
+        matching_recipe = _fetch_recipe_from_hub(sagemaker_client, model_id, job_type, technique, None, is_huggingface=is_huggingface)
+        supported_instance_types = set(matching_recipe.get('SupportedInstanceTypes', []))
+
+        if not supported_instance_types:
+            click.secho("❌ No supported instance types found in recipe", fg="red")
+            return None, None
+
+        click.secho("🔍 Fetching available clusters...", fg="blue")
+
+        try:
+            from sagemaker.hyperpod.cli.commands.cluster import _get_hyperpod_clusters
+            from sagemaker.hyperpod.common.utils import get_current_cluster
+
+            cluster_names = _get_hyperpod_clusters(sagemaker_client)
+            if not cluster_names:
+                click.secho("❌ No HyperPod clusters found", fg="red")
+                return None, None
+
+            # Build {cluster_name: [(instance_type, node_count), ...]} for compatible types only
+            clusters_map: dict = {}
+            lock = threading.Lock()
+
+            def _fetch_cluster(cluster_name: str):
+                try:
+                    cluster_response = sagemaker_client.describe_cluster(ClusterName=cluster_name)
+                    compatible = [
+                        (g.get('InstanceType'), g.get('CurrentCount', 0))
+                        for g in cluster_response.get('InstanceGroups', [])
+                        if g.get('InstanceType') in supported_instance_types
+                    ]
+                    if compatible:
+                        with lock:
+                            clusters_map[cluster_name] = compatible
+                except Exception as e:
+                    click.secho(f"⚠️ Warning: Could not get details for cluster {cluster_name}: {e}", fg="yellow")
+
+            with ThreadPoolExecutor(max_workers=min(len(cluster_names), 10)) as executor:
+                list(executor.map(_fetch_cluster, cluster_names))
+
+            if not clusters_map:
+                click.secho("❌ No cluster details could be retrieved", fg="red")
+                return None, None
+
+        except Exception as e:
+            click.secho(f"❌ Error fetching clusters: {e}", fg="red")
+            return None, None
+
+        if not clusters_map:
+            click.secho(
+                f"❌ No compatible clusters found. The '{technique or job_type}' recipe for "
+                f"'{model_id}' requires one of: {sorted(supported_instance_types)}",
+                fg="red",
+            )
+            click.secho(
+                "   To skip cluster auto-detection, specify the instance type directly: --instance-type <instance-type>",
+                fg="yellow",
+            )
+            return None, None
+
+        # Detect current cluster context
+        current_cluster = None
+        try:
+            current_cluster = get_current_cluster()
+        except Exception:
+            pass
+
+        def _prompt_instance_type(cluster_name: str) -> str | None:
+            instance_types = clusters_map[cluster_name]
+            click.secho(f"\n📋 Compatible instance types for {cluster_name}:", fg="green")
+            for i, (itype, nodes) in enumerate(instance_types, 1):
+                click.secho(f"  {i}. {itype:<22} ({nodes} nodes)", fg="white")
+            while True:
+                try:
+                    choice = click.prompt(f"Select an instance type (1-{len(instance_types)})", type=int)
+                    if 1 <= choice <= len(instance_types):
+                        return instance_types[choice - 1][0]
+                    click.secho(f"❌ Please enter a number between 1 and {len(instance_types)}", fg="red")
+                except (ValueError, click.Abort):
+                    click.secho("❌ Operation cancelled", fg="red")
+                    return None
+
+        # If current context cluster is compatible, offer it as default
+        if current_cluster and current_cluster in clusters_map:
+            click.secho(f"\nCurrent cluster context: {current_cluster}", fg="cyan")
+            instance_type = _prompt_instance_type(current_cluster)
+            if instance_type is None:
+                return None, None
+            # Ask if they want to use a different cluster
+            try:
+                use_different = click.confirm("Use a different cluster?", default=False)
+            except click.Abort:
+                click.secho("❌ Operation cancelled", fg="red")
+                return None, None
+            if not use_different:
+                click.secho(f"✔️ Selected: {current_cluster} ({instance_type})", fg="green")
+                return current_cluster, instance_type
+
+        # Full cluster selection
+        cluster_list = list(clusters_map.keys())
+        click.secho(f"\n📋 Compatible clusters ({len(cluster_list)} found):", fg="green")
+        click.secho("-" * 80, fg="blue")
+        for i, name in enumerate(cluster_list, 1):
+            types_summary = ", ".join(f"{t} ({n} nodes)" for t, n in clusters_map[name])
+            click.secho(f"{i}. {name:<40} {types_summary}", fg="cyan")
+
+        while True:
+            try:
+                choice = click.prompt(f"\nSelect a cluster (1-{len(cluster_list)})", type=int)
+                if 1 <= choice <= len(cluster_list):
+                    selected_cluster = cluster_list[choice - 1]
+                    break
+                click.secho(f"❌ Please enter a number between 1 and {len(cluster_list)}", fg="red")
+            except (ValueError, click.Abort):
+                click.secho("❌ Operation cancelled", fg="red")
+                return None, None
+
+        instance_type = _prompt_instance_type(selected_cluster)
+        if instance_type is None:
+            return None, None
+
+        click.secho(f"✔️ Selected: {selected_cluster} ({instance_type})", fg="green")
+        return selected_cluster, instance_type
+
+    except ValueError as e:
+        click.secho(f"❌ {e}", fg="red")
+        return None, None
+    except Exception as e:
+        click.secho(f"❌ Error during cluster selection: {e}", fg="red")
+        return None, None
+
+
+def _init_training_job(directory: str, job_type: str, model_id: str, technique: str, instance_type: str = None, is_huggingface: bool = False) -> bool:
+    """Initialize training job configuration."""
+    try:
+        sagemaker_client = _get_sagemaker_client()
+        s3_client = _get_s3_client()
+
+        # If instance_type not provided, use interactive selection
+        cluster_name = None
+        if not instance_type:
+            cluster_name, instance_type = _interactive_cluster_selection(
+                sagemaker_client, model_id, job_type, technique, is_huggingface=is_huggingface
+            )
+            if not instance_type:
+                return False
+
+        # Update kubeconfig to point at the selected cluster
+        if cluster_name:
+            from sagemaker.hyperpod.cli.commands.cluster import set_cluster_context
+            click.secho(f"🔧 Connecting to cluster: {cluster_name}", fg="blue")
+            set_cluster_context.main(["--cluster-name", cluster_name], standalone_mode=False)
+        
+        # Fetch and validate recipe
+        matching_recipe = _fetch_recipe_from_hub(sagemaker_client, model_id, job_type, technique, instance_type, is_huggingface=is_huggingface)
+        
+        override_params_uri = matching_recipe.get('HpEksOverrideParamsS3Uri')
+        k8s_template_uri = matching_recipe.get('HpEksPayloadTemplateS3Uri')
+        
+        if not override_params_uri or not k8s_template_uri:
+            click.secho("❌ Missing S3 URIs in recipe", fg="red")
+            return False
+        
+        # Create directory
+        dir_path = Path(directory).resolve()
+        dir_path.mkdir(parents=True, exist_ok=True)
+        
+        # Download and save override params
+        override_data = _download_s3_json(s3_client, override_params_uri)
+        with open(dir_path / '.override_spec.json', 'w') as f:
+            json.dump(override_data, f, indent=2)
+        
+        # Create config.yaml
+        _generate_dynamic_config_yaml(dir_path, job_type, model_name=model_id, technique=technique, instance_type=instance_type)
+        
+        # Download and save k8s template
+        k8s_content = _download_s3_content(s3_client, k8s_template_uri)
+        with open(dir_path / 'k8s.jinja', 'w') as f:
+            f.write(k8s_content)
+        
+        return True
+        
+    except Exception as e:
+        click.secho(f"❌ Error: {e}", fg="red")
+        return False
+
+def _configure_dynamic_template(ctx, option, value, dir_path):
+    """Handle configure for dynamic templates (recipe)"""
+    config_path = dir_path / "config.yaml"
+    spec_path = dir_path / ".override_spec.json"
+    
+    if not spec_path.exists():
+        click.secho(f"❌ .override_spec.json not found", fg="red")
+        ctx.exit(1)
+    
+    # Load spec
+    spec = load_dynamic_schema(dir_path)
+    
+    # Check if user provided --option flags (only those explicitly provided, not defaults)
+    provided_options = {}
+    for param_name, param_value in ctx.params.items():
+        if param_name not in ['option', 'value', 'model_config']:
+            # Check if this parameter was actually provided by the user (not a default)
+            param_source = ctx.get_parameter_source(param_name)
+            if param_source and param_source.name == 'COMMANDLINE' and param_value is not None:
+                # Convert back to original key format
+                original_key = param_name.replace('-', '_')
+                if original_key in spec:
+                    provided_options[original_key] = param_value
+    
+    # If --option flags were used, process them
+    if provided_options:
+        for key, value in provided_options.items():
+            _update_config_field(config_path, spec, key, value)
+        click.secho("✔️  config.yaml updated successfully.", fg="green")
+        return
+
+    # If no arguments, show help
+    click.echo(ctx.get_help())
+    ctx.exit(0)
+
+
+def _warn_if_instance_type_unavailable(instance_type: str) -> None:
+    """Warn if the requested instance type has no ready nodes in the current cluster."""
+    try:
+        config.load_kube_config()
+        v1 = client.CoreV1Api()
+        nodes = v1.list_node().items
+        available = {
+            n.metadata.labels.get("node.kubernetes.io/instance-type")
+            for n in nodes
+            if n.metadata.labels
+        }
+        available.discard(None)
+        if instance_type and instance_type not in available:
+            click.secho(
+                f"⚠️  Instance type '{instance_type}' not found in the current cluster.\n"
+                f"   Available: {', '.join(sorted(available)) or 'none'}\n"
+                f"   The job will be submitted but pods may remain Pending.",
+                fg="yellow"
+            )
+    except Exception as e:
+        click.secho(f"⚠️  Could not verify instance type availability: {e}", fg="yellow")
+
+
+def _create_dynamic_template(dir_path: Path, config_data: dict):
+    """Handle create for dynamic templates (recipe)"""
+    try:
+        # Validate config first
+        _validate_dynamic_template(dir_path)
+        click.secho("✔️ Configuration validated successfully", fg="green")
+
+        # Warn if instance type isn't available in the cluster
+        _warn_if_instance_type_unavailable(config_data.get('instance_type'))
+        
+        k8s_template_file = dir_path / 'k8s.jinja'
+        if not k8s_template_file.exists():
+            raise FileNotFoundError("k8s.jinja template not found")
+        
+        # Read and render template
+        template_content = k8s_template_file.read_text()
+        rendered = _render_k8s_template(template_content, config_data)
+        
+        # Create run directory
+        run_root = dir_path / 'run'
+        run_root.mkdir(exist_ok=True)
+        timestamp = datetime.now().strftime('%Y%m%dT%H%M%S')
+        out_dir = run_root / timestamp
+        out_dir.mkdir()
+        
+        # Save files
+        shutil.copy(dir_path / 'config.yaml', out_dir / 'config.yaml')
+        (out_dir / 'k8s.yaml').write_text(rendered)
+        
+        relative_out_dir = Path("run") / timestamp
+        click.secho(f"✔️ Files written to {relative_out_dir}", fg="green")
+        
+        # Submit to Kubernetes
+        custom_api = _get_k8s_custom_client()
+        _submit_k8s_resources(custom_api, rendered)
+        
+        click.secho("✔️ Successfully submitted to HyperPod", fg="green")
+                
+    except (FileNotFoundError, ValueError) as e:
+        click.secho(f"❌ {e}", fg="red")
+        sys.exit(1)
+    except Exception as e:
+        try:
+            resource_name = config_data.get('name', 'unknown')
+            handle_exception(e, resource_name, 'default')
+        except Exception as handled_e:
+            click.secho(f"❌ {handled_e}", fg="red")
+        sys.exit(1)
diff --git a/src/sagemaker/hyperpod/cli/constants/init_constants.py b/src/sagemaker/hyperpod/cli/constants/init_constants.py
index 3168484d..4a1ba4df 100644
--- a/src/sagemaker/hyperpod/cli/constants/init_constants.py
+++ b/src/sagemaker/hyperpod/cli/constants/init_constants.py
@@ -38,6 +38,13 @@
         "schema_pkg": "hyperpod_cluster_stack_template",
         "schema_type": CFN,
         'type': "jinja"
+    },
+    "hyp-recipe-job": {
+        "registry": {},
+        "template_registry": {},
+        "schema_pkg": None,
+        "schema_type": CRD,
+        'type': "dynamic"
     }
 }
 
diff --git a/src/sagemaker/hyperpod/cli/hyp_cli.py b/src/sagemaker/hyperpod/cli/hyp_cli.py
index a33aee29..4b2107c0 100644
--- a/src/sagemaker/hyperpod/cli/hyp_cli.py
+++ b/src/sagemaker/hyperpod/cli/hyp_cli.py
@@ -1,11 +1,7 @@
 import click
-import yaml
-import json
-import os
-import subprocess
-from pydantic import BaseModel, ValidationError, Field
-from typing import Optional, Union
+from typing import Union
 from importlib.metadata import version, PackageNotFoundError
+import copy
 
 from sagemaker.hyperpod.cli.commands.cluster import list_cluster, set_cluster_context, get_cluster_context, \
     get_monitoring, describe_cluster
@@ -62,8 +58,8 @@
 from sagemaker.hyperpod.cli.commands.init import (
     init,
     reset,
-    configure,
     validate,
+    configure,
     _default_create
 )
 
@@ -74,6 +70,7 @@ def get_package_version(package_name):
     except PackageNotFoundError:
         return "Not installed"
 
+
 def print_version(ctx, param, value):
     if not value or ctx.resilient_parsing:
         return
@@ -91,7 +88,8 @@ def print_version(ctx, param, value):
 
 
 @click.group(context_settings={'max_content_width': 200})
-@click.option('--version', is_flag=True, callback=print_version, expose_value=False, is_eager=True, help='Show version information')
+@click.option('--version', is_flag=True, callback=print_version, expose_value=False, is_eager=True,
+              help='Show version information')
 def cli():
     pass
 
@@ -142,11 +140,13 @@ def describe():
     """Describe endpoints, pytorch jobs or cluster stacks, spaces or space template."""
     pass
 
+
 @cli.group(cls=CLICommand)
 def update():
     """Update an existing HyperPod cluster configuration, space, or space template."""
     pass
 
+
 @cli.group(cls=CLICommand)
 def delete():
     """Delete endpoints, pytorch jobs, space, space access or space template."""
@@ -173,7 +173,7 @@ def portforward():
 
 @cli.group(cls=CLICommand)
 def list_pods():
-    """List pods for endpoints or pytorch jobs."""
+    """List pods for endpoints, pytorch jobs, or recipe jobs."""
     pass
 
 
@@ -197,7 +197,7 @@ def get_operator_logs():
 
 @cli.group(cls=CLICommand)
 def exec():
-    """Execute commands in pods for endpoints or pytorch jobs."""
+    """Execute commands in pods for endpoints, pytorch jobs, or recipe jobs."""
     pass
 
 
@@ -207,6 +207,7 @@ def exec():
 cli.add_command(validate)
 
 create.add_command(pytorch_create)
+# create.add_command(create_recipe_job_interactive)
 create.add_command(js_create)
 create.add_command(custom_create)
 
@@ -217,6 +218,9 @@ def exec():
 create.add_command(space_access_create)
 
 list.add_command(list_jobs)
+recipe_list_cmd = copy.copy(list_jobs)
+recipe_list_cmd.help = "List all HyperPod recipe jobs"
+list.add_command(recipe_list_cmd, name="hyp-recipe-job")
 list.add_command(js_list)
 list.add_command(custom_list)
 list.add_command(list_cluster_stacks)
@@ -224,6 +228,9 @@ def exec():
 list.add_command(space_template_list)
 
 describe.add_command(pytorch_describe)
+recipe_describe_cmd = copy.copy(pytorch_describe)
+recipe_describe_cmd.help = "Describe a HyperPod recipe job."
+describe.add_command(recipe_describe_cmd, name="hyp-recipe-job")
 describe.add_command(js_describe)
 describe.add_command(custom_describe)
 describe.add_command(describe_cluster_stack)
@@ -237,6 +244,9 @@ def exec():
 update.add_command(space_template_update)
 
 delete.add_command(pytorch_delete)
+recipe_delete_cmd = copy.copy(pytorch_delete)
+recipe_delete_cmd.help = "Delete a HyperPod recipe job."
+delete.add_command(recipe_delete_cmd, name="hyp-recipe-job")
 delete.add_command(js_delete)
 delete.add_command(custom_delete)
 delete.add_command(delete_cluster_stack)
@@ -248,10 +258,16 @@ def exec():
 stop.add_command(space_stop)
 
 list_pods.add_command(pytorch_list_pods)
+recipe_list_pods_cmd = copy.copy(pytorch_list_pods)
+recipe_list_pods_cmd.help = "List all HyperPod PyTorch pods related to the recipe job."
+list_pods.add_command(recipe_list_pods_cmd, name="hyp-recipe-job")
 list_pods.add_command(js_list_pods)
 list_pods.add_command(custom_list_pods)
 
 get_logs.add_command(pytorch_get_logs)
+recipe_get_logs_cmd = copy.copy(pytorch_get_logs)
+recipe_get_logs_cmd.help = "Get specific pod log for HyperPod recipe job."
+get_logs.add_command(recipe_get_logs_cmd, name="hyp-recipe-job")
 get_logs.add_command(js_get_logs)
 get_logs.add_command(custom_get_logs)
 get_logs.add_command(space_get_logs)
@@ -259,11 +275,16 @@ def exec():
 portforward.add_command(space_portforward)
 
 get_operator_logs.add_command(pytorch_get_operator_logs)
+recipe_get_operator_logs_cmd = copy.copy(pytorch_get_operator_logs)
+recipe_get_operator_logs_cmd.help = "Get operator logs for HyperPod recipe jobs."
+get_operator_logs.add_command(recipe_get_operator_logs_cmd, name="hyp-recipe-job")
 get_operator_logs.add_command(js_get_operator_logs)
 get_operator_logs.add_command(custom_get_operator_logs)
 
 invoke.add_command(custom_invoke)
-invoke.add_command(custom_invoke, name="hyp-jumpstart-endpoint")
+jumpstart_invoke_cmd = copy.copy(custom_invoke)
+jumpstart_invoke_cmd.help = "Invoke a jumpstart model endpoint."
+invoke.add_command(jumpstart_invoke_cmd, name="hyp-jumpstart-endpoint")
 
 cli.add_command(list_cluster)
 cli.add_command(set_cluster_context)
@@ -273,6 +294,9 @@ def exec():
 cli.add_command(list_accelerator_partition_type)
 
 exec.add_command(pytorch_exec)
+recipe_exec_cmd = copy.copy(pytorch_exec)
+recipe_exec_cmd.help = "Execute commands in pods associated with a HyperPod recipe job."
+exec.add_command(recipe_exec_cmd, name="hyp-recipe-job")
 
 if __name__ == "__main__":
     cli()
diff --git a/src/sagemaker/hyperpod/cli/inference_utils.py b/src/sagemaker/hyperpod/cli/inference_utils.py
index eb38da16..29f3c6b3 100644
--- a/src/sagemaker/hyperpod/cli/inference_utils.py
+++ b/src/sagemaker/hyperpod/cli/inference_utils.py
@@ -96,6 +96,8 @@ def wrapped_func(*args, **kwargs):
                 help=spec.get("description", ""),
             )(wrapped_func)
 
+        # Preserve the original function's docstring
+        wrapped_func.__doc__ = func.__doc__
         return wrapped_func
 
     return decorator
\ No newline at end of file
diff --git a/src/sagemaker/hyperpod/cli/init_utils.py b/src/sagemaker/hyperpod/cli/init_utils.py
index f36837d7..6b502fe2 100644
--- a/src/sagemaker/hyperpod/cli/init_utils.py
+++ b/src/sagemaker/hyperpod/cli/init_utils.py
@@ -8,7 +8,7 @@
 import yaml
 import sys
 from pathlib import Path
-from sagemaker.hyperpod.cli.type_handler_utils import convert_cli_value, to_click_type, is_complex_type, DEFAULT_TYPE_HANDLER
+from sagemaker.hyperpod.cli.type_handler_utils import convert_cli_value, to_click_type, is_complex_type, DEFAULT_TYPE_HANDLER, is_undefined_value
 from pydantic import ValidationError
 from typing import List, Any
 from sagemaker.hyperpod.cli.constants.init_constants import (
@@ -144,9 +144,15 @@ def _load_schema_for_version(version: str, schema_pkg: str) -> dict:
     return json.loads(raw)
 
 
-def _get_handler_for_field(template_name, field_name):
+def _get_handler_for_field(template_name, field_name, version=None):
     """Get appropriate handler for a field using template.field mapping."""
     if template_name and field_name:
+        # Try version-scoped key first, then fall back to unversioned
+        if version:
+            scoped_key = f"{template_name}.{version}.{field_name}"
+            handler = SPECIAL_FIELD_HANDLERS.get(scoped_key)
+            if handler:
+                return handler
         scoped_key = f"{template_name}.{field_name}"
         handler = SPECIAL_FIELD_HANDLERS.get(scoped_key, DEFAULT_TYPE_HANDLER)
         return handler
@@ -187,6 +193,7 @@ def generate_click_command() -> Callable:
     """
     Decorator that:
       - injects --<prop> for every property in the current template's schema (detected from config.yaml)
+      - supports both standard templates (Pydantic) and dynamic templates (.override_spec.json)
       - only works for configure command, returns minimal decorator for others
     """
 
@@ -204,7 +211,101 @@ def decorator(func: Callable) -> Callable:
         click.secho("❌  No config.yaml found. Run 'hyp init <template>' first.", fg="red")
         sys.exit(1)
     
-    _, current_template, current_version = load_config()
+    # Load template info from config
+    try:
+        data, current_template, current_version = load_config(Path(".").resolve())
+    except Exception:
+        # If any error, return minimal decorator
+        def decorator(func: Callable) -> Callable:
+            return func
+        return decorator
+    
+    # Check if this is a dynamic template
+    if is_dynamic_template(current_template, Path(".").resolve()):
+        return _generate_dynamic_click_command()
+    
+    # Handle standard templates (existing logic)
+    return _generate_standard_click_command(current_template, current_version)
+
+
+def _generate_dynamic_click_command() -> Callable:
+    """Generate Click command for dynamic templates using .override_spec.json"""
+    
+    # Load dynamic schema
+    override_spec = load_dynamic_schema(Path(".").resolve())
+    
+    def decorator(func: Callable) -> Callable:
+        # Add Click options for each field in override_spec
+        for key, spec in override_spec.items():
+            param_type = spec.get("type", "string")
+            help_text = spec.get("description", "")
+            default = spec.get("default")
+            required = spec.get("required", False)
+            
+            # Add constraints to help text
+            constraints = []
+            if "min" in spec:
+                constraints.append(f"min: {spec['min']}")
+            if "max" in spec:
+                constraints.append(f"max: {spec['max']}")
+            if "enum" in spec:
+                constraints.append(f"allowed: {spec['enum']}")
+            
+            if constraints:
+                help_text = f"{help_text} ({', '.join(constraints)})" if help_text else f"({', '.join(constraints)})"
+            
+            # Convert type
+            if param_type == "integer":
+                click_type = int
+            elif param_type in ["number", "float"]:
+                click_type = float
+            elif param_type == "boolean":
+                click_type = bool
+            else:
+                click_type = str
+            
+            # Convert default value - handle PydanticUndefinedType
+            if is_undefined_value(default):
+                default = None
+            
+            # Create Click option
+            opt_name = f"--{key.replace('_', '-')}"
+            option = click.Option(
+                [opt_name], 
+                type=click_type, 
+                default=default,
+                help=help_text,
+                required=required,
+                show_default=True
+            )
+            
+            # Add to function parameters
+            if not hasattr(func, '__click_params__'):
+                func.__click_params__ = []
+            func.__click_params__.append(option)
+        
+        # Create wrapper that handles dynamic template arguments
+        def wrapper(*args, **kwargs):
+            # For dynamic templates, filter out the dynamic options and pass them separately
+            # Keep only the expected function parameters
+            expected_params = {'option', 'value', 'model_config'}
+            filtered_kwargs = {k: v for k, v in kwargs.items() if k in expected_params}
+            
+            # Pass the dynamic options through ctx.params for the function to access
+            return func(*args, option=None, value=None, model_config=None, **filtered_kwargs)
+        
+        # Copy function metadata
+        wrapper.__name__ = func.__name__
+        wrapper.__doc__ = func.__doc__
+        wrapper.__click_params__ = getattr(func, '__click_params__', [])
+        
+        return wrapper
+    
+    return decorator
+
+
+def _generate_standard_click_command(current_template: str, current_version: str) -> Callable:
+    """Generate Click command for standard templates using Pydantic schemas"""
     
     # Build schema props for current template only
     union_props = {}
@@ -238,7 +339,7 @@ def wrapper(*args, **kwargs):
                     filtered_kwargs[k] = convert_cli_value(v, field_type)
             
             model_config = model.model_construct(**filtered_kwargs)
-            return func(model_config=model_config, *args)
+            return func(*args, option=None, value=None, model_config=model_config)
 
         # Generate Click options directly from model fields
         for field_name, field in reversed(list(model.model_fields.items())):
@@ -247,18 +348,20 @@ def wrapper(*args, **kwargs):
 
             flag_name = field_name.replace('_', '-')
             field_type = getattr(field, 'annotation', str)
-            required = field.is_required()
+            required = False  # For configure, all fields should be optional
             default = getattr(field, 'default', None)
-            help_text = getattr(getattr(field, 'field_info', None), 'description', field_name) or field_name
 
-            # Unified handler approach - use template.field lookup
-            handler = _get_handler_for_field(current_template, field_name)
-            option_kwargs = _get_click_option_config(handler, field_type, default, required, help_text)
+            # Get description from union_props
+            description = union_props.get(field_name, {}).get('description', '')
 
+            # Use handler-based option config to correctly handle special types
+            # (volumes, security groups, etc. need multiple=True and JSON callbacks)
+            handler = _get_handler_for_field(current_template, field_name, version=current_version)
+            option_kwargs = _get_click_option_config(handler, field_type, default, required, description)
             wrapper = click.option(f"--{flag_name}", **option_kwargs)(wrapper)
-
+        
         return wrapper
-
+    
     return decorator
 
 
@@ -271,14 +374,27 @@ def save_config_yaml(prefill: dict, comment_map: dict, directory: str):
     template = prefill.get('template')
 
     with open(path, 'w') as f:
+        # Write commented template and version at the top
+        f.write(f"# template: {prefill.get('template')}\n")
+        f.write(f"# version: {prefill.get('version')}\n\n")
+        
         for key in prefill:
+            # Skip template and version as they're already written
+            if key in ('template', 'version'):
+                continue
+                
             comment = comment_map.get(key)
             if comment:
                 f.write(f"# {comment}\n")
 
             val = prefill.get(key)
-            handler = _get_handler_for_field(template, key)
-            handler['write_to_yaml'](key, handler['from_dicts'](val) if val is not None else val, f)    
+            handler = _get_handler_for_field(template, key, version=prefill.get('version'))
+            handler['write_to_yaml'](key, handler['from_dicts'](val) if val is not None else val, f)
+
+    # Write lockfile so the template cannot be silently changed by editing config.yaml
+    lockfile = os.path.join(directory, ".hyp")
+    with open(lockfile, 'w') as f:
+        f.write(template)
 
 
 def load_config(dir_path: Path = None) -> Tuple[dict, str, str]:
@@ -304,17 +420,68 @@ def load_config(dir_path: Path = None) -> Tuple[dict, str, str]:
         sys.exit(1)
 
     # Load existing config
-    data = yaml.safe_load(config_file.read_text()) or {}
-    template = data.get("template")
-    version = data.get("version", "1.0")
+    config_text = config_file.read_text()
+    data = yaml.safe_load(config_text) or {}
+    
+    # Extract template and version from comments
+    template = None
+    version = None
+    
+    for line in config_text.split('\n'):
+        line = line.strip()
+        if line.startswith('# template:'):
+            template = line.split(':', 1)[1].strip()
+        elif line.startswith('# version:'):
+            version = line.split(':', 1)[1].strip()
+    
+    # Fallback to data if not found in comments (backward compatibility)
+    if not template:
+        template = data.get("template")
+    if not version:
+        version = data.get("version", "1.0")
 
-    if template not in TEMPLATES:
+    if not template or template not in TEMPLATES:
         click.secho(f"❌  Unknown template '{template}' in config.yaml", fg="red")
         sys.exit(1)
-        
+
+    # Check lockfile to detect if the template comment was manually changed
+    lockfile = dir_path / ".hyp"
+    if lockfile.is_file():
+        locked_template = lockfile.read_text().strip()
+        if locked_template != template:
+            click.secho(
+                f"❌  Template mismatch: config.yaml says '{template}' but this directory was initialized with '{locked_template}'. "
+                f"Do not edit the '# template:' line in config.yaml.",
+                fg="red",
+            )
+            sys.exit(1)
+
     return data, template, version
 
 
+def is_dynamic_template(template: str, dir_path: Path = None) -> bool:
+    """Check if template uses dynamic schema (.override_spec.json)"""
+    if dir_path is None:
+        dir_path = Path(".").resolve()
+    
+    # Check if .override_spec.json exists
+    override_spec_file = dir_path / ".override_spec.json"
+    return override_spec_file.exists() and template in ["hyp-recipe-job"]
+
+
+def load_dynamic_schema(dir_path: Path = None) -> dict:
+    """Load schema from .override_spec.json for dynamic templates"""
+    if dir_path is None:
+        dir_path = Path(".").resolve()
+    
+    override_spec_file = dir_path / ".override_spec.json"
+    if not override_spec_file.exists():
+        return {}
+    
+    with open(override_spec_file, 'r') as f:
+        return json.load(f)
+
+
 def load_config_and_validate(dir_path: Path = None) -> Tuple[dict, str, str]:
     """
     Load config.yaml, validate it exists, and extract template and version.
@@ -322,6 +489,14 @@ def load_config_and_validate(dir_path: Path = None) -> Tuple[dict, str, str]:
     Exits on validation errors - use for commands that require valid config.
     """
     data, template, version = load_config(dir_path)
+    
+    # Check if this is a dynamic template
+    if is_dynamic_template(template, dir_path):
+        # For dynamic templates, we don't use Pydantic validation
+        # The validation is handled separately if needed
+        return data, template, version
+    
+    # Standard template validation
     validation_errors = validate_config_against_model(data, template, version)
     
     is_valid = display_validation_results(
@@ -361,7 +536,7 @@ def validate_config_against_model(config_data: dict, template: str, version: str
         if model:
             # Unified handler approach
             for key in filtered_config:
-                handler = _get_handler_for_field(template, key)
+                handler = _get_handler_for_field(template, key, version=version)
                 filtered_config[key] = handler['from_dicts'](filtered_config[key])
 
             model(**filtered_config)
@@ -446,7 +621,7 @@ def build_config_from_schema(template: str, version: str, model_config=None, exi
     reqs = schema.get("required", [])
 
     
-    # Build config dict with defaults from schema
+    # Build config dict with template and version for comment generation
     full_cfg = {
         "template": template,
         "version": version,  
@@ -484,7 +659,7 @@ def build_config_from_schema(template: str, version: str, model_config=None, exi
                 continue            
 
             # Unified handler approach
-            handler = _get_handler_for_field(template, key)
+            handler = _get_handler_for_field(template, key, version=version)
 
             # Parse strings using appropriate handler
             if user_provided_fields and isinstance(val, str):
diff --git a/src/sagemaker/hyperpod/cli/recipe_param_order.py b/src/sagemaker/hyperpod/cli/recipe_param_order.py
new file mode 100644
index 00000000..0eec314a
--- /dev/null
+++ b/src/sagemaker/hyperpod/cli/recipe_param_order.py
@@ -0,0 +1,149 @@
+"""
+Param ordering and grouped config.yaml rendering for recipe jobs.
+
+The priority list is a hardcoded ordered sequence of param names grouped by
+concern. When rendering a config.yaml from a recipe's parameter_schema, params
+are emitted in this order with section headers as comments. Any param not in
+the list falls into a final "Other" group at the end.
+
+This is intentionally a simple, explicit list — no dynamic inference. When new
+params appear in recipes they can be slotted into the right group here.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+# ---------------------------------------------------------------------------
+# Priority list — ordered by group, required before optional within each group
+# ---------------------------------------------------------------------------
+
+# Each entry is (param_name, group_label).
+# Group label is used to emit section header comments in the YAML output.
+_PARAM_ORDER: list[tuple[str, str]] = [
+    # Job Identity
+    ("name",                    "Job Identity"),
+    ("namespace",               "Job Identity"),
+
+    # Data — llmft/verl family
+    ("data_path",               "Data"),
+    ("training_data_name",      "Data"),
+    ("validation_data_path",    "Data"),
+    ("validation_data_name",    "Data"),
+    ("train_val_split_ratio",   "Data"),
+
+    # Data — nova family
+    ("data_s3_path",            "Data"),
+    ("validation_s3_path",      "Data"),
+
+    # Output
+    ("output_path",             "Output"),
+    ("results_directory",       "Output"),
+    ("resume_from_path",        "Output"),
+    ("output_s3_path",          "Output"),
+
+    # Core Hyperparameters
+    ("global_batch_size",       "Core Hyperparameters"),
+    ("learning_rate",           "Core Hyperparameters"),
+    ("max_epochs",              "Core Hyperparameters"),
+    ("max_steps",               "Core Hyperparameters"),
+
+    # Advanced Hyperparameters (includes technique-specific params)
+    ("lr_warmup_ratio",         "Advanced Hyperparameters"),
+    ("max_context_length",      "Advanced Hyperparameters"),
+    ("max_prompt_length",       "Advanced Hyperparameters"),
+    ("max_length",              "Advanced Hyperparameters"),
+    ("lora_alpha",              "Advanced Hyperparameters"),
+    ("learning_rate_ratio",     "Advanced Hyperparameters"),
+    ("adam_beta",               "Advanced Hyperparameters"),
+    ("rollout",                 "Advanced Hyperparameters"),
+    ("number_generation",       "Advanced Hyperparameters"),
+    ("preset_reward_function",  "Advanced Hyperparameters"),
+    ("judge_model_id",          "Advanced Hyperparameters"),
+    ("judge_prompt_template",   "Advanced Hyperparameters"),
+    ("reward_lambda_arn",       "Advanced Hyperparameters"),
+    ("reasoning_enabled",       "Advanced Hyperparameters"),
+
+    # MLflow
+    ("mlflow_tracking_uri",     "MLflow"),
+    ("mlflow_run_id",           "MLflow"),
+    ("mlflow_experiment_name",  "MLflow"),
+    ("mlflow_run_name",         "MLflow"),
+
+    # Compute
+    ("instance_type",           "Compute"),
+    ("replicas",                "Compute"),
+
+    # Model (auto-resolved from recipe, rarely overridden)
+    ("model_name_or_path",      "Model"),
+]
+
+# Build lookup: param_name -> (index, group)
+_ORDER_INDEX: dict[str, tuple[int, str]] = {
+    name: (i, group) for i, (name, group) in enumerate(_PARAM_ORDER)
+}
+
+_FALLBACK_GROUP = "Other"
+
+
+def sort_key(param_name: str) -> tuple[int, str]:
+    """Return (priority_index, param_name) for stable ordering."""
+    idx, _ = _ORDER_INDEX.get(param_name, (len(_PARAM_ORDER), param_name))
+    return (idx, param_name)
+
+
+def render_config_yaml(parameter_schema: dict[str, Any], header_comments: list[str] | None = None) -> str:
+    """Render a config.yaml string from a recipe's parameter_schema.
+
+    Params are emitted in priority order with section header comments.
+    Params not in the priority list are appended at the end under "Other".
+
+    Args:
+        parameter_schema: The recipe's parameter_schema dict (from cache.json).
+        header_comments: Optional list of lines to emit at the top (e.g. model, technique).
+
+    Returns:
+        A YAML string ready to write to config.yaml.
+    """
+    lines: list[str] = []
+
+    if header_comments:
+        for c in header_comments:
+            lines.append(f"# {c}")
+        lines.append("")
+
+    # Sort params by priority
+    sorted_params = sorted(parameter_schema.items(), key=lambda kv: sort_key(kv[0]))
+
+    current_group: str | None = None
+    for param_name, spec in sorted_params:
+        _, group = _ORDER_INDEX.get(param_name, (len(_PARAM_ORDER), _FALLBACK_GROUP))
+
+        # Emit section header when group changes
+        if group != current_group:
+            if current_group is not None:
+                lines.append("")
+            lines.append(f"# {'─' * 10} {group} {'─' * (40 - len(group))}")
+            current_group = group
+
+        # Build inline comment from spec metadata
+        meta_parts: list[str] = [f"Type: {spec.get('type', 'any')}"]
+        if spec.get("required"):
+            meta_parts.append("Required")
+        else:
+            meta_parts.append("Optional")
+        if "minimum" in spec:
+            meta_parts.append(f"Min: {spec['minimum']}")
+        if "maximum" in spec:
+            meta_parts.append(f"Max: {spec['maximum']}")
+        if "enum" in spec:
+            meta_parts.append(f"Options: {spec['enum']}")
+        lines.append(f"# {', '.join(meta_parts)}")
+
+        # Emit the param with its default value
+        default = spec.get("default", "")
+        if default is None:
+            default = ""
+        lines.append(f"{param_name}: {default}")
+
+    return "\n".join(lines) + "\n"
diff --git a/src/sagemaker/hyperpod/cli/recipe_utils.py b/src/sagemaker/hyperpod/cli/recipe_utils.py
new file mode 100644
index 00000000..8306d34f
--- /dev/null
+++ b/src/sagemaker/hyperpod/cli/recipe_utils.py
@@ -0,0 +1,487 @@
+"""Reusable utilities for recipe job operations."""
+
+import json
+import re
+import yaml
+import click
+import boto3
+import sys
+from jinja2 import Template
+from kubernetes import client, config
+from pathlib import Path
+from typing import Dict, Any, Tuple, Optional
+from sagemaker.hyperpod.cli.init_utils import load_dynamic_schema
+from sagemaker.hyperpod.cli.type_handler_utils import is_undefined_value
+
+_KIND_PLURALS = {
+    "ingress": "ingresses",
+    "networkpolicy": "networkpolicies",
+    "storageclass": "storageclasses",
+    "clusterrole": "clusterroles",
+    "clusterrolebinding": "clusterrolebindings",
+}
+
+def _kind_to_plural(kind: str) -> str:
+    k = kind.lower()
+    return _KIND_PLURALS.get(k, k + 's')
+
+
+import re
+
+
+_HUB_CONTENT_ARN_PATTERN = re.compile(
+    r"^arn:aws(-[\w]+)*:sagemaker:[a-z0-9\-]+:(\d{12}|aws):hub-content/[^/]+/[^/]+/[^/]+(/[\d.]+)?$"
+)
+
+# Static fallback for HF model IDs where name derivation fails the search.
+# Only contains edge cases — not all models.
+_HF_MODEL_ID_FALLBACK = {
+    "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B": "deepseek-llm-r1-distill-qwen-32b",
+    "Qwen/Qwen3-0.6B":                          "huggingface-reasoning-qwen3-06b",
+}
+
+# Technique aliases for user-friendly input
+_TECHNIQUE_ALIASES = {
+    "deterministic": "DeterministicEvaluation",
+    "llmaj": "LLMAJEvaluation",
+}
+
+
+def _is_hub_content_arn(model_id: str) -> bool:
+    """Check if model_id is a valid SageMaker Hub Content ARN."""
+    return bool(_HUB_CONTENT_ARN_PATTERN.match(model_id))
+
+
+def _parse_hub_content_arn(arn: str) -> dict:
+    """Parse a Hub Content ARN into describe_hub_content parameters.
+
+    ARN format: arn:aws:sagemaker:<region>:<account>:hub-content/<hub>/<type>/<name>[/<version>]
+    """
+    path = arn.split(":hub-content/", 1)[1]
+    parts = path.split("/")
+    params = {
+        "HubName": parts[0],
+        "HubContentType": parts[1],
+        "HubContentName": parts[2],
+    }
+    if len(parts) >= 4:
+        params["HubContentVersion"] = parts[3]
+    return params
+
+
+def _fetch_recipe_from_private_hub(sagemaker_client, hub_content_arn: str) -> Dict[str, Any]:
+    """Fetch hub content document using a Hub Content ARN."""
+    if not _is_hub_content_arn(hub_content_arn):
+        raise ValueError(
+            f"Invalid Hub Content ARN format: '{hub_content_arn}'. "
+            f"Expected format: arn:aws:sagemaker:<region>:<account>:hub-content/<hub>/<type>/<name>"
+        )
+    params = _parse_hub_content_arn(hub_content_arn)
+    response = sagemaker_client.describe_hub_content(**params)
+    return json.loads(response.get('HubContentDocument', '{}'))
+
+
+def _resolve_huggingface_model_id(sagemaker_client, hf_model_id: str) -> str:
+    """Resolve a HuggingFace model ID to a JumpStart Hub content name.
+
+    Strategy: dynamic search first (@recipe: keyword filter), then static
+    fallback for models whose names don't derive cleanly to a search term.
+    Raises ValueError with a clean "not supported" message for unknown models.
+    """
+    raw = hf_model_id.split('/')[-1].lower()
+    search_term = re.sub(r'[^a-z0-9\-]', '-', raw).strip('-')
+
+    response = sagemaker_client.list_hub_contents(
+        HubName='SageMakerPublicHub',
+        HubContentType='Model',
+        NameContains=search_term,
+        MaxResults=20,
+    )
+    recipe_matches = [
+        m for m in response.get('HubContentSummaries', [])
+        if any(k.startswith('@recipe:') for k in m.get('HubContentSearchKeywords', []))
+    ]
+
+    if len(recipe_matches) == 1:
+        return recipe_matches[0]['HubContentName']
+
+    if hf_model_id in _HF_MODEL_ID_FALLBACK:
+        return _HF_MODEL_ID_FALLBACK[hf_model_id]
+
+    raise ValueError(
+        f"'{hf_model_id}' may not be supported for HyperPod recipes, or could not be resolved.\n"
+        f"If you are using a HuggingFace model ID, try the JumpStart Hub model ID instead and retry.\n"
+        f"To find supported model IDs: https://docs.aws.amazon.com/cli/latest/reference/sagemaker/list-hub-contents.html"
+    )
+
+
+def _fetch_recipe_from_hub(sagemaker_client, model_id: str, job_type: str, technique: str = None, instance_type: str = None, is_huggingface: bool = False) -> Dict[str, Any]:
+    """Fetch and validate recipe from SageMaker Hub.
+
+    Supports three model_id formats:
+    - JumpStart model ID via --model-id (e.g. meta-textgeneration-llama-3-2-1b)
+    - HuggingFace model ID via --huggingface-model-id (e.g. meta-llama/Llama-3.1-8B-Instruct)
+    - Hub Content ARN via --model-id (e.g. arn:aws:sagemaker:...)
+    """
+    if _is_hub_content_arn(model_id):
+        hub_content_doc = _fetch_recipe_from_private_hub(sagemaker_client, model_id)
+    else:
+        if is_huggingface:
+            resolved_id = _resolve_huggingface_model_id(sagemaker_client, model_id)
+        else:
+            resolved_id = model_id
+        response = sagemaker_client.describe_hub_content(
+            HubName="SageMakerPublicHub",
+            HubContentType="Model",
+            HubContentName=resolved_id,
+        )
+        hub_content_doc = json.loads(response.get('HubContentDocument', '{}'))
+    recipe_collection = hub_content_doc.get('RecipeCollection', [])
+
+    if not technique:
+        raise ValueError("technique is required. Supported values: SFT, DPO, RLAIF, RLVR, CPT, PPO, deterministic, LLMAJ")
+
+    # Resolve technique aliases to canonical names (case-insensitive)
+    technique_lower = technique.lower()
+    technique = _TECHNIQUE_ALIASES.get(technique_lower, technique.upper() if technique_lower not in _TECHNIQUE_ALIASES else technique)
+
+    # Filter recipes by technique
+    matching_recipes = [
+        r for r in recipe_collection
+        if (r.get('CustomizationTechnique') or '').upper() == technique.upper()
+        or r.get('EvaluationType') == technique
+    ]
+    
+    if not matching_recipes:
+        available = set()
+        for r in recipe_collection:
+            available.add(r.get('CustomizationTechnique') or r.get('EvaluationType') or '(none)')
+        raise ValueError(f"No recipe found for technique: {technique}. Available: {sorted(available)}")
+    
+    # If instance type is provided, find recipe that supports it
+    if instance_type:
+        for recipe in matching_recipes:
+            if instance_type in recipe.get('SupportedInstanceTypes', []):
+                return recipe
+        
+        # If no recipe supports the instance type, collect all supported types
+        all_supported = set()
+        for recipe in matching_recipes:
+            all_supported.update(recipe.get('SupportedInstanceTypes', []))
+        
+        raise ValueError(f"Instance type {instance_type} not supported. Supported: {sorted(all_supported)}")
+    
+    # Return first matching recipe if no instance type specified
+    return matching_recipes[0]
+
+
+def _download_s3_content(s3_client, s3_uri: str) -> str:
+    """Download content from S3 URI."""
+    bucket = s3_uri.split('/')[2]
+    key = '/'.join(s3_uri.split('/')[3:])
+    obj = s3_client.get_object(Bucket=bucket, Key=key)
+    return obj['Body'].read().decode('utf-8')
+
+
+def _download_s3_json(s3_client, s3_uri: str) -> Dict[str, Any]:
+    """Download JSON content from S3 URI."""
+    bucket = s3_uri.split('/')[2]
+    key = '/'.join(s3_uri.split('/')[3:])
+    obj = s3_client.get_object(Bucket=bucket, Key=key)
+    return json.loads(obj['Body'].read())
+
+
+def _validate_and_convert_value(value: str, param_spec: Dict[str, Any]) -> Any:
+    """Validate and convert a parameter value according to its specification."""
+    param_type = param_spec.get('type', 'string')
+    min_val = param_spec.get('minimum') or param_spec.get('min')
+    max_val = param_spec.get('maximum') or param_spec.get('max')
+    enum_vals = param_spec.get('enum')
+    
+    # Type conversion with better error messages
+    try:
+        if param_type == "integer":
+            converted_value = int(value)
+        elif param_type == "float":
+            converted_value = float(value)
+        elif param_type == "string":
+            converted_value = str(value)
+        else:
+            converted_value = value
+    except ValueError:
+        raise ValueError(f"Invalid {param_type} value: '{value}'. Please enter a valid {param_type}.")
+    
+    # Constraint validation with specific messages
+    if min_val is not None and converted_value < min_val:
+        raise ValueError(f"Value {converted_value} is below the minimum allowed value of {min_val}.")
+    
+    if max_val is not None and converted_value > max_val:
+        raise ValueError(f"Value {converted_value} exceeds the maximum allowed value of {max_val}.")
+    
+    if enum_vals and converted_value not in enum_vals:
+        raise ValueError(f"Invalid option '{converted_value}'. Please choose from: {', '.join(map(str, enum_vals))}.")
+    
+    return converted_value
+
+
+def _collect_parameter_interactively(key: str, param_spec: Dict[str, Any]) -> Tuple[str, Any]:
+    """Collect a single parameter value interactively from user."""
+    param_type = param_spec.get('type', 'string')
+    description = param_spec.get('description', '')
+    required = param_spec.get('required', False)
+    default = param_spec.get('default')
+    min_val = param_spec.get('min')
+    max_val = param_spec.get('max')
+    enum_vals = param_spec.get('enum')
+    
+    # Display parameter info
+    click.secho(f"--{key.replace('_', '-')}", fg="cyan", bold=True)
+    if description:
+        click.secho(f"  Description: {description}", fg="white")
+    
+    type_info = f"  Type: {param_type}"
+    if min_val is not None:
+        type_info += f", Min: {min_val}"
+    if max_val is not None:
+        type_info += f", Max: {max_val}"
+    if enum_vals:
+        type_info += f", Options: {enum_vals}"
+    type_info += f", Required: {required}"
+    click.secho(type_info, fg="white")
+    
+    if default is not None:
+        click.secho(f"  Default: {default}", fg="white")
+    
+    # Get user input with validation loop
+    while True:
+        if required and default is None:
+            user_input = input(f"Enter value for {key}: ")
+        else:
+            prompt_text = f"Enter value for {key}"
+            if default is not None:
+                user_input = input(f"{prompt_text} [{default}]: ") or str(default)
+            else:
+                user_input = input(f"{prompt_text}: ")
+        
+        # Skip if empty and not required
+        if not user_input and not required:
+            return key, None
+        
+        # Check for empty input on required fields
+        if not user_input and required:
+            click.secho(f"❌ This field is required. Please provide a value.", fg="red")
+            continue
+        
+        # Validate input
+        try:
+            converted_value = _validate_and_convert_value(user_input, param_spec)
+            return key, converted_value
+        except ValueError as e:
+            click.secho(f"❌ {e}", fg="red")
+            continue
+
+
+def _submit_k8s_resources(custom_api, rendered_yaml: str) -> None:
+    """Submit Kubernetes resources from rendered YAML."""
+    k8s_documents = list(yaml.safe_load_all(rendered_yaml))
+    
+    for k8s_config in k8s_documents:
+        if not k8s_config:
+            continue
+            
+        api_version = k8s_config.get('apiVersion', '')
+        kind = k8s_config.get('kind', '')
+        metadata = k8s_config.get('metadata', {})
+        namespace = metadata.get('namespace', 'default')
+        
+        # Handle standard vs custom resources
+        if api_version == 'v1' or api_version.startswith('apps/') or api_version.startswith('extensions/'):
+            core_api = client.CoreV1Api()
+            if kind == 'ConfigMap':
+                core_api.create_namespaced_config_map(namespace=namespace, body=k8s_config)
+            elif kind == 'Secret':
+                core_api.create_namespaced_secret(namespace=namespace, body=k8s_config)
+            elif kind == 'Service':
+                core_api.create_namespaced_service(namespace=namespace, body=k8s_config)
+            else:
+                custom_api.create_namespaced_custom_object(
+                    group='', version=api_version, namespace=namespace,
+                    plural=_kind_to_plural(kind), body=k8s_config)
+        else:
+            if '/' in api_version:
+                group, version = api_version.split('/', 1)
+            else:
+                group, version = '', api_version
+
+            custom_api.create_namespaced_custom_object(
+                group=group, version=version, namespace=namespace,
+                plural=_kind_to_plural(kind), body=k8s_config)
+
+
+def _render_k8s_template(template_content: str, config_data: Dict[str, Any]) -> str:
+    """Render Kubernetes template with configuration data."""
+    template = Template(template_content)
+    return template.render(**config_data)
+
+
+def _collect_all_parameters_interactively(spec: Dict[str, Any]) -> Dict[str, Any]:
+    """Collect all parameters interactively from user."""
+    config_data = {}
+    sorted_spec = sorted(spec.items(), key=lambda x: (not x[1].get('required', False), x[0]))
+    
+    for key, param_spec in sorted_spec:
+        param_key, param_value = _collect_parameter_interactively(key, param_spec)
+        if param_value is not None:
+            config_data[param_key] = param_value
+        click.echo()  # Add spacing
+    
+    return config_data
+
+
+# Client management utilities
+_sagemaker_client = None
+_s3_client = None
+_k8s_custom_client = None
+
+
+def _get_sagemaker_client():
+    """Get cached SageMaker client."""
+    global _sagemaker_client
+    if _sagemaker_client is None:
+        _sagemaker_client = boto3.client("sagemaker")
+    return _sagemaker_client
+
+
+def _get_s3_client():
+    """Get cached S3 client."""
+    global _s3_client
+    if _s3_client is None:
+        _s3_client = boto3.client("s3")
+    return _s3_client
+
+
+def _get_k8s_custom_client():
+    """Get Kubernetes custom objects API client for PyTorchJob resources."""
+    global _k8s_custom_client
+    if _k8s_custom_client is None:
+        try:
+            config.load_kube_config()
+        except config.ConfigException:
+            try:
+                config.load_incluster_config()
+            except config.ConfigException:
+                raise Exception("Could not configure kubernetes python client")
+        _k8s_custom_client = client.CustomObjectsApi()
+    return _k8s_custom_client
+
+
+def _validate_dynamic_template(dir_path: Path) -> bool:
+    """Validate dynamic template config against .override_spec.json"""
+    spec_path = dir_path / ".override_spec.json"
+    if not spec_path.exists():
+        raise FileNotFoundError(".override_spec.json not found")
+    
+    spec = load_dynamic_schema(dir_path)
+    config_data = yaml.safe_load((dir_path / "config.yaml").read_text()) or {}
+    
+    validation_errors = []
+    for key, field_spec in spec.items():
+        value = config_data.get(key)
+        required = field_spec.get("required", False)
+        field_type = field_spec.get("type", "string")
+        
+        if required and (value is None or value == ""):
+            validation_errors.append(f"{key}: Required field is missing or empty")
+            continue
+        
+        if value is None:
+            continue
+        
+        # Type validation
+        if field_type == "integer" and not isinstance(value, int):
+            validation_errors.append(f"{key}: Expected integer, got {type(value).__name__}")
+        elif field_type == "float" and not isinstance(value, (int, float)):
+            validation_errors.append(f"{key}: Expected number, got {type(value).__name__}")
+        elif field_type == "string" and not isinstance(value, str):
+            validation_errors.append(f"{key}: Expected string, got {type(value).__name__}")
+        
+        # Constraint validation with improved messages
+        if isinstance(value, (int, float)):
+            if "minimum" in field_spec and value < field_spec["minimum"]:
+                validation_errors.append(f"{key}: Value {value} is below the minimum allowed value of {field_spec['minimum']}")
+            if "maximum" in field_spec and value > field_spec["maximum"]:
+                validation_errors.append(f"{key}: Value {value} exceeds the maximum allowed value of {field_spec['maximum']}")
+        
+        if "enum" in field_spec and value not in field_spec["enum"]:
+            validation_errors.append(f"{key}: Invalid option '{value}'. Please choose from: {', '.join(map(str, field_spec['enum']))}")
+    
+    if validation_errors:
+        raise ValueError("Config validation failed:\n" + "\n".join(f"  • {error}" for error in validation_errors))
+    
+    return True
+
+
+def _generate_dynamic_config_yaml(dir_path: Path, template: str, version: str = None, model_name: str = None, technique: str = None, instance_type: str = None):
+    """Generate config.yaml for dynamic templates with default values"""
+    from sagemaker.hyperpod.cli.recipe_param_order import render_config_yaml
+
+    spec = load_dynamic_schema(dir_path)
+
+    # Override instance_type default with user-provided value (copy to avoid mutating loaded schema)
+    if instance_type and 'instance_type' in spec:
+        spec = dict(spec)
+        spec['instance_type'] = dict(spec['instance_type'], default=instance_type)
+
+    header = [f"template: {template}"]
+    if model_name:
+        header.append(f"model: {model_name}")
+    if technique:
+        header.append(f"fine tune technique: {technique}")
+
+    config_path = dir_path / 'config.yaml'
+    config_path.write_text(render_config_yaml(spec, header_comments=header))
+
+    lockfile = dir_path / ".hyp"
+    lockfile.write_text(template)
+
+
+def _update_config_field(config_path: Path, spec: Dict[str, Any], option: str, value: Any):
+    """Update a single field in config.yaml for dynamic templates"""
+    # Validate option exists
+    if option not in spec:
+        click.secho(f"❌ Unknown option: {option}", fg="red")
+        sys.exit(1)
+    
+    if is_undefined_value(value):
+        click.secho(f"❌ This field is required. Please provide a value for option: {option}", fg="red")
+        sys.exit(1)
+    
+    # Validate and convert value
+    try:
+        converted_value = _validate_and_convert_value(str(value), spec[option])
+    except ValueError as e:
+        click.secho(f"❌ {e}", fg="red")
+        sys.exit(1)
+    
+    # Load and update config.yaml - preserve existing values
+    with open(config_path, 'r') as f:
+        lines = f.readlines()
+    
+    updated = False
+    new_lines = []
+    key_pattern = re.compile(r'^(\s*)' + re.escape(option) + r'\s*:')
+    for line in lines:
+        if not line.strip().startswith('#') and key_pattern.match(line):
+            indent = len(line) - len(line.lstrip())
+            new_lines.append(f"{' ' * indent}{option}: {converted_value}\n")
+            updated = True
+        else:
+            new_lines.append(line)
+    
+    if not updated:
+        click.secho(f"❌ Option {option} not found in config.yaml", fg="red")
+        sys.exit(1)
+    
+    # Write back to config.yaml
+    with open(config_path, 'w') as f:
+        f.writelines(new_lines)
diff --git a/src/sagemaker/hyperpod/cli/service/list_pods.py b/src/sagemaker/hyperpod/cli/service/list_pods.py
index d1f173f5..a34f03da 100644
--- a/src/sagemaker/hyperpod/cli/service/list_pods.py
+++ b/src/sagemaker/hyperpod/cli/service/list_pods.py
@@ -52,7 +52,7 @@ def list_pods_for_training_job(
                 resource_attributes_template
             )
 
-        label_filter = f"training.kubeflow.org/job-name={job_name}"
+        label_filter = f"HPJob={job_name}"
 
         try:
             _pods: V1PodList = k8s_client.list_pods_with_labels(namespace, label_filter)
diff --git a/src/sagemaker/hyperpod/cli/training_utils.py b/src/sagemaker/hyperpod/cli/training_utils.py
index c5ba3667..5a5096b9 100644
--- a/src/sagemaker/hyperpod/cli/training_utils.py
+++ b/src/sagemaker/hyperpod/cli/training_utils.py
@@ -110,6 +110,9 @@ def wrapped_func(*args, **kwargs):
         # 2) inject click options from JSON Schema
         excluded_props = set(["version"])
         
+        # Preserve the original function's docstring
+        wrapped_func.__doc__ = func.__doc__
+        
         wrapped_func = click.option(
             "--environment",
             callback=_parse_json_flag,
diff --git a/src/sagemaker/hyperpod/cli/type_handler_utils.py b/src/sagemaker/hyperpod/cli/type_handler_utils.py
index 73b45681..4ccdc8ad 100644
--- a/src/sagemaker/hyperpod/cli/type_handler_utils.py
+++ b/src/sagemaker/hyperpod/cli/type_handler_utils.py
@@ -50,6 +50,33 @@ def to_click_type(field_type):
         return str
 
 
+def is_undefined_value(value):
+    """Check if value is None or PydanticUndefinedType"""
+    return value is None or str(type(value)) == "<class 'pydantic_core._pydantic_core.PydanticUndefinedType'>"
+
+
+def create_click_option(flag_name, field_type, required, default, description):
+    """Create a Click option from field information.
+
+    Note: standard templates use _get_handler_for_field + _get_click_option_config instead.
+    This utility is kept for dynamic template option generation.
+    """
+    click_type = to_click_type(field_type)
+    opt_name = f"--{flag_name}"
+
+    if is_undefined_value(default):
+        default = None
+
+    return click.Option(
+        [opt_name],
+        type=click_type,
+        required=required,
+        default=default,
+        help=description,
+        show_default=True
+    )
+
+
 def is_complex_type(field_type):
     """Check if field type needs JSON parsing"""
     origin = get_origin(field_type)
diff --git a/src/sagemaker/hyperpod/common/cli_decorators.py b/src/sagemaker/hyperpod/common/cli_decorators.py
index 50642684..5e569fd2 100644
--- a/src/sagemaker/hyperpod/common/cli_decorators.py
+++ b/src/sagemaker/hyperpod/common/cli_decorators.py
@@ -253,7 +253,7 @@ def _check_job_exists_for_pod_validation(job_name: str, namespace: str, raw_reso
         
         # Construct the describe command for the resource type
         # Use appropriate parameter name based on resource type
-        if raw_resource_type == "pytorch-job":
+        if raw_resource_type in ("pytorch-job", "recipe-job"):
             cmd = ["hyp", "describe", f"hyp-{raw_resource_type}", "--job-name", job_name]
         else:
             cmd = ["hyp", "describe", f"hyp-{raw_resource_type}", "--name", job_name]
@@ -416,6 +416,10 @@ def _is_valid_jumpstart_model_id(model_id: str) -> bool:
     Check if model-id exists in JumpStart registry.
     Uses same SageMaker API that's already being called during creation.
     """
+    # ARNs are for private hubs — skip JumpStart public hub validation
+    if model_id.startswith("arn:"):
+        return True
+
     try:
         import boto3
         from botocore.exceptions import ClientError
diff --git a/src/sagemaker/hyperpod/common/utils.py b/src/sagemaker/hyperpod/common/utils.py
index 43028892..a2cda3c5 100644
--- a/src/sagemaker/hyperpod/common/utils.py
+++ b/src/sagemaker/hyperpod/common/utils.py
@@ -572,7 +572,7 @@ def verify_kubernetes_version_compatibility(logger) -> bool:
                     click.secho(
                         f"\nWARNING: Kubernetes client version {client_version_str} is incompatible with server {server_version_str}. "
                         f"Server requires minimum client version {min_major}.{min_minor}. "
-                        f"\nPlease update Kubernetes Python Client: pip install --upgrade kubernetes>={min_major}.{min_minor}.0",
+                        f"\nPlease update Kubernetes Python Client: pip install --upgrade kubernetes>={min_minor}.0",
                         fg="yellow"
                     )
                     is_compatible = False
diff --git a/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py b/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py
index dbd82528..9377f87b 100644
--- a/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py
+++ b/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py
@@ -400,6 +400,18 @@ def delete(self):
             handle_exception(e, self.metadata.name, self.metadata.namespace,
                             operation_type='delete', resource_type='training_job')
 
+        # Clean up associated ConfigMap created during job submission
+        configmap_name = f"training-config-{self.metadata.name}"
+        try:
+            client.CoreV1Api().delete_namespaced_config_map(
+                name=configmap_name,
+                namespace=self.metadata.namespace,
+            )
+            logger.info(f"Deleted ConfigMap '{configmap_name}'")
+        except Exception:
+            # ConfigMap may not exist (e.g. non-recipe jobs) — ignore
+            pass
+
     @_hyperpod_telemetry_emitter(Feature.HYPERPOD, "exec_pytorchjob")
     def exec_command(self, command: List[str], pod: Optional[str] = None,
                      all_pods: bool = False, container: Optional[str] = None):
@@ -444,15 +456,24 @@ def exec_command(self, command: List[str], pod: Optional[str] = None,
             handle_exception(e, job_name, namespace)
 
     def _exec_command_on_pod(self, pod: str, command: List[str], container: Optional[str] = None):
-        return stream.stream(
-            client.CoreV1Api().connect_get_namespaced_pod_exec,
-            stderr=True,
-            stdout=True,
-            name=pod,
-            namespace=self.metadata.namespace,
-            command=command,
-            container=container
-        )
+        from kubernetes.client.exceptions import ApiException
+        try:
+            return stream.stream(
+                client.CoreV1Api().connect_get_namespaced_pod_exec,
+                stderr=True,
+                stdout=True,
+                name=pod,
+                namespace=self.metadata.namespace,
+                command=command,
+                container=container
+            )
+        except ApiException as e:
+            if e.status == 400 and "does not have a host assigned" in str(e.body):
+                raise RuntimeError(
+                    f"Cannot exec into pod '{pod}': pod is not running (no host assigned). "
+                    f"The job may have already completed or the pod is still pending."
+                ) from e
+            raise
 
 
     @classmethod
@@ -512,8 +533,16 @@ def get(cls, name, namespace=None) -> "HyperPodPytorchJob":
                 namespace=namespace,
                 plural=PLURAL,
                 name=name,
+                _request_timeout=10,
             )
             return _load_hp_job(response)
+        except AttributeError as e:
+            if "getheaders" in str(e):
+                raise Exception(
+                    f"Resource '{name}' not found in namespace '{namespace}'. "
+                    f"Please check the resource name and namespace."
+                ) from e
+            raise
         except Exception as e:
             handle_exception(e, name, namespace,
                             operation_type='get', resource_type='training_job')
@@ -590,12 +619,11 @@ def list_pods(self) -> List[str]:
             config.load_kube_config()
             v1 = client.CoreV1Api()
 
-            response = v1.list_namespaced_pod(self.metadata.namespace)
-            pods = []
-
-            for pod in response.items:
-                if pod.metadata.name.startswith(f"{self.metadata.name}-pod"):
-                    pods.append(pod.metadata.name)
+            response = v1.list_namespaced_pod(
+                self.metadata.namespace,
+                label_selector=f"HPJob={self.metadata.name}",
+            )
+            pods = [pod.metadata.name for pod in response.items]
             return pods
         except Exception as e:
             logger.error(f"Failed to list pod in namespace {self.metadata.namespace}!")
@@ -655,12 +683,14 @@ def get_logs_from_pod(self, pod_name: str, container: Optional[str] = None) -> s
             config.load_kube_config()
             v1 = client.CoreV1Api()
 
-            logs = v1.read_namespaced_pod_log(
+            response = v1.read_namespaced_pod_log(
                 name=pod_name,
                 namespace=self.metadata.namespace,
                 timestamps=True,
                 container=container,
+                _preload_content=False,
             )
+            logs = response.data.decode("utf-8")
             return logs
         except Exception as e:
             logger.error(f"Failed to get logs from pod {pod_name}!")
diff --git a/test/integration_tests/init/test_init_workflow.py b/test/integration_tests/init/test_init_workflow.py
index 8232370c..41eaa41d 100644
--- a/test/integration_tests/init/test_init_workflow.py
+++ b/test/integration_tests/init/test_init_workflow.py
@@ -372,13 +372,15 @@ def test_reset_clears_config_to_defaults(self, temp_dir, runner):
         
         # Verify config was reset (template should remain)
         config_path = Path(temp_dir) / "config.yaml"
-        with open(config_path, 'r') as f:
-            config = yaml.safe_load(f)
         
-        assert config.get('template') == "hyp-jumpstart-endpoint", "Template should be preserved"
+        # Use load_config to properly read template from comments
+        from sagemaker.hyperpod.cli.init_utils import load_config
+        config_data, template, version = load_config(Path(temp_dir))
+        
+        assert template == "hyp-jumpstart-endpoint", "Template should be preserved"
         # Other fields should be reset to defaults (None or empty)
-        assert config.get('model_id') is None or config.get('model_id') == ""
-        assert config.get('endpoint_name') is None or config.get('endpoint_name') == ""
+        assert config_data.get('model_id') is None or config_data.get('model_id') == ""
+        assert config_data.get('endpoint_name') is None or config_data.get('endpoint_name') == ""
     
     def test_reset_and_reconfigure_workflow(self, temp_dir, runner):
         """Test reset -> reconfigure workflow."""
diff --git a/test/integration_tests/init/test_recipe_job_creation.py b/test/integration_tests/init/test_recipe_job_creation.py
new file mode 100644
index 00000000..5e6743be
--- /dev/null
+++ b/test/integration_tests/init/test_recipe_job_creation.py
@@ -0,0 +1,296 @@
+"""
+End-to-end integration tests for hyp-recipe-job template.
+
+SAFETY WARNING: This test submits a real HyperPodPyTorchJob to a live cluster.
+Only run with proper cost controls and cleanup procedures in place.
+
+Tests complete user workflow: init -> configure -> validate -> create -> wait -> delete.
+Uses real AWS resources (SageMaker Hub, S3, Kubernetes) with cost implications.
+"""
+import time
+import yaml
+import pytest
+import os
+import sys
+import tempfile
+from pathlib import Path
+from unittest.mock import patch
+
+from click.testing import CliRunner
+from sagemaker.hyperpod.cli.commands.init import init, configure, validate, reset
+from test.integration_tests.init.utils import (
+    assert_command_succeeded,
+    assert_config_values,
+    get_most_recent_run_directory,
+)
+from test.integration_tests.utils import get_time_str, execute_command, execute_command_with_retry
+
+# --------- Test Configuration ---------
+NAMESPACE = "default"
+REGION = "us-east-2"
+INSTANCE_TYPE = "ml.g5.48xlarge"
+HF_MODEL_ID = "Qwen/Qwen3-4B"
+TECHNIQUE = "SFT"
+TIMEOUT_MINUTES = 30
+POLL_INTERVAL_SECONDS = 30
+
+
+@pytest.fixture(scope="module")
+def runner():
+    return CliRunner()
+
+
+@pytest.fixture(scope="module")
+def job_name():
+    return "recipe-integ-" + get_time_str()
+
+
+@pytest.fixture(scope="module")
+def test_directory():
+    with tempfile.TemporaryDirectory() as temp_dir:
+        original_cwd = os.getcwd()
+        os.chdir(temp_dir)
+        try:
+            yield temp_dir
+        finally:
+            os.chdir(original_cwd)
+
+
+# --------- Recipe Job Tests ---------
+
+@pytest.mark.dependency(name="init")
+def test_init_recipe_job(runner, job_name, test_directory):
+    """Initialize recipe job from SageMaker Hub and verify files are created."""
+    result = runner.invoke(init, [
+        "hyp-recipe-job", ".",
+        "--huggingface-model-id", HF_MODEL_ID,
+        "--technique", TECHNIQUE,
+        "--instance-type", INSTANCE_TYPE,
+    ], catch_exceptions=False)
+
+    assert_command_succeeded(result)
+    base = Path(test_directory)
+    assert (base / "config.yaml").exists(), f"config.yaml not found in {test_directory}. Output: {result.output}"
+    assert (base / ".override_spec.json").exists(), f".override_spec.json not found in {test_directory}"
+    assert (base / "k8s.jinja").exists(), f"k8s.jinja not found in {test_directory}"
+
+
+@pytest.mark.dependency(name="configure", depends=["init"])
+def test_configure_recipe_job(runner, job_name, test_directory):
+    """Configure recipe job with training parameters."""
+    # Patch EFA to 1 in k8s.jinja — the S3 template may ship with a higher value
+    import re
+    k8s_jinja = Path("k8s.jinja")
+    k8s_jinja.write_text(
+        re.sub(r'(vpc\.amazonaws\.com/efa:\s*)\d+', r'\g<1>1', k8s_jinja.read_text())
+    )
+
+    with patch.object(sys, 'argv', ['hyp', 'configure']):
+        import importlib
+        from sagemaker.hyperpod.cli.commands import init as init_mod
+        importlib.reload(init_mod)
+        configure_cmd = init_mod.configure
+
+    result = runner.invoke(configure_cmd, [
+        "--name", job_name,
+        "--namespace", NAMESPACE,
+        "--data-path", "/data/recipes-data/sft/zc_train_256.jsonl",
+        "--global-batch-size", "8",
+        "--learning-rate", "0.0001",
+        "--lr-warmup-ratio", "0.1",
+        "--max-epochs", "20", 
+        "--output-path", "/data/output/qwen3-sft",
+        "--results-directory", "/data/results/qwen3-sft",
+        "--resume-from-path", "/data/output/qwen3-sft",
+        "--training-data-name", "zc_train_256",
+        "--validation-data-name", "zc_train_256",
+        "--validation-data-path", "/data/recipes-data/sft/zc_train_256.jsonl",
+        "--train-val-split-ratio", "0.9",
+        "--instance-type", INSTANCE_TYPE,
+    ], catch_exceptions=False)
+
+    assert_command_succeeded(result)
+    assert_config_values(test_directory, {
+        "name": job_name,
+        "namespace": NAMESPACE,
+        "instance_type": INSTANCE_TYPE,
+    })
+
+
+@pytest.mark.dependency(name="validate", depends=["configure", "init"])
+def test_validate_recipe_job(runner, job_name, test_directory):
+    """Validate recipe job configuration."""
+    result = runner.invoke(validate, [], catch_exceptions=False)
+    assert_command_succeeded(result)
+
+
+@pytest.mark.dependency(name="create", depends=["validate", "configure", "init"])
+def test_create_recipe_job(runner, job_name, test_directory):
+    """Submit recipe job to Kubernetes and verify submission."""
+    from sagemaker.hyperpod.cli.commands.init import _default_create as create_cmd
+
+    result = runner.invoke(create_cmd, [], catch_exceptions=False)
+    assert_command_succeeded(result)
+
+    # Verify run directory was created with rendered k8s.yaml
+    run_dir = get_most_recent_run_directory(test_directory)
+    assert (run_dir / "k8s.yaml").exists()
+    assert (run_dir / "config.yaml").exists()
+
+    # Verify job name appears in rendered output
+    k8s_yaml = (run_dir / "k8s.yaml").read_text()
+    assert job_name in k8s_yaml
+
+
+@pytest.mark.dependency(name="wait", depends=["create"])
+def test_wait_for_recipe_job_running(job_name, test_directory):
+    """Poll hyp describe until recipe job pods reach Running state without crash-looping."""
+    print(f"[INFO] Waiting for recipe job '{job_name}' to be Running...")
+    deadline = time.time() + (TIMEOUT_MINUTES * 60)
+    poll_count = 0
+
+    while time.time() < deadline:
+        poll_count += 1
+        print(f"[DEBUG] Poll #{poll_count}: Checking job status...")
+
+        try:
+            result = execute_command([
+                "hyp", "describe", "hyp-recipe-job",
+                "--job-name", job_name,
+                "--namespace", NAMESPACE,
+            ])
+            output = result.stdout
+
+            if "Failed" in output and "Status:             True" in output:
+                pytest.fail(f"Job {job_name} failed")
+
+            # Check for crash-looping pods via list-pods
+            pods_result = execute_command([
+                "hyp", "list-pods", "hyp-recipe-job",
+                "--job-name", job_name,
+                "--namespace", NAMESPACE,
+            ])
+            # hyp list-pods returns pod names; check restart counts via describe output
+            if "restartCount" in pods_result.stdout:
+                restart_counts = [
+                    int(x) for x in pods_result.stdout.split()
+                    if x.isdigit()
+                ]
+                if any(c > 3 for c in restart_counts):
+                    pytest.fail(f"Job {job_name} pods are crash-looping")
+
+            if "Running" in output and "Status:             True" in output:
+                print(f"[INFO] Job {job_name} is Running")
+                return
+
+        except RuntimeError as e:
+            print(f"[DEBUG] Exception during polling: {e}")
+
+        time.sleep(POLL_INTERVAL_SECONDS)
+
+    pytest.fail(f"[ERROR] Timed out waiting for job {job_name} to be Running after {TIMEOUT_MINUTES}m")
+
+
+@pytest.mark.dependency(name="list_pods", depends=["wait"])
+def test_list_pods_recipe_job(job_name, test_directory):
+    """List pods associated with the recipe job."""
+    time.sleep(10)
+
+    result = execute_command([
+        "hyp", "list-pods", "hyp-recipe-job",
+        "--job-name", job_name,
+        "--namespace", NAMESPACE,
+    ])
+    assert result.returncode == 0
+    assert f"Pods for job: {job_name}" in result.stdout
+    assert "POD NAME" in result.stdout
+    assert "NAMESPACE" in result.stdout
+    print(f"[INFO] list-pods output:\n{result.stdout}")
+
+
+@pytest.mark.dependency(name="get_logs", depends=["wait"])
+def test_get_logs_recipe_job(job_name, test_directory):
+    """Get logs from the first pod of the recipe job."""
+    pods_result = execute_command([
+        "hyp", "list-pods", "hyp-recipe-job",
+        "--job-name", job_name,
+        "--namespace", NAMESPACE,
+    ])
+    assert pods_result.returncode == 0
+    # Extract first pod name — pod rows start with the job name
+    lines = [l for l in pods_result.stdout.splitlines() if l.strip().startswith(job_name)]
+    assert lines, "No pod found for job"
+    pod_name = lines[0].split()[0]
+
+    result = execute_command([
+        "hyp", "get-logs", "hyp-recipe-job",
+        "--job-name", job_name,
+        "--pod-name", pod_name,
+        "--namespace", NAMESPACE,
+    ])
+    assert result.returncode == 0
+    print(f"[INFO] get-logs output for pod {pod_name}:\n{result.stdout[:500]}")
+
+
+@pytest.mark.dependency(name="get_operator_logs", depends=["create"])
+def test_get_operator_logs_recipe_job(job_name, test_directory):
+    """Get operator logs for the recipe job."""
+    result = execute_command([
+        "hyp", "get-operator-logs", "hyp-recipe-job",
+        "--since-hours", "1",
+    ])
+    assert result.returncode == 0
+    print(f"[INFO] get-operator-logs output:\n{result.stdout[:500]}")
+
+
+@pytest.mark.dependency(name="exec_cmd", depends=["wait"])
+def test_exec_recipe_job(job_name, test_directory):
+    """Exec a simple command in the recipe job pod."""
+    result = execute_command([
+        "hyp", "exec", "hyp-recipe-job",
+        "--job-name", job_name,
+        "--namespace", NAMESPACE,
+        "--all-pods",
+        "--", "echo", "hello",
+    ])
+    assert result.returncode == 0
+    assert "hello" in result.stdout
+    print(f"[INFO] exec output:\n{result.stdout}")
+
+
+@pytest.mark.dependency(name="list", depends=["create"])
+def test_list_recipe_jobs(job_name, test_directory):
+    """List recipe jobs and verify the created job appears."""
+    result = execute_command_with_retry([
+        "hyp", "list", "hyp-recipe-job",
+        "--namespace", NAMESPACE,
+    ])
+    assert result.returncode == 0
+    assert job_name in result.stdout
+    print(f"[INFO] Job {job_name} found in list output")
+
+
+@pytest.mark.dependency(name="describe", depends=["create"])
+def test_describe_recipe_job(job_name, test_directory):
+    """Describe the created recipe job and verify key fields."""
+    result = execute_command_with_retry([
+        "hyp", "describe", "hyp-recipe-job",
+        "--job-name", job_name,
+        "--namespace", NAMESPACE,
+    ])
+    assert result.returncode == 0
+    assert job_name in result.stdout
+    print(f"[INFO] Describe output for {job_name}:\n{result.stdout}")
+
+
+@pytest.mark.run(order=99)
+@pytest.mark.dependency(depends=["create"])
+def test_recipe_job_delete(job_name, test_directory):
+    """Clean up submitted recipe job."""
+    delete_result = execute_command([
+        "hyp", "delete", "hyp-recipe-job",
+        "--job-name", job_name,
+        "--namespace", NAMESPACE,
+    ])
+    assert delete_result.returncode == 0
+    print(f"[INFO] Successfully deleted job: {job_name}")
diff --git a/test/integration_tests/init/utils.py b/test/integration_tests/init/utils.py
index d978cd96..5755e65e 100644
--- a/test/integration_tests/init/utils.py
+++ b/test/integration_tests/init/utils.py
@@ -36,12 +36,21 @@ def assert_config_values(directory, expected_values):
     config_path = Path(directory) / "config.yaml"
     assert config_path.exists(), f"config.yaml should exist in {directory}"
     
-    with open(config_path, 'r') as f:
-        config = yaml.safe_load(f)
+    # Handle template specially - read from comments
+    if "template" in expected_values:
+        from sagemaker.hyperpod.cli.init_utils import load_config
+        config_data, template, version = load_config(Path(directory))
+        expected_template = expected_values.pop("template")
+        assert template == expected_template, f"Expected template={expected_template}, got {template}"
     
-    for key, expected_value in expected_values.items():
-        actual_value = config.get(key)
-        assert actual_value == expected_value, f"Expected {key}={expected_value}, got {actual_value}"
+    # Handle other values from YAML data
+    if expected_values:
+        with open(config_path, 'r') as f:
+            config = yaml.safe_load(f)
+        
+        for key, expected_value in expected_values.items():
+            actual_value = config.get(key)
+            assert actual_value == expected_value, f"Expected {key}={expected_value}, got {actual_value}"
 
 
 def assert_warning_displayed(result, expected_keywords):
diff --git a/test/integration_tests/utils.py b/test/integration_tests/utils.py
index 26c4ca56..a8040240 100644
--- a/test/integration_tests/utils.py
+++ b/test/integration_tests/utils.py
@@ -1,9 +1,22 @@
 import subprocess
 import logging
 import datetime
+import time
 
 logger = logging.getLogger(__name__)
 
+def execute_command_with_retry(command, retries=5, delay=10):
+    """Execute a CLI command with retries on failure."""
+    for attempt in range(retries):
+        try:
+            return execute_command(command)
+        except RuntimeError as e:
+            if attempt < retries - 1:
+                logger.warning(f"Attempt {attempt + 1} failed, retrying in {delay}s: {e}")
+                time.sleep(delay)
+            else:
+                raise
+
 def execute_command(command):
     """Execute a CLI command and return the result object."""
     try:
diff --git a/test/unit_tests/cli/test_init_dynamic.py b/test/unit_tests/cli/test_init_dynamic.py
new file mode 100644
index 00000000..bcec494b
--- /dev/null
+++ b/test/unit_tests/cli/test_init_dynamic.py
@@ -0,0 +1,89 @@
+import pytest
+import tempfile
+from pathlib import Path
+from unittest.mock import patch, MagicMock
+from click.testing import CliRunner
+
+from sagemaker.hyperpod.cli.commands.init import validate, _default_create, reset
+
+
+class TestInitDynamicTemplateIntegration:
+    """Test cases for init.py dynamic template integration"""
+
+    @patch('sagemaker.hyperpod.cli.commands.init.is_dynamic_template')
+    @patch('sagemaker.hyperpod.cli.commands.init._validate_dynamic_template')
+    @patch('sagemaker.hyperpod.cli.commands.init.load_config')
+    def test_validate_command_dynamic_template(self, mock_load_config, mock_validate_dynamic, mock_is_dynamic):
+        """Test validate command with dynamic template"""
+        runner = CliRunner()
+        
+        with tempfile.TemporaryDirectory() as temp_dir:
+            with runner.isolated_filesystem(temp_dir):
+                # Mock config loading
+                mock_load_config.return_value = ({"job_name": "test"}, "hyp-recipe-job", "1.0")
+                mock_is_dynamic.return_value = True
+                mock_validate_dynamic.return_value = True
+                
+                result = runner.invoke(validate)
+                
+                assert result.exit_code == 0
+                mock_validate_dynamic.assert_called_once()
+
+    @patch('sagemaker.hyperpod.cli.commands.init.is_dynamic_template')
+    @patch('sagemaker.hyperpod.cli.commands.init._create_dynamic_template')
+    @patch('sagemaker.hyperpod.cli.commands.init.load_config_and_validate')
+    def test_create_command_dynamic_template(self, mock_load_config, mock_create_dynamic, mock_is_dynamic):
+        """Test create command with dynamic template"""
+        runner = CliRunner()
+        
+        with tempfile.TemporaryDirectory() as temp_dir:
+            with runner.isolated_filesystem(temp_dir):
+                # Mock config loading
+                config_data = {"job_name": "test-job", "epochs": 50}
+                mock_load_config.return_value = (config_data, "hyp-recipe-job", "1.0")
+                mock_is_dynamic.return_value = True
+                
+                result = runner.invoke(_default_create)
+                
+                assert result.exit_code == 0
+                mock_create_dynamic.assert_called_once()
+
+    @patch('sagemaker.hyperpod.cli.commands.init.is_dynamic_template')
+    @patch('sagemaker.hyperpod.cli.commands.init._generate_dynamic_config_yaml')
+    @patch('sagemaker.hyperpod.cli.commands.init.load_config')
+    def test_reset_command_dynamic_template(self, mock_load_config, mock_generate_config, mock_is_dynamic):
+        """Test reset command with dynamic template"""
+        runner = CliRunner()
+        
+        with tempfile.TemporaryDirectory() as temp_dir:
+            with runner.isolated_filesystem(temp_dir):
+                # Mock config loading
+                mock_load_config.return_value = ({"job_name": "test"}, "hyp-recipe-job", "1.0")
+                mock_is_dynamic.return_value = True
+                
+                result = runner.invoke(reset)
+                
+                assert result.exit_code == 0
+                mock_generate_config.assert_called_once_with(Path(".").resolve(), "hyp-recipe-job", "1.0")
+                assert "config.yaml reset: all fields set to default values" in result.output
+
+    @patch('sagemaker.hyperpod.cli.commands.init.is_dynamic_template')
+    @patch('sagemaker.hyperpod.cli.commands.init.build_config_from_schema')
+    @patch('sagemaker.hyperpod.cli.commands.init.save_config_yaml')
+    @patch('sagemaker.hyperpod.cli.commands.init.load_config')
+    def test_reset_command_standard_template(self, mock_load_config, mock_save_config, mock_build_config, mock_is_dynamic):
+        """Test reset command with standard template (non-dynamic)"""
+        runner = CliRunner()
+        
+        with tempfile.TemporaryDirectory() as temp_dir:
+            with runner.isolated_filesystem(temp_dir):
+                # Mock config loading
+                mock_load_config.return_value = ({"namespace": "test"}, "hyp-pytorch-job", "1.0")
+                mock_is_dynamic.return_value = False
+                mock_build_config.return_value = ({"namespace": "default"}, {})
+                
+                result = runner.invoke(reset)
+                
+                assert result.exit_code == 0
+                mock_build_config.assert_called_once_with("hyp-pytorch-job", "1.0")
+                mock_save_config.assert_called_once()
diff --git a/test/unit_tests/cli/test_init_training_jobs.py b/test/unit_tests/cli/test_init_training_jobs.py
new file mode 100644
index 00000000..f76b8ee6
--- /dev/null
+++ b/test/unit_tests/cli/test_init_training_jobs.py
@@ -0,0 +1,190 @@
+"""
+Unit tests for init command training job functionality
+"""
+import tempfile
+import pytest
+from unittest.mock import patch, MagicMock
+from pathlib import Path
+import click
+from click.testing import CliRunner
+
+from sagemaker.hyperpod.cli.commands.init import init
+
+
+class TestInitTrainingJobCommands:
+    """Test cases for init command with training job templates"""
+
+    def test_init_recipe_job_template_choice(self):
+        """Test that hyp-recipe-job is available as a template choice"""
+        runner = CliRunner()
+        result = runner.invoke(init, ['--help'])
+        
+        assert result.exit_code == 0
+        assert 'hyp-recipe-job' in result.output
+
+    @patch('sagemaker.hyperpod.cli.commands.init._init_training_job')
+    def test_init_recipe_job_with_all_params(self, mock_init_training):
+        """Test init hyp-recipe-job with all parameters"""
+        mock_init_training.return_value = True
+
+        runner = CliRunner()
+        with tempfile.TemporaryDirectory() as temp_dir:
+            result = runner.invoke(init, [
+                'hyp-recipe-job', temp_dir,
+                '--model-id', 'test-model',
+                '--technique', 'SFT',
+                '--instance-type', 'ml.p4d.24xlarge'
+            ])
+
+            assert result.exit_code == 0
+            mock_init_training.assert_called_once_with(
+                temp_dir, 'hyp-recipe-job', 'test-model', 'SFT', 'ml.p4d.24xlarge', is_huggingface=False
+            )
+            assert "initialized successfully" in result.output
+
+    @patch('sagemaker.hyperpod.cli.commands.init._init_training_job')
+    def test_init_recipe_job_without_technique(self, mock_init_training):
+        """Test init hyp-recipe-job without technique shows error (technique is required)"""
+        runner = CliRunner()
+        with tempfile.TemporaryDirectory() as temp_dir:
+            result = runner.invoke(init, [
+                'hyp-recipe-job', temp_dir,
+                '--model-id', 'test-model',
+                '--instance-type', 'ml.p4d.24xlarge'
+            ])
+
+            assert result.exit_code == 0
+            assert "--technique is required" in result.output
+            mock_init_training.assert_not_called()
+
+    @patch('sagemaker.hyperpod.cli.commands.init._init_training_job')
+    def test_init_recipe_job_without_instance_type(self, mock_init_training):
+        """Test init hyp-recipe-job without instance type (triggers interactive selection)"""
+        mock_init_training.return_value = True
+
+        runner = CliRunner()
+        with tempfile.TemporaryDirectory() as temp_dir:
+            result = runner.invoke(init, [
+                'hyp-recipe-job', temp_dir,
+                '--model-id', 'test-model',
+                '--technique', 'SFT',
+            ])
+
+            assert result.exit_code == 0
+            mock_init_training.assert_called_once_with(
+                temp_dir, 'hyp-recipe-job', 'test-model', 'SFT', None, is_huggingface=False
+            )
+
+    def test_init_recipe_job_missing_model_name(self):
+        """Test init hyp-recipe-job without required model-id parameter"""
+        runner = CliRunner()
+        with tempfile.TemporaryDirectory() as temp_dir:
+            result = runner.invoke(init, [
+                'hyp-recipe-job', temp_dir,
+                '--technique', 'SFT',
+                '--instance-type', 'ml.p4d.24xlarge'
+            ])
+
+            assert result.exit_code == 0
+            assert "--model-id or --huggingface-model-id is required" in result.output
+
+    @patch('sagemaker.hyperpod.cli.commands.init._init_training_job')
+    def test_init_recipe_job_failure(self, mock_init_training):
+        """Test init training job when initialization fails"""
+        mock_init_training.return_value = False
+
+        runner = CliRunner()
+        with tempfile.TemporaryDirectory() as temp_dir:
+            result = runner.invoke(init, [
+                'hyp-recipe-job', temp_dir,
+                '--model-id', 'test-model',
+                '--technique', 'SFT',
+                '--instance-type', 'ml.p4d.24xlarge'
+            ])
+
+            assert result.exit_code == 0
+            assert "initialized successfully" not in result.output
+
+    @patch('sagemaker.hyperpod.cli.commands.init._init_training_job')
+    def test_init_recipe_job_with_huggingface_model_id(self, mock_init_training):
+        """Test init hyp-recipe-job with --huggingface-model-id flag"""
+        mock_init_training.return_value = True
+
+        runner = CliRunner()
+        with tempfile.TemporaryDirectory() as temp_dir:
+            result = runner.invoke(init, [
+                'hyp-recipe-job', temp_dir,
+                '--huggingface-model-id', 'meta-llama/Llama-3.1-8B-Instruct',
+                '--technique', 'SFT',
+                '--instance-type', 'ml.p4d.24xlarge'
+            ])
+
+            assert result.exit_code == 0
+            mock_init_training.assert_called_once_with(
+                temp_dir, 'hyp-recipe-job', 'meta-llama/Llama-3.1-8B-Instruct', 'SFT', 'ml.p4d.24xlarge', is_huggingface=True
+            )
+
+    def test_init_recipe_job_both_model_id_flags_error(self):
+        """Test that providing both --model-id and --huggingface-model-id shows error"""
+        runner = CliRunner()
+        with tempfile.TemporaryDirectory() as temp_dir:
+            result = runner.invoke(init, [
+                'hyp-recipe-job', temp_dir,
+                '--model-id', 'test-model',
+                '--huggingface-model-id', 'meta-llama/Llama-3.1-8B-Instruct',
+                '--technique', 'SFT',
+            ])
+
+            assert result.exit_code == 0
+            assert "Specify either --model-id or --huggingface-model-id, not both" in result.output
+
+
+class TestInitUtilsChanges:
+    """Test cases for changes in init_utils.py"""
+
+    def test_is_dynamic_template_recipe_job(self):
+        """Test is_dynamic_template recognizes hyp-recipe-job"""
+        from sagemaker.hyperpod.cli.init_utils import is_dynamic_template
+        
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_path = Path(temp_dir)
+            override_file = temp_path / ".override_spec.json"
+            override_file.write_text('{"test": "data"}')
+            
+            assert is_dynamic_template("hyp-recipe-job", temp_path) is True
+
+    def test_is_dynamic_template_other_template(self):
+        """Test is_dynamic_template returns False for other templates"""
+        from sagemaker.hyperpod.cli.init_utils import is_dynamic_template
+        
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_path = Path(temp_dir)
+            override_file = temp_path / ".override_spec.json"
+            override_file.write_text('{"test": "data"}')
+            
+            assert is_dynamic_template("hyp-pytorch-job", temp_path) is False
+
+    def test_is_dynamic_template_no_override_file(self):
+        """Test is_dynamic_template returns False when no override file exists"""
+        from sagemaker.hyperpod.cli.init_utils import is_dynamic_template
+        
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_path = Path(temp_dir)
+            
+            assert is_dynamic_template("hyp-recipe-job", temp_path) is False
+
+
+class TestInitConstantsChanges:
+    """Test cases for changes in init_constants.py"""
+
+    def test_templates_include_recipe_job(self):
+        """Test that TEMPLATES includes hyp-recipe-job"""
+        from sagemaker.hyperpod.cli.constants.init_constants import TEMPLATES
+        
+        assert "hyp-recipe-job" in TEMPLATES
+
+    def test_recipe_job_has_dynamic_type(self):
+        """Test that hyp-recipe-job is marked as dynamic"""
+        from sagemaker.hyperpod.cli.constants.init_constants import TEMPLATES
+        
+        assert TEMPLATES["hyp-recipe-job"]["type"] == "dynamic"
diff --git a/test/unit_tests/cli/test_init_utils.py b/test/unit_tests/cli/test_init_utils.py
index 3e98f0be..5175a355 100644
--- a/test/unit_tests/cli/test_init_utils.py
+++ b/test/unit_tests/cli/test_init_utils.py
@@ -155,22 +155,21 @@ def test_save_config_yaml_success(self, mock_print, mock_join, mock_makedirs, mo
             'namespace': '[Required] Kubernetes namespace'
         }
         directory = '/test/dir'
-        mock_join.return_value = '/test/dir/config.yaml'
+        mock_join.side_effect = ['/test/dir/config.yaml', '/test/dir/.hyp']
         
         save_config_yaml(prefill, comment_map, directory)
         
         # Verify directory creation
         mock_makedirs.assert_called_once_with(directory, exist_ok=True)
         
-        # Verify file operations
-        mock_file.assert_called_once_with('/test/dir/config.yaml', 'w')
+        # Verify config.yaml was opened for writing
+        mock_file.assert_any_call('/test/dir/config.yaml', 'w')
         
         # Verify content written
         written_calls = mock_file().write.call_args_list
         written_content = ''.join(call[0][0] for call in written_calls)
         
-        assert '# Template type' in written_content
-        assert 'template: hyp-cluster-stack' in written_content
+        assert '# template: hyp-cluster-stack' in written_content
         assert '# [Required] Kubernetes namespace' in written_content
         assert 'namespace: test-namespace' in written_content
         
@@ -212,7 +211,7 @@ def test_load_config_success(self):
             'hyp-cluster-stack': create_mock_template(CFN)
         }
         
-        with patch('pathlib.Path.is_file', return_value=True), \
+        with patch('pathlib.Path.is_file', lambda self: self.name == 'config.yaml'), \
              patch('pathlib.Path.read_text', return_value=config_content), \
              patch('sagemaker.hyperpod.cli.init_utils.TEMPLATES', mock_templates):
             
@@ -234,7 +233,7 @@ def test_load_config_default_version(self):
             'hyp-cluster-stack': create_mock_template(CFN)
         }
         
-        with patch('pathlib.Path.is_file', return_value=True), \
+        with patch('pathlib.Path.is_file', lambda self: self.name == 'config.yaml'), \
              patch('pathlib.Path.read_text', return_value=config_content), \
              patch('sagemaker.hyperpod.cli.init_utils.TEMPLATES', mock_templates):
             
@@ -356,7 +355,7 @@ def test_validate_config_handles_list_values(self):
             validate_config_against_model(config_data, 'hyp-cluster-stack', '1.0')
             
             # Verify handler was called
-            mock_get_handler.assert_called_with('hyp-cluster-stack', 'tags')
+            mock_get_handler.assert_called_with('hyp-cluster-stack', 'tags', version='1.0')
             mock_from_dicts.assert_called_with(['tag1', 'tag2'])
 
 
@@ -734,7 +733,7 @@ def test_load_config_and_validate_success(self):
             'hyp-cluster-stack': create_mock_template(CFN)
         }
         
-        with patch('pathlib.Path.is_file', return_value=True), \
+        with patch('pathlib.Path.is_file', lambda self: self.name == 'config.yaml'), \
              patch('pathlib.Path.read_text', return_value=config_content), \
              patch('sagemaker.hyperpod.cli.init_utils.TEMPLATES', mock_templates), \
              patch('sagemaker.hyperpod.cluster_management.hp_cluster_stack.HpClusterStack') as mock_cluster_stack:
diff --git a/test/unit_tests/cli/test_init_utils_dynamic.py b/test/unit_tests/cli/test_init_utils_dynamic.py
new file mode 100644
index 00000000..b850e919
--- /dev/null
+++ b/test/unit_tests/cli/test_init_utils_dynamic.py
@@ -0,0 +1,185 @@
+import pytest
+import json
+import tempfile
+from pathlib import Path
+from unittest.mock import patch, mock_open
+
+from sagemaker.hyperpod.cli.init_utils import (
+    is_dynamic_template,
+    load_dynamic_schema
+)
+
+
+class TestIsDynamicTemplate:
+    """Test cases for is_dynamic_template function"""
+
+    def test_is_dynamic_template_true(self):
+        """Test detection of dynamic template"""
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_path = Path(temp_dir)
+            
+            # Create .override_spec.json file
+            (temp_path / ".override_spec.json").write_text('{"job_name": {"type": "string"}}')
+            
+            result = is_dynamic_template("hyp-recipe-job", temp_path)
+            assert result is True
+
+    def test_is_dynamic_template_false_no_spec_file(self):
+        """Test non-dynamic template without spec file"""
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_path = Path(temp_dir)
+            
+            result = is_dynamic_template("hyp-pytorch-job", temp_path)
+            assert result is False
+
+
+class TestLoadDynamicSchema:
+    """Test cases for load_dynamic_schema function"""
+
+    def test_load_dynamic_schema_success(self):
+        """Test successful schema loading"""
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_path = Path(temp_dir)
+            
+            # Create .override_spec.json
+            spec = {
+                "job_name": {"type": "string", "required": True},
+                "epochs": {"type": "integer", "min": 1, "max": 100}
+            }
+            (temp_path / ".override_spec.json").write_text(json.dumps(spec))
+            
+            result = load_dynamic_schema(temp_path)
+            
+            assert result == spec
+            assert "job_name" in result
+            assert result["job_name"]["type"] == "string"
+
+    def test_load_dynamic_schema_file_not_found(self):
+        """Test schema loading with missing file returns empty dict"""
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_path = Path(temp_dir)
+            
+            result = load_dynamic_schema(temp_path)
+            assert result == {}
+
+    def test_load_dynamic_schema_default_path(self):
+        """Test with default path (current directory)"""
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Change to temp directory and create spec file there
+            import os
+            original_cwd = os.getcwd()
+            try:
+                os.chdir(temp_dir)
+                spec = {"test": {"type": "string"}}
+                Path(".override_spec.json").write_text(json.dumps(spec))
+                
+                result = load_dynamic_schema()
+                assert result == spec
+            finally:
+                os.chdir(original_cwd)
+
+
+class TestGenerateStandardClickCommandHandlers:
+    """Ensure _generate_standard_click_command uses handler-based options for complex fields.
+
+    Regression test: a refactor accidentally replaced _get_handler_for_field +
+    _get_click_option_config with create_click_option, which broke complex fields
+    (volumes, security groups) that need multiple=True and JSON parsing callbacks.
+    """
+
+class TestGenerateStandardClickCommandHandlers:
+    """Ensure _generate_standard_click_command uses handler-based option generation.
+
+    Regression test: a refactor accidentally replaced _get_handler_for_field +
+    _get_click_option_config with create_click_option, losing special type handling.
+    """
+
+    def test_uses_get_handler_for_field(self):
+        """_generate_standard_click_command must call _get_handler_for_field for each field."""
+        from unittest.mock import patch, MagicMock
+        from sagemaker.hyperpod.cli.init_utils import _generate_standard_click_command, DEFAULT_TYPE_HANDLER
+
+        with patch("sagemaker.hyperpod.cli.init_utils._get_handler_for_field",
+                   return_value=DEFAULT_TYPE_HANDLER) as mock_handler, \
+             patch("sagemaker.hyperpod.cli.init_utils._get_click_option_config",
+                   return_value={"type": str, "help": ""}) as mock_config:
+
+            decorator = _generate_standard_click_command("hyp-pytorch-job", "1.1")
+
+            @decorator
+            def dummy(option=None, value=None, model_config=None):
+                pass
+
+            assert mock_handler.call_count > 0, \
+                "_get_handler_for_field was not called — handler-based generation was replaced"
+            assert mock_config.call_count > 0, \
+                "_get_click_option_config was not called — handler-based generation was replaced"
+
+    def test_does_not_use_create_click_option(self):
+        """_generate_standard_click_command must NOT use create_click_option (simple path)."""
+        from unittest.mock import patch
+        from sagemaker.hyperpod.cli.init_utils import _generate_standard_click_command
+
+        with patch("sagemaker.hyperpod.cli.type_handler_utils.create_click_option") as mock_simple:
+            decorator = _generate_standard_click_command("hyp-pytorch-job", "1.1")
+
+            @decorator
+            def dummy(option=None, value=None, model_config=None):
+                pass
+
+            assert mock_simple.call_count == 0, \
+                "create_click_option was called — standard template should use handler-based generation"
+
+
+class TestGenerateDynamicClickCommand:
+    """Test _generate_dynamic_click_command generates options from .override_spec.json."""
+
+    def test_generates_options_from_spec(self):
+        """Options should be generated for each key in .override_spec.json."""
+        import tempfile, json
+        from pathlib import Path
+        from unittest.mock import patch
+        from sagemaker.hyperpod.cli.init_utils import _generate_dynamic_click_command
+
+        spec = {
+            "learning_rate": {"type": "float", "required": True, "default": 0.001},
+            "max_epochs":    {"type": "integer", "required": True, "default": 5},
+            "output_path":   {"type": "string", "required": False, "default": ""},
+        }
+
+        with tempfile.TemporaryDirectory() as tmp:
+            (Path(tmp) / ".override_spec.json").write_text(json.dumps(spec))
+            with patch("sagemaker.hyperpod.cli.init_utils.Path") as mock_path_cls:
+                mock_path_cls.return_value.resolve.return_value = Path(tmp)
+                with patch("sagemaker.hyperpod.cli.init_utils.load_dynamic_schema", return_value=spec):
+                    decorator = _generate_dynamic_click_command()
+
+        @decorator
+        def dummy(option=None, value=None, model_config=None):
+            pass
+
+        param_names = {p.name for p in getattr(dummy, '__click_params__', [])}
+        assert 'learning-rate' in param_names or 'learning_rate' in param_names
+        assert 'max-epochs' in param_names or 'max_epochs' in param_names
+        assert 'output-path' in param_names or 'output_path' in param_names
+
+    def test_integer_field_has_int_type(self):
+        """Integer fields in spec should produce int-typed Click options."""
+        import json, tempfile
+        from pathlib import Path
+        from unittest.mock import patch
+        from sagemaker.hyperpod.cli.init_utils import _generate_dynamic_click_command
+
+        spec = {"max_epochs": {"type": "integer", "required": True, "default": 5}}
+
+        with patch("sagemaker.hyperpod.cli.init_utils.load_dynamic_schema", return_value=spec):
+            decorator = _generate_dynamic_click_command()
+
+        @decorator
+        def dummy(option=None, value=None, model_config=None):
+            pass
+
+        params = {p.name: p for p in getattr(dummy, '__click_params__', [])}
+        param = params.get('max-epochs') or params.get('max_epochs')
+        assert param is not None
+        assert param.type.name == 'integer'
diff --git a/test/unit_tests/cli/test_training_recipe.py b/test/unit_tests/cli/test_training_recipe.py
new file mode 100644
index 00000000..90a34a6d
--- /dev/null
+++ b/test/unit_tests/cli/test_training_recipe.py
@@ -0,0 +1,1057 @@
+import pytest
+import json
+import yaml
+import tempfile
+from pathlib import Path
+from unittest.mock import patch, MagicMock, mock_open
+from kubernetes.client.rest import ApiException
+from kubernetes import config
+import click
+
+from sagemaker.hyperpod.cli.commands.training_recipe import (
+    _configure_dynamic_template,
+    _create_dynamic_template,
+    _init_training_job,
+    # create_recipe_job_interactive
+)
+from sagemaker.hyperpod.cli.recipe_utils import (
+    _validate_dynamic_template,
+    _update_config_field,
+    _fetch_recipe_from_hub,
+    _validate_and_convert_value,
+    _collect_parameter_interactively,
+    _get_sagemaker_client,
+    _get_s3_client,
+    _get_k8s_custom_client,
+    _download_s3_json,
+    _download_s3_content,
+    load_dynamic_schema
+)
+
+
+class TestValidateDynamicTemplate:
+    """Test cases for _validate_dynamic_template function"""
+
+    def test_validate_dynamic_template_success(self):
+        """Test successful validation"""
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_path = Path(temp_dir)
+            
+            # Create .override_spec.json
+            spec = {
+                "job_name": {"type": "string", "required": True},
+                "epochs": {"type": "integer", "min": 1, "max": 100, "required": False}
+            }
+            (temp_path / ".override_spec.json").write_text(json.dumps(spec))
+            
+            # Create valid config.yaml
+            config = {"job_name": "test-job", "epochs": 50}
+            (temp_path / "config.yaml").write_text(yaml.dump(config))
+            
+            # Should not raise exception
+            result = _validate_dynamic_template(temp_path)
+            assert result is True
+
+    def test_validate_dynamic_template_missing_spec(self):
+        """Test validation with missing .override_spec.json"""
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_path = Path(temp_dir)
+            
+            with pytest.raises(FileNotFoundError, match=".override_spec.json not found"):
+                _validate_dynamic_template(temp_path)
+
+    def test_validate_dynamic_template_missing_required_field(self):
+        """Test validation with missing required field"""
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_path = Path(temp_dir)
+            
+            # Create .override_spec.json
+            spec = {
+                "job_name": {"type": "string", "required": True},
+                "epochs": {"type": "integer", "required": False}
+            }
+            (temp_path / ".override_spec.json").write_text(json.dumps(spec))
+            
+            # Create config.yaml missing required field
+            config = {"epochs": 50}
+            (temp_path / "config.yaml").write_text(yaml.dump(config))
+            
+            with pytest.raises(ValueError, match="job_name: Required field is missing or empty"):
+                _validate_dynamic_template(temp_path)
+
+
+class TestCreateDynamicTemplate:
+    """Test cases for _create_dynamic_template function"""
+
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe._validate_dynamic_template')
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe._submit_k8s_resources')
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe._get_k8s_custom_client')
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe.click.secho')
+    def test_create_dynamic_template_success(self, mock_secho, mock_custom_client, mock_submit, mock_validate):
+        """Test successful template creation and submission"""
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_path = Path(temp_dir)
+            
+            # Create k8s.jinja template
+            k8s_template = """---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ job_name }}-config
+---
+apiVersion: kubeflow.org/v1
+kind: PyTorchJob
+metadata:
+  name: {{ job_name }}"""
+            (temp_path / "k8s.jinja").write_text(k8s_template)
+            (temp_path / "config.yaml").write_text("job_name: test-job")
+            
+            config_data = {"job_name": "test-job"}
+            
+            # Mock validation success
+            mock_validate.return_value = True
+            
+            # Mock Kubernetes client
+            mock_custom_instance = MagicMock()
+            mock_custom_client.return_value = mock_custom_instance
+            
+            # Execute
+            _create_dynamic_template(temp_path, config_data)
+            
+            # Verify validation was called
+            mock_validate.assert_called_once_with(temp_path)
+            
+            # Verify submit was called
+            mock_submit.assert_called_once()
+            
+            # Verify success messages
+            mock_secho.assert_any_call("✔️ Configuration validated successfully", fg="green")
+            mock_secho.assert_any_call("✔️ Successfully submitted to HyperPod", fg="green")
+
+
+class TestInitTrainingJob:
+    """Test cases for _init_training_job function"""
+
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe._get_sagemaker_client')
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe._get_s3_client')
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe.click.secho')
+    def test_init_training_job_success(self, mock_secho, mock_get_s3_client, mock_get_sagemaker_client):
+        """Test successful recipe job initialization"""
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Mock SageMaker client
+            mock_sagemaker = MagicMock()
+            mock_sagemaker.describe_hub_content.return_value = {
+                'HubContentDocument': json.dumps({
+                    'RecipeCollection': [{
+                        'Type': 'FineTuning',
+                        'CustomizationTechnique': 'lora',
+                        'SupportedInstanceTypes': ['ml.p4d.24xlarge'],
+                        'HpEksOverrideParamsS3Uri': 's3://bucket/override.json',
+                        'HpEksPayloadTemplateS3Uri': 's3://bucket/template.yaml'
+                    }]
+                })
+            }
+            mock_get_sagemaker_client.return_value = mock_sagemaker
+            
+            # Mock S3 client
+            mock_s3 = MagicMock()
+            mock_s3.get_object.side_effect = [
+                {'Body': MagicMock(read=lambda: json.dumps({"job_name": {"type": "string", "required": True}}).encode())},
+                {'Body': MagicMock(read=lambda: b'apiVersion: v1\nkind: Job')}
+            ]
+            mock_get_s3_client.return_value = mock_s3
+            
+            result = _init_training_job(temp_dir, "hyp-recipe-job", "test-model", "lora", "ml.p4d.24xlarge")
+            
+            assert result is True
+            assert Path(temp_dir, ".override_spec.json").exists()
+            assert Path(temp_dir, "config.yaml").exists()
+            assert Path(temp_dir, "k8s.jinja").exists()
+
+
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe._get_sagemaker_client')
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe._get_s3_client')
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe.click.secho')
+    def test_init_recipe_job_no_technique(self, mock_secho, mock_get_s3_client, mock_get_sagemaker_client):
+        """Test that recipe job initialization fails when technique is None (technique is required)"""
+        with tempfile.TemporaryDirectory() as temp_dir:
+            mock_sagemaker = MagicMock()
+            mock_get_sagemaker_client.return_value = mock_sagemaker
+
+            result = _init_training_job(temp_dir, "hyp-recipe-job", "test-model", None, "ml.p4d.24xlarge")
+
+            assert result is False
+
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe._interactive_cluster_selection')
+    @patch('sagemaker.hyperpod.cli.commands.cluster.set_cluster_context')
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe._get_sagemaker_client')
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe._get_s3_client')
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe.click.secho')
+    def test_init_training_job_with_interactive_selection(self, mock_secho, mock_get_s3_client, mock_get_sagemaker_client, mock_set_context, mock_interactive):
+        """Test training job initialization with interactive cluster selection"""
+        with tempfile.TemporaryDirectory() as temp_dir:
+            mock_interactive.return_value = ("test-cluster", "ml.p4d.24xlarge")
+            mock_set_context.main = MagicMock()
+
+            mock_sagemaker = MagicMock()
+            mock_sagemaker.describe_hub_content.return_value = {
+                'HubContentDocument': json.dumps({
+                    'RecipeCollection': [{
+                        'Type': 'FineTuning',
+                        'CustomizationTechnique': 'SFT',
+                        'SupportedInstanceTypes': ['ml.p4d.24xlarge'],
+                        'HpEksOverrideParamsS3Uri': 's3://bucket/override.json',
+                        'HpEksPayloadTemplateS3Uri': 's3://bucket/template.yaml'
+                    }]
+                })
+            }
+            mock_get_sagemaker_client.return_value = mock_sagemaker
+
+            mock_s3 = MagicMock()
+            mock_s3.get_object.side_effect = [
+                {'Body': MagicMock(read=lambda: json.dumps({"job_name": {"type": "string", "required": True}}).encode())},
+                {'Body': MagicMock(read=lambda: b'apiVersion: v1\nkind: Job')}
+            ]
+            mock_get_s3_client.return_value = mock_s3
+
+            result = _init_training_job(temp_dir, "hyp-recipe-job", "test-model", "SFT")
+
+            assert result is True
+            mock_interactive.assert_called_once()
+            assert Path(temp_dir, ".override_spec.json").exists()
+            assert Path(temp_dir, "config.yaml").exists()
+            assert Path(temp_dir, "k8s.jinja").exists()
+
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe._interactive_cluster_selection')
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe._get_sagemaker_client')
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe.click.secho')
+    def test_init_training_job_interactive_selection_fails(self, mock_secho, mock_get_sagemaker_client, mock_interactive):
+        """Test training job initialization when interactive selection fails"""
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Mock interactive selection failure
+            mock_interactive.return_value = (None, None)
+            
+            mock_sagemaker = MagicMock()
+            mock_get_sagemaker_client.return_value = mock_sagemaker
+            
+            result = _init_training_job(temp_dir, "hyp-recipe-job", "test-model", "lora")
+            
+            assert result is False
+            mock_interactive.assert_called_once()
+
+
+class TestUpdateConfigField:
+    """Test cases for _update_config_field function"""
+
+    def test_update_config_field_success(self):
+        """Test successful config field update"""
+        with tempfile.TemporaryDirectory() as temp_dir:
+            config_path = Path(temp_dir) / "config.yaml"
+            config_path.write_text("job_name: old-name\nepochs: 10\n")
+            
+            spec = {
+                "job_name": {"type": "string", "required": True},
+                "epochs": {"type": "integer", "min": 1, "max": 100}
+            }
+            
+            _update_config_field(config_path, spec, "epochs", "50")
+            
+            updated_content = config_path.read_text()
+            assert "epochs: 50" in updated_content
+            assert "job_name: old-name" in updated_content
+
+    def test_update_config_field_validation_error(self):
+        """Test config field update with validation error"""
+        with tempfile.TemporaryDirectory() as temp_dir:
+            config_path = Path(temp_dir) / "config.yaml"
+            config_path.write_text("job_name: test\nepochs: 10\n")
+            
+            spec = {
+                "epochs": {"type": "integer", "min": 1, "max": 100}
+            }
+            
+            with pytest.raises(SystemExit):
+                _update_config_field(config_path, spec, "epochs", "150")  # Exceeds max
+
+
+class TestFetchRecipeFromHub:
+    """Test cases for _fetch_recipe_from_hub function"""
+
+    def test_fetch_recipe_single_match(self):
+        """Test fetching recipe recipe with single matching recipe"""
+        mock_client = MagicMock()
+        mock_client.describe_hub_content.return_value = {
+            'HubContentDocument': json.dumps({
+                'RecipeCollection': [{
+                    'Type': 'FineTuning',
+                    'CustomizationTechnique': 'lora',
+                    'SupportedInstanceTypes': ['ml.p4d.24xlarge', 'ml.g5.xlarge']
+                }]
+            })
+        }
+        
+        result = _fetch_recipe_from_hub(mock_client, "test-model", "hyp-recipe-job", "lora", "ml.p4d.24xlarge")
+        
+        assert result['Type'] == 'FineTuning'
+        assert result['CustomizationTechnique'] == 'lora'
+        assert 'ml.p4d.24xlarge' in result['SupportedInstanceTypes']
+
+    def test_fetch_recipe_no_technique_success(self):
+        """Test that fetching recipe without technique raises ValueError (technique is required)"""
+        mock_client = MagicMock()
+        mock_client.describe_hub_content.return_value = {
+            'HubContentDocument': json.dumps({
+                'RecipeCollection': [{'Type': 'FineTuning', 'SupportedInstanceTypes': ['ml.p4d.24xlarge']}]
+            })
+        }
+
+        with pytest.raises(ValueError, match="technique is required"):
+            _fetch_recipe_from_hub(mock_client, "test-model", "hyp-recipe-job", None, "ml.p4d.24xlarge")
+
+
+
+    def test_fetch_recipe_no_instance_type_match(self):
+        """Test fetching recipe with no matching instance type"""
+        mock_client = MagicMock()
+        mock_client.describe_hub_content.return_value = {
+            'HubContentDocument': json.dumps({
+                'RecipeCollection': [
+                    {
+                        'Type': 'FineTuning',
+                        'CustomizationTechnique': 'lora',
+                        'SupportedInstanceTypes': ['ml.g5.xlarge']
+                    },
+                    {
+                        'Type': 'FineTuning',
+                        'CustomizationTechnique': 'lora',
+                        'SupportedInstanceTypes': ['ml.g4dn.xlarge']
+                    }
+                ]
+            })
+        }
+        
+        with pytest.raises(ValueError, match="Instance type ml.p4d.24xlarge not supported. Supported: \\['ml.g4dn.xlarge', 'ml.g5.xlarge'\\]"):
+            _fetch_recipe_from_hub(mock_client, "test-model", "hyp-recipe-job", "lora", "ml.p4d.24xlarge")
+
+
+class TestValidateAndConvertValue:
+    """Test cases for _validate_and_convert_value function"""
+
+    def test_validate_integer_success(self):
+        """Test successful integer validation"""
+        result = _validate_and_convert_value("42", {"type": "integer", "min": 1, "max": 100})
+        assert result == 42
+
+    def test_validate_integer_type_error(self):
+        """Test integer validation with invalid type"""
+        with pytest.raises(ValueError, match="Invalid integer value: 'not_a_number'. Please enter a valid integer."):
+            _validate_and_convert_value("not_a_number", {"type": "integer"})
+
+    def test_validate_float_success(self):
+        """Test successful float validation"""
+        result = _validate_and_convert_value("3.14", {"type": "float", "min": 0.0, "max": 10.0})
+        assert result == 3.14
+
+    def test_validate_float_min_error(self):
+        """Test float validation below minimum"""
+        with pytest.raises(ValueError, match="Value -1.0 is below the minimum allowed value of 0.0."):
+            _validate_and_convert_value("-1.0", {"type": "float", "min": 0.0})
+
+    def test_validate_float_max_error(self):
+        """Test float validation above maximum"""
+        with pytest.raises(ValueError, match="Value 15.0 exceeds the maximum allowed value of 10.0."):
+            _validate_and_convert_value("15.0", {"type": "float", "max": 10.0})
+
+    def test_validate_enum_success(self):
+        """Test successful enum validation"""
+        result = _validate_and_convert_value("option1", {"type": "string", "enum": ["option1", "option2"]})
+        assert result == "option1"
+
+    def test_validate_enum_error(self):
+        """Test enum validation with invalid option"""
+        with pytest.raises(ValueError, match="Invalid option 'invalid'. Please choose from: option1, option2."):
+            _validate_and_convert_value("invalid", {"type": "string", "enum": ["option1", "option2"]})
+
+
+# class TestCreateFineTuningJobInteractive:
+#     """Test cases for create_recipe_job_interactive function"""
+
+#     @patch('sagemaker.hyperpod.cli.commands.training_recipe.click.secho')
+#     def test_create_missing_parameters(self, mock_secho):
+#         """Test create command with missing required parameters"""
+#         # Call the function directly, not the Click command
+#         from sagemaker.hyperpod.cli.commands.training_recipe import create_recipe_job_interactive
+        
+#         # Get the actual function, not the Click command wrapper
+#         func = create_recipe_job_interactive.callback
+#         result = func(None, "lora", "ml.p4d.24xlarge")
+        
+#         assert result is False
+#         mock_secho.assert_called_with("❌ --model-name, --technique, and --instance-type are required for hyp-recipe-job", fg="red")
+
+#     @patch('sagemaker.hyperpod.cli.commands.training_recipe.click.secho')
+#     def test_create_missing_technique(self, mock_secho):
+#         """Test create command with missing technique"""
+#         from sagemaker.hyperpod.cli.commands.training_recipe import create_recipe_job_interactive
+        
+#         func = create_recipe_job_interactive.callback
+#         result = func("test-model", None, "ml.p4d.24xlarge")
+        
+#         assert result is False
+#         mock_secho.assert_called_with("❌ --model-name, --technique, and --instance-type are required for hyp-recipe-job", fg="red")
+
+#     @patch('sagemaker.hyperpod.cli.commands.training_recipe.click.secho')
+#     def test_create_missing_instance_type(self, mock_secho):
+#         """Test create command with missing instance type"""
+#         from sagemaker.hyperpod.cli.commands.training_recipe import create_recipe_job_interactive
+        
+#         func = create_recipe_job_interactive.callback
+#         result = func("test-model", "lora", None)
+        
+#         assert result is False
+#         mock_secho.assert_called_with("❌ --model-name, --technique, and --instance-type are required for hyp-recipe-job", fg="red")
+
+#     @patch('sagemaker.hyperpod.cli.commands.training_recipe._get_sagemaker_client')
+#     @patch('sagemaker.hyperpod.cli.commands.training_recipe._get_s3_client')
+#     @patch('sagemaker.hyperpod.cli.commands.training_recipe._get_k8s_custom_client')
+#     @patch('sagemaker.hyperpod.cli.commands.training_recipe._collect_all_parameters_interactively')
+#     @patch('sagemaker.hyperpod.cli.commands.training_recipe._submit_k8s_resources')
+#     @patch('sagemaker.hyperpod.cli.commands.training_recipe.click.secho')
+#     def test_create_success(self, mock_secho, mock_submit, mock_collect, mock_k8s_client, mock_s3_client, mock_sagemaker_client):
+#         """Test successful create command"""
+#         from sagemaker.hyperpod.cli.commands.training_recipe import create_recipe_job_interactive
+        
+#         # Mock SageMaker client
+#         mock_sagemaker = MagicMock()
+#         mock_sagemaker.describe_hub_content.return_value = {
+#             'HubContentDocument': json.dumps({
+#                 'RecipeCollection': [{
+#                     'Type': 'FineTuning',
+#                     'CustomizationTechnique': 'lora',
+#                     'SupportedInstanceTypes': ['ml.p4d.24xlarge'],
+#                     'HpEksOverrideParamsS3Uri': 's3://bucket/override.json',
+#                     'HpEksPayloadTemplateS3Uri': 's3://bucket/template.yaml'
+#                 }]
+#             })
+#         }
+#         mock_sagemaker_client.return_value = mock_sagemaker
+        
+#         # Mock S3 client
+#         mock_s3 = MagicMock()
+#         mock_s3.get_object.side_effect = [
+#             {'Body': MagicMock(read=lambda: json.dumps({"job_name": {"type": "string", "required": True}}).encode())},
+#             {'Body': MagicMock(read=lambda: b'apiVersion: v1\nkind: Job')}
+#         ]
+#         mock_s3_client.return_value = mock_s3
+        
+#         # Mock interactive collection
+#         mock_collect.return_value = {"job_name": "test-job"}
+        
+#         # Mock submit to return True
+#         mock_submit.return_value = True
+        
+#         func = create_recipe_job_interactive.callback
+#         result = func("test-model", "lora", "ml.p4d.24xlarge")
+        
+#         assert result is True
+#         mock_secho.assert_any_call("✅ Recipe job created successfully!", fg="green", bold=True)
+
+
+class TestInitFineTuningJobErrorPaths:
+    """Test error paths for _init_training_job function"""
+
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe._get_s3_client')
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe._get_sagemaker_client')
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe._fetch_recipe_from_hub')
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe.click.secho')
+    def test_init_missing_s3_uris(self, mock_secho, mock_fetch_recipe, mock_get_sagemaker_client, mock_get_s3_client):
+        """Test init with missing S3 URIs in recipe"""
+        mock_fetch_recipe.return_value = {}
+        
+        result = _init_training_job("test-dir", "hyp-recipe-job", "model", "technique", "instance")
+        
+        assert result is False
+        mock_secho.assert_called_with("❌ Missing S3 URIs in recipe", fg="red")
+
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe._get_s3_client')
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe._get_sagemaker_client')
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe._fetch_recipe_from_hub')
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe.click.secho')
+    def test_init_exception_handling(self, mock_secho, mock_fetch_recipe, mock_get_sagemaker_client, mock_get_s3_client):
+        """Test init with exception handling"""
+        mock_fetch_recipe.side_effect = Exception("Test error")
+        
+        result = _init_training_job("test-dir", "hyp-recipe-job", "model", "technique", "instance")
+        
+        assert result is False
+        mock_secho.assert_called_with("❌ Error: Test error", fg="red")
+
+
+class TestCollectParameterInteractively:
+    """Test cases for _collect_parameter_interactively function"""
+
+    @patch('builtins.input', return_value='test_value')
+    @patch('sagemaker.hyperpod.cli.recipe_utils.click.secho')
+    def test_collect_required_parameter(self, mock_secho, mock_input):
+        """Test collecting a required parameter"""
+        param_spec = {
+            'type': 'string',
+            'description': 'Test parameter',
+            'required': True
+        }
+        
+        key, value = _collect_parameter_interactively('test_param', param_spec)
+        
+        assert key == 'test_param'
+        assert value == 'test_value'
+
+    @patch('builtins.input', return_value='')
+    @patch('sagemaker.hyperpod.cli.recipe_utils.click.secho')
+    def test_collect_optional_parameter_empty(self, mock_secho, mock_input):
+        """Test collecting optional parameter with empty input"""
+        param_spec = {
+            'type': 'string',
+            'description': 'Test parameter',
+            'required': False
+        }
+        
+        key, value = _collect_parameter_interactively('test_param', param_spec)
+        
+        assert key == 'test_param'
+        assert value is None
+
+    @patch('builtins.input', side_effect=['', 'valid_value'])
+    @patch('sagemaker.hyperpod.cli.recipe_utils.click.secho')
+    def test_collect_required_parameter_retry(self, mock_secho, mock_input):
+        """Test collecting required parameter with retry on empty input"""
+        param_spec = {
+            'type': 'string',
+            'description': 'Test parameter',
+            'required': True
+        }
+        
+        key, value = _collect_parameter_interactively('test_param', param_spec)
+        
+        assert key == 'test_param'
+        assert value == 'valid_value'
+        mock_secho.assert_any_call("❌ This field is required. Please provide a value.", fg="red")
+
+    @patch('builtins.input', return_value='')
+    @patch('sagemaker.hyperpod.cli.recipe_utils.click.secho')
+    def test_collect_parameter_with_default(self, mock_secho, mock_input):
+        """Test collecting parameter with default value"""
+        param_spec = {
+            'type': 'string',
+            'description': 'Test parameter',
+            'required': False,
+            'default': 'default_value'
+        }
+        
+        key, value = _collect_parameter_interactively('test_param', param_spec)
+        
+        assert key == 'test_param'
+        assert value == 'default_value'
+
+
+class TestClientManagement:
+    """Test cases for client management functions"""
+
+    @patch('sagemaker.hyperpod.cli.recipe_utils.boto3.client')
+    def test_get_sagemaker_client(self, mock_boto3_client):
+        """Test SageMaker client creation"""
+        # Reset global client
+        import sagemaker.hyperpod.cli.recipe_utils as utils
+        utils._sagemaker_client = None
+        
+        mock_client = MagicMock()
+        mock_boto3_client.return_value = mock_client
+        
+        client = _get_sagemaker_client()
+        
+        assert client == mock_client
+        mock_boto3_client.assert_called_once_with(
+            "sagemaker",
+        )
+
+    @patch('sagemaker.hyperpod.cli.recipe_utils.boto3.client')
+    def test_get_s3_client(self, mock_boto3_client):
+        """Test S3 client creation"""
+        # Reset global client
+        import sagemaker.hyperpod.cli.recipe_utils as utils
+        utils._s3_client = None
+        
+        mock_client = MagicMock()
+        mock_boto3_client.return_value = mock_client
+        
+        client = _get_s3_client()
+        
+        assert client == mock_client
+        mock_boto3_client.assert_called_once_with("s3")
+
+    @patch('sagemaker.hyperpod.cli.recipe_utils.client.CustomObjectsApi')
+    @patch('sagemaker.hyperpod.cli.recipe_utils.config.load_kube_config')
+    def test_get_k8s_client_success(self, mock_load_config, mock_custom_api):
+        """Test Kubernetes client creation success"""
+        # Reset global client
+        import sagemaker.hyperpod.cli.recipe_utils as utils
+        utils._k8s_custom_client = None
+        
+        mock_client = MagicMock()
+        mock_custom_api.return_value = mock_client
+        
+        client = _get_k8s_custom_client()
+        
+        assert client == mock_client
+        mock_load_config.assert_called_once()
+
+    @patch('sagemaker.hyperpod.cli.recipe_utils.client.CustomObjectsApi')
+    @patch('sagemaker.hyperpod.cli.recipe_utils.config.load_incluster_config')
+    @patch('sagemaker.hyperpod.cli.recipe_utils.config.load_kube_config')
+    def test_get_k8s_client_fallback(self, mock_load_config, mock_load_incluster, mock_custom_api):
+        """Test Kubernetes client creation with fallback"""
+        # Reset global client
+        import sagemaker.hyperpod.cli.recipe_utils as utils
+        utils._k8s_custom_client = None
+        
+        mock_load_config.side_effect = config.ConfigException("Config error")
+        mock_client = MagicMock()
+        mock_custom_api.return_value = mock_client
+        
+        client = _get_k8s_custom_client()
+        
+        assert client == mock_client
+        mock_load_incluster.assert_called_once()
+
+    @patch('sagemaker.hyperpod.cli.recipe_utils.config.load_incluster_config')
+    @patch('sagemaker.hyperpod.cli.recipe_utils.config.load_kube_config')
+    def test_get_k8s_client_failure(self, mock_load_config, mock_load_incluster):
+        """Test Kubernetes client creation failure"""
+        # Reset global client
+        import sagemaker.hyperpod.cli.recipe_utils as utils
+        utils._k8s_custom_client = None
+        
+        mock_load_config.side_effect = config.ConfigException("Config error")
+        mock_load_incluster.side_effect = config.ConfigException("Incluster error")
+        
+        with pytest.raises(Exception, match="Could not configure kubernetes python client"):
+            _get_k8s_custom_client()
+
+
+class TestCreateDynamicTemplateErrorPaths:
+    """Test error paths for _create_dynamic_template function"""
+
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe._validate_dynamic_template')
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe.click.secho')
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe.sys.exit')
+    def test_create_validation_error(self, mock_exit, mock_secho, mock_validate):
+        """Test create with validation error"""
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_path = Path(temp_dir)
+            
+            mock_validate.side_effect = ValueError("Validation failed")
+            
+            _create_dynamic_template(temp_path, {})
+            
+            mock_secho.assert_called_with("❌ Validation failed", fg="red")
+            mock_exit.assert_called_with(1)
+
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe.click.secho')
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe.sys.exit')
+    def test_create_missing_template(self, mock_exit, mock_secho):
+        """Test create with missing k8s.jinja template"""
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_path = Path(temp_dir)
+            
+            _create_dynamic_template(temp_path, {})
+            
+            mock_secho.assert_called_with("❌ .override_spec.json not found", fg="red")
+            mock_exit.assert_called_with(1)
+
+
+class TestDownloadFunctions:
+    """Test cases for S3 download functions"""
+
+    @patch('sagemaker.hyperpod.cli.recipe_utils.json.loads')
+    def test_download_s3_json(self, mock_json_loads):
+        """Test S3 JSON download"""
+        from sagemaker.hyperpod.cli.recipe_utils import _download_s3_json
+        
+        mock_s3_client = MagicMock()
+        mock_s3_client.get_object.return_value = {
+            'Body': MagicMock(read=lambda: b'{"key": "value"}')
+        }
+        mock_json_loads.return_value = {"key": "value"}
+        
+        result = _download_s3_json(mock_s3_client, "s3://bucket/key.json")
+        
+        assert result == {"key": "value"}
+        mock_s3_client.get_object.assert_called_once_with(Bucket="bucket", Key="key.json")
+
+    def test_download_s3_content(self):
+        """Test S3 content download"""
+        from sagemaker.hyperpod.cli.recipe_utils import _download_s3_content
+        
+        mock_s3_client = MagicMock()
+        mock_s3_client.get_object.return_value = {
+            'Body': MagicMock(read=lambda: b'file content')
+        }
+        
+        result = _download_s3_content(mock_s3_client, "s3://bucket/file.txt")
+        
+        assert result == "file content"
+        mock_s3_client.get_object.assert_called_once_with(Bucket="bucket", Key="file.txt")
+
+
+class TestInstanceTypeOverride:
+    """Test cases for instance type override functionality in _generate_dynamic_config_yaml"""
+
+    def _make_schema(self, instance_default="ml.g5.2xlarge"):
+        return {
+            "instance_type": {"type": "string", "required": True, "default": instance_default},
+            "other_param":   {"type": "string", "required": False, "default": "default_value"},
+        }
+
+    @patch('sagemaker.hyperpod.cli.recipe_utils.load_dynamic_schema')
+    def test_generate_config_with_instance_type_override(self, mock_load_schema):
+        """Test that instance_type field is overridden with user input"""
+        from sagemaker.hyperpod.cli.recipe_utils import _generate_dynamic_config_yaml
+        mock_load_schema.return_value = self._make_schema()
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            _generate_dynamic_config_yaml(
+                Path(temp_dir), "hyp-recipe-job",
+                model_name="test-model", technique="SFT", instance_type="ml.g5.48xlarge"
+            )
+            content = (Path(temp_dir) / "config.yaml").read_text()
+            assert "ml.g5.48xlarge" in content
+
+    @patch('sagemaker.hyperpod.cli.recipe_utils.load_dynamic_schema')
+    def test_generate_config_without_instance_type_override(self, mock_load_schema):
+        """Test that default instance_type is used when no override provided"""
+        from sagemaker.hyperpod.cli.recipe_utils import _generate_dynamic_config_yaml
+        mock_load_schema.return_value = self._make_schema()
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            _generate_dynamic_config_yaml(
+                Path(temp_dir), "hyp-recipe-job",
+                model_name="test-model", technique="SFT"
+            )
+            content = (Path(temp_dir) / "config.yaml").read_text()
+            assert "ml.g5.2xlarge" in content
+
+    @patch('sagemaker.hyperpod.cli.recipe_utils.load_dynamic_schema')
+    def test_generate_config_instance_type_override_only_affects_instance_type_field(self, mock_load_schema):
+        """Test that instance_type override only affects the instance_type field"""
+        from sagemaker.hyperpod.cli.recipe_utils import _generate_dynamic_config_yaml
+        mock_load_schema.return_value = self._make_schema()
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            _generate_dynamic_config_yaml(
+                Path(temp_dir), "hyp-recipe-job",
+                model_name="test-model", technique="SFT", instance_type="ml.g5.48xlarge"
+            )
+            content = (Path(temp_dir) / "config.yaml").read_text()
+            assert "ml.g5.48xlarge" in content
+            assert "default_value" in content
+
+
+class TestInteractiveClusterSelection:
+    """Test cases for _interactive_cluster_selection function"""
+
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe._fetch_recipe_from_hub')
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe.click.secho')
+    def test_interactive_cluster_selection_no_supported_instance_types(self, mock_secho, mock_fetch_recipe):
+        """Test interactive cluster selection with no supported instance types"""
+        from sagemaker.hyperpod.cli.commands.training_recipe import _interactive_cluster_selection
+        
+        # Mock recipe with no supported instance types
+        mock_fetch_recipe.return_value = {
+            'SupportedInstanceTypes': []
+        }
+        
+        mock_sagemaker_client = MagicMock()
+        result = _interactive_cluster_selection(mock_sagemaker_client, "test-model", "hyp-recipe-job", "lora")
+        
+        assert result == (None, None)
+        mock_secho.assert_any_call("❌ No supported instance types found in recipe", fg="red")
+
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe._fetch_recipe_from_hub')
+    @patch('sagemaker.hyperpod.cli.commands.training_recipe.click.secho')
+    def test_interactive_cluster_selection_exception_handling(self, mock_secho, mock_fetch_recipe):
+        """Test interactive cluster selection with exception handling"""
+        from sagemaker.hyperpod.cli.commands.training_recipe import _interactive_cluster_selection
+        
+        # Mock recipe fetch to raise exception
+        mock_fetch_recipe.side_effect = Exception("Test error")
+        
+        mock_sagemaker_client = MagicMock()
+        result = _interactive_cluster_selection(mock_sagemaker_client, "test-model", "hyp-recipe-job", "lora")
+        
+        assert result == (None, None)
+        mock_secho.assert_any_call("❌ Error during cluster selection: Test error", fg="red")
+
+
+class TestHypCliDeleteCommand:
+    """Test cases for the CLI delete command fix"""
+
+    def test_recipe_delete_command_registration(self):
+        """Test that recipe delete command is properly registered"""
+        from sagemaker.hyperpod.cli.hyp_cli import delete
+        
+        # Check that hyp-recipe-job delete command exists
+        commands = delete.list_commands(None)
+        assert "hyp-recipe-job" in commands
+        
+        # Get the command and verify it's the delete command, not describe
+        recipe_cmd = delete.get_command(None, "hyp-recipe-job")
+        assert recipe_cmd is not None
+        assert "Delete" in recipe_cmd.help or "delete" in recipe_cmd.help.lower()
+
+    def test_recipe_delete_command_help_text(self):
+        """Test that recipe delete command has correct help text"""
+        from sagemaker.hyperpod.cli.hyp_cli import delete
+        
+        recipe_cmd = delete.get_command(None, "hyp-recipe-job")
+        assert recipe_cmd is not None
+        assert "Delete a HyperPod recipe job" in recipe_cmd.help
+
+
+    def test_recipe_commands_registration(self):
+        """Test that recipe commands are properly registered"""
+        from sagemaker.hyperpod.cli.hyp_cli import list, describe, delete, list_pods, get_logs, get_operator_logs
+        
+        # Check list command
+        commands = list.list_commands(None)
+        assert "hyp-recipe-job" in commands
+        
+        # Check describe command
+        commands = describe.list_commands(None)
+        assert "hyp-recipe-job" in commands
+        
+        # Check delete command
+        commands = delete.list_commands(None)
+        assert "hyp-recipe-job" in commands
+        
+        # Check list-pods command
+        commands = list_pods.list_commands(None)
+        assert "hyp-recipe-job" in commands
+        
+        # Check get-logs command
+        commands = get_logs.list_commands(None)
+        assert "hyp-recipe-job" in commands
+        
+        # Check get-operator-logs command
+        commands = get_operator_logs.list_commands(None)
+        assert "hyp-recipe-job" in commands
+
+    def test_recipe_command_help_texts(self):
+        """Test that recipe commands have correct help text"""
+        from sagemaker.hyperpod.cli.hyp_cli import list, describe, delete
+        
+        # Check list command help
+        recipe_list_cmd = list.get_command(None, "hyp-recipe-job")
+        assert recipe_list_cmd is not None
+        assert "List all HyperPod recipe jobs" in recipe_list_cmd.help
+        
+        # Check describe command help
+        recipe_describe_cmd = describe.get_command(None, "hyp-recipe-job")
+        assert recipe_describe_cmd is not None
+        assert "Describe a HyperPod recipe job" in recipe_describe_cmd.help
+        
+        # Check delete command help
+        recipe_delete_cmd = delete.get_command(None, "hyp-recipe-job")
+        assert recipe_delete_cmd is not None
+        assert "Delete a HyperPod recipe job" in recipe_delete_cmd.help
+
+
+class TestIsHubContentArn:
+    def test_valid_private_hub_arn_with_version(self):
+        from sagemaker.hyperpod.cli.recipe_utils import _is_hub_content_arn
+        assert _is_hub_content_arn(
+            "arn:aws:sagemaker:us-west-2:123456789012:hub-content/MyHub/Model/my-model/1.0.0"
+        ) is True
+
+    def test_valid_public_hub_arn(self):
+        from sagemaker.hyperpod.cli.recipe_utils import _is_hub_content_arn
+        assert _is_hub_content_arn(
+            "arn:aws:sagemaker:us-west-2:aws:hub-content/SageMakerPublicHub/Model/my-model/3.1.0"
+        ) is True
+
+    def test_valid_arn_without_version(self):
+        from sagemaker.hyperpod.cli.recipe_utils import _is_hub_content_arn
+        assert _is_hub_content_arn(
+            "arn:aws:sagemaker:us-west-2:123456789012:hub-content/MyHub/Model/my-model"
+        ) is True
+
+    def test_jumpstart_id_is_not_arn(self):
+        from sagemaker.hyperpod.cli.recipe_utils import _is_hub_content_arn
+        assert _is_hub_content_arn("meta-textgeneration-llama-3-1-8b-instruct") is False
+
+    def test_hf_id_is_not_arn(self):
+        from sagemaker.hyperpod.cli.recipe_utils import _is_hub_content_arn
+        assert _is_hub_content_arn("meta-llama/Llama-3.1-8B-Instruct") is False
+
+
+class TestParseHubContentArn:
+    def test_parses_all_components_with_version(self):
+        from sagemaker.hyperpod.cli.recipe_utils import _parse_hub_content_arn
+        result = _parse_hub_content_arn(
+            "arn:aws:sagemaker:us-west-2:123456789012:hub-content/MyHub/ModelReference/my-model/3.1.0"
+        )
+        assert result == {
+            "HubName": "MyHub",
+            "HubContentType": "ModelReference",
+            "HubContentName": "my-model",
+            "HubContentVersion": "3.1.0",
+        }
+
+    def test_parses_without_version(self):
+        from sagemaker.hyperpod.cli.recipe_utils import _parse_hub_content_arn
+        result = _parse_hub_content_arn(
+            "arn:aws:sagemaker:us-west-2:123456789012:hub-content/MyHub/Model/my-model"
+        )
+        assert "HubContentVersion" not in result
+        assert result["HubName"] == "MyHub"
+        assert result["HubContentName"] == "my-model"
+
+
+class TestFetchRecipeFromPrivateHub:
+    def test_calls_describe_with_parsed_params(self):
+        from sagemaker.hyperpod.cli.recipe_utils import _fetch_recipe_from_private_hub
+        mock_client = MagicMock()
+        mock_client.describe_hub_content.return_value = {
+            "HubContentDocument": json.dumps({"RecipeCollection": []})
+        }
+        arn = "arn:aws:sagemaker:us-west-2:123456789012:hub-content/MyHub/Model/my-model/1.0.0"
+        _fetch_recipe_from_private_hub(mock_client, arn)
+        mock_client.describe_hub_content.assert_called_once_with(
+            HubName="MyHub", HubContentType="Model",
+            HubContentName="my-model", HubContentVersion="1.0.0"
+        )
+
+    def test_raises_on_invalid_arn(self):
+        from sagemaker.hyperpod.cli.recipe_utils import _fetch_recipe_from_private_hub
+        with pytest.raises(ValueError, match="Invalid Hub Content ARN"):
+            _fetch_recipe_from_private_hub(MagicMock(), "not-an-arn")
+
+
+class TestResolveHuggingfaceModelId:
+    def test_resolves_via_search(self):
+        from sagemaker.hyperpod.cli.recipe_utils import _resolve_huggingface_model_id
+        mock_client = MagicMock()
+        mock_client.list_hub_contents.return_value = {
+            "HubContentSummaries": [{
+                "HubContentName": "meta-textgeneration-llama-3-1-8b-instruct",
+                "HubContentSearchKeywords": ["@recipe:finetuning_sft_lora"],
+            }]
+        }
+        result = _resolve_huggingface_model_id(mock_client, "meta-llama/Llama-3.1-8B-Instruct")
+        assert result == "meta-textgeneration-llama-3-1-8b-instruct"
+
+    def test_falls_back_to_static_table(self):
+        from sagemaker.hyperpod.cli.recipe_utils import _resolve_huggingface_model_id
+        mock_client = MagicMock()
+        mock_client.list_hub_contents.return_value = {"HubContentSummaries": []}
+        result = _resolve_huggingface_model_id(mock_client, "Qwen/Qwen3-0.6B")
+        assert result == "huggingface-reasoning-qwen3-06b"
+
+    def test_raises_for_unknown_model(self):
+        from sagemaker.hyperpod.cli.recipe_utils import _resolve_huggingface_model_id
+        mock_client = MagicMock()
+        mock_client.list_hub_contents.return_value = {"HubContentSummaries": []}
+        with pytest.raises(ValueError, match="may not be supported"):
+            _resolve_huggingface_model_id(mock_client, "someorg/Unknown-Model")
+
+    def test_raises_on_ambiguous_matches(self):
+        from sagemaker.hyperpod.cli.recipe_utils import _resolve_huggingface_model_id
+        mock_client = MagicMock()
+        mock_client.list_hub_contents.return_value = {
+            "HubContentSummaries": [
+                {"HubContentName": "model-a", "HubContentSearchKeywords": ["@recipe:finetuning_sft_lora"]},
+                {"HubContentName": "model-b", "HubContentSearchKeywords": ["@recipe:finetuning_sft_lora"]},
+            ]
+        }
+        with pytest.raises(ValueError, match="may not be supported"):
+            _resolve_huggingface_model_id(mock_client, "someorg/SomeModel")
+
+
+class TestFetchRecipeFromHubModelIdFormats:
+    def _hub_doc(self, technique="SFT"):
+        return {
+            "HubContentDocument": json.dumps({"RecipeCollection": [{
+                "Type": "FineTuning",
+                "CustomizationTechnique": technique,
+                "SupportedInstanceTypes": ["ml.p4d.24xlarge"],
+                "HpEksOverrideParamsS3Uri": "s3://b/o",
+                "HpEksPayloadTemplateS3Uri": "s3://b/t",
+            }]})
+        }
+
+    def test_jumpstart_id_path(self):
+        from sagemaker.hyperpod.cli.recipe_utils import _fetch_recipe_from_hub
+        mock_client = MagicMock()
+        mock_client.describe_hub_content.return_value = self._hub_doc()
+        result = _fetch_recipe_from_hub(mock_client, "meta-textgeneration-llama-3-1-8b-instruct", "hyp-recipe-job", "SFT")
+        mock_client.describe_hub_content.assert_called_once_with(
+            HubName="SageMakerPublicHub", HubContentType="Model",
+            HubContentName="meta-textgeneration-llama-3-1-8b-instruct"
+        )
+        assert result["CustomizationTechnique"] == "SFT"
+
+    def test_arn_path_uses_private_hub(self):
+        from sagemaker.hyperpod.cli.recipe_utils import _fetch_recipe_from_hub
+        mock_client = MagicMock()
+        mock_client.describe_hub_content.return_value = self._hub_doc()
+        arn = "arn:aws:sagemaker:us-west-2:123456789012:hub-content/MyHub/Model/my-model/1.0.0"
+        _fetch_recipe_from_hub(mock_client, arn, "hyp-recipe-job", "SFT")
+        mock_client.describe_hub_content.assert_called_once_with(
+            HubName="MyHub", HubContentType="Model",
+            HubContentName="my-model", HubContentVersion="1.0.0"
+        )
+
+    def test_hf_id_resolves_via_search(self):
+        from sagemaker.hyperpod.cli.recipe_utils import _fetch_recipe_from_hub
+        mock_client = MagicMock()
+        mock_client.list_hub_contents.return_value = {
+            "HubContentSummaries": [{
+                "HubContentName": "meta-textgeneration-llama-3-1-8b-instruct",
+                "HubContentSearchKeywords": ["@recipe:finetuning_sft_lora"],
+            }]
+        }
+        mock_client.describe_hub_content.return_value = self._hub_doc()
+        _fetch_recipe_from_hub(mock_client, "meta-llama/Llama-3.1-8B-Instruct", "hyp-recipe-job", "SFT", is_huggingface=True)
+        mock_client.describe_hub_content.assert_called_once_with(
+            HubName="SageMakerPublicHub", HubContentType="Model",
+            HubContentName="meta-textgeneration-llama-3-1-8b-instruct"
+        )
+
+
+class TestWarnIfInstanceTypeUnavailable:
+    def test_warns_when_instance_type_missing(self):
+        from sagemaker.hyperpod.cli.commands.training_recipe import _warn_if_instance_type_unavailable
+        node = MagicMock()
+        node.metadata.labels = {"node.kubernetes.io/instance-type": "ml.g5.8xlarge"}
+        with patch('sagemaker.hyperpod.cli.commands.training_recipe.config') as mock_cfg, \
+             patch('sagemaker.hyperpod.cli.commands.training_recipe.client') as mock_k8s_client, \
+             patch('sagemaker.hyperpod.cli.commands.training_recipe.click.secho') as mock_secho:
+            mock_k8s_client.CoreV1Api.return_value.list_node.return_value.items = [node]
+            _warn_if_instance_type_unavailable("ml.p5.48xlarge")
+            mock_secho.assert_called_once()
+            assert "ml.p5.48xlarge" in mock_secho.call_args[0][0]
+
+    def test_silent_when_instance_type_present(self):
+        from sagemaker.hyperpod.cli.commands.training_recipe import _warn_if_instance_type_unavailable
+        node = MagicMock()
+        node.metadata.labels = {"node.kubernetes.io/instance-type": "ml.g5.8xlarge"}
+        with patch('sagemaker.hyperpod.cli.commands.training_recipe.config'), \
+             patch('sagemaker.hyperpod.cli.commands.training_recipe.client') as mock_k8s_client, \
+             patch('sagemaker.hyperpod.cli.commands.training_recipe.click.secho') as mock_secho:
+            mock_k8s_client.CoreV1Api.return_value.list_node.return_value.items = [node]
+            _warn_if_instance_type_unavailable("ml.g5.8xlarge")
+            mock_secho.assert_not_called()
+
+    def test_silent_on_exception(self):
+        from sagemaker.hyperpod.cli.commands.training_recipe import _warn_if_instance_type_unavailable
+        with patch('sagemaker.hyperpod.cli.commands.training_recipe.config') as mock_cfg:
+            mock_cfg.load_kube_config.side_effect = Exception("no kubeconfig")
+            _warn_if_instance_type_unavailable("ml.p5.48xlarge")
diff --git a/test/unit_tests/training/test_hyperpod_pytorch_job.py b/test/unit_tests/training/test_hyperpod_pytorch_job.py
index 4191ea6c..3e480fa0 100644
--- a/test/unit_tests/training/test_hyperpod_pytorch_job.py
+++ b/test/unit_tests/training/test_hyperpod_pytorch_job.py
@@ -191,6 +191,7 @@ def test_get_success(self, mock_load_job, mock_custom_api, mock_verify_config):
             namespace="test-namespace",
             plural="hyperpodpytorchjobs",
             name="test-job",
+            _request_timeout=10,
         )
         mock_load_job.assert_called_once_with(mock_response)
         self.assertEqual(result, expected_job)
@@ -225,22 +226,22 @@ def test_list_pods_success(self, mock_core_api, mock_load_config, mock_verify_co
         mock_api_instance = MagicMock()
         mock_core_api.return_value = mock_api_instance
 
-        # Mock pod response
+        # Mock pod response — label selector handles filtering, so all returned items belong to this job
         mock_pod1 = MagicMock()
         mock_pod1.metadata.name = "test-job-pod-0"
         mock_pod2 = MagicMock()
-        mock_pod2.metadata.name = "other-job-pod-0"
-        mock_pod3 = MagicMock()
-        mock_pod3.metadata.name = "test-job-pod-1"
+        mock_pod2.metadata.name = "test-job-pod-1"
 
         mock_response = MagicMock()
-        mock_response.items = [mock_pod1, mock_pod2, mock_pod3]
+        mock_response.items = [mock_pod1, mock_pod2]
         mock_api_instance.list_namespaced_pod.return_value = mock_response
 
         result = self.job.list_pods()
 
         mock_load_config.assert_called_once()
-        mock_api_instance.list_namespaced_pod.assert_called_once_with("default")
+        mock_api_instance.list_namespaced_pod.assert_called_once_with(
+            "default", label_selector="HPJob=test-job"
+        )
         self.assertEqual(result, ["test-job-pod-0", "test-job-pod-1"])
 
     @patch.object(HyperPodPytorchJob, "verify_kube_config")
@@ -252,7 +253,9 @@ def test_get_logs_from_pod_success(
         """Test successful log retrieval from pod"""
         mock_api_instance = MagicMock()
         mock_core_api.return_value = mock_api_instance
-        mock_api_instance.read_namespaced_pod_log.return_value = "test logs"
+        mock_response = MagicMock()
+        mock_response.data = b"test logs"
+        mock_api_instance.read_namespaced_pod_log.return_value = mock_response
 
         result = self.job.get_logs_from_pod("test-pod")
 
@@ -261,6 +264,7 @@ def test_get_logs_from_pod_success(
             namespace="default",
             timestamps=True,
             container="test-container",
+            _preload_content=False,
         )
         self.assertEqual(result, "test logs")
         
@@ -273,7 +277,9 @@ def test_get_logs_from_pod_with_container_name(
         """Test log retrieval with specific container name"""
         mock_api_instance = MagicMock()
         mock_core_api.return_value = mock_api_instance
-        mock_api_instance.read_namespaced_pod_log.return_value = "test logs"
+        mock_response = MagicMock()
+        mock_response.data = b"test logs"
+        mock_api_instance.read_namespaced_pod_log.return_value = mock_response
 
         result = self.job.get_logs_from_pod("test-pod", "specific-container")
 
@@ -282,6 +288,7 @@ def test_get_logs_from_pod_with_container_name(
             namespace="default",
             timestamps=True,
             container="specific-container",
+            _preload_content=False,
         )
         self.assertEqual(result, "test logs")