diff --git a/README.md b/README.md index 8d9aab59..57cbf39a 100644 --- a/README.md +++ b/README.md @@ -429,6 +429,105 @@ hyp get-operator-logs hyp-pytorch-job --since-hours 0.5 hyp delete hyp-pytorch-job --job-name ``` +### Recipe Job + +Use `hyp-recipe-job` to submit fine-tuning and evaluation jobs using pre-built recipes from SageMaker JumpStart Hub — no YAML authoring required. + + +#### Initialize Recipe Job Configuration + +```bash +mkdir my-recipe-job && cd my-recipe-job + +# Option A: HuggingFace model ID +hyp init hyp-recipe-job . \ + --huggingface-model-id Qwen/Qwen3-0.6B \ + --technique SFT \ + --instance-type ml.g5.48xlarge + +# Option B: JumpStart model ID +hyp init hyp-recipe-job . \ + --model-id huggingface-reasoning-qwen3-06b \ + --technique SFT \ + --instance-type ml.g5.48xlarge +``` + +Supported job types: +- **Fine-tuning**: `SFT`, `DPO`, `CPT`, `PPO`, `RLAIF`, `RLVR` +- **Evaluation**: `deterministic`, `LLMAJ` + +> **Note**: If you omit `--instance-type`, the CLI will automatically query your HyperPod clusters and find clusters with instance types supported by the selected recipe and technique. You will be presented with a list of compatible clusters to choose from. + +#### Configure Recipe Job Parameters + +```bash +hyp configure \ + --name my-recipe-job \ + --namespace default \ + --data-path /data/recipes-data/sft/train.jsonl \ + --global-batch-size 8 \ + --learning-rate 0.0001 \ + --max-epochs 1 \ + --output-path /data/output/my-model \ + --instance-type ml.g5.48xlarge +``` + +#### Validate Configuration + +```bash +hyp validate +``` + +#### Reset Configuration + +To reset `config.yaml` back to its default values: + +```bash +hyp reset +``` + +#### Submit Recipe Job + +```bash +hyp create +``` + +#### List Recipe Jobs + +```bash +hyp list hyp-recipe-job --namespace default +``` + +#### Describe a Recipe Job + +```bash +hyp describe hyp-recipe-job --job-name --namespace default +``` + +#### List Pods for a Recipe Job + +```bash +hyp list-pods hyp-recipe-job --job-name --namespace default +``` + +#### Get Logs from a Recipe Job Pod + +```bash +hyp get-logs hyp-recipe-job --job-name --pod-name --namespace default +``` + +#### Get Operator Logs + +```bash +hyp get-operator-logs hyp-recipe-job +``` + +#### Delete a Recipe Job + +```bash +hyp delete hyp-recipe-job --job-name --namespace default +``` + ### Inference ### Jumpstart Endpoint Creation diff --git a/doc/examples.md b/doc/examples.md index 18d4b392..319075b5 100644 --- a/doc/examples.md +++ b/doc/examples.md @@ -66,6 +66,13 @@ For detailed examples of training with HyperPod, see: **Training Examples** Refer the Training SDK Example. ::: +:::{grid-item-card} Recipe Job CLI Example +:link: https://github.com/aws/sagemaker-hyperpod-cli/blob/main/examples/end_to_end_walkthrough/01-training-job-submission/02-recipe-job-cli.ipynb +:class-card: sd-border-primary + +**Recipe Job Example** Submit a fine-tuning job using `hyp-recipe-job` — pre-built recipes from SageMaker JumpStart Hub, no YAML required. +::: + :::: diff --git a/doc/getting_started/training.md b/doc/getting_started/training.md index 5e888c20..4bbdf42e 100644 --- a/doc/getting_started/training.md +++ b/doc/getting_started/training.md @@ -104,6 +104,107 @@ This will: - Initialize the job creation process +## Creating Training Jobs -- Recipe Job Init Experience + +The `hyp-recipe-job` experience lets you submit fine-tuning and evaluation jobs using pre-built recipes published to SageMaker JumpStart Hub. No YAML authoring required — the CLI fetches the Kubernetes job template and parameter spec automatically. + +### 1. Initialize a Recipe Job + +`````{tab-set} +````{tab-item} CLI (HuggingFace model ID) +```bash +mkdir my-recipe-job +cd my-recipe-job +hyp init hyp-recipe-job . \ + --huggingface-model-id Qwen/Qwen3-0.6B \ + --technique SFT \ + --instance-type ml.g5.48xlarge +``` +```` +````{tab-item} CLI (JumpStart model ID) +```bash +mkdir my-recipe-job +cd my-recipe-job +hyp init hyp-recipe-job . \ + --model-id huggingface-reasoning-qwen3-06b \ + --technique SFT \ + --instance-type ml.g5.48xlarge +``` +```` +````` + +Supported job types: +- **Fine-tuning**: `SFT`, `DPO`, `CPT`, `PPO`, `RLAIF`, `RLVR` +- **Evaluation**: `deterministic`, `LLMAJ` + +```{note} +If you omit `--instance-type`, the CLI will automatically query your HyperPod clusters and find clusters with instance types supported by the selected recipe and technique. You will be presented with a list of compatible clusters to choose from. Note that this interactive prompt requires a terminal and is not supported in Jupyter notebooks. +``` + +This creates three files in your job directory: +- `config.yaml` — your editable training parameters +- `.override_spec.json` — the parameter schema +- `k8s.jinja` — the Kubernetes job template + +### 3. Configure Recipe Job Parameters + +```bash +hyp configure \ + --name my-recipe-job \ + --namespace default \ + --data-path /data/recipes-data/sft/train.jsonl \ + --global-batch-size 8 \ + --learning-rate 0.0001 \ + --max-epochs 1 \ + --output-path /data/output/my-model \ + --instance-type ml.g5.48xlarge +``` + +### 4. Validate Configuration + +```bash +hyp validate +``` + +### 4a. Reset Configuration (Optional) + +To reset `config.yaml` back to its default values: + +```bash +hyp reset +``` + +### 5. Submit the Recipe Job + +```bash +hyp create +``` + +### 6. Manage Recipe Jobs + +```bash +# List jobs +hyp list hyp-recipe-job --namespace default + +# Describe a job +hyp describe hyp-recipe-job --job-name --namespace default + +# List pods +hyp list-pods hyp-recipe-job --job-name --namespace default + +# Get logs +hyp get-logs hyp-recipe-job --job-name --pod-name --namespace default + +# Get operator logs +hyp get-operator-logs hyp-recipe-job + +# Exec into pods +hyp exec hyp-recipe-job --job-name --namespace default --all-pods -- echo hello + +# Delete job +hyp delete hyp-recipe-job --job-name --namespace default +``` + ## Creating Training Jobs -- CLI/SDK You can create training jobs using either the CLI or SDK approach: @@ -295,5 +396,6 @@ For detailed examples of training with HyperPod, see: - CLI Training Init Experience Example - CLI Training Example - SDK Training Example +- Recipe Job CLI Example These examples demonstrate end-to-end workflows for creating and managing training jobs using both the CLI and SDK approaches. diff --git a/examples/end_to_end_walkthrough/01-training-job-submission/02-recipe-job-cli.ipynb b/examples/end_to_end_walkthrough/01-training-job-submission/02-recipe-job-cli.ipynb new file mode 100644 index 00000000..ac723f6d --- /dev/null +++ b/examples/end_to_end_walkthrough/01-training-job-submission/02-recipe-job-cli.ipynb @@ -0,0 +1,335 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a1b2c3d4", + "metadata": {}, + "source": [ + "# Submitting a Recipe Fine-Tuning Job - HyperPod CLI End-to-End Walkthrough\n", + "\n", + "This example shows how to fine-tune a model using the **HyperPod CLI recipe job** experience (`hyp-recipe-job`). Recipes are pre-built fine-tuning configurations published to SageMaker JumpStart Hub — they include a Kubernetes job template and parameter spec, so you don't need to write any YAML.\n", + "\n", + "The workflow is:\n", + "1. **`hyp init`** — fetch the recipe from SageMaker Hub and scaffold your job directory\n", + "2. **`hyp configure`** — set your training parameters\n", + "3. **`hyp validate`** — verify the configuration is complete and valid\n", + "4. **`hyp create`** — render and submit the Kubernetes job\n", + "\n", + "This example assumes you have completed the **Setup instructions** in [00-getting-started/00-setup.md](../00-getting-started/00-setup.md) and have a HyperPod EKS cluster with your kubeconfig configured." + ] + }, + { + "cell_type": "markdown", + "id": "b2c3d4e5", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "\n", + "- HyperPod EKS cluster with kubeconfig configured\n", + "- `sagemaker-hyperpod` CLI installed (`pip install sagemaker-hyperpod`)\n", + "- FSx for Lustre volume mounted at `/data` with training data available\n", + "- AWS credentials with `sagemaker:DescribeHubContent`, `sagemaker:ListHubContents`, and `s3:GetObject` permissions" + ] + }, + { + "cell_type": "markdown", + "id": "set_cluster_md", + "metadata": {}, + "source": [ + "## Set Cluster Context\n", + "\n", + "Configure to point at your HyperPod EKS cluster before running any other commands." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "set_cluster_code", + "metadata": {}, + "outputs": [], + "source": [ + "CLUSTER_NAME = \"\" # Replace with your HyperPod cluster name\n", + "!hyp set-cluster-context --cluster-name {CLUSTER_NAME}" + ] + }, + { + "cell_type": "markdown", + "id": "c3d4e5f6", + "metadata": {}, + "source": [ + "## Step 0: Configuration\n", + "\n", + "Set your job name and working directory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4e5f6a7", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "JOB_NAME = \"qwen3-sft-recipe-job\" # Change to a unique name\n", + "JOB_DIR = f\"./{JOB_NAME}\" # Local directory for job files\n", + "NAMESPACE = \"default\"\n", + "\n", + "os.makedirs(JOB_DIR, exist_ok=True)\n", + "print(f\"Job directory: {os.path.abspath(JOB_DIR)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "e5f6a7b8", + "metadata": {}, + "source": [ + "## Step 1: Initialize the Recipe Job\n", + "\n", + "The `hyp init hyp-recipe-job` command fetches the recipe from SageMaker JumpStart Hub and creates three files in your job directory:\n", + "- `config.yaml` — your editable training parameters\n", + "- `.override_spec.json` — the parameter schema (used by `configure` and `validate`)\n", + "- `k8s.jinja` — the Kubernetes job template\n", + "\n", + "### Specifying your model\n", + "\n", + "You can identify the model using either a **JumpStart model ID** or a **HuggingFace model ID**:\n", + "\n", + "```bash\n", + "# Option A: JumpStart model ID\n", + "hyp init hyp-recipe-job --model-id huggingface-reasoning-qwen3-06b --technique SFT --instance-type ml.g5.48xlarge\n", + "\n", + "# Option B: HuggingFace model ID (resolved automatically via JumpStart Hub search)\n", + "hyp init hyp-recipe-job --huggingface-model-id Qwen/Qwen3-0.6B --technique SFT --instance-type ml.g5.48xlarge\n", + "```\n", + "\n", + "### Instance type selection\n", + "\n", + "If you omit `--instance-type`, the CLI will **automatically query your HyperPod clusters** and find clusters with instance types supported by the recipe and technique you selected. You will be presented with a list of compatible clusters to choose from.\n", + "\n", + "Supported job types:\\n", + "- **Fine-tuning**: `SFT`, `DPO`, `CPT`, `PPO`, `RLAIF`, `RLVR`\\n", + "- **Evaluation**: `deterministic`, `LLMAJ`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "435a652c", + "metadata": {}, + "outputs": [], + "source": [ + "# Option A: JumpStart model ID\n", + "!hyp init hyp-recipe-job {JOB_DIR} \\\n", + " --model-id huggingface-reasoning-qwen3-06b \\\n", + " --technique SFT \\\n", + " --instance-type ml.g5.48xlarge" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6a7b8c9", + "metadata": {}, + "outputs": [], + "source": [ + "# Option B: HuggingFace model ID (uncomment to use)\n", + "# !hyp init hyp-recipe-job {JOB_DIR} \\\n", + "# --huggingface-model-id Qwen/Qwen3-0.6B \\\n", + "# --technique SFT \\\n", + "# --instance-type ml.g5.48xlarge" + ] + }, + { + "cell_type": "markdown", + "id": "b084517f", + "metadata": {}, + "source": [ + "> **Note — Option C requires a terminal.** Omitting `--instance-type` triggers an interactive cluster selection prompt, which is not supported in Jupyter notebooks. Run this command in a terminal instead:\n", + ">\n", + "> ```bash\n", + "> hyp init hyp-recipe-job ./ \\\n", + "> --huggingface-model-id Qwen/Qwen3-0.6B \\\n", + "> --technique SFT\n", + "> ```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7b8c9d0", + "metadata": {}, + "outputs": [], + "source": [ + "# Verify files were created\n", + "!ls -la {JOB_DIR}" + ] + }, + { + "cell_type": "markdown", + "id": "b8c9d0e1", + "metadata": {}, + "source": [ + "## Step 2: Configure Training Parameters\n", + "\n", + "Use `hyp configure` to set your training parameters. Run this from inside the job directory.\n", + "\n", + "You can see all available parameters with `hyp configure --help` (run from inside the job directory)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9d0e1f2", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash -s \"$JOB_NAME\" \"$NAMESPACE\"\n", + "cd $1\n", + "hyp configure \\\n", + " --name $1 \\\n", + " --namespace $2 \\\n", + " --data-path /data/recipes-data/sft/zc_train_256.jsonl \\\n", + " --global-batch-size 8 \\\n", + " --learning-rate 0.0001 \\\n", + " --lr-warmup-ratio 0.1 \\\n", + " --max-epochs 5 \\\n", + " --output-path /data/output/qwen3-sft \\\n", + " --results-directory /data/results/qwen3-sft \\\n", + " --resume-from-path /data/output/qwen3-sft \\\n", + " --training-data-name zc_train_256 \\\n", + " --validation-data-name zc_train_256 \\\n", + " --validation-data-path /data/recipes-data/sft/zc_train_256.jsonl \\\n", + " --train-val-split-ratio 0.9 \\\n", + " --instance-type ml.g5.48xlarge" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0e1f2a3", + "metadata": {}, + "outputs": [], + "source": [ + "# Review the generated config\n", + "!cat {JOB_DIR}/config.yaml" + ] + }, + { + "cell_type": "markdown", + "id": "e1f2a3b4", + "metadata": {}, + "source": [ + "## Step 3: Validate Configuration\n", + "\n", + "Validate that all required fields are set and values are within allowed ranges." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2a3b4c5", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash -s \"$JOB_NAME\"\n", + "cd $1\n", + "hyp validate" + ] + }, + { + "cell_type": "markdown", + "id": "a3b4c5d6", + "metadata": {}, + "source": [ + "## Step 4: Submit the Job\n", + "\n", + "`hyp create` renders the Kubernetes YAML from `k8s.jinja` + `config.yaml` and submits it to your cluster. The rendered files are saved under `run//` for reference." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4c5d6e7", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash -s \"$JOB_NAME\"\n", + "cd $1\n", + "hyp create" + ] + }, + { + "cell_type": "markdown", + "id": "c5d6e7f8", + "metadata": {}, + "source": [ + "## Step 5: Monitor the Job" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6e7f8a9", + "metadata": {}, + "outputs": [], + "source": [ + "# Check job status\n", + "!kubectl get hyperpodpytorchjob {JOB_NAME} -n {NAMESPACE}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7f8a9b0", + "metadata": {}, + "outputs": [], + "source": [ + "# List pods\n", + "!hyp list-pods hyp-recipe-job --job-name {JOB_NAME} --namespace {NAMESPACE}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8a9b0c1", + "metadata": {}, + "outputs": [], + "source": [ + "# Stream logs from the first pod (replace pod name from list-pods output above)\n", + "# !kubectl logs -f -n {NAMESPACE}" + ] + }, + { + "cell_type": "markdown", + "id": "a9b0c1d2", + "metadata": {}, + "source": [ + "## Step 6: Clean Up" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0c1d2e3", + "metadata": {}, + "outputs": [], + "source": [ + "!hyp delete hyp-recipe-job --job-name {JOB_NAME} --namespace {NAMESPACE}" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/end_to_end_walkthrough/01-training-job-submission/README.MD b/examples/end_to_end_walkthrough/01-training-job-submission/README.MD index f23919b9..e099c613 100644 --- a/examples/end_to_end_walkthrough/01-training-job-submission/README.MD +++ b/examples/end_to_end_walkthrough/01-training-job-submission/README.MD @@ -2,4 +2,5 @@ This folder contains the following files: - [00-pytorch-training-job.md](00-pytorch-training-job.md) - Instructions on how to create and submit a **Qwen3 4B Lora** fine-tuning job to the HyperPod cluster through the HyperPod CLI. Additionally, an example for instance failure recovery. -- [01-pytorch-training-job-sdk.ipynb](01-pytorch-training-job-sdk.ipynb) - Instructions on how to to utilize the HyperPod Python SDK to create and submit the equivalent job to the HyperPod cluster. \ No newline at end of file +- [01-pytorch-training-job-sdk.ipynb](01-pytorch-training-job-sdk.ipynb) - Instructions on how to to utilize the HyperPod Python SDK to create and submit the equivalent job to the HyperPod cluster. +- [02-recipe-job-cli.ipynb](02-recipe-job-cli.ipynb) - Instructions on how to submit a fine-tuning job using the HyperPod CLI recipe job experience (`hyp-recipe-job`). Recipes are pre-built fine-tuning configurations fetched from SageMaker JumpStart Hub — no YAML authoring required. \ No newline at end of file diff --git a/examples/end_to_end_walkthrough/README.md b/examples/end_to_end_walkthrough/README.md index 2d6f5d6e..7b1c618b 100644 --- a/examples/end_to_end_walkthrough/README.md +++ b/examples/end_to_end_walkthrough/README.md @@ -9,6 +9,7 @@ A recording of the full walkthrough as part of re:invent 2025 session 371 is ava - [**Training Job Submission**](./01-training-job-submission/) - [00-pytorch-training-job.md](./01-training-job-submission/00-pytorch-training-job.md) - Instructions on how to create and submit a Qwen3 4B Lora fine-tuning job to the HyperPod cluster through the HyperPod CLI. Additionally, an example for instance failure recovery. - [01-pytorch-training-job-sdk.ipynb](./01-training-job-submission/01-pytorch-training-job-sdk.ipynb) - Instructions on how to to utilize the HyperPod Python SDK to create and submit the equivalent job to the HyperPod cluster. + - [02-recipe-job-cli.ipynb](./01-training-job-submission/02-recipe-job-cli.ipynb) - Instructions on how to submit a fine-tuning job using the HyperPod CLI recipe job experience (`hyp-recipe-job`). Recipes are pre-built fine-tuning configurations fetched from SageMaker JumpStart Hub — no YAML authoring required. - [**Inference Deployment**](./02-inference-deployment/) - [00-jumpstart-endpoint.md](./02-inference-deployment/00-jumpstart-endpoint.md) - Instructions on how to deploy models available on SageMaker JumpStart to the HyperPod cluster. - [01-custom-model-endpoint.md](./02-inference-deployment/01-custom-model-endpoint.md) - Instructions on how to deploy a custom model from an S3 bucket (TinyLlama) to the HyperPod cluster and how to utilize the autoscaling functionality. diff --git a/setup.py b/setup.py index 49097c3f..fcb88a8e 100644 --- a/setup.py +++ b/setup.py @@ -62,7 +62,7 @@ "awscli-cwlogs>=1.4.6", "boto3>=1.35.3,<2.0", "botocore>=1.35.6 ", - "kubernetes==33.1.0", + "kubernetes>=33.1.0", "kr8s>=0.20.0", "pyyaml==6.0.2", "ratelimit==2.2.1", @@ -76,6 +76,8 @@ "omegaconf==2.3", "pynvml==11.4.1", "requests==2.32.4", + "urllib3>=1.21.1,<3", + "charset-normalizer>=2,<4", "tqdm==4.66.5", "zstandard==0.15.2", # Test dependencies diff --git a/src/sagemaker/hyperpod/cli/commands/cluster.py b/src/sagemaker/hyperpod/cli/commands/cluster.py index 289a827a..a66d3a33 100644 --- a/src/sagemaker/hyperpod/cli/commands/cluster.py +++ b/src/sagemaker/hyperpod/cli/commands/cluster.py @@ -358,6 +358,12 @@ def rate_limited_operation( ) cluster_capacities.append(capacities) return cluster_capacities + except RuntimeError as e: + if "not found" in str(e): + logger.debug(f"Skipping cluster {cluster_name}: {e}") + else: + logger.error(f"Error processing cluster {cluster_name}: {e}, continue...") + return None except Exception as e: logger.error(f"Error processing cluster {cluster_name}: {e}, continue...") return None @@ -831,8 +837,11 @@ def _update_kube_config( try: # Execute the command to update kubeconfig - subprocess.run(command, check=True) + subprocess.run(command, check=True, capture_output=True) except subprocess.CalledProcessError as e: + stderr = e.stderr.decode() if e.stderr else "" + if "ResourceNotFoundException" in stderr or "No cluster found" in stderr: + raise RuntimeError(f"EKS cluster '{eks_name}' not found (may have been deleted)") raise RuntimeError(f"Failed to update kubeconfig: {e}") except (OSError, ValueError) as e: raise RuntimeError(f"Invalid command execution: {e}") diff --git a/src/sagemaker/hyperpod/cli/commands/inference.py b/src/sagemaker/hyperpod/cli/commands/inference.py index 49c14fa0..85a1fcd0 100644 --- a/src/sagemaker/hyperpod/cli/commands/inference.py +++ b/src/sagemaker/hyperpod/cli/commands/inference.py @@ -29,9 +29,7 @@ @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "create_js_endpoint_cli") @handle_cli_exceptions() def js_create(version, debug, js_endpoint): - """ - Create a jumpstart model endpoint. - """ + """Create a jumpstart model endpoint""" click.echo(f"Using version: {version}") js_endpoint.create(debug=debug) @@ -46,9 +44,7 @@ def js_create(version, debug, js_endpoint): @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "create_custom_endpoint_cli") @handle_cli_exceptions() def custom_create(version, debug, custom_endpoint): - """ - Create a custom model endpoint. - """ + """Create a custom model endpoint""" click.echo(f"Using version: {version}") custom_endpoint.create(debug=debug) @@ -134,7 +130,7 @@ def js_list( namespace: Optional[str], ): """ - List all Hyperpod Jumpstart model endpoints. + List all HyperPod Jumpstart model endpoints. """ endpoints = HPJumpStartEndpoint.model_construct().list(namespace) data = [ep.model_dump() for ep in endpoints] @@ -177,7 +173,7 @@ def custom_list( namespace: Optional[str], ): """ - List all Hyperpod custom model endpoints. + List all HyperPod custom model endpoints. """ endpoints = HPEndpoint.model_construct().list(namespace) data = [ep.model_dump() for ep in endpoints] @@ -236,7 +232,7 @@ def js_describe( full: bool ): """ - Describe a Hyperpod Jumpstart model endpoint. + Describe a HyperPod Jumpstart model endpoint. """ my_endpoint = HPJumpStartEndpoint.model_construct().get(name, namespace) data = my_endpoint.model_dump() @@ -385,7 +381,7 @@ def custom_describe( full: bool ): """ - Describe a Hyperpod custom model endpoint. + Describe a HyperPod custom model endpoint. """ my_endpoint = HPEndpoint.model_construct().get(name, namespace) data = my_endpoint.model_dump() diff --git a/src/sagemaker/hyperpod/cli/commands/init.py b/src/sagemaker/hyperpod/cli/commands/init.py index c3a54d16..8cfb15c1 100644 --- a/src/sagemaker/hyperpod/cli/commands/init.py +++ b/src/sagemaker/hyperpod/cli/commands/init.py @@ -1,10 +1,10 @@ import click import yaml import sys +import shutil from pathlib import Path from datetime import datetime from jinja2 import Template -import shutil from sagemaker.hyperpod.cli.constants.init_constants import ( USAGE_GUIDE_TEXT_CFN, USAGE_GUIDE_TEXT_CRD, @@ -24,51 +24,96 @@ build_config_from_schema, save_template, get_default_version_for_template, - create_from_k8s_yaml + create_from_k8s_yaml, + is_dynamic_template ) from sagemaker.hyperpod.common.utils import get_aws_default_region from sagemaker.hyperpod.common.telemetry.telemetry_logging import ( _hyperpod_telemetry_emitter, ) from sagemaker.hyperpod.common.telemetry.constants import Feature +from sagemaker.hyperpod.cli.commands.training_recipe import _init_training_job, _configure_dynamic_template, _create_dynamic_template +from sagemaker.hyperpod.cli.recipe_utils import _validate_dynamic_template, _generate_dynamic_config_yaml + @click.command("init") @click.argument("template", type=click.Choice(list(TEMPLATES.keys()))) @click.argument("directory", type=click.Path(file_okay=False), default=".") @click.option("--version", "-v", default=None, help="Schema version") +@click.option("--model-id", hidden=True, help="JumpStart model ID (e.g. meta-textgeneration-llama-3-2-1b)") +@click.option("--huggingface-model-id", hidden=True, help="HuggingFace model ID (e.g. meta-llama/Llama-2-7b)") +@click.option("--technique", hidden=True, help="Customization technique (for hyp-recipe-job only)") +@click.option("--instance-type", hidden=True, help="Instance type (optional - if not provided, interactive cluster selection will be used)") @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "init_template_cli") def init( template: str, directory: str, version: str, + model_id: str, + huggingface_model_id: str, + technique: str, + instance_type: str, ): """ Initialize a TEMPLATE scaffold in DIRECTORY. - + This command creates a complete project scaffold for the specified template type. It performs the following steps: - + + \b 1. Checks if the directory already contains a config.yaml and handles existing configurations 2. Creates the target directory if it doesn't exist 3. Generates a config.yaml file with schema-based default values 4. Creates a template file (.jinja) for the specified template type 5. Adds a README.md with usage instructions - + The generated files provide a starting point for configuring and submitting jobs to SageMaker HyperPod clusters orchestrated by Amazon EKS. + + Available templates: + + \b + hyp-pytorch-job PyTorch distributed training job + hyp-jumpstart-endpoint JumpStart model inference endpoint + hyp-custom-endpoint Custom model inference endpoint + cluster-stack HyperPod EKS cluster CloudFormation stack + hyp-recipe-job Fine-tuning/evaluation job from JumpStart Hub recipe + + For hyp-recipe-job, the following options are available: + + \b + --model-id JumpStart model ID (required, or use --huggingface-model-id) + --huggingface-model-id HuggingFace model ID (required, or use --model-id) + --technique Fine-tuning: SFT, DPO, RLAIF, RLVR, CPT, PPO + Evaluation: deterministic, LLMAJ (required) + --instance-type Instance type to use. If not provided, an interactive + cluster selection will be launched (optional) """ + # Original template initialization logic dir_path = Path(directory).resolve() config_file = dir_path / "config.yaml" skip_readme = False + # Validate required params for hyp-recipe-job before any output + if template in ["hyp-recipe-job"]: + if not model_id and not huggingface_model_id: + click.secho(f"❌ --model-id or --huggingface-model-id is required for {template}", fg="red") + return + if model_id and huggingface_model_id: + click.secho("❌ Specify either --model-id or --huggingface-model-id, not both", fg="red") + return + if not technique: + click.secho(f"❌ --technique is required for {template} (e.g. SFT, DPO, deterministic, LLMAJ)", fg="red") + return + # 1) Inspect existing config.yaml try: if config_file.is_file(): try: - existing = yaml.safe_load(config_file.read_text()) or {} - existing_template = existing.get("template") + # Use load_config to properly read commented template + _, existing_template, _ = load_config(dir_path) except Exception as e: - click.echo("Could not parse existing config.yaml: %s", e) + click.secho(f"⚠️ Could not parse existing config.yaml: {e}", fg="yellow") existing_template = None if existing_template == template: @@ -90,7 +135,7 @@ def init( else: click.echo(f"Initializing new scaffold for '{template}'…") except Exception as e: - click.secho("💥 Initialization aborted due to error: %s", e, fg="red") + click.secho(f"💥 Initialization aborted due to error: {e}", fg="red") sys.exit(1) # 2) Ensure directory exists @@ -100,6 +145,14 @@ def init( click.secho(f"❌ Could not create directory {dir_path}: {e}", fg="red") sys.exit(1) + # Handle dynamic job templates + if template in ["hyp-recipe-job"]: + resolved_model_id = huggingface_model_id if huggingface_model_id else model_id + if _init_training_job(directory, template, resolved_model_id, technique, instance_type, is_huggingface=bool(huggingface_model_id)): + click.secho(f"✔️ {template.replace('-', ' ').title()} initialized successfully", fg="green") + click.secho("📄 Created: config.yaml, k8s.jinja", fg="green") + return + # 3) Build config dict + comment map, then write config.yaml try: # Determine version: use user-provided version or default to latest @@ -162,9 +215,19 @@ def reset(): # 1) Load and validate config data, template, version = load_config(dir_path) - # 2) Build config with default values from schema + # 2) Check if this is a dynamic template + if is_dynamic_template(template, dir_path): + # For dynamic templates, reset using the helper function + try: + _generate_dynamic_config_yaml(dir_path, template, version) + click.secho("✔️ config.yaml reset: all fields set to default values.", fg="green") + except Exception as e: + click.secho(f"💥 Could not reset config.yaml: {e}", fg="red") + sys.exit(1) + return + + # 3) Standard template reset logic full_cfg, comment_map = build_config_from_schema(template, version) - # 3) Overwrite config.yaml try: save_config_yaml( prefill=full_cfg, @@ -185,7 +248,7 @@ def reset(): @generate_click_command() @click.pass_context @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "init_configure_cli") -def configure(ctx, model_config): +def configure(ctx, option, value, model_config): """ Update any subset of fields in ./config.yaml by passing -- flags. @@ -201,16 +264,27 @@ def configure(ctx, model_config): # Update multiple fields at once hyp configure --stack-name my-stack --create-fsx-stack: False - + # Update complex fields with JSON object hyp configure --availability-zone-ids '["id1", "id2"]' - """ # 1) Load existing config without validation dir_path = Path(".").resolve() data, template, version = load_config(dir_path) - # 2) Determine which fields the user actually provided + # 2) Check if this is a dynamic template (recipe) + if is_dynamic_template(template, dir_path): + # Handle recipe configure logic + _configure_dynamic_template(ctx, option, value, dir_path) + return + + # 3) Handle standard template configure logic + _configure_standard_template(ctx, model_config, dir_path, data, template, version) + + +def _configure_standard_template(ctx, model_config, dir_path, data, template, version): + """Handle configure for standard templates""" + # Determine which fields the user actually provided # Use Click's parameter source tracking to identify command-line provided parameters user_input_fields = set() @@ -223,10 +297,10 @@ def configure(ctx, model_config): user_input_fields.add(param_name) if not user_input_fields: - click.secho("⚠️ No arguments provided to configure.", fg="yellow") - return + click.echo(ctx.get_help()) + ctx.exit(0) - # 3) Build merged config with user input + # Build merged config with user input full_cfg, comment_map = build_config_from_schema( template=template, version=version, @@ -235,7 +309,7 @@ def configure(ctx, model_config): user_provided_fields=user_input_fields ) - # 4) Validate the merged config, but only check user-provided fields + # Validate the merged config, but only check user-provided fields all_validation_errors = validate_config_against_model(full_cfg, template, version) user_input_errors = filter_validation_errors_for_user_input(all_validation_errors, user_input_fields) @@ -249,7 +323,7 @@ def configure(ctx, model_config): click.secho("❌ config.yaml was not updated due to invalid input.", fg="red") sys.exit(1) - # 5) Write out the updated config.yaml (only if user input is valid) + # Write out the updated config.yaml (only if user input is valid) try: save_config_yaml( prefill=full_cfg, @@ -268,7 +342,26 @@ def validate(): Validate this directory's config.yaml against the appropriate schema. """ dir_path = Path(".").resolve() - load_config_and_validate(dir_path) + + try: + # Load config to determine template type + data, template, version = load_config(dir_path) + + # Check if this is a dynamic template + if is_dynamic_template(template, dir_path): + # Validate dynamic template + _validate_dynamic_template(dir_path) + click.secho("✔️ Configuration validated successfully", fg="green") + else: + # Use standard validation + load_config_and_validate(dir_path) + click.secho("✔️ Configuration validated successfully", fg="green") + except (FileNotFoundError, ValueError) as e: + click.secho(f"❌ {e}", fg="red") + sys.exit(1) + except Exception as e: + click.secho(f"❌ Validation failed: {e}", fg="red") + sys.exit(1) @click.command(name="_default_create") @@ -310,6 +403,17 @@ def _default_create(region, template_version, debug): # 1) Load config to determine template type data, template, version = load_config_and_validate(dir_path) + # Check if this is a dynamic template (recipe) + if is_dynamic_template(template, dir_path): + _create_dynamic_template(dir_path, data) + return + + # Handle standard templates (existing logic) + _create_standard_template(dir_path, data, template, version, region, template_version, debug) + + +def _create_standard_template(dir_path: Path, data: dict, template: str, version: str, region: str, template_version: int, debug: bool = False): + """Handle create for standard templates""" # Check if region flag is used for non-cluster-stack templates if region and template != "cluster-stack": click.secho(f"❌ --region flag is only available for cluster-stack template, not for {template}.", fg="red") @@ -324,6 +428,7 @@ def _default_create(region, template_version, debug): jinja_file = dir_path / 'k8s.jinja' # 3) Ensure files exist + config_file = dir_path / 'config.yaml' if not config_file.is_file() or not jinja_file.is_file(): click.secho(f"❌ Missing config.yaml or {jinja_file.name}. Run `hyp init` first.", fg="red") sys.exit(1) @@ -387,7 +492,6 @@ def _default_create(region, template_version, debug): k8s_file = out_dir / 'k8s.yaml' create_from_k8s_yaml(str(k8s_file), debug=debug) - except Exception as e: click.secho(f"❌ Failed to submit the command: {e}", fg="red") sys.exit(1) \ No newline at end of file diff --git a/src/sagemaker/hyperpod/cli/commands/training.py b/src/sagemaker/hyperpod/cli/commands/training.py index 5d3153ab..b59f874b 100644 --- a/src/sagemaker/hyperpod/cli/commands/training.py +++ b/src/sagemaker/hyperpod/cli/commands/training.py @@ -21,7 +21,7 @@ @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "create_pytorchjob_cli") @handle_cli_exceptions() def pytorch_create(version, debug, job): - """Create a PyTorch job.""" + """Create a PyTorch job""" click.echo(f"Using version: {version}") # Create job job.create(debug=debug) diff --git a/src/sagemaker/hyperpod/cli/commands/training_recipe.py b/src/sagemaker/hyperpod/cli/commands/training_recipe.py new file mode 100644 index 00000000..1c5f954f --- /dev/null +++ b/src/sagemaker/hyperpod/cli/commands/training_recipe.py @@ -0,0 +1,321 @@ +from concurrent.futures import ThreadPoolExecutor +import threading +from datetime import datetime +from kubernetes import client, config +from kubernetes.client.rest import ApiException +import yaml +import json +import sys +import click +from pathlib import Path +from sagemaker.hyperpod.cli.init_utils import load_dynamic_schema +from sagemaker.hyperpod.common.utils import handle_exception +from sagemaker.hyperpod.cli.type_handler_utils import is_undefined_value +from sagemaker.hyperpod.cli.recipe_utils import ( + _fetch_recipe_from_hub, _download_s3_content, _download_s3_json, + _validate_and_convert_value, _collect_all_parameters_interactively, + _submit_k8s_resources, _render_k8s_template, _get_sagemaker_client, + _get_s3_client, _get_k8s_custom_client, _validate_dynamic_template, + _generate_dynamic_config_yaml, _update_config_field +) +import shutil +from sagemaker.hyperpod.common.telemetry.constants import Feature +from sagemaker.hyperpod.common.telemetry.telemetry_logging import _hyperpod_telemetry_emitter +from sagemaker.hyperpod.common.cli_decorators import handle_cli_exceptions + + +def _interactive_cluster_selection(sagemaker_client, model_id: str, job_type: str, technique: str = None, is_huggingface: bool = False): + """Interactive cluster and instance type selection.""" + try: + matching_recipe = _fetch_recipe_from_hub(sagemaker_client, model_id, job_type, technique, None, is_huggingface=is_huggingface) + supported_instance_types = set(matching_recipe.get('SupportedInstanceTypes', [])) + + if not supported_instance_types: + click.secho("❌ No supported instance types found in recipe", fg="red") + return None, None + + click.secho("🔍 Fetching available clusters...", fg="blue") + + try: + from sagemaker.hyperpod.cli.commands.cluster import _get_hyperpod_clusters + from sagemaker.hyperpod.common.utils import get_current_cluster + + cluster_names = _get_hyperpod_clusters(sagemaker_client) + if not cluster_names: + click.secho("❌ No HyperPod clusters found", fg="red") + return None, None + + # Build {cluster_name: [(instance_type, node_count), ...]} for compatible types only + clusters_map: dict = {} + lock = threading.Lock() + + def _fetch_cluster(cluster_name: str): + try: + cluster_response = sagemaker_client.describe_cluster(ClusterName=cluster_name) + compatible = [ + (g.get('InstanceType'), g.get('CurrentCount', 0)) + for g in cluster_response.get('InstanceGroups', []) + if g.get('InstanceType') in supported_instance_types + ] + if compatible: + with lock: + clusters_map[cluster_name] = compatible + except Exception as e: + click.secho(f"⚠️ Warning: Could not get details for cluster {cluster_name}: {e}", fg="yellow") + + with ThreadPoolExecutor(max_workers=min(len(cluster_names), 10)) as executor: + list(executor.map(_fetch_cluster, cluster_names)) + + if not clusters_map: + click.secho("❌ No cluster details could be retrieved", fg="red") + return None, None + + except Exception as e: + click.secho(f"❌ Error fetching clusters: {e}", fg="red") + return None, None + + if not clusters_map: + click.secho( + f"❌ No compatible clusters found. The '{technique or job_type}' recipe for " + f"'{model_id}' requires one of: {sorted(supported_instance_types)}", + fg="red", + ) + click.secho( + " To skip cluster auto-detection, specify the instance type directly: --instance-type ", + fg="yellow", + ) + return None, None + + # Detect current cluster context + current_cluster = None + try: + current_cluster = get_current_cluster() + except Exception: + pass + + def _prompt_instance_type(cluster_name: str) -> str | None: + instance_types = clusters_map[cluster_name] + click.secho(f"\n📋 Compatible instance types for {cluster_name}:", fg="green") + for i, (itype, nodes) in enumerate(instance_types, 1): + click.secho(f" {i}. {itype:<22} ({nodes} nodes)", fg="white") + while True: + try: + choice = click.prompt(f"Select an instance type (1-{len(instance_types)})", type=int) + if 1 <= choice <= len(instance_types): + return instance_types[choice - 1][0] + click.secho(f"❌ Please enter a number between 1 and {len(instance_types)}", fg="red") + except (ValueError, click.Abort): + click.secho("❌ Operation cancelled", fg="red") + return None + + # If current context cluster is compatible, offer it as default + if current_cluster and current_cluster in clusters_map: + click.secho(f"\nCurrent cluster context: {current_cluster}", fg="cyan") + instance_type = _prompt_instance_type(current_cluster) + if instance_type is None: + return None, None + # Ask if they want to use a different cluster + try: + use_different = click.confirm("Use a different cluster?", default=False) + except click.Abort: + click.secho("❌ Operation cancelled", fg="red") + return None, None + if not use_different: + click.secho(f"✔️ Selected: {current_cluster} ({instance_type})", fg="green") + return current_cluster, instance_type + + # Full cluster selection + cluster_list = list(clusters_map.keys()) + click.secho(f"\n📋 Compatible clusters ({len(cluster_list)} found):", fg="green") + click.secho("-" * 80, fg="blue") + for i, name in enumerate(cluster_list, 1): + types_summary = ", ".join(f"{t} ({n} nodes)" for t, n in clusters_map[name]) + click.secho(f"{i}. {name:<40} {types_summary}", fg="cyan") + + while True: + try: + choice = click.prompt(f"\nSelect a cluster (1-{len(cluster_list)})", type=int) + if 1 <= choice <= len(cluster_list): + selected_cluster = cluster_list[choice - 1] + break + click.secho(f"❌ Please enter a number between 1 and {len(cluster_list)}", fg="red") + except (ValueError, click.Abort): + click.secho("❌ Operation cancelled", fg="red") + return None, None + + instance_type = _prompt_instance_type(selected_cluster) + if instance_type is None: + return None, None + + click.secho(f"✔️ Selected: {selected_cluster} ({instance_type})", fg="green") + return selected_cluster, instance_type + + except ValueError as e: + click.secho(f"❌ {e}", fg="red") + return None, None + except Exception as e: + click.secho(f"❌ Error during cluster selection: {e}", fg="red") + return None, None + + +def _init_training_job(directory: str, job_type: str, model_id: str, technique: str, instance_type: str = None, is_huggingface: bool = False) -> bool: + """Initialize training job configuration.""" + try: + sagemaker_client = _get_sagemaker_client() + s3_client = _get_s3_client() + + # If instance_type not provided, use interactive selection + cluster_name = None + if not instance_type: + cluster_name, instance_type = _interactive_cluster_selection( + sagemaker_client, model_id, job_type, technique, is_huggingface=is_huggingface + ) + if not instance_type: + return False + + # Update kubeconfig to point at the selected cluster + if cluster_name: + from sagemaker.hyperpod.cli.commands.cluster import set_cluster_context + click.secho(f"🔧 Connecting to cluster: {cluster_name}", fg="blue") + set_cluster_context.main(["--cluster-name", cluster_name], standalone_mode=False) + + # Fetch and validate recipe + matching_recipe = _fetch_recipe_from_hub(sagemaker_client, model_id, job_type, technique, instance_type, is_huggingface=is_huggingface) + + override_params_uri = matching_recipe.get('HpEksOverrideParamsS3Uri') + k8s_template_uri = matching_recipe.get('HpEksPayloadTemplateS3Uri') + + if not override_params_uri or not k8s_template_uri: + click.secho("❌ Missing S3 URIs in recipe", fg="red") + return False + + # Create directory + dir_path = Path(directory).resolve() + dir_path.mkdir(parents=True, exist_ok=True) + + # Download and save override params + override_data = _download_s3_json(s3_client, override_params_uri) + with open(dir_path / '.override_spec.json', 'w') as f: + json.dump(override_data, f, indent=2) + + # Create config.yaml + _generate_dynamic_config_yaml(dir_path, job_type, model_name=model_id, technique=technique, instance_type=instance_type) + + # Download and save k8s template + k8s_content = _download_s3_content(s3_client, k8s_template_uri) + with open(dir_path / 'k8s.jinja', 'w') as f: + f.write(k8s_content) + + return True + + except Exception as e: + click.secho(f"❌ Error: {e}", fg="red") + return False + +def _configure_dynamic_template(ctx, option, value, dir_path): + """Handle configure for dynamic templates (recipe)""" + config_path = dir_path / "config.yaml" + spec_path = dir_path / ".override_spec.json" + + if not spec_path.exists(): + click.secho(f"❌ .override_spec.json not found", fg="red") + ctx.exit(1) + + # Load spec + spec = load_dynamic_schema(dir_path) + + # Check if user provided --option flags (only those explicitly provided, not defaults) + provided_options = {} + for param_name, param_value in ctx.params.items(): + if param_name not in ['option', 'value', 'model_config']: + # Check if this parameter was actually provided by the user (not a default) + param_source = ctx.get_parameter_source(param_name) + if param_source and param_source.name == 'COMMANDLINE' and param_value is not None: + # Convert back to original key format + original_key = param_name.replace('-', '_') + if original_key in spec: + provided_options[original_key] = param_value + + # If --option flags were used, process them + if provided_options: + for key, value in provided_options.items(): + _update_config_field(config_path, spec, key, value) + click.secho("✔️ config.yaml updated successfully.", fg="green") + return + + # If no arguments, show help + click.echo(ctx.get_help()) + ctx.exit(0) + + +def _warn_if_instance_type_unavailable(instance_type: str) -> None: + """Warn if the requested instance type has no ready nodes in the current cluster.""" + try: + config.load_kube_config() + v1 = client.CoreV1Api() + nodes = v1.list_node().items + available = { + n.metadata.labels.get("node.kubernetes.io/instance-type") + for n in nodes + if n.metadata.labels + } + available.discard(None) + if instance_type and instance_type not in available: + click.secho( + f"⚠️ Instance type '{instance_type}' not found in the current cluster.\n" + f" Available: {', '.join(sorted(available)) or 'none'}\n" + f" The job will be submitted but pods may remain Pending.", + fg="yellow" + ) + except Exception as e: + click.secho(f"⚠️ Could not verify instance type availability: {e}", fg="yellow") + + +def _create_dynamic_template(dir_path: Path, config_data: dict): + """Handle create for dynamic templates (recipe)""" + try: + # Validate config first + _validate_dynamic_template(dir_path) + click.secho("✔️ Configuration validated successfully", fg="green") + + # Warn if instance type isn't available in the cluster + _warn_if_instance_type_unavailable(config_data.get('instance_type')) + + k8s_template_file = dir_path / 'k8s.jinja' + if not k8s_template_file.exists(): + raise FileNotFoundError("k8s.jinja template not found") + + # Read and render template + template_content = k8s_template_file.read_text() + rendered = _render_k8s_template(template_content, config_data) + + # Create run directory + run_root = dir_path / 'run' + run_root.mkdir(exist_ok=True) + timestamp = datetime.now().strftime('%Y%m%dT%H%M%S') + out_dir = run_root / timestamp + out_dir.mkdir() + + # Save files + shutil.copy(dir_path / 'config.yaml', out_dir / 'config.yaml') + (out_dir / 'k8s.yaml').write_text(rendered) + + relative_out_dir = Path("run") / timestamp + click.secho(f"✔️ Files written to {relative_out_dir}", fg="green") + + # Submit to Kubernetes + custom_api = _get_k8s_custom_client() + _submit_k8s_resources(custom_api, rendered) + + click.secho("✔️ Successfully submitted to HyperPod", fg="green") + + except (FileNotFoundError, ValueError) as e: + click.secho(f"❌ {e}", fg="red") + sys.exit(1) + except Exception as e: + try: + resource_name = config_data.get('name', 'unknown') + handle_exception(e, resource_name, 'default') + except Exception as handled_e: + click.secho(f"❌ {handled_e}", fg="red") + sys.exit(1) diff --git a/src/sagemaker/hyperpod/cli/constants/init_constants.py b/src/sagemaker/hyperpod/cli/constants/init_constants.py index 3168484d..4a1ba4df 100644 --- a/src/sagemaker/hyperpod/cli/constants/init_constants.py +++ b/src/sagemaker/hyperpod/cli/constants/init_constants.py @@ -38,6 +38,13 @@ "schema_pkg": "hyperpod_cluster_stack_template", "schema_type": CFN, 'type': "jinja" + }, + "hyp-recipe-job": { + "registry": {}, + "template_registry": {}, + "schema_pkg": None, + "schema_type": CRD, + 'type': "dynamic" } } diff --git a/src/sagemaker/hyperpod/cli/hyp_cli.py b/src/sagemaker/hyperpod/cli/hyp_cli.py index a33aee29..4b2107c0 100644 --- a/src/sagemaker/hyperpod/cli/hyp_cli.py +++ b/src/sagemaker/hyperpod/cli/hyp_cli.py @@ -1,11 +1,7 @@ import click -import yaml -import json -import os -import subprocess -from pydantic import BaseModel, ValidationError, Field -from typing import Optional, Union +from typing import Union from importlib.metadata import version, PackageNotFoundError +import copy from sagemaker.hyperpod.cli.commands.cluster import list_cluster, set_cluster_context, get_cluster_context, \ get_monitoring, describe_cluster @@ -62,8 +58,8 @@ from sagemaker.hyperpod.cli.commands.init import ( init, reset, - configure, validate, + configure, _default_create ) @@ -74,6 +70,7 @@ def get_package_version(package_name): except PackageNotFoundError: return "Not installed" + def print_version(ctx, param, value): if not value or ctx.resilient_parsing: return @@ -91,7 +88,8 @@ def print_version(ctx, param, value): @click.group(context_settings={'max_content_width': 200}) -@click.option('--version', is_flag=True, callback=print_version, expose_value=False, is_eager=True, help='Show version information') +@click.option('--version', is_flag=True, callback=print_version, expose_value=False, is_eager=True, + help='Show version information') def cli(): pass @@ -142,11 +140,13 @@ def describe(): """Describe endpoints, pytorch jobs or cluster stacks, spaces or space template.""" pass + @cli.group(cls=CLICommand) def update(): """Update an existing HyperPod cluster configuration, space, or space template.""" pass + @cli.group(cls=CLICommand) def delete(): """Delete endpoints, pytorch jobs, space, space access or space template.""" @@ -173,7 +173,7 @@ def portforward(): @cli.group(cls=CLICommand) def list_pods(): - """List pods for endpoints or pytorch jobs.""" + """List pods for endpoints, pytorch jobs, or recipe jobs.""" pass @@ -197,7 +197,7 @@ def get_operator_logs(): @cli.group(cls=CLICommand) def exec(): - """Execute commands in pods for endpoints or pytorch jobs.""" + """Execute commands in pods for endpoints, pytorch jobs, or recipe jobs.""" pass @@ -207,6 +207,7 @@ def exec(): cli.add_command(validate) create.add_command(pytorch_create) +# create.add_command(create_recipe_job_interactive) create.add_command(js_create) create.add_command(custom_create) @@ -217,6 +218,9 @@ def exec(): create.add_command(space_access_create) list.add_command(list_jobs) +recipe_list_cmd = copy.copy(list_jobs) +recipe_list_cmd.help = "List all HyperPod recipe jobs" +list.add_command(recipe_list_cmd, name="hyp-recipe-job") list.add_command(js_list) list.add_command(custom_list) list.add_command(list_cluster_stacks) @@ -224,6 +228,9 @@ def exec(): list.add_command(space_template_list) describe.add_command(pytorch_describe) +recipe_describe_cmd = copy.copy(pytorch_describe) +recipe_describe_cmd.help = "Describe a HyperPod recipe job." +describe.add_command(recipe_describe_cmd, name="hyp-recipe-job") describe.add_command(js_describe) describe.add_command(custom_describe) describe.add_command(describe_cluster_stack) @@ -237,6 +244,9 @@ def exec(): update.add_command(space_template_update) delete.add_command(pytorch_delete) +recipe_delete_cmd = copy.copy(pytorch_delete) +recipe_delete_cmd.help = "Delete a HyperPod recipe job." +delete.add_command(recipe_delete_cmd, name="hyp-recipe-job") delete.add_command(js_delete) delete.add_command(custom_delete) delete.add_command(delete_cluster_stack) @@ -248,10 +258,16 @@ def exec(): stop.add_command(space_stop) list_pods.add_command(pytorch_list_pods) +recipe_list_pods_cmd = copy.copy(pytorch_list_pods) +recipe_list_pods_cmd.help = "List all HyperPod PyTorch pods related to the recipe job." +list_pods.add_command(recipe_list_pods_cmd, name="hyp-recipe-job") list_pods.add_command(js_list_pods) list_pods.add_command(custom_list_pods) get_logs.add_command(pytorch_get_logs) +recipe_get_logs_cmd = copy.copy(pytorch_get_logs) +recipe_get_logs_cmd.help = "Get specific pod log for HyperPod recipe job." +get_logs.add_command(recipe_get_logs_cmd, name="hyp-recipe-job") get_logs.add_command(js_get_logs) get_logs.add_command(custom_get_logs) get_logs.add_command(space_get_logs) @@ -259,11 +275,16 @@ def exec(): portforward.add_command(space_portforward) get_operator_logs.add_command(pytorch_get_operator_logs) +recipe_get_operator_logs_cmd = copy.copy(pytorch_get_operator_logs) +recipe_get_operator_logs_cmd.help = "Get operator logs for HyperPod recipe jobs." +get_operator_logs.add_command(recipe_get_operator_logs_cmd, name="hyp-recipe-job") get_operator_logs.add_command(js_get_operator_logs) get_operator_logs.add_command(custom_get_operator_logs) invoke.add_command(custom_invoke) -invoke.add_command(custom_invoke, name="hyp-jumpstart-endpoint") +jumpstart_invoke_cmd = copy.copy(custom_invoke) +jumpstart_invoke_cmd.help = "Invoke a jumpstart model endpoint." +invoke.add_command(jumpstart_invoke_cmd, name="hyp-jumpstart-endpoint") cli.add_command(list_cluster) cli.add_command(set_cluster_context) @@ -273,6 +294,9 @@ def exec(): cli.add_command(list_accelerator_partition_type) exec.add_command(pytorch_exec) +recipe_exec_cmd = copy.copy(pytorch_exec) +recipe_exec_cmd.help = "Execute commands in pods associated with a HyperPod recipe job." +exec.add_command(recipe_exec_cmd, name="hyp-recipe-job") if __name__ == "__main__": cli() diff --git a/src/sagemaker/hyperpod/cli/inference_utils.py b/src/sagemaker/hyperpod/cli/inference_utils.py index eb38da16..29f3c6b3 100644 --- a/src/sagemaker/hyperpod/cli/inference_utils.py +++ b/src/sagemaker/hyperpod/cli/inference_utils.py @@ -96,6 +96,8 @@ def wrapped_func(*args, **kwargs): help=spec.get("description", ""), )(wrapped_func) + # Preserve the original function's docstring + wrapped_func.__doc__ = func.__doc__ return wrapped_func return decorator \ No newline at end of file diff --git a/src/sagemaker/hyperpod/cli/init_utils.py b/src/sagemaker/hyperpod/cli/init_utils.py index f36837d7..6b502fe2 100644 --- a/src/sagemaker/hyperpod/cli/init_utils.py +++ b/src/sagemaker/hyperpod/cli/init_utils.py @@ -8,7 +8,7 @@ import yaml import sys from pathlib import Path -from sagemaker.hyperpod.cli.type_handler_utils import convert_cli_value, to_click_type, is_complex_type, DEFAULT_TYPE_HANDLER +from sagemaker.hyperpod.cli.type_handler_utils import convert_cli_value, to_click_type, is_complex_type, DEFAULT_TYPE_HANDLER, is_undefined_value from pydantic import ValidationError from typing import List, Any from sagemaker.hyperpod.cli.constants.init_constants import ( @@ -144,9 +144,15 @@ def _load_schema_for_version(version: str, schema_pkg: str) -> dict: return json.loads(raw) -def _get_handler_for_field(template_name, field_name): +def _get_handler_for_field(template_name, field_name, version=None): """Get appropriate handler for a field using template.field mapping.""" if template_name and field_name: + # Try version-scoped key first, then fall back to unversioned + if version: + scoped_key = f"{template_name}.{version}.{field_name}" + handler = SPECIAL_FIELD_HANDLERS.get(scoped_key) + if handler: + return handler scoped_key = f"{template_name}.{field_name}" handler = SPECIAL_FIELD_HANDLERS.get(scoped_key, DEFAULT_TYPE_HANDLER) return handler @@ -187,6 +193,7 @@ def generate_click_command() -> Callable: """ Decorator that: - injects -- for every property in the current template's schema (detected from config.yaml) + - supports both standard templates (Pydantic) and dynamic templates (.override_spec.json) - only works for configure command, returns minimal decorator for others """ @@ -204,7 +211,101 @@ def decorator(func: Callable) -> Callable: click.secho("❌ No config.yaml found. Run 'hyp init