Merge pull request #104 from VectorInstitute/feature/add-slurm-account

XkunW · web-flow · commit 019ca542f475 · 2025-05-07T13:28:22.000-04:00
* Add account option for model config to support deadline QoS
* Removed LaunchOptionsDict class, it was only used by CLI launch function and still needed to ignore type check, so instead of using this extra middle step just dump the CLI args to LaunchOptions and ignore type check
* Add short/long name mapping of vllm engine args used in model config
diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
 ----------------------------------------------------
 
 [![PyPI](https://img.shields.io/pypi/v/vec-inf)](https://pypi.org/project/vec-inf)
-[![downloads](https://img.shields.io/pypi/dm/vec-inf)]
+[![downloads](https://img.shields.io/pypi/dm/vec-inf)](https://pypistats.org/packages/vec-inf)
 [![code checks](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml)
 [![docs](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs.yml)
 [![codecov](https://codecov.io/github/VectorInstitute/vector-inference/branch/main/graph/badge.svg?token=NI88QSIGAC)](https://app.codecov.io/github/VectorInstitute/vector-inference/tree/main)
diff --git a/vec_inf/cli/_cli.py b/vec_inf/cli/_cli.py
@@ -31,7 +31,7 @@
     MetricsResponseFormatter,
     StatusResponseFormatter,
 )
-from vec_inf.client import LaunchOptions, LaunchOptionsDict, VecInfClient
+from vec_inf.client import LaunchOptions, VecInfClient
 
 
 CONSOLE = Console()
@@ -62,6 +62,11 @@ def cli() -> None:
     type=int,
     help="Number of GPUs/node to use, default to suggested resource allocation for model",
 )
+@click.option(
+    "--account",
+    type=str,
+    help="Charge resources used by this job to specified account.",
+)
 @click.option(
     "--qos",
     type=str,
@@ -141,18 +146,18 @@ def launch(
     """
     try:
         # Convert cli_kwargs to LaunchOptions
-        kwargs = {k: v for k, v in cli_kwargs.items() if k != "json_mode"}
-        # Cast the dictionary to LaunchOptionsDict
-        options_dict: LaunchOptionsDict = kwargs  # type: ignore
-        launch_options = LaunchOptions(**options_dict)
+        json_mode = cli_kwargs["json_mode"]
+        del cli_kwargs["json_mode"]
+
+        launch_options = LaunchOptions(**cli_kwargs)  # type: ignore
 
         # Start the client and launch model inference server
         client = VecInfClient()
         launch_response = client.launch_model(model_name, launch_options)
 
         # Display launch information
         launch_formatter = LaunchResponseFormatter(model_name, launch_response.config)
-        if cli_kwargs.get("json_mode"):
+        if json_mode:
             click.echo(launch_response.config)
         else:
             launch_info_table = launch_formatter.format_table_output()
diff --git a/vec_inf/client/__init__.py b/vec_inf/client/__init__.py
@@ -9,7 +9,6 @@
 from vec_inf.client.config import ModelConfig
 from vec_inf.client.models import (
     LaunchOptions,
-    LaunchOptionsDict,
     LaunchResponse,
     MetricsResponse,
     ModelInfo,
@@ -28,6 +27,5 @@
     "ModelStatus",
     "ModelType",
     "LaunchOptions",
-    "LaunchOptionsDict",
     "ModelConfig",
 ]
diff --git a/vec_inf/client/_client_vars.py b/vec_inf/client/_client_vars.py
@@ -56,6 +56,7 @@
 SLURM_JOB_CONFIG_ARGS = {
     "job-name": "model_name",
     "partition": "partition",
+    "account": "account",
     "qos": "qos",
     "time": "time",
     "nodes": "num_nodes",
@@ -66,6 +67,13 @@
     "error": "err_file",
 }
 
+# vLLM engine args mapping between short and long names
+VLLM_SHORT_TO_LONG_MAP = {
+    "-tp": "--tensor-parallel-size",
+    "-pp": "--pipeline-parallel-size",
+    "-O": "--compilation-config",
+}
+
 
 # Slurm script templates
 class ShebangConfig(TypedDict):
diff --git a/vec_inf/client/_helper.py b/vec_inf/client/_helper.py
@@ -19,6 +19,7 @@
     KEY_METRICS,
     REQUIRED_FIELDS,
     SRC_DIR,
+    VLLM_SHORT_TO_LONG_MAP,
 )
 from vec_inf.client._exceptions import (
     MissingRequiredFieldsError,
@@ -156,9 +157,14 @@ def _process_vllm_args(self, arg_string: str) -> dict[str, Any]:
         for arg in arg_string.split(","):
             if "=" in arg:
                 key, value = arg.split("=")
-                vllm_args[key] = value
+                if key.strip() in VLLM_SHORT_TO_LONG_MAP:
+                    key = VLLM_SHORT_TO_LONG_MAP[key.strip()]
+                vllm_args[key.strip()] = value.strip()
+            elif "-O" in arg.strip():
+                key = VLLM_SHORT_TO_LONG_MAP["-O"]
+                vllm_args[key] = arg.strip()[2:].strip()
             else:
-                vllm_args[arg] = True
+                vllm_args[arg.strip()] = True
         return vllm_args
 
     def _get_launch_params(self) -> dict[str, Any]:
@@ -175,7 +181,7 @@ def _get_launch_params(self) -> dict[str, Any]:
             If required fields are missing or tensor parallel size is not specified
             when using multiple GPUs
         """
-        params = self.model_config.model_dump()
+        params = self.model_config.model_dump(exclude_none=True)
 
         # Override config defaults with CLI arguments
         if self.kwargs.get("vllm_args"):
diff --git a/vec_inf/client/_slurm_script_generator.py b/vec_inf/client/_slurm_script_generator.py
@@ -68,7 +68,8 @@ def _generate_shebang(self) -> str:
         """
         shebang = [SLURM_SCRIPT_TEMPLATE["shebang"]["base"]]
         for arg, value in SLURM_JOB_CONFIG_ARGS.items():
-            shebang.append(f"#SBATCH --{arg}={self.params[value]}")
+            if self.params.get(value):
+                shebang.append(f"#SBATCH --{arg}={self.params[value]}")
         if self.is_multinode:
             shebang += SLURM_SCRIPT_TEMPLATE["shebang"]["multinode"]
         return "\n".join(shebang)
diff --git a/vec_inf/client/config.py b/vec_inf/client/config.py
@@ -47,6 +47,8 @@ class ModelConfig(BaseModel):
         Memory allocation per node in GB format (e.g., '32G')
     vocab_size : int
         Size of the model's vocabulary (1-1,000,000)
+    account : Optional[str], optional
+        Charge resources used by this job to specified account.
     qos : Union[QOS, str], optional
         Quality of Service tier for job scheduling
     time : str, optional
@@ -92,6 +94,9 @@ class ModelConfig(BaseModel):
         description="Memory per node",
     )
     vocab_size: int = Field(..., gt=0, le=1_000_000)
+    account: Optional[str] = Field(
+        default=None, description="Account name for job scheduling"
+    )
     qos: Union[QOS, str] = Field(
         default=cast(str, DEFAULT_ARGS["qos"]), description="Quality of Service tier"
     )
diff --git a/vec_inf/client/models.py b/vec_inf/client/models.py
@@ -25,7 +25,7 @@
 
 from dataclasses import dataclass, field
 from enum import Enum
-from typing import Any, Optional, TypedDict, Union
+from typing import Any, Optional, Union
 
 
 class ModelStatus(str, Enum):
@@ -164,6 +164,8 @@ class LaunchOptions:
         Number of nodes to allocate
     gpus_per_node : int, optional
         Number of GPUs per node
+    account : str, optional
+        Account name for job scheduling
     qos : str, optional
         Quality of Service level
     time : str, optional
@@ -187,6 +189,7 @@ class LaunchOptions:
     partition: Optional[str] = None
     num_nodes: Optional[int] = None
     gpus_per_node: Optional[int] = None
+    account: Optional[str] = None
     qos: Optional[str] = None
     time: Optional[str] = None
     vocab_size: Optional[int] = None
@@ -197,43 +200,6 @@ class LaunchOptions:
     vllm_args: Optional[str] = None
 
 
-class LaunchOptionsDict(TypedDict):
-    """TypedDict for LaunchOptions.
-
-    A TypedDict representation of LaunchOptions for type checking and
-    serialization purposes. All fields are optional and may be None.
-
-    Attributes
-    ----------
-    model_family : str, optional
-        Family/architecture of the model
-    model_variant : str, optional
-        Specific variant/version of the model
-    partition : str, optional
-        SLURM partition to use
-    num_nodes : int, optional
-        Number of nodes to allocate
-    gpus_per_node : int, optional
-        Number of GPUs per node
-    qos : str, optional
-        Quality of Service level
-    time : str, optional
-        Time limit for the job
-    vocab_size : int, optional
-        Size of model vocabulary
-    data_type : str, optional
-        Data type for model weights
-    venv : str, optional
-        Virtual environment to use
-    log_dir : str, optional
-        Directory for logs
-    model_weights_parent_dir : str, optional
-        Parent directory containing model weights
-    vllm_args : str, optional
-        Additional arguments for vLLM
-    """
-
-
 @dataclass
 class ModelInfo:
     """Information about an available model.