From cb4458a97212394127bf3208286620b4f4f98a98 Mon Sep 17 00:00:00 2001 From: Sam Welborn Date: Tue, 5 May 2026 16:00:32 -0400 Subject: [PATCH 01/11] add stdin --- app/s3df/compute_adapter.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/app/s3df/compute_adapter.py b/app/s3df/compute_adapter.py index 47978cd0..6c9055d9 100644 --- a/app/s3df/compute_adapter.py +++ b/app/s3df/compute_adapter.py @@ -265,6 +265,7 @@ async def submit_job( name = job_spec.name executable = job_spec.executable cwd = str(job_spec.directory) if job_spec.directory else None + stdin = job_spec.stdin_path stdout = job_spec.stdout_path stderr = job_spec.stderr_path @@ -295,6 +296,7 @@ async def submit_job( account=account, environment=environment, current_working_directory=cwd, + standard_input=stdin, standard_output=stdout, standard_error=stderr, **custom_attributes From b0c599f56c33fcc57347b2afe6e3eb107ca2708a Mon Sep 17 00:00:00 2001 From: Sam Welborn Date: Tue, 5 May 2026 16:08:34 -0400 Subject: [PATCH 02/11] add process count --- app/s3df/compute_adapter.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/app/s3df/compute_adapter.py b/app/s3df/compute_adapter.py index 6c9055d9..ec64852a 100644 --- a/app/s3df/compute_adapter.py +++ b/app/s3df/compute_adapter.py @@ -257,6 +257,7 @@ async def submit_job( # --- resource fields with safe defaults --- node_count = 1 + tasks = None duration_mins = 60 partition = None account = None @@ -274,6 +275,7 @@ async def submit_job( if job_spec.resources: node_count = job_spec.resources.node_count or 1 + tasks = job_spec.resources.process_count if job_spec.attributes: if job_spec.attributes.duration is not None: @@ -289,6 +291,7 @@ async def submit_job( try: slurm_job = SlurmV0041PostJobSubmitRequestJobStrict( nodes=str(node_count), + tasks=tasks, time_limit=SlurmV0041PostJobSubmitRequestJobsInnerTimeLimit(set=True, number=duration_mins), name=name, script=executable, From 73a9b4de0a8d7b80b7a0fcc20cc11b9a0cbc4d44 Mon Sep 17 00:00:00 2001 From: Sam Welborn Date: Tue, 5 May 2026 16:09:20 -0400 Subject: [PATCH 03/11] add tasks_per_node --- app/s3df/compute_adapter.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/app/s3df/compute_adapter.py b/app/s3df/compute_adapter.py index ec64852a..2f1d4a75 100644 --- a/app/s3df/compute_adapter.py +++ b/app/s3df/compute_adapter.py @@ -258,6 +258,7 @@ async def submit_job( # --- resource fields with safe defaults --- node_count = 1 tasks = None + tasks_per_node = None duration_mins = 60 partition = None account = None @@ -276,6 +277,7 @@ async def submit_job( if job_spec.resources: node_count = job_spec.resources.node_count or 1 tasks = job_spec.resources.process_count + tasks_per_node = job_spec.resources.processes_per_node if job_spec.attributes: if job_spec.attributes.duration is not None: @@ -292,6 +294,7 @@ async def submit_job( slurm_job = SlurmV0041PostJobSubmitRequestJobStrict( nodes=str(node_count), tasks=tasks, + tasks_per_node=tasks_per_node, time_limit=SlurmV0041PostJobSubmitRequestJobsInnerTimeLimit(set=True, number=duration_mins), name=name, script=executable, From ebf88e18f1b0e42df79cdab9aa8ff317443eca89 Mon Sep 17 00:00:00 2001 From: Sam Welborn Date: Tue, 5 May 2026 16:10:10 -0400 Subject: [PATCH 04/11] add cpus_per_task --- app/s3df/compute_adapter.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/app/s3df/compute_adapter.py b/app/s3df/compute_adapter.py index 2f1d4a75..4937470f 100644 --- a/app/s3df/compute_adapter.py +++ b/app/s3df/compute_adapter.py @@ -259,6 +259,7 @@ async def submit_job( node_count = 1 tasks = None tasks_per_node = None + cpus_per_task = None duration_mins = 60 partition = None account = None @@ -278,6 +279,7 @@ async def submit_job( node_count = job_spec.resources.node_count or 1 tasks = job_spec.resources.process_count tasks_per_node = job_spec.resources.processes_per_node + cpus_per_task = job_spec.resources.cpu_cores_per_process if job_spec.attributes: if job_spec.attributes.duration is not None: @@ -295,6 +297,7 @@ async def submit_job( nodes=str(node_count), tasks=tasks, tasks_per_node=tasks_per_node, + cpus_per_task=cpus_per_task, time_limit=SlurmV0041PostJobSubmitRequestJobsInnerTimeLimit(set=True, number=duration_mins), name=name, script=executable, From 42513022b05720cd439a141b2e731c920b0ad3c4 Mon Sep 17 00:00:00 2001 From: Sam Welborn Date: Tue, 5 May 2026 16:11:20 -0400 Subject: [PATCH 05/11] add gpu_cores_per_process MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit note, I am not too sure about this, it will have to be tested… --- app/s3df/compute_adapter.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/app/s3df/compute_adapter.py b/app/s3df/compute_adapter.py index 4937470f..0674b7a3 100644 --- a/app/s3df/compute_adapter.py +++ b/app/s3df/compute_adapter.py @@ -260,6 +260,7 @@ async def submit_job( tasks = None tasks_per_node = None cpus_per_task = None + tres_per_task = None duration_mins = 60 partition = None account = None @@ -280,6 +281,8 @@ async def submit_job( tasks = job_spec.resources.process_count tasks_per_node = job_spec.resources.processes_per_node cpus_per_task = job_spec.resources.cpu_cores_per_process + if job_spec.resources.gpu_cores_per_process: + tres_per_task = f"gres/gpu:{job_spec.resources.gpu_cores_per_process}" if job_spec.attributes: if job_spec.attributes.duration is not None: @@ -298,6 +301,7 @@ async def submit_job( tasks=tasks, tasks_per_node=tasks_per_node, cpus_per_task=cpus_per_task, + tres_per_task=tres_per_task, time_limit=SlurmV0041PostJobSubmitRequestJobsInnerTimeLimit(set=True, number=duration_mins), name=name, script=executable, From d5d1f8a0393f2cb3f02dec247a4fa869f44226f2 Mon Sep 17 00:00:00 2001 From: Sam Welborn Date: Tue, 5 May 2026 16:15:02 -0400 Subject: [PATCH 06/11] add exclusive --- app/s3df/compute_adapter.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/app/s3df/compute_adapter.py b/app/s3df/compute_adapter.py index 0674b7a3..78b6197a 100644 --- a/app/s3df/compute_adapter.py +++ b/app/s3df/compute_adapter.py @@ -261,6 +261,7 @@ async def submit_job( tasks_per_node = None cpus_per_task = None tres_per_task = None + exclusive = ["true"] duration_mins = 60 partition = None account = None @@ -283,6 +284,8 @@ async def submit_job( cpus_per_task = job_spec.resources.cpu_cores_per_process if job_spec.resources.gpu_cores_per_process: tres_per_task = f"gres/gpu:{job_spec.resources.gpu_cores_per_process}" + if not job_spec.resources.exclusive_node_use: + exclusive = ["false"] if job_spec.attributes: if job_spec.attributes.duration is not None: @@ -302,6 +305,7 @@ async def submit_job( tasks_per_node=tasks_per_node, cpus_per_task=cpus_per_task, tres_per_task=tres_per_task, + exclusive=exclusive, time_limit=SlurmV0041PostJobSubmitRequestJobsInnerTimeLimit(set=True, number=duration_mins), name=name, script=executable, From 416687ccb8772a8838acbc3368d68e4ebed1f2c6 Mon Sep 17 00:00:00 2001 From: Sam Welborn Date: Tue, 5 May 2026 16:16:43 -0400 Subject: [PATCH 07/11] add memory per node --- app/s3df/compute_adapter.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/app/s3df/compute_adapter.py b/app/s3df/compute_adapter.py index 78b6197a..798e92ec 100644 --- a/app/s3df/compute_adapter.py +++ b/app/s3df/compute_adapter.py @@ -35,6 +35,9 @@ from slurmrestd_client.models.slurm_v0041_post_job_submit_request_jobs_inner_time_limit import ( SlurmV0041PostJobSubmitRequestJobsInnerTimeLimit, ) +from slurmrestd_client.models.slurm_v0041_post_job_submit_request_jobs_inner_memory_per_cpu import ( + SlurmV0041PostJobSubmitRequestJobsInnerMemoryPerCpu, +) from fastapi import HTTPException, Response from pydantic import ConfigDict, ValidationError @@ -262,6 +265,7 @@ async def submit_job( cpus_per_task = None tres_per_task = None exclusive = ["true"] + memory_per_node = None duration_mins = 60 partition = None account = None @@ -286,6 +290,9 @@ async def submit_job( tres_per_task = f"gres/gpu:{job_spec.resources.gpu_cores_per_process}" if not job_spec.resources.exclusive_node_use: exclusive = ["false"] + if job_spec.resources.memory: + memory_mb = max(1, job_spec.resources.memory // (1024 * 1024)) + memory_per_node = SlurmV0041PostJobSubmitRequestJobsInnerMemoryPerCpu(set=True, number=memory_mb) if job_spec.attributes: if job_spec.attributes.duration is not None: @@ -306,6 +313,7 @@ async def submit_job( cpus_per_task=cpus_per_task, tres_per_task=tres_per_task, exclusive=exclusive, + memory_per_node=memory_per_node, time_limit=SlurmV0041PostJobSubmitRequestJobsInnerTimeLimit(set=True, number=duration_mins), name=name, script=executable, From 0eb84e74df7caaef931c117bd97a1548fe11119a Mon Sep 17 00:00:00 2001 From: Sam Welborn Date: Tue, 5 May 2026 16:17:46 -0400 Subject: [PATCH 08/11] add reservation --- app/s3df/compute_adapter.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/app/s3df/compute_adapter.py b/app/s3df/compute_adapter.py index 798e92ec..97e9aa4e 100644 --- a/app/s3df/compute_adapter.py +++ b/app/s3df/compute_adapter.py @@ -269,6 +269,7 @@ async def submit_job( duration_mins = 60 partition = None account = None + reservation = None environment = ["PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"] name = job_spec.name @@ -299,6 +300,7 @@ async def submit_job( duration_mins = max(1, int(job_spec.attributes.duration // 60)) partition = job_spec.attributes.queue_name account = job_spec.attributes.account + reservation = job_spec.attributes.reservation_id partition = partition or os.environ.get("SLURM_DEFAULT_PARTITION") account = account or os.environ.get("SLURM_DEFAULT_ACCOUNT") @@ -319,6 +321,7 @@ async def submit_job( script=executable, partition=partition, account=account, + reservation=reservation, environment=environment, current_working_directory=cwd, standard_input=stdin, From 870d3e46d492a72d9bf261782c7f9a0a9b77b14b Mon Sep 17 00:00:00 2001 From: Sam Welborn Date: Tue, 5 May 2026 16:18:34 -0400 Subject: [PATCH 09/11] add arguments --- app/s3df/compute_adapter.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/app/s3df/compute_adapter.py b/app/s3df/compute_adapter.py index 97e9aa4e..e7450395 100644 --- a/app/s3df/compute_adapter.py +++ b/app/s3df/compute_adapter.py @@ -274,6 +274,7 @@ async def submit_job( name = job_spec.name executable = job_spec.executable + argv = job_spec.arguments or None cwd = str(job_spec.directory) if job_spec.directory else None stdin = job_spec.stdin_path stdout = job_spec.stdout_path @@ -319,6 +320,7 @@ async def submit_job( time_limit=SlurmV0041PostJobSubmitRequestJobsInnerTimeLimit(set=True, number=duration_mins), name=name, script=executable, + argv=argv, partition=partition, account=account, reservation=reservation, From 5d26731f268bebc39a3d26db290f46cf7e4f4d7a Mon Sep 17 00:00:00 2001 From: Sam Welborn Date: Wed, 6 May 2026 11:52:49 -0400 Subject: [PATCH 10/11] move attributes check up in stack --- app/s3df/compute_adapter.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/app/s3df/compute_adapter.py b/app/s3df/compute_adapter.py index e7450395..de163aba 100644 --- a/app/s3df/compute_adapter.py +++ b/app/s3df/compute_adapter.py @@ -283,6 +283,16 @@ async def submit_job( if job_spec.environment: environment = [f"{k}={v}" for k, v in job_spec.environment.items()] + if job_spec.attributes: + if job_spec.attributes.duration is not None: + duration_mins = max(1, int(job_spec.attributes.duration // 60)) + partition = job_spec.attributes.queue_name + account = job_spec.attributes.account + reservation = job_spec.attributes.reservation_id + + partition = partition or os.environ.get("SLURM_DEFAULT_PARTITION") + account = account or os.environ.get("SLURM_DEFAULT_ACCOUNT") + if job_spec.resources: node_count = job_spec.resources.node_count or 1 tasks = job_spec.resources.process_count @@ -296,16 +306,6 @@ async def submit_job( memory_mb = max(1, job_spec.resources.memory // (1024 * 1024)) memory_per_node = SlurmV0041PostJobSubmitRequestJobsInnerMemoryPerCpu(set=True, number=memory_mb) - if job_spec.attributes: - if job_spec.attributes.duration is not None: - duration_mins = max(1, int(job_spec.attributes.duration // 60)) - partition = job_spec.attributes.queue_name - account = job_spec.attributes.account - reservation = job_spec.attributes.reservation_id - - partition = partition or os.environ.get("SLURM_DEFAULT_PARTITION") - account = account or os.environ.get("SLURM_DEFAULT_ACCOUNT") - custom_attributes = job_spec.attributes.custom_attributes if job_spec.attributes else {} try: From 7ab00334dd8ebce60492cfa9d7f6608892e9cb56 Mon Sep 17 00:00:00 2001 From: Sam Welborn Date: Wed, 6 May 2026 11:53:39 -0400 Subject: [PATCH 11/11] fix gpu_cores_per_process --- app/s3df/compute_adapter.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/app/s3df/compute_adapter.py b/app/s3df/compute_adapter.py index de163aba..d9400022 100644 --- a/app/s3df/compute_adapter.py +++ b/app/s3df/compute_adapter.py @@ -89,6 +89,14 @@ class SlurmV0041PostJobSubmitRequestJobStrict(SlurmV0041PostJobSubmitRequestJob) "STOPPED": JobState.CANCELED, } +# Map from Slurm partition name → GPU type string for GRES +PARTITION_GPU_TYPE: dict[str, str] = { + "ampere": "a100", + "turing": "geforce_rtx_2080_ti", + "ada": "l40s", + "hopper": "h200", +} + # --------------------------------------------------------------------------- # JWT minting — IRI signs tokens using the shared key @@ -299,7 +307,11 @@ async def submit_job( tasks_per_node = job_spec.resources.processes_per_node cpus_per_task = job_spec.resources.cpu_cores_per_process if job_spec.resources.gpu_cores_per_process: - tres_per_task = f"gres/gpu:{job_spec.resources.gpu_cores_per_process}" + gpu_type = PARTITION_GPU_TYPE.get(partition or "") + if gpu_type: + tres_per_task = f"gres/gpu:{gpu_type}:{job_spec.resources.gpu_cores_per_process}" + else: + tres_per_task = f"gres/gpu:{job_spec.resources.gpu_cores_per_process}" if not job_spec.resources.exclusive_node_use: exclusive = ["false"] if job_spec.resources.memory: