diff --git a/src/dstack/_internal/core/backends/aws/compute.py b/src/dstack/_internal/core/backends/aws/compute.py index 037ec1c160..4c0fb4e39a 100644 --- a/src/dstack/_internal/core/backends/aws/compute.py +++ b/src/dstack/_internal/core/backends/aws/compute.py @@ -1244,7 +1244,6 @@ def _supported_instances(offer: InstanceOffer) -> bool: "p5e.", "p4d.", "p4de.", - "p3.", "g7e.", "g6.", "g6e.", diff --git a/src/dstack/_internal/core/backends/aws/resources.py b/src/dstack/_internal/core/backends/aws/resources.py index 48c32375e4..67959a19c0 100644 --- a/src/dstack/_internal/core/backends/aws/resources.py +++ b/src/dstack/_internal/core/backends/aws/resources.py @@ -6,8 +6,6 @@ import dstack.version as version from dstack._internal.core.backends.aws.models import AWSOSImageConfig -from dstack._internal.core.backends.base.compute import requires_nvidia_proprietary_kernel_modules -from dstack._internal.core.consts import DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES from dstack._internal.core.errors import BackendError, ComputeError, ComputeResourceNotFoundError from dstack._internal.utils.logging import get_logger @@ -31,17 +29,15 @@ def get_image_id_and_username( image_name = image.name image_owner = image.owner username = image.user - elif _supported_by_dlami(instance_type): + elif gpu_name is not None: + # AWS Deep Learning AMIs (DLAMI) support all GPU instance types currently supported by dstack. + # dstack's cuda AMI is still built but not used. + # It may be used again in case some instance types are not supported by DLAMI. image_name = "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04) *" image_owner = DLAMI_OWNER_ACCOUNT_ID username = "ubuntu" else: - if gpu_name is None: - image_name = f"dstack-{version.base_image}" - elif not requires_nvidia_proprietary_kernel_modules(gpu_name): - image_name = f"dstack-cuda-{version.base_image}" - else: - image_name = f"dstack-cuda-{DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES}" + image_name = f"dstack-{version.base_image}" image_owner = DSTACK_ACCOUNT_ID username = "ubuntu" response = ec2_client.describe_images( @@ -636,25 +632,6 @@ def _is_private_subnet_with_internet_egress( return False -def _supported_by_dlami(instance_type: str) -> bool: - # Currently only p3. instances are not supported by DLAMI among GPU instances. - return any( - instance_type.startswith(family) - for family in [ - "g4dn.", - "g5.", - "g6.", - "gr6.", - "g6e.", - "p4d.", - "p4de.", - "p5.", - "p5e.", - "p6-b200.", - ] - ) - - def get_reservation( ec2_client: botocore.client.BaseClient, reservation_id: str, diff --git a/src/dstack/_internal/core/models/fleets.py b/src/dstack/_internal/core/models/fleets.py index 5824d2a1f0..12869ed5f9 100644 --- a/src/dstack/_internal/core/models/fleets.py +++ b/src/dstack/_internal/core/models/fleets.py @@ -261,7 +261,7 @@ class BackendFleetConfiguraionProps(CoreModel): instance_types: Annotated[ Optional[List[str]], Field( - description="The cloud-specific instance types to consider for provisioning (e.g., `[p3.8xlarge, n1-standard-4]`)" + description="The cloud-specific instance types to consider for provisioning (e.g., `[g6e.24xlarge, n1-standard-4]`)" ), ] = None spot_policy: Annotated[ diff --git a/src/dstack/_internal/core/models/profiles.py b/src/dstack/_internal/core/models/profiles.py index 5175146947..c09a32f046 100644 --- a/src/dstack/_internal/core/models/profiles.py +++ b/src/dstack/_internal/core/models/profiles.py @@ -257,7 +257,7 @@ class ProfileParams(CoreModel): instance_types: Annotated[ Optional[List[str]], Field( - description="The cloud-specific instance types to consider for provisioning (e.g., `[p3.8xlarge, n1-standard-4]`)" + description="The cloud-specific instance types to consider for provisioning (e.g., `[g6e.24xlarge, n1-standard-4]`)" ), ] = None reservation: Annotated[ diff --git a/src/tests/_internal/core/backends/aws/test_resources.py b/src/tests/_internal/core/backends/aws/test_resources.py index 9dadd24f42..dcec84bf62 100644 --- a/src/tests/_internal/core/backends/aws/test_resources.py +++ b/src/tests/_internal/core/backends/aws/test_resources.py @@ -150,14 +150,23 @@ def test_raises_resource_not_found_if_none_available( assert "image 'dstack-0.0' not found" in caplog.text @pytest.mark.parametrize( - ["cuda", "expected"], + ["cuda", "expected_name", "expected_owner"], [ - [False, "dstack-0.0"], - [True, "dstack-cuda-0.0"], + [False, "dstack-0.0", "142421590066"], + [ + True, + "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04) *", + "898082745236", + ], ], ) - def test_uses_dstack_image_name_and_account_id_if_image_config_not_provided( - self, monkeypatch: pytest.MonkeyPatch, ec2_client_mock: Mock, cuda: bool, expected: str + def test_uses_default_image_name_and_account_id_if_image_config_not_provided( + self, + monkeypatch: pytest.MonkeyPatch, + ec2_client_mock: Mock, + cuda: bool, + expected_name: str, + expected_owner: str, ): monkeypatch.setattr("dstack.version.base_image", "0.0") _, username = get_image_id_and_username( @@ -167,7 +176,7 @@ def test_uses_dstack_image_name_and_account_id_if_image_config_not_provided( ) assert username == "ubuntu" ec2_client_mock.describe_images.assert_called_once_with( - Filters=[{"Name": "name", "Values": [expected]}], Owners=["142421590066"] + Filters=[{"Name": "name", "Values": [expected_name]}], Owners=[expected_owner] ) @pytest.mark.parametrize( diff --git a/src/tests/_internal/server/routers/test_fleets.py b/src/tests/_internal/server/routers/test_fleets.py index 7748708a1a..499460e300 100644 --- a/src/tests/_internal/server/routers/test_fleets.py +++ b/src/tests/_internal/server/routers/test_fleets.py @@ -1534,7 +1534,7 @@ async def test_errors_if_ssh_key_is_bad( [ pytest.param("backends", [BackendType.AWS], id="backends"), pytest.param("regions", ["eu-west-1"], id="regions"), - pytest.param("instance_types", ["p3.8xlarge"], id="instance_types"), + pytest.param("instance_types", ["g6e.24xlarge"], id="instance_types"), pytest.param("idle_duration", 60, id="idle_duration"), pytest.param("tags", {}, id="tags"), # falsy value ],