diff --git a/model-engine/model_engine_server/common/dtos/model_endpoints.py b/model-engine/model_engine_server/common/dtos/model_endpoints.py index 0f5e76a07..f971cb56b 100644 --- a/model-engine/model_engine_server/common/dtos/model_endpoints.py +++ b/model-engine/model_engine_server/common/dtos/model_endpoints.py @@ -159,6 +159,10 @@ class GetModelEndpointV1Response(BaseModel): metadata: Optional[Dict[str, Any]] = Field(default=None) # TODO: JSON type bundle_name: str status: ModelEndpointStatus + status_reason: Optional[str] = Field( + default=None, + description="Human-readable reason for the current status, e.g. the failure cause when status is UPDATE_FAILED.", + ) post_inference_hooks: Optional[List[str]] = Field(default=None) default_callback_url: Optional[HttpUrlStr] = Field(default=None) default_callback_auth: Optional[CallbackAuth] = Field(default=None) diff --git a/model-engine/model_engine_server/db/migrations/alembic/versions/2026_06_16_1200-c4d5e6f7a8b9_add_status_reason_column.py b/model-engine/model_engine_server/db/migrations/alembic/versions/2026_06_16_1200-c4d5e6f7a8b9_add_status_reason_column.py new file mode 100644 index 000000000..d332cb4e2 --- /dev/null +++ b/model-engine/model_engine_server/db/migrations/alembic/versions/2026_06_16_1200-c4d5e6f7a8b9_add_status_reason_column.py @@ -0,0 +1,31 @@ +"""add status_reason column + +Revision ID: c4d5e6f7a8b9 +Revises: a1b2c3d4e5f6 +Create Date: 2026-06-16 12:00:00.000000 + +""" +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = 'c4d5e6f7a8b9' +down_revision = 'a1b2c3d4e5f6' +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.add_column( + 'endpoints', + sa.Column('status_reason', sa.Text, nullable=True), + schema='hosted_model_inference', + ) + + +def downgrade() -> None: + op.drop_column( + 'endpoints', + 'status_reason', + schema='hosted_model_inference', + ) diff --git a/model-engine/model_engine_server/db/models/hosted_model_inference.py b/model-engine/model_engine_server/db/models/hosted_model_inference.py index 01f52a328..c3397ab71 100644 --- a/model-engine/model_engine_server/db/models/hosted_model_inference.py +++ b/model-engine/model_engine_server/db/models/hosted_model_inference.py @@ -465,6 +465,9 @@ class Endpoint(Base): # Endpoints should eventually end up as READY barring any bugs. # EndpointStatus.ready.value endpoint_status = Column(Text, default="READY") + # Human-readable reason for the current status, set when a build fails (status + # UPDATE_FAILED) so the failure cause can be surfaced to API consumers. + status_reason = Column(Text, nullable=True) current_bundle = relationship("Bundle") owner = Column(String(SHORT_STRING)) public_inference = Column(Boolean, default=False) @@ -484,6 +487,7 @@ def __init__( endpoint_type: str = "async", destination: Optional[str] = None, endpoint_status: Optional[str] = "READY", # EndpointStatus.ready.value + status_reason: Optional[str] = None, owner: Optional[str] = None, public_inference: Optional[bool] = False, task_expires_seconds: Optional[int] = None, @@ -498,6 +502,7 @@ def __init__( self.endpoint_type = endpoint_type self.destination = destination self.endpoint_status = endpoint_status + self.status_reason = status_reason self.owner = owner self.public_inference = public_inference self.task_expires_seconds = task_expires_seconds diff --git a/model-engine/model_engine_server/domain/entities/model_endpoint_entity.py b/model-engine/model_engine_server/domain/entities/model_endpoint_entity.py index 7df8177ab..7de3bdb71 100644 --- a/model-engine/model_engine_server/domain/entities/model_endpoint_entity.py +++ b/model-engine/model_engine_server/domain/entities/model_endpoint_entity.py @@ -132,6 +132,7 @@ class ModelEndpointRecord(OwnedEntity): endpoint_type: ModelEndpointType destination: str status: ModelEndpointStatus + status_reason: Optional[str] = None current_model_bundle: ModelBundle owner: str public_inference: Optional[bool] = None diff --git a/model-engine/model_engine_server/domain/use_cases/model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/model_endpoint_use_cases.py index 357c58b87..bd4c4f0cc 100644 --- a/model-engine/model_engine_server/domain/use_cases/model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/model_endpoint_use_cases.py @@ -78,6 +78,7 @@ def model_endpoint_entity_to_get_model_endpoint_response( metadata=model_endpoint.record.metadata, bundle_name=model_endpoint.record.current_model_bundle.name, status=model_endpoint.record.status, + status_reason=model_endpoint.record.status_reason, post_inference_hooks=post_inference_hooks, default_callback_url=default_callback_url, # type: ignore default_callback_auth=default_callback_auth, diff --git a/model-engine/model_engine_server/infra/repositories/db_model_endpoint_record_repository.py b/model-engine/model_engine_server/infra/repositories/db_model_endpoint_record_repository.py index 1551e3fae..0ebddda57 100644 --- a/model-engine/model_engine_server/infra/repositories/db_model_endpoint_record_repository.py +++ b/model-engine/model_engine_server/infra/repositories/db_model_endpoint_record_repository.py @@ -51,6 +51,7 @@ def translate_model_endpoint_orm_to_model_endpoint_record( endpoint_type=model_endpoint_orm.endpoint_type, destination=model_endpoint_orm.destination, status=model_endpoint_orm.endpoint_status, + status_reason=model_endpoint_orm.status_reason, current_model_bundle=current_model_bundle, public_inference=model_endpoint_orm.public_inference, task_expires_seconds=model_endpoint_orm.task_expires_seconds, @@ -121,6 +122,7 @@ async def create_model_endpoint_record( creation_task_id: str, status: str, owner: str, + status_reason: Optional[str] = None, public_inference: Optional[bool] = False, task_expires_seconds: Optional[int] = None, queue_message_timeout_seconds: Optional[int] = None, @@ -134,6 +136,7 @@ async def create_model_endpoint_record( destination=destination, creation_task_id=creation_task_id, endpoint_status=status, + status_reason=status_reason, owner=owner, public_inference=public_inference, task_expires_seconds=task_expires_seconds, @@ -310,6 +313,8 @@ async def update_model_endpoint_record( creation_task_id: Optional[str] = None, destination: Optional[str] = None, status: Optional[str] = None, + status_reason: Optional[str] = None, + clear_status_reason: bool = False, public_inference: Optional[bool] = None, task_expires_seconds: Optional[int] = None, queue_message_timeout_seconds: Optional[int] = None, @@ -330,11 +335,16 @@ async def update_model_endpoint_record( creation_task_id=creation_task_id, destination=destination, endpoint_status=status, + status_reason=status_reason, last_updated_at=datetime.utcnow(), public_inference=public_inference, task_expires_seconds=task_expires_seconds, queue_message_timeout_seconds=queue_message_timeout_seconds, ) + # `status_reason=None` leaves the value unchanged (like every other field); + # callers reset it to NULL via the explicit clear_status_reason flag. + if clear_status_reason: + update_kwargs["status_reason"] = None await OrmModelEndpoint.update_by_name_owner( session=session, name=model_endpoint_orm.name, diff --git a/model-engine/model_engine_server/infra/repositories/model_endpoint_record_repository.py b/model-engine/model_engine_server/infra/repositories/model_endpoint_record_repository.py index a0f7e1482..efed86c5a 100644 --- a/model-engine/model_engine_server/infra/repositories/model_endpoint_record_repository.py +++ b/model-engine/model_engine_server/infra/repositories/model_endpoint_record_repository.py @@ -49,6 +49,7 @@ async def create_model_endpoint_record( creation_task_id: str, status: str, owner: str, + status_reason: Optional[str] = None, public_inference: Optional[bool] = False, task_expires_seconds: Optional[int] = None, queue_message_timeout_seconds: Optional[int] = None, @@ -67,6 +68,8 @@ async def create_model_endpoint_record( status: A status field on the endpoint, keeps track of endpoint state, used to coordinate edit operations on the endpoint owner: Team who owns endpoint + status_reason: Human-readable reason for the current status (e.g. the failure + cause when status is UPDATE_FAILED) public_inference: Whether the endpoint is publicly accessible task_expires_seconds: For async endpoints, how long a task can wait in queue before expiring queue_message_timeout_seconds: For async endpoints, queue message visibility/lock timeout @@ -85,6 +88,8 @@ async def update_model_endpoint_record( creation_task_id: Optional[str] = None, destination: Optional[str] = None, status: Optional[str] = None, + status_reason: Optional[str] = None, + clear_status_reason: bool = False, public_inference: Optional[bool] = None, task_expires_seconds: Optional[int] = None, queue_message_timeout_seconds: Optional[int] = None, @@ -99,6 +104,12 @@ async def update_model_endpoint_record( creation_task_id: The task id corresponding to endpoint creation destination: The destination where async tasks should be sent. status: Status field on the endpoint, used to coordinate endpoint edit operations + status_reason: Human-readable reason for the current status (e.g. the failure + cause when status is UPDATE_FAILED). None leaves the existing value + unchanged, consistent with the other fields here; pass + clear_status_reason=True to reset it to NULL. + clear_status_reason: When True, reset status_reason to NULL (e.g. once an + endpoint recovers to READY). public_inference: Whether the endpoint is publicly accessible task_expires_seconds: For async endpoints, how long a task can wait in queue before expiring queue_message_timeout_seconds: For async endpoints, queue message visibility/lock timeout diff --git a/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py b/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py index 1a8b7e6f5..6ce2db0cf 100644 --- a/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py +++ b/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py @@ -42,6 +42,7 @@ ) from model_engine_server.domain.exceptions import ( DockerBuildFailedException, + DomainException, EndpointResourceInfraException, ) from model_engine_server.domain.gateways import MonitoringMetricsGateway @@ -83,6 +84,30 @@ INITIAL_K8S_CACHE_TTL_SECONDS: int = 180 MAX_IMAGE_TAG_LEN = 128 +# Cap the persisted failure reason so a verbose traceback/string can't blow up the column. +MAX_STATUS_REASON_LEN = 500 +# Generic reason used when the failure cause isn't a known user-safe exception, so we +# don't leak raw infra error text (k8s/AWS/DB internals) into the public API. +GENERIC_STATUS_REASON = "Endpoint deployment failed due to an internal error." + + +def _status_reason_from_error(error: BaseException) -> str: + """Derive a user-safe status_reason from a build error. + + Only DomainException messages are surfaced — they are the codebase's own, + intentionally user-facing errors (e.g. DockerImageNotFound, invalid request, + quota). Any other exception (raw Kubernetes/AWS/DB errors) can carry sensitive + internals, so we return a generic message instead. Full detail is still logged. + """ + if isinstance(error, DomainException): + collapsed = " ".join(str(error).split()) + if collapsed: + return ( + collapsed[: MAX_STATUS_REASON_LEN - 1].rstrip() + "…" + if len(collapsed) > MAX_STATUS_REASON_LEN + else collapsed + ) + return GENERIC_STATUS_REASON RESTRICTED_ENV_VARS_KEYS = { "BASE": [ @@ -342,15 +367,19 @@ async def build_endpoint( model_endpoint_id=endpoint_id, destination=create_or_update_response.destination, status=ModelEndpointStatus.READY, + # Clear any reason from a prior failed attempt now that we're healthy. + clear_status_reason=True, ) except Exception as error: # noqa log_error("Failed endpoint build process!") - # Update status as failed endpoint creation on unhandled error + # Update status as failed endpoint creation on unhandled error, recording + # the cause so it can be surfaced to API consumers. try: await self.model_endpoint_record_repository.update_model_endpoint_record( model_endpoint_id=endpoint_id, status=ModelEndpointStatus.UPDATE_FAILED, + status_reason=_status_reason_from_error(error), ) except Exception as error_update: log_error("Failed to update endpoint build status to FAILED") @@ -714,10 +743,10 @@ async def _build_image( f"Image build failed for endpoint {model_endpoint_name}, user {user_id}" ) - await self.model_endpoint_record_repository.update_model_endpoint_record( - model_endpoint_id=build_endpoint_request.model_endpoint_record.id, - status=ModelEndpointStatus.UPDATE_FAILED, - ) + # Note: status (and status_reason) is set by the outer except handler + # from the DockerBuildFailedException raised below, so we don't write + # the record here — doing so would be overwritten. The exception message + # is kept user-safe (no internal ids) since it becomes the status_reason. if s3_logs_location is not None: help_url = self.filesystem_gateway.generate_signed_url( @@ -753,7 +782,11 @@ async def _build_image( users=[user_id], ) - raise DockerBuildFailedException(f"Image build failed ({endpoint_id=})") + logger_adapter.error(f"Image build failed ({endpoint_id=})") + raise DockerBuildFailedException( + "Image build failed. Check that the bundle's image and " + "dependencies are valid and accessible." + ) else: self.monitoring_metrics_gateway.emit_image_build_cache_hit_metric(image_type) diff --git a/model-engine/model_engine_server/infra/services/live_model_endpoint_service.py b/model-engine/model_engine_server/infra/services/live_model_endpoint_service.py index e329c761a..aa2fc92ec 100644 --- a/model-engine/model_engine_server/infra/services/live_model_endpoint_service.py +++ b/model-engine/model_engine_server/infra/services/live_model_endpoint_service.py @@ -410,6 +410,7 @@ async def delete_model_endpoint(self, model_endpoint_id: str) -> None: await self.model_endpoint_record_repository.update_model_endpoint_record( model_endpoint_id=model_endpoint_id, status=ModelEndpointStatus.UPDATE_FAILED, + status_reason="Failed to delete the endpoint's infrastructure.", ) raise EndpointDeleteFailedException diff --git a/model-engine/tests/unit/domain/test_model_endpoint_use_cases.py b/model-engine/tests/unit/domain/test_model_endpoint_use_cases.py index 4ad6247ba..61fc366da 100644 --- a/model-engine/tests/unit/domain/test_model_endpoint_use_cases.py +++ b/model-engine/tests/unit/domain/test_model_endpoint_use_cases.py @@ -18,7 +18,11 @@ STORAGE_LIMIT, ) from model_engine_server.core.auth.authentication_repository import User -from model_engine_server.domain.entities import ModelBundle, ModelEndpoint +from model_engine_server.domain.entities import ( + ModelBundle, + ModelEndpoint, + ModelEndpointStatus, +) from model_engine_server.domain.exceptions import ( EndpointBillingTagsMalformedException, EndpointLabelsException, @@ -780,6 +784,25 @@ async def test_get_model_endpoint_use_case_success( assert response_2.resource_state.nodes_per_worker == 2 +@pytest.mark.asyncio +async def test_get_model_endpoint_use_case_surfaces_status_reason( + test_api_key: str, + fake_model_endpoint_service, + model_endpoint_1: ModelEndpoint, +): + # A failed endpoint's status_reason should flow through to the API response. + model_endpoint_1.record.status = ModelEndpointStatus.UPDATE_FAILED + model_endpoint_1.record.status_reason = "CUDA out of memory" + fake_model_endpoint_service.add_model_endpoint(model_endpoint_1) + use_case = GetModelEndpointByIdV1UseCase(model_endpoint_service=fake_model_endpoint_service) + user = User(user_id=test_api_key, team_id=test_api_key, is_privileged_user=True) + + response = await use_case.execute(user=user, model_endpoint_id=model_endpoint_1.record.id) + + assert response.status == ModelEndpointStatus.UPDATE_FAILED + assert response.status_reason == "CUDA out of memory" + + @pytest.mark.asyncio async def test_get_model_endpoint_use_case_same_team_finds_endpoint( test_api_key_user_on_other_team: str,