diff --git a/model-engine/model_engine_server/common/dtos/model_endpoints.py b/model-engine/model_engine_server/common/dtos/model_endpoints.py index 0f5e76a07..f971cb56b 100644 --- a/model-engine/model_engine_server/common/dtos/model_endpoints.py +++ b/model-engine/model_engine_server/common/dtos/model_endpoints.py @@ -159,6 +159,10 @@ class GetModelEndpointV1Response(BaseModel): metadata: Optional[Dict[str, Any]] = Field(default=None) # TODO: JSON type bundle_name: str status: ModelEndpointStatus + status_reason: Optional[str] = Field( + default=None, + description="Human-readable reason for the current status, e.g. the failure cause when status is UPDATE_FAILED.", + ) post_inference_hooks: Optional[List[str]] = Field(default=None) default_callback_url: Optional[HttpUrlStr] = Field(default=None) default_callback_auth: Optional[CallbackAuth] = Field(default=None) diff --git a/model-engine/model_engine_server/db/migrations/alembic/versions/2026_06_16_1200-c4d5e6f7a8b9_add_status_reason_column.py b/model-engine/model_engine_server/db/migrations/alembic/versions/2026_06_16_1200-c4d5e6f7a8b9_add_status_reason_column.py new file mode 100644 index 000000000..d332cb4e2 --- /dev/null +++ b/model-engine/model_engine_server/db/migrations/alembic/versions/2026_06_16_1200-c4d5e6f7a8b9_add_status_reason_column.py @@ -0,0 +1,31 @@ +"""add status_reason column + +Revision ID: c4d5e6f7a8b9 +Revises: a1b2c3d4e5f6 +Create Date: 2026-06-16 12:00:00.000000 + +""" +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = 'c4d5e6f7a8b9' +down_revision = 'a1b2c3d4e5f6' +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.add_column( + 'endpoints', + sa.Column('status_reason', sa.Text, nullable=True), + schema='hosted_model_inference', + ) + + +def downgrade() -> None: + op.drop_column( + 'endpoints', + 'status_reason', + schema='hosted_model_inference', + ) diff --git a/model-engine/model_engine_server/db/models/hosted_model_inference.py b/model-engine/model_engine_server/db/models/hosted_model_inference.py index 01f52a328..c3397ab71 100644 --- a/model-engine/model_engine_server/db/models/hosted_model_inference.py +++ b/model-engine/model_engine_server/db/models/hosted_model_inference.py @@ -465,6 +465,9 @@ class Endpoint(Base): # Endpoints should eventually end up as READY barring any bugs. # EndpointStatus.ready.value endpoint_status = Column(Text, default="READY") + # Human-readable reason for the current status, set when a build fails (status + # UPDATE_FAILED) so the failure cause can be surfaced to API consumers. + status_reason = Column(Text, nullable=True) current_bundle = relationship("Bundle") owner = Column(String(SHORT_STRING)) public_inference = Column(Boolean, default=False) @@ -484,6 +487,7 @@ def __init__( endpoint_type: str = "async", destination: Optional[str] = None, endpoint_status: Optional[str] = "READY", # EndpointStatus.ready.value + status_reason: Optional[str] = None, owner: Optional[str] = None, public_inference: Optional[bool] = False, task_expires_seconds: Optional[int] = None, @@ -498,6 +502,7 @@ def __init__( self.endpoint_type = endpoint_type self.destination = destination self.endpoint_status = endpoint_status + self.status_reason = status_reason self.owner = owner self.public_inference = public_inference self.task_expires_seconds = task_expires_seconds diff --git a/model-engine/model_engine_server/domain/entities/model_endpoint_entity.py b/model-engine/model_engine_server/domain/entities/model_endpoint_entity.py index 7df8177ab..7de3bdb71 100644 --- a/model-engine/model_engine_server/domain/entities/model_endpoint_entity.py +++ b/model-engine/model_engine_server/domain/entities/model_endpoint_entity.py @@ -132,6 +132,7 @@ class ModelEndpointRecord(OwnedEntity): endpoint_type: ModelEndpointType destination: str status: ModelEndpointStatus + status_reason: Optional[str] = None current_model_bundle: ModelBundle owner: str public_inference: Optional[bool] = None diff --git a/model-engine/model_engine_server/domain/use_cases/model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/model_endpoint_use_cases.py index 357c58b87..bd4c4f0cc 100644 --- a/model-engine/model_engine_server/domain/use_cases/model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/model_endpoint_use_cases.py @@ -78,6 +78,7 @@ def model_endpoint_entity_to_get_model_endpoint_response( metadata=model_endpoint.record.metadata, bundle_name=model_endpoint.record.current_model_bundle.name, status=model_endpoint.record.status, + status_reason=model_endpoint.record.status_reason, post_inference_hooks=post_inference_hooks, default_callback_url=default_callback_url, # type: ignore default_callback_auth=default_callback_auth, diff --git a/model-engine/model_engine_server/infra/repositories/db_model_endpoint_record_repository.py b/model-engine/model_engine_server/infra/repositories/db_model_endpoint_record_repository.py index 1551e3fae..644658fca 100644 --- a/model-engine/model_engine_server/infra/repositories/db_model_endpoint_record_repository.py +++ b/model-engine/model_engine_server/infra/repositories/db_model_endpoint_record_repository.py @@ -51,6 +51,7 @@ def translate_model_endpoint_orm_to_model_endpoint_record( endpoint_type=model_endpoint_orm.endpoint_type, destination=model_endpoint_orm.destination, status=model_endpoint_orm.endpoint_status, + status_reason=model_endpoint_orm.status_reason, current_model_bundle=current_model_bundle, public_inference=model_endpoint_orm.public_inference, task_expires_seconds=model_endpoint_orm.task_expires_seconds, @@ -121,6 +122,7 @@ async def create_model_endpoint_record( creation_task_id: str, status: str, owner: str, + status_reason: Optional[str] = None, public_inference: Optional[bool] = False, task_expires_seconds: Optional[int] = None, queue_message_timeout_seconds: Optional[int] = None, @@ -134,6 +136,7 @@ async def create_model_endpoint_record( destination=destination, creation_task_id=creation_task_id, endpoint_status=status, + status_reason=status_reason, owner=owner, public_inference=public_inference, task_expires_seconds=task_expires_seconds, @@ -310,6 +313,7 @@ async def update_model_endpoint_record( creation_task_id: Optional[str] = None, destination: Optional[str] = None, status: Optional[str] = None, + status_reason: Optional[str] = None, public_inference: Optional[bool] = None, task_expires_seconds: Optional[int] = None, queue_message_timeout_seconds: Optional[int] = None, @@ -335,6 +339,11 @@ async def update_model_endpoint_record( task_expires_seconds=task_expires_seconds, queue_message_timeout_seconds=queue_message_timeout_seconds, ) + # status_reason is handled separately from dict_not_none so an explicit + # clear (empty string -> NULL) is possible when an endpoint recovers to a + # healthy state, while omitting it (None) leaves any existing reason intact. + if status_reason is not None: + update_kwargs["status_reason"] = status_reason or None await OrmModelEndpoint.update_by_name_owner( session=session, name=model_endpoint_orm.name, diff --git a/model-engine/model_engine_server/infra/repositories/model_endpoint_record_repository.py b/model-engine/model_engine_server/infra/repositories/model_endpoint_record_repository.py index a0f7e1482..353a5e07b 100644 --- a/model-engine/model_engine_server/infra/repositories/model_endpoint_record_repository.py +++ b/model-engine/model_engine_server/infra/repositories/model_endpoint_record_repository.py @@ -49,6 +49,7 @@ async def create_model_endpoint_record( creation_task_id: str, status: str, owner: str, + status_reason: Optional[str] = None, public_inference: Optional[bool] = False, task_expires_seconds: Optional[int] = None, queue_message_timeout_seconds: Optional[int] = None, @@ -67,6 +68,8 @@ async def create_model_endpoint_record( status: A status field on the endpoint, keeps track of endpoint state, used to coordinate edit operations on the endpoint owner: Team who owns endpoint + status_reason: Human-readable reason for the current status (e.g. the failure + cause when status is UPDATE_FAILED) public_inference: Whether the endpoint is publicly accessible task_expires_seconds: For async endpoints, how long a task can wait in queue before expiring queue_message_timeout_seconds: For async endpoints, queue message visibility/lock timeout @@ -85,6 +88,7 @@ async def update_model_endpoint_record( creation_task_id: Optional[str] = None, destination: Optional[str] = None, status: Optional[str] = None, + status_reason: Optional[str] = None, public_inference: Optional[bool] = None, task_expires_seconds: Optional[int] = None, queue_message_timeout_seconds: Optional[int] = None, @@ -99,6 +103,8 @@ async def update_model_endpoint_record( creation_task_id: The task id corresponding to endpoint creation destination: The destination where async tasks should be sent. status: Status field on the endpoint, used to coordinate endpoint edit operations + status_reason: Human-readable reason for the current status (e.g. the failure + cause when status is UPDATE_FAILED) public_inference: Whether the endpoint is publicly accessible task_expires_seconds: For async endpoints, how long a task can wait in queue before expiring queue_message_timeout_seconds: For async endpoints, queue message visibility/lock timeout diff --git a/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py b/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py index 1a8b7e6f5..a58baa255 100644 --- a/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py +++ b/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py @@ -83,6 +83,16 @@ INITIAL_K8S_CACHE_TTL_SECONDS: int = 180 MAX_IMAGE_TAG_LEN = 128 +# Cap the persisted failure reason so a verbose traceback/string can't blow up the column. +MAX_STATUS_REASON_LEN = 500 + + +def _sanitize_status_reason(message: str) -> str: + """Collapse whitespace and cap length for a status_reason persisted on an endpoint.""" + collapsed = " ".join(message.split()) + if len(collapsed) > MAX_STATUS_REASON_LEN: + return collapsed[: MAX_STATUS_REASON_LEN - 1].rstrip() + "…" + return collapsed RESTRICTED_ENV_VARS_KEYS = { "BASE": [ @@ -342,15 +352,19 @@ async def build_endpoint( model_endpoint_id=endpoint_id, destination=create_or_update_response.destination, status=ModelEndpointStatus.READY, + # Clear any reason from a prior failed attempt now that we're healthy. + status_reason="", ) except Exception as error: # noqa log_error("Failed endpoint build process!") - # Update status as failed endpoint creation on unhandled error + # Update status as failed endpoint creation on unhandled error, recording + # the cause so it can be surfaced to API consumers. try: await self.model_endpoint_record_repository.update_model_endpoint_record( model_endpoint_id=endpoint_id, status=ModelEndpointStatus.UPDATE_FAILED, + status_reason=_sanitize_status_reason(str(error)), ) except Exception as error_update: log_error("Failed to update endpoint build status to FAILED") @@ -717,6 +731,8 @@ async def _build_image( await self.model_endpoint_record_repository.update_model_endpoint_record( model_endpoint_id=build_endpoint_request.model_endpoint_record.id, status=ModelEndpointStatus.UPDATE_FAILED, + status_reason="Image build failed. Check that the bundle's image " + "and dependencies are valid and accessible.", ) if s3_logs_location is not None: diff --git a/model-engine/model_engine_server/infra/services/live_model_endpoint_service.py b/model-engine/model_engine_server/infra/services/live_model_endpoint_service.py index e329c761a..aa2fc92ec 100644 --- a/model-engine/model_engine_server/infra/services/live_model_endpoint_service.py +++ b/model-engine/model_engine_server/infra/services/live_model_endpoint_service.py @@ -410,6 +410,7 @@ async def delete_model_endpoint(self, model_endpoint_id: str) -> None: await self.model_endpoint_record_repository.update_model_endpoint_record( model_endpoint_id=model_endpoint_id, status=ModelEndpointStatus.UPDATE_FAILED, + status_reason="Failed to delete the endpoint's infrastructure.", ) raise EndpointDeleteFailedException diff --git a/model-engine/tests/unit/domain/test_model_endpoint_use_cases.py b/model-engine/tests/unit/domain/test_model_endpoint_use_cases.py index 4ad6247ba..61fc366da 100644 --- a/model-engine/tests/unit/domain/test_model_endpoint_use_cases.py +++ b/model-engine/tests/unit/domain/test_model_endpoint_use_cases.py @@ -18,7 +18,11 @@ STORAGE_LIMIT, ) from model_engine_server.core.auth.authentication_repository import User -from model_engine_server.domain.entities import ModelBundle, ModelEndpoint +from model_engine_server.domain.entities import ( + ModelBundle, + ModelEndpoint, + ModelEndpointStatus, +) from model_engine_server.domain.exceptions import ( EndpointBillingTagsMalformedException, EndpointLabelsException, @@ -780,6 +784,25 @@ async def test_get_model_endpoint_use_case_success( assert response_2.resource_state.nodes_per_worker == 2 +@pytest.mark.asyncio +async def test_get_model_endpoint_use_case_surfaces_status_reason( + test_api_key: str, + fake_model_endpoint_service, + model_endpoint_1: ModelEndpoint, +): + # A failed endpoint's status_reason should flow through to the API response. + model_endpoint_1.record.status = ModelEndpointStatus.UPDATE_FAILED + model_endpoint_1.record.status_reason = "CUDA out of memory" + fake_model_endpoint_service.add_model_endpoint(model_endpoint_1) + use_case = GetModelEndpointByIdV1UseCase(model_endpoint_service=fake_model_endpoint_service) + user = User(user_id=test_api_key, team_id=test_api_key, is_privileged_user=True) + + response = await use_case.execute(user=user, model_endpoint_id=model_endpoint_1.record.id) + + assert response.status == ModelEndpointStatus.UPDATE_FAILED + assert response.status_reason == "CUDA out of memory" + + @pytest.mark.asyncio async def test_get_model_endpoint_use_case_same_team_finds_endpoint( test_api_key_user_on_other_team: str,