scaleapi · dm36 · Jun 17, 2026 · greptile-apps · Jun 19, 2026 · greptile-apps
diff --git a/model-engine/model_engine_server/common/dtos/model_endpoints.py b/model-engine/model_engine_server/common/dtos/model_endpoints.py
@@ -159,6 +159,10 @@ class GetModelEndpointV1Response(BaseModel):
     metadata: Optional[Dict[str, Any]] = Field(default=None)  # TODO: JSON type
     bundle_name: str
     status: ModelEndpointStatus
+    status_reason: Optional[str] = Field(
+        default=None,
+        description="Human-readable reason for the current status, e.g. the failure cause when status is UPDATE_FAILED.",
+    )
     post_inference_hooks: Optional[List[str]] = Field(default=None)
     default_callback_url: Optional[HttpUrlStr] = Field(default=None)
     default_callback_auth: Optional[CallbackAuth] = Field(default=None)

diff --git a/...r/db/migrations/alembic/versions/2026_06_16_1200-c4d5e6f7a8b9_add_status_reason_column.py b/...r/db/migrations/alembic/versions/2026_06_16_1200-c4d5e6f7a8b9_add_status_reason_column.py
@@ -0,0 +1,31 @@
+"""add status_reason column
+
+Revision ID: c4d5e6f7a8b9
+Revises: a1b2c3d4e5f6
+Create Date: 2026-06-16 12:00:00.000000
+
+"""
+import sqlalchemy as sa
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision = 'c4d5e6f7a8b9'
+down_revision = 'a1b2c3d4e5f6'
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        'endpoints',
+        sa.Column('status_reason', sa.Text, nullable=True),
+        schema='hosted_model_inference',
+    )
+
+
+def downgrade() -> None:
+    op.drop_column(
+        'endpoints',
+        'status_reason',
+        schema='hosted_model_inference',
+    )
diff --git a/model-engine/model_engine_server/db/models/hosted_model_inference.py b/model-engine/model_engine_server/db/models/hosted_model_inference.py
@@ -465,6 +465,9 @@ class Endpoint(Base):
     # Endpoints should eventually end up as READY barring any bugs.
     # EndpointStatus.ready.value
     endpoint_status = Column(Text, default="READY")
+    # Human-readable reason for the current status, set when a build fails (status
+    # UPDATE_FAILED) so the failure cause can be surfaced to API consumers.
+    status_reason = Column(Text, nullable=True)
     current_bundle = relationship("Bundle")
     owner = Column(String(SHORT_STRING))
     public_inference = Column(Boolean, default=False)
@@ -484,6 +487,7 @@ def __init__(
         endpoint_type: str = "async",
         destination: Optional[str] = None,
         endpoint_status: Optional[str] = "READY",  # EndpointStatus.ready.value
+        status_reason: Optional[str] = None,
         owner: Optional[str] = None,
         public_inference: Optional[bool] = False,
         task_expires_seconds: Optional[int] = None,
@@ -498,6 +502,7 @@ def __init__(
         self.endpoint_type = endpoint_type
         self.destination = destination
         self.endpoint_status = endpoint_status
+        self.status_reason = status_reason
         self.owner = owner
         self.public_inference = public_inference
         self.task_expires_seconds = task_expires_seconds

diff --git a/model-engine/model_engine_server/domain/entities/model_endpoint_entity.py b/model-engine/model_engine_server/domain/entities/model_endpoint_entity.py
@@ -132,6 +132,7 @@ class ModelEndpointRecord(OwnedEntity):
     endpoint_type: ModelEndpointType
     destination: str
     status: ModelEndpointStatus
+    status_reason: Optional[str] = None
     current_model_bundle: ModelBundle
     owner: str
     public_inference: Optional[bool] = None

diff --git a/model-engine/model_engine_server/domain/use_cases/model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/model_endpoint_use_cases.py
@@ -78,6 +78,7 @@ def model_endpoint_entity_to_get_model_endpoint_response(
         metadata=model_endpoint.record.metadata,
         bundle_name=model_endpoint.record.current_model_bundle.name,
         status=model_endpoint.record.status,
+        status_reason=model_endpoint.record.status_reason,
         post_inference_hooks=post_inference_hooks,
         default_callback_url=default_callback_url,  # type: ignore
         default_callback_auth=default_callback_auth,

diff --git a/model-engine/model_engine_server/infra/repositories/db_model_endpoint_record_repository.py b/model-engine/model_engine_server/infra/repositories/db_model_endpoint_record_repository.py
@@ -51,6 +51,7 @@ def translate_model_endpoint_orm_to_model_endpoint_record(
         endpoint_type=model_endpoint_orm.endpoint_type,
         destination=model_endpoint_orm.destination,
         status=model_endpoint_orm.endpoint_status,
+        status_reason=model_endpoint_orm.status_reason,
         current_model_bundle=current_model_bundle,
         public_inference=model_endpoint_orm.public_inference,
         task_expires_seconds=model_endpoint_orm.task_expires_seconds,
@@ -121,6 +122,7 @@ async def create_model_endpoint_record(
         creation_task_id: str,
         status: str,
         owner: str,
+        status_reason: Optional[str] = None,
         public_inference: Optional[bool] = False,
         task_expires_seconds: Optional[int] = None,
         queue_message_timeout_seconds: Optional[int] = None,
@@ -134,6 +136,7 @@ async def create_model_endpoint_record(
             destination=destination,
             creation_task_id=creation_task_id,
             endpoint_status=status,
+            status_reason=status_reason,
             owner=owner,
             public_inference=public_inference,
             task_expires_seconds=task_expires_seconds,
@@ -310,6 +313,7 @@ async def update_model_endpoint_record(
         creation_task_id: Optional[str] = None,
         destination: Optional[str] = None,
         status: Optional[str] = None,
+        status_reason: Optional[str] = None,
         public_inference: Optional[bool] = None,
         task_expires_seconds: Optional[int] = None,
         queue_message_timeout_seconds: Optional[int] = None,
@@ -335,6 +339,11 @@ async def update_model_endpoint_record(
                 task_expires_seconds=task_expires_seconds,
                 queue_message_timeout_seconds=queue_message_timeout_seconds,
             )
+            # status_reason is handled separately from dict_not_none so an explicit
+            # clear (empty string -> NULL) is possible when an endpoint recovers to a
+            # healthy state, while omitting it (None) leaves any existing reason intact.
+            if status_reason is not None:
+                update_kwargs["status_reason"] = status_reason or None
             await OrmModelEndpoint.update_by_name_owner(
                 session=session,
                 name=model_endpoint_orm.name,

diff --git a/model-engine/model_engine_server/infra/repositories/model_endpoint_record_repository.py b/model-engine/model_engine_server/infra/repositories/model_endpoint_record_repository.py
@@ -49,6 +49,7 @@ async def create_model_endpoint_record(
         creation_task_id: str,
         status: str,
         owner: str,
+        status_reason: Optional[str] = None,
         public_inference: Optional[bool] = False,
         task_expires_seconds: Optional[int] = None,
         queue_message_timeout_seconds: Optional[int] = None,
@@ -67,6 +68,8 @@ async def create_model_endpoint_record(
             status: A status field on the endpoint, keeps track of endpoint state,
                 used to coordinate edit operations on the endpoint
             owner: Team who owns endpoint
+            status_reason: Human-readable reason for the current status (e.g. the failure
+                cause when status is UPDATE_FAILED)
             public_inference: Whether the endpoint is publicly accessible
             task_expires_seconds: For async endpoints, how long a task can wait in queue before expiring
             queue_message_timeout_seconds: For async endpoints, queue message visibility/lock timeout
@@ -85,6 +88,7 @@ async def update_model_endpoint_record(
         creation_task_id: Optional[str] = None,
         destination: Optional[str] = None,
         status: Optional[str] = None,
+        status_reason: Optional[str] = None,
         public_inference: Optional[bool] = None,
         task_expires_seconds: Optional[int] = None,
         queue_message_timeout_seconds: Optional[int] = None,
@@ -99,6 +103,8 @@ async def update_model_endpoint_record(
             creation_task_id: The task id corresponding to endpoint creation
             destination: The destination where async tasks should be sent.
             status: Status field on the endpoint, used to coordinate endpoint edit operations
+            status_reason: Human-readable reason for the current status (e.g. the failure
+                cause when status is UPDATE_FAILED)
             public_inference: Whether the endpoint is publicly accessible
             task_expires_seconds: For async endpoints, how long a task can wait in queue before expiring
             queue_message_timeout_seconds: For async endpoints, queue message visibility/lock timeout

diff --git a/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py b/model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py
@@ -83,6 +83,16 @@
 
 INITIAL_K8S_CACHE_TTL_SECONDS: int = 180
 MAX_IMAGE_TAG_LEN = 128
+# Cap the persisted failure reason so a verbose traceback/string can't blow up the column.
+MAX_STATUS_REASON_LEN = 500
+
+
+def _sanitize_status_reason(message: str) -> str:
+    """Collapse whitespace and cap length for a status_reason persisted on an endpoint."""
+    collapsed = " ".join(message.split())
+    if len(collapsed) > MAX_STATUS_REASON_LEN:
+        return collapsed[: MAX_STATUS_REASON_LEN - 1].rstrip() + "…"
+    return collapsed
 
 RESTRICTED_ENV_VARS_KEYS = {
     "BASE": [
@@ -342,15 +352,19 @@ async def build_endpoint(
                     model_endpoint_id=endpoint_id,
                     destination=create_or_update_response.destination,
                     status=ModelEndpointStatus.READY,
+                    # Clear any reason from a prior failed attempt now that we're healthy.
+                    status_reason="",
                 )
 
             except Exception as error:  # noqa
                 log_error("Failed endpoint build process!")
-                # Update status as failed endpoint creation on unhandled error
+                # Update status as failed endpoint creation on unhandled error, recording
+                # the cause so it can be surfaced to API consumers.
                 try:
                     await self.model_endpoint_record_repository.update_model_endpoint_record(
                         model_endpoint_id=endpoint_id,
                         status=ModelEndpointStatus.UPDATE_FAILED,
+                        status_reason=_sanitize_status_reason(str(error)),
                     )
                 except Exception as error_update:
                     log_error("Failed to update endpoint build status to FAILED")
@@ -717,6 +731,8 @@ async def _build_image(
                     await self.model_endpoint_record_repository.update_model_endpoint_record(
                         model_endpoint_id=build_endpoint_request.model_endpoint_record.id,
                         status=ModelEndpointStatus.UPDATE_FAILED,
+                        status_reason="Image build failed. Check that the bundle's image "
+                        "and dependencies are valid and accessible.",
                     )
 
                     if s3_logs_location is not None:

diff --git a/model-engine/model_engine_server/infra/services/live_model_endpoint_service.py b/model-engine/model_engine_server/infra/services/live_model_endpoint_service.py
@@ -410,6 +410,7 @@ async def delete_model_endpoint(self, model_endpoint_id: str) -> None:
                 await self.model_endpoint_record_repository.update_model_endpoint_record(
                     model_endpoint_id=model_endpoint_id,
                     status=ModelEndpointStatus.UPDATE_FAILED,
+                    status_reason="Failed to delete the endpoint's infrastructure.",
                 )
                 raise EndpointDeleteFailedException
 

diff --git a/model-engine/tests/unit/domain/test_model_endpoint_use_cases.py b/model-engine/tests/unit/domain/test_model_endpoint_use_cases.py
@@ -18,7 +18,11 @@
     STORAGE_LIMIT,
 )
 from model_engine_server.core.auth.authentication_repository import User
-from model_engine_server.domain.entities import ModelBundle, ModelEndpoint
+from model_engine_server.domain.entities import (
+    ModelBundle,
+    ModelEndpoint,
+    ModelEndpointStatus,
+)
 from model_engine_server.domain.exceptions import (
     EndpointBillingTagsMalformedException,
     EndpointLabelsException,
@@ -780,6 +784,25 @@ async def test_get_model_endpoint_use_case_success(
     assert response_2.resource_state.nodes_per_worker == 2
 
 
+@pytest.mark.asyncio
+async def test_get_model_endpoint_use_case_surfaces_status_reason(
+    test_api_key: str,
+    fake_model_endpoint_service,
+    model_endpoint_1: ModelEndpoint,
+):
+    # A failed endpoint's status_reason should flow through to the API response.
+    model_endpoint_1.record.status = ModelEndpointStatus.UPDATE_FAILED
+    model_endpoint_1.record.status_reason = "CUDA out of memory"
+    fake_model_endpoint_service.add_model_endpoint(model_endpoint_1)
+    use_case = GetModelEndpointByIdV1UseCase(model_endpoint_service=fake_model_endpoint_service)
+    user = User(user_id=test_api_key, team_id=test_api_key, is_privileged_user=True)
+
+    response = await use_case.execute(user=user, model_endpoint_id=model_endpoint_1.record.id)
+
+    assert response.status == ModelEndpointStatus.UPDATE_FAILED
+    assert response.status_reason == "CUDA out of memory"
+
+
 @pytest.mark.asyncio
 async def test_get_model_endpoint_use_case_same_team_finds_endpoint(
     test_api_key_user_on_other_team: str,