Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,10 @@ class GetModelEndpointV1Response(BaseModel):
metadata: Optional[Dict[str, Any]] = Field(default=None) # TODO: JSON type
bundle_name: str
status: ModelEndpointStatus
status_reason: Optional[str] = Field(
default=None,
description="Human-readable reason for the current status, e.g. the failure cause when status is UPDATE_FAILED.",
)
post_inference_hooks: Optional[List[str]] = Field(default=None)
default_callback_url: Optional[HttpUrlStr] = Field(default=None)
default_callback_auth: Optional[CallbackAuth] = Field(default=None)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""add status_reason column

Revision ID: c4d5e6f7a8b9
Revises: a1b2c3d4e5f6
Create Date: 2026-06-16 12:00:00.000000

"""
import sqlalchemy as sa
from alembic import op

# revision identifiers, used by Alembic.
revision = 'c4d5e6f7a8b9'
down_revision = 'a1b2c3d4e5f6'
branch_labels = None
depends_on = None


def upgrade() -> None:
op.add_column(
'endpoints',
sa.Column('status_reason', sa.Text, nullable=True),
schema='hosted_model_inference',
)


def downgrade() -> None:
op.drop_column(
'endpoints',
'status_reason',
schema='hosted_model_inference',
)
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,9 @@ class Endpoint(Base):
# Endpoints should eventually end up as READY barring any bugs.
# EndpointStatus.ready.value
endpoint_status = Column(Text, default="READY")
# Human-readable reason for the current status, set when a build fails (status
# UPDATE_FAILED) so the failure cause can be surfaced to API consumers.
status_reason = Column(Text, nullable=True)
current_bundle = relationship("Bundle")
owner = Column(String(SHORT_STRING))
public_inference = Column(Boolean, default=False)
Expand All @@ -484,6 +487,7 @@ def __init__(
endpoint_type: str = "async",
destination: Optional[str] = None,
endpoint_status: Optional[str] = "READY", # EndpointStatus.ready.value
status_reason: Optional[str] = None,
owner: Optional[str] = None,
public_inference: Optional[bool] = False,
task_expires_seconds: Optional[int] = None,
Expand All @@ -498,6 +502,7 @@ def __init__(
self.endpoint_type = endpoint_type
self.destination = destination
self.endpoint_status = endpoint_status
self.status_reason = status_reason
self.owner = owner
self.public_inference = public_inference
self.task_expires_seconds = task_expires_seconds
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ class ModelEndpointRecord(OwnedEntity):
endpoint_type: ModelEndpointType
destination: str
status: ModelEndpointStatus
status_reason: Optional[str] = None
current_model_bundle: ModelBundle
owner: str
public_inference: Optional[bool] = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def model_endpoint_entity_to_get_model_endpoint_response(
metadata=model_endpoint.record.metadata,
bundle_name=model_endpoint.record.current_model_bundle.name,
status=model_endpoint.record.status,
status_reason=model_endpoint.record.status_reason,
post_inference_hooks=post_inference_hooks,
default_callback_url=default_callback_url, # type: ignore
default_callback_auth=default_callback_auth,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def translate_model_endpoint_orm_to_model_endpoint_record(
endpoint_type=model_endpoint_orm.endpoint_type,
destination=model_endpoint_orm.destination,
status=model_endpoint_orm.endpoint_status,
status_reason=model_endpoint_orm.status_reason,
current_model_bundle=current_model_bundle,
public_inference=model_endpoint_orm.public_inference,
task_expires_seconds=model_endpoint_orm.task_expires_seconds,
Expand Down Expand Up @@ -121,6 +122,7 @@ async def create_model_endpoint_record(
creation_task_id: str,
status: str,
owner: str,
status_reason: Optional[str] = None,
public_inference: Optional[bool] = False,
task_expires_seconds: Optional[int] = None,
queue_message_timeout_seconds: Optional[int] = None,
Expand All @@ -134,6 +136,7 @@ async def create_model_endpoint_record(
destination=destination,
creation_task_id=creation_task_id,
endpoint_status=status,
status_reason=status_reason,
owner=owner,
public_inference=public_inference,
task_expires_seconds=task_expires_seconds,
Expand Down Expand Up @@ -310,6 +313,8 @@ async def update_model_endpoint_record(
creation_task_id: Optional[str] = None,
destination: Optional[str] = None,
status: Optional[str] = None,
status_reason: Optional[str] = None,
clear_status_reason: bool = False,
public_inference: Optional[bool] = None,
task_expires_seconds: Optional[int] = None,
queue_message_timeout_seconds: Optional[int] = None,
Expand All @@ -330,11 +335,16 @@ async def update_model_endpoint_record(
creation_task_id=creation_task_id,
destination=destination,
endpoint_status=status,
status_reason=status_reason,
last_updated_at=datetime.utcnow(),
public_inference=public_inference,
task_expires_seconds=task_expires_seconds,
queue_message_timeout_seconds=queue_message_timeout_seconds,
)
# `status_reason=None` leaves the value unchanged (like every other field);
# callers reset it to NULL via the explicit clear_status_reason flag.
if clear_status_reason:
update_kwargs["status_reason"] = None
await OrmModelEndpoint.update_by_name_owner(
session=session,
name=model_endpoint_orm.name,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ async def create_model_endpoint_record(
creation_task_id: str,
status: str,
owner: str,
status_reason: Optional[str] = None,
public_inference: Optional[bool] = False,
task_expires_seconds: Optional[int] = None,
queue_message_timeout_seconds: Optional[int] = None,
Expand All @@ -67,6 +68,8 @@ async def create_model_endpoint_record(
status: A status field on the endpoint, keeps track of endpoint state,
used to coordinate edit operations on the endpoint
owner: Team who owns endpoint
status_reason: Human-readable reason for the current status (e.g. the failure
cause when status is UPDATE_FAILED)
public_inference: Whether the endpoint is publicly accessible
task_expires_seconds: For async endpoints, how long a task can wait in queue before expiring
queue_message_timeout_seconds: For async endpoints, queue message visibility/lock timeout
Expand All @@ -85,6 +88,8 @@ async def update_model_endpoint_record(
creation_task_id: Optional[str] = None,
destination: Optional[str] = None,
status: Optional[str] = None,
status_reason: Optional[str] = None,
clear_status_reason: bool = False,
public_inference: Optional[bool] = None,
task_expires_seconds: Optional[int] = None,
queue_message_timeout_seconds: Optional[int] = None,
Expand All @@ -99,6 +104,12 @@ async def update_model_endpoint_record(
creation_task_id: The task id corresponding to endpoint creation
destination: The destination where async tasks should be sent.
status: Status field on the endpoint, used to coordinate endpoint edit operations
status_reason: Human-readable reason for the current status (e.g. the failure
cause when status is UPDATE_FAILED). None leaves the existing value
unchanged, consistent with the other fields here; pass
clear_status_reason=True to reset it to NULL.
clear_status_reason: When True, reset status_reason to NULL (e.g. once an
endpoint recovers to READY).
public_inference: Whether the endpoint is publicly accessible
task_expires_seconds: For async endpoints, how long a task can wait in queue before expiring
queue_message_timeout_seconds: For async endpoints, queue message visibility/lock timeout
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
)
from model_engine_server.domain.exceptions import (
DockerBuildFailedException,
DomainException,
EndpointResourceInfraException,
)
from model_engine_server.domain.gateways import MonitoringMetricsGateway
Expand Down Expand Up @@ -83,6 +84,30 @@

INITIAL_K8S_CACHE_TTL_SECONDS: int = 180
MAX_IMAGE_TAG_LEN = 128
# Cap the persisted failure reason so a verbose traceback/string can't blow up the column.
MAX_STATUS_REASON_LEN = 500
# Generic reason used when the failure cause isn't a known user-safe exception, so we
# don't leak raw infra error text (k8s/AWS/DB internals) into the public API.
GENERIC_STATUS_REASON = "Endpoint deployment failed due to an internal error."


def _status_reason_from_error(error: BaseException) -> str:
"""Derive a user-safe status_reason from a build error.

Only DomainException messages are surfaced — they are the codebase's own,
intentionally user-facing errors (e.g. DockerImageNotFound, invalid request,
quota). Any other exception (raw Kubernetes/AWS/DB errors) can carry sensitive
internals, so we return a generic message instead. Full detail is still logged.
"""
if isinstance(error, DomainException):
collapsed = " ".join(str(error).split())
if collapsed:
return (
collapsed[: MAX_STATUS_REASON_LEN - 1].rstrip() + "…"
if len(collapsed) > MAX_STATUS_REASON_LEN
else collapsed
)
return GENERIC_STATUS_REASON

RESTRICTED_ENV_VARS_KEYS = {
"BASE": [
Expand Down Expand Up @@ -342,15 +367,19 @@ async def build_endpoint(
model_endpoint_id=endpoint_id,
destination=create_or_update_response.destination,
status=ModelEndpointStatus.READY,
# Clear any reason from a prior failed attempt now that we're healthy.
clear_status_reason=True,
)

except Exception as error: # noqa
log_error("Failed endpoint build process!")
# Update status as failed endpoint creation on unhandled error
# Update status as failed endpoint creation on unhandled error, recording
# the cause so it can be surfaced to API consumers.
try:
await self.model_endpoint_record_repository.update_model_endpoint_record(
model_endpoint_id=endpoint_id,
status=ModelEndpointStatus.UPDATE_FAILED,
status_reason=_status_reason_from_error(error),
)
except Exception as error_update:
log_error("Failed to update endpoint build status to FAILED")
Expand Down Expand Up @@ -714,10 +743,10 @@ async def _build_image(
f"Image build failed for endpoint {model_endpoint_name}, user {user_id}"
)

await self.model_endpoint_record_repository.update_model_endpoint_record(
model_endpoint_id=build_endpoint_request.model_endpoint_record.id,
status=ModelEndpointStatus.UPDATE_FAILED,
)
# Note: status (and status_reason) is set by the outer except handler
# from the DockerBuildFailedException raised below, so we don't write
# the record here — doing so would be overwritten. The exception message
# is kept user-safe (no internal ids) since it becomes the status_reason.

if s3_logs_location is not None:
help_url = self.filesystem_gateway.generate_signed_url(
Expand Down Expand Up @@ -753,7 +782,11 @@ async def _build_image(
users=[user_id],
)

raise DockerBuildFailedException(f"Image build failed ({endpoint_id=})")
logger_adapter.error(f"Image build failed ({endpoint_id=})")
raise DockerBuildFailedException(
"Image build failed. Check that the bundle's image and "
"dependencies are valid and accessible."
)

else:
self.monitoring_metrics_gateway.emit_image_build_cache_hit_metric(image_type)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,7 @@ async def delete_model_endpoint(self, model_endpoint_id: str) -> None:
await self.model_endpoint_record_repository.update_model_endpoint_record(
model_endpoint_id=model_endpoint_id,
status=ModelEndpointStatus.UPDATE_FAILED,
status_reason="Failed to delete the endpoint's infrastructure.",
)
raise EndpointDeleteFailedException

Expand Down
25 changes: 24 additions & 1 deletion model-engine/tests/unit/domain/test_model_endpoint_use_cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,11 @@
STORAGE_LIMIT,
)
from model_engine_server.core.auth.authentication_repository import User
from model_engine_server.domain.entities import ModelBundle, ModelEndpoint
from model_engine_server.domain.entities import (
ModelBundle,
ModelEndpoint,
ModelEndpointStatus,
)
from model_engine_server.domain.exceptions import (
EndpointBillingTagsMalformedException,
EndpointLabelsException,
Expand Down Expand Up @@ -780,6 +784,25 @@ async def test_get_model_endpoint_use_case_success(
assert response_2.resource_state.nodes_per_worker == 2


@pytest.mark.asyncio
async def test_get_model_endpoint_use_case_surfaces_status_reason(
test_api_key: str,
fake_model_endpoint_service,
model_endpoint_1: ModelEndpoint,
):
# A failed endpoint's status_reason should flow through to the API response.
model_endpoint_1.record.status = ModelEndpointStatus.UPDATE_FAILED
model_endpoint_1.record.status_reason = "CUDA out of memory"
fake_model_endpoint_service.add_model_endpoint(model_endpoint_1)
use_case = GetModelEndpointByIdV1UseCase(model_endpoint_service=fake_model_endpoint_service)
user = User(user_id=test_api_key, team_id=test_api_key, is_privileged_user=True)

response = await use_case.execute(user=user, model_endpoint_id=model_endpoint_1.record.id)

assert response.status == ModelEndpointStatus.UPDATE_FAILED
assert response.status_reason == "CUDA out of memory"


@pytest.mark.asyncio
async def test_get_model_endpoint_use_case_same_team_finds_endpoint(
test_api_key_user_on_other_team: str,
Expand Down