Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,10 @@ class GetModelEndpointV1Response(BaseModel):
metadata: Optional[Dict[str, Any]] = Field(default=None) # TODO: JSON type
bundle_name: str
status: ModelEndpointStatus
status_reason: Optional[str] = Field(
default=None,
description="Human-readable reason for the current status, e.g. the failure cause when status is UPDATE_FAILED.",
)
post_inference_hooks: Optional[List[str]] = Field(default=None)
default_callback_url: Optional[HttpUrlStr] = Field(default=None)
default_callback_auth: Optional[CallbackAuth] = Field(default=None)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""add status_reason column

Revision ID: c4d5e6f7a8b9
Revises: a1b2c3d4e5f6
Create Date: 2026-06-16 12:00:00.000000

"""
import sqlalchemy as sa
from alembic import op

# revision identifiers, used by Alembic.
revision = 'c4d5e6f7a8b9'
down_revision = 'a1b2c3d4e5f6'
branch_labels = None
depends_on = None


def upgrade() -> None:
op.add_column(
'endpoints',
sa.Column('status_reason', sa.Text, nullable=True),
schema='hosted_model_inference',
)


def downgrade() -> None:
op.drop_column(
'endpoints',
'status_reason',
schema='hosted_model_inference',
)
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,9 @@ class Endpoint(Base):
# Endpoints should eventually end up as READY barring any bugs.
# EndpointStatus.ready.value
endpoint_status = Column(Text, default="READY")
# Human-readable reason for the current status, set when a build fails (status
# UPDATE_FAILED) so the failure cause can be surfaced to API consumers.
status_reason = Column(Text, nullable=True)
current_bundle = relationship("Bundle")
owner = Column(String(SHORT_STRING))
public_inference = Column(Boolean, default=False)
Expand All @@ -484,6 +487,7 @@ def __init__(
endpoint_type: str = "async",
destination: Optional[str] = None,
endpoint_status: Optional[str] = "READY", # EndpointStatus.ready.value
status_reason: Optional[str] = None,
owner: Optional[str] = None,
public_inference: Optional[bool] = False,
task_expires_seconds: Optional[int] = None,
Expand All @@ -498,6 +502,7 @@ def __init__(
self.endpoint_type = endpoint_type
self.destination = destination
self.endpoint_status = endpoint_status
self.status_reason = status_reason
self.owner = owner
self.public_inference = public_inference
self.task_expires_seconds = task_expires_seconds
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ class ModelEndpointRecord(OwnedEntity):
endpoint_type: ModelEndpointType
destination: str
status: ModelEndpointStatus
status_reason: Optional[str] = None
current_model_bundle: ModelBundle
owner: str
public_inference: Optional[bool] = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def model_endpoint_entity_to_get_model_endpoint_response(
metadata=model_endpoint.record.metadata,
bundle_name=model_endpoint.record.current_model_bundle.name,
status=model_endpoint.record.status,
status_reason=model_endpoint.record.status_reason,
post_inference_hooks=post_inference_hooks,
default_callback_url=default_callback_url, # type: ignore
default_callback_auth=default_callback_auth,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def translate_model_endpoint_orm_to_model_endpoint_record(
endpoint_type=model_endpoint_orm.endpoint_type,
destination=model_endpoint_orm.destination,
status=model_endpoint_orm.endpoint_status,
status_reason=model_endpoint_orm.status_reason,
current_model_bundle=current_model_bundle,
public_inference=model_endpoint_orm.public_inference,
task_expires_seconds=model_endpoint_orm.task_expires_seconds,
Expand Down Expand Up @@ -121,6 +122,7 @@ async def create_model_endpoint_record(
creation_task_id: str,
status: str,
owner: str,
status_reason: Optional[str] = None,
public_inference: Optional[bool] = False,
task_expires_seconds: Optional[int] = None,
queue_message_timeout_seconds: Optional[int] = None,
Expand All @@ -134,6 +136,7 @@ async def create_model_endpoint_record(
destination=destination,
creation_task_id=creation_task_id,
endpoint_status=status,
status_reason=status_reason,
owner=owner,
public_inference=public_inference,
task_expires_seconds=task_expires_seconds,
Expand Down Expand Up @@ -310,6 +313,7 @@ async def update_model_endpoint_record(
creation_task_id: Optional[str] = None,
destination: Optional[str] = None,
status: Optional[str] = None,
status_reason: Optional[str] = None,
public_inference: Optional[bool] = None,
task_expires_seconds: Optional[int] = None,
queue_message_timeout_seconds: Optional[int] = None,
Expand All @@ -335,6 +339,11 @@ async def update_model_endpoint_record(
task_expires_seconds=task_expires_seconds,
queue_message_timeout_seconds=queue_message_timeout_seconds,
)
# status_reason is handled separately from dict_not_none so an explicit
# clear (empty string -> NULL) is possible when an endpoint recovers to a
# healthy state, while omitting it (None) leaves any existing reason intact.
if status_reason is not None:
update_kwargs["status_reason"] = status_reason or None
Comment on lines +342 to +346

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Empty-string-as-sentinel is a non-obvious API contract

The update method uses None to mean "leave the existing value unchanged" and "" to mean "clear to NULL". This is documented in a comment but differs from every other nullable parameter in the same method signature (e.g., status, destination, metadata) where None always means "do not update". A future caller who passes status_reason=None intending to clear the field will silently leave a stale failure reason. Consider using a dedicated sentinel (e.g., a module-level CLEAR = object()) or a separate clear_status_reason: bool = False parameter to make the intent explicit.

Prompt To Fix With AI
This is a comment left during a code review.
Path: model-engine/model_engine_server/infra/repositories/db_model_endpoint_record_repository.py
Line: 342-346

Comment:
**Empty-string-as-sentinel is a non-obvious API contract**

The update method uses `None` to mean "leave the existing value unchanged" and `""` to mean "clear to NULL". This is documented in a comment but differs from every other nullable parameter in the same method signature (e.g., `status`, `destination`, `metadata`) where `None` always means "do not update". A future caller who passes `status_reason=None` intending to clear the field will silently leave a stale failure reason. Consider using a dedicated sentinel (e.g., a module-level `CLEAR = object()`) or a separate `clear_status_reason: bool = False` parameter to make the intent explicit.

How can I resolve this? If you propose a fix, please make it concise.

Note: If this suggestion doesn't match your team's coding style, reply to this and let me know. I'll remember it for next time!

Fix in Cursor Fix in Claude Code Fix in Codex

await OrmModelEndpoint.update_by_name_owner(
session=session,
name=model_endpoint_orm.name,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ async def create_model_endpoint_record(
creation_task_id: str,
status: str,
owner: str,
status_reason: Optional[str] = None,
public_inference: Optional[bool] = False,
task_expires_seconds: Optional[int] = None,
queue_message_timeout_seconds: Optional[int] = None,
Expand All @@ -67,6 +68,8 @@ async def create_model_endpoint_record(
status: A status field on the endpoint, keeps track of endpoint state,
used to coordinate edit operations on the endpoint
owner: Team who owns endpoint
status_reason: Human-readable reason for the current status (e.g. the failure
cause when status is UPDATE_FAILED)
public_inference: Whether the endpoint is publicly accessible
task_expires_seconds: For async endpoints, how long a task can wait in queue before expiring
queue_message_timeout_seconds: For async endpoints, queue message visibility/lock timeout
Expand All @@ -85,6 +88,7 @@ async def update_model_endpoint_record(
creation_task_id: Optional[str] = None,
destination: Optional[str] = None,
status: Optional[str] = None,
status_reason: Optional[str] = None,
public_inference: Optional[bool] = None,
task_expires_seconds: Optional[int] = None,
queue_message_timeout_seconds: Optional[int] = None,
Expand All @@ -99,6 +103,8 @@ async def update_model_endpoint_record(
creation_task_id: The task id corresponding to endpoint creation
destination: The destination where async tasks should be sent.
status: Status field on the endpoint, used to coordinate endpoint edit operations
status_reason: Human-readable reason for the current status (e.g. the failure
cause when status is UPDATE_FAILED)
public_inference: Whether the endpoint is publicly accessible
task_expires_seconds: For async endpoints, how long a task can wait in queue before expiring
queue_message_timeout_seconds: For async endpoints, queue message visibility/lock timeout
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,16 @@

INITIAL_K8S_CACHE_TTL_SECONDS: int = 180
MAX_IMAGE_TAG_LEN = 128
# Cap the persisted failure reason so a verbose traceback/string can't blow up the column.
MAX_STATUS_REASON_LEN = 500


def _sanitize_status_reason(message: str) -> str:
"""Collapse whitespace and cap length for a status_reason persisted on an endpoint."""
collapsed = " ".join(message.split())
if len(collapsed) > MAX_STATUS_REASON_LEN:
return collapsed[: MAX_STATUS_REASON_LEN - 1].rstrip() + "…"
return collapsed

RESTRICTED_ENV_VARS_KEYS = {
"BASE": [
Expand Down Expand Up @@ -342,15 +352,19 @@ async def build_endpoint(
model_endpoint_id=endpoint_id,
destination=create_or_update_response.destination,
status=ModelEndpointStatus.READY,
# Clear any reason from a prior failed attempt now that we're healthy.
status_reason="",
)

except Exception as error: # noqa
log_error("Failed endpoint build process!")
# Update status as failed endpoint creation on unhandled error
# Update status as failed endpoint creation on unhandled error, recording
# the cause so it can be surfaced to API consumers.
try:
await self.model_endpoint_record_repository.update_model_endpoint_record(
model_endpoint_id=endpoint_id,
status=ModelEndpointStatus.UPDATE_FAILED,
status_reason=_sanitize_status_reason(str(error)),
)
Comment on lines +367 to 368

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 security Unfiltered str(error) may leak sensitive infrastructure details

The broad except Exception block catches all errors from infrastructure operations (Kubernetes API calls, AWS SDK calls, database operations, etc.). Passing str(error) directly as status_reason can expose internal connection strings, internal IP addresses, resource names, or database schema details to API consumers via the public GET /model-endpoint response. The _sanitize_status_reason helper only collapses whitespace and caps length — it performs no content filtering. Consider catching specific known exception types and mapping them to safe, user-facing messages, or stripping exception types from a known-internal class hierarchy before persisting the message.

Prompt To Fix With AI
This is a comment left during a code review.
Path: model-engine/model_engine_server/infra/services/live_endpoint_builder_service.py
Line: 367-368

Comment:
**Unfiltered `str(error)` may leak sensitive infrastructure details**

The broad `except Exception` block catches all errors from infrastructure operations (Kubernetes API calls, AWS SDK calls, database operations, etc.). Passing `str(error)` directly as `status_reason` can expose internal connection strings, internal IP addresses, resource names, or database schema details to API consumers via the public `GET /model-endpoint` response. The `_sanitize_status_reason` helper only collapses whitespace and caps length — it performs no content filtering. Consider catching specific known exception types and mapping them to safe, user-facing messages, or stripping exception types from a known-internal class hierarchy before persisting the message.

How can I resolve this? If you propose a fix, please make it concise.

Fix in Cursor Fix in Claude Code Fix in Codex

except Exception as error_update:
log_error("Failed to update endpoint build status to FAILED")
Expand Down Expand Up @@ -717,6 +731,8 @@ async def _build_image(
await self.model_endpoint_record_repository.update_model_endpoint_record(
model_endpoint_id=build_endpoint_request.model_endpoint_record.id,
status=ModelEndpointStatus.UPDATE_FAILED,
status_reason="Image build failed. Check that the bundle's image "
"and dependencies are valid and accessible.",
)

if s3_logs_location is not None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,7 @@ async def delete_model_endpoint(self, model_endpoint_id: str) -> None:
await self.model_endpoint_record_repository.update_model_endpoint_record(
model_endpoint_id=model_endpoint_id,
status=ModelEndpointStatus.UPDATE_FAILED,
status_reason="Failed to delete the endpoint's infrastructure.",
)
raise EndpointDeleteFailedException

Expand Down
25 changes: 24 additions & 1 deletion model-engine/tests/unit/domain/test_model_endpoint_use_cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,11 @@
STORAGE_LIMIT,
)
from model_engine_server.core.auth.authentication_repository import User
from model_engine_server.domain.entities import ModelBundle, ModelEndpoint
from model_engine_server.domain.entities import (
ModelBundle,
ModelEndpoint,
ModelEndpointStatus,
)
from model_engine_server.domain.exceptions import (
EndpointBillingTagsMalformedException,
EndpointLabelsException,
Expand Down Expand Up @@ -780,6 +784,25 @@ async def test_get_model_endpoint_use_case_success(
assert response_2.resource_state.nodes_per_worker == 2


@pytest.mark.asyncio
async def test_get_model_endpoint_use_case_surfaces_status_reason(
test_api_key: str,
fake_model_endpoint_service,
model_endpoint_1: ModelEndpoint,
):
# A failed endpoint's status_reason should flow through to the API response.
model_endpoint_1.record.status = ModelEndpointStatus.UPDATE_FAILED
model_endpoint_1.record.status_reason = "CUDA out of memory"
fake_model_endpoint_service.add_model_endpoint(model_endpoint_1)
use_case = GetModelEndpointByIdV1UseCase(model_endpoint_service=fake_model_endpoint_service)
user = User(user_id=test_api_key, team_id=test_api_key, is_privileged_user=True)

response = await use_case.execute(user=user, model_endpoint_id=model_endpoint_1.record.id)

assert response.status == ModelEndpointStatus.UPDATE_FAILED
assert response.status_reason == "CUDA out of memory"


@pytest.mark.asyncio
async def test_get_model_endpoint_use_case_same_team_finds_endpoint(
test_api_key_user_on_other_team: str,
Expand Down