@@ -428,8 +428,16 @@ export async function runStreamLoop(
428428 // Real OTel span for Tempo/Grafana. Stamped aggregate-only so
429429 // there is no per-chunk OTel cost — one span per read loop with
430430 // integer counters, plus a bounded set of events.
431+ //
432+ // `context.streamComplete` is the caller-visible "this leg was
433+ // supposed to be final" signal (set by the complete/error/run-end
434+ // handlers). When it's true but we didn't see a terminal event on
435+ // the wire, that's the real "disappeared response" bug — as
436+ // opposed to a normal tool-pause leg which ends with
437+ // streamComplete=false and terminal_event_seen=false and is fine.
431438 stampSseReadLoopSpan ( bodyStart , counters , endedOn , fetchUrl , pathname , {
432439 idleGapEventThresholdMs : IDLE_GAP_EVENT_THRESHOLD_MS ,
440+ expectedTerminal : context . streamComplete ,
433441 } )
434442 }
435443}
@@ -475,7 +483,7 @@ function stampSseReadLoopSpan(
475483 closeReason : string ,
476484 fetchUrl : string ,
477485 pathname : string ,
478- opts : { idleGapEventThresholdMs : number }
486+ opts : { idleGapEventThresholdMs : number ; expectedTerminal : boolean }
479487) : void {
480488 // Translate performance.now() values into wall-clock Date values so
481489 // the span's timestamps land in real time (OTel accepts both, but we
@@ -484,6 +492,16 @@ function stampSseReadLoopSpan(
484492 const nowWall = Date . now ( )
485493 const startWall = nowWall - ( nowPerf - startPerfMs )
486494
495+ const terminalEventSeen = counters . eventsByType . complete > 0
496+ // `terminal_event_missing` is the single-attribute dashboard signal
497+ // for the "disappeared response" bug class: the caller considered
498+ // this leg to be the final one (`context.streamComplete === true`)
499+ // but no `complete` event arrived on the wire. Tool-pause legs have
500+ // expectedTerminal=false and never trip this, so dashboards can
501+ // filter on `{ .copilot.sse.terminal_event_missing = true }` without
502+ // false positives.
503+ const terminalEventMissing = opts . expectedTerminal && ! terminalEventSeen
504+
487505 const tracer = getCopilotTracer ( )
488506 const span = tracer . startSpan ( TraceSpan . CopilotSseReadLoop , {
489507 startTime : startWall ,
@@ -503,29 +521,43 @@ function stampSseReadLoopSpan(
503521 [ TraceAttr . CopilotSseEventsComplete ] : counters . eventsByType . complete ,
504522 [ TraceAttr . CopilotSseLongestIdleGapMs ] : Math . round ( counters . longestIdleGapMs ) ,
505523 [ TraceAttr . CopilotSseCloseReason ] : closeReason ,
506- [ TraceAttr . CopilotSseTerminalEventSeen ] : counters . eventsByType . complete > 0 ,
524+ [ TraceAttr . CopilotSseExpectedTerminal ] : opts . expectedTerminal ,
525+ [ TraceAttr . CopilotSseTerminalEventSeen ] : terminalEventSeen ,
526+ [ TraceAttr . CopilotSseTerminalEventMissing ] : terminalEventMissing ,
507527 } ,
508528 } )
509529
510530 if ( counters . firstEventMs !== undefined ) {
511531 span . setAttribute ( TraceAttr . CopilotSseFirstEventMs , counters . firstEventMs )
512- span . addEvent ( TraceEvent . CopilotSseFirstEvent , {
513- [ TraceAttr . CopilotSseFirstEventMs ] : counters . firstEventMs ,
514- } )
532+ // Anchor the event to the moment the first SSE event was actually
533+ // received (startWall + firstEventMs), not `now`, so a trace
534+ // waterfall shows the diamond at the TTFT point — not at span end.
535+ span . addEvent (
536+ TraceEvent . CopilotSseFirstEvent ,
537+ { [ TraceAttr . CopilotSseFirstEventMs ] : counters . firstEventMs } ,
538+ startWall + counters . firstEventMs
539+ )
515540 }
516541 if ( counters . longestIdleGapMs >= opts . idleGapEventThresholdMs ) {
517542 span . addEvent ( TraceEvent . CopilotSseIdleGapExceeded , {
518543 [ TraceAttr . CopilotSseLongestIdleGapMs ] : Math . round ( counters . longestIdleGapMs ) ,
519544 } )
520545 }
521- if ( counters . eventsByType . complete > 0 ) {
546+ if ( terminalEventSeen ) {
522547 span . addEvent ( TraceEvent . CopilotSseTerminalEventReceived )
523548 }
524549
525550 // Span status: only mark ERROR for real failures. User aborts and
526551 // clean terminals stay UNSET so dashboards filtering `status=error`
527- // don't light up for normal cancellations.
528- if (
552+ // don't light up for normal cancellations. Tool-pause legs (caller
553+ // didn't set streamComplete) are NOT errors even though they have
554+ // no complete event.
555+ if ( terminalEventMissing ) {
556+ span . setStatus ( {
557+ code : SpanStatusCode . ERROR ,
558+ message : 'SSE read loop finished without terminal event (caller expected one)' ,
559+ } )
560+ } else if (
529561 closeReason !== CopilotSseCloseReason . Terminal &&
530562 closeReason !== CopilotSseCloseReason . Aborted
531563 ) {
0 commit comments