Skip to content

Commit 41b1a32

Browse files
declan-scaleclaude
andcommitted
feat(harness): mark derived tool spans as errored from ToolResponseContent.is_error
Now that the SDK carries ToolResponseContent.is_error (AGX1-371), thread it through the harness surface: CloseSpan gains an is_error field, the span deriver populates it when closing a tool span on a ToolResponseContent, and the tracer records the status on span.data (Span has no dedicated error field; None means the harness reported no status, so data is left untouched). This preserves tool-error tracing fidelity for harnesses that report tool failures (e.g. golden agent's ToolCompleted.is_error) once they move onto the unified surface, instead of silently dropping the status. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent cf01e70 commit 41b1a32

5 files changed

Lines changed: 67 additions & 1 deletion

File tree

src/agentex/lib/core/harness/span_derivation.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,14 @@ def _on_full(self, event: StreamTaskMessageFull) -> list[SpanSignal]:
124124
tcid = content.tool_call_id
125125
if tcid in self._open_tool_ids:
126126
self._open_tool_ids.pop(tcid, None)
127-
return [CloseSpan(key=tcid, output=content.content, is_complete=True)]
127+
return [
128+
CloseSpan(
129+
key=tcid,
130+
output=content.content,
131+
is_complete=True,
132+
is_error=content.is_error,
133+
)
134+
]
128135
return []
129136

130137
def _on_done(self, event: StreamTaskMessageDone) -> list[SpanSignal]:

src/agentex/lib/core/harness/tracer.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,12 @@ async def handle(self, signal: SpanSignal) -> None:
7474
# end_span(trace_id, span, start_to_close_timeout, heartbeat_timeout, retry_policy)
7575
# It does not accept an output= kwarg.
7676
span.output = signal.output
77+
# Tool failure status (ToolResponseContent.is_error) is recorded
78+
# on span.data when the harness reports one; Span has no dedicated
79+
# error field. None means no status was reported, so leave data alone.
80+
if signal.is_error is not None:
81+
data = span.data if isinstance(span.data, dict) else {}
82+
span.data = {**data, "is_error": signal.is_error}
7783
await self._tracing.end_span(
7884
trace_id=self.trace_id,
7985
span=span,

src/agentex/lib/core/harness/types.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ class CloseSpan:
4242
key: str
4343
output: Any = None
4444
is_complete: bool = True # False when closed by flush() without a result
45+
is_error: bool | None = None # tool failure status; None when the harness reports no status
4546

4647

4748
SpanSignal = Union[OpenSpan, CloseSpan]

tests/lib/core/harness/test_span_derivation.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,35 @@ def test_single_tool_opens_on_done_closes_on_response():
5757
OpenSpan(key="call_1", kind="tool", name="Bash", input={"cmd": "ls"}),
5858
CloseSpan(key="call_1", output="files", is_complete=True),
5959
]
60+
# No status reported -> CloseSpan carries is_error=None.
61+
assert sigs[1].is_error is None
62+
63+
64+
def test_tool_response_is_error_propagates_to_close_span():
65+
"""ToolResponseContent.is_error flows onto the CloseSpan so a derived tool
66+
span can be marked as a failure (AGX1-371)."""
67+
d = SpanDeriver()
68+
events = [
69+
_tool_req(0, "call_err", "Bash", {"cmd": "false"}),
70+
StreamTaskMessageDone(type="done", index=0),
71+
StreamTaskMessageFull(
72+
type="full",
73+
index=1,
74+
content=ToolResponseContent(
75+
type="tool_response",
76+
author="agent",
77+
tool_call_id="call_err",
78+
name="Bash",
79+
content="boom",
80+
is_error=True,
81+
),
82+
),
83+
]
84+
sigs = _signals(d, events)
85+
assert sigs == [
86+
OpenSpan(key="call_err", kind="tool", name="Bash", input={"cmd": "false"}),
87+
CloseSpan(key="call_err", output="boom", is_complete=True, is_error=True),
88+
]
6089

6190

6291
def test_reasoning_opens_on_start_closes_on_done():

tests/lib/core/harness/test_tracer.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,19 +10,22 @@ class _FakeSpan:
1010
def __init__(self, name):
1111
self.name = name
1212
self.output = None
13+
self.data = None
1314

1415

1516
class _FakeTracing:
1617
def __init__(self):
1718
self.started = []
1819
self.ended = []
20+
self.ended_spans = []
1921

2022
async def start_span(self, *, trace_id, name, input=None, parent_id=None, data=None, task_id=None):
2123
self.started.append((name, parent_id, input))
2224
return _FakeSpan(name)
2325

2426
async def end_span(self, *, trace_id, span):
2527
self.ended.append((span.name, span.output))
28+
self.ended_spans.append(span)
2629

2730

2831
@pytest.mark.asyncio
@@ -35,6 +38,26 @@ async def test_open_then_close_starts_and_ends_span():
3538
assert fake.ended == [("Bash", "files")]
3639

3740

41+
@pytest.mark.asyncio
42+
async def test_close_records_is_error_on_span_data():
43+
"""A CloseSpan carrying is_error records the status on span.data (AGX1-371)."""
44+
fake = _FakeTracing()
45+
tracer = SpanTracer(trace_id="t1", parent_span_id="p1", tracing=fake)
46+
await tracer.handle(OpenSpan(key="call_err", kind="tool", name="Bash", input={}))
47+
await tracer.handle(CloseSpan(key="call_err", output="boom", is_complete=True, is_error=True))
48+
assert fake.ended_spans[0].data == {"is_error": True}
49+
50+
51+
@pytest.mark.asyncio
52+
async def test_close_without_status_leaves_span_data_untouched():
53+
"""is_error=None (no status reported) must not write to span.data."""
54+
fake = _FakeTracing()
55+
tracer = SpanTracer(trace_id="t1", parent_span_id="p1", tracing=fake)
56+
await tracer.handle(OpenSpan(key="call_1", kind="tool", name="Bash", input={}))
57+
await tracer.handle(CloseSpan(key="call_1", output="files", is_complete=True))
58+
assert fake.ended_spans[0].data is None
59+
60+
3861
@pytest.mark.asyncio
3962
async def test_no_trace_id_is_noop():
4063
fake = _FakeTracing()

0 commit comments

Comments
 (0)