Skip to content

Commit 7f6d70a

Browse files
authored
test(harness): shared test fakes + conformance determinism fix (#427)
1 parent 10d22a2 commit 7f6d70a

13 files changed

Lines changed: 199 additions & 441 deletions

docs/superpowers/plans/2026-06-18-unified-harness-surface-pr4-pydantic-ai.md

Lines changed: 0 additions & 246 deletions
This file was deleted.

tests/lib/core/harness/_fakes.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
"""Shared test doubles for the unified harness test suites.
2+
3+
A single superset implementation of the in-memory tracing backend used across
4+
the harness tests. Three recording shapes were previously duplicated:
5+
6+
- Shape-1 (richest): ``started`` = ``(name, parent_id, input)`` 3-tuples,
7+
``ended`` = ``(name, output)`` 2-tuples, plus an ``ended_spans`` list of the
8+
closed ``FakeSpan`` objects (which carry ``.name``, ``.output``, ``.data``).
9+
- Shape-2: ``started`` = ``(name, parent_id)`` 2-tuples, ``ended`` =
10+
``(name, output)``.
11+
- Shape-3: ``started`` = bare names, ``ended`` = bare outputs.
12+
13+
``FakeTracing`` records the richest (shape-1) form and exposes read-only
14+
convenience properties (``started_names``, ``started_pairs``,
15+
``ended_outputs``) so shape-2 and shape-3 assertions stay clean.
16+
"""
17+
18+
from __future__ import annotations
19+
20+
from typing import Any
21+
22+
23+
class FakeSpan:
24+
def __init__(self, name: str) -> None:
25+
self.name = name
26+
self.output: Any = None
27+
self.data: Any = None
28+
29+
30+
class FakeTracing:
31+
def __init__(self) -> None:
32+
self.started: list[tuple[str, Any, Any]] = []
33+
self.ended: list[tuple[str, Any]] = []
34+
self.ended_spans: list[FakeSpan] = []
35+
36+
async def start_span(
37+
self,
38+
*,
39+
trace_id: str,
40+
name: str,
41+
input: Any = None,
42+
parent_id: Any = None,
43+
data: Any = None,
44+
task_id: Any = None,
45+
) -> FakeSpan:
46+
self.started.append((name, parent_id, input))
47+
return FakeSpan(name)
48+
49+
async def end_span(self, *, trace_id: str, span: FakeSpan) -> None:
50+
self.ended.append((span.name, span.output))
51+
self.ended_spans.append(span)
52+
53+
@property
54+
def started_names(self) -> list[str]:
55+
return [name for (name, _parent, _input) in self.started]
56+
57+
@property
58+
def started_pairs(self) -> list[tuple[str, Any]]:
59+
return [(name, parent) for (name, parent, _input) in self.started]
60+
61+
@property
62+
def ended_outputs(self) -> list[Any]:
63+
return [output for (_name, output) in self.ended]
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
"""Conformance-suite test setup.
2+
3+
Eagerly import every per-harness conformance module so each one's module-level
4+
``register(...)`` calls run before any test executes. This makes
5+
``all_fixtures()`` complete and independent of pytest's collection/import order
6+
(the runner documents that cross-module registration order is not guaranteed),
7+
so the cross-harness ``test_span_derivation_is_deterministic`` guard in
8+
``test_conformance.py`` covers the full fixture set even when this directory is
9+
run in isolation.
10+
"""
11+
12+
from __future__ import annotations
13+
14+
# Importing these for their registration side effects only.
15+
from . import (
16+
test_codex_conformance, # noqa: F401
17+
test_openai_conformance, # noqa: F401
18+
test_langgraph_conformance, # noqa: F401
19+
test_claude_code_conformance, # noqa: F401
20+
test_pydantic_ai_conformance, # noqa: F401
21+
)

tests/lib/core/harness/conformance/runner.py

Lines changed: 33 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,8 @@
4343
because:
4444
- StreamingTaskMessageContext.close() persists initial_content when no deltas
4545
have been streamed, so the message IS correctly persisted.
46-
- It mirrors the pattern already used by the real _langgraph_async.py harness,
47-
keeping behavioural parity.
46+
- It mirrors the pattern already used by the real langgraph streaming helper
47+
(now in _langgraph_turn.py), keeping behavioural parity.
4848
- Switching to adk.messages.create would require an additional injectable
4949
dependency, adding surface area for no observable benefit.
5050
The conformance test treats this as an ACCEPTABLE envelope difference: at the
@@ -53,18 +53,14 @@
5353
identical because both adapters drive the same SpanDeriver.observe() call
5454
sequence and forward every signal to their tracer.
5555
56-
AGX1-377 fix: auto_send now DELIVERS streamed tool-request messages (Start+Done)
57-
instead of dropping them. The conformance normaliser previously suppressed the
58-
delivery for Start(tool_request)+Done on the yield channel to match auto_send's
59-
old drop behaviour. That suppression is now removed: both channels produce a
60-
LogicalDelivery for a streamed tool_request, and the cross-channel assertion
61-
verifies it is delivered on both.
56+
auto_send DELIVERS streamed tool-request messages (Start+Done): both channels
57+
produce a LogicalDelivery for a streamed tool_request, and the cross-channel
58+
assertion verifies it is delivered on both.
6259
"""
6360

6461
from __future__ import annotations
6562

6663
import json
67-
import types as _types
6864
from typing import Any, NamedTuple, override
6965
from dataclasses import dataclass
7066

@@ -81,6 +77,8 @@
8177
from agentex.types.reasoning_content_delta import ReasoningContentDelta
8278
from agentex.lib.core.harness.span_derivation import SpanDeriver
8379

80+
from .._fakes import FakeTracing
81+
8482

8583
@dataclass
8684
class Fixture:
@@ -99,6 +97,25 @@ def all_fixtures() -> list[Fixture]:
9997
return list(_REGISTRY)
10098

10199

100+
def run_pure_async(coro: Any) -> Any:
101+
"""Drive a *pure* (I/O-free) coroutine to completion without an event loop.
102+
103+
Conformance fixtures are built at import time so they can parametrize the
104+
tests below. The fixture-building coroutines only iterate in-memory events
105+
and never suspend on a real future, so we step them by hand instead of
106+
``asyncio.run()``. ``asyncio.run()`` at import raises ``RuntimeError`` when a
107+
loop is already running (programmatic pytest, a Jupyter kernel, or a
108+
session-scoped asyncio loop); this driver is unaffected by ambient loop
109+
state. It raises if the coroutine ever suspends on real I/O.
110+
"""
111+
try:
112+
coro.send(None)
113+
except StopIteration as stop:
114+
return stop.value
115+
coro.close()
116+
raise RuntimeError("conformance fixture build unexpectedly suspended on real I/O")
117+
118+
102119
def derive_all(events: list[StreamTaskMessage]) -> list[SpanSignal]:
103120
d = SpanDeriver()
104121
out: list[SpanSignal] = []
@@ -145,8 +162,8 @@ def _yield_logical_deliveries(events: list[StreamTaskMessage]) -> list[LogicalDe
145162
- reasoning: initial_content.summary joined (from Start) prepended to
146163
accumulated reasoning-content deltas (this catches a channel that drops
147164
the summary)
148-
- tool_request: JSON-sorted arguments from the Start content (AGX1-377: now
149-
delivered on both channels, no longer suppressed)
165+
- tool_request: JSON-sorted arguments from the Start content (delivered on
166+
both channels)
150167
- tool_response: str(content) from Full event
151168
"""
152169
from agentex.types.text_content import TextContent
@@ -191,9 +208,9 @@ def _yield_logical_deliveries(events: list[StreamTaskMessage]) -> list[LogicalDe
191208
)
192209
)
193210
elif ctype == "tool_request" and isinstance(content, ToolRequestContent):
194-
# AGX1-377 fix: auto_send now delivers streamed tool-request
195-
# messages. Emit a delivery here so the cross-channel
196-
# assertion verifies it is present on both channels.
211+
# auto_send delivers streamed tool-request messages. Emit a
212+
# delivery here so the cross-channel assertion verifies it is
213+
# present on both channels.
197214
deliveries.append(
198215
LogicalDelivery(
199216
content_type=ctype,
@@ -296,30 +313,6 @@ def streaming_task_message_context(
296313
return _FakeCtx(self.sink, ctype, initial_content)
297314

298315

299-
class _FakeTracing:
300-
"""Minimal tracing backend: records started/ended span names + outputs."""
301-
302-
def __init__(self) -> None:
303-
self.started: list[str] = []
304-
self.ended: list[Any] = []
305-
306-
async def start_span(
307-
self,
308-
*,
309-
trace_id: str,
310-
name: str,
311-
input: Any = None,
312-
parent_id: Any = None,
313-
data: Any = None,
314-
task_id: Any = None,
315-
) -> Any:
316-
self.started.append(name)
317-
return _types.SimpleNamespace()
318-
319-
async def end_span(self, *, trace_id: str, span: Any) -> None:
320-
self.ended.append(getattr(span, "output", None))
321-
322-
323316
class _RecordingTracer(SpanTracer):
324317
"""SpanTracer that records every SpanSignal it actually receives.
325318
@@ -486,7 +479,7 @@ async def run_cross_channel_conformance(
486479
from agentex.lib.core.harness.yield_delivery import yield_events
487480

488481
# --- yield channel ---
489-
tracer_yield = _RecordingTracer(tracing=_FakeTracing())
482+
tracer_yield = _RecordingTracer(tracing=FakeTracing())
490483
yield_out = [e async for e in yield_events(_gen(fixture.events), tracer=tracer_yield)]
491484

492485
# Span signals the yield channel actually emitted to its tracer
@@ -496,7 +489,7 @@ async def run_cross_channel_conformance(
496489
yield_deliveries = _yield_text_reasoning_seq(_yield_logical_deliveries(yield_out))
497490

498491
# --- auto_send channel ---
499-
tracer_auto = _RecordingTracer(tracing=_FakeTracing())
492+
tracer_auto = _RecordingTracer(tracing=FakeTracing())
500493
fake_streaming = _FakeStreaming()
501494
await auto_send(
502495
_gen(fixture.events),

tests/lib/core/harness/conformance/test_claude_code_conformance.py

Lines changed: 10 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -29,15 +29,14 @@
2929

3030
from __future__ import annotations
3131

32-
from typing import Any
33-
3432
import pytest
3533

3634
from agentex.lib.adk._modules._claude_code_sync import convert_claude_code_to_agentex_events
3735

3836
from .runner import (
3937
Fixture,
4038
register,
39+
run_pure_async,
4140
run_cross_channel_conformance,
4241
)
4342

@@ -155,25 +154,16 @@ async def _build_fixture(name: str, envelopes: list[dict]) -> Fixture:
155154

156155
# Fixtures must exist before pytest collects (they parametrize the test below),
157156
# so they are built at import time. The conversion only iterates in-memory
158-
# envelopes — it never suspends on a real future — so we drive the coroutine to
159-
# completion by hand instead of asyncio.run(). asyncio.run() at import raises
160-
# RuntimeError when an event loop is already running (programmatic pytest, a
161-
# Jupyter kernel, or session-scoped asyncio loops); the loop-free driver below
162-
# is unaffected by the ambient loop state.
163-
def _run_pure_async(coro: Any) -> Any:
164-
try:
165-
coro.send(None)
166-
except StopIteration as stop:
167-
return stop.value
168-
coro.close()
169-
raise RuntimeError("conformance fixture build unexpectedly suspended on real I/O")
170-
171-
157+
# envelopes — it never suspends on a real future — so we drive the coroutines to
158+
# completion with the shared loop-free ``run_pure_async`` driver instead of
159+
# asyncio.run(), which raises RuntimeError at import when an event loop is
160+
# already running (programmatic pytest, a Jupyter kernel, or session-scoped
161+
# asyncio loops).
172162
_FIXTURES: list[Fixture] = [
173-
_run_pure_async(_build_fixture("claude-code-text-only", _TEXT_ENVELOPES)),
174-
_run_pure_async(_build_fixture("claude-code-tool-call-result", _TOOL_ENVELOPES)),
175-
_run_pure_async(_build_fixture("claude-code-thinking-block", _THINKING_ENVELOPES)),
176-
_run_pure_async(_build_fixture("claude-code-multi-step", _MULTI_STEP_ENVELOPES)),
163+
run_pure_async(_build_fixture("claude-code-text-only", _TEXT_ENVELOPES)),
164+
run_pure_async(_build_fixture("claude-code-tool-call-result", _TOOL_ENVELOPES)),
165+
run_pure_async(_build_fixture("claude-code-thinking-block", _THINKING_ENVELOPES)),
166+
run_pure_async(_build_fixture("claude-code-multi-step", _MULTI_STEP_ENVELOPES)),
177167
]
178168

179169
# Register into the shared registry so all_fixtures() can enumerate them

tests/lib/core/harness/conformance/test_codex_conformance.py

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,14 @@
1111

1212
from __future__ import annotations
1313

14-
import asyncio
1514
from typing import Any, AsyncIterator
1615

1716
import pytest
1817

1918
from agentex.lib.core.harness.types import StreamTaskMessage
2019
from agentex.lib.adk._modules._codex_sync import convert_codex_to_agentex_events
2120

22-
from .runner import Fixture, register, derive_all
21+
from .runner import Fixture, register, run_pure_async
2322

2423

2524
async def _aiter(items: list[Any]) -> AsyncIterator[Any]:
@@ -32,7 +31,9 @@ async def _collect(events: list[Any]) -> list[StreamTaskMessage]:
3231

3332

3433
def _build(events: list[Any]) -> list[StreamTaskMessage]:
35-
return asyncio.run(_collect(events))
34+
# Loop-free driver: this runs at import time, where asyncio.run() would raise
35+
# under an already-running loop (programmatic pytest, notebooks).
36+
return run_pure_async(_collect(events))
3637

3738

3839
# ---------------------------------------------------------------------------
@@ -208,17 +209,6 @@ def _build(events: list[Any]) -> list[StreamTaskMessage]:
208209
_LOCAL_FIXTURES = [_CODEX_TEXT, _CODEX_TOOL, _CODEX_REASONING, _CODEX_MULTI]
209210

210211

211-
@pytest.mark.parametrize("fixture", _LOCAL_FIXTURES, ids=lambda f: f.name)
212-
def test_codex_span_derivation_is_deterministic(fixture: Fixture) -> None:
213-
"""Span derivation over codex events is deterministic (cross-channel guarantee).
214-
215-
Deriving twice over the same events yields identical signals. This is the
216-
invariant that makes ``yield`` and ``auto_send`` delivery equivalent: both
217-
observe the same event stream, so their tracing side effects are identical.
218-
"""
219-
assert derive_all(fixture.events) == derive_all(fixture.events)
220-
221-
222212
@pytest.mark.parametrize("fixture", _LOCAL_FIXTURES, ids=lambda f: f.name)
223213
def test_codex_events_are_non_empty(fixture: Fixture) -> None:
224214
"""Every codex fixture yields at least one StreamTaskMessage*."""

tests/lib/core/harness/conformance/test_conformance.py

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,9 @@
2424
Full vs Start+Done envelope difference is a documented, acceptable choice in
2525
auto_send — see runner.py for the rationale).
2626
27-
AGX1-377 fix: auto_send now delivers streamed tool-request messages. The
28-
suppression that previously prevented the yield normaliser from emitting a
29-
LogicalDelivery for Start(tool_request)+Done is removed. Both channels now
30-
produce a delivery for streamed tool_request, verified by the
31-
"streamed-tool-request" fixture.
27+
auto_send delivers streamed tool-request messages: both channels produce a
28+
delivery for streamed tool_request, verified by the "streamed-tool-request"
29+
fixture.
3230
"""
3331

3432
from __future__ import annotations
@@ -134,9 +132,8 @@
134132
StreamTaskMessageDone(type="done", index=0),
135133
],
136134
),
137-
# fixture 4: streamed tool_request (AGX1-377 fix) — tool_request delivered
138-
# via Start+Done (no Full). auto_send now delivers this instead of dropping
139-
# it. Both channels must produce a LogicalDelivery for this fixture.
135+
# fixture 4: streamed tool_request — tool_request delivered via Start+Done
136+
# (no Full). Both channels must produce a LogicalDelivery for this fixture.
140137
Fixture(
141138
name="streamed-tool-request",
142139
events=[
@@ -275,11 +272,28 @@ async def test_cross_channel_equivalence(fixture: Fixture) -> None:
275272
# ---------------------------------------------------------------------------
276273

277274

278-
@pytest.mark.parametrize("fixture", all_fixtures(), ids=lambda f: f.name)
279-
def test_span_derivation_is_deterministic(fixture: Fixture) -> None:
280-
"""Span derivation over the same event list is idempotent.
275+
def test_span_derivation_is_deterministic() -> None:
276+
"""Span derivation over the same event list is idempotent, for EVERY
277+
registered fixture across all harnesses.
278+
279+
``all_fixtures()`` is read at run time (not at collection/parametrize time)
280+
so it sees fixtures registered by every conformance module, regardless of
281+
import/collection order. The per-harness conformance modules are imported
282+
eagerly via ``conftest.py`` in this directory, so this test covers the full
283+
cross-harness fixture set even when run in isolation. (Parametrizing on
284+
``all_fixtures()`` at import time would freeze the set to whatever happened
285+
to be registered before this module was collected.)
281286
282287
Retained as a lightweight regression guard. The primary cross-channel
283288
guarantee is asserted in test_cross_channel_equivalence above.
284289
"""
285-
assert derive_all(fixture.events) == derive_all(fixture.events)
290+
fixtures = all_fixtures()
291+
assert len(fixtures) > len(_FIXTURES), (
292+
"expected per-harness fixtures to be registered in addition to the "
293+
f"{len(_FIXTURES)} generic ones; got {len(fixtures)} total — a conformance "
294+
"module's fixtures are not being registered (check conftest imports)"
295+
)
296+
for fixture in fixtures:
297+
assert derive_all(fixture.events) == derive_all(fixture.events), (
298+
f"[{fixture.name}] span derivation is not deterministic"
299+
)

0 commit comments

Comments
 (0)