test(openai-conformance): adapt to cross-channel runner on #414 foundation

declan-scale · claude · declan-scale · commit 84f99f429ea2 · 2026-06-22T11:51:37.000-04:00
Replace the old determinism-only test (derive_all) with the full
cross-channel assertion pattern: register fixtures with per-module
_OPENAI_FIXTURES, call run_cross_channel_conformance, and assert
logical-delivery and span-signal equivalence across yield_events and
auto_send — matching the pattern in test_conformance.py.

Swap ReasoningSummaryDelta for ReasoningContentDelta so the runner's
payload accumulator recognises the delta type and the payload comparison
exercises the reasoning seeding path. Remove derive_all import.

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/tests/lib/core/harness/conformance/test_openai_conformance.py b/tests/lib/core/harness/conformance/test_openai_conformance.py
@@ -1,25 +1,26 @@
 """OpenAI conformance fixtures for the shared harness span-derivation engine.
 
-The cross-channel guarantee is that yield-delivery and auto-send observe the
-SAME canonical ``StreamTaskMessage*`` stream, so span derivation over that
-stream must be deterministic and idempotent regardless of channel. These
+The cross-channel guarantee is that yield-delivery and auto_send observe the
+SAME canonical StreamTaskMessage* stream, so span derivation and logical
+delivery over that stream must be equivalent regardless of channel. These
 fixtures express the canonical sequences an OpenAI turn produces (text,
-tool-call, reasoning, and a combined multi-step turn) and assert that property.
+tool-call, reasoning, and a combined multi-step turn) and assert that property
+via run_cross_channel_conformance.
 
-Registry hazard (see conformance/runner.py): ``_REGISTRY`` is process-global and
+Registry hazard (see conformance/runner.py): _REGISTRY is process-global and
 collection order across modules is not guaranteed. To stay deterministic this
 module keeps its OWN fixture list and parametrizes over THAT list, rather than
-over ``all_fixtures()``. It still calls ``register()`` so the cross-module
-conformance suite can see these fixtures too.
+over all_fixtures(). It still calls register() so the cross-module conformance
+suite can see these fixtures too.
 """
 
 from __future__ import annotations
 
 import pytest
 
+from agentex.types.text_delta import TextDelta
 from agentex.types.text_content import TextContent
 from agentex.types.reasoning_content import ReasoningContent
-from agentex.types.task_message_delta import TextDelta, ReasoningSummaryDelta
 from agentex.types.task_message_update import (
     StreamTaskMessageDone,
     StreamTaskMessageFull,
@@ -28,8 +29,9 @@
 )
 from agentex.types.tool_request_content import ToolRequestContent
 from agentex.types.tool_response_content import ToolResponseContent
+from agentex.types.reasoning_content_delta import ReasoningContentDelta
 
-from .runner import Fixture, register, derive_all
+from .runner import Fixture, register, run_cross_channel_conformance
 
 _OPENAI_FIXTURES: list[Fixture] = []
 
@@ -40,15 +42,17 @@ def _add(fixture: Fixture) -> None:
     register(fixture)
 
 
-# Text-only turn: start -> deltas -> done. No spans are derived from plain text.
+# Text-only turn: start -> deltas -> done.
+# Uses non-empty initial_content so payload comparison catches a channel that
+# drops StreamTaskMessageStart.content.
 _add(
     Fixture(
         name="openai-text-only",
         events=[
             StreamTaskMessageStart(
                 type="start",
                 index=0,
-                content=TextContent(type="text", author="agent", content=""),
+                content=TextContent(type="text", author="agent", content="Init"),
             ),
             StreamTaskMessageDelta(type="delta", index=0, delta=TextDelta(type="text", text_delta="Hel")),
             StreamTaskMessageDelta(type="delta", index=0, delta=TextDelta(type="text", text_delta="lo")),
@@ -57,9 +61,8 @@ def _add(fixture: Fixture) -> None:
     )
 )
 
-# Tool-call turn: the OpenAI converter emits a single Full(ToolRequestContent)
-# for the call and a Full(ToolResponseContent) for the result, matched by
-# tool_call_id. Mirrors convert_openai_to_agentex_events' tool path.
+# Tool-call turn: Full(ToolRequestContent) for the call + Full(ToolResponseContent)
+# for the result, matched by tool_call_id. Mirrors the OpenAI converter's tool path.
 _add(
     Fixture(
         name="openai-tool-call",
@@ -90,21 +93,30 @@ def _add(fixture: Fixture) -> None:
     )
 )
 
-# Reasoning turn: start(ReasoningContent) -> summary deltas -> done. Span
-# derivation opens a reasoning span on Start and closes it on the index's Done.
+# Reasoning turn: start(ReasoningContent) -> content deltas -> done.
+# ReasoningContent.summary is seeded in the payload so a channel that drops the
+# summary fails the cross-channel comparison.
 _add(
     Fixture(
         name="openai-reasoning",
         events=[
             StreamTaskMessageStart(
                 type="start",
                 index=0,
-                content=ReasoningContent(type="reasoning", author="agent", summary=[], content=[], style="active"),
+                content=ReasoningContent(
+                    type="reasoning",
+                    author="agent",
+                    summary=["Thinking..."],
+                ),
             ),
             StreamTaskMessageDelta(
                 type="delta",
                 index=0,
-                delta=ReasoningSummaryDelta(type="reasoning_summary", summary_index=0, summary_delta="thinking"),
+                delta=ReasoningContentDelta(
+                    type="reasoning_content",
+                    content_index=0,
+                    content_delta="step 1",
+                ),
             ),
             StreamTaskMessageDone(type="done", index=0),
         ],
@@ -119,12 +131,20 @@ def _add(fixture: Fixture) -> None:
             StreamTaskMessageStart(
                 type="start",
                 index=0,
-                content=ReasoningContent(type="reasoning", author="agent", summary=[], content=[], style="active"),
+                content=ReasoningContent(
+                    type="reasoning",
+                    author="agent",
+                    summary=["plan"],
+                ),
             ),
             StreamTaskMessageDelta(
                 type="delta",
                 index=0,
-                delta=ReasoningSummaryDelta(type="reasoning_summary", summary_index=0, summary_delta="plan"),
+                delta=ReasoningContentDelta(
+                    type="reasoning_content",
+                    content_index=0,
+                    content_delta="elaboration",
+                ),
             ),
             StreamTaskMessageDone(type="done", index=0),
             StreamTaskMessageFull(
@@ -162,8 +182,25 @@ def _add(fixture: Fixture) -> None:
 
 
 @pytest.mark.parametrize("fixture", _OPENAI_FIXTURES, ids=lambda f: f.name)
-def test_openai_span_derivation_is_deterministic(fixture):
-    """Deriving twice over the same canonical events yields identical signals,
-    which is exactly what makes yield-delivery and auto-send equivalent (both
-    observe the same stream)."""
-    assert derive_all(fixture.events) == derive_all(fixture.events)
+@pytest.mark.asyncio
+async def test_openai_cross_channel_equivalence(fixture: Fixture) -> None:
+    """Assert that yield_events and auto_send produce equivalent logical
+    deliveries and identical span signals for every OpenAI fixture.
+
+    This is the cross-channel guarantee: the two delivery adapters agree on
+    WHAT was delivered (logical content) and HOW spans were derived, even
+    though their streaming-envelope shapes differ (Full vs Start+Done for tool
+    messages).
+
+    The span signals are the ones each channel's tracer ACTUALLY recorded while
+    delivering, not a re-derivation, so a regression where one channel skips
+    deriver.observe() for some event type is caught here.
+    """
+    yield_deliveries, auto_deliveries, yield_spans, auto_spans = await run_cross_channel_conformance(fixture)
+
+    assert yield_deliveries == auto_deliveries, (
+        f"[{fixture.name}] logical deliveries differ:\n  yield:     {yield_deliveries}\n  auto_send: {auto_deliveries}"
+    )
+    assert yield_spans == auto_spans, (
+        f"[{fixture.name}] span signals differ:\n  yield:     {yield_spans}\n  auto_send: {auto_spans}"
+    )