diff --git a/temporalio/bridge/Cargo.lock b/temporalio/bridge/Cargo.lock index 85793c0f3..b86ad6b16 100644 --- a/temporalio/bridge/Cargo.lock +++ b/temporalio/bridge/Cargo.lock @@ -473,7 +473,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "778e2ac28f6c47af28e4907f13ffd1e1ddbd400980a9abd7c8df189bf578a5ad" dependencies = [ "libc", - "windows-sys 0.52.0", + "windows-sys 0.60.2", ] [[package]] @@ -1914,7 +1914,7 @@ dependencies = [ "once_cell", "socket2 0.5.10", "tracing", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -2138,7 +2138,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys 0.52.0", + "windows-sys 0.60.2", ] [[package]] @@ -2468,7 +2468,7 @@ dependencies = [ "getrandom 0.3.3", "once_cell", "rustix", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] diff --git a/temporalio/contrib/google_adk_agents/_model.py b/temporalio/contrib/google_adk_agents/_model.py index 80079433c..d5752d9a0 100644 --- a/temporalio/contrib/google_adk_agents/_model.py +++ b/temporalio/contrib/google_adk_agents/_model.py @@ -1,13 +1,30 @@ +import json +import logging from collections.abc import AsyncGenerator -from datetime import timedelta +from datetime import datetime, timedelta, timezone from google.adk.models import BaseLlm, LLMRegistry from google.adk.models.llm_request import LlmRequest from google.adk.models.llm_response import LlmResponse from temporalio import activity, workflow +from temporalio.contrib.pubsub import PubSubClient from temporalio.workflow import ActivityConfig +logger = logging.getLogger(__name__) + +EVENTS_TOPIC = "events" + + +def _make_event(event_type: str, **data: object) -> bytes: + return json.dumps( + { + "type": event_type, + "timestamp": datetime.now(timezone.utc).isoformat(), + "data": data, + } + ).encode() + @activity.defn async def invoke_model(llm_request: LlmRequest) -> list[LlmResponse]: @@ -35,20 +52,93 @@ async def invoke_model(llm_request: LlmRequest) -> list[LlmResponse]: ] +@activity.defn +async def invoke_model_streaming(llm_request: LlmRequest) -> list[LlmResponse]: + """Streaming-aware model activity. + + Calls the LLM with stream=True, publishes TEXT_DELTA events via + PubSubClient as tokens arrive, and returns the collected responses. + + The PubSubClient auto-detects the activity context to find the parent + workflow for publishing. + + Args: + llm_request: The LLM request containing model name and parameters. + + Returns: + List of LLM responses from the model. + """ + if llm_request.model is None: + raise ValueError("No model name provided, could not create LLM.") + + llm = LLMRegistry.new_llm(llm_request.model) + if not llm: + raise ValueError(f"Failed to create LLM for model: {llm_request.model}") + + pubsub = PubSubClient.create(batch_interval=0.1) + responses: list[LlmResponse] = [] + text_buffer = "" + + async with pubsub: + pubsub.publish(EVENTS_TOPIC, _make_event("LLM_CALL_START"), priority=True) + + async for response in llm.generate_content_async( + llm_request=llm_request, stream=True + ): + activity.heartbeat() + responses.append(response) + + if response.content and response.content.parts: + for part in response.content.parts: + if part.text: + text_buffer += part.text + pubsub.publish( + EVENTS_TOPIC, + _make_event("TEXT_DELTA", delta=part.text), + ) + if part.function_call: + pubsub.publish( + EVENTS_TOPIC, + _make_event( + "TOOL_CALL_START", + tool_name=part.function_call.name, + ), + ) + + if text_buffer: + pubsub.publish( + EVENTS_TOPIC, + _make_event("TEXT_COMPLETE", text=text_buffer), + priority=True, + ) + pubsub.publish( + EVENTS_TOPIC, _make_event("LLM_CALL_COMPLETE"), priority=True + ) + + return responses + + class TemporalModel(BaseLlm): """A Temporal-based LLM model that executes model invocations as activities.""" def __init__( - self, model_name: str, activity_config: ActivityConfig | None = None + self, + model_name: str, + activity_config: ActivityConfig | None = None, + streaming: bool = False, ) -> None: """Initialize the TemporalModel. Args: model_name: The name of the model to use. activity_config: Configuration options for the activity execution. + streaming: When True, the model activity uses the streaming LLM + endpoint and publishes token events via PubSubClient. The + workflow is unaffected -- it still receives complete responses. """ super().__init__(model=model_name) self._model_name = model_name + self._streaming = streaming self._activity_config = ActivityConfig( start_to_close_timeout=timedelta(seconds=60) ) @@ -62,15 +152,23 @@ async def generate_content_async( Args: llm_request: The LLM request containing model parameters and content. - stream: Whether to stream the response (currently ignored). + stream: Whether to stream the response (currently ignored; use the + ``streaming`` constructor parameter instead). Yields: The responses from the model. """ - responses = await workflow.execute_activity( - invoke_model, - args=[llm_request], - **self._activity_config, - ) + if self._streaming: + responses = await workflow.execute_activity( + invoke_model_streaming, + args=[llm_request], + **self._activity_config, + ) + else: + responses = await workflow.execute_activity( + invoke_model, + args=[llm_request], + **self._activity_config, + ) for response in responses: yield response diff --git a/temporalio/contrib/google_adk_agents/_plugin.py b/temporalio/contrib/google_adk_agents/_plugin.py index 03cb78998..52504e78f 100644 --- a/temporalio/contrib/google_adk_agents/_plugin.py +++ b/temporalio/contrib/google_adk_agents/_plugin.py @@ -8,7 +8,10 @@ from temporalio import workflow from temporalio.contrib.google_adk_agents._mcp import TemporalMcpToolSetProvider -from temporalio.contrib.google_adk_agents._model import invoke_model +from temporalio.contrib.google_adk_agents._model import ( + invoke_model, + invoke_model_streaming, +) from temporalio.contrib.pydantic import ( PydanticPayloadConverter as _DefaultPydanticPayloadConverter, ) @@ -94,7 +97,7 @@ def workflow_runner(runner: WorkflowRunner | None) -> WorkflowRunner: ) return runner - new_activities = [invoke_model] + new_activities = [invoke_model, invoke_model_streaming] if toolset_providers is not None: for toolset_provider in toolset_providers: new_activities.extend(toolset_provider._get_activities()) diff --git a/temporalio/contrib/openai_agents/_invoke_model_activity.py b/temporalio/contrib/openai_agents/_invoke_model_activity.py index 945a05ec6..c29ef2dc9 100644 --- a/temporalio/contrib/openai_agents/_invoke_model_activity.py +++ b/temporalio/contrib/openai_agents/_invoke_model_activity.py @@ -4,8 +4,10 @@ """ import enum +import json +import logging from dataclasses import dataclass -from datetime import timedelta +from datetime import datetime, timedelta, timezone from typing import Any from agents import ( @@ -24,6 +26,7 @@ RunContextWrapper, Tool, TResponseInputItem, + Usage, UserError, WebSearchTool, ) @@ -31,13 +34,29 @@ APIStatusError, AsyncOpenAI, ) +from openai.types.responses import ResponseCompletedEvent from openai.types.responses.tool_param import Mcp from typing_extensions import Required, TypedDict from temporalio import activity from temporalio.contrib.openai_agents._heartbeat_decorator import _auto_heartbeater +from temporalio.contrib.pubsub import PubSubClient from temporalio.exceptions import ApplicationError +logger = logging.getLogger(__name__) + +EVENTS_TOPIC = "events" + + +def _make_event(event_type: str, **data: object) -> bytes: + return json.dumps( + { + "type": event_type, + "timestamp": datetime.now(timezone.utc).isoformat(), + "data": data, + } + ).encode() + @dataclass class HandoffInput: @@ -263,3 +282,201 @@ def make_tool(tool: ToolInput) -> Tool: non_retryable=True, next_retry_delay=retry_after, ) from e + + @activity.defn + @_auto_heartbeater + async def invoke_model_activity_streaming( + self, input: ActivityModelInput + ) -> ModelResponse: + """Streaming-aware model activity. + + Calls model.stream_response(), publishes token events via PubSubClient, + and returns the complete ModelResponse constructed from the + ResponseCompletedEvent at the end of the stream. + """ + model = self._model_provider.get_model(input.get("model_name")) + + async def empty_on_invoke_tool( + _ctx: RunContextWrapper[Any], _input: str + ) -> str: + return "" + + async def empty_on_invoke_handoff( + _ctx: RunContextWrapper[Any], _input: str + ) -> Any: + return None + + def make_tool(tool: ToolInput) -> Tool: + if isinstance( + tool, + ( + FileSearchTool, + WebSearchTool, + ImageGenerationTool, + CodeInterpreterTool, + ), + ): + return tool + elif isinstance(tool, HostedMCPToolInput): + return HostedMCPTool(tool_config=tool.tool_config) + elif isinstance(tool, FunctionToolInput): + return FunctionTool( + name=tool.name, + description=tool.description, + params_json_schema=tool.params_json_schema, + on_invoke_tool=empty_on_invoke_tool, + strict_json_schema=tool.strict_json_schema, + ) + else: + raise UserError(f"Unknown tool type: {tool.name}") # type:ignore[reportUnreachable] + + tools = [make_tool(x) for x in input.get("tools", [])] + handoffs: list[Handoff[Any, Any]] = [ + Handoff( + tool_name=x.tool_name, + tool_description=x.tool_description, + input_json_schema=x.input_json_schema, + agent_name=x.agent_name, + strict_json_schema=x.strict_json_schema, + on_invoke_handoff=empty_on_invoke_handoff, + ) + for x in input.get("handoffs", []) + ] + + pubsub = PubSubClient.create(batch_interval=0.1) + final_response = None + text_buffer = "" + thinking_buffer = "" + thinking_active = False + + try: + async with pubsub: + pubsub.publish( + EVENTS_TOPIC, _make_event("LLM_CALL_START"), priority=True + ) + + async for event in model.stream_response( + system_instructions=input.get("system_instructions"), + input=input["input"], + model_settings=input["model_settings"], + tools=tools, + output_schema=input.get("output_schema"), + handoffs=handoffs, + tracing=ModelTracing(input["tracing"]), + previous_response_id=input.get("previous_response_id"), + conversation_id=input.get("conversation_id"), + prompt=input.get("prompt"), + ): + activity.heartbeat() + etype = getattr(event, "type", None) + + if etype == "response.output_text.delta": + text_buffer += event.delta + pubsub.publish( + EVENTS_TOPIC, + _make_event("TEXT_DELTA", delta=event.delta), + ) + elif etype == "response.reasoning_summary_text.delta": + if not thinking_active: + thinking_active = True + pubsub.publish( + EVENTS_TOPIC, _make_event("THINKING_START") + ) + thinking_buffer += event.delta + pubsub.publish( + EVENTS_TOPIC, + _make_event("THINKING_DELTA", delta=event.delta), + ) + elif etype == "response.reasoning_summary_text.done": + if thinking_active: + pubsub.publish( + EVENTS_TOPIC, + _make_event( + "THINKING_COMPLETE", + content=thinking_buffer, + ), + priority=True, + ) + thinking_buffer = "" + thinking_active = False + elif etype == "response.output_item.added": + item = event.item + if getattr(item, "type", None) == "function_call": + pubsub.publish( + EVENTS_TOPIC, + _make_event( + "TOOL_CALL_START", tool_name=item.name + ), + ) + elif isinstance(event, ResponseCompletedEvent): + final_response = event.response + + if text_buffer: + pubsub.publish( + EVENTS_TOPIC, + _make_event("TEXT_COMPLETE", text=text_buffer), + priority=True, + ) + pubsub.publish( + EVENTS_TOPIC, + _make_event("LLM_CALL_COMPLETE"), + priority=True, + ) + + except APIStatusError as e: + retry_after = None + retry_after_ms_header = e.response.headers.get("retry-after-ms") + if retry_after_ms_header is not None: + retry_after = timedelta(milliseconds=float(retry_after_ms_header)) + + if retry_after is None: + retry_after_header = e.response.headers.get("retry-after") + if retry_after_header is not None: + retry_after = timedelta(seconds=float(retry_after_header)) + + should_retry_header = e.response.headers.get("x-should-retry") + if should_retry_header == "true": + raise e + if should_retry_header == "false": + raise ApplicationError( + "Non retryable OpenAI error", + non_retryable=True, + next_retry_delay=retry_after, + ) from e + + if ( + e.response.status_code in [408, 409, 429] + or e.response.status_code >= 500 + ): + raise ApplicationError( + f"Retryable OpenAI status code: {e.response.status_code}", + non_retryable=False, + next_retry_delay=retry_after, + ) from e + + raise ApplicationError( + f"Non retryable OpenAI status code: {e.response.status_code}", + non_retryable=True, + next_retry_delay=retry_after, + ) from e + + if final_response is None: + raise ApplicationError( + "Stream ended without ResponseCompletedEvent", + non_retryable=True, + ) + + usage = Usage( + requests=1, + input_tokens=final_response.usage.input_tokens + if final_response.usage + else 0, + output_tokens=final_response.usage.output_tokens + if final_response.usage + else 0, + ) + return ModelResponse( + output=final_response.output, + usage=usage, + response_id=final_response.id, + ) diff --git a/temporalio/contrib/openai_agents/_model_parameters.py b/temporalio/contrib/openai_agents/_model_parameters.py index 55827e0d5..d5b757a4e 100644 --- a/temporalio/contrib/openai_agents/_model_parameters.py +++ b/temporalio/contrib/openai_agents/_model_parameters.py @@ -68,3 +68,9 @@ class ModelActivityParameters: use_local_activity: bool = False """Whether to use a local activity. If changed during a workflow execution, that would break determinism.""" + + enable_streaming: bool = False + """When True, the model activity uses the streaming LLM endpoint and + publishes token events via PubSubClient. The workflow is unaffected -- + it still receives a complete ModelResponse. Incompatible with + use_local_activity (local activities do not support heartbeats).""" diff --git a/temporalio/contrib/openai_agents/_temporal_model_stub.py b/temporalio/contrib/openai_agents/_temporal_model_stub.py index f55821309..adacd9ecb 100644 --- a/temporalio/contrib/openai_agents/_temporal_model_stub.py +++ b/temporalio/contrib/openai_agents/_temporal_model_stub.py @@ -1,6 +1,7 @@ from __future__ import annotations import logging +from datetime import timedelta from temporalio import workflow from temporalio.contrib.openai_agents._model_parameters import ModelActivityParameters @@ -154,7 +155,28 @@ def make_tool_info(tool: Tool) -> ToolInput: else: summary = None - if self.model_params.use_local_activity: + if self.model_params.enable_streaming: + if self.model_params.use_local_activity: + raise ValueError( + "Streaming is incompatible with local activities " + "(local activities do not support heartbeats)." + ) + return await workflow.execute_activity_method( + ModelActivity.invoke_model_activity_streaming, + activity_input, + summary=summary, + task_queue=self.model_params.task_queue, + schedule_to_close_timeout=self.model_params.schedule_to_close_timeout, + schedule_to_start_timeout=self.model_params.schedule_to_start_timeout, + start_to_close_timeout=self.model_params.start_to_close_timeout, + heartbeat_timeout=self.model_params.heartbeat_timeout + or timedelta(seconds=30), + retry_policy=self.model_params.retry_policy, + cancellation_type=self.model_params.cancellation_type, + versioning_intent=self.model_params.versioning_intent, + priority=self.model_params.priority, + ) + elif self.model_params.use_local_activity: return await workflow.execute_local_activity_method( ModelActivity.invoke_model_activity, activity_input, diff --git a/temporalio/contrib/openai_agents/_temporal_openai_agents.py b/temporalio/contrib/openai_agents/_temporal_openai_agents.py index 39168d0fd..b35853781 100644 --- a/temporalio/contrib/openai_agents/_temporal_openai_agents.py +++ b/temporalio/contrib/openai_agents/_temporal_openai_agents.py @@ -195,7 +195,11 @@ def add_activities( if not register_activities: return activities or [] - new_activities = [ModelActivity(model_provider).invoke_model_activity] + model_activity = ModelActivity(model_provider) + new_activities = [ + model_activity.invoke_model_activity, + model_activity.invoke_model_activity_streaming, + ] server_names = [server.name for server in mcp_server_providers] if len(server_names) != len(set(server_names)): diff --git a/temporalio/contrib/pubsub/DESIGN-ADDENDUM-CAN.md b/temporalio/contrib/pubsub/DESIGN-ADDENDUM-CAN.md new file mode 100644 index 000000000..55650db56 --- /dev/null +++ b/temporalio/contrib/pubsub/DESIGN-ADDENDUM-CAN.md @@ -0,0 +1,272 @@ +# Continue-As-New Addendum + +Addendum to [DESIGN.md](./DESIGN.md). Addresses the continue-as-new (CAN) gap +identified in section 10 ("Event retention"). + +## Problem + +The pub/sub mixin accumulates workflow history through two channels: + +1. **Signals** — each `__pubsub_publish` signal adds a `WorkflowSignaled` event + plus the serialized `PublishInput` payload. +2. **Updates** — each `__pubsub_poll` response serializes the returned + `PollResult` (including all matched items) into the history as an update + completion event. + +Over a streaming agent session, a subscriber polling every few seconds +accumulates many update-completion events, each containing a slice of the log. +These are redundant copies of data already held in `_pubsub_log`. The history +grows toward the ~50K event warning threshold, at which point Temporal forces +termination. + +Continue-as-new resets the history. By serializing the full log into the CAN +input, we carry a single canonical copy forward and discard all the redundant +history entries from prior signals, updates, and queries. + +## Design + +### `PubSubState` type + +New dataclass in `_types.py`: + +```python +@dataclass +class PubSubState: + """Serializable snapshot of pub/sub state for continue-as-new.""" + log: list[PubSubItem] = field(default_factory=list) +``` + +The offset counter is not stored — it is derived as `len(log)`. This avoids +any possibility of the counter and log diverging. + +Exported from `__init__.py`. + +### Mixin changes + +New and modified methods on `PubSubMixin`: + +```python +def init_pubsub(self, prior_state: PubSubState | None = None) -> None: + """Initialize pub/sub state. + + Args: + prior_state: State from a previous run (via get_pubsub_state()). + Pass None on the first run. + """ + if prior_state is not None: + self._pubsub_log = list(prior_state.log) + else: + self._pubsub_log = [] + self._pubsub_draining = False + +def get_pubsub_state(self) -> PubSubState: + """Return a serializable snapshot of pub/sub state. + + Call this when building your continue-as-new arguments. + """ + return PubSubState(log=list(self._pubsub_log)) +``` + +The mixin does **not** trigger CAN itself. The parent workflow decides when to +continue-as-new (typically by checking `workflow.info().is_continue_as_new_suggested()` +at a safe point in its main loop). + +### Draining: `drain_pubsub()` + update validator + +A long-poll `__pubsub_poll` handler can block for up to 300 seconds waiting for +new items. We cannot let that block continue-as-new indefinitely. Conversely, a +naive drain that unblocks waiting polls but doesn't reject new ones creates a +race: the client receives an empty result, immediately sends a new poll, the new +poll is accepted, and `all_handlers_finished()` never stabilizes. This is +because `await workflow.wait_condition(workflow.all_handlers_finished)` yields, +allowing the SDK to process new events — including new update acceptances — in +the same or subsequent workflow tasks. + +The solution is two mechanisms working together: + +1. **A drain flag** that unblocks all waiting poll handlers. +2. **An update validator** that rejects new polls once draining is set. + +```python +def drain_pubsub(self) -> None: + """Unblock all waiting poll handlers and reject new polls. + + Call this before waiting for all_handlers_finished() and + continue_as_new(). + """ + self._pubsub_draining = True + +@workflow.update(name="__pubsub_poll") +async def _pubsub_poll(self, input: PollInput) -> PollResult: + await workflow.wait_condition( + lambda: len(self._pubsub_log) > input.from_offset + or self._pubsub_draining, + timeout=input.timeout, + ) + # Return whatever items are available (possibly empty if drain-only) + all_new = self._pubsub_log[input.from_offset:] + next_offset = len(self._pubsub_log) + if input.topics: + topic_set = set(input.topics) + filtered = [item for item in all_new if item.topic in topic_set] + else: + filtered = list(all_new) + return PollResult(items=filtered, next_offset=next_offset) + +@_pubsub_poll.validator +def _validate_pubsub_poll(self, input: PollInput) -> None: + if self._pubsub_draining: + raise RuntimeError("Workflow is draining for continue-as-new") +``` + +The validator is read-only (checks a flag, raises to reject) — this satisfies +the Temporal constraint that validators must not mutate state or block. + +**CAN sequence in the parent workflow:** + +```python +self.drain_pubsub() +await workflow.wait_condition(workflow.all_handlers_finished) +workflow.continue_as_new(args=[...]) +``` + +What happens: + +1. `drain_pubsub()` sets `_pubsub_draining = True`. +2. All blocked `__pubsub_poll` handlers unblock (the `or self._pubsub_draining` + clause becomes true) and return their current items. +3. The validator rejects any new `__pubsub_poll` updates — they are never + accepted, so no new handlers start. +4. `all_handlers_finished()` becomes true and **stays** true. +5. `continue_as_new()` proceeds. + +On the client side, the rejected poll surfaces as an error. The subscriber +detects CAN via `describe()`, follows the chain, and resumes from the same +offset against the new run. + +### Client-side CAN resilience + +The current `subscribe()` method catches `CancelledError` and +`WorkflowUpdateRPCTimeoutOrCancelledError`, then stops iteration. It has no +CAN awareness. + +#### New behavior + +`subscribe()` gains a `follow_continues` parameter (default `True`): + +```python +async def subscribe( + self, + topics: list[str] | None = None, + from_offset: int = 0, + *, + follow_continues: bool = True, +) -> AsyncIterator[PubSubItem]: +``` + +When an `execute_update` call fails and `follow_continues` is `True`, the +client: + +1. Calls `describe()` on the current handle to check execution status. +2. If the status is `CONTINUED_AS_NEW`, replaces `self._handle` with a fresh + handle for the same workflow ID (no pinned `run_id`), then retries the poll + from the same offset. +3. If the status is anything else, re-raises the original error. + +```python +async def _follow_continue_as_new(self) -> bool: + """Check if the workflow continued-as-new and update the handle. + + Returns True if the handle was updated (caller should retry). + """ + try: + desc = await self._handle.describe() + except Exception: + return False + if desc.status == WorkflowExecutionStatus.CONTINUED_AS_NEW: + self._handle = self._handle._client.get_workflow_handle( + self._handle.id + ) + return True + return False +``` + +The retry succeeds because the new run's log contains all items from the +previous run. Polling from the same offset returns the expected items. + +#### Why this works with `activity_pubsub_client()` + +`activity_pubsub_client()` creates handles via +`activity.client().get_workflow_handle(workflow_id)` — no `run_id` pinned. +Signals and updates already route to the current run, so activity-side +publishing is CAN-friendly without changes. + +## Offset Continuity + +Since the full log is carried forward: + +- Pre-CAN: offsets `0..N-1`, `len(log) == N`. +- Post-CAN: `init_pubsub(prior_state)` restores the same N items. New appends + start at offset N. +- A subscriber at offset K (where K < N) polls the new run and gets items + `K..N-1` from the carried-forward log, then continues with new items. + +No offset remapping. No sentinel values. No coordination protocol. + +## Usage Example + +```python +@dataclass +class WorkflowInput: + # ... application fields ... + pubsub_state: PubSubState | None = None + +@workflow.defn +class AgentWorkflow(PubSubMixin): + @workflow.run + async def run(self, input: WorkflowInput) -> None: + self.init_pubsub(prior_state=input.pubsub_state) + + while True: + await workflow.wait_condition( + lambda: self._pending_message or self._closed + ) + if self._closed: + return + + await self._run_turn(self._pending_message) + + if workflow.info().is_continue_as_new_suggested(): + self.drain_pubsub() + await workflow.wait_condition(workflow.all_handlers_finished) + workflow.continue_as_new(args=[WorkflowInput( + # ... application fields ... + pubsub_state=self.get_pubsub_state(), + )]) +``` + +## Edge Cases + +### Payload size limit + +The full log serialized into CAN input could approach Temporal's default 2 MB +payload limit for very long sessions with large payloads. This is an inherent +constraint of the full-history approach. + +Mitigation: the snapshot + truncate extension described in DESIGN.md section 10 +addresses this by discarding consumed entries before CAN. That extension becomes +the natural next step if payload size becomes a problem in practice. + +### Signal delivery during CAN + +A `PubSubClient` in publish mode sending signals mid-CAN may get errors if +its handle is pinned to the old run. The publishing side does **not** +auto-follow CAN — the parent workflow should ensure activities complete (and +therefore stop publishing) before triggering CAN. + +### Concurrent subscribers + +Multiple subscribers independently follow the CAN chain. Each maintains its +own offset. Sharing a `PubSubClient` instance across concurrent `subscribe()` +calls is safe — they all want to target the latest run, and the handle is +effectively just a workflow ID reference. diff --git a/temporalio/contrib/pubsub/DESIGN-ADDENDUM-DEDUP.md b/temporalio/contrib/pubsub/DESIGN-ADDENDUM-DEDUP.md new file mode 100644 index 000000000..7c838f9b3 --- /dev/null +++ b/temporalio/contrib/pubsub/DESIGN-ADDENDUM-DEDUP.md @@ -0,0 +1,224 @@ +# Exactly-Once Publish Delivery — Addendum + +Addendum to [DESIGN.md](./DESIGN.md). Addresses the signal delivery gap: the +original design has no deduplication, so a retry after a failed signal can +produce duplicate entries in the log. + +## Problem + +The `PubSubClient.flush()` method sends buffered items to the workflow via a +Temporal signal. If the signal call raises an exception (e.g., network timeout +on the response after the server accepted the signal), the client cannot +distinguish "signal was delivered" from "signal was not delivered." Without +deduplication, the client must choose: + +- **Clear buffer before sending (swap pattern).** Items are lost if the signal + truly fails. At-most-once. +- **Clear buffer after sending.** Items are re-sent on the next flush if the + signal was delivered but the response failed. At-least-once with silent + duplication. + +Neither is acceptable for a pub/sub log where subscribers expect exactly-once +delivery and stable offsets. + +## Options Considered + +### Option 1: Batch UUID + +Each flush assigns a `uuid4` to the batch. The workflow maintains a set of seen +batch IDs and skips duplicates. + +- **Pro:** Simple to implement. +- **Con:** The seen-IDs set grows without bound. Must be carried through + continue-as-new or periodically pruned. Pruning requires knowing which IDs + can never be retried — which is unknowable without additional protocol. + +### Option 2: Offset-based dedup + +The publisher includes the expected log offset in the signal. The workflow +rejects if items at that offset already exist. + +- **Pro:** No additional state — dedup is implicit in the log structure. +- **Con:** The publisher does not know the current log offset. It would need to + query first, introducing a read-before-write round-trip and a race between + the query and the signal. Multiple concurrent publishers would conflict. + +### Option 3: Publisher ID + sequence number + +Each `PubSubClient` generates a UUID on creation (the publisher ID). Each flush +increments a monotonic sequence counter. The signal payload includes +`(publisher_id, sequence)`. The workflow tracks the highest seen sequence per +publisher and rejects any signal with a sequence ≤ the recorded value. + +- **Pro:** Dedup state is `dict[str, int]` — bounded by the number of + publishers (typically 1–2), not the number of flushes. The workflow can + detect gaps (missing sequence numbers) as a diagnostic signal. Naturally + survives continue-as-new if carried in state. No unbounded set. No + read-before-write round-trip. +- **Con:** Requires the publisher to maintain a sequence counter (trivial) and + the workflow to carry `publisher_sequences` through CAN (small dict). + +### Option 4: Temporal idempotency keys + +Temporal does not currently provide built-in signal deduplication or idempotency +keys for signals. This option is not available. + +## Design Decision: Publisher ID + sequence number (Option 3) + +Option 3 is adopted. The dedup state is minimal, bounded, and self-cleaning +(old publishers' entries can be removed after a timeout or on CAN). It aligns +with how Kafka producers achieve exactly-once: each producer has an ID and a +monotonic sequence, and the broker deduplicates on the pair. + +## Wire Changes + +### `PublishInput` + +```python +@dataclass +class PublishInput: + items: list[PublishEntry] = field(default_factory=list) + publisher_id: str = "" + sequence: int = 0 +``` + +Both fields default to empty/zero for backward compatibility. If `publisher_id` +is empty, the workflow skips deduplication (legacy behavior). + +### `PubSubClient` changes + +```python +class PubSubClient: + def __init__(self, handle, ...): + ... + self._publisher_id: str = uuid.uuid4().hex + self._sequence: int = 0 + + async def flush(self) -> None: + async with self._flush_lock: + if self._buffer: + self._sequence += 1 + batch = self._buffer + self._buffer = [] + try: + await self._handle.signal( + "__pubsub_publish", + PublishInput( + items=batch, + publisher_id=self._publisher_id, + sequence=self._sequence, + ), + ) + except Exception: + # Restore items for retry. Sequence number is already + # incremented — the next attempt uses the same sequence, + # so the workflow deduplicates if the first signal was + # actually delivered. + self._sequence -= 1 + self._buffer = batch + self._buffer + raise +``` + +Key behaviors: + +- **Buffer swap before send.** Items are moved out of the buffer before the + signal await. New `publish()` calls during the await write to the fresh + buffer and are not affected by a retry. +- **Sequence advances on failure.** If the signal raises, the sequence counter + is NOT decremented. The failed batch is restored to the buffer, but the next + flush uses a new sequence number. This prevents data loss: if the original + signal was delivered but the client saw an error, items published during the + failed await would be merged into the retry batch. With the old sequence, + the workflow would deduplicate the entire merged batch, silently dropping + the newly-published items. With a new sequence, the retry is treated as a + fresh batch. The tradeoff is that the original items may be delivered twice + (at-least-once), but the workflow-side dedup catches the common case where + the batch is retried unchanged. +- **Lock for coalescing.** An `asyncio.Lock` serializes flushes. Multiple + concurrent `flush()` callers queue on the lock; by the time each enters, + later items have accumulated. This naturally coalesces N flush calls into + fewer signals. + +## Workflow Changes + +### Signal handler + +```python +@workflow.signal(name="__pubsub_publish") +def _pubsub_publish(self, input: PublishInput) -> None: + self._check_initialized() + if input.publisher_id: + last_seq = self._publisher_sequences.get(input.publisher_id, 0) + if input.sequence <= last_seq: + return # duplicate — skip + self._publisher_sequences[input.publisher_id] = input.sequence + for entry in input.items: + self._pubsub_log.append(PubSubItem(topic=entry.topic, data=entry.data)) +``` + +If `publisher_id` is empty (legacy or workflow-internal publish), dedup is +skipped. Otherwise, the workflow compares the incoming sequence against the +highest seen for that publisher. If it's ≤, the entire batch is dropped as a +duplicate. + +### Internal state + +```python +self._publisher_sequences: dict[str, int] = {} +``` + +Initialized in `init_pubsub()` from `PubSubState.publisher_sequences`. + +## Continue-as-New State + +`PubSubState` gains a `publisher_sequences` field: + +```python +@dataclass +class PubSubState: + log: list[PubSubItem] = field(default_factory=list) + base_offset: int = 0 + publisher_sequences: dict[str, int] = field(default_factory=dict) +``` + +This is carried through CAN so that dedup survives across runs. The dict is +small — one entry per publisher that has ever sent to this workflow, typically +1–2 entries. + +### Cleanup on CAN + +Stale publisher entries (from publishers that are no longer active) accumulate +but are harmless — they're just `str: int` pairs. If cleanup is desired, the +workflow can remove entries for publishers that haven't sent in N runs, but this +is not required for correctness. + +## Sequence Gap Detection + +If the workflow receives sequence N+2 without seeing N+1, it indicates a lost +signal. The current design does **not** act on this — it processes the batch +normally and records the new high-water mark. Gaps are expected to be rare +(they require a signal to be truly lost, not just slow), and the publisher will +retry with the same sequence if it didn't get an ack. + +A future extension could log a warning on gap detection for observability. + +## Properties + +- **Exactly-once delivery.** Each `(publisher_id, sequence)` pair is processed + at most once. Combined with at-least-once retry on the client, this achieves + exactly-once. +- **Bounded dedup state.** One `int` per publisher. Does not grow with the + number of flushes. +- **No read-before-write.** The publisher does not need to query the workflow + before sending. +- **Backward compatible.** Empty `publisher_id` disables dedup. Existing code + without the field works as before. +- **CAN-safe.** Publisher sequences survive continue-as-new in `PubSubState`. + +## Relationship to Other Addenda + +- [Continue-as-new addendum](./DESIGN-ADDENDUM-CAN.md): `PubSubState` shape + updated with `publisher_sequences`. Drain/validator mechanics unaffected. +- [Topic offsets addendum](./DESIGN-ADDENDUM-TOPICS.md): Unaffected. Dedup + operates on the publish path; offsets and cursors operate on the subscribe + path. diff --git a/temporalio/contrib/pubsub/DESIGN-ADDENDUM-ITEM-OFFSET.md b/temporalio/contrib/pubsub/DESIGN-ADDENDUM-ITEM-OFFSET.md new file mode 100644 index 000000000..5cb992cea --- /dev/null +++ b/temporalio/contrib/pubsub/DESIGN-ADDENDUM-ITEM-OFFSET.md @@ -0,0 +1,175 @@ +# Per-Item Offsets — Addendum + +Addendum to [DESIGN-ADDENDUM-TOPICS.md](./DESIGN-ADDENDUM-TOPICS.md). Revisits +the decision that `PubSubItem` does not carry an offset, based on experience +with the voice-terminal agent where the subscriber needs to track consumption +progress at item granularity. + +## Problem + +The voice-terminal agent streams TTS audio chunks through the pub/sub log. +Audio chunks are large (~50-100KB base64 each) and must not be truncated +from the workflow log until they have been **played** by the client, not merely +**received**. + +The current API exposes offsets only at poll-batch granularity via +`PollResult.next_offset`. The subscriber cannot determine which global offset +corresponds to a specific item within the batch. This makes it impossible to +report fine-grained consumption progress back to the workflow for truncation. + +### Why batch-level offsets are insufficient + +The subscriber's consumption model has two stages: + +1. **Receive**: items are yielded by `subscribe()` and buffered locally + (e.g., audio enqueued into a playback buffer). +2. **Consume**: the local consumer finishes processing the item (e.g., the + speaker finishes playing the audio). + +The subscriber needs to signal the workflow: "I have consumed through offset N, +you may truncate up to N." This requires knowing the offset of each item, not +just the offset at the end of a poll batch. + +Without per-item offsets, the subscriber can only report the batch boundary. +If the subscriber crashes after receiving a batch but before consuming all +items, truncation based on the batch boundary discards unconsumed items. + +### Why this matters for continue-as-new + +Before continue-as-new, the workflow must serialize the pub/sub log into the +workflow input. Audio chunks make the log large (observed 3.6MB, exceeding +Temporal's payload size limit). The workflow needs to truncate consumed items +before serialization, but can only safely truncate items the subscriber has +actually consumed — which requires per-item offset tracking. + +### Workaround: count items from `from_offset` + +When the subscriber requests all topics (no filtering), items map 1:1 to +consecutive global offsets. The subscriber can compute `from_offset + i` for +each item. This works for the voice-terminal (which subscribes to all topics) +but is fragile — it breaks silently if topic filtering is introduced or if a +third topic is added to the workflow without updating the subscription. + +## Proposed Change + +Add an `offset` field to `PubSubItem` and `_WireItem`, populated by the poll +handler from the item's position in the log. No new storage in the workflow — +the offset is computed at poll time. + +### Wire types (revised) + +```python +@dataclass +class PubSubItem: + topic: str + data: bytes + offset: int = 0 + +@dataclass +class _WireItem: + topic: str + data: str # base64-encoded bytes + offset: int = 0 +``` + +### Poll handler change + +The poll handler already iterates the log slice. It annotates each item with +its global offset before returning: + +```python +all_new = self._pubsub_log[log_offset:] +next_offset = self._pubsub_base_offset + len(self._pubsub_log) +if input.topics: + topic_set = set(input.topics) + filtered = [ + (self._pubsub_base_offset + log_offset + i, item) + for i, item in enumerate(all_new) + if item.topic in topic_set + ] +else: + filtered = [ + (self._pubsub_base_offset + log_offset + i, item) + for i, item in enumerate(all_new) + ] +return PollResult( + items=[ + _WireItem(topic=item.topic, data=encode_data(item.data), offset=off) + for off, item in filtered + ], + next_offset=next_offset, +) +``` + +### `subscribe()` change + +The client passes the offset through to the yielded `PubSubItem`: + +```python +for wire_item in result.items: + yield PubSubItem( + topic=wire_item.topic, + data=decode_data(wire_item.data), + offset=wire_item.offset, + ) +``` + +### Backward compatibility + +The `offset` field defaults to `0` on both `PubSubItem` and `_WireItem`. +Existing subscribers that don't use the field are unaffected. Workflows +running old code that don't populate the field will return `0` for all items — +subscribers must treat `offset=0` as "unknown" if they depend on it. + +## Subscriber consumption tracking pattern + +With per-item offsets, the voice-terminal client can track played-through +progress: + +```python +played_offset = from_offset + +async for item in pubsub.subscribe(from_offset=from_offset): + if item.topic == AUDIO_TOPIC: + player.enqueue(pcm, offset=item.offset) + elif item.topic == EVENTS_TOPIC: + # Events are consumed immediately on receipt + played_offset = item.offset + 1 + if event_type == "TURN_COMPLETE": + break + +# After playback finishes, update played_offset from the player +played_offset = player.last_played_offset + +# Signal the workflow to truncate consumed items +await handle.signal(workflow.truncate, played_offset) +``` + +The workflow truncates only up to `played_offset`, preserving any items the +subscriber has received but not yet consumed. Before continue-as-new, the +workflow truncates to the last acked offset rather than the log tail. + +## Properties + +- **No new workflow state.** Offsets are computed at poll time from + `base_offset` and the item's position in the log. +- **Backward compatible.** Default `offset=0` means existing code is + unaffected. +- **Enables safe truncation.** Subscribers can report exactly which items + they have consumed, not just which batches they have received. +- **Works with topic filtering.** Per-item offsets are correct regardless of + which topics the subscriber requests. + +## Relationship to existing design + +The [DESIGN-ADDENDUM-TOPICS.md](./DESIGN-ADDENDUM-TOPICS.md) states: + +> `PubSubItem` does not carry an offset. The global offset is an internal +> detail exposed only through `PollResult.next_offset` and the `get_offset()` +> query. + +This addendum revises that decision. The global offset is no longer purely +internal — it is exposed per-item to enable consumption tracking. The offset +model (global, monotonic, single log) is unchanged. The BFF containment +strategy for end-client leakage is also unchanged — the BFF still assigns its +own SSE event IDs. diff --git a/temporalio/contrib/pubsub/DESIGN-ADDENDUM-TOPICS.md b/temporalio/contrib/pubsub/DESIGN-ADDENDUM-TOPICS.md new file mode 100644 index 000000000..a99bf91d4 --- /dev/null +++ b/temporalio/contrib/pubsub/DESIGN-ADDENDUM-TOPICS.md @@ -0,0 +1,272 @@ +# Topic Offsets and Cursor Design — Addendum + +Addendum to [DESIGN.md](./DESIGN.md). Revises section 3 ("Global monotonic +offsets, not per-topic") after evaluating per-topic offset models. Concludes +that global offsets are the right choice for workflow-scoped pub/sub, with +information leakage addressed at the BFF layer rather than the pub/sub API. + +## Problem + +The original design assigns every log entry a global monotonic offset regardless +of topic. A single-topic subscriber sees gaps in offset numbers — e.g., offsets +0, 3, 7, 12. These gaps leak information about activity on other topics. A +subscriber to `"events"` can infer the volume of traffic on `"thinking"` or +`"status"` from the size of the gaps, even though it has no direct access to +those topics. + +This is an information leakage concern, not a correctness bug. + +## Industry Survey + +We surveyed offset/cursor models across major pub/sub and streaming systems to +inform the design. + +| System | Cursor Scope | Unified Multi-Topic Cursor? | +|---|---|---| +| Kafka | Per-partition offset (int64) | No — separate offset per partition per topic | +| Redis Streams | Per-stream entry ID (timestamp-seq) | No — separate ID per stream | +| NATS JetStream | Per-stream sequence (uint64) | Yes — one stream captures multiple subjects | +| PubNub | Per-channel timetoken (nanosecond timestamp) | Yes — single timestamp spans channels | +| Google Pub/Sub | Per-subscription ack set | No | +| RabbitMQ Streams | Per-stream offset (uint64) | No | +| Amazon SQS/SNS | Ack-and-delete (no offset) | No | + +**Key finding:** No major system provides a true global offset across +independent topics. The two that offer unified multi-topic cursors do it +differently: + +- **NATS JetStream** defines a single stream that captures messages from + multiple subjects (via wildcards). The stream has one sequence counter. + Interleaving happens at write time. This is closest to our design. + +- **PubNub** uses a wall-clock nanosecond timestamp as the cursor, so a single + timetoken naturally spans channels. The tradeoff is timestamp-based ordering + rather than sequence-based. + +Every other system requires the consumer to maintain independent cursors per +topic/partition/stream. + +## Options Considered + +### Option A: Per-topic item count as cursor + +The subscriber's cursor represents "I've seen N items matching my filter." The +workflow translates that back to a global log position internally. + +- **Pro:** Zero information leakage. Total ordering preserved internally. +- **Con:** Resume requires translating per-topic offset → global log position. + Either O(n) scan on every poll, or a per-topic index that adds state to + manage through continue-as-new. Also, the cursor is coupled to the topic + filter — a cursor from `subscribe(["events"])` is meaningless if you later + call `subscribe(["events", "status"])`. + +### Option B: Opaque cursor wrapping the global offset + +Cursor is typed as `str`, documented as opaque. Internally contains the global +offset. + +- **Pro:** Zero internal complexity. O(1) resume. Cursor works regardless of + topic filter changes. +- **Con:** Information leakage remains observable to anyone who inspects cursor + values across polls. "Opaque" is a social contract, not a technical one. + Gaps in the underlying numbers are still visible. + +### Option C: Encrypted/HMAC'd global offset + +Same as B but cryptographically opaque. + +- **Pro:** Leakage is technically unobservable. +- **Con:** Requires a stable key across continue-as-new. Introduces crypto into + workflow code (determinism concerns). Complexity disproportionate to the + threat model — the subscriber already has access to its own data. + +### Option D: Per-topic offsets everywhere + +Separate log per topic. Each topic has its own 0-based sequence. + +- **Pro:** No leakage by construction. Simplest mental model per topic. +- **Con:** Loses total cross-topic ordering. Multi-topic subscription requires + merging N streams with no defined interleaving. More internal state. More + complex continue-as-new serialization. + +### Option E: Accept the leakage + +Keep global offsets exposed as-is (original design). + +- **Pro:** Simplest implementation. Offset = list index. +- **Con:** The information leakage identified above. + +### Option F: Per-topic offsets with cursor hints + +Per-topic offsets on the wire, single global log internally, opaque cursors +carrying a global position hint for efficient resume. + +- **Pro:** Zero information leakage. Global insertion order preserved. Efficient + resume via hints. Graceful degradation if hints are stale. +- **Con:** Cursor parsing/formatting logic. `topic_counts` dict that survives + continue-as-new. Multi-cursor alignment algorithm. Cursors are per-topic, + not portable across filter changes. Complexity unjustified for expected log + sizes (thousands of items where a filtered slice is microseconds). + +### Summary + +| | Leakage | Ordering | Resume cost | Complexity | Cursor portability | +|---|---|---|---|---|---| +| A. Per-topic count | None | Preserved | O(n) or extra state | Medium | Coupled to filter | +| B. Opaque global | Observable | Preserved | O(1) | Minimal | Filter-independent | +| C. Encrypted global | None | Preserved | O(1) | High | Filter-independent | +| D. Per-topic lists | None | **Lost** | O(1) | High | N/A | +| E. Accept it | Yes | Preserved | O(1) | None | Filter-independent | +| F. Per-topic + hints | None | Preserved | O(new items) | Medium-High | Per-topic only | + +## Design Decision: Global offsets with BFF-layer containment + +We evaluated per-topic offset models (Options A, D, F) and concluded that the +complexity is not justified. The information leakage concern is real but is +better addressed at the trust boundary (the BFF) than in the pub/sub API itself. + +### Why not per-topic offsets? + +The subscriber in our architecture is the BFF — trusted server-side code that +could just as easily subscribe to all topics. The threat model for information +leakage assumes untrusted multi-tenant subscribers (Kafka's world: separate +consumers for separate services). That does not apply to workflow-scoped +pub/sub, where one workflow serves one subscriber through a server-side proxy. + +Per-topic cursors (Option F) also sacrifice cursor portability. A global offset +is a stream position that works regardless of which topics you filter on. +Changing your topic filter does not invalidate your cursor. Per-topic cursors +are coupled to the filter — you need a separate cursor per topic, and adding a +topic to your subscription requires starting that topic from the beginning. + +### Why not just accept the leakage (Option E)? + +We accept the leakage **within the pub/sub API** (between workflow and BFF) but +contain it there. The global offset must not leak to the end client (browser). +The BFF is the trust boundary: it consumes global offsets from the workflow and +presents a clean, opaque interface to the browser. + +### The NATS JetStream model + +Our design follows the NATS JetStream model: one stream, multiple subjects, one +sequence counter. The industry survey identified this as the closest analogue, +and we adopt it directly. Topics are labels for server-side filtering, not +independent streams with independent cursors. + +### Information leakage containment at the BFF + +The BFF assigns its own gapless sequence numbers to SSE events using the +standard SSE `id` field. The browser sees `id: 1`, `id: 2`, `id: 3` — no gaps, +no global offsets, no information about other topics. + +On reconnect, the browser sends `Last-Event-ID` (built into the SSE spec). The +BFF maps that back to a global offset internally and resumes the subscription. + +This keeps: +- The **workflow API** simple (global offsets, single integer cursor) +- The **browser API** clean (SSE event IDs, no workflow internals) +- The **mapping** where it belongs (the BFF, which is the trust boundary) + +### Final design + +**Global offsets internally and on the pub/sub wire. Single append-only log. +BFF contains the leakage by assigning SSE event IDs at the trust boundary.** + +### Wire types + +```python +@dataclass +class PubSubItem: + topic: str + data: bytes + +@dataclass +class PollInput: + topics: list[str] = field(default_factory=list) + from_offset: int = 0 + timeout: float = 300.0 + +@dataclass +class PollResult: + items: list[PubSubItem] + next_offset: int = 0 +``` + +`PubSubItem` does not carry an offset. The global offset is an internal detail +exposed only through `PollResult.next_offset` and the `get_offset()` query. + +### `get_offset()` remains public + +The `__pubsub_offset` query returns the current log length (next offset). This +is essential for the "snapshot the watermark, then subscribe from there" pattern +used by the BFF: + +```python +start_offset = await pubsub.get_offset() # capture position before starting work +# ... start the agent turn ... +async for item in pubsub.subscribe(topics=["events"], from_offset=start_offset): + yield sse_event(item) +``` + +### Internal state + +```python +self._pubsub_log: list[PubSubItem] # single ordered log, all topics +self._base_offset: int = 0 # global offset of log[0] +``` + +The `base_offset` is 0 today. It exists to support future log truncation: when +a prefix of the log is discarded (e.g., after continue-as-new compaction), the +base offset advances so that global offsets remain monotonic across the +workflow's lifetime. All log access uses `self._pubsub_log[offset - self._base_offset]`. +If `offset < self._base_offset`, the subscriber has fallen behind the +truncation point — this is an error. + +Log truncation and compaction are deferred to a future design iteration. Until +then, the log grows without bound and `base_offset` remains 0. + +### Poll algorithm + +Given `from_offset = 4702`: + +1. Compute log index: `start = from_offset - self._base_offset`. +2. If `start < 0`, the subscriber fell behind truncation — raise error. +3. Slice: `self._pubsub_log[start:]`. +4. Filter to requested topics (if any). +5. Return filtered items plus `next_offset = self._base_offset + len(self._pubsub_log)`. + +**Efficiency:** O(new items since last poll). The global offset points directly +to where the last poll left off. No scanning, no alignment, no cursor parsing. + +### Continue-as-new state + +```python +@dataclass +class PubSubState: + log: list[PubSubItem] = field(default_factory=list) + base_offset: int = 0 +``` + +The full log is carried through continue-as-new. Truncation (discarding a +prefix and advancing `base_offset`) is deferred to a future iteration. + +### Properties + +- **No leakage to end clients.** Global offsets stay between workflow and BFF. + The browser sees SSE event IDs assigned by the BFF. +- **Global insertion order preserved.** Poll responses return items in the order + they were published, across all requested topics. +- **Efficient resume.** O(new items) — the offset points directly to the + resume position. +- **Cursor portability.** The global offset works regardless of topic filter. + Change your topic filter without invalidating your cursor. +- **Simple internal state.** One list, one integer. No auxiliary data structures, + no per-topic indices, no cursor parsing. +- **Truncation-ready.** `base_offset` supports future log prefix removal + without changing the offset model or the external API. + +## Relationship to Other Addenda + +The [continue-as-new addendum](./DESIGN-ADDENDUM-CAN.md) remains valid. The +CAN state shape is `PubSubState` with `log` and `base_offset`. The +drain/validator/follow-CAN-chain mechanisms are unaffected. diff --git a/temporalio/contrib/pubsub/DESIGN-v2.md b/temporalio/contrib/pubsub/DESIGN-v2.md new file mode 100644 index 000000000..0a5739d01 --- /dev/null +++ b/temporalio/contrib/pubsub/DESIGN-v2.md @@ -0,0 +1,634 @@ +# Temporal Workflow Pub/Sub — Design Document v2 + +Consolidated design document reflecting the current implementation. +Supersedes [DESIGN.md](./DESIGN.md) and its addenda +([CAN](./DESIGN-ADDENDUM-CAN.md), [Topics](./DESIGN-ADDENDUM-TOPICS.md), +[Dedup](./DESIGN-ADDENDUM-DEDUP.md)), which are preserved as historical +records of the design exploration. + +## Overview + +A reusable pub/sub module for Temporal workflows. The workflow acts as the +message broker — it holds an append-only log of `(topic, data)` entries. +External clients (activities, starters, other services) publish and subscribe +through the workflow handle using Temporal primitives (signals, updates, +queries). + +The module ships as `temporalio.contrib.pubsub` in the Python SDK and is +designed to be cross-language compatible. Payloads are opaque byte strings — +the workflow does not interpret them. + +## Architecture + +``` + ┌──────────────────────────────────┐ + │ Temporal Workflow │ + │ (PubSubMixin) │ + │ │ + │ ┌─────────────────────────────┐ │ + │ │ Append-only log │ │ + │ │ [(topic, data), ...] │ │ + │ │ base_offset: int │ │ + │ │ publisher_sequences: {} │ │ + │ └─────────────────────────────┘ │ + │ │ + signal ──────────►│ __pubsub_publish (with dedup) │ + update ──────────►│ __pubsub_poll (long-poll) │◄── subscribe() + query ──────────►│ __pubsub_offset │ + │ │ + │ publish() ── workflow-side │ + └──────────────────────────────────┘ + │ + │ continue-as-new + ▼ + ┌──────────────────────────────────┐ + │ PubSubState carries: │ + │ log, base_offset, │ + │ publisher_sequences │ + └──────────────────────────────────┘ +``` + +## API Surface + +### Workflow side — `PubSubMixin` + +A mixin class that adds signal, update, and query handlers to any workflow. + +```python +from temporalio import workflow +from temporalio.contrib.pubsub import PubSubMixin + +@workflow.defn +class MyWorkflow(PubSubMixin): + @workflow.init + def __init__(self, input: MyInput) -> None: + self.init_pubsub() + + @workflow.run + async def run(self, input: MyInput) -> None: + self.publish("status", b"started") + await do_work() + self.publish("status", b"done") +``` + +Call `init_pubsub()` in `__init__` for fresh workflows. When accepting +continue-as-new state, call it in `run()` with the `prior_state` argument +(see [Continue-as-New](#continue-as-new)). + +| Method / Handler | Kind | Description | +|---|---|---| +| `init_pubsub(prior_state=None)` | instance method | Initialize internal state. Must be called before use. | +| `publish(topic, data)` | instance method | Append to the log from workflow code. | +| `get_pubsub_state(publisher_ttl=900)` | instance method | Snapshot for CAN. Prunes dedup entries older than TTL. | +| `drain_pubsub()` | instance method | Unblock polls and reject new ones for CAN. | +| `truncate_pubsub(up_to_offset)` | instance method | Discard log entries before offset. | +| `__pubsub_publish` | `@workflow.signal` | Receives publications from external clients (with dedup). | +| `__pubsub_poll` | `@workflow.update` | Long-poll subscription: blocks until new items or drain. | +| `__pubsub_offset` | `@workflow.query` | Returns the current global offset. | + +### Client side — `PubSubClient` + +Used by activities, starters, and any code with a workflow handle. + +```python +from temporalio.contrib.pubsub import PubSubClient + +# Preferred: factory method (enables CAN following + activity auto-detect) +client = PubSubClient.create(temporal_client, workflow_id) + +# --- Publishing (with batching) --- +async with client: + client.publish("events", b'{"type":"TEXT_DELTA","delta":"hello"}') + client.publish("events", b'{"type":"TEXT_DELTA","delta":" world"}') + client.publish("events", b'{"type":"TEXT_COMPLETE"}', priority=True) + +# --- Subscribing --- +async for item in client.subscribe(["events"], from_offset=0): + print(item.topic, item.data) + if is_done(item): + break +``` + +| Method | Description | +|---|---| +| `PubSubClient.create(client?, wf_id?)` | Factory (preferred). Auto-detects activity context if args omitted. | +| `PubSubClient(handle)` | From handle directly (no CAN following). | +| `publish(topic, data, priority=False)` | Buffer a message. Priority triggers immediate flush (fire-and-forget). | +| `subscribe(topics, from_offset, poll_cooldown=0.1)` | Async iterator. Always follows CAN chains when created via `create`. | +| `get_offset()` | Query current global offset. | + +Use as `async with` for batched publishing with automatic flush on exit. +There is no public `flush()` method — use `priority=True` on `publish()` +for immediate delivery, or rely on the background flusher and context +manager exit flush. + +#### Activity convenience + +When called from within an activity, `client` and `workflow_id` can be +omitted from `create()` — they are inferred from the activity context: + +```python +@activity.defn +async def stream_events() -> None: + client = PubSubClient.create(batch_interval=2.0) + async with client: + for chunk in generate_chunks(): + client.publish("events", chunk) + activity.heartbeat() +``` + +## Data Types + +```python +@dataclass +class PubSubItem: + topic: str # Topic string + data: bytes # Opaque payload + +@dataclass +class PublishEntry: + topic: str + data: bytes + +@dataclass +class PublishInput: + items: list[PublishEntry] + publisher_id: str = "" # For exactly-once dedup + sequence: int = 0 # Monotonic per publisher + +@dataclass +class PollInput: + topics: list[str] # Filter (empty = all) + from_offset: int = 0 # Global offset to resume from + +@dataclass +class PollResult: + items: list[PubSubItem] + next_offset: int = 0 # Offset for next poll + +@dataclass +class PubSubState: + log: list[PubSubItem] = field(default_factory=list) + base_offset: int = 0 + publisher_sequences: dict[str, int] = field(default_factory=dict) + publisher_last_seen: dict[str, float] = field(default_factory=dict) # For TTL pruning +``` + +`PubSubItem` does not carry an offset field. The global offset is derived +from the item's position in the log plus `base_offset`. It is exposed only +through `PollResult.next_offset` and the `__pubsub_offset` query. + +The containing workflow input must type the field as `PubSubState | None`, +not `Any` — `Any`-typed fields deserialize as plain dicts, losing the type. + +## Design Decisions + +### 1. Topics are plain strings, no hierarchy + +Topics are exact-match strings. No prefix matching, no wildcards. A subscriber +provides a list of topic strings to filter on; an empty list means "all topics." + +### 2. Items are opaque byte strings + +The workflow does not interpret payloads. This enables cross-language +compatibility. The pub/sub layer is transport; application semantics belong +in the application. + +### 3. Global offsets, NATS JetStream model + +Every entry gets a global offset from a single counter. Subscribers filter by +topic but advance through the global offset space. + +We surveyed offset models across Kafka, Redis Streams, NATS JetStream, PubNub, +Google Pub/Sub, RabbitMQ Streams, and Amazon SQS/SNS. No major system provides +a true global offset across independent topics. The two closest: + +- **NATS JetStream**: one stream captures multiple subjects via wildcards, with + a single sequence counter. This is our model. +- **PubNub**: wall-clock nanosecond timestamp as cursor across channels. + +We evaluated six alternatives for handling the information leakage that global +offsets create (a single-topic subscriber can infer other-topic activity from +gaps): per-topic counts, opaque cursors, encrypted cursors, per-topic lists, +per-topic offsets with cursor hints, and accepting the leakage. See +[DESIGN-ADDENDUM-TOPICS.md](./DESIGN-ADDENDUM-TOPICS.md) for the full +analysis. + +**Decision:** Global offsets are the right choice for workflow-scoped pub/sub. + +**Why not per-topic offsets?** The most sophisticated alternative — per-topic +offsets with opaque cursors carrying global position hints (Option F in the +addendum) — was rejected for three reasons: + +1. **The threat model doesn't apply.** Information leakage assumes untrusted + multi-tenant subscribers who shouldn't learn about each other's traffic + volumes. That's Kafka's world — separate consumers for separate services. + In workflow-scoped pub/sub, the subscriber is the BFF: trusted server-side + code that could just as easily subscribe to all topics. + +2. **Cursor portability.** A global offset is a stream position that works + regardless of which topics you filter on. You can subscribe to `["events"]`, + then later subscribe to `["events", "thinking"]` with the same offset. + Per-topic cursors are coupled to the filter — you need a separate cursor per + topic, and adding a topic to your subscription requires starting it from the + beginning. + +3. **Unjustified complexity.** Per-topic cursors require cursor + parsing/formatting, a `topic_counts` dict that survives continue-as-new, a + multi-cursor alignment algorithm, and stale-hint fallback paths. For log + sizes of thousands of items where a filtered slice is microseconds, this + machinery adds cost without measurable benefit. + +**Leakage is contained at the BFF trust boundary.** The global offset stays +between workflow and BFF. The BFF assigns its own gapless SSE event IDs to the +browser. The global offset never reaches the end client. See +[Information Leakage and the BFF](#information-leakage-and-the-bff) for the +full mechanism. + +### 4. No topic creation + +Topics are implicit. Publishing to a topic creates it. Subscribing to a +nonexistent topic returns no items and waits for new ones. + +### 5. Priority forces flush, does not reorder + +`priority=True` causes the client to immediately flush its buffer. It does NOT +reorder items — the priority item appears in its natural position after any +previously-buffered items. The purpose is latency-sensitive delivery, not +importance ranking. + +### 6. Session ordering + +Publications from a single client are ordered. This relies on two Temporal +guarantees: (1) signals sent sequentially from the same client appear in +workflow history in send order, and (2) signal handlers are invoked in +history order. The `PubSubClient` flush lock ensures signals are never in +flight concurrently, so both guarantees apply. + +Concurrent publishers get a total order in the log (the workflow serializes +all signal processing), but the interleaving is nondeterministic — it depends +on arrival order at the server. Per-publisher ordering is preserved. This is +formally verified as `OrderPreservedPerPublisher` in `PubSubDedupTTL.tla`. + +Once items are in the log, their order is stable — reads are repeatable. + +### 7. Batching is built into the client + +`PubSubClient` includes a Nagle-like batcher (buffer + timer). The async +context manager starts a background flush task; exiting cancels it and does a +final flush. Batching amortizes Temporal signal overhead. + +Parameters: +- `batch_interval` (default 2.0s): timer between automatic flushes. +- `max_batch_size` (optional): auto-flush when buffer reaches this size. + +### 8. Subscription is poll-based, exposed as async iterator + +The primitive is `__pubsub_poll` (a Temporal update with `wait_condition`). +`subscribe()` wraps this in an `AsyncIterator` with a configurable +`poll_interval` (default 0.1s) to rate-limit polls. + +Temporal has no server-push to external clients. Updates with `wait_condition` +are the closest thing — the workflow blocks until data is available. + +**Poll efficiency.** The poll slices `self._pubsub_log[from_offset - base_offset:]` +and filters by topic. The common case — single topic, continuing from last +poll — is O(new items since last poll). The global offset points directly to +the resume position with no scanning or cursor alignment. Multi-topic polls +are the same cost: one slice, one filter pass. The worst case is a poll from +offset 0 (full log scan), which only happens on first connection or after the +subscriber falls behind. + +### 9. Workflow can publish but should not subscribe + +Workflow code can call `self.publish()` directly — this is deterministic. +Reading from the log within workflow code is possible but breaks the +failure-free abstraction because external publishers send data via signals +(non-deterministic inputs), and branching on signal content creates +replay-sensitive code paths. + +### 10. `base_offset` for future truncation + +The log carries a `base_offset` (0 today). All offset arithmetic uses +`offset - base_offset` to index into the log. This supports future log +truncation: discard a prefix of consumed entries, advance `base_offset`, +and global offsets remain monotonic. If `offset < base_offset`, the +subscriber has fallen behind truncation — the poll raises an error. + +Truncation is deferred to a future iteration. Until then, the log grows +without bound within a run and is compacted only through continue-as-new. + +### 11. No timeout on long-poll + +`wait_condition` in the poll handler has no timeout. The poll blocks +indefinitely until one of three things happens: + +1. **New data arrives** — the `len(log) > offset` condition fires. +2. **Draining for continue-as-new** — `drain_pubsub()` sets the flag. +3. **Client disconnects** — the BFF drops the SSE connection, cancels the + update RPC, and the handler becomes an inert coroutine cleaned up at + the next drain cycle. + +A previous design used a 5-minute timeout as a defensive "don't block +forever" mechanism. This was removed because: + +- **It adds unnecessary history events.** Every poll creates a `TimerStarted` + event. For a streaming session doing hundreds of polls, this doubles the + history event count and accelerates approach to the ~50K event CAN threshold. +- **The drain mechanism already handles cleanup.** `drain_pubsub()` unblocks + all waiting polls, and the update validator rejects new polls, so + `all_handlers_finished()` converges without timers. +- **Zombie polls are harmless.** If a client crashes without cancelling, its + poll handler is just an in-memory coroutine waiting on a condition. It + consumes no Temporal actions and is cleaned up at the next CAN cycle. + +## Exactly-Once Publish Delivery + +External publishers get exactly-once delivery through publisher ID + sequence +number deduplication, following the Kafka producer model. + +### Problem + +`flush()` sends items via a Temporal signal. If the signal call raises after +the server accepted it (e.g., network timeout on the response), the client +cannot distinguish delivered from not-delivered. Without dedup, the client +must choose between at-most-once (data loss) and at-least-once (silent +duplication). + +### Solution + +Each `PubSubClient` instance generates a UUID (`publisher_id`) on creation. +Each `flush()` increments a monotonic `sequence` counter. The signal payload +includes both. The workflow tracks the highest seen sequence per publisher in +`_publisher_sequences: dict[str, int]` and rejects any signal with +`sequence <= last_seen`. + +``` +Client Workflow + │ │ + │ signal(publisher_id, seq=1) │ + │───────────────────────────────────►│ seq 1 > 0 → accept, record seq=1 + │ │ + │ signal(publisher_id, seq=1) │ (retry after timeout) + │───────────────────────────────────►│ seq 1 <= 1 → reject (duplicate) + │ │ + │ signal(publisher_id, seq=2) │ + │───────────────────────────────────►│ seq 2 > 1 → accept, record seq=2 +``` + +### Client-side flush (TLA+-verified algorithm) + +The flush algorithm has been formally verified using TLA+ model checking. +See `verification/PROOF.md` for the full correctness proof and +`verification/PubSubDedup.tla` for the spec. + +```python +async def _flush(self) -> None: + async with self._flush_lock: + if self._pending is not None: + # Retry failed batch with same sequence + batch = self._pending + seq = self._pending_seq + elif self._buffer: + # New batch + seq = self._sequence + 1 + batch = self._buffer + self._buffer = [] + self._pending = batch + self._pending_seq = seq + else: + return + try: + await self._handle.signal( + "__pubsub_publish", + PublishInput(items=batch, publisher_id=self._publisher_id, + sequence=seq), + ) + self._sequence = seq # advance confirmed sequence + self._pending = None # clear pending + except Exception: + pass # pending stays for retry + raise +``` + +- **Separate pending from buffer**: failed batches stay in `_pending`, not + restored to `_buffer`. New `publish()` calls during retry go to the fresh + buffer. This prevents the data-loss bug where items would be merged into a + retry batch under a different sequence number. +- **Retry with same sequence**: on failure, the next `_flush()` retries the + same `_pending` with the same `_pending_seq`. If the signal was delivered + but the client saw an error, the workflow deduplicates the retry. +- **Sequence advances only on success**: `_sequence` (confirmed) is updated + only after the signal call returns without error. +- **Lock for coalescing**: concurrent `_flush()` callers queue on the lock. +- **max_retry_duration**: if set, the client gives up retrying after this + duration and raises `TimeoutError`. Must be less than the workflow's + `publisher_ttl` to preserve exactly-once guarantees. + +### Dedup state and TTL pruning + +`publisher_sequences` is `dict[str, int]` — bounded by number of publishers +(typically 1-2), not number of flushes. Carried through continue-as-new in +`PubSubState`. If `publisher_id` is empty (workflow-internal publish or legacy +client), dedup is skipped. + +`publisher_last_seen` tracks the last `workflow.time()` each publisher was +seen. During `get_pubsub_state(publisher_ttl=900)`, entries older than TTL +are pruned to bound memory across long-lived workflow chains. + +**Safety constraint**: `publisher_ttl` must exceed the client's +`max_retry_duration`. If a publisher's dedup entry is pruned while it still +has a pending retry, the retry could be accepted as new, creating duplicates. +This is formally verified in `verification/PubSubDedupTTL.tla` — TLC finds +the counterexample for unsafe pruning and confirms safe pruning preserves +NoDuplicates. + +## Continue-as-New + +### Problem + +The pub/sub mixin accumulates workflow history through signals (each +`__pubsub_publish`) and updates (each `__pubsub_poll` response). Over a +streaming session, history grows toward the ~50K event threshold. CAN resets +the history while carrying the canonical log copy forward. + +### State + +```python +@dataclass +class PubSubState: + log: list[PubSubItem] = field(default_factory=list) + base_offset: int = 0 + publisher_sequences: dict[str, int] = field(default_factory=dict) + publisher_last_seen: dict[str, float] = field(default_factory=dict) +``` + +`init_pubsub(prior_state)` restores all four fields. `get_pubsub_state()` +snapshots them. + +### Draining + +A long-poll `__pubsub_poll` blocks indefinitely until new data arrives. To +allow CAN to proceed, draining uses two mechanisms: + +1. **`drain_pubsub()`** sets a flag that unblocks all waiting poll handlers + (the `or self._pubsub_draining` clause in `wait_condition`). +2. **Update validator** rejects new polls when draining, so no new handlers + start and `all_handlers_finished()` stabilizes. + +```python +# CAN sequence in the parent workflow: +self.drain_pubsub() +await workflow.wait_condition(workflow.all_handlers_finished) +workflow.continue_as_new(args=[WorkflowInput( + pubsub_state=self.get_pubsub_state(), +)]) +``` + +### Client-side CAN following + +`subscribe()` always follows CAN chains when the client was created via +`for_workflow()`. When a poll fails with +`WorkflowUpdateRPCTimeoutOrCancelledError`, the client calls `describe()` on +the handle. If the status is `CONTINUED_AS_NEW`, it gets a fresh handle for +the same workflow ID (targeting the latest run) and retries the poll from the +same offset. + +```python +async def _follow_continue_as_new(self) -> bool: + if self._client is None: + return False + try: + desc = await self._handle.describe() + except Exception: + return False + if desc.status == WorkflowExecutionStatus.CONTINUED_AS_NEW: + self._handle = self._client.get_workflow_handle(self._workflow_id) + return True + return False +``` + +The `describe()` check prevents infinite loops: if the workflow completed or +failed (not CAN), the subscriber stops instead of retrying. + +### Offset continuity + +Since the full log is carried forward: + +- Pre-CAN: offsets `0..N-1`, log length N. +- Post-CAN: `init_pubsub(prior_state)` restores N items. New appends start + at offset N. +- A subscriber at offset K resumes seamlessly against the new run. + +### Edge cases + +**Payload size limit.** The full log in CAN input could approach Temporal's +2 MB limit for very long sessions. Mitigation: truncation (discarding consumed +entries before CAN) is the natural extension, supported by `base_offset`. + +**Signal delivery during CAN.** A publisher sending mid-CAN may get errors if +its handle is pinned to the old run. The workflow should ensure activities +complete before triggering CAN. + +**Concurrent subscribers.** Each maintains its own offset. Sharing a +`PubSubClient` across concurrent `subscribe()` calls is safe. + +## Information Leakage and the BFF + +Global offsets leak cross-topic activity (a single-topic subscriber sees gaps). +This is acceptable within the pub/sub API because the subscriber is the BFF — +trusted server-side code. The leakage must not reach the end client (browser). + +### The problem + +If the BFF forwarded `PollResult.next_offset` to the browser (e.g., as an SSE +reconnection cursor), the browser could observe gaps and infer activity on +topics it is not subscribed to. Even if the offset is "opaque," a monotonic +integer with gaps is trivially inspectable. + +### Options considered + +We evaluated four approaches for browser-side reconnection: + +1. **BFF tracks the cursor server-side.** The BFF maintains a per-session + `session_id → last_offset` mapping. The browser reconnects with just the + session ID. On BFF restart, cursors are lost — fall back to replaying from + turn start. + +2. **Opaque token from the BFF.** The BFF wraps the global offset in an + encoded or encrypted token. The browser passes it back on reconnect. + `base64(offset)` is trivially reversible (security theater); real encryption + needs a key and adds a layer for marginal benefit over option 1. + +3. **BFF assigns SSE event IDs with `Last-Event-ID`.** The BFF emits SSE + events with `id: 1`, `id: 2`, `id: 3` (a BFF-local counter per stream). + On reconnect, the browser sends `Last-Event-ID` (built into the SSE spec). + The BFF maps that back to a global offset internally. + +4. **No mid-stream resume.** Browser reconnects, BFF replays from start of + the current turn. Frontend deduplicates. Simplest, but replays more data + than necessary. + +### Decision: SSE event IDs (option 3) + +The BFF assigns gapless integer IDs to SSE events and maintains a small +mapping from SSE event index to global offset. The browser never sees the +workflow's offset — it sees the BFF's event numbering. + +```python +sse_id = 0 +sse_id_to_offset: dict[int, int] = {} + +start_offset = await pubsub.get_offset() +async for item in pubsub.subscribe(topics=["events"], from_offset=start_offset): + sse_id += 1 + sse_id_to_offset[sse_id] = item_global_offset + yield f"id: {sse_id}\ndata: {item.data}\n\n" +``` + +On reconnect, the browser sends `Last-Event-ID: 47`. The BFF looks up the +corresponding global offset and resumes the subscription from there. + +The BFF is already per-session and stateful (it holds the SSE connection). +The `sse_id → global_offset` mapping is negligible additional state. On BFF +restart, the mapping is lost — fall back to replaying from turn start (option +4), which is acceptable because agent turns produce modest event volumes and +the frontend reducer is idempotent. + +This uses the SSE spec as designed: `Last-Event-ID` exists for exactly this +reconnection pattern. + +## Cross-Language Protocol + +Any Temporal client in any language can interact with a pub/sub workflow by: + +1. **Publishing**: Signal `__pubsub_publish` with `PublishInput` payload +2. **Subscribing**: Execute update `__pubsub_poll` with `PollInput`, loop +3. **Checking offset**: Query `__pubsub_offset` + +Double-underscore prefix on handler names avoids collisions with application +signals/updates. The payload types are simple composites of strings, bytes, +and ints — representable in every Temporal SDK's default data converter. + +## File Layout + +``` +temporalio/contrib/pubsub/ +├── __init__.py # Public API exports +├── _mixin.py # PubSubMixin (workflow-side) +├── _client.py # PubSubClient (external-side) +├── _types.py # Shared data types +├── README.md # Usage documentation +├── DESIGN-v2.md # This document +├── DESIGN.md # Historical: original design +├── DESIGN-ADDENDUM-CAN.md # Historical: CAN exploration +├── DESIGN-ADDENDUM-TOPICS.md # Historical: offset model exploration +├── DESIGN-ADDENDUM-DEDUP.md # Historical: dedup exploration +└── verification/ # TLA+ formal verification + ├── README.md # Overview and running instructions + ├── PROOF.md # Full correctness proof + ├── PubSubDedup.tla # Correct single-publisher protocol + ├── PubSubDedupInductive.tla # Inductive invariant (unbounded proof) + ├── PubSubDedupTTL.tla # Multi-publisher + TTL pruning + └── PubSubDedupBroken.tla # Old (broken) algorithm — counterexample +``` diff --git a/temporalio/contrib/pubsub/DESIGN.md b/temporalio/contrib/pubsub/DESIGN.md new file mode 100644 index 000000000..da5914664 --- /dev/null +++ b/temporalio/contrib/pubsub/DESIGN.md @@ -0,0 +1,299 @@ +# Temporal Workflow Pub/Sub — Design Document + +## Overview + +A reusable pub/sub module for Temporal workflows. The workflow acts as the message +broker — it holds an append-only log of `(offset, topic, data)` entries. External +clients (activities, starters, other services) publish and subscribe through the +workflow handle using Temporal primitives (signals, updates, queries). + +The module ships as `temporalio.contrib.pubsub` in the Python SDK and is designed +to be cross-language compatible. Payloads are opaque byte strings — the workflow +does not interpret them. + +## API Surface + +### Workflow side — `PubSubMixin` + +A mixin class that adds signal, update, and query handlers to any workflow. + +```python +from temporalio.contrib.pubsub import PubSubMixin + +@workflow.defn +class MyWorkflow(PubSubMixin): + @workflow.run + async def run(self, input: MyInput) -> MyOutput: + self.init_pubsub() + # The workflow is now a pub/sub broker. + # It can also publish directly: + self.publish("status", b"started") + await do_work() + self.publish("status", b"done") +``` + +`PubSubMixin` provides: + +| Method / Handler | Kind | Description | +|---|---|---| +| `init_pubsub()` | instance method | Initialize internal state. Must be called before use. | +| `publish(topic, data, priority=False)` | instance method | Append to the log from workflow code. | +| `__pubsub_publish` | `@workflow.signal` | Receives publications from external clients. | +| `__pubsub_poll` | `@workflow.update` | Long-poll subscription: blocks until new items or completion. | +| `__pubsub_offset` | `@workflow.query` | Returns the current log length (next offset). | + +Double-underscore prefix on handler names avoids collisions with application signals/updates. + +### Client side — `PubSubClient` + +Used by activities, starters, and any code with a workflow handle. + +```python +from temporalio.contrib.pubsub import PubSubClient + +client = PubSubClient(workflow_handle, batch_interval=2.0) + +# --- Publishing --- +async with client: + client.publish("events", b'{"type":"TEXT_DELTA","delta":"hello"}') + client.publish("events", b'{"type":"TEXT_DELTA","delta":" world"}') + client.publish("events", b'{"type":"TEXT_COMPLETE"}', priority=True) + # priority=True forces an immediate flush + # context manager exit flushes remaining buffer + +# --- Subscribing --- +async for item in client.subscribe(["events"], from_offset=0): + print(item.offset, item.topic, item.data) + if is_done(item): + break +``` + +### `PubSubClient` details + +| Method | Description | +|---|---| +| `publish(topic, data, priority=False)` | Buffer a message. If `priority=True`, flush immediately. | +| `flush()` | Send all buffered messages to the workflow via signal. | +| `subscribe(topics, from_offset=0)` | Returns an `AsyncIterator[PubSubItem]`. Internally polls via the `__pubsub_poll` update. | +| `get_offset()` | Query the current log offset. | + +Constructor parameters: + +| Parameter | Default | Description | +|---|---|---| +| `handle` | required | `WorkflowHandle` to the broker workflow. | +| `batch_interval` | `2.0` | Seconds between automatic flushes. | + +The client implements `AsyncContextManager`. Entering starts the background flush +timer; exiting cancels it and does a final flush. + +### Activity convenience + +```python +from temporalio.contrib.pubsub import PubSubClient +from temporalio import activity + +async def get_pubsub_client(**kwargs) -> PubSubClient: + """Create a PubSubClient for the current activity's parent workflow.""" + info = activity.info() + handle = activity.client().get_workflow_handle(info.workflow_id) + return PubSubClient(handle, **kwargs) +``` + +## Data Types + +All types use standard Temporal serialization (default data converter) for +cross-language compatibility. + +```python +@dataclass +class PubSubItem: + offset: int # Global monotonic offset + topic: str # Topic string + data: bytes # Opaque payload + +@dataclass +class PublishInput: + items: list[PublishEntry] + +@dataclass +class PublishEntry: + topic: str + data: bytes + priority: bool = False + +@dataclass +class PollInput: + topics: list[str] # Filter to these topics (empty = all) + from_offset: int # Start reading from this global offset + timeout: float = 300.0 # Server-side wait timeout + +@dataclass +class PollResult: + items: list[PubSubItem] + next_offset: int # Offset for next poll call +``` + +## Design Decisions + +### 1. Topics are plain strings, no hierarchy + +Topics are exact-match strings. No prefix matching, no wildcards. A subscriber +provides a list of topic strings to filter on; an empty list means "all topics." + +**Rationale**: Simplicity. Prefix matching adds implementation complexity and is +rarely needed for the streaming use cases this targets. + +### 2. Items are opaque byte strings + +The workflow does not interpret payloads. This enables cross-language +compatibility — each SDK's client serializes/deserializes in its own language. + +**Rationale**: The pub/sub layer is transport. Application semantics belong in the +application. + +### 3. Global monotonic offsets, not per-topic + +Every entry gets a global offset from a single counter. Subscribers filter by topic +but advance through the global offset space. + +**Rationale**: Simpler implementation. Global ordering means a subscriber to +multiple topics sees a consistent interleaving. The tradeoff is that a +single-topic subscriber may see gaps in offset numbers — but `next_offset` in +`PollResult` handles continuation cleanly. + +### 4. No topic creation + +Topics are implicit. Publishing to a topic creates it. Subscribing to a +nonexistent topic returns no items (and waits for new ones). + +**Rationale**: Eliminates a management API and lifecycle concerns. Matches the +lightweight "just strings" philosophy. + +### 5. Priority forces flush, does not reorder + +Setting `priority=True` on a publish causes the client to immediately flush its +buffer. It does NOT reorder items in the log — the priority item appears in its +natural position after any previously-buffered items. + +**Rationale**: Reordering would break the append-only log invariant and complicate +offset semantics. The purpose of priority is latency-sensitive delivery (e.g., +"thinking complete" events), not importance ranking. + +### 6. Session ordering + +Publications from a single client are ordered. The workflow serializes all signal +processing, so concurrent publishers get a total order (though the interleaving is +nondeterministic). Once items are in the log, their order is stable — reads are +repeatable. + +### 7. Batching is built into the client + +The `PubSubClient` includes a Nagle-like batcher (buffer + timer). This is the +same pattern as the existing `EventBatcher` but generalized. Batching amortizes +Temporal signal overhead — instead of one signal per token, a 2-second window +batches hundreds of tokens into a single signal. + +### 8. Subscription is poll-based, exposed as async iterator + +The primitive is `__pubsub_poll` (a Temporal update with `wait_condition`). The +`subscribe()` method wraps this in an `AsyncIterator` that handles polling, +reconnection, and yielding items one at a time. + +**Why poll, not push**: Temporal has no server-push to external clients. Updates +with `wait_condition` are the closest thing — the workflow blocks until data is +available, so the client doesn't busy-wait. + +**Why async iterator**: Idiomatic Python. Matches what users expect from +Kafka consumers, Redis XREAD, NATS subscriptions, etc. + +### 9. Workflow can publish but should not subscribe + +Workflow code can call `self.publish()` directly — this is deterministic (appends +to a list). Reading from the log within workflow code is also possible via +`self._pubsub_log` but breaks the failure-free abstraction because: + +- External publishers send data via signals, which are non-deterministic inputs +- Branching on signal content creates replay-sensitive code paths + +If a workflow needs to react to published data, it should do so in signal handlers, +not by polling its own log. + +### 10. Event retention: full log for workflow lifetime (future: snapshot + truncate) + +For now, the log grows unbounded for the workflow's lifetime. This is acceptable +for the target use cases (streaming agent sessions lasting minutes to hours). + +**Future extension — snapshot + truncate**: + +1. `snapshot(topic)` → serialize current subscriber state as a special log entry +2. `truncate(before_offset)` → discard entries before the offset +3. Offsets remain monotonic (never reset) +4. New subscribers start from the snapshot entry +5. Natural integration with `continue_as_new()` — carry the snapshot forward + +This follows the event sourcing pattern (snapshot + event replay) and is analogous +to Kafka's log compaction. We note it here as a planned extension but do not +implement it in v1. + +## Signal / Update / Query Names + +For cross-language interop, the handler names are fixed strings: + +| Handler | Temporal name | Kind | +|---|---|---| +| `__pubsub_publish` | `__pubsub_publish` | signal | +| `__pubsub_poll` | `__pubsub_poll` | update | +| `__pubsub_offset` | `__pubsub_offset` | query | + +Other language SDKs implementing the same protocol must use these exact names. + +## Cross-Language Protocol + +Any Temporal client in any language can interact with a pub/sub workflow by: + +1. **Publishing**: Send signal `__pubsub_publish` with `PublishInput` payload +2. **Subscribing**: Execute update `__pubsub_poll` with `PollInput`, loop +3. **Checking offset**: Query `__pubsub_offset` + +The payload types are simple composites of strings, bytes, ints, and bools — all +representable in every Temporal SDK's default data converter. + +## File Layout + +``` +temporalio/contrib/pubsub/ +├── __init__.py # Public API exports +├── _mixin.py # PubSubMixin (workflow-side) +├── _client.py # PubSubClient (external-side, includes batcher) +├── _types.py # Shared data types +└── README.md # Usage documentation +``` + +## Local Development + +To use the local sdk-python with temporal-streaming-agents-samples: + +```toml +# In temporal-streaming-agents-samples/backend-temporal/pyproject.toml +[tool.uv.sources] +temporalio = { path = "../../../sdk-python", editable = true } +``` + +This requires `maturin develop` to have been run at least once (for the Rust +bridge), but subsequent Python-only changes are reflected immediately. + +## Migration Plan (temporal-streaming-agents-samples) + +The existing streaming code maps directly to the new contrib: + +| Current code | Replaces with | +|---|---| +| `EventBatcher` | `PubSubClient` (with batching) | +| `receive_events` signal | `__pubsub_publish` signal (from mixin) | +| `poll_events` update | `__pubsub_poll` update (from mixin) | +| `get_event_count` query | `__pubsub_offset` query (from mixin) | +| `_event_list` state | `PubSubMixin._pubsub_log` | +| `_get_batcher()` helper | `get_pubsub_client()` or `PubSubClient(handle)` | +| `ActivityEventsInput` | `PublishInput` | +| `PollEventsInput/Result` | `PollInput/PollResult` | diff --git a/temporalio/contrib/pubsub/README.md b/temporalio/contrib/pubsub/README.md new file mode 100644 index 000000000..a18e2024b --- /dev/null +++ b/temporalio/contrib/pubsub/README.md @@ -0,0 +1,165 @@ +# Temporal Workflow Pub/Sub + +Reusable pub/sub for Temporal workflows. The workflow acts as a message broker +with an append-only log. External clients (activities, starters, other services) +publish and subscribe through the workflow handle using Temporal primitives. + +Payloads are base64-encoded byte strings for cross-language compatibility. + +## Quick Start + +### Workflow side + +Add `PubSubMixin` to your workflow and call `init_pubsub()`: + +```python +from temporalio import workflow +from temporalio.contrib.pubsub import PubSubMixin + +@workflow.defn +class MyWorkflow(PubSubMixin): + @workflow.init + def __init__(self, input: MyInput) -> None: + self.init_pubsub() + + @workflow.run + async def run(self, input: MyInput) -> None: + self.publish("status", b"started") + await do_work() + self.publish("status", b"done") +``` + +### Activity side (publishing) + +Use `PubSubClient.for_workflow()` with the async context manager for batched +publishing. When called from within an activity, the client and workflow ID +are inferred automatically: + +```python +from temporalio import activity +from temporalio.contrib.pubsub import PubSubClient + +@activity.defn +async def stream_events() -> None: + client = PubSubClient.for_workflow(batch_interval=2.0) + async with client: + for chunk in generate_chunks(): + client.publish("events", chunk) + activity.heartbeat() + # Buffer is flushed automatically on context manager exit +``` + +Use `priority=True` to flush immediately for latency-sensitive events: + +```python +client.publish("events", data, priority=True) +``` + +### Subscribing + +Use `PubSubClient.for_workflow()` and the `subscribe()` async iterator: + +```python +from temporalio.contrib.pubsub import PubSubClient + +client = PubSubClient.for_workflow(temporal_client, workflow_id) +async for item in client.subscribe(["events"], from_offset=0): + print(item.topic, item.data) + if is_done(item): + break +``` + +## Topics + +Topics are plain strings with exact matching. No hierarchy or wildcards. + +- Publish to one topic at a time +- Subscribe to a list of topics (empty list = all topics) +- Publishing to a topic implicitly creates it + +## Continue-as-new + +Carry pub/sub state across continue-as-new boundaries: + +```python +from dataclasses import dataclass +from temporalio import workflow +from temporalio.contrib.pubsub import PubSubMixin, PubSubState + +@dataclass +class WorkflowInput: + pubsub_state: PubSubState | None = None + +@workflow.defn +class MyWorkflow(PubSubMixin): + @workflow.init + def __init__(self, input: WorkflowInput) -> None: + self.init_pubsub(prior_state=input.pubsub_state) + + @workflow.run + async def run(self, input: WorkflowInput) -> None: + # ... do work ... + + if workflow.info().is_continue_as_new_suggested(): + self.drain_pubsub() + await workflow.wait_condition(workflow.all_handlers_finished) + workflow.continue_as_new(args=[WorkflowInput( + pubsub_state=self.get_pubsub_state(), + )]) +``` + +`drain_pubsub()` unblocks waiting subscribers and rejects new polls so +`all_handlers_finished` can stabilize. Subscribers created via +`PubSubClient.for_workflow()` automatically follow continue-as-new chains. + +**Important:** Type the pubsub_state field as `PubSubState | None`, not `Any`. +`Any`-typed fields deserialize as plain dicts, which breaks `init_pubsub()`. + +## Exactly-Once Delivery + +External publishers (via `PubSubClient`) get exactly-once delivery through +publisher ID + sequence number deduplication. Each client instance generates +a unique publisher ID and increments a monotonic sequence on each flush. +The workflow tracks the highest seen sequence per publisher and rejects +duplicates. See `DESIGN-ADDENDUM-DEDUP.md` for details. + +## API Reference + +### PubSubMixin + +| Method | Description | +|---|---| +| `init_pubsub(prior_state=None)` | Initialize state. Call in `__init__` for fresh workflows, or in `run()` when accepting CAN state. | +| `publish(topic, data)` | Append to the log from workflow code. | +| `get_pubsub_state()` | Snapshot for continue-as-new. | +| `drain_pubsub()` | Unblock polls and reject new ones. | + +Handlers added automatically: + +| Handler | Kind | Name | +|---|---|---| +| Signal | `__pubsub_publish` | Receive external publications (with dedup) | +| Update | `__pubsub_poll` | Long-poll subscription | +| Query | `__pubsub_offset` | Current global offset | + +### PubSubClient + +| Method | Description | +|---|---| +| `PubSubClient.for_workflow(client, wf_id)` | Factory (preferred). Auto-detects activity context if args omitted. | +| `PubSubClient(handle)` | From handle (no CAN follow). | +| `publish(topic, data, priority=False)` | Buffer a message. | +| `flush()` | Send buffered messages (with dedup). | +| `subscribe(topics, from_offset, poll_interval=0.1)` | Async iterator. Always follows CAN chains when created via `for_workflow`. | +| `get_offset()` | Query current global offset. | + +Use as `async with` for batched publishing with automatic flush. + +## Cross-Language Protocol + +Any Temporal client can interact with a pub/sub workflow using these +fixed handler names: + +1. **Publish:** Signal `__pubsub_publish` with `PublishInput` +2. **Subscribe:** Update `__pubsub_poll` with `PollInput` -> `PollResult` +3. **Offset:** Query `__pubsub_offset` -> `int` diff --git a/temporalio/contrib/pubsub/__init__.py b/temporalio/contrib/pubsub/__init__.py new file mode 100644 index 000000000..b9978f94a --- /dev/null +++ b/temporalio/contrib/pubsub/__init__.py @@ -0,0 +1,31 @@ +"""Pub/sub support for Temporal workflows. + +This module provides a reusable pub/sub pattern where a workflow acts as a +message broker. External clients (activities, starters, other services) publish +and subscribe through the workflow handle using Temporal primitives. + +Payloads are opaque bytes. Base64 encoding is used on the wire for +cross-language compatibility, but users work with native byte types. +""" + +from temporalio.contrib.pubsub._client import PubSubClient +from temporalio.contrib.pubsub._mixin import PubSubMixin +from temporalio.contrib.pubsub._types import ( + PollInput, + PollResult, + PubSubItem, + PubSubState, + PublishEntry, + PublishInput, +) + +__all__ = [ + "PollInput", + "PollResult", + "PubSubClient", + "PubSubItem", + "PubSubMixin", + "PubSubState", + "PublishEntry", + "PublishInput", +] diff --git a/temporalio/contrib/pubsub/_client.py b/temporalio/contrib/pubsub/_client.py new file mode 100644 index 000000000..c316c005a --- /dev/null +++ b/temporalio/contrib/pubsub/_client.py @@ -0,0 +1,312 @@ +"""External-side pub/sub client. + +Used by activities, starters, and any code with a workflow handle to publish +messages and subscribe to topics on a pub/sub workflow. +""" + +from __future__ import annotations + +import asyncio +import time +import uuid +from collections.abc import AsyncIterator +from typing import Self + +from temporalio import activity +from temporalio.client import ( + Client, + WorkflowExecutionStatus, + WorkflowHandle, + WorkflowUpdateFailedError, + WorkflowUpdateRPCTimeoutOrCancelledError, +) + +from ._types import ( + PollInput, + PollResult, + PubSubItem, + PublishEntry, + PublishInput, + decode_data, + encode_data, +) + + +class PubSubClient: + """Client for publishing to and subscribing from a pub/sub workflow. + + Create via :py:meth:`create` (preferred) or by passing a handle + directly to the constructor. + + For publishing, use as an async context manager to get automatic batching:: + + client = PubSubClient.create(temporal_client, workflow_id) + async with client: + client.publish("events", b"hello") + client.publish("events", b"world", priority=True) + + For subscribing:: + + client = PubSubClient.create(temporal_client, workflow_id) + async for item in client.subscribe(["events"], from_offset=0): + process(item) + """ + + def __init__( + self, + handle: WorkflowHandle, + *, + batch_interval: float = 2.0, + max_batch_size: int | None = None, + max_retry_duration: float = 600.0, + ) -> None: + """Create a pub/sub client from a workflow handle. + + Prefer :py:meth:`create` when you need continue-as-new + following in ``subscribe()``. + + Args: + handle: Workflow handle to the pub/sub workflow. + batch_interval: Seconds between automatic flushes. + max_batch_size: Auto-flush when buffer reaches this size. + max_retry_duration: Maximum seconds to retry a failed flush + before raising TimeoutError. Must be less than the + workflow's ``publisher_ttl`` (default 900s) to preserve + exactly-once delivery. Default: 600s. + """ + self._handle = handle + self._client: Client | None = None + self._workflow_id = handle.id + self._batch_interval = batch_interval + self._max_batch_size = max_batch_size + self._max_retry_duration = max_retry_duration + self._buffer: list[PublishEntry] = [] + self._flush_event = asyncio.Event() + self._flush_task: asyncio.Task[None] | None = None + self._flush_lock = asyncio.Lock() + self._publisher_id: str = uuid.uuid4().hex[:16] + self._sequence: int = 0 + self._pending: list[PublishEntry] | None = None + self._pending_seq: int = 0 + self._pending_since: float | None = None + + @classmethod + def create( + cls, + client: Client | None = None, + workflow_id: str | None = None, + *, + batch_interval: float = 2.0, + max_batch_size: int | None = None, + max_retry_duration: float = 600.0, + ) -> PubSubClient: + """Create a pub/sub client from a Temporal client and workflow ID. + + This is the preferred constructor. It enables continue-as-new + following in ``subscribe()``. + + If called from within an activity, ``client`` and ``workflow_id`` + can be omitted — they are inferred from the activity context. + + Args: + client: Temporal client. If None and in an activity, uses + ``activity.client()``. + workflow_id: ID of the pub/sub workflow. If None and in an + activity, uses the activity's parent workflow ID. + batch_interval: Seconds between automatic flushes. + max_batch_size: Auto-flush when buffer reaches this size. + max_retry_duration: Maximum seconds to retry a failed flush + before raising TimeoutError. Default: 600s. + """ + if client is None or workflow_id is None: + info = activity.info() + if client is None: + client = activity.client() + if workflow_id is None: + wf_id = info.workflow_id + assert wf_id is not None, ( + "activity must be called from within a workflow" + ) + workflow_id = wf_id + handle = client.get_workflow_handle(workflow_id) + instance = cls( + handle, + batch_interval=batch_interval, + max_batch_size=max_batch_size, + max_retry_duration=max_retry_duration, + ) + instance._client = client + return instance + + async def __aenter__(self) -> Self: + self._flush_task = asyncio.create_task(self._run_flusher()) + return self + + async def __aexit__(self, *_exc: object) -> None: + if self._flush_task: + self._flush_task.cancel() + try: + await self._flush_task + except asyncio.CancelledError: + pass + self._flush_task = None + await self._flush() + + def publish(self, topic: str, data: bytes, priority: bool = False) -> None: + """Buffer a message for publishing. + + Args: + topic: Topic string. + data: Opaque byte payload. + priority: If True, wake the flusher to send immediately + (fire-and-forget — does not block the caller). + """ + self._buffer.append(PublishEntry(topic=topic, data=encode_data(data))) + if priority or ( + self._max_batch_size is not None + and len(self._buffer) >= self._max_batch_size + ): + self._flush_event.set() + + async def _flush(self) -> None: + """Send buffered or pending messages to the workflow via signal. + + On failure, the pending batch and sequence are kept for retry. + Only advances the confirmed sequence on success. + """ + async with self._flush_lock: + if self._pending is not None: + # Retry path: check max_retry_duration + if ( + self._pending_since is not None + and time.monotonic() - self._pending_since + > self._max_retry_duration + ): + self._pending = None + self._pending_seq = 0 + self._pending_since = None + raise TimeoutError( + f"Flush retry exceeded max_retry_duration " + f"({self._max_retry_duration}s). Pending batch dropped. " + f"If the signal was delivered, items are in the log. " + f"If not, they are lost." + ) + batch = self._pending + seq = self._pending_seq + elif self._buffer: + # New batch path + seq = self._sequence + 1 + batch = self._buffer + self._buffer = [] + self._pending = batch + self._pending_seq = seq + self._pending_since = time.monotonic() + else: + return + + try: + await self._handle.signal( + "__pubsub_publish", + PublishInput( + items=batch, + publisher_id=self._publisher_id, + sequence=seq, + ), + ) + # Success: advance confirmed sequence, clear pending + self._sequence = seq + self._pending = None + self._pending_seq = 0 + self._pending_since = None + except Exception: + # Pending stays set for retry on the next _flush() call + raise + + async def _run_flusher(self) -> None: + """Background task: wait for timer OR priority wakeup, then flush.""" + while True: + try: + await asyncio.wait_for( + self._flush_event.wait(), timeout=self._batch_interval + ) + except asyncio.TimeoutError: + pass + self._flush_event.clear() + await self._flush() + + async def subscribe( + self, + topics: list[str] | None = None, + from_offset: int = 0, + *, + poll_cooldown: float = 0.1, + ) -> AsyncIterator[PubSubItem]: + """Async iterator that polls for new items. + + Automatically follows continue-as-new chains when the client + was created via :py:meth:`create`. + + Args: + topics: Topic filter. None or empty list means all topics. + from_offset: Global offset to start reading from. + poll_cooldown: Minimum seconds between polls to avoid + overwhelming the workflow when items arrive faster than + the poll round-trip. Defaults to 0.1. + + Yields: + PubSubItem for each matching item. + """ + offset = from_offset + while True: + try: + result: PollResult = await self._handle.execute_update( + "__pubsub_poll", + PollInput(topics=topics or [], from_offset=offset), + result_type=PollResult, + ) + except asyncio.CancelledError: + return + except WorkflowUpdateFailedError as e: + if ( + e.cause + and getattr(e.cause, "type", None) == "TruncatedOffset" + ): + # Subscriber fell behind truncation. Retry from offset 0 + # which the mixin treats as "from the beginning of + # whatever exists" (i.e., from base_offset). + offset = 0 + continue + raise + except WorkflowUpdateRPCTimeoutOrCancelledError: + if await self._follow_continue_as_new(): + continue + return + for wire_item in result.items: + yield PubSubItem( + topic=wire_item.topic, + data=decode_data(wire_item.data), + offset=wire_item.offset, + ) + offset = result.next_offset + if poll_cooldown > 0: + await asyncio.sleep(poll_cooldown) + + async def _follow_continue_as_new(self) -> bool: + """Check if the workflow continued-as-new and re-target the handle. + + Returns True if the handle was updated (caller should retry). + """ + if self._client is None: + return False + try: + desc = await self._handle.describe() + except Exception: + return False + if desc.status == WorkflowExecutionStatus.CONTINUED_AS_NEW: + self._handle = self._client.get_workflow_handle(self._workflow_id) + return True + return False + + async def get_offset(self) -> int: + """Query the current global offset (base_offset + log length).""" + return await self._handle.query("__pubsub_offset", result_type=int) diff --git a/temporalio/contrib/pubsub/_mixin.py b/temporalio/contrib/pubsub/_mixin.py new file mode 100644 index 000000000..35f683863 --- /dev/null +++ b/temporalio/contrib/pubsub/_mixin.py @@ -0,0 +1,241 @@ +"""Workflow-side pub/sub mixin. + +Add PubSubMixin as a base class to any workflow to get pub/sub signal, update, +and query handlers. + +Call ``init_pubsub()`` in ``__init__`` for fresh workflows, or in ``run()`` +when accepting ``prior_state`` from continue-as-new arguments. +""" + +from __future__ import annotations + +from temporalio import workflow +from temporalio.exceptions import ApplicationError + +from ._types import ( + PollInput, + PollResult, + PubSubItem, + PubSubState, + PublishInput, + _WireItem, + decode_data, + encode_data, +) + + +class PubSubMixin: + """Mixin that turns a workflow into a pub/sub broker. + + Provides: + - ``publish(topic, data)`` for workflow-side publishing + - ``__pubsub_publish`` signal for external publishing (with dedup) + - ``__pubsub_poll`` update for long-poll subscription + - ``__pubsub_offset`` query for current log length + - ``drain_pubsub()`` / ``get_pubsub_state()`` for continue-as-new + - ``truncate_pubsub(offset)`` for log prefix truncation + """ + + _pubsub_log: list[PubSubItem] + _pubsub_base_offset: int + _pubsub_publisher_sequences: dict[str, int] + _pubsub_publisher_last_seen: dict[str, float] + _pubsub_draining: bool + + def init_pubsub(self, prior_state: PubSubState | None = None) -> None: + """Initialize pub/sub state. + + Args: + prior_state: State carried from a previous run via + ``get_pubsub_state()`` through continue-as-new. Pass None + on the first run. + """ + if prior_state is not None: + self._pubsub_log = [ + PubSubItem(topic=item.topic, data=decode_data(item.data)) + for item in prior_state.log + ] + self._pubsub_base_offset = prior_state.base_offset + self._pubsub_publisher_sequences = dict( + prior_state.publisher_sequences + ) + self._pubsub_publisher_last_seen = dict( + prior_state.publisher_last_seen + ) + else: + self._pubsub_log = [] + self._pubsub_base_offset = 0 + self._pubsub_publisher_sequences = {} + self._pubsub_publisher_last_seen = {} + self._pubsub_draining = False + + def get_pubsub_state( + self, *, publisher_ttl: float = 900.0 + ) -> PubSubState: + """Return a serializable snapshot of pub/sub state for continue-as-new. + + Prunes publisher dedup entries older than ``publisher_ttl`` seconds. + The TTL must exceed the ``max_retry_duration`` of any client that + may still be retrying a failed flush. + + Args: + publisher_ttl: Seconds after which a publisher's dedup entry + is pruned. Default 900 (15 minutes). + """ + self._check_initialized() + now = workflow.time() + + # Determine which publishers to retain. Publishers with timestamps + # are pruned by TTL. Publishers without timestamps (legacy state + # from before publisher_last_seen was added) are always retained + # to avoid silently dropping dedup entries on upgrade. + active_sequences: dict[str, int] = {} + active_last_seen: dict[str, float] = {} + for pid, seq in self._pubsub_publisher_sequences.items(): + ts = self._pubsub_publisher_last_seen.get(pid) + if ts is None or now - ts < publisher_ttl: + active_sequences[pid] = seq + if ts is not None: + active_last_seen[pid] = ts + + return PubSubState( + log=[ + _WireItem(topic=item.topic, data=encode_data(item.data)) + for item in self._pubsub_log + ], + base_offset=self._pubsub_base_offset, + publisher_sequences=active_sequences, + publisher_last_seen=active_last_seen, + ) + + def drain_pubsub(self) -> None: + """Unblock all waiting poll handlers and reject new polls. + + Call this before ``await workflow.wait_condition(workflow.all_handlers_finished)`` + and ``workflow.continue_as_new()``. + """ + self._check_initialized() + self._pubsub_draining = True + + def truncate_pubsub(self, up_to_offset: int) -> None: + """Discard log entries before ``up_to_offset``. + + After truncation, polls requesting an offset before the new + base will receive a ValueError. All global offsets remain + monotonic. + + Args: + up_to_offset: The global offset to truncate up to (exclusive). + Entries at offsets ``[base_offset, up_to_offset)`` are + discarded. + """ + self._check_initialized() + log_index = up_to_offset - self._pubsub_base_offset + if log_index <= 0: + return + if log_index > len(self._pubsub_log): + raise ValueError( + f"Cannot truncate to offset {up_to_offset}: " + f"only {self._pubsub_base_offset + len(self._pubsub_log)} " + f"items exist" + ) + self._pubsub_log = self._pubsub_log[log_index:] + self._pubsub_base_offset = up_to_offset + + def _check_initialized(self) -> None: + if not hasattr(self, "_pubsub_log"): + raise RuntimeError( + "PubSubMixin not initialized. Call self.init_pubsub() in " + "your workflow's __init__ or at the start of run()." + ) + + def publish(self, topic: str, data: bytes) -> None: + """Publish an item from within workflow code. Deterministic — just appends.""" + self._check_initialized() + self._pubsub_log.append(PubSubItem(topic=topic, data=data)) + + @workflow.signal(name="__pubsub_publish") + def _pubsub_publish(self, input: PublishInput) -> None: + """Receive publications from external clients (activities, starters). + + Deduplicates using (publisher_id, sequence). If publisher_id is set + and the sequence is <= the last seen sequence for that publisher, + the entire batch is dropped as a duplicate. Batches are atomic: + the dedup decision applies to the whole batch, not individual items. + """ + self._check_initialized() + if input.publisher_id: + last_seq = self._pubsub_publisher_sequences.get( + input.publisher_id, 0 + ) + if input.sequence <= last_seq: + return + self._pubsub_publisher_sequences[input.publisher_id] = ( + input.sequence + ) + self._pubsub_publisher_last_seen[input.publisher_id] = ( + workflow.time() + ) + for entry in input.items: + self._pubsub_log.append( + PubSubItem(topic=entry.topic, data=decode_data(entry.data)) + ) + + @workflow.update(name="__pubsub_poll") + async def _pubsub_poll(self, input: PollInput) -> PollResult: + """Long-poll: block until new items available or draining, then return.""" + self._check_initialized() + log_offset = input.from_offset - self._pubsub_base_offset + if log_offset < 0: + if input.from_offset == 0: + # "From the beginning" — start at whatever is available. + log_offset = 0 + else: + # Subscriber had a specific position that's been truncated. + # ApplicationError fails this update (client gets the error) + # without crashing the workflow task — avoids a poison pill + # during replay. + raise ApplicationError( + f"Requested offset {input.from_offset} has been truncated. " + f"Current base offset is {self._pubsub_base_offset}.", + type="TruncatedOffset", + non_retryable=True, + ) + await workflow.wait_condition( + lambda: len(self._pubsub_log) > log_offset + or self._pubsub_draining, + ) + all_new = self._pubsub_log[log_offset:] + next_offset = self._pubsub_base_offset + len(self._pubsub_log) + if input.topics: + topic_set = set(input.topics) + filtered = [ + (self._pubsub_base_offset + log_offset + i, item) + for i, item in enumerate(all_new) + if item.topic in topic_set + ] + else: + filtered = [ + (self._pubsub_base_offset + log_offset + i, item) + for i, item in enumerate(all_new) + ] + return PollResult( + items=[ + _WireItem(topic=item.topic, data=encode_data(item.data), offset=off) + for off, item in filtered + ], + next_offset=next_offset, + ) + + @_pubsub_poll.validator + def _validate_pubsub_poll(self, input: PollInput) -> None: # noqa: A002 + """Reject new polls when draining for continue-as-new.""" + self._check_initialized() + if self._pubsub_draining: + raise RuntimeError("Workflow is draining for continue-as-new") + + @workflow.query(name="__pubsub_offset") + def _pubsub_offset(self) -> int: + """Return the current global offset (base_offset + log length).""" + self._check_initialized() + return self._pubsub_base_offset + len(self._pubsub_log) diff --git a/temporalio/contrib/pubsub/_types.py b/temporalio/contrib/pubsub/_types.py new file mode 100644 index 000000000..69cc5f431 --- /dev/null +++ b/temporalio/contrib/pubsub/_types.py @@ -0,0 +1,100 @@ +"""Shared data types for the pub/sub contrib module.""" + +from __future__ import annotations + +import base64 +from dataclasses import dataclass, field + + +def encode_data(data: bytes) -> str: + """Encode bytes to base64 string for wire format.""" + return base64.b64encode(data).decode("ascii") + + +def decode_data(data: str) -> bytes: + """Decode base64 string from wire format to bytes.""" + return base64.b64decode(data) + + +@dataclass +class PubSubItem: + """A single item in the pub/sub log. + + The ``offset`` field is populated at poll time from the item's position + in the global log. It defaults to 0 ("unknown") for backward compatibility. + See DESIGN-ADDENDUM-ITEM-OFFSET.md. + """ + + topic: str + data: bytes + offset: int = 0 + + +@dataclass +class PublishEntry: + """A single entry to publish via signal (wire type). + + The ``data`` field is a base64-encoded string for cross-language + compatibility over Temporal's JSON payload converter. + """ + + topic: str + data: str # base64-encoded bytes + + +@dataclass +class PublishInput: + """Signal payload: batch of entries to publish. + + Includes publisher_id and sequence for exactly-once deduplication. + See DESIGN-ADDENDUM-DEDUP.md. + """ + + items: list[PublishEntry] = field(default_factory=list) + publisher_id: str = "" + sequence: int = 0 + + +@dataclass +class PollInput: + """Update payload: request to poll for new items.""" + + topics: list[str] = field(default_factory=list) + from_offset: int = 0 + + +@dataclass +class _WireItem: + """Wire representation of a PubSubItem (base64 data).""" + + topic: str + data: str # base64-encoded bytes + offset: int = 0 + + +@dataclass +class PollResult: + """Update response: items matching the poll request. + + Items use base64-encoded data for cross-language wire compatibility. + """ + + items: list[_WireItem] = field(default_factory=list) + next_offset: int = 0 + + +@dataclass +class PubSubState: + """Serializable snapshot of pub/sub state for continue-as-new. + + The containing workflow input must type the field as + ``PubSubState | None``, not ``Any``, so that the default data converter + can reconstruct the dataclass from JSON. + + The log items use base64-encoded data for serialization stability. + """ + + log: list[_WireItem] = field(default_factory=list) + base_offset: int = 0 + publisher_sequences: dict[str, int] = field(default_factory=dict) + publisher_last_seen: dict[str, float] = field(default_factory=dict) diff --git a/temporalio/contrib/pubsub/docs/end-to-end-dedup-analysis.md b/temporalio/contrib/pubsub/docs/end-to-end-dedup-analysis.md new file mode 100644 index 000000000..a6de76028 --- /dev/null +++ b/temporalio/contrib/pubsub/docs/end-to-end-dedup-analysis.md @@ -0,0 +1,190 @@ +# Analysis: End-to-End Principle Applied to Deduplication + +Should pub/sub dedup live in the workflow (middle layer), or should +consumers handle it at the edges? This analysis applies the end-to-end +argument to the different types of duplicates in the system. + +## The End-to-End Argument + +Saltzer, Reed, and Clark (1984): a function can be correctly and +completely implemented only with the knowledge and help of the +application standing at the endpoints. Putting it in the middle layer +may improve performance but cannot guarantee correctness — the endpoints +must still handle the failure cases themselves. + +Applied here: if the consumer must handle duplicates anyway (because some +duplicates originate above or below the transport layer), then dedup in +the pub/sub workflow is redundant complexity. + +## The Pipeline + +``` +LLM API --> Activity --> PubSubClient --> Workflow Log --> BFF/SSE --> Browser + (1) (2) (3) (4) (5) (6) +``` + +Duplicates can arise at stages 1, 3, and 5. Each has different +characteristics. + +## Types of Duplicates + +### Type A: Duplicate LLM Responses (Stage 1) + +**Cause**: Activity retries. If an activity calling an LLM times out but +the LLM actually completed, the retry produces a second, semantically +equivalent but textually different response. + +**Nature**: The two responses have *different content*. They are not +byte-identical duplicates — they are duplicate *requests* that produce +duplicate *work*. + +**Why this doesn't belong in pub/sub**: Not because pub/sub can't detect +it — in principle, you could fingerprint content or track LLM request +IDs in the workflow. The real reason is that **data escapes to the +application before you know whether dedup will be needed.** The activity +streams the first LLM response through the pub/sub log as tokens arrive. +The subscriber consumes them. The BFF forwards them to the browser. The +user sees them rendered. All of this happens during the first LLM call, +before any retry occurs. + +By the time the activity fails and retries, the first response's tokens +are already consumed, rendered, and acted upon. The duplicate LLM +response hasn't been produced yet — it doesn't exist until the retry +completes. So there is no point during the first call where the pub/sub +layer could suppress it, because at that point there is nothing to +suppress. + +When the retry does produce a second response, the application must +decide what to do: discard it, replace the first, merge them, show both. +That decision depends on application semantics that the pub/sub layer +has no knowledge of. The correct place for this dedup is the activity +(don't retry completed LLM calls), the orchestrating workflow (use +activity idempotency keys), or the application's own recovery logic. + +**End-to-end verdict**: Type A dedup belongs at the application layer, +not because pub/sub lacks the capability, but because the data has +already escaped before the duplicate exists. + +### Type B: Duplicate Signal Batches (Stage 3) + +**Cause**: `PubSubClient._flush()` sends a signal. The server accepts it +but the client sees a network error. The client retries, sending the +same batch again. The workflow receives both signals. + +**Nature**: Byte-identical duplicate batches with the same +`(publisher_id, sequence)`. + +**Why this belongs in pub/sub**: Two reasons. + +First, **encapsulation**: the fact that publishing goes through batched +signals is an implementation detail of the pub/sub transport. The +consumer shouldn't need to know about `(publisher_id, sequence)`, batch +boundaries, or signal retry semantics. Leaking batch-level dedup to the +consumer would couple it to the transport mechanism. If we later switch +to updates, change the batching strategy, or introduce a different +transport, the consumer's dedup logic would break. + +Second, **the consumer cannot do it correctly**. The subscriber sees +`PubSubItem(topic, data)` — items have no unique ID. If the workflow +accepts a duplicate batch, it assigns *new* offsets to the duplicate +items, making them indistinguishable from originals. Content-based dedup +has false positives (an LLM legitimately produces the same token twice; +a status event like `{"type":"THINKING_START"}` is repeated across +turns). The consumer would need to implement a fragile, heuristic dedup +that still misses edge cases. + +The pub/sub layer, by contrast, can detect these duplicates cheaply and +precisely: `sequence <= last_seen` is a single integer comparison per +batch. The sequence number is generated and validated within the same +control boundary (publisher client + workflow handler). This is not a +"middle layer redundantly implementing endpoint functionality" — it is +the only layer with sufficient context to do it correctly. + +**End-to-end verdict**: Type B dedup is properly placed in the workflow. +It preserves transport encapsulation and is the only correct +implementation. + +### Type C: Duplicate SSE Delivery (Stage 5) + +**Cause**: Browser reconnection. The SSE connection drops, the browser +reconnects with `Last-Event-ID`, and the BFF replays from that offset. +If the BFF replays too far back, the browser sees duplicate events. + +**Nature**: Exact replay of previously-delivered events. + +**Where dedup must live**: The **BFF** (stage 5) and/or the **browser** +(stage 6). The BFF must track SSE event IDs and resume from the correct +point. The browser/frontend reducer should be idempotent — applying the +same event twice should not corrupt state (e.g., append a text delta +twice). + +**End-to-end verdict**: Pub/sub dedup is irrelevant for Type C. This +duplicate exists below the pub/sub layer, in the SSE transport. + +## Summary Table + +| Type | Cause | Why not in pub/sub? | Where dedup belongs | +|---|---|---|---| +| A: Duplicate LLM work | Activity retry | Data escapes before duplicate exists | Activity / workflow orchestration | +| B: Duplicate batches | Signal retry | *Does* belong in pub/sub | Workflow (pub/sub layer) | +| C: Duplicate SSE events | Browser reconnect | Below the pub/sub layer | BFF / browser | + +## Proper Layering + +Each layer handles the duplicates it introduces: + +``` +┌─────────────────────────────────────────────────────────┐ +│ Application layer (activity / workflow orchestration) │ +│ Handles: Type A — duplicate LLM work │ +│ Mechanism: activity idempotency keys, don't retry │ +│ completed LLM calls, application recovery logic │ +├─────────────────────────────────────────────────────────┤ +│ Transport layer (pub/sub workflow) │ +│ Handles: Type B — duplicate signal batches │ +│ Mechanism: (publisher_id, sequence) dedup │ +│ Encapsulates: batching, signals, retry semantics │ +├─────────────────────────────────────────────────────────┤ +│ Delivery layer (BFF / SSE / browser) │ +│ Handles: Type C — duplicate SSE events │ +│ Mechanism: Last-Event-ID, idempotent reducers │ +└─────────────────────────────────────────────────────────┘ +``` + +Each layer is self-contained. The application doesn't know about signal +batches. The pub/sub layer doesn't know about LLM semantics. The SSE +layer doesn't know about either. Duplicates are resolved at the layer +that introduces them, with the context needed to resolve them correctly. + +## Does the Consumer Need Type B Dedup Anyway? + +The end-to-end argument would apply if consumers needed Type B dedup +regardless of what the workflow does. They don't: + +1. **Consumers cannot detect Type B duplicates.** Items have no unique + ID. Offsets are assigned by the workflow — if it accepts a duplicate + batch, the duplicates get fresh offsets and are indistinguishable. + +2. **Consumers already handle Type C independently.** SSE reconnection + and idempotent reducers are standard patterns that exist regardless + of what the pub/sub layer does. + +3. **Type A is handled above.** The activity/workflow prevents duplicate + work from being published in the first place. + +The consumer does *not* need Type B dedup. The layers are clean. + +## Conclusion + +The `(publisher_id, sequence)` dedup protocol is correctly placed in the +pub/sub workflow. It handles the one type of duplicate that originates +within the transport layer, using context that only the transport layer +has, without leaking transport implementation details to the consumer. + +What the pub/sub layer should *not* attempt: +- Type A dedup (duplicate LLM work) — data has already escaped to the + application before the duplicate exists; resolution requires + application semantics +- Type C dedup (SSE reconnection) — below the pub/sub layer +- General-purpose content dedup — false positive risk, wrong abstraction + level diff --git a/temporalio/contrib/pubsub/docs/signal-vs-update-dedup-analysis.md b/temporalio/contrib/pubsub/docs/signal-vs-update-dedup-analysis.md new file mode 100644 index 000000000..de17e0eb3 --- /dev/null +++ b/temporalio/contrib/pubsub/docs/signal-vs-update-dedup-analysis.md @@ -0,0 +1,198 @@ +# Analysis: Signal vs Update for Publishing — Deduplication Tradeoffs + +Should pub/sub publishing use signals (current) or updates? This analysis +examines what Temporal provides natively for deduplication and whether +application-level dedup can be eliminated. + +## What Temporal Provides + +### Signals + +- **Delivery guarantee**: at-least-once. +- **Request-level dedup**: the gRPC layer attaches a random `request_id` to + each RPC. If the SDK's internal retry resends the *same* RPC (e.g., due to + a transient gRPC error), the server deduplicates it. This is transparent + and not controllable by the application. +- **No application-level dedup key**: there is no way to attach an + idempotency key to a signal. If the client makes a *new* signal call with + the same logical content (a retry after a timeout where the outcome is + unknown), Temporal treats it as a distinct signal and delivers it. +- **Official guidance**: "For Signals, you should use a custom idempotency + key that you send as part of your own signal inputs, implementing the + deduplication in your Workflow code." + ([docs](https://docs.temporal.io/handling-messages#exactly-once-message-processing)) + +### Updates + +- **Delivery guarantee**: exactly-once *per workflow run*, via Update ID. +- **Update ID**: defaults to a random UUID but can be set by the caller. The + server deduplicates accepted updates by Update ID within a single workflow + execution. +- **Cross-CAN boundary**: Update ID dedup state does *not* persist across + continue-as-new. A retry that lands on a new run is treated as a new + update. +- **Known bug (temporal/temporal#6375)**: `CompleteUpdate` is sometimes not + honored when in the same WFT completion as CAN. The frontend retries and + the update can be delivered to the post-CAN run as a distinct update. + This makes cross-CAN dedup unreliable even for updates. +- **Official guidance**: "If you are using Updates with Continue-As-New you + should implement the deduplication in your Workflow code, since Update ID + deduplication by the server is per Workflow run." + +### Summary + +| | Signals (current) | Updates | +|---|---|---| +| Per-run dedup | None (app must provide) | Built-in via Update ID | +| Cross-CAN dedup | None (app must provide) | None (app must provide) | +| App-level dedup needed? | **Yes** | **Yes** (for CAN workflows) | + +Since pub/sub workflows use continue-as-new, **application-level dedup is +required regardless of whether we use signals or updates for publishing.** + +**Pragmatic view**: The cross-CAN update dedup gap (temporal/temporal#6375) +is a known issue that Temporal will likely fix. If we used updates for +publishing and accepted this edge case as a temporary platform limitation, +we could eventually drop application-level dedup entirely once the fix +ships. With signals, application-level dedup is a permanent requirement — +there are no plans to add signal idempotency keys to the platform. + +## Tradeoffs Beyond Dedup + +### Latency and blocking + +| | Signals | Updates | +|---|---|---| +| Client blocks? | No — fire-and-forget | Yes — until workflow processes it | +| Flush latency | ~0 (signal enqueued at server) | Round-trip to worker + processing | +| Caller impact | `publish()` never blocks | Flush blocks for ~10-50ms | + +With signals, the flush is non-blocking. The client can immediately continue +buffering new items. With updates, the flush would block until the workflow +worker processes the batch and returns a result. + +For high-throughput publishing from activities (e.g., streaming LLM tokens), +the non-blocking property matters. The activity can buffer tokens at whatever +rate they arrive without being throttled by the workflow's processing speed. + +### Backpressure + +| | Signals | Updates | +|---|---|---| +| Natural backpressure | No | Yes | +| Overflow risk | Workflow history grows unbounded | Client slows to workflow speed | + +Updates provide natural backpressure: a fast publisher automatically slows +down because each flush blocks. With signals, a fast publisher can +overwhelm the workflow's event history (each signal adds events). The +current mitigation is batching (amortizes signal count) and relying on the +workflow to CAN before history gets too large. + +### Batching + +Batching works identically with either approach. The client-side buffer/swap/ +flush logic is unchanged — only the flush transport differs: + +```python +# Signal (current) +await self._handle.signal("__pubsub_publish", PublishInput(...)) + +# Update (alternative) +await self._handle.execute_update("__pubsub_publish", PublishInput(...)) +``` + +My earlier claim that batching would be "awkward" with updates was wrong. + +### Return value + +Updates can return a result. A publish-via-update could return the assigned +offsets, confirmation of delivery, or the current log length. With signals, +the client has no way to learn the outcome without a separate query. + +### Event history cost + +Each signal adds `WorkflowSignalReceived` to history (1 event). Each update +adds `WorkflowExecutionUpdateAccepted` + `WorkflowExecutionUpdateCompleted` +(2 events). Updates consume history faster, bringing CAN sooner. + +### Concurrency limits + +Temporal Cloud has [per-workflow update limits](https://docs.temporal.io/cloud/limits#per-workflow-execution-update-limits). +Signals have no equivalent limit. For very high-throughput scenarios, signals +may be the only option. + +## Recommendation + +**Keep signals for publishing.** The non-blocking property is the decisive +factor for the streaming use case. The application-level dedup +(`publisher_id` + `sequence`) is a permanent requirement for signals and +is already implemented with TLA+ verification. + +**Alternative worth revisiting**: If the non-blocking property were less +important (e.g., lower-throughput use case), updates would be attractive. +Once temporal/temporal#6375 is fixed, update-based publishing with CAN +would get platform-native exactly-once with no application dedup needed. +The tradeoff is blocking flush + 2x history events per batch. + +For the current streaming use case, signals remain the right choice. + +**Keep updates for polling.** The `__pubsub_poll` update is the correct +choice for subscription: the caller needs a result (the items), and blocking +is the desired behavior (long-poll semantics). + +## What Would Change If We Switched + +For completeness, here's what a switch to update-based publishing would +require: + +1. Replace signal handler `__pubsub_publish` with an update handler +2. The publish handler becomes synchronous (just appends to log) — fast +3. Client flush changes from `handle.signal(...)` to + `handle.execute_update(...)` +4. Background flusher blocks on the update call instead of fire-and-forget +5. Application-level dedup stays (CAN requirement) +6. Update validator could reject publishes during drain (already done for + polls) +7. Return type could include assigned offsets + +The dedup protocol, TLA+ specs, and mixin-side handler logic would be +essentially unchanged. The change is mechanical, not architectural. + +## Signal Ordering Guarantee + +Temporal guarantees that signals from a single client, sent sequentially +(each signal call completes before the next is sent), are delivered in order: + +> "Signals are delivered in the order they are received by the Cluster and +> written to History." +> ([docs](https://docs.temporal.io/workflows#signal)) + +The guarantee breaks down only for *concurrent* signals — if two signal RPCs +are in flight simultaneously, their order in history is nondeterministic. + +The pub/sub client's `_flush_lock` ensures signals are never sent +concurrently from a single `PubSubClient` instance. The sequence is: + +1. Acquire lock +2. `await handle.signal(...)` — blocks until server writes to history +3. Release lock + +This means batches from a single publisher are ordered in the workflow log. +Combined with the workflow's single-threaded signal processing (the +`_pubsub_publish` handler is synchronous — no `await`), items within and +across batches preserve their publish order. + +**Cross-publisher ordering** is nondeterministic. If publisher A and +publisher B send signals concurrently, the interleaving in history depends +on arrival order at the server. Within each publisher's stream, ordering is +preserved. This matches the `OrderPreservedPerPublisher` invariant verified +in `PubSubDedupTTL.tla`. + +## Sources + +- [Temporal docs: Message handler patterns — exactly-once processing](https://docs.temporal.io/handling-messages#exactly-once-message-processing) +- [Temporal docs: Signals vs Updates decision table](https://docs.temporal.io/encyclopedia/workflow-message-passing) +- [temporal/temporal#6375: CompleteUpdate not honored during CAN](https://github.com/temporalio/temporal/issues/6375) +- [Community: Deduping workflow signals](https://community.temporal.io/t/deduping-workflow-signals/5547) +- [Community: Idempotent signals investigation](https://community.temporal.io/t/preliminary-investigation-into-idempotent-signals/13694) +- [Slack: request_id is for client call dedup, not application dedup](https://temporalio.slack.com/archives/C012SHMPDDZ/p1729554260821239) diff --git a/temporalio/contrib/pubsub/verification/PROOF.md b/temporalio/contrib/pubsub/verification/PROOF.md new file mode 100644 index 000000000..9562822ed --- /dev/null +++ b/temporalio/contrib/pubsub/verification/PROOF.md @@ -0,0 +1,322 @@ +# Proof of Exactly-Once Delivery + +Formal verification that the pub/sub dedup protocol guarantees no duplicates +and no data loss, for any number of published items. + +## Protocol + +A client flushes batches of items to a workflow via Temporal signals: + +1. **Buffer swap**: `pending = buffer; buffer = []` +2. **Assign sequence**: `pending_seq = confirmed_seq + 1` +3. **Send signal** with `(publisher_id, pending_seq, pending)` +4. **On success**: `confirmed_seq = pending_seq; pending = None` +5. **On failure**: keep `pending` and `pending_seq` for retry + +The workflow deduplicates: reject if `sequence <= last_seen_seq[publisher_id]`. + +The network is non-deterministic: a signal may be delivered to the workflow +but the client may see a failure (e.g., network timeout on the response). + +## Properties + +- **NoDuplicates** (safety): each item appears at most once in the workflow log. +- **OrderPreserved** (safety): items appear in the log in the order they were + published. This is stronger than within-batch ordering — it covers + cross-batch ordering too. +- **AllItemsDelivered** (liveness): under fairness, every published item + eventually reaches the log. Note: the TLA+ spec models a protocol without + `max_retry_duration`. The implementation intentionally sacrifices this + liveness property by dropping pending batches after a timeout to bound + resource usage. This is a design choice — when a batch is dropped, items + may be lost if the signal was not delivered. + +## Bounded Model Checking + +`PubSubDedup.tla` models the protocol with TLC model checking: + +| MaxItems | States Generated | Distinct States | Depth | Result | +|----------|-----------------|-----------------|-------|--------| +| 4 | 320 | 175 | 19 | Pass | +| 6 | 1,202 | 609 | 27 | Pass | + +NoDuplicates, OrderPreserved (invariants) and AllItemsDelivered (liveness +under weak fairness) all pass. + +## Inductive Invariant (Unbounded Argument) + +Bounded model checking proves correctness for specific MaxItems values. +To extend to all N, we define a strengthened invariant `IndInv` in +`PubSubDedupInductive.tla` and verify that it holds for all reachable +states under the standard specification. + +Note: TLC checks `IndInv` as a reachable-state invariant of `Spec` +(i.e., `Init => IndInv` and preservation along all reachable behaviors), +not as a true inductive invariant from arbitrary `IndInv` states. +The per-action proof sketch below argues inductiveness informally. +Since the invariant's clauses are structural relationships independent +of N, verification at MaxItems=6 gives high confidence in the general +case. + +### Definition + +`IndInv` has 13 clauses organized into 5 groups: + +**Uniqueness (C1-C3):** Items are unique within each container. +- C1: `Unique(wf_log)` — no duplicates in the log +- C2: `Unique(buffer)` — no duplicates in the buffer +- C3: `Unique(pending)` — no duplicates in the pending batch + +**Disjointness (C4-C5):** Buffer items are always fresh. +- C4: `Disjoint(buffer, pending)` +- C5: `Disjoint(buffer, wf_log)` + +**Dedup relationship (C6-C7):** The critical property linking pending to the log. +- C6: If `pending_seq > wf_last_seq` (not yet delivered), then `Disjoint(pending, wf_log)` +- C7: If `pending_seq <= wf_last_seq` (already delivered), then `IsSubseq(pending, wf_log)` + +**Sequence consistency (C8-C11):** Sequence numbers track delivery correctly. +- C8: `confirmed_seq <= wf_last_seq` +- C9: `pending = <<>> => confirmed_seq = wf_last_seq` +- C10: `pending = <<>> <=> pending_seq = 0` +- C11: `pending /= <<>> => pending_seq = confirmed_seq + 1` + +**Bounds (C12-C13):** All item IDs are in `1..item_counter`. + +### IndInv implies NoDuplicates + +Trivially: NoDuplicates is clause C1. + +### Init implies IndInv + +All containers are empty, all counters are 0. Every clause is vacuously true +or directly satisfied. + +### IndInv is preserved by every action + +**Publish:** Adds `item_counter + 1` to buffer. This ID is fresh — not in +any container (by C12, all existing IDs are in `1..item_counter`). Uniqueness +and disjointness are preserved. `item_counter` increments, so C12 holds for +the new ID. + +**StartFlush (retry):** No changes to buffer, pending, or wf_log. Only +`flushing` and `delivered` change. All structural properties preserved. + +**StartFlush (new):** Requires `pending = <<>>`. By C9, `confirmed_seq = wf_last_seq`. +So `pending_seq' = confirmed_seq + 1 = wf_last_seq + 1 > wf_last_seq`. +Buffer moves to pending: C2 (buffer unique) transfers to C3 (pending unique). +C5 (buffer disjoint from log) transfers to C6 (pending disjoint from log, +since `pending_seq' > wf_last_seq`). New buffer is `<<>>`, satisfying C4-C5 +vacuously. + +**Deliver (accepted, `pending_seq > wf_last_seq`):** Appends pending to wf_log. +By C6, pending is disjoint from wf_log. Combined with C1 (log unique) and +C3 (pending unique), the extended log has no duplicates → C1 preserved. +Sets `wf_last_seq' = pending_seq`, so now `pending_seq <= wf_last_seq'`. +Pending items are in the new log → C7 satisfied. C5 preserved: buffer was +disjoint from both pending and old log, so disjoint from new log. + +**Deliver (rejected, `pending_seq <= wf_last_seq`):** wf_log unchanged. +Sets `delivered = TRUE`. All properties trivially preserved. + +**FlushSuccess:** Requires `delivered = TRUE` (so Deliver has fired). Sets +`confirmed_seq' = pending_seq`, `pending' = <<>>`. By C11, +`pending_seq = confirmed_seq + 1`. The Deliver action that set +`delivered = TRUE` either accepted (setting `wf_last_seq = pending_seq`) +or rejected (leaving `wf_last_seq` unchanged, which means +`pending_seq <= wf_last_seq` was already true — but since +`pending_seq = confirmed_seq + 1` and `confirmed_seq <= wf_last_seq` (C8), +we need `wf_last_seq >= confirmed_seq + 1 = pending_seq`). In both cases, +`wf_last_seq >= pending_seq` after Deliver. FlushSuccess requires +`delivered = TRUE`, meaning Deliver fired. If Deliver accepted, +`wf_last_seq = pending_seq`. If Deliver rejected, `pending_seq <= wf_last_seq` +was already true. So `confirmed_seq' = pending_seq <= wf_last_seq`, and +since `confirmed_seq <= wf_last_seq` is C8 (not strict equality), C8 is +preserved. C9 requires `pending = <<>> => confirmed_seq = wf_last_seq`. +After FlushSuccess, `pending' = <<>>` and `confirmed_seq' = pending_seq`. +If Deliver accepted: `wf_last_seq = pending_seq = confirmed_seq'` → C9 holds. +If Deliver rejected: `pending_seq <= wf_last_seq`, so `confirmed_seq' <= wf_last_seq`. +But can `confirmed_seq' < wf_last_seq`? Only if another delivery advanced +`wf_last_seq` past `pending_seq` — but there is only one publisher, so no. +In the single-publisher model, `wf_last_seq` is only set by Deliver for +this publisher's `pending_seq`, so after acceptance `wf_last_seq = pending_seq`. +If rejected, `wf_last_seq` was already `>= pending_seq`, but since only +this publisher writes to `wf_last_seq`, and the last accepted sequence was +`confirmed_seq` (by C9 before StartFlush), and `pending_seq = confirmed_seq + 1`, +we have `wf_last_seq >= confirmed_seq + 1 = pending_seq`. If Deliver rejected, +it means `wf_last_seq >= pending_seq` already, but the only way `wf_last_seq` +could exceed `confirmed_seq` is from a previous delivered-but-not-confirmed +flush — which is exactly `pending_seq`. So `wf_last_seq = pending_seq`, +and C9 holds. Clearing pending makes C3, C4, C6, C7 vacuously true. + +**FlushFail:** Sets `flushing' = FALSE`. No changes to buffer, pending, +wf_log, or sequences. All properties preserved. + +### Why this generalizes beyond MaxItems + +The 13 clauses of IndInv are structural relationships between containers +(uniqueness, disjointness, subset, sequence ordering). None depends on the +value of MaxItems or the total number of items published. The per-action +preservation arguments above use only these structural properties, not any +bound on N. + +TLC verifies IndInv for all 609 reachable states at MaxItems=6. The +proof sketch above argues inductiveness informally — since the clauses +are structural relationships independent of N, this gives high +confidence in the general case. + +## Order Preservation + +`OrderPreserved` states that items appear in the log in ascending order of +their IDs. This is verified as an invariant alongside NoDuplicates. + +The property follows from the protocol structure: + +1. `Publish` assigns monotonically increasing IDs (`item_counter + 1`) +2. `StartFlush` moves the entire buffer to pending, preserving order +3. `Deliver` appends the entire pending sequence to the log, preserving order +4. Retries re-send the same pending with the same order; dedup ensures only + one copy appears in the log +5. The flush lock serializes batches, so all items in batch N have lower IDs + than all items in batch N+1 + +For multi-publisher scenarios (`PubSubDedupTTL.tla`), ordering is preserved +**per publisher** but not globally across publishers, since concurrent +publishers interleave non-deterministically. The `OrderPreservedPerPublisher` +invariant verifies this. + +## TTL-Based Pruning of Dedup Entries + +### Problem + +`publisher_sequences` grows with each distinct publisher. During +continue-as-new, stale entries (from publishers that are no longer active) +waste space. TTL-based pruning removes entries that haven't been updated +within a time window. + +### Safety Constraint + +`PubSubDedupTTL.tla` models two publishers with a `Prune` action that +resets a publisher's `wf_last` to 0 (forgetting its dedup history). + +**Unsafe pruning** (prune any publisher at any time) violates NoDuplicates. +TLC finds the counterexample in 9 states: + +``` +1. Publisher A sends batch [1,3] with seq=1 +2. Delivered to workflow (log=[1,3], wf_last[A]=1) +3. Client sees failure, keeps pending for retry +4. Retry starts (same pending, same seq=1) +5. PruneUnsafe: wf_last[A] reset to 0 (TTL expired!) +6. Deliver: seq=1 > 0 → accepted → log=[1,3,1,3] — DUPLICATE +``` + +The root cause: the publisher still has an in-flight retry, but the workflow +has forgotten its dedup entry. + +**Safe pruning** (prune only when the publisher has no pending batch and is +not flushing) preserves NoDuplicates. TLC verifies this across 7,635 states +with 2 publishers and MaxItemsPerPub=2. + +### Implementation Constraint + +The TLA+ safety condition `pend[p] = <<>> /\ ~flush_active[p]` translates +to a real-world constraint: **TTL must exceed the maximum time a publisher +might retry a failed flush.** In practice: + +- `PubSubClient` instances are ephemeral (activity-scoped or request-scoped) +- When the activity completes, the client is gone — no more retries +- A 15-minute TTL exceeds any reasonable activity execution time +- During CAN, `get_pubsub_state()` prunes entries older than TTL +- The workflow should wait for activities to complete before triggering CAN + +### Multi-Publisher Protocol + +The base multi-publisher protocol (without pruning) also passes all +properties: NoDuplicates, OrderPreservedPerPublisher, and AllItemsDelivered. +5,143 states explored with 2 publishers and MaxItemsPerPub=2. + +## Scope and Limitations + +The TLA+ specs model the core dedup protocol. The following implementation +paths are not modeled: + +- **`max_retry_duration` timeout**: The implementation drops pending batches + after a timeout. This sacrifices `AllItemsDelivered` (liveness) for bounded + resource usage. `NoDuplicates` (safety) is not affected — dropping a batch + cannot create duplicates. + +- **Late delivery after client failure**: The model only allows `Deliver` + while `flushing = TRUE`. In practice, a signal could be delivered after the + client observes failure and stops flushing. This cannot cause duplicates: + if the signal is delivered between FlushFail and the next retry StartFlush, + `wf_last_seq` advances to `pending_seq`. When the retry fires, Deliver + sees `pending_seq <= wf_last_seq` and rejects (dedup). If the signal was + already delivered before FlushFail, the retry is also rejected. + +- **Legacy `publisher_id = ""` (dedup bypass)**: When `publisher_id` is empty, + the workflow skips dedup entirely. This path is not modeled — it's + intentionally at-least-once for backward compatibility. + +- **Workflow-internal `publish()`**: Deterministic, no signal involved, no + dedup needed. Not modeled because there's no concurrency to verify. + +- **TTL pruning is assumption-dependent**: `PruneSafe` in the TLA+ spec + requires `pend[p] = <<>> /\ ~flush_active[p]`. The implementation + approximates this via timestamps (`publisher_ttl > max_retry_duration`). + Safety depends on the user aligning these two settings. + +- **Publisher ID uniqueness**: The TLA+ model uses fixed publisher identities + (`{"A", "B"}`). The implementation uses random 64-bit UUIDs + (`uuid.uuid4().hex[:16]`). If two client instances received the same + publisher ID and the first's dedup entry was pruned, the second could + have its sequence 1 accepted even though the first's sequence 1 was + already delivered. Collision probability is ~2^-64, making this + practically impossible, but the safety argument implicitly relies on + publisher ID uniqueness across the TTL window. + +## Counterexample: Broken Algorithm + +`PubSubDedupBroken.tla` models the old algorithm where on failure the client: +- Restores items to the main buffer +- Advances the sequence number + +TLC finds a NoDuplicates violation in 10 states: + +``` +State 1: Initial (empty) +State 2: Publish item 1 +State 3: StartFlush: in_flight=[1], seq=1, buffer=[] +State 4-6: Publish items 2,3,4 (arrive during flush) +State 7: Deliver: wf_log=[1], wf_last_seq=1 (signal delivered) +State 8: FlushFail: buffer=[1,2,3,4], confirmed_seq=1 (BUG: item 1 restored) +State 9: StartFlush: in_flight=[1,2,3,4], seq=2 +State 10: Deliver: wf_log=[1,1,2,3,4] — DUPLICATE! +``` + +The root cause: item 1 was delivered (in the log) but also restored to the +buffer under a new sequence number, bypassing the workflow's dedup check. + +The correct algorithm prevents this by keeping the failed batch **separate** +(`pending`) and retrying with the **same** sequence number. If the signal was +already delivered, the retry is deduplicated (same sequence). If it wasn't, +the retry delivers it. + +## Correspondence to Implementation + +| TLA+ Variable | Python Implementation | +|---|---| +| `buffer` | `PubSubClient._buffer` | +| `pending` | `PubSubClient._pending` | +| `pending_seq` | `PubSubClient._pending_seq` | +| `confirmed_seq` | `PubSubClient._sequence` | +| `wf_last_seq` | `PubSubMixin._pubsub_publisher_sequences[publisher_id]` | + +| TLA+ Action | Python Code | +|---|---| +| `Publish` | `PubSubClient.publish()` appends to `_buffer` | +| `StartFlush` (retry) | `_flush()` detects `_pending is not None` | +| `StartFlush` (new) | `_flush()` swaps: `batch = _buffer; _buffer = []` | +| `Deliver` | Temporal signal delivery + `_pubsub_publish` handler | +| `FlushSuccess` | Signal call returns without exception | +| `FlushFail` | Signal call raises; `_pending` retained for retry | diff --git a/temporalio/contrib/pubsub/verification/PubSubDedup.cfg b/temporalio/contrib/pubsub/verification/PubSubDedup.cfg new file mode 100644 index 000000000..859346ed3 --- /dev/null +++ b/temporalio/contrib/pubsub/verification/PubSubDedup.cfg @@ -0,0 +1,14 @@ +SPECIFICATION FairSpec + +CONSTANTS + MaxItems = 4 + +INVARIANTS + NoDuplicates + OrderPreserved + +PROPERTIES + AllItemsDelivered + +CHECK_DEADLOCK + FALSE diff --git a/temporalio/contrib/pubsub/verification/PubSubDedup.tla b/temporalio/contrib/pubsub/verification/PubSubDedup.tla new file mode 100644 index 000000000..ba939f4e6 --- /dev/null +++ b/temporalio/contrib/pubsub/verification/PubSubDedup.tla @@ -0,0 +1,205 @@ +--------------------------- MODULE PubSubDedup ---------------------------- +(* + * Formal verification of the pub/sub exactly-once delivery protocol. + * + * Models a single publisher flushing batches to a workflow via Temporal + * signals, with non-deterministic network behavior (signals may be + * delivered but the client sees a failure). + * + * The protocol: + * - Client swaps buffer → pending batch, assigns sequence = confirmed + 1 + * - Client sends signal with (publisher_id, sequence, batch) + * - On confirmed success: advance confirmed_seq, clear pending + * - On failure: keep pending batch + sequence for retry (DO NOT advance) + * - Workflow deduplicates: reject if sequence <= last_seen_seq + * + * Verified properties: + * - NoDuplicates: each item appears at most once in the workflow log + * - NoDataLoss: every published item eventually reaches the log + * - OrderPreserved: items within a batch maintain their relative order + *) +EXTENDS Integers, Sequences, FiniteSets + +CONSTANTS + MaxItems \* Upper bound on items published (for finite model checking) + +VARIABLES + (* === Client state === *) + buffer, \* Seq of item IDs waiting to be flushed + pending, \* Seq of item IDs in the current pending batch (<<>> if none) + pending_seq, \* Sequence number assigned to the pending batch + confirmed_seq, \* Last sequence number confirmed delivered + flushing, \* TRUE when a signal send is in-flight + + (* === Network state === *) + delivered, \* TRUE if the current in-flight signal reached the workflow + + (* === Workflow state === *) + wf_log, \* Append-only log of item IDs + wf_last_seq, \* Highest accepted sequence for this publisher + + (* === Bookkeeping === *) + item_counter \* Monotonic counter for generating unique item IDs + +vars == <> + +------------------------------------------------------------------------ +(* Initial state *) + +Init == + /\ buffer = <<>> + /\ pending = <<>> + /\ pending_seq = 0 + /\ confirmed_seq = 0 + /\ flushing = FALSE + /\ delivered = FALSE + /\ wf_log = <<>> + /\ wf_last_seq = 0 + /\ item_counter = 0 + +------------------------------------------------------------------------ +(* Client actions *) + +\* Publish a new item into the buffer. +\* Can happen at any time, including while a flush is in-flight. +\* This models the buffer swap: new items go to the fresh buffer, +\* not the pending batch. +Publish == + /\ item_counter < MaxItems + /\ item_counter' = item_counter + 1 + /\ buffer' = Append(buffer, item_counter + 1) + /\ UNCHANGED <> + +\* Start a flush attempt. +\* - If there is a pending batch (from a prior failure), retry it. +\* - Otherwise, swap buffer into pending with a new sequence number. +\* - If nothing to send, this action is not enabled. +StartFlush == + /\ ~flushing + /\ \/ (* Case 1: retry a failed batch *) + /\ pending /= <<>> + /\ flushing' = TRUE + /\ delivered' = FALSE + /\ UNCHANGED <> + \/ (* Case 2: new batch from buffer *) + /\ pending = <<>> + /\ buffer /= <<>> + /\ pending' = buffer + /\ buffer' = <<>> + /\ pending_seq' = confirmed_seq + 1 + /\ flushing' = TRUE + /\ delivered' = FALSE + /\ UNCHANGED <> + +------------------------------------------------------------------------ +(* Network / Workflow actions *) + +\* The signal reaches the workflow. The workflow applies dedup logic: +\* - If pending_seq > wf_last_seq: accept (append items, update last_seq) +\* - Otherwise: reject (duplicate) +\* +\* This may or may not happen before the client observes a result. +\* Non-determinism is captured by allowing Deliver to fire or not. +Deliver == + /\ flushing + /\ ~delivered + /\ IF pending_seq > wf_last_seq + THEN /\ wf_log' = wf_log \o pending + /\ wf_last_seq' = pending_seq + ELSE /\ UNCHANGED <> + /\ delivered' = TRUE + /\ UNCHANGED <> + +------------------------------------------------------------------------ +(* Client observes result *) + +\* Client sees success. This can only happen if the signal was delivered +\* (you cannot get a success response for an undelivered signal). +FlushSuccess == + /\ flushing + /\ delivered + /\ flushing' = FALSE + /\ confirmed_seq' = pending_seq + /\ pending' = <<>> + /\ pending_seq' = 0 + /\ UNCHANGED <> + +\* Client sees failure. The signal may or may not have been delivered. +\* Pending batch and sequence are kept for retry. +FlushFail == + /\ flushing + /\ flushing' = FALSE + /\ UNCHANGED <> + +------------------------------------------------------------------------ +(* State machine *) + +Next == + \/ Publish + \/ StartFlush + \/ Deliver + \/ FlushSuccess + \/ FlushFail + +Spec == Init /\ [][Next]_vars + +\* Fairness: under weak fairness, every continuously enabled action +\* eventually executes. This ensures the system makes progress. +Fairness == + /\ WF_vars(StartFlush) + /\ WF_vars(Deliver) + /\ WF_vars(FlushSuccess) + /\ WF_vars(FlushFail) + +FairSpec == Spec /\ Fairness + +------------------------------------------------------------------------ +(* Safety properties *) + +\* Every item ID in wf_log is unique — no duplicates. +NoDuplicates == + \A i, j \in 1..Len(wf_log) : + (i /= j) => (wf_log[i] /= wf_log[j]) + +\* Global ordering: items appear in the log in the order they were +\* published (ascending item IDs). This is stronger than within-batch +\* ordering — it covers cross-batch ordering too. +\* +\* This holds because: +\* 1. Publish appends item_counter+1 (monotonically increasing) +\* 2. StartFlush moves the entire buffer to pending (preserving order) +\* 3. Deliver appends the entire pending sequence (preserving order) +\* 4. Retries re-send the same pending (same order), and dedup +\* means the log only contains one copy +\* 5. The flush lock serializes batches, so batch N's items all +\* have lower IDs than batch N+1's items +OrderPreserved == + \A i, j \in 1..Len(wf_log) : + (i < j) => (wf_log[i] < wf_log[j]) + +------------------------------------------------------------------------ +(* Liveness properties *) + +\* Every published item eventually appears in the workflow log. +\* This requires fairness (otherwise the system can stutter forever). +\* +\* Stated as: it is always the case that eventually all published items +\* are in the log (assuming the system keeps running). +AllItemsDelivered == + <>(\A id \in 1..item_counter : + \E i \in 1..Len(wf_log) : wf_log[i] = id) + +\* The system does not deadlock: some action is always enabled. +\* (Not strictly a liveness property but useful to check.) +NoDeadlock == + \/ item_counter < MaxItems \* Can still publish + \/ buffer /= <<>> \* Can flush + \/ pending /= <<>> \* Can retry + \/ flushing \* Waiting for network result + +======================================================================== diff --git a/temporalio/contrib/pubsub/verification/PubSubDedupBroken.cfg b/temporalio/contrib/pubsub/verification/PubSubDedupBroken.cfg new file mode 100644 index 000000000..7a376151d --- /dev/null +++ b/temporalio/contrib/pubsub/verification/PubSubDedupBroken.cfg @@ -0,0 +1,10 @@ +SPECIFICATION FairSpec + +CONSTANTS + MaxItems = 4 + +INVARIANTS + NoDuplicates + +CHECK_DEADLOCK + FALSE diff --git a/temporalio/contrib/pubsub/verification/PubSubDedupBroken.tla b/temporalio/contrib/pubsub/verification/PubSubDedupBroken.tla new file mode 100644 index 000000000..43475b417 --- /dev/null +++ b/temporalio/contrib/pubsub/verification/PubSubDedupBroken.tla @@ -0,0 +1,120 @@ +------------------------ MODULE PubSubDedupBroken ------------------------- +(* + * BROKEN version of the dedup protocol: advances sequence on failure + * and restores items to the main buffer. + * + * This models the OLD algorithm. TLC should find a NoDuplicates or + * data loss violation, confirming the bug that motivated the redesign. + * + * The broken behavior: + * - On failure: restore items to buffer, advance sequence anyway + * - Next flush merges restored + new items under a new sequence + * - If the original signal WAS delivered, the merged batch creates + * duplicates (original items appear twice in the log) + *) +EXTENDS Integers, Sequences, FiniteSets + +CONSTANTS + MaxItems + +VARIABLES + buffer, + confirmed_seq, + flushing, + in_flight_batch, \* The batch currently being sent + in_flight_seq, \* Its sequence number + delivered, + wf_log, + wf_last_seq, + item_counter + +vars == <> + +Init == + /\ buffer = <<>> + /\ confirmed_seq = 0 + /\ flushing = FALSE + /\ in_flight_batch = <<>> + /\ in_flight_seq = 0 + /\ delivered = FALSE + /\ wf_log = <<>> + /\ wf_last_seq = 0 + /\ item_counter = 0 + +Publish == + /\ item_counter < MaxItems + /\ item_counter' = item_counter + 1 + /\ buffer' = Append(buffer, item_counter + 1) + /\ UNCHANGED <> + +\* BROKEN: always takes from buffer (no separate pending/retry) +StartFlush == + /\ ~flushing + /\ buffer /= <<>> + /\ in_flight_seq' = confirmed_seq + 1 + /\ in_flight_batch' = buffer + /\ buffer' = <<>> + /\ flushing' = TRUE + /\ delivered' = FALSE + /\ UNCHANGED <> + +Deliver == + /\ flushing + /\ ~delivered + /\ IF in_flight_seq > wf_last_seq + THEN /\ wf_log' = wf_log \o in_flight_batch + /\ wf_last_seq' = in_flight_seq + ELSE /\ UNCHANGED <> + /\ delivered' = TRUE + /\ UNCHANGED <> + +FlushSuccess == + /\ flushing + /\ delivered + /\ flushing' = FALSE + /\ confirmed_seq' = in_flight_seq + /\ in_flight_batch' = <<>> + /\ in_flight_seq' = 0 + /\ UNCHANGED <> + +\* BROKEN: On failure, restore items to front of buffer AND advance sequence. +\* This is the bug: if the signal was delivered, the next flush will +\* re-send these items under a new sequence, creating duplicates. +FlushFail == + /\ flushing + /\ flushing' = FALSE + /\ confirmed_seq' = in_flight_seq \* <-- BUG: advance anyway + /\ buffer' = in_flight_batch \o buffer \* <-- BUG: restore to buffer + /\ in_flight_batch' = <<>> + /\ in_flight_seq' = 0 + /\ UNCHANGED <> + +Next == + \/ Publish + \/ StartFlush + \/ Deliver + \/ FlushSuccess + \/ FlushFail + +Spec == Init /\ [][Next]_vars + +Fairness == + /\ WF_vars(StartFlush) + /\ WF_vars(Deliver) + /\ WF_vars(FlushSuccess) + /\ WF_vars(FlushFail) + +FairSpec == Spec /\ Fairness + +NoDuplicates == + \A i, j \in 1..Len(wf_log) : + (i /= j) => (wf_log[i] /= wf_log[j]) + +AllItemsDelivered == + <>(\A id \in 1..item_counter : + \E i \in 1..Len(wf_log) : wf_log[i] = id) + +======================================================================== diff --git a/temporalio/contrib/pubsub/verification/PubSubDedupBroken_TTrace_1775536423.bin b/temporalio/contrib/pubsub/verification/PubSubDedupBroken_TTrace_1775536423.bin new file mode 100644 index 000000000..0d1676142 Binary files /dev/null and b/temporalio/contrib/pubsub/verification/PubSubDedupBroken_TTrace_1775536423.bin differ diff --git a/temporalio/contrib/pubsub/verification/PubSubDedupBroken_TTrace_1775536423.tla b/temporalio/contrib/pubsub/verification/PubSubDedupBroken_TTrace_1775536423.tla new file mode 100644 index 000000000..e130026cb --- /dev/null +++ b/temporalio/contrib/pubsub/verification/PubSubDedupBroken_TTrace_1775536423.tla @@ -0,0 +1,187 @@ +---- MODULE PubSubDedupBroken_TTrace_1775536423 ---- +EXTENDS Sequences, TLCExt, Toolbox, Naturals, TLC, PubSubDedupBroken + +_expression == + LET PubSubDedupBroken_TEExpression == INSTANCE PubSubDedupBroken_TEExpression + IN PubSubDedupBroken_TEExpression!expression +---- + +_trace == + LET PubSubDedupBroken_TETrace == INSTANCE PubSubDedupBroken_TETrace + IN PubSubDedupBroken_TETrace!trace +---- + +_inv == + ~( + TLCGet("level") = Len(_TETrace) + /\ + item_counter = (4) + /\ + in_flight_batch = (<<1, 2, 3, 4>>) + /\ + wf_last_seq = (2) + /\ + delivered = (TRUE) + /\ + flushing = (TRUE) + /\ + buffer = (<<>>) + /\ + in_flight_seq = (2) + /\ + wf_log = (<<1, 1, 2, 3, 4>>) + /\ + confirmed_seq = (1) + ) +---- + +_init == + /\ wf_log = _TETrace[1].wf_log + /\ flushing = _TETrace[1].flushing + /\ in_flight_batch = _TETrace[1].in_flight_batch + /\ in_flight_seq = _TETrace[1].in_flight_seq + /\ buffer = _TETrace[1].buffer + /\ item_counter = _TETrace[1].item_counter + /\ confirmed_seq = _TETrace[1].confirmed_seq + /\ wf_last_seq = _TETrace[1].wf_last_seq + /\ delivered = _TETrace[1].delivered +---- + +_next == + /\ \E i,j \in DOMAIN _TETrace: + /\ \/ /\ j = i + 1 + /\ i = TLCGet("level") + /\ wf_log = _TETrace[i].wf_log + /\ wf_log' = _TETrace[j].wf_log + /\ flushing = _TETrace[i].flushing + /\ flushing' = _TETrace[j].flushing + /\ in_flight_batch = _TETrace[i].in_flight_batch + /\ in_flight_batch' = _TETrace[j].in_flight_batch + /\ in_flight_seq = _TETrace[i].in_flight_seq + /\ in_flight_seq' = _TETrace[j].in_flight_seq + /\ buffer = _TETrace[i].buffer + /\ buffer' = _TETrace[j].buffer + /\ item_counter = _TETrace[i].item_counter + /\ item_counter' = _TETrace[j].item_counter + /\ confirmed_seq = _TETrace[i].confirmed_seq + /\ confirmed_seq' = _TETrace[j].confirmed_seq + /\ wf_last_seq = _TETrace[i].wf_last_seq + /\ wf_last_seq' = _TETrace[j].wf_last_seq + /\ delivered = _TETrace[i].delivered + /\ delivered' = _TETrace[j].delivered + +\* Uncomment the ASSUME below to write the states of the error trace +\* to the given file in Json format. Note that you can pass any tuple +\* to `JsonSerialize`. For example, a sub-sequence of _TETrace. + \* ASSUME + \* LET J == INSTANCE Json + \* IN J!JsonSerialize("PubSubDedupBroken_TTrace_1775536423.json", _TETrace) + +============================================================================= + + Note that you can extract this module `PubSubDedupBroken_TEExpression` + to a dedicated file to reuse `expression` (the module in the + dedicated `PubSubDedupBroken_TEExpression.tla` file takes precedence + over the module `PubSubDedupBroken_TEExpression` below). + +---- MODULE PubSubDedupBroken_TEExpression ---- +EXTENDS Sequences, TLCExt, Toolbox, Naturals, TLC, PubSubDedupBroken + +expression == + [ + \* To hide variables of the `PubSubDedupBroken` spec from the error trace, + \* remove the variables below. The trace will be written in the order + \* of the fields of this record. + wf_log |-> wf_log + ,flushing |-> flushing + ,in_flight_batch |-> in_flight_batch + ,in_flight_seq |-> in_flight_seq + ,buffer |-> buffer + ,item_counter |-> item_counter + ,confirmed_seq |-> confirmed_seq + ,wf_last_seq |-> wf_last_seq + ,delivered |-> delivered + + \* Put additional constant-, state-, and action-level expressions here: + \* ,_stateNumber |-> _TEPosition + \* ,_wf_logUnchanged |-> wf_log = wf_log' + + \* Format the `wf_log` variable as Json value. + \* ,_wf_logJson |-> + \* LET J == INSTANCE Json + \* IN J!ToJson(wf_log) + + \* Lastly, you may build expressions over arbitrary sets of states by + \* leveraging the _TETrace operator. For example, this is how to + \* count the number of times a spec variable changed up to the current + \* state in the trace. + \* ,_wf_logModCount |-> + \* LET F[s \in DOMAIN _TETrace] == + \* IF s = 1 THEN 0 + \* ELSE IF _TETrace[s].wf_log # _TETrace[s-1].wf_log + \* THEN 1 + F[s-1] ELSE F[s-1] + \* IN F[_TEPosition - 1] + ] + +============================================================================= + + + +Parsing and semantic processing can take forever if the trace below is long. + In this case, it is advised to uncomment the module below to deserialize the + trace from a generated binary file. + +\* +\*---- MODULE PubSubDedupBroken_TETrace ---- +\*EXTENDS IOUtils, TLC, PubSubDedupBroken +\* +\*trace == IODeserialize("PubSubDedupBroken_TTrace_1775536423.bin", TRUE) +\* +\*============================================================================= +\* + +---- MODULE PubSubDedupBroken_TETrace ---- +EXTENDS TLC, PubSubDedupBroken + +trace == + << + ([item_counter |-> 0,in_flight_batch |-> <<>>,wf_last_seq |-> 0,delivered |-> FALSE,flushing |-> FALSE,buffer |-> <<>>,in_flight_seq |-> 0,wf_log |-> <<>>,confirmed_seq |-> 0]), + ([item_counter |-> 1,in_flight_batch |-> <<>>,wf_last_seq |-> 0,delivered |-> FALSE,flushing |-> FALSE,buffer |-> <<1>>,in_flight_seq |-> 0,wf_log |-> <<>>,confirmed_seq |-> 0]), + ([item_counter |-> 1,in_flight_batch |-> <<1>>,wf_last_seq |-> 0,delivered |-> FALSE,flushing |-> TRUE,buffer |-> <<>>,in_flight_seq |-> 1,wf_log |-> <<>>,confirmed_seq |-> 0]), + ([item_counter |-> 2,in_flight_batch |-> <<1>>,wf_last_seq |-> 0,delivered |-> FALSE,flushing |-> TRUE,buffer |-> <<2>>,in_flight_seq |-> 1,wf_log |-> <<>>,confirmed_seq |-> 0]), + ([item_counter |-> 3,in_flight_batch |-> <<1>>,wf_last_seq |-> 0,delivered |-> FALSE,flushing |-> TRUE,buffer |-> <<2, 3>>,in_flight_seq |-> 1,wf_log |-> <<>>,confirmed_seq |-> 0]), + ([item_counter |-> 4,in_flight_batch |-> <<1>>,wf_last_seq |-> 0,delivered |-> FALSE,flushing |-> TRUE,buffer |-> <<2, 3, 4>>,in_flight_seq |-> 1,wf_log |-> <<>>,confirmed_seq |-> 0]), + ([item_counter |-> 4,in_flight_batch |-> <<1>>,wf_last_seq |-> 1,delivered |-> TRUE,flushing |-> TRUE,buffer |-> <<2, 3, 4>>,in_flight_seq |-> 1,wf_log |-> <<1>>,confirmed_seq |-> 0]), + ([item_counter |-> 4,in_flight_batch |-> <<>>,wf_last_seq |-> 1,delivered |-> TRUE,flushing |-> FALSE,buffer |-> <<1, 2, 3, 4>>,in_flight_seq |-> 0,wf_log |-> <<1>>,confirmed_seq |-> 1]), + ([item_counter |-> 4,in_flight_batch |-> <<1, 2, 3, 4>>,wf_last_seq |-> 1,delivered |-> FALSE,flushing |-> TRUE,buffer |-> <<>>,in_flight_seq |-> 2,wf_log |-> <<1>>,confirmed_seq |-> 1]), + ([item_counter |-> 4,in_flight_batch |-> <<1, 2, 3, 4>>,wf_last_seq |-> 2,delivered |-> TRUE,flushing |-> TRUE,buffer |-> <<>>,in_flight_seq |-> 2,wf_log |-> <<1, 1, 2, 3, 4>>,confirmed_seq |-> 1]) + >> +---- + + +============================================================================= + +---- CONFIG PubSubDedupBroken_TTrace_1775536423 ---- +CONSTANTS + MaxItems = 4 + +INVARIANT + _inv + +CHECK_DEADLOCK + \* CHECK_DEADLOCK off because of PROPERTY or INVARIANT above. + FALSE + +INIT + _init + +NEXT + _next + +CONSTANT + _TETrace <- _trace + +ALIAS + _expression +============================================================================= +\* Generated on Mon Apr 06 21:33:43 PDT 2026 \ No newline at end of file diff --git a/temporalio/contrib/pubsub/verification/PubSubDedupInductive.cfg b/temporalio/contrib/pubsub/verification/PubSubDedupInductive.cfg new file mode 100644 index 000000000..789d9e80d --- /dev/null +++ b/temporalio/contrib/pubsub/verification/PubSubDedupInductive.cfg @@ -0,0 +1,25 @@ +\* Verify IndInv holds for all reachable states of the standard spec. +\* +\* This checks: +\* 1. Init => IndInv +\* 2. IndInv is preserved along all reachable behaviors +\* +\* This is reachable-state invariant checking, not full inductiveness +\* checking (which would require IndSpec with all IndInv states as +\* initial states — not feasible with TLC for sequence-valued state). +\* The per-action proof sketch in the .tla file argues inductiveness +\* informally. Since the invariant's clauses are structural relationships +\* between containers — not functions of MaxItems — verification at +\* small N gives high confidence in the general case. + +SPECIFICATION Spec + +CONSTANTS + MaxItems = 6 + +INVARIANTS + IndInv + OrderPreserved + +CHECK_DEADLOCK + FALSE diff --git a/temporalio/contrib/pubsub/verification/PubSubDedupInductive.tla b/temporalio/contrib/pubsub/verification/PubSubDedupInductive.tla new file mode 100644 index 000000000..ddf5787c6 --- /dev/null +++ b/temporalio/contrib/pubsub/verification/PubSubDedupInductive.tla @@ -0,0 +1,244 @@ +---------------------- MODULE PubSubDedupInductive ------------------------- +(* + * Inductive invariant for the pub/sub dedup protocol. + * + * A strengthened invariant that implies NoDuplicates. If IndInv is + * preserved by every action (i.e., it is inductive), then NoDuplicates + * holds for ALL reachable states regardless of MaxItems. + * + * TLC checks IndInv as a reachable-state invariant of the standard + * Spec (Init /\ [][Next]_vars). This verifies Init => IndInv and + * preservation along all reachable behaviors, but does not check + * inductiveness from arbitrary IndInv states (which would require + * enumerating all sequence-valued states satisfying IndInv — not + * feasible with TLC). The per-action proof sketch below argues + * inductiveness informally. + * + * Proof sketch for each action preserving IndInv: + * + * Publish: Adds item_counter+1 (fresh, not in any container). + * All uniqueness/disjointness clauses preserved since the new + * item is unique. item_counter increments, keeping Bounded. + * + * StartFlush (retry): pending/buffer/wf_log unchanged. + * Only flushing and delivered change. All structural properties + * preserved trivially. + * + * StartFlush (new): Moves buffer -> pending, buffer becomes <<>>. + * pending_seq = confirmed_seq + 1. By SeqConsistency, + * pending = <<>> before this step implies confirmed_seq = wf_last_seq, + * so pending_seq = wf_last_seq + 1 > wf_last_seq. Since buffer was + * Disjoint from wf_log (by BufferDisjointLog), pending is now + * Disjoint from wf_log. Buffer uniqueness transfers to pending. + * + * Deliver (accepted, pending_seq > wf_last_seq): Appends pending + * to wf_log. By PendingLogRelation, pending is Disjoint from + * wf_log. Combined with NoDuplicates and PendingUnique, the + * extended log has no duplicates. Sets wf_last_seq = pending_seq, + * so now pending_seq <= wf_last_seq, and SubsetWhenDelivered + * is satisfied (pending items are in the new wf_log). + * + * Deliver (rejected, pending_seq <= wf_last_seq): wf_log unchanged. + * All properties trivially preserved. + * + * FlushSuccess: Sets pending = <<>>, confirmed_seq = pending_seq. + * Since Deliver already set wf_last_seq = pending_seq, we get + * confirmed_seq = wf_last_seq, satisfying SeqConsistency. + * Clearing pending satisfies all pending-related clauses vacuously. + * + * FlushFail: Only sets flushing = FALSE. All structural state + * (buffer, pending, wf_log, sequences) unchanged. + *) +EXTENDS Integers, Sequences, FiniteSets + +CONSTANTS + MaxItems + +VARIABLES + buffer, pending, pending_seq, confirmed_seq, flushing, + delivered, wf_log, wf_last_seq, item_counter + +vars == <> + +------------------------------------------------------------------------ +(* Import the protocol definition *) + +Init == + /\ buffer = <<>> + /\ pending = <<>> + /\ pending_seq = 0 + /\ confirmed_seq = 0 + /\ flushing = FALSE + /\ delivered = FALSE + /\ wf_log = <<>> + /\ wf_last_seq = 0 + /\ item_counter = 0 + +Publish == + /\ item_counter < MaxItems + /\ item_counter' = item_counter + 1 + /\ buffer' = Append(buffer, item_counter + 1) + /\ UNCHANGED <> + +StartFlush == + /\ ~flushing + /\ \/ /\ pending /= <<>> + /\ flushing' = TRUE + /\ delivered' = FALSE + /\ UNCHANGED <> + \/ /\ pending = <<>> + /\ buffer /= <<>> + /\ pending' = buffer + /\ buffer' = <<>> + /\ pending_seq' = confirmed_seq + 1 + /\ flushing' = TRUE + /\ delivered' = FALSE + /\ UNCHANGED <> + +Deliver == + /\ flushing + /\ ~delivered + /\ IF pending_seq > wf_last_seq + THEN /\ wf_log' = wf_log \o pending + /\ wf_last_seq' = pending_seq + ELSE /\ UNCHANGED <> + /\ delivered' = TRUE + /\ UNCHANGED <> + +FlushSuccess == + /\ flushing + /\ delivered + /\ flushing' = FALSE + /\ confirmed_seq' = pending_seq + /\ pending' = <<>> + /\ pending_seq' = 0 + /\ UNCHANGED <> + +FlushFail == + /\ flushing + /\ flushing' = FALSE + /\ UNCHANGED <> + +Next == + \/ Publish + \/ StartFlush + \/ Deliver + \/ FlushSuccess + \/ FlushFail + +------------------------------------------------------------------------ +(* Helper operators *) + +\* Set of elements in a sequence +SeqToSet(s) == {s[i] : i \in 1..Len(s)} + +\* All elements of a sequence are distinct +Unique(s) == + \A i, j \in 1..Len(s) : (i /= j) => (s[i] /= s[j]) + +\* Two sequences share no elements +Disjoint(s1, s2) == + SeqToSet(s1) \cap SeqToSet(s2) = {} + +\* All elements of s1 appear in s2 +IsSubseq(s1, s2) == + SeqToSet(s1) \subseteq SeqToSet(s2) + +------------------------------------------------------------------------ +(* The inductive invariant *) + +IndInv == + (* --- Uniqueness within each container --- *) + \* C1: No duplicates in the workflow log + /\ Unique(wf_log) + \* C2: No duplicates in the buffer + /\ Unique(buffer) + \* C3: No duplicates in the pending batch + /\ Unique(pending) + + (* --- Disjointness between containers --- *) + \* C4: Buffer items are not in the pending batch + /\ Disjoint(buffer, pending) + \* C5: Buffer items are not in the log + /\ Disjoint(buffer, wf_log) + + (* --- Pending-log relationship (key dedup property) --- *) + \* C6: If pending hasn't been delivered yet, its items are not in the log + /\ (pending /= <<>> /\ pending_seq > wf_last_seq) + => Disjoint(pending, wf_log) + \* C7: If pending WAS already delivered, its items are in the log + \* (so a re-delivery would be a no-op) + /\ (pending /= <<>> /\ pending_seq <= wf_last_seq) + => IsSubseq(pending, wf_log) + + (* --- Sequence consistency --- *) + \* C8: confirmed_seq never exceeds wf_last_seq + /\ confirmed_seq <= wf_last_seq + \* C9: When no pending batch, confirmed and wf sequences are in sync. + \* This ensures StartFlush (new) always produces pending_seq > wf_last_seq. + /\ (pending = <<>>) => (confirmed_seq = wf_last_seq) + \* C10: pending_seq is 0 iff pending is empty + /\ (pending = <<>>) <=> (pending_seq = 0) + \* C11: pending_seq is bounded by confirmed_seq + 1 + /\ (pending /= <<>>) => (pending_seq = confirmed_seq + 1) + + (* --- Item ID bounds --- *) + \* C12: All item IDs are in 1..item_counter + /\ \A i \in 1..Len(wf_log) : wf_log[i] \in 1..item_counter + /\ \A i \in 1..Len(buffer) : buffer[i] \in 1..item_counter + /\ \A i \in 1..Len(pending) : pending[i] \in 1..item_counter + + (* --- Non-negative sequences --- *) + /\ confirmed_seq >= 0 + /\ wf_last_seq >= 0 + /\ item_counter >= 0 + +------------------------------------------------------------------------ +(* Safety properties implied by IndInv *) + +NoDuplicates == Unique(wf_log) +THEOREM IndInv => NoDuplicates \* Trivially: NoDuplicates is conjunct C1 + +\* Global ordering: items appear in ascending order of their IDs. +\* This follows from C12 (bounded IDs), C1 (unique), and the fact that +\* Publish assigns monotonically increasing IDs, StartFlush preserves +\* buffer order, and Deliver appends in order. +OrderPreserved == + \A i, j \in 1..Len(wf_log) : + (i < j) => (wf_log[i] < wf_log[j]) + +------------------------------------------------------------------------ +(* Specification for checking inductiveness: + * Initial states = ALL states satisfying IndInv (within type bounds). + * If IndInv is an invariant of this spec, then IndInv is inductive. *) + +\* Type constraint to bound the state space for TLC +TypeOK == + /\ item_counter \in 0..MaxItems + /\ confirmed_seq \in 0..MaxItems + /\ wf_last_seq \in 0..MaxItems + /\ pending_seq \in 0..MaxItems + /\ flushing \in BOOLEAN + /\ delivered \in BOOLEAN + /\ Len(buffer) <= MaxItems + /\ Len(pending) <= MaxItems + /\ Len(wf_log) <= MaxItems \* Conservative bound for TLC state enumeration + /\ \A i \in 1..Len(buffer) : buffer[i] \in 1..MaxItems + /\ \A i \in 1..Len(pending) : pending[i] \in 1..MaxItems + /\ \A i \in 1..Len(wf_log) : wf_log[i] \in 1..MaxItems + +\* For inductiveness checking: all IndInv states as initial states +IndInit == TypeOK /\ IndInv + +\* The inductiveness-checking specification +IndSpec == IndInit /\ [][Next]_vars + +\* The standard specification (for reference) +Spec == Init /\ [][Next]_vars + +======================================================================== diff --git a/temporalio/contrib/pubsub/verification/PubSubDedupTTL.tla b/temporalio/contrib/pubsub/verification/PubSubDedupTTL.tla new file mode 100644 index 000000000..d105cc391 --- /dev/null +++ b/temporalio/contrib/pubsub/verification/PubSubDedupTTL.tla @@ -0,0 +1,203 @@ +--------------------------- MODULE PubSubDedupTTL -------------------------- +(* + * Verification of TTL-based pruning of publisher dedup entries. + * + * When a workflow continues-as-new, it can prune stale publisher_sequences + * entries to bound memory. This spec verifies: + * + * 1. UNSAFE pruning (prune any publisher at any time) allows duplicates. + * TLC finds the counterexample. + * + * 2. SAFE pruning (prune only publishers with no pending batch) preserves + * NoDuplicates. This models the real constraint: TTL must exceed the + * maximum time a publisher might retry a failed flush. + * + * The spec models two publishers (A and B) sharing a single workflow log. + * Each publisher has independent buffer/pending/sequence state. The workflow + * tracks per-publisher last_seq in a function. + * + * The pruning action models what happens during continue-as-new when a + * publisher's TTL has expired: the workflow "forgets" that publisher's + * last_seq, resetting it to 0. + *) +EXTENDS Integers, Sequences, FiniteSets + +CONSTANTS + MaxItemsPerPub \* Max items each publisher can create + +Publishers == {"A", "B"} + +VARIABLES + (* === Per-publisher client state === *) + buf, \* buf[p]: buffer for publisher p + pend, \* pend[p]: pending batch for publisher p + pend_seq, \* pend_seq[p]: sequence of pending batch + conf_seq, \* conf_seq[p]: last confirmed sequence + flush_active, \* flush_active[p]: TRUE when flush in-flight + delivered_flag, \* delivered_flag[p]: TRUE if current signal delivered + + (* === Workflow state === *) + wf_log, \* Shared append-only log + wf_last, \* wf_last[p]: last accepted seq for publisher p + + (* === Bookkeeping === *) + ctr \* ctr[p]: item counter per publisher + +vars == <> + +------------------------------------------------------------------------ +(* Initial state *) + +Init == + /\ buf = [p \in Publishers |-> <<>>] + /\ pend = [p \in Publishers |-> <<>>] + /\ pend_seq = [p \in Publishers |-> 0] + /\ conf_seq = [p \in Publishers |-> 0] + /\ flush_active = [p \in Publishers |-> FALSE] + /\ delivered_flag = [p \in Publishers |-> FALSE] + /\ wf_log = <<>> + /\ wf_last = [p \in Publishers |-> 0] + /\ ctr = [p \in Publishers |-> 0] + +------------------------------------------------------------------------ +(* Per-publisher actions, parameterized by publisher p *) + +\* Unique item IDs: publisher A gets odd numbers, B gets even numbers. +\* This ensures global uniqueness without a shared counter. +ItemId(p, n) == + IF p = "A" THEN 2 * n - 1 ELSE 2 * n + +Publish(p) == + /\ ctr[p] < MaxItemsPerPub + /\ ctr' = [ctr EXCEPT ![p] = @ + 1] + /\ buf' = [buf EXCEPT ![p] = Append(@, ItemId(p, ctr[p] + 1))] + /\ UNCHANGED <> + +StartFlush(p) == + /\ ~flush_active[p] + /\ \/ (* Retry *) + /\ pend[p] /= <<>> + /\ flush_active' = [flush_active EXCEPT ![p] = TRUE] + /\ delivered_flag' = [delivered_flag EXCEPT ![p] = FALSE] + /\ UNCHANGED <> + \/ (* New batch *) + /\ pend[p] = <<>> + /\ buf[p] /= <<>> + /\ pend' = [pend EXCEPT ![p] = buf[p]] + /\ buf' = [buf EXCEPT ![p] = <<>>] + /\ pend_seq' = [pend_seq EXCEPT ![p] = conf_seq[p] + 1] + /\ flush_active' = [flush_active EXCEPT ![p] = TRUE] + /\ delivered_flag' = [delivered_flag EXCEPT ![p] = FALSE] + /\ UNCHANGED <> + +Deliver(p) == + /\ flush_active[p] + /\ ~delivered_flag[p] + /\ IF pend_seq[p] > wf_last[p] + THEN /\ wf_log' = wf_log \o pend[p] + /\ wf_last' = [wf_last EXCEPT ![p] = pend_seq[p]] + ELSE /\ UNCHANGED <> + /\ delivered_flag' = [delivered_flag EXCEPT ![p] = TRUE] + /\ UNCHANGED <> + +FlushSuccess(p) == + /\ flush_active[p] + /\ delivered_flag[p] + /\ flush_active' = [flush_active EXCEPT ![p] = FALSE] + /\ conf_seq' = [conf_seq EXCEPT ![p] = pend_seq[p]] + /\ pend' = [pend EXCEPT ![p] = <<>>] + /\ pend_seq' = [pend_seq EXCEPT ![p] = 0] + /\ UNCHANGED <> + +FlushFail(p) == + /\ flush_active[p] + /\ flush_active' = [flush_active EXCEPT ![p] = FALSE] + /\ UNCHANGED <> + +------------------------------------------------------------------------ +(* TTL Pruning actions *) + +\* UNSAFE: Prune any publisher's dedup entry at any time. +\* This models setting TTL too short — the publisher might still retry. +PruneUnsafe(p) == + /\ wf_last[p] > 0 \* Has a dedup entry to prune + /\ wf_last' = [wf_last EXCEPT ![p] = 0] + /\ UNCHANGED <> + +\* SAFE: Prune only when the publisher has no pending batch. +\* This models the correct TTL constraint: the publisher has finished +\* all retries before the entry is pruned. In practice, this means +\* TTL > max activity/client lifetime. +PruneSafe(p) == + /\ wf_last[p] > 0 \* Has a dedup entry to prune + /\ pend[p] = <<>> \* Publisher has no in-flight batch + /\ ~flush_active[p] \* Not currently flushing + /\ wf_last' = [wf_last EXCEPT ![p] = 0] + /\ UNCHANGED <> + +------------------------------------------------------------------------ +(* Specifications *) + +\* Base actions (no pruning) — for verifying the multi-publisher protocol +BaseNext == + \E p \in Publishers : + \/ Publish(p) + \/ StartFlush(p) + \/ Deliver(p) + \/ FlushSuccess(p) + \/ FlushFail(p) + +\* With unsafe pruning — should FAIL NoDuplicates +UnsafeNext == + \/ BaseNext + \/ \E p \in Publishers : PruneUnsafe(p) + +\* With safe pruning — should PASS NoDuplicates +SafeNext == + \/ BaseNext + \/ \E p \in Publishers : PruneSafe(p) + +BaseSpec == Init /\ [][BaseNext]_vars +UnsafeSpec == Init /\ [][UnsafeNext]_vars +SafeSpec == Init /\ [][SafeNext]_vars + +\* Fairness for liveness checking +BaseFairness == + \A p \in Publishers : + /\ WF_vars(StartFlush(p)) + /\ WF_vars(Deliver(p)) + /\ WF_vars(FlushSuccess(p)) + /\ WF_vars(FlushFail(p)) + +BaseFairSpec == BaseSpec /\ BaseFairness +SafeFairSpec == SafeSpec /\ BaseFairness + +------------------------------------------------------------------------ +(* Properties *) + +NoDuplicates == + \A i, j \in 1..Len(wf_log) : + (i /= j) => (wf_log[i] /= wf_log[j]) + +OrderPreservedPerPublisher == + \* Within each publisher's items, order is preserved. + \* (Global order across publishers is non-deterministic.) + \A p \in Publishers : + \A i, j \in 1..Len(wf_log) : + /\ wf_log[i] \in {ItemId(p, n) : n \in 1..MaxItemsPerPub} + /\ wf_log[j] \in {ItemId(p, n) : n \in 1..MaxItemsPerPub} + /\ i < j + => wf_log[i] < wf_log[j] + +\* All published items eventually appear in the log (under fairness) +AllItemsDelivered == + <>(\A p \in Publishers : + \A n \in 1..ctr[p] : + \E i \in 1..Len(wf_log) : wf_log[i] = ItemId(p, n)) + +======================================================================== diff --git a/temporalio/contrib/pubsub/verification/PubSubDedupTTL_Base.cfg b/temporalio/contrib/pubsub/verification/PubSubDedupTTL_Base.cfg new file mode 100644 index 000000000..55b378e2e --- /dev/null +++ b/temporalio/contrib/pubsub/verification/PubSubDedupTTL_Base.cfg @@ -0,0 +1,17 @@ +\* Multi-publisher protocol without pruning. +\* Verifies NoDuplicates and OrderPreservedPerPublisher. + +SPECIFICATION BaseFairSpec + +CONSTANTS + MaxItemsPerPub = 2 + +INVARIANTS + NoDuplicates + OrderPreservedPerPublisher + +PROPERTIES + AllItemsDelivered + +CHECK_DEADLOCK + FALSE diff --git a/temporalio/contrib/pubsub/verification/PubSubDedupTTL_Safe.cfg b/temporalio/contrib/pubsub/verification/PubSubDedupTTL_Safe.cfg new file mode 100644 index 000000000..04dd20c9c --- /dev/null +++ b/temporalio/contrib/pubsub/verification/PubSubDedupTTL_Safe.cfg @@ -0,0 +1,17 @@ +\* Safe pruning: prune only when publisher has no pending batch and is not flushing. +\* Should PASS NoDuplicates — confirms the TTL safety constraint. + +SPECIFICATION SafeFairSpec + +CONSTANTS + MaxItemsPerPub = 2 + +INVARIANTS + NoDuplicates + OrderPreservedPerPublisher + +PROPERTIES + AllItemsDelivered + +CHECK_DEADLOCK + FALSE diff --git a/temporalio/contrib/pubsub/verification/PubSubDedupTTL_TTrace_1775536996.bin b/temporalio/contrib/pubsub/verification/PubSubDedupTTL_TTrace_1775536996.bin new file mode 100644 index 000000000..4f2c39ea0 Binary files /dev/null and b/temporalio/contrib/pubsub/verification/PubSubDedupTTL_TTrace_1775536996.bin differ diff --git a/temporalio/contrib/pubsub/verification/PubSubDedupTTL_TTrace_1775536996.tla b/temporalio/contrib/pubsub/verification/PubSubDedupTTL_TTrace_1775536996.tla new file mode 100644 index 000000000..ee25c0a00 --- /dev/null +++ b/temporalio/contrib/pubsub/verification/PubSubDedupTTL_TTrace_1775536996.tla @@ -0,0 +1,186 @@ +---- MODULE PubSubDedupTTL_TTrace_1775536996 ---- +EXTENDS Sequences, TLCExt, Toolbox, Naturals, TLC, PubSubDedupTTL + +_expression == + LET PubSubDedupTTL_TEExpression == INSTANCE PubSubDedupTTL_TEExpression + IN PubSubDedupTTL_TEExpression!expression +---- + +_trace == + LET PubSubDedupTTL_TETrace == INSTANCE PubSubDedupTTL_TETrace + IN PubSubDedupTTL_TETrace!trace +---- + +_inv == + ~( + TLCGet("level") = Len(_TETrace) + /\ + ctr = ([A |-> 2, B |-> 0]) + /\ + buf = ([A |-> <<>>, B |-> <<>>]) + /\ + conf_seq = ([A |-> 0, B |-> 0]) + /\ + pend_seq = ([A |-> 1, B |-> 0]) + /\ + wf_last = ([A |-> 1, B |-> 0]) + /\ + flush_active = ([A |-> TRUE, B |-> FALSE]) + /\ + wf_log = (<<1, 3, 1, 3>>) + /\ + delivered_flag = ([A |-> TRUE, B |-> FALSE]) + /\ + pend = ([A |-> <<1, 3>>, B |-> <<>>]) + ) +---- + +_init == + /\ delivered_flag = _TETrace[1].delivered_flag + /\ flush_active = _TETrace[1].flush_active + /\ wf_log = _TETrace[1].wf_log + /\ ctr = _TETrace[1].ctr + /\ pend_seq = _TETrace[1].pend_seq + /\ buf = _TETrace[1].buf + /\ pend = _TETrace[1].pend + /\ wf_last = _TETrace[1].wf_last + /\ conf_seq = _TETrace[1].conf_seq +---- + +_next == + /\ \E i,j \in DOMAIN _TETrace: + /\ \/ /\ j = i + 1 + /\ i = TLCGet("level") + /\ delivered_flag = _TETrace[i].delivered_flag + /\ delivered_flag' = _TETrace[j].delivered_flag + /\ flush_active = _TETrace[i].flush_active + /\ flush_active' = _TETrace[j].flush_active + /\ wf_log = _TETrace[i].wf_log + /\ wf_log' = _TETrace[j].wf_log + /\ ctr = _TETrace[i].ctr + /\ ctr' = _TETrace[j].ctr + /\ pend_seq = _TETrace[i].pend_seq + /\ pend_seq' = _TETrace[j].pend_seq + /\ buf = _TETrace[i].buf + /\ buf' = _TETrace[j].buf + /\ pend = _TETrace[i].pend + /\ pend' = _TETrace[j].pend + /\ wf_last = _TETrace[i].wf_last + /\ wf_last' = _TETrace[j].wf_last + /\ conf_seq = _TETrace[i].conf_seq + /\ conf_seq' = _TETrace[j].conf_seq + +\* Uncomment the ASSUME below to write the states of the error trace +\* to the given file in Json format. Note that you can pass any tuple +\* to `JsonSerialize`. For example, a sub-sequence of _TETrace. + \* ASSUME + \* LET J == INSTANCE Json + \* IN J!JsonSerialize("PubSubDedupTTL_TTrace_1775536996.json", _TETrace) + +============================================================================= + + Note that you can extract this module `PubSubDedupTTL_TEExpression` + to a dedicated file to reuse `expression` (the module in the + dedicated `PubSubDedupTTL_TEExpression.tla` file takes precedence + over the module `PubSubDedupTTL_TEExpression` below). + +---- MODULE PubSubDedupTTL_TEExpression ---- +EXTENDS Sequences, TLCExt, Toolbox, Naturals, TLC, PubSubDedupTTL + +expression == + [ + \* To hide variables of the `PubSubDedupTTL` spec from the error trace, + \* remove the variables below. The trace will be written in the order + \* of the fields of this record. + delivered_flag |-> delivered_flag + ,flush_active |-> flush_active + ,wf_log |-> wf_log + ,ctr |-> ctr + ,pend_seq |-> pend_seq + ,buf |-> buf + ,pend |-> pend + ,wf_last |-> wf_last + ,conf_seq |-> conf_seq + + \* Put additional constant-, state-, and action-level expressions here: + \* ,_stateNumber |-> _TEPosition + \* ,_delivered_flagUnchanged |-> delivered_flag = delivered_flag' + + \* Format the `delivered_flag` variable as Json value. + \* ,_delivered_flagJson |-> + \* LET J == INSTANCE Json + \* IN J!ToJson(delivered_flag) + + \* Lastly, you may build expressions over arbitrary sets of states by + \* leveraging the _TETrace operator. For example, this is how to + \* count the number of times a spec variable changed up to the current + \* state in the trace. + \* ,_delivered_flagModCount |-> + \* LET F[s \in DOMAIN _TETrace] == + \* IF s = 1 THEN 0 + \* ELSE IF _TETrace[s].delivered_flag # _TETrace[s-1].delivered_flag + \* THEN 1 + F[s-1] ELSE F[s-1] + \* IN F[_TEPosition - 1] + ] + +============================================================================= + + + +Parsing and semantic processing can take forever if the trace below is long. + In this case, it is advised to uncomment the module below to deserialize the + trace from a generated binary file. + +\* +\*---- MODULE PubSubDedupTTL_TETrace ---- +\*EXTENDS IOUtils, TLC, PubSubDedupTTL +\* +\*trace == IODeserialize("PubSubDedupTTL_TTrace_1775536996.bin", TRUE) +\* +\*============================================================================= +\* + +---- MODULE PubSubDedupTTL_TETrace ---- +EXTENDS TLC, PubSubDedupTTL + +trace == + << + ([ctr |-> [A |-> 0, B |-> 0],buf |-> [A |-> <<>>, B |-> <<>>],conf_seq |-> [A |-> 0, B |-> 0],pend_seq |-> [A |-> 0, B |-> 0],wf_last |-> [A |-> 0, B |-> 0],flush_active |-> [A |-> FALSE, B |-> FALSE],wf_log |-> <<>>,delivered_flag |-> [A |-> FALSE, B |-> FALSE],pend |-> [A |-> <<>>, B |-> <<>>]]), + ([ctr |-> [A |-> 1, B |-> 0],buf |-> [A |-> <<1>>, B |-> <<>>],conf_seq |-> [A |-> 0, B |-> 0],pend_seq |-> [A |-> 0, B |-> 0],wf_last |-> [A |-> 0, B |-> 0],flush_active |-> [A |-> FALSE, B |-> FALSE],wf_log |-> <<>>,delivered_flag |-> [A |-> FALSE, B |-> FALSE],pend |-> [A |-> <<>>, B |-> <<>>]]), + ([ctr |-> [A |-> 2, B |-> 0],buf |-> [A |-> <<1, 3>>, B |-> <<>>],conf_seq |-> [A |-> 0, B |-> 0],pend_seq |-> [A |-> 0, B |-> 0],wf_last |-> [A |-> 0, B |-> 0],flush_active |-> [A |-> FALSE, B |-> FALSE],wf_log |-> <<>>,delivered_flag |-> [A |-> FALSE, B |-> FALSE],pend |-> [A |-> <<>>, B |-> <<>>]]), + ([ctr |-> [A |-> 2, B |-> 0],buf |-> [A |-> <<>>, B |-> <<>>],conf_seq |-> [A |-> 0, B |-> 0],pend_seq |-> [A |-> 1, B |-> 0],wf_last |-> [A |-> 0, B |-> 0],flush_active |-> [A |-> TRUE, B |-> FALSE],wf_log |-> <<>>,delivered_flag |-> [A |-> FALSE, B |-> FALSE],pend |-> [A |-> <<1, 3>>, B |-> <<>>]]), + ([ctr |-> [A |-> 2, B |-> 0],buf |-> [A |-> <<>>, B |-> <<>>],conf_seq |-> [A |-> 0, B |-> 0],pend_seq |-> [A |-> 1, B |-> 0],wf_last |-> [A |-> 1, B |-> 0],flush_active |-> [A |-> TRUE, B |-> FALSE],wf_log |-> <<1, 3>>,delivered_flag |-> [A |-> TRUE, B |-> FALSE],pend |-> [A |-> <<1, 3>>, B |-> <<>>]]), + ([ctr |-> [A |-> 2, B |-> 0],buf |-> [A |-> <<>>, B |-> <<>>],conf_seq |-> [A |-> 0, B |-> 0],pend_seq |-> [A |-> 1, B |-> 0],wf_last |-> [A |-> 1, B |-> 0],flush_active |-> [A |-> FALSE, B |-> FALSE],wf_log |-> <<1, 3>>,delivered_flag |-> [A |-> TRUE, B |-> FALSE],pend |-> [A |-> <<1, 3>>, B |-> <<>>]]), + ([ctr |-> [A |-> 2, B |-> 0],buf |-> [A |-> <<>>, B |-> <<>>],conf_seq |-> [A |-> 0, B |-> 0],pend_seq |-> [A |-> 1, B |-> 0],wf_last |-> [A |-> 1, B |-> 0],flush_active |-> [A |-> TRUE, B |-> FALSE],wf_log |-> <<1, 3>>,delivered_flag |-> [A |-> FALSE, B |-> FALSE],pend |-> [A |-> <<1, 3>>, B |-> <<>>]]), + ([ctr |-> [A |-> 2, B |-> 0],buf |-> [A |-> <<>>, B |-> <<>>],conf_seq |-> [A |-> 0, B |-> 0],pend_seq |-> [A |-> 1, B |-> 0],wf_last |-> [A |-> 0, B |-> 0],flush_active |-> [A |-> TRUE, B |-> FALSE],wf_log |-> <<1, 3>>,delivered_flag |-> [A |-> FALSE, B |-> FALSE],pend |-> [A |-> <<1, 3>>, B |-> <<>>]]), + ([ctr |-> [A |-> 2, B |-> 0],buf |-> [A |-> <<>>, B |-> <<>>],conf_seq |-> [A |-> 0, B |-> 0],pend_seq |-> [A |-> 1, B |-> 0],wf_last |-> [A |-> 1, B |-> 0],flush_active |-> [A |-> TRUE, B |-> FALSE],wf_log |-> <<1, 3, 1, 3>>,delivered_flag |-> [A |-> TRUE, B |-> FALSE],pend |-> [A |-> <<1, 3>>, B |-> <<>>]]) + >> +---- + + +============================================================================= + +---- CONFIG PubSubDedupTTL_TTrace_1775536996 ---- +CONSTANTS + MaxItemsPerPub = 2 + +INVARIANT + _inv + +CHECK_DEADLOCK + \* CHECK_DEADLOCK off because of PROPERTY or INVARIANT above. + FALSE + +INIT + _init + +NEXT + _next + +CONSTANT + _TETrace <- _trace + +ALIAS + _expression +============================================================================= +\* Generated on Mon Apr 06 21:43:16 PDT 2026 \ No newline at end of file diff --git a/temporalio/contrib/pubsub/verification/PubSubDedupTTL_Unsafe.cfg b/temporalio/contrib/pubsub/verification/PubSubDedupTTL_Unsafe.cfg new file mode 100644 index 000000000..4420da7ef --- /dev/null +++ b/temporalio/contrib/pubsub/verification/PubSubDedupTTL_Unsafe.cfg @@ -0,0 +1,13 @@ +\* Unsafe pruning: prune any publisher's dedup entry at any time. +\* Should FAIL NoDuplicates — confirms that unbounded pruning is dangerous. + +SPECIFICATION UnsafeSpec + +CONSTANTS + MaxItemsPerPub = 2 + +INVARIANTS + NoDuplicates + +CHECK_DEADLOCK + FALSE diff --git a/temporalio/contrib/pubsub/verification/PubSubDedup_TTrace_1775536362.bin b/temporalio/contrib/pubsub/verification/PubSubDedup_TTrace_1775536362.bin new file mode 100644 index 000000000..e7461f615 Binary files /dev/null and b/temporalio/contrib/pubsub/verification/PubSubDedup_TTrace_1775536362.bin differ diff --git a/temporalio/contrib/pubsub/verification/PubSubDedup_TTrace_1775536362.tla b/temporalio/contrib/pubsub/verification/PubSubDedup_TTrace_1775536362.tla new file mode 100644 index 000000000..8fd999a5b --- /dev/null +++ b/temporalio/contrib/pubsub/verification/PubSubDedup_TTrace_1775536362.tla @@ -0,0 +1,185 @@ +---- MODULE PubSubDedup_TTrace_1775536362 ---- +EXTENDS Sequences, TLCExt, PubSubDedup, Toolbox, Naturals, TLC + +_expression == + LET PubSubDedup_TEExpression == INSTANCE PubSubDedup_TEExpression + IN PubSubDedup_TEExpression!expression +---- + +_trace == + LET PubSubDedup_TETrace == INSTANCE PubSubDedup_TETrace + IN PubSubDedup_TETrace!trace +---- + +_inv == + ~( + TLCGet("level") = Len(_TETrace) + /\ + item_counter = (4) + /\ + pending = (<<>>) + /\ + pending_seq = (0) + /\ + wf_last_seq = (1) + /\ + delivered = (TRUE) + /\ + flushing = (FALSE) + /\ + buffer = (<<>>) + /\ + wf_log = (<<1, 2, 3, 4>>) + /\ + confirmed_seq = (1) + ) +---- + +_init == + /\ pending = _TETrace[1].pending + /\ wf_log = _TETrace[1].wf_log + /\ flushing = _TETrace[1].flushing + /\ pending_seq = _TETrace[1].pending_seq + /\ buffer = _TETrace[1].buffer + /\ item_counter = _TETrace[1].item_counter + /\ confirmed_seq = _TETrace[1].confirmed_seq + /\ wf_last_seq = _TETrace[1].wf_last_seq + /\ delivered = _TETrace[1].delivered +---- + +_next == + /\ \E i,j \in DOMAIN _TETrace: + /\ \/ /\ j = i + 1 + /\ i = TLCGet("level") + /\ pending = _TETrace[i].pending + /\ pending' = _TETrace[j].pending + /\ wf_log = _TETrace[i].wf_log + /\ wf_log' = _TETrace[j].wf_log + /\ flushing = _TETrace[i].flushing + /\ flushing' = _TETrace[j].flushing + /\ pending_seq = _TETrace[i].pending_seq + /\ pending_seq' = _TETrace[j].pending_seq + /\ buffer = _TETrace[i].buffer + /\ buffer' = _TETrace[j].buffer + /\ item_counter = _TETrace[i].item_counter + /\ item_counter' = _TETrace[j].item_counter + /\ confirmed_seq = _TETrace[i].confirmed_seq + /\ confirmed_seq' = _TETrace[j].confirmed_seq + /\ wf_last_seq = _TETrace[i].wf_last_seq + /\ wf_last_seq' = _TETrace[j].wf_last_seq + /\ delivered = _TETrace[i].delivered + /\ delivered' = _TETrace[j].delivered + +\* Uncomment the ASSUME below to write the states of the error trace +\* to the given file in Json format. Note that you can pass any tuple +\* to `JsonSerialize`. For example, a sub-sequence of _TETrace. + \* ASSUME + \* LET J == INSTANCE Json + \* IN J!JsonSerialize("PubSubDedup_TTrace_1775536362.json", _TETrace) + +============================================================================= + + Note that you can extract this module `PubSubDedup_TEExpression` + to a dedicated file to reuse `expression` (the module in the + dedicated `PubSubDedup_TEExpression.tla` file takes precedence + over the module `PubSubDedup_TEExpression` below). + +---- MODULE PubSubDedup_TEExpression ---- +EXTENDS Sequences, TLCExt, PubSubDedup, Toolbox, Naturals, TLC + +expression == + [ + \* To hide variables of the `PubSubDedup` spec from the error trace, + \* remove the variables below. The trace will be written in the order + \* of the fields of this record. + pending |-> pending + ,wf_log |-> wf_log + ,flushing |-> flushing + ,pending_seq |-> pending_seq + ,buffer |-> buffer + ,item_counter |-> item_counter + ,confirmed_seq |-> confirmed_seq + ,wf_last_seq |-> wf_last_seq + ,delivered |-> delivered + + \* Put additional constant-, state-, and action-level expressions here: + \* ,_stateNumber |-> _TEPosition + \* ,_pendingUnchanged |-> pending = pending' + + \* Format the `pending` variable as Json value. + \* ,_pendingJson |-> + \* LET J == INSTANCE Json + \* IN J!ToJson(pending) + + \* Lastly, you may build expressions over arbitrary sets of states by + \* leveraging the _TETrace operator. For example, this is how to + \* count the number of times a spec variable changed up to the current + \* state in the trace. + \* ,_pendingModCount |-> + \* LET F[s \in DOMAIN _TETrace] == + \* IF s = 1 THEN 0 + \* ELSE IF _TETrace[s].pending # _TETrace[s-1].pending + \* THEN 1 + F[s-1] ELSE F[s-1] + \* IN F[_TEPosition - 1] + ] + +============================================================================= + + + +Parsing and semantic processing can take forever if the trace below is long. + In this case, it is advised to uncomment the module below to deserialize the + trace from a generated binary file. + +\* +\*---- MODULE PubSubDedup_TETrace ---- +\*EXTENDS IOUtils, PubSubDedup, TLC +\* +\*trace == IODeserialize("PubSubDedup_TTrace_1775536362.bin", TRUE) +\* +\*============================================================================= +\* + +---- MODULE PubSubDedup_TETrace ---- +EXTENDS PubSubDedup, TLC + +trace == + << + ([item_counter |-> 0,pending |-> <<>>,pending_seq |-> 0,wf_last_seq |-> 0,delivered |-> FALSE,flushing |-> FALSE,buffer |-> <<>>,wf_log |-> <<>>,confirmed_seq |-> 0]), + ([item_counter |-> 1,pending |-> <<>>,pending_seq |-> 0,wf_last_seq |-> 0,delivered |-> FALSE,flushing |-> FALSE,buffer |-> <<1>>,wf_log |-> <<>>,confirmed_seq |-> 0]), + ([item_counter |-> 2,pending |-> <<>>,pending_seq |-> 0,wf_last_seq |-> 0,delivered |-> FALSE,flushing |-> FALSE,buffer |-> <<1, 2>>,wf_log |-> <<>>,confirmed_seq |-> 0]), + ([item_counter |-> 3,pending |-> <<>>,pending_seq |-> 0,wf_last_seq |-> 0,delivered |-> FALSE,flushing |-> FALSE,buffer |-> <<1, 2, 3>>,wf_log |-> <<>>,confirmed_seq |-> 0]), + ([item_counter |-> 4,pending |-> <<>>,pending_seq |-> 0,wf_last_seq |-> 0,delivered |-> FALSE,flushing |-> FALSE,buffer |-> <<1, 2, 3, 4>>,wf_log |-> <<>>,confirmed_seq |-> 0]), + ([item_counter |-> 4,pending |-> <<1, 2, 3, 4>>,pending_seq |-> 1,wf_last_seq |-> 0,delivered |-> FALSE,flushing |-> TRUE,buffer |-> <<>>,wf_log |-> <<>>,confirmed_seq |-> 0]), + ([item_counter |-> 4,pending |-> <<1, 2, 3, 4>>,pending_seq |-> 1,wf_last_seq |-> 1,delivered |-> TRUE,flushing |-> TRUE,buffer |-> <<>>,wf_log |-> <<1, 2, 3, 4>>,confirmed_seq |-> 0]), + ([item_counter |-> 4,pending |-> <<>>,pending_seq |-> 0,wf_last_seq |-> 1,delivered |-> TRUE,flushing |-> FALSE,buffer |-> <<>>,wf_log |-> <<1, 2, 3, 4>>,confirmed_seq |-> 1]) + >> +---- + + +============================================================================= + +---- CONFIG PubSubDedup_TTrace_1775536362 ---- +CONSTANTS + MaxItems = 4 + +INVARIANT + _inv + +CHECK_DEADLOCK + \* CHECK_DEADLOCK off because of PROPERTY or INVARIANT above. + FALSE + +INIT + _init + +NEXT + _next + +CONSTANT + _TETrace <- _trace + +ALIAS + _expression +============================================================================= +\* Generated on Mon Apr 06 21:32:43 PDT 2026 \ No newline at end of file diff --git a/temporalio/contrib/pubsub/verification/README.md b/temporalio/contrib/pubsub/verification/README.md new file mode 100644 index 000000000..0a6a3d50c --- /dev/null +++ b/temporalio/contrib/pubsub/verification/README.md @@ -0,0 +1,52 @@ +# Pub/Sub Dedup Verification + +TLA+ specifications for the exactly-once delivery protocol. +See [PROOF.md](./PROOF.md) for the full correctness argument. + +## Files + +| File | Purpose | +|---|---| +| `PubSubDedup.tla` | Correct algorithm — bounded model checking (safety + liveness) | +| `PubSubDedupInductive.tla` | Strengthened invariant — reachable-state verification + informal induction argument | +| `PubSubDedupTTL.tla` | Multi-publisher + TTL pruning (safe vs unsafe) | +| `PubSubDedupBroken.tla` | Old (broken) algorithm — TLC finds the duplicate bug | +| `PROOF.md` | Full proof: invariant, order preservation, TTL safety, counterexamples | + +## Verified Properties + +| Property | Type | Spec | +|---|---|---| +| NoDuplicates | safety | all specs | +| OrderPreserved | safety | single-publisher | +| OrderPreservedPerPublisher | safety | multi-publisher | +| AllItemsDelivered | liveness | all specs (under fairness) | +| TTL safe pruning | safety | PubSubDedupTTL | + +## Running + +```bash +curl -sL -o /tmp/tla2tools.jar \ + https://github.com/tlaplus/tlaplus/releases/download/v1.8.0/tla2tools.jar + +# Single-publisher bounded model checking +java -cp /tmp/tla2tools.jar tlc2.TLC PubSubDedup -workers auto + +# Inductive invariant (unbounded) +java -cp /tmp/tla2tools.jar tlc2.TLC PubSubDedupInductive -workers auto + +# Multi-publisher base protocol +java -cp /tmp/tla2tools.jar tlc2.TLC PubSubDedupTTL \ + -config PubSubDedupTTL_Base.cfg -workers auto + +# TTL unsafe pruning (should FAIL) +java -cp /tmp/tla2tools.jar tlc2.TLC PubSubDedupTTL \ + -config PubSubDedupTTL_Unsafe.cfg -workers auto + +# TTL safe pruning (should PASS) +java -cp /tmp/tla2tools.jar tlc2.TLC PubSubDedupTTL \ + -config PubSubDedupTTL_Safe.cfg -workers auto + +# Broken algorithm (should FAIL) +java -cp /tmp/tla2tools.jar tlc2.TLC PubSubDedupBroken -workers auto +``` diff --git a/tests/contrib/google_adk_agents/test_adk_streaming.py b/tests/contrib/google_adk_agents/test_adk_streaming.py new file mode 100644 index 000000000..a6c964544 --- /dev/null +++ b/tests/contrib/google_adk_agents/test_adk_streaming.py @@ -0,0 +1,198 @@ +"""Integration tests for ADK streaming support. + +Verifies that the streaming model activity publishes TEXT_DELTA events via +PubSubMixin and that non-streaming mode remains backward-compatible. +""" + +import asyncio +import json +import logging +import uuid +from collections.abc import AsyncGenerator +from datetime import timedelta + +import pytest +from google.adk import Agent +from google.adk.models import BaseLlm, LLMRegistry +from google.adk.models.llm_request import LlmRequest +from google.adk.models.llm_response import LlmResponse +from google.adk.runners import InMemoryRunner +from google.genai.types import Content, Part + +from temporalio import workflow +from temporalio.client import Client +from temporalio.contrib.google_adk_agents import GoogleAdkPlugin, TemporalModel +from temporalio.contrib.pubsub import PubSubClient, PubSubMixin +from temporalio.worker import Worker + +logger = logging.getLogger(__name__) + + +class StreamingTestModel(BaseLlm): + """Test model that yields multiple partial responses to simulate streaming.""" + + @classmethod + def supported_models(cls) -> list[str]: + return ["streaming_test_model"] + + async def generate_content_async( + self, llm_request: LlmRequest, stream: bool = False + ) -> AsyncGenerator[LlmResponse, None]: + yield LlmResponse( + content=Content(role="model", parts=[Part(text="Hello ")]) + ) + yield LlmResponse( + content=Content(role="model", parts=[Part(text="world!")]) + ) + + +@workflow.defn +class StreamingAdkWorkflow(PubSubMixin): + """Test workflow that uses streaming TemporalModel with PubSubMixin.""" + + @workflow.init + def __init__(self, prompt: str) -> None: + self.init_pubsub() + + @workflow.run + async def run(self, prompt: str) -> str: + model = TemporalModel("streaming_test_model", streaming=True) + agent = Agent( + name="test_agent", + model=model, + instruction="You are a test agent.", + ) + + runner = InMemoryRunner(agent=agent, app_name="test-app") + session = await runner.session_service.create_session( + app_name="test-app", user_id="test" + ) + + final_text = "" + async for event in runner.run_async( + user_id="test", + session_id=session.id, + new_message=Content(role="user", parts=[Part(text=prompt)]), + ): + if event.content and event.content.parts: + for part in event.content.parts: + if part.text: + final_text = part.text + + return final_text + + +@workflow.defn +class NonStreamingAdkWorkflow: + """Test workflow without streaming -- verifies backward compatibility.""" + + @workflow.run + async def run(self, prompt: str) -> str: + model = TemporalModel("streaming_test_model", streaming=False) + agent = Agent( + name="test_agent", + model=model, + instruction="You are a test agent.", + ) + + runner = InMemoryRunner(agent=agent, app_name="test-app") + session = await runner.session_service.create_session( + app_name="test-app", user_id="test" + ) + + final_text = "" + async for event in runner.run_async( + user_id="test", + session_id=session.id, + new_message=Content(role="user", parts=[Part(text=prompt)]), + ): + if event.content and event.content.parts: + for part in event.content.parts: + if part.text: + final_text = part.text + + return final_text + + +@pytest.mark.asyncio +async def test_streaming_publishes_events(client: Client): + """Verify that streaming activity publishes TEXT_DELTA events via pubsub.""" + LLMRegistry.register(StreamingTestModel) + + new_config = client.config() + new_config["plugins"] = [GoogleAdkPlugin()] + client = Client(**new_config) + + workflow_id = f"adk-streaming-test-{uuid.uuid4()}" + + async with Worker( + client, + task_queue="adk-streaming-test", + workflows=[StreamingAdkWorkflow], + max_cached_workflows=0, + ): + handle = await client.start_workflow( + StreamingAdkWorkflow.run, + "Hello", + id=workflow_id, + task_queue="adk-streaming-test", + execution_timeout=timedelta(seconds=30), + ) + + # Subscribe concurrently while the workflow is running + pubsub = PubSubClient.create(client, workflow_id) + events: list[dict] = [] + + async def collect_events() -> None: + async for item in pubsub.subscribe( + ["events"], from_offset=0, poll_cooldown=0.05 + ): + event = json.loads(item.data) + events.append(event) + if event["type"] == "LLM_CALL_COMPLETE": + break + + collect_task = asyncio.create_task(collect_events()) + result = await handle.result() + + # Wait for event collection with a timeout + await asyncio.wait_for(collect_task, timeout=10.0) + + assert result is not None + + event_types = [e["type"] for e in events] + assert "LLM_CALL_START" in event_types, f"Expected LLM_CALL_START, got: {event_types}" + assert "TEXT_DELTA" in event_types, f"Expected TEXT_DELTA, got: {event_types}" + assert "LLM_CALL_COMPLETE" in event_types, ( + f"Expected LLM_CALL_COMPLETE, got: {event_types}" + ) + + text_deltas = [e["data"]["delta"] for e in events if e["type"] == "TEXT_DELTA"] + assert len(text_deltas) >= 1, f"Expected at least 1 TEXT_DELTA, got: {text_deltas}" + + +@pytest.mark.asyncio +async def test_non_streaming_backward_compatible(client: Client): + """Verify non-streaming mode still works (backward compatibility).""" + LLMRegistry.register(StreamingTestModel) + + new_config = client.config() + new_config["plugins"] = [GoogleAdkPlugin()] + client = Client(**new_config) + + async with Worker( + client, + task_queue="adk-non-streaming-test", + workflows=[NonStreamingAdkWorkflow], + max_cached_workflows=0, + ): + handle = await client.start_workflow( + NonStreamingAdkWorkflow.run, + "Hello", + id=f"adk-non-streaming-test-{uuid.uuid4()}", + task_queue="adk-non-streaming-test", + execution_timeout=timedelta(seconds=30), + ) + result = await handle.result() + + assert result is not None diff --git a/tests/contrib/openai_agents/test_openai_streaming.py b/tests/contrib/openai_agents/test_openai_streaming.py new file mode 100644 index 000000000..ca90eb3f3 --- /dev/null +++ b/tests/contrib/openai_agents/test_openai_streaming.py @@ -0,0 +1,287 @@ +"""Integration tests for OpenAI Agents streaming support. + +Verifies that the streaming model activity publishes TEXT_DELTA events via +PubSubMixin and that the workflow returns the correct final result. +""" + +import asyncio +import json +import logging +import uuid +from collections.abc import AsyncIterator +from datetime import timedelta +from typing import Any + +import pytest +from agents import ( + Agent, + AgentOutputSchemaBase, + Handoff, + Model, + ModelResponse, + ModelSettings, + ModelTracing, + Runner, + Tool, + TResponseInputItem, + Usage, +) +from agents.items import TResponseStreamEvent +from openai.types.responses import ( + Response, + ResponseCompletedEvent, + ResponseOutputMessage, + ResponseOutputText, + ResponseTextDeltaEvent, +) + +from temporalio import workflow +from temporalio.client import Client +from temporalio.contrib.openai_agents import ModelActivityParameters +from temporalio.contrib.openai_agents.testing import AgentEnvironment +from temporalio.contrib.pubsub import PubSubClient, PubSubMixin +from tests.helpers import new_worker + +logger = logging.getLogger(__name__) + + +class StreamingTestModel(Model): + """Test model that yields text deltas followed by a ResponseCompletedEvent.""" + + __test__ = False + + async def get_response( + self, + system_instructions: str | None, + input: str | list[TResponseInputItem], + model_settings: ModelSettings, + tools: list[Tool], + output_schema: AgentOutputSchemaBase | None, + handoffs: list[Handoff], + tracing: ModelTracing, + **kwargs: Any, + ) -> ModelResponse: + return ModelResponse( + output=[ + ResponseOutputMessage( + id="msg_test", + content=[ + ResponseOutputText( + text="Hello world!", + annotations=[], + type="output_text", + logprobs=[], + ) + ], + role="assistant", + status="completed", + type="message", + ) + ], + usage=Usage(), + response_id=None, + ) + + async def stream_response( + self, + system_instructions: str | None, + input: str | list[TResponseInputItem], + model_settings: ModelSettings, + tools: list[Tool], + output_schema: AgentOutputSchemaBase | None, + handoffs: list[Handoff], + tracing: ModelTracing, + **kwargs: Any, + ) -> AsyncIterator[TResponseStreamEvent]: + # Yield text deltas + yield ResponseTextDeltaEvent( + content_index=0, + delta="Hello ", + item_id="item1", + output_index=0, + sequence_number=0, + type="response.output_text.delta", + logprobs=[], + ) + yield ResponseTextDeltaEvent( + content_index=0, + delta="world!", + item_id="item1", + output_index=0, + sequence_number=1, + type="response.output_text.delta", + logprobs=[], + ) + + # Yield the final completed event + response = Response( + id="resp_test", + created_at=0, + error=None, + incomplete_details=None, + instructions=None, + metadata={}, + model="test", + object="response", + output=[ + ResponseOutputMessage( + id="msg_test", + content=[ + ResponseOutputText( + text="Hello world!", + annotations=[], + type="output_text", + logprobs=[], + ) + ], + role="assistant", + status="completed", + type="message", + ) + ], + parallel_tool_calls=True, + temperature=1.0, + tool_choice="auto", + tools=[], + top_p=1.0, + status="completed", + text={"format": {"type": "text"}}, + truncation="disabled", + usage={ + "input_tokens": 10, + "output_tokens": 5, + "total_tokens": 15, + "input_tokens_details": {"cached_tokens": 0}, + "output_tokens_details": {"reasoning_tokens": 0}, + }, + ) + yield ResponseCompletedEvent( + response=response, sequence_number=2, type="response.completed" + ) + + +@workflow.defn +class StreamingOpenAIWorkflow(PubSubMixin): + """Test workflow that uses streaming model activity with PubSubMixin.""" + + @workflow.init + def __init__(self, prompt: str) -> None: + self.init_pubsub() + + @workflow.run + async def run(self, prompt: str) -> str: + agent = Agent[None]( + name="Assistant", + instructions="You are a test agent.", + ) + result = await Runner.run(starting_agent=agent, input=prompt) + return result.final_output + + +@workflow.defn +class NonStreamingOpenAIWorkflow: + """Test workflow without streaming -- verifies backward compatibility.""" + + @workflow.run + async def run(self, prompt: str) -> str: + agent = Agent[None]( + name="Assistant", + instructions="You are a test agent.", + ) + result = await Runner.run(starting_agent=agent, input=prompt) + return result.final_output + + +@pytest.mark.asyncio +async def test_streaming_publishes_events(client: Client): + """Verify that streaming activity publishes TEXT_DELTA events via pubsub.""" + model = StreamingTestModel() + async with AgentEnvironment( + model=model, + model_params=ModelActivityParameters( + start_to_close_timeout=timedelta(seconds=30), + enable_streaming=True, + ), + ) as env: + client = env.applied_on_client(client) + + workflow_id = f"openai-streaming-test-{uuid.uuid4()}" + + async with new_worker( + client, + StreamingOpenAIWorkflow, + max_cached_workflows=0, + ) as worker: + handle = await client.start_workflow( + StreamingOpenAIWorkflow.run, + "Hello", + id=workflow_id, + task_queue=worker.task_queue, + execution_timeout=timedelta(seconds=30), + ) + + # Subscribe concurrently while the workflow is running + pubsub = PubSubClient.create(client, workflow_id) + events: list[dict] = [] + + async def collect_events() -> None: + async for item in pubsub.subscribe( + ["events"], from_offset=0, poll_cooldown=0.05 + ): + event = json.loads(item.data) + events.append(event) + if event["type"] == "LLM_CALL_COMPLETE": + break + + collect_task = asyncio.create_task(collect_events()) + result = await handle.result() + + # Wait for event collection with a timeout + await asyncio.wait_for(collect_task, timeout=10.0) + + assert result is not None + + event_types = [e["type"] for e in events] + assert "LLM_CALL_START" in event_types, ( + f"Expected LLM_CALL_START, got: {event_types}" + ) + assert "TEXT_DELTA" in event_types, ( + f"Expected TEXT_DELTA, got: {event_types}" + ) + assert "LLM_CALL_COMPLETE" in event_types, ( + f"Expected LLM_CALL_COMPLETE, got: {event_types}" + ) + + text_deltas = [e["data"]["delta"] for e in events if e["type"] == "TEXT_DELTA"] + assert len(text_deltas) >= 1, f"Expected at least 1 TEXT_DELTA, got: {text_deltas}" + assert "Hello " in text_deltas + assert "world!" in text_deltas + + +@pytest.mark.asyncio +async def test_non_streaming_backward_compatible(client: Client): + """Verify non-streaming mode still works (backward compatibility).""" + model = StreamingTestModel() + async with AgentEnvironment( + model=model, + model_params=ModelActivityParameters( + start_to_close_timeout=timedelta(seconds=30), + enable_streaming=False, + ), + ) as env: + client = env.applied_on_client(client) + + async with new_worker( + client, + NonStreamingOpenAIWorkflow, + max_cached_workflows=0, + ) as worker: + result = await client.execute_workflow( + NonStreamingOpenAIWorkflow.run, + "Hello", + id=f"openai-non-streaming-test-{uuid.uuid4()}", + task_queue=worker.task_queue, + execution_timeout=timedelta(seconds=30), + ) + + assert result == "Hello world!" diff --git a/tests/contrib/pubsub/__init__.py b/tests/contrib/pubsub/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/contrib/pubsub/test_pubsub.py b/tests/contrib/pubsub/test_pubsub.py new file mode 100644 index 000000000..e0154035e --- /dev/null +++ b/tests/contrib/pubsub/test_pubsub.py @@ -0,0 +1,1333 @@ +"""E2E integration tests for temporalio.contrib.pubsub.""" + +from __future__ import annotations + +import asyncio +import uuid +from datetime import timedelta + +import pytest + +from typing import Any + +from dataclasses import dataclass + +from temporalio import activity, workflow +from temporalio.client import Client +from temporalio.contrib.pubsub import ( + PollInput, + PollResult, + PubSubClient, + PubSubItem, + PubSubMixin, + PubSubState, + PublishEntry, + PublishInput, +) +from temporalio.contrib.pubsub._types import encode_data +from tests.helpers import assert_eq_eventually, new_worker + + +# --------------------------------------------------------------------------- +# Test workflows (must be module-level, not local classes) +# --------------------------------------------------------------------------- + + +@workflow.defn +class BasicPubSubWorkflow(PubSubMixin): + @workflow.init + def __init__(self) -> None: + self.init_pubsub() + self._closed = False + + @workflow.signal + def close(self) -> None: + self._closed = True + + @workflow.run + async def run(self) -> None: + await workflow.wait_condition(lambda: self._closed) + + +@workflow.defn +class ActivityPublishWorkflow(PubSubMixin): + @workflow.init + def __init__(self, count: int) -> None: + self.init_pubsub() + self._closed = False + + @workflow.signal + def close(self) -> None: + self._closed = True + + @workflow.run + async def run(self, count: int) -> None: + await workflow.execute_activity( + "publish_items", + count, + start_to_close_timeout=timedelta(seconds=30), + heartbeat_timeout=timedelta(seconds=10), + ) + self.publish("status", b"activity_done") + await workflow.wait_condition(lambda: self._closed) + + +@workflow.defn +class WorkflowSidePublishWorkflow(PubSubMixin): + @workflow.init + def __init__(self, count: int) -> None: + self.init_pubsub() + self._closed = False + + @workflow.signal + def close(self) -> None: + self._closed = True + + @workflow.run + async def run(self, count: int) -> None: + for i in range(count): + self.publish("events", f"item-{i}".encode()) + await workflow.wait_condition(lambda: self._closed) + + +@workflow.defn +class MultiTopicWorkflow(PubSubMixin): + @workflow.init + def __init__(self, count: int) -> None: + self.init_pubsub() + self._closed = False + + @workflow.signal + def close(self) -> None: + self._closed = True + + @workflow.run + async def run(self, count: int) -> None: + await workflow.execute_activity( + "publish_multi_topic", + count, + start_to_close_timeout=timedelta(seconds=30), + heartbeat_timeout=timedelta(seconds=10), + ) + await workflow.wait_condition(lambda: self._closed) + + +@workflow.defn +class InterleavedWorkflow(PubSubMixin): + @workflow.init + def __init__(self, count: int) -> None: + self.init_pubsub() + self._closed = False + + @workflow.signal + def close(self) -> None: + self._closed = True + + @workflow.run + async def run(self, count: int) -> None: + self.publish("status", b"started") + await workflow.execute_activity( + "publish_items", + count, + start_to_close_timeout=timedelta(seconds=30), + heartbeat_timeout=timedelta(seconds=10), + ) + self.publish("status", b"done") + await workflow.wait_condition(lambda: self._closed) + + +@workflow.defn +class PriorityWorkflow(PubSubMixin): + @workflow.init + def __init__(self) -> None: + self.init_pubsub() + self._closed = False + + @workflow.signal + def close(self) -> None: + self._closed = True + + @workflow.run + async def run(self) -> None: + await workflow.execute_activity( + "publish_with_priority", + start_to_close_timeout=timedelta(seconds=30), + heartbeat_timeout=timedelta(seconds=10), + ) + await workflow.wait_condition(lambda: self._closed) + + +@workflow.defn +class FlushOnExitWorkflow(PubSubMixin): + @workflow.init + def __init__(self, count: int) -> None: + self.init_pubsub() + self._closed = False + + @workflow.signal + def close(self) -> None: + self._closed = True + + @workflow.run + async def run(self, count: int) -> None: + await workflow.execute_activity( + "publish_batch_test", + count, + start_to_close_timeout=timedelta(seconds=30), + heartbeat_timeout=timedelta(seconds=10), + ) + await workflow.wait_condition(lambda: self._closed) + + +@workflow.defn +class MaxBatchWorkflow(PubSubMixin): + @workflow.init + def __init__(self, count: int) -> None: + self.init_pubsub() + self._closed = False + + @workflow.signal + def close(self) -> None: + self._closed = True + + @workflow.run + async def run(self, count: int) -> None: + await workflow.execute_activity( + "publish_with_max_batch", + count, + start_to_close_timeout=timedelta(seconds=30), + heartbeat_timeout=timedelta(seconds=10), + ) + self.publish("status", b"activity_done") + await workflow.wait_condition(lambda: self._closed) + + +@workflow.defn +class MixinCoexistenceWorkflow(PubSubMixin): + @workflow.init + def __init__(self) -> None: + self.init_pubsub() + self._app_data: list[str] = [] + self._closed = False + + @workflow.signal + def close(self) -> None: + self._closed = True + + @workflow.signal + def app_signal(self, value: str) -> None: + self._app_data.append(value) + + @workflow.query + def app_query(self) -> list[str]: + return self._app_data + + @workflow.run + async def run(self) -> None: + await workflow.wait_condition(lambda: self._closed) + + +# --------------------------------------------------------------------------- +# Activities +# --------------------------------------------------------------------------- + + +@activity.defn(name="publish_items") +async def publish_items(count: int) -> None: + client = PubSubClient.create(batch_interval=0.5) + async with client: + for i in range(count): + activity.heartbeat() + client.publish("events", f"item-{i}".encode()) + + +@activity.defn(name="publish_multi_topic") +async def publish_multi_topic(count: int) -> None: + topics = ["a", "b", "c"] + client = PubSubClient.create(batch_interval=0.5) + async with client: + for i in range(count): + activity.heartbeat() + topic = topics[i % len(topics)] + client.publish(topic, f"{topic}-{i}".encode()) + + +@activity.defn(name="publish_with_priority") +async def publish_with_priority() -> None: + client = PubSubClient.create(batch_interval=60.0) + async with client: + client.publish("events", b"normal-0") + client.publish("events", b"normal-1") + client.publish("events", b"priority", priority=True) + # Give the flusher time to wake and flush + await asyncio.sleep(0.5) + + +@activity.defn(name="publish_batch_test") +async def publish_batch_test(count: int) -> None: + client = PubSubClient.create(batch_interval=60.0) + async with client: + for i in range(count): + activity.heartbeat() + client.publish("events", f"item-{i}".encode()) + + +@activity.defn(name="publish_with_max_batch") +async def publish_with_max_batch(count: int) -> None: + client = PubSubClient.create(batch_interval=60.0, max_batch_size=3) + async with client: + for i in range(count): + activity.heartbeat() + client.publish("events", f"item-{i}".encode()) + # Long batch_interval ensures only max_batch_size triggers flushes + # Context manager exit flushes any remainder + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +async def _is_different_run(old_handle, new_handle) -> bool: + """Check if new_handle points to a different run than old_handle.""" + try: + desc = await new_handle.describe() + return desc.run_id != old_handle.result_run_id + except Exception: + return False + + +async def collect_items( + handle, + topics: list[str] | None, + from_offset: int, + expected_count: int, + timeout: float = 15.0, +) -> list[PubSubItem]: + """Subscribe and collect exactly expected_count items, with timeout.""" + client = PubSubClient(handle) + items: list[PubSubItem] = [] + try: + async with asyncio.timeout(timeout): + async for item in client.subscribe( + topics=topics, from_offset=from_offset, poll_cooldown=0 + ): + items.append(item) + if len(items) >= expected_count: + break + except asyncio.TimeoutError: + pass + return items + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_activity_publish_and_subscribe(client: Client) -> None: + """Activity publishes items, external client subscribes and receives them.""" + count = 10 + async with new_worker( + client, + ActivityPublishWorkflow, + activities=[publish_items], + ) as worker: + handle = await client.start_workflow( + ActivityPublishWorkflow.run, + count, + id=f"pubsub-basic-{uuid.uuid4()}", + task_queue=worker.task_queue, + ) + # Collect activity items + the "activity_done" status item + items = await collect_items(handle, None, 0, count + 1) + assert len(items) == count + 1 + + # Check activity items + for i in range(count): + assert items[i].topic == "events" + assert items[i].data == f"item-{i}".encode() + + # Check workflow-side status item + assert items[count].topic == "status" + assert items[count].data == b"activity_done" + + await handle.signal(ActivityPublishWorkflow.close) + + +@pytest.mark.asyncio +async def test_topic_filtering(client: Client) -> None: + """Publish to multiple topics, subscribe with filter.""" + count = 9 # 3 per topic + async with new_worker( + client, + MultiTopicWorkflow, + activities=[publish_multi_topic], + ) as worker: + handle = await client.start_workflow( + MultiTopicWorkflow.run, + count, + id=f"pubsub-filter-{uuid.uuid4()}", + task_queue=worker.task_queue, + ) + + # Subscribe to topic "a" only — should get 3 items + a_items = await collect_items(handle, ["a"], 0, 3) + assert len(a_items) == 3 + assert all(item.topic == "a" for item in a_items) + + # Subscribe to ["a", "c"] — should get 6 items + ac_items = await collect_items(handle, ["a", "c"], 0, 6) + assert len(ac_items) == 6 + assert all(item.topic in ("a", "c") for item in ac_items) + + # Subscribe to all (None) — should get all 9 + all_items = await collect_items(handle, None, 0, 9) + assert len(all_items) == 9 + + await handle.signal(MultiTopicWorkflow.close) + + +@pytest.mark.asyncio +async def test_subscribe_from_offset(client: Client) -> None: + """Subscribe from a non-zero offset.""" + count = 5 + async with new_worker( + client, + WorkflowSidePublishWorkflow, + ) as worker: + handle = await client.start_workflow( + WorkflowSidePublishWorkflow.run, + count, + id=f"pubsub-offset-{uuid.uuid4()}", + task_queue=worker.task_queue, + ) + + # Subscribe from offset 3 — should get items 3, 4 + items = await collect_items(handle, None, 3, 2) + assert len(items) == 2 + assert items[0].data == b"item-3" + assert items[1].data == b"item-4" + + # Subscribe from offset 0 — should get all 5 + all_items = await collect_items(handle, None, 0, 5) + assert len(all_items) == 5 + + await handle.signal(WorkflowSidePublishWorkflow.close) + + +@pytest.mark.asyncio +async def test_per_item_offsets(client: Client) -> None: + """Each yielded PubSubItem carries its correct global offset.""" + count = 5 + async with new_worker( + client, + WorkflowSidePublishWorkflow, + ) as worker: + handle = await client.start_workflow( + WorkflowSidePublishWorkflow.run, + count, + id=f"pubsub-item-offset-{uuid.uuid4()}", + task_queue=worker.task_queue, + ) + + items = await collect_items(handle, None, 0, count) + assert len(items) == count + for i, item in enumerate(items): + assert item.offset == i, f"item {i} has offset {item.offset}" + + # Subscribe from offset 3 — offsets should be 3, 4 + later_items = await collect_items(handle, None, 3, 2) + assert len(later_items) == 2 + assert later_items[0].offset == 3 + assert later_items[1].offset == 4 + + await handle.signal(WorkflowSidePublishWorkflow.close) + + +@pytest.mark.asyncio +async def test_per_item_offsets_with_topic_filter(client: Client) -> None: + """Per-item offsets are global (not per-topic) even when filtering.""" + count = 9 # 3 per topic (a, b, c round-robin) + async with new_worker( + client, + MultiTopicWorkflow, + activities=[publish_multi_topic], + ) as worker: + handle = await client.start_workflow( + MultiTopicWorkflow.run, + count, + id=f"pubsub-item-offset-filter-{uuid.uuid4()}", + task_queue=worker.task_queue, + ) + + # Subscribe to topic "a" only — items are at global offsets 0, 3, 6 + a_items = await collect_items(handle, ["a"], 0, 3) + assert len(a_items) == 3 + assert a_items[0].offset == 0 + assert a_items[1].offset == 3 + assert a_items[2].offset == 6 + + # Subscribe to topic "b" — items are at global offsets 1, 4, 7 + b_items = await collect_items(handle, ["b"], 0, 3) + assert len(b_items) == 3 + assert b_items[0].offset == 1 + assert b_items[1].offset == 4 + assert b_items[2].offset == 7 + + await handle.signal(MultiTopicWorkflow.close) + + +@pytest.mark.asyncio +async def test_per_item_offsets_after_truncation(client: Client) -> None: + """Per-item offsets remain correct after log truncation.""" + async with new_worker( + client, + TruncateSignalWorkflow, + ) as worker: + handle = await client.start_workflow( + TruncateSignalWorkflow.run, + id=f"pubsub-item-offset-trunc-{uuid.uuid4()}", + task_queue=worker.task_queue, + ) + + # Publish 5 items + await handle.signal( + "__pubsub_publish", + PublishInput(items=[ + PublishEntry(topic="events", data=encode_data(f"item-{i}".encode())) + for i in range(5) + ]), + ) + await asyncio.sleep(0.5) + + # Truncate up to offset 3 + await handle.signal("truncate", 3) + await asyncio.sleep(0.3) + + # Items 3, 4 should have offsets 3, 4 + items = await collect_items(handle, None, 3, 2) + assert len(items) == 2 + assert items[0].offset == 3 + assert items[1].offset == 4 + + await handle.signal("close") + + +@pytest.mark.asyncio +async def test_poll_truncated_offset_returns_application_error(client: Client) -> None: + """Polling a truncated offset raises ApplicationError (not ValueError) + and does not crash the workflow task.""" + async with new_worker( + client, + TruncateSignalWorkflow, + ) as worker: + handle = await client.start_workflow( + TruncateSignalWorkflow.run, + id=f"pubsub-trunc-error-{uuid.uuid4()}", + task_queue=worker.task_queue, + ) + + # Publish 5 items + await handle.signal( + "__pubsub_publish", + PublishInput(items=[ + PublishEntry(topic="events", data=encode_data(f"item-{i}".encode())) + for i in range(5) + ]), + ) + await asyncio.sleep(0.5) + + # Truncate up to offset 3 + await handle.signal("truncate", 3) + await asyncio.sleep(0.3) + + # Poll from offset 1 (truncated) — should get ApplicationError, + # NOT crash the workflow task. + from temporalio.client import WorkflowUpdateFailedError + with pytest.raises(WorkflowUpdateFailedError): + await handle.execute_update( + "__pubsub_poll", + PollInput(topics=[], from_offset=1), + result_type=PollResult, + ) + + # Workflow should still be usable — poll from valid offset 3 + items = await collect_items(handle, None, 3, 2) + assert len(items) == 2 + assert items[0].offset == 3 + + await handle.signal("close") + + +@pytest.mark.asyncio +async def test_poll_offset_zero_after_truncation(client: Client) -> None: + """Polling from offset 0 after truncation returns items from base_offset.""" + async with new_worker( + client, + TruncateSignalWorkflow, + ) as worker: + handle = await client.start_workflow( + TruncateSignalWorkflow.run, + id=f"pubsub-trunc-zero-{uuid.uuid4()}", + task_queue=worker.task_queue, + ) + + # Publish 5 items, truncate first 3 + await handle.signal( + "__pubsub_publish", + PublishInput(items=[ + PublishEntry(topic="events", data=encode_data(f"item-{i}".encode())) + for i in range(5) + ]), + ) + await asyncio.sleep(0.5) + await handle.signal("truncate", 3) + await asyncio.sleep(0.3) + + # Poll from offset 0 — should get items starting from base_offset (3) + items = await collect_items(handle, None, 0, 2) + assert len(items) == 2 + assert items[0].offset == 3 + assert items[1].offset == 4 + + await handle.signal("close") + + +@pytest.mark.asyncio +async def test_subscribe_recovers_from_truncation(client: Client) -> None: + """subscribe() auto-recovers when offset falls behind truncation.""" + async with new_worker( + client, + TruncateSignalWorkflow, + ) as worker: + handle = await client.start_workflow( + TruncateSignalWorkflow.run, + id=f"pubsub-trunc-recover-{uuid.uuid4()}", + task_queue=worker.task_queue, + ) + + # Publish 5 items + await handle.signal( + "__pubsub_publish", + PublishInput(items=[ + PublishEntry(topic="events", data=encode_data(f"item-{i}".encode())) + for i in range(5) + ]), + ) + await asyncio.sleep(0.5) + + # Truncate first 3 + await handle.signal("truncate", 3) + await asyncio.sleep(0.3) + + # subscribe from offset 1 (truncated) — should auto-recover + # and deliver items from base_offset (3) + pubsub = PubSubClient(handle) + items: list[PubSubItem] = [] + try: + async with asyncio.timeout(5): + async for item in pubsub.subscribe( + from_offset=1, poll_cooldown=0 + ): + items.append(item) + if len(items) >= 2: + break + except asyncio.TimeoutError: + pass + assert len(items) == 2 + assert items[0].offset == 3 + + await handle.signal("close") + + +@pytest.mark.asyncio +async def test_workflow_and_activity_publish_interleaved(client: Client) -> None: + """Workflow publishes status events around activity publishing.""" + count = 5 + async with new_worker( + client, + InterleavedWorkflow, + activities=[publish_items], + ) as worker: + handle = await client.start_workflow( + InterleavedWorkflow.run, + count, + id=f"pubsub-interleave-{uuid.uuid4()}", + task_queue=worker.task_queue, + ) + + # Total: 1 (started) + count (activity) + 1 (done) = count + 2 + items = await collect_items(handle, None, 0, count + 2) + assert len(items) == count + 2 + + # First item is workflow-side "started" + assert items[0].topic == "status" + assert items[0].data == b"started" + + # Middle items are from activity + for i in range(count): + assert items[i + 1].topic == "events" + assert items[i + 1].data == f"item-{i}".encode() + + # Last item is workflow-side "done" + assert items[count + 1].topic == "status" + assert items[count + 1].data == b"done" + + await handle.signal(InterleavedWorkflow.close) + + +@pytest.mark.asyncio +async def test_priority_flush(client: Client) -> None: + """Priority publish triggers immediate flush without waiting for timer.""" + async with new_worker( + client, + PriorityWorkflow, + activities=[publish_with_priority], + ) as worker: + handle = await client.start_workflow( + PriorityWorkflow.run, + id=f"pubsub-priority-{uuid.uuid4()}", + task_queue=worker.task_queue, + ) + + # If priority works, we get all 3 items quickly despite 60s batch interval + items = await collect_items(handle, None, 0, 3, timeout=10.0) + assert len(items) == 3 + assert items[2].data == b"priority" + + await handle.signal(PriorityWorkflow.close) + + +@pytest.mark.asyncio +async def test_iterator_cancellation(client: Client) -> None: + """Cancelling a subscription iterator completes cleanly.""" + async with new_worker( + client, + BasicPubSubWorkflow, + ) as worker: + handle = await client.start_workflow( + BasicPubSubWorkflow.run, + id=f"pubsub-cancel-{uuid.uuid4()}", + task_queue=worker.task_queue, + ) + + pubsub_client = PubSubClient(handle) + + async def subscribe_and_collect(): + items = [] + async for item in pubsub_client.subscribe( + from_offset=0, poll_cooldown=0 + ): + items.append(item) + return items + + task = asyncio.create_task(subscribe_and_collect()) + await asyncio.sleep(0.5) + task.cancel() + try: + await task + except asyncio.CancelledError: + pass + + await handle.signal(BasicPubSubWorkflow.close) + + +@pytest.mark.asyncio +async def test_context_manager_flushes_on_exit(client: Client) -> None: + """Context manager exit flushes all buffered items.""" + count = 5 + async with new_worker( + client, + FlushOnExitWorkflow, + activities=[publish_batch_test], + ) as worker: + handle = await client.start_workflow( + FlushOnExitWorkflow.run, + count, + id=f"pubsub-flush-{uuid.uuid4()}", + task_queue=worker.task_queue, + ) + + # Despite 60s batch interval, all items arrive because __aexit__ flushes + items = await collect_items(handle, None, 0, count, timeout=15.0) + assert len(items) == count + for i in range(count): + assert items[i].data == f"item-{i}".encode() + + await handle.signal(FlushOnExitWorkflow.close) + + +@pytest.mark.asyncio +async def test_concurrent_subscribers(client: Client) -> None: + """Two subscribers on different topics receive correct items concurrently.""" + count = 6 # 2 per topic + async with new_worker( + client, + MultiTopicWorkflow, + activities=[publish_multi_topic], + ) as worker: + handle = await client.start_workflow( + MultiTopicWorkflow.run, + count, + id=f"pubsub-concurrent-{uuid.uuid4()}", + task_queue=worker.task_queue, + ) + + a_task = asyncio.create_task(collect_items(handle, ["a"], 0, 2)) + b_task = asyncio.create_task(collect_items(handle, ["b"], 0, 2)) + + a_items, b_items = await asyncio.gather(a_task, b_task) + + assert len(a_items) == 2 + assert all(item.topic == "a" for item in a_items) + assert len(b_items) == 2 + assert all(item.topic == "b" for item in b_items) + + await handle.signal(MultiTopicWorkflow.close) + + +@pytest.mark.asyncio +async def test_mixin_coexistence(client: Client) -> None: + """PubSubMixin works alongside application signals and queries.""" + async with new_worker( + client, + MixinCoexistenceWorkflow, + ) as worker: + handle = await client.start_workflow( + MixinCoexistenceWorkflow.run, + id=f"pubsub-coexist-{uuid.uuid4()}", + task_queue=worker.task_queue, + ) + + # Use application signal + await handle.signal(MixinCoexistenceWorkflow.app_signal, "hello") + await handle.signal(MixinCoexistenceWorkflow.app_signal, "world") + + # Use pub/sub signal + await handle.signal( + "__pubsub_publish", + PublishInput(items=[PublishEntry(topic="events", data=encode_data(b"test-item"))]), + ) + + # Give signals time to be processed + await asyncio.sleep(0.5) + + # Query application state + app_data = await handle.query(MixinCoexistenceWorkflow.app_query) + assert app_data == ["hello", "world"] + + # Query pub/sub offset + pubsub_client = PubSubClient(handle) + offset = await pubsub_client.get_offset() + assert offset == 1 + + # Subscribe to pub/sub + items = await collect_items(handle, None, 0, 1) + assert len(items) == 1 + assert items[0].topic == "events" + + await handle.signal(MixinCoexistenceWorkflow.close) + + +@pytest.mark.asyncio +async def test_max_batch_size(client: Client) -> None: + """max_batch_size triggers auto-flush without waiting for timer.""" + count = 7 # with max_batch_size=3: flushes at 3, 6, then remainder 1 on exit + async with new_worker( + client, + MaxBatchWorkflow, + activities=[publish_with_max_batch], + max_cached_workflows=0, + ) as worker: + handle = await client.start_workflow( + MaxBatchWorkflow.run, + count, + id=f"pubsub-maxbatch-{uuid.uuid4()}", + task_queue=worker.task_queue, + ) + # count items from activity + 1 "activity_done" from workflow + items = await collect_items(handle, None, 0, count + 1, timeout=15.0) + assert len(items) == count + 1 + for i in range(count): + assert items[i].data == f"item-{i}".encode() + await handle.signal(MaxBatchWorkflow.close) + + +@pytest.mark.asyncio +async def test_replay_safety(client: Client) -> None: + """Pub/sub mixin survives workflow replay (max_cached_workflows=0).""" + async with new_worker( + client, + InterleavedWorkflow, + activities=[publish_items], + max_cached_workflows=0, + ) as worker: + handle = await client.start_workflow( + InterleavedWorkflow.run, + 5, + id=f"pubsub-replay-{uuid.uuid4()}", + task_queue=worker.task_queue, + ) + # 1 (started) + 5 (activity) + 1 (done) = 7 + items = await collect_items(handle, None, 0, 7) + assert len(items) == 7 + assert items[0].data == b"started" + assert items[6].data == b"done" + await handle.signal(InterleavedWorkflow.close) + + +@pytest.mark.asyncio +async def test_flush_keeps_pending_on_signal_failure(client: Client) -> None: + """If flush signal fails, items stay in _pending for retry with same sequence. + + This matches the TLA+-verified algorithm (PubSubDedup.tla): on failure, + the pending batch and sequence are kept so the next _flush() retries with + the SAME sequence number. The confirmed sequence (_sequence) does NOT + advance until delivery is confirmed. + """ + bogus_handle = client.get_workflow_handle("nonexistent-workflow-id") + pubsub = PubSubClient(bogus_handle) + + pubsub.publish("events", b"item-0") + pubsub.publish("events", b"item-1") + assert len(pubsub._buffer) == 2 + + # flush should fail (workflow doesn't exist) + with pytest.raises(Exception): + await pubsub._flush() + + # Items moved to _pending (not restored to _buffer) + assert len(pubsub._buffer) == 0 + assert pubsub._pending is not None + assert len(pubsub._pending) == 2 + assert pubsub._pending[0].data == encode_data(b"item-0") + assert pubsub._pending[1].data == encode_data(b"item-1") + # Pending sequence is set, confirmed sequence is NOT advanced + assert pubsub._pending_seq == 1 + assert pubsub._sequence == 0 + + # New items published during failure go to _buffer (not _pending) + pubsub.publish("events", b"item-2") + assert len(pubsub._buffer) == 1 + assert pubsub._pending is not None # Still set for retry + + # Next flush retries the pending batch with the same sequence + with pytest.raises(Exception): + await pubsub._flush() + assert pubsub._pending_seq == 1 # Same sequence on retry + assert pubsub._sequence == 0 # Still not advanced + + +@pytest.mark.asyncio +async def test_max_retry_duration_expiry(client: Client) -> None: + """Flush raises TimeoutError when max_retry_duration is exceeded.""" + bogus_handle = client.get_workflow_handle("nonexistent-workflow-id") + pubsub = PubSubClient(bogus_handle, max_retry_duration=0.1) + + pubsub.publish("events", b"item-0") + + # First flush fails, sets pending + with pytest.raises(Exception, match="not found"): + await pubsub._flush() + assert pubsub._pending is not None + + # Wait for retry duration to expire + await asyncio.sleep(0.2) + + # Next flush should raise TimeoutError and clear pending + with pytest.raises(TimeoutError, match="max_retry_duration"): + await pubsub._flush() + assert pubsub._pending is None + assert pubsub._sequence == 0 + + +@pytest.mark.asyncio +async def test_dedup_rejects_duplicate_signal(client: Client) -> None: + """Workflow deduplicates signals with the same publisher_id + sequence.""" + async with new_worker( + client, + BasicPubSubWorkflow, + ) as worker: + handle = await client.start_workflow( + BasicPubSubWorkflow.run, + id=f"pubsub-dedup-{uuid.uuid4()}", + task_queue=worker.task_queue, + ) + + # Send a batch with publisher_id and sequence + await handle.signal( + "__pubsub_publish", + PublishInput( + items=[PublishEntry(topic="events", data=encode_data(b"item-0"))], + publisher_id="test-pub", + sequence=1, + ), + ) + + # Send the same sequence again — should be deduped + await handle.signal( + "__pubsub_publish", + PublishInput( + items=[PublishEntry(topic="events", data=encode_data(b"duplicate"))], + publisher_id="test-pub", + sequence=1, + ), + ) + + # Send a new sequence — should go through + await handle.signal( + "__pubsub_publish", + PublishInput( + items=[PublishEntry(topic="events", data=encode_data(b"item-1"))], + publisher_id="test-pub", + sequence=2, + ), + ) + + await asyncio.sleep(0.5) + + # Should have 2 items, not 3 + items = await collect_items(handle, None, 0, 2) + assert len(items) == 2 + assert items[0].data == b"item-0" + assert items[1].data == b"item-1" + + # Verify offset is 2 (not 3) + pubsub_client = PubSubClient(handle) + offset = await pubsub_client.get_offset() + assert offset == 2 + + await handle.signal(BasicPubSubWorkflow.close) + + +@pytest.mark.asyncio +async def test_truncate_pubsub(client: Client) -> None: + """truncate_pubsub discards prefix and adjusts base_offset.""" + async with new_worker( + client, + TruncateSignalWorkflow, + ) as worker: + handle = await client.start_workflow( + TruncateSignalWorkflow.run, + id=f"pubsub-truncate-{uuid.uuid4()}", + task_queue=worker.task_queue, + ) + + # Publish 5 items via signal + await handle.signal( + "__pubsub_publish", + PublishInput(items=[ + PublishEntry(topic="events", data=encode_data(f"item-{i}".encode())) + for i in range(5) + ]), + ) + await asyncio.sleep(0.5) + + # Verify all 5 items + items = await collect_items(handle, None, 0, 5) + assert len(items) == 5 + + # Truncate up to offset 3 (discard items 0, 1, 2) + await handle.signal("truncate", 3) + await asyncio.sleep(0.3) + + # Offset should still be 5 + pubsub_client = PubSubClient(handle) + offset = await pubsub_client.get_offset() + assert offset == 5 + + # Reading from offset 3 should work (items 3, 4) + items_after = await collect_items(handle, None, 3, 2) + assert len(items_after) == 2 + assert items_after[0].data == b"item-3" + assert items_after[1].data == b"item-4" + + await handle.signal("close") + + +@pytest.mark.asyncio +async def test_ttl_pruning_in_get_pubsub_state(client: Client) -> None: + """get_pubsub_state prunes stale publisher entries based on TTL.""" + async with new_worker( + client, + TTLTestWorkflow, + ) as worker: + handle = await client.start_workflow( + TTLTestWorkflow.run, + id=f"pubsub-ttl-{uuid.uuid4()}", + task_queue=worker.task_queue, + ) + + # Publish from two different publishers + await handle.signal( + "__pubsub_publish", + PublishInput( + items=[PublishEntry(topic="events", data=encode_data(b"from-a"))], + publisher_id="pub-a", + sequence=1, + ), + ) + await handle.signal( + "__pubsub_publish", + PublishInput( + items=[PublishEntry(topic="events", data=encode_data(b"from-b"))], + publisher_id="pub-b", + sequence=1, + ), + ) + await asyncio.sleep(0.5) + + # Query state with a very long TTL — both publishers retained + state = await handle.query(TTLTestWorkflow.get_state_with_ttl, 9999.0) + assert "pub-a" in state.publisher_sequences + assert "pub-b" in state.publisher_sequences + + # Query state with TTL=0 — both publishers pruned + state_pruned = await handle.query(TTLTestWorkflow.get_state_with_ttl, 0.0) + assert "pub-a" not in state_pruned.publisher_sequences + assert "pub-b" not in state_pruned.publisher_sequences + + # Items are still in the log regardless of pruning + assert len(state_pruned.log) == 2 + + await handle.signal("close") + + +# --------------------------------------------------------------------------- +# Truncate and TTL test workflows +# --------------------------------------------------------------------------- + + +@workflow.defn +class TruncateSignalWorkflow(PubSubMixin): + """Workflow that accepts a truncate signal for testing.""" + + @workflow.init + def __init__(self) -> None: + self.init_pubsub() + self._closed = False + + @workflow.signal + def close(self) -> None: + self._closed = True + + @workflow.signal + def truncate(self, up_to_offset: int) -> None: + self.truncate_pubsub(up_to_offset) + + @workflow.run + async def run(self) -> None: + await workflow.wait_condition(lambda: self._closed) + + +@workflow.defn +class TTLTestWorkflow(PubSubMixin): + """Workflow that exposes get_pubsub_state via query for TTL testing.""" + + @workflow.init + def __init__(self) -> None: + self.init_pubsub() + self._closed = False + + @workflow.signal + def close(self) -> None: + self._closed = True + + @workflow.query + def get_state_with_ttl(self, ttl: float) -> PubSubState: + return self.get_pubsub_state(publisher_ttl=ttl) + + @workflow.run + async def run(self) -> None: + await workflow.wait_condition(lambda: self._closed) + + +# --------------------------------------------------------------------------- +# Continue-as-new workflow and test +# --------------------------------------------------------------------------- + + +@dataclass +class CANWorkflowInputAny: + """Uses Any typing — reproduces the pitfall.""" + pubsub_state: Any = None + + +@dataclass +class CANWorkflowInputTyped: + """Uses proper typing.""" + pubsub_state: PubSubState | None = None + + +@workflow.defn +class ContinueAsNewAnyWorkflow(PubSubMixin): + """CAN workflow using Any-typed pubsub_state (reproduces samples pattern).""" + + @workflow.init + def __init__(self, input: CANWorkflowInputAny) -> None: + self.init_pubsub(prior_state=input.pubsub_state) + self._should_continue = False + self._closed = False + + @workflow.signal + def close(self) -> None: + self._closed = True + + @workflow.signal + def trigger_continue(self) -> None: + self._should_continue = True + + @workflow.run + async def run(self, input: CANWorkflowInputAny) -> None: + while True: + await workflow.wait_condition( + lambda: self._should_continue or self._closed + ) + if self._closed: + return + if self._should_continue: + self._should_continue = False + self.drain_pubsub() + await workflow.wait_condition(workflow.all_handlers_finished) + workflow.continue_as_new(args=[CANWorkflowInputAny( + pubsub_state=self.get_pubsub_state(), + )]) + + +@workflow.defn +class ContinueAsNewTypedWorkflow(PubSubMixin): + """CAN workflow using properly-typed pubsub_state.""" + + @workflow.init + def __init__(self, input: CANWorkflowInputTyped) -> None: + self.init_pubsub(prior_state=input.pubsub_state) + self._should_continue = False + self._closed = False + + @workflow.signal + def close(self) -> None: + self._closed = True + + @workflow.signal + def trigger_continue(self) -> None: + self._should_continue = True + + @workflow.run + async def run(self, input: CANWorkflowInputTyped) -> None: + while True: + await workflow.wait_condition( + lambda: self._should_continue or self._closed + ) + if self._closed: + return + if self._should_continue: + self._should_continue = False + self.drain_pubsub() + await workflow.wait_condition(workflow.all_handlers_finished) + workflow.continue_as_new(args=[CANWorkflowInputTyped( + pubsub_state=self.get_pubsub_state(), + )]) + + +async def _run_can_test(can_client: Client, workflow_cls, input_cls) -> None: + """Shared CAN test logic: publish, CAN, verify items survive.""" + async with new_worker( + can_client, + workflow_cls, + ) as worker: + handle = await can_client.start_workflow( + workflow_cls.run, + input_cls(), + id=f"pubsub-can-{uuid.uuid4()}", + task_queue=worker.task_queue, + ) + + # Publish 3 items via signal + await handle.signal( + "__pubsub_publish", + PublishInput(items=[ + PublishEntry(topic="events", data=encode_data(b"item-0")), + PublishEntry(topic="events", data=encode_data(b"item-1")), + PublishEntry(topic="events", data=encode_data(b"item-2")), + ]), + ) + + # Verify items are there + items_before = await collect_items(handle, None, 0, 3) + assert len(items_before) == 3 + + # Trigger continue-as-new + await handle.signal(workflow_cls.trigger_continue) + + # Wait for new run to start (poll, don't sleep) + new_handle = can_client.get_workflow_handle(handle.id) + await assert_eq_eventually( + True, + lambda: _is_different_run(handle, new_handle), + ) + + # The 3 items from before CAN should still be readable + items_after = await collect_items(new_handle, None, 0, 3) + assert len(items_after) == 3 + assert items_after[0].data == b"item-0" + assert items_after[1].data == b"item-1" + assert items_after[2].data == b"item-2" + + # New items should get offset 3+ + await new_handle.signal( + "__pubsub_publish", + PublishInput(items=[PublishEntry(topic="events", data=encode_data(b"item-3"))]), + ) + items_all = await collect_items(new_handle, None, 0, 4) + assert len(items_all) == 4 + assert items_all[3].data == b"item-3" + + await new_handle.signal(workflow_cls.close) + + +@pytest.mark.asyncio +async def test_continue_as_new_any_typed_fails(client: Client) -> None: + """Any-typed pubsub_state does NOT survive CAN — documents the pitfall. + + The default data converter deserializes Any fields as plain dicts, losing + the PubSubState type. Use ``PubSubState | None`` instead. + """ + async with new_worker( + client, + ContinueAsNewAnyWorkflow, + ) as worker: + handle = await client.start_workflow( + ContinueAsNewAnyWorkflow.run, + CANWorkflowInputAny(), + id=f"pubsub-can-any-{uuid.uuid4()}", + task_queue=worker.task_queue, + ) + + await handle.signal( + "__pubsub_publish", + PublishInput(items=[PublishEntry(topic="events", data=encode_data(b"item-0"))]), + ) + items = await collect_items(handle, None, 0, 1) + assert len(items) == 1 + + # Trigger CAN — the new run will fail to deserialize pubsub_state + await handle.signal(ContinueAsNewAnyWorkflow.trigger_continue) + + # Wait for CAN to happen + new_handle = client.get_workflow_handle(handle.id) + await assert_eq_eventually( + True, + lambda: _is_different_run(handle, new_handle), + ) + + # The new run should be broken — items are NOT accessible + items_after = await collect_items(new_handle, None, 0, 1, timeout=3.0) + assert len(items_after) == 0 # fails because workflow can't start + + +@pytest.mark.asyncio +async def test_continue_as_new_properly_typed(client: Client) -> None: + """CAN with PubSubState-typed pubsub_state field.""" + await _run_can_test(client, ContinueAsNewTypedWorkflow, CANWorkflowInputTyped) diff --git a/uv.lock b/uv.lock index c63faefad..df900573a 100644 --- a/uv.lock +++ b/uv.lock @@ -8,6 +8,10 @@ resolution-markers = [ "python_full_version < '3.11'", ] +[options] +exclude-newer = "2026-03-30T03:37:56.787253Z" +exclude-newer-span = "P7D" + [[package]] name = "aioboto3" version = "15.5.0" @@ -1768,7 +1772,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/38/3f/9859f655d11901e7b2996c6e3d33e0caa9a1d4572c3bc61ed0faa64b2f4c/greenlet-3.3.2-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:9bc885b89709d901859cf95179ec9f6bb67a3d2bb1f0e88456461bd4b7f8fd0d", size = 277747, upload-time = "2026-02-20T20:16:21.325Z" }, { url = "https://files.pythonhosted.org/packages/fb/07/cb284a8b5c6498dbd7cba35d31380bb123d7dceaa7907f606c8ff5993cbf/greenlet-3.3.2-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b568183cf65b94919be4438dc28416b234b678c608cafac8874dfeeb2a9bbe13", size = 579202, upload-time = "2026-02-20T20:47:28.955Z" }, { url = "https://files.pythonhosted.org/packages/ed/45/67922992b3a152f726163b19f890a85129a992f39607a2a53155de3448b8/greenlet-3.3.2-cp310-cp310-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:527fec58dc9f90efd594b9b700662ed3fb2493c2122067ac9c740d98080a620e", size = 590620, upload-time = "2026-02-20T20:55:55.581Z" }, - { url = "https://files.pythonhosted.org/packages/03/5f/6e2a7d80c353587751ef3d44bb947f0565ec008a2e0927821c007e96d3a7/greenlet-3.3.2-cp310-cp310-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:508c7f01f1791fbc8e011bd508f6794cb95397fdb198a46cb6635eb5b78d85a7", size = 602132, upload-time = "2026-02-20T21:02:43.261Z" }, { url = "https://files.pythonhosted.org/packages/ad/55/9f1ebb5a825215fadcc0f7d5073f6e79e3007e3282b14b22d6aba7ca6cb8/greenlet-3.3.2-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ad0c8917dd42a819fe77e6bdfcb84e3379c0de956469301d9fd36427a1ca501f", size = 591729, upload-time = "2026-02-20T20:20:58.395Z" }, { url = "https://files.pythonhosted.org/packages/24/b4/21f5455773d37f94b866eb3cf5caed88d6cea6dd2c6e1f9c34f463cba3ec/greenlet-3.3.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:97245cc10e5515dbc8c3104b2928f7f02b6813002770cfaffaf9a6e0fc2b94ef", size = 1551946, upload-time = "2026-02-20T20:49:31.102Z" }, { url = "https://files.pythonhosted.org/packages/00/68/91f061a926abead128fe1a87f0b453ccf07368666bd59ffa46016627a930/greenlet-3.3.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8c1fdd7d1b309ff0da81d60a9688a8bd044ac4e18b250320a96fc68d31c209ca", size = 1618494, upload-time = "2026-02-20T20:21:06.541Z" }, @@ -1776,7 +1779,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f3/47/16400cb42d18d7a6bb46f0626852c1718612e35dcb0dffa16bbaffdf5dd2/greenlet-3.3.2-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:c56692189a7d1c7606cb794be0a8381470d95c57ce5be03fb3d0ef57c7853b86", size = 278890, upload-time = "2026-02-20T20:19:39.263Z" }, { url = "https://files.pythonhosted.org/packages/a3/90/42762b77a5b6aa96cd8c0e80612663d39211e8ae8a6cd47c7f1249a66262/greenlet-3.3.2-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1ebd458fa8285960f382841da585e02201b53a5ec2bac6b156fc623b5ce4499f", size = 581120, upload-time = "2026-02-20T20:47:30.161Z" }, { url = "https://files.pythonhosted.org/packages/bf/6f/f3d64f4fa0a9c7b5c5b3c810ff1df614540d5aa7d519261b53fba55d4df9/greenlet-3.3.2-cp311-cp311-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a443358b33c4ec7b05b79a7c8b466f5d275025e750298be7340f8fc63dff2a55", size = 594363, upload-time = "2026-02-20T20:55:56.965Z" }, - { url = "https://files.pythonhosted.org/packages/9c/8b/1430a04657735a3f23116c2e0d5eb10220928846e4537a938a41b350bed6/greenlet-3.3.2-cp311-cp311-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:4375a58e49522698d3e70cc0b801c19433021b5c37686f7ce9c65b0d5c8677d2", size = 605046, upload-time = "2026-02-20T21:02:45.234Z" }, { url = "https://files.pythonhosted.org/packages/72/83/3e06a52aca8128bdd4dcd67e932b809e76a96ab8c232a8b025b2850264c5/greenlet-3.3.2-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8e2cd90d413acbf5e77ae41e5d3c9b3ac1d011a756d7284d7f3f2b806bbd6358", size = 594156, upload-time = "2026-02-20T20:20:59.955Z" }, { url = "https://files.pythonhosted.org/packages/70/79/0de5e62b873e08fe3cef7dbe84e5c4bc0e8ed0c7ff131bccb8405cd107c8/greenlet-3.3.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:442b6057453c8cb29b4fb36a2ac689382fc71112273726e2423f7f17dc73bf99", size = 1554649, upload-time = "2026-02-20T20:49:32.293Z" }, { url = "https://files.pythonhosted.org/packages/5a/00/32d30dee8389dc36d42170a9c66217757289e2afb0de59a3565260f38373/greenlet-3.3.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:45abe8eb6339518180d5a7fa47fa01945414d7cca5ecb745346fc6a87d2750be", size = 1619472, upload-time = "2026-02-20T20:21:07.966Z" }, @@ -1785,7 +1787,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ea/ab/1608e5a7578e62113506740b88066bf09888322a311cff602105e619bd87/greenlet-3.3.2-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:ac8d61d4343b799d1e526db579833d72f23759c71e07181c2d2944e429eb09cd", size = 280358, upload-time = "2026-02-20T20:17:43.971Z" }, { url = "https://files.pythonhosted.org/packages/a5/23/0eae412a4ade4e6623ff7626e38998cb9b11e9ff1ebacaa021e4e108ec15/greenlet-3.3.2-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3ceec72030dae6ac0c8ed7591b96b70410a8be370b6a477b1dbc072856ad02bd", size = 601217, upload-time = "2026-02-20T20:47:31.462Z" }, { url = "https://files.pythonhosted.org/packages/f8/16/5b1678a9c07098ecb9ab2dd159fafaf12e963293e61ee8d10ecb55273e5e/greenlet-3.3.2-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a2a5be83a45ce6188c045bcc44b0ee037d6a518978de9a5d97438548b953a1ac", size = 611792, upload-time = "2026-02-20T20:55:58.423Z" }, - { url = "https://files.pythonhosted.org/packages/5c/c5/cc09412a29e43406eba18d61c70baa936e299bc27e074e2be3806ed29098/greenlet-3.3.2-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ae9e21c84035c490506c17002f5c8ab25f980205c3e61ddb3a2a2a2e6c411fcb", size = 626250, upload-time = "2026-02-20T21:02:46.596Z" }, { url = "https://files.pythonhosted.org/packages/50/1f/5155f55bd71cabd03765a4aac9ac446be129895271f73872c36ebd4b04b6/greenlet-3.3.2-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:43e99d1749147ac21dde49b99c9abffcbc1e2d55c67501465ef0930d6e78e070", size = 613875, upload-time = "2026-02-20T20:21:01.102Z" }, { url = "https://files.pythonhosted.org/packages/fc/dd/845f249c3fcd69e32df80cdab059b4be8b766ef5830a3d0aa9d6cad55beb/greenlet-3.3.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4c956a19350e2c37f2c48b336a3afb4bff120b36076d9d7fb68cb44e05d95b79", size = 1571467, upload-time = "2026-02-20T20:49:33.495Z" }, { url = "https://files.pythonhosted.org/packages/2a/50/2649fe21fcc2b56659a452868e695634722a6655ba245d9f77f5656010bf/greenlet-3.3.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6c6f8ba97d17a1e7d664151284cb3315fc5f8353e75221ed4324f84eb162b395", size = 1640001, upload-time = "2026-02-20T20:21:09.154Z" }, @@ -1794,7 +1795,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ac/48/f8b875fa7dea7dd9b33245e37f065af59df6a25af2f9561efa8d822fde51/greenlet-3.3.2-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:aa6ac98bdfd716a749b84d4034486863fd81c3abde9aa3cf8eff9127981a4ae4", size = 279120, upload-time = "2026-02-20T20:19:01.9Z" }, { url = "https://files.pythonhosted.org/packages/49/8d/9771d03e7a8b1ee456511961e1b97a6d77ae1dea4a34a5b98eee706689d3/greenlet-3.3.2-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ab0c7e7901a00bc0a7284907273dc165b32e0d109a6713babd04471327ff7986", size = 603238, upload-time = "2026-02-20T20:47:32.873Z" }, { url = "https://files.pythonhosted.org/packages/59/0e/4223c2bbb63cd5c97f28ffb2a8aee71bdfb30b323c35d409450f51b91e3e/greenlet-3.3.2-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:d248d8c23c67d2291ffd47af766e2a3aa9fa1c6703155c099feb11f526c63a92", size = 614219, upload-time = "2026-02-20T20:55:59.817Z" }, - { url = "https://files.pythonhosted.org/packages/94/2b/4d012a69759ac9d77210b8bfb128bc621125f5b20fc398bce3940d036b1c/greenlet-3.3.2-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ccd21bb86944ca9be6d967cf7691e658e43417782bce90b5d2faeda0ff78a7dd", size = 628268, upload-time = "2026-02-20T21:02:48.024Z" }, { url = "https://files.pythonhosted.org/packages/7a/34/259b28ea7a2a0c904b11cd36c79b8cef8019b26ee5dbe24e73b469dea347/greenlet-3.3.2-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b6997d360a4e6a4e936c0f9625b1c20416b8a0ea18a8e19cabbefc712e7397ab", size = 616774, upload-time = "2026-02-20T20:21:02.454Z" }, { url = "https://files.pythonhosted.org/packages/0a/03/996c2d1689d486a6e199cb0f1cf9e4aa940c500e01bdf201299d7d61fa69/greenlet-3.3.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:64970c33a50551c7c50491671265d8954046cb6e8e2999aacdd60e439b70418a", size = 1571277, upload-time = "2026-02-20T20:49:34.795Z" }, { url = "https://files.pythonhosted.org/packages/d9/c4/2570fc07f34a39f2caf0bf9f24b0a1a0a47bc2e8e465b2c2424821389dfc/greenlet-3.3.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:1a9172f5bf6bd88e6ba5a84e0a68afeac9dc7b6b412b245dd64f52d83c81e55b", size = 1640455, upload-time = "2026-02-20T20:21:10.261Z" }, @@ -1803,7 +1803,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3f/ae/8bffcbd373b57a5992cd077cbe8858fff39110480a9d50697091faea6f39/greenlet-3.3.2-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:8d1658d7291f9859beed69a776c10822a0a799bc4bfe1bd4272bb60e62507dab", size = 279650, upload-time = "2026-02-20T20:18:00.783Z" }, { url = "https://files.pythonhosted.org/packages/d1/c0/45f93f348fa49abf32ac8439938726c480bd96b2a3c6f4d949ec0124b69f/greenlet-3.3.2-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:18cb1b7337bca281915b3c5d5ae19f4e76d35e1df80f4ad3c1a7be91fadf1082", size = 650295, upload-time = "2026-02-20T20:47:34.036Z" }, { url = "https://files.pythonhosted.org/packages/b3/de/dd7589b3f2b8372069ab3e4763ea5329940fc7ad9dcd3e272a37516d7c9b/greenlet-3.3.2-cp314-cp314-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c2e47408e8ce1c6f1ceea0dffcdf6ebb85cc09e55c7af407c99f1112016e45e9", size = 662163, upload-time = "2026-02-20T20:56:01.295Z" }, - { url = "https://files.pythonhosted.org/packages/cd/ac/85804f74f1ccea31ba518dcc8ee6f14c79f73fe36fa1beba38930806df09/greenlet-3.3.2-cp314-cp314-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:e3cb43ce200f59483eb82949bf1835a99cf43d7571e900d7c8d5c62cdf25d2f9", size = 675371, upload-time = "2026-02-20T21:02:49.664Z" }, { url = "https://files.pythonhosted.org/packages/d2/d8/09bfa816572a4d83bccd6750df1926f79158b1c36c5f73786e26dbe4ee38/greenlet-3.3.2-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:63d10328839d1973e5ba35e98cccbca71b232b14051fd957b6f8b6e8e80d0506", size = 664160, upload-time = "2026-02-20T20:21:04.015Z" }, { url = "https://files.pythonhosted.org/packages/48/cf/56832f0c8255d27f6c35d41b5ec91168d74ec721d85f01a12131eec6b93c/greenlet-3.3.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:8e4ab3cfb02993c8cc248ea73d7dae6cec0253e9afa311c9b37e603ca9fad2ce", size = 1619181, upload-time = "2026-02-20T20:49:36.052Z" }, { url = "https://files.pythonhosted.org/packages/0a/23/b90b60a4aabb4cec0796e55f25ffbfb579a907c3898cd2905c8918acaa16/greenlet-3.3.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:94ad81f0fd3c0c0681a018a976e5c2bd2ca2d9d94895f23e7bb1af4e8af4e2d5", size = 1687713, upload-time = "2026-02-20T20:21:11.684Z" }, @@ -1812,7 +1811,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/98/6d/8f2ef704e614bcf58ed43cfb8d87afa1c285e98194ab2cfad351bf04f81e/greenlet-3.3.2-cp314-cp314t-macosx_11_0_universal2.whl", hash = "sha256:e26e72bec7ab387ac80caa7496e0f908ff954f31065b0ffc1f8ecb1338b11b54", size = 286617, upload-time = "2026-02-20T20:19:29.856Z" }, { url = "https://files.pythonhosted.org/packages/5e/0d/93894161d307c6ea237a43988f27eba0947b360b99ac5239ad3fe09f0b47/greenlet-3.3.2-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8b466dff7a4ffda6ca975979bab80bdadde979e29fc947ac3be4451428d8b0e4", size = 655189, upload-time = "2026-02-20T20:47:35.742Z" }, { url = "https://files.pythonhosted.org/packages/f5/2c/d2d506ebd8abcb57386ec4f7ba20f4030cbe56eae541bc6fd6ef399c0b41/greenlet-3.3.2-cp314-cp314t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b8bddc5b73c9720bea487b3bffdb1840fe4e3656fba3bd40aa1489e9f37877ff", size = 658225, upload-time = "2026-02-20T20:56:02.527Z" }, - { url = "https://files.pythonhosted.org/packages/d1/67/8197b7e7e602150938049d8e7f30de1660cfb87e4c8ee349b42b67bdb2e1/greenlet-3.3.2-cp314-cp314t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:59b3e2c40f6706b05a9cd299c836c6aa2378cabe25d021acd80f13abf81181cf", size = 666581, upload-time = "2026-02-20T21:02:51.526Z" }, { url = "https://files.pythonhosted.org/packages/8e/30/3a09155fbf728673a1dea713572d2d31159f824a37c22da82127056c44e4/greenlet-3.3.2-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b26b0f4428b871a751968285a1ac9648944cea09807177ac639b030bddebcea4", size = 657907, upload-time = "2026-02-20T20:21:05.259Z" }, { url = "https://files.pythonhosted.org/packages/f3/fd/d05a4b7acd0154ed758797f0a43b4c0962a843bedfe980115e842c5b2d08/greenlet-3.3.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:1fb39a11ee2e4d94be9a76671482be9398560955c9e568550de0224e41104727", size = 1618857, upload-time = "2026-02-20T20:49:37.309Z" }, { url = "https://files.pythonhosted.org/packages/6f/e1/50ee92a5db521de8f35075b5eff060dd43d39ebd46c2181a2042f7070385/greenlet-3.3.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:20154044d9085151bc309e7689d6f7ba10027f8f5a8c0676ad398b951913d89e", size = 1680010, upload-time = "2026-02-20T20:21:13.427Z" }, @@ -4857,7 +4855,7 @@ requires-dist = [ { name = "pydantic", marker = "extra == 'pydantic'", specifier = ">=2.0.0,<3" }, { name = "python-dateutil", marker = "python_full_version < '3.11'", specifier = ">=2.8.2,<3" }, { name = "types-aioboto3", extras = ["s3"], marker = "extra == 'aioboto3'", specifier = ">=10.4.0" }, - { name = "types-protobuf", specifier = ">=3.20" }, + { name = "types-protobuf", specifier = ">=3.20,<7.0.0" }, { name = "typing-extensions", specifier = ">=4.2.0,<5" }, ] provides-extras = ["grpc", "opentelemetry", "pydantic", "openai-agents", "google-adk", "aioboto3"]