Skip to content

Commit 58f5a69

Browse files
committed
Fix stdio shutdown edge cases found in review
- Close the read stream before the writer-flush wait: the reader's stdout drain is what unwedges a server blocked writing, so a wedged writer's flush can now complete instead of burning the flush window (on trio the aborted send also truncated an accepted message mid-line). - Run the reader's drain phase shielded and on raw bytes: caller cancellation no longer skips the drain (which got well-behaved servers spuriously tree-killed), and a dying server flushing non-UTF-8 output can no longer abort shutdown with a decode error. - Re-check returncode when the grace wait times out, so a server dying in the final poll interval is not spuriously escalated. - Release the subprocess transport by duck type, not isinstance: uvloop's transport is not an asyncio.SubprocessTransport, so the deterministic fd release silently no-op'd there. Tolerate PermissionError from the close (Python <= 3.12 re-kills a setuid child without catching it). - Log killpg EPERM at warning, not error: macOS raises EPERM for groups containing only unreaped zombies, i.e. a cleanly exited server. - Test teardown tolerates EPERM from killpg (same macOS zombie case), and the interaction test's outer deadline now exceeds the grace period it configures. New regression tests pin each behavioral fix. - Document the terminate_posix_process_tree group-leader requirement and the win32 utilities logger rename in the migration guide.
1 parent 1ba4065 commit 58f5a69

6 files changed

Lines changed: 238 additions & 28 deletions

File tree

docs/migration.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,11 @@ pipe lingered until garbage collection); and a failed write to a server that is
140140
still running now surfaces as a closed connection (`CONNECTION_CLOSED`) on the read
141141
side instead of leaving requests waiting indefinitely.
142142

143+
`terminate_posix_process_tree` now requires the process to lead its own process
144+
group (spawned with `start_new_session=True`); the `getpgid()` lookup and the
145+
per-process terminate/kill fallback are gone. The win32 utilities logger is now
146+
named `mcp.os.win32.utilities` (was `client.stdio.win32`).
147+
143148
### Removed type aliases and classes
144149

145150
The following deprecated type aliases and classes have been removed from `mcp.types`:

src/mcp/client/stdio.py

Lines changed: 47 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import asyncio
21
import logging
32
import os
43
import sys
@@ -176,15 +175,30 @@ async def stdout_reader() -> None:
176175
lines = (buffer + chunk).split("\n")
177176
buffer = lines.pop()
178177
for line in lines:
179-
await read_stream_writer.send(_parse_line(line))
180-
except (anyio.ClosedResourceError, anyio.BrokenResourceError):
181-
# The read stream is gone (shutdown closed it, or the caller did).
178+
try:
179+
await read_stream_writer.send(_parse_line(line))
180+
except (anyio.ClosedResourceError, anyio.BrokenResourceError):
181+
# The read stream is gone (shutdown closed it, or
182+
# the caller did); stop delivering. The drain below
183+
# still runs on the way out.
184+
return
185+
finally:
182186
# Phase 2: keep consuming stdout without delivering, so a server
183187
# flushing its remaining output during shutdown does not block on
184188
# a full pipe and miss its chance to exit within the grace period.
185-
with suppress(anyio.EndOfStream):
186-
while True:
187-
await stdout.receive()
189+
# Shielded so caller cancellation cannot skip the drain (bounded:
190+
# shutdown closes process.stdout), and raw bytes so non-UTF-8 in
191+
# a dying server's flush cannot abort it.
192+
with anyio.CancelScope(shield=True):
193+
with suppress(
194+
anyio.EndOfStream,
195+
anyio.ClosedResourceError,
196+
anyio.BrokenResourceError,
197+
ConnectionError,
198+
OSError,
199+
):
200+
while True:
201+
await process.stdout.receive()
188202
except anyio.ClosedResourceError:
189203
# Our own shutdown closed/poisoned the stdout stream under the read.
190204
pass
@@ -222,14 +236,16 @@ async def stdin_writer() -> None:
222236
async def shutdown() -> None:
223237
"""Wind the transport down, leaving no live server process behind.
224238
225-
1. Close the session's write stream, then wait (bounded by
239+
1. Close the session's read stream, unblocking the reader from any
240+
undelivered message so it drains stdout for the rest of shutdown
241+
(see the drain note in `stdout_reader`). Draining before the flush
242+
wait below is what lets a wedged writer's flush succeed: a server
243+
blocked writing its stdout cannot get to reading its stdin.
244+
2. Close the session's write stream, then wait (bounded by
226245
`_WRITER_FLUSH_TIMEOUT`) for the writer task to hand any message the
227246
transport already accepted to the server's stdin before that stdin
228247
closes; a zero-buffer send completes at the rendezvous, before the
229248
writer has written.
230-
2. Close the session's read stream, unblocking the reader from any
231-
undelivered message so it drains stdout for the rest of shutdown
232-
(see the drain note in `stdout_reader`).
233249
3. Stop the server process: close its stdin, give it a grace period to
234250
exit on its own, and terminate its process tree if it does not
235251
(see `_stop_server_process`).
@@ -238,12 +254,12 @@ async def shutdown() -> None:
238254
run their exit/except paths (deterministic ClosedResourceError
239255
handling) before the caller cancels them.
240256
"""
257+
read_stream.close()
241258
write_stream.close()
242259
with anyio.move_on_after(_WRITER_FLUSH_TIMEOUT) as flush_scope:
243260
await writer_done.wait()
244261
if flush_scope.cancelled_caught:
245262
await anyio.lowlevel.cancel_shielded_checkpoint() # heal gh-106749
246-
read_stream.close()
247263
await _stop_server_process(process)
248264
await _aclose_all(read_stream, write_stream, read_stream_writer, write_stream_reader)
249265
await anyio.lowlevel.checkpoint()
@@ -257,15 +273,18 @@ async def shutdown() -> None:
257273
shutting_down = True
258274
# Shutdown must run to completion even when the caller is being
259275
# cancelled (skipping it would leak the server process); every wait
260-
# inside the shield is time-bounded. A native task.cancel() delivered
261-
# mid-cleanup can still abort it: the shield only holds against
262-
# anyio-level cancellation.
276+
# inside the shield is time-bounded, except that on the Windows
277+
# SelectorEventLoop fallback a worker thread parked in synchronous
278+
# pipe I/O (a read, write, or close) ignores cancellation and anyio
279+
# waits for it — a pre-existing limitation with no portable fix.
280+
# A native task.cancel() delivered mid-cleanup can still abort the
281+
# cleanup: the shield only holds against anyio-level cancellation.
263282
with anyio.CancelScope(shield=True):
264283
await shutdown()
265284
# Cancel the pipe tasks so the join cannot hang on a write into a pipe
266-
# whose read end a kill-surviving descendant still holds. (The Windows
267-
# fallback's reader thread parked in a synchronous ReadFile ignores
268-
# cancellation; anyio waits for it, with no portable way to abandon it.)
285+
# whose read end a kill-surviving descendant still holds. (Same Windows
286+
# fallback caveat as above: a worker thread parked in synchronous pipe
287+
# I/O can still hold the join.)
269288
tg.cancel_scope.cancel()
270289
# The cancel above is delivered via `coro.throw()` into this task at
271290
# the task-group join; on CPython 3.11 (gh-106749) that drops `'call'`
@@ -355,7 +374,9 @@ async def _wait_for_process_exit(process: ServerProcess, timeout: float) -> bool
355374
await anyio.sleep(_EXIT_POLL_INTERVAL)
356375
return True
357376
await anyio.lowlevel.cancel_shielded_checkpoint() # heal gh-106749 after the throw
358-
return False
377+
# Re-check: the process may have died during the very poll interval the
378+
# deadline cut short, and a dead process must not be escalated.
379+
return process.returncode is not None
359380

360381

361382
async def _terminate_process_tree(process: ServerProcess) -> None:
@@ -385,11 +406,16 @@ def _close_subprocess_transport(process: ServerProcess) -> None:
385406
return here.
386407
"""
387408
transport = getattr(getattr(process, "_process", None), "_transport", None)
388-
if isinstance(transport, asyncio.SubprocessTransport):
409+
# Duck-typed: uvloop's UVProcessTransport is not an asyncio.SubprocessTransport.
410+
close = getattr(transport, "close", None)
411+
if callable(close):
389412
# Unflushed stdin bytes defer the write-pipe close until their holder
390413
# exits, but close() still marks the transport closed, so nothing warns
391414
# at GC and the residual fd is bounded by the survivor's lifetime.
392-
transport.close()
415+
# CPython <= 3.12's close() can raise PermissionError re-killing a
416+
# setuid child; 3.13+ suppresses it internally.
417+
with suppress(PermissionError):
418+
close()
393419

394420

395421
def _get_executable_command(command: str) -> str:

src/mcp/os/posix/utilities.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,12 @@ async def terminate_posix_process_tree(process: Process, timeout_seconds: float
4444
# enough, and the rest may well have been signalled (current XNU also raises
4545
# it for all-zombie groups, where Linux succeeds). On no platform does it
4646
# mean the group is gone, so fall through to the grace wait and SIGKILL
47-
# escalation (both tolerate EPERM) instead of giving up.
48-
logger.exception("No permission to signal some of process group %d; waiting for it to exit anyway", pgid)
47+
# escalation (both tolerate EPERM) instead of giving up. A warning rather
48+
# than an error: on macOS this can simply mean the group already exited
49+
# cleanly and is waiting to be reaped.
50+
logger.warning(
51+
"No permission to signal some of process group %d; waiting for it to exit anyway", pgid, exc_info=True
52+
)
4953

5054
with anyio.move_on_after(timeout_seconds):
5155
while True:

0 commit comments

Comments
 (0)