From 2639f859f74dab3fdd142d94603970b83241d505 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 20 May 2026 17:33:32 +0200 Subject: [PATCH 1/2] fix(io_uring): handle more robustly EINTR in io_uring_enter() io_uring_enter() might return with a EINTR when called with IORING_ENTER_GETEVENTS. Make the submit() call a bit more robust by retrying when we observe this error. Retry 3 times. This is a semi-arbitrary choice. The assumption is that if an interrupt arrives subsequent call to the system call should most likely succeed. If we keep receiving interrupts something is more severely broken, so propagate to caller. Signed-off-by: Babis Chalios --- src/vmm/src/io_uring/queue/submission.rs | 53 +++++++++++++++--------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/src/vmm/src/io_uring/queue/submission.rs b/src/vmm/src/io_uring/queue/submission.rs index db308e63a54..072a3a427b3 100644 --- a/src/vmm/src/io_uring/queue/submission.rs +++ b/src/vmm/src/io_uring/queue/submission.rs @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use std::fmt::Debug; -use std::io::Error as IOError; +use std::io::{Error as IOError, ErrorKind}; use std::mem; use std::num::Wrapping; use std::os::unix::io::RawFd; @@ -130,26 +130,39 @@ impl SubmissionQueue { if min_complete > 0 { flags |= generated::IORING_ENTER_GETEVENTS; } - // SAFETY: Safe because values are valid and we check the return value. - let submitted = SyscallReturnCode(unsafe { - libc::syscall( - libc::SYS_io_uring_enter, - self.io_uring_fd, - self.to_submit, - min_complete, - flags, - std::ptr::null::(), - ) - }) - .into_result()?; - // It's safe to convert to u32 since the syscall didn't return an error. - let submitted = u32::try_from(submitted).unwrap(); - - // This is safe since submitted <= self.to_submit. However we use a saturating_sub - // for extra safety. - self.to_submit = self.to_submit.saturating_sub(submitted); - Ok(submitted) + // The number of retries is completely arbitrary here. I assume that this + // will happen rarely and that if it happens subsequent retry will immediately + // succeed. If we fall in a storm of interrupts something else is probably wrong + // so let the consumer know. + let mut eintr_retries = 3; + loop { + // SAFETY: Safe because values are valid and we check the return value. + let ret = SyscallReturnCode(unsafe { + libc::syscall( + libc::SYS_io_uring_enter, + self.io_uring_fd, + self.to_submit, + min_complete, + flags, + std::ptr::null::(), + ) + }) + .into_result(); + match ret { + Ok(num) => { + // It's safe to convert to u32 since the syscall didn't return an error. + let submitted = u32::try_from(num).unwrap(); + self.to_submit = self.to_submit.saturating_sub(submitted); + return Ok(submitted); + } + Err(err) if err.kind() == ErrorKind::Interrupted && eintr_retries > 0 => { + eintr_retries -= 1; + continue; + } + Err(err) => return Err(SQueueError::from(err)), + } + } } fn mmap( From a712043583dc2c2fe09eb812a8f237385a95cb43 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 20 May 2026 17:39:51 +0200 Subject: [PATCH 2/2] fix(block): make prepare_save() more robust If prepare_save() fails to drain the io_uring queues (when used) and sync the host filesystem we might end up with a corrupted disk snapshot. Currently, Firecracker ignores that, only emitting an error message. Be more strict and expect no errors, so that we can have a better post-mortem analysis of what happened. Signed-off-by: Babis Chalios --- src/vmm/src/devices/virtio/block/virtio/device.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index c2538188301..e8b3c1bd428 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -722,9 +722,12 @@ impl VirtioBlock { } fn drain_and_flush(&mut self, discard: bool) { - if let Err(err) = self.disk.file_engine.drain_and_flush(discard) { - error!("Failed to drain ops and flush block data: {:?}", err); - } + // If draining and/or flushing failed, hard crash to avoid continuing with a potentially + // not consistent underlying filesystem in the disk. + self.disk + .file_engine + .drain_and_flush(discard) + .expect("virtio-block: failed to drain ops and flush block data"); } /// Prepare device for being snapshotted.