From 14b690beca36c7ed82e97312815116e117845919 Mon Sep 17 00:00:00 2001 From: "Hung-Han (Henry) Chen" Date: Fri, 29 May 2026 12:34:13 +0300 Subject: [PATCH 1/3] Emit named IN_CREATE/IN_DELETE for inotify directory watches EVFILT_VNODE reports that a watched directory changed but not which child, so directory events were queued with no name. fsnotify-based consumers (notably the k0s manifest applier, which re-applies only when a *.yaml entry appears) filter on the entry name and silently drop nameless events, so manifests written after the watch was established were never picked up. Keep a per-watch snapshot of the directory's entry names; on each NOTE_WRITE re-list the directory and diff against the snapshot to emit a named IN_CREATE per added child and IN_DELETE per removed one, matching real inotify semantics. The blocking-read and non-blocking collect paths share one process_vnode_event() helper. The snapshot is allocated on add_watch and freed on rm_watch and inotify_close. Add a regression test (test-inotify Test 6) that watches a fresh directory, creates a child, and asserts a named IN_CREATE for it is delivered; this fails before the fix (the event arrives without a name). Validated with make check on Apple Silicon. (cherry picked from commit 2a5fa28b5257ceb5ed116c231aecc7c4a2ef54ab) --- src/syscall/inotify.c | 226 ++++++++++++++++++++++++++++++++++-------- tests/test-inotify.c | 70 +++++++++++++ 2 files changed, 257 insertions(+), 39 deletions(-) diff --git a/src/syscall/inotify.c b/src/syscall/inotify.c index 19f0b37..3fa40bf 100644 --- a/src/syscall/inotify.c +++ b/src/syscall/inotify.c @@ -28,6 +28,8 @@ #include #include #include +#include +#include #include #include #include @@ -82,6 +84,11 @@ typedef struct { bool is_dir; /* true if watching a directory */ dev_t dev; /* Device ID (for re-add lookup by inode) */ ino_t ino; /* Inode number (for re-add lookup by inode) */ + /* Dir watches only: path + entry-name snapshot, diffed on change to + * recover the child name kqueue omits. NULL/0 for file watches. */ + char *path; + char **entries; + int n_entries; } inotify_watch_t; typedef struct { @@ -295,12 +302,136 @@ static void pipe_drain(inotify_instance_t *inst) ; } +/* Snapshot the entry names of a directory (excluding "." and ".."). On return + * *out is a malloc'd array of malloc'd strings with *n_out entries (free with + * free_dir_snapshot). On any failure the snapshot is left empty. + */ +static void dir_snapshot(const char *path, char ***out, int *n_out) +{ + *out = NULL; + *n_out = 0; + + DIR *d = opendir(path); + if (!d) + return; + + char **names = NULL; + int n = 0, cap = 0; + struct dirent *de; + while ((de = readdir(d)) != NULL) { + if (!strcmp(de->d_name, ".") || !strcmp(de->d_name, "..")) + continue; + if (n == cap) { + int ncap = cap ? cap * 2 : 16; + char **tmp = realloc(names, (size_t) ncap * sizeof(char *)); + if (!tmp) + break; + names = tmp; + cap = ncap; + } + names[n] = strdup(de->d_name); + if (!names[n]) + break; + n++; + } + closedir(d); + + *out = names; + *n_out = n; +} + +static void free_dir_snapshot(char **entries, int n) +{ + if (!entries) + return; + for (int i = 0; i < n; i++) + free(entries[i]); + free(entries); +} + +static bool snapshot_contains(char *const *entries, int n, const char *name) +{ + for (int i = 0; i < n; i++) + if (!strcmp(entries[i], name)) + return true; + return false; +} + /* Collect events from kqueue. */ -/* Poll the kqueue for pending vnode events and translate them into inotify - * events in the instance buffer. - * - * Returns the number of events collected. +/* Translate one EVFILT_VNODE notification into queued inotify events for the + * watch on host_fd. Returns the number queued, or -1 on buffer overflow (an + * IN_Q_OVERFLOW marker is queued). Caller holds inotify_lock. + */ +static int process_vnode_event(inotify_instance_t *inst, + int host_fd, + uint32_t fflags) +{ + int widx = watch_find_by_hostfd(inst, host_fd); + if (widx < 0) + return 0; + + inotify_watch_t *w = &inst->watches[widx]; + int queued = 0; + bool overflow = false; + + if (w->is_dir && (fflags & NOTE_WRITE) && w->path) { + char **now = NULL; + int now_n = 0; + dir_snapshot(w->path, &now, &now_n); + + for (int j = 0; j < now_n && !overflow; j++) { + if ((w->mask & IN_CREATE) && + !snapshot_contains(w->entries, w->n_entries, now[j])) { + if (queue_event(inst, w->wd, IN_CREATE, 0, now[j]) < 0) + overflow = true; + else + queued++; + } + } + for (int j = 0; j < w->n_entries && !overflow; j++) { + if ((w->mask & IN_DELETE) && + !snapshot_contains(now, now_n, w->entries[j])) { + if (queue_event(inst, w->wd, IN_DELETE, 0, w->entries[j]) < 0) + overflow = true; + else + queued++; + } + } + + /* Advance the snapshot regardless: the directory state has moved on, + * and any names dropped under overflow are covered by IN_Q_OVERFLOW. + */ + free_dir_snapshot(w->entries, w->n_entries); + w->entries = now; + w->n_entries = now_n; + } + + if (!overflow) { + uint32_t in_mask = notes_to_in_mask(fflags, w->mask, w->is_dir); + /* The per-child create/delete is emitted by the diff above; only emit + * the bare-mask event for file watches or non-create/delete changes. + */ + if (in_mask != 0 && + !(w->is_dir && (in_mask & (IN_CREATE | IN_DELETE)))) { + if (queue_event(inst, w->wd, in_mask, 0, NULL) < 0) + overflow = true; + else + queued++; + } + } + + if (overflow) { + /* IN_Q_OVERFLOW (0x4000) uses wd=-1 per Linux semantics. */ + queue_event(inst, -1, 0x4000, 0, NULL); + return -1; + } + return queued; +} + +/* Poll the kqueue for pending vnode events and translate them into + * inotify events in the instance buffer. Returns the number of + * events collected. */ static int collect_events(inotify_instance_t *inst) { @@ -312,35 +443,19 @@ static int collect_events(inotify_instance_t *inst) return 0; int collected = 0; + bool overflow = false; for (int i = 0; i < nev; i++) { - int host_fd = (int) kevs[i].ident; - int widx = watch_find_by_hostfd(inst, host_fd); - if (widx < 0) - continue; - - inotify_watch_t *w = &inst->watches[widx]; - uint32_t in_mask = - notes_to_in_mask((uint32_t) kevs[i].fflags, w->mask, w->is_dir); - if (in_mask == 0) - continue; - - /* Queue event without a filename for file watches. For directory - * watches, inotify emulation also omits the filename since kqueue - * EVFILT_VNODE does not report which child changed. - */ - if (queue_event(inst, w->wd, in_mask, 0, NULL) == 0) { - collected++; - } else { - /* Fixed inotify queue is full; queue IN_Q_OVERFLOW and stop. - * IN_Q_OVERFLOW (0x4000) uses wd=-1 per Linux semantics. - */ - queue_event(inst, -1, 0x4000, 0, NULL); + int r = process_vnode_event(inst, (int) kevs[i].ident, + (uint32_t) kevs[i].fflags); + if (r < 0) { + overflow = true; break; } + collected += r; } /* Signal the self-pipe so poll/epoll sees readability */ - if (collected > 0) + if (collected > 0 || overflow) pipe_signal(inst); return collected; @@ -438,12 +553,27 @@ int64_t sys_inotify_add_watch(guest_t *g, /* Strip IN_MASK_ADD control flag before storing */ uint32_t event_mask = mask & ~(uint32_t) IN_MASK_ADD; + /* For directory watches, snapshot the path + current entries up-front + * (outside the lock) so collect_events can diff on each change to emit + * named IN_CREATE/IN_DELETE. Ownership moves to the watch slot on success; + * every early-exit path below frees these. + */ + char *wpath = NULL; + char **wentries = NULL; + int wn = 0; + if (is_dir) { + wpath = strdup(path); + dir_snapshot(path, &wentries, &wn); + } + pthread_mutex_lock(&inotify_lock); int slot = inotify_find(inotify_fd); if (slot < 0) { pthread_mutex_unlock(&inotify_lock); close(host_fd); + free_dir_snapshot(wentries, wn); + free(wpath); return -LINUX_EBADF; } @@ -467,8 +597,12 @@ int64_t sys_inotify_add_watch(guest_t *g, uint32_t snapshot_mask = w->mask; /* Snapshot before unlock */ pthread_mutex_unlock(&inotify_lock); - /* Close the duplicate fd; inotify emulation keeps the original */ + /* Close the duplicate fd; inotify emulation keeps the original. + * The existing watch keeps its snapshot; drop this call's copy. + */ close(host_fd); + free_dir_snapshot(wentries, wn); + free(wpath); /* Update kevent filter with the new mask (use snapshot -- w->mask may * be modified by another thread after unlock) @@ -487,6 +621,8 @@ int64_t sys_inotify_add_watch(guest_t *g, if (widx < 0) { pthread_mutex_unlock(&inotify_lock); close(host_fd); + free_dir_snapshot(wentries, wn); + free(wpath); return -LINUX_ENOSPC; } @@ -502,6 +638,9 @@ int64_t sys_inotify_add_watch(guest_t *g, w->is_dir = is_dir; w->dev = st.st_dev; w->ino = st.st_ino; + w->path = wpath; + w->entries = wentries; + w->n_entries = wn; /* Capture kq_fd while under lock */ int kq_fd = inst->kq_fd; @@ -522,6 +661,11 @@ int64_t sys_inotify_add_watch(guest_t *g, pthread_mutex_lock(&inotify_lock); w->wd = 0; w->host_fd = 0; + free_dir_snapshot(w->entries, w->n_entries); + w->entries = NULL; + w->n_entries = 0; + free(w->path); + w->path = NULL; pthread_mutex_unlock(&inotify_lock); close(host_fd); errno = saved; @@ -556,6 +700,11 @@ int64_t sys_inotify_rm_watch(int inotify_fd, int wd) w->host_fd = 0; w->mask = 0; w->is_dir = 0; + free_dir_snapshot(w->entries, w->n_entries); + w->entries = NULL; + w->n_entries = 0; + free(w->path); + w->path = NULL; pthread_mutex_unlock(&inotify_lock); /* Remove from kqueue and close outside lock */ @@ -619,18 +768,12 @@ int64_t inotify_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count) } inst = &inotify_state[slot]; - /* Process the received event */ + /* Process the received event (same named-directory diff as the + * non-blocking collect path). + */ int host_fd = (int) kev.ident; - int widx = watch_find_by_hostfd(inst, host_fd); - if (widx >= 0) { - inotify_watch_t *w = &inst->watches[widx]; - uint32_t in_mask = - notes_to_in_mask((uint32_t) kev.fflags, w->mask, w->is_dir); - if (in_mask != 0) { - queue_event(inst, w->wd, in_mask, 0, NULL); - pipe_signal(inst); - } - } + if (process_vnode_event(inst, host_fd, (uint32_t) kev.fflags) != 0) + pipe_signal(inst); } } @@ -711,6 +854,11 @@ static void inotify_close(int guest_fd) watch_fds[nfds++] = inst->watches[i].host_fd; inst->watches[i].wd = 0; } + free_dir_snapshot(inst->watches[i].entries, inst->watches[i].n_entries); + inst->watches[i].entries = NULL; + inst->watches[i].n_entries = 0; + free(inst->watches[i].path); + inst->watches[i].path = NULL; } inst->guest_fd = -1; diff --git a/tests/test-inotify.c b/tests/test-inotify.c index 9a85278..deffd45 100644 --- a/tests/test-inotify.c +++ b/tests/test-inotify.c @@ -15,7 +15,10 @@ #include #include +#include #include +#include +#include #include #include #include @@ -188,6 +191,72 @@ static void test_dir_create(void) close(fd); } +/* Test 6: a directory watch must deliver a NAMED IN_CREATE for the new child, + * not just a nameless event. + */ +static void test_dir_create_named(void) +{ + TEST("watch dir delivers named IN_CREATE"); + + char dir[] = "/tmp/elfuse-inotify-dir-XXXXXX"; + if (!mkdtemp(dir)) { + FAIL("mkdtemp"); + return; + } + + int fd = inotify_init1(IN_NONBLOCK); + if (fd < 0) { + FAIL("inotify_init1"); + rmdir(dir); + return; + } + + int wd = inotify_add_watch(fd, dir, IN_CREATE); + if (wd < 0) { + FAIL("inotify_add_watch"); + close(fd); + rmdir(dir); + return; + } + + const char *child = "manifest.yaml"; + char path[300]; + snprintf(path, sizeof(path), "%s/%s", dir, child); + int tfd = open(path, O_WRONLY | O_CREAT | O_EXCL, 0644); + if (tfd < 0) { + FAIL("create child in watched dir"); + inotify_rm_watch(fd, wd); + close(fd); + rmdir(dir); + return; + } + close(tfd); + + /* read() drives the diff; retry until the vnode notification is queued. */ + bool found = false; + for (int attempt = 0; attempt < 50 && !found; attempt++) { + char buf[1024]; + ssize_t n = read(fd, buf, sizeof(buf)); + for (ssize_t off = 0; + off + (ssize_t) sizeof(struct inotify_event) <= n;) { + struct inotify_event *ev = (struct inotify_event *) (buf + off); + if ((ev->mask & IN_CREATE) && ev->len > 0 && + strcmp(ev->name, child) == 0) + found = true; + off += (ssize_t) sizeof(struct inotify_event) + ev->len; + } + if (!found) + usleep(20000); /* 20ms */ + } + + EXPECT_TRUE(found, "named IN_CREATE for new child not delivered"); + + unlink(path); + inotify_rm_watch(fd, wd); + close(fd); + rmdir(dir); +} + /* Main */ int main(void) @@ -199,6 +268,7 @@ int main(void) test_modify_event(); test_nonblock(); test_dir_create(); + test_dir_create_named(); SUMMARY("test-inotify"); return fails > 0 ? 1 : 0; From c0a11825973b9f863e49ceddef4b1b8ac8071a64 Mon Sep 17 00:00:00 2001 From: "Hung-Han (Henry) Chen" Date: Sat, 30 May 2026 13:54:12 +0300 Subject: [PATCH 2/3] Keep inotify directory baseline when a snapshot fails A directory watch snapshots the child-name set on each change and diffs it against the previous baseline to recover the name kqueue omits. dir_snapshot returned an empty list on any failure (opendir error, a mid-read readdir error, or an allocation failure), which the diff could not tell apart from a genuinely empty directory. On a transient failure the IN_DELETE pass then saw every known child as missing and queued a spurious IN_DELETE for each, and the baseline was overwritten with the empty list, so every later change re-reported the whole directory as IN_CREATE. The corruption was permanent. Make dir_snapshot return bool: on failure it frees any partial result and reports false, and readdir read errors are now detected by clearing errno before each call. process_vnode_event only diffs against and advances to a snapshot that succeeded; on failure it keeps the previous baseline so the next successful snapshot reconciles whatever changed in between. The watch-add path documents that a failed initial snapshot is a best-effort empty baseline. Validated with make check on Apple Silicon; test-inotify still passes 6/6. --- src/syscall/inotify.c | 117 ++++++++++++++++++++++++++---------------- 1 file changed, 74 insertions(+), 43 deletions(-) diff --git a/src/syscall/inotify.c b/src/syscall/inotify.c index 3fa40bf..250ccd4 100644 --- a/src/syscall/inotify.c +++ b/src/syscall/inotify.c @@ -302,51 +302,74 @@ static void pipe_drain(inotify_instance_t *inst) ; } -/* Snapshot the entry names of a directory (excluding "." and ".."). On return - * *out is a malloc'd array of malloc'd strings with *n_out entries (free with - * free_dir_snapshot). On any failure the snapshot is left empty. +static void free_dir_snapshot(char **entries, int n) +{ + if (!entries) + return; + for (int i = 0; i < n; i++) + free(entries[i]); + free(entries); +} + +/* List a directory's child names, excluding "." and "..", into the out array + * (free with free_dir_snapshot). Returns false on any failure, leaving the + * result empty -- which the caller must treat as distinct from a true return + * with zero entries, since a failure mistaken for "empty" would diff every + * known child as deleted. */ -static void dir_snapshot(const char *path, char ***out, int *n_out) +static bool dir_snapshot(const char *path, char ***out, int *n_out) { *out = NULL; *n_out = 0; DIR *d = opendir(path); if (!d) - return; + return false; char **names = NULL; int n = 0, cap = 0; - struct dirent *de; - while ((de = readdir(d)) != NULL) { + bool ok = true; + for (;;) { + /* readdir returns NULL both at end-of-stream and on error; reset + * errno immediately before each call so a non-zero errno afterwards + * unambiguously signals a read error rather than EOF. + */ + errno = 0; + struct dirent *de = readdir(d); + if (!de) { + if (errno != 0) + ok = false; + break; + } if (!strcmp(de->d_name, ".") || !strcmp(de->d_name, "..")) continue; if (n == cap) { int ncap = cap ? cap * 2 : 16; char **tmp = realloc(names, (size_t) ncap * sizeof(char *)); - if (!tmp) + if (!tmp) { + ok = false; break; + } names = tmp; cap = ncap; } names[n] = strdup(de->d_name); - if (!names[n]) + if (!names[n]) { + ok = false; break; + } n++; } closedir(d); + if (!ok) { + free_dir_snapshot(names, n); + return false; + } + *out = names; *n_out = n; -} - -static void free_dir_snapshot(char **entries, int n) -{ - if (!entries) - return; - for (int i = 0; i < n; i++) - free(entries[i]); - free(entries); + return true; } static bool snapshot_contains(char *const *entries, int n, const char *name) @@ -378,33 +401,38 @@ static int process_vnode_event(inotify_instance_t *inst, if (w->is_dir && (fflags & NOTE_WRITE) && w->path) { char **now = NULL; int now_n = 0; - dir_snapshot(w->path, &now, &now_n); - - for (int j = 0; j < now_n && !overflow; j++) { - if ((w->mask & IN_CREATE) && - !snapshot_contains(w->entries, w->n_entries, now[j])) { - if (queue_event(inst, w->wd, IN_CREATE, 0, now[j]) < 0) - overflow = true; - else - queued++; + /* Only diff against -- and advance to -- a snapshot that succeeded. + * On failure keep the previous baseline; the next successful snapshot + * reconciles whatever changed in between. + */ + if (dir_snapshot(w->path, &now, &now_n)) { + for (int j = 0; j < now_n && !overflow; j++) { + if ((w->mask & IN_CREATE) && + !snapshot_contains(w->entries, w->n_entries, now[j])) { + if (queue_event(inst, w->wd, IN_CREATE, 0, now[j]) < 0) + overflow = true; + else + queued++; + } } - } - for (int j = 0; j < w->n_entries && !overflow; j++) { - if ((w->mask & IN_DELETE) && - !snapshot_contains(now, now_n, w->entries[j])) { - if (queue_event(inst, w->wd, IN_DELETE, 0, w->entries[j]) < 0) - overflow = true; - else - queued++; + for (int j = 0; j < w->n_entries && !overflow; j++) { + if ((w->mask & IN_DELETE) && + !snapshot_contains(now, now_n, w->entries[j])) { + if (queue_event(inst, w->wd, IN_DELETE, 0, w->entries[j]) < + 0) + overflow = true; + else + queued++; + } } - } - /* Advance the snapshot regardless: the directory state has moved on, - * and any names dropped under overflow are covered by IN_Q_OVERFLOW. - */ - free_dir_snapshot(w->entries, w->n_entries); - w->entries = now; - w->n_entries = now_n; + /* Advance the snapshot: the directory state has moved on, and any + * names dropped under overflow are covered by IN_Q_OVERFLOW. + */ + free_dir_snapshot(w->entries, w->n_entries); + w->entries = now; + w->n_entries = now_n; + } } if (!overflow) { @@ -563,7 +591,10 @@ int64_t sys_inotify_add_watch(guest_t *g, int wn = 0; if (is_dir) { wpath = strdup(path); - dir_snapshot(path, &wentries, &wn); + /* Best-effort: a failed listing starts the watch with an empty + * baseline, which is the only state worth recording at add time. + */ + (void) dir_snapshot(path, &wentries, &wn); } pthread_mutex_lock(&inotify_lock); From 9c28c569f01b5b0ed94c96f7db9fa92132166f86 Mon Sep 17 00:00:00 2001 From: "Hung-Han (Henry) Chen" Date: Mon, 22 Jun 2026 12:51:46 +0800 Subject: [PATCH 3/3] Take inotify directory snapshot without holding inotify_lock process_vnode_event() ran opendir()/readdir() (via dir_snapshot) for a directory watch's NOTE_WRITE while holding the global inotify_lock, so a large or slow directory scan blocked every other inotify operation -- on all instances -- for its duration. This is new I/O-under-lock introduced with the named-event diffing. Take the snapshot with the lock released: copy the watch's path and dev/ino under the lock, drop the lock for the opendir/readdir I/O, then re-acquire and re-validate the instance (by guest_fd) and the watch (by host_fd + dev/ino) before applying the diff. A close or host_fd reuse during the window discards the stale snapshot. Both collect paths (collect_events and the blocking read) re-check the instance after the call and return EBADF if it was torn down, mirroring the existing post-kevent re-validation. No change to the emitted events (test-inotify still 6/6); addresses the lock-contention review finding. --- src/syscall/inotify.c | 131 ++++++++++++++++++++++++++++++++---------- 1 file changed, 100 insertions(+), 31 deletions(-) diff --git a/src/syscall/inotify.c b/src/syscall/inotify.c index 250ccd4..a599de0 100644 --- a/src/syscall/inotify.c +++ b/src/syscall/inotify.c @@ -384,9 +384,17 @@ static bool snapshot_contains(char *const *entries, int n, const char *name) /* Translate one EVFILT_VNODE notification into queued inotify events for the * watch on host_fd. Returns the number queued, or -1 on buffer overflow (an - * IN_Q_OVERFLOW marker is queued). Caller holds inotify_lock. + * IN_Q_OVERFLOW marker is queued). + * + * Caller holds inotify_lock; it is held again on return. For a directory write + * the lock is released around the opendir/readdir snapshot so filesystem I/O + * does not stall inotify operations on other instances. guest_fd identifies + * this instance: because the table can change while unlocked, the instance and + * the watch are re-validated (by host_fd and dev/ino) before the snapshot is + * applied, and a teardown or host_fd reuse during the window discards it. */ static int process_vnode_event(inotify_instance_t *inst, + int guest_fd, int host_fd, uint32_t fflags) { @@ -398,41 +406,77 @@ static int process_vnode_event(inotify_instance_t *inst, int queued = 0; bool overflow = false; + char **now = NULL; + int now_n = 0; + bool snap_ok = false; + if (w->is_dir && (fflags & NOTE_WRITE) && w->path) { - char **now = NULL; - int now_n = 0; + /* Copy the path + identity, then release the lock for the opendir/ + * readdir snapshot so filesystem I/O does not block other instances. + */ + char *path = strdup(w->path); + if (path) { + dev_t dev = w->dev; + ino_t ino = w->ino; + int slot = (int) (inst - inotify_state); + + pthread_mutex_unlock(&inotify_lock); + snap_ok = dir_snapshot(path, &now, &now_n); + free(path); + pthread_mutex_lock(&inotify_lock); + + /* Re-validate across the unlocked window: the instance may have + * been closed, or the watch removed and its host_fd reused for a + * different file. Any of these discards the stale snapshot. + */ + widx = watch_find_by_hostfd(inst, host_fd); + if (inotify_state[slot].guest_fd != guest_fd || widx < 0) { + free_dir_snapshot(now, now_n); + return 0; + } + w = &inst->watches[widx]; + if (!w->is_dir || w->dev != dev || w->ino != ino) { + free_dir_snapshot(now, now_n); + return 0; + } + } + } + + if (snap_ok) { /* Only diff against -- and advance to -- a snapshot that succeeded. * On failure keep the previous baseline; the next successful snapshot * reconciles whatever changed in between. */ - if (dir_snapshot(w->path, &now, &now_n)) { - for (int j = 0; j < now_n && !overflow; j++) { - if ((w->mask & IN_CREATE) && - !snapshot_contains(w->entries, w->n_entries, now[j])) { - if (queue_event(inst, w->wd, IN_CREATE, 0, now[j]) < 0) - overflow = true; - else - queued++; - } + for (int j = 0; j < now_n && !overflow; j++) { + if ((w->mask & IN_CREATE) && + !snapshot_contains(w->entries, w->n_entries, now[j])) { + if (queue_event(inst, w->wd, IN_CREATE, 0, now[j]) < 0) + overflow = true; + else + queued++; } - for (int j = 0; j < w->n_entries && !overflow; j++) { - if ((w->mask & IN_DELETE) && - !snapshot_contains(now, now_n, w->entries[j])) { - if (queue_event(inst, w->wd, IN_DELETE, 0, w->entries[j]) < - 0) - overflow = true; - else - queued++; - } + } + for (int j = 0; j < w->n_entries && !overflow; j++) { + if ((w->mask & IN_DELETE) && + !snapshot_contains(now, now_n, w->entries[j])) { + if (queue_event(inst, w->wd, IN_DELETE, 0, w->entries[j]) < 0) + overflow = true; + else + queued++; } - - /* Advance the snapshot: the directory state has moved on, and any - * names dropped under overflow are covered by IN_Q_OVERFLOW. - */ - free_dir_snapshot(w->entries, w->n_entries); - w->entries = now; - w->n_entries = now_n; } + + /* Advance the snapshot: the directory state has moved on, and any + * names dropped under overflow are covered by IN_Q_OVERFLOW. + */ + free_dir_snapshot(w->entries, w->n_entries); + w->entries = now; + w->n_entries = now_n; + } else { + /* File watch or failed snapshot: nothing to apply. free_dir_snapshot + * tolerates the NULL result dir_snapshot leaves on failure. + */ + free_dir_snapshot(now, now_n); } if (!overflow) { @@ -470,20 +514,28 @@ static int collect_events(inotify_instance_t *inst) if (nev <= 0) return 0; + /* process_vnode_event may release inotify_lock around directory I/O; + * capture the instance identity to detect teardown across that window. + */ + int slot = (int) (inst - inotify_state); + int guest_fd = inst->guest_fd; + int collected = 0; bool overflow = false; for (int i = 0; i < nev; i++) { - int r = process_vnode_event(inst, (int) kevs[i].ident, + int r = process_vnode_event(inst, guest_fd, (int) kevs[i].ident, (uint32_t) kevs[i].fflags); if (r < 0) { overflow = true; break; } collected += r; + if (inotify_state[slot].guest_fd != guest_fd) + return collected; /* instance closed during snapshot I/O */ } /* Signal the self-pipe so poll/epoll sees readability */ - if (collected > 0 || overflow) + if ((collected > 0 || overflow) && inotify_state[slot].guest_fd == guest_fd) pipe_signal(inst); return collected; @@ -762,6 +814,14 @@ int64_t inotify_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count) if (inst->event_used == 0) { int n = collect_events(inst); + /* collect_events may release the lock for directory I/O; bail if the + * instance was closed in that window. + */ + if (inotify_state[slot].guest_fd != guest_fd) { + pthread_mutex_unlock(&inotify_lock); + return -LINUX_EBADF; + } + if (n == 0) { if (inst->nonblock) { pthread_mutex_unlock(&inotify_lock); @@ -803,7 +863,16 @@ int64_t inotify_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count) * non-blocking collect path). */ int host_fd = (int) kev.ident; - if (process_vnode_event(inst, host_fd, (uint32_t) kev.fflags) != 0) + int r = process_vnode_event(inst, guest_fd, host_fd, + (uint32_t) kev.fflags); + /* process_vnode_event may release the lock for the snapshot; bail + * if the instance was closed in that window. + */ + if (inotify_state[slot].guest_fd != guest_fd) { + pthread_mutex_unlock(&inotify_lock); + return -LINUX_EBADF; + } + if (r != 0) pipe_signal(inst); } }