From 14b690beca36c7ed82e97312815116e117845919 Mon Sep 17 00:00:00 2001
From: "Hung-Han (Henry) Chen" <chenhungh@gmail.com>
Date: Fri, 29 May 2026 12:34:13 +0300
Subject: [PATCH 1/3] Emit named IN_CREATE/IN_DELETE for inotify directory
 watches

EVFILT_VNODE reports that a watched directory changed but not which child, so
directory events were queued with no name. fsnotify-based consumers (notably
the k0s manifest applier, which re-applies only when a *.yaml entry appears)
filter on the entry name and silently drop nameless events, so manifests
written after the watch was established were never picked up.

Keep a per-watch snapshot of the directory's entry names; on each NOTE_WRITE
re-list the directory and diff against the snapshot to emit a named IN_CREATE
per added child and IN_DELETE per removed one, matching real inotify
semantics. The blocking-read and non-blocking collect paths share one
process_vnode_event() helper. The snapshot is allocated on add_watch and
freed on rm_watch and inotify_close.

Add a regression test (test-inotify Test 6) that watches a fresh directory,
creates a child, and asserts a named IN_CREATE for it is delivered; this
fails before the fix (the event arrives without a name).

Validated with make check on Apple Silicon.

(cherry picked from commit 2a5fa28b5257ceb5ed116c231aecc7c4a2ef54ab)
---
 src/syscall/inotify.c | 226 ++++++++++++++++++++++++++++++++++--------
 tests/test-inotify.c  |  70 +++++++++++++
 2 files changed, 257 insertions(+), 39 deletions(-)

diff --git a/src/syscall/inotify.c b/src/syscall/inotify.c
index 19f0b37..3fa40bf 100644
--- a/src/syscall/inotify.c
+++ b/src/syscall/inotify.c
@@ -28,6 +28,8 @@
 #include <errno.h>
 #include <limits.h>
 #include <pthread.h>
+#include <dirent.h>
+#include <stdlib.h>
 #include <sys/event.h>
 #include <sys/stat.h>
 #include <sys/uio.h>
@@ -82,6 +84,11 @@ typedef struct {
     bool is_dir;   /* true if watching a directory */
     dev_t dev;     /* Device ID (for re-add lookup by inode) */
     ino_t ino;     /* Inode number (for re-add lookup by inode) */
+    /* Dir watches only: path + entry-name snapshot, diffed on change to
+     * recover the child name kqueue omits. NULL/0 for file watches. */
+    char *path;
+    char **entries;
+    int n_entries;
 } inotify_watch_t;
 
 typedef struct {
@@ -295,12 +302,136 @@ static void pipe_drain(inotify_instance_t *inst)
         ;
 }
 
+/* Snapshot the entry names of a directory (excluding "." and ".."). On return
+ * *out is a malloc'd array of malloc'd strings with *n_out entries (free with
+ * free_dir_snapshot). On any failure the snapshot is left empty.
+ */
+static void dir_snapshot(const char *path, char ***out, int *n_out)
+{
+    *out = NULL;
+    *n_out = 0;
+
+    DIR *d = opendir(path);
+    if (!d)
+        return;
+
+    char **names = NULL;
+    int n = 0, cap = 0;
+    struct dirent *de;
+    while ((de = readdir(d)) != NULL) {
+        if (!strcmp(de->d_name, ".") || !strcmp(de->d_name, ".."))
+            continue;
+        if (n == cap) {
+            int ncap = cap ? cap * 2 : 16;
+            char **tmp = realloc(names, (size_t) ncap * sizeof(char *));
+            if (!tmp)
+                break;
+            names = tmp;
+            cap = ncap;
+        }
+        names[n] = strdup(de->d_name);
+        if (!names[n])
+            break;
+        n++;
+    }
+    closedir(d);
+
+    *out = names;
+    *n_out = n;
+}
+
+static void free_dir_snapshot(char **entries, int n)
+{
+    if (!entries)
+        return;
+    for (int i = 0; i < n; i++)
+        free(entries[i]);
+    free(entries);
+}
+
+static bool snapshot_contains(char *const *entries, int n, const char *name)
+{
+    for (int i = 0; i < n; i++)
+        if (!strcmp(entries[i], name))
+            return true;
+    return false;
+}
+
 /* Collect events from kqueue. */
 
-/* Poll the kqueue for pending vnode events and translate them into inotify
- * events in the instance buffer.
- *
- * Returns the number of events collected.
+/* Translate one EVFILT_VNODE notification into queued inotify events for the
+ * watch on host_fd. Returns the number queued, or -1 on buffer overflow (an
+ * IN_Q_OVERFLOW marker is queued). Caller holds inotify_lock.
+ */
+static int process_vnode_event(inotify_instance_t *inst,
+                               int host_fd,
+                               uint32_t fflags)
+{
+    int widx = watch_find_by_hostfd(inst, host_fd);
+    if (widx < 0)
+        return 0;
+
+    inotify_watch_t *w = &inst->watches[widx];
+    int queued = 0;
+    bool overflow = false;
+
+    if (w->is_dir && (fflags & NOTE_WRITE) && w->path) {
+        char **now = NULL;
+        int now_n = 0;
+        dir_snapshot(w->path, &now, &now_n);
+
+        for (int j = 0; j < now_n && !overflow; j++) {
+            if ((w->mask & IN_CREATE) &&
+                !snapshot_contains(w->entries, w->n_entries, now[j])) {
+                if (queue_event(inst, w->wd, IN_CREATE, 0, now[j]) < 0)
+                    overflow = true;
+                else
+                    queued++;
+            }
+        }
+        for (int j = 0; j < w->n_entries && !overflow; j++) {
+            if ((w->mask & IN_DELETE) &&
+                !snapshot_contains(now, now_n, w->entries[j])) {
+                if (queue_event(inst, w->wd, IN_DELETE, 0, w->entries[j]) < 0)
+                    overflow = true;
+                else
+                    queued++;
+            }
+        }
+
+        /* Advance the snapshot regardless: the directory state has moved on,
+         * and any names dropped under overflow are covered by IN_Q_OVERFLOW.
+         */
+        free_dir_snapshot(w->entries, w->n_entries);
+        w->entries = now;
+        w->n_entries = now_n;
+    }
+
+    if (!overflow) {
+        uint32_t in_mask = notes_to_in_mask(fflags, w->mask, w->is_dir);
+        /* The per-child create/delete is emitted by the diff above; only emit
+         * the bare-mask event for file watches or non-create/delete changes.
+         */
+        if (in_mask != 0 &&
+            !(w->is_dir && (in_mask & (IN_CREATE | IN_DELETE)))) {
+            if (queue_event(inst, w->wd, in_mask, 0, NULL) < 0)
+                overflow = true;
+            else
+                queued++;
+        }
+    }
+
+    if (overflow) {
+        /* IN_Q_OVERFLOW (0x4000) uses wd=-1 per Linux semantics. */
+        queue_event(inst, -1, 0x4000, 0, NULL);
+        return -1;
+    }
+    return queued;
+}
+
+/* Poll the kqueue for pending vnode events and translate them into
+ * inotify events in the instance buffer. Returns the number of
+ * events collected.
  */
 static int collect_events(inotify_instance_t *inst)
 {
@@ -312,35 +443,19 @@ static int collect_events(inotify_instance_t *inst)
         return 0;
 
     int collected = 0;
+    bool overflow = false;
     for (int i = 0; i < nev; i++) {
-        int host_fd = (int) kevs[i].ident;
-        int widx = watch_find_by_hostfd(inst, host_fd);
-        if (widx < 0)
-            continue;
-
-        inotify_watch_t *w = &inst->watches[widx];
-        uint32_t in_mask =
-            notes_to_in_mask((uint32_t) kevs[i].fflags, w->mask, w->is_dir);
-        if (in_mask == 0)
-            continue;
-
-        /* Queue event without a filename for file watches. For directory
-         * watches, inotify emulation also omits the filename since kqueue
-         * EVFILT_VNODE does not report which child changed.
-         */
-        if (queue_event(inst, w->wd, in_mask, 0, NULL) == 0) {
-            collected++;
-        } else {
-            /* Fixed inotify queue is full; queue IN_Q_OVERFLOW and stop.
-             * IN_Q_OVERFLOW (0x4000) uses wd=-1 per Linux semantics.
-             */
-            queue_event(inst, -1, 0x4000, 0, NULL);
+        int r = process_vnode_event(inst, (int) kevs[i].ident,
+                                    (uint32_t) kevs[i].fflags);
+        if (r < 0) {
+            overflow = true;
             break;
         }
+        collected += r;
     }
 
     /* Signal the self-pipe so poll/epoll sees readability */
-    if (collected > 0)
+    if (collected > 0 || overflow)
         pipe_signal(inst);
 
     return collected;
@@ -438,12 +553,27 @@ int64_t sys_inotify_add_watch(guest_t *g,
     /* Strip IN_MASK_ADD control flag before storing */
     uint32_t event_mask = mask & ~(uint32_t) IN_MASK_ADD;
 
+    /* For directory watches, snapshot the path + current entries up-front
+     * (outside the lock) so collect_events can diff on each change to emit
+     * named IN_CREATE/IN_DELETE. Ownership moves to the watch slot on success;
+     * every early-exit path below frees these.
+     */
+    char *wpath = NULL;
+    char **wentries = NULL;
+    int wn = 0;
+    if (is_dir) {
+        wpath = strdup(path);
+        dir_snapshot(path, &wentries, &wn);
+    }
+
     pthread_mutex_lock(&inotify_lock);
 
     int slot = inotify_find(inotify_fd);
     if (slot < 0) {
         pthread_mutex_unlock(&inotify_lock);
         close(host_fd);
+        free_dir_snapshot(wentries, wn);
+        free(wpath);
         return -LINUX_EBADF;
     }
 
@@ -467,8 +597,12 @@ int64_t sys_inotify_add_watch(guest_t *g,
         uint32_t snapshot_mask = w->mask; /* Snapshot before unlock */
         pthread_mutex_unlock(&inotify_lock);
 
-        /* Close the duplicate fd; inotify emulation keeps the original */
+        /* Close the duplicate fd; inotify emulation keeps the original.
+         * The existing watch keeps its snapshot; drop this call's copy.
+         */
         close(host_fd);
+        free_dir_snapshot(wentries, wn);
+        free(wpath);
 
         /* Update kevent filter with the new mask (use snapshot -- w->mask may
          * be modified by another thread after unlock)
@@ -487,6 +621,8 @@ int64_t sys_inotify_add_watch(guest_t *g,
     if (widx < 0) {
         pthread_mutex_unlock(&inotify_lock);
         close(host_fd);
+        free_dir_snapshot(wentries, wn);
+        free(wpath);
         return -LINUX_ENOSPC;
     }
 
@@ -502,6 +638,9 @@ int64_t sys_inotify_add_watch(guest_t *g,
     w->is_dir = is_dir;
     w->dev = st.st_dev;
     w->ino = st.st_ino;
+    w->path = wpath;
+    w->entries = wentries;
+    w->n_entries = wn;
 
     /* Capture kq_fd while under lock */
     int kq_fd = inst->kq_fd;
@@ -522,6 +661,11 @@ int64_t sys_inotify_add_watch(guest_t *g,
         pthread_mutex_lock(&inotify_lock);
         w->wd = 0;
         w->host_fd = 0;
+        free_dir_snapshot(w->entries, w->n_entries);
+        w->entries = NULL;
+        w->n_entries = 0;
+        free(w->path);
+        w->path = NULL;
         pthread_mutex_unlock(&inotify_lock);
         close(host_fd);
         errno = saved;
@@ -556,6 +700,11 @@ int64_t sys_inotify_rm_watch(int inotify_fd, int wd)
     w->host_fd = 0;
     w->mask = 0;
     w->is_dir = 0;
+    free_dir_snapshot(w->entries, w->n_entries);
+    w->entries = NULL;
+    w->n_entries = 0;
+    free(w->path);
+    w->path = NULL;
     pthread_mutex_unlock(&inotify_lock);
 
     /* Remove from kqueue and close outside lock */
@@ -619,18 +768,12 @@ int64_t inotify_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count)
             }
             inst = &inotify_state[slot];
 
-            /* Process the received event */
+            /* Process the received event (same named-directory diff as the
+             * non-blocking collect path).
+             */
             int host_fd = (int) kev.ident;
-            int widx = watch_find_by_hostfd(inst, host_fd);
-            if (widx >= 0) {
-                inotify_watch_t *w = &inst->watches[widx];
-                uint32_t in_mask =
-                    notes_to_in_mask((uint32_t) kev.fflags, w->mask, w->is_dir);
-                if (in_mask != 0) {
-                    queue_event(inst, w->wd, in_mask, 0, NULL);
-                    pipe_signal(inst);
-                }
-            }
+            if (process_vnode_event(inst, host_fd, (uint32_t) kev.fflags) != 0)
+                pipe_signal(inst);
         }
     }
 
@@ -711,6 +854,11 @@ static void inotify_close(int guest_fd)
             watch_fds[nfds++] = inst->watches[i].host_fd;
             inst->watches[i].wd = 0;
         }
+        free_dir_snapshot(inst->watches[i].entries, inst->watches[i].n_entries);
+        inst->watches[i].entries = NULL;
+        inst->watches[i].n_entries = 0;
+        free(inst->watches[i].path);
+        inst->watches[i].path = NULL;
     }
 
     inst->guest_fd = -1;
diff --git a/tests/test-inotify.c b/tests/test-inotify.c
index 9a85278..deffd45 100644
--- a/tests/test-inotify.c
+++ b/tests/test-inotify.c
@@ -15,7 +15,10 @@
 
 #include <fcntl.h>
 #include <poll.h>
+#include <stdbool.h>
 #include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
 #include <sys/inotify.h>
 #include <unistd.h>
@@ -188,6 +191,72 @@ static void test_dir_create(void)
     close(fd);
 }
 
+/* Test 6: a directory watch must deliver a NAMED IN_CREATE for the new child,
+ * not just a nameless event.
+ */
+static void test_dir_create_named(void)
+{
+    TEST("watch dir delivers named IN_CREATE");
+
+    char dir[] = "/tmp/elfuse-inotify-dir-XXXXXX";
+    if (!mkdtemp(dir)) {
+        FAIL("mkdtemp");
+        return;
+    }
+
+    int fd = inotify_init1(IN_NONBLOCK);
+    if (fd < 0) {
+        FAIL("inotify_init1");
+        rmdir(dir);
+        return;
+    }
+
+    int wd = inotify_add_watch(fd, dir, IN_CREATE);
+    if (wd < 0) {
+        FAIL("inotify_add_watch");
+        close(fd);
+        rmdir(dir);
+        return;
+    }
+
+    const char *child = "manifest.yaml";
+    char path[300];
+    snprintf(path, sizeof(path), "%s/%s", dir, child);
+    int tfd = open(path, O_WRONLY | O_CREAT | O_EXCL, 0644);
+    if (tfd < 0) {
+        FAIL("create child in watched dir");
+        inotify_rm_watch(fd, wd);
+        close(fd);
+        rmdir(dir);
+        return;
+    }
+    close(tfd);
+
+    /* read() drives the diff; retry until the vnode notification is queued. */
+    bool found = false;
+    for (int attempt = 0; attempt < 50 && !found; attempt++) {
+        char buf[1024];
+        ssize_t n = read(fd, buf, sizeof(buf));
+        for (ssize_t off = 0;
+             off + (ssize_t) sizeof(struct inotify_event) <= n;) {
+            struct inotify_event *ev = (struct inotify_event *) (buf + off);
+            if ((ev->mask & IN_CREATE) && ev->len > 0 &&
+                strcmp(ev->name, child) == 0)
+                found = true;
+            off += (ssize_t) sizeof(struct inotify_event) + ev->len;
+        }
+        if (!found)
+            usleep(20000); /* 20ms */
+    }
+
+    EXPECT_TRUE(found, "named IN_CREATE for new child not delivered");
+
+    unlink(path);
+    inotify_rm_watch(fd, wd);
+    close(fd);
+    rmdir(dir);
+}
+
 /* Main */
 
 int main(void)
@@ -199,6 +268,7 @@ int main(void)
     test_modify_event();
     test_nonblock();
     test_dir_create();
+    test_dir_create_named();
 
     SUMMARY("test-inotify");
     return fails > 0 ? 1 : 0;

From c0a11825973b9f863e49ceddef4b1b8ac8071a64 Mon Sep 17 00:00:00 2001
From: "Hung-Han (Henry) Chen" <chenhungh@gmail.com>
Date: Sat, 30 May 2026 13:54:12 +0300
Subject: [PATCH 2/3] Keep inotify directory baseline when a snapshot fails

A directory watch snapshots the child-name set on each change and diffs it
against the previous baseline to recover the name kqueue omits. dir_snapshot
returned an empty list on any failure (opendir error, a mid-read readdir
error, or an allocation failure), which the diff could not tell apart from a
genuinely empty directory.

On a transient failure the IN_DELETE pass then saw every known child as
missing and queued a spurious IN_DELETE for each, and the baseline was
overwritten with the empty list, so every later change re-reported the whole
directory as IN_CREATE. The corruption was permanent.

Make dir_snapshot return bool: on failure it frees any partial result and
reports false, and readdir read errors are now detected by clearing errno
before each call. process_vnode_event only diffs against and advances to a
snapshot that succeeded; on failure it keeps the previous baseline so the next
successful snapshot reconciles whatever changed in between. The watch-add path
documents that a failed initial snapshot is a best-effort empty baseline.

Validated with make check on Apple Silicon; test-inotify still passes 6/6.
---
 src/syscall/inotify.c | 117 ++++++++++++++++++++++++++----------------
 1 file changed, 74 insertions(+), 43 deletions(-)

diff --git a/src/syscall/inotify.c b/src/syscall/inotify.c
index 3fa40bf..250ccd4 100644
--- a/src/syscall/inotify.c
+++ b/src/syscall/inotify.c
@@ -302,51 +302,74 @@ static void pipe_drain(inotify_instance_t *inst)
         ;
 }
 
-/* Snapshot the entry names of a directory (excluding "." and ".."). On return
- * *out is a malloc'd array of malloc'd strings with *n_out entries (free with
- * free_dir_snapshot). On any failure the snapshot is left empty.
+static void free_dir_snapshot(char **entries, int n)
+{
+    if (!entries)
+        return;
+    for (int i = 0; i < n; i++)
+        free(entries[i]);
+    free(entries);
+}
+
+/* List a directory's child names, excluding "." and "..", into the out array
+ * (free with free_dir_snapshot). Returns false on any failure, leaving the
+ * result empty -- which the caller must treat as distinct from a true return
+ * with zero entries, since a failure mistaken for "empty" would diff every
+ * known child as deleted.
  */
-static void dir_snapshot(const char *path, char ***out, int *n_out)
+static bool dir_snapshot(const char *path, char ***out, int *n_out)
 {
     *out = NULL;
     *n_out = 0;
 
     DIR *d = opendir(path);
     if (!d)
-        return;
+        return false;
 
     char **names = NULL;
     int n = 0, cap = 0;
-    struct dirent *de;
-    while ((de = readdir(d)) != NULL) {
+    bool ok = true;
+    for (;;) {
+        /* readdir returns NULL both at end-of-stream and on error; reset
+         * errno immediately before each call so a non-zero errno afterwards
+         * unambiguously signals a read error rather than EOF.
+         */
+        errno = 0;
+        struct dirent *de = readdir(d);
+        if (!de) {
+            if (errno != 0)
+                ok = false;
+            break;
+        }
         if (!strcmp(de->d_name, ".") || !strcmp(de->d_name, ".."))
             continue;
         if (n == cap) {
             int ncap = cap ? cap * 2 : 16;
             char **tmp = realloc(names, (size_t) ncap * sizeof(char *));
-            if (!tmp)
+            if (!tmp) {
+                ok = false;
                 break;
+            }
             names = tmp;
             cap = ncap;
         }
         names[n] = strdup(de->d_name);
-        if (!names[n])
+        if (!names[n]) {
+            ok = false;
             break;
+        }
         n++;
     }
     closedir(d);
 
+    if (!ok) {
+        free_dir_snapshot(names, n);
+        return false;
+    }
+
     *out = names;
     *n_out = n;
-}
-
-static void free_dir_snapshot(char **entries, int n)
-{
-    if (!entries)
-        return;
-    for (int i = 0; i < n; i++)
-        free(entries[i]);
-    free(entries);
+    return true;
 }
 
 static bool snapshot_contains(char *const *entries, int n, const char *name)
@@ -378,33 +401,38 @@ static int process_vnode_event(inotify_instance_t *inst,
     if (w->is_dir && (fflags & NOTE_WRITE) && w->path) {
         char **now = NULL;
         int now_n = 0;
-        dir_snapshot(w->path, &now, &now_n);
-
-        for (int j = 0; j < now_n && !overflow; j++) {
-            if ((w->mask & IN_CREATE) &&
-                !snapshot_contains(w->entries, w->n_entries, now[j])) {
-                if (queue_event(inst, w->wd, IN_CREATE, 0, now[j]) < 0)
-                    overflow = true;
-                else
-                    queued++;
+        /* Only diff against -- and advance to -- a snapshot that succeeded.
+         * On failure keep the previous baseline; the next successful snapshot
+         * reconciles whatever changed in between.
+         */
+        if (dir_snapshot(w->path, &now, &now_n)) {
+            for (int j = 0; j < now_n && !overflow; j++) {
+                if ((w->mask & IN_CREATE) &&
+                    !snapshot_contains(w->entries, w->n_entries, now[j])) {
+                    if (queue_event(inst, w->wd, IN_CREATE, 0, now[j]) < 0)
+                        overflow = true;
+                    else
+                        queued++;
+                }
             }
-        }
-        for (int j = 0; j < w->n_entries && !overflow; j++) {
-            if ((w->mask & IN_DELETE) &&
-                !snapshot_contains(now, now_n, w->entries[j])) {
-                if (queue_event(inst, w->wd, IN_DELETE, 0, w->entries[j]) < 0)
-                    overflow = true;
-                else
-                    queued++;
+            for (int j = 0; j < w->n_entries && !overflow; j++) {
+                if ((w->mask & IN_DELETE) &&
+                    !snapshot_contains(now, now_n, w->entries[j])) {
+                    if (queue_event(inst, w->wd, IN_DELETE, 0, w->entries[j]) <
+                        0)
+                        overflow = true;
+                    else
+                        queued++;
+                }
             }
-        }
 
-        /* Advance the snapshot regardless: the directory state has moved on,
-         * and any names dropped under overflow are covered by IN_Q_OVERFLOW.
-         */
-        free_dir_snapshot(w->entries, w->n_entries);
-        w->entries = now;
-        w->n_entries = now_n;
+            /* Advance the snapshot: the directory state has moved on, and any
+             * names dropped under overflow are covered by IN_Q_OVERFLOW.
+             */
+            free_dir_snapshot(w->entries, w->n_entries);
+            w->entries = now;
+            w->n_entries = now_n;
+        }
     }
 
     if (!overflow) {
@@ -563,7 +591,10 @@ int64_t sys_inotify_add_watch(guest_t *g,
     int wn = 0;
     if (is_dir) {
         wpath = strdup(path);
-        dir_snapshot(path, &wentries, &wn);
+        /* Best-effort: a failed listing starts the watch with an empty
+         * baseline, which is the only state worth recording at add time.
+         */
+        (void) dir_snapshot(path, &wentries, &wn);
     }
 
     pthread_mutex_lock(&inotify_lock);

From 9c28c569f01b5b0ed94c96f7db9fa92132166f86 Mon Sep 17 00:00:00 2001
From: "Hung-Han (Henry) Chen" <chenhungh@gmail.com>
Date: Mon, 22 Jun 2026 12:51:46 +0800
Subject: [PATCH 3/3] Take inotify directory snapshot without holding
 inotify_lock

process_vnode_event() ran opendir()/readdir() (via dir_snapshot) for a
directory watch's NOTE_WRITE while holding the global inotify_lock, so a
large or slow directory scan blocked every other inotify operation -- on
all instances -- for its duration. This is new I/O-under-lock introduced
with the named-event diffing.

Take the snapshot with the lock released: copy the watch's path and
dev/ino under the lock, drop the lock for the opendir/readdir I/O, then
re-acquire and re-validate the instance (by guest_fd) and the watch (by
host_fd + dev/ino) before applying the diff. A close or host_fd reuse
during the window discards the stale snapshot. Both collect paths
(collect_events and the blocking read) re-check the instance after the
call and return EBADF if it was torn down, mirroring the existing
post-kevent re-validation.

No change to the emitted events (test-inotify still 6/6); addresses the
lock-contention review finding.
---
 src/syscall/inotify.c | 131 ++++++++++++++++++++++++++++++++----------
 1 file changed, 100 insertions(+), 31 deletions(-)

diff --git a/src/syscall/inotify.c b/src/syscall/inotify.c
index 250ccd4..a599de0 100644
--- a/src/syscall/inotify.c
+++ b/src/syscall/inotify.c
@@ -384,9 +384,17 @@ static bool snapshot_contains(char *const *entries, int n, const char *name)
 
 /* Translate one EVFILT_VNODE notification into queued inotify events for the
  * watch on host_fd. Returns the number queued, or -1 on buffer overflow (an
- * IN_Q_OVERFLOW marker is queued). Caller holds inotify_lock.
+ * IN_Q_OVERFLOW marker is queued).
+ *
+ * Caller holds inotify_lock; it is held again on return. For a directory write
+ * the lock is released around the opendir/readdir snapshot so filesystem I/O
+ * does not stall inotify operations on other instances. guest_fd identifies
+ * this instance: because the table can change while unlocked, the instance and
+ * the watch are re-validated (by host_fd and dev/ino) before the snapshot is
+ * applied, and a teardown or host_fd reuse during the window discards it.
  */
 static int process_vnode_event(inotify_instance_t *inst,
+                               int guest_fd,
                                int host_fd,
                                uint32_t fflags)
 {
@@ -398,41 +406,77 @@ static int process_vnode_event(inotify_instance_t *inst,
     int queued = 0;
     bool overflow = false;
 
+    char **now = NULL;
+    int now_n = 0;
+    bool snap_ok = false;
+
     if (w->is_dir && (fflags & NOTE_WRITE) && w->path) {
-        char **now = NULL;
-        int now_n = 0;
+        /* Copy the path + identity, then release the lock for the opendir/
+         * readdir snapshot so filesystem I/O does not block other instances.
+         */
+        char *path = strdup(w->path);
+        if (path) {
+            dev_t dev = w->dev;
+            ino_t ino = w->ino;
+            int slot = (int) (inst - inotify_state);
+
+            pthread_mutex_unlock(&inotify_lock);
+            snap_ok = dir_snapshot(path, &now, &now_n);
+            free(path);
+            pthread_mutex_lock(&inotify_lock);
+
+            /* Re-validate across the unlocked window: the instance may have
+             * been closed, or the watch removed and its host_fd reused for a
+             * different file. Any of these discards the stale snapshot.
+             */
+            widx = watch_find_by_hostfd(inst, host_fd);
+            if (inotify_state[slot].guest_fd != guest_fd || widx < 0) {
+                free_dir_snapshot(now, now_n);
+                return 0;
+            }
+            w = &inst->watches[widx];
+            if (!w->is_dir || w->dev != dev || w->ino != ino) {
+                free_dir_snapshot(now, now_n);
+                return 0;
+            }
+        }
+    }
+
+    if (snap_ok) {
         /* Only diff against -- and advance to -- a snapshot that succeeded.
          * On failure keep the previous baseline; the next successful snapshot
          * reconciles whatever changed in between.
          */
-        if (dir_snapshot(w->path, &now, &now_n)) {
-            for (int j = 0; j < now_n && !overflow; j++) {
-                if ((w->mask & IN_CREATE) &&
-                    !snapshot_contains(w->entries, w->n_entries, now[j])) {
-                    if (queue_event(inst, w->wd, IN_CREATE, 0, now[j]) < 0)
-                        overflow = true;
-                    else
-                        queued++;
-                }
+        for (int j = 0; j < now_n && !overflow; j++) {
+            if ((w->mask & IN_CREATE) &&
+                !snapshot_contains(w->entries, w->n_entries, now[j])) {
+                if (queue_event(inst, w->wd, IN_CREATE, 0, now[j]) < 0)
+                    overflow = true;
+                else
+                    queued++;
             }
-            for (int j = 0; j < w->n_entries && !overflow; j++) {
-                if ((w->mask & IN_DELETE) &&
-                    !snapshot_contains(now, now_n, w->entries[j])) {
-                    if (queue_event(inst, w->wd, IN_DELETE, 0, w->entries[j]) <
-                        0)
-                        overflow = true;
-                    else
-                        queued++;
-                }
+        }
+        for (int j = 0; j < w->n_entries && !overflow; j++) {
+            if ((w->mask & IN_DELETE) &&
+                !snapshot_contains(now, now_n, w->entries[j])) {
+                if (queue_event(inst, w->wd, IN_DELETE, 0, w->entries[j]) < 0)
+                    overflow = true;
+                else
+                    queued++;
             }
-
-            /* Advance the snapshot: the directory state has moved on, and any
-             * names dropped under overflow are covered by IN_Q_OVERFLOW.
-             */
-            free_dir_snapshot(w->entries, w->n_entries);
-            w->entries = now;
-            w->n_entries = now_n;
         }
+
+        /* Advance the snapshot: the directory state has moved on, and any
+         * names dropped under overflow are covered by IN_Q_OVERFLOW.
+         */
+        free_dir_snapshot(w->entries, w->n_entries);
+        w->entries = now;
+        w->n_entries = now_n;
+    } else {
+        /* File watch or failed snapshot: nothing to apply. free_dir_snapshot
+         * tolerates the NULL result dir_snapshot leaves on failure.
+         */
+        free_dir_snapshot(now, now_n);
     }
 
     if (!overflow) {
@@ -470,20 +514,28 @@ static int collect_events(inotify_instance_t *inst)
     if (nev <= 0)
         return 0;
 
+    /* process_vnode_event may release inotify_lock around directory I/O;
+     * capture the instance identity to detect teardown across that window.
+     */
+    int slot = (int) (inst - inotify_state);
+    int guest_fd = inst->guest_fd;
+
     int collected = 0;
     bool overflow = false;
     for (int i = 0; i < nev; i++) {
-        int r = process_vnode_event(inst, (int) kevs[i].ident,
+        int r = process_vnode_event(inst, guest_fd, (int) kevs[i].ident,
                                     (uint32_t) kevs[i].fflags);
         if (r < 0) {
             overflow = true;
             break;
         }
         collected += r;
+        if (inotify_state[slot].guest_fd != guest_fd)
+            return collected; /* instance closed during snapshot I/O */
     }
 
     /* Signal the self-pipe so poll/epoll sees readability */
-    if (collected > 0 || overflow)
+    if ((collected > 0 || overflow) && inotify_state[slot].guest_fd == guest_fd)
         pipe_signal(inst);
 
     return collected;
@@ -762,6 +814,14 @@ int64_t inotify_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count)
     if (inst->event_used == 0) {
         int n = collect_events(inst);
 
+        /* collect_events may release the lock for directory I/O; bail if the
+         * instance was closed in that window.
+         */
+        if (inotify_state[slot].guest_fd != guest_fd) {
+            pthread_mutex_unlock(&inotify_lock);
+            return -LINUX_EBADF;
+        }
+
         if (n == 0) {
             if (inst->nonblock) {
                 pthread_mutex_unlock(&inotify_lock);
@@ -803,7 +863,16 @@ int64_t inotify_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count)
              * non-blocking collect path).
              */
             int host_fd = (int) kev.ident;
-            if (process_vnode_event(inst, host_fd, (uint32_t) kev.fflags) != 0)
+            int r = process_vnode_event(inst, guest_fd, host_fd,
+                                        (uint32_t) kev.fflags);
+            /* process_vnode_event may release the lock for the snapshot; bail
+             * if the instance was closed in that window.
+             */
+            if (inotify_state[slot].guest_fd != guest_fd) {
+                pthread_mutex_unlock(&inotify_lock);
+                return -LINUX_EBADF;
+            }
+            if (r != 0)
                 pipe_signal(inst);
         }
     }