diff --git a/src/core/guest.c b/src/core/guest.c
index 632a033..853ab13 100644
--- a/src/core/guest.c
+++ b/src/core/guest.c
@@ -77,6 +77,47 @@ static pthread_mutex_t pt_lock = PTHREAD_MUTEX_INITIALIZER; /* Lock order: 2 */
 /* Track whether the 80% warning has been emitted (avoid log spam) */
 static bool pt_pool_warned = false;
 
+static size_t guest_host_page_size_cached(void)
+{
+    static size_t cached;
+    if (!cached) {
+        long s = sysconf(_SC_PAGESIZE);
+        cached = (s > 0) ? (size_t) s : GUEST_PAGE_SIZE;
+    }
+    return cached;
+}
+
+static void guest_region_clear_overlay(guest_region_t *r)
+{
+    r->overlay_active = false;
+    r->overlay_start = 0;
+    r->overlay_end = 0;
+}
+
+static void guest_region_clip_overlay(guest_region_t *r)
+{
+    if (!r->overlay_active || r->end <= r->start) {
+        guest_region_clear_overlay(r);
+        return;
+    }
+
+    size_t hps = guest_host_page_size_cached();
+    uint64_t page_start = ALIGN_DOWN(r->start, hps);
+    uint64_t page_end = ALIGN_UP(r->end, hps);
+    uint64_t overlay_start =
+        r->overlay_start > page_start ? r->overlay_start : page_start;
+    uint64_t overlay_end =
+        r->overlay_end < page_end ? r->overlay_end : page_end;
+
+    if (overlay_end <= overlay_start) {
+        guest_region_clear_overlay(r);
+        return;
+    }
+
+    r->overlay_start = overlay_start;
+    r->overlay_end = overlay_end;
+}
+
 /* Allocate a zeroed 4KiB page from the page table pool.
  * Returns GPA of the page, or 0 on pool exhaustion.
  * Acquires pt_lock internally. Caller typically holds mmap_lock.
@@ -304,6 +345,12 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
         return -1;
     }
 
+    /* Seed HVF segment list with one entry covering the whole slab.
+     * sys_mmap may later split this for MAP_SHARED file overlays.
+     */
+    g->segments[0] = (hvf_segment_t) {.ipa = GUEST_IPA_BASE, .len = size};
+    g->n_segments = 1;
+
     return 0;
 }
 
@@ -371,6 +418,13 @@ int guest_init_from_shm(guest_t *g,
         return -1;
     }
 
+    /* Seed HVF segment list. The child re-establishes any per-region file
+     * overlays the parent had after this call (handled by fork-state
+     * deserialization).
+     */
+    g->segments[0] = (hvf_segment_t) {.ipa = GUEST_IPA_BASE, .len = size};
+    g->n_segments = 1;
+
     log_debug(
         "guest: CoW fork: mapped %llu GiB from shm "
         "(ipa=%u bits)",
@@ -390,6 +444,13 @@ void guest_destroy(guest_t *g)
         hv_vcpu_destroy(g->vcpu);
         g->vcpu = 0;
     }
+    /* Unmap each HVF segment. hv_vm_destroy releases all stage-2 state
+     * regardless, but unmapping explicitly keeps invariants clean for
+     * downstream tools (Instruments, leak detectors).
+     */
+    for (int i = 0; i < g->n_segments; i++)
+        hv_vm_unmap(g->segments[i].ipa, g->segments[i].len);
+    g->n_segments = 0;
     hv_vm_destroy();
     if (g->host_base) {
         munmap(g->host_base, g->guest_size);
@@ -901,6 +962,8 @@ static bool regions_mergeable(const guest_region_t *a, const guest_region_t *b)
      */
     if (a->noreserve != b->noreserve)
         return false;
+    if (a->overlay_active || b->overlay_active)
+        return false;
     if (strcmp(a->name, b->name) != 0)
         return false;
 
@@ -1014,6 +1077,7 @@ int guest_region_add_ex_owned(guest_t *g,
     r->backing_fd = owned_backing_fd;
     r->shared = (flags & 0x01) != 0;      /* LINUX_MAP_SHARED = 0x01 */
     r->noreserve = (flags & 0x4000) != 0; /* LINUX_MAP_NORESERVE = 0x4000 */
+    guest_region_clear_overlay(r);
     if (name) {
         str_copy_trunc(r->name, name, sizeof(r->name));
     } else {
@@ -1062,6 +1126,7 @@ void guest_region_remove(guest_t *g, uint64_t start, uint64_t end)
             uint64_t trimmed = end - r->start;
             r->offset += trimmed;
             r->start = end;
+            guest_region_clip_overlay(r);
             i++;
             continue;
         }
@@ -1069,6 +1134,7 @@ void guest_region_remove(guest_t *g, uint64_t start, uint64_t end)
         /* Partial overlap: removal range cuts the end */
         if (r->start < start && r->end > start && r->end <= end) {
             r->end = start;
+            guest_region_clip_overlay(r);
             i++;
             continue;
         }
@@ -1117,6 +1183,8 @@ void guest_region_remove(guest_t *g, uint64_t start, uint64_t end)
 
             /* Left half keeps the original entry and shortens its end. */
             r->end = start;
+            guest_region_clip_overlay(r);
+            guest_region_clip_overlay(right);
 
             g->nregions++;
             i += 2; /* skip both halves */
@@ -1173,6 +1241,7 @@ void guest_region_set_prot(guest_t *g, uint64_t start, uint64_t end, int prot)
             g->nregions++;
             /* Left half keeps original prot and backing_fd */
             g->regions[i].end = start;
+            guest_region_clip_overlay(&g->regions[i]);
             /* Right half will be processed next iteration */
             g->regions[i + 1].offset += (start - g->regions[i + 1].start);
             g->regions[i + 1].start = start;
@@ -1185,6 +1254,7 @@ void guest_region_set_prot(guest_t *g, uint64_t start, uint64_t end, int prot)
                         "split: %s",
                         strerror(errno));
             }
+            guest_region_clip_overlay(&g->regions[i + 1]);
             i++; /* advance to the right half */
             r = &g->regions[i];
         }
@@ -1213,6 +1283,7 @@ void guest_region_set_prot(guest_t *g, uint64_t start, uint64_t end, int prot)
             /* Left half: [r->start, end) with new prot */
             g->regions[i].end = end;
             g->regions[i].prot = prot;
+            guest_region_clip_overlay(&g->regions[i]);
             /* Right half: [end, old_end) keeps original prot */
             g->regions[i + 1].offset += (end - g->regions[i + 1].start);
             g->regions[i + 1].start = end;
@@ -1225,6 +1296,7 @@ void guest_region_set_prot(guest_t *g, uint64_t start, uint64_t end, int prot)
                         "end-split: %s",
                         strerror(errno));
             }
+            guest_region_clip_overlay(&g->regions[i + 1]);
             if (first_modified < 0)
                 first_modified = i;
             last_modified = i;
diff --git a/src/core/guest.h b/src/core/guest.h
index b21aaa4..7425d01 100644
--- a/src/core/guest.h
+++ b/src/core/guest.h
@@ -107,6 +107,26 @@ typedef struct {
  */
 #define GUEST_MAX_REGIONS 4096
 
+/* HVF stage-2 mapping segment. The slab is mapped to HVF in pieces so that
+ * file-backed MAP_SHARED regions can have real host-VA overlays applied via
+ * mmap MAP_FIXED|MAP_SHARED of a file fd. HVF requires hv_vm_unmap to target
+ * an exactly-previously-mapped range; sub-range unmap of a larger map fails
+ * with HV_BAD_ARGUMENT. To allow a 2 MiB-aligned middle range to be unmapped +
+ * remapped (refreshing HVF stage-2 caching after a host mmap MAP_FIXED), the
+ * slab is split into 2 MiB-aligned segments around each affected block. All
+ * segments are 2 MiB-aligned and 2 MiB-sized at minimum.
+ *
+ * 256 segments is generous: each MAP_SHARED file mmap costs at most 2 new
+ * segments (left/right of the carved block), and most workloads keep that count
+ * well under 50.
+ */
+typedef struct {
+    uint64_t ipa; /* 2 MiB-aligned IPA start */
+    uint64_t len; /* 2 MiB-aligned length */
+} hvf_segment_t;
+
+#define GUEST_MAX_HVF_SEGMENTS 256
+
 /* A semantic memory region tracked for munmap/mprotect and /proc/self/maps.
  * Distinct from mem_region_t which is used purely for page table construction.
  * Regions are kept sorted by start address in guest_t.regions[].
@@ -120,7 +140,17 @@ typedef struct {
     int backing_fd;  /* Duplicated host fd for file-backed mappings, or -1 */
     bool shared;     /* MAP_SHARED (writes should propagate) */
     bool noreserve;  /* MAP_NORESERVE: PTEs deferred until fault */
-    char name[64];   /* Label: "[heap]", "[stack]", ELF path, etc. */
+    bool overlay_active; /* Region has a live host MAP_FIXED|MAP_SHARED overlay
+                          * of backing_fd at host_base+start. The kernel's
+                          * page cache keeps it coherent with the file and
+                          * with peer overlays of the same file, so msync
+                          * skips the snapshot-style pwrite-the-diff and
+                          * refresh-from-file paths for these regions. */
+    uint64_t overlay_start; /* Host-page-aligned overlay start. May extend
+                             * outside [start, end) when only part of a host
+                             * page is guest-visible. */
+    uint64_t overlay_end;   /* Host-page-aligned overlay end (exclusive). */
+    char name[64];          /* Label: "[heap]", "[stack]", ELF path, etc. */
 } guest_region_t;
 
 /* Guest state. */
@@ -160,6 +190,15 @@ typedef struct {
     /* Semantic region tracking for munmap/mprotect/proc-self-maps */
     guest_region_t regions[GUEST_MAX_REGIONS];
     int nregions; /* Number of active regions */
+
+    /* HVF stage-2 segment list: the union of segments[0..n_segments) covers the
+     * live IPA range that is currently hv_vm_map'd to HVF. Sorted by ipa.
+     * Initially one segment spans the whole slab. See guest.h header comment on
+     * hvf_segment_t for the rationale.
+     */
+    hvf_segment_t segments[GUEST_MAX_HVF_SEGMENTS];
+    int n_segments;
+
     /* Page table generation counter: incremented on every PT modification.
      * Used by the per-thread GVA TLB cache to detect stale entries.
      * 64-bit to avoid wrap-around stale hits over long-running sessions.
diff --git a/src/runtime/fork-state.c b/src/runtime/fork-state.c
index b40c324..1cb99f2 100644
--- a/src/runtime/fork-state.c
+++ b/src/runtime/fork-state.c
@@ -618,8 +618,20 @@ int fork_ipc_recv_process_state(int ipc_fd, guest_t *g, signal_state_t *sig)
         return -1;
     }
     g->nregions = (int) num_guest_regions;
-    for (int i = 0; i < g->nregions; i++)
+    for (int i = 0; i < g->nregions; i++) {
         g->regions[i].backing_fd = -1;
+        /* Demote inherited overlays: the child does not yet re-establish
+         * host MAP_FIXED|MAP_SHARED mappings from the parent's overlay
+         * fds, so msync, MADV_DONTNEED and friends must use the
+         * snapshot-style emulation. The CoW path's pre-fork sync of
+         * overlay bytes into shm_fd already gave the child snapshot the
+         * correct content at fork time. Live cross-fork MAP_SHARED
+         * coherence is the next P1 TODO item.
+         */
+        g->regions[i].overlay_active = false;
+        g->regions[i].overlay_start = 0;
+        g->regions[i].overlay_end = 0;
+    }
 
     if (fork_ipc_recv_backing_fds(ipc_fd, g) < 0)
         return -1;
diff --git a/src/runtime/forkipc.c b/src/runtime/forkipc.c
index e01e6a0..03a7683 100644
--- a/src/runtime/forkipc.c
+++ b/src/runtime/forkipc.c
@@ -984,8 +984,45 @@ int64_t sys_clone(hv_vcpu_t vcpu,
         goto fail_snapshot;
     }
 
-    /* CoW path: send shm fd to child via SCM_RIGHTS */
+    /* CoW path: sync MAP_SHARED file overlays back into shm_fd before
+     * sending it to the child. The parent's host VA at each overlay
+     * region maps the overlay file, not shm_fd, so shm_fd's content at
+     * those IPAs is stale (typically zero). The child's MAP_PRIVATE
+     * snapshot would expose that stale data at the overlay IPAs. Copy
+     * the live overlay bytes into shm_fd at the matching offsets so the
+     * child snapshot reflects the parent's view at fork time. Live
+     * cross-fork MAP_SHARED coherence (parent and child both seeing
+     * subsequent writes through the same file) is left to the cross-fork
+     * coherence TODO; this fix only avoids the stale-snapshot regression.
+     */
     if (use_shm) {
+        for (int i = 0; i < g->nregions; i++) {
+            const guest_region_t *r = &g->regions[i];
+            if (!r->overlay_active)
+                continue;
+            uint64_t len = r->end - r->start;
+            const uint8_t *src = (const uint8_t *) g->host_base + r->start;
+            uint64_t off = r->start;
+            while (len > 0) {
+                size_t chunk = len > (uint64_t) SSIZE_MAX ? (size_t) SSIZE_MAX
+                                                          : (size_t) len;
+                ssize_t nw = pwrite(g->shm_fd, src, chunk, (off_t) off);
+                if (nw < 0) {
+                    if (errno == EINTR)
+                        continue;
+                    log_error("clone: shm overlay sync pwrite failed: %s",
+                              strerror(errno));
+                    goto fail_snapshot;
+                }
+                if (nw == 0) {
+                    log_error("clone: shm overlay sync pwrite returned 0");
+                    goto fail_snapshot;
+                }
+                src += nw;
+                off += (uint64_t) nw;
+                len -= (uint64_t) nw;
+            }
+        }
         if (fork_ipc_send_fds(ipc_sock, &g->shm_fd, 1) < 0) {
             log_error("clone: failed to send shm fd");
             goto fail_snapshot;
diff --git a/src/syscall/mem.c b/src/syscall/mem.c
index 9a02a62..9cfdcd5 100644
--- a/src/syscall/mem.c
+++ b/src/syscall/mem.c
@@ -14,6 +14,7 @@
 #include <unistd.h>
 #include <errno.h>
 #include <limits.h>
+#include <sys/mman.h>
 #include <sys/stat.h>
 #include <pthread.h>
 
@@ -30,6 +31,22 @@
  */
 pthread_mutex_t mmap_lock = PTHREAD_MUTEX_INITIALIZER; /* Lock order: 1 */
 
+/* Host kernel page size (16 KiB on Apple Silicon, typically 4 KiB on
+ * Intel macOS). MAP_FIXED requires addr/length/offset multiples of this,
+ * so an overlay onto a guest 4 KiB-aligned IPA is only applicable when the
+ * IPA happens to land on a host page boundary; otherwise sys_mmap falls
+ * back to the pread snapshot path.
+ */
+static size_t host_page_size_cached(void)
+{
+    static size_t cached;
+    if (!cached) {
+        long s = sysconf(_SC_PAGESIZE);
+        cached = (s > 0) ? (size_t) s : 4096;
+    }
+    return cached;
+}
+
 /* Gap-finding allocator for mmap.
  *
  * find_free_gap_inner() scans guest_t.regions[] (sorted) for the first free gap
@@ -106,12 +123,125 @@ static int dup_region_backing_fd(const guest_region_t *region)
     return dup(region->backing_fd);
 }
 
+static bool region_has_live_overlay(const guest_region_t *r)
+{
+    return r->overlay_active && r->overlay_end > r->overlay_start;
+}
+
+static void region_clear_overlay(guest_region_t *r)
+{
+    r->overlay_active = false;
+    r->overlay_start = 0;
+    r->overlay_end = 0;
+}
+
+static void region_clip_overlay(guest_region_t *r);
+
+static void clear_overlay_metadata_range(guest_t *g,
+                                         uint64_t start,
+                                         uint64_t end)
+{
+    for (int i = 0; i < g->nregions; i++) {
+        guest_region_t *r = &g->regions[i];
+        if (!region_has_live_overlay(r))
+            continue;
+        if (r->overlay_start != start || r->overlay_end != end)
+            continue;
+        region_clear_overlay(r);
+    }
+}
+
+static void mark_overlay_metadata_range(guest_t *g,
+                                        uint64_t start,
+                                        uint64_t end,
+                                        uint64_t overlay_start,
+                                        uint64_t overlay_end)
+{
+    for (int i = 0; i < g->nregions; i++) {
+        guest_region_t *r = &g->regions[i];
+        if (r->start >= end)
+            break;
+        if (r->end <= start)
+            continue;
+        r->overlay_active = true;
+        r->overlay_start = overlay_start;
+        r->overlay_end = overlay_end;
+        region_clip_overlay(r);
+    }
+}
+
+static void region_clip_overlay(guest_region_t *r)
+{
+    if (!region_has_live_overlay(r) || r->end <= r->start) {
+        region_clear_overlay(r);
+        return;
+    }
+
+    size_t hps = host_page_size_cached();
+    uint64_t page_start = ALIGN_DOWN(r->start, hps);
+    uint64_t page_end = ALIGN_UP(r->end, hps);
+
+    if (r->overlay_start < page_start)
+        r->overlay_start = page_start;
+    if (r->overlay_end > page_end)
+        r->overlay_end = page_end;
+    if (r->overlay_end <= r->overlay_start)
+        region_clear_overlay(r);
+}
+
+static void split_regions_at_boundary(guest_t *g, uint64_t boundary)
+{
+    if (boundary == 0)
+        return;
+
+    for (int i = 0; i < g->nregions; i++) {
+        guest_region_t *r = &g->regions[i];
+        if (boundary <= r->start)
+            break;
+        if (boundary >= r->end)
+            continue;
+        if (g->nregions >= GUEST_MAX_REGIONS) {
+            log_error(
+                "guest: region table full, cleanup split skipped at "
+                "0x%llx",
+                (unsigned long long) boundary);
+            return;
+        }
+
+        memmove(&g->regions[i + 1], &g->regions[i],
+                (g->nregions - i) * sizeof(guest_region_t));
+        g->nregions++;
+
+        g->regions[i].end = boundary;
+        g->regions[i + 1].offset += (boundary - g->regions[i + 1].start);
+        g->regions[i + 1].start = boundary;
+        if (g->regions[i + 1].backing_fd >= 0) {
+            g->regions[i + 1].backing_fd = dup(g->regions[i + 1].backing_fd);
+            if (g->regions[i + 1].backing_fd < 0)
+                log_error("guest: dup() failed for cleanup split: %s",
+                          strerror(errno));
+        }
+        region_clip_overlay(&g->regions[i]);
+        region_clip_overlay(&g->regions[i + 1]);
+        return;
+    }
+}
+
 static uint64_t find_free_gap_inner(const guest_t *g,
                                     uint64_t length,
                                     uint64_t min_addr,
                                     uint64_t max_addr)
 {
-    uint64_t gap_start = min_addr;
+    /* Round the search start up to the next host-page boundary so an
+     * unaligned addr hint cannot return a result that lands inside a host
+     * page already covered by a preceding region's overlay tail (the
+     * overlay extends to ALIGN_UP(r->end, hps)). Apple Silicon enforces
+     * 16 KiB host pages; aligning to the guest 4 KiB page is not enough.
+     * Advance past each walked region to the same boundary for the same
+     * reason.
+     */
+    size_t hps = host_page_size_cached();
+    uint64_t gap_start = ALIGN_UP(min_addr, hps);
 
     for (int i = 0; i < g->nregions; i++) {
         /* Skip regions entirely before the current search position */
@@ -127,10 +257,8 @@ static uint64_t find_free_gap_inner(const guest_t *g,
             g->regions[i].start >= gap_start + length)
             return gap_start;
 
-        /* Region overlaps; advance past it */
-        gap_start = g->regions[i].end;
-        /* Page-align the next candidate position */
-        gap_start = PAGE_ALIGN_UP(gap_start);
+        /* Region overlaps; advance past it and round to the next host page */
+        gap_start = ALIGN_UP(g->regions[i].end, hps);
     }
 
     /* Check trailing space after all regions */
@@ -153,12 +281,20 @@ static uint64_t find_free_gap(guest_t *g,
     /* RX and RW mappings advance independently, so keep separate hints. */
     uint64_t *hint =
         (min_addr < MMAP_BASE) ? &g->mmap_rx_gap_hint : &g->mmap_rw_gap_hint;
+    /* Advance the hint to the next host-page boundary so the following
+     * sequential allocation lands on an address that the kernel accepts
+     * for mmap MAP_FIXED (Apple Silicon enforces 16 KiB host pages). The
+     * tradeoff is up to host_page-1 bytes of address-space waste per small
+     * allocation; physical pages are still demand-paged, so RAM cost is
+     * unchanged.
+     */
+    size_t hps = host_page_size_cached();
 
     /* Try cached hint first (only if within the valid range) */
     if (*hint >= min_addr && *hint < max_addr) {
         uint64_t result = find_free_gap_inner(g, length, *hint, max_addr);
         if (result != UINT64_MAX) {
-            *hint = result + length;
+            *hint = ALIGN_UP(result + length, hps);
             return result;
         }
     }
@@ -166,7 +302,7 @@ static uint64_t find_free_gap(guest_t *g,
     /* Full scan from base */
     uint64_t result = find_free_gap_inner(g, length, min_addr, max_addr);
     if (result != UINT64_MAX)
-        *hint = result + length;
+        *hint = ALIGN_UP(result + length, hps);
     return result;
 }
 
@@ -204,6 +340,652 @@ static int mremap_extend_range(guest_t *g,
     return 0;
 }
 
+static int hvf_apply_file_overlay(guest_t *g,
+                                  uint64_t ipa,
+                                  uint64_t len,
+                                  int fd,
+                                  off_t file_off);
+static int hvf_remove_file_overlay(guest_t *g, uint64_t ipa, uint64_t len);
+
+static int read_file_range_to_guest(guest_t *g,
+                                    uint64_t guest_off,
+                                    int fd,
+                                    uint64_t file_off,
+                                    uint64_t len)
+{
+    uint8_t *dst = (uint8_t *) g->host_base + guest_off;
+    size_t remaining = len;
+
+    while (remaining > 0) {
+        ssize_t nr = pread(fd, dst, remaining, (off_t) file_off);
+        if (nr < 0) {
+            if (errno == EINTR)
+                continue;
+            return linux_errno();
+        }
+        if (nr == 0)
+            break;
+        dst += nr;
+        remaining -= (size_t) nr;
+        file_off += (uint64_t) nr;
+    }
+
+    return 0;
+}
+
+static int restore_file_overlay_range(guest_t *g,
+                                      uint64_t start,
+                                      uint64_t end,
+                                      uint64_t overlay_start,
+                                      uint64_t overlay_end,
+                                      int fd,
+                                      uint64_t file_off)
+{
+    int err = hvf_apply_file_overlay(
+        g, overlay_start, overlay_end - overlay_start, fd, (off_t) file_off);
+    if (err < 0)
+        return err;
+    mark_overlay_metadata_range(g, start, end, overlay_start, overlay_end);
+    return 0;
+}
+
+typedef struct {
+    uint64_t start;
+    uint64_t end;
+    int prot;
+    int flags;
+    uint64_t offset;
+    int backing_fd;
+    bool overlay_active;
+    uint64_t overlay_start;
+    uint64_t overlay_end;
+    char name[sizeof(((guest_region_t *) 0)->name)];
+} region_snapshot_t;
+
+static void close_region_snapshots(region_snapshot_t *snaps, int n)
+{
+    for (int i = 0; i < n; i++) {
+        if (snaps[i].backing_fd >= 0) {
+            close(snaps[i].backing_fd);
+            snaps[i].backing_fd = -1;
+        }
+    }
+}
+
+/* Close any open dup'd backing fds in *snaps_ptr, free the heap buffer,
+ * and zero out the caller's pointer/count so a follow-on call is a no-op.
+ * Used for buffers allocated via malloc by sys_mmap and sys_mremap; the
+ * stack-allocated callers in capture_region_snapshots itself keep using
+ * close_region_snapshots directly.
+ */
+static void dispose_region_snapshots(region_snapshot_t **snaps_ptr, int *n_ptr)
+{
+    if (snaps_ptr && *snaps_ptr) {
+        close_region_snapshots(*snaps_ptr, n_ptr ? *n_ptr : 0);
+        free(*snaps_ptr);
+        *snaps_ptr = NULL;
+    }
+    if (n_ptr)
+        *n_ptr = 0;
+}
+
+static int capture_region_snapshots(guest_t *g,
+                                    uint64_t start,
+                                    uint64_t end,
+                                    region_snapshot_t *snaps,
+                                    int max_snaps)
+{
+    split_regions_at_boundary(g, start);
+    split_regions_at_boundary(g, end);
+
+    int n = 0;
+    for (int i = 0; i < g->nregions; i++) {
+        const guest_region_t *r = &g->regions[i];
+        if (r->start >= end)
+            break;
+        if (r->end <= start)
+            continue;
+        if (n >= max_snaps) {
+            close_region_snapshots(snaps, n);
+            return -LINUX_ENOMEM;
+        }
+
+        region_snapshot_t *snap = &snaps[n++];
+        snap->start = r->start;
+        snap->end = r->end;
+        snap->prot = r->prot;
+        snap->flags = r->flags;
+        snap->offset = r->offset;
+        snap->backing_fd = -1;
+        if (r->backing_fd >= 0) {
+            snap->backing_fd = dup(r->backing_fd);
+            if (snap->backing_fd < 0) {
+                close_region_snapshots(snaps, n);
+                return -LINUX_ENOMEM;
+            }
+        }
+        snap->overlay_active = r->overlay_active;
+        snap->overlay_start = r->overlay_start;
+        snap->overlay_end = r->overlay_end;
+        str_copy_trunc(snap->name, r->name, sizeof(snap->name));
+    }
+
+    return n;
+}
+
+static int restore_snapshot_overlays_in_place(guest_t *g,
+                                              const region_snapshot_t *snaps,
+                                              int n)
+{
+    for (int i = 0; i < n; i++) {
+        const region_snapshot_t *snap = &snaps[i];
+        if (!snap->overlay_active || snap->backing_fd < 0)
+            continue;
+
+        bool first = true;
+        uint64_t snap_file_off =
+            snap->offset + (snap->overlay_start - snap->start);
+        for (int j = 0; j < i; j++) {
+            const region_snapshot_t *prev = &snaps[j];
+            if (!prev->overlay_active || prev->backing_fd < 0)
+                continue;
+            uint64_t prev_file_off =
+                prev->offset + (prev->overlay_start - prev->start);
+            if (prev->overlay_start == snap->overlay_start &&
+                prev->overlay_end == snap->overlay_end &&
+                prev_file_off == snap_file_off) {
+                first = false;
+                break;
+            }
+        }
+
+        if (first) {
+            int err = restore_file_overlay_range(
+                g, snap->start, snap->end, snap->overlay_start,
+                snap->overlay_end, snap->backing_fd, snap_file_off);
+            if (err < 0)
+                return err;
+            continue;
+        }
+
+        mark_overlay_metadata_range(g, snap->start, snap->end,
+                                    snap->overlay_start, snap->overlay_end);
+    }
+
+    return 0;
+}
+
+static bool snapshot_has_materialized_ptes(const region_snapshot_t *snap)
+{
+    return snap->prot != LINUX_PROT_NONE &&
+           (snap->flags & LINUX_MAP_NORESERVE) == 0;
+}
+
+static int restore_snapshot_page_tables(guest_t *g,
+                                        uint64_t start,
+                                        uint64_t end,
+                                        const region_snapshot_t *snaps,
+                                        int n)
+{
+    if (guest_invalidate_ptes(g, start, end) < 0)
+        return -LINUX_ENOMEM;
+
+    for (int i = 0; i < n; i++) {
+        const region_snapshot_t *snap = &snaps[i];
+        if (!snapshot_has_materialized_ptes(snap))
+            continue;
+
+        int page_perms = prot_to_perms(snap->prot);
+        uint64_t ext_start = ALIGN_DOWN(snap->start, BLOCK_2MIB);
+        uint64_t ext_end = ALIGN_UP(snap->end, BLOCK_2MIB);
+        if (ext_end > g->guest_size)
+            ext_end = g->guest_size;
+
+        if (guest_extend_page_tables(g, ext_start, ext_end, page_perms) < 0)
+            return -LINUX_ENOMEM;
+        guest_update_perms(g, snap->start, snap->end, page_perms);
+    }
+
+    /* guest_extend_page_tables() repopulates whole 2 MiB blocks, so clear
+     * holes and deferred mappings again after all snapshot ranges are back.
+     */
+    uint64_t cursor = start;
+    for (int i = 0; i < n; i++) {
+        const region_snapshot_t *snap = &snaps[i];
+        if (cursor < snap->start &&
+            guest_invalidate_ptes(g, cursor, snap->start) < 0)
+            return -LINUX_ENOMEM;
+        if (!snapshot_has_materialized_ptes(snap) &&
+            guest_invalidate_ptes(g, snap->start, snap->end) < 0)
+            return -LINUX_ENOMEM;
+        cursor = snap->end;
+    }
+    if (cursor < end && guest_invalidate_ptes(g, cursor, end) < 0)
+        return -LINUX_ENOMEM;
+
+    return 0;
+}
+
+static int restore_region_snapshots(guest_t *g, region_snapshot_t *snaps, int n)
+{
+    for (int i = 0; i < n; i++) {
+        region_snapshot_t *snap = &snaps[i];
+        if (guest_region_add_ex_owned(g, snap->start, snap->end, snap->prot,
+                                      snap->flags, snap->offset,
+                                      snap->name[0] ? snap->name : NULL,
+                                      snap->backing_fd) < 0) {
+            snap->backing_fd = -1;
+            close_region_snapshots(snaps, n);
+            return -LINUX_ENOMEM;
+        }
+        snap->backing_fd = -1;
+    }
+
+    for (int i = 0; i < n; i++) {
+        const region_snapshot_t *snap = &snaps[i];
+        if (!snap->overlay_active)
+            continue;
+
+        bool first = true;
+        uint64_t snap_file_off =
+            snap->offset + (snap->overlay_start - snap->start);
+        for (int j = 0; j < i; j++) {
+            const region_snapshot_t *prev = &snaps[j];
+            if (!prev->overlay_active)
+                continue;
+            uint64_t prev_file_off =
+                prev->offset + (prev->overlay_start - prev->start);
+            if (prev->overlay_start == snap->overlay_start &&
+                prev->overlay_end == snap->overlay_end &&
+                prev_file_off == snap_file_off) {
+                first = false;
+                break;
+            }
+        }
+
+        if (first) {
+            const guest_region_t *r = guest_region_find(g, snap->start);
+            if (!r || r->backing_fd < 0)
+                return -LINUX_EFAULT;
+            int err = restore_file_overlay_range(
+                g, snap->start, snap->end, snap->overlay_start,
+                snap->overlay_end, r->backing_fd, snap_file_off);
+            if (err < 0)
+                return err;
+            continue;
+        }
+
+        mark_overlay_metadata_range(g, snap->start, snap->end,
+                                    snap->overlay_start, snap->overlay_end);
+    }
+
+    return 0;
+}
+
+static int rollback_fresh_mmap_allocation(guest_t *g,
+                                          uint64_t start,
+                                          uint64_t length,
+                                          bool overlay_installed,
+                                          uint64_t overlay_ipa,
+                                          uint64_t overlay_len,
+                                          uint64_t saved_mmap_next,
+                                          uint64_t saved_mmap_end,
+                                          uint64_t saved_mmap_rx_next,
+                                          uint64_t saved_mmap_rx_end,
+                                          uint64_t saved_rw_gap_hint,
+                                          uint64_t saved_rx_gap_hint)
+{
+    if (overlay_installed)
+        hvf_remove_file_overlay(g, overlay_ipa, overlay_len);
+    if (guest_invalidate_ptes(g, start, start + length) < 0)
+        return -LINUX_ENOMEM;
+    g->need_tlbi = true;
+    g->mmap_next = saved_mmap_next;
+    g->mmap_end = saved_mmap_end;
+    g->mmap_rx_next = saved_mmap_rx_next;
+    g->mmap_rx_end = saved_mmap_rx_end;
+    g->mmap_rw_gap_hint = saved_rw_gap_hint;
+    g->mmap_rx_gap_hint = saved_rx_gap_hint;
+    return 0;
+}
+
+/* HVF stage-2 segment management.
+ *
+ * The slab is mapped to HVF in 2 MiB-aligned segments tracked by
+ * g->segments[]. Initially the slab is one segment (set up by guest_init).
+ * MAP_SHARED file-backed mmap may need to overlay a sub-range of the slab
+ * with a real host mmap MAP_FIXED|MAP_SHARED of the file fd. HVF caches
+ * the host VA->PA mapping at hv_vm_map time and a plain MAP_FIXED overlay
+ * does not refresh it (see comment in src/runtime/forkipc.c near line 940
+ * for the empirical evidence). To force HVF to re-walk the host page
+ * tables after the overlay, the affected segment is hv_vm_unmap'd, the
+ * file is mmap'd MAP_FIXED|MAP_SHARED into its host VA, and the segment
+ * is hv_vm_map'd again.
+ *
+ * HVF rejects sub-range hv_vm_unmap of a larger map (HV_BAD_ARGUMENT).
+ * Therefore, before applying the first overlay inside a large segment,
+ * the segment is split into 2 MiB-aligned pieces around the affected
+ * range so each piece is independently unmappable.
+ */
+
+/* HVF flags applied to slab segments. The slab is mapped RWX so guest
+ * stage-1 page tables retain full control over per-page permissions
+ * (W^X is enforced by the guest's L2/L3 entries, not stage-2). File
+ * overlay segments use the same RWX flags so PROT_EXEC mmaps still
+ * work; the host file mmap is created PROT_READ|PROT_WRITE so HVF
+ * never asks the host kernel for execute permission on the file pages.
+ */
+#define HVF_SEGMENT_FLAGS (HV_MEMORY_READ | HV_MEMORY_WRITE | HV_MEMORY_EXEC)
+
+/* Find the index of the segment containing ipa, or -1 if none. */
+static int hvf_segment_find(const guest_t *g, uint64_t ipa)
+{
+    int lo = 0, hi = g->n_segments - 1;
+    while (lo <= hi) {
+        int mid = (lo + hi) / 2;
+        const hvf_segment_t *s = &g->segments[mid];
+        if (ipa >= s->ipa && ipa < s->ipa + s->len)
+            return mid;
+        if (ipa < s->ipa)
+            hi = mid - 1;
+        else
+            lo = mid + 1;
+    }
+    return -1;
+}
+
+/* Restore the slab backing for [ipa, ipa+len) in the host VA. Used to
+ * undo a previous file overlay. Maps shm_fd MAP_SHARED if the slab is
+ * shm-backed (so subsequent fork CoW snapshots see consistent content),
+ * otherwise MAP_ANON|MAP_PRIVATE. The IPA is unmapped from the guest's
+ * perspective by the caller (page tables invalidated, region removed),
+ * so the content of the restored backing is not directly observable
+ * to the guest until a subsequent mmap targets the same IPA.
+ *
+ * The caller must ensure no HVF segment currently covers [ipa, ipa+len).
+ * Returns 0 on success, -errno on failure.
+ */
+static int hvf_restore_slab_backing(guest_t *g, uint64_t ipa, uint64_t len)
+{
+    void *target = (uint8_t *) g->host_base + ipa;
+    void *p;
+    if (g->shm_fd >= 0) {
+        p = mmap(target, len, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED,
+                 g->shm_fd, (off_t) ipa);
+    } else {
+        p = mmap(target, len, PROT_READ | PROT_WRITE,
+                 MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0);
+    }
+    if (p == MAP_FAILED)
+        return -linux_errno();
+    return 0;
+}
+
+/* Split the segment that exactly contains [aligned_start, aligned_end) so
+ * that the middle range becomes its own segment. The caller MUST have
+ * quiesced sibling vCPUs before calling so HVF's brief unmap window does
+ * not race with concurrent guest accesses through stage-2.
+ *
+ * Up to two new segments may be inserted on either side. If the segment
+ * already exactly matches the requested bounds, this is a no-op.
+ *
+ * Both bounds must be 2 MiB-aligned. Returns 0 on success, -errno on
+ * failure.
+ */
+static int hvf_segment_split(guest_t *g,
+                             uint64_t aligned_start,
+                             uint64_t aligned_end)
+{
+    int idx = hvf_segment_find(g, aligned_start);
+    if (idx < 0)
+        return -LINUX_EFAULT;
+    hvf_segment_t orig = g->segments[idx];
+    if (aligned_end > orig.ipa + orig.len)
+        return -LINUX_EFAULT;
+    if (aligned_start == orig.ipa && aligned_end == orig.ipa + orig.len)
+        return 0;
+
+    hvf_segment_t pieces[3];
+    int n_pieces = 0;
+    if (aligned_start > orig.ipa)
+        pieces[n_pieces++] =
+            (hvf_segment_t) {.ipa = orig.ipa, .len = aligned_start - orig.ipa};
+    pieces[n_pieces++] = (hvf_segment_t) {.ipa = aligned_start,
+                                          .len = aligned_end - aligned_start};
+    if (aligned_end < orig.ipa + orig.len)
+        pieces[n_pieces++] = (hvf_segment_t) {
+            .ipa = aligned_end, .len = orig.ipa + orig.len - aligned_end};
+
+    if (g->n_segments + n_pieces - 1 > GUEST_MAX_HVF_SEGMENTS)
+        return -LINUX_ENOMEM;
+
+    if (hv_vm_unmap(orig.ipa, orig.len) != HV_SUCCESS)
+        return -LINUX_EIO;
+
+    for (int i = 0; i < n_pieces; i++) {
+        void *host_va = (uint8_t *) g->host_base + pieces[i].ipa;
+        if (hv_vm_map(host_va, pieces[i].ipa, pieces[i].len,
+                      HVF_SEGMENT_FLAGS) != HV_SUCCESS) {
+            /* Best-effort recovery: tear down whatever pieces we already
+             * mapped (HVF would reject hv_vm_map(orig) as overlapping if we
+             * left them in place) and re-map the original segment. Sibling
+             * vCPUs are quiesced so they cannot observe the gap. If the
+             * final remap also fails the IPA range stays without stage-2
+             * entries and the guest will fault on access; log the
+             * unrecoverable state so post-mortem points at the right
+             * culprit instead of the unrelated downstream fault.
+             */
+            for (int j = 0; j < i; j++)
+                hv_vm_unmap(pieces[j].ipa, pieces[j].len);
+            hv_return_t r = hv_vm_map((uint8_t *) g->host_base + orig.ipa,
+                                      orig.ipa, orig.len, HVF_SEGMENT_FLAGS);
+            if (r != HV_SUCCESS)
+                log_error(
+                    "hvf_segment_split: recovery hv_vm_map(0x%llx, 0x%llx) "
+                    "failed with 0x%x; IPA range left without stage-2 "
+                    "entries",
+                    (unsigned long long) orig.ipa,
+                    (unsigned long long) orig.len, (int) r);
+            return -LINUX_EIO;
+        }
+    }
+
+    /* Replace orig with pieces in the segment array */
+    int tail = g->n_segments - idx - 1;
+    memmove(&g->segments[idx + n_pieces], &g->segments[idx + 1],
+            (size_t) tail * sizeof(hvf_segment_t));
+    for (int i = 0; i < n_pieces; i++)
+        g->segments[idx + i] = pieces[i];
+    g->n_segments += n_pieces - 1;
+    return 0;
+}
+
+/* Apply a real MAP_SHARED file overlay at [ipa, ipa+len) backed by [fd,
+ * file_off). The IPA range may be sub-2 MiB; the containing 2 MiB
+ * segment is split out first if it is not already isolated. Caller
+ * holds mmap_lock and has not quiesced siblings yet. The function
+ * quiesces siblings around the unmap+remap window so concurrent vCPUs
+ * cannot fault on the temporarily-unmapped IPA range.
+ */
+static int hvf_apply_file_overlay(guest_t *g,
+                                  uint64_t ipa,
+                                  uint64_t len,
+                                  int fd,
+                                  off_t file_off)
+{
+    uint64_t aligned_start = ALIGN_2MIB_DOWN(ipa);
+    uint64_t aligned_end = ALIGN_2MIB_UP(ipa + len);
+
+    thread_quiesce_siblings();
+
+    int err = hvf_segment_split(g, aligned_start, aligned_end);
+    if (err < 0) {
+        thread_resume_siblings();
+        return err;
+    }
+
+    int idx = hvf_segment_find(g, aligned_start);
+    if (idx < 0 || g->segments[idx].ipa != aligned_start ||
+        g->segments[idx].len != aligned_end - aligned_start) {
+        thread_resume_siblings();
+        return -LINUX_EFAULT;
+    }
+    hvf_segment_t seg = g->segments[idx];
+
+    if (hv_vm_unmap(seg.ipa, seg.len) != HV_SUCCESS) {
+        thread_resume_siblings();
+        return -LINUX_EIO;
+    }
+
+    void *target = (uint8_t *) g->host_base + ipa;
+    void *p = mmap(target, len, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED,
+                   fd, file_off);
+    if (p == MAP_FAILED) {
+        int saved = linux_errno();
+        /* The overlay failed; restore the segment to slab backing so the
+         * host VA range stays consistent. The host VA was untouched by
+         * the failed mmap, so nothing else to undo.
+         */
+        hv_vm_map((uint8_t *) g->host_base + seg.ipa, seg.ipa, seg.len,
+                  HVF_SEGMENT_FLAGS);
+        thread_resume_siblings();
+        return saved < 0 ? saved : -saved;
+    }
+
+    if (hv_vm_map((uint8_t *) g->host_base + seg.ipa, seg.ipa, seg.len,
+                  HVF_SEGMENT_FLAGS) != HV_SUCCESS) {
+        /* Restore slab backing so the host VA stops referencing the
+         * caller's file fd (which they expect to take back), then
+         * re-issue hv_vm_map so the IPA range is not left without
+         * stage-2 entries. Without the second hv_vm_map, sibling vCPUs
+         * would page-fault on this IPA after thread_resume_siblings
+         * with no chance of recovery short of process exit.
+         */
+        hvf_restore_slab_backing(g, ipa, len);
+        hv_vm_map((uint8_t *) g->host_base + seg.ipa, seg.ipa, seg.len,
+                  HVF_SEGMENT_FLAGS);
+        thread_resume_siblings();
+        return -LINUX_EIO;
+    }
+
+    thread_resume_siblings();
+    return 0;
+}
+
+/* Undo a file overlay at [ipa, ipa+len) by restoring the slab backing
+ * and refreshing the containing HVF segment. Caller holds mmap_lock.
+ * Sibling vCPUs are quiesced around the brief unmap window.
+ */
+static int hvf_remove_file_overlay(guest_t *g, uint64_t ipa, uint64_t len)
+{
+    int idx = hvf_segment_find(g, ipa);
+    if (idx < 0)
+        return -LINUX_EFAULT;
+    hvf_segment_t seg = g->segments[idx];
+
+    thread_quiesce_siblings();
+
+    if (hv_vm_unmap(seg.ipa, seg.len) != HV_SUCCESS) {
+        thread_resume_siblings();
+        return -LINUX_EIO;
+    }
+
+    int err = hvf_restore_slab_backing(g, ipa, len);
+    if (err < 0) {
+        /* Best-effort: re-establish the segment with whatever the host VA
+         * currently has (still the file overlay) so the guest can see
+         * something rather than nothing.
+         */
+        hv_vm_map((uint8_t *) g->host_base + seg.ipa, seg.ipa, seg.len,
+                  HVF_SEGMENT_FLAGS);
+        thread_resume_siblings();
+        return err;
+    }
+
+    if (hv_vm_map((uint8_t *) g->host_base + seg.ipa, seg.ipa, seg.len,
+                  HVF_SEGMENT_FLAGS) != HV_SUCCESS) {
+        thread_resume_siblings();
+        return -LINUX_EIO;
+    }
+
+    thread_resume_siblings();
+    return 0;
+}
+
+/* Walk semantic regions in [start, end) and undo any active MAP_SHARED file
+ * overlays on the underlying host VA. Used before sys_mmap MAP_FIXED replaces
+ * a previously-overlaid range with a new mapping (anonymous or different
+ * file): without restoring the slab backing first, stale file pages would
+ * leak into the new mapping. Returns 0 on success, -errno on failure;
+ * region overlay metadata is cleared only for ranges where the underlying
+ * host-VA overlay was successfully torn down so a partial failure does not
+ * leave the runtime believing an overlay is gone while the file mmap is
+ * still live (which would cause a later memset to write into the file).
+ * Caller holds mmap_lock.
+ */
+static int cleanup_overlays_in_range(guest_t *g, uint64_t start, uint64_t end)
+{
+    size_t hps = host_page_size_cached();
+    uint64_t host_start = ALIGN_DOWN(start, hps);
+    uint64_t host_end = ALIGN_UP(end, hps);
+
+    split_regions_at_boundary(g, host_start);
+    split_regions_at_boundary(g, host_end);
+
+    /* Snapshot affected ranges first; the host-side mmap calls below do not
+     * touch the region array, but a future caller invariant is to allow
+     * this loop to mutate metadata only after the unmap-and-restore dance
+     * succeeds. The bounded buffer keeps the function stack-only.
+     */
+    struct {
+        uint64_t off, len;
+    } overlays[GUEST_MAX_REGIONS];
+    int n = 0;
+    for (int i = 0; i < g->nregions && n < GUEST_MAX_REGIONS; i++) {
+        guest_region_t *r = &g->regions[i];
+        if (r->start >= host_end)
+            break;
+        if (r->end <= host_start)
+            continue;
+        if (!region_has_live_overlay(r))
+            continue;
+        uint64_t s =
+            r->overlay_start > host_start ? r->overlay_start : host_start;
+        uint64_t e = r->overlay_end < host_end ? r->overlay_end : host_end;
+        if (e <= s)
+            continue;
+        bool seen = false;
+        for (int j = 0; j < n; j++) {
+            if (overlays[j].off == s && overlays[j].len == e - s) {
+                seen = true;
+                break;
+            }
+        }
+        if (!seen) {
+            overlays[n].off = s;
+            overlays[n].len = e - s;
+            n++;
+        }
+    }
+    int err = 0;
+    for (int i = 0; i < n; i++) {
+        int rc = hvf_remove_file_overlay(g, overlays[i].off, overlays[i].len);
+        if (rc < 0) {
+            /* Stop on first failure; leave overlay_active set on regions
+             * we could not tear down so subsequent operations still see a
+             * live overlay there and route through the overlay-aware
+             * paths.
+             */
+            if (!err)
+                err = rc;
+            break;
+        }
+        clear_overlay_metadata_range(g, overlays[i].off,
+                                     overlays[i].off + overlays[i].len);
+    }
+    return err;
+}
+
 /* Memory syscalls (tightly coupled to guest.h). */
 
 int64_t sys_brk(guest_t *g, uint64_t addr)
@@ -290,6 +1072,31 @@ int64_t sys_mmap(guest_t *g,
     bool is_noreserve = is_anon && (flags & LINUX_MAP_NORESERVE) != 0;
     host_fd_ref_t backing_ref = {.fd = -1, .owned = 0};
     int host_backing_fd = -1, track_backing_fd = -1;
+    /* Tracks whether hvf_apply_file_overlay has installed a host
+     * MAP_FIXED|MAP_SHARED mapping that the failure paths must undo if
+     * later steps (page tables, region tracking) fall over. Without this,
+     * a partial-success rollback leaves the file mmap'd at host_base+ipa
+     * with no region tracking, and the next operation in that range would
+     * memset zeros directly into the user's file.
+     */
+    bool overlay_installed = false;
+    uint64_t overlay_ipa = 0;
+    uint64_t overlay_len = 0;
+    uint64_t saved_mmap_next = g->mmap_next;
+    uint64_t saved_mmap_end = g->mmap_end;
+    uint64_t saved_mmap_rx_next = g->mmap_rx_next;
+    uint64_t saved_mmap_rx_end = g->mmap_rx_end;
+    uint64_t saved_rw_gap_hint = g->mmap_rw_gap_hint;
+    uint64_t saved_rx_gap_hint = g->mmap_rx_gap_hint;
+    /* Heap-allocated to avoid blowing the ~512 KiB default stack on macOS
+     * worker threads: GUEST_MAX_REGIONS * sizeof(region_snapshot_t) is on
+     * the order of half a megabyte. Allocated lazily inside the FIXED
+     * path that actually consumes it; non-FIXED mmaps never touch this
+     * pointer. Always free()'d (free(NULL) is a no-op) before return.
+     */
+    region_snapshot_t *replaced_snaps = NULL;
+    int replaced_nsnaps = 0;
+    bool replaced_regions_removed = false;
     int track_flags =
         ((flags & LINUX_MAP_SHARED) ? LINUX_MAP_SHARED : LINUX_MAP_PRIVATE);
     if (is_anon)
@@ -388,9 +1195,23 @@ int64_t sys_mmap(guest_t *g,
             host_fd_ref_close(&backing_ref);
             return -LINUX_ENOMEM;
         }
+        replaced_snaps = malloc(GUEST_MAX_REGIONS * sizeof(*replaced_snaps));
+        if (!replaced_snaps) {
+            host_fd_ref_close(&backing_ref);
+            return -LINUX_ENOMEM;
+        }
+        replaced_nsnaps =
+            capture_region_snapshots(g, result_off, result_off + length,
+                                     replaced_snaps, GUEST_MAX_REGIONS);
+        if (replaced_nsnaps < 0) {
+            free(replaced_snaps);
+            host_fd_ref_close(&backing_ref);
+            return replaced_nsnaps;
+        }
         if (!is_anon) {
             track_backing_fd = dup(host_backing_fd);
             if (track_backing_fd < 0) {
+                dispose_region_snapshots(&replaced_snaps, &replaced_nsnaps);
                 host_fd_ref_close(&backing_ref);
                 return -LINUX_ENOMEM;
             }
@@ -402,6 +1223,7 @@ int64_t sys_mmap(guest_t *g,
                 } while (nr < 0 && errno == EINTR);
                 if (nr < 0) {
                     close(track_backing_fd);
+                    dispose_region_snapshots(&replaced_snaps, &replaced_nsnaps);
                     host_fd_ref_close(&backing_ref);
                     return linux_errno();
                 }
@@ -424,10 +1246,30 @@ int64_t sys_mmap(guest_t *g,
             if (ext_end > g->guest_size)
                 ext_end = g->guest_size;
 
+            /* Restore slab backing under any pre-existing MAP_SHARED file
+             * overlay in the replaced range. Without this, stale file pages
+             * leak into the new mapping. Must run before guest_region_remove
+             * because the cleanup walker reads the live region metadata.
+             */
+            int cleanup_err =
+                cleanup_overlays_in_range(g, result_off, result_off + length);
+            if (cleanup_err < 0) {
+                (void) restore_snapshot_overlays_in_place(g, replaced_snaps,
+                                                          replaced_nsnaps);
+                if (track_backing_fd >= 0)
+                    close(track_backing_fd);
+                dispose_region_snapshots(&replaced_snaps, &replaced_nsnaps);
+                host_fd_ref_close(&backing_ref);
+                return cleanup_err;
+            }
+
             if (guest_extend_page_tables(g, ext_start, ext_end, page_perms) <
                 0) {
+                (void) restore_snapshot_overlays_in_place(g, replaced_snaps,
+                                                          replaced_nsnaps);
                 if (track_backing_fd >= 0)
                     close(track_backing_fd);
+                dispose_region_snapshots(&replaced_snaps, &replaced_nsnaps);
                 host_fd_ref_close(&backing_ref);
                 return -LINUX_ENOMEM;
             }
@@ -436,6 +1278,7 @@ int64_t sys_mmap(guest_t *g,
              * succeeds.
              */
             guest_region_remove(g, result_off, result_off + length);
+            replaced_regions_removed = true;
 
             /* Fine-tune permissions for the exact range. Handles L3
              * splitting when MAP_FIXED overlays different permissions
@@ -445,15 +1288,46 @@ int64_t sys_mmap(guest_t *g,
 
             /* For MAP_ANONYMOUS: zero the region (host memory may contain
              * stale data from earlier mappings).
-             * For file-backed: read file contents into guest memory.
-             * Short reads leave the remainder zeroed (memset first).
+             * For MAP_SHARED + regular file: install a real host mmap
+             * MAP_FIXED|MAP_SHARED overlay so the guest sees live host
+             * writes and its own writes hit the file directly.
+             * For MAP_PRIVATE file-backed: read file contents into guest
+             * memory; private writes stay in the slab.  Short reads leave
+             * the remainder zeroed (memset first).
              */
             if (is_anon) {
                 memset((uint8_t *) g->host_base + result_off, 0, length);
+            } else if (fd >= 0 && (flags & LINUX_MAP_SHARED) &&
+                       (result_off % host_page_size_cached() == 0) &&
+                       ((uint64_t) offset % host_page_size_cached() == 0)) {
+                uint64_t fixed_overlay_len =
+                    ALIGN_UP(length, host_page_size_cached());
+                int oerr =
+                    hvf_apply_file_overlay(g, result_off, fixed_overlay_len,
+                                           host_backing_fd, (off_t) offset);
+                if (oerr < 0) {
+                    int restore_err = restore_region_snapshots(
+                        g, replaced_snaps, replaced_nsnaps);
+                    if (restore_err == 0)
+                        restore_err = restore_snapshot_page_tables(
+                            g, result_off, result_off + length, replaced_snaps,
+                            replaced_nsnaps);
+                    if (track_backing_fd >= 0)
+                        close(track_backing_fd);
+                    if (restore_err < 0) {
+                        dispose_region_snapshots(&replaced_snaps,
+                                                 &replaced_nsnaps);
+                        host_fd_ref_close(&backing_ref);
+                        return restore_err;
+                    }
+                    dispose_region_snapshots(&replaced_snaps, &replaced_nsnaps);
+                    host_fd_ref_close(&backing_ref);
+                    return oerr;
+                }
+                overlay_installed = true;
+                overlay_ipa = result_off;
+                overlay_len = fixed_overlay_len;
             } else if (fd >= 0) {
-                /* Zero first, then overlay with file data. This matches
-                 * Linux MAP_FIXED semantics: pages beyond EOF are zeroed.
-                 */
                 memset((uint8_t *) g->host_base + result_off, 0, length);
                 uint8_t *dst = (uint8_t *) g->host_base + result_off;
                 size_t remaining = length;
@@ -474,8 +1348,24 @@ int64_t sys_mmap(guest_t *g,
                 }
             }
         } else {
+            /* Restore slab backing under any pre-existing MAP_SHARED file
+             * overlay before dropping the region tracking.
+             */
+            int cleanup_err =
+                cleanup_overlays_in_range(g, result_off, result_off + length);
+            if (cleanup_err < 0) {
+                (void) restore_snapshot_overlays_in_place(g, replaced_snaps,
+                                                          replaced_nsnaps);
+                if (track_backing_fd >= 0)
+                    close(track_backing_fd);
+                dispose_region_snapshots(&replaced_snaps, &replaced_nsnaps);
+                host_fd_ref_close(&backing_ref);
+                return cleanup_err;
+            }
+
             /* Remove any existing region coverage in the fixed range. */
             guest_region_remove(g, result_off, result_off + length);
+            replaced_regions_removed = true;
 
             /* PROT_NONE with MAP_FIXED: invalidate existing page table
              * entries so the region becomes truly inaccessible. Without
@@ -650,37 +1540,77 @@ int64_t sys_mmap(guest_t *g,
         g->need_tlbi = true;
     }
 
-    /* For file-backed mmap, read file contents into the region.
-     * Short reads are acceptable (region is already zeroed above),
-     * but total failure means the mapping is useless.
-     * Skip for PROT_NONE: the region has no page table entries yet;
-     * data is faulted in when mprotect makes the pages accessible.
+    /* For file-backed mmap, populate the region with file contents.
+     * MAP_SHARED installs a real host mmap MAP_FIXED|MAP_SHARED overlay so
+     * guest reads observe concurrent host writes and guest writes hit the
+     * file directly.  MAP_PRIVATE pread-snapshots into private guest pages
+     * so writes stay local.  Skip for PROT_NONE: the region has no page
+     * table entries yet; data is faulted in when mprotect makes the pages
+     * accessible.
      */
     if (!is_anon && fd >= 0 && !is_prot_none) {
-        uint8_t *dst = (uint8_t *) g->host_base + result_off;
-        size_t remaining = length;
-        off_t file_off = offset;
-        bool read_err = false;
-        while (remaining > 0) {
-            ssize_t nr = pread(host_backing_fd, dst, remaining, file_off);
-            if (nr < 0) {
-                if (errno == EINTR)
-                    continue;
-                read_err = true;
-                break;
+        size_t hps = host_page_size_cached();
+        /* mmap rounds length up to the host page size internally; only
+         * addr and offset alignment matter for MAP_FIXED on macOS Apple
+         * Silicon (16 KiB host pages). The "extra" trailing bytes inside
+         * the host page are never reachable by the guest because the
+         * gap-finder advances the hint to the next host-page boundary
+         * after each allocation.
+         */
+        bool overlay_aligned = (flags & LINUX_MAP_SHARED) &&
+                               (result_off % hps == 0) &&
+                               ((uint64_t) offset % hps == 0);
+        if (overlay_aligned) {
+            uint64_t nf_overlay_len = ALIGN_UP(length, hps);
+            int oerr = hvf_apply_file_overlay(g, result_off, nf_overlay_len,
+                                              host_backing_fd, (off_t) offset);
+            if (oerr < 0) {
+                int rollback_err = rollback_fresh_mmap_allocation(
+                    g, result_off, length, false, 0, 0, saved_mmap_next,
+                    saved_mmap_end, saved_mmap_rx_next, saved_mmap_rx_end,
+                    saved_rw_gap_hint, saved_rx_gap_hint);
+                if (track_backing_fd >= 0)
+                    close(track_backing_fd);
+                host_fd_ref_close(&backing_ref);
+                if (rollback_err < 0)
+                    return rollback_err;
+                return oerr;
+            }
+            overlay_installed = true;
+            overlay_ipa = result_off;
+            overlay_len = nf_overlay_len;
+        } else {
+            uint8_t *dst = (uint8_t *) g->host_base + result_off;
+            size_t remaining = length;
+            off_t file_off = offset;
+            bool read_err = false;
+            while (remaining > 0) {
+                ssize_t nr = pread(host_backing_fd, dst, remaining, file_off);
+                if (nr < 0) {
+                    if (errno == EINTR)
+                        continue;
+                    read_err = true;
+                    break;
+                }
+                if (nr == 0)
+                    break; /* EOF; remaining pages stay zeroed */
+                dst += nr;
+                remaining -= (size_t) nr;
+                file_off += nr;
+            }
+            if (read_err && remaining == length) {
+                /* Total failure (no bytes read). Undo the mapping. */
+                int rollback_err = rollback_fresh_mmap_allocation(
+                    g, result_off, length, false, 0, 0, saved_mmap_next,
+                    saved_mmap_end, saved_mmap_rx_next, saved_mmap_rx_end,
+                    saved_rw_gap_hint, saved_rx_gap_hint);
+                if (track_backing_fd >= 0)
+                    close(track_backing_fd);
+                host_fd_ref_close(&backing_ref);
+                if (rollback_err < 0)
+                    return rollback_err;
+                return linux_errno();
             }
-            if (nr == 0)
-                break; /* EOF; remaining pages stay zeroed */
-            dst += nr;
-            remaining -= (size_t) nr;
-            file_off += nr;
-        }
-        if (read_err && remaining == length) {
-            /* Total failure (no bytes read). Undo the mapping. */
-            if (track_backing_fd >= 0)
-                close(track_backing_fd);
-            host_fd_ref_close(&backing_ref);
-            return linux_errno();
         }
     }
 
@@ -690,11 +1620,59 @@ int64_t sys_mmap(guest_t *g,
     if (guest_region_add_ex_owned(g, result_off, result_off + length, prot,
                                   track_flags, is_anon ? 0 : (uint64_t) offset,
                                   NULL, track_backing_fd) < 0) {
+        /* Region table was full: undo any host overlay we just installed
+         * so the file is not left mmap'd at host_base+ipa with no
+         * tracking. Without this, a later operation in that range would
+         * memset zeros directly into the user's file via the leaked
+         * overlay.
+         */
+        int rollback_err = 0;
+        if (replaced_regions_removed) {
+            if (overlay_installed)
+                hvf_remove_file_overlay(g, overlay_ipa, overlay_len);
+            rollback_err =
+                restore_region_snapshots(g, replaced_snaps, replaced_nsnaps);
+            if (rollback_err == 0)
+                rollback_err = restore_snapshot_page_tables(
+                    g, result_off, result_off + length, replaced_snaps,
+                    replaced_nsnaps);
+        } else {
+            rollback_err = rollback_fresh_mmap_allocation(
+                g, result_off, length, overlay_installed, overlay_ipa,
+                overlay_len, saved_mmap_next, saved_mmap_end,
+                saved_mmap_rx_next, saved_mmap_rx_end, saved_rw_gap_hint,
+                saved_rx_gap_hint);
+        }
+        dispose_region_snapshots(&replaced_snaps, &replaced_nsnaps);
         host_fd_ref_close(&backing_ref);
+        if (rollback_err < 0)
+            return rollback_err;
         return -LINUX_ENOMEM;
     }
 
+    /* Mark the region as overlay-backed when sys_mmap installed a real
+     * MAP_FIXED|MAP_SHARED overlay on the host VA. Used by msync to skip
+     * the snapshot-style pwrite/refresh paths for regions that the kernel
+     * already keeps coherent with the file's page cache.
+     */
+    if (!is_anon && fd >= 0 && !is_prot_none && (flags & LINUX_MAP_SHARED)) {
+        size_t hps = host_page_size_cached();
+        if ((result_off % hps == 0) && ((uint64_t) offset % hps == 0)) {
+            for (int i = 0; i < g->nregions; i++) {
+                if (g->regions[i].start == result_off &&
+                    g->regions[i].end == result_off + length) {
+                    g->regions[i].overlay_active = true;
+                    g->regions[i].overlay_start = result_off;
+                    g->regions[i].overlay_end =
+                        result_off + ALIGN_UP(length, hps);
+                    break;
+                }
+            }
+        }
+    }
+
     host_fd_ref_close(&backing_ref);
+    dispose_region_snapshots(&replaced_snaps, &replaced_nsnaps);
 
     /* Return IPA-based address to guest */
     return (int64_t) guest_ipa(g, result_off);
@@ -760,6 +1738,12 @@ int64_t sys_mremap(guest_t *g,
     /* Shrinking mremap keeps the base address and releases only the tail. */
     if (new_size < old_size && !(flags & LINUX_MREMAP_FIXED)) {
         uint64_t tail_off = old_off + new_size, tail_end = old_off + old_size;
+        /* Restore slab backing under any tail overlay before zeroing so the
+         * memset does not write zeros into a file.
+         */
+        int cleanup_err = cleanup_overlays_in_range(g, tail_off, tail_end);
+        if (cleanup_err < 0)
+            return cleanup_err;
         /* Zero the trimmed region */
         memset((uint8_t *) g->host_base + tail_off, 0, tail_end - tail_off);
         guest_region_remove(g, tail_off, tail_end);
@@ -805,11 +1789,100 @@ int64_t sys_mremap(guest_t *g,
         int track_backing_fd = dup_region_backing_fd(old_reg);
         if (old_reg && old_reg->backing_fd >= 0 && track_backing_fd < 0)
             return -LINUX_ENOMEM;
+        bool source_overlay = old_reg && region_has_live_overlay(old_reg);
+        uint64_t source_file_off =
+            old_reg ? old_reg->offset + (old_off - old_reg->start) : 0;
         char track_name[sizeof(old_reg->name)] = {0};
+        /* Heap-allocated to avoid blowing the ~512 KiB default macOS thread
+         * stack: each region_snapshot_t array is GUEST_MAX_REGIONS *
+         * sizeof(region_snapshot_t), so two of them on the stack would be
+         * close to a megabyte. Freed via dispose_region_snapshots on every
+         * exit path below.
+         */
+        region_snapshot_t *source_snaps = NULL;
+        region_snapshot_t *dest_snaps = NULL;
+        int source_nsnaps = 0, dest_nsnaps = 0;
         if (old_reg)
             str_copy_trunc(track_name, old_reg->name, sizeof(track_name));
 
+        source_snaps = malloc(GUEST_MAX_REGIONS * sizeof(*source_snaps));
+        dest_snaps = malloc(GUEST_MAX_REGIONS * sizeof(*dest_snaps));
+        if (!source_snaps || !dest_snaps) {
+            free(source_snaps);
+            free(dest_snaps);
+            if (track_backing_fd >= 0)
+                close(track_backing_fd);
+            return -LINUX_ENOMEM;
+        }
+
+        source_nsnaps = capture_region_snapshots(
+            g, old_off, old_off + old_size, source_snaps, GUEST_MAX_REGIONS);
+        if (source_nsnaps < 0) {
+            free(source_snaps);
+            free(dest_snaps);
+            if (track_backing_fd >= 0)
+                close(track_backing_fd);
+            return source_nsnaps;
+        }
+        dest_nsnaps = capture_region_snapshots(g, new_off, new_off + new_size,
+                                               dest_snaps, GUEST_MAX_REGIONS);
+        if (dest_nsnaps < 0) {
+            dispose_region_snapshots(&source_snaps, &source_nsnaps);
+            free(dest_snaps);
+            if (track_backing_fd >= 0)
+                close(track_backing_fd);
+            return dest_nsnaps;
+        }
+
+        if (source_overlay) {
+            int cleanup_err =
+                cleanup_overlays_in_range(g, old_off, old_off + old_size);
+            if (cleanup_err < 0) {
+                (void) restore_snapshot_overlays_in_place(g, source_snaps,
+                                                          source_nsnaps);
+                dispose_region_snapshots(&dest_snaps, &dest_nsnaps);
+                dispose_region_snapshots(&source_snaps, &source_nsnaps);
+                if (track_backing_fd >= 0)
+                    close(track_backing_fd);
+                return cleanup_err;
+            }
+        }
+
+        int cleanup_err =
+            cleanup_overlays_in_range(g, new_off, new_off + new_size);
+        if (cleanup_err < 0) {
+            int restore_err = restore_snapshot_overlays_in_place(
+                g, source_snaps, source_nsnaps);
+            if (restore_err < 0) {
+                dispose_region_snapshots(&dest_snaps, &dest_nsnaps);
+                dispose_region_snapshots(&source_snaps, &source_nsnaps);
+                if (track_backing_fd >= 0)
+                    close(track_backing_fd);
+                return restore_err;
+            }
+            (void) restore_snapshot_overlays_in_place(g, dest_snaps,
+                                                      dest_nsnaps);
+            dispose_region_snapshots(&dest_snaps, &dest_nsnaps);
+            dispose_region_snapshots(&source_snaps, &source_nsnaps);
+            if (track_backing_fd >= 0)
+                close(track_backing_fd);
+            return cleanup_err;
+        }
+
         if (mremap_extend_range(g, new_off, new_size, prot) < 0) {
+            int restore_err = restore_snapshot_overlays_in_place(
+                g, source_snaps, source_nsnaps);
+            if (restore_err < 0) {
+                dispose_region_snapshots(&dest_snaps, &dest_nsnaps);
+                dispose_region_snapshots(&source_snaps, &source_nsnaps);
+                if (track_backing_fd >= 0)
+                    close(track_backing_fd);
+                return restore_err;
+            }
+            (void) restore_snapshot_overlays_in_place(g, dest_snaps,
+                                                      dest_nsnaps);
+            dispose_region_snapshots(&dest_snaps, &dest_nsnaps);
+            dispose_region_snapshots(&source_snaps, &source_nsnaps);
             if (track_backing_fd >= 0)
                 close(track_backing_fd);
             return -LINUX_ENOMEM;
@@ -820,13 +1893,49 @@ int64_t sys_mremap(guest_t *g,
          */
         guest_region_remove(g, new_off, new_off + new_size);
 
-        /* Copy data (use memmove for potential overlap) */
+        /* Copy data (use memmove for potential overlap).  If the source
+         * has a live overlay, the read side of the memmove pulls live
+         * file content; the destination receives a private snapshot at
+         * mremap time (no overlay reapplied), and msync's emulated
+         * pwrite-the-diff path keeps subsequent writes consistent.
+         */
         uint64_t copy_len = old_size < new_size ? old_size : new_size;
-        if (prot == LINUX_PROT_NONE)
+        if (prot == LINUX_PROT_NONE) {
             memset((uint8_t *) g->host_base + new_off, 0, new_size);
-        else
+        } else if (source_overlay) {
+            memset((uint8_t *) g->host_base + new_off, 0, new_size);
+            int copy_err = read_file_range_to_guest(
+                g, new_off, track_backing_fd, source_file_off, copy_len);
+            if (copy_err < 0) {
+                int restore_err = restore_snapshot_overlays_in_place(
+                    g, source_snaps, source_nsnaps);
+                if (restore_err < 0)
+                    copy_err = restore_err;
+                restore_err =
+                    restore_region_snapshots(g, dest_snaps, dest_nsnaps);
+                /* Re-establish the destination's page-table state to match
+                 * the regions we just restored. mremap_extend_range above
+                 * had filled in PTEs for the new mremap target; without
+                 * this rollback those PTEs would outlive the regions and
+                 * the guest would see live mappings where its own metadata
+                 * (after the restore) says nothing is mapped.
+                 */
+                int pt_err = restore_snapshot_page_tables(
+                    g, new_off, new_off + new_size, dest_snaps, dest_nsnaps);
+                if (pt_err < 0 && restore_err >= 0)
+                    restore_err = pt_err;
+                if (track_backing_fd >= 0)
+                    close(track_backing_fd);
+                dispose_region_snapshots(&source_snaps, &source_nsnaps);
+                dispose_region_snapshots(&dest_snaps, &dest_nsnaps);
+                if (restore_err < 0)
+                    return restore_err;
+                return copy_err;
+            }
+        } else {
             memmove((uint8_t *) g->host_base + new_off,
                     (uint8_t *) g->host_base + old_off, copy_len);
+        }
         /* Zero any extension beyond old data */
         if (new_size > old_size)
             memset((uint8_t *) g->host_base + new_off + old_size, 0,
@@ -845,8 +1954,14 @@ int64_t sys_mremap(guest_t *g,
 
         if (guest_region_add_ex_owned(
                 g, new_off, new_off + new_size, prot, track_flags, track_offset,
-                track_name[0] ? track_name : NULL, track_backing_fd) < 0)
+                track_name[0] ? track_name : NULL, track_backing_fd) < 0) {
+            (void) restore_region_snapshots(g, dest_snaps, dest_nsnaps);
+            dispose_region_snapshots(&source_snaps, &source_nsnaps);
+            dispose_region_snapshots(&dest_snaps, &dest_nsnaps);
             return -LINUX_ENOMEM;
+        }
+        dispose_region_snapshots(&source_snaps, &source_nsnaps);
+        dispose_region_snapshots(&dest_snaps, &dest_nsnaps);
         g->need_tlbi = true;
         return (int64_t) guest_ipa(g, new_off);
     }
@@ -885,6 +2000,11 @@ int64_t sys_mremap(guest_t *g,
                             : (LINUX_MAP_PRIVATE | LINUX_MAP_ANONYMOUS);
                 uint64_t track_offset = old_reg ? old_reg->offset : 0;
                 int track_backing_fd = dup_region_backing_fd(old_reg);
+                bool old_overlay = old_reg && region_has_live_overlay(old_reg);
+                uint64_t old_overlay_start =
+                    old_overlay ? old_reg->overlay_start : 0;
+                uint64_t old_overlay_end =
+                    old_overlay ? old_reg->overlay_end : 0;
                 if (old_reg && old_reg->backing_fd >= 0 && track_backing_fd < 0)
                     return -LINUX_ENOMEM;
                 char track_name[sizeof(old_reg->name)] = {0};
@@ -907,6 +2027,10 @@ int64_t sys_mremap(guest_t *g,
                                               track_name[0] ? track_name : NULL,
                                               track_backing_fd) < 0)
                     return -LINUX_ENOMEM;
+                if (old_overlay)
+                    mark_overlay_metadata_range(g, old_off, old_off + old_size,
+                                                old_overlay_start,
+                                                old_overlay_end);
                 g->need_tlbi = true;
 
                 /* Update high-water marks */
@@ -937,6 +2061,16 @@ int64_t sys_mremap(guest_t *g,
         int track_backing_fd = dup_region_backing_fd(old_reg);
         if (old_reg && old_reg->backing_fd >= 0 && track_backing_fd < 0)
             return -LINUX_ENOMEM;
+        bool source_overlay = old_reg && region_has_live_overlay(old_reg);
+        uint64_t source_overlay_start =
+            source_overlay ? old_reg->overlay_start : 0;
+        uint64_t source_overlay_end = source_overlay ? old_reg->overlay_end : 0;
+        uint64_t source_file_off =
+            old_reg ? old_reg->offset + (old_off - old_reg->start) : 0;
+        uint64_t source_overlay_file_off =
+            source_overlay
+                ? old_reg->offset + (source_overlay_start - old_reg->start)
+                : 0;
         char track_name[sizeof(old_reg->name)] = {0};
         if (old_reg)
             str_copy_trunc(track_name, old_reg->name, sizeof(track_name));
@@ -961,22 +2095,71 @@ int64_t sys_mremap(guest_t *g,
             return -LINUX_ENOMEM;
         }
 
+        if (source_overlay) {
+            int cleanup_err =
+                cleanup_overlays_in_range(g, old_off, old_off + old_size);
+            if (cleanup_err < 0) {
+                if (track_backing_fd >= 0)
+                    close(track_backing_fd);
+                return cleanup_err;
+            }
+        }
+
         if (mremap_extend_range(g, new_off, new_size, prot) < 0) {
+            if (source_overlay) {
+                int restore_err = restore_file_overlay_range(
+                    g, old_off, old_off + old_size, source_overlay_start,
+                    source_overlay_end, track_backing_fd,
+                    source_overlay_file_off);
+                if (restore_err < 0) {
+                    if (track_backing_fd >= 0)
+                        close(track_backing_fd);
+                    return restore_err;
+                }
+            }
             if (track_backing_fd >= 0)
                 close(track_backing_fd);
             return -LINUX_ENOMEM;
         }
 
-        /* Copy old data, zero extension */
-        if (prot == LINUX_PROT_NONE)
+        /* Copy old data, zero extension. The new range was just allocated
+         * from a free gap so it has no overlays to clean up; the source
+         * may have an overlay, which is read transparently by the memcpy
+         * before its underlying slab is restored below.
+         */
+        if (prot == LINUX_PROT_NONE) {
             memset((uint8_t *) g->host_base + new_off, 0, new_size);
-        else
+        } else if (source_overlay) {
+            memset((uint8_t *) g->host_base + new_off, 0, new_size);
+            int copy_err = read_file_range_to_guest(
+                g, new_off, track_backing_fd, source_file_off, old_size);
+            if (copy_err < 0) {
+                /* Roll back both sides: re-apply the source overlay so the
+                 * caller's MAP_SHARED is not silently demoted to a slab
+                 * snapshot, and tear down the destination PTEs we just
+                 * allocated via mremap_extend_range so the guest does not
+                 * see phantom zero pages where the failed mremap landed.
+                 */
+                (void) restore_file_overlay_range(
+                    g, old_off, old_off + old_size, source_overlay_start,
+                    source_overlay_end, track_backing_fd,
+                    source_overlay_file_off);
+                guest_invalidate_ptes(g, new_off, new_off + new_size);
+                g->need_tlbi = true;
+                if (track_backing_fd >= 0)
+                    close(track_backing_fd);
+                return copy_err;
+            }
+        } else {
             memcpy((uint8_t *) g->host_base + new_off,
                    (uint8_t *) g->host_base + old_off, old_size);
+        }
         memset((uint8_t *) g->host_base + new_off + old_size, 0,
                new_size - old_size);
 
-        /* Remove old mapping */
+        /* Remove old mapping. Any live source overlay was already torn down
+         * before the destination range was touched.
+         */
         memset((uint8_t *) g->host_base + old_off, 0, old_size);
         guest_region_remove(g, old_off, old_off + old_size);
         guest_invalidate_ptes(g, old_off, old_off + old_size);
@@ -1083,6 +2266,14 @@ int64_t sys_madvise(guest_t *g, uint64_t addr, uint64_t length, int advice)
                 continue;
             if (r->shared && r->backing_fd >= 0 && (r->prot & LINUX_PROT_WRITE))
                 continue;
+            /* Overlay-backed regions already serve their content from the
+             * file's page cache. The "zero + pread" reset would write zeros
+             * straight into the file because the host VA is the file. Skip
+             * the reset; the next guest read already sees the current file
+             * image, which is what MADV_DONTNEED promises.
+             */
+            if (r->overlay_active)
+                continue;
 
             uint64_t zstart = (r->start > off) ? r->start : off;
             uint64_t zend = (r->end < end) ? r->end : end;
@@ -1184,6 +2375,16 @@ int64_t sys_munmap(guest_t *g, uint64_t addr, uint64_t length)
             if (unmap_off < ELF_DEFAULT_BASE && end > PT_POOL_BASE)
                 return -LINUX_EINVAL;
 
+            /* Restore slab backing under any active MAP_SHARED file overlay
+             * before zeroing the host VA. Without this, the memset below
+             * would write zeros directly into the file. The cleanup walker
+             * reads live region metadata so it must run before
+             * guest_region_remove.
+             */
+            int cleanup_err = cleanup_overlays_in_range(g, unmap_off, end);
+            if (cleanup_err < 0)
+                return cleanup_err;
+
             /* Invalidate PTEs first. This may need to split a 2MiB block
              * which can fail if the page table pool is exhausted. Failing
              * before region removal keeps metadata consistent.
@@ -1433,10 +2634,19 @@ int64_t sys_msync(guest_t *g, uint64_t addr, uint64_t length, int flags)
         uint64_t file_start = r->offset + (sync_start - r->start);
         uint64_t file_end = file_start + (sync_end - sync_start);
 
-        int64_t err =
-            sync_shared_aliases_range(g, r->backing_fd, file_start, file_end);
-        if (err < 0)
-            return err;
+        /* Real overlay regions are kept coherent with the file by the
+         * kernel's page cache. The snapshot-style pwrite-the-diff would
+         * compare the live file against itself and may trip on macOS's
+         * page-cache write path; the refresh-from-file pass would do the
+         * same self-write. Both are no-ops for overlays, so MS_SYNC
+         * collapses to a plain fsync.
+         */
+        if (!r->overlay_active) {
+            int64_t err = sync_shared_aliases_range(g, r->backing_fd,
+                                                    file_start, file_end);
+            if (err < 0)
+                return err;
+        }
 
         if (flags & LINUX_MS_SYNC) {
             if (fsync(r->backing_fd) < 0)
@@ -1447,6 +2657,13 @@ int64_t sys_msync(guest_t *g, uint64_t addr, uint64_t length, int flags)
             guest_region_t *dst = &g->regions[j];
             if (!dst->shared || dst->backing_fd < 0)
                 continue;
+            /* Skip self and overlay-backed peers: the page cache already
+             * keeps them coherent with the file. Only legacy snapshot
+             * regions (e.g., a region created by mremap that lost its
+             * overlay) need refresh.
+             */
+            if (dst == r || dst->overlay_active)
+                continue;
             if (!same_backing_file(r->backing_fd, dst->backing_fd))
                 continue;
 
diff --git a/src/syscall/syscall.c b/src/syscall/syscall.c
index 36f8c61..96f0a96 100644
--- a/src/syscall/syscall.c
+++ b/src/syscall/syscall.c
@@ -330,8 +330,112 @@ SC_FORWARD(sc_waitid, sc_waitid_impl(g, x0, x1, x2, x3, x4))
 /* Futex */
 SC_FORWARD(sc_futex, sys_futex(g, x0, (int) x1, (uint32_t) x2, x3, x4, (uint32_t) x5))
 
-/* Sync */
-SC_FORWARD(sc_sync, (sync(), 0))
+/* Sync.
+ *
+ * Linux sync(2) flushes all dirty buffers. Forwarding to host sync()
+ * stalls because the guest slab is mmap'd MAP_SHARED to an internal
+ * tempfile (g->shm_fd) for the CoW fork fast path: a global flush has
+ * to walk multi-GB of demand-paged dirty pages from that tempfile, plus
+ * the same from any other elfuse process running on the host. The slab
+ * tempfile is implementation detail; the guest never opened it. Iterate
+ * the guest fd table and the region overlay backing fds, dup each under
+ * its lock, release the lock, and fsync the dups outside any guest lock
+ * so a slow disk cannot stall concurrent mmap/fd operations on other
+ * threads. fsync on non-regular fds returns EINVAL on macOS, which is
+ * benign and ignored. Always returns 0 to mirror sync(2)'s "void" spirit.
+ */
+/* Inline fallback: under malloc failure the bulk-dup path cannot proceed,
+ * so iterate one fd at a time, dupping under the matching lock and fsync
+ * outside it. Slower (acquires/releases fd_lock per regular fd) but keeps
+ * sync(2) honest under memory pressure instead of silently no-opping.
+ */
+static void sc_sync_fdtable_inline(void)
+{
+    for (int i = 0; i < FD_TABLE_SIZE; i++) {
+        pthread_mutex_lock(&fd_lock);
+        int t = fd_table[i].type;
+        int duped = -1;
+        if (t == FD_REGULAR || t == FD_DIR)
+            duped = dup(fd_table[i].host_fd);
+        pthread_mutex_unlock(&fd_lock);
+        if (duped < 0)
+            continue;
+        (void) fsync(duped);
+        close(duped);
+    }
+}
+
+static void sc_sync_regions_inline(guest_t *g)
+{
+    /* Region count can change under us once mmap_lock is released, so
+     * resnapshot under the lock each iteration; the i index is a live
+     * cursor into g->regions[] so a concurrent insertion (always at the
+     * sorted position) cannot make us skip an entry permanently.
+     */
+    for (int i = 0;; i++) {
+        pthread_mutex_lock(&mmap_lock);
+        if (i >= g->nregions) {
+            pthread_mutex_unlock(&mmap_lock);
+            break;
+        }
+        const guest_region_t *r = &g->regions[i];
+        int duped = -1;
+        if (r->shared && r->backing_fd >= 0)
+            duped = dup(r->backing_fd);
+        pthread_mutex_unlock(&mmap_lock);
+        if (duped < 0)
+            continue;
+        (void) fsync(duped);
+        close(duped);
+    }
+}
+
+static int64_t sc_sync_impl(guest_t *g)
+{
+    size_t cap = FD_TABLE_SIZE + GUEST_MAX_REGIONS;
+    int *hosts = malloc(cap * sizeof(int));
+    if (!hosts) {
+        sc_sync_fdtable_inline();
+        sc_sync_regions_inline(g);
+        return 0;
+    }
+    int n = 0;
+
+    pthread_mutex_lock(&fd_lock);
+    for (int i = 0; i < FD_TABLE_SIZE && n < (int) cap; i++) {
+        int t = fd_table[i].type;
+        if (t != FD_REGULAR && t != FD_DIR)
+            continue;
+        int duped = dup(fd_table[i].host_fd);
+        if (duped < 0)
+            continue;
+        hosts[n++] = duped;
+    }
+    pthread_mutex_unlock(&fd_lock);
+
+    pthread_mutex_lock(&mmap_lock);
+    for (int i = 0; i < g->nregions && n < (int) cap; i++) {
+        const guest_region_t *r = &g->regions[i];
+        if (!r->shared || r->backing_fd < 0)
+            continue;
+        int duped = dup(r->backing_fd);
+        if (duped < 0)
+            continue;
+        hosts[n++] = duped;
+    }
+    pthread_mutex_unlock(&mmap_lock);
+
+    /* fsync each dup outside both locks so a slow disk does not stall
+     * concurrent FD or memory operations on other threads.
+     */
+    for (int i = 0; i < n; i++) {
+        (void) fsync(hosts[i]);
+        close(hosts[i]);
+    }
+    free(hosts);
+    return 0;
+}
+SC_FORWARD(sc_sync, sc_sync_impl(g))
 
 /* SysV IPC */
 SC_FORWARD(sc_shmget, sys_shmget(g, (int32_t) x0, x1, (int) x2))
diff --git a/tests/test-matrix.sh b/tests/test-matrix.sh
index 39e06a6..9235ae3 100755
--- a/tests/test-matrix.sh
+++ b/tests/test-matrix.sh
@@ -177,11 +177,12 @@ run_elfuse_sysroot()
 
 # Generic test helpers.
 
-# Tests that exercise raw clone / PI futex / massive thread+mmap stress and
-# legitimately hang under qemu-system-aarch64 on Apple Silicon (HVF + Alpine
-# linux-virt 6.12 kernel).  They pass cleanly under elfuse and are kept in
-# the elfuse-aarch64 mode; the qemu reference run skips them with SKIP.
-QEMU_SKIP="test-thread test-stress test-futex-pi"
+# Tests that either hang under qemu-system-aarch64 on Apple Silicon
+# (raw clone / PI futex / massive thread+mmap stress) or currently diverge
+# from the Alpine linux-virt reference kernel on the deprecated oom_adj
+# procfs compatibility path exercised by test-io-opt. They still run in
+# elfuse-aarch64 mode and in `make check`; the qemu reference run skips them.
+QEMU_SKIP="test-thread test-stress test-futex-pi test-io-opt"
 
 is_qemu_skipped()
 {
diff --git a/tests/test-msync.c b/tests/test-msync.c
index e081d06..5eaec3b 100644
--- a/tests/test-msync.c
+++ b/tests/test-msync.c
@@ -10,6 +10,7 @@
  */
 
 #include <fcntl.h>
+#include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -250,6 +251,197 @@ static void test_shm_name_visible_after_fork(void)
     my_shm_unlink(name);
 }
 
+/* Real MAP_SHARED requires that host writes to the backing file are
+ * observable through the mapping without the guest calling msync. The
+ * pre-overlay implementation snapshotted file contents into private
+ * guest pages and only reconciled on msync, so this is the regression
+ * lock-in for the overlay path.
+ */
+static void test_shared_host_write_visible_without_msync(void)
+{
+    TEST("MAP_SHARED host pwrite visible without msync");
+
+    char name[64];
+    snprintf(name, sizeof(name), "/elfuse-msync-host-%ld", (long) getpid());
+    int fd = my_shm_open(name, O_CREAT | O_EXCL | O_RDWR, 0600);
+    if (fd < 0) {
+        FAIL("shm_open failed");
+        return;
+    }
+    my_shm_unlink(name);
+
+    if (ftruncate(fd, 4096) != 0) {
+        FAIL("ftruncate failed");
+        close(fd);
+        return;
+    }
+
+    /* Seed the file with a known pattern before mmap. */
+    char seed[16];
+    memset(seed, 0x11, sizeof(seed));
+    if (pwrite(fd, seed, sizeof(seed), 0) != (ssize_t) sizeof(seed)) {
+        FAIL("seed pwrite failed");
+        close(fd);
+        return;
+    }
+
+    char *p = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+    if (p == MAP_FAILED) {
+        FAIL("mmap failed");
+        close(fd);
+        return;
+    }
+
+    /* Confirm the initial seed is visible through the mapping. */
+    if (p[0] != 0x11) {
+        FAIL("initial seed not visible through mapping");
+        munmap(p, 4096);
+        close(fd);
+        return;
+    }
+
+    /* Mutate the file via pwrite (host-side write). The mapping must
+     * reflect the new bytes immediately, with no msync from the guest.
+     */
+    char update[16];
+    memset(update, 0x22, sizeof(update));
+    if (pwrite(fd, update, sizeof(update), 0) != (ssize_t) sizeof(update)) {
+        FAIL("update pwrite failed");
+        munmap(p, 4096);
+        close(fd);
+        return;
+    }
+
+    bool ok = true;
+    for (int i = 0; i < (int) sizeof(update); i++) {
+        if ((unsigned char) p[i] != 0x22) {
+            ok = false;
+            break;
+        }
+    }
+    if (ok)
+        PASS();
+    else
+        FAIL("mapping did not reflect host pwrite without msync");
+
+    munmap(p, 4096);
+    close(fd);
+}
+
+/* Guest writes to a MAP_SHARED file mapping must reach the file
+ * immediately so other readers (here, a sibling pread) see them without
+ * the guest needing to call msync. This is the converse of the
+ * host-write-visible test.
+ */
+static void test_shared_guest_write_lands_in_file(void)
+{
+    TEST("MAP_SHARED guest write lands in file without msync");
+
+    char name[64];
+    snprintf(name, sizeof(name), "/elfuse-msync-guest-%ld", (long) getpid());
+    int fd = my_shm_open(name, O_CREAT | O_EXCL | O_RDWR, 0600);
+    if (fd < 0) {
+        FAIL("shm_open failed");
+        return;
+    }
+    my_shm_unlink(name);
+
+    if (ftruncate(fd, 4096) != 0) {
+        FAIL("ftruncate failed");
+        close(fd);
+        return;
+    }
+
+    char *p = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+    if (p == MAP_FAILED) {
+        FAIL("mmap failed");
+        close(fd);
+        return;
+    }
+
+    /* Write through the mapping; do NOT call msync. */
+    const char marker[] = "elfuse-overlay";
+    memcpy(p, marker, sizeof(marker));
+
+    char buf[sizeof(marker)] = {0};
+    if (pread(fd, buf, sizeof(buf), 0) != (ssize_t) sizeof(buf))
+        FAIL("pread failed");
+    else if (!memcmp(buf, marker, sizeof(marker)))
+        PASS();
+    else
+        FAIL("guest write did not reach file without msync");
+
+    munmap(p, 4096);
+    close(fd);
+}
+
+/* On hosts with pages larger than the guest's 4 KiB granule, a MAP_SHARED
+ * overlay of one guest page can alias adjacent guest pages in the same host
+ * page. Replacing the adjacent guest page must tear down the shared overlay
+ * first so the new mapping cannot write through into the file.
+ */
+static void test_shared_adjacent_fixed_mapping_does_not_alias_file(void)
+{
+    TEST("MAP_FIXED neighbor does not inherit shared-file overlay");
+
+    size_t hps = (size_t) sysconf(_SC_PAGESIZE);
+    size_t file_len = hps > 8192 ? hps : 8192;
+
+    char name[64];
+    snprintf(name, sizeof(name), "/elfuse-msync-alias-%ld", (long) getpid());
+    int fd = my_shm_open(name, O_CREAT | O_EXCL | O_RDWR, 0600);
+    if (fd < 0) {
+        FAIL("shm_open failed");
+        return;
+    }
+    my_shm_unlink(name);
+
+    if (ftruncate(fd, (off_t) file_len) != 0) {
+        FAIL("ftruncate failed");
+        close(fd);
+        return;
+    }
+
+    unsigned char seed = 0x33;
+    if (pwrite(fd, &seed, 1, 4096) != 1) {
+        FAIL("seed pwrite failed");
+        close(fd);
+        return;
+    }
+
+    unsigned char *shared =
+        mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+    if (shared == MAP_FAILED) {
+        FAIL("shared mmap failed");
+        close(fd);
+        return;
+    }
+
+    unsigned char *adjacent =
+        mmap(shared + 4096, 4096, PROT_READ | PROT_WRITE,
+             MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
+    if (adjacent == MAP_FAILED) {
+        FAIL("adjacent MAP_FIXED mmap failed");
+        munmap(shared, 4096);
+        close(fd);
+        return;
+    }
+
+    adjacent[0] = 0x7a;
+
+    unsigned char file_byte = 0;
+    if (pread(fd, &file_byte, 1, 4096) != 1)
+        FAIL("pread failed");
+    else if (file_byte == seed)
+        PASS();
+    else
+        FAIL("adjacent anonymous write leaked into file");
+
+    munmap(shared, 4096);
+    munmap(adjacent, 4096);
+    close(fd);
+}
+
 int main(void)
 {
     printf("test-msync: MAP_SHARED msync tests\n\n");
@@ -257,6 +449,9 @@ int main(void)
     test_shared_msync_writes_file();
     test_shared_msync_refreshes_peer_mapping();
     test_shared_msync_preserves_alias_writes();
+    test_shared_host_write_visible_without_msync();
+    test_shared_guest_write_lands_in_file();
+    test_shared_adjacent_fixed_mapping_does_not_alias_file();
     test_shm_name_visible_after_fork();
 
     SUMMARY("test-msync");