Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 27 additions & 8 deletions src/core/guest.c
Original file line number Diff line number Diff line change
Expand Up @@ -441,12 +441,13 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
int guest_init_from_shm(guest_t *g,
int shm_fd,
uint64_t size,
uint32_t ipa_bits)
uint32_t ipa_bits,
bool retain_shared)
{
uint64_t t0;

memset(g, 0, sizeof(*g));
g->shm_fd = -1; /* Child does not own the shm */
g->shm_fd = -1; /* Child does not own the shm unless retain_shared */
g->ipa_base = GUEST_IPA_BASE;
g->elf_load_min = ELF_DEFAULT_BASE;
g->brk_base = BRK_BASE_DEFAULT;
Expand All @@ -471,13 +472,21 @@ int guest_init_from_shm(guest_t *g,
}
g->pt_pool_next = g->pt_pool_base;

/* Map the shm fd MAP_PRIVATE: copy-on-write semantics. Reads see the
* parent's frozen snapshot; writes are private to this process. macOS CoW
* is page-granular: only modified pages are duplicated.
/* Two mapping modes:
* retain_shared: shm_fd is an independent APFS clone of the parent's
* memory (already isolated from the parent). Map MAP_SHARED so the
* child's writes land in the clone file, then keep the fd so the child
* can fclonefileat it for its own nested CoW fork. guest_destroy closes
* it.
* otherwise: shm_fd may be the parent's live fd (clonefile fallback). Map
* MAP_PRIVATE so writes stay private to this process, then close the
* fd. macOS CoW is page-granular either way: only modified pages are
* duplicated.
*/
int map_flags = retain_shared ? MAP_SHARED : MAP_PRIVATE;
t0 = startup_trace_now_ns();
g->host_base =
mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, shm_fd, 0);
mmap(NULL, size, PROT_READ | PROT_WRITE, map_flags, shm_fd, 0);
startup_trace_step("shm_mmap", t0);
if (g->host_base == MAP_FAILED) {
perror("guest: mmap shm");
Expand All @@ -486,8 +495,10 @@ int guest_init_from_shm(guest_t *g,
return -1;
}

/* Close the shm fd; the mapping keeps the pages alive */
close(shm_fd);
if (retain_shared)
g->shm_fd = shm_fd; /* Child owns the clone; guest_destroy closes it */
else
close(shm_fd); /* MAP_PRIVATE mapping keeps the pages alive */

/* Create HVF VM with the same IPA width as the parent */
hv_return_t ret = HV_ERROR;
Expand All @@ -506,6 +517,10 @@ int guest_init_from_shm(guest_t *g,
log_error("guest: hv_vm_create (shm) failed: %d", (int) ret);
munmap(g->host_base, size);
g->host_base = NULL;
if (g->shm_fd >= 0) {
close(g->shm_fd);
g->shm_fd = -1;
}
return -1;
}

Expand All @@ -518,6 +533,10 @@ int guest_init_from_shm(guest_t *g,
hv_vm_destroy();
munmap(g->host_base, size);
g->host_base = NULL;
if (g->shm_fd >= 0) {
close(g->shm_fd);
g->shm_fd = -1;
}
return -1;
}

Expand Down
16 changes: 11 additions & 5 deletions src/core/guest.h
Original file line number Diff line number Diff line change
Expand Up @@ -774,16 +774,22 @@ static inline bool guest_addr_in_infra(const guest_t *g, uint64_t addr)
*/
int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits);

/* Initialize guest from a POSIX shared memory fd (CoW fork path). Maps shm_fd
* MAP_PRIVATE (copy-on-write), creates HVF VM, maps to hypervisor. The child
* gets an instant CoW snapshot of parent's guest memory without copying. shm_fd
* is closed after mapping.
/* Initialize guest from a shared memory fd (CoW fork path). Creates the HVF VM
* and maps the fd to the hypervisor. The child gets an instant CoW snapshot of
* the parent's guest memory without copying.
*
* retain_shared selects the mapping: when true, shm_fd is an independent APFS
* clone, so it is mapped MAP_SHARED and retained in g->shm_fd (guest_destroy
* closes it) so the child can fclonefileat it for nested CoW fork. When false,
* shm_fd may be the parent's live fd, so it is mapped MAP_PRIVATE and closed
* after mapping. This function takes ownership of shm_fd on every path.
* Returns 0 on success, -1 on failure.
*/
int guest_init_from_shm(guest_t *g,
int shm_fd,
uint64_t size,
uint32_t ipa_bits);
uint32_t ipa_bits,
bool retain_shared);

/* Tear down VM and free guest memory. */
void guest_destroy(guest_t *g);
Expand Down
16 changes: 16 additions & 0 deletions src/runtime/fork-state.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,22 @@ typedef struct {
uint64_t rosetta_entry;
uint64_t kbuf_gpa;
uint64_t ttbr1;
/* Clone TID-sync state for the fork path. glibc's fork wrapper passes
* CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID so the child writes its new TID
* into the TCB and clears it on exit. The posix_spawn child has no access
* to the original clone() arguments, so the parent forwards them here:
* clone_flags carries the CHILD_SETTID / CHILD_CLEARTID bits and ctid_gva
* the guest address. Zero for callers (e.g. raw fork(2)) that pass neither.
*/
uint64_t clone_flags;
Comment thread
jserv marked this conversation as resolved.
uint64_t ctid_gva;
/* Nonzero when the shm fd sent below is an independent fclonefileat clone
* (not the parent's live fd). Only then may the child map it MAP_SHARED and
* retain it for its own nested CoW fork; the live-fd fallback must stay
* MAP_PRIVATE so the child does not share writes with the parent.
*/
uint32_t shm_is_clone;
uint32_t _pad2;
} ipc_header_t;

typedef struct {
Expand Down
53 changes: 42 additions & 11 deletions src/runtime/forkipc.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,18 @@
#include "debug/log.h"
#include "debug/syscall-hist.h"

/* Linux clone flags. Shared by the fork-child TID-sync emulation below and
* sys_clone further down.
*/
#define LINUX_CLONE_VM 0x00000100
#define LINUX_CLONE_VFORK 0x00004000
#define LINUX_CLONE_THREAD 0x00010000
#define LINUX_CLONE_SETTLS 0x00080000
#define LINUX_CLONE_PARENT_SETTID 0x00100000
#define LINUX_CLONE_CHILD_CLEARTID 0x00200000
#define LINUX_CLONE_CHILD_SETTID 0x01000000
/* LINUX_SIGCHLD defined in syscall_signal.h (included above) */

/* fork_child_main. */

static int fork_child_vfork_notify_fd = -1;
Expand Down Expand Up @@ -166,7 +178,8 @@ int fork_child_main(int ipc_fd,
close(ipc_fd);
return 1;
}
if (guest_init_from_shm(&g, shm_fd, hdr.guest_size, hdr.ipa_bits) < 0) {
if (guest_init_from_shm(&g, shm_fd, hdr.guest_size, hdr.ipa_bits,
hdr.shm_is_clone != 0) < 0) {
log_error("fork-child: guest_init_from_shm failed");
close(ipc_fd);
return 1;
Expand Down Expand Up @@ -363,6 +376,30 @@ int fork_child_main(int ipc_fd,
*/
thread_register_main(vcpu, vexit, hdr.child_pid, regs.sp_el1);

/* Emulate CLONE_CHILD_SETTID for the fork child. glibc's fork wrapper
* passes CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID so the child's TCB
* caches its own TID; without the SETTID write the child keeps the parent's
* cached TID and modern glibc trips stack-canary / TLS checks ("stack
* smashing detected"). The write goes through guest memory, valid for both
* the CoW and region-copy paths. A faulting ctid_gva is the guest's own bad
* pointer: warn and continue, matching how the kernel ignores a
* child_tidptr fault.
*
* CLONE_CHILD_CLEARTID is deliberately not honored here. The clear-and-wake
* on exit only matters to an in-process joiner waiting on the futex (that
* is how the worker-thread exit path serves pthread_join). A fork child is
* a separate process with its own address space, so its ctid lives in
* memory no other process can observe -- the parent reaps it via
* wait4/SIGCHLD, not a cross-process futex. Registering clear_child_tid
* would be inert.
*/
if (hdr.clone_flags & LINUX_CLONE_CHILD_SETTID) {
int32_t tid32 = (int32_t) hdr.child_pid;
if (guest_write_small(&g, hdr.ctid_gva, &tid32, sizeof(tid32)) < 0)
log_warn("fork-child: CHILD_SETTID write to 0x%llx failed",
(unsigned long long) hdr.ctid_gva);
}

/* Re-publish identity into the child's shim-globals cache: the CoW / region
* copy inherits the parent's pid/uid values, and the shim's identity fast
* path would otherwise return the parent's pid to the child. Identity is
Expand Down Expand Up @@ -420,16 +457,6 @@ int fork_child_main(int ipc_fd,

/* sys_clone. */

/* Linux clone flags */
#define LINUX_CLONE_VM 0x00000100
#define LINUX_CLONE_VFORK 0x00004000
#define LINUX_CLONE_THREAD 0x00010000
#define LINUX_CLONE_SETTLS 0x00080000
#define LINUX_CLONE_PARENT_SETTID 0x00100000
#define LINUX_CLONE_CHILD_CLEARTID 0x00200000
#define LINUX_CLONE_CHILD_SETTID 0x01000000
/* LINUX_SIGCHLD defined in syscall_signal.h (included above) */

/* Namespace flags. elfuse implements no namespace isolation. Both sys_clone and
* sys_clone3 reject them.
*/
Expand Down Expand Up @@ -1528,6 +1555,10 @@ int64_t sys_clone(hv_vcpu_t vcpu,
.rosetta_entry = g->rosetta_entry,
.kbuf_gpa = g->kbuf_gpa,
.ttbr1 = g->ttbr1,
.clone_flags =
flags & (LINUX_CLONE_CHILD_SETTID | LINUX_CLONE_CHILD_CLEARTID),
.ctid_gva = ctid_gva,
.shm_is_clone = (snapshot_shm_fd >= 0) ? 1 : 0,
};
if (fork_ipc_write_all(ipc_sock, &hdr, sizeof(hdr)) < 0) {
log_error("clone: failed to send header");
Expand Down
1 change: 1 addition & 0 deletions tests/manifest.txt
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ test-signal-thread

[section] Fork edge cases
test-clone3 # diff=skip
test-clone-childtid
test-fork-exec $TESTDIR/echo-test
test-fork-lowbase

Expand Down
107 changes: 107 additions & 0 deletions tests/test-clone-childtid.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/* Test CLONE_CHILD_SETTID / CLONE_CHILD_CLEARTID on the fork (posix_spawn) path
*
* Copyright 2026 elfuse contributors
* SPDX-License-Identifier: Apache-2.0
*
* Issue #99: glibc's fork wrapper clones with CLONE_CHILD_SETTID |
* CLONE_CHILD_CLEARTID | SIGCHLD. The child's TID must be written into the
* ctid address so glibc's TCB caches the right value. This calls clone()
* directly with those exact flags (no CLONE_VM/THREAD/VFORK, so elfuse takes
* the fork helper-process path) and checks the child observes its own TID at
* the ctid slot -- glibc-version-independent, unlike the canary symptom.
*
* Raw syscall throughout: glibc's own clone wrapper does not expose the ctid
* arg, and we want to exercise elfuse's handling rather than libc's.
*/

#include <stdio.h>
#include <stdint.h>
#include <unistd.h>
#include <sched.h>
#include <sys/syscall.h>
#include <sys/wait.h>
#include <linux/sched.h>

#ifndef CLONE_CHILD_CLEARTID
#define CLONE_CHILD_CLEARTID 0x00200000
#endif
#ifndef CLONE_CHILD_SETTID
#define CLONE_CHILD_SETTID 0x01000000
#endif

static volatile int child_tid_slot;

int main(void)
{
/* aarch64 clone(2): clone(flags, stack, parent_tid, tls, child_tid). */
unsigned long flags = CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID | SIGCHLD;
long rc = syscall(SYS_clone, flags, (void *) 0, (void *) 0, (void *) 0,
(void *) &child_tid_slot);
if (rc < 0) {
printf("test-clone-childtid: clone failed -- FAIL\n");
return 1;
}

if (rc == 0) {
/* Child: the kernel (here, elfuse) must have written our TID into the
* ctid slot before we resumed.
*/
pid_t tid = (pid_t) syscall(SYS_gettid);
if (child_tid_slot != tid) {
/* Cannot printf reliably from a possibly-confused child; encode the
* result in the exit status instead.
*/
_exit(child_tid_slot == 0 ? 2 : 3);
}

/* Nested clone: the child forks a grandchild with the same flags. This
* exercises the child-side CoW shm retention (issue #99 part 2): the
* child must be able to clone its own memory again, and the grandchild
* must likewise see a fresh TID at its ctid slot.
*/
static volatile int grand_tid_slot;
long grc = syscall(SYS_clone, flags, (void *) 0, (void *) 0, (void *) 0,
(void *) &grand_tid_slot);
if (grc < 0)
_exit(4);
if (grc == 0) {
pid_t gtid = (pid_t) syscall(SYS_gettid);
_exit(grand_tid_slot == gtid ? 0 : 5);
}
int gstatus;
if (waitpid((pid_t) grc, &gstatus, 0) < 0)
_exit(6);
if (!WIFEXITED(gstatus) || WEXITSTATUS(gstatus) != 0)
_exit(7);
_exit(0);
}

int status;
if (waitpid((pid_t) rc, &status, 0) < 0) {
printf("test-clone-childtid: waitpid failed -- FAIL\n");
return 1;
}
if (!WIFEXITED(status)) {
printf(
"test-clone-childtid: child did not exit cleanly (0x%x) -- FAIL\n",
status);
return 1;
}
switch (WEXITSTATUS(status)) {
case 0:
printf("test-clone-childtid: child saw its TID at ctid -- PASS\n");
return 0;
case 2:
printf(
"test-clone-childtid: ctid slot still 0 (SETTID ignored) -- "
"FAIL\n");
return 1;
case 3:
printf("test-clone-childtid: ctid slot holds wrong TID -- FAIL\n");
return 1;
default:
printf("test-clone-childtid: unexpected child exit %d -- FAIL\n",
WEXITSTATUS(status));
return 1;
}
}
1 change: 1 addition & 0 deletions tests/test-matrix.sh
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,7 @@ run_unit_tests()

printf "\nProcess tests\n"
test_check "$runner" "test-fork" "PASS" "$bindir/test-fork"
test_check "$runner" "test-clone-childtid" "PASS" "$bindir/test-clone-childtid"
test_check "$runner" "test-exec" "exec-works" "$bindir/test-exec" "$bindir/echo-test" exec-works
test_check "$runner" "test-fork-exec" "PASS" "$bindir/test-fork-exec" "$bindir/echo-test"
test_check "$runner" "test-cloexec" "PASS" "$bindir/test-cloexec"
Expand Down
Loading