From 22d8532213457a3158132cbe31f53aacb5ac7528 Mon Sep 17 00:00:00 2001 From: Trung Date: Wed, 10 Jun 2026 14:37:10 +0700 Subject: [PATCH] Improve Linux distro rootfs compatibility on Apple Silicon Improve compatibility with real Linux distro rootfs environments on Apple Silicon hosts. Package-manager and shell workflows need behavior closer to Linux for credentials, script execution, fork/clone state, wait handling, pipes, /proc, and shared mappings. Preserve dynamic guest UID/GID state in auxv instead of always reporting fixed guest IDs, and allow the initial guest identity to be configured with ELFUSE_GUEST_UID and ELFUSE_GUEST_GID. This lets distro workflows such as apt post-install scripts run with root-like guest credentials when needed. Probe ELF binaries quietly before falling back to shebang handling, so script execution does not emit misleading "not an ELF" diagnostics. Extend fork IPC state and child restore handling to carry more complete CPU state, including TLS-related registers, PAC keys, clone flags, child TID handling, TPIDRRO_EL0, TPIDR2_EL0, and the original SPSR. Add child process monitoring so host child exit can wake Linux-style wait and signal behavior. Align non-fixed file-backed MAP_SHARED mappings to 2 MiB stage-2 boundaries to avoid HVF mapping issues on Apple Silicon. Improve sysroot symlink creation for absolute guest symlink targets, and add small Linux compatibility behavior for sync_file_range and pipe F_SETNOSIGPIPE. These changes were tested with an Ubuntu arm64 rootfs using shell pipelines, /proc checks, and apt-get update smoke testing. --- src/core/elf.c | 36 +++++-- src/core/elf.h | 1 + src/core/guest.c | 13 +++ src/core/stack.c | 14 +-- src/runtime/fork-state.c | 2 +- src/runtime/fork-state.h | 28 ++++- src/runtime/forkipc.c | 201 +++++++++++++++++++++++++++++++++--- src/runtime/procemu.c | 66 ++++++++---- src/syscall/abi.h | 1 + src/syscall/dispatch.tbl | 1 + src/syscall/exec.c | 6 +- src/syscall/fs.c | 146 +++++++++++++++++++++++++- src/syscall/io.c | 4 - src/syscall/mem.c | 72 +++++++------ src/syscall/proc-identity.c | 86 ++++++++++----- src/syscall/proc-state.c | 19 ++-- src/syscall/proc.c | 1 + src/syscall/sys.c | 36 +------ src/syscall/syscall.c | 18 +++- 19 files changed, 580 insertions(+), 171 deletions(-) diff --git a/src/core/elf.c b/src/core/elf.c index 575e1c7..4b42c9c 100644 --- a/src/core/elf.c +++ b/src/core/elf.c @@ -8,6 +8,7 @@ * segments, and copies them into guest memory. */ +#include #include #include #include @@ -19,19 +20,21 @@ #include "debug/log.h" #include "utils.h" -int elf_load(const char *path, elf_info_t *info) +static int elf_load_impl(const char *path, elf_info_t *info, bool quiet) { memset(info, 0, sizeof(*info)); FILE *f = fopen(path, "rb"); if (!f) { - perror(path); + if (!quiet) + perror(path); return -1; } elf64_ehdr_t ehdr; if (fread(&ehdr, sizeof(ehdr), 1, f) != 1) { - log_error("%s: failed to read ELF header", path); + if (!quiet) + log_error("%s: failed to read ELF header", path); fclose(f); return -1; } @@ -39,21 +42,24 @@ int elf_load(const char *path, elf_info_t *info) /* Reject non-ELF inputs before interpreting the rest of the header. */ if (ehdr.e_ident[0] != ELFMAG0 || ehdr.e_ident[1] != ELFMAG1 || ehdr.e_ident[2] != ELFMAG2 || ehdr.e_ident[3] != ELFMAG3) { - log_error("%s: not an ELF file", path); + if (!quiet) + log_error("%s: not an ELF file", path); fclose(f); return -1; } /* elfuse only implements the 64-bit Linux ABI. */ if (ehdr.e_ident[EI_CLASS] != ELFCLASS64) { - log_error("%s: not a 64-bit ELF", path); + if (!quiet) + log_error("%s: not a 64-bit ELF", path); fclose(f); return -1; } /* aarch64-linux user binaries are little-endian in the supported mode. */ if (ehdr.e_ident[EI_DATA] != ELFDATA2LSB) { - log_error("%s: not little-endian", path); + if (!quiet) + log_error("%s: not little-endian", path); fclose(f); return -1; } @@ -62,8 +68,9 @@ int elf_load(const char *path, elf_info_t *info) * diagnostic instead of a generic parse failure. */ if (ehdr.e_machine != EM_AARCH64 && ehdr.e_machine != EM_X86_64) { - log_error("%s: unsupported architecture (e_machine=%u)", path, - ehdr.e_machine); + if (!quiet) + log_error("%s: unsupported architecture (e_machine=%u)", path, + ehdr.e_machine); fclose(f); return -1; } @@ -72,7 +79,8 @@ int elf_load(const char *path, elf_info_t *info) * the load base that keeps them away from elfuse's reserved regions. */ if (ehdr.e_type != ET_EXEC && ehdr.e_type != ET_DYN) { - log_error("%s: not an executable (e_type=%u)", path, ehdr.e_type); + if (!quiet) + log_error("%s: not an executable (e_type=%u)", path, ehdr.e_type); fclose(f); return -1; } @@ -204,6 +212,16 @@ int elf_load(const char *path, elf_info_t *info) return 0; } +int elf_load(const char *path, elf_info_t *info) +{ + return elf_load_impl(path, info, false); +} + +int elf_load_quiet(const char *path, elf_info_t *info) +{ + return elf_load_impl(path, info, true); +} + int elf_map_segments(const elf_info_t *info, const char *path, void *guest_base, diff --git a/src/core/elf.h b/src/core/elf.h index a8ce7ce..1f8428d 100644 --- a/src/core/elf.h +++ b/src/core/elf.h @@ -105,6 +105,7 @@ typedef struct { * Returns 0 on success, -1 on failure. Does NOT copy to guest yet. */ int elf_load(const char *path, elf_info_t *info); +int elf_load_quiet(const char *path, elf_info_t *info); /* Copy ELF segments into guest memory. Call after elf_load() and guest_init(). * Also copies program headers into guest memory for AT_PHDR. load_base is added diff --git a/src/core/guest.c b/src/core/guest.c index 01422b8..1648f6f 100644 --- a/src/core/guest.c +++ b/src/core/guest.c @@ -1661,6 +1661,19 @@ int guest_get_used_regions(const guest_t *g, n++; } + /* Interpreter high block. The dynamic linker stores process-global state + * such as __stack_chk_guard in its high mapping just above interp_base. + * Fork children that take the region-copy path must inherit those bytes; + * otherwise libc's post-fork canary check observes zeroed guard storage + * and aborts before the child can exec. + */ + if (n < max && g->interp_base > 0 && + g->interp_base <= g->guest_size - BLOCK_2MIB) { + out[n].offset = g->interp_base; + out[n].size = BLOCK_2MIB; + n++; + } + /* ELF + brk region: from elf_load_min (set by ELF loader) to brk_current. * The lower bound is the actual ELF load address, not ELF_DEFAULT_BASE: * ET_EXECs linked below 0x400000 (e.g. at 0x200000) have segments below the diff --git a/src/core/stack.c b/src/core/stack.c index e0a8d19..319d49b 100644 --- a/src/core/stack.c +++ b/src/core/stack.c @@ -16,7 +16,7 @@ #include #include "core/stack.h" -#include "syscall/abi.h" /* GUEST_UID, GUEST_GID */ +#include "syscall/proc.h" /* Linux aarch64 HWCAP bits (from asm/hwcap.h). Only the bits the VZ-sanitized * ID registers actually advertise are listed here; HWCAP bits left out (e.g., @@ -284,12 +284,12 @@ uint64_t build_linux_stack(guest_t *g, AUX(AT_PHENT, elf_info->phentsize); AUX(AT_PHNUM, elf_info->phnum); AUX(AT_ENTRY, elf_info->entry + elf_load_base); - AUX(AT_UID, GUEST_UID); - AUX(AT_EUID, GUEST_UID); - AUX(AT_GID, GUEST_GID); - AUX(AT_EGID, GUEST_GID); - /* Bionic's __libc_init_AT_SECURE aborts when AT_SECURE is absent. elfuse - * never elevates privileges, so AT_SECURE is always 0. + AUX(AT_UID, proc_get_uid()); + AUX(AT_EUID, proc_get_euid()); + AUX(AT_GID, proc_get_gid()); + AUX(AT_EGID, proc_get_egid()); + /* Bionic's __libc_init_AT_SECURE aborts when AT_SECURE is absent. + * elfuse never elevates privileges, so AT_SECURE is always 0. */ AUX(AT_SECURE, 0); AUX(AT_HWCAP2, query_hwcap2()); diff --git a/src/runtime/fork-state.c b/src/runtime/fork-state.c index 8f00258..bca4881 100644 --- a/src/runtime/fork-state.c +++ b/src/runtime/fork-state.c @@ -71,7 +71,7 @@ int fork_ipc_read_all(int fd, void *buf, size_t len) * message comfortably below that limit and stream large fd sets in multiple * chunks. */ -#define FORK_IPC_FD_CHUNK 120 +#define FORK_IPC_FD_CHUNK 32 int fork_ipc_send_fds(int sock, const int *fds, int count) { diff --git a/src/runtime/fork-state.h b/src/runtime/fork-state.h index 18f3443..a71c336 100644 --- a/src/runtime/fork-state.h +++ b/src/runtime/fork-state.h @@ -18,14 +18,30 @@ /* Magic values for IPC frame delimiters */ #define IPC_MAGIC_HEADER 0x454C464BU /* "ELFK" */ #define IPC_MAGIC_SENTINEL 0x454C4F4BU /* "ELOK" */ -/* Bumped to 11 when regions_tracker_stale was added to process state so forked - * children preserve mprotect fast-path correctness. +/* Bumped to 13 when pointer-authentication key registers and the remaining + * EL0 TLS registers were added so forked children and clone-created vCPUs + * resume with the same userspace CPU context as the parent. New Ubuntu arm64 + * userspace can use PAC in libc and TLS-adjacent state during fork return. + * + * Bumped to 12 when clone_flags/child_tid_gva were added so fork-process + * children can apply CLONE_CHILD_SETTID/CLEARTID inside their own snapshot. + * + * Bumped to 11 when regions_tracker_stale was added to process state so + * forked children preserve mprotect fast-path correctness. * * Bumped to 10 when the rosetta placement / kbuf / ttbr1 tuple was added so a * rosetta-aware child rejects an older parent's header instead of trying to * interpret unknown trailing fields. */ -#define IPC_VERSION 11 +#define IPC_VERSION 13 + +typedef struct { + uint64_t apiakeylo_el1, apiakeyhi_el1; + uint64_t apibkeylo_el1, apibkeyhi_el1; + uint64_t apdakeylo_el1, apdakeyhi_el1; + uint64_t apdbkeylo_el1, apdbkeyhi_el1; + uint64_t apgakeylo_el1, apgakeyhi_el1; +} ipc_pauth_keys_t; typedef struct { uint32_t magic; @@ -60,6 +76,8 @@ typedef struct { uint64_t rosetta_entry; uint64_t kbuf_gpa; uint64_t ttbr1; + uint64_t clone_flags; + uint64_t child_tid_gva; } ipc_header_t; typedef struct { @@ -74,8 +92,10 @@ typedef struct { * access faults. */ uint64_t ttbr1_el1; - uint64_t sctlr_el1, tcr_el1, mair_el1, cpacr_el1, tpidr_el0, sp_el1; + uint64_t sctlr_el1, tcr_el1, mair_el1, cpacr_el1; + uint64_t tpidr_el0, tpidrro_el0, tpidr2_el0, sp_el1; uint64_t x[31]; + ipc_pauth_keys_t pauth_keys; vcpu_simd_state_t simd_state; } ipc_registers_t; diff --git a/src/runtime/forkipc.c b/src/runtime/forkipc.c index 19de841..2b311d0 100644 --- a/src/runtime/forkipc.c +++ b/src/runtime/forkipc.c @@ -22,6 +22,7 @@ #include #include #include +#include #include /* fdopendir, for DIR* reconstruction in child */ #include #include /* fclonefileat for CoW shm snapshots */ @@ -48,10 +49,143 @@ #include "debug/log.h" #include "debug/syscall-hist.h" +typedef struct fork_child_monitor_arg { + pid_t host_pid; +} fork_child_monitor_arg_t; + +static void *fork_child_monitor_main(void *arg) +{ + fork_child_monitor_arg_t *m = (fork_child_monitor_arg_t *) arg; + pid_t host_pid = m->host_pid; + free(m); + + int kq = kqueue(); + if (kq < 0) { + log_warn("clone: child monitor kqueue failed for pid=%d: %s", + (int) host_pid, strerror(errno)); + return NULL; + } + + struct kevent kev; + EV_SET(&kev, (uintptr_t) host_pid, EVFILT_PROC, EV_ADD | EV_ONESHOT, + NOTE_EXIT, 0, NULL); + if (kevent(kq, &kev, 1, NULL, 0, NULL) < 0) { + if (errno != ESRCH) + log_warn("clone: child monitor kevent add pid=%d failed: %s", + (int) host_pid, strerror(errno)); + close(kq); + return NULL; + } + + do { + errno = 0; + } while (kevent(kq, NULL, 0, &kev, 1, NULL) < 0 && errno == EINTR); + close(kq); + signal_queue(LINUX_SIGCHLD); + return NULL; +} + +static void fork_child_monitor_start(pid_t host_pid) +{ + fork_child_monitor_arg_t *arg = calloc(1, sizeof(*arg)); + if (!arg) { + log_warn("clone: failed to allocate child monitor for pid=%d", + (int) host_pid); + return; + } + arg->host_pid = host_pid; + + pthread_t thread; + pthread_attr_t attr; + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + int err = pthread_create(&thread, &attr, fork_child_monitor_main, arg); + pthread_attr_destroy(&attr); + if (err != 0) { + log_warn("clone: failed to start child monitor for pid=%d: %s", + (int) host_pid, strerror(err)); + free(arg); + } +} + +/* Pointer-authentication sysregs were added to recent macOS SDKs. Define the + * architectural encodings as a fallback so older SDK headers can still build + * the runtime. + */ +#ifndef HV_SYS_REG_APIAKEYLO_EL1 +#define HV_SYS_REG_APIAKEYLO_EL1 ((hv_sys_reg_t) 0xc108) +#define HV_SYS_REG_APIAKEYHI_EL1 ((hv_sys_reg_t) 0xc109) +#define HV_SYS_REG_APIBKEYLO_EL1 ((hv_sys_reg_t) 0xc10a) +#define HV_SYS_REG_APIBKEYHI_EL1 ((hv_sys_reg_t) 0xc10b) +#define HV_SYS_REG_APDAKEYLO_EL1 ((hv_sys_reg_t) 0xc110) +#define HV_SYS_REG_APDAKEYHI_EL1 ((hv_sys_reg_t) 0xc111) +#define HV_SYS_REG_APDBKEYLO_EL1 ((hv_sys_reg_t) 0xc112) +#define HV_SYS_REG_APDBKEYHI_EL1 ((hv_sys_reg_t) 0xc113) +#define HV_SYS_REG_APGAKEYLO_EL1 ((hv_sys_reg_t) 0xc118) +#define HV_SYS_REG_APGAKEYHI_EL1 ((hv_sys_reg_t) 0xc119) +#endif +#ifndef HV_SYS_REG_TPIDRRO_EL0 +#define HV_SYS_REG_TPIDRRO_EL0 ((hv_sys_reg_t) 0xde83) +#endif +#ifndef HV_SYS_REG_TPIDR2_EL0 +#define HV_SYS_REG_TPIDR2_EL0 ((hv_sys_reg_t) 0xde85) +#endif + +static void capture_pauth_keys(hv_vcpu_t vcpu, ipc_pauth_keys_t *keys) +{ + keys->apiakeylo_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_APIAKEYLO_EL1); + keys->apiakeyhi_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_APIAKEYHI_EL1); + keys->apibkeylo_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_APIBKEYLO_EL1); + keys->apibkeyhi_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_APIBKEYHI_EL1); + keys->apdakeylo_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_APDAKEYLO_EL1); + keys->apdakeyhi_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_APDAKEYHI_EL1); + keys->apdbkeylo_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_APDBKEYLO_EL1); + keys->apdbkeyhi_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_APDBKEYHI_EL1); + keys->apgakeylo_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_APGAKEYLO_EL1); + keys->apgakeyhi_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_APGAKEYHI_EL1); +} + +static hv_return_t restore_pauth_keys(hv_vcpu_t vcpu, + const ipc_pauth_keys_t *keys) +{ + hv_return_t r; + +#define SET_PAUTH_KEY(reg, val) \ + do { \ + r = hv_vcpu_set_sys_reg(vcpu, reg, val); \ + if (r != HV_SUCCESS) \ + return r; \ + } while (0) + + SET_PAUTH_KEY(HV_SYS_REG_APIAKEYLO_EL1, keys->apiakeylo_el1); + SET_PAUTH_KEY(HV_SYS_REG_APIAKEYHI_EL1, keys->apiakeyhi_el1); + SET_PAUTH_KEY(HV_SYS_REG_APIBKEYLO_EL1, keys->apibkeylo_el1); + SET_PAUTH_KEY(HV_SYS_REG_APIBKEYHI_EL1, keys->apibkeyhi_el1); + SET_PAUTH_KEY(HV_SYS_REG_APDAKEYLO_EL1, keys->apdakeylo_el1); + SET_PAUTH_KEY(HV_SYS_REG_APDAKEYHI_EL1, keys->apdakeyhi_el1); + SET_PAUTH_KEY(HV_SYS_REG_APDBKEYLO_EL1, keys->apdbkeylo_el1); + SET_PAUTH_KEY(HV_SYS_REG_APDBKEYHI_EL1, keys->apdbkeyhi_el1); + SET_PAUTH_KEY(HV_SYS_REG_APGAKEYLO_EL1, keys->apgakeylo_el1); + SET_PAUTH_KEY(HV_SYS_REG_APGAKEYHI_EL1, keys->apgakeyhi_el1); +#undef SET_PAUTH_KEY + + return HV_SUCCESS; +} + /* fork_child_main. */ static int fork_child_vfork_notify_fd = -1; +/* Linux clone flags */ +#define LINUX_CLONE_VM 0x00000100 +#define LINUX_CLONE_VFORK 0x00004000 +#define LINUX_CLONE_THREAD 0x00010000 +#define LINUX_CLONE_SETTLS 0x00080000 +#define LINUX_CLONE_PARENT_SETTID 0x00100000 +#define LINUX_CLONE_CHILD_CLEARTID 0x00200000 +#define LINUX_CLONE_CHILD_SETTID 0x01000000 +/* LINUX_SIGCHLD defined in syscall_signal.h (included above) */ + void fork_notify_vfork_exec(void) { if (fork_child_vfork_notify_fd < 0) @@ -275,6 +409,16 @@ int fork_child_main(int ipc_fd, return 1; } + if ((hdr.clone_flags & LINUX_CLONE_CHILD_SETTID) && hdr.child_tid_gva) { + int32_t tid32 = (int32_t) hdr.child_pid; + if (guest_write_small(&g, hdr.child_tid_gva, &tid32, sizeof(tid32)) < + 0) { + log_error("fork-child: failed to write CLONE_CHILD_SETTID"); + guest_destroy(&g); + return 1; + } + } + /* POSIX: "Signals pending to the parent shall not be pending to the child." * Clear pending bitmask and RT queue before applying state. * signal_set_state() is deferred until after thread_register_main() so that @@ -313,9 +457,14 @@ int fork_child_main(int ipc_fd, HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TTBR0_EL1, regs.ttbr0_el1)); HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TTBR1_EL1, regs.ttbr1_el1)); HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_CPACR_EL1, regs.cpacr_el1)); + uint64_t child_sp_el1 = g.ipa_base + g.shim_data_base + BLOCK_2MIB; HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SP_EL0, regs.sp_el0)); - HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SP_EL1, regs.sp_el1)); + HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SP_EL1, child_sp_el1)); HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDR_EL0, regs.tpidr_el0)); + HV_CHECK( + hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDRRO_EL0, regs.tpidrro_el0)); + HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDR2_EL0, regs.tpidr2_el0)); + HV_CHECK(restore_pauth_keys(vcpu, ®s.pauth_keys)); /* TPIDR_EL1 is set by the host (never inherited from the parent's register * snapshot) because it must point at the child's own shim_globals base in @@ -355,13 +504,15 @@ int fork_child_main(int ipc_fd, HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_ELR_EL1, regs.elr_el1)); HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SPSR_EL1, regs.spsr_el1)); HV_CHECK(hv_vcpu_set_reg(vcpu, HV_REG_PC, regs.elr_el1)); - HV_CHECK(hv_vcpu_set_reg(vcpu, HV_REG_CPSR, 0)); /* EL0t */ + HV_CHECK(hv_vcpu_set_reg(vcpu, HV_REG_CPSR, regs.spsr_el1)); /* Register the fork child's main thread in the thread table. Without this, * current_thread is NULL and any syscall handler that accesses per-thread * state (signal masks, ptrace, CLONE_THREAD) will dereference NULL. */ - thread_register_main(vcpu, vexit, hdr.child_pid, regs.sp_el1); + thread_register_main(vcpu, vexit, hdr.child_pid, child_sp_el1); + if ((hdr.clone_flags & LINUX_CLONE_CHILD_CLEARTID) && hdr.child_tid_gva) + current_thread->clear_child_tid = hdr.child_tid_gva; /* Re-publish identity into the child's shim-globals cache: the CoW / region * copy inherits the parent's pid/uid values, and the shim's identity fast @@ -420,16 +571,6 @@ int fork_child_main(int ipc_fd, /* sys_clone. */ -/* Linux clone flags */ -#define LINUX_CLONE_VM 0x00000100 -#define LINUX_CLONE_VFORK 0x00004000 -#define LINUX_CLONE_THREAD 0x00010000 -#define LINUX_CLONE_SETTLS 0x00080000 -#define LINUX_CLONE_PARENT_SETTID 0x00100000 -#define LINUX_CLONE_CHILD_CLEARTID 0x00200000 -#define LINUX_CLONE_CHILD_SETTID 0x01000000 -/* LINUX_SIGCHLD defined in syscall_signal.h (included above) */ - /* Namespace flags. elfuse implements no namespace isolation. Both sys_clone and * sys_clone3 reject them. */ @@ -467,7 +608,8 @@ typedef struct { uint64_t child_stack, flags, tls; /* Parent system regs to copy into the new vCPU */ uint64_t elr, spsr, vbar, ttbr0, sctlr, tcr, mair, cpacr; - uint64_t tpidr; + uint64_t tpidr, tpidrro, tpidr2; + ipc_pauth_keys_t pauth_keys; uint64_t gprs[31]; uint64_t sp_el1; vcpu_simd_state_t simd_state; @@ -559,6 +701,12 @@ static int64_t sys_clone_thread(hv_vcpu_t parent_vcpu, parent_mair = vcpu_get_sysreg(parent_vcpu, HV_SYS_REG_MAIR_EL1); parent_cpacr = vcpu_get_sysreg(parent_vcpu, HV_SYS_REG_CPACR_EL1); parent_tpidr = vcpu_get_sysreg(parent_vcpu, HV_SYS_REG_TPIDR_EL0); + uint64_t parent_tpidrro = + vcpu_get_sysreg(parent_vcpu, HV_SYS_REG_TPIDRRO_EL0); + uint64_t parent_tpidr2 = + vcpu_get_sysreg(parent_vcpu, HV_SYS_REG_TPIDR2_EL0); + ipc_pauth_keys_t parent_pauth_keys; + capture_pauth_keys(parent_vcpu, &parent_pauth_keys); uint64_t parent_gprs[31]; vcpu_snapshot_gprs(parent_vcpu, parent_gprs); @@ -587,6 +735,9 @@ static int64_t sys_clone_thread(hv_vcpu_t parent_vcpu, tca->mair = parent_mair; tca->cpacr = parent_cpacr; tca->tpidr = parent_tpidr; + tca->tpidrro = parent_tpidrro; + tca->tpidr2 = parent_tpidr2; + tca->pauth_keys = parent_pauth_keys; memcpy(tca->gprs, parent_gprs, sizeof(parent_gprs)); tca->sp_el1 = child_sp_el1; vcpu_snapshot_simd(parent_vcpu, &tca->simd_state); @@ -762,6 +913,9 @@ static void *thread_create_and_run(void *arg) } else { WORKER_HV(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDR_EL0, tca->tpidr)); } + WORKER_HV(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDRRO_EL0, tca->tpidrro)); + WORKER_HV(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDR2_EL0, tca->tpidr2)); + WORKER_HV(restore_pauth_keys(vcpu, &tca->pauth_keys)); /* ELR_EL1 = clone return point (same as parent) */ WORKER_HV(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_ELR_EL1, tca->elr)); @@ -943,6 +1097,12 @@ static int64_t sys_clone_vm(hv_vcpu_t parent_vcpu, uint64_t parent_mair = vcpu_get_sysreg(parent_vcpu, HV_SYS_REG_MAIR_EL1); uint64_t parent_cpacr = vcpu_get_sysreg(parent_vcpu, HV_SYS_REG_CPACR_EL1); uint64_t parent_tpidr = vcpu_get_sysreg(parent_vcpu, HV_SYS_REG_TPIDR_EL0); + uint64_t parent_tpidrro = + vcpu_get_sysreg(parent_vcpu, HV_SYS_REG_TPIDRRO_EL0); + uint64_t parent_tpidr2 = + vcpu_get_sysreg(parent_vcpu, HV_SYS_REG_TPIDR2_EL0); + ipc_pauth_keys_t parent_pauth_keys; + capture_pauth_keys(parent_vcpu, &parent_pauth_keys); uint64_t parent_gprs[31]; vcpu_snapshot_gprs(parent_vcpu, parent_gprs); @@ -970,6 +1130,9 @@ static int64_t sys_clone_vm(hv_vcpu_t parent_vcpu, tca->mair = parent_mair; tca->cpacr = parent_cpacr; tca->tpidr = parent_tpidr; + tca->tpidrro = parent_tpidrro; + tca->tpidr2 = parent_tpidr2; + tca->pauth_keys = parent_pauth_keys; memcpy(tca->gprs, parent_gprs, sizeof(parent_gprs)); tca->sp_el1 = child_sp_el1; vcpu_snapshot_simd(parent_vcpu, &tca->simd_state); @@ -1070,6 +1233,9 @@ static void *vm_clone_thread_run(void *arg) } else { HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDR_EL0, tca->tpidr)); } + HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDRRO_EL0, tca->tpidrro)); + HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDR2_EL0, tca->tpidr2)); + HV_CHECK(restore_pauth_keys(vcpu, &tca->pauth_keys)); /* ELR_EL1 = clone return point */ HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_ELR_EL1, tca->elr)); @@ -1528,6 +1694,8 @@ int64_t sys_clone(hv_vcpu_t vcpu, .rosetta_entry = g->rosetta_entry, .kbuf_gpa = g->kbuf_gpa, .ttbr1 = g->ttbr1, + .clone_flags = flags, + .child_tid_gva = ctid_gva, }; if (fork_ipc_write_all(ipc_sock, &hdr, sizeof(hdr)) < 0) { log_error("clone: failed to send header"); @@ -1561,7 +1729,10 @@ int64_t sys_clone(hv_vcpu_t vcpu, regs.mair_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_MAIR_EL1); regs.cpacr_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_CPACR_EL1); regs.tpidr_el0 = vcpu_get_sysreg(vcpu, HV_SYS_REG_TPIDR_EL0); + regs.tpidrro_el0 = vcpu_get_sysreg(vcpu, HV_SYS_REG_TPIDRRO_EL0); + regs.tpidr2_el0 = vcpu_get_sysreg(vcpu, HV_SYS_REG_TPIDR2_EL0); regs.sp_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_SP_EL1); + capture_pauth_keys(vcpu, ®s.pauth_keys); vcpu_snapshot_gprs(vcpu, regs.x); vcpu_snapshot_simd(vcpu, ®s.simd_state); @@ -1660,6 +1831,8 @@ int64_t sys_clone(hv_vcpu_t vcpu, if (waited == child_host_pid) proc_mark_child_exited(child_host_pid, status); } + } else { + fork_child_monitor_start(child_host_pid); } log_debug("clone: child pid=%lld (host=%d)", (long long) child_guest_pid, diff --git a/src/runtime/procemu.c b/src/runtime/procemu.c index 7d28599..816df4c 100644 --- a/src/runtime/procemu.c +++ b/src/runtime/procemu.c @@ -1676,7 +1676,13 @@ static int pty_keepalive_register_locked(int master_host_fd, if (pty_keepalive_table[slot].slave_host_fd >= 0 && pty_keepalive_table[slot].slave_host_fd != slave_host_fd) close(pty_keepalive_table[slot].slave_host_fd); - pty_keepalive_table[slot].slave_host_fd = slave_host_fd; + if (stale_open_once) { + pty_keepalive_table[slot].slave_host_fd = slave_host_fd; + } else { + if (slave_host_fd >= 0) + close(slave_host_fd); + pty_keepalive_table[slot].slave_host_fd = PTY_KEEPALIVE_FREE; + } pty_keepalive_table[slot].linux_pts_num = linux_pts_num; pty_keepalive_table[slot].stale_open_once = stale_open_once; if (slave_path) @@ -2051,21 +2057,27 @@ void proc_pty_dup_keepalive_locked(int src_master_host_fd, int slot = pty_keepalive_find_master_locked(src_master_host_fd); if (slot < 0) return; - int dst_slave = dup(pty_keepalive_table[slot].slave_host_fd); - if (dst_slave < 0) - return; + int dst_slave = -1; + if (pty_keepalive_table[slot].slave_host_fd >= 0) { + dst_slave = dup(pty_keepalive_table[slot].slave_host_fd); + if (dst_slave < 0) + return; + } uint32_t src_pts_num = pty_keepalive_table[slot].linux_pts_num; char src_slave_path[PTY_SLAVE_PATH_MAX]; memcpy(src_slave_path, pty_keepalive_table[slot].slave_path, PTY_SLAVE_PATH_MAX); - /* dup(2) clears FD_CLOEXEC; the keepalive must not survive exec into a - * guest child that has no map back to it. - */ - int fdflags = fcntl(dst_slave, F_GETFD); - if (fdflags < 0 || fcntl(dst_slave, F_SETFD, fdflags | FD_CLOEXEC) < 0) { - close(dst_slave); - return; + if (dst_slave >= 0) { + /* dup(2) clears FD_CLOEXEC; the keepalive must not survive exec into + * a guest child that has no map back to it. + */ + int fdflags = fcntl(dst_slave, F_GETFD); + if (fdflags < 0 || + fcntl(dst_slave, F_SETFD, fdflags | FD_CLOEXEC) < 0) { + close(dst_slave); + return; + } } int rc = pty_keepalive_register_locked(dst_master_host_fd, dst_slave, @@ -2076,7 +2088,8 @@ void proc_pty_dup_keepalive_locked(int src_master_host_fd, * fd that should not already be in the table unless a prior close * skipped proc_pty_close_keepalive. */ - close(dst_slave); + if (dst_slave >= 0) + close(dst_slave); } } @@ -2156,11 +2169,18 @@ int proc_pty_snapshot_keepalive(proc_pty_ipc_entry_t *out_entries, if (pty_keepalive_table[i].master_host_fd == PTY_KEEPALIVE_FREE) continue; - /* dup under the lock so the slave fd cannot be closed and the host fd - * number recycled before SCM_RIGHTS reads it. The caller closes the dup - * after the send completes. + /* Live entries keep only the slave path so the master can observe HUP + * when the real child-side slave closes. Open a temporary slave fd only + * for SCM_RIGHTS handoff to the fork child; stale one-shot entries may + * already carry a retained slave fd and can still be duped. */ - int duped = dup(pty_keepalive_table[i].slave_host_fd); + int duped = -1; + if (pty_keepalive_table[i].slave_host_fd >= 0) { + duped = dup(pty_keepalive_table[i].slave_host_fd); + } else if (pty_keepalive_table[i].slave_path[0] != '\0') { + duped = open(pty_keepalive_table[i].slave_path, + O_RDWR | O_NOCTTY | O_CLOEXEC); + } if (duped < 0) continue; @@ -2554,10 +2574,11 @@ int proc_intercept_open(const guest_t *g, "VmRSS:\t%llu kB\n" "Threads:\t%d\n", name, (long long) proc_get_pid(), (long long) proc_get_pid(), - (long long) proc_get_ppid(), GUEST_UID, GUEST_UID, GUEST_UID, - GUEST_UID, GUEST_GID, GUEST_GID, GUEST_GID, GUEST_GID, - (unsigned long long) vm_size_kb, (unsigned long long) vm_size_kb, - (unsigned long long) vm_rss_kb, threads); + (long long) proc_get_ppid(), proc_get_uid(), proc_get_euid(), + proc_get_suid(), proc_get_euid(), proc_get_gid(), proc_get_egid(), + proc_get_sgid(), proc_get_egid(), (unsigned long long) vm_size_kb, + (unsigned long long) vm_size_kb, (unsigned long long) vm_rss_kb, + threads); } /* /proc/self/limits -> resource limits from prlimit64 cache */ @@ -2660,8 +2681,9 @@ int proc_intercept_open(const guest_t *g, "Gid:\t%d\t%d\t%d\t%d\n" "Threads:\t%d\n", proc_comm_name(), (long long) proc_get_pid(), tid, - (long long) proc_get_ppid(), GUEST_UID, GUEST_UID, GUEST_UID, - GUEST_UID, GUEST_GID, GUEST_GID, GUEST_GID, GUEST_GID, + (long long) proc_get_ppid(), proc_get_uid(), proc_get_euid(), + proc_get_suid(), proc_get_euid(), proc_get_gid(), + proc_get_egid(), proc_get_sgid(), proc_get_egid(), thread_active_count()); } diff --git a/src/syscall/abi.h b/src/syscall/abi.h index 8154281..f3459c7 100644 --- a/src/syscall/abi.h +++ b/src/syscall/abi.h @@ -75,6 +75,7 @@ #define SYS_sync 81 #define SYS_fsync 82 #define SYS_fdatasync 83 +#define SYS_sync_file_range 84 #define SYS_utimensat 88 #define SYS_exit 93 #define SYS_exit_group 94 diff --git a/src/syscall/dispatch.tbl b/src/syscall/dispatch.tbl index 3bb0156..fe23a30 100644 --- a/src/syscall/dispatch.tbl +++ b/src/syscall/dispatch.tbl @@ -100,6 +100,7 @@ SYS_fremovexattr sc_fremovexattr 1 SYS_sync sc_sync 1 SYS_fsync sc_fsync 1 SYS_fdatasync sc_fdatasync 1 +SYS_sync_file_range sc_sync_file_range 0 SYS_msync sc_msync 0 SYS_membarrier sc_membarrier 0 diff --git a/src/syscall/exec.c b/src/syscall/exec.c index ef57f7f..b95481a 100644 --- a/src/syscall/exec.c +++ b/src/syscall/exec.c @@ -328,9 +328,9 @@ int64_t sys_execve(hv_vcpu_t vcpu, * binfmt_script. */ elf_info_t elf_info; - if (elf_load(path_host, &elf_info) < 0) { - /* Not a valid ELF. Check if it's a script with a shebang line. Read the - * first 256 bytes and look for "#!" at the start. + if (elf_load_quiet(path_host, &elf_info) < 0) { + /* Not a valid ELF. Check if it's a script with a shebang line. + * Read the first 256 bytes and look for "#!" at the start. */ int script_fd = open(path_host, O_RDONLY); if (script_fd < 0) { diff --git a/src/syscall/fs.c b/src/syscall/fs.c index 51264be..6eede8d 100644 --- a/src/syscall/fs.c +++ b/src/syscall/fs.c @@ -288,6 +288,136 @@ static int64_t reject_unsupported_fuse_path_op(const path_translation_t *tx) return tx && tx->fuse_path ? -LINUX_ENOSYS : INT64_MIN; } +static int path_parent_copy(const char *path, char *out, size_t outsz) +{ + size_t len = str_copy_trunc(out, path, outsz); + if (len >= outsz) { + errno = ENAMETOOLONG; + return -1; + } + + char *slash = strrchr(out, '/'); + if (!slash) { + str_copy_trunc(out, ".", outsz); + } else if (slash == out) { + out[1] = '\0'; + } else { + *slash = '\0'; + } + return 0; +} + +static int append_path_part(char *out, + size_t outsz, + size_t *used, + const char *part, + size_t part_len) +{ + if (*used + part_len >= outsz) { + errno = ENAMETOOLONG; + return -1; + } + memcpy(out + *used, part, part_len); + *used += part_len; + out[*used] = '\0'; + return 0; +} + +static int relative_path_between(const char *from_dir, + const char *to_path, + char *out, + size_t outsz) +{ + size_t common = 0; + for (size_t i = 0; from_dir[i] && to_path[i] && from_dir[i] == to_path[i]; + i++) { + if (from_dir[i] == '/') + common = i; + } + + if (common == 0) { + errno = EXDEV; + return -1; + } + + const char *from_tail = from_dir + common; + while (*from_tail == '/') + from_tail++; + const char *to_tail = to_path + common; + while (*to_tail == '/') + to_tail++; + + size_t used = 0; + out[0] = '\0'; + for (const char *p = from_tail; *p;) { + while (*p == '/') + p++; + if (!*p) + break; + const char *next = strchr(p, '/'); + if (append_path_part(out, outsz, &used, "../", 3) < 0) + return -1; + p = next ? next + 1 : p + strlen(p); + } + + if (*to_tail) { + if (append_path_part(out, outsz, &used, to_tail, strlen(to_tail)) < 0) + return -1; + } else if (used == 0) { + if (append_path_part(out, outsz, &used, ".", 1) < 0) + return -1; + } else if (used >= 1 && out[used - 1] == '/') { + out[used - 1] = '\0'; + } + return 0; +} + +static const char *host_relative_symlink_target(const char *guest_target, + host_fd_ref_t *dir_ref, + const path_translation_t *tx, + char *buf, + size_t bufsz) +{ + char sysroot[LINUX_PATH_MAX]; + if (!guest_target || guest_target[0] != '/' || + !proc_sysroot_snapshot(sysroot, sizeof(sysroot))) + return guest_target; + + char target_host[LINUX_PATH_MAX]; + int n = snprintf(target_host, sizeof(target_host), "%s%s", sysroot, + guest_target); + if (n < 0 || (size_t) n >= sizeof(target_host)) { + errno = ENAMETOOLONG; + return NULL; + } + + char link_host[LINUX_PATH_MAX]; + if (tx->host_path[0] == '/') { + if (str_copy_trunc(link_host, tx->host_path, sizeof(link_host)) >= + sizeof(link_host)) { + errno = ENAMETOOLONG; + return NULL; + } + } else { + char dir_host[LINUX_PATH_MAX]; + if (fcntl(dir_ref->fd, F_GETPATH, dir_host) < 0) + return NULL; + n = snprintf(link_host, sizeof(link_host), "%s/%s", dir_host, + tx->host_path); + if (n < 0 || (size_t) n >= sizeof(link_host)) { + errno = ENAMETOOLONG; + return NULL; + } + } + + char link_parent[LINUX_PATH_MAX]; + if (path_parent_copy(link_host, link_parent, sizeof(link_parent)) < 0) + return NULL; + if (relative_path_between(link_parent, target_host, buf, bufsz) < 0) + return NULL; + return buf; +} + /* open/close. */ int64_t sys_openat_path(guest_t *g, @@ -1250,6 +1380,10 @@ int64_t sys_pipe2(guest_t *g, uint64_t fds_gva, int linux_flags) if (pipe(host_fds) < 0) return linux_errno(); +#ifdef F_SETNOSIGPIPE + (void) fcntl(host_fds[1], F_SETNOSIGPIPE, 1); +#endif + int guest_fds[2]; guest_fds[0] = fd_alloc(FD_PIPE, host_fds[0], NULL); if (guest_fds[0] < 0) { @@ -1623,8 +1757,16 @@ int64_t sys_symlinkat(guest_t *g, if (host_dirfd_ref_open(dirfd, &dir_ref) < 0) return -LINUX_EBADF; - /* Resolve linkpath (the new symlink location) through sysroot */ - if (symlinkat(target, dir_ref.fd, tx.host_path) < 0) { + char relative_target[LINUX_PATH_MAX]; + const char *host_target = host_relative_symlink_target( + target, &dir_ref, &tx, relative_target, sizeof(relative_target)); + if (!host_target) { + host_fd_ref_close(&dir_ref); + return linux_errno(); + } + + /* Resolve linkpath (the new symlink location) through sysroot. */ + if (symlinkat(host_target, dir_ref.fd, tx.host_path) < 0) { host_fd_ref_close(&dir_ref); return linux_errno(); } diff --git a/src/syscall/io.c b/src/syscall/io.c index acdddfe..e88e9bf 100644 --- a/src/syscall/io.c +++ b/src/syscall/io.c @@ -833,10 +833,6 @@ static int64_t io_write_result(ssize_t ret) if (ret >= 0) return ret; - int saved_errno = errno; - if (saved_errno == EPIPE) - signal_queue(LINUX_SIGPIPE); - errno = saved_errno; return linux_errno(); } diff --git a/src/syscall/mem.c b/src/syscall/mem.c index b682b75..d6edb05 100644 --- a/src/syscall/mem.c +++ b/src/syscall/mem.c @@ -266,7 +266,8 @@ static void split_regions_at_boundary(guest_t *g, uint64_t boundary) static uint64_t find_free_gap_inner(const guest_t *g, uint64_t length, uint64_t min_addr, - uint64_t max_addr) + uint64_t max_addr, + uint64_t align) { /* Round the search start up to the next host-page boundary so an unaligned * addr hint cannot return a result that lands inside a host page already @@ -275,8 +276,7 @@ static uint64_t find_free_gap_inner(const guest_t *g, * aligning to the guest 4 KiB page is not enough. Advance past each walked * region to the same boundary for the same reason. */ - size_t hps = host_page_size_cached(); - uint64_t gap_start = ALIGN_UP(min_addr, hps); + uint64_t gap_start = ALIGN_UP(min_addr, align); /* Skip the prefix of regions entirely below gap_start in O(log n). After a * successful allocation the gap hint advances near or past the existing @@ -308,7 +308,7 @@ static uint64_t find_free_gap_inner(const guest_t *g, return gap_start; /* Region overlaps; advance past it and round to the next host page */ - gap_start = ALIGN_UP(g->regions[i].end, hps); + gap_start = ALIGN_UP(g->regions[i].end, align); } /* Check trailing space after all regions */ @@ -326,33 +326,27 @@ static uint64_t find_free_gap_inner(const guest_t *g, static uint64_t find_free_gap(guest_t *g, uint64_t length, uint64_t min_addr, - uint64_t max_addr) + uint64_t max_addr, + uint64_t align) { /* RX and RW mappings advance independently, so keep separate hints. */ uint64_t *hint = (min_addr < MMAP_BASE) ? &g->mmap_rx_gap_hint : &g->mmap_rw_gap_hint; - /* Advance the hint to the next host-page boundary so the following - * sequential allocation lands on an address that the kernel accepts for - * mmap MAP_FIXED (Apple Silicon enforces 16 KiB host pages). The tradeoff - * is up to host_page-1 bytes of address-space waste per small allocation; - * physical pages are still demand-paged, so RAM cost is unchanged. - */ - size_t hps = host_page_size_cached(); - /* Try cached hint first (only if within the valid range) */ if (*hint >= min_addr && *hint < max_addr) { - uint64_t result = find_free_gap_inner(g, length, *hint, max_addr); + uint64_t result = + find_free_gap_inner(g, length, *hint, max_addr, align); if (result != UINT64_MAX) { - *hint = ALIGN_UP(result + length, hps); + *hint = ALIGN_UP(result + length, align); return result; } } /* Full scan from base */ - uint64_t result = find_free_gap_inner(g, length, min_addr, max_addr); + uint64_t result = find_free_gap_inner(g, length, min_addr, max_addr, align); if (result != UINT64_MAX) - *hint = ALIGN_UP(result + length, hps); + *hint = ALIGN_UP(result + length, align); return result; } @@ -1869,14 +1863,6 @@ int64_t sys_mmap(guest_t *g, return -LINUX_ENODEV; } - /* Round length up to page size (overflow-safe) */ - if (length > UINT64_MAX - 4095) - return -LINUX_ENOMEM; - length = PAGE_ALIGN_UP(length); - if (length == 0) - return -LINUX_ENOMEM; - - /* Linux kernel rejects MAP_FIXED with non-page-aligned address */ bool is_fixed = (flags & LINUX_MAP_FIXED) || (flags & LINUX_MAP_FIXED_NOREPLACE); if (is_fixed && (addr & 4095)) @@ -1887,6 +1873,19 @@ int64_t sys_mmap(guest_t *g, */ bool is_noreplace = (flags & LINUX_MAP_FIXED_NOREPLACE) != 0; + size_t hps = host_page_size_cached(); + uint64_t align = hps; + if (!is_fixed && !is_anon && fd >= 0 && (flags & LINUX_MAP_SHARED)) { + align = BLOCK_2MIB; + } + + /* Round length up to align size (overflow-safe) */ + if (length > UINT64_MAX - (align - 1)) + return -LINUX_ENOMEM; + length = ALIGN_UP(length, align); + if (length == 0) + return -LINUX_ENOMEM; + uint64_t result_off; /* Result as offset (0-based) */ if (is_fixed) { /* Addresses above TASK_SIZE (bit 63 set or beyond user VA range) are @@ -2191,7 +2190,8 @@ int64_t sys_mmap(guest_t *g, * ones. The RX region at MMAP_RX_BASE is pre-mapped with execute * permission. */ - result_off = find_free_gap(g, length, MMAP_RX_BASE, g->mmap_limit); + result_off = + find_free_gap(g, length, MMAP_RX_BASE, g->mmap_limit, align); if (result_off == UINT64_MAX) { log_debug( "mmap: RX address space exhausted " @@ -2232,12 +2232,13 @@ int64_t sys_mmap(guest_t *g, */ uint64_t hint_max = (hint_off < MMAP_BASE) ? MMAP_BASE : g->mmap_limit; - result_off = - find_free_gap_inner(g, length, hint_off, hint_max); + result_off = find_free_gap_inner(g, length, hint_off, + hint_max, align); } } if (result_off == UINT64_MAX) - result_off = find_free_gap(g, length, MMAP_BASE, g->mmap_limit); + result_off = + find_free_gap(g, length, MMAP_BASE, g->mmap_limit, align); if (result_off == UINT64_MAX) { log_debug( "mmap: RW address space exhausted " @@ -2366,6 +2367,12 @@ int64_t sys_mmap(guest_t *g, * host pages). The "extra" trailing bytes inside the host page are * never reachable by the guest because the gap-finder advances the hint * to the next host-page boundary after each allocation. + /* mmap rounds length up to the host page size internally; only + * addr and offset alignment matter for MAP_FIXED on macOS Apple + * Silicon (16 KiB host pages). The "extra" trailing bytes inside + * the host page are never reachable by the guest because the + * gap-finder advances the hint to the next host-page boundary + * after each allocation. */ /* overlay_fd_writable rejects read-only backing fds inside * hvf_apply_file_overlay; mirror the check here so a read-only mmap @@ -2480,7 +2487,6 @@ int64_t sys_mmap(guest_t *g, * keeps coherent with the file's page cache. */ if (!is_anon && fd >= 0 && !is_prot_none && (flags & LINUX_MAP_SHARED)) { - size_t hps = host_page_size_cached(); if ((result_off % hps == 0) && ((uint64_t) offset % hps == 0)) { for (int i = 0; i < g->nregions; i++) { if (g->regions[i].start == result_off && @@ -2932,9 +2938,11 @@ int64_t sys_mremap(guest_t *g, uint64_t new_off; if (needs_exec && !(prot & LINUX_PROT_WRITE)) - new_off = find_free_gap(g, new_size, MMAP_RX_BASE, g->mmap_limit); + new_off = find_free_gap(g, new_size, MMAP_RX_BASE, g->mmap_limit, + host_page_size_cached()); else - new_off = find_free_gap(g, new_size, MMAP_BASE, g->mmap_limit); + new_off = find_free_gap(g, new_size, MMAP_BASE, g->mmap_limit, + host_page_size_cached()); if (new_off == UINT64_MAX) { if (track_backing_fd >= 0) diff --git a/src/syscall/proc-identity.c b/src/syscall/proc-identity.c index 5a157cd..ef11d4a 100644 --- a/src/syscall/proc-identity.c +++ b/src/syscall/proc-identity.c @@ -7,6 +7,9 @@ #include #include #include +#include +#include +#include #include "syscall/abi.h" #include "core/shim-globals.h" @@ -24,16 +27,33 @@ static _Atomic int64_t guest_sid = 1, guest_pgid = 1; static _Atomic int64_t guest_fg_pgrp = 1; static _Atomic int32_t guest_has_ctty = 1; +static uint32_t proc_identity_env_u32(const char *name, uint32_t fallback) +{ + const char *value = getenv(name); + if (!value || !*value) + return fallback; + + errno = 0; + char *end = NULL; + unsigned long parsed = strtoul(value, &end, 10); + if (errno != 0 || end == value || *end != '\0' || parsed > UINT32_MAX) + return fallback; + return (uint32_t) parsed; +} + void proc_identity_init(void) { + uint32_t initial_uid = proc_identity_env_u32("ELFUSE_GUEST_UID", GUEST_UID); + uint32_t initial_gid = proc_identity_env_u32("ELFUSE_GUEST_GID", GUEST_GID); + guest_pid = 1; parent_pid = 0; - emu_uid = GUEST_UID; - emu_euid = GUEST_UID; - emu_suid = GUEST_UID; - emu_gid = GUEST_GID; - emu_egid = GUEST_GID; - emu_sgid = GUEST_GID; + emu_uid = initial_uid; + emu_euid = initial_uid; + emu_suid = initial_uid; + emu_gid = initial_gid; + emu_egid = initial_gid; + emu_sgid = initial_gid; emu_nice = 0; guest_sid = 1; guest_pgid = 1; @@ -93,36 +113,47 @@ static bool gid_is_permitted(uint32_t val) int64_t proc_sys_setuid(uint32_t uid) { - if (!uid_is_permitted(uid)) + bool privileged = (emu_euid == 0); + if (!privileged && !uid_is_permitted(uid)) return -LINUX_EPERM; + if (privileged) { + emu_uid = uid; + emu_suid = uid; + } emu_euid = uid; return 0; } int64_t proc_sys_setgid(uint32_t gid) { - if (!gid_is_permitted(gid)) + bool privileged = (emu_euid == 0); + if (!privileged && !gid_is_permitted(gid)) return -LINUX_EPERM; + if (privileged) { + emu_gid = gid; + emu_sgid = gid; + } emu_egid = gid; return 0; } -#define DEFINE_SETRE(suffix, real, eff, saved, perm_fn) \ - int64_t proc_sys_setre##suffix(uint32_t r, uint32_t e) \ - { \ - uint32_t old_real = real; \ - if (r != (uint32_t) -1 && r != real && r != eff) \ - return -LINUX_EPERM; \ - if (e != (uint32_t) -1 && !perm_fn(e)) \ - return -LINUX_EPERM; \ - if (r != (uint32_t) -1) \ - real = r; \ - if (e != (uint32_t) -1) { \ - eff = e; \ - if (r != (uint32_t) -1 || e != old_real) \ - saved = e; \ - } \ - return 0; \ +#define DEFINE_SETRE(suffix, real, eff, saved, perm_fn) \ + int64_t proc_sys_setre##suffix(uint32_t r, uint32_t e) \ + { \ + uint32_t old_real = real; \ + bool privileged = (emu_euid == 0); \ + if (!privileged && r != (uint32_t) -1 && r != real && r != eff) \ + return -LINUX_EPERM; \ + if (!privileged && e != (uint32_t) -1 && !perm_fn(e)) \ + return -LINUX_EPERM; \ + if (r != (uint32_t) -1) \ + real = r; \ + if (e != (uint32_t) -1) { \ + eff = e; \ + if (r != (uint32_t) -1 || e != old_real) \ + saved = e; \ + } \ + return 0; \ } DEFINE_SETRE(uid, emu_uid, emu_euid, emu_suid, uid_is_permitted) @@ -133,11 +164,12 @@ DEFINE_SETRE(gid, emu_gid, emu_egid, emu_sgid, gid_is_permitted) #define DEFINE_SETRES(suffix, real, eff, saved, perm_fn) \ int64_t proc_sys_setres##suffix(uint32_t r, uint32_t e, uint32_t s) \ { \ - if (r != (uint32_t) -1 && !perm_fn(r)) \ + bool privileged = (emu_euid == 0); \ + if (!privileged && r != (uint32_t) -1 && !perm_fn(r)) \ return -LINUX_EPERM; \ - if (e != (uint32_t) -1 && !perm_fn(e)) \ + if (!privileged && e != (uint32_t) -1 && !perm_fn(e)) \ return -LINUX_EPERM; \ - if (s != (uint32_t) -1 && !perm_fn(s)) \ + if (!privileged && s != (uint32_t) -1 && !perm_fn(s)) \ return -LINUX_EPERM; \ if (r != (uint32_t) -1) \ real = r; \ diff --git a/src/syscall/proc-state.c b/src/syscall/proc-state.c index 81dc4e0..260f50b 100644 --- a/src/syscall/proc-state.c +++ b/src/syscall/proc-state.c @@ -553,7 +553,16 @@ static const char *proc_resolve_sysroot_path_flags(const char *path, errno = ENAMETOOLONG; return NULL; } - return path; + char parent[LINUX_PATH_MAX]; + str_copy_trunc(parent, buf, sizeof(parent)); + char *slash = strrchr(parent, '/'); + if (!slash || slash == parent) + return buf; + *slash = '\0'; + + if (sysroot_validate_dir_prefix(parent) < 0) + return NULL; + return buf; } const char *proc_resolve_sysroot_path(const char *path, char *buf, size_t bufsz) @@ -615,14 +624,6 @@ const char *proc_resolve_sysroot_create_path(const char *path, if (errno != ENOENT && errno != ENOTDIR) return NULL; - /* Parent doesn't exist in sysroot. Only /tmp, /var/tmp, and ccache get - * forcefully redirected to the sysroot to avoid host case-collisions; - * everything else falls back to the host literal. - */ - if (strncmp(path, "/tmp/", 5) && strncmp(path, "/var/tmp/", 9) && - !strstr(path, "/.ccache/")) - return path; - if (!create_parents) { if (sysroot_validate_dir_prefix(parent) < 0) return NULL; diff --git a/src/syscall/proc.c b/src/syscall/proc.c index beab7ea..88b5db1 100644 --- a/src/syscall/proc.c +++ b/src/syscall/proc.c @@ -313,6 +313,7 @@ void proc_mark_child_exited(pid_t host_pid, int status) pthread_cond_broadcast(&pid_cond); pthread_mutex_unlock(&pid_lock); proc_pidfd_notify_exit(gpid); + signal_queue(LINUX_SIGCHLD); return; } pthread_mutex_unlock(&pid_lock); diff --git a/src/syscall/sys.c b/src/syscall/sys.c index da6b7d4..3530be2 100644 --- a/src/syscall/sys.c +++ b/src/syscall/sys.c @@ -31,9 +31,6 @@ /* System info syscall handlers. */ -static pthread_once_t groups_once = PTHREAD_ONCE_INIT; -static uint32_t cached_linux_groups[64]; -static int cached_ngroups = -1; static const linux_utsname_t cached_uname = { .sysname = "Linux", .nodename = "elfuse", @@ -81,18 +78,6 @@ _Static_assert(offsetof(struct rusage, ru_maxrss) == */ static bool sched_pid_alive(int pid); -static void groups_init_cached_linux_groups(void) -{ - gid_t groups[64]; - int ngroups = getgroups(64, groups); - if (ngroups < 0) - return; - - for (int i = 0; i < ngroups; i++) - cached_linux_groups[i] = (uint32_t) groups[i]; - cached_ngroups = ngroups; -} - static void sysinfo_init_cached_host_state(void) { struct timeval boottime; @@ -158,19 +143,6 @@ static void sysinfo_refresh_cached_locked(time_t now_sec) cached_sysinfo_sec = now_sec; } -static int get_cached_linux_groups(void) -{ - if (thread_is_single_active()) { - if (cached_ngroups >= 0) - return cached_ngroups; - groups_init_cached_linux_groups(); - return cached_ngroups; - } - - pthread_once(&groups_once, groups_init_cached_linux_groups); - return cached_ngroups; -} - int64_t sys_uname(guest_t *g, uint64_t buf_gva) { if (guest_write_small(g, buf_gva, &cached_uname, sizeof(cached_uname)) < 0) @@ -487,17 +459,15 @@ int64_t sys_sched_rr_get_interval(guest_t *g, int pid, uint64_t ts_gva) int64_t sys_getgroups(guest_t *g, int size, uint64_t list_gva) { - int ngroups = get_cached_linux_groups(); - if (ngroups < 0) - return linux_errno(); + const int ngroups = 1; if (size == 0) return ngroups; if (size < ngroups) return -LINUX_EINVAL; - size_t bytes = (size_t) ngroups * sizeof(cached_linux_groups[0]); - if (guest_write_small(g, list_gva, cached_linux_groups, bytes) < 0) + uint32_t group = proc_get_gid(); + if (guest_write_small(g, list_gva, &group, sizeof(group)) < 0) return -LINUX_EFAULT; return ngroups; diff --git a/src/syscall/syscall.c b/src/syscall/syscall.c index 6f97fc0..cb0237d 100644 --- a/src/syscall/syscall.c +++ b/src/syscall/syscall.c @@ -556,6 +556,7 @@ SC_FORWARD(sc_setfsuid, (int64_t) proc_get_euid()) SC_FORWARD(sc_setfsgid, (int64_t) proc_get_egid()) SC_FORWARD(sc_setpgid, proc_sys_setpgid(g, (int64_t) x0, (int64_t) x1)) SC_STUB(sc_fadvise64, 0) +SC_STUB(sc_sync_file_range, 0) SC_STUB(sc_sched_yield, (sched_yield(), 0)) SC_STUB(sc_mlock, 0) SC_STUB(sc_munlock, 0) @@ -1952,8 +1953,6 @@ int syscall_dispatch(hv_vcpu_t vcpu, guest_t *g, int *exit_code, bool verbose) result = ret; goto fast_done; } - if (nr == SYS_write && errno == EPIPE) - signal_queue(LINUX_SIGPIPE); result = linux_errno(); goto fast_done; } @@ -2057,12 +2056,23 @@ int syscall_dispatch(hv_vcpu_t vcpu, guest_t *g, int *exit_code, bool verbose) if (verbose) { log_debug(" -> %lld (0x%llx)", (long long) result, (unsigned long long) (uint64_t) result); - /* Log file paths for openat/readlinkat */ + /* Log path-bearing syscalls. */ if ((int) x8 == SYS_openat || (int) x8 == SYS_readlinkat || - (int) x8 == SYS_faccessat) { + (int) x8 == SYS_faccessat || (int) x8 == SYS_newfstatat || + (int) x8 == SYS_unlinkat || (int) x8 == SYS_mkdirat || + (int) x8 == SYS_fchmodat || (int) x8 == SYS_fchownat) { char pathbuf[256]; if (guest_read_str(g, x1, pathbuf, sizeof(pathbuf)) >= 0) log_debug(" path=\"%s\"", pathbuf); + } else if ((int) x8 == SYS_renameat || (int) x8 == SYS_renameat2 || + (int) x8 == SYS_linkat || (int) x8 == SYS_symlinkat) { + char path_a[256]; + char path_b[256]; + uint64_t a_gva = ((int) x8 == SYS_symlinkat) ? x0 : x1; + uint64_t b_gva = ((int) x8 == SYS_symlinkat) ? x2 : x3; + if (guest_read_str(g, a_gva, path_a, sizeof(path_a)) >= 0 && + guest_read_str(g, b_gva, path_b, sizeof(path_b)) >= 0) + log_debug(" path=\"%s\" -> \"%s\"", path_a, path_b); } } /* Write result back to X0 */