diff --git a/src/core/elf.c b/src/core/elf.c index 575e1c7..4b42c9c 100644 --- a/src/core/elf.c +++ b/src/core/elf.c @@ -8,6 +8,7 @@ * segments, and copies them into guest memory. */ +#include #include #include #include @@ -19,19 +20,21 @@ #include "debug/log.h" #include "utils.h" -int elf_load(const char *path, elf_info_t *info) +static int elf_load_impl(const char *path, elf_info_t *info, bool quiet) { memset(info, 0, sizeof(*info)); FILE *f = fopen(path, "rb"); if (!f) { - perror(path); + if (!quiet) + perror(path); return -1; } elf64_ehdr_t ehdr; if (fread(&ehdr, sizeof(ehdr), 1, f) != 1) { - log_error("%s: failed to read ELF header", path); + if (!quiet) + log_error("%s: failed to read ELF header", path); fclose(f); return -1; } @@ -39,21 +42,24 @@ int elf_load(const char *path, elf_info_t *info) /* Reject non-ELF inputs before interpreting the rest of the header. */ if (ehdr.e_ident[0] != ELFMAG0 || ehdr.e_ident[1] != ELFMAG1 || ehdr.e_ident[2] != ELFMAG2 || ehdr.e_ident[3] != ELFMAG3) { - log_error("%s: not an ELF file", path); + if (!quiet) + log_error("%s: not an ELF file", path); fclose(f); return -1; } /* elfuse only implements the 64-bit Linux ABI. */ if (ehdr.e_ident[EI_CLASS] != ELFCLASS64) { - log_error("%s: not a 64-bit ELF", path); + if (!quiet) + log_error("%s: not a 64-bit ELF", path); fclose(f); return -1; } /* aarch64-linux user binaries are little-endian in the supported mode. */ if (ehdr.e_ident[EI_DATA] != ELFDATA2LSB) { - log_error("%s: not little-endian", path); + if (!quiet) + log_error("%s: not little-endian", path); fclose(f); return -1; } @@ -62,8 +68,9 @@ int elf_load(const char *path, elf_info_t *info) * diagnostic instead of a generic parse failure. */ if (ehdr.e_machine != EM_AARCH64 && ehdr.e_machine != EM_X86_64) { - log_error("%s: unsupported architecture (e_machine=%u)", path, - ehdr.e_machine); + if (!quiet) + log_error("%s: unsupported architecture (e_machine=%u)", path, + ehdr.e_machine); fclose(f); return -1; } @@ -72,7 +79,8 @@ int elf_load(const char *path, elf_info_t *info) * the load base that keeps them away from elfuse's reserved regions. */ if (ehdr.e_type != ET_EXEC && ehdr.e_type != ET_DYN) { - log_error("%s: not an executable (e_type=%u)", path, ehdr.e_type); + if (!quiet) + log_error("%s: not an executable (e_type=%u)", path, ehdr.e_type); fclose(f); return -1; } @@ -204,6 +212,16 @@ int elf_load(const char *path, elf_info_t *info) return 0; } +int elf_load(const char *path, elf_info_t *info) +{ + return elf_load_impl(path, info, false); +} + +int elf_load_quiet(const char *path, elf_info_t *info) +{ + return elf_load_impl(path, info, true); +} + int elf_map_segments(const elf_info_t *info, const char *path, void *guest_base, diff --git a/src/core/elf.h b/src/core/elf.h index a8ce7ce..1f8428d 100644 --- a/src/core/elf.h +++ b/src/core/elf.h @@ -105,6 +105,7 @@ typedef struct { * Returns 0 on success, -1 on failure. Does NOT copy to guest yet. */ int elf_load(const char *path, elf_info_t *info); +int elf_load_quiet(const char *path, elf_info_t *info); /* Copy ELF segments into guest memory. Call after elf_load() and guest_init(). * Also copies program headers into guest memory for AT_PHDR. load_base is added diff --git a/src/core/guest.c b/src/core/guest.c index 01422b8..1648f6f 100644 --- a/src/core/guest.c +++ b/src/core/guest.c @@ -1661,6 +1661,19 @@ int guest_get_used_regions(const guest_t *g, n++; } + /* Interpreter high block. The dynamic linker stores process-global state + * such as __stack_chk_guard in its high mapping just above interp_base. + * Fork children that take the region-copy path must inherit those bytes; + * otherwise libc's post-fork canary check observes zeroed guard storage + * and aborts before the child can exec. + */ + if (n < max && g->interp_base > 0 && + g->interp_base <= g->guest_size - BLOCK_2MIB) { + out[n].offset = g->interp_base; + out[n].size = BLOCK_2MIB; + n++; + } + /* ELF + brk region: from elf_load_min (set by ELF loader) to brk_current. * The lower bound is the actual ELF load address, not ELF_DEFAULT_BASE: * ET_EXECs linked below 0x400000 (e.g. at 0x200000) have segments below the diff --git a/src/core/stack.c b/src/core/stack.c index e0a8d19..319d49b 100644 --- a/src/core/stack.c +++ b/src/core/stack.c @@ -16,7 +16,7 @@ #include #include "core/stack.h" -#include "syscall/abi.h" /* GUEST_UID, GUEST_GID */ +#include "syscall/proc.h" /* Linux aarch64 HWCAP bits (from asm/hwcap.h). Only the bits the VZ-sanitized * ID registers actually advertise are listed here; HWCAP bits left out (e.g., @@ -284,12 +284,12 @@ uint64_t build_linux_stack(guest_t *g, AUX(AT_PHENT, elf_info->phentsize); AUX(AT_PHNUM, elf_info->phnum); AUX(AT_ENTRY, elf_info->entry + elf_load_base); - AUX(AT_UID, GUEST_UID); - AUX(AT_EUID, GUEST_UID); - AUX(AT_GID, GUEST_GID); - AUX(AT_EGID, GUEST_GID); - /* Bionic's __libc_init_AT_SECURE aborts when AT_SECURE is absent. elfuse - * never elevates privileges, so AT_SECURE is always 0. + AUX(AT_UID, proc_get_uid()); + AUX(AT_EUID, proc_get_euid()); + AUX(AT_GID, proc_get_gid()); + AUX(AT_EGID, proc_get_egid()); + /* Bionic's __libc_init_AT_SECURE aborts when AT_SECURE is absent. + * elfuse never elevates privileges, so AT_SECURE is always 0. */ AUX(AT_SECURE, 0); AUX(AT_HWCAP2, query_hwcap2()); diff --git a/src/runtime/fork-state.c b/src/runtime/fork-state.c index 8f00258..bca4881 100644 --- a/src/runtime/fork-state.c +++ b/src/runtime/fork-state.c @@ -71,7 +71,7 @@ int fork_ipc_read_all(int fd, void *buf, size_t len) * message comfortably below that limit and stream large fd sets in multiple * chunks. */ -#define FORK_IPC_FD_CHUNK 120 +#define FORK_IPC_FD_CHUNK 32 int fork_ipc_send_fds(int sock, const int *fds, int count) { diff --git a/src/runtime/fork-state.h b/src/runtime/fork-state.h index 18f3443..a71c336 100644 --- a/src/runtime/fork-state.h +++ b/src/runtime/fork-state.h @@ -18,14 +18,30 @@ /* Magic values for IPC frame delimiters */ #define IPC_MAGIC_HEADER 0x454C464BU /* "ELFK" */ #define IPC_MAGIC_SENTINEL 0x454C4F4BU /* "ELOK" */ -/* Bumped to 11 when regions_tracker_stale was added to process state so forked - * children preserve mprotect fast-path correctness. +/* Bumped to 13 when pointer-authentication key registers and the remaining + * EL0 TLS registers were added so forked children and clone-created vCPUs + * resume with the same userspace CPU context as the parent. New Ubuntu arm64 + * userspace can use PAC in libc and TLS-adjacent state during fork return. + * + * Bumped to 12 when clone_flags/child_tid_gva were added so fork-process + * children can apply CLONE_CHILD_SETTID/CLEARTID inside their own snapshot. + * + * Bumped to 11 when regions_tracker_stale was added to process state so + * forked children preserve mprotect fast-path correctness. * * Bumped to 10 when the rosetta placement / kbuf / ttbr1 tuple was added so a * rosetta-aware child rejects an older parent's header instead of trying to * interpret unknown trailing fields. */ -#define IPC_VERSION 11 +#define IPC_VERSION 13 + +typedef struct { + uint64_t apiakeylo_el1, apiakeyhi_el1; + uint64_t apibkeylo_el1, apibkeyhi_el1; + uint64_t apdakeylo_el1, apdakeyhi_el1; + uint64_t apdbkeylo_el1, apdbkeyhi_el1; + uint64_t apgakeylo_el1, apgakeyhi_el1; +} ipc_pauth_keys_t; typedef struct { uint32_t magic; @@ -60,6 +76,8 @@ typedef struct { uint64_t rosetta_entry; uint64_t kbuf_gpa; uint64_t ttbr1; + uint64_t clone_flags; + uint64_t child_tid_gva; } ipc_header_t; typedef struct { @@ -74,8 +92,10 @@ typedef struct { * access faults. */ uint64_t ttbr1_el1; - uint64_t sctlr_el1, tcr_el1, mair_el1, cpacr_el1, tpidr_el0, sp_el1; + uint64_t sctlr_el1, tcr_el1, mair_el1, cpacr_el1; + uint64_t tpidr_el0, tpidrro_el0, tpidr2_el0, sp_el1; uint64_t x[31]; + ipc_pauth_keys_t pauth_keys; vcpu_simd_state_t simd_state; } ipc_registers_t; diff --git a/src/runtime/forkipc.c b/src/runtime/forkipc.c index 19de841..2b311d0 100644 --- a/src/runtime/forkipc.c +++ b/src/runtime/forkipc.c @@ -22,6 +22,7 @@ #include #include #include +#include #include /* fdopendir, for DIR* reconstruction in child */ #include #include /* fclonefileat for CoW shm snapshots */ @@ -48,10 +49,143 @@ #include "debug/log.h" #include "debug/syscall-hist.h" +typedef struct fork_child_monitor_arg { + pid_t host_pid; +} fork_child_monitor_arg_t; + +static void *fork_child_monitor_main(void *arg) +{ + fork_child_monitor_arg_t *m = (fork_child_monitor_arg_t *) arg; + pid_t host_pid = m->host_pid; + free(m); + + int kq = kqueue(); + if (kq < 0) { + log_warn("clone: child monitor kqueue failed for pid=%d: %s", + (int) host_pid, strerror(errno)); + return NULL; + } + + struct kevent kev; + EV_SET(&kev, (uintptr_t) host_pid, EVFILT_PROC, EV_ADD | EV_ONESHOT, + NOTE_EXIT, 0, NULL); + if (kevent(kq, &kev, 1, NULL, 0, NULL) < 0) { + if (errno != ESRCH) + log_warn("clone: child monitor kevent add pid=%d failed: %s", + (int) host_pid, strerror(errno)); + close(kq); + return NULL; + } + + do { + errno = 0; + } while (kevent(kq, NULL, 0, &kev, 1, NULL) < 0 && errno == EINTR); + close(kq); + signal_queue(LINUX_SIGCHLD); + return NULL; +} + +static void fork_child_monitor_start(pid_t host_pid) +{ + fork_child_monitor_arg_t *arg = calloc(1, sizeof(*arg)); + if (!arg) { + log_warn("clone: failed to allocate child monitor for pid=%d", + (int) host_pid); + return; + } + arg->host_pid = host_pid; + + pthread_t thread; + pthread_attr_t attr; + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + int err = pthread_create(&thread, &attr, fork_child_monitor_main, arg); + pthread_attr_destroy(&attr); + if (err != 0) { + log_warn("clone: failed to start child monitor for pid=%d: %s", + (int) host_pid, strerror(err)); + free(arg); + } +} + +/* Pointer-authentication sysregs were added to recent macOS SDKs. Define the + * architectural encodings as a fallback so older SDK headers can still build + * the runtime. + */ +#ifndef HV_SYS_REG_APIAKEYLO_EL1 +#define HV_SYS_REG_APIAKEYLO_EL1 ((hv_sys_reg_t) 0xc108) +#define HV_SYS_REG_APIAKEYHI_EL1 ((hv_sys_reg_t) 0xc109) +#define HV_SYS_REG_APIBKEYLO_EL1 ((hv_sys_reg_t) 0xc10a) +#define HV_SYS_REG_APIBKEYHI_EL1 ((hv_sys_reg_t) 0xc10b) +#define HV_SYS_REG_APDAKEYLO_EL1 ((hv_sys_reg_t) 0xc110) +#define HV_SYS_REG_APDAKEYHI_EL1 ((hv_sys_reg_t) 0xc111) +#define HV_SYS_REG_APDBKEYLO_EL1 ((hv_sys_reg_t) 0xc112) +#define HV_SYS_REG_APDBKEYHI_EL1 ((hv_sys_reg_t) 0xc113) +#define HV_SYS_REG_APGAKEYLO_EL1 ((hv_sys_reg_t) 0xc118) +#define HV_SYS_REG_APGAKEYHI_EL1 ((hv_sys_reg_t) 0xc119) +#endif +#ifndef HV_SYS_REG_TPIDRRO_EL0 +#define HV_SYS_REG_TPIDRRO_EL0 ((hv_sys_reg_t) 0xde83) +#endif +#ifndef HV_SYS_REG_TPIDR2_EL0 +#define HV_SYS_REG_TPIDR2_EL0 ((hv_sys_reg_t) 0xde85) +#endif + +static void capture_pauth_keys(hv_vcpu_t vcpu, ipc_pauth_keys_t *keys) +{ + keys->apiakeylo_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_APIAKEYLO_EL1); + keys->apiakeyhi_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_APIAKEYHI_EL1); + keys->apibkeylo_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_APIBKEYLO_EL1); + keys->apibkeyhi_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_APIBKEYHI_EL1); + keys->apdakeylo_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_APDAKEYLO_EL1); + keys->apdakeyhi_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_APDAKEYHI_EL1); + keys->apdbkeylo_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_APDBKEYLO_EL1); + keys->apdbkeyhi_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_APDBKEYHI_EL1); + keys->apgakeylo_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_APGAKEYLO_EL1); + keys->apgakeyhi_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_APGAKEYHI_EL1); +} + +static hv_return_t restore_pauth_keys(hv_vcpu_t vcpu, + const ipc_pauth_keys_t *keys) +{ + hv_return_t r; + +#define SET_PAUTH_KEY(reg, val) \ + do { \ + r = hv_vcpu_set_sys_reg(vcpu, reg, val); \ + if (r != HV_SUCCESS) \ + return r; \ + } while (0) + + SET_PAUTH_KEY(HV_SYS_REG_APIAKEYLO_EL1, keys->apiakeylo_el1); + SET_PAUTH_KEY(HV_SYS_REG_APIAKEYHI_EL1, keys->apiakeyhi_el1); + SET_PAUTH_KEY(HV_SYS_REG_APIBKEYLO_EL1, keys->apibkeylo_el1); + SET_PAUTH_KEY(HV_SYS_REG_APIBKEYHI_EL1, keys->apibkeyhi_el1); + SET_PAUTH_KEY(HV_SYS_REG_APDAKEYLO_EL1, keys->apdakeylo_el1); + SET_PAUTH_KEY(HV_SYS_REG_APDAKEYHI_EL1, keys->apdakeyhi_el1); + SET_PAUTH_KEY(HV_SYS_REG_APDBKEYLO_EL1, keys->apdbkeylo_el1); + SET_PAUTH_KEY(HV_SYS_REG_APDBKEYHI_EL1, keys->apdbkeyhi_el1); + SET_PAUTH_KEY(HV_SYS_REG_APGAKEYLO_EL1, keys->apgakeylo_el1); + SET_PAUTH_KEY(HV_SYS_REG_APGAKEYHI_EL1, keys->apgakeyhi_el1); +#undef SET_PAUTH_KEY + + return HV_SUCCESS; +} + /* fork_child_main. */ static int fork_child_vfork_notify_fd = -1; +/* Linux clone flags */ +#define LINUX_CLONE_VM 0x00000100 +#define LINUX_CLONE_VFORK 0x00004000 +#define LINUX_CLONE_THREAD 0x00010000 +#define LINUX_CLONE_SETTLS 0x00080000 +#define LINUX_CLONE_PARENT_SETTID 0x00100000 +#define LINUX_CLONE_CHILD_CLEARTID 0x00200000 +#define LINUX_CLONE_CHILD_SETTID 0x01000000 +/* LINUX_SIGCHLD defined in syscall_signal.h (included above) */ + void fork_notify_vfork_exec(void) { if (fork_child_vfork_notify_fd < 0) @@ -275,6 +409,16 @@ int fork_child_main(int ipc_fd, return 1; } + if ((hdr.clone_flags & LINUX_CLONE_CHILD_SETTID) && hdr.child_tid_gva) { + int32_t tid32 = (int32_t) hdr.child_pid; + if (guest_write_small(&g, hdr.child_tid_gva, &tid32, sizeof(tid32)) < + 0) { + log_error("fork-child: failed to write CLONE_CHILD_SETTID"); + guest_destroy(&g); + return 1; + } + } + /* POSIX: "Signals pending to the parent shall not be pending to the child." * Clear pending bitmask and RT queue before applying state. * signal_set_state() is deferred until after thread_register_main() so that @@ -313,9 +457,14 @@ int fork_child_main(int ipc_fd, HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TTBR0_EL1, regs.ttbr0_el1)); HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TTBR1_EL1, regs.ttbr1_el1)); HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_CPACR_EL1, regs.cpacr_el1)); + uint64_t child_sp_el1 = g.ipa_base + g.shim_data_base + BLOCK_2MIB; HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SP_EL0, regs.sp_el0)); - HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SP_EL1, regs.sp_el1)); + HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SP_EL1, child_sp_el1)); HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDR_EL0, regs.tpidr_el0)); + HV_CHECK( + hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDRRO_EL0, regs.tpidrro_el0)); + HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDR2_EL0, regs.tpidr2_el0)); + HV_CHECK(restore_pauth_keys(vcpu, ®s.pauth_keys)); /* TPIDR_EL1 is set by the host (never inherited from the parent's register * snapshot) because it must point at the child's own shim_globals base in @@ -355,13 +504,15 @@ int fork_child_main(int ipc_fd, HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_ELR_EL1, regs.elr_el1)); HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SPSR_EL1, regs.spsr_el1)); HV_CHECK(hv_vcpu_set_reg(vcpu, HV_REG_PC, regs.elr_el1)); - HV_CHECK(hv_vcpu_set_reg(vcpu, HV_REG_CPSR, 0)); /* EL0t */ + HV_CHECK(hv_vcpu_set_reg(vcpu, HV_REG_CPSR, regs.spsr_el1)); /* Register the fork child's main thread in the thread table. Without this, * current_thread is NULL and any syscall handler that accesses per-thread * state (signal masks, ptrace, CLONE_THREAD) will dereference NULL. */ - thread_register_main(vcpu, vexit, hdr.child_pid, regs.sp_el1); + thread_register_main(vcpu, vexit, hdr.child_pid, child_sp_el1); + if ((hdr.clone_flags & LINUX_CLONE_CHILD_CLEARTID) && hdr.child_tid_gva) + current_thread->clear_child_tid = hdr.child_tid_gva; /* Re-publish identity into the child's shim-globals cache: the CoW / region * copy inherits the parent's pid/uid values, and the shim's identity fast @@ -420,16 +571,6 @@ int fork_child_main(int ipc_fd, /* sys_clone. */ -/* Linux clone flags */ -#define LINUX_CLONE_VM 0x00000100 -#define LINUX_CLONE_VFORK 0x00004000 -#define LINUX_CLONE_THREAD 0x00010000 -#define LINUX_CLONE_SETTLS 0x00080000 -#define LINUX_CLONE_PARENT_SETTID 0x00100000 -#define LINUX_CLONE_CHILD_CLEARTID 0x00200000 -#define LINUX_CLONE_CHILD_SETTID 0x01000000 -/* LINUX_SIGCHLD defined in syscall_signal.h (included above) */ - /* Namespace flags. elfuse implements no namespace isolation. Both sys_clone and * sys_clone3 reject them. */ @@ -467,7 +608,8 @@ typedef struct { uint64_t child_stack, flags, tls; /* Parent system regs to copy into the new vCPU */ uint64_t elr, spsr, vbar, ttbr0, sctlr, tcr, mair, cpacr; - uint64_t tpidr; + uint64_t tpidr, tpidrro, tpidr2; + ipc_pauth_keys_t pauth_keys; uint64_t gprs[31]; uint64_t sp_el1; vcpu_simd_state_t simd_state; @@ -559,6 +701,12 @@ static int64_t sys_clone_thread(hv_vcpu_t parent_vcpu, parent_mair = vcpu_get_sysreg(parent_vcpu, HV_SYS_REG_MAIR_EL1); parent_cpacr = vcpu_get_sysreg(parent_vcpu, HV_SYS_REG_CPACR_EL1); parent_tpidr = vcpu_get_sysreg(parent_vcpu, HV_SYS_REG_TPIDR_EL0); + uint64_t parent_tpidrro = + vcpu_get_sysreg(parent_vcpu, HV_SYS_REG_TPIDRRO_EL0); + uint64_t parent_tpidr2 = + vcpu_get_sysreg(parent_vcpu, HV_SYS_REG_TPIDR2_EL0); + ipc_pauth_keys_t parent_pauth_keys; + capture_pauth_keys(parent_vcpu, &parent_pauth_keys); uint64_t parent_gprs[31]; vcpu_snapshot_gprs(parent_vcpu, parent_gprs); @@ -587,6 +735,9 @@ static int64_t sys_clone_thread(hv_vcpu_t parent_vcpu, tca->mair = parent_mair; tca->cpacr = parent_cpacr; tca->tpidr = parent_tpidr; + tca->tpidrro = parent_tpidrro; + tca->tpidr2 = parent_tpidr2; + tca->pauth_keys = parent_pauth_keys; memcpy(tca->gprs, parent_gprs, sizeof(parent_gprs)); tca->sp_el1 = child_sp_el1; vcpu_snapshot_simd(parent_vcpu, &tca->simd_state); @@ -762,6 +913,9 @@ static void *thread_create_and_run(void *arg) } else { WORKER_HV(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDR_EL0, tca->tpidr)); } + WORKER_HV(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDRRO_EL0, tca->tpidrro)); + WORKER_HV(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDR2_EL0, tca->tpidr2)); + WORKER_HV(restore_pauth_keys(vcpu, &tca->pauth_keys)); /* ELR_EL1 = clone return point (same as parent) */ WORKER_HV(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_ELR_EL1, tca->elr)); @@ -943,6 +1097,12 @@ static int64_t sys_clone_vm(hv_vcpu_t parent_vcpu, uint64_t parent_mair = vcpu_get_sysreg(parent_vcpu, HV_SYS_REG_MAIR_EL1); uint64_t parent_cpacr = vcpu_get_sysreg(parent_vcpu, HV_SYS_REG_CPACR_EL1); uint64_t parent_tpidr = vcpu_get_sysreg(parent_vcpu, HV_SYS_REG_TPIDR_EL0); + uint64_t parent_tpidrro = + vcpu_get_sysreg(parent_vcpu, HV_SYS_REG_TPIDRRO_EL0); + uint64_t parent_tpidr2 = + vcpu_get_sysreg(parent_vcpu, HV_SYS_REG_TPIDR2_EL0); + ipc_pauth_keys_t parent_pauth_keys; + capture_pauth_keys(parent_vcpu, &parent_pauth_keys); uint64_t parent_gprs[31]; vcpu_snapshot_gprs(parent_vcpu, parent_gprs); @@ -970,6 +1130,9 @@ static int64_t sys_clone_vm(hv_vcpu_t parent_vcpu, tca->mair = parent_mair; tca->cpacr = parent_cpacr; tca->tpidr = parent_tpidr; + tca->tpidrro = parent_tpidrro; + tca->tpidr2 = parent_tpidr2; + tca->pauth_keys = parent_pauth_keys; memcpy(tca->gprs, parent_gprs, sizeof(parent_gprs)); tca->sp_el1 = child_sp_el1; vcpu_snapshot_simd(parent_vcpu, &tca->simd_state); @@ -1070,6 +1233,9 @@ static void *vm_clone_thread_run(void *arg) } else { HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDR_EL0, tca->tpidr)); } + HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDRRO_EL0, tca->tpidrro)); + HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDR2_EL0, tca->tpidr2)); + HV_CHECK(restore_pauth_keys(vcpu, &tca->pauth_keys)); /* ELR_EL1 = clone return point */ HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_ELR_EL1, tca->elr)); @@ -1528,6 +1694,8 @@ int64_t sys_clone(hv_vcpu_t vcpu, .rosetta_entry = g->rosetta_entry, .kbuf_gpa = g->kbuf_gpa, .ttbr1 = g->ttbr1, + .clone_flags = flags, + .child_tid_gva = ctid_gva, }; if (fork_ipc_write_all(ipc_sock, &hdr, sizeof(hdr)) < 0) { log_error("clone: failed to send header"); @@ -1561,7 +1729,10 @@ int64_t sys_clone(hv_vcpu_t vcpu, regs.mair_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_MAIR_EL1); regs.cpacr_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_CPACR_EL1); regs.tpidr_el0 = vcpu_get_sysreg(vcpu, HV_SYS_REG_TPIDR_EL0); + regs.tpidrro_el0 = vcpu_get_sysreg(vcpu, HV_SYS_REG_TPIDRRO_EL0); + regs.tpidr2_el0 = vcpu_get_sysreg(vcpu, HV_SYS_REG_TPIDR2_EL0); regs.sp_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_SP_EL1); + capture_pauth_keys(vcpu, ®s.pauth_keys); vcpu_snapshot_gprs(vcpu, regs.x); vcpu_snapshot_simd(vcpu, ®s.simd_state); @@ -1660,6 +1831,8 @@ int64_t sys_clone(hv_vcpu_t vcpu, if (waited == child_host_pid) proc_mark_child_exited(child_host_pid, status); } + } else { + fork_child_monitor_start(child_host_pid); } log_debug("clone: child pid=%lld (host=%d)", (long long) child_guest_pid, diff --git a/src/runtime/procemu.c b/src/runtime/procemu.c index 7d28599..816df4c 100644 --- a/src/runtime/procemu.c +++ b/src/runtime/procemu.c @@ -1676,7 +1676,13 @@ static int pty_keepalive_register_locked(int master_host_fd, if (pty_keepalive_table[slot].slave_host_fd >= 0 && pty_keepalive_table[slot].slave_host_fd != slave_host_fd) close(pty_keepalive_table[slot].slave_host_fd); - pty_keepalive_table[slot].slave_host_fd = slave_host_fd; + if (stale_open_once) { + pty_keepalive_table[slot].slave_host_fd = slave_host_fd; + } else { + if (slave_host_fd >= 0) + close(slave_host_fd); + pty_keepalive_table[slot].slave_host_fd = PTY_KEEPALIVE_FREE; + } pty_keepalive_table[slot].linux_pts_num = linux_pts_num; pty_keepalive_table[slot].stale_open_once = stale_open_once; if (slave_path) @@ -2051,21 +2057,27 @@ void proc_pty_dup_keepalive_locked(int src_master_host_fd, int slot = pty_keepalive_find_master_locked(src_master_host_fd); if (slot < 0) return; - int dst_slave = dup(pty_keepalive_table[slot].slave_host_fd); - if (dst_slave < 0) - return; + int dst_slave = -1; + if (pty_keepalive_table[slot].slave_host_fd >= 0) { + dst_slave = dup(pty_keepalive_table[slot].slave_host_fd); + if (dst_slave < 0) + return; + } uint32_t src_pts_num = pty_keepalive_table[slot].linux_pts_num; char src_slave_path[PTY_SLAVE_PATH_MAX]; memcpy(src_slave_path, pty_keepalive_table[slot].slave_path, PTY_SLAVE_PATH_MAX); - /* dup(2) clears FD_CLOEXEC; the keepalive must not survive exec into a - * guest child that has no map back to it. - */ - int fdflags = fcntl(dst_slave, F_GETFD); - if (fdflags < 0 || fcntl(dst_slave, F_SETFD, fdflags | FD_CLOEXEC) < 0) { - close(dst_slave); - return; + if (dst_slave >= 0) { + /* dup(2) clears FD_CLOEXEC; the keepalive must not survive exec into + * a guest child that has no map back to it. + */ + int fdflags = fcntl(dst_slave, F_GETFD); + if (fdflags < 0 || + fcntl(dst_slave, F_SETFD, fdflags | FD_CLOEXEC) < 0) { + close(dst_slave); + return; + } } int rc = pty_keepalive_register_locked(dst_master_host_fd, dst_slave, @@ -2076,7 +2088,8 @@ void proc_pty_dup_keepalive_locked(int src_master_host_fd, * fd that should not already be in the table unless a prior close * skipped proc_pty_close_keepalive. */ - close(dst_slave); + if (dst_slave >= 0) + close(dst_slave); } } @@ -2156,11 +2169,18 @@ int proc_pty_snapshot_keepalive(proc_pty_ipc_entry_t *out_entries, if (pty_keepalive_table[i].master_host_fd == PTY_KEEPALIVE_FREE) continue; - /* dup under the lock so the slave fd cannot be closed and the host fd - * number recycled before SCM_RIGHTS reads it. The caller closes the dup - * after the send completes. + /* Live entries keep only the slave path so the master can observe HUP + * when the real child-side slave closes. Open a temporary slave fd only + * for SCM_RIGHTS handoff to the fork child; stale one-shot entries may + * already carry a retained slave fd and can still be duped. */ - int duped = dup(pty_keepalive_table[i].slave_host_fd); + int duped = -1; + if (pty_keepalive_table[i].slave_host_fd >= 0) { + duped = dup(pty_keepalive_table[i].slave_host_fd); + } else if (pty_keepalive_table[i].slave_path[0] != '\0') { + duped = open(pty_keepalive_table[i].slave_path, + O_RDWR | O_NOCTTY | O_CLOEXEC); + } if (duped < 0) continue; @@ -2554,10 +2574,11 @@ int proc_intercept_open(const guest_t *g, "VmRSS:\t%llu kB\n" "Threads:\t%d\n", name, (long long) proc_get_pid(), (long long) proc_get_pid(), - (long long) proc_get_ppid(), GUEST_UID, GUEST_UID, GUEST_UID, - GUEST_UID, GUEST_GID, GUEST_GID, GUEST_GID, GUEST_GID, - (unsigned long long) vm_size_kb, (unsigned long long) vm_size_kb, - (unsigned long long) vm_rss_kb, threads); + (long long) proc_get_ppid(), proc_get_uid(), proc_get_euid(), + proc_get_suid(), proc_get_euid(), proc_get_gid(), proc_get_egid(), + proc_get_sgid(), proc_get_egid(), (unsigned long long) vm_size_kb, + (unsigned long long) vm_size_kb, (unsigned long long) vm_rss_kb, + threads); } /* /proc/self/limits -> resource limits from prlimit64 cache */ @@ -2660,8 +2681,9 @@ int proc_intercept_open(const guest_t *g, "Gid:\t%d\t%d\t%d\t%d\n" "Threads:\t%d\n", proc_comm_name(), (long long) proc_get_pid(), tid, - (long long) proc_get_ppid(), GUEST_UID, GUEST_UID, GUEST_UID, - GUEST_UID, GUEST_GID, GUEST_GID, GUEST_GID, GUEST_GID, + (long long) proc_get_ppid(), proc_get_uid(), proc_get_euid(), + proc_get_suid(), proc_get_euid(), proc_get_gid(), + proc_get_egid(), proc_get_sgid(), proc_get_egid(), thread_active_count()); } diff --git a/src/syscall/abi.h b/src/syscall/abi.h index 8154281..f3459c7 100644 --- a/src/syscall/abi.h +++ b/src/syscall/abi.h @@ -75,6 +75,7 @@ #define SYS_sync 81 #define SYS_fsync 82 #define SYS_fdatasync 83 +#define SYS_sync_file_range 84 #define SYS_utimensat 88 #define SYS_exit 93 #define SYS_exit_group 94 diff --git a/src/syscall/dispatch.tbl b/src/syscall/dispatch.tbl index 3bb0156..fe23a30 100644 --- a/src/syscall/dispatch.tbl +++ b/src/syscall/dispatch.tbl @@ -100,6 +100,7 @@ SYS_fremovexattr sc_fremovexattr 1 SYS_sync sc_sync 1 SYS_fsync sc_fsync 1 SYS_fdatasync sc_fdatasync 1 +SYS_sync_file_range sc_sync_file_range 0 SYS_msync sc_msync 0 SYS_membarrier sc_membarrier 0 diff --git a/src/syscall/exec.c b/src/syscall/exec.c index ef57f7f..b95481a 100644 --- a/src/syscall/exec.c +++ b/src/syscall/exec.c @@ -328,9 +328,9 @@ int64_t sys_execve(hv_vcpu_t vcpu, * binfmt_script. */ elf_info_t elf_info; - if (elf_load(path_host, &elf_info) < 0) { - /* Not a valid ELF. Check if it's a script with a shebang line. Read the - * first 256 bytes and look for "#!" at the start. + if (elf_load_quiet(path_host, &elf_info) < 0) { + /* Not a valid ELF. Check if it's a script with a shebang line. + * Read the first 256 bytes and look for "#!" at the start. */ int script_fd = open(path_host, O_RDONLY); if (script_fd < 0) { diff --git a/src/syscall/fs.c b/src/syscall/fs.c index 51264be..6eede8d 100644 --- a/src/syscall/fs.c +++ b/src/syscall/fs.c @@ -288,6 +288,136 @@ static int64_t reject_unsupported_fuse_path_op(const path_translation_t *tx) return tx && tx->fuse_path ? -LINUX_ENOSYS : INT64_MIN; } +static int path_parent_copy(const char *path, char *out, size_t outsz) +{ + size_t len = str_copy_trunc(out, path, outsz); + if (len >= outsz) { + errno = ENAMETOOLONG; + return -1; + } + + char *slash = strrchr(out, '/'); + if (!slash) { + str_copy_trunc(out, ".", outsz); + } else if (slash == out) { + out[1] = '\0'; + } else { + *slash = '\0'; + } + return 0; +} + +static int append_path_part(char *out, + size_t outsz, + size_t *used, + const char *part, + size_t part_len) +{ + if (*used + part_len >= outsz) { + errno = ENAMETOOLONG; + return -1; + } + memcpy(out + *used, part, part_len); + *used += part_len; + out[*used] = '\0'; + return 0; +} + +static int relative_path_between(const char *from_dir, + const char *to_path, + char *out, + size_t outsz) +{ + size_t common = 0; + for (size_t i = 0; from_dir[i] && to_path[i] && from_dir[i] == to_path[i]; + i++) { + if (from_dir[i] == '/') + common = i; + } + + if (common == 0) { + errno = EXDEV; + return -1; + } + + const char *from_tail = from_dir + common; + while (*from_tail == '/') + from_tail++; + const char *to_tail = to_path + common; + while (*to_tail == '/') + to_tail++; + + size_t used = 0; + out[0] = '\0'; + for (const char *p = from_tail; *p;) { + while (*p == '/') + p++; + if (!*p) + break; + const char *next = strchr(p, '/'); + if (append_path_part(out, outsz, &used, "../", 3) < 0) + return -1; + p = next ? next + 1 : p + strlen(p); + } + + if (*to_tail) { + if (append_path_part(out, outsz, &used, to_tail, strlen(to_tail)) < 0) + return -1; + } else if (used == 0) { + if (append_path_part(out, outsz, &used, ".", 1) < 0) + return -1; + } else if (used >= 1 && out[used - 1] == '/') { + out[used - 1] = '\0'; + } + return 0; +} + +static const char *host_relative_symlink_target(const char *guest_target, + host_fd_ref_t *dir_ref, + const path_translation_t *tx, + char *buf, + size_t bufsz) +{ + char sysroot[LINUX_PATH_MAX]; + if (!guest_target || guest_target[0] != '/' || + !proc_sysroot_snapshot(sysroot, sizeof(sysroot))) + return guest_target; + + char target_host[LINUX_PATH_MAX]; + int n = snprintf(target_host, sizeof(target_host), "%s%s", sysroot, + guest_target); + if (n < 0 || (size_t) n >= sizeof(target_host)) { + errno = ENAMETOOLONG; + return NULL; + } + + char link_host[LINUX_PATH_MAX]; + if (tx->host_path[0] == '/') { + if (str_copy_trunc(link_host, tx->host_path, sizeof(link_host)) >= + sizeof(link_host)) { + errno = ENAMETOOLONG; + return NULL; + } + } else { + char dir_host[LINUX_PATH_MAX]; + if (fcntl(dir_ref->fd, F_GETPATH, dir_host) < 0) + return NULL; + n = snprintf(link_host, sizeof(link_host), "%s/%s", dir_host, + tx->host_path); + if (n < 0 || (size_t) n >= sizeof(link_host)) { + errno = ENAMETOOLONG; + return NULL; + } + } + + char link_parent[LINUX_PATH_MAX]; + if (path_parent_copy(link_host, link_parent, sizeof(link_parent)) < 0) + return NULL; + if (relative_path_between(link_parent, target_host, buf, bufsz) < 0) + return NULL; + return buf; +} + /* open/close. */ int64_t sys_openat_path(guest_t *g, @@ -1250,6 +1380,10 @@ int64_t sys_pipe2(guest_t *g, uint64_t fds_gva, int linux_flags) if (pipe(host_fds) < 0) return linux_errno(); +#ifdef F_SETNOSIGPIPE + (void) fcntl(host_fds[1], F_SETNOSIGPIPE, 1); +#endif + int guest_fds[2]; guest_fds[0] = fd_alloc(FD_PIPE, host_fds[0], NULL); if (guest_fds[0] < 0) { @@ -1623,8 +1757,16 @@ int64_t sys_symlinkat(guest_t *g, if (host_dirfd_ref_open(dirfd, &dir_ref) < 0) return -LINUX_EBADF; - /* Resolve linkpath (the new symlink location) through sysroot */ - if (symlinkat(target, dir_ref.fd, tx.host_path) < 0) { + char relative_target[LINUX_PATH_MAX]; + const char *host_target = host_relative_symlink_target( + target, &dir_ref, &tx, relative_target, sizeof(relative_target)); + if (!host_target) { + host_fd_ref_close(&dir_ref); + return linux_errno(); + } + + /* Resolve linkpath (the new symlink location) through sysroot. */ + if (symlinkat(host_target, dir_ref.fd, tx.host_path) < 0) { host_fd_ref_close(&dir_ref); return linux_errno(); } diff --git a/src/syscall/io.c b/src/syscall/io.c index acdddfe..e88e9bf 100644 --- a/src/syscall/io.c +++ b/src/syscall/io.c @@ -833,10 +833,6 @@ static int64_t io_write_result(ssize_t ret) if (ret >= 0) return ret; - int saved_errno = errno; - if (saved_errno == EPIPE) - signal_queue(LINUX_SIGPIPE); - errno = saved_errno; return linux_errno(); } diff --git a/src/syscall/mem.c b/src/syscall/mem.c index b682b75..d6edb05 100644 --- a/src/syscall/mem.c +++ b/src/syscall/mem.c @@ -266,7 +266,8 @@ static void split_regions_at_boundary(guest_t *g, uint64_t boundary) static uint64_t find_free_gap_inner(const guest_t *g, uint64_t length, uint64_t min_addr, - uint64_t max_addr) + uint64_t max_addr, + uint64_t align) { /* Round the search start up to the next host-page boundary so an unaligned * addr hint cannot return a result that lands inside a host page already @@ -275,8 +276,7 @@ static uint64_t find_free_gap_inner(const guest_t *g, * aligning to the guest 4 KiB page is not enough. Advance past each walked * region to the same boundary for the same reason. */ - size_t hps = host_page_size_cached(); - uint64_t gap_start = ALIGN_UP(min_addr, hps); + uint64_t gap_start = ALIGN_UP(min_addr, align); /* Skip the prefix of regions entirely below gap_start in O(log n). After a * successful allocation the gap hint advances near or past the existing @@ -308,7 +308,7 @@ static uint64_t find_free_gap_inner(const guest_t *g, return gap_start; /* Region overlaps; advance past it and round to the next host page */ - gap_start = ALIGN_UP(g->regions[i].end, hps); + gap_start = ALIGN_UP(g->regions[i].end, align); } /* Check trailing space after all regions */ @@ -326,33 +326,27 @@ static uint64_t find_free_gap_inner(const guest_t *g, static uint64_t find_free_gap(guest_t *g, uint64_t length, uint64_t min_addr, - uint64_t max_addr) + uint64_t max_addr, + uint64_t align) { /* RX and RW mappings advance independently, so keep separate hints. */ uint64_t *hint = (min_addr < MMAP_BASE) ? &g->mmap_rx_gap_hint : &g->mmap_rw_gap_hint; - /* Advance the hint to the next host-page boundary so the following - * sequential allocation lands on an address that the kernel accepts for - * mmap MAP_FIXED (Apple Silicon enforces 16 KiB host pages). The tradeoff - * is up to host_page-1 bytes of address-space waste per small allocation; - * physical pages are still demand-paged, so RAM cost is unchanged. - */ - size_t hps = host_page_size_cached(); - /* Try cached hint first (only if within the valid range) */ if (*hint >= min_addr && *hint < max_addr) { - uint64_t result = find_free_gap_inner(g, length, *hint, max_addr); + uint64_t result = + find_free_gap_inner(g, length, *hint, max_addr, align); if (result != UINT64_MAX) { - *hint = ALIGN_UP(result + length, hps); + *hint = ALIGN_UP(result + length, align); return result; } } /* Full scan from base */ - uint64_t result = find_free_gap_inner(g, length, min_addr, max_addr); + uint64_t result = find_free_gap_inner(g, length, min_addr, max_addr, align); if (result != UINT64_MAX) - *hint = ALIGN_UP(result + length, hps); + *hint = ALIGN_UP(result + length, align); return result; } @@ -1869,14 +1863,6 @@ int64_t sys_mmap(guest_t *g, return -LINUX_ENODEV; } - /* Round length up to page size (overflow-safe) */ - if (length > UINT64_MAX - 4095) - return -LINUX_ENOMEM; - length = PAGE_ALIGN_UP(length); - if (length == 0) - return -LINUX_ENOMEM; - - /* Linux kernel rejects MAP_FIXED with non-page-aligned address */ bool is_fixed = (flags & LINUX_MAP_FIXED) || (flags & LINUX_MAP_FIXED_NOREPLACE); if (is_fixed && (addr & 4095)) @@ -1887,6 +1873,19 @@ int64_t sys_mmap(guest_t *g, */ bool is_noreplace = (flags & LINUX_MAP_FIXED_NOREPLACE) != 0; + size_t hps = host_page_size_cached(); + uint64_t align = hps; + if (!is_fixed && !is_anon && fd >= 0 && (flags & LINUX_MAP_SHARED)) { + align = BLOCK_2MIB; + } + + /* Round length up to align size (overflow-safe) */ + if (length > UINT64_MAX - (align - 1)) + return -LINUX_ENOMEM; + length = ALIGN_UP(length, align); + if (length == 0) + return -LINUX_ENOMEM; + uint64_t result_off; /* Result as offset (0-based) */ if (is_fixed) { /* Addresses above TASK_SIZE (bit 63 set or beyond user VA range) are @@ -2191,7 +2190,8 @@ int64_t sys_mmap(guest_t *g, * ones. The RX region at MMAP_RX_BASE is pre-mapped with execute * permission. */ - result_off = find_free_gap(g, length, MMAP_RX_BASE, g->mmap_limit); + result_off = + find_free_gap(g, length, MMAP_RX_BASE, g->mmap_limit, align); if (result_off == UINT64_MAX) { log_debug( "mmap: RX address space exhausted " @@ -2232,12 +2232,13 @@ int64_t sys_mmap(guest_t *g, */ uint64_t hint_max = (hint_off < MMAP_BASE) ? MMAP_BASE : g->mmap_limit; - result_off = - find_free_gap_inner(g, length, hint_off, hint_max); + result_off = find_free_gap_inner(g, length, hint_off, + hint_max, align); } } if (result_off == UINT64_MAX) - result_off = find_free_gap(g, length, MMAP_BASE, g->mmap_limit); + result_off = + find_free_gap(g, length, MMAP_BASE, g->mmap_limit, align); if (result_off == UINT64_MAX) { log_debug( "mmap: RW address space exhausted " @@ -2366,6 +2367,12 @@ int64_t sys_mmap(guest_t *g, * host pages). The "extra" trailing bytes inside the host page are * never reachable by the guest because the gap-finder advances the hint * to the next host-page boundary after each allocation. + /* mmap rounds length up to the host page size internally; only + * addr and offset alignment matter for MAP_FIXED on macOS Apple + * Silicon (16 KiB host pages). The "extra" trailing bytes inside + * the host page are never reachable by the guest because the + * gap-finder advances the hint to the next host-page boundary + * after each allocation. */ /* overlay_fd_writable rejects read-only backing fds inside * hvf_apply_file_overlay; mirror the check here so a read-only mmap @@ -2480,7 +2487,6 @@ int64_t sys_mmap(guest_t *g, * keeps coherent with the file's page cache. */ if (!is_anon && fd >= 0 && !is_prot_none && (flags & LINUX_MAP_SHARED)) { - size_t hps = host_page_size_cached(); if ((result_off % hps == 0) && ((uint64_t) offset % hps == 0)) { for (int i = 0; i < g->nregions; i++) { if (g->regions[i].start == result_off && @@ -2932,9 +2938,11 @@ int64_t sys_mremap(guest_t *g, uint64_t new_off; if (needs_exec && !(prot & LINUX_PROT_WRITE)) - new_off = find_free_gap(g, new_size, MMAP_RX_BASE, g->mmap_limit); + new_off = find_free_gap(g, new_size, MMAP_RX_BASE, g->mmap_limit, + host_page_size_cached()); else - new_off = find_free_gap(g, new_size, MMAP_BASE, g->mmap_limit); + new_off = find_free_gap(g, new_size, MMAP_BASE, g->mmap_limit, + host_page_size_cached()); if (new_off == UINT64_MAX) { if (track_backing_fd >= 0) diff --git a/src/syscall/proc-identity.c b/src/syscall/proc-identity.c index 5a157cd..ef11d4a 100644 --- a/src/syscall/proc-identity.c +++ b/src/syscall/proc-identity.c @@ -7,6 +7,9 @@ #include #include #include +#include +#include +#include #include "syscall/abi.h" #include "core/shim-globals.h" @@ -24,16 +27,33 @@ static _Atomic int64_t guest_sid = 1, guest_pgid = 1; static _Atomic int64_t guest_fg_pgrp = 1; static _Atomic int32_t guest_has_ctty = 1; +static uint32_t proc_identity_env_u32(const char *name, uint32_t fallback) +{ + const char *value = getenv(name); + if (!value || !*value) + return fallback; + + errno = 0; + char *end = NULL; + unsigned long parsed = strtoul(value, &end, 10); + if (errno != 0 || end == value || *end != '\0' || parsed > UINT32_MAX) + return fallback; + return (uint32_t) parsed; +} + void proc_identity_init(void) { + uint32_t initial_uid = proc_identity_env_u32("ELFUSE_GUEST_UID", GUEST_UID); + uint32_t initial_gid = proc_identity_env_u32("ELFUSE_GUEST_GID", GUEST_GID); + guest_pid = 1; parent_pid = 0; - emu_uid = GUEST_UID; - emu_euid = GUEST_UID; - emu_suid = GUEST_UID; - emu_gid = GUEST_GID; - emu_egid = GUEST_GID; - emu_sgid = GUEST_GID; + emu_uid = initial_uid; + emu_euid = initial_uid; + emu_suid = initial_uid; + emu_gid = initial_gid; + emu_egid = initial_gid; + emu_sgid = initial_gid; emu_nice = 0; guest_sid = 1; guest_pgid = 1; @@ -93,36 +113,47 @@ static bool gid_is_permitted(uint32_t val) int64_t proc_sys_setuid(uint32_t uid) { - if (!uid_is_permitted(uid)) + bool privileged = (emu_euid == 0); + if (!privileged && !uid_is_permitted(uid)) return -LINUX_EPERM; + if (privileged) { + emu_uid = uid; + emu_suid = uid; + } emu_euid = uid; return 0; } int64_t proc_sys_setgid(uint32_t gid) { - if (!gid_is_permitted(gid)) + bool privileged = (emu_euid == 0); + if (!privileged && !gid_is_permitted(gid)) return -LINUX_EPERM; + if (privileged) { + emu_gid = gid; + emu_sgid = gid; + } emu_egid = gid; return 0; } -#define DEFINE_SETRE(suffix, real, eff, saved, perm_fn) \ - int64_t proc_sys_setre##suffix(uint32_t r, uint32_t e) \ - { \ - uint32_t old_real = real; \ - if (r != (uint32_t) -1 && r != real && r != eff) \ - return -LINUX_EPERM; \ - if (e != (uint32_t) -1 && !perm_fn(e)) \ - return -LINUX_EPERM; \ - if (r != (uint32_t) -1) \ - real = r; \ - if (e != (uint32_t) -1) { \ - eff = e; \ - if (r != (uint32_t) -1 || e != old_real) \ - saved = e; \ - } \ - return 0; \ +#define DEFINE_SETRE(suffix, real, eff, saved, perm_fn) \ + int64_t proc_sys_setre##suffix(uint32_t r, uint32_t e) \ + { \ + uint32_t old_real = real; \ + bool privileged = (emu_euid == 0); \ + if (!privileged && r != (uint32_t) -1 && r != real && r != eff) \ + return -LINUX_EPERM; \ + if (!privileged && e != (uint32_t) -1 && !perm_fn(e)) \ + return -LINUX_EPERM; \ + if (r != (uint32_t) -1) \ + real = r; \ + if (e != (uint32_t) -1) { \ + eff = e; \ + if (r != (uint32_t) -1 || e != old_real) \ + saved = e; \ + } \ + return 0; \ } DEFINE_SETRE(uid, emu_uid, emu_euid, emu_suid, uid_is_permitted) @@ -133,11 +164,12 @@ DEFINE_SETRE(gid, emu_gid, emu_egid, emu_sgid, gid_is_permitted) #define DEFINE_SETRES(suffix, real, eff, saved, perm_fn) \ int64_t proc_sys_setres##suffix(uint32_t r, uint32_t e, uint32_t s) \ { \ - if (r != (uint32_t) -1 && !perm_fn(r)) \ + bool privileged = (emu_euid == 0); \ + if (!privileged && r != (uint32_t) -1 && !perm_fn(r)) \ return -LINUX_EPERM; \ - if (e != (uint32_t) -1 && !perm_fn(e)) \ + if (!privileged && e != (uint32_t) -1 && !perm_fn(e)) \ return -LINUX_EPERM; \ - if (s != (uint32_t) -1 && !perm_fn(s)) \ + if (!privileged && s != (uint32_t) -1 && !perm_fn(s)) \ return -LINUX_EPERM; \ if (r != (uint32_t) -1) \ real = r; \ diff --git a/src/syscall/proc-state.c b/src/syscall/proc-state.c index 81dc4e0..260f50b 100644 --- a/src/syscall/proc-state.c +++ b/src/syscall/proc-state.c @@ -553,7 +553,16 @@ static const char *proc_resolve_sysroot_path_flags(const char *path, errno = ENAMETOOLONG; return NULL; } - return path; + char parent[LINUX_PATH_MAX]; + str_copy_trunc(parent, buf, sizeof(parent)); + char *slash = strrchr(parent, '/'); + if (!slash || slash == parent) + return buf; + *slash = '\0'; + + if (sysroot_validate_dir_prefix(parent) < 0) + return NULL; + return buf; } const char *proc_resolve_sysroot_path(const char *path, char *buf, size_t bufsz) @@ -615,14 +624,6 @@ const char *proc_resolve_sysroot_create_path(const char *path, if (errno != ENOENT && errno != ENOTDIR) return NULL; - /* Parent doesn't exist in sysroot. Only /tmp, /var/tmp, and ccache get - * forcefully redirected to the sysroot to avoid host case-collisions; - * everything else falls back to the host literal. - */ - if (strncmp(path, "/tmp/", 5) && strncmp(path, "/var/tmp/", 9) && - !strstr(path, "/.ccache/")) - return path; - if (!create_parents) { if (sysroot_validate_dir_prefix(parent) < 0) return NULL; diff --git a/src/syscall/proc.c b/src/syscall/proc.c index beab7ea..88b5db1 100644 --- a/src/syscall/proc.c +++ b/src/syscall/proc.c @@ -313,6 +313,7 @@ void proc_mark_child_exited(pid_t host_pid, int status) pthread_cond_broadcast(&pid_cond); pthread_mutex_unlock(&pid_lock); proc_pidfd_notify_exit(gpid); + signal_queue(LINUX_SIGCHLD); return; } pthread_mutex_unlock(&pid_lock); diff --git a/src/syscall/sys.c b/src/syscall/sys.c index da6b7d4..3530be2 100644 --- a/src/syscall/sys.c +++ b/src/syscall/sys.c @@ -31,9 +31,6 @@ /* System info syscall handlers. */ -static pthread_once_t groups_once = PTHREAD_ONCE_INIT; -static uint32_t cached_linux_groups[64]; -static int cached_ngroups = -1; static const linux_utsname_t cached_uname = { .sysname = "Linux", .nodename = "elfuse", @@ -81,18 +78,6 @@ _Static_assert(offsetof(struct rusage, ru_maxrss) == */ static bool sched_pid_alive(int pid); -static void groups_init_cached_linux_groups(void) -{ - gid_t groups[64]; - int ngroups = getgroups(64, groups); - if (ngroups < 0) - return; - - for (int i = 0; i < ngroups; i++) - cached_linux_groups[i] = (uint32_t) groups[i]; - cached_ngroups = ngroups; -} - static void sysinfo_init_cached_host_state(void) { struct timeval boottime; @@ -158,19 +143,6 @@ static void sysinfo_refresh_cached_locked(time_t now_sec) cached_sysinfo_sec = now_sec; } -static int get_cached_linux_groups(void) -{ - if (thread_is_single_active()) { - if (cached_ngroups >= 0) - return cached_ngroups; - groups_init_cached_linux_groups(); - return cached_ngroups; - } - - pthread_once(&groups_once, groups_init_cached_linux_groups); - return cached_ngroups; -} - int64_t sys_uname(guest_t *g, uint64_t buf_gva) { if (guest_write_small(g, buf_gva, &cached_uname, sizeof(cached_uname)) < 0) @@ -487,17 +459,15 @@ int64_t sys_sched_rr_get_interval(guest_t *g, int pid, uint64_t ts_gva) int64_t sys_getgroups(guest_t *g, int size, uint64_t list_gva) { - int ngroups = get_cached_linux_groups(); - if (ngroups < 0) - return linux_errno(); + const int ngroups = 1; if (size == 0) return ngroups; if (size < ngroups) return -LINUX_EINVAL; - size_t bytes = (size_t) ngroups * sizeof(cached_linux_groups[0]); - if (guest_write_small(g, list_gva, cached_linux_groups, bytes) < 0) + uint32_t group = proc_get_gid(); + if (guest_write_small(g, list_gva, &group, sizeof(group)) < 0) return -LINUX_EFAULT; return ngroups; diff --git a/src/syscall/syscall.c b/src/syscall/syscall.c index 6f97fc0..cb0237d 100644 --- a/src/syscall/syscall.c +++ b/src/syscall/syscall.c @@ -556,6 +556,7 @@ SC_FORWARD(sc_setfsuid, (int64_t) proc_get_euid()) SC_FORWARD(sc_setfsgid, (int64_t) proc_get_egid()) SC_FORWARD(sc_setpgid, proc_sys_setpgid(g, (int64_t) x0, (int64_t) x1)) SC_STUB(sc_fadvise64, 0) +SC_STUB(sc_sync_file_range, 0) SC_STUB(sc_sched_yield, (sched_yield(), 0)) SC_STUB(sc_mlock, 0) SC_STUB(sc_munlock, 0) @@ -1952,8 +1953,6 @@ int syscall_dispatch(hv_vcpu_t vcpu, guest_t *g, int *exit_code, bool verbose) result = ret; goto fast_done; } - if (nr == SYS_write && errno == EPIPE) - signal_queue(LINUX_SIGPIPE); result = linux_errno(); goto fast_done; } @@ -2057,12 +2056,23 @@ int syscall_dispatch(hv_vcpu_t vcpu, guest_t *g, int *exit_code, bool verbose) if (verbose) { log_debug(" -> %lld (0x%llx)", (long long) result, (unsigned long long) (uint64_t) result); - /* Log file paths for openat/readlinkat */ + /* Log path-bearing syscalls. */ if ((int) x8 == SYS_openat || (int) x8 == SYS_readlinkat || - (int) x8 == SYS_faccessat) { + (int) x8 == SYS_faccessat || (int) x8 == SYS_newfstatat || + (int) x8 == SYS_unlinkat || (int) x8 == SYS_mkdirat || + (int) x8 == SYS_fchmodat || (int) x8 == SYS_fchownat) { char pathbuf[256]; if (guest_read_str(g, x1, pathbuf, sizeof(pathbuf)) >= 0) log_debug(" path=\"%s\"", pathbuf); + } else if ((int) x8 == SYS_renameat || (int) x8 == SYS_renameat2 || + (int) x8 == SYS_linkat || (int) x8 == SYS_symlinkat) { + char path_a[256]; + char path_b[256]; + uint64_t a_gva = ((int) x8 == SYS_symlinkat) ? x0 : x1; + uint64_t b_gva = ((int) x8 == SYS_symlinkat) ? x2 : x3; + if (guest_read_str(g, a_gva, path_a, sizeof(path_a)) >= 0 && + guest_read_str(g, b_gva, path_b, sizeof(path_b)) >= 0) + log_debug(" path=\"%s\" -> \"%s\"", path_a, path_b); } } /* Write result back to X0 */