diff --git a/fs/namei.c b/fs/namei.c index 9155ecb547ce62..57cc727134de6c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -40,6 +40,7 @@ #include #include #include +#include #include "internal.h" #include "mount.h" @@ -4928,6 +4929,10 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd, rd.delegated_inode = &delegated_inode; rd.flags = flags; error = vfs_rename(&rd); + if (!error) { + struct path rpath = { .dentry = old_dentry, .mnt = new_path.mnt }; + lttle_check_rename(&rpath); + } exit5: dput(new_dentry); exit4: diff --git a/fs/open.c b/fs/open.c index a81319b6177f69..6292edb165f500 100644 --- a/fs/open.c +++ b/fs/open.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "internal.h" @@ -1423,6 +1424,7 @@ int filp_close(struct file *filp, fl_owner_t id) dnotify_flush(filp, id); locks_remove_posix(filp, id); } + lttle_check_close(filp); fput(filp); return retval; } diff --git a/include/lttle/trigger.h b/include/lttle/trigger.h new file mode 100644 index 00000000000000..6336fa27f59fc8 --- /dev/null +++ b/include/lttle/trigger.h @@ -0,0 +1,46 @@ +#ifndef _LTTLE_TRIGGER_H +#define _LTTLE_TRIGGER_H + +#include +#include + +struct file; +struct path; + +struct lttle_watch_req { + __u32 index; + char path[256]; +}; + +#define LTTLE_IOC_MAGIC 'L' +#define LTTLE_IOC_WATCH _IOW(LTTLE_IOC_MAGIC, 1, struct lttle_watch_req) + +/* Triggers are sent via PIO port 0x510 (8-byte outl pairs). + * No MMIO mapping needed. */ +#define LTTLE_TRIGGER_PIO_PORT (0x510) + +#define LTTLE_SYS_AFTER_OFFSET 127 +#define LTTLE_SYS_CMD_OFFSET 64 + +#define LTTLE_SYS_LISTEN_BEFORE 1 +#define LTTLE_SYS_BIND_BEFORE 2 +#define LTTLE_USERSPACE_READY 3 +#define LTTLE_MANUAL_TRIGGER 10 + +#define LTTLE_SYS_LISTEN_AFTER (LTTLE_SYS_LISTEN_BEFORE + LTTLE_SYS_AFTER_OFFSET) +#define LTTLE_SYS_BIND_AFTER (LTTLE_SYS_BIND_BEFORE + LTTLE_SYS_AFTER_OFFSET) + +#define LTTLE_CMD_FLASH_LOCK (LTTLE_SYS_CMD_OFFSET + 0) +#define LTTLE_CMD_FLASH_UNLOCK (LTTLE_SYS_CMD_OFFSET + 1) + +#define LTTLE_FILE_WATCH_MAX 2 + +int __init lttle_subsystem_init(void); +void __exit lttle_subsystem_exit(void); +void lttle_sys_trigger(unsigned char code, char data[7]); +void lttle_sys_cmd(unsigned char cmd); +void lttle_set_watch(int index, const char *path); +void lttle_check_close(struct file *file); +void lttle_check_rename(const struct path *path); + +#endif // _LTTLE_TRIGGER_H diff --git a/kernel/Makefile b/kernel/Makefile index d754e0be1176df..837cc9cd123816 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -10,7 +10,8 @@ obj-y = fork.o exec_domain.o panic.o \ extable.o params.o \ kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ - async.o range.o smpboot.o ucount.o regset.o + async.o range.o smpboot.o ucount.o regset.o \ + lttle.o obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o obj-$(CONFIG_MODULES) += kmod.o diff --git a/kernel/lttle.c b/kernel/lttle.c new file mode 100644 index 00000000000000..0b5495744365fd --- /dev/null +++ b/kernel/lttle.c @@ -0,0 +1,407 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* No MMIO mapping needed — triggers go via PIO port 0x510. */ + +/* + * Fork resume via NMI — solves the AMD KVM_SET_LAPIC hrtimer bug. + * + * On AMD, KVM_SET_LAPIC restores LAPIC register values but doesn't arm + * the internal hrtimer. The timer only starts when the guest writes to + * APIC_TMICT (initial count register). After fork restore, all vCPUs are + * halted and the timer is dead. + * + * The VMM writes FORK_PHASE_REQ_RESUME_FORK (3) to guest physical address + * 0x6348 (fork mailbox + 0x348) and injects an NMI before starting vCPUs. + * This NMI handler checks the mailbox, and if a fork resume is requested: + * 1. Writes to APIC_TMICT to trigger KVM to create an hrtimer + * 2. Reconfigures network, hostname, entropy from fork mailbox + * + * The fork mailbox physical address and phase offset must match the VMM. + */ +#define FORK_MAILBOX_PHYS 0x5000 +#define FORK_PHASE_OFFSET 0x348 +#define FORK_PHASE_REQ_RESUME 3 + +static volatile void *fork_mailbox_page; + +/* Workqueue handler: signal takeoff (PID 1) to do fork reconfiguration. + * Now that drain_io() is called before snapshot, disk I/O works after + * restore, so takeoff can read /dev/mem and run ip commands normally. */ +static void fork_reconfig_work_fn(struct work_struct *work) +{ + struct pid *pid; + int ret = -1; + const char *magic; + + if (!fork_mailbox_page) + return; + + /* Verify the fork mailbox has "BOXDFORK" magic at offset 0 */ + magic = (const char *)fork_mailbox_page; + if (memcmp(magic, "BOXDFORK", 8) != 0) { + pr_warn("lttle: fork mailbox magic mismatch, skipping signal\n"); + return; + } + + pr_info("lttle: fork detected — ip=%s gw=%s name=%s\n", + (const char *)fork_mailbox_page + 0x48, + (const char *)fork_mailbox_page + 0xC8, + (const char *)fork_mailbox_page + 0x248); + + /* Send signal 42 (SIGRTMIN+10) to PID 1 (takeoff) */ + rcu_read_lock(); + pid = find_pid_ns(1, &init_pid_ns); + if (pid) + ret = kill_pid(pid, 42, 1); + rcu_read_unlock(); + pr_info("lttle: kill_pid(1, 42) = %d\n", ret); +} + +static DECLARE_WORK(fork_reconfig_work, fork_reconfig_work_fn); + +/* irq_work runs from normal interrupt context, not NMI context. + * This is where we re-arm the LAPIC timer and re-init kvmclock. */ + +static void fork_resume_irq_work(struct irq_work *work) +{ + u32 lvt, lvt_new, tmict, apicbase_lo, apicbase_hi; + u64 msr_val; + int cpu = smp_processor_id(); + + /* + * Step 1: Re-initialize kvmclock. + * + * After fork, the pvclock shared page has stale parameters from the + * source VM. The guest computes timer periods using these stale values, + * producing absurd results (e.g. 992ns periods). Re-writing + * MSR_KVM_SYSTEM_TIME_NEW with its current value forces KVM to + * recompute the pvclock parameters for this VM's TSC. + */ + rdmsrl(0x4b564d01, msr_val); /* MSR_KVM_SYSTEM_TIME_NEW */ + if (msr_val) { + pr_info("lttle: CPU %d re-init kvmclock: MSR_KVM_SYSTEM_TIME_NEW=0x%llx\n", + cpu, msr_val); + wrmsrl(0x4b564d01, msr_val); + } + + /* + * Step 2: Re-arm LAPIC timer. + * + * On AMD, KVM_SET_LAPIC restores register values but doesn't arm + * the host hrtimer. Writing APIC_TMICT triggers a VM exit that + * creates the hrtimer. Use periodic mode at ~10Hz. + */ + lvt = apic_read(APIC_LVTT); + tmict = apic_read(APIC_TMICT); + rdmsr(MSR_IA32_APICBASE, apicbase_lo, apicbase_hi); + + pr_info("lttle: CPU %d BEFORE: LVTT=0x%08x TMICT=%u APICBASE=0x%08x\n", + cpu, lvt, tmict, apicbase_lo); + + lvt_new = (lvt & 0xFF) | (1 << 17); /* periodic + keep vector */ + apic_write(APIC_LVTT, lvt_new); + apic_write(APIC_TMICT, 23700000); + + pr_info("lttle: CPU %d AFTER: LVTT=0x%08x TMICT=%u TMCCT=%u\n", + cpu, apic_read(APIC_LVTT), apic_read(APIC_TMICT), + apic_read(APIC_TMCCT)); + + /* Step 3: Schedule fork reconfiguration (network, hostname, entropy). + * Only CPU 0 does this. call_usermodehelper needs process context, + * so we schedule it on a workqueue. */ + if (cpu == 0 && fork_mailbox_page) { + /* Clear the fork phase so subsequent NMIs are ignored */ + *(volatile u8 *)((char *)fork_mailbox_page + FORK_PHASE_OFFSET) = 0; + schedule_work(&fork_reconfig_work); + } +} + +static DEFINE_PER_CPU(struct irq_work, fork_resume_work) = + IRQ_WORK_INIT(fork_resume_irq_work); + +static int lttle_nmi_handler(unsigned int type, struct pt_regs *regs) +{ + u8 phase; + + if (!fork_mailbox_page) + return NMI_DONE; + + phase = *(volatile u8 *)((char *)fork_mailbox_page + FORK_PHASE_OFFSET); + if (phase != FORK_PHASE_REQ_RESUME) + return NMI_DONE; + + pr_info("lttle: NMI handler CPU %d: phase=%d LVTT=0x%08x TMICT=%u TMCCT=%u\n", + smp_processor_id(), phase, + apic_read(APIC_LVTT), apic_read(APIC_TMICT), apic_read(APIC_TMCCT)); + + /* Schedule the actual timer re-arm from normal context. + * KVM on AMD may not arm the hrtimer from NMI context. + * Use per-CPU irq_work so every vCPU re-inits its own timers. + * Don't clear the phase here — both CPUs need to see it. + * The irq_work handler on CPU 0 will clear it after signaling takeoff. */ + irq_work_queue(this_cpu_ptr(&fork_resume_work)); + + return NMI_HANDLED; +} + +/* File watch state */ +static char watched_paths[LTTLE_FILE_WATCH_MAX][256]; +static char watched_names[LTTLE_FILE_WATCH_MAX][64]; +static bool watch_active[LTTLE_FILE_WATCH_MAX]; + +static void lttle_emit_event(int file_index) +{ + struct pid *pid; + int sig = (file_index == 0) ? SIGUSR1 : SIGUSR2; + int ret; + + /* Send signal to the process (not a specific thread) so that + * any thread waiting via sigwait() can pick it up. */ + rcu_read_lock(); + pid = find_pid_ns(1, &init_pid_ns); + if (pid) { + struct task_struct *task = pid_task(pid, PIDTYPE_PID); + pr_info("lttle: emit signal %d to pid 1 (task=%s, pid=%d, tgid=%d)\n", + sig, task ? task->comm : "NULL", + task ? task->pid : -1, + task ? task->tgid : -1); + ret = kill_pid(pid, sig, 1); + pr_info("lttle: kill_pid returned %d\n", ret); + } else { + pr_warn("lttle: find_pid_ns(1, init_pid_ns) returned NULL!\n"); + } + rcu_read_unlock(); +} + +void lttle_set_watch(int index, const char *path) +{ + const char *slash; + + if (index < 0 || index >= LTTLE_FILE_WATCH_MAX) + return; + + strscpy(watched_paths[index], path, sizeof(watched_paths[index])); + slash = strrchr(path, '/'); + strscpy(watched_names[index], slash ? slash + 1 : path, + sizeof(watched_names[index])); + watch_active[index] = true; + pr_info("lttle: watching path[%d] = %s (name=%s)\n", + index, watched_paths[index], watched_names[index]); +} + +void lttle_check_close(struct file *file) +{ + int i; + const char *name; + char buf[256]; + char *path; + + if (!(file->f_mode & FMODE_WRITE)) + return; + + name = file->f_path.dentry->d_name.name; + for (i = 0; i < LTTLE_FILE_WATCH_MAX; i++) { + if (!watch_active[i]) + continue; + if (strcmp(name, watched_names[i]) != 0) + continue; + /* Filename matches, do full path check */ + path = d_path(&file->f_path, buf, sizeof(buf)); + if (IS_ERR(path)) + continue; + if (strcmp(path, watched_paths[i]) == 0) { + pr_info("lttle: file close detected [%d] %s\n", + i, path); + lttle_emit_event(i); + break; + } + } +} + +void lttle_check_rename(const struct path *rpath) +{ + int i; + const char *name; + char buf[256]; + char *path; + + name = rpath->dentry->d_name.name; + for (i = 0; i < LTTLE_FILE_WATCH_MAX; i++) { + if (!watch_active[i]) + continue; + if (strcmp(name, watched_names[i]) != 0) + continue; + path = d_path(rpath, buf, sizeof(buf)); + if (IS_ERR(path)) + continue; + if (strcmp(path, watched_paths[i]) == 0) { + pr_info("lttle: file rename detected [%d] %s\n", + i, path); + lttle_emit_event(i); + break; + } + } +} + +typedef struct { + unsigned char code; + // 7 bytes data + unsigned char data[7]; +} lttle_sys_trigger_data; + +typedef char lttle_sys_trigger_data_incomplete_size[sizeof(lttle_sys_trigger_data) == 8 ? 1 : -1]; // Ensure the size of the struct is 8 bytes + +static ssize_t lttle_proc_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + /* Boot time counters are no longer available via MMIO. Return 0. */ + unsigned long long last_boot_time = 0; + unsigned long long first_boot_time = 0; + char msg[1024]; + int len = snprintf(msg, sizeof(msg), "{\"last_boot_time_us\": %llu, \"first_boot_time_us\": %llu}", last_boot_time, first_boot_time); + + return simple_read_from_buffer(buf, count, ppos, msg, len); +} + +static ssize_t lttle_proc_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) +{ + int len; + char tbuf[256] = {0}; + + if (count >= sizeof(tbuf)) { + return -ENOSPC; + } + + len = simple_write_to_buffer(tbuf, sizeof(tbuf) - 1, ppos, buf, count); + if (len < 0) { + return len; + } + + tbuf[len] = '\0'; + + if (strcmp(tbuf, "manual_trigger") == 0) { + lttle_sys_trigger(LTTLE_MANUAL_TRIGGER, NULL); + *ppos = 0; // Reset position for next write + return count; + } + + if (strcmp(tbuf, "flash_lock") == 0) { + lttle_sys_cmd(LTTLE_CMD_FLASH_LOCK); + *ppos = 0; // Reset position for next write + return count; + } + + if (strcmp(tbuf, "flash_unlock") == 0) { + lttle_sys_cmd(LTTLE_CMD_FLASH_UNLOCK); + *ppos = 0; // Reset position for next write + return count; + } + + *ppos = 0; // Reset position for next write + return count; +} + +static long lttle_proc_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case LTTLE_IOC_WATCH: { + struct lttle_watch_req req; + + if (copy_from_user(&req, (void __user *)arg, sizeof(req))) + return -EFAULT; + req.path[sizeof(req.path) - 1] = '\0'; + if (req.index >= LTTLE_FILE_WATCH_MAX) + return -EINVAL; + lttle_set_watch(req.index, req.path); + return 0; + } + default: + return -ENOTTY; + } +} + +static const struct proc_ops lttle_proc_ops = { + .proc_read = lttle_proc_read, + .proc_write = lttle_proc_write, + .proc_ioctl = lttle_proc_ioctl, +}; + +static struct proc_dir_entry *lttle_proc_entry; + +int __init lttle_subsystem_init(void) +{ + lttle_proc_entry = proc_create("lttle", 0666, NULL, <tle_proc_ops); + if (!lttle_proc_entry) { + pr_err("Failed to create /proc/lttle\n"); + return -1; + } + + /* Map the fork mailbox page for the NMI handler */ + fork_mailbox_page = ioremap(FORK_MAILBOX_PHYS, PAGE_SIZE); + if (!fork_mailbox_page) + pr_warn("LTTLE: failed to map fork mailbox at 0x%x\n", FORK_MAILBOX_PHYS); + + /* Register NMI handler for fork resume */ + register_nmi_handler(NMI_UNKNOWN, lttle_nmi_handler, 0, "lttle_fork"); + + pr_info("LTTLE subsystem initialized (PIO triggers on port 0x%x)\n", LTTLE_TRIGGER_PIO_PORT); + return 0; +} + +void __exit lttle_subsystem_exit(void) +{ + unregister_nmi_handler(NMI_UNKNOWN, "lttle_fork"); + + if (lttle_proc_entry) { + proc_remove(lttle_proc_entry); + lttle_proc_entry = NULL; + } + if (fork_mailbox_page) { + iounmap(fork_mailbox_page); + fork_mailbox_page = NULL; + } + + pr_info("LTTLE subsystem exited cleanly\n"); +} + +void lttle_sys_trigger(unsigned char code, char data[7]) +{ + lttle_sys_trigger_data trigger_data; + trigger_data.code = code; + if (data) { + memcpy(trigger_data.data, data, sizeof(trigger_data.data)); + } else { + memset(trigger_data.data, 0, sizeof(trigger_data.data)); + } + + /* Send 8-byte trigger as two 4-byte PIO writes to port 0x510 and 0x514. + * The VMM reconstructs the 8 bytes from these two writes. */ + outl(*(u32 *)&trigger_data, LTTLE_TRIGGER_PIO_PORT); + outl(*((u32 *)&trigger_data + 1), LTTLE_TRIGGER_PIO_PORT + 4); +} + +void lttle_sys_cmd(unsigned char cmd) +{ + /* Commands use port 0x518 (single byte) */ + outb(cmd, LTTLE_TRIGGER_PIO_PORT + 8); +} + +subsys_initcall(lttle_subsystem_init); +module_exit(lttle_subsystem_exit); diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 5d85014d59b5f8..f3de0f4752dfbc 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -607,6 +607,7 @@ void clockevents_resume(void) if (dev->resume && !clockevent_state_detached(dev)) dev->resume(dev); } +EXPORT_SYMBOL_GPL(clockevents_resume); #ifdef CONFIG_HOTPLUG_CPU diff --git a/net/socket.c b/net/socket.c index 00da9ce3dba0bf..b2d272c89dc66c 100644 --- a/net/socket.c +++ b/net/socket.c @@ -107,6 +107,8 @@ #include #include +#include + #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int sysctl_net_busy_read __read_mostly; unsigned int sysctl_net_busy_poll __read_mostly; @@ -1632,6 +1634,7 @@ int __sys_socket(int family, int type, int protocol) { struct socket *sock; int flags; + int fd; sock = __sys_socket_create(family, type, protocol); if (IS_ERR(sock)) @@ -1641,7 +1644,8 @@ int __sys_socket(int family, int type, int protocol) if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; - return sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); + fd = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); + return fd; } SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol) @@ -1763,12 +1767,34 @@ int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen) { struct socket *sock; struct sockaddr_storage address; + struct sockaddr *addr; + struct sockaddr_in *addr_in; int err, fput_needed; + bool compatible_family_trigger = false; + char trigger_data[7] = {0}; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock) { err = move_addr_to_kernel(umyaddr, addrlen, &address); if (!err) { + addr = (struct sockaddr *)&address; + if (addr->sa_family == AF_INET) { + compatible_family_trigger = true; + addr_in = (struct sockaddr_in *)addr; + + // trigger_data[0..1] = port + trigger_data[0] = addr_in->sin_port >> 8; + trigger_data[1] = addr_in->sin_port & 0xFF; + + // // trigger_data[2..5] = address + trigger_data[2] = addr_in->sin_addr.s_addr >> 24; + trigger_data[3] = addr_in->sin_addr.s_addr >> 16; + trigger_data[4] = addr_in->sin_addr.s_addr >> 8; + trigger_data[5] = addr_in->sin_addr.s_addr; + + lttle_sys_trigger(LTTLE_SYS_BIND_BEFORE, trigger_data); + } + err = security_socket_bind(sock, (struct sockaddr *)&address, addrlen); @@ -1779,6 +1805,11 @@ int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen) } fput_light(sock->file, fput_needed); } + + if (err == 0 && compatible_family_trigger) { + lttle_sys_trigger(LTTLE_SYS_BIND_AFTER, trigger_data); + } + return err; } @@ -1798,9 +1829,27 @@ int __sys_listen(int fd, int backlog) struct socket *sock; int err, fput_needed; int somaxconn; + bool compatible_family_trigger = false; + char trigger_data[7] = {0}; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock) { + if (sock->sk) { + compatible_family_trigger = true; + + // trigger_data[0..1] = port (local port for listening socket) + trigger_data[0] = sock->sk->sk_num & 0xFF; + trigger_data[1] = sock->sk->sk_num >> 8; + // trigger_data[2..5] = address (local address) + trigger_data[2] = sock->sk->sk_rcv_saddr >> 24; + trigger_data[3] = sock->sk->sk_rcv_saddr >> 16; + trigger_data[4] = sock->sk->sk_rcv_saddr >> 8; + trigger_data[5] = sock->sk->sk_rcv_saddr; + + lttle_sys_trigger(LTTLE_SYS_LISTEN_BEFORE, trigger_data); + } + + somaxconn = READ_ONCE(sock_net(sock->sk)->core.sysctl_somaxconn); if ((unsigned int)backlog > somaxconn) backlog = somaxconn; @@ -1811,6 +1860,11 @@ int __sys_listen(int fd, int backlog) fput_light(sock->file, fput_needed); } + + if (err == 0 && compatible_family_trigger) { + lttle_sys_trigger(LTTLE_SYS_LISTEN_AFTER, trigger_data); + } + return err; }