diff --git a/bench_vs/lambda/recursion/Cargo.toml b/bench_vs/lambda/recursion/Cargo.toml index bdfeb38dc..9a4a9e138 100644 --- a/bench_vs/lambda/recursion/Cargo.toml +++ b/bench_vs/lambda/recursion/Cargo.toml @@ -9,3 +9,6 @@ edition = "2024" lambda-vm-prover = { path = "../../../prover", default-features = false } lambda-vm-syscalls = { path = "../../../syscalls" } postcard = { version = "1.0", features = ["alloc"] } + +[profile.release] +debug = 2 diff --git a/bin/cli/src/main.rs b/bin/cli/src/main.rs index 2b053755c..b5e494d7b 100644 --- a/bin/cli/src/main.rs +++ b/bin/cli/src/main.rs @@ -10,11 +10,7 @@ use clap::{Parser, Subcommand, ValueHint}; #[global_allocator] static ALLOC: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; -use executor::{ - elf::{Elf, SymbolTable}, - flamegraph::FlamegraphGenerator, - vm::execution::Executor, -}; +use executor::{elf::Elf, flamegraph::FlamegraphGenerator, vm::execution::Executor}; use prover::VmProof; use stark::proof::options::GoldilocksCubicProofOptions; @@ -112,6 +108,22 @@ enum Commands { #[arg(long, value_hint = ValueHint::FilePath)] flamegraph: Option, + /// Key the folded stacks by raw hex address instead of resolving + /// through the ELF symtab (pairs with scripts/enrich_flamegraph.py). + /// Only meaningful with --flamegraph. + #[arg(long, requires = "flamegraph")] + flamegraph_raw: bool, + + /// Checkpoint the flamegraph's folded output to --flamegraph every N + /// cycles, so a killed run still leaves usable (partial) output on + /// disk. Only meaningful with --flamegraph. + #[arg(long, requires = "flamegraph")] + flamegraph_checkpoint_cycles: Option, + + /// Stop execution early once at least this many cycles have run. + #[arg(long)] + cycle_budget: Option, + /// Print the dynamic instruction (cycle) count #[arg(long)] cycles: bool, @@ -207,8 +219,21 @@ fn main() -> ExitCode { elf, private_input, flamegraph, + flamegraph_raw, + flamegraph_checkpoint_cycles, + cycle_budget, + cycles, + } => cmd_execute( + elf, + private_input, + FlamegraphCliOptions { + path: flamegraph, + raw: flamegraph_raw, + checkpoint_cycles: flamegraph_checkpoint_cycles, + }, + cycle_budget, cycles, - } => cmd_execute(elf, private_input, flamegraph, cycles), + ), Commands::Prove { elf, output, @@ -272,10 +297,38 @@ fn count_cycles(elf_data: &[u8], private_inputs: &[u8]) -> Result { .map_err(|e| format!("Execution failed during cycle count: {e:?}")) } +/// Write the flamegraph's current (possibly partial) folded output to +/// `output_path`, overwriting any previous contents. Used both for the final +/// write and for periodic checkpoints during a long run. +fn write_flamegraph_checkpoint( + output_path: &PathBuf, + generator: &FlamegraphGenerator, + raw: bool, +) -> Result<(), String> { + let file = + File::create(output_path).map_err(|e| format!("Failed to create output file: {e}"))?; + let mut writer = BufWriter::new(file); + let result = if raw { + generator.write_folded_raw(&mut writer) + } else { + generator.write_folded(&mut writer) + }; + result.map_err(|e| format!("Failed to write flamegraph output: {e:?}")) +} + +/// Flamegraph-related flags grouped so `cmd_execute` doesn't need a flat +/// 8-argument signature. +struct FlamegraphCliOptions { + path: Option, + raw: bool, + checkpoint_cycles: Option, +} + fn cmd_execute( elf_path: PathBuf, private_input_path: Option, - flamegraph_path: Option, + flamegraph: FlamegraphCliOptions, + cycle_budget: Option, cycles: bool, ) -> ExitCode { let elf_data = match std::fs::read(&elf_path) { @@ -302,71 +355,83 @@ fn cmd_execute( } }; - let mut executor = match Executor::new(&program, private_inputs) { - Ok(e) => e, - Err(e) => { - eprintln!("Failed to create executor: {:?}", e); - return ExitCode::FAILURE; - } - }; - - // Set up flamegraph generator if requested - let mut generator = flamegraph_path.as_ref().map(|_| { - let symbols = SymbolTable::parse(&elf_data); - FlamegraphGenerator::new(symbols, program.entry_point) - }); + let cycle_count = if let Some(ref output_path) = flamegraph.path { + // Shared execute+flamegraph path (executor::flamegraph) instead of + // hand-rolling the SymbolTable/Executor/drive-loop wiring here. + let mut next_checkpoint = flamegraph.checkpoint_cycles; + let result = executor::flamegraph::run_with_flamegraph( + &elf_data, + &program, + private_inputs, + executor::flamegraph::FlamegraphRunOptions { cycle_budget }, + |total_cycles, generator| { + let Some(threshold) = next_checkpoint else { + return; + }; + if total_cycles < threshold { + return; + } + if let Err(e) = write_flamegraph_checkpoint(output_path, generator, flamegraph.raw) + { + eprintln!("Warning: flamegraph checkpoint failed: {e}"); + } + next_checkpoint = flamegraph.checkpoint_cycles.map(|step| threshold + step); + }, + ); - // Execute in chunks, counting cycles and (if requested) feeding the flamegraph. - let mut cycle_count: u64 = 0; - loop { - let logs = match executor.resume() { - Ok(logs) => logs, + let (generator, total_cycles) = match result { + Ok(r) => r, Err(e) => { eprintln!("Execution failed: {:?}", e); return ExitCode::FAILURE; } }; - match logs { - Some(logs) => { - cycle_count += logs.len() as u64; - if let Some(ref mut fg) = generator { - let logs: Vec<_> = logs.to_vec(); - if let Err(e) = fg.process_logs(&logs, &executor.instructions) { - eprintln!("Failed to process logs for flamegraph: {:?}", e); - return ExitCode::FAILURE; - } - } - } - None => break, - } - } - if let Err(e) = executor.finish() { - eprintln!("Failed to finish execution: {:?}", e); - return ExitCode::FAILURE; - } + if let Err(e) = write_flamegraph_checkpoint(output_path, &generator, flamegraph.raw) { + eprintln!("{e}"); + return ExitCode::FAILURE; + } + eprintln!( + "Flamegraph written to {:?} ({} instructions)", + output_path, + generator.total_instructions() + ); - // Write flamegraph output if requested - if let (Some(output_path), Some(generator)) = (flamegraph_path, generator) { - let file = match File::create(&output_path) { - Ok(f) => f, + total_cycles + } else { + let mut executor = match Executor::new(&program, private_inputs) { + Ok(e) => e, Err(e) => { - eprintln!("Failed to create flamegraph output file: {}", e); + eprintln!("Failed to create executor: {:?}", e); return ExitCode::FAILURE; } }; - let mut writer = BufWriter::new(file); - if let Err(e) = generator.write_folded(&mut writer) { - eprintln!("Failed to write flamegraph output: {:?}", e); + + let mut cycle_count: u64 = 0; + loop { + let logs = match executor.resume() { + Ok(logs) => logs, + Err(e) => { + eprintln!("Execution failed: {:?}", e); + return ExitCode::FAILURE; + } + }; + match logs { + Some(logs) => cycle_count += logs.len() as u64, + None => break, + } + if cycle_budget.is_some_and(|budget| cycle_count >= budget) { + break; + } + } + + if let Err(e) = executor.finish() { + eprintln!("Failed to finish execution: {:?}", e); return ExitCode::FAILURE; } - eprintln!( - "Flamegraph written to {:?} ({} instructions)", - output_path, - generator.total_instructions() - ); - } + cycle_count + }; if cycles { println!("Cycles: {}", cycle_count); diff --git a/executor/src/flamegraph.rs b/executor/src/flamegraph.rs index f9b447d19..bbd2a7260 100644 --- a/executor/src/flamegraph.rs +++ b/executor/src/flamegraph.rs @@ -8,26 +8,58 @@ use std::io::{self, Write}; use rustc_demangle::demangle as rustc_demangle; -use crate::elf::SymbolTable; -use crate::vm::execution::InstructionCache; +use crate::elf::{Elf, SymbolTable}; +use crate::vm::execution::{Executor, ExecutorError, InstructionCache}; use crate::vm::instruction::decoding::Instruction; use crate::vm::logs::Log; /// Errors that can occur during flamegraph generation. -#[derive(Debug)] +#[derive(Debug, thiserror::Error)] pub enum FlamegraphError { /// Instruction not found for a given program counter. + #[error("instruction not found for a given program counter")] InstructionNotFound, } +/// Errors from the shared execute+flamegraph drive loop. +#[derive(Debug, thiserror::Error)] +pub enum FlamegraphDriveError { + #[error(transparent)] + Executor(#[from] ExecutorError), + #[error(transparent)] + Flamegraph(#[from] FlamegraphError), +} + +/// One node of the call-graph trie. `addr` is the function-entry address of +/// the frame this node represents; `count` is the number of instructions +/// attributed directly to this exact call-stack state. +struct TrieNode { + parent: u32, + addr: u64, + count: u64, + children: HashMap, +} + +/// Root node index. Its own `parent` field is a self-loop sentinel and is +/// never followed — `pop` refuses to move past it. +const ROOT: u32 = 0; + /// Generates flamegraph data by tracking function calls during execution. +/// +/// Instruction counts are stored in a call-graph trie keyed by address, not a +/// demangled string per stack — pushing/popping/counting are all O(1) +/// pointer/hashmap operations independent of call-stack depth. Symbol +/// resolution and demangling happen once per unique address, only when +/// `write_folded` walks the trie. pub struct FlamegraphGenerator { - /// Symbol table for address-to-name resolution + /// Symbol table for address-to-name resolution. symbols: SymbolTable, - /// Current call stack (function entry addresses) - call_stack: Vec, - /// Instruction counts per stack state: "main;foo;bar" -> count - stack_counts: HashMap, + /// Arena of trie nodes; index 0 is the root (the entry-point frame). + nodes: Vec, + /// Index into `nodes` of the current call-stack leaf. + current: u32, + /// Sum of `count` across all nodes, tracked incrementally. + total_counted: u64, } impl FlamegraphGenerator { @@ -35,23 +67,28 @@ impl FlamegraphGenerator { pub fn new(symbols: SymbolTable, entry_point: u64) -> Self { Self { symbols, - call_stack: vec![entry_point], // Start with entry point on stack - stack_counts: HashMap::new(), + nodes: vec![TrieNode { + parent: ROOT, + addr: entry_point, + count: 0, + children: HashMap::new(), + }], + current: ROOT, + total_counted: 0, } } - /// Process a batch of execution logs, updating call stack and instruction counts. + /// Process a batch of execution logs, updating the call stack and + /// instruction counts. pub fn process_logs( &mut self, logs: &[Log], instructions: &InstructionCache, ) -> Result<(), FlamegraphError> { for log in logs { - // Count this instruction under the current stack - let stack_key = self.format_stack(); - *self.stack_counts.entry(stack_key).or_insert(0) += 1; + self.nodes[self.current as usize].count += 1; + self.total_counted += 1; - // Update call stack based on instruction type let instruction = instructions .get(log.current_pc) .copied() @@ -61,19 +98,6 @@ impl FlamegraphGenerator { Ok(()) } - /// Format the current call stack as a semicolon-separated string. - fn format_stack(&self) -> String { - if self.call_stack.is_empty() { - return "".to_string(); - } - - self.call_stack - .iter() - .map(|&addr| self.resolve_address(addr)) - .collect::>() - .join(";") - } - /// Resolve an address to a function name, or hex address if unknown. fn resolve_address(&self, address: u64) -> String { self.symbols @@ -82,73 +106,266 @@ impl FlamegraphGenerator { .unwrap_or_else(|| format!("0x{:x}", address)) } + /// Descend to (or create) the child of the current node keyed by `addr`. + fn push(&mut self, addr: u64) { + let current = self.current as usize; + if let Some(&child) = self.nodes[current].children.get(&addr) { + self.current = child; + return; + } + let new_idx = self.nodes.len() as u32; + self.nodes.push(TrieNode { + parent: self.current, + addr, + count: 0, + children: HashMap::new(), + }); + self.nodes[current].children.insert(addr, new_idx); + self.current = new_idx; + } + + /// Move to the parent node. Refuses to pop past the root. + fn pop(&mut self) { + if self.current != ROOT { + self.current = self.nodes[self.current as usize].parent; + } + } + /// Update the call stack based on the instruction type. fn update_stack(&mut self, log: &Log, instruction: Instruction) { match instruction { // Function CALL: JAL with dst=ra (register 1) // Saves return address to ra and jumps to offset - Instruction::JumpAndLink { dst: 1, .. } => { - self.call_stack.push(log.next_pc); - } + Instruction::JumpAndLink { dst: 1, .. } => self.push(log.next_pc), // Function CALL: JALR with dst=ra (register 1) // Indirect call through register - Instruction::JumpAndLinkRegister { dst: 1, .. } => { - self.call_stack.push(log.next_pc); - } + Instruction::JumpAndLinkRegister { dst: 1, .. } => self.push(log.next_pc), // Function RETURN: JALR with base=ra (register 1), dst=zero (register 0) // This is the standard "ret" instruction (jalr x0, ra, 0) - // Only pop if we have more than the root frame to prevent stack underflow Instruction::JumpAndLinkRegister { base, dst, .. } if base == 1 && dst == 0 => { - if self.call_stack.len() > 1 { - self.call_stack.pop(); - } + self.pop(); } - // Tail call: JAL/JALR with dst=zero (doesn't save return address) - // Pop current function and push the new one - // Only pop if we have more than the root frame to prevent stack underflow - Instruction::JumpAndLink { dst: 0, .. } => { - if self.call_stack.len() > 1 { - self.call_stack.pop(); - } - self.call_stack.push(log.next_pc); - } + // JAL/JALR with dst=zero doesn't save a return address. This + // covers both true tail calls AND ordinary intra-function jumps + // (loop back-edges, if/else, jump tables, self-tail-recursion) — + // only a jump that actually crosses a function boundary is a + // tail call; same-function jumps must not mutate the stack. + Instruction::JumpAndLink { dst: 0, .. } => self.maybe_tail_call(log), Instruction::JumpAndLinkRegister { dst: 0, base, .. } if base != 1 => { - // Tail call through register (not a return) - if self.call_stack.len() > 1 { - self.call_stack.pop(); - } - self.call_stack.push(log.next_pc); + self.maybe_tail_call(log) } _ => {} } } + /// A `dst=0` jump: pop+push only if `next_pc` lands in a different + /// function than `current_pc` (a true tail call). Same function (or + /// either address unresolved) is treated as an ordinary jump — no stack + /// mutation. Symbols with `size == 0` (stripped/ASM) accept any address + /// at or past their start, so a `dst=0` jump landing exactly on such a + /// boundary can misattribute — not fixed here, see flamegraph_plan.md + /// bug #1. + fn maybe_tail_call(&mut self, log: &Log) { + let same_function = match ( + self.symbols.lookup(log.current_pc), + self.symbols.lookup(log.next_pc), + ) { + (Some(a), Some(b)) => a.address == b.address, + // Either address unresolved: treat as an ordinary jump (no stack + // mutation) rather than a tail call, matching the doc comment and + // this PR's stance against spurious pop+push in unsymbolized code. + _ => true, + }; + if same_function { + return; + } + self.pop(); + self.push(log.next_pc); + } + /// Write the folded stack output to a writer. /// /// Output format: `stack;frame;names count` /// Example: `main;quicksort;partition 12345` + /// + /// Symbol resolution/demangling happens here, once per unique address + /// (memoized), rather than per instruction. pub fn write_folded(&self, writer: &mut W) -> io::Result<()> { - // Sort by stack path for deterministic output - let mut stacks: Vec<_> = self.stack_counts.iter().collect(); - stacks.sort_by_key(|(k, _)| k.as_str()); + let mut name_cache: HashMap = HashMap::new(); + let mut path = Vec::new(); + // Keyed by the *resolved* stack string, not by trie node: distinct + // nodes (e.g. two different call-site addresses inside the same + // function) can resolve to the same name path and must be summed + // into one line, matching the pre-trie String-keyed behavior. + let mut counts: HashMap = HashMap::new(); + self.collect(ROOT, &mut path, &mut name_cache, &mut counts); - for (stack, count) in stacks { - if !stack.is_empty() { - writeln!(writer, "{} {}", stack, count)?; - } + // Sort by stack path for deterministic output. + let mut entries: Vec<_> = counts.into_iter().collect(); + entries.sort_by(|(a, _), (b, _)| a.cmp(b)); + + for (stack, count) in entries { + writeln!(writer, "{} {}", stack, count)?; } Ok(()) } - /// Get the total number of instructions processed. + fn collect( + &self, + node_idx: u32, + path: &mut Vec, + name_cache: &mut HashMap, + counts: &mut HashMap, + ) { + let node = &self.nodes[node_idx as usize]; + path.push(node.addr); + + if node.count > 0 { + let stack = path + .iter() + .map(|addr| { + name_cache + .entry(*addr) + .or_insert_with(|| self.resolve_address(*addr)) + .clone() + }) + .collect::>() + .join(";"); + *counts.entry(stack).or_insert(0) += node.count; + } + + for &child in node.children.values() { + self.collect(child, path, name_cache, counts); + } + + path.pop(); + } + + /// Get the total number of instructions counted so far. pub fn total_instructions(&self) -> u64 { - self.stack_counts.values().sum() + self.total_counted + } + + /// Raw (unresolved) call-stack address paths and their counts — one + /// entry per counted trie node, root-to-leaf. + pub fn raw_stacks(&self) -> Vec<(Vec, u64)> { + let mut path = Vec::new(); + let mut out = Vec::new(); + self.collect_raw(ROOT, &mut path, &mut out); + out + } + + /// Write folded stack output keyed by raw hex addresses instead of + /// resolved names (pairs with scripts/enrich_flamegraph.py). + pub fn write_folded_raw(&self, writer: &mut W) -> io::Result<()> { + let mut entries: Vec<(String, u64)> = self + .raw_stacks() + .into_iter() + .map(|(addrs, count)| { + let stack = addrs + .iter() + .map(|addr| format!("0x{addr:x}")) + .collect::>() + .join(";"); + (stack, count) + }) + .collect(); + entries.sort_by(|(a, _), (b, _)| a.cmp(b)); + + for (stack, count) in entries { + writeln!(writer, "{stack} {count}")?; + } + Ok(()) + } + + fn collect_raw(&self, node_idx: u32, path: &mut Vec, out: &mut Vec<(Vec, u64)>) { + let node = &self.nodes[node_idx as usize]; + path.push(node.addr); + + if node.count > 0 { + out.push((path.clone(), node.count)); + } + for &child in node.children.values() { + self.collect_raw(child, path, out); + } + + path.pop(); + } +} + +/// Drive `executor` to completion (or until `cycle_budget` is hit), feeding +/// every log to `generator` and calling `on_chunk(total_cycles_so_far, +/// generator)` after each processed chunk so callers can implement periodic +/// partial persistence (e.g. checkpoint `write_folded` to disk every N +/// cycles) without reimplementing the drive loop. Returns the total number +/// of cycles processed. +/// +/// `cycle_budget` of `None` runs to completion; `Some(n)` stops once at +/// least `n` cycles have been processed (the last chunk may overshoot +/// slightly, since chunks aren't split mid-way). +pub fn drive_with_flamegraph( + executor: &mut Executor, + generator: &mut FlamegraphGenerator, + cycle_budget: Option, + mut on_chunk: impl FnMut(u64, &FlamegraphGenerator), +) -> Result { + // The program's code never changes during execution, so cloning this + // once up front (not per chunk) means `process_logs` never needs to + // borrow `executor` again inside the loop — avoiding a conflict with the + // `&mut self` borrow `resume()`'s returned slice is tied to, without + // paying to copy every log chunk just to end that borrow early. + let instructions = executor.instructions.clone(); + + let mut total_cycles: u64 = 0; + while let Some(logs) = executor.resume()? { + total_cycles += logs.len() as u64; + generator.process_logs(logs, &instructions)?; + on_chunk(total_cycles, generator); + + if cycle_budget.is_some_and(|budget| total_cycles >= budget) { + break; + } } + Ok(total_cycles) +} + +/// Options for [`run_with_flamegraph`]. +#[derive(Debug, Clone, Copy, Default)] +pub struct FlamegraphRunOptions { + /// Stop once at least this many cycles have been processed. `None` runs + /// to completion. + pub cycle_budget: Option, +} + +/// Reusable execute+flamegraph path: build the `SymbolTable`, construct the +/// `Executor`, and drive it via [`drive_with_flamegraph`]. This is what the +/// CLI's `execute --flamegraph` path and any test/caller should use instead +/// of hand-rolling the same `SymbolTable`/`Executor`/drive-loop wiring. +/// +/// `on_chunk` is forwarded to `drive_with_flamegraph` for periodic partial +/// persistence; pass `|_, _| {}` if not needed. +pub fn run_with_flamegraph( + elf_bytes: &[u8], + program: &Elf, + private_inputs: Vec, + options: FlamegraphRunOptions, + on_chunk: impl FnMut(u64, &FlamegraphGenerator), +) -> Result<(FlamegraphGenerator, u64), FlamegraphDriveError> { + let symbols = SymbolTable::parse(elf_bytes); + let mut generator = FlamegraphGenerator::new(symbols, program.entry_point); + let mut executor = Executor::new(program, private_inputs)?; + let total_cycles = drive_with_flamegraph( + &mut executor, + &mut generator, + options.cycle_budget, + on_chunk, + )?; + Ok((generator, total_cycles)) } /// Demangle a Rust symbol name using the official rustc-demangle crate. diff --git a/executor/src/vm/execution.rs b/executor/src/vm/execution.rs index 99eb0a00f..23ca50993 100644 --- a/executor/src/vm/execution.rs +++ b/executor/src/vm/execution.rs @@ -184,6 +184,7 @@ fn load_program(segments: &[crate::elf::Segment], memory: &mut Memory) -> Result Ok(()) } +#[derive(Clone)] pub struct InstructionSegment { base_addr: u64, instructions: Vec, @@ -195,6 +196,7 @@ impl InstructionSegment { } } +#[derive(Clone)] pub struct InstructionCache { segments: Vec, } diff --git a/executor/tests/flamegraph.rs b/executor/tests/flamegraph.rs index d064bdb7d..0d3bf4fc4 100644 --- a/executor/tests/flamegraph.rs +++ b/executor/tests/flamegraph.rs @@ -32,6 +32,18 @@ fn nop_instruction() -> Instruction { Instruction::LoadUpperImm { dst: 0, imm: 0 } } +/// Helper to build a `Log` for a plain PC transition (no register values needed +/// by any flamegraph test). +fn mk_log(current_pc: u64, next_pc: u64) -> Log { + Log { + current_pc, + next_pc, + src1_val: 0, + src2_val: 0, + dst_val: 0, + } +} + // ============================================================================ // SymbolTable::lookup tests // ============================================================================ @@ -497,3 +509,333 @@ fn test_flamegraph_instruction_not_found_error() { let result = generator.process_logs(&logs, &instructions); assert!(result.is_err()); } + +// ============================================================================ +// Tail-call misdetection regression tests (flamegraph_plan.md bug #1) +// ============================================================================ + +#[test] +fn test_flamegraph_intra_function_jal_x0_does_not_alter_stack() { + // `jal x0,