diff --git a/src/vmaware.hpp b/src/vmaware.hpp index 1608779b..d02b9fd5 100644 --- a/src/vmaware.hpp +++ b/src/vmaware.hpp @@ -5575,7 +5575,7 @@ struct VM { // the robust center: median M and MAD -> approximate sigma const u64 M = median_of_sorted(s, 0, s.size()); - // Faster MAD: select the median deviation in linear time instead of sorting all deviations. + // select the median deviation in linear time instead of sorting all deviations std::vector absdev; absdev.resize(N); for (size_t i = 0; i < N; ++i) { @@ -5741,7 +5741,9 @@ struct VM { std::uniform_int_distribution batch_dist(30000, 70000); const size_t BATCH_SIZE = batch_dist(gen); i32 dummy_res[4]{}; - size_t valid = 0; // end of setup phase + size_t valid = 0; + i16 invalid = 0; + bool apply_multiplier = false; // end of setup phase SleepEx(0, FALSE); // try to get fresh quantum before starting warm-up phase, give time to kernel to set up priorities @@ -5765,10 +5767,13 @@ struct VM { v_pre = state.counter; std::atomic_signal_fence(std::memory_order_seq_cst); // _ReadWriteBarrier() aka dont emit runtime fences - // force cpuid here so that the hypervisor is either forced to keep interception and try to bypass latency, or disable interception and try to bypass XSAVE states - trigger_vmexit(dummy_res, 0x0, 0); // fastest cpuid path, on purpose for stability in the measurement - // scaled by 2x to negate cache invalidation ping-ponging across distant NUMA nodes, as our core randomizer will pin the threads on distant cores - for (int i = 0; i < 2; ++i) { + // vmexit here so that the hypervisor is either forced to keep interception and try to bypass latency, or disable interception and try to bypass XSAVE states + if (!apply_multiplier) { + trigger_vmexit(dummy_res, 0x0, 0); + } + else { + // scaled by 2x if we dynamically detect cache invalidation ping-ponging across distant NUMA nodes, as our core randomizer pin our threads on different CPUs + trigger_vmexit(dummy_res, 0x0, 0); trigger_vmexit(dummy_res, 0x0, 0); } std::atomic_signal_fence(std::memory_order_seq_cst); @@ -5779,7 +5784,13 @@ struct VM { r_pre = state.counter; std::atomic_signal_fence(std::memory_order_seq_cst); // ensure compiler-level ordering - for (int i = 0; i < 16; ++i) _mm_lfence(); // 16 LFENCES is enough for the Cross-Core/Cross-CCD MESI RFO cache bounce in the data race (so that the counter thread sees an increment) + if (!apply_multiplier) { + for (int i = 0; i < 8; ++i) _mm_lfence(); // 8 LFENCES is enough for the Cross-Core/Cross-CCD MESI RFO cache bounce in the data race (so that the counter thread sees an increment) + } + else { + // scaled if counter thread is not able to increment in time due to CPUID being too fast + for (int i = 0; i < 16; ++i) _mm_lfence(); + } std::atomic_signal_fence(std::memory_order_seq_cst); r_post = state.counter; @@ -5789,6 +5800,10 @@ struct VM { ref_samples[valid] = r_post - r_pre; valid++; } + else if (v_post <= v_pre && !apply_multiplier) { + invalid++; + if (invalid >= 1000) apply_multiplier = true; + } } state.test_done.store(true, std::memory_order_release); @@ -12213,6 +12228,7 @@ struct VM { /** * @brief Check whether a hypervisor uses EPT/NPT hooking to intercept hardware breakpoints + * @note This hypervisor detection also affects debuggers * @category Windows * @implements VM::HYPERVISOR_HOOK */ @@ -12523,6 +12539,7 @@ struct VM { // bit 0 = 1 // bits 17:16 = 11b // bits 19:18 = 00b + ctx.Dr7 = 0x30001; status = nt_set_context_thread(current_thread, &ctx); if (status < 0) { cleanup_pages(); @@ -12666,6 +12683,7 @@ struct VM { // bit 0 = 1 // bits 17:16 = 11b // bits 19:18 = 00b + ctx.Dr7 = 0x30001; status = nt_set_context_thread(current_thread, &ctx); if (status < 0) { rtl_remove_vectored_exception_handler(veh_handle); @@ -12702,7 +12720,6 @@ struct VM { #endif } - /** * @brief Check whether a hypervisor delays trap flags over exiting instructions * @category Windows