From a6d15bb2ed210726dcee9f2413e929097287b093 Mon Sep 17 00:00:00 2001 From: Requiem Date: Tue, 3 Feb 2026 18:30:32 +0100 Subject: [PATCH 1/2] feat: improved cpuid trap check --- src/vmaware.hpp | 158 ++++++++++++++---------------------------------- 1 file changed, 44 insertions(+), 114 deletions(-) diff --git a/src/vmaware.hpp b/src/vmaware.hpp index 2793139..174b703 100644 --- a/src/vmaware.hpp +++ b/src/vmaware.hpp @@ -4943,7 +4943,7 @@ struct VM { // we used a rng before running the traditional rdtsc-cpuid-rdtsc trick // sometimes not intercepted in some hvs (like VirtualBox) under compat mode - auto cpuid_ex = [&](int leaf, int subleaf) noexcept -> u64 { + auto cpuid = [&](unsigned int leaf) noexcept -> u64 { #if (MSVC) // make regs volatile so writes cannot be optimized out, if this isn't added and the code is compiled in release mode, cycles would be around 40 even under Hyper-V volatile int regs[4]{}; @@ -4956,7 +4956,7 @@ struct VM { // prevent the compiler from moving the __cpuid call before the t1 read COMPILER_BARRIER(); - __cpuidex((int*)regs, leaf, subleaf); + __cpuid((int*)regs, static_cast(leaf)); // not using cpu::cpuid to get a chance of inlining COMPILER_BARRIER(); @@ -4984,7 +4984,7 @@ struct VM { // because the compiler must honor the write to a volatile variable. asm volatile("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) - : "a"(leaf), "c"(subleaf) + : "a"(leaf) : "memory"); COMPILER_BARRIER(); @@ -4999,8 +4999,6 @@ struct VM { #endif }; - constexpr u16 iterations = 1000; - auto calculate_latency = [&](const std::vector& samples_in) -> u64 { if (samples_in.empty()) return 0; const size_t N = samples_in.size(); @@ -5079,10 +5077,28 @@ struct VM { return result; }; + // intel leaves on AMD and viceversa will still work for this probe + constexpr unsigned int leaves[] = { + 0xB, // topology + 0xD, // xsave/xstate + 0x4, // deterministic cache params + 0x1, // basic features + 0x7, // extended features + 0xA, // architectural performance monitoring + 0x12, // SGX/enclave + 0x5, // MONITOR/MWAIT + 0x40000000u, // hypervisor range start + 0x80000008u, // extended address limits (amd/intel ext) + 0x0 // fallback to leaf 0 occasionally + }; + constexpr size_t n_leaves = sizeof(leaves) / sizeof(leaves[0]); + + constexpr u16 iterations = 1000; + // pre-allocate sample buffer and touch pages to avoid page faults by MMU during measurement std::vector samples; - samples.resize(iterations); - for (unsigned i = 0; i < iterations; ++i) samples[i] = 0; // or RtlSecureZeroMemory (memset) + samples.resize(n_leaves * iterations); + for (size_t i = 0; i < samples.size(); ++i) samples[i] = 0; // or RtlSecureZeroMemory (memset) /* * We want to move our thread from the Running state to the Waiting state @@ -5097,126 +5113,40 @@ struct VM { * This gives us more time for sampling before we're rescheduled again */ - #if (WINDOWS) - // voluntary context switch to get a fresh quantum - SleepEx(1, FALSE); - #else - // should work similarly in Unix-like operating systems - std::this_thread::sleep_for(std::chrono::milliseconds(1)); - #endif + #if (WINDOWS) + // voluntary context switch to get a fresh quantum + SleepEx(1, FALSE); + #else + // should work similarly in Unix-like operating systems + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + #endif + + // warm up but rotating through leaves to exercise different cpuid paths for (int w = 0; w < 128; ++w) { - volatile u64 tmp = cpuid_ex(0, 0); + volatile u64 tmp = cpuid(leaves[w % n_leaves]); VMAWARE_UNUSED(tmp); } - for (unsigned i = 0; i < iterations; ++i) { - samples[i] = cpuid_ex(0, 0); // leaf 0 just returns static data so it should be fast - } - - const u64 cpuid_latency_leaf0 = calculate_latency(samples); - - // Extended Topology requires the hypervisor to calculate dynamic x2APIC IDs - // we expect this to crash entire VMs if the kernel developer is not enough - for (unsigned i = 0; i < iterations; ++i) { - samples[i] = cpuid_ex(0xB, 0); + // 1000 iterations per leaf, store contiguously per-leaf + for (size_t li = 0; li < n_leaves; ++li) { + const unsigned int leaf = leaves[li]; + for (unsigned i = 0; i < iterations; ++i) { + samples[li * iterations + i] = cpuid(leaf); + } } - const u64 cpuid_latency_leafB = calculate_latency(samples); - debug("TIMER: Leaf 0 latency -> ", cpuid_latency_leaf0); - debug("TIMER: Leaf 0xB latency -> ", cpuid_latency_leafB); + const u64 cpuid_latency = calculate_latency(samples); - // simple differential analysis - if (cpuid_latency_leaf0 > 0) { - if (cpuid_latency_leafB > (cpuid_latency_leaf0 * 1.6)) { - debug("TIMER: VMAware detected a CPUID patch"); - return true; - } - } + debug("TIMER: VMEXIT latency -> ", cpuid_latency); - if (cpuid_latency_leaf0 >= cycle_threshold) { - return true; - } - if (cpuid_latency_leafB >= cycle_threshold) { + if (cpuid_latency >= cycle_threshold) { return true; } - else if (cpuid_latency_leaf0 <= 20) { // cpuid is fully serializing, not even old CPUs have this low average cycles in real-world scenarios + else if (cpuid_latency <= 20) { // cpuid is fully serializing, not even old CPUs have this low average cycles in real-world scenarios return true; } - - // the core idea is to force the host scheduler's pending signal check (kvm_vcpu_check_block) - // We detect cpuid patches that just do fast vmexits by spawning a thread on the SAME core that spams the patched instruction - // If patched, the host core enters an uninterruptible loop, starving the timer interrupt needed for the sleep syscall - #if (WINDOWS) - { - using NtCreateThreadEx_t = NTSTATUS(__stdcall*)(PHANDLE, ACCESS_MASK, PVOID, HANDLE, PVOID, PVOID, ULONG, ULONG_PTR, ULONG_PTR, ULONG_PTR, PVOID); - using NtTerminateThread_t = NTSTATUS(__stdcall*)(HANDLE, NTSTATUS); - using NtWaitForSingleObject_t = NTSTATUS(__stdcall*)(HANDLE, BOOLEAN, PLARGE_INTEGER); - - const HMODULE ntdll = util::get_ntdll(); - if (ntdll) { - const char* names[] = { "NtCreateThreadEx", "NtTerminateThread", "NtWaitForSingleObject" }; - void* funcs[3] = {}; - util::get_function_address(ntdll, names, funcs, 3); - - auto pNtCreateThreadEx = (NtCreateThreadEx_t)funcs[0]; - auto pNtTerminateThread = (NtTerminateThread_t)funcs[1]; - auto pNtWaitForSingleObject = (NtWaitForSingleObject_t)funcs[2]; - - if (pNtCreateThreadEx && pNtTerminateThread && pNtWaitForSingleObject) { - - // stateless lambda castable to thread routine - auto spammer_routine = [](PVOID) -> DWORD { - // This loop exploits the patch's lack of interrupt window checking - while (true) { - int regs[4]; - __cpuid(regs, 0); - } - return 0; - }; - - HANDLE hSpammer = nullptr; - const NTSTATUS status = pNtCreateThreadEx(&hSpammer, MAXIMUM_ALLOWED, nullptr, GetCurrentProcess(), - (PVOID)(uintptr_t(+spammer_routine)), nullptr, TRUE, 0, 0, 0, nullptr); - - if (status >= 0 && hSpammer) { - // forcing contention contention - THREAD_BASIC_INFORMATION tbi_local{}; - if (pNtQueryInformationThread(hCurrentThread, ThreadBasicInformation, &tbi_local, sizeof(tbi_local), nullptr) >= 0) { - pNtSetInformationThread(hSpammer, ThreadAffinityMask, &tbi_local.AffinityMask, sizeof(ULONG_PTR)); - } - - ResumeThread(hSpammer); - - LARGE_INTEGER qpc_start, qpc_end, qpc_freq; - QueryPerformanceFrequency(&qpc_freq); - QueryPerformanceCounter(&qpc_start); - - // expecting gibberish cpuid patches to lock the interrupt timer - // by the infinite fastpath loop on the physical core, causing a massive overshoot - SleepEx(10, FALSE); - - QueryPerformanceCounter(&qpc_end); - - // Cleanup - pNtTerminateThread(hSpammer, 0); - pNtWaitForSingleObject(hSpammer, FALSE, nullptr); - CloseHandle(hSpammer); - - double elapsed_ms = (double)(qpc_end.QuadPart - qpc_start.QuadPart) * 1000.0 / (double)qpc_freq.QuadPart; - - debug("TIMER: Timer interrupt starvation -> ", elapsed_ms, " ms"); - - if (elapsed_ms > 40.0) { - debug("TIMER: VMAware detected a CPUID patch"); - return true; - } - } - } - } - } - #endif // TLB flushes or side channel cache attacks are not even tried due to how unreliable they are against stealthy hypervisors -#endif + #endif return false; } From 465ee4dcd0275be0e1d794ed0b76b75ed699150a Mon Sep 17 00:00:00 2001 From: Requiem Date: Tue, 3 Feb 2026 18:35:34 +0100 Subject: [PATCH 2/2] chore: small threshold tweak --- src/vmaware.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vmaware.hpp b/src/vmaware.hpp index 8000b93..509a2b1 100644 --- a/src/vmaware.hpp +++ b/src/vmaware.hpp @@ -4583,7 +4583,7 @@ struct VM { return false; } // will be used in cpuid measurements later - u16 cycle_threshold = 1000; + u16 cycle_threshold = 800; if (util::hyper_x() == HYPERV_ARTIFACT_VM) { cycle_threshold = 3500; // if we're running under Hyper-V, make VMAware detect nested virtualization }