diff --git a/src/vmaware.hpp b/src/vmaware.hpp index 58d682c..2793139 100644 --- a/src/vmaware.hpp +++ b/src/vmaware.hpp @@ -4943,7 +4943,7 @@ struct VM { // we used a rng before running the traditional rdtsc-cpuid-rdtsc trick // sometimes not intercepted in some hvs (like VirtualBox) under compat mode - auto cpuid = [&]() noexcept -> u64 { + auto cpuid_ex = [&](int leaf, int subleaf) noexcept -> u64 { #if (MSVC) // make regs volatile so writes cannot be optimized out, if this isn't added and the code is compiled in release mode, cycles would be around 40 even under Hyper-V volatile int regs[4]{}; @@ -4956,7 +4956,7 @@ struct VM { // prevent the compiler from moving the __cpuid call before the t1 read COMPILER_BARRIER(); - __cpuid((int*)regs, 0); // not using cpu::cpuid to get a chance of inlining + __cpuidex((int*)regs, leaf, subleaf); COMPILER_BARRIER(); @@ -4984,7 +4984,7 @@ struct VM { // because the compiler must honor the write to a volatile variable. asm volatile("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) - : "a"(0) + : "a"(leaf), "c"(subleaf) : "memory"); COMPILER_BARRIER(); @@ -5001,40 +5001,6 @@ struct VM { constexpr u16 iterations = 1000; - // pre-allocate sample buffer and touch pages to avoid page faults by MMU during measurement - std::vector samples; - samples.resize(iterations); - for (unsigned i = 0; i < iterations; ++i) samples[i] = 0; // or RtlSecureZeroMemory (memset) - - /* - * We want to move our thread from the Running state to the Waiting state - * When the sleep expires (at the next timer tick), the OS moves VMAware's thread to the Ready state - * When it picks us up again, it grants VMAware a fresh quantum, typically varying between 2 ticks (30ms) and 6 ticks (90ms) on Windows Client editions - * The default resolution of the Windows clock we're using is 64Hz - * Because we're calling NtDelayExecution with only 1ms, the kernel interprets this as "Sleep for at least 1ms" - * Since the hardware interrupt (tick) only fires every 15.6ms and we're not using timeBeginPeriod, the kernel cannot wake us after exactly 1ms - * So instead, it does what we want and wakes us up at the very next timer interrupt - * That's the reason why it's only 1ms and we're not using CreateWaitableTimerEx / SetWaitableTimerEx - * Sleep(0) would return instantly in some circumstances - * This gives us more time for sampling before we're rescheduled again - */ - - #if (WINDOWS) - // voluntary context switch to get a fresh quantum - SleepEx(1, FALSE); - #else - // should work similarly in Unix-like operating systems - std::this_thread::sleep_for(std::chrono::milliseconds(1)); - #endif - for (int w = 0; w < 128; ++w) { - volatile u64 tmp = cpuid(); - VMAWARE_UNUSED(tmp); - } - - for (unsigned i = 0; i < iterations; ++i) { - samples[i] = cpuid(); - } - auto calculate_latency = [&](const std::vector& samples_in) -> u64 { if (samples_in.empty()) return 0; const size_t N = samples_in.size(); @@ -5113,18 +5079,144 @@ struct VM { return result; }; - u64 cpuid_latency = calculate_latency(samples); + // pre-allocate sample buffer and touch pages to avoid page faults by MMU during measurement + std::vector samples; + samples.resize(iterations); + for (unsigned i = 0; i < iterations; ++i) samples[i] = 0; // or RtlSecureZeroMemory (memset) + + /* + * We want to move our thread from the Running state to the Waiting state + * When the sleep expires (at the next timer tick), the OS moves VMAware's thread to the Ready state + * When it picks us up again, it grants VMAware a fresh quantum, typically varying between 2 ticks (30ms) and 6 ticks (90ms) on Windows Client editions + * The default resolution of the Windows clock we're using is 64Hz + * Because we're calling NtDelayExecution with only 1ms, the kernel interprets this as "Sleep for at least 1ms" + * Since the hardware interrupt (tick) only fires every 15.6ms and we're not using timeBeginPeriod, the kernel cannot wake us after exactly 1ms + * So instead, it does what we want and wakes us up at the very next timer interrupt + * That's the reason why it's only 1ms and we're not using CreateWaitableTimerEx / SetWaitableTimerEx + * Sleep(0) would return instantly in some circumstances + * This gives us more time for sampling before we're rescheduled again + */ + + #if (WINDOWS) + // voluntary context switch to get a fresh quantum + SleepEx(1, FALSE); + #else + // should work similarly in Unix-like operating systems + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + #endif + for (int w = 0; w < 128; ++w) { + volatile u64 tmp = cpuid_ex(0, 0); + VMAWARE_UNUSED(tmp); + } + + for (unsigned i = 0; i < iterations; ++i) { + samples[i] = cpuid_ex(0, 0); // leaf 0 just returns static data so it should be fast + } - debug("TIMER: VMEXIT latency -> ", cpuid_latency); + const u64 cpuid_latency_leaf0 = calculate_latency(samples); - if (cpuid_latency >= cycle_threshold) { + // Extended Topology requires the hypervisor to calculate dynamic x2APIC IDs + // we expect this to crash entire VMs if the kernel developer is not enough + for (unsigned i = 0; i < iterations; ++i) { + samples[i] = cpuid_ex(0xB, 0); + } + const u64 cpuid_latency_leafB = calculate_latency(samples); + + debug("TIMER: Leaf 0 latency -> ", cpuid_latency_leaf0); + debug("TIMER: Leaf 0xB latency -> ", cpuid_latency_leafB); + + // simple differential analysis + if (cpuid_latency_leaf0 > 0) { + if (cpuid_latency_leafB > (cpuid_latency_leaf0 * 1.6)) { + debug("TIMER: VMAware detected a CPUID patch"); + return true; + } + } + + if (cpuid_latency_leaf0 >= cycle_threshold) { return true; } - else if (cpuid_latency <= 20) { // cpuid is fully serializing, not even old CPUs have this low average cycles in real-world scenarios + if (cpuid_latency_leafB >= cycle_threshold) { return true; } - // TLB flushes or side channel cache attacks are not even tried due to how unreliable they are against stealthy hypervisors + else if (cpuid_latency_leaf0 <= 20) { // cpuid is fully serializing, not even old CPUs have this low average cycles in real-world scenarios + return true; + } + + // the core idea is to force the host scheduler's pending signal check (kvm_vcpu_check_block) + // We detect cpuid patches that just do fast vmexits by spawning a thread on the SAME core that spams the patched instruction + // If patched, the host core enters an uninterruptible loop, starving the timer interrupt needed for the sleep syscall + #if (WINDOWS) + { + using NtCreateThreadEx_t = NTSTATUS(__stdcall*)(PHANDLE, ACCESS_MASK, PVOID, HANDLE, PVOID, PVOID, ULONG, ULONG_PTR, ULONG_PTR, ULONG_PTR, PVOID); + using NtTerminateThread_t = NTSTATUS(__stdcall*)(HANDLE, NTSTATUS); + using NtWaitForSingleObject_t = NTSTATUS(__stdcall*)(HANDLE, BOOLEAN, PLARGE_INTEGER); + + const HMODULE ntdll = util::get_ntdll(); + if (ntdll) { + const char* names[] = { "NtCreateThreadEx", "NtTerminateThread", "NtWaitForSingleObject" }; + void* funcs[3] = {}; + util::get_function_address(ntdll, names, funcs, 3); + + auto pNtCreateThreadEx = (NtCreateThreadEx_t)funcs[0]; + auto pNtTerminateThread = (NtTerminateThread_t)funcs[1]; + auto pNtWaitForSingleObject = (NtWaitForSingleObject_t)funcs[2]; + + if (pNtCreateThreadEx && pNtTerminateThread && pNtWaitForSingleObject) { + + // stateless lambda castable to thread routine + auto spammer_routine = [](PVOID) -> DWORD { + // This loop exploits the patch's lack of interrupt window checking + while (true) { + int regs[4]; + __cpuid(regs, 0); + } + return 0; + }; + + HANDLE hSpammer = nullptr; + const NTSTATUS status = pNtCreateThreadEx(&hSpammer, MAXIMUM_ALLOWED, nullptr, GetCurrentProcess(), + (PVOID)(uintptr_t(+spammer_routine)), nullptr, TRUE, 0, 0, 0, nullptr); + + if (status >= 0 && hSpammer) { + // forcing contention contention + THREAD_BASIC_INFORMATION tbi_local{}; + if (pNtQueryInformationThread(hCurrentThread, ThreadBasicInformation, &tbi_local, sizeof(tbi_local), nullptr) >= 0) { + pNtSetInformationThread(hSpammer, ThreadAffinityMask, &tbi_local.AffinityMask, sizeof(ULONG_PTR)); + } + + ResumeThread(hSpammer); + + LARGE_INTEGER qpc_start, qpc_end, qpc_freq; + QueryPerformanceFrequency(&qpc_freq); + QueryPerformanceCounter(&qpc_start); + + // expecting gibberish cpuid patches to lock the interrupt timer + // by the infinite fastpath loop on the physical core, causing a massive overshoot + SleepEx(10, FALSE); + + QueryPerformanceCounter(&qpc_end); + + // Cleanup + pNtTerminateThread(hSpammer, 0); + pNtWaitForSingleObject(hSpammer, FALSE, nullptr); + CloseHandle(hSpammer); + + double elapsed_ms = (double)(qpc_end.QuadPart - qpc_start.QuadPart) * 1000.0 / (double)qpc_freq.QuadPart; + + debug("TIMER: Timer interrupt starvation -> ", elapsed_ms, " ms"); + + if (elapsed_ms > 40.0) { + debug("TIMER: VMAware detected a CPUID patch"); + return true; + } + } + } + } + } #endif + // TLB flushes or side channel cache attacks are not even tried due to how unreliable they are against stealthy hypervisors +#endif return false; }