diff --git a/src/vmaware.hpp b/src/vmaware.hpp
index 58d682c..2793139 100644
--- a/src/vmaware.hpp
+++ b/src/vmaware.hpp
@@ -4943,7 +4943,7 @@ struct VM {
         // we used a rng before running the traditional rdtsc-cpuid-rdtsc trick
        
         // sometimes not intercepted in some hvs (like VirtualBox) under compat mode
-        auto cpuid = [&]() noexcept -> u64 {
+        auto cpuid_ex = [&](int leaf, int subleaf) noexcept -> u64 {
         #if (MSVC)
             // make regs volatile so writes cannot be optimized out, if this isn't added and the code is compiled in release mode, cycles would be around 40 even under Hyper-V
             volatile int regs[4]{};
@@ -4956,7 +4956,7 @@ struct VM {
             // prevent the compiler from moving the __cpuid call before the t1 read
             COMPILER_BARRIER();
 
-            __cpuid((int*)regs, 0); // not using cpu::cpuid to get a chance of inlining
+            __cpuidex((int*)regs, leaf, subleaf);
 
             COMPILER_BARRIER();
 
@@ -4984,7 +4984,7 @@ struct VM {
             // because the compiler must honor the write to a volatile variable.
             asm volatile("cpuid"
                 : "=a"(a), "=b"(b), "=c"(c), "=d"(d)
-                : "a"(0)
+                : "a"(leaf), "c"(subleaf)
                 : "memory");
 
             COMPILER_BARRIER();
@@ -5001,40 +5001,6 @@ struct VM {
 
         constexpr u16 iterations = 1000;
 
-        // pre-allocate sample buffer and touch pages to avoid page faults by MMU during measurement
-        std::vector<u64> samples;
-        samples.resize(iterations);
-        for (unsigned i = 0; i < iterations; ++i) samples[i] = 0; // or RtlSecureZeroMemory (memset)
-
-        /*
-        * We want to move our thread from the Running state to the Waiting state
-        * When the sleep expires (at the next timer tick), the OS moves VMAware's thread to the Ready state
-        * When it picks us up again, it grants VMAware a fresh quantum, typically varying between 2 ticks (30ms) and 6 ticks (90ms) on Windows Client editions
-        * The default resolution of the Windows clock we're using is 64Hz
-        * Because we're calling NtDelayExecution with only 1ms, the kernel interprets this as "Sleep for at least 1ms"
-        * Since the hardware interrupt (tick) only fires every 15.6ms and we're not using timeBeginPeriod, the kernel cannot wake us after exactly 1ms
-        * So instead, it does what we want and wakes us up at the very next timer interrupt
-        * That's the reason why it's only 1ms and we're not using CreateWaitableTimerEx / SetWaitableTimerEx
-        * Sleep(0) would return instantly in some circumstances
-        * This gives us more time for sampling before we're rescheduled again
-        */
-
-        #if (WINDOWS)
-            // voluntary context switch to get a fresh quantum
-            SleepEx(1, FALSE);
-        #else 
-            // should work similarly in Unix-like operating systems
-            std::this_thread::sleep_for(std::chrono::milliseconds(1));
-        #endif
-        for (int w = 0; w < 128; ++w) {
-            volatile u64 tmp = cpuid();
-            VMAWARE_UNUSED(tmp);
-        }
-
-        for (unsigned i = 0; i < iterations; ++i) {
-            samples[i] = cpuid();
-        }
-
         auto calculate_latency = [&](const std::vector<u64>& samples_in) -> u64 {
             if (samples_in.empty()) return 0;
             const size_t N = samples_in.size();
@@ -5113,18 +5079,144 @@ struct VM {
             return result;
         };
 
-        u64 cpuid_latency = calculate_latency(samples);
+        // pre-allocate sample buffer and touch pages to avoid page faults by MMU during measurement
+        std::vector<u64> samples;
+        samples.resize(iterations);
+        for (unsigned i = 0; i < iterations; ++i) samples[i] = 0; // or RtlSecureZeroMemory (memset)
+
+        /*
+        * We want to move our thread from the Running state to the Waiting state
+        * When the sleep expires (at the next timer tick), the OS moves VMAware's thread to the Ready state
+        * When it picks us up again, it grants VMAware a fresh quantum, typically varying between 2 ticks (30ms) and 6 ticks (90ms) on Windows Client editions
+        * The default resolution of the Windows clock we're using is 64Hz
+        * Because we're calling NtDelayExecution with only 1ms, the kernel interprets this as "Sleep for at least 1ms"
+        * Since the hardware interrupt (tick) only fires every 15.6ms and we're not using timeBeginPeriod, the kernel cannot wake us after exactly 1ms
+        * So instead, it does what we want and wakes us up at the very next timer interrupt
+        * That's the reason why it's only 1ms and we're not using CreateWaitableTimerEx / SetWaitableTimerEx
+        * Sleep(0) would return instantly in some circumstances
+        * This gives us more time for sampling before we're rescheduled again
+        */
+
+        #if (WINDOWS)
+            // voluntary context switch to get a fresh quantum
+            SleepEx(1, FALSE);
+        #else 
+            // should work similarly in Unix-like operating systems
+            std::this_thread::sleep_for(std::chrono::milliseconds(1));
+        #endif
+        for (int w = 0; w < 128; ++w) {
+            volatile u64 tmp = cpuid_ex(0, 0);
+            VMAWARE_UNUSED(tmp);
+        }
+
+        for (unsigned i = 0; i < iterations; ++i) {
+            samples[i] = cpuid_ex(0, 0); // leaf 0 just returns static data so it should be fast
+        }
 
-        debug("TIMER: VMEXIT latency -> ", cpuid_latency);
+        const u64 cpuid_latency_leaf0 = calculate_latency(samples);
 
-        if (cpuid_latency >= cycle_threshold) {
+        // Extended Topology requires the hypervisor to calculate dynamic x2APIC IDs
+        // we expect this to crash entire VMs if the kernel developer is not enough
+        for (unsigned i = 0; i < iterations; ++i) {
+            samples[i] = cpuid_ex(0xB, 0);
+        }
+        const u64 cpuid_latency_leafB = calculate_latency(samples);
+
+        debug("TIMER: Leaf 0 latency -> ", cpuid_latency_leaf0);
+        debug("TIMER: Leaf 0xB latency -> ", cpuid_latency_leafB);
+
+        // simple differential analysis
+        if (cpuid_latency_leaf0 > 0) {
+            if (cpuid_latency_leafB > (cpuid_latency_leaf0 * 1.6)) {
+                debug("TIMER: VMAware detected a CPUID patch");
+                return true;
+            }
+        }
+
+        if (cpuid_latency_leaf0 >= cycle_threshold) {
             return true;
         }
-        else if (cpuid_latency <= 20) { // cpuid is fully serializing, not even old CPUs have this low average cycles in real-world scenarios
+        if (cpuid_latency_leafB >= cycle_threshold) {
             return true;
         }
-        // TLB flushes or side channel cache attacks are not even tried due to how unreliable they are against stealthy hypervisors
+        else if (cpuid_latency_leaf0 <= 20) { // cpuid is fully serializing, not even old CPUs have this low average cycles in real-world scenarios
+            return true;
+        }
+
+        // the core idea is to force the host scheduler's pending signal check (kvm_vcpu_check_block)
+        // We detect cpuid patches that just do fast vmexits by spawning a thread on the SAME core that spams the patched instruction
+        // If patched, the host core enters an uninterruptible loop, starving the timer interrupt needed for the sleep syscall
+    #if (WINDOWS)
+        {
+            using NtCreateThreadEx_t = NTSTATUS(__stdcall*)(PHANDLE, ACCESS_MASK, PVOID, HANDLE, PVOID, PVOID, ULONG, ULONG_PTR, ULONG_PTR, ULONG_PTR, PVOID);
+            using NtTerminateThread_t = NTSTATUS(__stdcall*)(HANDLE, NTSTATUS);
+            using NtWaitForSingleObject_t = NTSTATUS(__stdcall*)(HANDLE, BOOLEAN, PLARGE_INTEGER);
+
+            const HMODULE ntdll = util::get_ntdll();
+            if (ntdll) {
+                const char* names[] = { "NtCreateThreadEx", "NtTerminateThread", "NtWaitForSingleObject" };
+                void* funcs[3] = {};
+                util::get_function_address(ntdll, names, funcs, 3);
+
+                auto pNtCreateThreadEx = (NtCreateThreadEx_t)funcs[0];
+                auto pNtTerminateThread = (NtTerminateThread_t)funcs[1];
+                auto pNtWaitForSingleObject = (NtWaitForSingleObject_t)funcs[2];
+
+                if (pNtCreateThreadEx && pNtTerminateThread && pNtWaitForSingleObject) {
+
+                    // stateless lambda castable to thread routine
+                    auto spammer_routine = [](PVOID) -> DWORD {
+                        // This loop exploits the patch's lack of interrupt window checking
+                        while (true) {
+                            int regs[4];
+                            __cpuid(regs, 0);
+                        }
+                        return 0;
+                    };
+
+                    HANDLE hSpammer = nullptr;
+                    const NTSTATUS status = pNtCreateThreadEx(&hSpammer, MAXIMUM_ALLOWED, nullptr, GetCurrentProcess(),
+                        (PVOID)(uintptr_t(+spammer_routine)), nullptr, TRUE, 0, 0, 0, nullptr);
+
+                    if (status >= 0 && hSpammer) {
+                        // forcing contention contention
+                        THREAD_BASIC_INFORMATION tbi_local{};
+                        if (pNtQueryInformationThread(hCurrentThread, ThreadBasicInformation, &tbi_local, sizeof(tbi_local), nullptr) >= 0) {
+                            pNtSetInformationThread(hSpammer, ThreadAffinityMask, &tbi_local.AffinityMask, sizeof(ULONG_PTR));
+                        }
+
+                        ResumeThread(hSpammer);
+
+                        LARGE_INTEGER qpc_start, qpc_end, qpc_freq;
+                        QueryPerformanceFrequency(&qpc_freq);
+                        QueryPerformanceCounter(&qpc_start);
+
+                        // expecting gibberish cpuid patches to lock the interrupt timer
+                        // by the infinite fastpath loop on the physical core, causing a massive overshoot
+                        SleepEx(10, FALSE);
+
+                        QueryPerformanceCounter(&qpc_end);
+
+                        // Cleanup
+                        pNtTerminateThread(hSpammer, 0);
+                        pNtWaitForSingleObject(hSpammer, FALSE, nullptr);
+                        CloseHandle(hSpammer);
+
+                        double elapsed_ms = (double)(qpc_end.QuadPart - qpc_start.QuadPart) * 1000.0 / (double)qpc_freq.QuadPart;
+
+                        debug("TIMER: Timer interrupt starvation -> ", elapsed_ms, " ms");
+
+                        if (elapsed_ms > 40.0) { 
+                            debug("TIMER: VMAware detected a CPUID patch");
+                            return true;
+                        }
+                    }
+                }
+            }
+        }
     #endif
+        // TLB flushes or side channel cache attacks are not even tried due to how unreliable they are against stealthy hypervisors
+#endif
         return false;
     }