From abf876b14aa3b9c2cc186ebdf139523a4880b881 Mon Sep 17 00:00:00 2001 From: Scott Spencer Date: Sat, 7 Feb 2026 10:03:27 -0500 Subject: [PATCH] Set thread QoS to USER_INITIATED on Apple Silicon On Apple Silicon Macs, TBB worker threads are created with the default QoS class, which macOS may schedule to efficiency cores even when performance cores are available. This significantly degrades parallel performance. This adds a pthread_set_qos_class_self_np() call in on_scheduler_entry() to set USER_INITIATED QoS, signaling to macOS that these are compute threads the user is waiting for. This causes macOS to prefer performance cores when available. Fixes #3277 --- stan/math/rev/core/init_chainablestack.hpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/stan/math/rev/core/init_chainablestack.hpp b/stan/math/rev/core/init_chainablestack.hpp index 197bd611729..511cef41990 100644 --- a/stan/math/rev/core/init_chainablestack.hpp +++ b/stan/math/rev/core/init_chainablestack.hpp @@ -11,6 +11,11 @@ #include #include +#ifdef __APPLE__ +#include +#include +#endif + namespace stan { namespace math { @@ -20,6 +25,10 @@ namespace math { * hook ensures that each worker thread has an initialized AD tape * ready for use. * + * On Apple Silicon, this also sets the thread QoS class to + * USER_INITIATED so that macOS prefers scheduling compute threads + * on performance cores rather than efficiency cores. + * * Refer to * https://software.intel.com/content/www/us/en/develop/documentation/tbb-documentation/top/intel-threading-building-blocks-developer-reference/task-scheduler/taskschedulerobserver.html * for details on the observer concept. @@ -37,6 +46,13 @@ class ad_tape_observer final : public tbb::task_scheduler_observer { ~ad_tape_observer() { observe(false); } void on_scheduler_entry(bool worker) { +#ifdef __APPLE__ +#if defined(__arm64__) || defined(__aarch64__) + // Set thread QoS to USER_INITIATED so macOS prefers scheduling + // TBB worker threads on performance cores rather than efficiency cores. + pthread_set_qos_class_self_np(QOS_CLASS_USER_INITIATED, 0); +#endif +#endif std::lock_guard thread_tape_map_lock(thread_tape_map_mutex_); const std::thread::id thread_id = std::this_thread::get_id(); if (thread_tape_map_.find(thread_id) == thread_tape_map_.end()) {