From 746bf31961cf89c7bcf5d700751e74659f127998 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 28 Dec 2025 09:37:58 +0200 Subject: [PATCH 01/51] Add SQ8-to-SQ8 distance functions and optimizations - Implemented inner product and cosine distance functions for SQ8-to-SQ8 vectors in SVE, NEON, and AVX512 architectures. - Added corresponding distance function selection logic in IP_space.cpp and function headers in IP_space.h. - Created benchmarks for SQ8-to-SQ8 distance functions to evaluate performance across different architectures. - Developed unit tests to validate the correctness of the new distance functions against expected results. - Ensured compatibility with existing optimization features for various CPU architectures. --- src/VecSim/spaces/IP/IP.cpp | 47 ++++ src/VecSim/spaces/IP/IP.h | 6 + src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h | 4 +- src/VecSim/spaces/IP/IP_AVX2_SQ8.h | 10 +- .../spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h | 9 +- .../spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h | 136 +++++++++++ src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h | 157 +++++++++++++ src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h | 168 ++++++++++++++ src/VecSim/spaces/IP_space.cpp | 78 +++++++ src/VecSim/spaces/IP_space.h | 5 + .../spaces/functions/AVX512F_BW_VL_VNNI.cpp | 14 ++ .../spaces/functions/AVX512F_BW_VL_VNNI.h | 4 + src/VecSim/spaces/functions/NEON.cpp | 14 ++ src/VecSim/spaces/functions/NEON.h | 4 + src/VecSim/spaces/functions/SVE.cpp | 15 ++ src/VecSim/spaces/functions/SVE.h | 4 + src/VecSim/spaces/functions/SVE2.cpp | 15 ++ src/VecSim/spaces/functions/SVE2.h | 5 + .../spaces_benchmarks/bm_spaces_sq8.cpp | 52 ++++- tests/unit/test_spaces.cpp | 218 +++++++++++++++++- 20 files changed, 951 insertions(+), 14 deletions(-) create mode 100644 src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h create mode 100644 src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h create mode 100644 src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp index 5e2c4b4dc..645fc739d 100644 --- a/src/VecSim/spaces/IP/IP.cpp +++ b/src/VecSim/spaces/IP/IP.cpp @@ -51,6 +51,53 @@ float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) { return 1.0f - res; } +// SQ8-to-SQ8: Both vectors are uint8 quantized +float SQ8_SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) { + const auto *pVect1 = static_cast(pVect1v); + const auto *pVect2 = static_cast(pVect2v); + + // Get quantization parameters from pVect1 + const float min_val1 = *reinterpret_cast(pVect1 + dimension); + const float delta1 = *reinterpret_cast(pVect1 + dimension + sizeof(float)); + + // Get quantization parameters from pVect2 + const float min_val2 = *reinterpret_cast(pVect2 + dimension); + const float delta2 = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + + // Compute inner product with dequantization of both vectors + float res = 0; + for (size_t i = 0; i < dimension; i++) { + float dequant1 = pVect1[i] * delta1 + min_val1; + float dequant2 = pVect2[i] * delta2 + min_val2; + res += dequant1 * dequant2; + } + return 1.0f - res; +} + +// SQ8-to-SQ8: Both vectors are uint8 quantized (cosine version) +float SQ8_SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) { + const auto *pVect1 = static_cast(pVect1v); + const auto *pVect2 = static_cast(pVect2v); + + // Get quantization parameters from pVect1 + const float min_val1 = *reinterpret_cast(pVect1 + dimension); + const float delta1 = *reinterpret_cast(pVect1 + dimension + sizeof(float)); + + // Get quantization parameters from pVect2 + const float min_val2 = *reinterpret_cast(pVect2 + dimension); + const float delta2 = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + + // Compute inner product with dequantization of both vectors + float res = 0; + for (size_t i = 0; i < dimension; i++) { + float dequant1 = pVect1[i] * delta1 + min_val1; + float dequant2 = pVect2[i] * delta2 + min_val2; + res += dequant1 * dequant2; + } + // Assume both vectors are normalized. + return 1.0f - res; +} + float FP32_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension) { auto *vec1 = (float *)pVect1; auto *vec2 = (float *)pVect2; diff --git a/src/VecSim/spaces/IP/IP.h b/src/VecSim/spaces/IP/IP.h index d4796cbd6..748377ec0 100644 --- a/src/VecSim/spaces/IP/IP.h +++ b/src/VecSim/spaces/IP/IP.h @@ -16,6 +16,12 @@ float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimensio // pVect1v vector of type fp32 and pVect2v vector of type uint8 float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension); +// SQ8-to-SQ8: Both vectors are uint8 quantized +float SQ8_SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension); + +// SQ8-to-SQ8: Both vectors are uint8 quantized and normalized +float SQ8_SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension); + float FP32_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension); double FP64_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension); diff --git a/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h index 007ee333e..7c4e6b0d4 100644 --- a/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h @@ -85,10 +85,10 @@ float SQ8_InnerProductImp_FMA(const void *pVect1v, const void *pVect2v, size_t d // We dealt with the residual part. We are left with some multiple of 16 floats. // In each iteration we calculate 16 floats = 512 bits. - do { + while (pVect1 < pEnd1) { InnerProductStepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec); InnerProductStepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec); - } while (pVect1 < pEnd1); + } return my_mm256_reduce_add_ps(sum256); } diff --git a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h index 89b1c0b6b..1df263122 100644 --- a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h @@ -33,7 +33,7 @@ static inline void InnerProductStepSQ8(const float *&pVect1, const uint8_t *&pVe } template // 0..15 -float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) { +float SQ8_InnerProductImp_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) { const float *pVect1 = static_cast(pVect1v); // pVect2 is a quantized uint8_t vector const uint8_t *pVect2 = static_cast(pVect2v); @@ -79,17 +79,17 @@ float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimen // We dealt with the residual part. We are left with some multiple of 16 floats. // In each iteration we calculate 16 floats = 512 bits. - do { + while (pVect1 < pEnd1) { InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec); InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec); - } while (pVect1 < pEnd1); + } return my_mm256_reduce_add_ps(sum256); } template // 0..15 float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) { - return 1.0f - SQ8_InnerProductImp(pVect1v, pVect2v, dimension); + return 1.0f - SQ8_InnerProductImp_AVX2(pVect1v, pVect2v, dimension); } template // 0..15 @@ -99,7 +99,7 @@ float SQ8_CosineSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dim const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); // Calculate inner product using common implementation with normalization - float ip = SQ8_InnerProductImp(pVect1v, pVect2v, dimension); + float ip = SQ8_InnerProductImp_AVX2(pVect1v, pVect2v, dimension); // For cosine, we need to account for the vector norms // The inv_norm parameter is stored after min_val and delta in the quantized vector diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h index 3fd665111..481672504 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h @@ -36,8 +36,7 @@ static inline void SQ8_InnerProductStep(const float *&pVec1, const uint8_t *&pVe // Common implementation for both inner product and cosine similarity template // 0..15 -float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimension, - float inv_norm = 1.0f) { +float SQ8_InnerProductImp_AVX512(const void *pVec1v, const void *pVec2v, size_t dimension) { const float *pVec1 = static_cast(pVec1v); const uint8_t *pVec2 = static_cast(pVec2v); const float *pEnd1 = pVec1 + dimension; @@ -92,7 +91,7 @@ template // 0..15 float SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, size_t dimension) { // Calculate inner product using common implementation - float ip = SQ8_InnerProductImp(pVec1v, pVec2v, dimension); + float ip = SQ8_InnerProductImp_AVX512(pVec1v, pVec2v, dimension); // The inner product similarity is 1 - ip return 1.0f - ip; @@ -106,8 +105,8 @@ float SQ8_CosineSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v const float inv_norm = *reinterpret_cast(pVec2 + dimension + 2 * sizeof(float)); // Calculate inner product using common implementation with normalization - float ip = SQ8_InnerProductImp(pVec1v, pVec2v, dimension, inv_norm); + float ip = SQ8_InnerProductImp_AVX512(pVec1v, pVec2v, dimension); // The cosine similarity is 1 - ip - return 1.0f - ip; + return 1.0f - ip * inv_norm; } diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h new file mode 100644 index 000000000..679f967fc --- /dev/null +++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include + +/** + * SQ8-to-SQ8 distance functions. + * These functions compute distance between two SQ8 (scalar quantized 8-bit) vectors, + * where BOTH vectors are uint8 quantized and dequantization is applied to both + * during computation. + * + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [inv_norm (float)] + * Dequantization formula: dequantized_value = quantized_value * delta + min_val + */ + +// Helper function to perform inner product step for 16 elements with dual dequantization +static inline void SQ8_SQ8_InnerProductStep(const uint8_t *&pVec1, const uint8_t *&pVec2, + __m512 &sum, const __m512 &min_val_vec1, + const __m512 &delta_vec1, const __m512 &min_val_vec2, + const __m512 &delta_vec2) { + // Load 16 uint8 elements from pVec1 and convert to float + __m128i v1_128 = _mm_loadu_si128(reinterpret_cast(pVec1)); + __m512i v1_512 = _mm512_cvtepu8_epi32(v1_128); + __m512 v1_f = _mm512_cvtepi32_ps(v1_512); + + // Dequantize v1: (val * delta1) + min_val1 + __m512 v1_dequant = _mm512_fmadd_ps(v1_f, delta_vec1, min_val_vec1); + + // Load 16 uint8 elements from pVec2 and convert to float + __m128i v2_128 = _mm_loadu_si128(reinterpret_cast(pVec2)); + __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); + __m512 v2_f = _mm512_cvtepi32_ps(v2_512); + + // Dequantize v2: (val * delta2) + min_val2 + __m512 v2_dequant = _mm512_fmadd_ps(v2_f, delta_vec2, min_val_vec2); + + // Compute dot product and add to sum: sum += v1_dequant * v2_dequant + sum = _mm512_fmadd_ps(v1_dequant, v2_dequant, sum); + + // Advance pointers + pVec1 += 16; + pVec2 += 16; +} + +// Common implementation for inner product between two SQ8 vectors +template // 0..15 +float SQ8_SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimension) { + const uint8_t *pVec1 = static_cast(pVec1v); + const uint8_t *pVec2 = static_cast(pVec2v); + const uint8_t *pEnd1 = pVec1 + dimension; + + // Get dequantization parameters from the end of pVec1 + const float min_val1 = *reinterpret_cast(pVec1 + dimension); + const float delta1 = *reinterpret_cast(pVec1 + dimension + sizeof(float)); + + // Get dequantization parameters from the end of pVec2 + const float min_val2 = *reinterpret_cast(pVec2 + dimension); + const float delta2 = *reinterpret_cast(pVec2 + dimension + sizeof(float)); + + // Create broadcast vectors for SIMD operations + __m512 min_val_vec1 = _mm512_set1_ps(min_val1); + __m512 delta_vec1 = _mm512_set1_ps(delta1); + __m512 min_val_vec2 = _mm512_set1_ps(min_val2); + __m512 delta_vec2 = _mm512_set1_ps(delta2); + + // Initialize sum accumulator + __m512 sum = _mm512_setzero_ps(); + + // Deal with remainder first + if constexpr (residual > 0) { + // Handle less than 16 elements + __mmask16 mask = (1U << residual) - 1; + + // Load and convert v1 elements (safe to load 16 elements, masked later) + __m128i v1_128 = _mm_loadu_si128(reinterpret_cast(pVec1)); + __m512i v1_512 = _mm512_cvtepu8_epi32(v1_128); + __m512 v1_f = _mm512_cvtepi32_ps(v1_512); + + // Dequantize v1 + __m512 v1_dequant = _mm512_fmadd_ps(v1_f, delta_vec1, min_val_vec1); + + // Load and convert v2 elements + __m128i v2_128 = _mm_loadu_si128(reinterpret_cast(pVec2)); + __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); + __m512 v2_f = _mm512_cvtepi32_ps(v2_512); + + // Dequantize v2 + __m512 v2_dequant = _mm512_fmadd_ps(v2_f, delta_vec2, min_val_vec2); + + // Compute masked dot product + __m512 product = _mm512_mul_ps(v1_dequant, v2_dequant); + sum = _mm512_maskz_mov_ps(mask, product); + + pVec1 += residual; + pVec2 += residual; + } + + // Process remaining full chunks of 16 elements + do { + SQ8_SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec1, delta_vec1, min_val_vec2, + delta_vec2); + } while (pVec1 < pEnd1); + + // Horizontal sum and return + return _mm512_reduce_add_ps(sum); +} + +// SQ8-to-SQ8 Inner Product distance function +// Assumes both vectors are normalized. +// Returns 1 - inner_product (distance form) +template // 0..15 +float SQ8_SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, + size_t dimension) { + float ip = SQ8_SQ8_InnerProductImp(pVec1v, pVec2v, dimension); + return 1.0f - ip; +} + +// SQ8-to-SQ8 Cosine distance function +// Assumes both vectors are normalized. +// Returns 1 - (inner_product) +template // 0..15 +float SQ8_SQ8_CosineSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, + size_t dimension) { + // Calculate inner product + float ip = SQ8_SQ8_InnerProductImp(pVec1v, pVec2v, dimension); + + // Return cosine similarity + return 1.0f - ip; +} \ No newline at end of file diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h new file mode 100644 index 000000000..23d191735 --- /dev/null +++ b/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include + +/** + * SQ8-to-SQ8 distance functions for NEON. + * These functions compute distance between two SQ8 (scalar quantized 8-bit) vectors, + * where BOTH vectors are uint8 quantized and dequantization is applied to both + * during computation. + * + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [inv_norm (float)] + * Dequantization formula: dequantized_value = quantized_value * delta + min_val + */ + +// Helper function to perform inner product step for 4 elements with dual dequantization +static inline void SQ8_SQ8_InnerProductStep_NEON(const uint8_t *&pVec1, const uint8_t *&pVec2, + float32x4_t &sum, const float32x4_t &min_val_vec1, + const float32x4_t &delta_vec1, + const float32x4_t &min_val_vec2, + const float32x4_t &delta_vec2) { + // Load 4 uint8 elements from pVec1 and convert to float + uint8x8_t v1_u8 = vld1_u8(pVec1); + uint32x4_t v1_u32 = vmovl_u16(vget_low_u16(vmovl_u8(v1_u8))); + float32x4_t v1_f = vcvtq_f32_u32(v1_u32); + + // Dequantize v1: (val * delta1) + min_val1 + float32x4_t v1_dequant = vmlaq_f32(min_val_vec1, v1_f, delta_vec1); + + // Load 4 uint8 elements from pVec2 and convert to float + uint8x8_t v2_u8 = vld1_u8(pVec2); + uint32x4_t v2_u32 = vmovl_u16(vget_low_u16(vmovl_u8(v2_u8))); + float32x4_t v2_f = vcvtq_f32_u32(v2_u32); + + // Dequantize v2: (val * delta2) + min_val2 + float32x4_t v2_dequant = vmlaq_f32(min_val_vec2, v2_f, delta_vec2); + + // Compute dot product and add to sum + sum = vmlaq_f32(sum, v1_dequant, v2_dequant); + + // Advance pointers + pVec1 += 4; + pVec2 += 4; +} + +// Common implementation for inner product between two SQ8 vectors +template // 0..15 +float SQ8_SQ8_InnerProductSIMD16_NEON_IMP(const void *pVec1v, const void *pVec2v, size_t dimension) { + const uint8_t *pVec1 = static_cast(pVec1v); + const uint8_t *pVec2 = static_cast(pVec2v); + + // Get dequantization parameters from the end of pVec1 + const float min_val1 = *reinterpret_cast(pVec1 + dimension); + const float delta1 = *reinterpret_cast(pVec1 + dimension + sizeof(float)); + + // Get dequantization parameters from the end of pVec2 + const float min_val2 = *reinterpret_cast(pVec2 + dimension); + const float delta2 = *reinterpret_cast(pVec2 + dimension + sizeof(float)); + + // Create broadcast vectors for SIMD operations + float32x4_t min_val_vec1 = vdupq_n_f32(min_val1); + float32x4_t delta_vec1 = vdupq_n_f32(delta1); + float32x4_t min_val_vec2 = vdupq_n_f32(min_val2); + float32x4_t delta_vec2 = vdupq_n_f32(delta2); + + float32x4_t sum0 = vdupq_n_f32(0.0f); + float32x4_t sum1 = vdupq_n_f32(0.0f); + float32x4_t sum2 = vdupq_n_f32(0.0f); + float32x4_t sum3 = vdupq_n_f32(0.0f); + + const size_t num_of_chunks = dimension / 16; + + // Process 16 elements at a time in the main loop + for (size_t i = 0; i < num_of_chunks; i++) { + SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, sum0, min_val_vec1, delta_vec1, min_val_vec2, + delta_vec2); + SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, sum1, min_val_vec1, delta_vec1, min_val_vec2, + delta_vec2); + SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, sum2, min_val_vec1, delta_vec1, min_val_vec2, + delta_vec2); + SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, sum3, min_val_vec1, delta_vec1, min_val_vec2, + delta_vec2); + } + + // Handle remaining complete 4-element blocks within residual + if constexpr (residual >= 4) { + SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, sum0, min_val_vec1, delta_vec1, min_val_vec2, + delta_vec2); + } + if constexpr (residual >= 8) { + SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, sum1, min_val_vec1, delta_vec1, min_val_vec2, + delta_vec2); + } + if constexpr (residual >= 12) { + SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, sum2, min_val_vec1, delta_vec1, min_val_vec2, + delta_vec2); + } + + // Handle final residual elements (0-3 elements) + constexpr size_t final_residual = residual % 4; + if constexpr (final_residual > 0) { + float32x4_t v1_dequant = vdupq_n_f32(0.0f); + float32x4_t v2_dequant = vdupq_n_f32(0.0f); + + if constexpr (final_residual >= 1) { + float dequant1_0 = pVec1[0] * delta1 + min_val1; + float dequant2_0 = pVec2[0] * delta2 + min_val2; + v1_dequant = vld1q_lane_f32(&dequant1_0, v1_dequant, 0); + v2_dequant = vld1q_lane_f32(&dequant2_0, v2_dequant, 0); + } + if constexpr (final_residual >= 2) { + float dequant1_1 = pVec1[1] * delta1 + min_val1; + float dequant2_1 = pVec2[1] * delta2 + min_val2; + v1_dequant = vld1q_lane_f32(&dequant1_1, v1_dequant, 1); + v2_dequant = vld1q_lane_f32(&dequant2_1, v2_dequant, 1); + } + if constexpr (final_residual >= 3) { + float dequant1_2 = pVec1[2] * delta1 + min_val1; + float dequant2_2 = pVec2[2] * delta2 + min_val2; + v1_dequant = vld1q_lane_f32(&dequant1_2, v1_dequant, 2); + v2_dequant = vld1q_lane_f32(&dequant2_2, v2_dequant, 2); + } + + sum3 = vmlaq_f32(sum3, v1_dequant, v2_dequant); + } + + // Combine all four sum accumulators + float32x4_t sum_combined = vaddq_f32(vaddq_f32(sum0, sum1), vaddq_f32(sum2, sum3)); + + // Horizontal sum of the 4 elements in the combined NEON register + float32x2_t sum_halves = vadd_f32(vget_low_f32(sum_combined), vget_high_f32(sum_combined)); + float32x2_t summed = vpadd_f32(sum_halves, sum_halves); + return vget_lane_f32(summed, 0); +} + +// SQ8-to-SQ8 Inner Product distance function +// Assumes both vectors are normalized. +// Returns 1 - inner_product (distance form) +template // 0..15 +float SQ8_SQ8_InnerProductSIMD16_NEON(const void *pVec1v, const void *pVec2v, size_t dimension) { + return 1.0f - SQ8_SQ8_InnerProductSIMD16_NEON_IMP(pVec1v, pVec2v, dimension); +} + +// SQ8-to-SQ8 Cosine distance function +// Assumes both vectors are normalized. +// Returns 1 - inner_product +template // 0..15 +float SQ8_SQ8_CosineSIMD16_NEON(const void *pVec1v, const void *pVec2v, size_t dimension) { + return 1.0f - SQ8_SQ8_InnerProductSIMD16_NEON_IMP(pVec1v, pVec2v, dimension); +} \ No newline at end of file diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h new file mode 100644 index 000000000..af49f8ce6 --- /dev/null +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include + +/** + * SQ8-to-SQ8 distance functions for SVE. + * These functions compute distance between two SQ8 (scalar quantized 8-bit) vectors, + * where BOTH vectors are uint8 quantized and dequantization is applied to both + * during computation. + * + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [inv_norm (float)] + * Dequantization formula: dequantized_value = quantized_value * delta + min_val + */ + +// Helper function to perform inner product step for one chunk with dual dequantization +static inline void SQ8_SQ8_InnerProductStep_SVE(const uint8_t *&pVec1, const uint8_t *&pVec2, + size_t &offset, svfloat32_t &sum, + const svfloat32_t &min_val_vec1, + const svfloat32_t &delta_vec1, + const svfloat32_t &min_val_vec2, + const svfloat32_t &delta_vec2, const size_t chunk) { + svbool_t pg = svptrue_b32(); + + // Load uint8 elements from pVec1 and convert to float + svuint32_t v1_u32 = svld1ub_u32(pg, pVec1 + offset); + svfloat32_t v1_f = svcvt_f32_u32_x(pg, v1_u32); + + // Dequantize v1: (val * delta1) + min_val1 + svfloat32_t v1_dequant = svmla_f32_x(pg, min_val_vec1, v1_f, delta_vec1); + + // Load uint8 elements from pVec2 and convert to float + svuint32_t v2_u32 = svld1ub_u32(pg, pVec2 + offset); + svfloat32_t v2_f = svcvt_f32_u32_x(pg, v2_u32); + + // Dequantize v2: (val * delta2) + min_val2 + svfloat32_t v2_dequant = svmla_f32_x(pg, min_val_vec2, v2_f, delta_vec2); + + // Compute dot product and add to sum: sum += v1_dequant * v2_dequant + sum = svmla_f32_x(pg, sum, v1_dequant, v2_dequant); + + // Move to the next set of elements + offset += chunk; +} + +// Common implementation for inner product between two SQ8 vectors +template +float SQ8_SQ8_InnerProductSIMD_SVE_IMP(const void *pVec1v, const void *pVec2v, size_t dimension) { + const uint8_t *pVec1 = static_cast(pVec1v); + const uint8_t *pVec2 = static_cast(pVec2v); + size_t offset = 0; + + // Get dequantization parameters from the end of pVec1 + const float min1 = *reinterpret_cast(pVec1 + dimension); + const float delta1 = *reinterpret_cast(pVec1 + dimension + sizeof(float)); + + // Get dequantization parameters from the end of pVec2 + const float min2 = *reinterpret_cast(pVec2 + dimension); + const float delta2 = *reinterpret_cast(pVec2 + dimension + sizeof(float)); + + // Create broadcast vectors for SIMD operations + svbool_t pg = svptrue_b32(); + svfloat32_t min_val_vec1 = svdup_f32(min1); + svfloat32_t delta_vec1 = svdup_f32(delta1); + svfloat32_t min_val_vec2 = svdup_f32(min2); + svfloat32_t delta_vec2 = svdup_f32(delta2); + + // Get the number of 32-bit elements per vector at runtime + uint64_t chunk = svcntw(); + + // Multiple accumulators to increase instruction-level parallelism + svfloat32_t sum0 = svdup_f32(0.0f); + svfloat32_t sum1 = svdup_f32(0.0f); + svfloat32_t sum2 = svdup_f32(0.0f); + svfloat32_t sum3 = svdup_f32(0.0f); + + // Handle partial chunk if needed + if constexpr (partial_chunk) { + size_t remaining = dimension % chunk; + if (remaining > 0) { + // Create predicate for the remaining elements + svbool_t pg_partial = + svwhilelt_b32(static_cast(0), static_cast(remaining)); + + // Load and convert v1 elements + svuint32_t v1_u32 = svld1ub_u32(pg_partial, pVec1 + offset); + svfloat32_t v1_f = svcvt_f32_u32_z(pg_partial, v1_u32); + + // Dequantize v1 + svfloat32_t v1_dequant = svmla_f32_z(pg_partial, min_val_vec1, v1_f, delta_vec1); + + // Load and convert v2 elements + svuint32_t v2_u32 = svld1ub_u32(pg_partial, pVec2 + offset); + svfloat32_t v2_f = svcvt_f32_u32_z(pg_partial, v2_u32); + + // Dequantize v2 + svfloat32_t v2_dequant = svmla_f32_z(pg_partial, min_val_vec2, v2_f, delta_vec2); + + // Compute dot product and add to sum + sum0 = svmla_f32_z(pg_partial, sum0, v1_dequant, v2_dequant); + + // Move past the partial chunk + offset += remaining; + } + } + + // Process 4 chunks at a time in the main loop + auto chunk_size = 4 * chunk; + const size_t number_of_chunks = + (dimension - (partial_chunk ? dimension % chunk : 0)) / chunk_size; + + for (size_t i = 0; i < number_of_chunks; i++) { + SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, sum0, min_val_vec1, delta_vec1, + min_val_vec2, delta_vec2, chunk); + SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, sum1, min_val_vec1, delta_vec1, + min_val_vec2, delta_vec2, chunk); + SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, sum2, min_val_vec1, delta_vec1, + min_val_vec2, delta_vec2, chunk); + SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, sum3, min_val_vec1, delta_vec1, + min_val_vec2, delta_vec2, chunk); + } + + // Handle remaining steps (0-3) + if constexpr (additional_steps > 0) { + SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, sum0, min_val_vec1, delta_vec1, + min_val_vec2, delta_vec2, chunk); + } + if constexpr (additional_steps > 1) { + SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, sum1, min_val_vec1, delta_vec1, + min_val_vec2, delta_vec2, chunk); + } + if constexpr (additional_steps > 2) { + SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, sum2, min_val_vec1, delta_vec1, + min_val_vec2, delta_vec2, chunk); + } + + // Combine the accumulators + svfloat32_t sum = svadd_f32_z(pg, sum0, sum1); + sum = svadd_f32_z(pg, sum, sum2); + sum = svadd_f32_z(pg, sum, sum3); + + // Horizontal sum of all elements in the vector + return svaddv_f32(pg, sum); +} + +// SQ8-to-SQ8 Inner Product distance function +// Returns 1 - inner_product (distance form) +template +float SQ8_SQ8_InnerProductSIMD_SVE(const void *pVec1v, const void *pVec2v, size_t dimension) { + return 1.0f - SQ8_SQ8_InnerProductSIMD_SVE_IMP(pVec1v, pVec2v, + dimension); +} + +// SQ8-to-SQ8 Cosine distance function +// Returns 1 - inner_product (assumes vectors are pre-normalized) +template +float SQ8_SQ8_CosineSIMD_SVE(const void *pVec1v, const void *pVec2v, size_t dimension) { + float ip = SQ8_SQ8_InnerProductSIMD_SVE_IMP(pVec1v, pVec2v, + dimension); + return 1.0f - ip; +} diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index d24c1d142..fc4ee8375 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -150,6 +150,82 @@ dist_func_t Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, return ret_dist_func; } +// SQ8-to-SQ8 Inner Product distance function (both vectors are uint8 quantized) +dist_func_t IP_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, + const void *arch_opt) { + unsigned char dummy_alignment; + if (alignment == nullptr) { + alignment = &dummy_alignment; + } + + dist_func_t ret_dist_func = SQ8_SQ8_InnerProduct; + [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt); + +#ifdef CPU_FEATURES_ARCH_AARCH64 +#ifdef OPT_SVE + if (features.sve) { + return Choose_SQ8_SQ8_IP_implementation_SVE(dim); + } +#endif +#ifdef OPT_NEON + if (features.asimd) { + return Choose_SQ8_SQ8_IP_implementation_NEON(dim); + } +#endif +#endif // AARCH64 + +#ifdef CPU_FEATURES_ARCH_X86_64 + // Optimizations assume at least 16 floats. If we have less, we use the naive implementation. + if (dim < 16) { + return ret_dist_func; + } +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (features.avx512f && features.avx512bw && features.avx512vnni) { + return Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); + } +#endif +#endif // __x86_64__ + return ret_dist_func; +} + +// SQ8-to-SQ8 Cosine distance function (both vectors are uint8 quantized) +dist_func_t Cosine_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, + const void *arch_opt) { + unsigned char dummy_alignment; + if (alignment == nullptr) { + alignment = &dummy_alignment; + } + + dist_func_t ret_dist_func = SQ8_SQ8_Cosine; + [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt); + +#ifdef CPU_FEATURES_ARCH_AARCH64 +#ifdef OPT_SVE + if (features.sve) { + return Choose_SQ8_SQ8_Cosine_implementation_SVE(dim); + } +#endif +#ifdef OPT_NEON + if (features.asimd) { + return Choose_SQ8_SQ8_Cosine_implementation_NEON(dim); + } +#endif +#endif // AARCH64 + +#ifdef CPU_FEATURES_ARCH_X86_64 + // Optimizations assume at least 16 floats. If we have less, we use the naive implementation. + if (dim < 16) { + return ret_dist_func; + } +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (features.avx512f && features.avx512bw && features.avx512vnni) { + return Choose_SQ8_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); + } +#endif +#endif // __x86_64__ + return ret_dist_func; +} + dist_func_t IP_FP32_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { unsigned char dummy_alignment; if (alignment == nullptr) { @@ -589,4 +665,6 @@ dist_func_t Cosine_UINT8_GetDistFunc(size_t dim, unsigned char *alignment return ret_dist_func; } + + } // namespace spaces diff --git a/src/VecSim/spaces/IP_space.h b/src/VecSim/spaces/IP_space.h index db2d0b2d9..2f504e6bc 100644 --- a/src/VecSim/spaces/IP_space.h +++ b/src/VecSim/spaces/IP_space.h @@ -31,4 +31,9 @@ dist_func_t Cosine_UINT8_GetDistFunc(size_t dim, unsigned char *alignment const void *arch_opt = nullptr); dist_func_t Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, const void *arch_opt = nullptr); +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized) +dist_func_t IP_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, + const void *arch_opt = nullptr); +dist_func_t Cosine_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, + const void *arch_opt = nullptr); } // namespace spaces diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp index fef34dd22..22086a971 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp @@ -17,6 +17,8 @@ #include "VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h" #include "VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h" +#include "VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h" + namespace spaces { #include "implementation_chooser.h" @@ -72,6 +74,18 @@ dist_func_t Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim) { CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_L2SqrSIMD16_AVX512F_BW_VL_VNNI); return ret_dist_func; } +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized) +dist_func_t Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_SQ8_CosineSIMD16_AVX512F_BW_VL_VNNI); + return ret_dist_func; +} #include "implementation_chooser_cleanup.h" diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h index 745a339fb..205235bbd 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h @@ -24,4 +24,8 @@ dist_func_t Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim); dist_func_t Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim); dist_func_t Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim); +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized) +dist_func_t Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim); +dist_func_t Choose_SQ8_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim); + } // namespace spaces diff --git a/src/VecSim/spaces/functions/NEON.cpp b/src/VecSim/spaces/functions/NEON.cpp index d0b5c9160..5b5070bc7 100644 --- a/src/VecSim/spaces/functions/NEON.cpp +++ b/src/VecSim/spaces/functions/NEON.cpp @@ -17,6 +17,7 @@ #include "VecSim/spaces/IP/IP_NEON_FP64.h" #include "VecSim/spaces/L2/L2_NEON_SQ8.h" #include "VecSim/spaces/IP/IP_NEON_SQ8.h" +#include "VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h" namespace spaces { @@ -99,6 +100,19 @@ dist_func_t Choose_SQ8_Cosine_implementation_NEON(size_t dim) { return ret_dist_func; } +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized) +dist_func_t Choose_SQ8_SQ8_IP_implementation_NEON(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_SQ8_InnerProductSIMD16_NEON); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_SQ8_Cosine_implementation_NEON(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_SQ8_CosineSIMD16_NEON); + return ret_dist_func; +} + #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/NEON.h b/src/VecSim/spaces/functions/NEON.h index 1449c6ac5..011d41b0d 100644 --- a/src/VecSim/spaces/functions/NEON.h +++ b/src/VecSim/spaces/functions/NEON.h @@ -30,4 +30,8 @@ dist_func_t Choose_SQ8_L2_implementation_NEON(size_t dim); dist_func_t Choose_SQ8_IP_implementation_NEON(size_t dim); dist_func_t Choose_SQ8_Cosine_implementation_NEON(size_t dim); +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized) +dist_func_t Choose_SQ8_SQ8_IP_implementation_NEON(size_t dim); +dist_func_t Choose_SQ8_SQ8_Cosine_implementation_NEON(size_t dim); + } // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE.cpp b/src/VecSim/spaces/functions/SVE.cpp index 208763779..9e18346cb 100644 --- a/src/VecSim/spaces/functions/SVE.cpp +++ b/src/VecSim/spaces/functions/SVE.cpp @@ -25,6 +25,8 @@ #include "VecSim/spaces/IP/IP_SVE_SQ8.h" #include "VecSim/spaces/L2/L2_SVE_SQ8.h" +#include "VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h" + namespace spaces { #include "implementation_chooser.h" @@ -116,6 +118,19 @@ dist_func_t Choose_SQ8_L2_implementation_SVE(size_t dim) { return ret_dist_func; } +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized) +dist_func_t Choose_SQ8_SQ8_IP_implementation_SVE(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_SQ8_InnerProductSIMD_SVE, dim, svcntw); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_SQ8_Cosine_implementation_SVE(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_SQ8_CosineSIMD_SVE, dim, svcntw); + return ret_dist_func; +} + #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE.h b/src/VecSim/spaces/functions/SVE.h index 680e906e6..d505d88dc 100644 --- a/src/VecSim/spaces/functions/SVE.h +++ b/src/VecSim/spaces/functions/SVE.h @@ -33,4 +33,8 @@ dist_func_t Choose_SQ8_IP_implementation_SVE(size_t dim); dist_func_t Choose_SQ8_Cosine_implementation_SVE(size_t dim); dist_func_t Choose_SQ8_L2_implementation_SVE(size_t dim); +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized) +dist_func_t Choose_SQ8_SQ8_IP_implementation_SVE(size_t dim); +dist_func_t Choose_SQ8_SQ8_Cosine_implementation_SVE(size_t dim); + } // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE2.cpp b/src/VecSim/spaces/functions/SVE2.cpp index 9df4b3b08..0d24c3fc7 100644 --- a/src/VecSim/spaces/functions/SVE2.cpp +++ b/src/VecSim/spaces/functions/SVE2.cpp @@ -22,6 +22,8 @@ #include "VecSim/spaces/IP/IP_SVE_UINT8.h" // SVE2 implementation is identical to SVE #include "VecSim/spaces/IP/IP_SVE_SQ8.h" // SVE2 implementation is identical to SVE #include "VecSim/spaces/L2/L2_SVE_SQ8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h" // SVE2 implementation is identical to SVE + namespace spaces { @@ -114,6 +116,19 @@ dist_func_t Choose_SQ8_L2_implementation_SVE2(size_t dim) { return ret_dist_func; } +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized) +dist_func_t Choose_SQ8_SQ8_IP_implementation_SVE2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_SQ8_InnerProductSIMD_SVE, dim, svcntw); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_SQ8_Cosine_implementation_SVE2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_SQ8_CosineSIMD_SVE, dim, svcntw); + return ret_dist_func; +} + #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE2.h b/src/VecSim/spaces/functions/SVE2.h index 059f38b1b..c2445bb2a 100644 --- a/src/VecSim/spaces/functions/SVE2.h +++ b/src/VecSim/spaces/functions/SVE2.h @@ -33,4 +33,9 @@ dist_func_t Choose_SQ8_IP_implementation_SVE2(size_t dim); dist_func_t Choose_SQ8_Cosine_implementation_SVE2(size_t dim); dist_func_t Choose_SQ8_L2_implementation_SVE2(size_t dim); +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized) +dist_func_t Choose_SQ8_SQ8_IP_implementation_SVE2(size_t dim); +dist_func_t Choose_SQ8_SQ8_Cosine_implementation_SVE2(size_t dim); + + } // namespace spaces diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp index 1349a3512..8c74fe1ff 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp @@ -96,6 +96,56 @@ INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8, SQ8, InnerProduct, 16); INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8, SQ8, Cosine, 16); INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8, SQ8, L2Sqr, 16); -// Naive +/** + * SQ8-to-SQ8 benchmarks: Both vectors are uint8 quantized with dequantization applied to both. + */ +class BM_VecSimSpaces_SQ8_SQ8 : public benchmark::Fixture { +protected: + std::mt19937 rng; + size_t dim; + uint8_t *v1; + uint8_t *v2; + +public: + BM_VecSimSpaces_SQ8_SQ8() { rng.seed(47); } + ~BM_VecSimSpaces_SQ8_SQ8() = default; + + void SetUp(const ::benchmark::State &state) { + dim = state.range(0); + // Allocate both vectors with extra space for min, delta and inv_norm + v1 = new uint8_t[dim + sizeof(float) * 3]; + v2 = new uint8_t[dim + sizeof(float) * 3]; + test_utils::populate_float_vec_to_sq8(v1, dim, 123); + test_utils::populate_float_vec_to_sq8(v2, dim, 1234); + } + void TearDown(const ::benchmark::State &state) { + delete[] v1; + delete[] v2; + } +}; + +#ifdef CPU_FEATURES_ARCH_AARCH64 +// NEON SQ8-to-SQ8 functions +#ifdef OPT_NEON +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, NEON, 16, neon_supported); +#endif // NEON +// SVE SQ8-to-SQ8 functions +#ifdef OPT_SVE +INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, SVE, 16, sve_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, SVE, 16, sve_supported); +#endif // SVE +#endif // AARCH64 + +#ifdef CPU_FEATURES_ARCH_X86_64 +// AVX512_F_BW_VL_VNNI SQ8-to-SQ8 functions +#ifdef OPT_AVX512_F_BW_VL_VNNI +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, AVX512F_BW_VL_VNNI, 16, + avx512_f_bw_vl_vnni_supported); +#endif // AVX512_F_BW_VL_VNNI +#endif // x86_64 + +// Naive SQ8-to-SQ8 algorithms +INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, InnerProduct, 16); +INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, Cosine, 16); BENCHMARK_MAIN(); diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index dabe9c794..641c2933d 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2348,7 +2348,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { } // Instantiate the test suite with dimensions to test -INSTANTIATE_TEST_SUITE_P(SQ8InnerProductTest, SQ8SpacesOptimizationTest, +INSTANTIATE_TEST_SUITE_P(SQ8OptFuncs, SQ8SpacesOptimizationTest, testing::Range(16UL, 16 * 2UL + 1)); TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { @@ -2467,3 +2467,219 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } + +/* ======================== Tests SQ8_SQ8 ========================= */ + +TEST_F(SpacesTest, SQ8_SQ8_ip_no_optimization_func_test) { + size_t dim = 5; + + // Create original vectors + float v1_orig[dim], v2_orig[dim]; + for (size_t i = 0; i < dim; i++) { + v1_orig[i] = float(i + 1.5); + v2_orig[i] = float(i + 1.5); + } + + // Normalize both vectors for IP test + spaces::GetNormalizeFunc()(v1_orig, dim); + spaces::GetNormalizeFunc()(v2_orig, dim); + + // Create SQ8 compressed versions of both vectors + std::vector v1_compressed = CreateSQ8CompressedVector(v1_orig, dim); + std::vector v2_compressed = CreateSQ8CompressedVector(v2_orig, dim); + + float dist = + SQ8_SQ8_InnerProduct((const void *)v1_compressed.data(), (const void *)v2_compressed.data(), + dim); + + // Since we're comparing identical normalized vectors, distance should be close to 0 + ASSERT_NEAR(dist, 0.0f, 0.01f) << "SQ8_SQ8_InnerProduct failed to match expected distance"; +} + +TEST_F(SpacesTest, SQ8_SQ8_Cosine_no_optimization_func_test) { + size_t dim = 5; + + // Create original vectors + float v1_orig[dim], v2_orig[dim]; + for (size_t i = 0; i < dim; i++) { + v1_orig[i] = float(i + 1.5); + v2_orig[i] = float(i + 1.5); + } + + // Normalize both vectors for Cosine test + spaces::GetNormalizeFunc()(v1_orig, dim); + spaces::GetNormalizeFunc()(v2_orig, dim); + + // Create SQ8 compressed versions of both vectors + std::vector v1_compressed = CreateSQ8CompressedVector(v1_orig, dim); + std::vector v2_compressed = CreateSQ8CompressedVector(v2_orig, dim); + + float dist = + SQ8_SQ8_Cosine((const void *)v1_compressed.data(), (const void *)v2_compressed.data(), dim); + + // Since we're comparing identical normalized vectors, cosine distance should be close to 0 + ASSERT_NEAR(dist, 0.0f, 0.01f) << "SQ8_SQ8_Cosine failed to match expected distance"; +} + +class SQ8_SQ8_SpacesOptimizationTest : public testing::TestWithParam {}; + +TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_InnerProductTest) { + auto optimization = getCpuOptimizationFeatures(); + size_t dim = GetParam(); + + // Create original vectors + std::vector v1_orig(dim); + std::vector v2_orig(dim); + for (size_t i = 0; i < dim; i++) { + v1_orig[i] = float(i + 1.5); + v2_orig[i] = float(i * 0.75 + 1.0); + } + + // Normalize both vectors + spaces::GetNormalizeFunc()(v1_orig.data(), dim); + spaces::GetNormalizeFunc()(v2_orig.data(), dim); + + // Create SQ8 compressed versions of both vectors + std::vector v1_compressed = CreateSQ8CompressedVector(v1_orig.data(), dim); + std::vector v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim); + + dist_func_t arch_opt_func; + float baseline = SQ8_SQ8_InnerProduct(v1_compressed.data(), v2_compressed.data(), dim); + +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_SQ8_IP_implementation_SVE2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_compressed.data(), v2_compressed.data(), dim), 0.01) + << "SVE2 with dim " << dim; + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_SQ8_IP_implementation_SVE(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_compressed.data(), v2_compressed.data(), dim), 0.01) + << "SVE with dim " << dim; + optimization.sve = 0; + } +#endif +#ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_SQ8_IP_implementation_NEON(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_compressed.data(), v2_compressed.data(), dim), 0.01) + << "NEON with dim " << dim; + optimization.asimd = 0; + } +#endif + +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_compressed.data(), v2_compressed.data(), dim), 0.01) + << "AVX512 with dim " << dim; + optimization.avx512f = 0; + } +#endif + + // Test default implementation + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, SQ8_SQ8_InnerProduct) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_compressed.data(), v2_compressed.data(), dim), 0.01) + << "No optimization with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; +} + +TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_CosineTest) { + auto optimization = getCpuOptimizationFeatures(); + size_t dim = GetParam(); + + // Create original vectors + std::vector v1_orig(dim); + std::vector v2_orig(dim); + for (size_t i = 0; i < dim; i++) { + v1_orig[i] = float(i + 1.5); + v2_orig[i] = float(i * 0.75 + 1.0); + } + + // Normalize both vectors + spaces::GetNormalizeFunc()(v1_orig.data(), dim); + spaces::GetNormalizeFunc()(v2_orig.data(), dim); + + // Create SQ8 compressed versions of both vectors + std::vector v1_compressed = CreateSQ8CompressedVector(v1_orig.data(), dim); + std::vector v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim); + + dist_func_t arch_opt_func; + float baseline = SQ8_SQ8_Cosine(v1_compressed.data(), v2_compressed.data(), dim); + +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_SQ8_Cosine_implementation_SVE2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_compressed.data(), v2_compressed.data(), dim), 0.01) + << "SVE2 with dim " << dim; + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_SQ8_Cosine_implementation_SVE(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_compressed.data(), v2_compressed.data(), dim), 0.01) + << "SVE with dim " << dim; + optimization.sve = 0; + } +#endif +#ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_SQ8_Cosine_implementation_NEON(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_compressed.data(), v2_compressed.data(), dim), 0.01) + << "NEON with dim " << dim; + optimization.asimd = 0; + } +#endif + +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_compressed.data(), v2_compressed.data(), dim), 0.01) + << "AVX512 with dim " << dim; + optimization.avx512f = 0; + } +#endif + + // Test default implementation + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, SQ8_SQ8_Cosine) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_compressed.data(), v2_compressed.data(), dim), 0.01) + << "No optimization with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; +} + +INSTANTIATE_TEST_SUITE_P(SQ8_SQ8OptFuncs, SQ8_SQ8_SpacesOptimizationTest, + testing::Range(16UL, 16 * 2UL + 1)); From 8697a3e1b5a2e979c9ed70a18eb75d912a4af8b1 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 28 Dec 2025 10:01:51 +0200 Subject: [PATCH 02/51] Add SQ8-to-SQ8 benchmark tests and update related scripts --- tests/benchmark/CMakeLists.txt | 2 +- tests/benchmark/benchmarks.sh | 5 ++ .../spaces_benchmarks/bm_spaces_sq8.cpp | 52 ------------ .../spaces_benchmarks/bm_spaces_sq8_sq8.cpp | 81 +++++++++++++++++++ 4 files changed, 87 insertions(+), 53 deletions(-) create mode 100644 tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp diff --git a/tests/benchmark/CMakeLists.txt b/tests/benchmark/CMakeLists.txt index d898fa85c..052207214 100644 --- a/tests/benchmark/CMakeLists.txt +++ b/tests/benchmark/CMakeLists.txt @@ -39,7 +39,7 @@ endif() # Spaces benchmarks # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # -set(DATA_TYPE fp32 fp64 bf16 fp16 int8 uint8 sq8) +set(DATA_TYPE fp32 fp64 bf16 fp16 int8 uint8 sq8 sq8_sq8) foreach(data_type IN LISTS DATA_TYPE) add_executable(bm_spaces_${data_type} spaces_benchmarks/bm_spaces_${data_type}.cpp) target_link_libraries(bm_spaces_${data_type} VectorSimilarity benchmark::benchmark) diff --git a/tests/benchmark/benchmarks.sh b/tests/benchmark/benchmarks.sh index bc8db7535..feb28f129 100755 --- a/tests/benchmark/benchmarks.sh +++ b/tests/benchmark/benchmarks.sh @@ -20,6 +20,7 @@ if [ -z "$BM_TYPE" ] || [ "$BM_TYPE" = "benchmarks-all" ]; then echo spaces_int8 echo spaces_uint8 echo spaces_sq8 + echo spaces_sq8_sq8 elif [ "$BM_TYPE" = "benchmarks-default" ]; then echo basics_single_fp32 @@ -31,6 +32,7 @@ elif [ "$BM_TYPE" = "benchmarks-default" ]; then echo spaces_int8 echo spaces_uint8 echo spaces_sq8 + echo spaces_sq8_sq8 # Basic benchmarks @@ -107,6 +109,7 @@ elif [ "$BM_TYPE" = "bm-spaces" ] ; then echo spaces_int8 echo spaces_uint8 echo spaces_sq8 + echo spaces_sq8_sq8 elif [ "$BM_TYPE" = "bm-spaces-fp32" ] ; then echo spaces_fp32 @@ -122,4 +125,6 @@ elif [ "$BM_TYPE" = "bm-spaces-uint8" ] ; then echo spaces_uint8 elif [ "$BM_TYPE" = "bm-spaces-sq8" ] ; then echo spaces_sq8 +elif [ "$BM_TYPE" = "bm-spaces-sq8-sq8" ] ; then + echo spaces_sq8_sq8 fi diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp index 8c74fe1ff..29a4319ad 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp @@ -96,56 +96,4 @@ INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8, SQ8, InnerProduct, 16); INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8, SQ8, Cosine, 16); INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8, SQ8, L2Sqr, 16); -/** - * SQ8-to-SQ8 benchmarks: Both vectors are uint8 quantized with dequantization applied to both. - */ -class BM_VecSimSpaces_SQ8_SQ8 : public benchmark::Fixture { -protected: - std::mt19937 rng; - size_t dim; - uint8_t *v1; - uint8_t *v2; - -public: - BM_VecSimSpaces_SQ8_SQ8() { rng.seed(47); } - ~BM_VecSimSpaces_SQ8_SQ8() = default; - - void SetUp(const ::benchmark::State &state) { - dim = state.range(0); - // Allocate both vectors with extra space for min, delta and inv_norm - v1 = new uint8_t[dim + sizeof(float) * 3]; - v2 = new uint8_t[dim + sizeof(float) * 3]; - test_utils::populate_float_vec_to_sq8(v1, dim, 123); - test_utils::populate_float_vec_to_sq8(v2, dim, 1234); - } - void TearDown(const ::benchmark::State &state) { - delete[] v1; - delete[] v2; - } -}; - -#ifdef CPU_FEATURES_ARCH_AARCH64 -// NEON SQ8-to-SQ8 functions -#ifdef OPT_NEON -INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, NEON, 16, neon_supported); -#endif // NEON -// SVE SQ8-to-SQ8 functions -#ifdef OPT_SVE -INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, SVE, 16, sve_supported); -INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, SVE, 16, sve_supported); -#endif // SVE -#endif // AARCH64 - -#ifdef CPU_FEATURES_ARCH_X86_64 -// AVX512_F_BW_VL_VNNI SQ8-to-SQ8 functions -#ifdef OPT_AVX512_F_BW_VL_VNNI -INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, AVX512F_BW_VL_VNNI, 16, - avx512_f_bw_vl_vnni_supported); -#endif // AVX512_F_BW_VL_VNNI -#endif // x86_64 - -// Naive SQ8-to-SQ8 algorithms -INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, InnerProduct, 16); -INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, Cosine, 16); - BENCHMARK_MAIN(); diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp new file mode 100644 index 000000000..ca888fff1 --- /dev/null +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#include "bm_spaces.h" +#include "utils/tests_utils.h" + +/** + * SQ8-to-SQ8 benchmarks: Both vectors are uint8 quantized with dequantization applied to both. + */ +class BM_VecSimSpaces_SQ8_SQ8 : public benchmark::Fixture { +protected: + std::mt19937 rng; + size_t dim; + uint8_t *v1; + uint8_t *v2; + +public: + BM_VecSimSpaces_SQ8_SQ8() { rng.seed(47); } + ~BM_VecSimSpaces_SQ8_SQ8() = default; + + void SetUp(const ::benchmark::State &state) { + dim = state.range(0); + // Allocate both vectors with extra space for min, delta and inv_norm + v1 = new uint8_t[dim + sizeof(float) * 3]; + v2 = new uint8_t[dim + sizeof(float) * 3]; + test_utils::populate_float_vec_to_sq8(v1, dim, 123); + test_utils::populate_float_vec_to_sq8(v2, dim, 1234); + } + void TearDown(const ::benchmark::State &state) { + delete[] v1; + delete[] v2; + } +}; + +#ifdef CPU_FEATURES_ARCH_AARCH64 +cpu_features::Aarch64Features opt = cpu_features::GetAarch64Info().features; + +// NEON SQ8-to-SQ8 functions +#ifdef OPT_NEON +bool neon_supported = opt.asimd; +INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, NEON, 16, neon_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, NEON, 16, neon_supported); +#endif // NEON +// SVE SQ8-to-SQ8 functions +#ifdef OPT_SVE +bool sve_supported = opt.sve; +INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, SVE, 16, sve_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, SVE, 16, sve_supported); +#endif // SVE +// SVE2 SQ8-to-SQ8 functions +#ifdef OPT_SVE2 +bool sve2_supported = opt.sve2; +INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, SVE2, 16, sve2_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, SVE2, 16, sve2_supported); +#endif // SVE2 +#endif // AARCH64 + +#ifdef CPU_FEATURES_ARCH_X86_64 +cpu_features::X86Features opt = cpu_features::GetX86Info().features; + +// AVX512_F_BW_VL_VNNI SQ8-to-SQ8 functions +#ifdef OPT_AVX512_F_BW_VL_VNNI +bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && opt.avx512vl && opt.avx512vnni; +INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, AVX512F_BW_VL_VNNI, 16, + avx512_f_bw_vl_vnni_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, AVX512F_BW_VL_VNNI, 16, + avx512_f_bw_vl_vnni_supported); +#endif // AVX512_F_BW_VL_VNNI +#endif // x86_64 + +// Naive SQ8-to-SQ8 algorithms +INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, InnerProduct, 16); +INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, Cosine, 16); + +BENCHMARK_MAIN(); + From e0ce2688cd349e294f84dd238f4cb3dbd292cec1 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 28 Dec 2025 10:55:02 +0200 Subject: [PATCH 03/51] Format --- src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h | 11 ++++++----- src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h | 5 ++--- src/VecSim/spaces/IP_space.cpp | 2 -- src/VecSim/spaces/functions/SVE2.cpp | 15 +++++++-------- src/VecSim/spaces/functions/SVE2.h | 1 - .../spaces_benchmarks/bm_spaces_sq8_sq8.cpp | 1 - tests/unit/test_spaces.cpp | 5 ++--- 7 files changed, 17 insertions(+), 23 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h index 23d191735..0398b4f61 100644 --- a/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h +++ b/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h @@ -22,10 +22,10 @@ // Helper function to perform inner product step for 4 elements with dual dequantization static inline void SQ8_SQ8_InnerProductStep_NEON(const uint8_t *&pVec1, const uint8_t *&pVec2, - float32x4_t &sum, const float32x4_t &min_val_vec1, - const float32x4_t &delta_vec1, - const float32x4_t &min_val_vec2, - const float32x4_t &delta_vec2) { + float32x4_t &sum, const float32x4_t &min_val_vec1, + const float32x4_t &delta_vec1, + const float32x4_t &min_val_vec2, + const float32x4_t &delta_vec2) { // Load 4 uint8 elements from pVec1 and convert to float uint8x8_t v1_u8 = vld1_u8(pVec1); uint32x4_t v1_u32 = vmovl_u16(vget_low_u16(vmovl_u8(v1_u8))); @@ -52,7 +52,8 @@ static inline void SQ8_SQ8_InnerProductStep_NEON(const uint8_t *&pVec1, const ui // Common implementation for inner product between two SQ8 vectors template // 0..15 -float SQ8_SQ8_InnerProductSIMD16_NEON_IMP(const void *pVec1v, const void *pVec2v, size_t dimension) { +float SQ8_SQ8_InnerProductSIMD16_NEON_IMP(const void *pVec1v, const void *pVec2v, + size_t dimension) { const uint8_t *pVec1 = static_cast(pVec1v); const uint8_t *pVec2 = static_cast(pVec2v); diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h index af49f8ce6..3993ab56e 100644 --- a/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h @@ -162,7 +162,6 @@ float SQ8_SQ8_InnerProductSIMD_SVE(const void *pVec1v, const void *pVec2v, size_ // Returns 1 - inner_product (assumes vectors are pre-normalized) template float SQ8_SQ8_CosineSIMD_SVE(const void *pVec1v, const void *pVec2v, size_t dimension) { - float ip = SQ8_SQ8_InnerProductSIMD_SVE_IMP(pVec1v, pVec2v, - dimension); - return 1.0f - ip; + return 1.0f - SQ8_SQ8_InnerProductSIMD_SVE_IMP(pVec1v, pVec2v, + dimension); } diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index fc4ee8375..699c3be86 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -665,6 +665,4 @@ dist_func_t Cosine_UINT8_GetDistFunc(size_t dim, unsigned char *alignment return ret_dist_func; } - - } // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE2.cpp b/src/VecSim/spaces/functions/SVE2.cpp index 0d24c3fc7..f6780e331 100644 --- a/src/VecSim/spaces/functions/SVE2.cpp +++ b/src/VecSim/spaces/functions/SVE2.cpp @@ -16,14 +16,13 @@ #include "VecSim/spaces/IP/IP_SVE_FP64.h" #include "VecSim/spaces/L2/L2_SVE_FP64.h" -#include "VecSim/spaces/L2/L2_SVE_INT8.h" // SVE2 implementation is identical to SVE -#include "VecSim/spaces/IP/IP_SVE_INT8.h" // SVE2 implementation is identical to SVE -#include "VecSim/spaces/L2/L2_SVE_UINT8.h" // SVE2 implementation is identical to SVE -#include "VecSim/spaces/IP/IP_SVE_UINT8.h" // SVE2 implementation is identical to SVE -#include "VecSim/spaces/IP/IP_SVE_SQ8.h" // SVE2 implementation is identical to SVE -#include "VecSim/spaces/L2/L2_SVE_SQ8.h" // SVE2 implementation is identical to SVE -#include "VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h" // SVE2 implementation is identical to SVE - +#include "VecSim/spaces/L2/L2_SVE_INT8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/IP/IP_SVE_INT8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/L2/L2_SVE_UINT8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/IP/IP_SVE_UINT8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/IP/IP_SVE_SQ8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/L2/L2_SVE_SQ8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h" // SVE2 implementation is identical to SVE namespace spaces { diff --git a/src/VecSim/spaces/functions/SVE2.h b/src/VecSim/spaces/functions/SVE2.h index c2445bb2a..bf1a717c9 100644 --- a/src/VecSim/spaces/functions/SVE2.h +++ b/src/VecSim/spaces/functions/SVE2.h @@ -37,5 +37,4 @@ dist_func_t Choose_SQ8_L2_implementation_SVE2(size_t dim); dist_func_t Choose_SQ8_SQ8_IP_implementation_SVE2(size_t dim); dist_func_t Choose_SQ8_SQ8_Cosine_implementation_SVE2(size_t dim); - } // namespace spaces diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp index ca888fff1..cf027b70d 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp @@ -78,4 +78,3 @@ INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, InnerProduct, 16); INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, Cosine, 16); BENCHMARK_MAIN(); - diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 641c2933d..cde00e3a3 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2488,9 +2488,8 @@ TEST_F(SpacesTest, SQ8_SQ8_ip_no_optimization_func_test) { std::vector v1_compressed = CreateSQ8CompressedVector(v1_orig, dim); std::vector v2_compressed = CreateSQ8CompressedVector(v2_orig, dim); - float dist = - SQ8_SQ8_InnerProduct((const void *)v1_compressed.data(), (const void *)v2_compressed.data(), - dim); + float dist = SQ8_SQ8_InnerProduct((const void *)v1_compressed.data(), + (const void *)v2_compressed.data(), dim); // Since we're comparing identical normalized vectors, distance should be close to 0 ASSERT_NEAR(dist, 0.0f, 0.01f) << "SQ8_SQ8_InnerProduct failed to match expected distance"; From ab6b07749265b609682650dcfe809d5d2221a415 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 28 Dec 2025 11:32:55 +0200 Subject: [PATCH 04/51] Orgnizing --- src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h | 4 ++-- src/VecSim/spaces/IP/IP_AVX2_SQ8.h | 4 ++-- src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h | 2 +- src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h | 2 +- tests/unit/test_spaces.cpp | 4 ++-- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h index 7c4e6b0d4..007ee333e 100644 --- a/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h @@ -85,10 +85,10 @@ float SQ8_InnerProductImp_FMA(const void *pVect1v, const void *pVect2v, size_t d // We dealt with the residual part. We are left with some multiple of 16 floats. // In each iteration we calculate 16 floats = 512 bits. - while (pVect1 < pEnd1) { + do { InnerProductStepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec); InnerProductStepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec); - } + } while (pVect1 < pEnd1); return my_mm256_reduce_add_ps(sum256); } diff --git a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h index 1df263122..dd3276de8 100644 --- a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h @@ -79,10 +79,10 @@ float SQ8_InnerProductImp_AVX2(const void *pVect1v, const void *pVect2v, size_t // We dealt with the residual part. We are left with some multiple of 16 floats. // In each iteration we calculate 16 floats = 512 bits. - while (pVect1 < pEnd1) { + do { InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec); InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec); - } + } while (pVect1 < pEnd1); return my_mm256_reduce_add_ps(sum256); } diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h index 679f967fc..a88a53252 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h @@ -133,4 +133,4 @@ float SQ8_SQ8_CosineSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pV // Return cosine similarity return 1.0f - ip; -} \ No newline at end of file +} diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h index 0398b4f61..589be826c 100644 --- a/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h +++ b/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h @@ -155,4 +155,4 @@ float SQ8_SQ8_InnerProductSIMD16_NEON(const void *pVec1v, const void *pVec2v, si template // 0..15 float SQ8_SQ8_CosineSIMD16_NEON(const void *pVec1v, const void *pVec2v, size_t dimension) { return 1.0f - SQ8_SQ8_InnerProductSIMD16_NEON_IMP(pVec1v, pVec2v, dimension); -} \ No newline at end of file +} diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index cde00e3a3..69bff9b31 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2480,7 +2480,7 @@ TEST_F(SpacesTest, SQ8_SQ8_ip_no_optimization_func_test) { v2_orig[i] = float(i + 1.5); } - // Normalize both vectors for IP test + // Normalize both vectors because it expects normalized vectors spaces::GetNormalizeFunc()(v1_orig, dim); spaces::GetNormalizeFunc()(v2_orig, dim); @@ -2505,7 +2505,7 @@ TEST_F(SpacesTest, SQ8_SQ8_Cosine_no_optimization_func_test) { v2_orig[i] = float(i + 1.5); } - // Normalize both vectors for Cosine test + // Normalize both vectors because it expects normalized vectors spaces::GetNormalizeFunc()(v1_orig, dim); spaces::GetNormalizeFunc()(v2_orig, dim); From 931e339589bb894312ef0bfd607d86782b648969 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 28 Dec 2025 11:44:13 +0200 Subject: [PATCH 05/51] Add full sq8 bencharks --- src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h | 1 - src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h | 1 - tests/benchmark/benchmarks.sh | 3 +++ 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h index a88a53252..a9531677a 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h @@ -113,7 +113,6 @@ float SQ8_SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dim } // SQ8-to-SQ8 Inner Product distance function -// Assumes both vectors are normalized. // Returns 1 - inner_product (distance form) template // 0..15 float SQ8_SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h index 589be826c..f47e04d6b 100644 --- a/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h +++ b/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h @@ -142,7 +142,6 @@ float SQ8_SQ8_InnerProductSIMD16_NEON_IMP(const void *pVec1v, const void *pVec2v } // SQ8-to-SQ8 Inner Product distance function -// Assumes both vectors are normalized. // Returns 1 - inner_product (distance form) template // 0..15 float SQ8_SQ8_InnerProductSIMD16_NEON(const void *pVec1v, const void *pVec2v, size_t dimension) { diff --git a/tests/benchmark/benchmarks.sh b/tests/benchmark/benchmarks.sh index feb28f129..00eaf47a0 100755 --- a/tests/benchmark/benchmarks.sh +++ b/tests/benchmark/benchmarks.sh @@ -99,6 +99,9 @@ elif [ "$BM_TYPE" = "bm-svs-train-fp16" ] ; then elif [ "$BM_TYPE" = "bm-basics-svs-fp32-single" ] ; then echo basics_svs_single_fp32 echo basics_svs_single_fp32_LVQ8 +elif [ "$BM_TYPE" = "bm-spaces-sq8-full" ] ; then + echo spaces_sq8 + echo spaces_sq8_sq8 # Spaces benchmarks elif [ "$BM_TYPE" = "bm-spaces" ] ; then From a56474d2f160fd407df7aad15c2a2db86587c5d0 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 28 Dec 2025 12:05:58 +0200 Subject: [PATCH 06/51] Optimize the sq8 sq8 --- .../spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h | 119 ++++++---- .../spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h | 209 +++++++++++------- src/VecSim/spaces/IP/IP_SVE_SQ8.h | 116 +++++----- src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h | 124 ++++++----- src/VecSim/spaces/IP_space.cpp | 14 +- .../spaces/functions/AVX512F_BW_VL_VNNI.cpp | 4 +- .../spaces_benchmarks/bm_spaces_sq8_sq8.cpp | 4 +- tests/unit/test_spaces.cpp | 2 +- 8 files changed, 353 insertions(+), 239 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h index 481672504..f20f3e6c3 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h @@ -9,29 +9,41 @@ #pragma once #include "VecSim/spaces/space_includes.h" #include -#include -static inline void SQ8_InnerProductStep(const float *&pVec1, const uint8_t *&pVec2, __m512 &sum, - const __m512 &min_val_vec, const __m512 &delta_vec) { - // Load 16 float elements from pVec1 +/** + * SQ8 distance functions (float32 query vs uint8 stored) using AVX512. + * + * Uses algebraic optimization to reduce operations per element: + * + * IP = Σ query[i] * (val[i] * δ + min) + * = δ * Σ(query[i] * val[i]) + min * Σ(query[i]) + * + * This saves one FMA per 16 elements by separating: + * - dot_sum: accumulates query[i] * val[i] + * - query_sum: accumulates query[i] + * Then combines at the end: result = δ * dot_sum + min * query_sum + * + * Also uses multiple accumulators for better instruction-level parallelism. + * + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [inv_norm (float)] + */ + +// Process 16 elements with algebraic optimization +static inline void SQ8_InnerProductStep(const float *pVec1, const uint8_t *pVec2, + __m512 &dot_sum, __m512 &query_sum) { + // Load 16 float elements from query __m512 v1 = _mm512_loadu_ps(pVec1); - // Load 16 uint8 elements from pVec2 and convert to __m512i - __m128i v2_128 = _mm_loadu_si128((__m128i *)pVec2); + // Load 16 uint8 elements and convert to float + __m128i v2_128 = _mm_loadu_si128(reinterpret_cast(pVec2)); __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); - - // Convert uint8 to float __m512 v2_f = _mm512_cvtepi32_ps(v2_512); - // Dequantize: (val * delta) + min_val - __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec); + // Accumulate query * val (without dequantization) + dot_sum = _mm512_fmadd_ps(v1, v2_f, dot_sum); - // Compute dot product and add to sum - sum = _mm512_fmadd_ps(v1, dequantized, sum); - - // Advance pointers - pVec1 += 16; - pVec2 += 16; + // Accumulate query sum + query_sum = _mm512_add_ps(query_sum, v1); } // Common implementation for both inner product and cosine similarity @@ -39,52 +51,83 @@ template // 0..15 float SQ8_InnerProductImp_AVX512(const void *pVec1v, const void *pVec2v, size_t dimension) { const float *pVec1 = static_cast(pVec1v); const uint8_t *pVec2 = static_cast(pVec2v); - const float *pEnd1 = pVec1 + dimension; // Get dequantization parameters from the end of pVec2 const float min_val = *reinterpret_cast(pVec2 + dimension); const float delta = *reinterpret_cast(pVec2 + dimension + sizeof(float)); - // Create broadcast vectors for SIMD operations - __m512 min_val_vec = _mm512_set1_ps(min_val); - __m512 delta_vec = _mm512_set1_ps(delta); + // Multiple accumulators for instruction-level parallelism + __m512 dot_sum0 = _mm512_setzero_ps(); + __m512 dot_sum1 = _mm512_setzero_ps(); + __m512 dot_sum2 = _mm512_setzero_ps(); + __m512 dot_sum3 = _mm512_setzero_ps(); + __m512 query_sum0 = _mm512_setzero_ps(); + __m512 query_sum1 = _mm512_setzero_ps(); + __m512 query_sum2 = _mm512_setzero_ps(); + __m512 query_sum3 = _mm512_setzero_ps(); - // Initialize sum accumulator - __m512 sum = _mm512_setzero_ps(); + size_t offset = 0; // Deal with remainder first if constexpr (residual > 0) { // Handle less than 16 elements __mmask16 mask = (1U << residual) - 1; - // Load masked float elements + // Load masked float elements from query __m512 v1 = _mm512_maskz_loadu_ps(mask, pVec1); - // Load full uint8 elements - we know that the first 16 elements are safe to load + // Load uint8 elements and convert to float __m128i v2_128 = _mm_loadu_si128(reinterpret_cast(pVec2)); __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); __m512 v2_f = _mm512_cvtepi32_ps(v2_512); - // Dequantize - __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec); + // Masked accumulation (mask already zeroed unused elements in v1) + dot_sum0 = _mm512_mul_ps(v1, v2_f); + query_sum0 = v1; + + offset = residual; + } - // Compute dot product - __m512 product = _mm512_mul_ps(v1, dequantized); + // Calculate number of full 64-element chunks (4 x 16) + size_t num_chunks = (dimension - residual) / 64; - // Apply mask to product and add to sum - sum = _mm512_fmadd_ps(sum, sum, product); + // Process 4 chunks at a time for maximum ILP + for (size_t i = 0; i < num_chunks; i++) { + SQ8_InnerProductStep(pVec1 + offset, pVec2 + offset, dot_sum0, query_sum0); + SQ8_InnerProductStep(pVec1 + offset + 16, pVec2 + offset + 16, dot_sum1, query_sum1); + SQ8_InnerProductStep(pVec1 + offset + 32, pVec2 + offset + 32, dot_sum2, query_sum2); + SQ8_InnerProductStep(pVec1 + offset + 48, pVec2 + offset + 48, dot_sum3, query_sum3); + offset += 64; + } - pVec1 += residual; - pVec2 += residual; + // Handle remaining 16-element chunks (0-3 remaining) + size_t remaining = (dimension - residual) % 64; + if (remaining >= 16) { + SQ8_InnerProductStep(pVec1 + offset, pVec2 + offset, dot_sum0, query_sum0); + offset += 16; + remaining -= 16; } + if (remaining >= 16) { + SQ8_InnerProductStep(pVec1 + offset, pVec2 + offset, dot_sum1, query_sum1); + offset += 16; + remaining -= 16; + } + if (remaining >= 16) { + SQ8_InnerProductStep(pVec1 + offset, pVec2 + offset, dot_sum2, query_sum2); + } + + // Combine accumulators + __m512 dot_total = _mm512_add_ps(_mm512_add_ps(dot_sum0, dot_sum1), + _mm512_add_ps(dot_sum2, dot_sum3)); + __m512 query_total = _mm512_add_ps(_mm512_add_ps(query_sum0, query_sum1), + _mm512_add_ps(query_sum2, query_sum3)); - // Process remaining full chunks of 16 elements - do { - SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec); - } while (pVec1 < pEnd1); + // Reduce to scalar + float dot_product = _mm512_reduce_add_ps(dot_total); + float query_sum = _mm512_reduce_add_ps(query_total); - // Return the raw inner product result - return _mm512_reduce_add_ps(sum); + // Apply algebraic formula: IP = δ * Σ(query*val) + min * Σ(query) + return delta * dot_product + min_val * query_sum; } template // 0..15 diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h index a9531677a..0cc9fe8df 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h @@ -11,111 +11,166 @@ #include /** - * SQ8-to-SQ8 distance functions. + * SQ8-to-SQ8 distance functions using AVX512 VNNI. * These functions compute distance between two SQ8 (scalar quantized 8-bit) vectors, - * where BOTH vectors are uint8 quantized and dequantization is applied to both - * during computation. + * where BOTH vectors are uint8 quantized. + * + * Uses algebraic optimization to leverage integer VNNI instructions: + * + * IP = Σ (v1[i]*δ1 + min1) * (v2[i]*δ2 + min2) + * = δ1*δ2 * Σ(v1[i]*v2[i]) + δ1*min2 * Σv1[i] + δ2*min1 * Σv2[i] + dim*min1*min2 + * + * This allows using VNNI's _mm512_dpwssd_epi32 for efficient integer dot product, + * then applying scalar corrections at the end. * * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [inv_norm (float)] - * Dequantization formula: dequantized_value = quantized_value * delta + min_val */ -// Helper function to perform inner product step for 16 elements with dual dequantization -static inline void SQ8_SQ8_InnerProductStep(const uint8_t *&pVec1, const uint8_t *&pVec2, - __m512 &sum, const __m512 &min_val_vec1, - const __m512 &delta_vec1, const __m512 &min_val_vec2, - const __m512 &delta_vec2) { - // Load 16 uint8 elements from pVec1 and convert to float - __m128i v1_128 = _mm_loadu_si128(reinterpret_cast(pVec1)); - __m512i v1_512 = _mm512_cvtepu8_epi32(v1_128); - __m512 v1_f = _mm512_cvtepi32_ps(v1_512); - - // Dequantize v1: (val * delta1) + min_val1 - __m512 v1_dequant = _mm512_fmadd_ps(v1_f, delta_vec1, min_val_vec1); - - // Load 16 uint8 elements from pVec2 and convert to float - __m128i v2_128 = _mm_loadu_si128(reinterpret_cast(pVec2)); - __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); - __m512 v2_f = _mm512_cvtepi32_ps(v2_512); - - // Dequantize v2: (val * delta2) + min_val2 - __m512 v2_dequant = _mm512_fmadd_ps(v2_f, delta_vec2, min_val_vec2); - - // Compute dot product and add to sum: sum += v1_dequant * v2_dequant - sum = _mm512_fmadd_ps(v1_dequant, v2_dequant, sum); - - // Advance pointers - pVec1 += 16; - pVec2 += 16; +// Process 64 uint8 elements using VNNI with multiple accumulators for ILP +static inline void SQ8_SQ8_InnerProductStep64(const uint8_t *pVec1, const uint8_t *pVec2, + __m512i &dot_acc0, __m512i &dot_acc1, + __m512i &sum1_acc, __m512i &sum2_acc) { + // Load 64 bytes from each vector + __m512i v1_full = _mm512_loadu_si512(reinterpret_cast(pVec1)); + __m512i v2_full = _mm512_loadu_si512(reinterpret_cast(pVec2)); + + // Extract lower and upper 256-bit halves + __m256i v1_lo = _mm512_castsi512_si256(v1_full); + __m256i v1_hi = _mm512_extracti64x4_epi64(v1_full, 1); + __m256i v2_lo = _mm512_castsi512_si256(v2_full); + __m256i v2_hi = _mm512_extracti64x4_epi64(v2_full, 1); + + // Convert to int16 (zero-extend) and compute dot products using VNNI + // dpwssd: multiply pairs of int16, sum pairs to int32, accumulate + dot_acc0 = _mm512_dpwssd_epi32(dot_acc0, _mm512_cvtepu8_epi16(v1_lo), + _mm512_cvtepu8_epi16(v2_lo)); + dot_acc1 = _mm512_dpwssd_epi32(dot_acc1, _mm512_cvtepu8_epi16(v1_hi), + _mm512_cvtepu8_epi16(v2_hi)); + + // Sum of elements using SAD with zero (sums bytes in groups of 8 -> 8x 64-bit results) + __m512i zero = _mm512_setzero_si512(); + sum1_acc = _mm512_add_epi64(sum1_acc, _mm512_sad_epu8(v1_full, zero)); + sum2_acc = _mm512_add_epi64(sum2_acc, _mm512_sad_epu8(v2_full, zero)); +} + +// Process 32 uint8 elements using VNNI +static inline void SQ8_SQ8_InnerProductStep32(const uint8_t *pVec1, const uint8_t *pVec2, + __m512i &dot_acc, __m512i &sum1_acc, + __m512i &sum2_acc) { + // Load 32 bytes from each vector + __m256i v1_256 = _mm256_loadu_si256(reinterpret_cast(pVec1)); + __m256i v2_256 = _mm256_loadu_si256(reinterpret_cast(pVec2)); + + // Convert to int16 (zero-extend) and compute dot product using VNNI + dot_acc = _mm512_dpwssd_epi32(dot_acc, _mm512_cvtepu8_epi16(v1_256), + _mm512_cvtepu8_epi16(v2_256)); + + // Sum of elements - extend to 512-bit and use SAD + // Use zextsi256_si512 to properly zero the upper half + __m512i v1_full = _mm512_zextsi256_si512(v1_256); + __m512i v2_full = _mm512_zextsi256_si512(v2_256); + __m512i zero = _mm512_setzero_si512(); + sum1_acc = _mm512_add_epi64(sum1_acc, _mm512_sad_epu8(v1_full, zero)); + sum2_acc = _mm512_add_epi64(sum2_acc, _mm512_sad_epu8(v2_full, zero)); } // Common implementation for inner product between two SQ8 vectors -template // 0..15 +template // 0..63 float SQ8_SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimension) { const uint8_t *pVec1 = static_cast(pVec1v); const uint8_t *pVec2 = static_cast(pVec2v); const uint8_t *pEnd1 = pVec1 + dimension; // Get dequantization parameters from the end of pVec1 - const float min_val1 = *reinterpret_cast(pVec1 + dimension); + const float min1 = *reinterpret_cast(pVec1 + dimension); const float delta1 = *reinterpret_cast(pVec1 + dimension + sizeof(float)); // Get dequantization parameters from the end of pVec2 - const float min_val2 = *reinterpret_cast(pVec2 + dimension); + const float min2 = *reinterpret_cast(pVec2 + dimension); const float delta2 = *reinterpret_cast(pVec2 + dimension + sizeof(float)); - // Create broadcast vectors for SIMD operations - __m512 min_val_vec1 = _mm512_set1_ps(min_val1); - __m512 delta_vec1 = _mm512_set1_ps(delta1); - __m512 min_val_vec2 = _mm512_set1_ps(min_val2); - __m512 delta_vec2 = _mm512_set1_ps(delta2); - - // Initialize sum accumulator - __m512 sum = _mm512_setzero_ps(); + // Multiple accumulators for instruction-level parallelism + __m512i dot_acc0 = _mm512_setzero_si512(); + __m512i dot_acc1 = _mm512_setzero_si512(); + __m512i sum1_acc = _mm512_setzero_si512(); // Sum of v1 elements + __m512i sum2_acc = _mm512_setzero_si512(); // Sum of v2 elements - // Deal with remainder first + // Handle residual first (0..63 elements) if constexpr (residual > 0) { - // Handle less than 16 elements - __mmask16 mask = (1U << residual) - 1; - - // Load and convert v1 elements (safe to load 16 elements, masked later) - __m128i v1_128 = _mm_loadu_si128(reinterpret_cast(pVec1)); - __m512i v1_512 = _mm512_cvtepu8_epi32(v1_128); - __m512 v1_f = _mm512_cvtepi32_ps(v1_512); - - // Dequantize v1 - __m512 v1_dequant = _mm512_fmadd_ps(v1_f, delta_vec1, min_val_vec1); - - // Load and convert v2 elements - __m128i v2_128 = _mm_loadu_si128(reinterpret_cast(pVec2)); - __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); - __m512 v2_f = _mm512_cvtepi32_ps(v2_512); - - // Dequantize v2 - __m512 v2_dequant = _mm512_fmadd_ps(v2_f, delta_vec2, min_val_vec2); - - // Compute masked dot product - __m512 product = _mm512_mul_ps(v1_dequant, v2_dequant); - sum = _mm512_maskz_mov_ps(mask, product); - + if constexpr (residual < 32) { + // Handle less than 32 elements with mask + constexpr __mmask32 mask = (1LU << residual) - 1; + __m256i v1_256 = _mm256_maskz_loadu_epi8(mask, pVec1); + __m256i v2_256 = _mm256_maskz_loadu_epi8(mask, pVec2); + + // Convert to int16 and compute dot product + dot_acc0 = _mm512_dpwssd_epi32(dot_acc0, _mm512_cvtepu8_epi16(v1_256), + _mm512_cvtepu8_epi16(v2_256)); + + // Sum using SAD (masked load already zeroed unused bytes) + __m512i v1_full = _mm512_zextsi256_si512(v1_256); + __m512i v2_full = _mm512_zextsi256_si512(v2_256); + __m512i zero = _mm512_setzero_si512(); + sum1_acc = _mm512_sad_epu8(v1_full, zero); + sum2_acc = _mm512_sad_epu8(v2_full, zero); + } else if constexpr (residual == 32) { + // Exactly 32 elements + SQ8_SQ8_InnerProductStep32(pVec1, pVec2, dot_acc0, sum1_acc, sum2_acc); + } else { + // 33-63 elements: use masked 64-byte load + constexpr __mmask64 mask = (1LLU << residual) - 1; + __m512i v1_full = _mm512_maskz_loadu_epi8(mask, pVec1); + __m512i v2_full = _mm512_maskz_loadu_epi8(mask, pVec2); + + // Extract halves and compute dot products + __m256i v1_lo = _mm512_castsi512_si256(v1_full); + __m256i v1_hi = _mm512_extracti64x4_epi64(v1_full, 1); + __m256i v2_lo = _mm512_castsi512_si256(v2_full); + __m256i v2_hi = _mm512_extracti64x4_epi64(v2_full, 1); + + dot_acc0 = _mm512_dpwssd_epi32(dot_acc0, _mm512_cvtepu8_epi16(v1_lo), + _mm512_cvtepu8_epi16(v2_lo)); + dot_acc1 = _mm512_dpwssd_epi32(dot_acc1, _mm512_cvtepu8_epi16(v1_hi), + _mm512_cvtepu8_epi16(v2_hi)); + + // Sum using SAD (masked load already zeroed unused bytes) + __m512i zero = _mm512_setzero_si512(); + sum1_acc = _mm512_sad_epu8(v1_full, zero); + sum2_acc = _mm512_sad_epu8(v2_full, zero); + } pVec1 += residual; pVec2 += residual; } - // Process remaining full chunks of 16 elements - do { - SQ8_SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec1, delta_vec1, min_val_vec2, - delta_vec2); - } while (pVec1 < pEnd1); + // Process full 64-byte chunks + while (pVec1 < pEnd1) { + SQ8_SQ8_InnerProductStep64(pVec1, pVec2, dot_acc0, dot_acc1, sum1_acc, sum2_acc); + pVec1 += 64; + pVec2 += 64; + } + + // Combine dot product accumulators and reduce + __m512i dot_total = _mm512_add_epi32(dot_acc0, dot_acc1); + int64_t dot_product = _mm512_reduce_add_epi32(dot_total); + + // Reduce sum accumulators (SAD produces 8 x 64-bit sums) + int64_t sum_v1 = _mm512_reduce_add_epi64(sum1_acc); + int64_t sum_v2 = _mm512_reduce_add_epi64(sum2_acc); + + // Apply the algebraic formula: + // IP = δ1*δ2 * Σ(v1[i]*v2[i]) + δ1*min2 * Σv1[i] + δ2*min1 * Σv2[i] + dim*min1*min2 + float result = delta1 * delta2 * static_cast(dot_product) + + delta1 * min2 * static_cast(sum_v1) + + delta2 * min1 * static_cast(sum_v2) + + static_cast(dimension) * min1 * min2; - // Horizontal sum and return - return _mm512_reduce_add_ps(sum); + return result; } // SQ8-to-SQ8 Inner Product distance function // Returns 1 - inner_product (distance form) -template // 0..15 -float SQ8_SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, +template // 0..63 +float SQ8_SQ8_InnerProductSIMD64_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, size_t dimension) { float ip = SQ8_SQ8_InnerProductImp(pVec1v, pVec2v, dimension); return 1.0f - ip; @@ -124,8 +179,8 @@ float SQ8_SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const vo // SQ8-to-SQ8 Cosine distance function // Assumes both vectors are normalized. // Returns 1 - (inner_product) -template // 0..15 -float SQ8_SQ8_CosineSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, +template // 0..63 +float SQ8_SQ8_CosineSIMD64_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, size_t dimension) { // Calculate inner product float ip = SQ8_SQ8_InnerProductImp(pVec1v, pVec2v, dimension); diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8.h index 7b9bd86bc..9d0afdad4 100644 --- a/src/VecSim/spaces/IP/IP_SVE_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8.h @@ -8,31 +8,38 @@ */ #include "VecSim/spaces/space_includes.h" #include -#include -#include -static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, size_t &offset, - svfloat32_t &sum, const svfloat32_t &min_val_vec, - const svfloat32_t &delta_vec, const size_t chunk) { +/** + * SQ8 distance functions (float32 query vs uint8 stored) for SVE. + * + * Uses algebraic optimization to reduce operations per element: + * + * IP = Σ query[i] * (val[i] * δ + min) + * = δ * Σ(query[i] * val[i]) + min * Σ(query[i]) + * + * This saves 1 FMA per chunk by deferring dequantization to scalar math at the end. + * + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [inv_norm (float)] + */ + +// Helper function to perform inner product step with algebraic optimization +static inline void InnerProductStep(const float *pVect1, const uint8_t *pVect2, size_t offset, + svfloat32_t &dot_sum, svfloat32_t &query_sum, + const size_t chunk) { svbool_t pg = svptrue_b32(); - // Load float elements from pVect1 + // Load float elements from query svfloat32_t v1 = svld1_f32(pg, pVect1 + offset); - // Convert uint8 to uint32 - svuint32_t v2_u32 = svld1ub_u32(pg, pVect2 + offset); // LD1UB: loa - - // Convert uint32 to float32 + // Load uint8 elements and convert to float + svuint32_t v2_u32 = svld1ub_u32(pg, pVect2 + offset); svfloat32_t v2_f = svcvt_f32_u32_x(pg, v2_u32); - // Dequantize: (val * delta) + min_val - svfloat32_t v2_dequant = svmla_f32_x(pg, min_val_vec, v2_f, delta_vec); + // Accumulate query * val (without dequantization) + dot_sum = svmla_f32_x(pg, dot_sum, v1, v2_f); - // Compute dot product and add to sum - sum = svmla_f32_x(pg, sum, v1, v2_dequant); - - // Move to the next set of elements - offset += chunk; + // Accumulate query sum + query_sum = svadd_f32_x(pg, query_sum, v1); } template @@ -42,22 +49,25 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz size_t offset = 0; // Get dequantization parameters from the end of quantized vector - float min = *(float *)(pVect2 + dimension); - float delta = *(float *)(pVect2 + dimension + sizeof(float)); + const float min_val = *reinterpret_cast(pVect2 + dimension); + const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); - // Create broadcast vectors for SIMD operations svbool_t pg = svptrue_b32(); - svfloat32_t min_val_vec = svdup_f32(min); - svfloat32_t delta_vec = svdup_f32(delta); // Get the number of 32-bit elements per vector at runtime uint64_t chunk = svcntw(); - // Multiple accumulators to increase instruction-level parallelism - svfloat32_t sum0 = svdup_f32(0.0f); - svfloat32_t sum1 = svdup_f32(0.0f); - svfloat32_t sum2 = svdup_f32(0.0f); - svfloat32_t sum3 = svdup_f32(0.0f); + // Multiple accumulators for instruction-level parallelism + // dot_sum: accumulates query[i] * val[i] + // query_sum: accumulates query[i] + svfloat32_t dot_sum0 = svdup_f32(0.0f); + svfloat32_t dot_sum1 = svdup_f32(0.0f); + svfloat32_t dot_sum2 = svdup_f32(0.0f); + svfloat32_t dot_sum3 = svdup_f32(0.0f); + svfloat32_t query_sum0 = svdup_f32(0.0f); + svfloat32_t query_sum1 = svdup_f32(0.0f); + svfloat32_t query_sum2 = svdup_f32(0.0f); + svfloat32_t query_sum3 = svdup_f32(0.0f); // Handle partial chunk if needed if constexpr (partial_chunk) { @@ -67,24 +77,20 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz svbool_t pg_partial = svwhilelt_b32(static_cast(0), static_cast(remaining)); - // Load float elements from pVect1 with predicate + // Load query float elements with predicate svfloat32_t v1 = svld1_f32(pg_partial, pVect1); - // load 8-bit bytes from pVect2+offset and zero-extend each into a 32-bit lane - svuint32_t v2_u32 = svld1ub_u32( - pg_partial, pVect2 + offset); // LD1UB: load 8-bit, zero-extend to 32-bit - // :contentReference[oaicite:0]{index=0} - - // Convert uint32 to float32 + // Load uint8 elements and convert to float + svuint32_t v2_u32 = svld1ub_u32(pg_partial, pVect2 + offset); svfloat32_t v2_f = svcvt_f32_u32_z(pg_partial, v2_u32); - // Dequantize: (val * delta) + min_val - svfloat32_t v2_dequant = svmla_f32_z(pg_partial, min_val_vec, v2_f, delta_vec); + // Accumulate dot product (no dequantization) + dot_sum0 = svmla_f32_z(pg_partial, dot_sum0, v1, v2_f); - // Compute dot product and add to sum - sum0 = svmla_f32_z(pg_partial, sum0, v1, v2_dequant); + // Accumulate query sum + query_sum0 = svadd_f32_z(pg_partial, query_sum0, v1); - // Move pointers past the partial chunk + // Move past the partial chunk offset += remaining; } } @@ -95,32 +101,38 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz (dimension - (partial_chunk ? dimension % chunk : 0)) / chunk_size; for (size_t i = 0; i < number_of_chunks; i++) { - InnerProductStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec, chunk); - InnerProductStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec, chunk); - InnerProductStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec, chunk); - InnerProductStep(pVect1, pVect2, offset, sum3, min_val_vec, delta_vec, chunk); + InnerProductStep(pVect1, pVect2, offset, dot_sum0, query_sum0, chunk); + InnerProductStep(pVect1, pVect2, offset + chunk, dot_sum1, query_sum1, chunk); + InnerProductStep(pVect1, pVect2, offset + 2 * chunk, dot_sum2, query_sum2, chunk); + InnerProductStep(pVect1, pVect2, offset + 3 * chunk, dot_sum3, query_sum3, chunk); + offset += chunk_size; } // Handle remaining steps (0-3) if constexpr (additional_steps > 0) { - InnerProductStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec, chunk); + InnerProductStep(pVect1, pVect2, offset, dot_sum0, query_sum0, chunk); + offset += chunk; } if constexpr (additional_steps > 1) { - InnerProductStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec, chunk); + InnerProductStep(pVect1, pVect2, offset, dot_sum1, query_sum1, chunk); + offset += chunk; } if constexpr (additional_steps > 2) { - InnerProductStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec, chunk); + InnerProductStep(pVect1, pVect2, offset, dot_sum2, query_sum2, chunk); } // Combine the accumulators - svfloat32_t sum = svadd_f32_z(pg, sum0, sum1); - sum = svadd_f32_z(pg, sum, sum2); - sum = svadd_f32_z(pg, sum, sum3); + svfloat32_t dot_total = svadd_f32_x(pg, svadd_f32_x(pg, dot_sum0, dot_sum1), + svadd_f32_x(pg, dot_sum2, dot_sum3)); + svfloat32_t query_total = svadd_f32_x(pg, svadd_f32_x(pg, query_sum0, query_sum1), + svadd_f32_x(pg, query_sum2, query_sum3)); - // Horizontal sum of all elements in the vector - float result = svaddv_f32(pg, sum); + // Horizontal sum of all elements + float dot_product = svaddv_f32(pg, dot_total); + float query_sum = svaddv_f32(pg, query_total); - return result; + // Apply algebraic formula: IP = δ * Σ(query*val) + min * Σ(query) + return delta * dot_product + min_val * query_sum; } template diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h index 3993ab56e..fdff78cd2 100644 --- a/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h @@ -13,41 +13,39 @@ /** * SQ8-to-SQ8 distance functions for SVE. * These functions compute distance between two SQ8 (scalar quantized 8-bit) vectors, - * where BOTH vectors are uint8 quantized and dequantization is applied to both - * during computation. + * where BOTH vectors are uint8 quantized. + * + * Uses algebraic optimization to reduce operations per element: + * + * IP = Σ (v1[i]*δ1 + min1) * (v2[i]*δ2 + min2) + * = δ1*δ2 * Σ(v1[i]*v2[i]) + δ1*min2 * Σv1[i] + δ2*min1 * Σv2[i] + dim*min1*min2 + * + * This saves 2 FMAs per chunk by deferring dequantization to scalar math at the end. * * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [inv_norm (float)] - * Dequantization formula: dequantized_value = quantized_value * delta + min_val */ -// Helper function to perform inner product step for one chunk with dual dequantization -static inline void SQ8_SQ8_InnerProductStep_SVE(const uint8_t *&pVec1, const uint8_t *&pVec2, - size_t &offset, svfloat32_t &sum, - const svfloat32_t &min_val_vec1, - const svfloat32_t &delta_vec1, - const svfloat32_t &min_val_vec2, - const svfloat32_t &delta_vec2, const size_t chunk) { +// Helper function to perform inner product step with algebraic optimization +static inline void SQ8_SQ8_InnerProductStep_SVE(const uint8_t *pVec1, const uint8_t *pVec2, + size_t offset, svfloat32_t &dot_sum, + svfloat32_t &sum1, svfloat32_t &sum2, + const size_t chunk) { svbool_t pg = svptrue_b32(); // Load uint8 elements from pVec1 and convert to float svuint32_t v1_u32 = svld1ub_u32(pg, pVec1 + offset); svfloat32_t v1_f = svcvt_f32_u32_x(pg, v1_u32); - // Dequantize v1: (val * delta1) + min_val1 - svfloat32_t v1_dequant = svmla_f32_x(pg, min_val_vec1, v1_f, delta_vec1); - // Load uint8 elements from pVec2 and convert to float svuint32_t v2_u32 = svld1ub_u32(pg, pVec2 + offset); svfloat32_t v2_f = svcvt_f32_u32_x(pg, v2_u32); - // Dequantize v2: (val * delta2) + min_val2 - svfloat32_t v2_dequant = svmla_f32_x(pg, min_val_vec2, v2_f, delta_vec2); + // Accumulate dot product: dot_sum += v1 * v2 (no dequantization) + dot_sum = svmla_f32_x(pg, dot_sum, v1_f, v2_f); - // Compute dot product and add to sum: sum += v1_dequant * v2_dequant - sum = svmla_f32_x(pg, sum, v1_dequant, v2_dequant); - - // Move to the next set of elements - offset += chunk; + // Accumulate element sums + sum1 = svadd_f32_x(pg, sum1, v1_f); + sum2 = svadd_f32_x(pg, sum2, v2_f); } // Common implementation for inner product between two SQ8 vectors @@ -65,21 +63,27 @@ float SQ8_SQ8_InnerProductSIMD_SVE_IMP(const void *pVec1v, const void *pVec2v, s const float min2 = *reinterpret_cast(pVec2 + dimension); const float delta2 = *reinterpret_cast(pVec2 + dimension + sizeof(float)); - // Create broadcast vectors for SIMD operations svbool_t pg = svptrue_b32(); - svfloat32_t min_val_vec1 = svdup_f32(min1); - svfloat32_t delta_vec1 = svdup_f32(delta1); - svfloat32_t min_val_vec2 = svdup_f32(min2); - svfloat32_t delta_vec2 = svdup_f32(delta2); // Get the number of 32-bit elements per vector at runtime uint64_t chunk = svcntw(); - // Multiple accumulators to increase instruction-level parallelism - svfloat32_t sum0 = svdup_f32(0.0f); - svfloat32_t sum1 = svdup_f32(0.0f); - svfloat32_t sum2 = svdup_f32(0.0f); - svfloat32_t sum3 = svdup_f32(0.0f); + // Multiple accumulators for instruction-level parallelism + // dot_sum: accumulates v1[i] * v2[i] + // sum1: accumulates v1[i] + // sum2: accumulates v2[i] + svfloat32_t dot_sum0 = svdup_f32(0.0f); + svfloat32_t dot_sum1 = svdup_f32(0.0f); + svfloat32_t dot_sum2 = svdup_f32(0.0f); + svfloat32_t dot_sum3 = svdup_f32(0.0f); + svfloat32_t sum1_0 = svdup_f32(0.0f); + svfloat32_t sum1_1 = svdup_f32(0.0f); + svfloat32_t sum1_2 = svdup_f32(0.0f); + svfloat32_t sum1_3 = svdup_f32(0.0f); + svfloat32_t sum2_0 = svdup_f32(0.0f); + svfloat32_t sum2_1 = svdup_f32(0.0f); + svfloat32_t sum2_2 = svdup_f32(0.0f); + svfloat32_t sum2_3 = svdup_f32(0.0f); // Handle partial chunk if needed if constexpr (partial_chunk) { @@ -93,18 +97,16 @@ float SQ8_SQ8_InnerProductSIMD_SVE_IMP(const void *pVec1v, const void *pVec2v, s svuint32_t v1_u32 = svld1ub_u32(pg_partial, pVec1 + offset); svfloat32_t v1_f = svcvt_f32_u32_z(pg_partial, v1_u32); - // Dequantize v1 - svfloat32_t v1_dequant = svmla_f32_z(pg_partial, min_val_vec1, v1_f, delta_vec1); - // Load and convert v2 elements svuint32_t v2_u32 = svld1ub_u32(pg_partial, pVec2 + offset); svfloat32_t v2_f = svcvt_f32_u32_z(pg_partial, v2_u32); - // Dequantize v2 - svfloat32_t v2_dequant = svmla_f32_z(pg_partial, min_val_vec2, v2_f, delta_vec2); + // Accumulate dot product (no dequantization) + dot_sum0 = svmla_f32_z(pg_partial, dot_sum0, v1_f, v2_f); - // Compute dot product and add to sum - sum0 = svmla_f32_z(pg_partial, sum0, v1_dequant, v2_dequant); + // Accumulate element sums + sum1_0 = svadd_f32_z(pg_partial, sum1_0, v1_f); + sum2_0 = svadd_f32_z(pg_partial, sum2_0, v2_f); // Move past the partial chunk offset += remaining; @@ -117,37 +119,45 @@ float SQ8_SQ8_InnerProductSIMD_SVE_IMP(const void *pVec1v, const void *pVec2v, s (dimension - (partial_chunk ? dimension % chunk : 0)) / chunk_size; for (size_t i = 0; i < number_of_chunks; i++) { - SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, sum0, min_val_vec1, delta_vec1, - min_val_vec2, delta_vec2, chunk); - SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, sum1, min_val_vec1, delta_vec1, - min_val_vec2, delta_vec2, chunk); - SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, sum2, min_val_vec1, delta_vec1, - min_val_vec2, delta_vec2, chunk); - SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, sum3, min_val_vec1, delta_vec1, - min_val_vec2, delta_vec2, chunk); + SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum0, sum1_0, sum2_0, chunk); + SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset + chunk, dot_sum1, sum1_1, sum2_1, chunk); + SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset + 2 * chunk, dot_sum2, sum1_2, sum2_2, + chunk); + SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset + 3 * chunk, dot_sum3, sum1_3, sum2_3, + chunk); + offset += chunk_size; } // Handle remaining steps (0-3) if constexpr (additional_steps > 0) { - SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, sum0, min_val_vec1, delta_vec1, - min_val_vec2, delta_vec2, chunk); + SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum0, sum1_0, sum2_0, chunk); + offset += chunk; } if constexpr (additional_steps > 1) { - SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, sum1, min_val_vec1, delta_vec1, - min_val_vec2, delta_vec2, chunk); + SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum1, sum1_1, sum2_1, chunk); + offset += chunk; } if constexpr (additional_steps > 2) { - SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, sum2, min_val_vec1, delta_vec1, - min_val_vec2, delta_vec2, chunk); + SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum2, sum1_2, sum2_2, chunk); } // Combine the accumulators - svfloat32_t sum = svadd_f32_z(pg, sum0, sum1); - sum = svadd_f32_z(pg, sum, sum2); - sum = svadd_f32_z(pg, sum, sum3); - - // Horizontal sum of all elements in the vector - return svaddv_f32(pg, sum); + svfloat32_t dot_total = svadd_f32_x(pg, svadd_f32_x(pg, dot_sum0, dot_sum1), + svadd_f32_x(pg, dot_sum2, dot_sum3)); + svfloat32_t sum1_total = svadd_f32_x(pg, svadd_f32_x(pg, sum1_0, sum1_1), + svadd_f32_x(pg, sum1_2, sum1_3)); + svfloat32_t sum2_total = svadd_f32_x(pg, svadd_f32_x(pg, sum2_0, sum2_1), + svadd_f32_x(pg, sum2_2, sum2_3)); + + // Horizontal sum of all elements + float dot_product = svaddv_f32(pg, dot_total); + float v1_sum = svaddv_f32(pg, sum1_total); + float v2_sum = svaddv_f32(pg, sum2_total); + + // Apply algebraic formula: + // IP = δ1*δ2 * Σ(v1*v2) + δ1*min2 * Σv1 + δ2*min1 * Σv2 + dim*min1*min2 + return delta1 * delta2 * dot_product + delta1 * min2 * v1_sum + delta2 * min1 * v2_sum + + static_cast(dimension) * min1 * min2; } // SQ8-to-SQ8 Inner Product distance function diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index 699c3be86..9214bba54 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -175,12 +175,9 @@ dist_func_t IP_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, #endif // AARCH64 #ifdef CPU_FEATURES_ARCH_X86_64 - // Optimizations assume at least 16 floats. If we have less, we use the naive implementation. - if (dim < 16) { - return ret_dist_func; - } #ifdef OPT_AVX512_F_BW_VL_VNNI - if (features.avx512f && features.avx512bw && features.avx512vnni) { + // AVX512 VNNI SQ8_SQ8 uses 64-element chunks + if (dim >= 64 && features.avx512f && features.avx512bw && features.avx512vnni) { return Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); } #endif @@ -213,12 +210,9 @@ dist_func_t Cosine_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignme #endif // AARCH64 #ifdef CPU_FEATURES_ARCH_X86_64 - // Optimizations assume at least 16 floats. If we have less, we use the naive implementation. - if (dim < 16) { - return ret_dist_func; - } #ifdef OPT_AVX512_F_BW_VL_VNNI - if (features.avx512f && features.avx512bw && features.avx512vnni) { + // AVX512 VNNI SQ8_SQ8 uses 64-element chunks + if (dim >= 64 && features.avx512f && features.avx512bw && features.avx512vnni) { return Choose_SQ8_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); } #endif diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp index 22086a971..c8988daf2 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp @@ -77,13 +77,13 @@ dist_func_t Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim) { // SQ8-to-SQ8 distance functions (both vectors are uint8 quantized) dist_func_t Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI); + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, SQ8_SQ8_InnerProductSIMD64_AVX512F_BW_VL_VNNI); return ret_dist_func; } dist_func_t Choose_SQ8_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_SQ8_CosineSIMD16_AVX512F_BW_VL_VNNI); + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, SQ8_SQ8_CosineSIMD64_AVX512F_BW_VL_VNNI); return ret_dist_func; } diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp index cf027b70d..0673dac18 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp @@ -66,9 +66,9 @@ cpu_features::X86Features opt = cpu_features::GetX86Info().features; // AVX512_F_BW_VL_VNNI SQ8-to-SQ8 functions #ifdef OPT_AVX512_F_BW_VL_VNNI bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && opt.avx512vl && opt.avx512vnni; -INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, AVX512F_BW_VL_VNNI, 16, +INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, AVX512F_BW_VL_VNNI, 64, avx512_f_bw_vl_vnni_supported); -INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, AVX512F_BW_VL_VNNI, 16, +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, AVX512F_BW_VL_VNNI, 64, avx512_f_bw_vl_vnni_supported); #endif // AVX512_F_BW_VL_VNNI #endif // x86_64 diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 69bff9b31..7681d5013 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2681,4 +2681,4 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_CosineTest) { } INSTANTIATE_TEST_SUITE_P(SQ8_SQ8OptFuncs, SQ8_SQ8_SpacesOptimizationTest, - testing::Range(16UL, 16 * 2UL + 1)); + testing::Range(64UL, 64 * 2UL + 1)); From a25f45cf879e6b2baa0c03d6b3bb789237812bc1 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 28 Dec 2025 12:32:11 +0200 Subject: [PATCH 07/51] Optimize SQ8 distance functions for NEON by reducing operations and improving performance --- src/VecSim/spaces/IP/IP_NEON_SQ8.h | 109 ++++++++++-------- src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h | 148 ++++++++++++++----------- src/VecSim/spaces/IP_space.cpp | 5 +- 3 files changed, 153 insertions(+), 109 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_SQ8.h index 3e632dcdb..74fc445d5 100644 --- a/src/VecSim/spaces/IP/IP_NEON_SQ8.h +++ b/src/VecSim/spaces/IP/IP_NEON_SQ8.h @@ -9,27 +9,37 @@ #include "VecSim/spaces/space_includes.h" #include -static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, float32x4_t &sum, - const float32x4_t &min_val_vec, const float32x4_t &delta_vec) { - // Load 4 float elements from pVect1 +/** + * SQ8 distance functions (float32 query vs uint8 stored) for NEON. + * + * Uses algebraic optimization to reduce operations per element: + * + * IP = Σ query[i] * (val[i] * δ + min) + * = δ * Σ(query[i] * val[i]) + min * Σ(query[i]) + * + * This saves 1 FMA per 4-element step by deferring dequantization to scalar math at the end. + * + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [inv_norm (float)] + */ + +// Helper function with algebraic optimization +static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, + float32x4_t &dot_sum, float32x4_t &query_sum) { + // Load 4 float elements from query float32x4_t v1 = vld1q_f32(pVect1); pVect1 += 4; - // Load 4 uint8 elements from pVect2 + // Load 4 uint8 elements and convert to float uint8x8_t v2_u8 = vld1_u8(pVect2); pVect2 += 4; - - // Convert uint8 to uint32 uint32x4_t v2_u32 = vmovl_u16(vget_low_u16(vmovl_u8(v2_u8))); - - // Convert uint32 to float32 float32x4_t v2_f = vcvtq_f32_u32(v2_u32); - // Dequantize: (val * delta) + min_val - float32x4_t v2_dequant = vmlaq_f32(min_val_vec, v2_f, delta_vec); + // Accumulate query * val (without dequantization) + dot_sum = vmlaq_f32(dot_sum, v1, v2_f); - // Compute dot product and add to sum - sum = vmlaq_f32(sum, v1, v2_dequant); + // Accumulate query sum + query_sum = vaddq_f32(query_sum, v1); } template // 0..15 @@ -41,70 +51,81 @@ float SQ8_InnerProductSIMD16_NEON_IMP(const void *pVect1v, const void *pVect2v, const float min_val = *reinterpret_cast(pVect2 + dimension); const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); - // Create broadcast vectors for SIMD operations - float32x4_t min_val_vec = vdupq_n_f32(min_val); - float32x4_t delta_vec = vdupq_n_f32(delta); - - float32x4_t sum0 = vdupq_n_f32(0.0f); - float32x4_t sum1 = vdupq_n_f32(0.0f); - float32x4_t sum2 = vdupq_n_f32(0.0f); - float32x4_t sum3 = vdupq_n_f32(0.0f); + // Multiple accumulators for instruction-level parallelism + // dot_sum: accumulates query[i] * val[i] + // query_sum: accumulates query[i] + float32x4_t dot_sum0 = vdupq_n_f32(0.0f); + float32x4_t dot_sum1 = vdupq_n_f32(0.0f); + float32x4_t dot_sum2 = vdupq_n_f32(0.0f); + float32x4_t dot_sum3 = vdupq_n_f32(0.0f); + float32x4_t query_sum0 = vdupq_n_f32(0.0f); + float32x4_t query_sum1 = vdupq_n_f32(0.0f); + float32x4_t query_sum2 = vdupq_n_f32(0.0f); + float32x4_t query_sum3 = vdupq_n_f32(0.0f); const size_t num_of_chunks = dimension / 16; // Process 16 elements at a time in the main loop for (size_t i = 0; i < num_of_chunks; i++) { - InnerProductStep(pVect1, pVect2, sum0, min_val_vec, delta_vec); - InnerProductStep(pVect1, pVect2, sum1, min_val_vec, delta_vec); - InnerProductStep(pVect1, pVect2, sum2, min_val_vec, delta_vec); - InnerProductStep(pVect1, pVect2, sum3, min_val_vec, delta_vec); + InnerProductStep(pVect1, pVect2, dot_sum0, query_sum0); + InnerProductStep(pVect1, pVect2, dot_sum1, query_sum1); + InnerProductStep(pVect1, pVect2, dot_sum2, query_sum2); + InnerProductStep(pVect1, pVect2, dot_sum3, query_sum3); } - // Handle remaining complete 4-float blocks within residual + // Handle remaining complete 4-element blocks within residual if constexpr (residual >= 4) { - InnerProductStep(pVect1, pVect2, sum0, min_val_vec, delta_vec); + InnerProductStep(pVect1, pVect2, dot_sum0, query_sum0); } if constexpr (residual >= 8) { - InnerProductStep(pVect1, pVect2, sum1, min_val_vec, delta_vec); + InnerProductStep(pVect1, pVect2, dot_sum1, query_sum1); } if constexpr (residual >= 12) { - InnerProductStep(pVect1, pVect2, sum2, min_val_vec, delta_vec); + InnerProductStep(pVect1, pVect2, dot_sum2, query_sum2); } - // Handle final residual elements (0-3 elements) + // Handle final residual elements (0-3 elements) with scalar math constexpr size_t final_residual = residual % 4; if constexpr (final_residual > 0) { float32x4_t v1 = vdupq_n_f32(0.0f); - float32x4_t v2_dequant = vdupq_n_f32(0.0f); + float32x4_t v2_f = vdupq_n_f32(0.0f); if constexpr (final_residual >= 1) { v1 = vld1q_lane_f32(pVect1, v1, 0); - float dequant0 = pVect2[0] * delta + min_val; - v2_dequant = vld1q_lane_f32(&dequant0, v2_dequant, 0); + float val0 = static_cast(pVect2[0]); + v2_f = vld1q_lane_f32(&val0, v2_f, 0); } if constexpr (final_residual >= 2) { v1 = vld1q_lane_f32(pVect1 + 1, v1, 1); - float dequant1 = pVect2[1] * delta + min_val; - v2_dequant = vld1q_lane_f32(&dequant1, v2_dequant, 1); + float val1 = static_cast(pVect2[1]); + v2_f = vld1q_lane_f32(&val1, v2_f, 1); } if constexpr (final_residual >= 3) { v1 = vld1q_lane_f32(pVect1 + 2, v1, 2); - float dequant2 = pVect2[2] * delta + min_val; - v2_dequant = vld1q_lane_f32(&dequant2, v2_dequant, 2); + float val2 = static_cast(pVect2[2]); + v2_f = vld1q_lane_f32(&val2, v2_f, 2); } - sum3 = vmlaq_f32(sum3, v1, v2_dequant); + dot_sum3 = vmlaq_f32(dot_sum3, v1, v2_f); + query_sum3 = vaddq_f32(query_sum3, v1); } - // Combine all four sum accumulators - float32x4_t sum_combined = vaddq_f32(vaddq_f32(sum0, sum1), vaddq_f32(sum2, sum3)); + // Combine accumulators + float32x4_t dot_total = vaddq_f32(vaddq_f32(dot_sum0, dot_sum1), vaddq_f32(dot_sum2, dot_sum3)); + float32x4_t query_total = + vaddq_f32(vaddq_f32(query_sum0, query_sum1), vaddq_f32(query_sum2, query_sum3)); + + // Horizontal sum + float32x2_t dot_halves = vadd_f32(vget_low_f32(dot_total), vget_high_f32(dot_total)); + float32x2_t dot_summed = vpadd_f32(dot_halves, dot_halves); + float dot_product = vget_lane_f32(dot_summed, 0); - // Horizontal sum of the 4 elements in the combined NEON register - float32x2_t sum_halves = vadd_f32(vget_low_f32(sum_combined), vget_high_f32(sum_combined)); - float32x2_t summed = vpadd_f32(sum_halves, sum_halves); - float sum = vget_lane_f32(summed, 0); + float32x2_t query_halves = vadd_f32(vget_low_f32(query_total), vget_high_f32(query_total)); + float32x2_t query_summed = vpadd_f32(query_halves, query_halves); + float query_sum = vget_lane_f32(query_summed, 0); - return sum; + // Apply algebraic formula: IP = δ * Σ(query*val) + min * Σ(query) + return delta * dot_product + min_val * query_sum; } template // 0..15 diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h index f47e04d6b..68c3683fd 100644 --- a/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h +++ b/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h @@ -13,37 +13,38 @@ /** * SQ8-to-SQ8 distance functions for NEON. * These functions compute distance between two SQ8 (scalar quantized 8-bit) vectors, - * where BOTH vectors are uint8 quantized and dequantization is applied to both - * during computation. + * where BOTH vectors are uint8 quantized. + * + * Uses algebraic optimization to reduce operations per element: + * + * IP = Σ (v1[i]*δ1 + min1) * (v2[i]*δ2 + min2) + * = δ1*δ2 * Σ(v1[i]*v2[i]) + δ1*min2 * Σv1[i] + δ2*min1 * Σv2[i] + dim*min1*min2 + * + * This saves 2 FMAs per 4-element step by deferring dequantization to scalar math at the end. * * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [inv_norm (float)] - * Dequantization formula: dequantized_value = quantized_value * delta + min_val */ -// Helper function to perform inner product step for 4 elements with dual dequantization +// Helper function with algebraic optimization static inline void SQ8_SQ8_InnerProductStep_NEON(const uint8_t *&pVec1, const uint8_t *&pVec2, - float32x4_t &sum, const float32x4_t &min_val_vec1, - const float32x4_t &delta_vec1, - const float32x4_t &min_val_vec2, - const float32x4_t &delta_vec2) { + float32x4_t &dot_sum, float32x4_t &sum1, + float32x4_t &sum2) { // Load 4 uint8 elements from pVec1 and convert to float uint8x8_t v1_u8 = vld1_u8(pVec1); uint32x4_t v1_u32 = vmovl_u16(vget_low_u16(vmovl_u8(v1_u8))); float32x4_t v1_f = vcvtq_f32_u32(v1_u32); - // Dequantize v1: (val * delta1) + min_val1 - float32x4_t v1_dequant = vmlaq_f32(min_val_vec1, v1_f, delta_vec1); - // Load 4 uint8 elements from pVec2 and convert to float uint8x8_t v2_u8 = vld1_u8(pVec2); uint32x4_t v2_u32 = vmovl_u16(vget_low_u16(vmovl_u8(v2_u8))); float32x4_t v2_f = vcvtq_f32_u32(v2_u32); - // Dequantize v2: (val * delta2) + min_val2 - float32x4_t v2_dequant = vmlaq_f32(min_val_vec2, v2_f, delta_vec2); + // Accumulate dot product: dot_sum += v1 * v2 (no dequantization) + dot_sum = vmlaq_f32(dot_sum, v1_f, v2_f); - // Compute dot product and add to sum - sum = vmlaq_f32(sum, v1_dequant, v2_dequant); + // Accumulate element sums + sum1 = vaddq_f32(sum1, v1_f); + sum2 = vaddq_f32(sum2, v2_f); // Advance pointers pVec1 += 4; @@ -58,87 +59,106 @@ float SQ8_SQ8_InnerProductSIMD16_NEON_IMP(const void *pVec1v, const void *pVec2v const uint8_t *pVec2 = static_cast(pVec2v); // Get dequantization parameters from the end of pVec1 - const float min_val1 = *reinterpret_cast(pVec1 + dimension); + const float min1 = *reinterpret_cast(pVec1 + dimension); const float delta1 = *reinterpret_cast(pVec1 + dimension + sizeof(float)); // Get dequantization parameters from the end of pVec2 - const float min_val2 = *reinterpret_cast(pVec2 + dimension); + const float min2 = *reinterpret_cast(pVec2 + dimension); const float delta2 = *reinterpret_cast(pVec2 + dimension + sizeof(float)); - // Create broadcast vectors for SIMD operations - float32x4_t min_val_vec1 = vdupq_n_f32(min_val1); - float32x4_t delta_vec1 = vdupq_n_f32(delta1); - float32x4_t min_val_vec2 = vdupq_n_f32(min_val2); - float32x4_t delta_vec2 = vdupq_n_f32(delta2); - - float32x4_t sum0 = vdupq_n_f32(0.0f); - float32x4_t sum1 = vdupq_n_f32(0.0f); - float32x4_t sum2 = vdupq_n_f32(0.0f); - float32x4_t sum3 = vdupq_n_f32(0.0f); + // Multiple accumulators for instruction-level parallelism + // dot_sum: accumulates v1[i] * v2[i] + // sum1: accumulates v1[i] + // sum2: accumulates v2[i] + float32x4_t dot_sum0 = vdupq_n_f32(0.0f); + float32x4_t dot_sum1 = vdupq_n_f32(0.0f); + float32x4_t dot_sum2 = vdupq_n_f32(0.0f); + float32x4_t dot_sum3 = vdupq_n_f32(0.0f); + float32x4_t sum1_0 = vdupq_n_f32(0.0f); + float32x4_t sum1_1 = vdupq_n_f32(0.0f); + float32x4_t sum1_2 = vdupq_n_f32(0.0f); + float32x4_t sum1_3 = vdupq_n_f32(0.0f); + float32x4_t sum2_0 = vdupq_n_f32(0.0f); + float32x4_t sum2_1 = vdupq_n_f32(0.0f); + float32x4_t sum2_2 = vdupq_n_f32(0.0f); + float32x4_t sum2_3 = vdupq_n_f32(0.0f); const size_t num_of_chunks = dimension / 16; // Process 16 elements at a time in the main loop for (size_t i = 0; i < num_of_chunks; i++) { - SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, sum0, min_val_vec1, delta_vec1, min_val_vec2, - delta_vec2); - SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, sum1, min_val_vec1, delta_vec1, min_val_vec2, - delta_vec2); - SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, sum2, min_val_vec1, delta_vec1, min_val_vec2, - delta_vec2); - SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, sum3, min_val_vec1, delta_vec1, min_val_vec2, - delta_vec2); + SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, dot_sum0, sum1_0, sum2_0); + SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, dot_sum1, sum1_1, sum2_1); + SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, dot_sum2, sum1_2, sum2_2); + SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, dot_sum3, sum1_3, sum2_3); } // Handle remaining complete 4-element blocks within residual if constexpr (residual >= 4) { - SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, sum0, min_val_vec1, delta_vec1, min_val_vec2, - delta_vec2); + SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, dot_sum0, sum1_0, sum2_0); } if constexpr (residual >= 8) { - SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, sum1, min_val_vec1, delta_vec1, min_val_vec2, - delta_vec2); + SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, dot_sum1, sum1_1, sum2_1); } if constexpr (residual >= 12) { - SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, sum2, min_val_vec1, delta_vec1, min_val_vec2, - delta_vec2); + SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, dot_sum2, sum1_2, sum2_2); } - // Handle final residual elements (0-3 elements) + // Handle final residual elements (0-3 elements) with scalar math constexpr size_t final_residual = residual % 4; if constexpr (final_residual > 0) { - float32x4_t v1_dequant = vdupq_n_f32(0.0f); - float32x4_t v2_dequant = vdupq_n_f32(0.0f); + float32x4_t v1_f = vdupq_n_f32(0.0f); + float32x4_t v2_f = vdupq_n_f32(0.0f); if constexpr (final_residual >= 1) { - float dequant1_0 = pVec1[0] * delta1 + min_val1; - float dequant2_0 = pVec2[0] * delta2 + min_val2; - v1_dequant = vld1q_lane_f32(&dequant1_0, v1_dequant, 0); - v2_dequant = vld1q_lane_f32(&dequant2_0, v2_dequant, 0); + float val1_0 = static_cast(pVec1[0]); + float val2_0 = static_cast(pVec2[0]); + v1_f = vld1q_lane_f32(&val1_0, v1_f, 0); + v2_f = vld1q_lane_f32(&val2_0, v2_f, 0); } if constexpr (final_residual >= 2) { - float dequant1_1 = pVec1[1] * delta1 + min_val1; - float dequant2_1 = pVec2[1] * delta2 + min_val2; - v1_dequant = vld1q_lane_f32(&dequant1_1, v1_dequant, 1); - v2_dequant = vld1q_lane_f32(&dequant2_1, v2_dequant, 1); + float val1_1 = static_cast(pVec1[1]); + float val2_1 = static_cast(pVec2[1]); + v1_f = vld1q_lane_f32(&val1_1, v1_f, 1); + v2_f = vld1q_lane_f32(&val2_1, v2_f, 1); } if constexpr (final_residual >= 3) { - float dequant1_2 = pVec1[2] * delta1 + min_val1; - float dequant2_2 = pVec2[2] * delta2 + min_val2; - v1_dequant = vld1q_lane_f32(&dequant1_2, v1_dequant, 2); - v2_dequant = vld1q_lane_f32(&dequant2_2, v2_dequant, 2); + float val1_2 = static_cast(pVec1[2]); + float val2_2 = static_cast(pVec2[2]); + v1_f = vld1q_lane_f32(&val1_2, v1_f, 2); + v2_f = vld1q_lane_f32(&val2_2, v2_f, 2); } - sum3 = vmlaq_f32(sum3, v1_dequant, v2_dequant); + dot_sum3 = vmlaq_f32(dot_sum3, v1_f, v2_f); + sum1_3 = vaddq_f32(sum1_3, v1_f); + sum2_3 = vaddq_f32(sum2_3, v2_f); } - // Combine all four sum accumulators - float32x4_t sum_combined = vaddq_f32(vaddq_f32(sum0, sum1), vaddq_f32(sum2, sum3)); - - // Horizontal sum of the 4 elements in the combined NEON register - float32x2_t sum_halves = vadd_f32(vget_low_f32(sum_combined), vget_high_f32(sum_combined)); - float32x2_t summed = vpadd_f32(sum_halves, sum_halves); - return vget_lane_f32(summed, 0); + // Combine accumulators + float32x4_t dot_total = + vaddq_f32(vaddq_f32(dot_sum0, dot_sum1), vaddq_f32(dot_sum2, dot_sum3)); + float32x4_t sum1_total = vaddq_f32(vaddq_f32(sum1_0, sum1_1), vaddq_f32(sum1_2, sum1_3)); + float32x4_t sum2_total = vaddq_f32(vaddq_f32(sum2_0, sum2_1), vaddq_f32(sum2_2, sum2_3)); + + // Horizontal sum for dot product + float32x2_t dot_halves = vadd_f32(vget_low_f32(dot_total), vget_high_f32(dot_total)); + float32x2_t dot_summed = vpadd_f32(dot_halves, dot_halves); + float dot_product = vget_lane_f32(dot_summed, 0); + + // Horizontal sum for v1 sum + float32x2_t sum1_halves = vadd_f32(vget_low_f32(sum1_total), vget_high_f32(sum1_total)); + float32x2_t sum1_summed = vpadd_f32(sum1_halves, sum1_halves); + float v1_sum = vget_lane_f32(sum1_summed, 0); + + // Horizontal sum for v2 sum + float32x2_t sum2_halves = vadd_f32(vget_low_f32(sum2_total), vget_high_f32(sum2_total)); + float32x2_t sum2_summed = vpadd_f32(sum2_halves, sum2_halves); + float v2_sum = vget_lane_f32(sum2_summed, 0); + + // Apply algebraic formula: + // IP = δ1*δ2 * Σ(v1*v2) + δ1*min2 * Σv1 + δ2*min1 * Σv2 + dim*min1*min2 + return delta1 * delta2 * dot_product + delta1 * min2 * v1_sum + delta2 * min1 * v2_sum + + static_cast(dimension) * min1 * min2; } // SQ8-to-SQ8 Inner Product distance function diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index 9214bba54..7a812db27 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -177,7 +177,10 @@ dist_func_t IP_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, #ifdef CPU_FEATURES_ARCH_X86_64 #ifdef OPT_AVX512_F_BW_VL_VNNI // AVX512 VNNI SQ8_SQ8 uses 64-element chunks - if (dim >= 64 && features.avx512f && features.avx512bw && features.avx512vnni) { + if (dim < 64){ + return ret_dist_func; + } + if (features.avx512f && features.avx512bw && features.avx512vnni) { return Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); } #endif From 0ad941e48fb017da168e12d4b311639c1d2e3466 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 28 Dec 2025 12:38:58 +0200 Subject: [PATCH 08/51] format --- src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h | 12 ++++++------ src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h | 12 ++++++------ src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h | 3 +-- src/VecSim/spaces/IP/IP_SVE_SQ8.h | 4 ++-- src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h | 12 ++++++------ src/VecSim/spaces/IP_space.cpp | 2 +- 6 files changed, 22 insertions(+), 23 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h index f20f3e6c3..8ee096622 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h @@ -29,8 +29,8 @@ */ // Process 16 elements with algebraic optimization -static inline void SQ8_InnerProductStep(const float *pVec1, const uint8_t *pVec2, - __m512 &dot_sum, __m512 &query_sum) { +static inline void SQ8_InnerProductStep(const float *pVec1, const uint8_t *pVec2, __m512 &dot_sum, + __m512 &query_sum) { // Load 16 float elements from query __m512 v1 = _mm512_loadu_ps(pVec1); @@ -117,10 +117,10 @@ float SQ8_InnerProductImp_AVX512(const void *pVec1v, const void *pVec2v, size_t } // Combine accumulators - __m512 dot_total = _mm512_add_ps(_mm512_add_ps(dot_sum0, dot_sum1), - _mm512_add_ps(dot_sum2, dot_sum3)); - __m512 query_total = _mm512_add_ps(_mm512_add_ps(query_sum0, query_sum1), - _mm512_add_ps(query_sum2, query_sum3)); + __m512 dot_total = + _mm512_add_ps(_mm512_add_ps(dot_sum0, dot_sum1), _mm512_add_ps(dot_sum2, dot_sum3)); + __m512 query_total = + _mm512_add_ps(_mm512_add_ps(query_sum0, query_sum1), _mm512_add_ps(query_sum2, query_sum3)); // Reduce to scalar float dot_product = _mm512_reduce_add_ps(dot_total); diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h index 0cc9fe8df..3a680ac24 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h @@ -42,10 +42,10 @@ static inline void SQ8_SQ8_InnerProductStep64(const uint8_t *pVec1, const uint8_ // Convert to int16 (zero-extend) and compute dot products using VNNI // dpwssd: multiply pairs of int16, sum pairs to int32, accumulate - dot_acc0 = _mm512_dpwssd_epi32(dot_acc0, _mm512_cvtepu8_epi16(v1_lo), - _mm512_cvtepu8_epi16(v2_lo)); - dot_acc1 = _mm512_dpwssd_epi32(dot_acc1, _mm512_cvtepu8_epi16(v1_hi), - _mm512_cvtepu8_epi16(v2_hi)); + dot_acc0 = + _mm512_dpwssd_epi32(dot_acc0, _mm512_cvtepu8_epi16(v1_lo), _mm512_cvtepu8_epi16(v2_lo)); + dot_acc1 = + _mm512_dpwssd_epi32(dot_acc1, _mm512_cvtepu8_epi16(v1_hi), _mm512_cvtepu8_epi16(v2_hi)); // Sum of elements using SAD with zero (sums bytes in groups of 8 -> 8x 64-bit results) __m512i zero = _mm512_setzero_si512(); @@ -62,8 +62,8 @@ static inline void SQ8_SQ8_InnerProductStep32(const uint8_t *pVec1, const uint8_ __m256i v2_256 = _mm256_loadu_si256(reinterpret_cast(pVec2)); // Convert to int16 (zero-extend) and compute dot product using VNNI - dot_acc = _mm512_dpwssd_epi32(dot_acc, _mm512_cvtepu8_epi16(v1_256), - _mm512_cvtepu8_epi16(v2_256)); + dot_acc = + _mm512_dpwssd_epi32(dot_acc, _mm512_cvtepu8_epi16(v1_256), _mm512_cvtepu8_epi16(v2_256)); // Sum of elements - extend to 512-bit and use SAD // Use zextsi256_si512 to properly zero the upper half diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h index 68c3683fd..36be5f428 100644 --- a/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h +++ b/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h @@ -135,8 +135,7 @@ float SQ8_SQ8_InnerProductSIMD16_NEON_IMP(const void *pVec1v, const void *pVec2v } // Combine accumulators - float32x4_t dot_total = - vaddq_f32(vaddq_f32(dot_sum0, dot_sum1), vaddq_f32(dot_sum2, dot_sum3)); + float32x4_t dot_total = vaddq_f32(vaddq_f32(dot_sum0, dot_sum1), vaddq_f32(dot_sum2, dot_sum3)); float32x4_t sum1_total = vaddq_f32(vaddq_f32(sum1_0, sum1_1), vaddq_f32(sum1_2, sum1_3)); float32x4_t sum2_total = vaddq_f32(vaddq_f32(sum2_0, sum2_1), vaddq_f32(sum2_2, sum2_3)); diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8.h index 9d0afdad4..d142fb6f9 100644 --- a/src/VecSim/spaces/IP/IP_SVE_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8.h @@ -122,8 +122,8 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz } // Combine the accumulators - svfloat32_t dot_total = svadd_f32_x(pg, svadd_f32_x(pg, dot_sum0, dot_sum1), - svadd_f32_x(pg, dot_sum2, dot_sum3)); + svfloat32_t dot_total = + svadd_f32_x(pg, svadd_f32_x(pg, dot_sum0, dot_sum1), svadd_f32_x(pg, dot_sum2, dot_sum3)); svfloat32_t query_total = svadd_f32_x(pg, svadd_f32_x(pg, query_sum0, query_sum1), svadd_f32_x(pg, query_sum2, query_sum3)); diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h index fdff78cd2..c16cf9a97 100644 --- a/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h @@ -142,12 +142,12 @@ float SQ8_SQ8_InnerProductSIMD_SVE_IMP(const void *pVec1v, const void *pVec2v, s } // Combine the accumulators - svfloat32_t dot_total = svadd_f32_x(pg, svadd_f32_x(pg, dot_sum0, dot_sum1), - svadd_f32_x(pg, dot_sum2, dot_sum3)); - svfloat32_t sum1_total = svadd_f32_x(pg, svadd_f32_x(pg, sum1_0, sum1_1), - svadd_f32_x(pg, sum1_2, sum1_3)); - svfloat32_t sum2_total = svadd_f32_x(pg, svadd_f32_x(pg, sum2_0, sum2_1), - svadd_f32_x(pg, sum2_2, sum2_3)); + svfloat32_t dot_total = + svadd_f32_x(pg, svadd_f32_x(pg, dot_sum0, dot_sum1), svadd_f32_x(pg, dot_sum2, dot_sum3)); + svfloat32_t sum1_total = + svadd_f32_x(pg, svadd_f32_x(pg, sum1_0, sum1_1), svadd_f32_x(pg, sum1_2, sum1_3)); + svfloat32_t sum2_total = + svadd_f32_x(pg, svadd_f32_x(pg, sum2_0, sum2_1), svadd_f32_x(pg, sum2_2, sum2_3)); // Horizontal sum of all elements float dot_product = svaddv_f32(pg, dot_total); diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index 7a812db27..677d30a71 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -177,7 +177,7 @@ dist_func_t IP_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, #ifdef CPU_FEATURES_ARCH_X86_64 #ifdef OPT_AVX512_F_BW_VL_VNNI // AVX512 VNNI SQ8_SQ8 uses 64-element chunks - if (dim < 64){ + if (dim < 64) { return ret_dist_func; } if (features.avx512f && features.avx512bw && features.avx512vnni) { From 68cd0682271c9bad9bff2682016111321ff92b96 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 28 Dec 2025 15:43:36 +0200 Subject: [PATCH 09/51] Add NEON DOTPROD-optimized distance functions for SQ8-to-SQ8 calculations --- .../spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h.h | 166 +++++++++++++++++ src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h | 174 ++++++++---------- src/VecSim/spaces/IP_space.cpp | 6 + src/VecSim/spaces/functions/NEON.h | 13 ++ src/VecSim/spaces/functions/NEON_DOTPROD.cpp | 14 ++ src/VecSim/spaces/functions/NEON_DOTPROD.h | 4 + src/VecSim/spaces/functions/SVE.cpp | 5 +- src/VecSim/spaces/functions/SVE2.cpp | 5 +- tests/unit/test_spaces.cpp | 22 +++ 9 files changed, 306 insertions(+), 103 deletions(-) create mode 100644 src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h.h diff --git a/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h.h b/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h.h new file mode 100644 index 000000000..9217fe6ed --- /dev/null +++ b/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h.h @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include + +/** + * SQ8-to-SQ8 distance functions for NEON with DOTPROD extension. + * These functions compute distance between two SQ8 (scalar quantized 8-bit) vectors, + * where BOTH vectors are uint8 quantized. + * + * Uses algebraic optimization with INTEGER arithmetic throughout: + * + * IP = Σ (v1[i]*δ1 + min1) * (v2[i]*δ2 + min2) + * = δ1*δ2 * Σ(v1[i]*v2[i]) + δ1*min2 * Σv1[i] + δ2*min1 * Σv2[i] + dim*min1*min2 + * + * All sums are computed using integer arithmetic, converted to float only at the end. + * + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [inv_norm (float)] + */ + +// Ones vector for computing element sums via dot product +static const uint8x16_t ones = vdupq_n_u8(1); + +// Helper function: computes dot product and element sums using integer arithmetic +__attribute__((always_inline)) static inline void +SQ8_SQ8_InnerProductStep_NEON_DOTPROD(const uint8_t *&pVec1, const uint8_t *&pVec2, + uint32x4_t &dot_sum, uint32x4_t &sum1, uint32x4_t &sum2) { + // Load 16 uint8 elements + uint8x16_t v1 = vld1q_u8(pVec1); + uint8x16_t v2 = vld1q_u8(pVec2); + + // Compute dot product using DOTPROD instruction: dot_sum += v1 . v2 + dot_sum = vdotq_u32(dot_sum, v1, v2); + + // Compute element sums using dot product with ones vector + // sum1 += Σv1[i], sum2 += Σv2[i] + sum1 = vdotq_u32(sum1, v1, ones); + sum2 = vdotq_u32(sum2, v2, ones); + + pVec1 += 16; + pVec2 += 16; +} + +// Common implementation for inner product between two SQ8 vectors +template // 0..63 +float SQ8_SQ8_InnerProductSIMD64_NEON_DOTPROD_IMP(const void *pVec1v, const void *pVec2v, + size_t dimension) { + const uint8_t *pVec1 = static_cast(pVec1v); + const uint8_t *pVec2 = static_cast(pVec2v); + + // Get dequantization parameters from the end of pVec1 + const float min1 = *reinterpret_cast(pVec1 + dimension); + const float delta1 = *reinterpret_cast(pVec1 + dimension + sizeof(float)); + + // Get dequantization parameters from the end of pVec2 + const float min2 = *reinterpret_cast(pVec2 + dimension); + const float delta2 = *reinterpret_cast(pVec2 + dimension + sizeof(float)); + + // Integer accumulators for dot product and element sums + uint32x4_t dot_sum0 = vdupq_n_u32(0); + uint32x4_t dot_sum1 = vdupq_n_u32(0); + uint32x4_t sum1_0 = vdupq_n_u32(0); + uint32x4_t sum1_1 = vdupq_n_u32(0); + uint32x4_t sum2_0 = vdupq_n_u32(0); + uint32x4_t sum2_1 = vdupq_n_u32(0); + + // Handle residual elements first (0-15 elements) + constexpr size_t final_residual = residual % 16; + if constexpr (final_residual > 0) { + constexpr uint8x16_t mask = { + 0xFF, + (final_residual >= 2) ? 0xFF : 0, + (final_residual >= 3) ? 0xFF : 0, + (final_residual >= 4) ? 0xFF : 0, + (final_residual >= 5) ? 0xFF : 0, + (final_residual >= 6) ? 0xFF : 0, + (final_residual >= 7) ? 0xFF : 0, + (final_residual >= 8) ? 0xFF : 0, + (final_residual >= 9) ? 0xFF : 0, + (final_residual >= 10) ? 0xFF : 0, + (final_residual >= 11) ? 0xFF : 0, + (final_residual >= 12) ? 0xFF : 0, + (final_residual >= 13) ? 0xFF : 0, + (final_residual >= 14) ? 0xFF : 0, + (final_residual >= 15) ? 0xFF : 0, + 0, + }; + + uint8x16_t v1 = vld1q_u8(pVec1); + uint8x16_t v2 = vld1q_u8(pVec2); + uint8x16_t zeros = vdupq_n_u8(0); + + // Zero out irrelevant elements + v1 = vbslq_u8(mask, v1, zeros); + v2 = vbslq_u8(mask, v2, zeros); + + // Accumulate using integer arithmetic + dot_sum1 = vdotq_u32(dot_sum1, v1, v2); + sum1_1 = vdotq_u32(sum1_1, v1, ones); + sum2_1 = vdotq_u32(sum2_1, v2, ones); + + pVec1 += final_residual; + pVec2 += final_residual; + } + + // Process 64 elements at a time in the main loop + const size_t num_of_chunks = dimension / 64; + + for (size_t i = 0; i < num_of_chunks; i++) { + SQ8_SQ8_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum0, sum1_0, sum2_0); + SQ8_SQ8_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum1, sum1_1, sum2_1); + SQ8_SQ8_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum0, sum1_0, sum2_0); + SQ8_SQ8_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum1, sum1_1, sum2_1); + } + + // Handle remaining 16-element chunks (0-3 chunks within residual) + constexpr size_t residual_chunks = residual / 16; + if constexpr (residual_chunks >= 1) { + SQ8_SQ8_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum0, sum1_0, sum2_0); + } + if constexpr (residual_chunks >= 2) { + SQ8_SQ8_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum1, sum1_1, sum2_1); + } + if constexpr (residual_chunks >= 3) { + SQ8_SQ8_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum0, sum1_0, sum2_0); + } + + // Combine accumulators + uint32x4_t dot_total = vaddq_u32(dot_sum0, dot_sum1); + uint32x4_t sum1_total = vaddq_u32(sum1_0, sum1_1); + uint32x4_t sum2_total = vaddq_u32(sum2_0, sum2_1); + + // Horizontal sum to scalar (integer) + uint32_t dot_product = vaddvq_u32(dot_total); + uint32_t v1_sum = vaddvq_u32(sum1_total); + uint32_t v2_sum = vaddvq_u32(sum2_total); + + // Apply algebraic formula with float conversion only at the end: + // IP = δ1*δ2 * Σ(v1*v2) + δ1*min2 * Σv1 + δ2*min1 * Σv2 + dim*min1*min2 + return delta1 * delta2 * static_cast(dot_product) + + delta1 * min2 * static_cast(v1_sum) + delta2 * min1 * static_cast(v2_sum) + + static_cast(dimension) * min1 * min2; +} + +// SQ8-to-SQ8 Inner Product distance function +// Returns 1 - inner_product (distance form) +template // 0..63 +float SQ8_SQ8_InnerProductSIMD64_NEON_DOTPROD(const void *pVec1v, const void *pVec2v, + size_t dimension) { + return 1.0f - SQ8_SQ8_InnerProductSIMD64_NEON_DOTPROD_IMP(pVec1v, pVec2v, dimension); +} + +// SQ8-to-SQ8 Cosine distance function +// Assumes both vectors are normalized. +// Returns 1 - inner_product +template // 0..63 +float SQ8_SQ8_CosineSIMD64_NEON_DOTPROD(const void *pVec1v, const void *pVec2v, size_t dimension) { + return 1.0f - SQ8_SQ8_InnerProductSIMD64_NEON_DOTPROD_IMP(pVec1v, pVec2v, dimension); +} diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h index c16cf9a97..e43800a03 100644 --- a/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h @@ -15,40 +15,40 @@ * These functions compute distance between two SQ8 (scalar quantized 8-bit) vectors, * where BOTH vectors are uint8 quantized. * - * Uses algebraic optimization to reduce operations per element: + * Uses algebraic optimization with INTEGER arithmetic throughout: * * IP = Σ (v1[i]*δ1 + min1) * (v2[i]*δ2 + min2) * = δ1*δ2 * Σ(v1[i]*v2[i]) + δ1*min2 * Σv1[i] + δ2*min1 * Σv2[i] + dim*min1*min2 * - * This saves 2 FMAs per chunk by deferring dequantization to scalar math at the end. + * All sums are computed using integer dot product instructions, converted to float only at the end. * * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [inv_norm (float)] */ -// Helper function to perform inner product step with algebraic optimization +// Helper function to perform inner product step using integer dot product static inline void SQ8_SQ8_InnerProductStep_SVE(const uint8_t *pVec1, const uint8_t *pVec2, - size_t offset, svfloat32_t &dot_sum, - svfloat32_t &sum1, svfloat32_t &sum2, + size_t &offset, svuint32_t &dot_sum, + svuint32_t &sum1, svuint32_t &sum2, const size_t chunk) { - svbool_t pg = svptrue_b32(); + svbool_t pg = svptrue_b8(); - // Load uint8 elements from pVec1 and convert to float - svuint32_t v1_u32 = svld1ub_u32(pg, pVec1 + offset); - svfloat32_t v1_f = svcvt_f32_u32_x(pg, v1_u32); + // Load uint8 vectors + svuint8_t v1_u8 = svld1_u8(pg, pVec1 + offset); + svuint8_t v2_u8 = svld1_u8(pg, pVec2 + offset); - // Load uint8 elements from pVec2 and convert to float - svuint32_t v2_u32 = svld1ub_u32(pg, pVec2 + offset); - svfloat32_t v2_f = svcvt_f32_u32_x(pg, v2_u32); + // Compute dot product using integer svdot instruction + dot_sum = svdot_u32(dot_sum, v1_u8, v2_u8); - // Accumulate dot product: dot_sum += v1 * v2 (no dequantization) - dot_sum = svmla_f32_x(pg, dot_sum, v1_f, v2_f); + // Compute element sums using dot product with ones vector + svuint8_t ones = svdup_u8(1); + sum1 = svdot_u32(sum1, v1_u8, ones); + sum2 = svdot_u32(sum2, v2_u8, ones); - // Accumulate element sums - sum1 = svadd_f32_x(pg, sum1, v1_f); - sum2 = svadd_f32_x(pg, sum2, v2_f); + offset += chunk; } // Common implementation for inner product between two SQ8 vectors +// Uses integer arithmetic throughout for maximum performance template float SQ8_SQ8_InnerProductSIMD_SVE_IMP(const void *pVec1v, const void *pVec2v, size_t dimension) { const uint8_t *pVec1 = static_cast(pVec1v); @@ -63,100 +63,76 @@ float SQ8_SQ8_InnerProductSIMD_SVE_IMP(const void *pVec1v, const void *pVec2v, s const float min2 = *reinterpret_cast(pVec2 + dimension); const float delta2 = *reinterpret_cast(pVec2 + dimension + sizeof(float)); - svbool_t pg = svptrue_b32(); - - // Get the number of 32-bit elements per vector at runtime - uint64_t chunk = svcntw(); - - // Multiple accumulators for instruction-level parallelism - // dot_sum: accumulates v1[i] * v2[i] - // sum1: accumulates v1[i] - // sum2: accumulates v2[i] - svfloat32_t dot_sum0 = svdup_f32(0.0f); - svfloat32_t dot_sum1 = svdup_f32(0.0f); - svfloat32_t dot_sum2 = svdup_f32(0.0f); - svfloat32_t dot_sum3 = svdup_f32(0.0f); - svfloat32_t sum1_0 = svdup_f32(0.0f); - svfloat32_t sum1_1 = svdup_f32(0.0f); - svfloat32_t sum1_2 = svdup_f32(0.0f); - svfloat32_t sum1_3 = svdup_f32(0.0f); - svfloat32_t sum2_0 = svdup_f32(0.0f); - svfloat32_t sum2_1 = svdup_f32(0.0f); - svfloat32_t sum2_2 = svdup_f32(0.0f); - svfloat32_t sum2_3 = svdup_f32(0.0f); - - // Handle partial chunk if needed - if constexpr (partial_chunk) { - size_t remaining = dimension % chunk; - if (remaining > 0) { - // Create predicate for the remaining elements - svbool_t pg_partial = - svwhilelt_b32(static_cast(0), static_cast(remaining)); - - // Load and convert v1 elements - svuint32_t v1_u32 = svld1ub_u32(pg_partial, pVec1 + offset); - svfloat32_t v1_f = svcvt_f32_u32_z(pg_partial, v1_u32); - - // Load and convert v2 elements - svuint32_t v2_u32 = svld1ub_u32(pg_partial, pVec2 + offset); - svfloat32_t v2_f = svcvt_f32_u32_z(pg_partial, v2_u32); - - // Accumulate dot product (no dequantization) - dot_sum0 = svmla_f32_z(pg_partial, dot_sum0, v1_f, v2_f); - - // Accumulate element sums - sum1_0 = svadd_f32_z(pg_partial, sum1_0, v1_f); - sum2_0 = svadd_f32_z(pg_partial, sum2_0, v2_f); - - // Move past the partial chunk - offset += remaining; - } - } + // Get the number of 8-bit elements per vector at runtime + const size_t vl = svcntb(); + const size_t chunk_size = 4 * vl; + + // Integer accumulators for dot product and element sums + svuint32_t dot_sum0 = svdup_u32(0); + svuint32_t dot_sum1 = svdup_u32(0); + svuint32_t dot_sum2 = svdup_u32(0); + svuint32_t dot_sum3 = svdup_u32(0); + svuint32_t sum1_0 = svdup_u32(0); + svuint32_t sum1_1 = svdup_u32(0); + svuint32_t sum1_2 = svdup_u32(0); + svuint32_t sum1_3 = svdup_u32(0); + svuint32_t sum2_0 = svdup_u32(0); + svuint32_t sum2_1 = svdup_u32(0); + svuint32_t sum2_2 = svdup_u32(0); + svuint32_t sum2_3 = svdup_u32(0); // Process 4 chunks at a time in the main loop - auto chunk_size = 4 * chunk; - const size_t number_of_chunks = - (dimension - (partial_chunk ? dimension % chunk : 0)) / chunk_size; + const size_t number_of_chunks = dimension / chunk_size; for (size_t i = 0; i < number_of_chunks; i++) { - SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum0, sum1_0, sum2_0, chunk); - SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset + chunk, dot_sum1, sum1_1, sum2_1, chunk); - SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset + 2 * chunk, dot_sum2, sum1_2, sum2_2, - chunk); - SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset + 3 * chunk, dot_sum3, sum1_3, sum2_3, - chunk); - offset += chunk_size; + SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum0, sum1_0, sum2_0, vl); + SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum1, sum1_1, sum2_1, vl); + SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum2, sum1_2, sum2_2, vl); + SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum3, sum1_3, sum2_3, vl); } - // Handle remaining steps (0-3) - if constexpr (additional_steps > 0) { - SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum0, sum1_0, sum2_0, chunk); - offset += chunk; + // Handle remaining steps (0-3 complete chunks) + if constexpr (additional_steps >= 1) { + SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum0, sum1_0, sum2_0, vl); } - if constexpr (additional_steps > 1) { - SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum1, sum1_1, sum2_1, chunk); - offset += chunk; + if constexpr (additional_steps >= 2) { + SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum1, sum1_1, sum2_1, vl); } - if constexpr (additional_steps > 2) { - SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum2, sum1_2, sum2_2, chunk); + if constexpr (additional_steps >= 3) { + SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum2, sum1_2, sum2_2, vl); } - // Combine the accumulators - svfloat32_t dot_total = - svadd_f32_x(pg, svadd_f32_x(pg, dot_sum0, dot_sum1), svadd_f32_x(pg, dot_sum2, dot_sum3)); - svfloat32_t sum1_total = - svadd_f32_x(pg, svadd_f32_x(pg, sum1_0, sum1_1), svadd_f32_x(pg, sum1_2, sum1_3)); - svfloat32_t sum2_total = - svadd_f32_x(pg, svadd_f32_x(pg, sum2_0, sum2_1), svadd_f32_x(pg, sum2_2, sum2_3)); - - // Horizontal sum of all elements - float dot_product = svaddv_f32(pg, dot_total); - float v1_sum = svaddv_f32(pg, sum1_total); - float v2_sum = svaddv_f32(pg, sum2_total); + // Handle partial chunk if needed + if constexpr (partial_chunk) { + svbool_t pg = svwhilelt_b8_u64(offset, dimension); + svuint8_t v1_u8 = svld1_u8(pg, pVec1 + offset); + svuint8_t v2_u8 = svld1_u8(pg, pVec2 + offset); + + // Compute dot product and sums (inactive lanes are already zeroed by svld1) + dot_sum3 = svdot_u32(dot_sum3, v1_u8, v2_u8); + svuint8_t ones = svdup_u8(1); + sum1_3 = svdot_u32(sum1_3, v1_u8, ones); + sum2_3 = svdot_u32(sum2_3, v2_u8, ones); + } - // Apply algebraic formula: + // Combine the integer accumulators + svbool_t pg32 = svptrue_b32(); + svuint32_t dot_total = svadd_u32_x(pg32, svadd_u32_x(pg32, dot_sum0, dot_sum1), + svadd_u32_x(pg32, dot_sum2, dot_sum3)); + svuint32_t sum1_total = + svadd_u32_x(pg32, svadd_u32_x(pg32, sum1_0, sum1_1), svadd_u32_x(pg32, sum1_2, sum1_3)); + svuint32_t sum2_total = + svadd_u32_x(pg32, svadd_u32_x(pg32, sum2_0, sum2_1), svadd_u32_x(pg32, sum2_2, sum2_3)); + + // Horizontal sum to scalar integers + uint32_t dot_product = svaddv_u32(pg32, dot_total); + uint32_t v1_sum = svaddv_u32(pg32, sum1_total); + uint32_t v2_sum = svaddv_u32(pg32, sum2_total); + + // Apply algebraic formula with float conversion only at the end: // IP = δ1*δ2 * Σ(v1*v2) + δ1*min2 * Σv1 + δ2*min1 * Σv2 + dim*min1*min2 - return delta1 * delta2 * dot_product + delta1 * min2 * v1_sum + delta2 * min1 * v2_sum + + return delta1 * delta2 * static_cast(dot_product) + + delta1 * min2 * static_cast(v1_sum) + delta2 * min1 * static_cast(v2_sum) + static_cast(dimension) * min1 * min2; } diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index 677d30a71..4429278ce 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -167,6 +167,12 @@ dist_func_t IP_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, return Choose_SQ8_SQ8_IP_implementation_SVE(dim); } #endif +#ifdef OPT_NEON_DOTPROD + // DOTPROD uses integer arithmetic - much faster than float-based NEON + if (features.asimddp && dim >= 64) { + return Choose_SQ8_SQ8_IP_implementation_NEON_DOTPROD(dim); + } +#endif #ifdef OPT_NEON if (features.asimd) { return Choose_SQ8_SQ8_IP_implementation_NEON(dim); diff --git a/src/VecSim/spaces/functions/NEON.h b/src/VecSim/spaces/functions/NEON.h index 011d41b0d..c962e6bb5 100644 --- a/src/VecSim/spaces/functions/NEON.h +++ b/src/VecSim/spaces/functions/NEON.h @@ -34,4 +34,17 @@ dist_func_t Choose_SQ8_Cosine_implementation_NEON(size_t dim); dist_func_t Choose_SQ8_SQ8_IP_implementation_NEON(size_t dim); dist_func_t Choose_SQ8_SQ8_Cosine_implementation_NEON(size_t dim); +// NEON DOTPROD-optimized functions +dist_func_t Choose_INT8_IP_implementation_NEON_DOTPROD(size_t dim); +dist_func_t Choose_INT8_Cosine_implementation_NEON_DOTPROD(size_t dim); +dist_func_t Choose_INT8_L2_implementation_NEON_DOTPROD(size_t dim); + +dist_func_t Choose_UINT8_IP_implementation_NEON_DOTPROD(size_t dim); +dist_func_t Choose_UINT8_Cosine_implementation_NEON_DOTPROD(size_t dim); +dist_func_t Choose_UINT8_L2_implementation_NEON_DOTPROD(size_t dim); + +// SQ8-to-SQ8 DOTPROD-optimized distance functions +dist_func_t Choose_SQ8_SQ8_IP_implementation_NEON_DOTPROD(size_t dim); +dist_func_t Choose_SQ8_SQ8_Cosine_implementation_NEON_DOTPROD(size_t dim); + } // namespace spaces diff --git a/src/VecSim/spaces/functions/NEON_DOTPROD.cpp b/src/VecSim/spaces/functions/NEON_DOTPROD.cpp index 02f098420..56e032e6f 100644 --- a/src/VecSim/spaces/functions/NEON_DOTPROD.cpp +++ b/src/VecSim/spaces/functions/NEON_DOTPROD.cpp @@ -9,6 +9,7 @@ #include "NEON.h" #include "VecSim/spaces/IP/IP_NEON_DOTPROD_INT8.h" #include "VecSim/spaces/IP/IP_NEON_DOTPROD_UINT8.h" +#include "VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h" #include "VecSim/spaces/L2/L2_NEON_DOTPROD_INT8.h" #include "VecSim/spaces/L2/L2_NEON_DOTPROD_UINT8.h" @@ -52,6 +53,19 @@ dist_func_t Choose_UINT8_L2_implementation_NEON_DOTPROD(size_t dim) { return ret_dist_func; } +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized) +dist_func_t Choose_SQ8_SQ8_IP_implementation_NEON_DOTPROD(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, SQ8_SQ8_InnerProductSIMD64_NEON_DOTPROD); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_SQ8_Cosine_implementation_NEON_DOTPROD(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, SQ8_SQ8_CosineSIMD64_NEON_DOTPROD); + return ret_dist_func; +} + #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/NEON_DOTPROD.h b/src/VecSim/spaces/functions/NEON_DOTPROD.h index 199e57708..232de725a 100644 --- a/src/VecSim/spaces/functions/NEON_DOTPROD.h +++ b/src/VecSim/spaces/functions/NEON_DOTPROD.h @@ -21,4 +21,8 @@ dist_func_t Choose_UINT8_Cosine_implementation_NEON_DOTPROD(size_t dim); dist_func_t Choose_INT8_L2_implementation_NEON_DOTPROD(size_t dim); dist_func_t Choose_UINT8_L2_implementation_NEON_DOTPROD(size_t dim); +// SQ8-to-SQ8 DOTPROD-optimized distance functions +dist_func_t Choose_SQ8_SQ8_IP_implementation_NEON_DOTPROD(size_t dim); +dist_func_t Choose_SQ8_SQ8_Cosine_implementation_NEON_DOTPROD(size_t dim); + } // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE.cpp b/src/VecSim/spaces/functions/SVE.cpp index 9e18346cb..98be856a2 100644 --- a/src/VecSim/spaces/functions/SVE.cpp +++ b/src/VecSim/spaces/functions/SVE.cpp @@ -119,15 +119,16 @@ dist_func_t Choose_SQ8_L2_implementation_SVE(size_t dim) { } // SQ8-to-SQ8 distance functions (both vectors are uint8 quantized) +// Note: Use svcntb for uint8 elements (not svcntw which is for 32-bit elements) dist_func_t Choose_SQ8_SQ8_IP_implementation_SVE(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_SQ8_InnerProductSIMD_SVE, dim, svcntw); + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_SQ8_InnerProductSIMD_SVE, dim, svcntb); return ret_dist_func; } dist_func_t Choose_SQ8_SQ8_Cosine_implementation_SVE(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_SQ8_CosineSIMD_SVE, dim, svcntw); + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_SQ8_CosineSIMD_SVE, dim, svcntb); return ret_dist_func; } diff --git a/src/VecSim/spaces/functions/SVE2.cpp b/src/VecSim/spaces/functions/SVE2.cpp index f6780e331..8d03fbe97 100644 --- a/src/VecSim/spaces/functions/SVE2.cpp +++ b/src/VecSim/spaces/functions/SVE2.cpp @@ -116,15 +116,16 @@ dist_func_t Choose_SQ8_L2_implementation_SVE2(size_t dim) { } // SQ8-to-SQ8 distance functions (both vectors are uint8 quantized) +// Note: Use svcntb for uint8 elements (not svcntw which is for 32-bit elements) dist_func_t Choose_SQ8_SQ8_IP_implementation_SVE2(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_SQ8_InnerProductSIMD_SVE, dim, svcntw); + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_SQ8_InnerProductSIMD_SVE, dim, svcntb); return ret_dist_func; } dist_func_t Choose_SQ8_SQ8_Cosine_implementation_SVE2(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_SQ8_CosineSIMD_SVE, dim, svcntw); + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_SQ8_CosineSIMD_SVE, dim, svcntb); return ret_dist_func; } diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 7681d5013..33fd15bd6 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2567,6 +2567,17 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_InnerProductTest) { optimization.sve = 0; } #endif +#ifdef OPT_NEON_DOTPROD + if (optimization.asimddp && dim >= 64) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_SQ8_IP_implementation_NEON_DOTPROD(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_compressed.data(), v2_compressed.data(), dim), 0.01) + << "NEON_DOTPROD with dim " << dim; + optimization.asimddp = 0; + } +#endif #ifdef OPT_NEON if (optimization.asimd) { unsigned char alignment = 0; @@ -2646,6 +2657,17 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_CosineTest) { optimization.sve = 0; } #endif +#ifdef OPT_NEON_DOTPROD + if (optimization.asimddp && dim >= 64) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_SQ8_Cosine_implementation_NEON_DOTPROD(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_compressed.data(), v2_compressed.data(), dim), 0.01) + << "NEON_DOTPROD with dim " << dim; + optimization.asimddp = 0; + } +#endif #ifdef OPT_NEON if (optimization.asimd) { unsigned char alignment = 0; From 0b4b5687e404a6d91536c9ca747c4688fdfeb803 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 28 Dec 2025 16:22:10 +0200 Subject: [PATCH 10/51] PR --- ..._SQ8_SQ8.h.h => IP_NEON_DOTPROD_SQ8_SQ8.h} | 0 src/VecSim/spaces/IP_space.cpp | 13 ++++++- tests/unit/test_spaces.cpp | 38 ++++++++++++------- 3 files changed, 36 insertions(+), 15 deletions(-) rename src/VecSim/spaces/IP/{IP_NEON_DOTPROD_SQ8_SQ8.h.h => IP_NEON_DOTPROD_SQ8_SQ8.h} (100%) diff --git a/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h.h b/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h similarity index 100% rename from src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h.h rename to src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index 4429278ce..2762a1216 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -169,7 +169,7 @@ dist_func_t IP_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, #endif #ifdef OPT_NEON_DOTPROD // DOTPROD uses integer arithmetic - much faster than float-based NEON - if (features.asimddp && dim >= 64) { + if (features.asimddp) { return Choose_SQ8_SQ8_IP_implementation_NEON_DOTPROD(dim); } #endif @@ -206,11 +206,22 @@ dist_func_t Cosine_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignme [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt); #ifdef CPU_FEATURES_ARCH_AARCH64 +#ifdef OPT_SVE2 + if (features.sve2) { + return Choose_SQ8_SQ8_Cosine_implementation_SVE2(dim); + } +#endif #ifdef OPT_SVE if (features.sve) { return Choose_SQ8_SQ8_Cosine_implementation_SVE(dim); } #endif +#ifdef OPT_NEON_DOTPROD + // DOTPROD uses integer arithmetic - much faster than float-based NEON + if (features.asimddp) { + return Choose_SQ8_SQ8_Cosine_implementation_NEON_DOTPROD(dim); + } +#endif #ifdef OPT_NEON if (features.asimd) { return Choose_SQ8_SQ8_Cosine_implementation_NEON(dim); diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 33fd15bd6..faf3eb802 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2136,7 +2136,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.02) << "AVX512 with dim " << dim; // ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim; // Unset optimizations flag, so we'll choose the next optimization. @@ -2149,7 +2149,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX2_FMA(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.02) << "AVX with dim " << dim; // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim; // Unset optimizations flag, so we'll choose the next optimization. @@ -2162,7 +2162,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX2(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.02) << "AVX with dim " << dim; // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim; // Unset avx flag as well, so we'll choose the next optimization (SSE). @@ -2175,7 +2175,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_SSE4(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.02) << "SSE with dim " << dim; // ASSERT_EQ(alignment, expected_alignment(128, dim)) << "SSE with dim " << dim; // Unset sse flag as well, so we'll choose the next optimization (default). @@ -2189,7 +2189,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_SVE2(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.02) << "SVE2 with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; // Unset sve2 flag as well, so we'll choose the next option (default). @@ -2202,7 +2202,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_SVE(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.02) << "SVE with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; // Unset sve flag as well, so we'll choose the next option (default). @@ -2215,7 +2215,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_NEON(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.02) << "NEON with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; // Unset optimizations flag, so we'll choose the next optimization. @@ -2227,7 +2227,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { unsigned char alignment = 0; arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, SQ8_L2Sqr) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.02) << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } @@ -2348,8 +2348,10 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { } // Instantiate the test suite with dimensions to test +// Range includes dimensions up to 128+ to cover AVX512 64-element chunk processing +// and remaining chunk handling (needs dim >= 64 for loop, dim % 64 >= 48 for third remainder) INSTANTIATE_TEST_SUITE_P(SQ8OptFuncs, SQ8SpacesOptimizationTest, - testing::Range(16UL, 16 * 2UL + 1)); + testing::Range(16UL, 16 * 8UL + 1)); TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { auto optimization = getCpuOptimizationFeatures(); @@ -2480,7 +2482,8 @@ TEST_F(SpacesTest, SQ8_SQ8_ip_no_optimization_func_test) { v2_orig[i] = float(i + 1.5); } - // Normalize both vectors because it expects normalized vectors + // Normalize vectors so identical vectors have IP = 1, making distance = 1 - IP = 0 + // (Inner product doesn't require normalization, but it simplifies expected value calculation) spaces::GetNormalizeFunc()(v1_orig, dim); spaces::GetNormalizeFunc()(v2_orig, dim); @@ -2488,8 +2491,10 @@ TEST_F(SpacesTest, SQ8_SQ8_ip_no_optimization_func_test) { std::vector v1_compressed = CreateSQ8CompressedVector(v1_orig, dim); std::vector v2_compressed = CreateSQ8CompressedVector(v2_orig, dim); - float dist = SQ8_SQ8_InnerProduct((const void *)v1_compressed.data(), - (const void *)v2_compressed.data(), dim); + // Get distance function with nullptr alignment to cover that code path + auto dist_func = IP_SQ8_SQ8_GetDistFunc(dim, nullptr, nullptr); + float dist = + dist_func((const void *)v1_compressed.data(), (const void *)v2_compressed.data(), dim); // Since we're comparing identical normalized vectors, distance should be close to 0 ASSERT_NEAR(dist, 0.0f, 0.01f) << "SQ8_SQ8_InnerProduct failed to match expected distance"; @@ -2505,7 +2510,7 @@ TEST_F(SpacesTest, SQ8_SQ8_Cosine_no_optimization_func_test) { v2_orig[i] = float(i + 1.5); } - // Normalize both vectors because it expects normalized vectors + // Normalize vectors so identical vectors have cosine similarity = 1, making distance = 0 spaces::GetNormalizeFunc()(v1_orig, dim); spaces::GetNormalizeFunc()(v2_orig, dim); @@ -2513,8 +2518,10 @@ TEST_F(SpacesTest, SQ8_SQ8_Cosine_no_optimization_func_test) { std::vector v1_compressed = CreateSQ8CompressedVector(v1_orig, dim); std::vector v2_compressed = CreateSQ8CompressedVector(v2_orig, dim); + // Get distance function with nullptr alignment to cover that code path + auto dist_func = Cosine_SQ8_SQ8_GetDistFunc(dim, nullptr, nullptr); float dist = - SQ8_SQ8_Cosine((const void *)v1_compressed.data(), (const void *)v2_compressed.data(), dim); + dist_func((const void *)v1_compressed.data(), (const void *)v2_compressed.data(), dim); // Since we're comparing identical normalized vectors, cosine distance should be close to 0 ASSERT_NEAR(dist, 0.0f, 0.01f) << "SQ8_SQ8_Cosine failed to match expected distance"; @@ -2702,5 +2709,8 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_CosineTest) { ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } +// Note: This suite intentionally uses a larger dimension range (64–128) than SQ8OptFuncs. +// It is designed to exercise SQ8–SQ8 cosine implementations, including SIMD paths +// that are only enabled or meaningfully stressed for dimensions >= 64. INSTANTIATE_TEST_SUITE_P(SQ8_SQ8OptFuncs, SQ8_SQ8_SpacesOptimizationTest, testing::Range(64UL, 64 * 2UL + 1)); From d0fd2e493fe24ee267f37c084112b40745f1d75d Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 28 Dec 2025 16:30:22 +0200 Subject: [PATCH 11/51] Remove NEON DOTPROD-optimized distance functions for INT8, UINT8, and SQ8-to-SQ8 calculations --- src/VecSim/spaces/functions/NEON.h | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/src/VecSim/spaces/functions/NEON.h b/src/VecSim/spaces/functions/NEON.h index c962e6bb5..011d41b0d 100644 --- a/src/VecSim/spaces/functions/NEON.h +++ b/src/VecSim/spaces/functions/NEON.h @@ -34,17 +34,4 @@ dist_func_t Choose_SQ8_Cosine_implementation_NEON(size_t dim); dist_func_t Choose_SQ8_SQ8_IP_implementation_NEON(size_t dim); dist_func_t Choose_SQ8_SQ8_Cosine_implementation_NEON(size_t dim); -// NEON DOTPROD-optimized functions -dist_func_t Choose_INT8_IP_implementation_NEON_DOTPROD(size_t dim); -dist_func_t Choose_INT8_Cosine_implementation_NEON_DOTPROD(size_t dim); -dist_func_t Choose_INT8_L2_implementation_NEON_DOTPROD(size_t dim); - -dist_func_t Choose_UINT8_IP_implementation_NEON_DOTPROD(size_t dim); -dist_func_t Choose_UINT8_Cosine_implementation_NEON_DOTPROD(size_t dim); -dist_func_t Choose_UINT8_L2_implementation_NEON_DOTPROD(size_t dim); - -// SQ8-to-SQ8 DOTPROD-optimized distance functions -dist_func_t Choose_SQ8_SQ8_IP_implementation_NEON_DOTPROD(size_t dim); -dist_func_t Choose_SQ8_SQ8_Cosine_implementation_NEON_DOTPROD(size_t dim); - } // namespace spaces From 9de61630a1ed0108b364000975374d6ab65b737e Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 28 Dec 2025 16:38:23 +0200 Subject: [PATCH 12/51] Fix vector layout documentation by removing inv_norm from comments in NEON and AVX512 headers --- src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h | 2 +- src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h | 11 +++++++---- src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h index 3a680ac24..90d8787f4 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h @@ -23,7 +23,7 @@ * This allows using VNNI's _mm512_dpwssd_epi32 for efficient integer dot product, * then applying scalar corrections at the end. * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [inv_norm (float)] + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)]] */ // Process 64 uint8 elements using VNNI with multiple accumulators for ILP diff --git a/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h index 9217fe6ed..3e718b9b7 100644 --- a/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h +++ b/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h @@ -22,16 +22,17 @@ * * All sums are computed using integer arithmetic, converted to float only at the end. * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [inv_norm (float)] + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)]] */ -// Ones vector for computing element sums via dot product -static const uint8x16_t ones = vdupq_n_u8(1); - // Helper function: computes dot product and element sums using integer arithmetic __attribute__((always_inline)) static inline void SQ8_SQ8_InnerProductStep_NEON_DOTPROD(const uint8_t *&pVec1, const uint8_t *&pVec2, uint32x4_t &dot_sum, uint32x4_t &sum1, uint32x4_t &sum2) { + // Ones vector for computing element sums via dot product (function-local to avoid + // multiple definitions when header is included in multiple translation units) + static const constexpr uint8x16_t ones = vdupq_n_u8(1); + // Load 16 uint8 elements uint8x16_t v1 = vld1q_u8(pVec1); uint8x16_t v2 = vld1q_u8(pVec2); @@ -74,6 +75,8 @@ float SQ8_SQ8_InnerProductSIMD64_NEON_DOTPROD_IMP(const void *pVec1v, const void // Handle residual elements first (0-15 elements) constexpr size_t final_residual = residual % 16; if constexpr (final_residual > 0) { + // Ones vector for computing element sums via dot product + static const uint8x16_t ones = vdupq_n_u8(1); constexpr uint8x16_t mask = { 0xFF, (final_residual >= 2) ? 0xFF : 0, diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h index 36be5f428..778c201e1 100644 --- a/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h +++ b/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h @@ -22,7 +22,7 @@ * * This saves 2 FMAs per 4-element step by deferring dequantization to scalar math at the end. * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [inv_norm (float)] + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)]] */ // Helper function with algebraic optimization From 63a46a170b168763aec928ee1043a9f831f02cec Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 28 Dec 2025 16:39:12 +0200 Subject: [PATCH 13/51] Remove 'constexpr' from ones vector declaration in NEON inner product function --- src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h index 3e718b9b7..2dc69eae4 100644 --- a/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h +++ b/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h @@ -31,7 +31,7 @@ SQ8_SQ8_InnerProductStep_NEON_DOTPROD(const uint8_t *&pVec1, const uint8_t *&pVe uint32x4_t &dot_sum, uint32x4_t &sum1, uint32x4_t &sum2) { // Ones vector for computing element sums via dot product (function-local to avoid // multiple definitions when header is included in multiple translation units) - static const constexpr uint8x16_t ones = vdupq_n_u8(1); + static const uint8x16_t ones = vdupq_n_u8(1); // Load 16 uint8 elements uint8x16_t v1 = vld1q_u8(pVec1); From 525f8dab465d69e90d9f4f6f71cbaee7497d3f4c Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Mon, 29 Dec 2025 11:13:53 +0200 Subject: [PATCH 14/51] Refactor distance functions to remove inv_norm parameter and update documentation accordingly --- src/VecSim/spaces/IP/IP.cpp | 11 +- src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h | 12 +-- src/VecSim/spaces/IP/IP_AVX2_SQ8.h | 12 +-- .../spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h | 13 +-- .../spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h | 12 +-- .../spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h | 3 +- src/VecSim/spaces/IP/IP_NEON_SQ8.h | 15 +-- src/VecSim/spaces/IP/IP_SSE4_SQ8.h | 14 +-- src/VecSim/spaces/IP/IP_SVE_SQ8.h | 17 +-- src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h | 2 +- src/VecSim/spaces/L2/L2.cpp | 2 +- .../spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h | 6 +- .../scripts/run_serialization_benchmarks.sh | 102 ++++++++++++++++++ .../spaces_benchmarks/bm_spaces_sq8_sq8.cpp | 6 +- tests/unit/test_spaces.cpp | 45 +++----- 15 files changed, 149 insertions(+), 123 deletions(-) create mode 100755 tests/benchmark/scripts/run_serialization_benchmarks.sh diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp index 645fc739d..b7bc73327 100644 --- a/src/VecSim/spaces/IP/IP.cpp +++ b/src/VecSim/spaces/IP/IP.cpp @@ -15,13 +15,13 @@ using bfloat16 = vecsim_types::bfloat16; using float16 = vecsim_types::float16; float FLOAT_INTEGER_InnerProduct(const float *pVect1v, const uint8_t *pVect2v, size_t dimension, - float min_val, float delta, float inv_norm) { + float min_val, float delta) { float res = 0; for (size_t i = 0; i < dimension; i++) { float dequantized_V2 = (pVect2v[i] * delta + min_val); res += pVect1v[i] * dequantized_V2; } - return res * inv_norm; + return res; } float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) { @@ -29,11 +29,11 @@ float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimensio const auto *pVect2 = static_cast(pVect2v); // pVect2 is a vector of uint8_t, so we need to de-quantize it, normalize it and then multiply // it. it is structured as [quantized values (int8_t * dim)][min_val (float)][delta - // (float)][inv_norm (float)] The last two values are used to dequantize the vector. + // (float)]] The last two values are used to dequantize the vector. const float min_val = *reinterpret_cast(pVect2 + dimension); const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); // Compute inner product with dequantization - const float res = FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta, 1.0f); + const float res = FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta); return 1.0f - res; } @@ -44,10 +44,9 @@ float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) { // Get quantization parameters const float min_val = *reinterpret_cast(pVect2 + dimension); const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); - const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); // Compute inner product with dequantization const float res = - FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta, inv_norm); + FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta); return 1.0f - res; } diff --git a/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h index 007ee333e..4d2927213 100644 --- a/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h @@ -100,14 +100,6 @@ float SQ8_InnerProductSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, template // 0..15 float SQ8_CosineSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) { - // Get dequantization parameters from the end of quantized vector - const uint8_t *pVect2 = static_cast(pVect2v); - const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); - - // Calculate inner product using common implementation with normalization - float ip = SQ8_InnerProductImp_FMA(pVect1v, pVect2v, dimension); - - // For cosine, we need to account for the vector norms - // The inv_norm parameter is stored after min_val and delta in the quantized vector - return 1.0f - ip * inv_norm; + // Assume vectors are normalized. + return 1.0f - SQ8_InnerProductImp_FMA(pVect1v, pVect2v, dimension); } diff --git a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h index dd3276de8..3b0303e6d 100644 --- a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h @@ -94,14 +94,6 @@ float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size template // 0..15 float SQ8_CosineSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) { - // Get dequantization parameters from the end of quantized vector - const uint8_t *pVect2 = static_cast(pVect2v); - const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); - - // Calculate inner product using common implementation with normalization - float ip = SQ8_InnerProductImp_AVX2(pVect1v, pVect2v, dimension); - - // For cosine, we need to account for the vector norms - // The inv_norm parameter is stored after min_val and delta in the quantized vector - return 1.0f - ip * inv_norm; + // Assume vectors are normalized. + return 1.0f - SQ8_InnerProductImp_AVX2(pVect1v, pVect2v, dimension); } diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h index 8ee096622..347ead210 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h @@ -25,7 +25,7 @@ * * Also uses multiple accumulators for better instruction-level parallelism. * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [inv_norm (float)] + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)]] */ // Process 16 elements with algebraic optimization @@ -143,13 +143,6 @@ float SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void * template // 0..15 float SQ8_CosineSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, size_t dimension) { - // Get the inverse norm factor stored after min_val and delta - const uint8_t *pVec2 = static_cast(pVec2v); - const float inv_norm = *reinterpret_cast(pVec2 + dimension + 2 * sizeof(float)); - - // Calculate inner product using common implementation with normalization - float ip = SQ8_InnerProductImp_AVX512(pVec1v, pVec2v, dimension); - - // The cosine similarity is 1 - ip - return 1.0f - ip * inv_norm; + // Assume vectors are normalized. + return 1.0f - SQ8_InnerProductImp_AVX512(pVec1v, pVec2v, dimension); } diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h index 90d8787f4..69205a49f 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h @@ -19,6 +19,7 @@ * * IP = Σ (v1[i]*δ1 + min1) * (v2[i]*δ2 + min2) * = δ1*δ2 * Σ(v1[i]*v2[i]) + δ1*min2 * Σv1[i] + δ2*min1 * Σv2[i] + dim*min1*min2 + * TODO: Can store the vector's norm and sum of elements in the vector data, and use it here. * * This allows using VNNI's _mm512_dpwssd_epi32 for efficient integer dot product, * then applying scalar corrections at the end. @@ -172,19 +173,14 @@ float SQ8_SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dim template // 0..63 float SQ8_SQ8_InnerProductSIMD64_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, size_t dimension) { - float ip = SQ8_SQ8_InnerProductImp(pVec1v, pVec2v, dimension); - return 1.0f - ip; + return 1.0f - SQ8_SQ8_InnerProductImp(pVec1v, pVec2v, dimension); } // SQ8-to-SQ8 Cosine distance function -// Assumes both vectors are normalized. // Returns 1 - (inner_product) template // 0..63 float SQ8_SQ8_CosineSIMD64_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, size_t dimension) { - // Calculate inner product - float ip = SQ8_SQ8_InnerProductImp(pVec1v, pVec2v, dimension); - - // Return cosine similarity - return 1.0f - ip; + // Assume vectors are normalized. + return 1.0f - SQ8_SQ8_InnerProductImp(pVec1v, pVec2v, dimension); } diff --git a/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h index 2dc69eae4..bd339e94f 100644 --- a/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h +++ b/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h @@ -161,9 +161,10 @@ float SQ8_SQ8_InnerProductSIMD64_NEON_DOTPROD(const void *pVec1v, const void *pV } // SQ8-to-SQ8 Cosine distance function -// Assumes both vectors are normalized. + // Returns 1 - inner_product template // 0..63 float SQ8_SQ8_CosineSIMD64_NEON_DOTPROD(const void *pVec1v, const void *pVec2v, size_t dimension) { + // Assumes both vectors are normalized. return 1.0f - SQ8_SQ8_InnerProductSIMD64_NEON_DOTPROD_IMP(pVec1v, pVec2v, dimension); } diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_SQ8.h index 74fc445d5..609c86123 100644 --- a/src/VecSim/spaces/IP/IP_NEON_SQ8.h +++ b/src/VecSim/spaces/IP/IP_NEON_SQ8.h @@ -19,7 +19,7 @@ * * This saves 1 FMA per 4-element step by deferring dequantization to scalar math at the end. * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [inv_norm (float)] + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)]] */ // Helper function with algebraic optimization @@ -135,15 +135,6 @@ float SQ8_InnerProductSIMD16_NEON(const void *pVect1v, const void *pVect2v, size template // 0..15 float SQ8_CosineSIMD16_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) { - const uint8_t *pVect2 = static_cast(pVect2v); - - // Get quantization parameters - const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); - - // Compute inner product with dequantization using the common function - const float res = SQ8_InnerProductSIMD16_NEON_IMP(pVect1v, pVect2v, dimension); - - // For cosine, we need to account for the vector norms - // The inv_norm parameter is stored after min_val and delta in the quantized vector - return 1.0f - res * inv_norm; + // Assume vectors are normalized. + return 1.0f - SQ8_InnerProductSIMD16_NEON_IMP(pVect1v, pVect2v, dimension); } diff --git a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h index 5e47af2b6..2bbd9f582 100644 --- a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h @@ -104,16 +104,6 @@ float SQ8_InnerProductSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size template // 0..15 float SQ8_CosineSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dimension) { - - const uint8_t *pVect2 = static_cast(pVect2v); - // Get quantization parameters - const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); - - // Compute inner product with dequantization using the common function - // We need to cast away const for the inner product function, but it doesn't modify the vectors - const float res = SQ8_InnerProductSIMD16_SSE4_IMP(pVect1v, pVect2v, dimension); - - // For cosine, we need to account for the vector norms - // The inv_norm parameter is stored after min_val and delta in the quantized vector - return 1.0f - res * inv_norm; + // Assume vectors are normalized. + return 1.0f - SQ8_InnerProductSIMD16_SSE4_IMP(pVect1v, pVect2v, dimension); } diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8.h index d142fb6f9..7295843f5 100644 --- a/src/VecSim/spaces/IP/IP_SVE_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8.h @@ -19,7 +19,7 @@ * * This saves 1 FMA per chunk by deferring dequantization to scalar math at the end. * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [inv_norm (float)] + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)]] */ // Helper function to perform inner product step with algebraic optimization @@ -143,16 +143,7 @@ float SQ8_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t template float SQ8_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { - const uint8_t *pVect2 = static_cast(pVect2v); - - // Get quantization parameters - const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); - - // Compute inner product with dequantization using the common function - const float res = - SQ8_InnerProductSIMD_SVE_IMP(pVect1v, pVect2v, dimension); - - // For cosine, we need to account for the vector norms - // The inv_norm parameter is stored after min_val and delta in the quantized vector - return 1.0f - res * inv_norm; + // Assume vectors are normalized. + return 1.0f - SQ8_InnerProductSIMD_SVE_IMP(pVect1v, pVect2v, + dimension); } diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h index e43800a03..7bb494561 100644 --- a/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h @@ -22,7 +22,7 @@ * * All sums are computed using integer dot product instructions, converted to float only at the end. * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [inv_norm (float)] + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)]] */ // Helper function to perform inner product step using integer dot product diff --git a/src/VecSim/spaces/L2/L2.cpp b/src/VecSim/spaces/L2/L2.cpp index a68ea5114..cac76d0e2 100644 --- a/src/VecSim/spaces/L2/L2.cpp +++ b/src/VecSim/spaces/L2/L2.cpp @@ -20,7 +20,7 @@ float SQ8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension) { const auto *pVect2 = static_cast(pVect2v); // pvect2 is a vector of uint8_t, so we need to dequantize it, normalize it and then multiply // it. it structred as [quantized values (uint8_t * dim)][min_val (float)][delta - // (float)][inv_norm (float)] The last two values are used to dequantize the vector. + // (float)]] The last two values are used to dequantize the vector. const float min_val = *reinterpret_cast(pVect2 + dimension); const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); diff --git a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h index d2775f5be..f76b2d915 100644 --- a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h +++ b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h @@ -21,7 +21,7 @@ static inline void SQ8_L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, _ // Convert uint8 to float __m512 v2_f = _mm512_cvtepi32_ps(v2_512); - // Dequantize: (val * delta + min_val) * inv_norm + // Dequantize: (val * delta) + min_val __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec); // Compute difference @@ -42,7 +42,7 @@ float SQ8_L2SqrSIMD16_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2 const uint8_t *pVect2 = static_cast(pVect2v); const float *pEnd1 = pVect1 + dimension; - // Get dequantization parameters from the end of pVect2 + // Get dequantization parameters const float min_val = *reinterpret_cast(pVect2 + dimension); const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); @@ -66,7 +66,7 @@ float SQ8_L2SqrSIMD16_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2 __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); __m512 v2_f = _mm512_cvtepi32_ps(v2_512); - // Dequantize: (val * delta + min_val) * inv_norm + // Dequantize: (val * delta) + min_val __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec); // Compute difference diff --git a/tests/benchmark/scripts/run_serialization_benchmarks.sh b/tests/benchmark/scripts/run_serialization_benchmarks.sh new file mode 100755 index 000000000..087bc6f98 --- /dev/null +++ b/tests/benchmark/scripts/run_serialization_benchmarks.sh @@ -0,0 +1,102 @@ +#!/bin/bash +# Serialization Benchmark Script +# Runs HNSW disk serialization benchmarks for different dataset sizes and thread counts + +set -e + +# Configuration +SERIALIZER="./bin/Linux-x86_64-release/hnsw_disk_serializer/hnsw_disk_serializer" +DATA_DIR="tests/benchmark/data" +OUTPUT_DIR="tests/benchmark/data/serialization_benchmarks" +RESULTS_FILE="$OUTPUT_DIR/results.csv" + +# Dataset parameters +DIM=96 +METRIC="L2" +DATA_TYPE="FLOAT32" +M=32 +EFC=200 +BATCH_SIZE=1000 + +# Datasets and thread counts +DATASETS=("100K") +THREADS=(4 8) + +# Get branch name +BRANCH=$(git rev-parse --abbrev-ref HEAD) +BRANCH_SAFE=$(echo "$BRANCH" | tr '/' '-') + +mkdir -p "$OUTPUT_DIR" +# Build +make -j8 +# Initialize results file if it doesn't exist +if [ ! -f "$RESULTS_FILE" ]; then + echo "branch,dataset,threads,time_seconds,vectors_per_second" > "$RESULTS_FILE" +fi + +echo "==========================================" +echo "Serialization Benchmark" +echo "Branch: $BRANCH" +echo "==========================================" + +for dataset in "${DATASETS[@]}"; do + INPUT_FILE="$DATA_DIR/deep.base.${dataset}.fbin" + + if [ ! -f "$INPUT_FILE" ]; then + echo "ERROR: Input file not found: $INPUT_FILE" + continue + fi + + for threads in "${THREADS[@]}"; do + OUTPUT_NAME="$OUTPUT_DIR/deep-${dataset}-L2-dim${DIM}-M${M}-efc${EFC}-${BRANCH_SAFE}-${threads}t" + + echo "" + echo "----------------------------------------" + echo "Dataset: $dataset, Threads: $threads" + echo "Output: $OUTPUT_NAME" + echo "----------------------------------------" + + # Remove existing output if present + rm -rf "$OUTPUT_NAME" + + # Run benchmark and capture time + START_TIME=$(date +%s.%N) + + "$SERIALIZER" \ + "$INPUT_FILE" \ + "$OUTPUT_NAME" \ + "$DIM" "$METRIC" "$DATA_TYPE" \ + "$M" "$EFC" "$threads" "$BATCH_SIZE" + + END_TIME=$(date +%s.%N) + ELAPSED=$(echo "$END_TIME - $START_TIME" | bc) + + # Calculate vectors per second + if [ "$dataset" = "100K" ]; then + NUM_VECTORS=100000 + elif [ "$dataset" = "1M" ]; then + NUM_VECTORS=1000000 + fi + + VPS=$(echo "scale=2; $NUM_VECTORS / $ELAPSED" | bc) + + echo "" + echo "Time: ${ELAPSED}s" + echo "Vectors/sec: $VPS" + + # Append to results + echo "$BRANCH,$dataset,$threads,$ELAPSED,$VPS" >> "$RESULTS_FILE" + done +done + +echo "" +echo "==========================================" +echo "Benchmark Complete" +echo "Results saved to: $RESULTS_FILE" +echo "==========================================" + +# Display results +echo "" +echo "Results:" +cat "$RESULTS_FILE" + diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp index 0673dac18..3a3bb5cc3 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp @@ -25,9 +25,9 @@ class BM_VecSimSpaces_SQ8_SQ8 : public benchmark::Fixture { void SetUp(const ::benchmark::State &state) { dim = state.range(0); - // Allocate both vectors with extra space for min, delta and inv_norm - v1 = new uint8_t[dim + sizeof(float) * 3]; - v2 = new uint8_t[dim + sizeof(float) * 3]; + // Allocate both vectors with extra space for min and delta + v1 = new uint8_t[dim + sizeof(float) * 2]; + v2 = new uint8_t[dim + sizeof(float) * 2]; test_utils::populate_float_vec_to_sq8(v1, dim, 123); test_utils::populate_float_vec_to_sq8(v2, dim, 1234); } diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index faf3eb802..bd368dcb0 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -319,8 +319,8 @@ void common_ip_sq8(bool should_normalize, float expected_dist) { } // Create SQ8 compressed version of v2 - // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float) - size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float); + // Size: dim (uint8_t) + min_val (float) + delta (float) + size_t compressed_size = dim * sizeof(uint8_t) + 2 * sizeof(float); if (should_normalize) { spaces::GetNormalizeFunc()(v1_orig, dim); spaces::GetNormalizeFunc()(v2_orig, dim); @@ -334,7 +334,7 @@ void common_ip_sq8(bool should_normalize, float expected_dist) { max_val = std::max(max_val, v2_orig[i]); } - // Calculate delta and inverse norm + // Calculate delta float delta = (max_val - min_val) / 255.0f; if (delta == 0) delta = 1.0f; // Avoid division by zero @@ -372,7 +372,6 @@ TEST_F(SpacesTest, SQ8_ip_no_optimization_func_test) { TEST_F(SpacesTest, SQ8_ip_no_optimization_norm_func_test) { common_ip_sq8(true, 0.0f); } TEST_F(SpacesTest, SQ8_Cosine_no_optimization_func_test) { - // create a vector with extra space for the norm size_t dim = 5; // Create original vectors @@ -382,8 +381,8 @@ TEST_F(SpacesTest, SQ8_Cosine_no_optimization_func_test) { v2_orig[i] = float(i + 1.5); } - // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float) - size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float); + // Size: dim (uint8_t) + min_val (float) + delta (float) + size_t compressed_size = dim * sizeof(uint8_t) + 2 * sizeof(float); spaces::GetNormalizeFunc()(v1_orig, dim); // Find min and max for quantization float min_val = v2_orig[0]; @@ -392,7 +391,7 @@ TEST_F(SpacesTest, SQ8_Cosine_no_optimization_func_test) { min_val = std::min(min_val, v2_orig[i]); max_val = std::max(max_val, v2_orig[i]); } - // Calculate delta and inverse norm + // Calculate delta float delta = (max_val - min_val) / 255.0f; if (delta == 0) delta = 1.0f; // Avoid division by zero @@ -408,17 +407,9 @@ TEST_F(SpacesTest, SQ8_Cosine_no_optimization_func_test) { normalized = std::max(0.0f, std::min(255.0f, normalized)); quant_values[i] = static_cast(std::round(normalized)); } - // Calculate inverse norm from decompressed values - float inv_norm = 0.0f; - for (size_t i = 0; i < dim; i++) { - float decompressed_value = min_val + quant_values[i] * delta; - inv_norm += decompressed_value * decompressed_value; - } - inv_norm = 1.0f / std::sqrt(inv_norm); // Store parameters params[0] = min_val; params[1] = delta; - params[2] = inv_norm; float dist = SQ8_Cosine((const void *)v1_orig, (const void *)v2_compressed.data(), dim); ASSERT_NEAR(dist, 0.0f, 0.000001f) << "SQ8_Cosine failed to match expected distance"; @@ -434,8 +425,8 @@ TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) { v2_orig[i] = float(i + 1.5); } - // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float) - size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float); + // Size: dim (uint8_t) + min_val (float) + delta (float) + size_t compressed_size = dim * sizeof(uint8_t) + 2 * sizeof(float); spaces::GetNormalizeFunc()(v1_orig, dim); spaces::GetNormalizeFunc()(v2_orig, dim); // Find min and max for quantization @@ -445,7 +436,7 @@ TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) { min_val = std::min(min_val, v2_orig[i]); max_val = std::max(max_val, v2_orig[i]); } - // Calculate delta and inverse norm + // Calculate delta float delta = (max_val - min_val) / 255.0f; if (delta == 0) delta = 1.0f; // Avoid division by zero @@ -461,17 +452,10 @@ TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) { normalized = std::max(0.0f, std::min(255.0f, normalized)); quant_values[i] = static_cast(std::round(normalized)); } - // Calculate inverse norm from decompressed values - float inv_norm = 0.0f; - for (size_t i = 0; i < dim; i++) { - float decompressed_value = min_val + quant_values[i] * delta; - inv_norm += decompressed_value * decompressed_value; - } - inv_norm = 1.0f / std::sqrt(inv_norm); + // Store parameters params[0] = min_val; params[1] = delta; - params[2] = inv_norm; float dist = SQ8_L2Sqr((const void *)v1_orig, (const void *)v2_compressed.data(), dim); ASSERT_NEAR(dist, 0.0f, 0.00001f) << "SQ8_Cosine failed to match expected distance"; @@ -2067,8 +2051,8 @@ std::vector CreateSQ8CompressedVector(const float *original, size_t dim // Create a copy of the original vector that we can modify std::vector vec_copy(original, original + dim); - // Size: dim (uint8_t) + min_val (float) + delta (float) + norm (float) - size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float); + // Size: dim (uint8_t) + min_val (float) + delta (float) + size_t compressed_size = dim * sizeof(uint8_t) + 2 * sizeof(float); std::vector compressed(compressed_size); // Find min and max for quantization @@ -2086,21 +2070,16 @@ std::vector CreateSQ8CompressedVector(const float *original, size_t dim // Quantize vector uint8_t *quant_values = compressed.data(); - float norm = 0.0f; // Quantize each value for (size_t i = 0; i < dim; i++) { float normalized = (vec_copy[i] - min_val) / delta; normalized = std::max(0.0f, std::min(255.0f, normalized)); quant_values[i] = static_cast(std::round(normalized)); - norm += (quant_values[i] * delta + min_val) * (quant_values[i] * delta + min_val); } - - float inv_norm = 1.0f / std::sqrt(norm); // Store parameters float *params = reinterpret_cast(quant_values + dim); params[0] = min_val; params[1] = delta; - params[2] = inv_norm; return compressed; } From 13a477b39383c962798ee96779eef4d1a8333c06 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Mon, 29 Dec 2025 11:27:13 +0200 Subject: [PATCH 15/51] Update SQ8 Cosine test to normalize both input vectors and adjust distance assertion tolerance --- tests/unit/test_spaces.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index bd368dcb0..b22ef947c 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -384,6 +384,7 @@ TEST_F(SpacesTest, SQ8_Cosine_no_optimization_func_test) { // Size: dim (uint8_t) + min_val (float) + delta (float) size_t compressed_size = dim * sizeof(uint8_t) + 2 * sizeof(float); spaces::GetNormalizeFunc()(v1_orig, dim); + spaces::GetNormalizeFunc()(v2_orig, dim); // Find min and max for quantization float min_val = v2_orig[0]; float max_val = v2_orig[0]; @@ -412,7 +413,7 @@ TEST_F(SpacesTest, SQ8_Cosine_no_optimization_func_test) { params[1] = delta; float dist = SQ8_Cosine((const void *)v1_orig, (const void *)v2_compressed.data(), dim); - ASSERT_NEAR(dist, 0.0f, 0.000001f) << "SQ8_Cosine failed to match expected distance"; + ASSERT_NEAR(dist, 0.0f, 0.001f) << "SQ8_Cosine failed to match expected distance"; } TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) { // create a vector with extra space for the norm @@ -458,7 +459,7 @@ TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) { params[1] = delta; float dist = SQ8_L2Sqr((const void *)v1_orig, (const void *)v2_compressed.data(), dim); - ASSERT_NEAR(dist, 0.0f, 0.00001f) << "SQ8_Cosine failed to match expected distance"; + ASSERT_NEAR(dist, 0.0f, 0.0001f) << "SQ8_Cosine failed to match expected distance"; } /* ======================== Test Getters ======================== */ From c18000eb9679a30cb115cd670073dc6857ad74af Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Mon, 29 Dec 2025 12:06:15 +0200 Subject: [PATCH 16/51] Rename 'compressed' to 'quantized' in SQ8 functions for clarity and consistency --- tests/unit/test_spaces.cpp | 180 ++++++++++++++++++------------------- 1 file changed, 90 insertions(+), 90 deletions(-) diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index b22ef947c..9343d295d 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -318,9 +318,9 @@ void common_ip_sq8(bool should_normalize, float expected_dist) { v2_orig[i] = float(i + 1.5); } - // Create SQ8 compressed version of v2 + // Create SQ8 quantized version of v2 // Size: dim (uint8_t) + min_val (float) + delta (float) - size_t compressed_size = dim * sizeof(uint8_t) + 2 * sizeof(float); + size_t quantized_size = dim * sizeof(uint8_t) + 2 * sizeof(float); if (should_normalize) { spaces::GetNormalizeFunc()(v1_orig, dim); spaces::GetNormalizeFunc()(v2_orig, dim); @@ -339,10 +339,10 @@ void common_ip_sq8(bool should_normalize, float expected_dist) { if (delta == 0) delta = 1.0f; // Avoid division by zero - std::vector v2_compressed(compressed_size); + std::vector v2_quantized(quantized_size); // Quantize v2 - uint8_t *quant_values = reinterpret_cast(v2_compressed.data()); + uint8_t *quant_values = reinterpret_cast(v2_quantized.data()); float *params = reinterpret_cast(quant_values + dim); // Store parameters @@ -356,7 +356,7 @@ void common_ip_sq8(bool should_normalize, float expected_dist) { quant_values[i] = static_cast(std::round(normalized)); } - float dist = SQ8_InnerProduct((const void *)v1_orig, (const void *)v2_compressed.data(), dim); + float dist = SQ8_InnerProduct((const void *)v1_orig, (const void *)v2_quantized.data(), dim); // Since we're comparing identical vectors, the inner product distance should be close to // expected @@ -382,7 +382,7 @@ TEST_F(SpacesTest, SQ8_Cosine_no_optimization_func_test) { } // Size: dim (uint8_t) + min_val (float) + delta (float) - size_t compressed_size = dim * sizeof(uint8_t) + 2 * sizeof(float); + size_t quantized_size = dim * sizeof(uint8_t) + 2 * sizeof(float); spaces::GetNormalizeFunc()(v1_orig, dim); spaces::GetNormalizeFunc()(v2_orig, dim); // Find min and max for quantization @@ -397,22 +397,22 @@ TEST_F(SpacesTest, SQ8_Cosine_no_optimization_func_test) { if (delta == 0) delta = 1.0f; // Avoid division by zero - // Compress v2 - std::vector v2_compressed(compressed_size); - uint8_t *quant_values = reinterpret_cast(v2_compressed.data()); + // Quantize v2 + std::vector v2_quantized(quantized_size); + uint8_t *quant_values = reinterpret_cast(v2_quantized.data()); float *params = reinterpret_cast(quant_values + dim); // Quantize each value for (size_t i = 0; i < dim; i++) { - float normalized = (v2_orig[i] - min_val) / delta; - normalized = std::max(0.0f, std::min(255.0f, normalized)); - quant_values[i] = static_cast(std::round(normalized)); + float quantized = (v2_orig[i] - min_val) / delta; + quantized = std::max(0.0f, std::min(255.0f, quantized)); + quant_values[i] = static_cast(std::round(quantized)); } // Store parameters params[0] = min_val; params[1] = delta; - float dist = SQ8_Cosine((const void *)v1_orig, (const void *)v2_compressed.data(), dim); + float dist = SQ8_Cosine((const void *)v1_orig, (const void *)v2_quantized.data(), dim); ASSERT_NEAR(dist, 0.0f, 0.001f) << "SQ8_Cosine failed to match expected distance"; } TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) { @@ -427,7 +427,7 @@ TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) { } // Size: dim (uint8_t) + min_val (float) + delta (float) - size_t compressed_size = dim * sizeof(uint8_t) + 2 * sizeof(float); + size_t quantized_size = dim * sizeof(uint8_t) + 2 * sizeof(float); spaces::GetNormalizeFunc()(v1_orig, dim); spaces::GetNormalizeFunc()(v2_orig, dim); // Find min and max for quantization @@ -442,24 +442,24 @@ TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) { if (delta == 0) delta = 1.0f; // Avoid division by zero - // Compress v2 - std::vector v2_compressed(compressed_size); - uint8_t *quant_values = reinterpret_cast(v2_compressed.data()); + // Quantize v2 + std::vector v2_quantized(quantized_size); + uint8_t *quant_values = reinterpret_cast(v2_quantized.data()); float *params = reinterpret_cast(quant_values + dim); // Quantize each value for (size_t i = 0; i < dim; i++) { - float normalized = (v2_orig[i] - min_val) / delta; - normalized = std::max(0.0f, std::min(255.0f, normalized)); - quant_values[i] = static_cast(std::round(normalized)); + float quantized = (v2_orig[i] - min_val) / delta; + quantized = std::max(0.0f, std::min(255.0f, quantized)); + quant_values[i] = static_cast(std::round(quantized)); } // Store parameters params[0] = min_val; params[1] = delta; - float dist = SQ8_L2Sqr((const void *)v1_orig, (const void *)v2_compressed.data(), dim); - ASSERT_NEAR(dist, 0.0f, 0.0001f) << "SQ8_Cosine failed to match expected distance"; + float dist = SQ8_L2Sqr((const void *)v1_orig, (const void *)v2_quantized.data(), dim); + ASSERT_NEAR(dist, 0.0f, 0.0001f) << "SQ8_L2Sqr failed to match expected distance"; } /* ======================== Test Getters ======================== */ @@ -2047,14 +2047,14 @@ TEST_P(UINT8SpacesOptimizationTest, UINT8_full_range_test) { INSTANTIATE_TEST_SUITE_P(UINT8OptFuncs, UINT8SpacesOptimizationTest, testing::Range(32UL, 64 * 2UL + 1)); -// Helper function to create SQ8 compressed vector -std::vector CreateSQ8CompressedVector(const float *original, size_t dim) { +// Helper function to create SQ8 quantized vector +std::vector CreateSQ8QuantizedVector(const float *original, size_t dim) { // Create a copy of the original vector that we can modify std::vector vec_copy(original, original + dim); // Size: dim (uint8_t) + min_val (float) + delta (float) - size_t compressed_size = dim * sizeof(uint8_t) + 2 * sizeof(float); - std::vector compressed(compressed_size); + size_t quantized_size = dim * sizeof(uint8_t) + 2 * sizeof(float); + std::vector quantized(quantized_size); // Find min and max for quantization float min_val = vec_copy[0]; @@ -2070,7 +2070,7 @@ std::vector CreateSQ8CompressedVector(const float *original, size_t dim delta = 1.0f; // Avoid division by zero // Quantize vector - uint8_t *quant_values = compressed.data(); + uint8_t *quant_values = quantized.data(); // Quantize each value for (size_t i = 0; i < dim; i++) { float normalized = (vec_copy[i] - min_val) / delta; @@ -2082,7 +2082,7 @@ std::vector CreateSQ8CompressedVector(const float *original, size_t dim params[0] = min_val; params[1] = delta; - return compressed; + return quantized; } class SQ8SpacesOptimizationTest : public testing::TestWithParam {}; @@ -2099,8 +2099,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { v2_orig[i] = float(i * 0.75 + 1.0); } - // Create SQ8 compressed version of v2 - std::vector v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim); + // Create SQ8 quantized version of v2 + std::vector v2_quantized = CreateSQ8QuantizedVector(v2_orig.data(), dim); auto expected_alignment = [](size_t reg_bit_size, size_t dim) { size_t elements_in_reg = reg_bit_size / sizeof(uint8_t) / 8; @@ -2108,7 +2108,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { }; dist_func_t arch_opt_func; - float baseline = SQ8_L2Sqr(v1_orig.data(), v2_compressed.data(), dim); + float baseline = SQ8_L2Sqr(v1_orig.data(), v2_quantized.data(), dim); // Test different optimizations based on CPU features #ifdef OPT_AVX512_F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { @@ -2116,7 +2116,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.02) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.02) << "AVX512 with dim " << dim; // ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim; // Unset optimizations flag, so we'll choose the next optimization. @@ -2129,7 +2129,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX2_FMA(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.02) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.02) << "AVX with dim " << dim; // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim; // Unset optimizations flag, so we'll choose the next optimization. @@ -2142,7 +2142,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX2(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.02) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.02) << "AVX with dim " << dim; // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim; // Unset avx flag as well, so we'll choose the next optimization (SSE). @@ -2155,7 +2155,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_SSE4(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.02) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.02) << "SSE with dim " << dim; // ASSERT_EQ(alignment, expected_alignment(128, dim)) << "SSE with dim " << dim; // Unset sse flag as well, so we'll choose the next optimization (default). @@ -2169,7 +2169,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_SVE2(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.02) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.02) << "SVE2 with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; // Unset sve2 flag as well, so we'll choose the next option (default). @@ -2182,7 +2182,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_SVE(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.02) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.02) << "SVE with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; // Unset sve flag as well, so we'll choose the next option (default). @@ -2195,7 +2195,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_NEON(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.02) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.02) << "NEON with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; // Unset optimizations flag, so we'll choose the next optimization. @@ -2207,7 +2207,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { unsigned char alignment = 0; arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, SQ8_L2Sqr) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.02) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.02) << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } @@ -2225,10 +2225,10 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { } spaces::GetNormalizeFunc()(v1_orig.data(), dim); - // Create SQ8 compressed version of v2 - std::vector v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim); + // Create SQ8 quantized version of v2 + std::vector v2_quantized = CreateSQ8QuantizedVector(v2_orig.data(), dim); // print min and delta - float *params = reinterpret_cast(v2_compressed.data() + dim); + float *params = reinterpret_cast(v2_quantized.data() + dim); auto expected_alignment = [](size_t reg_bit_size, size_t dim) { size_t elements_in_reg = reg_bit_size / sizeof(uint8_t) / 8; @@ -2236,7 +2236,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { }; dist_func_t arch_opt_func; - float baseline = SQ8_InnerProduct(v1_orig.data(), v2_compressed.data(), dim); + float baseline = SQ8_InnerProduct(v1_orig.data(), v2_quantized.data(), dim); // Test different optimizations based on CPU features #ifdef OPT_AVX512_F_BW_VL_VNNI @@ -2245,7 +2245,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) << "AVX512 with dim " << dim; optimization.avx512f = 0; } @@ -2256,7 +2256,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX2_FMA(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) << "AVX with dim " << dim; optimization.fma3 = 0; } @@ -2267,7 +2267,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX2(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) << "AVX with dim " << dim; optimization.avx2 = 0; } @@ -2278,7 +2278,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_SSE4(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) << "SSE with dim " << dim; optimization.sse4_1 = 0; } @@ -2289,7 +2289,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_SVE2(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) << "SVE2 with dim " << dim; optimization.sve2 = 0; } @@ -2300,7 +2300,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_SVE(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) << "SVE with dim " << dim; optimization.sve = 0; } @@ -2311,7 +2311,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_NEON(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) << "NEON with dim " << dim; optimization.asimd = 0; } @@ -2322,7 +2322,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, SQ8_InnerProduct) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } @@ -2349,8 +2349,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { spaces::GetNormalizeFunc()(v1_orig.data(), dim); spaces::GetNormalizeFunc()(v2_orig.data(), dim); - // Create SQ8 compressed version of v2 (with normalization) - std::vector v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim); + // Create SQ8 quantized version of v2 (with normalization) + std::vector v2_quantized = CreateSQ8QuantizedVector(v2_orig.data(), dim); auto expected_alignment = [](size_t reg_bit_size, size_t dim) { size_t elements_in_reg = reg_bit_size / sizeof(uint8_t) / 8; @@ -2358,7 +2358,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { }; dist_func_t arch_opt_func; - float baseline = SQ8_Cosine(v1_orig.data(), v2_compressed.data(), dim); + float baseline = SQ8_Cosine(v1_orig.data(), v2_quantized.data(), dim); #ifdef OPT_SVE2 if (optimization.sve2) { @@ -2366,7 +2366,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_SVE2(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) << "SVE2 with dim " << dim; optimization.sve2 = 0; } @@ -2377,7 +2377,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_SVE(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) << "SVE with dim " << dim; optimization.sve = 0; } @@ -2388,7 +2388,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_NEON(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) << "NEON with dim " << dim; optimization.asimd = 0; } @@ -2401,7 +2401,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) << "AVX512 with dim " << dim; optimization.avx512f = 0; } @@ -2412,7 +2412,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX2_FMA(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) << "AVX with dim " << dim; optimization.fma3 = 0; } @@ -2423,7 +2423,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX2(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) << "AVX with dim " << dim; optimization.avx2 = 0; } @@ -2435,7 +2435,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_SSE4(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) << "SSE with dim " << dim; optimization.sse4_1 = 0; } @@ -2445,7 +2445,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, SQ8_Cosine) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } @@ -2467,14 +2467,14 @@ TEST_F(SpacesTest, SQ8_SQ8_ip_no_optimization_func_test) { spaces::GetNormalizeFunc()(v1_orig, dim); spaces::GetNormalizeFunc()(v2_orig, dim); - // Create SQ8 compressed versions of both vectors - std::vector v1_compressed = CreateSQ8CompressedVector(v1_orig, dim); - std::vector v2_compressed = CreateSQ8CompressedVector(v2_orig, dim); + // Create SQ8 quantized versions of both vectors + std::vector v1_quantized = CreateSQ8QuantizedVector(v1_orig, dim); + std::vector v2_quantized = CreateSQ8QuantizedVector(v2_orig, dim); // Get distance function with nullptr alignment to cover that code path auto dist_func = IP_SQ8_SQ8_GetDistFunc(dim, nullptr, nullptr); float dist = - dist_func((const void *)v1_compressed.data(), (const void *)v2_compressed.data(), dim); + dist_func((const void *)v1_quantized.data(), (const void *)v2_quantized.data(), dim); // Since we're comparing identical normalized vectors, distance should be close to 0 ASSERT_NEAR(dist, 0.0f, 0.01f) << "SQ8_SQ8_InnerProduct failed to match expected distance"; @@ -2494,14 +2494,14 @@ TEST_F(SpacesTest, SQ8_SQ8_Cosine_no_optimization_func_test) { spaces::GetNormalizeFunc()(v1_orig, dim); spaces::GetNormalizeFunc()(v2_orig, dim); - // Create SQ8 compressed versions of both vectors - std::vector v1_compressed = CreateSQ8CompressedVector(v1_orig, dim); - std::vector v2_compressed = CreateSQ8CompressedVector(v2_orig, dim); + // Create SQ8 quantized versions of both vectors + std::vector v1_quantized = CreateSQ8QuantizedVector(v1_orig, dim); + std::vector v2_quantized = CreateSQ8QuantizedVector(v2_orig, dim); // Get distance function with nullptr alignment to cover that code path auto dist_func = Cosine_SQ8_SQ8_GetDistFunc(dim, nullptr, nullptr); float dist = - dist_func((const void *)v1_compressed.data(), (const void *)v2_compressed.data(), dim); + dist_func((const void *)v1_quantized.data(), (const void *)v2_quantized.data(), dim); // Since we're comparing identical normalized vectors, cosine distance should be close to 0 ASSERT_NEAR(dist, 0.0f, 0.01f) << "SQ8_SQ8_Cosine failed to match expected distance"; @@ -2525,12 +2525,12 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_InnerProductTest) { spaces::GetNormalizeFunc()(v1_orig.data(), dim); spaces::GetNormalizeFunc()(v2_orig.data(), dim); - // Create SQ8 compressed versions of both vectors - std::vector v1_compressed = CreateSQ8CompressedVector(v1_orig.data(), dim); - std::vector v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim); + // Create SQ8 quantized versions of both vectors + std::vector v1_quantized = CreateSQ8QuantizedVector(v1_orig.data(), dim); + std::vector v2_quantized = CreateSQ8QuantizedVector(v2_orig.data(), dim); dist_func_t arch_opt_func; - float baseline = SQ8_SQ8_InnerProduct(v1_compressed.data(), v2_compressed.data(), dim); + float baseline = SQ8_SQ8_InnerProduct(v1_quantized.data(), v2_quantized.data(), dim); #ifdef OPT_SVE2 if (optimization.sve2) { @@ -2538,7 +2538,7 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_InnerProductTest) { arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_SQ8_IP_implementation_SVE2(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_compressed.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.01) << "SVE2 with dim " << dim; optimization.sve2 = 0; } @@ -2549,7 +2549,7 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_InnerProductTest) { arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_SQ8_IP_implementation_SVE(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_compressed.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.01) << "SVE with dim " << dim; optimization.sve = 0; } @@ -2560,7 +2560,7 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_InnerProductTest) { arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_SQ8_IP_implementation_NEON_DOTPROD(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_compressed.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.01) << "NEON_DOTPROD with dim " << dim; optimization.asimddp = 0; } @@ -2571,7 +2571,7 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_InnerProductTest) { arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_SQ8_IP_implementation_NEON(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_compressed.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.01) << "NEON with dim " << dim; optimization.asimd = 0; } @@ -2583,7 +2583,7 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_InnerProductTest) { arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_compressed.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.01) << "AVX512 with dim " << dim; optimization.avx512f = 0; } @@ -2594,7 +2594,7 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_InnerProductTest) { arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, SQ8_SQ8_InnerProduct) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_compressed.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.01) << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } @@ -2615,12 +2615,12 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_CosineTest) { spaces::GetNormalizeFunc()(v1_orig.data(), dim); spaces::GetNormalizeFunc()(v2_orig.data(), dim); - // Create SQ8 compressed versions of both vectors - std::vector v1_compressed = CreateSQ8CompressedVector(v1_orig.data(), dim); - std::vector v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim); + // Create SQ8 quantized versions of both vectors + std::vector v1_quantized = CreateSQ8QuantizedVector(v1_orig.data(), dim); + std::vector v2_quantized = CreateSQ8QuantizedVector(v2_orig.data(), dim); dist_func_t arch_opt_func; - float baseline = SQ8_SQ8_Cosine(v1_compressed.data(), v2_compressed.data(), dim); + float baseline = SQ8_SQ8_Cosine(v1_quantized.data(), v2_quantized.data(), dim); #ifdef OPT_SVE2 if (optimization.sve2) { @@ -2628,7 +2628,7 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_CosineTest) { arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_SQ8_Cosine_implementation_SVE2(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_compressed.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.01) << "SVE2 with dim " << dim; optimization.sve2 = 0; } @@ -2639,7 +2639,7 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_CosineTest) { arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_SQ8_Cosine_implementation_SVE(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_compressed.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.01) << "SVE with dim " << dim; optimization.sve = 0; } @@ -2650,7 +2650,7 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_CosineTest) { arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_SQ8_Cosine_implementation_NEON_DOTPROD(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_compressed.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.01) << "NEON_DOTPROD with dim " << dim; optimization.asimddp = 0; } @@ -2661,7 +2661,7 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_CosineTest) { arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_SQ8_Cosine_implementation_NEON(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_compressed.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.01) << "NEON with dim " << dim; optimization.asimd = 0; } @@ -2673,7 +2673,7 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_CosineTest) { arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_compressed.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.01) << "AVX512 with dim " << dim; optimization.avx512f = 0; } @@ -2684,7 +2684,7 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_CosineTest) { arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, SQ8_SQ8_Cosine) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_compressed.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.01) << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } From bbf810ee2376b6af1b75ea613ea64b41373c22a6 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Mon, 29 Dec 2025 15:22:24 +0200 Subject: [PATCH 17/51] Implement SQ8-to-SQ8 distance functions with precomputed sum and norm using AVX512 VNNI; add benchmarks and tests for new functionality --- src/VecSim/spaces/IP/IP.cpp | 3 +- .../spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h | 2 +- ...P_AVX512F_SQ8_SQ8_BW_VL_VNNI_Precomputed.h | 163 ++++++++++++++++++ .../spaces/functions/AVX512F_BW_VL_VNNI.cpp | 16 ++ .../spaces/functions/AVX512F_BW_VL_VNNI.h | 4 + .../spaces_benchmarks/bm_spaces_sq8_sq8.cpp | 35 ++++ tests/unit/test_spaces.cpp | 136 +++++++++++++++ tests/utils/tests_utils.h | 55 +++++- 8 files changed, 407 insertions(+), 7 deletions(-) create mode 100644 src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI_Precomputed.h diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp index b7bc73327..02f5e383e 100644 --- a/src/VecSim/spaces/IP/IP.cpp +++ b/src/VecSim/spaces/IP/IP.cpp @@ -45,8 +45,7 @@ float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) { const float min_val = *reinterpret_cast(pVect2 + dimension); const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); // Compute inner product with dequantization - const float res = - FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta); + const float res = FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta); return 1.0f - res; } diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h index 69205a49f..e79b179e4 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h @@ -19,7 +19,7 @@ * * IP = Σ (v1[i]*δ1 + min1) * (v2[i]*δ2 + min2) * = δ1*δ2 * Σ(v1[i]*v2[i]) + δ1*min2 * Σv1[i] + δ2*min1 * Σv2[i] + dim*min1*min2 - * TODO: Can store the vector's norm and sum of elements in the vector data, and use it here. + * TODO: Can store the vector's sum and norm of elements in the vector data, and use it here. * * This allows using VNNI's _mm512_dpwssd_epi32 for efficient integer dot product, * then applying scalar corrections at the end. diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI_Precomputed.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI_Precomputed.h new file mode 100644 index 000000000..7533bd405 --- /dev/null +++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI_Precomputed.h @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include + +/** + * SQ8-to-SQ8 distance functions using AVX512 VNNI with precomputed sum and norm. + * These functions compute distance between two SQ8 (scalar quantized 8-bit) vectors, + * where BOTH vectors are uint8 quantized. + * + * This version uses precomputed sum and norm stored in the vector data, + * eliminating the need to compute them during distance calculation. + * + * Uses algebraic optimization to leverage integer VNNI instructions: + * + * IP = Σ (v1[i]*δ1 + min1) * (v2[i]*δ2 + min2) + * = δ1*δ2 * Σ(v1[i]*v2[i]) + δ1*min2 * Σv1[i] + δ2*min1 * Σv2[i] + dim*min1*min2 + * + * Since sum = Σv[i] is precomputed, we only need to compute the dot product Σ(v1[i]*v2[i]). + * + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] [norm + * (float)] + */ + +// Process 64 uint8 elements using VNNI with multiple accumulators for ILP (dot product only) +static inline void SQ8_SQ8_Precomputed_InnerProductStep64(const uint8_t *pVec1, + const uint8_t *pVec2, __m512i &dot_acc0, + __m512i &dot_acc1) { + // Load 64 bytes from each vector + __m512i v1_full = _mm512_loadu_si512(reinterpret_cast(pVec1)); + __m512i v2_full = _mm512_loadu_si512(reinterpret_cast(pVec2)); + + // Extract lower and upper 256-bit halves + __m256i v1_lo = _mm512_castsi512_si256(v1_full); + __m256i v1_hi = _mm512_extracti64x4_epi64(v1_full, 1); + __m256i v2_lo = _mm512_castsi512_si256(v2_full); + __m256i v2_hi = _mm512_extracti64x4_epi64(v2_full, 1); + + // Convert to int16 (zero-extend) and compute dot products using VNNI + // dpwssd: multiply pairs of int16, sum pairs to int32, accumulate + dot_acc0 = + _mm512_dpwssd_epi32(dot_acc0, _mm512_cvtepu8_epi16(v1_lo), _mm512_cvtepu8_epi16(v2_lo)); + dot_acc1 = + _mm512_dpwssd_epi32(dot_acc1, _mm512_cvtepu8_epi16(v1_hi), _mm512_cvtepu8_epi16(v2_hi)); +} + +// Process 32 uint8 elements using VNNI (dot product only) +static inline void SQ8_SQ8_Precomputed_InnerProductStep32(const uint8_t *pVec1, + const uint8_t *pVec2, __m512i &dot_acc) { + // Load 32 bytes from each vector + __m256i v1_256 = _mm256_loadu_si256(reinterpret_cast(pVec1)); + __m256i v2_256 = _mm256_loadu_si256(reinterpret_cast(pVec2)); + + // Convert to int16 (zero-extend) and compute dot product using VNNI + dot_acc = + _mm512_dpwssd_epi32(dot_acc, _mm512_cvtepu8_epi16(v1_256), _mm512_cvtepu8_epi16(v2_256)); +} + +// Common implementation for inner product between two SQ8 vectors with precomputed sum/norm +template // 0..63 +float SQ8_SQ8_Precomputed_InnerProductImp(const void *pVec1v, const void *pVec2v, + size_t dimension) { + const uint8_t *pVec1 = static_cast(pVec1v); + const uint8_t *pVec2 = static_cast(pVec2v); + const uint8_t *pEnd1 = pVec1 + dimension; + + // Get dequantization parameters and precomputed values from the end of pVec1 + // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] + const float *params1 = reinterpret_cast(pVec1 + dimension); + const float min1 = params1[0]; + const float delta1 = params1[1]; + const float sum1 = params1[2]; // Precomputed sum of uint8 elements + // const float norm1 = params1[3]; // Precomputed norm (sum of squares) - not used for IP + + // Get dequantization parameters and precomputed values from the end of pVec2 + const float *params2 = reinterpret_cast(pVec2 + dimension); + const float min2 = params2[0]; + const float delta2 = params2[1]; + const float sum2 = params2[2]; // Precomputed sum of uint8 elements + // const float norm2 = params2[3]; // Precomputed norm (sum of squares) - not used for IP + + // Multiple accumulators for instruction-level parallelism (dot product only) + __m512i dot_acc0 = _mm512_setzero_si512(); + __m512i dot_acc1 = _mm512_setzero_si512(); + + // Handle residual first (0..63 elements) + if constexpr (residual > 0) { + if constexpr (residual < 32) { + // Handle less than 32 elements with mask + constexpr __mmask32 mask = (1LU << residual) - 1; + __m256i v1_256 = _mm256_maskz_loadu_epi8(mask, pVec1); + __m256i v2_256 = _mm256_maskz_loadu_epi8(mask, pVec2); + + // Convert to int16 and compute dot product + dot_acc0 = _mm512_dpwssd_epi32(dot_acc0, _mm512_cvtepu8_epi16(v1_256), + _mm512_cvtepu8_epi16(v2_256)); + } else if constexpr (residual == 32) { + // Exactly 32 elements + SQ8_SQ8_Precomputed_InnerProductStep32(pVec1, pVec2, dot_acc0); + } else { + // 33-63 elements: use masked 64-byte load + constexpr __mmask64 mask = (1LLU << residual) - 1; + __m512i v1_full = _mm512_maskz_loadu_epi8(mask, pVec1); + __m512i v2_full = _mm512_maskz_loadu_epi8(mask, pVec2); + + // Extract halves and compute dot products + __m256i v1_lo = _mm512_castsi512_si256(v1_full); + __m256i v1_hi = _mm512_extracti64x4_epi64(v1_full, 1); + __m256i v2_lo = _mm512_castsi512_si256(v2_full); + __m256i v2_hi = _mm512_extracti64x4_epi64(v2_full, 1); + + dot_acc0 = _mm512_dpwssd_epi32(dot_acc0, _mm512_cvtepu8_epi16(v1_lo), + _mm512_cvtepu8_epi16(v2_lo)); + dot_acc1 = _mm512_dpwssd_epi32(dot_acc1, _mm512_cvtepu8_epi16(v1_hi), + _mm512_cvtepu8_epi16(v2_hi)); + } + pVec1 += residual; + pVec2 += residual; + } + + // Process full 64-byte chunks + while (pVec1 < pEnd1) { + SQ8_SQ8_Precomputed_InnerProductStep64(pVec1, pVec2, dot_acc0, dot_acc1); + pVec1 += 64; + pVec2 += 64; + } + + // Combine dot product accumulators and reduce + __m512i dot_total = _mm512_add_epi32(dot_acc0, dot_acc1); + int64_t dot_product = _mm512_reduce_add_epi32(dot_total); + + // Apply the algebraic formula using precomputed sums: + // IP = δ1*δ2 * Σ(v1[i]*v2[i]) + δ1*min2 * Σv1[i] + δ2*min1 * Σv2[i] + dim*min1*min2 + float result = delta1 * delta2 * static_cast(dot_product) + delta1 * min2 * sum1 + + delta2 * min1 * sum2 + static_cast(dimension) * min1 * min2; + + return result; +} + +// SQ8-to-SQ8 Inner Product distance function with precomputed sum/norm +// Returns 1 - inner_product (distance form) +template // 0..63 +float SQ8_SQ8_Precomputed_InnerProductSIMD64_AVX512F_BW_VL_VNNI(const void *pVec1v, + const void *pVec2v, + size_t dimension) { + return 1.0f - SQ8_SQ8_Precomputed_InnerProductImp(pVec1v, pVec2v, dimension); +} + +// SQ8-to-SQ8 Cosine distance function with precomputed sum/norm +// Returns 1 - (inner_product) +template // 0..63 +float SQ8_SQ8_Precomputed_CosineSIMD64_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, + size_t dimension) { + // Assume vectors are normalized. + return 1.0f - SQ8_SQ8_Precomputed_InnerProductImp(pVec1v, pVec2v, dimension); +} diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp index c8988daf2..09f39480e 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp @@ -18,6 +18,7 @@ #include "VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h" #include "VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h" +#include "VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI_Precomputed.h" namespace spaces { @@ -87,6 +88,21 @@ dist_func_t Choose_SQ8_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_ return ret_dist_func; } +// SQ8-to-SQ8 distance functions with precomputed sum and norm +dist_func_t Choose_SQ8_SQ8_Precomputed_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, + SQ8_SQ8_Precomputed_InnerProductSIMD64_AVX512F_BW_VL_VNNI); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_SQ8_Precomputed_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, + SQ8_SQ8_Precomputed_CosineSIMD64_AVX512F_BW_VL_VNNI); + return ret_dist_func; +} + #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h index 205235bbd..5bcdd63de 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h @@ -28,4 +28,8 @@ dist_func_t Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim); dist_func_t Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim); dist_func_t Choose_SQ8_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim); +// SQ8-to-SQ8 distance functions with precomputed sum and norm +dist_func_t Choose_SQ8_SQ8_Precomputed_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim); +dist_func_t Choose_SQ8_SQ8_Precomputed_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim); + } // namespace spaces diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp index 3a3bb5cc3..96aff6df6 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp @@ -37,6 +37,35 @@ class BM_VecSimSpaces_SQ8_SQ8 : public benchmark::Fixture { } }; +/** + * SQ8-to-SQ8 Precomputed benchmarks: Same as above but with precomputed sum and norm. + * Vector layout: [uint8_t values (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] + */ +class BM_VecSimSpaces_SQ8_SQ8_Precomputed : public benchmark::Fixture { +protected: + std::mt19937 rng; + size_t dim; + uint8_t *v1; + uint8_t *v2; + +public: + BM_VecSimSpaces_SQ8_SQ8_Precomputed() { rng.seed(47); } + ~BM_VecSimSpaces_SQ8_SQ8_Precomputed() = default; + + void SetUp(const ::benchmark::State &state) { + dim = state.range(0); + // Allocate both vectors with extra space for min, delta, sum, and norm (4 floats) + v1 = new uint8_t[dim + sizeof(float) * 4]; + v2 = new uint8_t[dim + sizeof(float) * 4]; + test_utils::populate_float_vec_to_sq8_with_sum_norm(v1, dim, 123); + test_utils::populate_float_vec_to_sq8_with_sum_norm(v2, dim, 1234); + } + void TearDown(const ::benchmark::State &state) { + delete[] v1; + delete[] v2; + } +}; + #ifdef CPU_FEATURES_ARCH_AARCH64 cpu_features::Aarch64Features opt = cpu_features::GetAarch64Info().features; @@ -70,6 +99,12 @@ INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, AVX512F_BW_VL_VNN avx512_f_bw_vl_vnni_supported); INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, AVX512F_BW_VL_VNNI, 64, avx512_f_bw_vl_vnni_supported); + +// AVX512_F_BW_VL_VNNI SQ8-to-SQ8 Precomputed functions (using precomputed sum and norm) +INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8_Precomputed, SQ8_SQ8_Precomputed, + AVX512F_BW_VL_VNNI, 64, avx512_f_bw_vl_vnni_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8_Precomputed, SQ8_SQ8_Precomputed, + AVX512F_BW_VL_VNNI, 64, avx512_f_bw_vl_vnni_supported); #endif // AVX512_F_BW_VL_VNNI #endif // x86_64 diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 9343d295d..721951711 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2694,3 +2694,139 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_CosineTest) { // that are only enabled or meaningfully stressed for dimensions >= 64. INSTANTIATE_TEST_SUITE_P(SQ8_SQ8OptFuncs, SQ8_SQ8_SpacesOptimizationTest, testing::Range(64UL, 64 * 2UL + 1)); + +/* ======================== Tests SQ8_SQ8 Precomputed ========================= */ + +// Helper function to create SQ8 quantized vector with precomputed sum and norm +// Vector layout: [uint8_t values (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] +std::vector CreateSQ8QuantizedVectorWithSumNorm(const float *original, size_t dim) { + std::vector vec_copy(original, original + dim); + + // Size: dim (uint8_t) + min_val (float) + delta (float) + sum (float) + norm (float) + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector quantized(quantized_size); + + // Find min and max for quantization + float min_val = vec_copy[0]; + float max_val = vec_copy[0]; + for (size_t i = 1; i < dim; i++) { + min_val = std::min(min_val, vec_copy[i]); + max_val = std::max(max_val, vec_copy[i]); + } + + // Calculate delta + float delta = (max_val - min_val) / 255.0f; + if (delta == 0) + delta = 1.0f; + + // Quantize vector and compute sum and norm + uint8_t *quant_values = quantized.data(); + float sum = 0.0f; + float norm = 0.0f; + for (size_t i = 0; i < dim; i++) { + float normalized = (vec_copy[i] - min_val) / delta; + normalized = std::max(0.0f, std::min(255.0f, normalized)); + quant_values[i] = static_cast(std::round(normalized)); + sum += static_cast(quant_values[i]); + norm += static_cast(quant_values[i]) * static_cast(quant_values[i]); + } + + // Store parameters: [min, delta, sum, norm] + float *params = reinterpret_cast(quant_values + dim); + params[0] = min_val; + params[1] = delta; + params[2] = sum; + params[3] = norm; + + return quantized; +} + +class SQ8_SQ8_Precomputed_SpacesOptimizationTest : public testing::TestWithParam {}; + +TEST_P(SQ8_SQ8_Precomputed_SpacesOptimizationTest, SQ8_SQ8_Precomputed_InnerProductTest) { + auto optimization = getCpuOptimizationFeatures(); + size_t dim = GetParam(); + + // Create original vectors + std::vector v1_orig(dim); + std::vector v2_orig(dim); + for (size_t i = 0; i < dim; i++) { + v1_orig[i] = float(i + 1.5); + v2_orig[i] = float(i * 0.75 + 1.0); + } + + // Normalize both vectors + spaces::GetNormalizeFunc()(v1_orig.data(), dim); + spaces::GetNormalizeFunc()(v2_orig.data(), dim); + + // Create SQ8 quantized versions (standard format for baseline) + std::vector v1_quantized = CreateSQ8QuantizedVector(v1_orig.data(), dim); + std::vector v2_quantized = CreateSQ8QuantizedVector(v2_orig.data(), dim); + + // Create SQ8 quantized versions with precomputed sum/norm + std::vector v1_precomputed = CreateSQ8QuantizedVectorWithSumNorm(v1_orig.data(), dim); + std::vector v2_precomputed = CreateSQ8QuantizedVectorWithSumNorm(v2_orig.data(), dim); + + // Baseline: original SQ8_SQ8 implementation + float baseline = SQ8_SQ8_InnerProduct(v1_quantized.data(), v2_quantized.data(), dim); + +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { + // Test precomputed version + auto precomputed_func = + spaces::Choose_SQ8_SQ8_Precomputed_IP_implementation_AVX512F_BW_VL_VNNI(dim); + float precomputed_result = + precomputed_func(v1_precomputed.data(), v2_precomputed.data(), dim); + + // Precomputed should match baseline (within tolerance for float precision) + ASSERT_NEAR(baseline, precomputed_result, 0.01) + << "AVX512 Precomputed IP with dim " << dim; + } +#endif +} + +TEST_P(SQ8_SQ8_Precomputed_SpacesOptimizationTest, SQ8_SQ8_Precomputed_CosineTest) { + auto optimization = getCpuOptimizationFeatures(); + size_t dim = GetParam(); + + // Create original vectors + std::vector v1_orig(dim); + std::vector v2_orig(dim); + for (size_t i = 0; i < dim; i++) { + v1_orig[i] = float(i + 1.5); + v2_orig[i] = float(i * 0.75 + 1.0); + } + + // Normalize both vectors + spaces::GetNormalizeFunc()(v1_orig.data(), dim); + spaces::GetNormalizeFunc()(v2_orig.data(), dim); + + // Create SQ8 quantized versions (standard format for baseline) + std::vector v1_quantized = CreateSQ8QuantizedVector(v1_orig.data(), dim); + std::vector v2_quantized = CreateSQ8QuantizedVector(v2_orig.data(), dim); + + // Create SQ8 quantized versions with precomputed sum/norm + std::vector v1_precomputed = CreateSQ8QuantizedVectorWithSumNorm(v1_orig.data(), dim); + std::vector v2_precomputed = CreateSQ8QuantizedVectorWithSumNorm(v2_orig.data(), dim); + + // Baseline: original SQ8_SQ8 implementation + float baseline = SQ8_SQ8_Cosine(v1_quantized.data(), v2_quantized.data(), dim); + +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { + // Test precomputed version + auto precomputed_func = + spaces::Choose_SQ8_SQ8_Precomputed_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); + float precomputed_result = + precomputed_func(v1_precomputed.data(), v2_precomputed.data(), dim); + + // Precomputed should match baseline (within tolerance for float precision) + ASSERT_NEAR(baseline, precomputed_result, 0.01) + << "AVX512 Precomputed Cosine with dim " << dim; + } +#endif +} + +// Test suite covers dimensions 64-128 to exercise AVX512 SIMD paths +INSTANTIATE_TEST_SUITE_P(SQ8_SQ8_PrecomputedOptFuncs, SQ8_SQ8_Precomputed_SpacesOptimizationTest, + testing::Range(64UL, 64 * 2UL + 1)); diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h index 64211b7ef..fd1d41bd6 100644 --- a/tests/utils/tests_utils.h +++ b/tests/utils/tests_utils.h @@ -78,20 +78,16 @@ static void quantize_float_vec_to_uint8(float *v, size_t dim, uint8_t *qv, int s float delta = (max_val - min_val) / 255.0f; if (delta == 0) delta = 1.0f; // Avoid division by zero - float norm = 0.0f; // Quantize each value for (size_t i = 0; i < dim; i++) { float normalized = (v[i] - min_val) / delta; normalized = std::max(0.0f, std::min(255.0f, normalized)); qv[i] = static_cast(std::round(normalized)); - norm += (qv[i] * delta + min_val) * (qv[i] * delta + min_val); } - float inv_norm = 1.0f / std::sqrt(norm); // Store parameters float *params = reinterpret_cast(qv + dim); params[0] = min_val; params[1] = delta; - params[2] = inv_norm; } static void populate_float_vec_to_sq8(uint8_t *v, size_t dim, int seed = 1234) { @@ -105,6 +101,57 @@ static void populate_float_vec_to_sq8(uint8_t *v, size_t dim, int seed = 1234) { quantize_float_vec_to_uint8(vec.data(), dim, v, seed); } +/** + * Quantize float vector to SQ8 with precomputed sum and norm. + * Vector layout: [uint8_t values (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] + * where sum = Σv[i] and norm = Σv[i]² (sum of squares of uint8 elements) + */ +static void quantize_float_vec_to_uint8_with_sum_norm(float *v, size_t dim, uint8_t *qv, + int seed = 1234) { + float min_val = v[0]; + float max_val = v[0]; + for (size_t i = 1; i < dim; i++) { + min_val = std::min(min_val, v[i]); + max_val = std::max(max_val, v[i]); + } + // Calculate delta + float delta = (max_val - min_val) / 255.0f; + if (delta == 0) + delta = 1.0f; // Avoid division by zero + + // Quantize each value and compute sum and norm + float sum = 0.0f; + float norm = 0.0f; + for (size_t i = 0; i < dim; i++) { + float normalized = (v[i] - min_val) / delta; + normalized = std::max(0.0f, std::min(255.0f, normalized)); + qv[i] = static_cast(std::round(normalized)); + sum += static_cast(qv[i]); + norm += static_cast(qv[i]) * static_cast(qv[i]); + } + + // Store parameters: [min, delta, sum, norm] + float *params = reinterpret_cast(qv + dim); + params[0] = min_val; + params[1] = delta; + params[2] = sum; + params[3] = norm; +} + +/** + * Populate a float vector and quantize to SQ8 with precomputed sum and norm. + * Vector layout: [uint8_t values (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] + */ +static void populate_float_vec_to_sq8_with_sum_norm(uint8_t *v, size_t dim, int seed = 1234) { + std::mt19937 gen(seed); + std::uniform_real_distribution dis(-1.0f, 1.0f); + std::vector vec(dim); + for (size_t i = 0; i < dim; i++) { + vec[i] = dis(gen); + } + quantize_float_vec_to_uint8_with_sum_norm(vec.data(), dim, v, seed); +} + template float integral_compute_norm(const datatype *vec, size_t dim) { return spaces::IntegralType_ComputeNorm(vec, dim); From dbbb7d9b2453922cc9de1b37caa12e992999734d Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Mon, 29 Dec 2025 16:00:43 +0200 Subject: [PATCH 18/51] Add edge case tests for SQ8-to-SQ8 precomputed cosine distance functions --- tests/unit/test_spaces.cpp | 215 +++++++++++++++++++++++++++++++++++++ 1 file changed, 215 insertions(+) diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 721951711..1b82b89e6 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2830,3 +2830,218 @@ TEST_P(SQ8_SQ8_Precomputed_SpacesOptimizationTest, SQ8_SQ8_Precomputed_CosineTes // Test suite covers dimensions 64-128 to exercise AVX512 SIMD paths INSTANTIATE_TEST_SUITE_P(SQ8_SQ8_PrecomputedOptFuncs, SQ8_SQ8_Precomputed_SpacesOptimizationTest, testing::Range(64UL, 64 * 2UL + 1)); + +// Additional test suite for smaller dimensions (1-63) to test residual handling +INSTANTIATE_TEST_SUITE_P(SQ8_SQ8_PrecomputedOptFuncs_SmallDim, + SQ8_SQ8_Precomputed_SpacesOptimizationTest, + testing::Values(1UL, 7UL, 15UL, 16UL, 31UL, 32UL, 33UL, 48UL, 63UL)); + +// Test suite for larger dimensions to stress-test the implementation +INSTANTIATE_TEST_SUITE_P(SQ8_SQ8_PrecomputedOptFuncs_LargeDim, + SQ8_SQ8_Precomputed_SpacesOptimizationTest, + testing::Values(256UL, 512UL, 768UL, 1024UL, 1536UL)); + +#ifdef OPT_AVX512_F_BW_VL_VNNI +// Test self-distance: distance to itself should be 0 for cosine (normalized vectors) +TEST(SQ8_SQ8_Precomputed_EdgeCases, SelfDistanceCosine) { + auto optimization = getCpuOptimizationFeatures(); + if (!(optimization.avx512f && optimization.avx512bw && optimization.avx512vnni)) { + GTEST_SKIP() << "AVX512 VNNI not available"; + } + + size_t dim = 128; + std::vector v_orig(dim); + + // Create a normalized vector + std::mt19937 rng(42); + std::uniform_real_distribution dist(-1.0f, 1.0f); + float norm = 0.0f; + for (size_t i = 0; i < dim; i++) { + v_orig[i] = dist(rng); + norm += v_orig[i] * v_orig[i]; + } + norm = std::sqrt(norm); + for (size_t i = 0; i < dim; i++) { + v_orig[i] /= norm; + } + + auto v_quantized = CreateSQ8QuantizedVectorWithSumNorm(v_orig.data(), dim); + + auto precomputed_func = + spaces::Choose_SQ8_SQ8_Precomputed_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); + float self_distance = precomputed_func(v_quantized.data(), v_quantized.data(), dim); + + // Self-distance for cosine should be close to 0 + ASSERT_NEAR(self_distance, 0.0f, 0.02f) << "Self-distance should be ~0 for cosine"; +} + +// Test symmetry: dist(v1, v2) == dist(v2, v1) +TEST(SQ8_SQ8_Precomputed_EdgeCases, SymmetryTest) { + auto optimization = getCpuOptimizationFeatures(); + if (!(optimization.avx512f && optimization.avx512bw && optimization.avx512vnni)) { + GTEST_SKIP() << "AVX512 VNNI not available"; + } + + size_t dim = 128; + std::vector v1_orig(dim), v2_orig(dim); + + std::mt19937 rng(123); + std::uniform_real_distribution dist(-1.0f, 1.0f); + for (size_t i = 0; i < dim; i++) { + v1_orig[i] = dist(rng); + v2_orig[i] = dist(rng); + } + + auto v1_quantized = CreateSQ8QuantizedVectorWithSumNorm(v1_orig.data(), dim); + auto v2_quantized = CreateSQ8QuantizedVectorWithSumNorm(v2_orig.data(), dim); + + auto ip_func = spaces::Choose_SQ8_SQ8_Precomputed_IP_implementation_AVX512F_BW_VL_VNNI(dim); + auto cosine_func = + spaces::Choose_SQ8_SQ8_Precomputed_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); + + float ip_12 = ip_func(v1_quantized.data(), v2_quantized.data(), dim); + float ip_21 = ip_func(v2_quantized.data(), v1_quantized.data(), dim); + ASSERT_NEAR(ip_12, ip_21, 1e-6f) << "IP should be symmetric"; + + float cos_12 = cosine_func(v1_quantized.data(), v2_quantized.data(), dim); + float cos_21 = cosine_func(v2_quantized.data(), v1_quantized.data(), dim); + ASSERT_NEAR(cos_12, cos_21, 1e-6f) << "Cosine should be symmetric"; +} + +// Test with zero vector +TEST(SQ8_SQ8_Precomputed_EdgeCases, ZeroVectorTest) { + auto optimization = getCpuOptimizationFeatures(); + if (!(optimization.avx512f && optimization.avx512bw && optimization.avx512vnni)) { + GTEST_SKIP() << "AVX512 VNNI not available"; + } + + size_t dim = 128; + std::vector v_zero(dim, 0.0f); + std::vector v_nonzero(dim); + + std::mt19937 rng(456); + std::uniform_real_distribution dist(-1.0f, 1.0f); + for (size_t i = 0; i < dim; i++) { + v_nonzero[i] = dist(rng); + } + + auto v_zero_quantized = CreateSQ8QuantizedVectorWithSumNorm(v_zero.data(), dim); + auto v_nonzero_quantized = CreateSQ8QuantizedVectorWithSumNorm(v_nonzero.data(), dim); + + // Get baseline from original implementation + auto orig_func = spaces::Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); + auto orig_v_zero = populate_float_vec_to_sq8(v_zero.data(), dim); + auto orig_v_nonzero = populate_float_vec_to_sq8(v_nonzero.data(), dim); + float baseline = orig_func(orig_v_zero.data(), orig_v_nonzero.data(), dim); + + auto ip_func = spaces::Choose_SQ8_SQ8_Precomputed_IP_implementation_AVX512F_BW_VL_VNNI(dim); + float result = ip_func(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); + + ASSERT_NEAR(result, baseline, 0.01f) << "Zero vector IP should match baseline"; +} + +// Test with constant vector (all same values) +TEST(SQ8_SQ8_Precomputed_EdgeCases, ConstantVectorTest) { + auto optimization = getCpuOptimizationFeatures(); + if (!(optimization.avx512f && optimization.avx512bw && optimization.avx512vnni)) { + GTEST_SKIP() << "AVX512 VNNI not available"; + } + + size_t dim = 128; + std::vector v_const(dim, 0.5f); + std::vector v_random(dim); + + std::mt19937 rng(789); + std::uniform_real_distribution dist(-1.0f, 1.0f); + for (size_t i = 0; i < dim; i++) { + v_random[i] = dist(rng); + } + + auto v_const_quantized = CreateSQ8QuantizedVectorWithSumNorm(v_const.data(), dim); + auto v_random_quantized = CreateSQ8QuantizedVectorWithSumNorm(v_random.data(), dim); + + // Get baseline from original implementation + auto orig_func = spaces::Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); + auto orig_v_const = populate_float_vec_to_sq8(v_const.data(), dim); + auto orig_v_random = populate_float_vec_to_sq8(v_random.data(), dim); + float baseline = orig_func(orig_v_const.data(), orig_v_random.data(), dim); + + auto ip_func = spaces::Choose_SQ8_SQ8_Precomputed_IP_implementation_AVX512F_BW_VL_VNNI(dim); + float result = ip_func(v_const_quantized.data(), v_random_quantized.data(), dim); + + ASSERT_NEAR(result, baseline, 0.01f) << "Constant vector IP should match baseline"; +} + +// Test with extreme values (-1 and 1 only) +TEST(SQ8_SQ8_Precomputed_EdgeCases, ExtremeValuesTest) { + auto optimization = getCpuOptimizationFeatures(); + if (!(optimization.avx512f && optimization.avx512bw && optimization.avx512vnni)) { + GTEST_SKIP() << "AVX512 VNNI not available"; + } + + size_t dim = 128; + std::vector v1(dim), v2(dim); + + // Alternating extreme values + for (size_t i = 0; i < dim; i++) { + v1[i] = (i % 2 == 0) ? 1.0f : -1.0f; + v2[i] = (i % 3 == 0) ? 1.0f : -1.0f; + } + + auto v1_quantized = CreateSQ8QuantizedVectorWithSumNorm(v1.data(), dim); + auto v2_quantized = CreateSQ8QuantizedVectorWithSumNorm(v2.data(), dim); + + // Get baseline from original implementation + auto orig_func = spaces::Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); + auto orig_v1 = populate_float_vec_to_sq8(v1.data(), dim); + auto orig_v2 = populate_float_vec_to_sq8(v2.data(), dim); + float baseline = orig_func(orig_v1.data(), orig_v2.data(), dim); + + auto ip_func = spaces::Choose_SQ8_SQ8_Precomputed_IP_implementation_AVX512F_BW_VL_VNNI(dim); + float result = ip_func(v1_quantized.data(), v2_quantized.data(), dim); + + ASSERT_NEAR(result, baseline, 0.01f) << "Extreme values IP should match baseline"; +} + +// Test accuracy across multiple random vector pairs +TEST(SQ8_SQ8_Precomputed_EdgeCases, AccuracyStressTest) { + auto optimization = getCpuOptimizationFeatures(); + if (!(optimization.avx512f && optimization.avx512bw && optimization.avx512vnni)) { + GTEST_SKIP() << "AVX512 VNNI not available"; + } + + size_t dim = 256; + const int num_iterations = 100; + std::mt19937 rng(999); + std::uniform_real_distribution dist(-10.0f, 10.0f); + + auto orig_ip_func = spaces::Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); + auto precomputed_ip_func = + spaces::Choose_SQ8_SQ8_Precomputed_IP_implementation_AVX512F_BW_VL_VNNI(dim); + + float max_error = 0.0f; + for (int iter = 0; iter < num_iterations; iter++) { + std::vector v1(dim), v2(dim); + for (size_t i = 0; i < dim; i++) { + v1[i] = dist(rng); + v2[i] = dist(rng); + } + + auto orig_v1 = populate_float_vec_to_sq8(v1.data(), dim); + auto orig_v2 = populate_float_vec_to_sq8(v2.data(), dim); + float baseline = orig_ip_func(orig_v1.data(), orig_v2.data(), dim); + + auto precomp_v1 = CreateSQ8QuantizedVectorWithSumNorm(v1.data(), dim); + auto precomp_v2 = CreateSQ8QuantizedVectorWithSumNorm(v2.data(), dim); + float result = precomputed_ip_func(precomp_v1.data(), precomp_v2.data(), dim); + + float error = std::abs(result - baseline); + max_error = std::max(max_error, error); + + ASSERT_NEAR(result, baseline, 0.01f) << "Iteration " << iter << " failed"; + } + + // Log max error for informational purposes + ASSERT_LT(max_error, 0.01f) << "Max error across all iterations: " << max_error; +} +#endif From 36ab068595131bceef104ce7725f790655476ba2 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Mon, 29 Dec 2025 16:04:04 +0200 Subject: [PATCH 19/51] Refactor SQ8 test cases to use CreateSQ8QuantizedVector for vector population --- tests/unit/test_spaces.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 1b82b89e6..5b60d74ec 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2930,8 +2930,8 @@ TEST(SQ8_SQ8_Precomputed_EdgeCases, ZeroVectorTest) { // Get baseline from original implementation auto orig_func = spaces::Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); - auto orig_v_zero = populate_float_vec_to_sq8(v_zero.data(), dim); - auto orig_v_nonzero = populate_float_vec_to_sq8(v_nonzero.data(), dim); + auto orig_v_zero = CreateSQ8QuantizedVector(v_zero.data(), dim); + auto orig_v_nonzero = CreateSQ8QuantizedVector(v_nonzero.data(), dim); float baseline = orig_func(orig_v_zero.data(), orig_v_nonzero.data(), dim); auto ip_func = spaces::Choose_SQ8_SQ8_Precomputed_IP_implementation_AVX512F_BW_VL_VNNI(dim); @@ -2962,8 +2962,8 @@ TEST(SQ8_SQ8_Precomputed_EdgeCases, ConstantVectorTest) { // Get baseline from original implementation auto orig_func = spaces::Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); - auto orig_v_const = populate_float_vec_to_sq8(v_const.data(), dim); - auto orig_v_random = populate_float_vec_to_sq8(v_random.data(), dim); + auto orig_v_const = CreateSQ8QuantizedVector(v_const.data(), dim); + auto orig_v_random = CreateSQ8QuantizedVector(v_random.data(), dim); float baseline = orig_func(orig_v_const.data(), orig_v_random.data(), dim); auto ip_func = spaces::Choose_SQ8_SQ8_Precomputed_IP_implementation_AVX512F_BW_VL_VNNI(dim); @@ -2988,17 +2988,17 @@ TEST(SQ8_SQ8_Precomputed_EdgeCases, ExtremeValuesTest) { v2[i] = (i % 3 == 0) ? 1.0f : -1.0f; } - auto v1_quantized = CreateSQ8QuantizedVectorWithSumNorm(v1.data(), dim); - auto v2_quantized = CreateSQ8QuantizedVectorWithSumNorm(v2.data(), dim); + auto v1_precomputed = CreateSQ8QuantizedVectorWithSumNorm(v1.data(), dim); + auto v2_precomputed = CreateSQ8QuantizedVectorWithSumNorm(v2.data(), dim); // Get baseline from original implementation auto orig_func = spaces::Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); - auto orig_v1 = populate_float_vec_to_sq8(v1.data(), dim); - auto orig_v2 = populate_float_vec_to_sq8(v2.data(), dim); + auto orig_v1 = CreateSQ8QuantizedVector(v1.data(), dim); + auto orig_v2 = CreateSQ8QuantizedVector(v2.data(), dim); float baseline = orig_func(orig_v1.data(), orig_v2.data(), dim); auto ip_func = spaces::Choose_SQ8_SQ8_Precomputed_IP_implementation_AVX512F_BW_VL_VNNI(dim); - float result = ip_func(v1_quantized.data(), v2_quantized.data(), dim); + float result = ip_func(v1_precomputed.data(), v2_precomputed.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Extreme values IP should match baseline"; } @@ -3027,8 +3027,8 @@ TEST(SQ8_SQ8_Precomputed_EdgeCases, AccuracyStressTest) { v2[i] = dist(rng); } - auto orig_v1 = populate_float_vec_to_sq8(v1.data(), dim); - auto orig_v2 = populate_float_vec_to_sq8(v2.data(), dim); + auto orig_v1 = CreateSQ8QuantizedVector(v1.data(), dim); + auto orig_v2 = CreateSQ8QuantizedVector(v2.data(), dim); float baseline = orig_ip_func(orig_v1.data(), orig_v2.data(), dim); auto precomp_v1 = CreateSQ8QuantizedVectorWithSumNorm(v1.data(), dim); From 00617d799d5cb6c82a1ae6267bed23b750b87b85 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Mon, 29 Dec 2025 16:21:41 +0200 Subject: [PATCH 20/51] Implement SQ8-to-SQ8 precomputed distance functions using ARM NEON, SVE, and AVX512; add corresponding selection functions and update tests for consistency. --- .../IP/IP_NEON_DOTPROD_SQ8_SQ8_Precomputed.h | 130 +++++++++++++++++ .../spaces/IP/IP_NEON_SQ8_SQ8_Precomputed.h | 138 ++++++++++++++++++ .../spaces/IP/IP_SVE_SQ8_SQ8_Precomputed.h | 136 +++++++++++++++++ src/VecSim/spaces/IP_space.cpp | 78 ++++++++++ src/VecSim/spaces/IP_space.h | 8 + src/VecSim/spaces/functions/NEON.cpp | 14 ++ src/VecSim/spaces/functions/NEON.h | 4 + src/VecSim/spaces/functions/NEON_DOTPROD.cpp | 15 ++ src/VecSim/spaces/functions/NEON_DOTPROD.h | 4 + src/VecSim/spaces/functions/SVE.cpp | 14 ++ src/VecSim/spaces/functions/SVE.h | 4 + tests/unit/test_spaces.cpp | 3 +- tests/utils/tests_utils.h | 2 +- 13 files changed, 547 insertions(+), 3 deletions(-) create mode 100644 src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8_Precomputed.h create mode 100644 src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8_Precomputed.h create mode 100644 src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8_Precomputed.h diff --git a/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8_Precomputed.h b/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8_Precomputed.h new file mode 100644 index 000000000..a9f372aac --- /dev/null +++ b/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8_Precomputed.h @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include + +/** + * SQ8-to-SQ8 distance functions using ARM NEON DOTPROD with precomputed sum and norm. + * These functions compute distance between two SQ8 (scalar quantized 8-bit) vectors, + * where BOTH vectors are uint8 quantized. + * + * This version uses precomputed sum and norm stored in the vector data, + * eliminating the need to compute them during distance calculation. + * + * Uses algebraic optimization with DOTPROD instruction: + * IP = δ1*δ2 * Σ(v1[i]*v2[i]) + δ1*min2 * Σv1[i] + δ2*min1 * Σv2[i] + dim*min1*min2 + * + * Since sum = Σv[i] is precomputed, we only need to compute the dot product Σ(v1[i]*v2[i]). + * + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] [norm + * (float)] + */ + +// Helper function: computes dot product using DOTPROD instruction (no sum computation needed) +__attribute__((always_inline)) static inline void +SQ8_SQ8_Precomputed_InnerProductStep_NEON_DOTPROD(const uint8_t *&pVec1, const uint8_t *&pVec2, + uint32x4_t &dot_sum) { + // Load 16 uint8 elements + uint8x16_t v1 = vld1q_u8(pVec1); + uint8x16_t v2 = vld1q_u8(pVec2); + + // Compute dot product using DOTPROD instruction: dot_sum += v1 . v2 + dot_sum = vdotq_u32(dot_sum, v1, v2); + + pVec1 += 16; + pVec2 += 16; +} + +// Common implementation for inner product between two SQ8 vectors with precomputed sum/norm +template // 0..63 +float SQ8_SQ8_Precomputed_InnerProductSIMD64_NEON_DOTPROD_IMP(const void *pVec1v, + const void *pVec2v, + size_t dimension) { + const uint8_t *pVec1 = static_cast(pVec1v); + const uint8_t *pVec2 = static_cast(pVec2v); + + // Get dequantization parameters and precomputed values from the end of pVec1 + // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] + const float *params1 = reinterpret_cast(pVec1 + dimension); + const float min1 = params1[0]; + const float delta1 = params1[1]; + const float sum1 = params1[2]; // Precomputed sum of uint8 elements + + // Get dequantization parameters and precomputed values from the end of pVec2 + const float *params2 = reinterpret_cast(pVec2 + dimension); + const float min2 = params2[0]; + const float delta2 = params2[1]; + const float sum2 = params2[2]; // Precomputed sum of uint8 elements + + // Calculate number of 64-element chunks + size_t num_of_chunks = (dimension - residual) / 64; + + // Multiple accumulators for ILP (dot product only) + uint32x4_t dot_sum0 = vdupq_n_u32(0); + uint32x4_t dot_sum1 = vdupq_n_u32(0); + uint32x4_t dot_sum2 = vdupq_n_u32(0); + uint32x4_t dot_sum3 = vdupq_n_u32(0); + + // Process 64 elements at a time (4 x 16) in the main loop + for (size_t i = 0; i < num_of_chunks; i++) { + SQ8_SQ8_Precomputed_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum0); + SQ8_SQ8_Precomputed_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum1); + SQ8_SQ8_Precomputed_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum2); + SQ8_SQ8_Precomputed_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum3); + } + + // Handle remaining complete 16-element blocks within residual + if constexpr (residual >= 16) { + SQ8_SQ8_Precomputed_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum0); + } + if constexpr (residual >= 32) { + SQ8_SQ8_Precomputed_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum1); + } + if constexpr (residual >= 48) { + SQ8_SQ8_Precomputed_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum2); + } + + // Combine accumulators + uint32x4_t dot_total = vaddq_u32(vaddq_u32(dot_sum0, dot_sum1), vaddq_u32(dot_sum2, dot_sum3)); + + // Horizontal sum for dot product + uint32_t dot_product = vaddvq_u32(dot_total); + + // Handle remaining scalar elements (0-15) + constexpr unsigned char remaining = residual % 16; + if constexpr (remaining > 0) { + for (unsigned char i = 0; i < remaining; i++) { + dot_product += static_cast(pVec1[i]) * static_cast(pVec2[i]); + } + } + + // Apply algebraic formula using precomputed sums: + // IP = δ1*δ2 * Σ(v1*v2) + δ1*min2 * Σv1 + δ2*min1 * Σv2 + dim*min1*min2 + return delta1 * delta2 * static_cast(dot_product) + delta1 * min2 * sum1 + + delta2 * min1 * sum2 + static_cast(dimension) * min1 * min2; +} + +// SQ8-to-SQ8 Inner Product distance function with precomputed sum/norm +// Returns 1 - inner_product (distance form) +template // 0..63 +float SQ8_SQ8_Precomputed_InnerProductSIMD64_NEON_DOTPROD(const void *pVec1v, const void *pVec2v, + size_t dimension) { + return 1.0f - SQ8_SQ8_Precomputed_InnerProductSIMD64_NEON_DOTPROD_IMP(pVec1v, pVec2v, + dimension); +} + +// SQ8-to-SQ8 Cosine distance function with precomputed sum/norm +// Returns 1 - inner_product (assumes vectors are pre-normalized) +template // 0..63 +float SQ8_SQ8_Precomputed_CosineSIMD64_NEON_DOTPROD(const void *pVec1v, const void *pVec2v, + size_t dimension) { + return 1.0f - SQ8_SQ8_Precomputed_InnerProductSIMD64_NEON_DOTPROD_IMP(pVec1v, pVec2v, + dimension); +} diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8_Precomputed.h b/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8_Precomputed.h new file mode 100644 index 000000000..9fb7c7d17 --- /dev/null +++ b/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8_Precomputed.h @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include + +/** + * SQ8-to-SQ8 distance functions using ARM NEON with precomputed sum and norm. + * These functions compute distance between two SQ8 (scalar quantized 8-bit) vectors, + * where BOTH vectors are uint8 quantized. + * + * This version uses precomputed sum and norm stored in the vector data, + * eliminating the need to compute them during distance calculation. + * + * Uses algebraic optimization: + * IP = δ1*δ2 * Σ(v1[i]*v2[i]) + δ1*min2 * Σv1[i] + δ2*min1 * Σv2[i] + dim*min1*min2 + * + * Since sum = Σv[i] is precomputed, we only need to compute the dot product Σ(v1[i]*v2[i]). + * + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] [norm + * (float)] + */ + +// Helper function with dot product only (no sum computation needed) +static inline void SQ8_SQ8_Precomputed_InnerProductStep_NEON(const uint8_t *&pVec1, + const uint8_t *&pVec2, + float32x4_t &dot_sum) { + // Load 4 uint8 elements from pVec1 and convert to float + uint8x8_t v1_u8 = vld1_u8(pVec1); + uint32x4_t v1_u32 = vmovl_u16(vget_low_u16(vmovl_u8(v1_u8))); + float32x4_t v1_f = vcvtq_f32_u32(v1_u32); + + // Load 4 uint8 elements from pVec2 and convert to float + uint8x8_t v2_u8 = vld1_u8(pVec2); + uint32x4_t v2_u32 = vmovl_u16(vget_low_u16(vmovl_u8(v2_u8))); + float32x4_t v2_f = vcvtq_f32_u32(v2_u32); + + // Accumulate dot product: dot_sum += v1 * v2 (no dequantization) + dot_sum = vmlaq_f32(dot_sum, v1_f, v2_f); + + // Advance pointers + pVec1 += 4; + pVec2 += 4; +} + +// Common implementation for inner product between two SQ8 vectors with precomputed sum/norm +template // 0..15 +float SQ8_SQ8_Precomputed_InnerProductSIMD16_NEON_IMP(const void *pVec1v, const void *pVec2v, + size_t dimension) { + const uint8_t *pVec1 = static_cast(pVec1v); + const uint8_t *pVec2 = static_cast(pVec2v); + + // Get dequantization parameters and precomputed values from the end of pVec1 + // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] + const float *params1 = reinterpret_cast(pVec1 + dimension); + const float min1 = params1[0]; + const float delta1 = params1[1]; + const float sum1 = params1[2]; // Precomputed sum of uint8 elements + + // Get dequantization parameters and precomputed values from the end of pVec2 + const float *params2 = reinterpret_cast(pVec2 + dimension); + const float min2 = params2[0]; + const float delta2 = params2[1]; + const float sum2 = params2[2]; // Precomputed sum of uint8 elements + + // Calculate number of 16-element chunks + size_t num_of_chunks = (dimension - residual) / 16; + + // Multiple accumulators for ILP (dot product only) + float32x4_t dot_sum0 = vdupq_n_f32(0.0f); + float32x4_t dot_sum1 = vdupq_n_f32(0.0f); + float32x4_t dot_sum2 = vdupq_n_f32(0.0f); + float32x4_t dot_sum3 = vdupq_n_f32(0.0f); + + // Process 16 elements at a time in the main loop + for (size_t i = 0; i < num_of_chunks; i++) { + SQ8_SQ8_Precomputed_InnerProductStep_NEON(pVec1, pVec2, dot_sum0); + SQ8_SQ8_Precomputed_InnerProductStep_NEON(pVec1, pVec2, dot_sum1); + SQ8_SQ8_Precomputed_InnerProductStep_NEON(pVec1, pVec2, dot_sum2); + SQ8_SQ8_Precomputed_InnerProductStep_NEON(pVec1, pVec2, dot_sum3); + } + + // Handle remaining complete 4-element blocks within residual + if constexpr (residual >= 4) { + SQ8_SQ8_Precomputed_InnerProductStep_NEON(pVec1, pVec2, dot_sum0); + } + if constexpr (residual >= 8) { + SQ8_SQ8_Precomputed_InnerProductStep_NEON(pVec1, pVec2, dot_sum1); + } + if constexpr (residual >= 12) { + SQ8_SQ8_Precomputed_InnerProductStep_NEON(pVec1, pVec2, dot_sum2); + } + + // Combine dot product accumulators + float32x4_t dot_total = vaddq_f32(vaddq_f32(dot_sum0, dot_sum1), vaddq_f32(dot_sum2, dot_sum3)); + + // Horizontal sum for dot product + float32x2_t dot_halves = vadd_f32(vget_low_f32(dot_total), vget_high_f32(dot_total)); + float32x2_t dot_summed = vpadd_f32(dot_halves, dot_halves); + float dot_product = vget_lane_f32(dot_summed, 0); + + // Handle remaining scalar elements (0-3) + constexpr unsigned char remaining = residual % 4; + if constexpr (remaining > 0) { + for (unsigned char i = 0; i < remaining; i++) { + dot_product += static_cast(pVec1[i]) * static_cast(pVec2[i]); + } + } + + // Apply algebraic formula using precomputed sums: + // IP = δ1*δ2 * Σ(v1*v2) + δ1*min2 * Σv1 + δ2*min1 * Σv2 + dim*min1*min2 + return delta1 * delta2 * dot_product + delta1 * min2 * sum1 + delta2 * min1 * sum2 + + static_cast(dimension) * min1 * min2; +} + +// SQ8-to-SQ8 Inner Product distance function with precomputed sum/norm +// Returns 1 - inner_product (distance form) +template // 0..15 +float SQ8_SQ8_Precomputed_InnerProductSIMD16_NEON(const void *pVec1v, const void *pVec2v, + size_t dimension) { + return 1.0f - + SQ8_SQ8_Precomputed_InnerProductSIMD16_NEON_IMP(pVec1v, pVec2v, dimension); +} + +// SQ8-to-SQ8 Cosine distance function with precomputed sum/norm +// Returns 1 - inner_product (assumes vectors are pre-normalized) +template // 0..15 +float SQ8_SQ8_Precomputed_CosineSIMD16_NEON(const void *pVec1v, const void *pVec2v, + size_t dimension) { + return 1.0f - + SQ8_SQ8_Precomputed_InnerProductSIMD16_NEON_IMP(pVec1v, pVec2v, dimension); +} diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8_Precomputed.h b/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8_Precomputed.h new file mode 100644 index 000000000..5a78705c6 --- /dev/null +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8_Precomputed.h @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include + +/** + * SQ8-to-SQ8 distance functions using ARM SVE with precomputed sum and norm. + * These functions compute distance between two SQ8 (scalar quantized 8-bit) vectors, + * where BOTH vectors are uint8 quantized. + * + * This version uses precomputed sum and norm stored in the vector data, + * eliminating the need to compute them during distance calculation. + * + * Uses algebraic optimization with SVE dot product instruction: + * IP = δ1*δ2 * Σ(v1[i]*v2[i]) + δ1*min2 * Σv1[i] + δ2*min1 * Σv2[i] + dim*min1*min2 + * + * Since sum = Σv[i] is precomputed, we only need to compute the dot product Σ(v1[i]*v2[i]). + * + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] [norm + * (float)] + */ + +// Helper function to perform inner product step using integer dot product (no sum computation) +static inline void SQ8_SQ8_Precomputed_InnerProductStep_SVE(const uint8_t *pVec1, + const uint8_t *pVec2, size_t &offset, + svuint32_t &dot_sum, + const size_t chunk) { + svbool_t pg = svptrue_b8(); + + // Load uint8 vectors + svuint8_t v1_u8 = svld1_u8(pg, pVec1 + offset); + svuint8_t v2_u8 = svld1_u8(pg, pVec2 + offset); + + // Compute dot product using integer svdot instruction + dot_sum = svdot_u32(dot_sum, v1_u8, v2_u8); + + offset += chunk; +} + +// Common implementation for inner product between two SQ8 vectors with precomputed sum/norm +template +float SQ8_SQ8_Precomputed_InnerProductSIMD_SVE_IMP(const void *pVec1v, const void *pVec2v, + size_t dimension) { + const uint8_t *pVec1 = static_cast(pVec1v); + const uint8_t *pVec2 = static_cast(pVec2v); + size_t offset = 0; + + // Get dequantization parameters and precomputed values from the end of pVec1 + // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] + const float *params1 = reinterpret_cast(pVec1 + dimension); + const float min1 = params1[0]; + const float delta1 = params1[1]; + const float sum1 = params1[2]; // Precomputed sum of uint8 elements + + // Get dequantization parameters and precomputed values from the end of pVec2 + const float *params2 = reinterpret_cast(pVec2 + dimension); + const float min2 = params2[0]; + const float delta2 = params2[1]; + const float sum2 = params2[2]; // Precomputed sum of uint8 elements + + // Get the vector length for uint8 elements + const size_t vl = svcntb(); + + // Calculate number of complete 4-chunk groups + size_t number_of_chunks = dimension / (vl * 4); + + // Multiple accumulators for ILP (dot product only) + svuint32_t dot_sum0 = svdup_u32(0); + svuint32_t dot_sum1 = svdup_u32(0); + svuint32_t dot_sum2 = svdup_u32(0); + svuint32_t dot_sum3 = svdup_u32(0); + + for (size_t i = 0; i < number_of_chunks; i++) { + SQ8_SQ8_Precomputed_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum0, vl); + SQ8_SQ8_Precomputed_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum1, vl); + SQ8_SQ8_Precomputed_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum2, vl); + SQ8_SQ8_Precomputed_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum3, vl); + } + + // Handle remaining steps (0-3 complete chunks) + if constexpr (additional_steps >= 1) { + SQ8_SQ8_Precomputed_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum0, vl); + } + if constexpr (additional_steps >= 2) { + SQ8_SQ8_Precomputed_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum1, vl); + } + if constexpr (additional_steps >= 3) { + SQ8_SQ8_Precomputed_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum2, vl); + } + + // Handle partial chunk if needed + if constexpr (partial_chunk) { + svbool_t pg = svwhilelt_b8(offset, dimension); + svuint8_t v1_u8 = svld1_u8(pg, pVec1 + offset); + svuint8_t v2_u8 = svld1_u8(pg, pVec2 + offset); + dot_sum3 = svdot_u32(dot_sum3, v1_u8, v2_u8); + } + + // Combine all accumulators + svuint32_t dot_total = svadd_u32_x(svptrue_b32(), dot_sum0, dot_sum1); + dot_total = svadd_u32_x(svptrue_b32(), dot_total, dot_sum2); + dot_total = svadd_u32_x(svptrue_b32(), dot_total, dot_sum3); + + // Horizontal sum to scalar integer + svbool_t pg32 = svptrue_b32(); + uint32_t dot_product = svaddv_u32(pg32, dot_total); + + // Apply algebraic formula with float conversion only at the end: + // IP = δ1*δ2 * Σ(v1*v2) + δ1*min2 * Σv1 + δ2*min1 * Σv2 + dim*min1*min2 + return delta1 * delta2 * static_cast(dot_product) + delta1 * min2 * sum1 + + delta2 * min1 * sum2 + static_cast(dimension) * min1 * min2; +} + +// SQ8-to-SQ8 Inner Product distance function with precomputed sum/norm +// Returns 1 - inner_product (distance form) +template +float SQ8_SQ8_Precomputed_InnerProductSIMD_SVE(const void *pVec1v, const void *pVec2v, + size_t dimension) { + return 1.0f - SQ8_SQ8_Precomputed_InnerProductSIMD_SVE_IMP( + pVec1v, pVec2v, dimension); +} + +// SQ8-to-SQ8 Cosine distance function with precomputed sum/norm +// Returns 1 - inner_product (assumes vectors are pre-normalized) +template +float SQ8_SQ8_Precomputed_CosineSIMD_SVE(const void *pVec1v, const void *pVec2v, size_t dimension) { + return 1.0f - SQ8_SQ8_Precomputed_InnerProductSIMD_SVE_IMP( + pVec1v, pVec2v, dimension); +} diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index 2762a1216..235ce156e 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -240,6 +240,84 @@ dist_func_t Cosine_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignme return ret_dist_func; } +// SQ8-to-SQ8 Precomputed Inner Product distance function (with precomputed sum/norm) +dist_func_t IP_SQ8_SQ8_Precomputed_GetDistFunc(size_t dim, unsigned char *alignment, + const void *arch_opt) { + unsigned char dummy_alignment; + if (alignment == nullptr) { + alignment = &dummy_alignment; + } + + dist_func_t ret_dist_func = SQ8_SQ8_InnerProduct; // Fallback to original + [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt); + +#ifdef CPU_FEATURES_ARCH_AARCH64 +#ifdef OPT_SVE + if (features.sve) { + return Choose_SQ8_SQ8_Precomputed_IP_implementation_SVE(dim); + } +#endif +#ifdef OPT_NEON_DOTPROD + if (features.asimddp) { + return Choose_SQ8_SQ8_Precomputed_IP_implementation_NEON_DOTPROD(dim); + } +#endif +#ifdef OPT_NEON + if (features.asimd) { + return Choose_SQ8_SQ8_Precomputed_IP_implementation_NEON(dim); + } +#endif +#endif // AARCH64 + +#ifdef CPU_FEATURES_ARCH_X86_64 +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (dim >= 64 && features.avx512f && features.avx512bw && features.avx512vnni) { + return Choose_SQ8_SQ8_Precomputed_IP_implementation_AVX512F_BW_VL_VNNI(dim); + } +#endif +#endif // __x86_64__ + return ret_dist_func; +} + +// SQ8-to-SQ8 Precomputed Cosine distance function (with precomputed sum/norm) +dist_func_t Cosine_SQ8_SQ8_Precomputed_GetDistFunc(size_t dim, unsigned char *alignment, + const void *arch_opt) { + unsigned char dummy_alignment; + if (alignment == nullptr) { + alignment = &dummy_alignment; + } + + dist_func_t ret_dist_func = SQ8_SQ8_Cosine; // Fallback to original + [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt); + +#ifdef CPU_FEATURES_ARCH_AARCH64 +#ifdef OPT_SVE + if (features.sve) { + return Choose_SQ8_SQ8_Precomputed_Cosine_implementation_SVE(dim); + } +#endif +#ifdef OPT_NEON_DOTPROD + if (features.asimddp) { + return Choose_SQ8_SQ8_Precomputed_Cosine_implementation_NEON_DOTPROD(dim); + } +#endif +#ifdef OPT_NEON + if (features.asimd) { + return Choose_SQ8_SQ8_Precomputed_Cosine_implementation_NEON(dim); + } +#endif +#endif // AARCH64 + +#ifdef CPU_FEATURES_ARCH_X86_64 +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (dim >= 64 && features.avx512f && features.avx512bw && features.avx512vnni) { + return Choose_SQ8_SQ8_Precomputed_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); + } +#endif +#endif // __x86_64__ + return ret_dist_func; +} + dist_func_t IP_FP32_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { unsigned char dummy_alignment; if (alignment == nullptr) { diff --git a/src/VecSim/spaces/IP_space.h b/src/VecSim/spaces/IP_space.h index 2f504e6bc..40d5aab72 100644 --- a/src/VecSim/spaces/IP_space.h +++ b/src/VecSim/spaces/IP_space.h @@ -36,4 +36,12 @@ dist_func_t IP_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = const void *arch_opt = nullptr); dist_func_t Cosine_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, const void *arch_opt = nullptr); + +// SQ8-to-SQ8 precomputed distance functions (with precomputed sum/norm) +dist_func_t IP_SQ8_SQ8_Precomputed_GetDistFunc(size_t dim, + unsigned char *alignment = nullptr, + const void *arch_opt = nullptr); +dist_func_t Cosine_SQ8_SQ8_Precomputed_GetDistFunc(size_t dim, + unsigned char *alignment = nullptr, + const void *arch_opt = nullptr); } // namespace spaces diff --git a/src/VecSim/spaces/functions/NEON.cpp b/src/VecSim/spaces/functions/NEON.cpp index 5b5070bc7..f5125a9b8 100644 --- a/src/VecSim/spaces/functions/NEON.cpp +++ b/src/VecSim/spaces/functions/NEON.cpp @@ -18,6 +18,7 @@ #include "VecSim/spaces/L2/L2_NEON_SQ8.h" #include "VecSim/spaces/IP/IP_NEON_SQ8.h" #include "VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h" +#include "VecSim/spaces/IP/IP_NEON_SQ8_SQ8_Precomputed.h" namespace spaces { @@ -113,6 +114,19 @@ dist_func_t Choose_SQ8_SQ8_Cosine_implementation_NEON(size_t dim) { return ret_dist_func; } +// SQ8-to-SQ8 precomputed distance functions (with precomputed sum/norm) +dist_func_t Choose_SQ8_SQ8_Precomputed_IP_implementation_NEON(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_SQ8_Precomputed_InnerProductSIMD16_NEON); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_SQ8_Precomputed_Cosine_implementation_NEON(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_SQ8_Precomputed_CosineSIMD16_NEON); + return ret_dist_func; +} + #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/NEON.h b/src/VecSim/spaces/functions/NEON.h index 011d41b0d..27efa5593 100644 --- a/src/VecSim/spaces/functions/NEON.h +++ b/src/VecSim/spaces/functions/NEON.h @@ -34,4 +34,8 @@ dist_func_t Choose_SQ8_Cosine_implementation_NEON(size_t dim); dist_func_t Choose_SQ8_SQ8_IP_implementation_NEON(size_t dim); dist_func_t Choose_SQ8_SQ8_Cosine_implementation_NEON(size_t dim); +// SQ8-to-SQ8 precomputed distance functions (with precomputed sum/norm) +dist_func_t Choose_SQ8_SQ8_Precomputed_IP_implementation_NEON(size_t dim); +dist_func_t Choose_SQ8_SQ8_Precomputed_Cosine_implementation_NEON(size_t dim); + } // namespace spaces diff --git a/src/VecSim/spaces/functions/NEON_DOTPROD.cpp b/src/VecSim/spaces/functions/NEON_DOTPROD.cpp index 56e032e6f..5e8c4d3ba 100644 --- a/src/VecSim/spaces/functions/NEON_DOTPROD.cpp +++ b/src/VecSim/spaces/functions/NEON_DOTPROD.cpp @@ -10,6 +10,7 @@ #include "VecSim/spaces/IP/IP_NEON_DOTPROD_INT8.h" #include "VecSim/spaces/IP/IP_NEON_DOTPROD_UINT8.h" #include "VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h" +#include "VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8_Precomputed.h" #include "VecSim/spaces/L2/L2_NEON_DOTPROD_INT8.h" #include "VecSim/spaces/L2/L2_NEON_DOTPROD_UINT8.h" @@ -66,6 +67,20 @@ dist_func_t Choose_SQ8_SQ8_Cosine_implementation_NEON_DOTPROD(size_t dim) return ret_dist_func; } +// SQ8-to-SQ8 precomputed distance functions (with precomputed sum/norm) +dist_func_t Choose_SQ8_SQ8_Precomputed_IP_implementation_NEON_DOTPROD(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, + SQ8_SQ8_Precomputed_InnerProductSIMD64_NEON_DOTPROD); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_SQ8_Precomputed_Cosine_implementation_NEON_DOTPROD(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, SQ8_SQ8_Precomputed_CosineSIMD64_NEON_DOTPROD); + return ret_dist_func; +} + #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/NEON_DOTPROD.h b/src/VecSim/spaces/functions/NEON_DOTPROD.h index 232de725a..7bc24c726 100644 --- a/src/VecSim/spaces/functions/NEON_DOTPROD.h +++ b/src/VecSim/spaces/functions/NEON_DOTPROD.h @@ -25,4 +25,8 @@ dist_func_t Choose_UINT8_L2_implementation_NEON_DOTPROD(size_t dim); dist_func_t Choose_SQ8_SQ8_IP_implementation_NEON_DOTPROD(size_t dim); dist_func_t Choose_SQ8_SQ8_Cosine_implementation_NEON_DOTPROD(size_t dim); +// SQ8-to-SQ8 precomputed distance functions (with precomputed sum/norm) +dist_func_t Choose_SQ8_SQ8_Precomputed_IP_implementation_NEON_DOTPROD(size_t dim); +dist_func_t Choose_SQ8_SQ8_Precomputed_Cosine_implementation_NEON_DOTPROD(size_t dim); + } // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE.cpp b/src/VecSim/spaces/functions/SVE.cpp index 98be856a2..3c1013298 100644 --- a/src/VecSim/spaces/functions/SVE.cpp +++ b/src/VecSim/spaces/functions/SVE.cpp @@ -26,6 +26,7 @@ #include "VecSim/spaces/L2/L2_SVE_SQ8.h" #include "VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h" +#include "VecSim/spaces/IP/IP_SVE_SQ8_SQ8_Precomputed.h" namespace spaces { @@ -132,6 +133,19 @@ dist_func_t Choose_SQ8_SQ8_Cosine_implementation_SVE(size_t dim) { return ret_dist_func; } +// SQ8-to-SQ8 precomputed distance functions (with precomputed sum/norm) +dist_func_t Choose_SQ8_SQ8_Precomputed_IP_implementation_SVE(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_SQ8_Precomputed_InnerProductSIMD_SVE, dim, svcntb); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_SQ8_Precomputed_Cosine_implementation_SVE(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_SQ8_Precomputed_CosineSIMD_SVE, dim, svcntb); + return ret_dist_func; +} + #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE.h b/src/VecSim/spaces/functions/SVE.h index d505d88dc..70dcccb47 100644 --- a/src/VecSim/spaces/functions/SVE.h +++ b/src/VecSim/spaces/functions/SVE.h @@ -37,4 +37,8 @@ dist_func_t Choose_SQ8_L2_implementation_SVE(size_t dim); dist_func_t Choose_SQ8_SQ8_IP_implementation_SVE(size_t dim); dist_func_t Choose_SQ8_SQ8_Cosine_implementation_SVE(size_t dim); +// SQ8-to-SQ8 precomputed distance functions (with precomputed sum/norm) +dist_func_t Choose_SQ8_SQ8_Precomputed_IP_implementation_SVE(size_t dim); +dist_func_t Choose_SQ8_SQ8_Precomputed_Cosine_implementation_SVE(size_t dim); + } // namespace spaces diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 5b60d74ec..340bfc03a 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2779,8 +2779,7 @@ TEST_P(SQ8_SQ8_Precomputed_SpacesOptimizationTest, SQ8_SQ8_Precomputed_InnerProd precomputed_func(v1_precomputed.data(), v2_precomputed.data(), dim); // Precomputed should match baseline (within tolerance for float precision) - ASSERT_NEAR(baseline, precomputed_result, 0.01) - << "AVX512 Precomputed IP with dim " << dim; + ASSERT_NEAR(baseline, precomputed_result, 0.01) << "AVX512 Precomputed IP with dim " << dim; } #endif } diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h index fd1d41bd6..4f90781a7 100644 --- a/tests/utils/tests_utils.h +++ b/tests/utils/tests_utils.h @@ -107,7 +107,7 @@ static void populate_float_vec_to_sq8(uint8_t *v, size_t dim, int seed = 1234) { * where sum = Σv[i] and norm = Σv[i]² (sum of squares of uint8 elements) */ static void quantize_float_vec_to_uint8_with_sum_norm(float *v, size_t dim, uint8_t *qv, - int seed = 1234) { + int seed = 1234) { float min_val = v[0]; float max_val = v[0]; for (size_t i = 1; i < dim; i++) { From 4331d913821a0e8c556bd3f77f9e1adeb9130fd2 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Mon, 29 Dec 2025 17:45:45 +0200 Subject: [PATCH 21/51] Implement SQ8-to-SQ8 precomputed inner product and cosine functions; update benchmarks and tests for new functionality --- src/VecSim/spaces/IP/IP.cpp | 50 ++++++++++++ src/VecSim/spaces/IP/IP.h | 6 ++ src/VecSim/spaces/IP_space.cpp | 4 +- .../spaces_benchmarks/bm_spaces_sq8_sq8.cpp | 76 ++++++++++--------- tests/unit/test_spaces.cpp | 6 ++ 5 files changed, 103 insertions(+), 39 deletions(-) diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp index 02f5e383e..fc1c2f912 100644 --- a/src/VecSim/spaces/IP/IP.cpp +++ b/src/VecSim/spaces/IP/IP.cpp @@ -72,6 +72,31 @@ float SQ8_SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dime return 1.0f - res; } +// SQ8-to-SQ8: Both vectors are uint8 quantized +float SQ8_SQ8_InnerProduct_Precomputed(const void *pVect1v, const void *pVect2v, size_t dimension) { + const auto *pVect1 = static_cast(pVect1v); + const auto *pVect2 = static_cast(pVect2v); + + // Get quantization parameters from pVect1 + const float min_val1 = *reinterpret_cast(pVect1 + dimension); + const float delta1 = *reinterpret_cast(pVect1 + dimension + sizeof(float)); + const float sum1 = *reinterpret_cast(pVect1 + dimension + 2 * sizeof(float)); + + // Get quantization parameters from pVect2 + const float min_val2 = *reinterpret_cast(pVect2 + dimension); + const float delta2 = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + const float sum2 = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); + + // Compute inner product with dequantization of both vectors + float product = 0; + for (size_t i = 0; i < dimension; i++) { + product += pVect1[i] * pVect2[i]; + } + float res = min_val1 * sum2 + min_val2 * sum1 - dimension * min_val1 * min_val2 + + delta1 * delta2 * product; + return 1.0f - res; +} + // SQ8-to-SQ8: Both vectors are uint8 quantized (cosine version) float SQ8_SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) { const auto *pVect1 = static_cast(pVect1v); @@ -96,6 +121,31 @@ float SQ8_SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) return 1.0f - res; } +// SQ8-to-SQ8: Both vectors are uint8 quantized (cosine version) +float SQ8_SQ8_Cosine_Precomputed(const void *pVect1v, const void *pVect2v, size_t dimension) { + const auto *pVect1 = static_cast(pVect1v); + const auto *pVect2 = static_cast(pVect2v); + + // Get quantization parameters from pVect1 + const float min_val1 = *reinterpret_cast(pVect1 + dimension); + const float delta1 = *reinterpret_cast(pVect1 + dimension + sizeof(float)); + const float sum1 = *reinterpret_cast(pVect1 + dimension + 2 * sizeof(float)); + + // Get quantization parameters from pVect2 + const float min_val2 = *reinterpret_cast(pVect2 + dimension); + const float delta2 = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + const float sum2 = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); + + float product = 0; + for (size_t i = 0; i < dimension; i++) { + product += pVect1[i] * pVect2[i]; + } + + float res = min_val1 * sum2 + min_val2 * sum1 - dimension * min_val1 * min_val2 + + delta1 * delta2 * product; + return 1.0f - res; +} + float FP32_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension) { auto *vec1 = (float *)pVect1; auto *vec2 = (float *)pVect2; diff --git a/src/VecSim/spaces/IP/IP.h b/src/VecSim/spaces/IP/IP.h index 748377ec0..40ce8c14d 100644 --- a/src/VecSim/spaces/IP/IP.h +++ b/src/VecSim/spaces/IP/IP.h @@ -19,9 +19,15 @@ float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension); // SQ8-to-SQ8: Both vectors are uint8 quantized float SQ8_SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension); +// SQ8-to-SQ8: Both vectors are uint8 quantized with precomputed sum/norm +float SQ8_SQ8_InnerProduct_Precomputed(const void *pVect1v, const void *pVect2v, size_t dimension); + // SQ8-to-SQ8: Both vectors are uint8 quantized and normalized float SQ8_SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension); +// SQ8-to-SQ8: Both vectors are uint8 quantized (cosine version) with precomputed sum/norm +float SQ8_SQ8_Cosine_Precomputed(const void *pVect1v, const void *pVect2v, size_t dimension); + float FP32_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension); double FP64_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension); diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index 235ce156e..de47b7afc 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -248,7 +248,7 @@ dist_func_t IP_SQ8_SQ8_Precomputed_GetDistFunc(size_t dim, unsigned char alignment = &dummy_alignment; } - dist_func_t ret_dist_func = SQ8_SQ8_InnerProduct; // Fallback to original + dist_func_t ret_dist_func = SQ8_SQ8_InnerProduct_Precomputed; // Fallback to original [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt); #ifdef CPU_FEATURES_ARCH_AARCH64 @@ -287,7 +287,7 @@ dist_func_t Cosine_SQ8_SQ8_Precomputed_GetDistFunc(size_t dim, unsigned c alignment = &dummy_alignment; } - dist_func_t ret_dist_func = SQ8_SQ8_Cosine; // Fallback to original + dist_func_t ret_dist_func = SQ8_SQ8_Cosine_Precomputed; // Fallback to original [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt); #ifdef CPU_FEATURES_ARCH_AARCH64 diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp index 96aff6df6..34a9c3689 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp @@ -66,50 +66,52 @@ class BM_VecSimSpaces_SQ8_SQ8_Precomputed : public benchmark::Fixture { } }; -#ifdef CPU_FEATURES_ARCH_AARCH64 -cpu_features::Aarch64Features opt = cpu_features::GetAarch64Info().features; +// #ifdef CPU_FEATURES_ARCH_AARCH64 +// cpu_features::Aarch64Features opt = cpu_features::GetAarch64Info().features; -// NEON SQ8-to-SQ8 functions -#ifdef OPT_NEON -bool neon_supported = opt.asimd; -INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, NEON, 16, neon_supported); -INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, NEON, 16, neon_supported); -#endif // NEON -// SVE SQ8-to-SQ8 functions -#ifdef OPT_SVE -bool sve_supported = opt.sve; -INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, SVE, 16, sve_supported); -INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, SVE, 16, sve_supported); -#endif // SVE -// SVE2 SQ8-to-SQ8 functions -#ifdef OPT_SVE2 -bool sve2_supported = opt.sve2; -INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, SVE2, 16, sve2_supported); -INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, SVE2, 16, sve2_supported); -#endif // SVE2 -#endif // AARCH64 +// // NEON SQ8-to-SQ8 functions +// #ifdef OPT_NEON +// bool neon_supported = opt.asimd; +// INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, NEON, 16, neon_supported); +// INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, NEON, 16, neon_supported); +// #endif // NEON +// // SVE SQ8-to-SQ8 functions +// #ifdef OPT_SVE +// bool sve_supported = opt.sve; +// INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, SVE, 16, sve_supported); +// INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, SVE, 16, sve_supported); +// #endif // SVE +// // SVE2 SQ8-to-SQ8 functions +// #ifdef OPT_SVE2 +// bool sve2_supported = opt.sve2; +// INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, SVE2, 16, sve2_supported); +// INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, SVE2, 16, sve2_supported); +// #endif // SVE2 +// #endif // AARCH64 -#ifdef CPU_FEATURES_ARCH_X86_64 -cpu_features::X86Features opt = cpu_features::GetX86Info().features; +// #ifdef CPU_FEATURES_ARCH_X86_64 +// cpu_features::X86Features opt = cpu_features::GetX86Info().features; -// AVX512_F_BW_VL_VNNI SQ8-to-SQ8 functions -#ifdef OPT_AVX512_F_BW_VL_VNNI -bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && opt.avx512vl && opt.avx512vnni; -INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, AVX512F_BW_VL_VNNI, 64, - avx512_f_bw_vl_vnni_supported); -INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, AVX512F_BW_VL_VNNI, 64, - avx512_f_bw_vl_vnni_supported); +// // AVX512_F_BW_VL_VNNI SQ8-to-SQ8 functions +// #ifdef OPT_AVX512_F_BW_VL_VNNI +// bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && opt.avx512vl && opt.avx512vnni; +// INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, AVX512F_BW_VL_VNNI, 64, +// avx512_f_bw_vl_vnni_supported); +// INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, AVX512F_BW_VL_VNNI, 64, +// avx512_f_bw_vl_vnni_supported); -// AVX512_F_BW_VL_VNNI SQ8-to-SQ8 Precomputed functions (using precomputed sum and norm) -INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8_Precomputed, SQ8_SQ8_Precomputed, - AVX512F_BW_VL_VNNI, 64, avx512_f_bw_vl_vnni_supported); -INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8_Precomputed, SQ8_SQ8_Precomputed, - AVX512F_BW_VL_VNNI, 64, avx512_f_bw_vl_vnni_supported); -#endif // AVX512_F_BW_VL_VNNI -#endif // x86_64 +// // AVX512_F_BW_VL_VNNI SQ8-to-SQ8 Precomputed functions (using precomputed sum and norm) +// INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8_Precomputed, SQ8_SQ8_Precomputed, +// AVX512F_BW_VL_VNNI, 64, avx512_f_bw_vl_vnni_supported); +// INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8_Precomputed, SQ8_SQ8_Precomputed, +// AVX512F_BW_VL_VNNI, 64, avx512_f_bw_vl_vnni_supported); +// #endif // AVX512_F_BW_VL_VNNI +// #endif // x86_64 // Naive SQ8-to-SQ8 algorithms INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, InnerProduct, 16); INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, Cosine, 16); +INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8_SQ8_Precomputed, SQ8_SQ8, InnerProduct, 16); +INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8_SQ8_Precomputed, SQ8_SQ8, Cosine, 16); BENCHMARK_MAIN(); diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 340bfc03a..92f828d2d 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2622,6 +2622,9 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_CosineTest) { dist_func_t arch_opt_func; float baseline = SQ8_SQ8_Cosine(v1_quantized.data(), v2_quantized.data(), dim); + // Note: CreateSQ8QuantizedVectorWithSumNorm is defined later in this file + // We test precomputed version in the dedicated SQ8_SQ8_PrecomputedOptFuncs tests + #ifdef OPT_SVE2 if (optimization.sve2) { unsigned char alignment = 0; @@ -2810,6 +2813,9 @@ TEST_P(SQ8_SQ8_Precomputed_SpacesOptimizationTest, SQ8_SQ8_Precomputed_CosineTes // Baseline: original SQ8_SQ8 implementation float baseline = SQ8_SQ8_Cosine(v1_quantized.data(), v2_quantized.data(), dim); + float baseline_precomputed = SQ8_SQ8_Cosine_Precomputed(v1_precomputed.data(), v2_precomputed.data(), dim); + + ASSERT_NEAR(baseline, baseline_precomputed, 0.01) << "Precomputed should match baseline"; #ifdef OPT_AVX512_F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { From 2e7b30da76e6fb6835afaa290e5f4f3a18cf659a Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Tue, 30 Dec 2025 09:20:23 +0200 Subject: [PATCH 22/51] Refactor SQ8 distance functions and remove precomputed variants - Updated distance function declarations in IP_space.h to clarify that SQ8-to-SQ8 functions use precomputed sum/norm. - Removed precomputed distance function implementations for AVX512F, NEON, and SVE architectures from their respective source files. - Adjusted benchmark tests to remove references to precomputed distance functions and ensure they utilize the updated quantization methods. - Modified utility functions to support the creation of SQ8 quantized vectors with precomputed sum and norm. - Updated unit tests to reflect changes in the quantization process and removed tests specifically for precomputed distance functions. --- src/VecSim/spaces/IP/IP.cpp | 68 +---- src/VecSim/spaces/IP/IP.h | 14 +- .../spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h | 98 +++--- ...P_AVX512F_SQ8_SQ8_BW_VL_VNNI_Precomputed.h | 163 ---------- .../spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h | 158 ++++------ .../IP/IP_NEON_DOTPROD_SQ8_SQ8_Precomputed.h | 130 -------- src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h | 137 +++------ .../spaces/IP/IP_NEON_SQ8_SQ8_Precomputed.h | 138 --------- src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h | 108 +++---- .../spaces/IP/IP_SVE_SQ8_SQ8_Precomputed.h | 136 --------- src/VecSim/spaces/IP_space.cpp | 92 +----- src/VecSim/spaces/IP_space.h | 10 +- .../spaces/functions/AVX512F_BW_VL_VNNI.cpp | 18 +- .../spaces/functions/AVX512F_BW_VL_VNNI.h | 6 +- src/VecSim/spaces/functions/NEON.cpp | 16 +- src/VecSim/spaces/functions/NEON.h | 6 +- src/VecSim/spaces/functions/NEON_DOTPROD.cpp | 17 +- src/VecSim/spaces/functions/NEON_DOTPROD.h | 6 +- src/VecSim/spaces/functions/SVE.cpp | 16 +- src/VecSim/spaces/functions/SVE.h | 6 +- .../spaces_benchmarks/bm_spaces_sq8_sq8.cpp | 5 +- tests/unit/test_spaces.cpp | 286 ++++-------------- tests/utils/tests_utils.h | 28 +- 23 files changed, 301 insertions(+), 1361 deletions(-) delete mode 100644 src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI_Precomputed.h delete mode 100644 src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8_Precomputed.h delete mode 100644 src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8_Precomputed.h delete mode 100644 src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8_Precomputed.h diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp index fc1c2f912..b4b0a7cb6 100644 --- a/src/VecSim/spaces/IP/IP.cpp +++ b/src/VecSim/spaces/IP/IP.cpp @@ -49,34 +49,13 @@ float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) { return 1.0f - res; } -// SQ8-to-SQ8: Both vectors are uint8 quantized +// SQ8-to-SQ8: Both vectors are uint8 quantized with precomputed sum/norm +// Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] [norm +// (float)] float SQ8_SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) { const auto *pVect1 = static_cast(pVect1v); const auto *pVect2 = static_cast(pVect2v); - // Get quantization parameters from pVect1 - const float min_val1 = *reinterpret_cast(pVect1 + dimension); - const float delta1 = *reinterpret_cast(pVect1 + dimension + sizeof(float)); - - // Get quantization parameters from pVect2 - const float min_val2 = *reinterpret_cast(pVect2 + dimension); - const float delta2 = *reinterpret_cast(pVect2 + dimension + sizeof(float)); - - // Compute inner product with dequantization of both vectors - float res = 0; - for (size_t i = 0; i < dimension; i++) { - float dequant1 = pVect1[i] * delta1 + min_val1; - float dequant2 = pVect2[i] * delta2 + min_val2; - res += dequant1 * dequant2; - } - return 1.0f - res; -} - -// SQ8-to-SQ8: Both vectors are uint8 quantized -float SQ8_SQ8_InnerProduct_Precomputed(const void *pVect1v, const void *pVect2v, size_t dimension) { - const auto *pVect1 = static_cast(pVect1v); - const auto *pVect2 = static_cast(pVect2v); - // Get quantization parameters from pVect1 const float min_val1 = *reinterpret_cast(pVect1 + dimension); const float delta1 = *reinterpret_cast(pVect1 + dimension + sizeof(float)); @@ -88,44 +67,24 @@ float SQ8_SQ8_InnerProduct_Precomputed(const void *pVect1v, const void *pVect2v, const float sum2 = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); // Compute inner product with dequantization of both vectors + // With sum = Σv[i] (sum of original float values), the formula is: + // IP = min1*sum2 + min2*sum1 + delta1*delta2*Σ(q1[i]*q2[i]) - dim*min1*min2 float product = 0; for (size_t i = 0; i < dimension; i++) { product += pVect1[i] * pVect2[i]; } - float res = min_val1 * sum2 + min_val2 * sum1 - dimension * min_val1 * min_val2 + - delta1 * delta2 * product; + float res = min_val1 * sum2 + min_val2 * sum1 - + static_cast(dimension) * min_val1 * min_val2 + delta1 * delta2 * product; return 1.0f - res; } -// SQ8-to-SQ8: Both vectors are uint8 quantized (cosine version) +// SQ8-to-SQ8: Both vectors are uint8 quantized and normalized with precomputed sum/norm +// Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] [norm +// (float)] float SQ8_SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) { const auto *pVect1 = static_cast(pVect1v); const auto *pVect2 = static_cast(pVect2v); - // Get quantization parameters from pVect1 - const float min_val1 = *reinterpret_cast(pVect1 + dimension); - const float delta1 = *reinterpret_cast(pVect1 + dimension + sizeof(float)); - - // Get quantization parameters from pVect2 - const float min_val2 = *reinterpret_cast(pVect2 + dimension); - const float delta2 = *reinterpret_cast(pVect2 + dimension + sizeof(float)); - - // Compute inner product with dequantization of both vectors - float res = 0; - for (size_t i = 0; i < dimension; i++) { - float dequant1 = pVect1[i] * delta1 + min_val1; - float dequant2 = pVect2[i] * delta2 + min_val2; - res += dequant1 * dequant2; - } - // Assume both vectors are normalized. - return 1.0f - res; -} - -// SQ8-to-SQ8: Both vectors are uint8 quantized (cosine version) -float SQ8_SQ8_Cosine_Precomputed(const void *pVect1v, const void *pVect2v, size_t dimension) { - const auto *pVect1 = static_cast(pVect1v); - const auto *pVect2 = static_cast(pVect2v); - // Get quantization parameters from pVect1 const float min_val1 = *reinterpret_cast(pVect1 + dimension); const float delta1 = *reinterpret_cast(pVect1 + dimension + sizeof(float)); @@ -136,13 +95,16 @@ float SQ8_SQ8_Cosine_Precomputed(const void *pVect1v, const void *pVect2v, size_ const float delta2 = *reinterpret_cast(pVect2 + dimension + sizeof(float)); const float sum2 = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); + // Compute inner product with dequantization of both vectors + // With sum = Σv[i] (sum of original float values), the formula is: + // IP = min1*sum2 + min2*sum1 + delta1*delta2*Σ(q1[i]*q2[i]) - dim*min1*min2 float product = 0; for (size_t i = 0; i < dimension; i++) { product += pVect1[i] * pVect2[i]; } - float res = min_val1 * sum2 + min_val2 * sum1 - dimension * min_val1 * min_val2 + - delta1 * delta2 * product; + float res = min_val1 * sum2 + min_val2 * sum1 - + static_cast(dimension) * min_val1 * min_val2 + delta1 * delta2 * product; return 1.0f - res; } diff --git a/src/VecSim/spaces/IP/IP.h b/src/VecSim/spaces/IP/IP.h index 40ce8c14d..3e706a5dc 100644 --- a/src/VecSim/spaces/IP/IP.h +++ b/src/VecSim/spaces/IP/IP.h @@ -16,18 +16,16 @@ float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimensio // pVect1v vector of type fp32 and pVect2v vector of type uint8 float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension); -// SQ8-to-SQ8: Both vectors are uint8 quantized -float SQ8_SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension); - // SQ8-to-SQ8: Both vectors are uint8 quantized with precomputed sum/norm -float SQ8_SQ8_InnerProduct_Precomputed(const void *pVect1v, const void *pVect2v, size_t dimension); +// Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] [norm +// (float)] +float SQ8_SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension); -// SQ8-to-SQ8: Both vectors are uint8 quantized and normalized +// SQ8-to-SQ8: Both vectors are uint8 quantized and normalized with precomputed sum/norm +// Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] [norm +// (float)] float SQ8_SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension); -// SQ8-to-SQ8: Both vectors are uint8 quantized (cosine version) with precomputed sum/norm -float SQ8_SQ8_Cosine_Precomputed(const void *pVect1v, const void *pVect2v, size_t dimension); - float FP32_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension); double FP64_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension); diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h index e79b179e4..607ca79c2 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h @@ -11,26 +11,27 @@ #include /** - * SQ8-to-SQ8 distance functions using AVX512 VNNI. + * SQ8-to-SQ8 distance functions using AVX512 VNNI with precomputed sum and norm. * These functions compute distance between two SQ8 (scalar quantized 8-bit) vectors, * where BOTH vectors are uint8 quantized. * + * Uses precomputed sum and norm stored in the vector data, + * eliminating the need to compute them during distance calculation. + * * Uses algebraic optimization to leverage integer VNNI instructions: * - * IP = Σ (v1[i]*δ1 + min1) * (v2[i]*δ2 + min2) - * = δ1*δ2 * Σ(v1[i]*v2[i]) + δ1*min2 * Σv1[i] + δ2*min1 * Σv2[i] + dim*min1*min2 - * TODO: Can store the vector's sum and norm of elements in the vector data, and use it here. + * With sum = Σv[i] (sum of original float values), the formula is: + * IP = min1*sum2 + min2*sum1 - dim*min1*min2 + δ1*δ2 * Σ(q1[i]*q2[i]) * - * This allows using VNNI's _mm512_dpwssd_epi32 for efficient integer dot product, - * then applying scalar corrections at the end. + * Since sum is precomputed, we only need to compute the dot product Σ(q1[i]*q2[i]). * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)]] + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] [norm + * (float)] */ -// Process 64 uint8 elements using VNNI with multiple accumulators for ILP +// Process 64 uint8 elements using VNNI with multiple accumulators for ILP (dot product only) static inline void SQ8_SQ8_InnerProductStep64(const uint8_t *pVec1, const uint8_t *pVec2, - __m512i &dot_acc0, __m512i &dot_acc1, - __m512i &sum1_acc, __m512i &sum2_acc) { + __m512i &dot_acc0, __m512i &dot_acc1) { // Load 64 bytes from each vector __m512i v1_full = _mm512_loadu_si512(reinterpret_cast(pVec1)); __m512i v2_full = _mm512_loadu_si512(reinterpret_cast(pVec2)); @@ -47,17 +48,11 @@ static inline void SQ8_SQ8_InnerProductStep64(const uint8_t *pVec1, const uint8_ _mm512_dpwssd_epi32(dot_acc0, _mm512_cvtepu8_epi16(v1_lo), _mm512_cvtepu8_epi16(v2_lo)); dot_acc1 = _mm512_dpwssd_epi32(dot_acc1, _mm512_cvtepu8_epi16(v1_hi), _mm512_cvtepu8_epi16(v2_hi)); - - // Sum of elements using SAD with zero (sums bytes in groups of 8 -> 8x 64-bit results) - __m512i zero = _mm512_setzero_si512(); - sum1_acc = _mm512_add_epi64(sum1_acc, _mm512_sad_epu8(v1_full, zero)); - sum2_acc = _mm512_add_epi64(sum2_acc, _mm512_sad_epu8(v2_full, zero)); } -// Process 32 uint8 elements using VNNI +// Process 32 uint8 elements using VNNI (dot product only) static inline void SQ8_SQ8_InnerProductStep32(const uint8_t *pVec1, const uint8_t *pVec2, - __m512i &dot_acc, __m512i &sum1_acc, - __m512i &sum2_acc) { + __m512i &dot_acc) { // Load 32 bytes from each vector __m256i v1_256 = _mm256_loadu_si256(reinterpret_cast(pVec1)); __m256i v2_256 = _mm256_loadu_si256(reinterpret_cast(pVec2)); @@ -65,36 +60,33 @@ static inline void SQ8_SQ8_InnerProductStep32(const uint8_t *pVec1, const uint8_ // Convert to int16 (zero-extend) and compute dot product using VNNI dot_acc = _mm512_dpwssd_epi32(dot_acc, _mm512_cvtepu8_epi16(v1_256), _mm512_cvtepu8_epi16(v2_256)); - - // Sum of elements - extend to 512-bit and use SAD - // Use zextsi256_si512 to properly zero the upper half - __m512i v1_full = _mm512_zextsi256_si512(v1_256); - __m512i v2_full = _mm512_zextsi256_si512(v2_256); - __m512i zero = _mm512_setzero_si512(); - sum1_acc = _mm512_add_epi64(sum1_acc, _mm512_sad_epu8(v1_full, zero)); - sum2_acc = _mm512_add_epi64(sum2_acc, _mm512_sad_epu8(v2_full, zero)); } -// Common implementation for inner product between two SQ8 vectors +// Common implementation for inner product between two SQ8 vectors with precomputed sum/norm template // 0..63 float SQ8_SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimension) { const uint8_t *pVec1 = static_cast(pVec1v); const uint8_t *pVec2 = static_cast(pVec2v); const uint8_t *pEnd1 = pVec1 + dimension; - // Get dequantization parameters from the end of pVec1 - const float min1 = *reinterpret_cast(pVec1 + dimension); - const float delta1 = *reinterpret_cast(pVec1 + dimension + sizeof(float)); - - // Get dequantization parameters from the end of pVec2 - const float min2 = *reinterpret_cast(pVec2 + dimension); - const float delta2 = *reinterpret_cast(pVec2 + dimension + sizeof(float)); - - // Multiple accumulators for instruction-level parallelism + // Get dequantization parameters and precomputed values from the end of pVec1 + // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] + const float *params1 = reinterpret_cast(pVec1 + dimension); + const float min1 = params1[0]; + const float delta1 = params1[1]; + const float sum1 = params1[2]; // Precomputed sum of original float elements + // const float norm1 = params1[3]; // Precomputed norm (sum of squares) - not used for IP + + // Get dequantization parameters and precomputed values from the end of pVec2 + const float *params2 = reinterpret_cast(pVec2 + dimension); + const float min2 = params2[0]; + const float delta2 = params2[1]; + const float sum2 = params2[2]; // Precomputed sum of original float elements + // const float norm2 = params2[3]; // Precomputed norm (sum of squares) - not used for IP + + // Multiple accumulators for instruction-level parallelism (dot product only) __m512i dot_acc0 = _mm512_setzero_si512(); __m512i dot_acc1 = _mm512_setzero_si512(); - __m512i sum1_acc = _mm512_setzero_si512(); // Sum of v1 elements - __m512i sum2_acc = _mm512_setzero_si512(); // Sum of v2 elements // Handle residual first (0..63 elements) if constexpr (residual > 0) { @@ -107,16 +99,9 @@ float SQ8_SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dim // Convert to int16 and compute dot product dot_acc0 = _mm512_dpwssd_epi32(dot_acc0, _mm512_cvtepu8_epi16(v1_256), _mm512_cvtepu8_epi16(v2_256)); - - // Sum using SAD (masked load already zeroed unused bytes) - __m512i v1_full = _mm512_zextsi256_si512(v1_256); - __m512i v2_full = _mm512_zextsi256_si512(v2_256); - __m512i zero = _mm512_setzero_si512(); - sum1_acc = _mm512_sad_epu8(v1_full, zero); - sum2_acc = _mm512_sad_epu8(v2_full, zero); } else if constexpr (residual == 32) { // Exactly 32 elements - SQ8_SQ8_InnerProductStep32(pVec1, pVec2, dot_acc0, sum1_acc, sum2_acc); + SQ8_SQ8_InnerProductStep32(pVec1, pVec2, dot_acc0); } else { // 33-63 elements: use masked 64-byte load constexpr __mmask64 mask = (1LLU << residual) - 1; @@ -133,11 +118,6 @@ float SQ8_SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dim _mm512_cvtepu8_epi16(v2_lo)); dot_acc1 = _mm512_dpwssd_epi32(dot_acc1, _mm512_cvtepu8_epi16(v1_hi), _mm512_cvtepu8_epi16(v2_hi)); - - // Sum using SAD (masked load already zeroed unused bytes) - __m512i zero = _mm512_setzero_si512(); - sum1_acc = _mm512_sad_epu8(v1_full, zero); - sum2_acc = _mm512_sad_epu8(v2_full, zero); } pVec1 += residual; pVec2 += residual; @@ -145,7 +125,7 @@ float SQ8_SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dim // Process full 64-byte chunks while (pVec1 < pEnd1) { - SQ8_SQ8_InnerProductStep64(pVec1, pVec2, dot_acc0, dot_acc1, sum1_acc, sum2_acc); + SQ8_SQ8_InnerProductStep64(pVec1, pVec2, dot_acc0, dot_acc1); pVec1 += 64; pVec2 += 64; } @@ -154,16 +134,10 @@ float SQ8_SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dim __m512i dot_total = _mm512_add_epi32(dot_acc0, dot_acc1); int64_t dot_product = _mm512_reduce_add_epi32(dot_total); - // Reduce sum accumulators (SAD produces 8 x 64-bit sums) - int64_t sum_v1 = _mm512_reduce_add_epi64(sum1_acc); - int64_t sum_v2 = _mm512_reduce_add_epi64(sum2_acc); - - // Apply the algebraic formula: - // IP = δ1*δ2 * Σ(v1[i]*v2[i]) + δ1*min2 * Σv1[i] + δ2*min1 * Σv2[i] + dim*min1*min2 - float result = delta1 * delta2 * static_cast(dot_product) + - delta1 * min2 * static_cast(sum_v1) + - delta2 * min1 * static_cast(sum_v2) + - static_cast(dimension) * min1 * min2; + // Apply the algebraic formula using precomputed sums: + // IP = min1*sum2 + min2*sum1 - dim*min1*min2 + δ1*δ2 * Σ(q1[i]*q2[i]) + float result = min1 * sum2 + min2 * sum1 - static_cast(dimension) * min1 * min2 + + delta1 * delta2 * static_cast(dot_product); return result; } diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI_Precomputed.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI_Precomputed.h deleted file mode 100644 index 7533bd405..000000000 --- a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI_Precomputed.h +++ /dev/null @@ -1,163 +0,0 @@ -/* - * Copyright (c) 2006-Present, Redis Ltd. - * All rights reserved. - * - * Licensed under your choice of the Redis Source Available License 2.0 - * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the - * GNU Affero General Public License v3 (AGPLv3). - */ -#pragma once -#include "VecSim/spaces/space_includes.h" -#include - -/** - * SQ8-to-SQ8 distance functions using AVX512 VNNI with precomputed sum and norm. - * These functions compute distance between two SQ8 (scalar quantized 8-bit) vectors, - * where BOTH vectors are uint8 quantized. - * - * This version uses precomputed sum and norm stored in the vector data, - * eliminating the need to compute them during distance calculation. - * - * Uses algebraic optimization to leverage integer VNNI instructions: - * - * IP = Σ (v1[i]*δ1 + min1) * (v2[i]*δ2 + min2) - * = δ1*δ2 * Σ(v1[i]*v2[i]) + δ1*min2 * Σv1[i] + δ2*min1 * Σv2[i] + dim*min1*min2 - * - * Since sum = Σv[i] is precomputed, we only need to compute the dot product Σ(v1[i]*v2[i]). - * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] [norm - * (float)] - */ - -// Process 64 uint8 elements using VNNI with multiple accumulators for ILP (dot product only) -static inline void SQ8_SQ8_Precomputed_InnerProductStep64(const uint8_t *pVec1, - const uint8_t *pVec2, __m512i &dot_acc0, - __m512i &dot_acc1) { - // Load 64 bytes from each vector - __m512i v1_full = _mm512_loadu_si512(reinterpret_cast(pVec1)); - __m512i v2_full = _mm512_loadu_si512(reinterpret_cast(pVec2)); - - // Extract lower and upper 256-bit halves - __m256i v1_lo = _mm512_castsi512_si256(v1_full); - __m256i v1_hi = _mm512_extracti64x4_epi64(v1_full, 1); - __m256i v2_lo = _mm512_castsi512_si256(v2_full); - __m256i v2_hi = _mm512_extracti64x4_epi64(v2_full, 1); - - // Convert to int16 (zero-extend) and compute dot products using VNNI - // dpwssd: multiply pairs of int16, sum pairs to int32, accumulate - dot_acc0 = - _mm512_dpwssd_epi32(dot_acc0, _mm512_cvtepu8_epi16(v1_lo), _mm512_cvtepu8_epi16(v2_lo)); - dot_acc1 = - _mm512_dpwssd_epi32(dot_acc1, _mm512_cvtepu8_epi16(v1_hi), _mm512_cvtepu8_epi16(v2_hi)); -} - -// Process 32 uint8 elements using VNNI (dot product only) -static inline void SQ8_SQ8_Precomputed_InnerProductStep32(const uint8_t *pVec1, - const uint8_t *pVec2, __m512i &dot_acc) { - // Load 32 bytes from each vector - __m256i v1_256 = _mm256_loadu_si256(reinterpret_cast(pVec1)); - __m256i v2_256 = _mm256_loadu_si256(reinterpret_cast(pVec2)); - - // Convert to int16 (zero-extend) and compute dot product using VNNI - dot_acc = - _mm512_dpwssd_epi32(dot_acc, _mm512_cvtepu8_epi16(v1_256), _mm512_cvtepu8_epi16(v2_256)); -} - -// Common implementation for inner product between two SQ8 vectors with precomputed sum/norm -template // 0..63 -float SQ8_SQ8_Precomputed_InnerProductImp(const void *pVec1v, const void *pVec2v, - size_t dimension) { - const uint8_t *pVec1 = static_cast(pVec1v); - const uint8_t *pVec2 = static_cast(pVec2v); - const uint8_t *pEnd1 = pVec1 + dimension; - - // Get dequantization parameters and precomputed values from the end of pVec1 - // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] - const float *params1 = reinterpret_cast(pVec1 + dimension); - const float min1 = params1[0]; - const float delta1 = params1[1]; - const float sum1 = params1[2]; // Precomputed sum of uint8 elements - // const float norm1 = params1[3]; // Precomputed norm (sum of squares) - not used for IP - - // Get dequantization parameters and precomputed values from the end of pVec2 - const float *params2 = reinterpret_cast(pVec2 + dimension); - const float min2 = params2[0]; - const float delta2 = params2[1]; - const float sum2 = params2[2]; // Precomputed sum of uint8 elements - // const float norm2 = params2[3]; // Precomputed norm (sum of squares) - not used for IP - - // Multiple accumulators for instruction-level parallelism (dot product only) - __m512i dot_acc0 = _mm512_setzero_si512(); - __m512i dot_acc1 = _mm512_setzero_si512(); - - // Handle residual first (0..63 elements) - if constexpr (residual > 0) { - if constexpr (residual < 32) { - // Handle less than 32 elements with mask - constexpr __mmask32 mask = (1LU << residual) - 1; - __m256i v1_256 = _mm256_maskz_loadu_epi8(mask, pVec1); - __m256i v2_256 = _mm256_maskz_loadu_epi8(mask, pVec2); - - // Convert to int16 and compute dot product - dot_acc0 = _mm512_dpwssd_epi32(dot_acc0, _mm512_cvtepu8_epi16(v1_256), - _mm512_cvtepu8_epi16(v2_256)); - } else if constexpr (residual == 32) { - // Exactly 32 elements - SQ8_SQ8_Precomputed_InnerProductStep32(pVec1, pVec2, dot_acc0); - } else { - // 33-63 elements: use masked 64-byte load - constexpr __mmask64 mask = (1LLU << residual) - 1; - __m512i v1_full = _mm512_maskz_loadu_epi8(mask, pVec1); - __m512i v2_full = _mm512_maskz_loadu_epi8(mask, pVec2); - - // Extract halves and compute dot products - __m256i v1_lo = _mm512_castsi512_si256(v1_full); - __m256i v1_hi = _mm512_extracti64x4_epi64(v1_full, 1); - __m256i v2_lo = _mm512_castsi512_si256(v2_full); - __m256i v2_hi = _mm512_extracti64x4_epi64(v2_full, 1); - - dot_acc0 = _mm512_dpwssd_epi32(dot_acc0, _mm512_cvtepu8_epi16(v1_lo), - _mm512_cvtepu8_epi16(v2_lo)); - dot_acc1 = _mm512_dpwssd_epi32(dot_acc1, _mm512_cvtepu8_epi16(v1_hi), - _mm512_cvtepu8_epi16(v2_hi)); - } - pVec1 += residual; - pVec2 += residual; - } - - // Process full 64-byte chunks - while (pVec1 < pEnd1) { - SQ8_SQ8_Precomputed_InnerProductStep64(pVec1, pVec2, dot_acc0, dot_acc1); - pVec1 += 64; - pVec2 += 64; - } - - // Combine dot product accumulators and reduce - __m512i dot_total = _mm512_add_epi32(dot_acc0, dot_acc1); - int64_t dot_product = _mm512_reduce_add_epi32(dot_total); - - // Apply the algebraic formula using precomputed sums: - // IP = δ1*δ2 * Σ(v1[i]*v2[i]) + δ1*min2 * Σv1[i] + δ2*min1 * Σv2[i] + dim*min1*min2 - float result = delta1 * delta2 * static_cast(dot_product) + delta1 * min2 * sum1 + - delta2 * min1 * sum2 + static_cast(dimension) * min1 * min2; - - return result; -} - -// SQ8-to-SQ8 Inner Product distance function with precomputed sum/norm -// Returns 1 - inner_product (distance form) -template // 0..63 -float SQ8_SQ8_Precomputed_InnerProductSIMD64_AVX512F_BW_VL_VNNI(const void *pVec1v, - const void *pVec2v, - size_t dimension) { - return 1.0f - SQ8_SQ8_Precomputed_InnerProductImp(pVec1v, pVec2v, dimension); -} - -// SQ8-to-SQ8 Cosine distance function with precomputed sum/norm -// Returns 1 - (inner_product) -template // 0..63 -float SQ8_SQ8_Precomputed_CosineSIMD64_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, - size_t dimension) { - // Assume vectors are normalized. - return 1.0f - SQ8_SQ8_Precomputed_InnerProductImp(pVec1v, pVec2v, dimension); -} diff --git a/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h index bd339e94f..de1ba5f53 100644 --- a/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h +++ b/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h @@ -11,28 +11,28 @@ #include /** - * SQ8-to-SQ8 distance functions for NEON with DOTPROD extension. + * SQ8-to-SQ8 distance functions using ARM NEON DOTPROD with precomputed sum and norm. * These functions compute distance between two SQ8 (scalar quantized 8-bit) vectors, * where BOTH vectors are uint8 quantized. * - * Uses algebraic optimization with INTEGER arithmetic throughout: + * Uses precomputed sum and norm stored in the vector data, + * eliminating the need to compute them during distance calculation. * - * IP = Σ (v1[i]*δ1 + min1) * (v2[i]*δ2 + min2) - * = δ1*δ2 * Σ(v1[i]*v2[i]) + δ1*min2 * Σv1[i] + δ2*min1 * Σv2[i] + dim*min1*min2 + * Uses algebraic optimization with DOTPROD instruction: * - * All sums are computed using integer arithmetic, converted to float only at the end. + * With sum = Σv[i] (sum of original float values), the formula is: + * IP = min1*sum2 + min2*sum1 - dim*min1*min2 + δ1*δ2 * Σ(q1[i]*q2[i]) * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)]] + * Since sum is precomputed, we only need to compute the dot product Σ(q1[i]*q2[i]). + * + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] [norm + * (float)] */ -// Helper function: computes dot product and element sums using integer arithmetic +// Helper function: computes dot product using DOTPROD instruction (no sum computation needed) __attribute__((always_inline)) static inline void SQ8_SQ8_InnerProductStep_NEON_DOTPROD(const uint8_t *&pVec1, const uint8_t *&pVec2, - uint32x4_t &dot_sum, uint32x4_t &sum1, uint32x4_t &sum2) { - // Ones vector for computing element sums via dot product (function-local to avoid - // multiple definitions when header is included in multiple translation units) - static const uint8x16_t ones = vdupq_n_u8(1); - + uint32x4_t &dot_sum) { // Load 16 uint8 elements uint8x16_t v1 = vld1q_u8(pVec1); uint8x16_t v2 = vld1q_u8(pVec2); @@ -40,116 +40,76 @@ SQ8_SQ8_InnerProductStep_NEON_DOTPROD(const uint8_t *&pVec1, const uint8_t *&pVe // Compute dot product using DOTPROD instruction: dot_sum += v1 . v2 dot_sum = vdotq_u32(dot_sum, v1, v2); - // Compute element sums using dot product with ones vector - // sum1 += Σv1[i], sum2 += Σv2[i] - sum1 = vdotq_u32(sum1, v1, ones); - sum2 = vdotq_u32(sum2, v2, ones); - pVec1 += 16; pVec2 += 16; } -// Common implementation for inner product between two SQ8 vectors +// Common implementation for inner product between two SQ8 vectors with precomputed sum/norm template // 0..63 float SQ8_SQ8_InnerProductSIMD64_NEON_DOTPROD_IMP(const void *pVec1v, const void *pVec2v, size_t dimension) { const uint8_t *pVec1 = static_cast(pVec1v); const uint8_t *pVec2 = static_cast(pVec2v); - // Get dequantization parameters from the end of pVec1 - const float min1 = *reinterpret_cast(pVec1 + dimension); - const float delta1 = *reinterpret_cast(pVec1 + dimension + sizeof(float)); + // Get dequantization parameters and precomputed values from the end of pVec1 + // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] + const float *params1 = reinterpret_cast(pVec1 + dimension); + const float min1 = params1[0]; + const float delta1 = params1[1]; + const float sum1 = params1[2]; // Precomputed sum of original float elements - // Get dequantization parameters from the end of pVec2 - const float min2 = *reinterpret_cast(pVec2 + dimension); - const float delta2 = *reinterpret_cast(pVec2 + dimension + sizeof(float)); + // Get dequantization parameters and precomputed values from the end of pVec2 + const float *params2 = reinterpret_cast(pVec2 + dimension); + const float min2 = params2[0]; + const float delta2 = params2[1]; + const float sum2 = params2[2]; // Precomputed sum of original float elements - // Integer accumulators for dot product and element sums + // Calculate number of 64-element chunks + size_t num_of_chunks = (dimension - residual) / 64; + + // Multiple accumulators for ILP (dot product only) uint32x4_t dot_sum0 = vdupq_n_u32(0); uint32x4_t dot_sum1 = vdupq_n_u32(0); - uint32x4_t sum1_0 = vdupq_n_u32(0); - uint32x4_t sum1_1 = vdupq_n_u32(0); - uint32x4_t sum2_0 = vdupq_n_u32(0); - uint32x4_t sum2_1 = vdupq_n_u32(0); - - // Handle residual elements first (0-15 elements) - constexpr size_t final_residual = residual % 16; - if constexpr (final_residual > 0) { - // Ones vector for computing element sums via dot product - static const uint8x16_t ones = vdupq_n_u8(1); - constexpr uint8x16_t mask = { - 0xFF, - (final_residual >= 2) ? 0xFF : 0, - (final_residual >= 3) ? 0xFF : 0, - (final_residual >= 4) ? 0xFF : 0, - (final_residual >= 5) ? 0xFF : 0, - (final_residual >= 6) ? 0xFF : 0, - (final_residual >= 7) ? 0xFF : 0, - (final_residual >= 8) ? 0xFF : 0, - (final_residual >= 9) ? 0xFF : 0, - (final_residual >= 10) ? 0xFF : 0, - (final_residual >= 11) ? 0xFF : 0, - (final_residual >= 12) ? 0xFF : 0, - (final_residual >= 13) ? 0xFF : 0, - (final_residual >= 14) ? 0xFF : 0, - (final_residual >= 15) ? 0xFF : 0, - 0, - }; - - uint8x16_t v1 = vld1q_u8(pVec1); - uint8x16_t v2 = vld1q_u8(pVec2); - uint8x16_t zeros = vdupq_n_u8(0); - - // Zero out irrelevant elements - v1 = vbslq_u8(mask, v1, zeros); - v2 = vbslq_u8(mask, v2, zeros); - - // Accumulate using integer arithmetic - dot_sum1 = vdotq_u32(dot_sum1, v1, v2); - sum1_1 = vdotq_u32(sum1_1, v1, ones); - sum2_1 = vdotq_u32(sum2_1, v2, ones); - - pVec1 += final_residual; - pVec2 += final_residual; - } - - // Process 64 elements at a time in the main loop - const size_t num_of_chunks = dimension / 64; + uint32x4_t dot_sum2 = vdupq_n_u32(0); + uint32x4_t dot_sum3 = vdupq_n_u32(0); + // Process 64 elements at a time (4 x 16) in the main loop for (size_t i = 0; i < num_of_chunks; i++) { - SQ8_SQ8_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum0, sum1_0, sum2_0); - SQ8_SQ8_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum1, sum1_1, sum2_1); - SQ8_SQ8_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum0, sum1_0, sum2_0); - SQ8_SQ8_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum1, sum1_1, sum2_1); + SQ8_SQ8_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum0); + SQ8_SQ8_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum1); + SQ8_SQ8_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum2); + SQ8_SQ8_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum3); } - // Handle remaining 16-element chunks (0-3 chunks within residual) - constexpr size_t residual_chunks = residual / 16; - if constexpr (residual_chunks >= 1) { - SQ8_SQ8_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum0, sum1_0, sum2_0); + // Handle remaining complete 16-element blocks within residual + if constexpr (residual >= 16) { + SQ8_SQ8_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum0); } - if constexpr (residual_chunks >= 2) { - SQ8_SQ8_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum1, sum1_1, sum2_1); + if constexpr (residual >= 32) { + SQ8_SQ8_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum1); } - if constexpr (residual_chunks >= 3) { - SQ8_SQ8_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum0, sum1_0, sum2_0); + if constexpr (residual >= 48) { + SQ8_SQ8_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum2); } // Combine accumulators - uint32x4_t dot_total = vaddq_u32(dot_sum0, dot_sum1); - uint32x4_t sum1_total = vaddq_u32(sum1_0, sum1_1); - uint32x4_t sum2_total = vaddq_u32(sum2_0, sum2_1); + uint32x4_t dot_total = vaddq_u32(vaddq_u32(dot_sum0, dot_sum1), vaddq_u32(dot_sum2, dot_sum3)); - // Horizontal sum to scalar (integer) + // Horizontal sum for dot product uint32_t dot_product = vaddvq_u32(dot_total); - uint32_t v1_sum = vaddvq_u32(sum1_total); - uint32_t v2_sum = vaddvq_u32(sum2_total); - - // Apply algebraic formula with float conversion only at the end: - // IP = δ1*δ2 * Σ(v1*v2) + δ1*min2 * Σv1 + δ2*min1 * Σv2 + dim*min1*min2 - return delta1 * delta2 * static_cast(dot_product) + - delta1 * min2 * static_cast(v1_sum) + delta2 * min1 * static_cast(v2_sum) + - static_cast(dimension) * min1 * min2; + + // Handle remaining scalar elements (0-15) + constexpr unsigned char remaining = residual % 16; + if constexpr (remaining > 0) { + for (unsigned char i = 0; i < remaining; i++) { + dot_product += static_cast(pVec1[i]) * static_cast(pVec2[i]); + } + } + + // Apply algebraic formula using precomputed sums: + // IP = min1*sum2 + min2*sum1 - dim*min1*min2 + δ1*δ2 * Σ(q1*q2) + return min1 * sum2 + min2 * sum1 - static_cast(dimension) * min1 * min2 + + delta1 * delta2 * static_cast(dot_product); } // SQ8-to-SQ8 Inner Product distance function @@ -161,10 +121,8 @@ float SQ8_SQ8_InnerProductSIMD64_NEON_DOTPROD(const void *pVec1v, const void *pV } // SQ8-to-SQ8 Cosine distance function - -// Returns 1 - inner_product +// Returns 1 - inner_product (assumes vectors are pre-normalized) template // 0..63 float SQ8_SQ8_CosineSIMD64_NEON_DOTPROD(const void *pVec1v, const void *pVec2v, size_t dimension) { - // Assumes both vectors are normalized. return 1.0f - SQ8_SQ8_InnerProductSIMD64_NEON_DOTPROD_IMP(pVec1v, pVec2v, dimension); } diff --git a/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8_Precomputed.h b/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8_Precomputed.h deleted file mode 100644 index a9f372aac..000000000 --- a/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8_Precomputed.h +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright (c) 2006-Present, Redis Ltd. - * All rights reserved. - * - * Licensed under your choice of the Redis Source Available License 2.0 - * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the - * GNU Affero General Public License v3 (AGPLv3). - */ -#pragma once -#include "VecSim/spaces/space_includes.h" -#include - -/** - * SQ8-to-SQ8 distance functions using ARM NEON DOTPROD with precomputed sum and norm. - * These functions compute distance between two SQ8 (scalar quantized 8-bit) vectors, - * where BOTH vectors are uint8 quantized. - * - * This version uses precomputed sum and norm stored in the vector data, - * eliminating the need to compute them during distance calculation. - * - * Uses algebraic optimization with DOTPROD instruction: - * IP = δ1*δ2 * Σ(v1[i]*v2[i]) + δ1*min2 * Σv1[i] + δ2*min1 * Σv2[i] + dim*min1*min2 - * - * Since sum = Σv[i] is precomputed, we only need to compute the dot product Σ(v1[i]*v2[i]). - * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] [norm - * (float)] - */ - -// Helper function: computes dot product using DOTPROD instruction (no sum computation needed) -__attribute__((always_inline)) static inline void -SQ8_SQ8_Precomputed_InnerProductStep_NEON_DOTPROD(const uint8_t *&pVec1, const uint8_t *&pVec2, - uint32x4_t &dot_sum) { - // Load 16 uint8 elements - uint8x16_t v1 = vld1q_u8(pVec1); - uint8x16_t v2 = vld1q_u8(pVec2); - - // Compute dot product using DOTPROD instruction: dot_sum += v1 . v2 - dot_sum = vdotq_u32(dot_sum, v1, v2); - - pVec1 += 16; - pVec2 += 16; -} - -// Common implementation for inner product between two SQ8 vectors with precomputed sum/norm -template // 0..63 -float SQ8_SQ8_Precomputed_InnerProductSIMD64_NEON_DOTPROD_IMP(const void *pVec1v, - const void *pVec2v, - size_t dimension) { - const uint8_t *pVec1 = static_cast(pVec1v); - const uint8_t *pVec2 = static_cast(pVec2v); - - // Get dequantization parameters and precomputed values from the end of pVec1 - // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] - const float *params1 = reinterpret_cast(pVec1 + dimension); - const float min1 = params1[0]; - const float delta1 = params1[1]; - const float sum1 = params1[2]; // Precomputed sum of uint8 elements - - // Get dequantization parameters and precomputed values from the end of pVec2 - const float *params2 = reinterpret_cast(pVec2 + dimension); - const float min2 = params2[0]; - const float delta2 = params2[1]; - const float sum2 = params2[2]; // Precomputed sum of uint8 elements - - // Calculate number of 64-element chunks - size_t num_of_chunks = (dimension - residual) / 64; - - // Multiple accumulators for ILP (dot product only) - uint32x4_t dot_sum0 = vdupq_n_u32(0); - uint32x4_t dot_sum1 = vdupq_n_u32(0); - uint32x4_t dot_sum2 = vdupq_n_u32(0); - uint32x4_t dot_sum3 = vdupq_n_u32(0); - - // Process 64 elements at a time (4 x 16) in the main loop - for (size_t i = 0; i < num_of_chunks; i++) { - SQ8_SQ8_Precomputed_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum0); - SQ8_SQ8_Precomputed_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum1); - SQ8_SQ8_Precomputed_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum2); - SQ8_SQ8_Precomputed_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum3); - } - - // Handle remaining complete 16-element blocks within residual - if constexpr (residual >= 16) { - SQ8_SQ8_Precomputed_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum0); - } - if constexpr (residual >= 32) { - SQ8_SQ8_Precomputed_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum1); - } - if constexpr (residual >= 48) { - SQ8_SQ8_Precomputed_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum2); - } - - // Combine accumulators - uint32x4_t dot_total = vaddq_u32(vaddq_u32(dot_sum0, dot_sum1), vaddq_u32(dot_sum2, dot_sum3)); - - // Horizontal sum for dot product - uint32_t dot_product = vaddvq_u32(dot_total); - - // Handle remaining scalar elements (0-15) - constexpr unsigned char remaining = residual % 16; - if constexpr (remaining > 0) { - for (unsigned char i = 0; i < remaining; i++) { - dot_product += static_cast(pVec1[i]) * static_cast(pVec2[i]); - } - } - - // Apply algebraic formula using precomputed sums: - // IP = δ1*δ2 * Σ(v1*v2) + δ1*min2 * Σv1 + δ2*min1 * Σv2 + dim*min1*min2 - return delta1 * delta2 * static_cast(dot_product) + delta1 * min2 * sum1 + - delta2 * min1 * sum2 + static_cast(dimension) * min1 * min2; -} - -// SQ8-to-SQ8 Inner Product distance function with precomputed sum/norm -// Returns 1 - inner_product (distance form) -template // 0..63 -float SQ8_SQ8_Precomputed_InnerProductSIMD64_NEON_DOTPROD(const void *pVec1v, const void *pVec2v, - size_t dimension) { - return 1.0f - SQ8_SQ8_Precomputed_InnerProductSIMD64_NEON_DOTPROD_IMP(pVec1v, pVec2v, - dimension); -} - -// SQ8-to-SQ8 Cosine distance function with precomputed sum/norm -// Returns 1 - inner_product (assumes vectors are pre-normalized) -template // 0..63 -float SQ8_SQ8_Precomputed_CosineSIMD64_NEON_DOTPROD(const void *pVec1v, const void *pVec2v, - size_t dimension) { - return 1.0f - SQ8_SQ8_Precomputed_InnerProductSIMD64_NEON_DOTPROD_IMP(pVec1v, pVec2v, - dimension); -} diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h index 778c201e1..af1bef2af 100644 --- a/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h +++ b/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h @@ -11,24 +11,27 @@ #include /** - * SQ8-to-SQ8 distance functions for NEON. + * SQ8-to-SQ8 distance functions using ARM NEON with precomputed sum and norm. * These functions compute distance between two SQ8 (scalar quantized 8-bit) vectors, * where BOTH vectors are uint8 quantized. * - * Uses algebraic optimization to reduce operations per element: + * Uses precomputed sum and norm stored in the vector data, + * eliminating the need to compute them during distance calculation. * - * IP = Σ (v1[i]*δ1 + min1) * (v2[i]*δ2 + min2) - * = δ1*δ2 * Σ(v1[i]*v2[i]) + δ1*min2 * Σv1[i] + δ2*min1 * Σv2[i] + dim*min1*min2 + * Uses algebraic optimization: * - * This saves 2 FMAs per 4-element step by deferring dequantization to scalar math at the end. + * With sum = Σv[i] (sum of original float values), the formula is: + * IP = min1*sum2 + min2*sum1 - dim*min1*min2 + δ1*δ2 * Σ(q1[i]*q2[i]) * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)]] + * Since sum is precomputed, we only need to compute the dot product Σ(q1[i]*q2[i]). + * + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] [norm + * (float)] */ -// Helper function with algebraic optimization +// Helper function with dot product only (no sum computation needed) static inline void SQ8_SQ8_InnerProductStep_NEON(const uint8_t *&pVec1, const uint8_t *&pVec2, - float32x4_t &dot_sum, float32x4_t &sum1, - float32x4_t &sum2) { + float32x4_t &dot_sum) { // Load 4 uint8 elements from pVec1 and convert to float uint8x8_t v1_u8 = vld1_u8(pVec1); uint32x4_t v1_u32 = vmovl_u16(vget_low_u16(vmovl_u8(v1_u8))); @@ -42,122 +45,79 @@ static inline void SQ8_SQ8_InnerProductStep_NEON(const uint8_t *&pVec1, const ui // Accumulate dot product: dot_sum += v1 * v2 (no dequantization) dot_sum = vmlaq_f32(dot_sum, v1_f, v2_f); - // Accumulate element sums - sum1 = vaddq_f32(sum1, v1_f); - sum2 = vaddq_f32(sum2, v2_f); - // Advance pointers pVec1 += 4; pVec2 += 4; } -// Common implementation for inner product between two SQ8 vectors +// Common implementation for inner product between two SQ8 vectors with precomputed sum/norm template // 0..15 float SQ8_SQ8_InnerProductSIMD16_NEON_IMP(const void *pVec1v, const void *pVec2v, size_t dimension) { const uint8_t *pVec1 = static_cast(pVec1v); const uint8_t *pVec2 = static_cast(pVec2v); - // Get dequantization parameters from the end of pVec1 - const float min1 = *reinterpret_cast(pVec1 + dimension); - const float delta1 = *reinterpret_cast(pVec1 + dimension + sizeof(float)); + // Get dequantization parameters and precomputed values from the end of pVec1 + // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] + const float *params1 = reinterpret_cast(pVec1 + dimension); + const float min1 = params1[0]; + const float delta1 = params1[1]; + const float sum1 = params1[2]; // Precomputed sum of original float elements - // Get dequantization parameters from the end of pVec2 - const float min2 = *reinterpret_cast(pVec2 + dimension); - const float delta2 = *reinterpret_cast(pVec2 + dimension + sizeof(float)); + // Get dequantization parameters and precomputed values from the end of pVec2 + const float *params2 = reinterpret_cast(pVec2 + dimension); + const float min2 = params2[0]; + const float delta2 = params2[1]; + const float sum2 = params2[2]; // Precomputed sum of original float elements - // Multiple accumulators for instruction-level parallelism - // dot_sum: accumulates v1[i] * v2[i] - // sum1: accumulates v1[i] - // sum2: accumulates v2[i] + // Calculate number of 16-element chunks + size_t num_of_chunks = (dimension - residual) / 16; + + // Multiple accumulators for ILP (dot product only) float32x4_t dot_sum0 = vdupq_n_f32(0.0f); float32x4_t dot_sum1 = vdupq_n_f32(0.0f); float32x4_t dot_sum2 = vdupq_n_f32(0.0f); float32x4_t dot_sum3 = vdupq_n_f32(0.0f); - float32x4_t sum1_0 = vdupq_n_f32(0.0f); - float32x4_t sum1_1 = vdupq_n_f32(0.0f); - float32x4_t sum1_2 = vdupq_n_f32(0.0f); - float32x4_t sum1_3 = vdupq_n_f32(0.0f); - float32x4_t sum2_0 = vdupq_n_f32(0.0f); - float32x4_t sum2_1 = vdupq_n_f32(0.0f); - float32x4_t sum2_2 = vdupq_n_f32(0.0f); - float32x4_t sum2_3 = vdupq_n_f32(0.0f); - - const size_t num_of_chunks = dimension / 16; // Process 16 elements at a time in the main loop for (size_t i = 0; i < num_of_chunks; i++) { - SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, dot_sum0, sum1_0, sum2_0); - SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, dot_sum1, sum1_1, sum2_1); - SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, dot_sum2, sum1_2, sum2_2); - SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, dot_sum3, sum1_3, sum2_3); + SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, dot_sum0); + SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, dot_sum1); + SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, dot_sum2); + SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, dot_sum3); } // Handle remaining complete 4-element blocks within residual if constexpr (residual >= 4) { - SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, dot_sum0, sum1_0, sum2_0); + SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, dot_sum0); } if constexpr (residual >= 8) { - SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, dot_sum1, sum1_1, sum2_1); + SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, dot_sum1); } if constexpr (residual >= 12) { - SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, dot_sum2, sum1_2, sum2_2); + SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, dot_sum2); } - // Handle final residual elements (0-3 elements) with scalar math - constexpr size_t final_residual = residual % 4; - if constexpr (final_residual > 0) { - float32x4_t v1_f = vdupq_n_f32(0.0f); - float32x4_t v2_f = vdupq_n_f32(0.0f); - - if constexpr (final_residual >= 1) { - float val1_0 = static_cast(pVec1[0]); - float val2_0 = static_cast(pVec2[0]); - v1_f = vld1q_lane_f32(&val1_0, v1_f, 0); - v2_f = vld1q_lane_f32(&val2_0, v2_f, 0); - } - if constexpr (final_residual >= 2) { - float val1_1 = static_cast(pVec1[1]); - float val2_1 = static_cast(pVec2[1]); - v1_f = vld1q_lane_f32(&val1_1, v1_f, 1); - v2_f = vld1q_lane_f32(&val2_1, v2_f, 1); - } - if constexpr (final_residual >= 3) { - float val1_2 = static_cast(pVec1[2]); - float val2_2 = static_cast(pVec2[2]); - v1_f = vld1q_lane_f32(&val1_2, v1_f, 2); - v2_f = vld1q_lane_f32(&val2_2, v2_f, 2); - } - - dot_sum3 = vmlaq_f32(dot_sum3, v1_f, v2_f); - sum1_3 = vaddq_f32(sum1_3, v1_f); - sum2_3 = vaddq_f32(sum2_3, v2_f); - } - - // Combine accumulators + // Combine dot product accumulators float32x4_t dot_total = vaddq_f32(vaddq_f32(dot_sum0, dot_sum1), vaddq_f32(dot_sum2, dot_sum3)); - float32x4_t sum1_total = vaddq_f32(vaddq_f32(sum1_0, sum1_1), vaddq_f32(sum1_2, sum1_3)); - float32x4_t sum2_total = vaddq_f32(vaddq_f32(sum2_0, sum2_1), vaddq_f32(sum2_2, sum2_3)); // Horizontal sum for dot product float32x2_t dot_halves = vadd_f32(vget_low_f32(dot_total), vget_high_f32(dot_total)); float32x2_t dot_summed = vpadd_f32(dot_halves, dot_halves); float dot_product = vget_lane_f32(dot_summed, 0); - // Horizontal sum for v1 sum - float32x2_t sum1_halves = vadd_f32(vget_low_f32(sum1_total), vget_high_f32(sum1_total)); - float32x2_t sum1_summed = vpadd_f32(sum1_halves, sum1_halves); - float v1_sum = vget_lane_f32(sum1_summed, 0); - - // Horizontal sum for v2 sum - float32x2_t sum2_halves = vadd_f32(vget_low_f32(sum2_total), vget_high_f32(sum2_total)); - float32x2_t sum2_summed = vpadd_f32(sum2_halves, sum2_halves); - float v2_sum = vget_lane_f32(sum2_summed, 0); + // Handle remaining scalar elements (0-3) + constexpr unsigned char remaining = residual % 4; + if constexpr (remaining > 0) { + for (unsigned char i = 0; i < remaining; i++) { + dot_product += static_cast(pVec1[i]) * static_cast(pVec2[i]); + } + } - // Apply algebraic formula: - // IP = δ1*δ2 * Σ(v1*v2) + δ1*min2 * Σv1 + δ2*min1 * Σv2 + dim*min1*min2 - return delta1 * delta2 * dot_product + delta1 * min2 * v1_sum + delta2 * min1 * v2_sum + - static_cast(dimension) * min1 * min2; + // Apply algebraic formula using precomputed sums: + // IP = min1*sum2 + min2*sum1 - dim*min1*min2 + δ1*δ2 * Σ(q1*q2) + return min1 * sum2 + min2 * sum1 - static_cast(dimension) * min1 * min2 + + delta1 * delta2 * dot_product; } // SQ8-to-SQ8 Inner Product distance function @@ -168,8 +128,7 @@ float SQ8_SQ8_InnerProductSIMD16_NEON(const void *pVec1v, const void *pVec2v, si } // SQ8-to-SQ8 Cosine distance function -// Assumes both vectors are normalized. -// Returns 1 - inner_product +// Returns 1 - inner_product (assumes vectors are pre-normalized) template // 0..15 float SQ8_SQ8_CosineSIMD16_NEON(const void *pVec1v, const void *pVec2v, size_t dimension) { return 1.0f - SQ8_SQ8_InnerProductSIMD16_NEON_IMP(pVec1v, pVec2v, dimension); diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8_Precomputed.h b/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8_Precomputed.h deleted file mode 100644 index 9fb7c7d17..000000000 --- a/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8_Precomputed.h +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Copyright (c) 2006-Present, Redis Ltd. - * All rights reserved. - * - * Licensed under your choice of the Redis Source Available License 2.0 - * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the - * GNU Affero General Public License v3 (AGPLv3). - */ -#pragma once -#include "VecSim/spaces/space_includes.h" -#include - -/** - * SQ8-to-SQ8 distance functions using ARM NEON with precomputed sum and norm. - * These functions compute distance between two SQ8 (scalar quantized 8-bit) vectors, - * where BOTH vectors are uint8 quantized. - * - * This version uses precomputed sum and norm stored in the vector data, - * eliminating the need to compute them during distance calculation. - * - * Uses algebraic optimization: - * IP = δ1*δ2 * Σ(v1[i]*v2[i]) + δ1*min2 * Σv1[i] + δ2*min1 * Σv2[i] + dim*min1*min2 - * - * Since sum = Σv[i] is precomputed, we only need to compute the dot product Σ(v1[i]*v2[i]). - * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] [norm - * (float)] - */ - -// Helper function with dot product only (no sum computation needed) -static inline void SQ8_SQ8_Precomputed_InnerProductStep_NEON(const uint8_t *&pVec1, - const uint8_t *&pVec2, - float32x4_t &dot_sum) { - // Load 4 uint8 elements from pVec1 and convert to float - uint8x8_t v1_u8 = vld1_u8(pVec1); - uint32x4_t v1_u32 = vmovl_u16(vget_low_u16(vmovl_u8(v1_u8))); - float32x4_t v1_f = vcvtq_f32_u32(v1_u32); - - // Load 4 uint8 elements from pVec2 and convert to float - uint8x8_t v2_u8 = vld1_u8(pVec2); - uint32x4_t v2_u32 = vmovl_u16(vget_low_u16(vmovl_u8(v2_u8))); - float32x4_t v2_f = vcvtq_f32_u32(v2_u32); - - // Accumulate dot product: dot_sum += v1 * v2 (no dequantization) - dot_sum = vmlaq_f32(dot_sum, v1_f, v2_f); - - // Advance pointers - pVec1 += 4; - pVec2 += 4; -} - -// Common implementation for inner product between two SQ8 vectors with precomputed sum/norm -template // 0..15 -float SQ8_SQ8_Precomputed_InnerProductSIMD16_NEON_IMP(const void *pVec1v, const void *pVec2v, - size_t dimension) { - const uint8_t *pVec1 = static_cast(pVec1v); - const uint8_t *pVec2 = static_cast(pVec2v); - - // Get dequantization parameters and precomputed values from the end of pVec1 - // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] - const float *params1 = reinterpret_cast(pVec1 + dimension); - const float min1 = params1[0]; - const float delta1 = params1[1]; - const float sum1 = params1[2]; // Precomputed sum of uint8 elements - - // Get dequantization parameters and precomputed values from the end of pVec2 - const float *params2 = reinterpret_cast(pVec2 + dimension); - const float min2 = params2[0]; - const float delta2 = params2[1]; - const float sum2 = params2[2]; // Precomputed sum of uint8 elements - - // Calculate number of 16-element chunks - size_t num_of_chunks = (dimension - residual) / 16; - - // Multiple accumulators for ILP (dot product only) - float32x4_t dot_sum0 = vdupq_n_f32(0.0f); - float32x4_t dot_sum1 = vdupq_n_f32(0.0f); - float32x4_t dot_sum2 = vdupq_n_f32(0.0f); - float32x4_t dot_sum3 = vdupq_n_f32(0.0f); - - // Process 16 elements at a time in the main loop - for (size_t i = 0; i < num_of_chunks; i++) { - SQ8_SQ8_Precomputed_InnerProductStep_NEON(pVec1, pVec2, dot_sum0); - SQ8_SQ8_Precomputed_InnerProductStep_NEON(pVec1, pVec2, dot_sum1); - SQ8_SQ8_Precomputed_InnerProductStep_NEON(pVec1, pVec2, dot_sum2); - SQ8_SQ8_Precomputed_InnerProductStep_NEON(pVec1, pVec2, dot_sum3); - } - - // Handle remaining complete 4-element blocks within residual - if constexpr (residual >= 4) { - SQ8_SQ8_Precomputed_InnerProductStep_NEON(pVec1, pVec2, dot_sum0); - } - if constexpr (residual >= 8) { - SQ8_SQ8_Precomputed_InnerProductStep_NEON(pVec1, pVec2, dot_sum1); - } - if constexpr (residual >= 12) { - SQ8_SQ8_Precomputed_InnerProductStep_NEON(pVec1, pVec2, dot_sum2); - } - - // Combine dot product accumulators - float32x4_t dot_total = vaddq_f32(vaddq_f32(dot_sum0, dot_sum1), vaddq_f32(dot_sum2, dot_sum3)); - - // Horizontal sum for dot product - float32x2_t dot_halves = vadd_f32(vget_low_f32(dot_total), vget_high_f32(dot_total)); - float32x2_t dot_summed = vpadd_f32(dot_halves, dot_halves); - float dot_product = vget_lane_f32(dot_summed, 0); - - // Handle remaining scalar elements (0-3) - constexpr unsigned char remaining = residual % 4; - if constexpr (remaining > 0) { - for (unsigned char i = 0; i < remaining; i++) { - dot_product += static_cast(pVec1[i]) * static_cast(pVec2[i]); - } - } - - // Apply algebraic formula using precomputed sums: - // IP = δ1*δ2 * Σ(v1*v2) + δ1*min2 * Σv1 + δ2*min1 * Σv2 + dim*min1*min2 - return delta1 * delta2 * dot_product + delta1 * min2 * sum1 + delta2 * min1 * sum2 + - static_cast(dimension) * min1 * min2; -} - -// SQ8-to-SQ8 Inner Product distance function with precomputed sum/norm -// Returns 1 - inner_product (distance form) -template // 0..15 -float SQ8_SQ8_Precomputed_InnerProductSIMD16_NEON(const void *pVec1v, const void *pVec2v, - size_t dimension) { - return 1.0f - - SQ8_SQ8_Precomputed_InnerProductSIMD16_NEON_IMP(pVec1v, pVec2v, dimension); -} - -// SQ8-to-SQ8 Cosine distance function with precomputed sum/norm -// Returns 1 - inner_product (assumes vectors are pre-normalized) -template // 0..15 -float SQ8_SQ8_Precomputed_CosineSIMD16_NEON(const void *pVec1v, const void *pVec2v, - size_t dimension) { - return 1.0f - - SQ8_SQ8_Precomputed_InnerProductSIMD16_NEON_IMP(pVec1v, pVec2v, dimension); -} diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h index 7bb494561..7a02a8ce0 100644 --- a/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h @@ -11,24 +11,27 @@ #include /** - * SQ8-to-SQ8 distance functions for SVE. + * SQ8-to-SQ8 distance functions using ARM SVE with precomputed sum and norm. * These functions compute distance between two SQ8 (scalar quantized 8-bit) vectors, * where BOTH vectors are uint8 quantized. * - * Uses algebraic optimization with INTEGER arithmetic throughout: + * Uses precomputed sum and norm stored in the vector data, + * eliminating the need to compute them during distance calculation. * - * IP = Σ (v1[i]*δ1 + min1) * (v2[i]*δ2 + min2) - * = δ1*δ2 * Σ(v1[i]*v2[i]) + δ1*min2 * Σv1[i] + δ2*min1 * Σv2[i] + dim*min1*min2 + * Uses algebraic optimization with SVE dot product instruction: * - * All sums are computed using integer dot product instructions, converted to float only at the end. + * With sum = Σv[i] (sum of original float values), the formula is: + * IP = min1*sum2 + min2*sum1 - dim*min1*min2 + δ1*δ2 * Σ(q1[i]*q2[i]) * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)]] + * Since sum is precomputed, we only need to compute the dot product Σ(q1[i]*q2[i]). + * + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] [norm + * (float)] */ -// Helper function to perform inner product step using integer dot product +// Helper function to perform inner product step using integer dot product (no sum computation) static inline void SQ8_SQ8_InnerProductStep_SVE(const uint8_t *pVec1, const uint8_t *pVec2, size_t &offset, svuint32_t &dot_sum, - svuint32_t &sum1, svuint32_t &sum2, const size_t chunk) { svbool_t pg = svptrue_b8(); @@ -39,101 +42,80 @@ static inline void SQ8_SQ8_InnerProductStep_SVE(const uint8_t *pVec1, const uint // Compute dot product using integer svdot instruction dot_sum = svdot_u32(dot_sum, v1_u8, v2_u8); - // Compute element sums using dot product with ones vector - svuint8_t ones = svdup_u8(1); - sum1 = svdot_u32(sum1, v1_u8, ones); - sum2 = svdot_u32(sum2, v2_u8, ones); - offset += chunk; } -// Common implementation for inner product between two SQ8 vectors -// Uses integer arithmetic throughout for maximum performance +// Common implementation for inner product between two SQ8 vectors with precomputed sum/norm template float SQ8_SQ8_InnerProductSIMD_SVE_IMP(const void *pVec1v, const void *pVec2v, size_t dimension) { const uint8_t *pVec1 = static_cast(pVec1v); const uint8_t *pVec2 = static_cast(pVec2v); size_t offset = 0; - // Get dequantization parameters from the end of pVec1 - const float min1 = *reinterpret_cast(pVec1 + dimension); - const float delta1 = *reinterpret_cast(pVec1 + dimension + sizeof(float)); + // Get dequantization parameters and precomputed values from the end of pVec1 + // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] + const float *params1 = reinterpret_cast(pVec1 + dimension); + const float min1 = params1[0]; + const float delta1 = params1[1]; + const float sum1 = params1[2]; // Precomputed sum of original float elements - // Get dequantization parameters from the end of pVec2 - const float min2 = *reinterpret_cast(pVec2 + dimension); - const float delta2 = *reinterpret_cast(pVec2 + dimension + sizeof(float)); + // Get dequantization parameters and precomputed values from the end of pVec2 + const float *params2 = reinterpret_cast(pVec2 + dimension); + const float min2 = params2[0]; + const float delta2 = params2[1]; + const float sum2 = params2[2]; // Precomputed sum of original float elements - // Get the number of 8-bit elements per vector at runtime + // Get the vector length for uint8 elements const size_t vl = svcntb(); - const size_t chunk_size = 4 * vl; - // Integer accumulators for dot product and element sums + // Calculate number of complete 4-chunk groups + size_t number_of_chunks = dimension / (vl * 4); + + // Multiple accumulators for ILP (dot product only) svuint32_t dot_sum0 = svdup_u32(0); svuint32_t dot_sum1 = svdup_u32(0); svuint32_t dot_sum2 = svdup_u32(0); svuint32_t dot_sum3 = svdup_u32(0); - svuint32_t sum1_0 = svdup_u32(0); - svuint32_t sum1_1 = svdup_u32(0); - svuint32_t sum1_2 = svdup_u32(0); - svuint32_t sum1_3 = svdup_u32(0); - svuint32_t sum2_0 = svdup_u32(0); - svuint32_t sum2_1 = svdup_u32(0); - svuint32_t sum2_2 = svdup_u32(0); - svuint32_t sum2_3 = svdup_u32(0); - - // Process 4 chunks at a time in the main loop - const size_t number_of_chunks = dimension / chunk_size; for (size_t i = 0; i < number_of_chunks; i++) { - SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum0, sum1_0, sum2_0, vl); - SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum1, sum1_1, sum2_1, vl); - SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum2, sum1_2, sum2_2, vl); - SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum3, sum1_3, sum2_3, vl); + SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum0, vl); + SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum1, vl); + SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum2, vl); + SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum3, vl); } // Handle remaining steps (0-3 complete chunks) if constexpr (additional_steps >= 1) { - SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum0, sum1_0, sum2_0, vl); + SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum0, vl); } if constexpr (additional_steps >= 2) { - SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum1, sum1_1, sum2_1, vl); + SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum1, vl); } if constexpr (additional_steps >= 3) { - SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum2, sum1_2, sum2_2, vl); + SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum2, vl); } // Handle partial chunk if needed if constexpr (partial_chunk) { - svbool_t pg = svwhilelt_b8_u64(offset, dimension); + svbool_t pg = svwhilelt_b8(offset, dimension); svuint8_t v1_u8 = svld1_u8(pg, pVec1 + offset); svuint8_t v2_u8 = svld1_u8(pg, pVec2 + offset); - - // Compute dot product and sums (inactive lanes are already zeroed by svld1) dot_sum3 = svdot_u32(dot_sum3, v1_u8, v2_u8); - svuint8_t ones = svdup_u8(1); - sum1_3 = svdot_u32(sum1_3, v1_u8, ones); - sum2_3 = svdot_u32(sum2_3, v2_u8, ones); } - // Combine the integer accumulators + // Combine all accumulators + svuint32_t dot_total = svadd_u32_x(svptrue_b32(), dot_sum0, dot_sum1); + dot_total = svadd_u32_x(svptrue_b32(), dot_total, dot_sum2); + dot_total = svadd_u32_x(svptrue_b32(), dot_total, dot_sum3); + + // Horizontal sum to scalar integer svbool_t pg32 = svptrue_b32(); - svuint32_t dot_total = svadd_u32_x(pg32, svadd_u32_x(pg32, dot_sum0, dot_sum1), - svadd_u32_x(pg32, dot_sum2, dot_sum3)); - svuint32_t sum1_total = - svadd_u32_x(pg32, svadd_u32_x(pg32, sum1_0, sum1_1), svadd_u32_x(pg32, sum1_2, sum1_3)); - svuint32_t sum2_total = - svadd_u32_x(pg32, svadd_u32_x(pg32, sum2_0, sum2_1), svadd_u32_x(pg32, sum2_2, sum2_3)); - - // Horizontal sum to scalar integers uint32_t dot_product = svaddv_u32(pg32, dot_total); - uint32_t v1_sum = svaddv_u32(pg32, sum1_total); - uint32_t v2_sum = svaddv_u32(pg32, sum2_total); // Apply algebraic formula with float conversion only at the end: - // IP = δ1*δ2 * Σ(v1*v2) + δ1*min2 * Σv1 + δ2*min1 * Σv2 + dim*min1*min2 - return delta1 * delta2 * static_cast(dot_product) + - delta1 * min2 * static_cast(v1_sum) + delta2 * min1 * static_cast(v2_sum) + - static_cast(dimension) * min1 * min2; + // IP = min1*sum2 + min2*sum1 - dim*min1*min2 + δ1*δ2 * Σ(q1*q2) + return min1 * sum2 + min2 * sum1 - static_cast(dimension) * min1 * min2 + + delta1 * delta2 * static_cast(dot_product); } // SQ8-to-SQ8 Inner Product distance function diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8_Precomputed.h b/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8_Precomputed.h deleted file mode 100644 index 5a78705c6..000000000 --- a/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8_Precomputed.h +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) 2006-Present, Redis Ltd. - * All rights reserved. - * - * Licensed under your choice of the Redis Source Available License 2.0 - * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the - * GNU Affero General Public License v3 (AGPLv3). - */ -#pragma once -#include "VecSim/spaces/space_includes.h" -#include - -/** - * SQ8-to-SQ8 distance functions using ARM SVE with precomputed sum and norm. - * These functions compute distance between two SQ8 (scalar quantized 8-bit) vectors, - * where BOTH vectors are uint8 quantized. - * - * This version uses precomputed sum and norm stored in the vector data, - * eliminating the need to compute them during distance calculation. - * - * Uses algebraic optimization with SVE dot product instruction: - * IP = δ1*δ2 * Σ(v1[i]*v2[i]) + δ1*min2 * Σv1[i] + δ2*min1 * Σv2[i] + dim*min1*min2 - * - * Since sum = Σv[i] is precomputed, we only need to compute the dot product Σ(v1[i]*v2[i]). - * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] [norm - * (float)] - */ - -// Helper function to perform inner product step using integer dot product (no sum computation) -static inline void SQ8_SQ8_Precomputed_InnerProductStep_SVE(const uint8_t *pVec1, - const uint8_t *pVec2, size_t &offset, - svuint32_t &dot_sum, - const size_t chunk) { - svbool_t pg = svptrue_b8(); - - // Load uint8 vectors - svuint8_t v1_u8 = svld1_u8(pg, pVec1 + offset); - svuint8_t v2_u8 = svld1_u8(pg, pVec2 + offset); - - // Compute dot product using integer svdot instruction - dot_sum = svdot_u32(dot_sum, v1_u8, v2_u8); - - offset += chunk; -} - -// Common implementation for inner product between two SQ8 vectors with precomputed sum/norm -template -float SQ8_SQ8_Precomputed_InnerProductSIMD_SVE_IMP(const void *pVec1v, const void *pVec2v, - size_t dimension) { - const uint8_t *pVec1 = static_cast(pVec1v); - const uint8_t *pVec2 = static_cast(pVec2v); - size_t offset = 0; - - // Get dequantization parameters and precomputed values from the end of pVec1 - // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] - const float *params1 = reinterpret_cast(pVec1 + dimension); - const float min1 = params1[0]; - const float delta1 = params1[1]; - const float sum1 = params1[2]; // Precomputed sum of uint8 elements - - // Get dequantization parameters and precomputed values from the end of pVec2 - const float *params2 = reinterpret_cast(pVec2 + dimension); - const float min2 = params2[0]; - const float delta2 = params2[1]; - const float sum2 = params2[2]; // Precomputed sum of uint8 elements - - // Get the vector length for uint8 elements - const size_t vl = svcntb(); - - // Calculate number of complete 4-chunk groups - size_t number_of_chunks = dimension / (vl * 4); - - // Multiple accumulators for ILP (dot product only) - svuint32_t dot_sum0 = svdup_u32(0); - svuint32_t dot_sum1 = svdup_u32(0); - svuint32_t dot_sum2 = svdup_u32(0); - svuint32_t dot_sum3 = svdup_u32(0); - - for (size_t i = 0; i < number_of_chunks; i++) { - SQ8_SQ8_Precomputed_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum0, vl); - SQ8_SQ8_Precomputed_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum1, vl); - SQ8_SQ8_Precomputed_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum2, vl); - SQ8_SQ8_Precomputed_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum3, vl); - } - - // Handle remaining steps (0-3 complete chunks) - if constexpr (additional_steps >= 1) { - SQ8_SQ8_Precomputed_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum0, vl); - } - if constexpr (additional_steps >= 2) { - SQ8_SQ8_Precomputed_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum1, vl); - } - if constexpr (additional_steps >= 3) { - SQ8_SQ8_Precomputed_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum2, vl); - } - - // Handle partial chunk if needed - if constexpr (partial_chunk) { - svbool_t pg = svwhilelt_b8(offset, dimension); - svuint8_t v1_u8 = svld1_u8(pg, pVec1 + offset); - svuint8_t v2_u8 = svld1_u8(pg, pVec2 + offset); - dot_sum3 = svdot_u32(dot_sum3, v1_u8, v2_u8); - } - - // Combine all accumulators - svuint32_t dot_total = svadd_u32_x(svptrue_b32(), dot_sum0, dot_sum1); - dot_total = svadd_u32_x(svptrue_b32(), dot_total, dot_sum2); - dot_total = svadd_u32_x(svptrue_b32(), dot_total, dot_sum3); - - // Horizontal sum to scalar integer - svbool_t pg32 = svptrue_b32(); - uint32_t dot_product = svaddv_u32(pg32, dot_total); - - // Apply algebraic formula with float conversion only at the end: - // IP = δ1*δ2 * Σ(v1*v2) + δ1*min2 * Σv1 + δ2*min1 * Σv2 + dim*min1*min2 - return delta1 * delta2 * static_cast(dot_product) + delta1 * min2 * sum1 + - delta2 * min1 * sum2 + static_cast(dimension) * min1 * min2; -} - -// SQ8-to-SQ8 Inner Product distance function with precomputed sum/norm -// Returns 1 - inner_product (distance form) -template -float SQ8_SQ8_Precomputed_InnerProductSIMD_SVE(const void *pVec1v, const void *pVec2v, - size_t dimension) { - return 1.0f - SQ8_SQ8_Precomputed_InnerProductSIMD_SVE_IMP( - pVec1v, pVec2v, dimension); -} - -// SQ8-to-SQ8 Cosine distance function with precomputed sum/norm -// Returns 1 - inner_product (assumes vectors are pre-normalized) -template -float SQ8_SQ8_Precomputed_CosineSIMD_SVE(const void *pVec1v, const void *pVec2v, size_t dimension) { - return 1.0f - SQ8_SQ8_Precomputed_InnerProductSIMD_SVE_IMP( - pVec1v, pVec2v, dimension); -} diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index de47b7afc..e6dfbe8e0 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -150,7 +150,8 @@ dist_func_t Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, return ret_dist_func; } -// SQ8-to-SQ8 Inner Product distance function (both vectors are uint8 quantized) +// SQ8-to-SQ8 Inner Product distance function (both vectors are uint8 quantized with precomputed +// sum/norm) dist_func_t IP_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { unsigned char dummy_alignment; @@ -168,7 +169,6 @@ dist_func_t IP_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, } #endif #ifdef OPT_NEON_DOTPROD - // DOTPROD uses integer arithmetic - much faster than float-based NEON if (features.asimddp) { return Choose_SQ8_SQ8_IP_implementation_NEON_DOTPROD(dim); } @@ -182,11 +182,7 @@ dist_func_t IP_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, #ifdef CPU_FEATURES_ARCH_X86_64 #ifdef OPT_AVX512_F_BW_VL_VNNI - // AVX512 VNNI SQ8_SQ8 uses 64-element chunks - if (dim < 64) { - return ret_dist_func; - } - if (features.avx512f && features.avx512bw && features.avx512vnni) { + if (dim >= 64 && features.avx512f && features.avx512bw && features.avx512vnni) { return Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); } #endif @@ -194,7 +190,7 @@ dist_func_t IP_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, return ret_dist_func; } -// SQ8-to-SQ8 Cosine distance function (both vectors are uint8 quantized) +// SQ8-to-SQ8 Cosine distance function (both vectors are uint8 quantized with precomputed sum/norm) dist_func_t Cosine_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { unsigned char dummy_alignment; @@ -217,7 +213,6 @@ dist_func_t Cosine_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignme } #endif #ifdef OPT_NEON_DOTPROD - // DOTPROD uses integer arithmetic - much faster than float-based NEON if (features.asimddp) { return Choose_SQ8_SQ8_Cosine_implementation_NEON_DOTPROD(dim); } @@ -231,7 +226,6 @@ dist_func_t Cosine_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignme #ifdef CPU_FEATURES_ARCH_X86_64 #ifdef OPT_AVX512_F_BW_VL_VNNI - // AVX512 VNNI SQ8_SQ8 uses 64-element chunks if (dim >= 64 && features.avx512f && features.avx512bw && features.avx512vnni) { return Choose_SQ8_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); } @@ -240,84 +234,6 @@ dist_func_t Cosine_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignme return ret_dist_func; } -// SQ8-to-SQ8 Precomputed Inner Product distance function (with precomputed sum/norm) -dist_func_t IP_SQ8_SQ8_Precomputed_GetDistFunc(size_t dim, unsigned char *alignment, - const void *arch_opt) { - unsigned char dummy_alignment; - if (alignment == nullptr) { - alignment = &dummy_alignment; - } - - dist_func_t ret_dist_func = SQ8_SQ8_InnerProduct_Precomputed; // Fallback to original - [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt); - -#ifdef CPU_FEATURES_ARCH_AARCH64 -#ifdef OPT_SVE - if (features.sve) { - return Choose_SQ8_SQ8_Precomputed_IP_implementation_SVE(dim); - } -#endif -#ifdef OPT_NEON_DOTPROD - if (features.asimddp) { - return Choose_SQ8_SQ8_Precomputed_IP_implementation_NEON_DOTPROD(dim); - } -#endif -#ifdef OPT_NEON - if (features.asimd) { - return Choose_SQ8_SQ8_Precomputed_IP_implementation_NEON(dim); - } -#endif -#endif // AARCH64 - -#ifdef CPU_FEATURES_ARCH_X86_64 -#ifdef OPT_AVX512_F_BW_VL_VNNI - if (dim >= 64 && features.avx512f && features.avx512bw && features.avx512vnni) { - return Choose_SQ8_SQ8_Precomputed_IP_implementation_AVX512F_BW_VL_VNNI(dim); - } -#endif -#endif // __x86_64__ - return ret_dist_func; -} - -// SQ8-to-SQ8 Precomputed Cosine distance function (with precomputed sum/norm) -dist_func_t Cosine_SQ8_SQ8_Precomputed_GetDistFunc(size_t dim, unsigned char *alignment, - const void *arch_opt) { - unsigned char dummy_alignment; - if (alignment == nullptr) { - alignment = &dummy_alignment; - } - - dist_func_t ret_dist_func = SQ8_SQ8_Cosine_Precomputed; // Fallback to original - [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt); - -#ifdef CPU_FEATURES_ARCH_AARCH64 -#ifdef OPT_SVE - if (features.sve) { - return Choose_SQ8_SQ8_Precomputed_Cosine_implementation_SVE(dim); - } -#endif -#ifdef OPT_NEON_DOTPROD - if (features.asimddp) { - return Choose_SQ8_SQ8_Precomputed_Cosine_implementation_NEON_DOTPROD(dim); - } -#endif -#ifdef OPT_NEON - if (features.asimd) { - return Choose_SQ8_SQ8_Precomputed_Cosine_implementation_NEON(dim); - } -#endif -#endif // AARCH64 - -#ifdef CPU_FEATURES_ARCH_X86_64 -#ifdef OPT_AVX512_F_BW_VL_VNNI - if (dim >= 64 && features.avx512f && features.avx512bw && features.avx512vnni) { - return Choose_SQ8_SQ8_Precomputed_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); - } -#endif -#endif // __x86_64__ - return ret_dist_func; -} - dist_func_t IP_FP32_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { unsigned char dummy_alignment; if (alignment == nullptr) { diff --git a/src/VecSim/spaces/IP_space.h b/src/VecSim/spaces/IP_space.h index 40d5aab72..c7c5fc17d 100644 --- a/src/VecSim/spaces/IP_space.h +++ b/src/VecSim/spaces/IP_space.h @@ -31,17 +31,9 @@ dist_func_t Cosine_UINT8_GetDistFunc(size_t dim, unsigned char *alignment const void *arch_opt = nullptr); dist_func_t Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, const void *arch_opt = nullptr); -// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized) +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum/norm) dist_func_t IP_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, const void *arch_opt = nullptr); dist_func_t Cosine_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, const void *arch_opt = nullptr); - -// SQ8-to-SQ8 precomputed distance functions (with precomputed sum/norm) -dist_func_t IP_SQ8_SQ8_Precomputed_GetDistFunc(size_t dim, - unsigned char *alignment = nullptr, - const void *arch_opt = nullptr); -dist_func_t Cosine_SQ8_SQ8_Precomputed_GetDistFunc(size_t dim, - unsigned char *alignment = nullptr, - const void *arch_opt = nullptr); } // namespace spaces diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp index 09f39480e..d784e3b1b 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp @@ -18,7 +18,6 @@ #include "VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h" #include "VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h" -#include "VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI_Precomputed.h" namespace spaces { @@ -75,7 +74,7 @@ dist_func_t Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim) { CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_L2SqrSIMD16_AVX512F_BW_VL_VNNI); return ret_dist_func; } -// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized) +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum/norm) dist_func_t Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim) { dist_func_t ret_dist_func; CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, SQ8_SQ8_InnerProductSIMD64_AVX512F_BW_VL_VNNI); @@ -88,21 +87,6 @@ dist_func_t Choose_SQ8_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_ return ret_dist_func; } -// SQ8-to-SQ8 distance functions with precomputed sum and norm -dist_func_t Choose_SQ8_SQ8_Precomputed_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim) { - dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, - SQ8_SQ8_Precomputed_InnerProductSIMD64_AVX512F_BW_VL_VNNI); - return ret_dist_func; -} - -dist_func_t Choose_SQ8_SQ8_Precomputed_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim) { - dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, - SQ8_SQ8_Precomputed_CosineSIMD64_AVX512F_BW_VL_VNNI); - return ret_dist_func; -} - #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h index 5bcdd63de..0105eab1f 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h @@ -24,12 +24,8 @@ dist_func_t Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim); dist_func_t Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim); dist_func_t Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim); -// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized) +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum/norm) dist_func_t Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim); dist_func_t Choose_SQ8_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim); -// SQ8-to-SQ8 distance functions with precomputed sum and norm -dist_func_t Choose_SQ8_SQ8_Precomputed_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim); -dist_func_t Choose_SQ8_SQ8_Precomputed_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim); - } // namespace spaces diff --git a/src/VecSim/spaces/functions/NEON.cpp b/src/VecSim/spaces/functions/NEON.cpp index f5125a9b8..4f3e9eef5 100644 --- a/src/VecSim/spaces/functions/NEON.cpp +++ b/src/VecSim/spaces/functions/NEON.cpp @@ -18,7 +18,6 @@ #include "VecSim/spaces/L2/L2_NEON_SQ8.h" #include "VecSim/spaces/IP/IP_NEON_SQ8.h" #include "VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h" -#include "VecSim/spaces/IP/IP_NEON_SQ8_SQ8_Precomputed.h" namespace spaces { @@ -101,7 +100,7 @@ dist_func_t Choose_SQ8_Cosine_implementation_NEON(size_t dim) { return ret_dist_func; } -// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized) +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum/norm) dist_func_t Choose_SQ8_SQ8_IP_implementation_NEON(size_t dim) { dist_func_t ret_dist_func; CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_SQ8_InnerProductSIMD16_NEON); @@ -114,19 +113,6 @@ dist_func_t Choose_SQ8_SQ8_Cosine_implementation_NEON(size_t dim) { return ret_dist_func; } -// SQ8-to-SQ8 precomputed distance functions (with precomputed sum/norm) -dist_func_t Choose_SQ8_SQ8_Precomputed_IP_implementation_NEON(size_t dim) { - dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_SQ8_Precomputed_InnerProductSIMD16_NEON); - return ret_dist_func; -} - -dist_func_t Choose_SQ8_SQ8_Precomputed_Cosine_implementation_NEON(size_t dim) { - dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_SQ8_Precomputed_CosineSIMD16_NEON); - return ret_dist_func; -} - #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/NEON.h b/src/VecSim/spaces/functions/NEON.h index 27efa5593..4e6324cac 100644 --- a/src/VecSim/spaces/functions/NEON.h +++ b/src/VecSim/spaces/functions/NEON.h @@ -30,12 +30,8 @@ dist_func_t Choose_SQ8_L2_implementation_NEON(size_t dim); dist_func_t Choose_SQ8_IP_implementation_NEON(size_t dim); dist_func_t Choose_SQ8_Cosine_implementation_NEON(size_t dim); -// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized) +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum/norm) dist_func_t Choose_SQ8_SQ8_IP_implementation_NEON(size_t dim); dist_func_t Choose_SQ8_SQ8_Cosine_implementation_NEON(size_t dim); -// SQ8-to-SQ8 precomputed distance functions (with precomputed sum/norm) -dist_func_t Choose_SQ8_SQ8_Precomputed_IP_implementation_NEON(size_t dim); -dist_func_t Choose_SQ8_SQ8_Precomputed_Cosine_implementation_NEON(size_t dim); - } // namespace spaces diff --git a/src/VecSim/spaces/functions/NEON_DOTPROD.cpp b/src/VecSim/spaces/functions/NEON_DOTPROD.cpp index 5e8c4d3ba..8ab510c99 100644 --- a/src/VecSim/spaces/functions/NEON_DOTPROD.cpp +++ b/src/VecSim/spaces/functions/NEON_DOTPROD.cpp @@ -10,7 +10,6 @@ #include "VecSim/spaces/IP/IP_NEON_DOTPROD_INT8.h" #include "VecSim/spaces/IP/IP_NEON_DOTPROD_UINT8.h" #include "VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h" -#include "VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8_Precomputed.h" #include "VecSim/spaces/L2/L2_NEON_DOTPROD_INT8.h" #include "VecSim/spaces/L2/L2_NEON_DOTPROD_UINT8.h" @@ -54,7 +53,7 @@ dist_func_t Choose_UINT8_L2_implementation_NEON_DOTPROD(size_t dim) { return ret_dist_func; } -// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized) +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum/norm) dist_func_t Choose_SQ8_SQ8_IP_implementation_NEON_DOTPROD(size_t dim) { dist_func_t ret_dist_func; CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, SQ8_SQ8_InnerProductSIMD64_NEON_DOTPROD); @@ -67,20 +66,6 @@ dist_func_t Choose_SQ8_SQ8_Cosine_implementation_NEON_DOTPROD(size_t dim) return ret_dist_func; } -// SQ8-to-SQ8 precomputed distance functions (with precomputed sum/norm) -dist_func_t Choose_SQ8_SQ8_Precomputed_IP_implementation_NEON_DOTPROD(size_t dim) { - dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, - SQ8_SQ8_Precomputed_InnerProductSIMD64_NEON_DOTPROD); - return ret_dist_func; -} - -dist_func_t Choose_SQ8_SQ8_Precomputed_Cosine_implementation_NEON_DOTPROD(size_t dim) { - dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, SQ8_SQ8_Precomputed_CosineSIMD64_NEON_DOTPROD); - return ret_dist_func; -} - #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/NEON_DOTPROD.h b/src/VecSim/spaces/functions/NEON_DOTPROD.h index 7bc24c726..0487a5b6f 100644 --- a/src/VecSim/spaces/functions/NEON_DOTPROD.h +++ b/src/VecSim/spaces/functions/NEON_DOTPROD.h @@ -21,12 +21,8 @@ dist_func_t Choose_UINT8_Cosine_implementation_NEON_DOTPROD(size_t dim); dist_func_t Choose_INT8_L2_implementation_NEON_DOTPROD(size_t dim); dist_func_t Choose_UINT8_L2_implementation_NEON_DOTPROD(size_t dim); -// SQ8-to-SQ8 DOTPROD-optimized distance functions +// SQ8-to-SQ8 DOTPROD-optimized distance functions (with precomputed sum/norm) dist_func_t Choose_SQ8_SQ8_IP_implementation_NEON_DOTPROD(size_t dim); dist_func_t Choose_SQ8_SQ8_Cosine_implementation_NEON_DOTPROD(size_t dim); -// SQ8-to-SQ8 precomputed distance functions (with precomputed sum/norm) -dist_func_t Choose_SQ8_SQ8_Precomputed_IP_implementation_NEON_DOTPROD(size_t dim); -dist_func_t Choose_SQ8_SQ8_Precomputed_Cosine_implementation_NEON_DOTPROD(size_t dim); - } // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE.cpp b/src/VecSim/spaces/functions/SVE.cpp index 3c1013298..da08009ec 100644 --- a/src/VecSim/spaces/functions/SVE.cpp +++ b/src/VecSim/spaces/functions/SVE.cpp @@ -26,7 +26,6 @@ #include "VecSim/spaces/L2/L2_SVE_SQ8.h" #include "VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h" -#include "VecSim/spaces/IP/IP_SVE_SQ8_SQ8_Precomputed.h" namespace spaces { @@ -119,7 +118,7 @@ dist_func_t Choose_SQ8_L2_implementation_SVE(size_t dim) { return ret_dist_func; } -// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized) +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum/norm) // Note: Use svcntb for uint8 elements (not svcntw which is for 32-bit elements) dist_func_t Choose_SQ8_SQ8_IP_implementation_SVE(size_t dim) { dist_func_t ret_dist_func; @@ -133,19 +132,6 @@ dist_func_t Choose_SQ8_SQ8_Cosine_implementation_SVE(size_t dim) { return ret_dist_func; } -// SQ8-to-SQ8 precomputed distance functions (with precomputed sum/norm) -dist_func_t Choose_SQ8_SQ8_Precomputed_IP_implementation_SVE(size_t dim) { - dist_func_t ret_dist_func; - CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_SQ8_Precomputed_InnerProductSIMD_SVE, dim, svcntb); - return ret_dist_func; -} - -dist_func_t Choose_SQ8_SQ8_Precomputed_Cosine_implementation_SVE(size_t dim) { - dist_func_t ret_dist_func; - CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_SQ8_Precomputed_CosineSIMD_SVE, dim, svcntb); - return ret_dist_func; -} - #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE.h b/src/VecSim/spaces/functions/SVE.h index 70dcccb47..3b88573d2 100644 --- a/src/VecSim/spaces/functions/SVE.h +++ b/src/VecSim/spaces/functions/SVE.h @@ -33,12 +33,8 @@ dist_func_t Choose_SQ8_IP_implementation_SVE(size_t dim); dist_func_t Choose_SQ8_Cosine_implementation_SVE(size_t dim); dist_func_t Choose_SQ8_L2_implementation_SVE(size_t dim); -// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized) +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum/norm) dist_func_t Choose_SQ8_SQ8_IP_implementation_SVE(size_t dim); dist_func_t Choose_SQ8_SQ8_Cosine_implementation_SVE(size_t dim); -// SQ8-to-SQ8 precomputed distance functions (with precomputed sum/norm) -dist_func_t Choose_SQ8_SQ8_Precomputed_IP_implementation_SVE(size_t dim); -dist_func_t Choose_SQ8_SQ8_Precomputed_Cosine_implementation_SVE(size_t dim); - } // namespace spaces diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp index 34a9c3689..827de7323 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp @@ -94,8 +94,9 @@ class BM_VecSimSpaces_SQ8_SQ8_Precomputed : public benchmark::Fixture { // // AVX512_F_BW_VL_VNNI SQ8-to-SQ8 functions // #ifdef OPT_AVX512_F_BW_VL_VNNI -// bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && opt.avx512vl && opt.avx512vnni; -// INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, AVX512F_BW_VL_VNNI, 64, +// bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && opt.avx512vl && +// opt.avx512vnni; INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, +// AVX512F_BW_VL_VNNI, 64, // avx512_f_bw_vl_vnni_supported); // INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, AVX512F_BW_VL_VNNI, 64, // avx512_f_bw_vl_vnni_supported); diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 92f828d2d..49088f759 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2047,21 +2047,27 @@ TEST_P(UINT8SpacesOptimizationTest, UINT8_full_range_test) { INSTANTIATE_TEST_SUITE_P(UINT8OptFuncs, UINT8SpacesOptimizationTest, testing::Range(32UL, 64 * 2UL + 1)); -// Helper function to create SQ8 quantized vector +// Helper function to create SQ8 quantized vector with precomputed sum and norm +// Vector layout: [uint8_t values (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] std::vector CreateSQ8QuantizedVector(const float *original, size_t dim) { // Create a copy of the original vector that we can modify std::vector vec_copy(original, original + dim); - // Size: dim (uint8_t) + min_val (float) + delta (float) - size_t quantized_size = dim * sizeof(uint8_t) + 2 * sizeof(float); + // Size: dim (uint8_t) + min_val (float) + delta (float) + sum (float) + norm (float) + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector quantized(quantized_size); - // Find min and max for quantization + // Find min and max for quantization, and compute sum and square_sum of original float values float min_val = vec_copy[0]; float max_val = vec_copy[0]; - for (size_t i = 1; i < dim; i++) { - min_val = std::min(min_val, vec_copy[i]); - max_val = std::max(max_val, vec_copy[i]); + float sum = 0.0f; + float square_sum = 0.0f; + for (size_t i = 0; i < dim; i++) { + auto val = vec_copy[i]; + min_val = std::min(min_val, val); + max_val = std::max(max_val, val); + sum += val; + square_sum += val * val; } // Calculate delta @@ -2069,18 +2075,21 @@ std::vector CreateSQ8QuantizedVector(const float *original, size_t dim) if (delta == 0) delta = 1.0f; // Avoid division by zero - // Quantize vector + // Quantize vector and compute sum and square_sum uint8_t *quant_values = quantized.data(); - // Quantize each value + for (size_t i = 0; i < dim; i++) { float normalized = (vec_copy[i] - min_val) / delta; normalized = std::max(0.0f, std::min(255.0f, normalized)); quant_values[i] = static_cast(std::round(normalized)); } - // Store parameters + + // Store parameters: [min, delta, sum, square_sum] float *params = reinterpret_cast(quant_values + dim); params[0] = min_val; params[1] = delta; + params[2] = sum; + params[3] = square_sum; return quantized; } @@ -2603,28 +2612,17 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_CosineTest) { auto optimization = getCpuOptimizationFeatures(); size_t dim = GetParam(); - // Create original vectors - std::vector v1_orig(dim); - std::vector v2_orig(dim); - for (size_t i = 0; i < dim; i++) { - v1_orig[i] = float(i + 1.5); - v2_orig[i] = float(i * 0.75 + 1.0); - } - - // Normalize both vectors - spaces::GetNormalizeFunc()(v1_orig.data(), dim); - spaces::GetNormalizeFunc()(v2_orig.data(), dim); - - // Create SQ8 quantized versions of both vectors - std::vector v1_quantized = CreateSQ8QuantizedVector(v1_orig.data(), dim); - std::vector v2_quantized = CreateSQ8QuantizedVector(v2_orig.data(), dim); + // Create quantized vectors + // Size: dim (uint8_t) + min_val (float) + delta (float) + sum (float) + norm (float) + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v1_quantized(quantized_size); + std::vector v2_quantized(quantized_size); + test_utils::populate_float_vec_to_sq8_with_sum_norm(v1_quantized.data(), dim, 1234); + test_utils::populate_float_vec_to_sq8_with_sum_norm(v2_quantized.data(), dim, 5678); dist_func_t arch_opt_func; float baseline = SQ8_SQ8_Cosine(v1_quantized.data(), v2_quantized.data(), dim); - // Note: CreateSQ8QuantizedVectorWithSumNorm is defined later in this file - // We test precomputed version in the dedicated SQ8_SQ8_PrecomputedOptFuncs tests - #ifdef OPT_SVE2 if (optimization.sve2) { unsigned char alignment = 0; @@ -2698,157 +2696,9 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_CosineTest) { INSTANTIATE_TEST_SUITE_P(SQ8_SQ8OptFuncs, SQ8_SQ8_SpacesOptimizationTest, testing::Range(64UL, 64 * 2UL + 1)); -/* ======================== Tests SQ8_SQ8 Precomputed ========================= */ - -// Helper function to create SQ8 quantized vector with precomputed sum and norm -// Vector layout: [uint8_t values (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] -std::vector CreateSQ8QuantizedVectorWithSumNorm(const float *original, size_t dim) { - std::vector vec_copy(original, original + dim); - - // Size: dim (uint8_t) + min_val (float) + delta (float) + sum (float) + norm (float) - size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); - std::vector quantized(quantized_size); - - // Find min and max for quantization - float min_val = vec_copy[0]; - float max_val = vec_copy[0]; - for (size_t i = 1; i < dim; i++) { - min_val = std::min(min_val, vec_copy[i]); - max_val = std::max(max_val, vec_copy[i]); - } - - // Calculate delta - float delta = (max_val - min_val) / 255.0f; - if (delta == 0) - delta = 1.0f; - - // Quantize vector and compute sum and norm - uint8_t *quant_values = quantized.data(); - float sum = 0.0f; - float norm = 0.0f; - for (size_t i = 0; i < dim; i++) { - float normalized = (vec_copy[i] - min_val) / delta; - normalized = std::max(0.0f, std::min(255.0f, normalized)); - quant_values[i] = static_cast(std::round(normalized)); - sum += static_cast(quant_values[i]); - norm += static_cast(quant_values[i]) * static_cast(quant_values[i]); - } - - // Store parameters: [min, delta, sum, norm] - float *params = reinterpret_cast(quant_values + dim); - params[0] = min_val; - params[1] = delta; - params[2] = sum; - params[3] = norm; - - return quantized; -} - -class SQ8_SQ8_Precomputed_SpacesOptimizationTest : public testing::TestWithParam {}; - -TEST_P(SQ8_SQ8_Precomputed_SpacesOptimizationTest, SQ8_SQ8_Precomputed_InnerProductTest) { - auto optimization = getCpuOptimizationFeatures(); - size_t dim = GetParam(); - - // Create original vectors - std::vector v1_orig(dim); - std::vector v2_orig(dim); - for (size_t i = 0; i < dim; i++) { - v1_orig[i] = float(i + 1.5); - v2_orig[i] = float(i * 0.75 + 1.0); - } - - // Normalize both vectors - spaces::GetNormalizeFunc()(v1_orig.data(), dim); - spaces::GetNormalizeFunc()(v2_orig.data(), dim); - - // Create SQ8 quantized versions (standard format for baseline) - std::vector v1_quantized = CreateSQ8QuantizedVector(v1_orig.data(), dim); - std::vector v2_quantized = CreateSQ8QuantizedVector(v2_orig.data(), dim); - - // Create SQ8 quantized versions with precomputed sum/norm - std::vector v1_precomputed = CreateSQ8QuantizedVectorWithSumNorm(v1_orig.data(), dim); - std::vector v2_precomputed = CreateSQ8QuantizedVectorWithSumNorm(v2_orig.data(), dim); - - // Baseline: original SQ8_SQ8 implementation - float baseline = SQ8_SQ8_InnerProduct(v1_quantized.data(), v2_quantized.data(), dim); - -#ifdef OPT_AVX512_F_BW_VL_VNNI - if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { - // Test precomputed version - auto precomputed_func = - spaces::Choose_SQ8_SQ8_Precomputed_IP_implementation_AVX512F_BW_VL_VNNI(dim); - float precomputed_result = - precomputed_func(v1_precomputed.data(), v2_precomputed.data(), dim); - - // Precomputed should match baseline (within tolerance for float precision) - ASSERT_NEAR(baseline, precomputed_result, 0.01) << "AVX512 Precomputed IP with dim " << dim; - } -#endif -} - -TEST_P(SQ8_SQ8_Precomputed_SpacesOptimizationTest, SQ8_SQ8_Precomputed_CosineTest) { - auto optimization = getCpuOptimizationFeatures(); - size_t dim = GetParam(); - - // Create original vectors - std::vector v1_orig(dim); - std::vector v2_orig(dim); - for (size_t i = 0; i < dim; i++) { - v1_orig[i] = float(i + 1.5); - v2_orig[i] = float(i * 0.75 + 1.0); - } - - // Normalize both vectors - spaces::GetNormalizeFunc()(v1_orig.data(), dim); - spaces::GetNormalizeFunc()(v2_orig.data(), dim); - - // Create SQ8 quantized versions (standard format for baseline) - std::vector v1_quantized = CreateSQ8QuantizedVector(v1_orig.data(), dim); - std::vector v2_quantized = CreateSQ8QuantizedVector(v2_orig.data(), dim); - - // Create SQ8 quantized versions with precomputed sum/norm - std::vector v1_precomputed = CreateSQ8QuantizedVectorWithSumNorm(v1_orig.data(), dim); - std::vector v2_precomputed = CreateSQ8QuantizedVectorWithSumNorm(v2_orig.data(), dim); - - // Baseline: original SQ8_SQ8 implementation - float baseline = SQ8_SQ8_Cosine(v1_quantized.data(), v2_quantized.data(), dim); - float baseline_precomputed = SQ8_SQ8_Cosine_Precomputed(v1_precomputed.data(), v2_precomputed.data(), dim); - - ASSERT_NEAR(baseline, baseline_precomputed, 0.01) << "Precomputed should match baseline"; - -#ifdef OPT_AVX512_F_BW_VL_VNNI - if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { - // Test precomputed version - auto precomputed_func = - spaces::Choose_SQ8_SQ8_Precomputed_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); - float precomputed_result = - precomputed_func(v1_precomputed.data(), v2_precomputed.data(), dim); - - // Precomputed should match baseline (within tolerance for float precision) - ASSERT_NEAR(baseline, precomputed_result, 0.01) - << "AVX512 Precomputed Cosine with dim " << dim; - } -#endif -} - -// Test suite covers dimensions 64-128 to exercise AVX512 SIMD paths -INSTANTIATE_TEST_SUITE_P(SQ8_SQ8_PrecomputedOptFuncs, SQ8_SQ8_Precomputed_SpacesOptimizationTest, - testing::Range(64UL, 64 * 2UL + 1)); - -// Additional test suite for smaller dimensions (1-63) to test residual handling -INSTANTIATE_TEST_SUITE_P(SQ8_SQ8_PrecomputedOptFuncs_SmallDim, - SQ8_SQ8_Precomputed_SpacesOptimizationTest, - testing::Values(1UL, 7UL, 15UL, 16UL, 31UL, 32UL, 33UL, 48UL, 63UL)); - -// Test suite for larger dimensions to stress-test the implementation -INSTANTIATE_TEST_SUITE_P(SQ8_SQ8_PrecomputedOptFuncs_LargeDim, - SQ8_SQ8_Precomputed_SpacesOptimizationTest, - testing::Values(256UL, 512UL, 768UL, 1024UL, 1536UL)); - #ifdef OPT_AVX512_F_BW_VL_VNNI // Test self-distance: distance to itself should be 0 for cosine (normalized vectors) -TEST(SQ8_SQ8_Precomputed_EdgeCases, SelfDistanceCosine) { +TEST(SQ8_SQ8_EdgeCases, SelfDistanceCosine) { auto optimization = getCpuOptimizationFeatures(); if (!(optimization.avx512f && optimization.avx512bw && optimization.avx512vnni)) { GTEST_SKIP() << "AVX512 VNNI not available"; @@ -2870,18 +2720,17 @@ TEST(SQ8_SQ8_Precomputed_EdgeCases, SelfDistanceCosine) { v_orig[i] /= norm; } - auto v_quantized = CreateSQ8QuantizedVectorWithSumNorm(v_orig.data(), dim); + auto v_quantized = CreateSQ8QuantizedVector(v_orig.data(), dim); - auto precomputed_func = - spaces::Choose_SQ8_SQ8_Precomputed_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); - float self_distance = precomputed_func(v_quantized.data(), v_quantized.data(), dim); + auto cosine_func = spaces::Choose_SQ8_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); + float self_distance = cosine_func(v_quantized.data(), v_quantized.data(), dim); // Self-distance for cosine should be close to 0 ASSERT_NEAR(self_distance, 0.0f, 0.02f) << "Self-distance should be ~0 for cosine"; } // Test symmetry: dist(v1, v2) == dist(v2, v1) -TEST(SQ8_SQ8_Precomputed_EdgeCases, SymmetryTest) { +TEST(SQ8_SQ8_EdgeCases, SymmetryTest) { auto optimization = getCpuOptimizationFeatures(); if (!(optimization.avx512f && optimization.avx512bw && optimization.avx512vnni)) { GTEST_SKIP() << "AVX512 VNNI not available"; @@ -2897,12 +2746,11 @@ TEST(SQ8_SQ8_Precomputed_EdgeCases, SymmetryTest) { v2_orig[i] = dist(rng); } - auto v1_quantized = CreateSQ8QuantizedVectorWithSumNorm(v1_orig.data(), dim); - auto v2_quantized = CreateSQ8QuantizedVectorWithSumNorm(v2_orig.data(), dim); + auto v1_quantized = CreateSQ8QuantizedVector(v1_orig.data(), dim); + auto v2_quantized = CreateSQ8QuantizedVector(v2_orig.data(), dim); - auto ip_func = spaces::Choose_SQ8_SQ8_Precomputed_IP_implementation_AVX512F_BW_VL_VNNI(dim); - auto cosine_func = - spaces::Choose_SQ8_SQ8_Precomputed_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); + auto ip_func = spaces::Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); + auto cosine_func = spaces::Choose_SQ8_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); float ip_12 = ip_func(v1_quantized.data(), v2_quantized.data(), dim); float ip_21 = ip_func(v2_quantized.data(), v1_quantized.data(), dim); @@ -2914,7 +2762,7 @@ TEST(SQ8_SQ8_Precomputed_EdgeCases, SymmetryTest) { } // Test with zero vector -TEST(SQ8_SQ8_Precomputed_EdgeCases, ZeroVectorTest) { +TEST(SQ8_SQ8_EdgeCases, ZeroVectorTest) { auto optimization = getCpuOptimizationFeatures(); if (!(optimization.avx512f && optimization.avx512bw && optimization.avx512vnni)) { GTEST_SKIP() << "AVX512 VNNI not available"; @@ -2930,23 +2778,20 @@ TEST(SQ8_SQ8_Precomputed_EdgeCases, ZeroVectorTest) { v_nonzero[i] = dist(rng); } - auto v_zero_quantized = CreateSQ8QuantizedVectorWithSumNorm(v_zero.data(), dim); - auto v_nonzero_quantized = CreateSQ8QuantizedVectorWithSumNorm(v_nonzero.data(), dim); + auto v_zero_quantized = CreateSQ8QuantizedVector(v_zero.data(), dim); + auto v_nonzero_quantized = CreateSQ8QuantizedVector(v_nonzero.data(), dim); - // Get baseline from original implementation - auto orig_func = spaces::Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); - auto orig_v_zero = CreateSQ8QuantizedVector(v_zero.data(), dim); - auto orig_v_nonzero = CreateSQ8QuantizedVector(v_nonzero.data(), dim); - float baseline = orig_func(orig_v_zero.data(), orig_v_nonzero.data(), dim); + // Compute baseline using fallback function + float baseline = SQ8_SQ8_InnerProduct(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); - auto ip_func = spaces::Choose_SQ8_SQ8_Precomputed_IP_implementation_AVX512F_BW_VL_VNNI(dim); + auto ip_func = spaces::Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); float result = ip_func(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Zero vector IP should match baseline"; } // Test with constant vector (all same values) -TEST(SQ8_SQ8_Precomputed_EdgeCases, ConstantVectorTest) { +TEST(SQ8_SQ8_EdgeCases, ConstantVectorTest) { auto optimization = getCpuOptimizationFeatures(); if (!(optimization.avx512f && optimization.avx512bw && optimization.avx512vnni)) { GTEST_SKIP() << "AVX512 VNNI not available"; @@ -2962,23 +2807,20 @@ TEST(SQ8_SQ8_Precomputed_EdgeCases, ConstantVectorTest) { v_random[i] = dist(rng); } - auto v_const_quantized = CreateSQ8QuantizedVectorWithSumNorm(v_const.data(), dim); - auto v_random_quantized = CreateSQ8QuantizedVectorWithSumNorm(v_random.data(), dim); + auto v_const_quantized = CreateSQ8QuantizedVector(v_const.data(), dim); + auto v_random_quantized = CreateSQ8QuantizedVector(v_random.data(), dim); - // Get baseline from original implementation - auto orig_func = spaces::Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); - auto orig_v_const = CreateSQ8QuantizedVector(v_const.data(), dim); - auto orig_v_random = CreateSQ8QuantizedVector(v_random.data(), dim); - float baseline = orig_func(orig_v_const.data(), orig_v_random.data(), dim); + // Compute baseline using fallback function + float baseline = SQ8_SQ8_InnerProduct(v_const_quantized.data(), v_random_quantized.data(), dim); - auto ip_func = spaces::Choose_SQ8_SQ8_Precomputed_IP_implementation_AVX512F_BW_VL_VNNI(dim); + auto ip_func = spaces::Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); float result = ip_func(v_const_quantized.data(), v_random_quantized.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Constant vector IP should match baseline"; } // Test with extreme values (-1 and 1 only) -TEST(SQ8_SQ8_Precomputed_EdgeCases, ExtremeValuesTest) { +TEST(SQ8_SQ8_EdgeCases, ExtremeValuesTest) { auto optimization = getCpuOptimizationFeatures(); if (!(optimization.avx512f && optimization.avx512bw && optimization.avx512vnni)) { GTEST_SKIP() << "AVX512 VNNI not available"; @@ -2993,23 +2835,20 @@ TEST(SQ8_SQ8_Precomputed_EdgeCases, ExtremeValuesTest) { v2[i] = (i % 3 == 0) ? 1.0f : -1.0f; } - auto v1_precomputed = CreateSQ8QuantizedVectorWithSumNorm(v1.data(), dim); - auto v2_precomputed = CreateSQ8QuantizedVectorWithSumNorm(v2.data(), dim); + auto v1_quantized = CreateSQ8QuantizedVector(v1.data(), dim); + auto v2_quantized = CreateSQ8QuantizedVector(v2.data(), dim); - // Get baseline from original implementation - auto orig_func = spaces::Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); - auto orig_v1 = CreateSQ8QuantizedVector(v1.data(), dim); - auto orig_v2 = CreateSQ8QuantizedVector(v2.data(), dim); - float baseline = orig_func(orig_v1.data(), orig_v2.data(), dim); + // Compute baseline using fallback function + float baseline = SQ8_SQ8_InnerProduct(v1_quantized.data(), v2_quantized.data(), dim); - auto ip_func = spaces::Choose_SQ8_SQ8_Precomputed_IP_implementation_AVX512F_BW_VL_VNNI(dim); - float result = ip_func(v1_precomputed.data(), v2_precomputed.data(), dim); + auto ip_func = spaces::Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); + float result = ip_func(v1_quantized.data(), v2_quantized.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Extreme values IP should match baseline"; } // Test accuracy across multiple random vector pairs -TEST(SQ8_SQ8_Precomputed_EdgeCases, AccuracyStressTest) { +TEST(SQ8_SQ8_EdgeCases, AccuracyStressTest) { auto optimization = getCpuOptimizationFeatures(); if (!(optimization.avx512f && optimization.avx512bw && optimization.avx512vnni)) { GTEST_SKIP() << "AVX512 VNNI not available"; @@ -3020,9 +2859,7 @@ TEST(SQ8_SQ8_Precomputed_EdgeCases, AccuracyStressTest) { std::mt19937 rng(999); std::uniform_real_distribution dist(-10.0f, 10.0f); - auto orig_ip_func = spaces::Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); - auto precomputed_ip_func = - spaces::Choose_SQ8_SQ8_Precomputed_IP_implementation_AVX512F_BW_VL_VNNI(dim); + auto ip_func = spaces::Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); float max_error = 0.0f; for (int iter = 0; iter < num_iterations; iter++) { @@ -3032,13 +2869,12 @@ TEST(SQ8_SQ8_Precomputed_EdgeCases, AccuracyStressTest) { v2[i] = dist(rng); } - auto orig_v1 = CreateSQ8QuantizedVector(v1.data(), dim); - auto orig_v2 = CreateSQ8QuantizedVector(v2.data(), dim); - float baseline = orig_ip_func(orig_v1.data(), orig_v2.data(), dim); + auto v1_quantized = CreateSQ8QuantizedVector(v1.data(), dim); + auto v2_quantized = CreateSQ8QuantizedVector(v2.data(), dim); - auto precomp_v1 = CreateSQ8QuantizedVectorWithSumNorm(v1.data(), dim); - auto precomp_v2 = CreateSQ8QuantizedVectorWithSumNorm(v2.data(), dim); - float result = precomputed_ip_func(precomp_v1.data(), precomp_v2.data(), dim); + // Compute baseline using fallback function + float baseline = SQ8_SQ8_InnerProduct(v1_quantized.data(), v2_quantized.data(), dim); + float result = ip_func(v1_quantized.data(), v2_quantized.data(), dim); float error = std::abs(result - baseline); max_error = std::max(max_error, error); diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h index 4f90781a7..4bc1d53aa 100644 --- a/tests/utils/tests_utils.h +++ b/tests/utils/tests_utils.h @@ -11,6 +11,7 @@ #include #include #include "VecSim/spaces/normalize/compute_norm.h" +#include "VecSim/spaces/spaces.h" #include "VecSim/types/float16.h" namespace test_utils { @@ -106,7 +107,7 @@ static void populate_float_vec_to_sq8(uint8_t *v, size_t dim, int seed = 1234) { * Vector layout: [uint8_t values (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] * where sum = Σv[i] and norm = Σv[i]² (sum of squares of uint8 elements) */ -static void quantize_float_vec_to_uint8_with_sum_norm(float *v, size_t dim, uint8_t *qv, +static void quantize_float_vec_to_uint8_with_sum_norm(const float *v, size_t dim, uint8_t *qv, int seed = 1234) { float min_val = v[0]; float max_val = v[0]; @@ -114,20 +115,25 @@ static void quantize_float_vec_to_uint8_with_sum_norm(float *v, size_t dim, uint min_val = std::min(min_val, v[i]); max_val = std::max(max_val, v[i]); } + + float sum = 0.0f; + float square_sum = 0.0f; + for (size_t i = 0; i < dim; i++) { + sum += v[i]; + square_sum += v[i] * v[i]; + } + // Calculate delta float delta = (max_val - min_val) / 255.0f; if (delta == 0) delta = 1.0f; // Avoid division by zero - // Quantize each value and compute sum and norm - float sum = 0.0f; - float norm = 0.0f; + // Quantize each value + for (size_t i = 0; i < dim; i++) { float normalized = (v[i] - min_val) / delta; normalized = std::max(0.0f, std::min(255.0f, normalized)); qv[i] = static_cast(std::round(normalized)); - sum += static_cast(qv[i]); - norm += static_cast(qv[i]) * static_cast(qv[i]); } // Store parameters: [min, delta, sum, norm] @@ -135,7 +141,7 @@ static void quantize_float_vec_to_uint8_with_sum_norm(float *v, size_t dim, uint params[0] = min_val; params[1] = delta; params[2] = sum; - params[3] = norm; + params[3] = square_sum; } /** @@ -143,12 +149,10 @@ static void quantize_float_vec_to_uint8_with_sum_norm(float *v, size_t dim, uint * Vector layout: [uint8_t values (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] */ static void populate_float_vec_to_sq8_with_sum_norm(uint8_t *v, size_t dim, int seed = 1234) { - std::mt19937 gen(seed); - std::uniform_real_distribution dis(-1.0f, 1.0f); std::vector vec(dim); - for (size_t i = 0; i < dim; i++) { - vec[i] = dis(gen); - } + populate_float_vec(vec.data(), dim, seed); + // Normalize vector + spaces::GetNormalizeFunc()(vec.data(), dim); quantize_float_vec_to_uint8_with_sum_norm(vec.data(), dim, v, seed); } From a111e36882ddb1eae7c44ac172449c2eefd79ad6 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Tue, 30 Dec 2025 17:45:37 +0200 Subject: [PATCH 23/51] Refactor SQ8 distance functions and tests for improved clarity and consistency - Updated include paths in AVX512F_BW_VL_VNNI.cpp to reflect new naming conventions. - Modified unit tests in test_spaces.cpp to streamline vector initialization and quantization processes. - Replaced repetitive code with utility functions for populating and quantizing vectors. - Enhanced assertions in tests to ensure optimized distance functions are correctly chosen and validated. - Removed unnecessary parameters from utility functions to simplify their interfaces. - Improved test coverage for edge cases, including zero and constant vectors, ensuring accuracy across various scenarios. --- ..._VL_VNNI.h => IP_AVX512F_BW_VL_VNNI_SQ8.h} | 0 ...VNNI.h => IP_AVX512F_BW_VL_VNNI_SQ8_SQ8.h} | 14 +- .../spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h | 12 +- src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h | 12 +- src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h | 14 +- .../spaces/functions/AVX512F_BW_VL_VNNI.cpp | 4 +- tests/unit/test_spaces.cpp | 627 +++++++++--------- tests/utils/tests_utils.h | 10 +- 8 files changed, 342 insertions(+), 351 deletions(-) rename src/VecSim/spaces/IP/{IP_AVX512F_SQ8_BW_VL_VNNI.h => IP_AVX512F_BW_VL_VNNI_SQ8.h} (100%) rename src/VecSim/spaces/IP/{IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h => IP_AVX512F_BW_VL_VNNI_SQ8_SQ8.h} (92%) diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h similarity index 100% rename from src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h rename to src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_SQ8.h similarity index 92% rename from src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h rename to src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_SQ8.h index 607ca79c2..1643775c4 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_SQ8.h @@ -21,12 +21,12 @@ * Uses algebraic optimization to leverage integer VNNI instructions: * * With sum = Σv[i] (sum of original float values), the formula is: - * IP = min1*sum2 + min2*sum1 - dim*min1*min2 + δ1*δ2 * Σ(q1[i]*q2[i]) + * IP = min1*sum2 + min2*sum1 + δ1*δ2 * Σ(q1[i]*q2[i]) - dim*min1*min2 * * Since sum is precomputed, we only need to compute the dot product Σ(q1[i]*q2[i]). * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] [norm - * (float)] + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] [sum of + * squares (float)] */ // Process 64 uint8 elements using VNNI with multiple accumulators for ILP (dot product only) @@ -75,14 +75,12 @@ float SQ8_SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dim const float min1 = params1[0]; const float delta1 = params1[1]; const float sum1 = params1[2]; // Precomputed sum of original float elements - // const float norm1 = params1[3]; // Precomputed norm (sum of squares) - not used for IP // Get dequantization parameters and precomputed values from the end of pVec2 const float *params2 = reinterpret_cast(pVec2 + dimension); const float min2 = params2[0]; const float delta2 = params2[1]; const float sum2 = params2[2]; // Precomputed sum of original float elements - // const float norm2 = params2[3]; // Precomputed norm (sum of squares) - not used for IP // Multiple accumulators for instruction-level parallelism (dot product only) __m512i dot_acc0 = _mm512_setzero_si512(); @@ -135,9 +133,9 @@ float SQ8_SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dim int64_t dot_product = _mm512_reduce_add_epi32(dot_total); // Apply the algebraic formula using precomputed sums: - // IP = min1*sum2 + min2*sum1 - dim*min1*min2 + δ1*δ2 * Σ(q1[i]*q2[i]) - float result = min1 * sum2 + min2 * sum1 - static_cast(dimension) * min1 * min2 + - delta1 * delta2 * static_cast(dot_product); + // IP = min1*sum2 + min2*sum1 + δ1*δ2 * Σ(q1[i]*q2[i]) - dim*min1*min2 + float result = min1 * sum2 + min2 * sum1 + delta1 * delta2 * static_cast(dot_product) - + static_cast(dimension) * min1 * min2; return result; } diff --git a/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h index de1ba5f53..af9f5739a 100644 --- a/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h +++ b/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h @@ -21,12 +21,12 @@ * Uses algebraic optimization with DOTPROD instruction: * * With sum = Σv[i] (sum of original float values), the formula is: - * IP = min1*sum2 + min2*sum1 - dim*min1*min2 + δ1*δ2 * Σ(q1[i]*q2[i]) + * IP = min1*sum2 + min2*sum1 + δ1*δ2 * Σ(q1[i]*q2[i]) - dim*min1*min2 * * Since sum is precomputed, we only need to compute the dot product Σ(q1[i]*q2[i]). * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] [norm - * (float)] + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] [sum of + * squares (float)] */ // Helper function: computes dot product using DOTPROD instruction (no sum computation needed) @@ -107,9 +107,9 @@ float SQ8_SQ8_InnerProductSIMD64_NEON_DOTPROD_IMP(const void *pVec1v, const void } // Apply algebraic formula using precomputed sums: - // IP = min1*sum2 + min2*sum1 - dim*min1*min2 + δ1*δ2 * Σ(q1*q2) - return min1 * sum2 + min2 * sum1 - static_cast(dimension) * min1 * min2 + - delta1 * delta2 * static_cast(dot_product); + // IP = min1*sum2 + min2*sum1 + δ1*δ2 * Σ(q1*q2) - dim*min1*min2 + return min1 * sum2 + min2 * sum1 + delta1 * delta2 * static_cast(dot_product) - + static_cast(dimension) * min1 * min2; } // SQ8-to-SQ8 Inner Product distance function diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h index af1bef2af..61d2797fa 100644 --- a/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h +++ b/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h @@ -21,12 +21,12 @@ * Uses algebraic optimization: * * With sum = Σv[i] (sum of original float values), the formula is: - * IP = min1*sum2 + min2*sum1 - dim*min1*min2 + δ1*δ2 * Σ(q1[i]*q2[i]) + * IP = min1*sum2 + min2*sum1 + δ1*δ2 * Σ(q1[i]*q2[i]) - dim*min1*min2 * * Since sum is precomputed, we only need to compute the dot product Σ(q1[i]*q2[i]). * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] [norm - * (float)] + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] [sum of + * squares (float)] */ // Helper function with dot product only (no sum computation needed) @@ -115,9 +115,9 @@ float SQ8_SQ8_InnerProductSIMD16_NEON_IMP(const void *pVec1v, const void *pVec2v } // Apply algebraic formula using precomputed sums: - // IP = min1*sum2 + min2*sum1 - dim*min1*min2 + δ1*δ2 * Σ(q1*q2) - return min1 * sum2 + min2 * sum1 - static_cast(dimension) * min1 * min2 + - delta1 * delta2 * dot_product; + // IP = min1*sum2 + min2*sum1 + δ1*δ2 * Σ(q1*q2) - dim*min1*min2 + return min1 * sum2 + min2 * sum1 + delta1 * delta2 * dot_product - + static_cast(dimension) * min1 * min2; } // SQ8-to-SQ8 Inner Product distance function diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h index 7a02a8ce0..e6f1301cd 100644 --- a/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h @@ -21,12 +21,12 @@ * Uses algebraic optimization with SVE dot product instruction: * * With sum = Σv[i] (sum of original float values), the formula is: - * IP = min1*sum2 + min2*sum1 - dim*min1*min2 + δ1*δ2 * Σ(q1[i]*q2[i]) + * IP = min1*sum2 + min2*sum1 + δ1*δ2 * Σ(q1[i]*q2[i]) - dim*min1*min2 * * Since sum is precomputed, we only need to compute the dot product Σ(q1[i]*q2[i]). * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] [norm - * (float)] + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] [sum of + * squares (float)] */ // Helper function to perform inner product step using integer dot product (no sum computation) @@ -53,7 +53,7 @@ float SQ8_SQ8_InnerProductSIMD_SVE_IMP(const void *pVec1v, const void *pVec2v, s size_t offset = 0; // Get dequantization parameters and precomputed values from the end of pVec1 - // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] + // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] [sum of squares (float)] const float *params1 = reinterpret_cast(pVec1 + dimension); const float min1 = params1[0]; const float delta1 = params1[1]; @@ -113,9 +113,9 @@ float SQ8_SQ8_InnerProductSIMD_SVE_IMP(const void *pVec1v, const void *pVec2v, s uint32_t dot_product = svaddv_u32(pg32, dot_total); // Apply algebraic formula with float conversion only at the end: - // IP = min1*sum2 + min2*sum1 - dim*min1*min2 + δ1*δ2 * Σ(q1*q2) - return min1 * sum2 + min2 * sum1 - static_cast(dimension) * min1 * min2 + - delta1 * delta2 * static_cast(dot_product); + // IP = min1*sum2 + min2*sum1 + δ1*δ2 * Σ(q1*q2) - dim*min1*min2 + return min1 * sum2 + min2 * sum1 + delta1 * delta2 * static_cast(dot_product) - + static_cast(dimension) * min1 * min2; } // SQ8-to-SQ8 Inner Product distance function diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp index d784e3b1b..89bcabf11 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp @@ -14,10 +14,10 @@ #include "VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_UINT8.h" #include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_UINT8.h" -#include "VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h" +#include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h" #include "VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h" -#include "VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h" +#include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_SQ8.h" namespace spaces { diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 49088f759..5f501810a 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -43,6 +43,7 @@ using bfloat16 = vecsim_types::bfloat16; using float16 = vecsim_types::float16; +using namespace spaces; class SpacesTest : public ::testing::Test { @@ -356,11 +357,19 @@ void common_ip_sq8(bool should_normalize, float expected_dist) { quant_values[i] = static_cast(std::round(normalized)); } - float dist = SQ8_InnerProduct((const void *)v1_orig, (const void *)v2_quantized.data(), dim); + float baseline = SQ8_InnerProduct((const void *)v1_orig, (const void *)v2_quantized.data(), dim); // Since we're comparing identical vectors, the inner product distance should be close to // expected - ASSERT_NEAR(dist, expected_dist, 0.01) << "SQ8_InnerProduct failed to match expected distance"; + ASSERT_NEAR(baseline, expected_dist, 0.01) << "SQ8_InnerProduct failed to match expected distance"; + + unsigned char alignment = 0; + auto arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, nullptr); + ASSERT_EQ(arch_opt_func, SQ8_InnerProduct) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig, v2_quantized.data(), dim), 0.01) + << "No optimization with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } /* ======================== Tests SQ8 ========================= */ @@ -375,91 +384,46 @@ TEST_F(SpacesTest, SQ8_Cosine_no_optimization_func_test) { size_t dim = 5; // Create original vectors - float v1_orig[dim], v2_orig[dim]; - for (size_t i = 0; i < dim; i++) { - v1_orig[i] = float(i + 1.5); - v2_orig[i] = float(i + 1.5); - } - - // Size: dim (uint8_t) + min_val (float) + delta (float) - size_t quantized_size = dim * sizeof(uint8_t) + 2 * sizeof(float); - spaces::GetNormalizeFunc()(v1_orig, dim); - spaces::GetNormalizeFunc()(v2_orig, dim); - // Find min and max for quantization - float min_val = v2_orig[0]; - float max_val = v2_orig[0]; - for (size_t i = 1; i < dim; i++) { - min_val = std::min(min_val, v2_orig[i]); - max_val = std::max(max_val, v2_orig[i]); - } - // Calculate delta - float delta = (max_val - min_val) / 255.0f; - if (delta == 0) - delta = 1.0f; // Avoid division by zero + std::vector v1_orig(dim); + test_utils::populate_float_vec(v1_orig.data(), dim); + spaces::GetNormalizeFunc()(v1_orig.data(), dim); - // Quantize v2 + // Create SQ8 quantized version of v2 + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector v2_quantized(quantized_size); - uint8_t *quant_values = reinterpret_cast(v2_quantized.data()); - float *params = reinterpret_cast(quant_values + dim); + test_utils::populate_float_vec_to_sq8_with_sum_norm(v2_quantized.data(), dim); - // Quantize each value - for (size_t i = 0; i < dim; i++) { - float quantized = (v2_orig[i] - min_val) / delta; - quantized = std::max(0.0f, std::min(255.0f, quantized)); - quant_values[i] = static_cast(std::round(quantized)); - } - // Store parameters - params[0] = min_val; - params[1] = delta; + float baseline = SQ8_Cosine(v1_orig.data(), v2_quantized.data(), dim); - float dist = SQ8_Cosine((const void *)v1_orig, (const void *)v2_quantized.data(), dim); - ASSERT_NEAR(dist, 0.0f, 0.001f) << "SQ8_Cosine failed to match expected distance"; + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, nullptr); + ASSERT_EQ(arch_opt_func, SQ8_Cosine) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) + << "No optimization with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) { - // create a vector with extra space for the norm size_t dim = 5; // Create original vectors - float v1_orig[dim], v2_orig[dim]; - for (size_t i = 0; i < dim; i++) { - v1_orig[i] = float(i + 1.5); - v2_orig[i] = float(i + 1.5); - } - - // Size: dim (uint8_t) + min_val (float) + delta (float) - size_t quantized_size = dim * sizeof(uint8_t) + 2 * sizeof(float); - spaces::GetNormalizeFunc()(v1_orig, dim); - spaces::GetNormalizeFunc()(v2_orig, dim); - // Find min and max for quantization - float min_val = v2_orig[0]; - float max_val = v2_orig[0]; - for (size_t i = 1; i < dim; i++) { - min_val = std::min(min_val, v2_orig[i]); - max_val = std::max(max_val, v2_orig[i]); - } - // Calculate delta - float delta = (max_val - min_val) / 255.0f; - if (delta == 0) - delta = 1.0f; // Avoid division by zero + std::vector v1_orig(dim); + test_utils::populate_float_vec(v1_orig.data(), dim); - // Quantize v2 + // Create SQ8 quantized version of v2 + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector v2_quantized(quantized_size); - uint8_t *quant_values = reinterpret_cast(v2_quantized.data()); - float *params = reinterpret_cast(quant_values + dim); - - // Quantize each value - for (size_t i = 0; i < dim; i++) { - float quantized = (v2_orig[i] - min_val) / delta; - quantized = std::max(0.0f, std::min(255.0f, quantized)); - quant_values[i] = static_cast(std::round(quantized)); - } + test_utils::populate_float_vec_to_sq8_with_sum_norm(v2_quantized.data(), dim); - // Store parameters - params[0] = min_val; - params[1] = delta; + float baseline = SQ8_L2Sqr(v1_orig.data(), v2_quantized.data(), dim); - float dist = SQ8_L2Sqr((const void *)v1_orig, (const void *)v2_quantized.data(), dim); - ASSERT_NEAR(dist, 0.0f, 0.0001f) << "SQ8_L2Sqr failed to match expected distance"; + unsigned char alignment = 0; + auto arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, nullptr); + ASSERT_EQ(arch_opt_func, SQ8_L2Sqr) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.02) + << "No optimization with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } /* ======================== Test Getters ======================== */ @@ -494,8 +458,6 @@ TEST_F(SpacesTest, GetDistFuncInvalidMetricUINT8) { (spaces::GetDistFunc((VecSimMetric)(VecSimMetric_Cosine + 1), 10, nullptr)), std::invalid_argument); } - -using namespace spaces; #ifdef CPU_FEATURES_ARCH_X86_64 TEST_F(SpacesTest, smallDimChooser) { // Verify that small dimensions gets the no optimization function. @@ -2047,69 +2009,19 @@ TEST_P(UINT8SpacesOptimizationTest, UINT8_full_range_test) { INSTANTIATE_TEST_SUITE_P(UINT8OptFuncs, UINT8SpacesOptimizationTest, testing::Range(32UL, 64 * 2UL + 1)); -// Helper function to create SQ8 quantized vector with precomputed sum and norm -// Vector layout: [uint8_t values (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] -std::vector CreateSQ8QuantizedVector(const float *original, size_t dim) { - // Create a copy of the original vector that we can modify - std::vector vec_copy(original, original + dim); - - // Size: dim (uint8_t) + min_val (float) + delta (float) + sum (float) + norm (float) - size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); - std::vector quantized(quantized_size); - - // Find min and max for quantization, and compute sum and square_sum of original float values - float min_val = vec_copy[0]; - float max_val = vec_copy[0]; - float sum = 0.0f; - float square_sum = 0.0f; - for (size_t i = 0; i < dim; i++) { - auto val = vec_copy[i]; - min_val = std::min(min_val, val); - max_val = std::max(max_val, val); - sum += val; - square_sum += val * val; - } - - // Calculate delta - float delta = (max_val - min_val) / 255.0f; - if (delta == 0) - delta = 1.0f; // Avoid division by zero - - // Quantize vector and compute sum and square_sum - uint8_t *quant_values = quantized.data(); - - for (size_t i = 0; i < dim; i++) { - float normalized = (vec_copy[i] - min_val) / delta; - normalized = std::max(0.0f, std::min(255.0f, normalized)); - quant_values[i] = static_cast(std::round(normalized)); - } - - // Store parameters: [min, delta, sum, square_sum] - float *params = reinterpret_cast(quant_values + dim); - params[0] = min_val; - params[1] = delta; - params[2] = sum; - params[3] = square_sum; - - return quantized; -} - class SQ8SpacesOptimizationTest : public testing::TestWithParam {}; TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { auto optimization = getCpuOptimizationFeatures(); size_t dim = GetParam(); - // Create original vectors std::vector v1_orig(dim); - std::vector v2_orig(dim); - for (size_t i = 0; i < dim; i++) { - v1_orig[i] = float(i + 1.5); - v2_orig[i] = float(i * 0.75 + 1.0); - } - + test_utils::populate_float_vec(v1_orig.data(), dim); + spaces::GetNormalizeFunc()(v1_orig.data(), dim); // Create SQ8 quantized version of v2 - std::vector v2_quantized = CreateSQ8QuantizedVector(v2_orig.data(), dim); + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v2_quantized(quantized_size); + test_utils::populate_float_vec_to_sq8_with_sum_norm(v2_quantized.data(), dim); auto expected_alignment = [](size_t reg_bit_size, size_t dim) { size_t elements_in_reg = reg_bit_size / sizeof(uint8_t) / 8; @@ -2227,15 +2139,13 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { // Create original vectors std::vector v1_orig(dim); - std::vector v2_orig(dim); - for (size_t i = 0; i < dim; i++) { - v1_orig[i] = float(i + 1.5); - v2_orig[i] = float(i * 0.75 + 1.0); - } + test_utils::populate_float_vec(v1_orig.data(), dim); spaces::GetNormalizeFunc()(v1_orig.data(), dim); // Create SQ8 quantized version of v2 - std::vector v2_quantized = CreateSQ8QuantizedVector(v2_orig.data(), dim); + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v2_quantized(quantized_size); + test_utils::populate_float_vec_to_sq8_with_sum_norm(v2_quantized.data(), dim); // print min and delta float *params = reinterpret_cast(v2_quantized.data() + dim); @@ -2348,18 +2258,14 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { // Create original vectors std::vector v1_orig(dim); - std::vector v2_orig(dim); - for (size_t i = 0; i < dim; i++) { - v1_orig[i] = float(i + 1.5); - v2_orig[i] = float(i * 0.75 + 1.0); - } + test_utils::populate_float_vec(v1_orig.data(), dim); - // Normalize v1 spaces::GetNormalizeFunc()(v1_orig.data(), dim); - spaces::GetNormalizeFunc()(v2_orig.data(), dim); // Create SQ8 quantized version of v2 (with normalization) - std::vector v2_quantized = CreateSQ8QuantizedVector(v2_orig.data(), dim); + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v2_quantized(quantized_size); + test_utils::populate_float_vec_to_sq8_with_sum_norm(v2_quantized.data(), dim); auto expected_alignment = [](size_t reg_bit_size, size_t dim) { size_t elements_in_reg = reg_bit_size / sizeof(uint8_t) / 8; @@ -2464,56 +2370,43 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { TEST_F(SpacesTest, SQ8_SQ8_ip_no_optimization_func_test) { size_t dim = 5; - // Create original vectors - float v1_orig[dim], v2_orig[dim]; - for (size_t i = 0; i < dim; i++) { - v1_orig[i] = float(i + 1.5); - v2_orig[i] = float(i + 1.5); - } - - // Normalize vectors so identical vectors have IP = 1, making distance = 1 - IP = 0 - // (Inner product doesn't require normalization, but it simplifies expected value calculation) - spaces::GetNormalizeFunc()(v1_orig, dim); - spaces::GetNormalizeFunc()(v2_orig, dim); - // Create SQ8 quantized versions of both vectors - std::vector v1_quantized = CreateSQ8QuantizedVector(v1_orig, dim); - std::vector v2_quantized = CreateSQ8QuantizedVector(v2_orig, dim); + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v1_quantized(quantized_size); + std::vector v2_quantized(quantized_size); + test_utils::populate_float_vec_to_sq8_with_sum_norm(v1_quantized.data(), dim); + test_utils::populate_float_vec_to_sq8_with_sum_norm(v2_quantized.data(), dim); - // Get distance function with nullptr alignment to cover that code path - auto dist_func = IP_SQ8_SQ8_GetDistFunc(dim, nullptr, nullptr); - float dist = - dist_func((const void *)v1_quantized.data(), (const void *)v2_quantized.data(), dim); + float baseline = SQ8_SQ8_InnerProduct(v1_quantized.data(), v2_quantized.data(), dim); - // Since we're comparing identical normalized vectors, distance should be close to 0 - ASSERT_NEAR(dist, 0.0f, 0.01f) << "SQ8_SQ8_InnerProduct failed to match expected distance"; + unsigned char alignment = 0; + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); + ASSERT_EQ(arch_opt_func, SQ8_SQ8_InnerProduct) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.01) + << "No optimization with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } TEST_F(SpacesTest, SQ8_SQ8_Cosine_no_optimization_func_test) { size_t dim = 5; - // Create original vectors - float v1_orig[dim], v2_orig[dim]; - for (size_t i = 0; i < dim; i++) { - v1_orig[i] = float(i + 1.5); - v2_orig[i] = float(i + 1.5); - } - - // Normalize vectors so identical vectors have cosine similarity = 1, making distance = 0 - spaces::GetNormalizeFunc()(v1_orig, dim); - spaces::GetNormalizeFunc()(v2_orig, dim); - // Create SQ8 quantized versions of both vectors - std::vector v1_quantized = CreateSQ8QuantizedVector(v1_orig, dim); - std::vector v2_quantized = CreateSQ8QuantizedVector(v2_orig, dim); + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v1_quantized(quantized_size); + std::vector v2_quantized(quantized_size); + test_utils::populate_float_vec_to_sq8_with_sum_norm(v1_quantized.data(), dim); + test_utils::populate_float_vec_to_sq8_with_sum_norm(v2_quantized.data(), dim); - // Get distance function with nullptr alignment to cover that code path - auto dist_func = Cosine_SQ8_SQ8_GetDistFunc(dim, nullptr, nullptr); - float dist = - dist_func((const void *)v1_quantized.data(), (const void *)v2_quantized.data(), dim); + float baseline = SQ8_SQ8_Cosine(v1_quantized.data(), v2_quantized.data(), dim); - // Since we're comparing identical normalized vectors, cosine distance should be close to 0 - ASSERT_NEAR(dist, 0.0f, 0.01f) << "SQ8_SQ8_Cosine failed to match expected distance"; + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); + ASSERT_EQ(arch_opt_func, SQ8_SQ8_Cosine) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.01) + << "No optimization with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } class SQ8_SQ8_SpacesOptimizationTest : public testing::TestWithParam {}; @@ -2522,25 +2415,18 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_InnerProductTest) { auto optimization = getCpuOptimizationFeatures(); size_t dim = GetParam(); - // Create original vectors - std::vector v1_orig(dim); - std::vector v2_orig(dim); - for (size_t i = 0; i < dim; i++) { - v1_orig[i] = float(i + 1.5); - v2_orig[i] = float(i * 0.75 + 1.0); - } - - // Normalize both vectors - spaces::GetNormalizeFunc()(v1_orig.data(), dim); - spaces::GetNormalizeFunc()(v2_orig.data(), dim); // Create SQ8 quantized versions of both vectors - std::vector v1_quantized = CreateSQ8QuantizedVector(v1_orig.data(), dim); - std::vector v2_quantized = CreateSQ8QuantizedVector(v2_orig.data(), dim); + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v1_quantized(quantized_size); + std::vector v2_quantized(quantized_size); + test_utils::populate_float_vec_to_sq8_with_sum_norm(v1_quantized.data(), dim); + test_utils::populate_float_vec_to_sq8_with_sum_norm(v2_quantized.data(), dim); dist_func_t arch_opt_func; float baseline = SQ8_SQ8_InnerProduct(v1_quantized.data(), v2_quantized.data(), dim); + #ifdef OPT_SVE2 if (optimization.sve2) { unsigned char alignment = 0; @@ -2696,61 +2582,85 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_CosineTest) { INSTANTIATE_TEST_SUITE_P(SQ8_SQ8OptFuncs, SQ8_SQ8_SpacesOptimizationTest, testing::Range(64UL, 64 * 2UL + 1)); -#ifdef OPT_AVX512_F_BW_VL_VNNI // Test self-distance: distance to itself should be 0 for cosine (normalized vectors) TEST(SQ8_SQ8_EdgeCases, SelfDistanceCosine) { auto optimization = getCpuOptimizationFeatures(); - if (!(optimization.avx512f && optimization.avx512bw && optimization.avx512vnni)) { - GTEST_SKIP() << "AVX512 VNNI not available"; - } - size_t dim = 128; - std::vector v_orig(dim); - - // Create a normalized vector - std::mt19937 rng(42); - std::uniform_real_distribution dist(-1.0f, 1.0f); - float norm = 0.0f; - for (size_t i = 0; i < dim; i++) { - v_orig[i] = dist(rng); - norm += v_orig[i] * v_orig[i]; - } - norm = std::sqrt(norm); - for (size_t i = 0; i < dim; i++) { - v_orig[i] /= norm; - } - auto v_quantized = CreateSQ8QuantizedVector(v_orig.data(), dim); + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v_quantized(quantized_size); + test_utils::populate_float_vec_to_sq8_with_sum_norm(v_quantized.data(), dim); - auto cosine_func = spaces::Choose_SQ8_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); - float self_distance = cosine_func(v_quantized.data(), v_quantized.data(), dim); + float baseline = SQ8_SQ8_Cosine(v_quantized.data(), v_quantized.data(), dim); // Self-distance for cosine should be close to 0 - ASSERT_NEAR(self_distance, 0.0f, 0.02f) << "Self-distance should be ~0 for cosine"; + ASSERT_NEAR(baseline, 0.0f, 0.02f) << "Self-distance should be ~0 for cosine"; + + #ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_quantized.data(), v_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; + optimization.sve2 = 0; + } + #endif + #ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_quantized.data(), v_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; + optimization.sve = 0; + } + #endif + #ifdef OPT_NEON_DOTPROD + if (optimization.asimddp) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_quantized.data(), v_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; + optimization.asimddp = 0; + } + #endif + #ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_quantized.data(), v_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; + optimization.asimd = 0; + } + #endif + #ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_quantized.data(), v_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; + optimization.avx512f = 0; + } + #endif + + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_quantized.data(), v_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; } // Test symmetry: dist(v1, v2) == dist(v2, v1) TEST(SQ8_SQ8_EdgeCases, SymmetryTest) { - auto optimization = getCpuOptimizationFeatures(); - if (!(optimization.avx512f && optimization.avx512bw && optimization.avx512vnni)) { - GTEST_SKIP() << "AVX512 VNNI not available"; - } - size_t dim = 128; - std::vector v1_orig(dim), v2_orig(dim); - - std::mt19937 rng(123); - std::uniform_real_distribution dist(-1.0f, 1.0f); - for (size_t i = 0; i < dim; i++) { - v1_orig[i] = dist(rng); - v2_orig[i] = dist(rng); - } - auto v1_quantized = CreateSQ8QuantizedVector(v1_orig.data(), dim); - auto v2_quantized = CreateSQ8QuantizedVector(v2_orig.data(), dim); + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v1_quantized(quantized_size); + std::vector v2_quantized(quantized_size); + test_utils::populate_float_vec_to_sq8_with_sum_norm(v1_quantized.data(), dim, 123, -1.0f, 1.0f); + test_utils::populate_float_vec_to_sq8_with_sum_norm(v2_quantized.data(), dim, 123, -1.0f, 1.0f); - auto ip_func = spaces::Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); - auto cosine_func = spaces::Choose_SQ8_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); + unsigned char alignment = 0; + auto ip_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); + auto cosine_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); float ip_12 = ip_func(v1_quantized.data(), v2_quantized.data(), dim); float ip_21 = ip_func(v2_quantized.data(), v1_quantized.data(), dim); @@ -2764,28 +2674,65 @@ TEST(SQ8_SQ8_EdgeCases, SymmetryTest) { // Test with zero vector TEST(SQ8_SQ8_EdgeCases, ZeroVectorTest) { auto optimization = getCpuOptimizationFeatures(); - if (!(optimization.avx512f && optimization.avx512bw && optimization.avx512vnni)) { - GTEST_SKIP() << "AVX512 VNNI not available"; - } - size_t dim = 128; std::vector v_zero(dim, 0.0f); - std::vector v_nonzero(dim); - - std::mt19937 rng(456); - std::uniform_real_distribution dist(-1.0f, 1.0f); - for (size_t i = 0; i < dim; i++) { - v_nonzero[i] = dist(rng); - } - auto v_zero_quantized = CreateSQ8QuantizedVector(v_zero.data(), dim); - auto v_nonzero_quantized = CreateSQ8QuantizedVector(v_nonzero.data(), dim); + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v_zero_quantized(quantized_size); + std::vector v_nonzero_quantized(quantized_size); + test_utils::quantize_float_vec_to_uint8_with_sum_norm(v_zero.data(), dim, v_zero_quantized.data()); + test_utils::populate_float_vec_to_sq8_with_sum_norm(v_nonzero_quantized.data(), dim); - // Compute baseline using fallback function float baseline = SQ8_SQ8_InnerProduct(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); - auto ip_func = spaces::Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); - float result = ip_func(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); + #ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; + optimization.sve2 = 0; + } + #endif + #ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; + optimization.sve = 0; + } + #endif + #ifdef OPT_NEON_DOTPROD + if (optimization.asimddp) { + unsigned char alignment = 0; + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; + optimization.asimddp = 0; + } + #endif + #ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; + optimization.asimd = 0; + } + #endif + #ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { + unsigned char alignment = 0; + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; + optimization.avx512f = 0; + } + #endif + unsigned char alignment = 0; + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); + float result = arch_opt_func(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Zero vector IP should match baseline"; } @@ -2793,39 +2740,71 @@ TEST(SQ8_SQ8_EdgeCases, ZeroVectorTest) { // Test with constant vector (all same values) TEST(SQ8_SQ8_EdgeCases, ConstantVectorTest) { auto optimization = getCpuOptimizationFeatures(); - if (!(optimization.avx512f && optimization.avx512bw && optimization.avx512vnni)) { - GTEST_SKIP() << "AVX512 VNNI not available"; - } - size_t dim = 128; std::vector v_const(dim, 0.5f); - std::vector v_random(dim); - std::mt19937 rng(789); - std::uniform_real_distribution dist(-1.0f, 1.0f); - for (size_t i = 0; i < dim; i++) { - v_random[i] = dist(rng); - } - - auto v_const_quantized = CreateSQ8QuantizedVector(v_const.data(), dim); - auto v_random_quantized = CreateSQ8QuantizedVector(v_random.data(), dim); + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v_const_quantized(quantized_size); + std::vector v_random_quantized(quantized_size); + test_utils::quantize_float_vec_to_uint8_with_sum_norm(v_const.data(), dim, v_const_quantized.data()); + test_utils::populate_float_vec_to_sq8_with_sum_norm(v_random_quantized.data(), dim); - // Compute baseline using fallback function float baseline = SQ8_SQ8_InnerProduct(v_const_quantized.data(), v_random_quantized.data(), dim); + #ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_const_quantized.data(), v_random_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized constant vector IP should match baseline"; + optimization.sve2 = 0; + } + #endif + #ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_const_quantized.data(), v_random_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized constant vector IP should match baseline"; + optimization.sve = 0; + } + #endif + #ifdef OPT_NEON_DOTPROD + if (optimization.asimddp) { + unsigned char alignment = 0; + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_const_quantized.data(), v_random_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized constant vector IP should match baseline"; + optimization.asimddp = 0; + } + #endif + #ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_const_quantized.data(), v_random_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized constant vector IP should match baseline"; + optimization.asimd = 0; + } + #endif + #ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { + unsigned char alignment = 0; + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_const_quantized.data(), v_random_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized constant vector IP should match baseline"; + optimization.avx512f = 0; + } + #endif - auto ip_func = spaces::Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); - float result = ip_func(v_const_quantized.data(), v_random_quantized.data(), dim); + unsigned char alignment = 0; + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); + float result = arch_opt_func(v_const_quantized.data(), v_random_quantized.data(), dim); ASSERT_NEAR(result, baseline, 0.01f) << "Constant vector IP should match baseline"; } // Test with extreme values (-1 and 1 only) TEST(SQ8_SQ8_EdgeCases, ExtremeValuesTest) { - auto optimization = getCpuOptimizationFeatures(); - if (!(optimization.avx512f && optimization.avx512bw && optimization.avx512vnni)) { - GTEST_SKIP() << "AVX512 VNNI not available"; - } - size_t dim = 128; std::vector v1(dim), v2(dim); @@ -2835,54 +2814,68 @@ TEST(SQ8_SQ8_EdgeCases, ExtremeValuesTest) { v2[i] = (i % 3 == 0) ? 1.0f : -1.0f; } - auto v1_quantized = CreateSQ8QuantizedVector(v1.data(), dim); - auto v2_quantized = CreateSQ8QuantizedVector(v2.data(), dim); + spaces::GetNormalizeFunc()(v1.data(), dim); + spaces::GetNormalizeFunc()(v2.data(), dim); - // Compute baseline using fallback function - float baseline = SQ8_SQ8_InnerProduct(v1_quantized.data(), v2_quantized.data(), dim); - - auto ip_func = spaces::Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); - float result = ip_func(v1_quantized.data(), v2_quantized.data(), dim); + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v1_quantized(quantized_size); + std::vector v2_quantized(quantized_size); + test_utils::quantize_float_vec_to_uint8_with_sum_norm(v1.data(), dim, v1_quantized.data()); + test_utils::quantize_float_vec_to_uint8_with_sum_norm(v2.data(), dim, v2_quantized.data()); - ASSERT_NEAR(result, baseline, 0.01f) << "Extreme values IP should match baseline"; -} + float baseline = SQ8_SQ8_InnerProduct(v1_quantized.data(), v2_quantized.data(), dim); + ASSERT_FALSE(std::isnan(baseline)) << "Extreme values IP should not produce NaN"; -// Test accuracy across multiple random vector pairs -TEST(SQ8_SQ8_EdgeCases, AccuracyStressTest) { auto optimization = getCpuOptimizationFeatures(); - if (!(optimization.avx512f && optimization.avx512bw && optimization.avx512vnni)) { - GTEST_SKIP() << "AVX512 VNNI not available"; + #ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized extreme values IP should match baseline"; + optimization.sve2 = 0; } - - size_t dim = 256; - const int num_iterations = 100; - std::mt19937 rng(999); - std::uniform_real_distribution dist(-10.0f, 10.0f); - - auto ip_func = spaces::Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); - - float max_error = 0.0f; - for (int iter = 0; iter < num_iterations; iter++) { - std::vector v1(dim), v2(dim); - for (size_t i = 0; i < dim; i++) { - v1[i] = dist(rng); - v2[i] = dist(rng); - } - - auto v1_quantized = CreateSQ8QuantizedVector(v1.data(), dim); - auto v2_quantized = CreateSQ8QuantizedVector(v2.data(), dim); - - // Compute baseline using fallback function - float baseline = SQ8_SQ8_InnerProduct(v1_quantized.data(), v2_quantized.data(), dim); - float result = ip_func(v1_quantized.data(), v2_quantized.data(), dim); - - float error = std::abs(result - baseline); - max_error = std::max(max_error, error); - - ASSERT_NEAR(result, baseline, 0.01f) << "Iteration " << iter << " failed"; + #endif + #ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized extreme values IP should match baseline"; + optimization.sve = 0; + } + #endif + #ifdef OPT_NEON_DOTPROD + if (optimization.asimddp) { + unsigned char alignment = 0; + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized extreme values IP should match baseline"; + optimization.asimddp = 0; } + #endif + #ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized extreme values IP should match baseline"; + optimization.asimd = 0; + } + #endif + #ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { + unsigned char alignment = 0; + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized extreme values IP should match baseline"; + optimization.avx512f = 0; + } + #endif - // Log max error for informational purposes - ASSERT_LT(max_error, 0.01f) << "Max error across all iterations: " << max_error; + unsigned char alignment = 0; + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); + float result = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + + ASSERT_NEAR(result, baseline, 0.01f) << "Extreme values IP should match baseline"; } -#endif diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h index 4bc1d53aa..3bd6ad1c7 100644 --- a/tests/utils/tests_utils.h +++ b/tests/utils/tests_utils.h @@ -107,8 +107,7 @@ static void populate_float_vec_to_sq8(uint8_t *v, size_t dim, int seed = 1234) { * Vector layout: [uint8_t values (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] * where sum = Σv[i] and norm = Σv[i]² (sum of squares of uint8 elements) */ -static void quantize_float_vec_to_uint8_with_sum_norm(const float *v, size_t dim, uint8_t *qv, - int seed = 1234) { +static void quantize_float_vec_to_uint8_with_sum_norm(const float *v, size_t dim, uint8_t *qv) { float min_val = v[0]; float max_val = v[0]; for (size_t i = 1; i < dim; i++) { @@ -148,12 +147,13 @@ static void quantize_float_vec_to_uint8_with_sum_norm(const float *v, size_t dim * Populate a float vector and quantize to SQ8 with precomputed sum and norm. * Vector layout: [uint8_t values (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] */ -static void populate_float_vec_to_sq8_with_sum_norm(uint8_t *v, size_t dim, int seed = 1234) { +static void populate_float_vec_to_sq8_with_sum_norm(uint8_t *v, size_t dim, int seed = 1234, float min = -1.0f, + float max = 1.0f) { std::vector vec(dim); - populate_float_vec(vec.data(), dim, seed); + populate_float_vec(vec.data(), dim, seed, min, max); // Normalize vector spaces::GetNormalizeFunc()(vec.data(), dim); - quantize_float_vec_to_uint8_with_sum_norm(vec.data(), dim, v, seed); + quantize_float_vec_to_uint8_with_sum_norm(vec.data(), dim, v); } template From d510b8ae979725f7bae7b444119585bcea24330c Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Tue, 30 Dec 2025 17:48:03 +0200 Subject: [PATCH 24/51] Refactor SQ8 benchmarks by removing precomputed variants and updating vector population methods --- .../spaces_benchmarks/bm_spaces_sq8_sq8.cpp | 103 ++++++------------ 1 file changed, 34 insertions(+), 69 deletions(-) diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp index 827de7323..2eb2149bd 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp @@ -28,35 +28,6 @@ class BM_VecSimSpaces_SQ8_SQ8 : public benchmark::Fixture { // Allocate both vectors with extra space for min and delta v1 = new uint8_t[dim + sizeof(float) * 2]; v2 = new uint8_t[dim + sizeof(float) * 2]; - test_utils::populate_float_vec_to_sq8(v1, dim, 123); - test_utils::populate_float_vec_to_sq8(v2, dim, 1234); - } - void TearDown(const ::benchmark::State &state) { - delete[] v1; - delete[] v2; - } -}; - -/** - * SQ8-to-SQ8 Precomputed benchmarks: Same as above but with precomputed sum and norm. - * Vector layout: [uint8_t values (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] - */ -class BM_VecSimSpaces_SQ8_SQ8_Precomputed : public benchmark::Fixture { -protected: - std::mt19937 rng; - size_t dim; - uint8_t *v1; - uint8_t *v2; - -public: - BM_VecSimSpaces_SQ8_SQ8_Precomputed() { rng.seed(47); } - ~BM_VecSimSpaces_SQ8_SQ8_Precomputed() = default; - - void SetUp(const ::benchmark::State &state) { - dim = state.range(0); - // Allocate both vectors with extra space for min, delta, sum, and norm (4 floats) - v1 = new uint8_t[dim + sizeof(float) * 4]; - v2 = new uint8_t[dim + sizeof(float) * 4]; test_utils::populate_float_vec_to_sq8_with_sum_norm(v1, dim, 123); test_utils::populate_float_vec_to_sq8_with_sum_norm(v2, dim, 1234); } @@ -66,53 +37,47 @@ class BM_VecSimSpaces_SQ8_SQ8_Precomputed : public benchmark::Fixture { } }; -// #ifdef CPU_FEATURES_ARCH_AARCH64 -// cpu_features::Aarch64Features opt = cpu_features::GetAarch64Info().features; +#ifdef CPU_FEATURES_ARCH_AARCH64 +cpu_features::Aarch64Features opt = cpu_features::GetAarch64Info().features; -// // NEON SQ8-to-SQ8 functions -// #ifdef OPT_NEON -// bool neon_supported = opt.asimd; -// INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, NEON, 16, neon_supported); -// INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, NEON, 16, neon_supported); -// #endif // NEON -// // SVE SQ8-to-SQ8 functions -// #ifdef OPT_SVE -// bool sve_supported = opt.sve; -// INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, SVE, 16, sve_supported); -// INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, SVE, 16, sve_supported); -// #endif // SVE -// // SVE2 SQ8-to-SQ8 functions -// #ifdef OPT_SVE2 -// bool sve2_supported = opt.sve2; -// INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, SVE2, 16, sve2_supported); -// INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, SVE2, 16, sve2_supported); -// #endif // SVE2 -// #endif // AARCH64 +// NEON SQ8-to-SQ8 functions +#ifdef OPT_NEON +bool neon_supported = opt.asimd; +INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, NEON, 16, neon_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, NEON, 16, neon_supported); +#endif // NEON +// SVE SQ8-to-SQ8 functions +#ifdef OPT_SVE +bool sve_supported = opt.sve; +INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, SVE, 16, sve_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, SVE, 16, sve_supported); +#endif // SVE +// SVE2 SQ8-to-SQ8 functions +#ifdef OPT_SVE2 +bool sve2_supported = opt.sve2; +INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, SVE2, 16, sve2_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, SVE2, 16, sve2_supported); +#endif // SVE2 +#endif // AARCH64 -// #ifdef CPU_FEATURES_ARCH_X86_64 -// cpu_features::X86Features opt = cpu_features::GetX86Info().features; +#ifdef CPU_FEATURES_ARCH_X86_64 +cpu_features::X86Features opt = cpu_features::GetX86Info().features; -// // AVX512_F_BW_VL_VNNI SQ8-to-SQ8 functions -// #ifdef OPT_AVX512_F_BW_VL_VNNI -// bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && opt.avx512vl && -// opt.avx512vnni; INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, -// AVX512F_BW_VL_VNNI, 64, -// avx512_f_bw_vl_vnni_supported); -// INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, AVX512F_BW_VL_VNNI, 64, -// avx512_f_bw_vl_vnni_supported); +// AVX512_F_BW_VL_VNNI SQ8-to-SQ8 functions +#ifdef OPT_AVX512_F_BW_VL_VNNI +bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && opt.avx512vl && +opt.avx512vnni; INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, +AVX512F_BW_VL_VNNI, 64, + avx512_f_bw_vl_vnni_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, AVX512F_BW_VL_VNNI, 64, + avx512_f_bw_vl_vnni_supported); -// // AVX512_F_BW_VL_VNNI SQ8-to-SQ8 Precomputed functions (using precomputed sum and norm) -// INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8_Precomputed, SQ8_SQ8_Precomputed, -// AVX512F_BW_VL_VNNI, 64, avx512_f_bw_vl_vnni_supported); -// INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8_Precomputed, SQ8_SQ8_Precomputed, -// AVX512F_BW_VL_VNNI, 64, avx512_f_bw_vl_vnni_supported); -// #endif // AVX512_F_BW_VL_VNNI -// #endif // x86_64 +#endif // AVX512_F_BW_VL_VNNI +#endif // x86_64 // Naive SQ8-to-SQ8 algorithms INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, InnerProduct, 16); INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, Cosine, 16); -INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8_SQ8_Precomputed, SQ8_SQ8, InnerProduct, 16); -INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8_SQ8_Precomputed, SQ8_SQ8, Cosine, 16); + BENCHMARK_MAIN(); From ee2674080615577e1c1b83f1479673327b6181de Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Tue, 30 Dec 2025 18:16:09 +0200 Subject: [PATCH 25/51] foramt --- .../spaces_benchmarks/bm_spaces_sq8_sq8.cpp | 6 +- tests/unit/test_spaces.cpp | 115 +++++++++--------- tests/utils/tests_utils.h | 6 +- 3 files changed, 65 insertions(+), 62 deletions(-) diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp index 2eb2149bd..c8a6b254b 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp @@ -65,9 +65,8 @@ cpu_features::X86Features opt = cpu_features::GetX86Info().features; // AVX512_F_BW_VL_VNNI SQ8-to-SQ8 functions #ifdef OPT_AVX512_F_BW_VL_VNNI -bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && opt.avx512vl && -opt.avx512vnni; INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, -AVX512F_BW_VL_VNNI, 64, +bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && opt.avx512vl && opt.avx512vnni; +INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, AVX512F_BW_VL_VNNI, 64, avx512_f_bw_vl_vnni_supported); INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, AVX512F_BW_VL_VNNI, 64, avx512_f_bw_vl_vnni_supported); @@ -79,5 +78,4 @@ INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, AVX512F_BW_VL INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, InnerProduct, 16); INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, Cosine, 16); - BENCHMARK_MAIN(); diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 5f501810a..3810ebcf6 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -357,11 +357,13 @@ void common_ip_sq8(bool should_normalize, float expected_dist) { quant_values[i] = static_cast(std::round(normalized)); } - float baseline = SQ8_InnerProduct((const void *)v1_orig, (const void *)v2_quantized.data(), dim); + float baseline = + SQ8_InnerProduct((const void *)v1_orig, (const void *)v2_quantized.data(), dim); // Since we're comparing identical vectors, the inner product distance should be close to // expected - ASSERT_NEAR(baseline, expected_dist, 0.01) << "SQ8_InnerProduct failed to match expected distance"; + ASSERT_NEAR(baseline, expected_dist, 0.01) + << "SQ8_InnerProduct failed to match expected distance"; unsigned char alignment = 0; auto arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, nullptr); @@ -397,8 +399,7 @@ TEST_F(SpacesTest, SQ8_Cosine_no_optimization_func_test) { unsigned char alignment = 0; auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, nullptr); - ASSERT_EQ(arch_opt_func, SQ8_Cosine) - << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(arch_opt_func, SQ8_Cosine) << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; @@ -419,8 +420,7 @@ TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) { unsigned char alignment = 0; auto arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, nullptr); - ASSERT_EQ(arch_opt_func, SQ8_L2Sqr) - << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(arch_opt_func, SQ8_L2Sqr) << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.02) << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; @@ -2415,7 +2415,6 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_InnerProductTest) { auto optimization = getCpuOptimizationFeatures(); size_t dim = GetParam(); - // Create SQ8 quantized versions of both vectors size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector v1_quantized(quantized_size); @@ -2426,7 +2425,6 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_InnerProductTest) { dist_func_t arch_opt_func; float baseline = SQ8_SQ8_InnerProduct(v1_quantized.data(), v2_quantized.data(), dim); - #ifdef OPT_SVE2 if (optimization.sve2) { unsigned char alignment = 0; @@ -2596,7 +2594,7 @@ TEST(SQ8_SQ8_EdgeCases, SelfDistanceCosine) { // Self-distance for cosine should be close to 0 ASSERT_NEAR(baseline, 0.0f, 0.02f) << "Self-distance should be ~0 for cosine"; - #ifdef OPT_SVE2 +#ifdef OPT_SVE2 if (optimization.sve2) { unsigned char alignment = 0; auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2604,8 +2602,8 @@ TEST(SQ8_SQ8_EdgeCases, SelfDistanceCosine) { ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; optimization.sve2 = 0; } - #endif - #ifdef OPT_SVE +#endif +#ifdef OPT_SVE if (optimization.sve) { unsigned char alignment = 0; auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2613,8 +2611,8 @@ TEST(SQ8_SQ8_EdgeCases, SelfDistanceCosine) { ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; optimization.sve = 0; } - #endif - #ifdef OPT_NEON_DOTPROD +#endif +#ifdef OPT_NEON_DOTPROD if (optimization.asimddp) { unsigned char alignment = 0; auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2622,8 +2620,8 @@ TEST(SQ8_SQ8_EdgeCases, SelfDistanceCosine) { ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; optimization.asimddp = 0; } - #endif - #ifdef OPT_NEON +#endif +#ifdef OPT_NEON if (optimization.asimd) { unsigned char alignment = 0; auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2631,8 +2629,8 @@ TEST(SQ8_SQ8_EdgeCases, SelfDistanceCosine) { ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; optimization.asimd = 0; } - #endif - #ifdef OPT_AVX512_F_BW_VL_VNNI +#endif +#ifdef OPT_AVX512_F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { unsigned char alignment = 0; auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2640,7 +2638,7 @@ TEST(SQ8_SQ8_EdgeCases, SelfDistanceCosine) { ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; optimization.avx512f = 0; } - #endif +#endif unsigned char alignment = 0; auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2680,12 +2678,13 @@ TEST(SQ8_SQ8_EdgeCases, ZeroVectorTest) { size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector v_zero_quantized(quantized_size); std::vector v_nonzero_quantized(quantized_size); - test_utils::quantize_float_vec_to_uint8_with_sum_norm(v_zero.data(), dim, v_zero_quantized.data()); + test_utils::quantize_float_vec_to_uint8_with_sum_norm(v_zero.data(), dim, + v_zero_quantized.data()); test_utils::populate_float_vec_to_sq8_with_sum_norm(v_nonzero_quantized.data(), dim); float baseline = SQ8_SQ8_InnerProduct(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); - #ifdef OPT_SVE2 +#ifdef OPT_SVE2 if (optimization.sve2) { unsigned char alignment = 0; auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2693,8 +2692,8 @@ TEST(SQ8_SQ8_EdgeCases, ZeroVectorTest) { ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; optimization.sve2 = 0; } - #endif - #ifdef OPT_SVE +#endif +#ifdef OPT_SVE if (optimization.sve) { unsigned char alignment = 0; auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2702,8 +2701,8 @@ TEST(SQ8_SQ8_EdgeCases, ZeroVectorTest) { ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; optimization.sve = 0; } - #endif - #ifdef OPT_NEON_DOTPROD +#endif +#ifdef OPT_NEON_DOTPROD if (optimization.asimddp) { unsigned char alignment = 0; auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2711,8 +2710,8 @@ TEST(SQ8_SQ8_EdgeCases, ZeroVectorTest) { ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; optimization.asimddp = 0; } - #endif - #ifdef OPT_NEON +#endif +#ifdef OPT_NEON if (optimization.asimd) { unsigned char alignment = 0; auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2720,8 +2719,8 @@ TEST(SQ8_SQ8_EdgeCases, ZeroVectorTest) { ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; optimization.asimd = 0; } - #endif - #ifdef OPT_AVX512_F_BW_VL_VNNI +#endif +#ifdef OPT_AVX512_F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { unsigned char alignment = 0; auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2729,7 +2728,7 @@ TEST(SQ8_SQ8_EdgeCases, ZeroVectorTest) { ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; optimization.avx512f = 0; } - #endif +#endif unsigned char alignment = 0; auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); float result = arch_opt_func(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); @@ -2746,55 +2745,61 @@ TEST(SQ8_SQ8_EdgeCases, ConstantVectorTest) { size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector v_const_quantized(quantized_size); std::vector v_random_quantized(quantized_size); - test_utils::quantize_float_vec_to_uint8_with_sum_norm(v_const.data(), dim, v_const_quantized.data()); + test_utils::quantize_float_vec_to_uint8_with_sum_norm(v_const.data(), dim, + v_const_quantized.data()); test_utils::populate_float_vec_to_sq8_with_sum_norm(v_random_quantized.data(), dim); float baseline = SQ8_SQ8_InnerProduct(v_const_quantized.data(), v_random_quantized.data(), dim); - #ifdef OPT_SVE2 +#ifdef OPT_SVE2 if (optimization.sve2) { unsigned char alignment = 0; auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); float result = arch_opt_func(v_const_quantized.data(), v_random_quantized.data(), dim); - ASSERT_NEAR(result, baseline, 0.01f) << "Optimized constant vector IP should match baseline"; + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized constant vector IP should match baseline"; optimization.sve2 = 0; } - #endif - #ifdef OPT_SVE +#endif +#ifdef OPT_SVE if (optimization.sve) { unsigned char alignment = 0; auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); float result = arch_opt_func(v_const_quantized.data(), v_random_quantized.data(), dim); - ASSERT_NEAR(result, baseline, 0.01f) << "Optimized constant vector IP should match baseline"; + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized constant vector IP should match baseline"; optimization.sve = 0; } - #endif - #ifdef OPT_NEON_DOTPROD +#endif +#ifdef OPT_NEON_DOTPROD if (optimization.asimddp) { unsigned char alignment = 0; auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); float result = arch_opt_func(v_const_quantized.data(), v_random_quantized.data(), dim); - ASSERT_NEAR(result, baseline, 0.01f) << "Optimized constant vector IP should match baseline"; + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized constant vector IP should match baseline"; optimization.asimddp = 0; } - #endif - #ifdef OPT_NEON +#endif +#ifdef OPT_NEON if (optimization.asimd) { unsigned char alignment = 0; auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); float result = arch_opt_func(v_const_quantized.data(), v_random_quantized.data(), dim); - ASSERT_NEAR(result, baseline, 0.01f) << "Optimized constant vector IP should match baseline"; + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized constant vector IP should match baseline"; optimization.asimd = 0; } - #endif - #ifdef OPT_AVX512_F_BW_VL_VNNI +#endif +#ifdef OPT_AVX512_F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { unsigned char alignment = 0; auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); float result = arch_opt_func(v_const_quantized.data(), v_random_quantized.data(), dim); - ASSERT_NEAR(result, baseline, 0.01f) << "Optimized constant vector IP should match baseline"; + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized constant vector IP should match baseline"; optimization.avx512f = 0; } - #endif +#endif unsigned char alignment = 0; auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); @@ -2827,7 +2832,7 @@ TEST(SQ8_SQ8_EdgeCases, ExtremeValuesTest) { ASSERT_FALSE(std::isnan(baseline)) << "Extreme values IP should not produce NaN"; auto optimization = getCpuOptimizationFeatures(); - #ifdef OPT_SVE2 +#ifdef OPT_SVE2 if (optimization.sve2) { unsigned char alignment = 0; auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2835,8 +2840,8 @@ TEST(SQ8_SQ8_EdgeCases, ExtremeValuesTest) { ASSERT_NEAR(result, baseline, 0.01f) << "Optimized extreme values IP should match baseline"; optimization.sve2 = 0; } - #endif - #ifdef OPT_SVE +#endif +#ifdef OPT_SVE if (optimization.sve) { unsigned char alignment = 0; auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2844,8 +2849,8 @@ TEST(SQ8_SQ8_EdgeCases, ExtremeValuesTest) { ASSERT_NEAR(result, baseline, 0.01f) << "Optimized extreme values IP should match baseline"; optimization.sve = 0; } - #endif - #ifdef OPT_NEON_DOTPROD +#endif +#ifdef OPT_NEON_DOTPROD if (optimization.asimddp) { unsigned char alignment = 0; auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2853,8 +2858,8 @@ TEST(SQ8_SQ8_EdgeCases, ExtremeValuesTest) { ASSERT_NEAR(result, baseline, 0.01f) << "Optimized extreme values IP should match baseline"; optimization.asimddp = 0; } - #endif - #ifdef OPT_NEON +#endif +#ifdef OPT_NEON if (optimization.asimd) { unsigned char alignment = 0; auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2862,8 +2867,8 @@ TEST(SQ8_SQ8_EdgeCases, ExtremeValuesTest) { ASSERT_NEAR(result, baseline, 0.01f) << "Optimized extreme values IP should match baseline"; optimization.asimd = 0; } - #endif - #ifdef OPT_AVX512_F_BW_VL_VNNI +#endif +#ifdef OPT_AVX512_F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { unsigned char alignment = 0; auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2871,7 +2876,7 @@ TEST(SQ8_SQ8_EdgeCases, ExtremeValuesTest) { ASSERT_NEAR(result, baseline, 0.01f) << "Optimized extreme values IP should match baseline"; optimization.avx512f = 0; } - #endif +#endif unsigned char alignment = 0; auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h index 3bd6ad1c7..dd34d2312 100644 --- a/tests/utils/tests_utils.h +++ b/tests/utils/tests_utils.h @@ -121,7 +121,7 @@ static void quantize_float_vec_to_uint8_with_sum_norm(const float *v, size_t dim sum += v[i]; square_sum += v[i] * v[i]; } - + // Calculate delta float delta = (max_val - min_val) / 255.0f; if (delta == 0) @@ -147,8 +147,8 @@ static void quantize_float_vec_to_uint8_with_sum_norm(const float *v, size_t dim * Populate a float vector and quantize to SQ8 with precomputed sum and norm. * Vector layout: [uint8_t values (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] */ -static void populate_float_vec_to_sq8_with_sum_norm(uint8_t *v, size_t dim, int seed = 1234, float min = -1.0f, - float max = 1.0f) { +static void populate_float_vec_to_sq8_with_sum_norm(uint8_t *v, size_t dim, int seed = 1234, + float min = -1.0f, float max = 1.0f) { std::vector vec(dim); populate_float_vec(vec.data(), dim, seed, min, max); // Normalize vector From afe1a4fee9573ba31e749485572f019117121e27 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Tue, 30 Dec 2025 18:19:26 +0200 Subject: [PATCH 26/51] Remove serialization benchmark script for HNSW disk serialization --- .../scripts/run_serialization_benchmarks.sh | 102 ------------------ 1 file changed, 102 deletions(-) delete mode 100755 tests/benchmark/scripts/run_serialization_benchmarks.sh diff --git a/tests/benchmark/scripts/run_serialization_benchmarks.sh b/tests/benchmark/scripts/run_serialization_benchmarks.sh deleted file mode 100755 index 087bc6f98..000000000 --- a/tests/benchmark/scripts/run_serialization_benchmarks.sh +++ /dev/null @@ -1,102 +0,0 @@ -#!/bin/bash -# Serialization Benchmark Script -# Runs HNSW disk serialization benchmarks for different dataset sizes and thread counts - -set -e - -# Configuration -SERIALIZER="./bin/Linux-x86_64-release/hnsw_disk_serializer/hnsw_disk_serializer" -DATA_DIR="tests/benchmark/data" -OUTPUT_DIR="tests/benchmark/data/serialization_benchmarks" -RESULTS_FILE="$OUTPUT_DIR/results.csv" - -# Dataset parameters -DIM=96 -METRIC="L2" -DATA_TYPE="FLOAT32" -M=32 -EFC=200 -BATCH_SIZE=1000 - -# Datasets and thread counts -DATASETS=("100K") -THREADS=(4 8) - -# Get branch name -BRANCH=$(git rev-parse --abbrev-ref HEAD) -BRANCH_SAFE=$(echo "$BRANCH" | tr '/' '-') - -mkdir -p "$OUTPUT_DIR" -# Build -make -j8 -# Initialize results file if it doesn't exist -if [ ! -f "$RESULTS_FILE" ]; then - echo "branch,dataset,threads,time_seconds,vectors_per_second" > "$RESULTS_FILE" -fi - -echo "==========================================" -echo "Serialization Benchmark" -echo "Branch: $BRANCH" -echo "==========================================" - -for dataset in "${DATASETS[@]}"; do - INPUT_FILE="$DATA_DIR/deep.base.${dataset}.fbin" - - if [ ! -f "$INPUT_FILE" ]; then - echo "ERROR: Input file not found: $INPUT_FILE" - continue - fi - - for threads in "${THREADS[@]}"; do - OUTPUT_NAME="$OUTPUT_DIR/deep-${dataset}-L2-dim${DIM}-M${M}-efc${EFC}-${BRANCH_SAFE}-${threads}t" - - echo "" - echo "----------------------------------------" - echo "Dataset: $dataset, Threads: $threads" - echo "Output: $OUTPUT_NAME" - echo "----------------------------------------" - - # Remove existing output if present - rm -rf "$OUTPUT_NAME" - - # Run benchmark and capture time - START_TIME=$(date +%s.%N) - - "$SERIALIZER" \ - "$INPUT_FILE" \ - "$OUTPUT_NAME" \ - "$DIM" "$METRIC" "$DATA_TYPE" \ - "$M" "$EFC" "$threads" "$BATCH_SIZE" - - END_TIME=$(date +%s.%N) - ELAPSED=$(echo "$END_TIME - $START_TIME" | bc) - - # Calculate vectors per second - if [ "$dataset" = "100K" ]; then - NUM_VECTORS=100000 - elif [ "$dataset" = "1M" ]; then - NUM_VECTORS=1000000 - fi - - VPS=$(echo "scale=2; $NUM_VECTORS / $ELAPSED" | bc) - - echo "" - echo "Time: ${ELAPSED}s" - echo "Vectors/sec: $VPS" - - # Append to results - echo "$BRANCH,$dataset,$threads,$ELAPSED,$VPS" >> "$RESULTS_FILE" - done -done - -echo "" -echo "==========================================" -echo "Benchmark Complete" -echo "Results saved to: $RESULTS_FILE" -echo "==========================================" - -# Display results -echo "" -echo "Results:" -cat "$RESULTS_FILE" - From a31f95c445e27f9cf5ba3153eb2901bc5f243ed1 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Wed, 31 Dec 2025 16:54:07 +0200 Subject: [PATCH 27/51] Refactor SQ8 distance functions and tests to remove precomputed norm references --- src/VecSim/spaces/IP/IP.cpp | 37 ++-- src/VecSim/spaces/IP/IP.h | 8 +- src/VecSim/spaces/IP_space.cpp | 4 +- .../spaces_benchmarks/bm_spaces_sq8_sq8.cpp | 4 +- tests/unit/test_spaces.cpp | 180 ++++-------------- tests/utils/tests_utils.h | 6 +- 6 files changed, 63 insertions(+), 176 deletions(-) diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp index b4b0a7cb6..d1d4e7578 100644 --- a/src/VecSim/spaces/IP/IP.cpp +++ b/src/VecSim/spaces/IP/IP.cpp @@ -49,13 +49,19 @@ float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) { return 1.0f - res; } -// SQ8-to-SQ8: Both vectors are uint8 quantized with precomputed sum/norm -// Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] [norm -// (float)] +// SQ8-to-SQ8: Both vectors are uint8 quantized with precomputed sum +// Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] float SQ8_SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) { const auto *pVect1 = static_cast(pVect1v); const auto *pVect2 = static_cast(pVect2v); + // Compute inner product of quantized values: Σ(q1[i]*q2[i]) + float product = 0; + for (size_t i = 0; i < dimension; i++) { + product += pVect1[i] * pVect2[i]; + } + + // Extract metadata from the end of vectors (likely already prefetched) // Get quantization parameters from pVect1 const float min_val1 = *reinterpret_cast(pVect1 + dimension); const float delta1 = *reinterpret_cast(pVect1 + dimension + sizeof(float)); @@ -66,25 +72,26 @@ float SQ8_SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dime const float delta2 = *reinterpret_cast(pVect2 + dimension + sizeof(float)); const float sum2 = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); - // Compute inner product with dequantization of both vectors - // With sum = Σv[i] (sum of original float values), the formula is: + // Apply the algebraic formula using precomputed sums: // IP = min1*sum2 + min2*sum1 + delta1*delta2*Σ(q1[i]*q2[i]) - dim*min1*min2 - float product = 0; - for (size_t i = 0; i < dimension; i++) { - product += pVect1[i] * pVect2[i]; - } float res = min_val1 * sum2 + min_val2 * sum1 - static_cast(dimension) * min_val1 * min_val2 + delta1 * delta2 * product; return 1.0f - res; } // SQ8-to-SQ8: Both vectors are uint8 quantized and normalized with precomputed sum/norm -// Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] [norm -// (float)] +// Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)]] float SQ8_SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) { const auto *pVect1 = static_cast(pVect1v); const auto *pVect2 = static_cast(pVect2v); + // Compute inner product of quantized values: Σ(q1[i]*q2[i]) + float product = 0; + for (size_t i = 0; i < dimension; i++) { + product += pVect1[i] * pVect2[i]; + } + + // Extract metadata from the end of vectors // Get quantization parameters from pVect1 const float min_val1 = *reinterpret_cast(pVect1 + dimension); const float delta1 = *reinterpret_cast(pVect1 + dimension + sizeof(float)); @@ -95,14 +102,8 @@ float SQ8_SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) const float delta2 = *reinterpret_cast(pVect2 + dimension + sizeof(float)); const float sum2 = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); - // Compute inner product with dequantization of both vectors - // With sum = Σv[i] (sum of original float values), the formula is: + // Apply the algebraic formula using precomputed sums: // IP = min1*sum2 + min2*sum1 + delta1*delta2*Σ(q1[i]*q2[i]) - dim*min1*min2 - float product = 0; - for (size_t i = 0; i < dimension; i++) { - product += pVect1[i] * pVect2[i]; - } - float res = min_val1 * sum2 + min_val2 * sum1 - static_cast(dimension) * min_val1 * min_val2 + delta1 * delta2 * product; return 1.0f - res; diff --git a/src/VecSim/spaces/IP/IP.h b/src/VecSim/spaces/IP/IP.h index 3e706a5dc..3b0a4aaac 100644 --- a/src/VecSim/spaces/IP/IP.h +++ b/src/VecSim/spaces/IP/IP.h @@ -17,13 +17,11 @@ float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimensio float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension); // SQ8-to-SQ8: Both vectors are uint8 quantized with precomputed sum/norm -// Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] [norm -// (float)] +// Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)]] float SQ8_SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension); -// SQ8-to-SQ8: Both vectors are uint8 quantized and normalized with precomputed sum/norm -// Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] [norm -// (float)] +// SQ8-to-SQ8: Both vectors are uint8 quantized and normalized with precomputed sum +// Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] float SQ8_SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension); float FP32_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension); diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index e6dfbe8e0..7ad23d2a6 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -151,7 +151,7 @@ dist_func_t Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, } // SQ8-to-SQ8 Inner Product distance function (both vectors are uint8 quantized with precomputed -// sum/norm) +// sum) dist_func_t IP_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { unsigned char dummy_alignment; @@ -190,7 +190,7 @@ dist_func_t IP_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, return ret_dist_func; } -// SQ8-to-SQ8 Cosine distance function (both vectors are uint8 quantized with precomputed sum/norm) +// SQ8-to-SQ8 Cosine distance function (both vectors are uint8 quantized with precomputed sum) dist_func_t Cosine_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { unsigned char dummy_alignment; diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp index c8a6b254b..61f8a0ac7 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp @@ -28,8 +28,8 @@ class BM_VecSimSpaces_SQ8_SQ8 : public benchmark::Fixture { // Allocate both vectors with extra space for min and delta v1 = new uint8_t[dim + sizeof(float) * 2]; v2 = new uint8_t[dim + sizeof(float) * 2]; - test_utils::populate_float_vec_to_sq8_with_sum_norm(v1, dim, 123); - test_utils::populate_float_vec_to_sq8_with_sum_norm(v2, dim, 1234); + test_utils::populate_float_vec_to_sq8_with_sum(v1, dim, 123); + test_utils::populate_float_vec_to_sq8_with_sum(v2, dim, 1234); } void TearDown(const ::benchmark::State &state) { delete[] v1; diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 3810ebcf6..16f050188 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -352,9 +352,9 @@ void common_ip_sq8(bool should_normalize, float expected_dist) { // Quantize each value for (size_t i = 0; i < dim; i++) { - float normalized = (v2_orig[i] - min_val) / delta; - normalized = std::max(0.0f, std::min(255.0f, normalized)); - quant_values[i] = static_cast(std::round(normalized)); + float quantized = (v2_orig[i] - min_val) / delta; + quantized = std::max(0.0f, std::min(255.0f, quantized)); + quant_values[i] = static_cast(std::round(quantized)); } float baseline = @@ -393,7 +393,7 @@ TEST_F(SpacesTest, SQ8_Cosine_no_optimization_func_test) { // Create SQ8 quantized version of v2 size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector v2_quantized(quantized_size); - test_utils::populate_float_vec_to_sq8_with_sum_norm(v2_quantized.data(), dim); + test_utils::populate_float_vec_to_sq8_with_sum(v2_quantized.data(), dim); float baseline = SQ8_Cosine(v1_orig.data(), v2_quantized.data(), dim); @@ -414,14 +414,14 @@ TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) { // Create SQ8 quantized version of v2 size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector v2_quantized(quantized_size); - test_utils::populate_float_vec_to_sq8_with_sum_norm(v2_quantized.data(), dim); + test_utils::populate_float_vec_to_sq8_with_sum(v2_quantized.data(), dim); float baseline = SQ8_L2Sqr(v1_orig.data(), v2_quantized.data(), dim); unsigned char alignment = 0; auto arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, nullptr); ASSERT_EQ(arch_opt_func, SQ8_L2Sqr) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.02) + ASSERT_EQ(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim)) << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } @@ -2016,12 +2016,11 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { size_t dim = GetParam(); // Create original vectors std::vector v1_orig(dim); - test_utils::populate_float_vec(v1_orig.data(), dim); - spaces::GetNormalizeFunc()(v1_orig.data(), dim); + test_utils::populate_float_vec(v1_orig.data(), dim, 1234); // Create SQ8 quantized version of v2 size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector v2_quantized(quantized_size); - test_utils::populate_float_vec_to_sq8_with_sum_norm(v2_quantized.data(), dim); + test_utils::populate_float_vec_to_sq8_with_sum(v2_quantized.data(), dim, 5678); auto expected_alignment = [](size_t reg_bit_size, size_t dim) { size_t elements_in_reg = reg_bit_size / sizeof(uint8_t) / 8; @@ -2128,120 +2127,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { unsigned char alignment = 0; arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, SQ8_L2Sqr) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.02) - << "No optimization with dim " << dim; - ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; -} - -TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { - auto optimization = getCpuOptimizationFeatures(); - size_t dim = GetParam(); - - // Create original vectors - std::vector v1_orig(dim); - test_utils::populate_float_vec(v1_orig.data(), dim); - spaces::GetNormalizeFunc()(v1_orig.data(), dim); - - // Create SQ8 quantized version of v2 - size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); - std::vector v2_quantized(quantized_size); - test_utils::populate_float_vec_to_sq8_with_sum_norm(v2_quantized.data(), dim); - // print min and delta - float *params = reinterpret_cast(v2_quantized.data() + dim); - - auto expected_alignment = [](size_t reg_bit_size, size_t dim) { - size_t elements_in_reg = reg_bit_size / sizeof(uint8_t) / 8; - return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(uint8_t) : 0; - }; - - dist_func_t arch_opt_func; - float baseline = SQ8_InnerProduct(v1_orig.data(), v2_quantized.data(), dim); - -// Test different optimizations based on CPU features -#ifdef OPT_AVX512_F_BW_VL_VNNI - if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { - unsigned char alignment = 0; - arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim)) - << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) - << "AVX512 with dim " << dim; - optimization.avx512f = 0; - } -#endif -#ifdef OPT_AVX2_FMA - if (optimization.avx2 && optimization.fma3) { - unsigned char alignment = 0; - arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX2_FMA(dim)) - << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) - << "AVX with dim " << dim; - optimization.fma3 = 0; - } -#endif -#ifdef OPT_AVX2 - if (optimization.avx2) { - unsigned char alignment = 0; - arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX2(dim)) - << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) - << "AVX with dim " << dim; - optimization.avx2 = 0; - } -#endif -#ifdef OPT_SSE - if (optimization.sse4_1) { - unsigned char alignment = 0; - arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_SSE4(dim)) - << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) - << "SSE with dim " << dim; - optimization.sse4_1 = 0; - } -#endif -#ifdef OPT_SVE2 - if (optimization.sve2) { - unsigned char alignment = 0; - arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_SVE2(dim)) - << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) - << "SVE2 with dim " << dim; - optimization.sve2 = 0; - } -#endif -#ifdef OPT_SVE - if (optimization.sve) { - unsigned char alignment = 0; - arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_SVE(dim)) - << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) - << "SVE with dim " << dim; - optimization.sve = 0; - } -#endif -#ifdef OPT_NEON - if (optimization.asimd) { - unsigned char alignment = 0; - arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_NEON(dim)) - << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) - << "NEON with dim " << dim; - optimization.asimd = 0; - } -#endif - - // Test default implementation - unsigned char alignment = 0; - arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, SQ8_InnerProduct) - << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) + ASSERT_EQ(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim)) << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } @@ -2265,7 +2151,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { // Create SQ8 quantized version of v2 (with normalization) size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector v2_quantized(quantized_size); - test_utils::populate_float_vec_to_sq8_with_sum_norm(v2_quantized.data(), dim); + test_utils::populate_float_vec_to_sq8_with_sum(v2_quantized.data(), dim); auto expected_alignment = [](size_t reg_bit_size, size_t dim) { size_t elements_in_reg = reg_bit_size / sizeof(uint8_t) / 8; @@ -2360,7 +2246,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, SQ8_Cosine) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) + ASSERT_EQ(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim)) << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } @@ -2374,8 +2260,8 @@ TEST_F(SpacesTest, SQ8_SQ8_ip_no_optimization_func_test) { size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector v1_quantized(quantized_size); std::vector v2_quantized(quantized_size); - test_utils::populate_float_vec_to_sq8_with_sum_norm(v1_quantized.data(), dim); - test_utils::populate_float_vec_to_sq8_with_sum_norm(v2_quantized.data(), dim); + test_utils::populate_float_vec_to_sq8_with_sum(v1_quantized.data(), dim); + test_utils::populate_float_vec_to_sq8_with_sum(v2_quantized.data(), dim); float baseline = SQ8_SQ8_InnerProduct(v1_quantized.data(), v2_quantized.data(), dim); @@ -2395,8 +2281,8 @@ TEST_F(SpacesTest, SQ8_SQ8_Cosine_no_optimization_func_test) { size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector v1_quantized(quantized_size); std::vector v2_quantized(quantized_size); - test_utils::populate_float_vec_to_sq8_with_sum_norm(v1_quantized.data(), dim); - test_utils::populate_float_vec_to_sq8_with_sum_norm(v2_quantized.data(), dim); + test_utils::populate_float_vec_to_sq8_with_sum(v1_quantized.data(), dim); + test_utils::populate_float_vec_to_sq8_with_sum(v2_quantized.data(), dim); float baseline = SQ8_SQ8_Cosine(v1_quantized.data(), v2_quantized.data(), dim); @@ -2419,8 +2305,8 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_InnerProductTest) { size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector v1_quantized(quantized_size); std::vector v2_quantized(quantized_size); - test_utils::populate_float_vec_to_sq8_with_sum_norm(v1_quantized.data(), dim); - test_utils::populate_float_vec_to_sq8_with_sum_norm(v2_quantized.data(), dim); + test_utils::populate_float_vec_to_sq8_with_sum(v1_quantized.data(), dim, 1234); + test_utils::populate_float_vec_to_sq8_with_sum(v2_quantized.data(), dim, 5678); dist_func_t arch_opt_func; float baseline = SQ8_SQ8_InnerProduct(v1_quantized.data(), v2_quantized.data(), dim); @@ -2487,7 +2373,7 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_InnerProductTest) { arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, SQ8_SQ8_InnerProduct) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.01) + ASSERT_EQ(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim)) << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } @@ -2501,8 +2387,8 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_CosineTest) { size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector v1_quantized(quantized_size); std::vector v2_quantized(quantized_size); - test_utils::populate_float_vec_to_sq8_with_sum_norm(v1_quantized.data(), dim, 1234); - test_utils::populate_float_vec_to_sq8_with_sum_norm(v2_quantized.data(), dim, 5678); + test_utils::populate_float_vec_to_sq8_with_sum(v1_quantized.data(), dim, 1234); + test_utils::populate_float_vec_to_sq8_with_sum(v2_quantized.data(), dim, 5678); dist_func_t arch_opt_func; float baseline = SQ8_SQ8_Cosine(v1_quantized.data(), v2_quantized.data(), dim); @@ -2569,7 +2455,7 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_CosineTest) { arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, SQ8_SQ8_Cosine) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.01) + ASSERT_EQ(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim)) << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } @@ -2587,7 +2473,7 @@ TEST(SQ8_SQ8_EdgeCases, SelfDistanceCosine) { size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector v_quantized(quantized_size); - test_utils::populate_float_vec_to_sq8_with_sum_norm(v_quantized.data(), dim); + test_utils::populate_float_vec_to_sq8_with_sum(v_quantized.data(), dim); float baseline = SQ8_SQ8_Cosine(v_quantized.data(), v_quantized.data(), dim); @@ -2642,8 +2528,10 @@ TEST(SQ8_SQ8_EdgeCases, SelfDistanceCosine) { unsigned char alignment = 0; auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_quantized.data(), v_quantized.data(), dim); - ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; + ASSERT_EQ(baseline, arch_opt_func(v_quantized.data(), v_quantized.data(), dim)) + << "No optimization self-distance should match baseline"; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; + } // Test symmetry: dist(v1, v2) == dist(v2, v1) @@ -2653,8 +2541,8 @@ TEST(SQ8_SQ8_EdgeCases, SymmetryTest) { size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector v1_quantized(quantized_size); std::vector v2_quantized(quantized_size); - test_utils::populate_float_vec_to_sq8_with_sum_norm(v1_quantized.data(), dim, 123, -1.0f, 1.0f); - test_utils::populate_float_vec_to_sq8_with_sum_norm(v2_quantized.data(), dim, 123, -1.0f, 1.0f); + test_utils::populate_float_vec_to_sq8_with_sum(v1_quantized.data(), dim, 123, -1.0f, 1.0f); + test_utils::populate_float_vec_to_sq8_with_sum(v2_quantized.data(), dim, 123, -1.0f, 1.0f); unsigned char alignment = 0; auto ip_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); @@ -2678,9 +2566,9 @@ TEST(SQ8_SQ8_EdgeCases, ZeroVectorTest) { size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector v_zero_quantized(quantized_size); std::vector v_nonzero_quantized(quantized_size); - test_utils::quantize_float_vec_to_uint8_with_sum_norm(v_zero.data(), dim, + test_utils::quantize_float_vec_to_uint8_with_sum(v_zero.data(), dim, v_zero_quantized.data()); - test_utils::populate_float_vec_to_sq8_with_sum_norm(v_nonzero_quantized.data(), dim); + test_utils::populate_float_vec_to_sq8_with_sum(v_nonzero_quantized.data(), dim); float baseline = SQ8_SQ8_InnerProduct(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); @@ -2745,9 +2633,9 @@ TEST(SQ8_SQ8_EdgeCases, ConstantVectorTest) { size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector v_const_quantized(quantized_size); std::vector v_random_quantized(quantized_size); - test_utils::quantize_float_vec_to_uint8_with_sum_norm(v_const.data(), dim, + test_utils::quantize_float_vec_to_uint8_with_sum(v_const.data(), dim, v_const_quantized.data()); - test_utils::populate_float_vec_to_sq8_with_sum_norm(v_random_quantized.data(), dim); + test_utils::populate_float_vec_to_sq8_with_sum(v_random_quantized.data(), dim); float baseline = SQ8_SQ8_InnerProduct(v_const_quantized.data(), v_random_quantized.data(), dim); #ifdef OPT_SVE2 @@ -2825,8 +2713,8 @@ TEST(SQ8_SQ8_EdgeCases, ExtremeValuesTest) { size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector v1_quantized(quantized_size); std::vector v2_quantized(quantized_size); - test_utils::quantize_float_vec_to_uint8_with_sum_norm(v1.data(), dim, v1_quantized.data()); - test_utils::quantize_float_vec_to_uint8_with_sum_norm(v2.data(), dim, v2_quantized.data()); + test_utils::quantize_float_vec_to_uint8_with_sum(v1.data(), dim, v1_quantized.data()); + test_utils::quantize_float_vec_to_uint8_with_sum(v2.data(), dim, v2_quantized.data()); float baseline = SQ8_SQ8_InnerProduct(v1_quantized.data(), v2_quantized.data(), dim); ASSERT_FALSE(std::isnan(baseline)) << "Extreme values IP should not produce NaN"; diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h index dd34d2312..ef166ea50 100644 --- a/tests/utils/tests_utils.h +++ b/tests/utils/tests_utils.h @@ -107,7 +107,7 @@ static void populate_float_vec_to_sq8(uint8_t *v, size_t dim, int seed = 1234) { * Vector layout: [uint8_t values (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] * where sum = Σv[i] and norm = Σv[i]² (sum of squares of uint8 elements) */ -static void quantize_float_vec_to_uint8_with_sum_norm(const float *v, size_t dim, uint8_t *qv) { +static void quantize_float_vec_to_uint8_with_sum(const float *v, size_t dim, uint8_t *qv) { float min_val = v[0]; float max_val = v[0]; for (size_t i = 1; i < dim; i++) { @@ -147,13 +147,13 @@ static void quantize_float_vec_to_uint8_with_sum_norm(const float *v, size_t dim * Populate a float vector and quantize to SQ8 with precomputed sum and norm. * Vector layout: [uint8_t values (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] */ -static void populate_float_vec_to_sq8_with_sum_norm(uint8_t *v, size_t dim, int seed = 1234, +static void populate_float_vec_to_sq8_with_sum(uint8_t *v, size_t dim, int seed = 1234, float min = -1.0f, float max = 1.0f) { std::vector vec(dim); populate_float_vec(vec.data(), dim, seed, min, max); // Normalize vector spaces::GetNormalizeFunc()(vec.data(), dim); - quantize_float_vec_to_uint8_with_sum_norm(vec.data(), dim, v); + quantize_float_vec_to_uint8_with_sum(vec.data(), dim, v); } template From f12ecf4849a9adf5b682074df83ffc0ebd31a33a Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Wed, 31 Dec 2025 17:12:20 +0200 Subject: [PATCH 28/51] format --- tests/unit/test_spaces.cpp | 7 ++----- tests/utils/tests_utils.h | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 16f050188..5de4a11a7 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2531,7 +2531,6 @@ TEST(SQ8_SQ8_EdgeCases, SelfDistanceCosine) { ASSERT_EQ(baseline, arch_opt_func(v_quantized.data(), v_quantized.data(), dim)) << "No optimization self-distance should match baseline"; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; - } // Test symmetry: dist(v1, v2) == dist(v2, v1) @@ -2566,8 +2565,7 @@ TEST(SQ8_SQ8_EdgeCases, ZeroVectorTest) { size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector v_zero_quantized(quantized_size); std::vector v_nonzero_quantized(quantized_size); - test_utils::quantize_float_vec_to_uint8_with_sum(v_zero.data(), dim, - v_zero_quantized.data()); + test_utils::quantize_float_vec_to_uint8_with_sum(v_zero.data(), dim, v_zero_quantized.data()); test_utils::populate_float_vec_to_sq8_with_sum(v_nonzero_quantized.data(), dim); float baseline = SQ8_SQ8_InnerProduct(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); @@ -2633,8 +2631,7 @@ TEST(SQ8_SQ8_EdgeCases, ConstantVectorTest) { size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector v_const_quantized(quantized_size); std::vector v_random_quantized(quantized_size); - test_utils::quantize_float_vec_to_uint8_with_sum(v_const.data(), dim, - v_const_quantized.data()); + test_utils::quantize_float_vec_to_uint8_with_sum(v_const.data(), dim, v_const_quantized.data()); test_utils::populate_float_vec_to_sq8_with_sum(v_random_quantized.data(), dim); float baseline = SQ8_SQ8_InnerProduct(v_const_quantized.data(), v_random_quantized.data(), dim); diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h index ef166ea50..95cd30e44 100644 --- a/tests/utils/tests_utils.h +++ b/tests/utils/tests_utils.h @@ -148,7 +148,7 @@ static void quantize_float_vec_to_uint8_with_sum(const float *v, size_t dim, uin * Vector layout: [uint8_t values (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] */ static void populate_float_vec_to_sq8_with_sum(uint8_t *v, size_t dim, int seed = 1234, - float min = -1.0f, float max = 1.0f) { + float min = -1.0f, float max = 1.0f) { std::vector vec(dim); populate_float_vec(vec.data(), dim, seed, min, max); // Normalize vector From fdc16c6e20e77480d897921b4655f8f4bd5ce13a Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Wed, 31 Dec 2025 18:25:02 +0200 Subject: [PATCH 29/51] Refactor SQ8 distance tests to use compressed vectors and improve normalization calculations --- tests/unit/test_spaces.cpp | 139 ++++++++++++++++++++++++++++--------- 1 file changed, 107 insertions(+), 32 deletions(-) diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 5de4a11a7..744c0f982 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -405,25 +405,57 @@ TEST_F(SpacesTest, SQ8_Cosine_no_optimization_func_test) { ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) { + // create a vector with extra space for the norm size_t dim = 5; // Create original vectors - std::vector v1_orig(dim); - test_utils::populate_float_vec(v1_orig.data(), dim); + float v1_orig[dim], v2_orig[dim]; + for (size_t i = 0; i < dim; i++) { + v1_orig[i] = float(i + 1.5); + v2_orig[i] = float(i + 1.5); + } - // Create SQ8 quantized version of v2 - size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); - std::vector v2_quantized(quantized_size); - test_utils::populate_float_vec_to_sq8_with_sum(v2_quantized.data(), dim); + // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float) + size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float); + spaces::GetNormalizeFunc()(v1_orig, dim); + spaces::GetNormalizeFunc()(v2_orig, dim); + // Find min and max for quantization + float min_val = v2_orig[0]; + float max_val = v2_orig[0]; + for (size_t i = 1; i < dim; i++) { + min_val = std::min(min_val, v2_orig[i]); + max_val = std::max(max_val, v2_orig[i]); + } + // Calculate delta and inverse norm + float delta = (max_val - min_val) / 255.0f; + if (delta == 0) + delta = 1.0f; // Avoid division by zero - float baseline = SQ8_L2Sqr(v1_orig.data(), v2_quantized.data(), dim); + // Compress v2 + std::vector v2_compressed(compressed_size); + uint8_t *quant_values = reinterpret_cast(v2_compressed.data()); + float *params = reinterpret_cast(quant_values + dim); - unsigned char alignment = 0; - auto arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, nullptr); - ASSERT_EQ(arch_opt_func, SQ8_L2Sqr) << "Unexpected distance function chosen for dim " << dim; - ASSERT_EQ(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim)) - << "No optimization with dim " << dim; - ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; + // Quantize each value + for (size_t i = 0; i < dim; i++) { + float normalized = (v2_orig[i] - min_val) / delta; + normalized = std::max(0.0f, std::min(255.0f, normalized)); + quant_values[i] = static_cast(std::round(normalized)); + } + // Calculate inverse norm from decompressed values + float inv_norm = 0.0f; + for (size_t i = 0; i < dim; i++) { + float decompressed_value = min_val + quant_values[i] * delta; + inv_norm += decompressed_value * decompressed_value; + } + inv_norm = 1.0f / std::sqrt(inv_norm); + // Store parameters + params[0] = min_val; + params[1] = delta; + params[2] = inv_norm; + + float dist = SQ8_L2Sqr((const void *)v1_orig, (const void *)v2_compressed.data(), dim); + ASSERT_NEAR(dist, 0.0f, 0.00001f) << "SQ8_Cosine failed to match expected distance"; } /* ======================== Test Getters ======================== */ @@ -2009,18 +2041,65 @@ TEST_P(UINT8SpacesOptimizationTest, UINT8_full_range_test) { INSTANTIATE_TEST_SUITE_P(UINT8OptFuncs, UINT8SpacesOptimizationTest, testing::Range(32UL, 64 * 2UL + 1)); +// Helper function to create SQ8 compressed vector +std::vector CreateSQ8CompressedVector(const float *original, size_t dim) { + // Create a copy of the original vector that we can modify + std::vector vec_copy(original, original + dim); + + // Size: dim (uint8_t) + min_val (float) + delta (float) + norm (float) + size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float); + std::vector compressed(compressed_size); + + // Find min and max for quantization + float min_val = vec_copy[0]; + float max_val = vec_copy[0]; + for (size_t i = 1; i < dim; i++) { + min_val = std::min(min_val, vec_copy[i]); + max_val = std::max(max_val, vec_copy[i]); + } + + // Calculate delta + float delta = (max_val - min_val) / 255.0f; + if (delta == 0) + delta = 1.0f; // Avoid division by zero + + // Quantize vector + uint8_t *quant_values = compressed.data(); + float norm = 0.0f; + // Quantize each value + for (size_t i = 0; i < dim; i++) { + float normalized = (vec_copy[i] - min_val) / delta; + normalized = std::max(0.0f, std::min(255.0f, normalized)); + quant_values[i] = static_cast(std::round(normalized)); + norm += (quant_values[i] * delta + min_val) * (quant_values[i] * delta + min_val); + } + + float inv_norm = 1.0f / std::sqrt(norm); + // Store parameters + float *params = reinterpret_cast(quant_values + dim); + params[0] = min_val; + params[1] = delta; + params[2] = inv_norm; + + return compressed; +} + class SQ8SpacesOptimizationTest : public testing::TestWithParam {}; TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { auto optimization = getCpuOptimizationFeatures(); size_t dim = GetParam(); + // Create original vectors std::vector v1_orig(dim); - test_utils::populate_float_vec(v1_orig.data(), dim, 1234); - // Create SQ8 quantized version of v2 - size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); - std::vector v2_quantized(quantized_size); - test_utils::populate_float_vec_to_sq8_with_sum(v2_quantized.data(), dim, 5678); + std::vector v2_orig(dim); + for (size_t i = 0; i < dim; i++) { + v1_orig[i] = float(i + 1.5); + v2_orig[i] = float(i * 0.75 + 1.0); + } + + // Create SQ8 compressed version of v2 + std::vector v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim); auto expected_alignment = [](size_t reg_bit_size, size_t dim) { size_t elements_in_reg = reg_bit_size / sizeof(uint8_t) / 8; @@ -2028,7 +2107,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { }; dist_func_t arch_opt_func; - float baseline = SQ8_L2Sqr(v1_orig.data(), v2_quantized.data(), dim); + float baseline = SQ8_L2Sqr(v1_orig.data(), v2_compressed.data(), dim); // Test different optimizations based on CPU features #ifdef OPT_AVX512_F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { @@ -2036,7 +2115,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.02) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "AVX512 with dim " << dim; // ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim; // Unset optimizations flag, so we'll choose the next optimization. @@ -2049,7 +2128,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX2_FMA(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.02) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "AVX with dim " << dim; // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim; // Unset optimizations flag, so we'll choose the next optimization. @@ -2062,7 +2141,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX2(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.02) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "AVX with dim " << dim; // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim; // Unset avx flag as well, so we'll choose the next optimization (SSE). @@ -2075,7 +2154,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_SSE4(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.02) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "SSE with dim " << dim; // ASSERT_EQ(alignment, expected_alignment(128, dim)) << "SSE with dim " << dim; // Unset sse flag as well, so we'll choose the next optimization (default). @@ -2089,7 +2168,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_SVE2(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.02) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "SVE2 with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; // Unset sve2 flag as well, so we'll choose the next option (default). @@ -2102,7 +2181,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_SVE(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.02) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "SVE with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; // Unset sve flag as well, so we'll choose the next option (default). @@ -2115,7 +2194,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_NEON(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.02) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "NEON with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; // Unset optimizations flag, so we'll choose the next optimization. @@ -2127,16 +2206,13 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { unsigned char alignment = 0; arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, SQ8_L2Sqr) << "Unexpected distance function chosen for dim " << dim; - ASSERT_EQ(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim)) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } -// Instantiate the test suite with dimensions to test -// Range includes dimensions up to 128+ to cover AVX512 64-element chunk processing -// and remaining chunk handling (needs dim >= 64 for loop, dim % 64 >= 48 for third remainder) INSTANTIATE_TEST_SUITE_P(SQ8OptFuncs, SQ8SpacesOptimizationTest, - testing::Range(16UL, 16 * 8UL + 1)); + testing::Range(16UL, 16 * 2UL + 1)); TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { auto optimization = getCpuOptimizationFeatures(); @@ -2229,7 +2305,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { optimization.avx2 = 0; } #endif - #ifdef OPT_SSE if (optimization.sse4_1) { unsigned char alignment = 0; From e5f519c5b978da18bea4614086f13c8339c4c7a5 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Wed, 31 Dec 2025 19:25:11 +0200 Subject: [PATCH 30/51] Update vector layout documentation to reflect removal of sum of squares in SQ8 implementations --- src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h | 2 +- src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_SQ8.h | 5 ++--- src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h | 5 ++--- src/VecSim/spaces/IP/IP_NEON_SQ8.h | 2 +- src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h | 5 ++--- src/VecSim/spaces/IP/IP_SVE_SQ8.h | 2 +- src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h | 5 ++--- src/VecSim/spaces/L2/L2.cpp | 2 +- src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h | 6 +++--- tests/unit/test_spaces.cpp | 2 +- tests/utils/tests_utils.h | 4 ++-- 11 files changed, 18 insertions(+), 22 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h index 347ead210..dd83cc81a 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h @@ -25,7 +25,7 @@ * * Also uses multiple accumulators for better instruction-level parallelism. * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)]] + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)]] */ // Process 16 elements with algebraic optimization diff --git a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_SQ8.h index 1643775c4..c19f57ada 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_SQ8.h @@ -25,8 +25,7 @@ * * Since sum is precomputed, we only need to compute the dot product Σ(q1[i]*q2[i]). * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] [sum of - * squares (float)] + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)]] */ // Process 64 uint8 elements using VNNI with multiple accumulators for ILP (dot product only) @@ -70,7 +69,7 @@ float SQ8_SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dim const uint8_t *pEnd1 = pVec1 + dimension; // Get dequantization parameters and precomputed values from the end of pVec1 - // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] + // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)]] const float *params1 = reinterpret_cast(pVec1 + dimension); const float min1 = params1[0]; const float delta1 = params1[1]; diff --git a/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h index af9f5739a..ddc66a13e 100644 --- a/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h +++ b/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h @@ -25,8 +25,7 @@ * * Since sum is precomputed, we only need to compute the dot product Σ(q1[i]*q2[i]). * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] [sum of - * squares (float)] + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)]] */ // Helper function: computes dot product using DOTPROD instruction (no sum computation needed) @@ -52,7 +51,7 @@ float SQ8_SQ8_InnerProductSIMD64_NEON_DOTPROD_IMP(const void *pVec1v, const void const uint8_t *pVec2 = static_cast(pVec2v); // Get dequantization parameters and precomputed values from the end of pVec1 - // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] + // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)]] const float *params1 = reinterpret_cast(pVec1 + dimension); const float min1 = params1[0]; const float delta1 = params1[1]; diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_SQ8.h index 609c86123..602897b83 100644 --- a/src/VecSim/spaces/IP/IP_NEON_SQ8.h +++ b/src/VecSim/spaces/IP/IP_NEON_SQ8.h @@ -19,7 +19,7 @@ * * This saves 1 FMA per 4-element step by deferring dequantization to scalar math at the end. * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)]] + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)]] */ // Helper function with algebraic optimization diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h index 61d2797fa..f381a15aa 100644 --- a/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h +++ b/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h @@ -25,8 +25,7 @@ * * Since sum is precomputed, we only need to compute the dot product Σ(q1[i]*q2[i]). * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] [sum of - * squares (float)] + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)]] */ // Helper function with dot product only (no sum computation needed) @@ -58,7 +57,7 @@ float SQ8_SQ8_InnerProductSIMD16_NEON_IMP(const void *pVec1v, const void *pVec2v const uint8_t *pVec2 = static_cast(pVec2v); // Get dequantization parameters and precomputed values from the end of pVec1 - // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] + // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)]] const float *params1 = reinterpret_cast(pVec1 + dimension); const float min1 = params1[0]; const float delta1 = params1[1]; diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8.h index 7295843f5..82896d002 100644 --- a/src/VecSim/spaces/IP/IP_SVE_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8.h @@ -19,7 +19,7 @@ * * This saves 1 FMA per chunk by deferring dequantization to scalar math at the end. * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)]] + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)]] */ // Helper function to perform inner product step with algebraic optimization diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h index e6f1301cd..d3220d142 100644 --- a/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h @@ -25,8 +25,7 @@ * * Since sum is precomputed, we only need to compute the dot product Σ(q1[i]*q2[i]). * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] [sum of - * squares (float)] + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)]] */ // Helper function to perform inner product step using integer dot product (no sum computation) @@ -53,7 +52,7 @@ float SQ8_SQ8_InnerProductSIMD_SVE_IMP(const void *pVec1v, const void *pVec2v, s size_t offset = 0; // Get dequantization parameters and precomputed values from the end of pVec1 - // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] [sum of squares (float)] + // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)]] const float *params1 = reinterpret_cast(pVec1 + dimension); const float min1 = params1[0]; const float delta1 = params1[1]; diff --git a/src/VecSim/spaces/L2/L2.cpp b/src/VecSim/spaces/L2/L2.cpp index cac76d0e2..a68ea5114 100644 --- a/src/VecSim/spaces/L2/L2.cpp +++ b/src/VecSim/spaces/L2/L2.cpp @@ -20,7 +20,7 @@ float SQ8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension) { const auto *pVect2 = static_cast(pVect2v); // pvect2 is a vector of uint8_t, so we need to dequantize it, normalize it and then multiply // it. it structred as [quantized values (uint8_t * dim)][min_val (float)][delta - // (float)]] The last two values are used to dequantize the vector. + // (float)][inv_norm (float)] The last two values are used to dequantize the vector. const float min_val = *reinterpret_cast(pVect2 + dimension); const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); diff --git a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h index f76b2d915..d2775f5be 100644 --- a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h +++ b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h @@ -21,7 +21,7 @@ static inline void SQ8_L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, _ // Convert uint8 to float __m512 v2_f = _mm512_cvtepi32_ps(v2_512); - // Dequantize: (val * delta) + min_val + // Dequantize: (val * delta + min_val) * inv_norm __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec); // Compute difference @@ -42,7 +42,7 @@ float SQ8_L2SqrSIMD16_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2 const uint8_t *pVect2 = static_cast(pVect2v); const float *pEnd1 = pVect1 + dimension; - // Get dequantization parameters + // Get dequantization parameters from the end of pVect2 const float min_val = *reinterpret_cast(pVect2 + dimension); const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); @@ -66,7 +66,7 @@ float SQ8_L2SqrSIMD16_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2 __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); __m512 v2_f = _mm512_cvtepi32_ps(v2_512); - // Dequantize: (val * delta) + min_val + // Dequantize: (val * delta + min_val) * inv_norm __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec); // Compute difference diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 744c0f982..06ff31b1a 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2458,7 +2458,7 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_CosineTest) { size_t dim = GetParam(); // Create quantized vectors - // Size: dim (uint8_t) + min_val (float) + delta (float) + sum (float) + norm (float) + // Size: dim (uint8_t) + min_val (float) + delta (float) + sum (float) + sum_squares (float) size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector v1_quantized(quantized_size); std::vector v2_quantized(quantized_size); diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h index 95cd30e44..1d0314626 100644 --- a/tests/utils/tests_utils.h +++ b/tests/utils/tests_utils.h @@ -104,7 +104,7 @@ static void populate_float_vec_to_sq8(uint8_t *v, size_t dim, int seed = 1234) { /** * Quantize float vector to SQ8 with precomputed sum and norm. - * Vector layout: [uint8_t values (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] + * Vector layout: [uint8_t values (dim)] [min (float)] [delta (float)] [sum (float)]] * where sum = Σv[i] and norm = Σv[i]² (sum of squares of uint8 elements) */ static void quantize_float_vec_to_uint8_with_sum(const float *v, size_t dim, uint8_t *qv) { @@ -145,7 +145,7 @@ static void quantize_float_vec_to_uint8_with_sum(const float *v, size_t dim, uin /** * Populate a float vector and quantize to SQ8 with precomputed sum and norm. - * Vector layout: [uint8_t values (dim)] [min (float)] [delta (float)] [sum (float)] [norm (float)] + * Vector layout: [uint8_t values (dim)] [min (float)] [delta (float)] [sum (float)]] */ static void populate_float_vec_to_sq8_with_sum(uint8_t *v, size_t dim, int seed = 1234, float min = -1.0f, float max = 1.0f) { From db1e67189c3024d341d4fe47a2395d62ed51be47 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Thu, 1 Jan 2026 16:14:52 +0200 Subject: [PATCH 31/51] Refactor SQ8 distance functions to remove norm computation - Updated comments and documentation to reflect that the SQ8-to-SQ8 distance functions now only utilize precomputed sums, removing references to norms. - Modified function signatures and implementations across various SIMD architectures (AVX512F, NEON, SVE) to align with the new approach. - Adjusted utility functions for populating SQ8 vectors to include metadata for sums and normalization. - Updated unit tests and benchmarks to ensure compatibility with the new SQ8 vector population methods and to validate the correctness of distance calculations. --- src/VecSim/spaces/IP/IP.cpp | 4 +- src/VecSim/spaces/IP/IP.h | 4 +- .../spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h | 2 +- .../spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_SQ8.h | 12 +- .../spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h | 12 +- src/VecSim/spaces/IP/IP_NEON_SQ8.h | 2 +- src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h | 12 +- src/VecSim/spaces/IP/IP_SVE_SQ8.h | 2 +- src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h | 14 +-- src/VecSim/spaces/IP_space.h | 2 +- .../spaces/functions/AVX512F_BW_VL_VNNI.cpp | 2 +- .../spaces/functions/AVX512F_BW_VL_VNNI.h | 2 +- src/VecSim/spaces/functions/NEON.h | 2 +- src/VecSim/spaces/functions/NEON_DOTPROD.cpp | 2 +- src/VecSim/spaces/functions/NEON_DOTPROD.h | 2 +- src/VecSim/spaces/functions/SVE.cpp | 2 +- src/VecSim/spaces/functions/SVE.h | 2 +- .../spaces_benchmarks/bm_spaces_sq8_sq8.cpp | 4 +- tests/unit/test_spaces.cpp | 115 +++++++++++++----- tests/utils/tests_utils.h | 56 +++++++-- 20 files changed, 171 insertions(+), 84 deletions(-) diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp index d1d4e7578..4797a5d5e 100644 --- a/src/VecSim/spaces/IP/IP.cpp +++ b/src/VecSim/spaces/IP/IP.cpp @@ -79,8 +79,8 @@ float SQ8_SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dime return 1.0f - res; } -// SQ8-to-SQ8: Both vectors are uint8 quantized and normalized with precomputed sum/norm -// Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)]] +// SQ8-to-SQ8: Both vectors are uint8 quantized and normalized with precomputed sum +// Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] float SQ8_SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) { const auto *pVect1 = static_cast(pVect1v); const auto *pVect2 = static_cast(pVect2v); diff --git a/src/VecSim/spaces/IP/IP.h b/src/VecSim/spaces/IP/IP.h index 3b0a4aaac..8acb4b963 100644 --- a/src/VecSim/spaces/IP/IP.h +++ b/src/VecSim/spaces/IP/IP.h @@ -16,8 +16,8 @@ float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimensio // pVect1v vector of type fp32 and pVect2v vector of type uint8 float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension); -// SQ8-to-SQ8: Both vectors are uint8 quantized with precomputed sum/norm -// Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)]] +// SQ8-to-SQ8: Both vectors are uint8 quantized with precomputed sum +// Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] float SQ8_SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension); // SQ8-to-SQ8: Both vectors are uint8 quantized and normalized with precomputed sum diff --git a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h index dd83cc81a..ae13d3527 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h @@ -25,7 +25,7 @@ * * Also uses multiple accumulators for better instruction-level parallelism. * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)]] + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] */ // Process 16 elements with algebraic optimization diff --git a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_SQ8.h index c19f57ada..c2638c384 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_SQ8.h @@ -11,11 +11,11 @@ #include /** - * SQ8-to-SQ8 distance functions using AVX512 VNNI with precomputed sum and norm. + * SQ8-to-SQ8 distance functions using AVX512 VNNI with precomputed sum. * These functions compute distance between two SQ8 (scalar quantized 8-bit) vectors, * where BOTH vectors are uint8 quantized. * - * Uses precomputed sum and norm stored in the vector data, + * Uses precomputed sum stored in the vector data, * eliminating the need to compute them during distance calculation. * * Uses algebraic optimization to leverage integer VNNI instructions: @@ -25,7 +25,7 @@ * * Since sum is precomputed, we only need to compute the dot product Σ(q1[i]*q2[i]). * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)]] + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] */ // Process 64 uint8 elements using VNNI with multiple accumulators for ILP (dot product only) @@ -61,7 +61,7 @@ static inline void SQ8_SQ8_InnerProductStep32(const uint8_t *pVec1, const uint8_ _mm512_dpwssd_epi32(dot_acc, _mm512_cvtepu8_epi16(v1_256), _mm512_cvtepu8_epi16(v2_256)); } -// Common implementation for inner product between two SQ8 vectors with precomputed sum/norm +// Common implementation for inner product between two SQ8 vectors with precomputed sum template // 0..63 float SQ8_SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimension) { const uint8_t *pVec1 = static_cast(pVec1v); @@ -69,7 +69,7 @@ float SQ8_SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dim const uint8_t *pEnd1 = pVec1 + dimension; // Get dequantization parameters and precomputed values from the end of pVec1 - // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)]] + // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] const float *params1 = reinterpret_cast(pVec1 + dimension); const float min1 = params1[0]; const float delta1 = params1[1]; @@ -153,5 +153,5 @@ template // 0..63 float SQ8_SQ8_CosineSIMD64_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, size_t dimension) { // Assume vectors are normalized. - return 1.0f - SQ8_SQ8_InnerProductImp(pVec1v, pVec2v, dimension); + return SQ8_SQ8_InnerProductSIMD64_AVX512F_BW_VL_VNNI(pVec1v, pVec2v, dimension); } diff --git a/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h index ddc66a13e..f395ac547 100644 --- a/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h +++ b/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h @@ -11,11 +11,11 @@ #include /** - * SQ8-to-SQ8 distance functions using ARM NEON DOTPROD with precomputed sum and norm. + * SQ8-to-SQ8 distance functions using ARM NEON DOTPROD with precomputed sum. * These functions compute distance between two SQ8 (scalar quantized 8-bit) vectors, * where BOTH vectors are uint8 quantized. * - * Uses precomputed sum and norm stored in the vector data, + * Uses precomputed sum stored in the vector data, * eliminating the need to compute them during distance calculation. * * Uses algebraic optimization with DOTPROD instruction: @@ -25,7 +25,7 @@ * * Since sum is precomputed, we only need to compute the dot product Σ(q1[i]*q2[i]). * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)]] + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] */ // Helper function: computes dot product using DOTPROD instruction (no sum computation needed) @@ -43,7 +43,7 @@ SQ8_SQ8_InnerProductStep_NEON_DOTPROD(const uint8_t *&pVec1, const uint8_t *&pVe pVec2 += 16; } -// Common implementation for inner product between two SQ8 vectors with precomputed sum/norm +// Common implementation for inner product between two SQ8 vectors with precomputed sum template // 0..63 float SQ8_SQ8_InnerProductSIMD64_NEON_DOTPROD_IMP(const void *pVec1v, const void *pVec2v, size_t dimension) { @@ -51,7 +51,7 @@ float SQ8_SQ8_InnerProductSIMD64_NEON_DOTPROD_IMP(const void *pVec1v, const void const uint8_t *pVec2 = static_cast(pVec2v); // Get dequantization parameters and precomputed values from the end of pVec1 - // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)]] + // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] const float *params1 = reinterpret_cast(pVec1 + dimension); const float min1 = params1[0]; const float delta1 = params1[1]; @@ -123,5 +123,5 @@ float SQ8_SQ8_InnerProductSIMD64_NEON_DOTPROD(const void *pVec1v, const void *pV // Returns 1 - inner_product (assumes vectors are pre-normalized) template // 0..63 float SQ8_SQ8_CosineSIMD64_NEON_DOTPROD(const void *pVec1v, const void *pVec2v, size_t dimension) { - return 1.0f - SQ8_SQ8_InnerProductSIMD64_NEON_DOTPROD_IMP(pVec1v, pVec2v, dimension); + return SQ8_SQ8_InnerProductSIMD64_NEON_DOTPROD(pVec1v, pVec2v, dimension); } diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_SQ8.h index 602897b83..612ef875d 100644 --- a/src/VecSim/spaces/IP/IP_NEON_SQ8.h +++ b/src/VecSim/spaces/IP/IP_NEON_SQ8.h @@ -19,7 +19,7 @@ * * This saves 1 FMA per 4-element step by deferring dequantization to scalar math at the end. * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)]] + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] */ // Helper function with algebraic optimization diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h index f381a15aa..401ab607a 100644 --- a/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h +++ b/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h @@ -11,11 +11,11 @@ #include /** - * SQ8-to-SQ8 distance functions using ARM NEON with precomputed sum and norm. + * SQ8-to-SQ8 distance functions using ARM NEON with precomputed sum. * These functions compute distance between two SQ8 (scalar quantized 8-bit) vectors, * where BOTH vectors are uint8 quantized. * - * Uses precomputed sum and norm stored in the vector data, + * Uses precomputed sum stored in the vector data, * eliminating the need to compute them during distance calculation. * * Uses algebraic optimization: @@ -25,7 +25,7 @@ * * Since sum is precomputed, we only need to compute the dot product Σ(q1[i]*q2[i]). * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)]] + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] */ // Helper function with dot product only (no sum computation needed) @@ -49,7 +49,7 @@ static inline void SQ8_SQ8_InnerProductStep_NEON(const uint8_t *&pVec1, const ui pVec2 += 4; } -// Common implementation for inner product between two SQ8 vectors with precomputed sum/norm +// Common implementation for inner product between two SQ8 vectors with precomputed sum template // 0..15 float SQ8_SQ8_InnerProductSIMD16_NEON_IMP(const void *pVec1v, const void *pVec2v, size_t dimension) { @@ -57,7 +57,7 @@ float SQ8_SQ8_InnerProductSIMD16_NEON_IMP(const void *pVec1v, const void *pVec2v const uint8_t *pVec2 = static_cast(pVec2v); // Get dequantization parameters and precomputed values from the end of pVec1 - // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)]] + // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] const float *params1 = reinterpret_cast(pVec1 + dimension); const float min1 = params1[0]; const float delta1 = params1[1]; @@ -130,5 +130,5 @@ float SQ8_SQ8_InnerProductSIMD16_NEON(const void *pVec1v, const void *pVec2v, si // Returns 1 - inner_product (assumes vectors are pre-normalized) template // 0..15 float SQ8_SQ8_CosineSIMD16_NEON(const void *pVec1v, const void *pVec2v, size_t dimension) { - return 1.0f - SQ8_SQ8_InnerProductSIMD16_NEON_IMP(pVec1v, pVec2v, dimension); + return SQ8_SQ8_InnerProductSIMD16_NEON(pVec1v, pVec2v, dimension); } diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8.h index 82896d002..a97c14e19 100644 --- a/src/VecSim/spaces/IP/IP_SVE_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8.h @@ -19,7 +19,7 @@ * * This saves 1 FMA per chunk by deferring dequantization to scalar math at the end. * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)]] + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] */ // Helper function to perform inner product step with algebraic optimization diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h index d3220d142..b510881dc 100644 --- a/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h @@ -11,11 +11,11 @@ #include /** - * SQ8-to-SQ8 distance functions using ARM SVE with precomputed sum and norm. + * SQ8-to-SQ8 distance functions using ARM SVE with precomputed sum. * These functions compute distance between two SQ8 (scalar quantized 8-bit) vectors, * where BOTH vectors are uint8 quantized. * - * Uses precomputed sum and norm stored in the vector data, + * Uses precomputed sum stored in the vector data, * eliminating the need to compute them during distance calculation. * * Uses algebraic optimization with SVE dot product instruction: @@ -25,7 +25,7 @@ * * Since sum is precomputed, we only need to compute the dot product Σ(q1[i]*q2[i]). * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)]] + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] */ // Helper function to perform inner product step using integer dot product (no sum computation) @@ -44,7 +44,7 @@ static inline void SQ8_SQ8_InnerProductStep_SVE(const uint8_t *pVec1, const uint offset += chunk; } -// Common implementation for inner product between two SQ8 vectors with precomputed sum/norm +// Common implementation for inner product between two SQ8 vectors with precomputed sum template float SQ8_SQ8_InnerProductSIMD_SVE_IMP(const void *pVec1v, const void *pVec2v, size_t dimension) { const uint8_t *pVec1 = static_cast(pVec1v); @@ -52,7 +52,7 @@ float SQ8_SQ8_InnerProductSIMD_SVE_IMP(const void *pVec1v, const void *pVec2v, s size_t offset = 0; // Get dequantization parameters and precomputed values from the end of pVec1 - // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)]] + // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] const float *params1 = reinterpret_cast(pVec1 + dimension); const float min1 = params1[0]; const float delta1 = params1[1]; @@ -129,6 +129,6 @@ float SQ8_SQ8_InnerProductSIMD_SVE(const void *pVec1v, const void *pVec2v, size_ // Returns 1 - inner_product (assumes vectors are pre-normalized) template float SQ8_SQ8_CosineSIMD_SVE(const void *pVec1v, const void *pVec2v, size_t dimension) { - return 1.0f - SQ8_SQ8_InnerProductSIMD_SVE_IMP(pVec1v, pVec2v, - dimension); + // Assume vectors are normalized. + return SQ8_SQ8_InnerProductSIMD_SVE(pVec1v, pVec2v, dimension); } diff --git a/src/VecSim/spaces/IP_space.h b/src/VecSim/spaces/IP_space.h index c7c5fc17d..9a03c6a96 100644 --- a/src/VecSim/spaces/IP_space.h +++ b/src/VecSim/spaces/IP_space.h @@ -31,7 +31,7 @@ dist_func_t Cosine_UINT8_GetDistFunc(size_t dim, unsigned char *alignment const void *arch_opt = nullptr); dist_func_t Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, const void *arch_opt = nullptr); -// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum/norm) +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum) dist_func_t IP_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, const void *arch_opt = nullptr); dist_func_t Cosine_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp index 89bcabf11..4041ef41a 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp @@ -74,7 +74,7 @@ dist_func_t Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim) { CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_L2SqrSIMD16_AVX512F_BW_VL_VNNI); return ret_dist_func; } -// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum/norm) +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum) dist_func_t Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim) { dist_func_t ret_dist_func; CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, SQ8_SQ8_InnerProductSIMD64_AVX512F_BW_VL_VNNI); diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h index 0105eab1f..f3127d577 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h @@ -24,7 +24,7 @@ dist_func_t Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim); dist_func_t Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim); dist_func_t Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim); -// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum/norm) +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum) dist_func_t Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim); dist_func_t Choose_SQ8_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim); diff --git a/src/VecSim/spaces/functions/NEON.h b/src/VecSim/spaces/functions/NEON.h index 4e6324cac..1c3dba285 100644 --- a/src/VecSim/spaces/functions/NEON.h +++ b/src/VecSim/spaces/functions/NEON.h @@ -30,7 +30,7 @@ dist_func_t Choose_SQ8_L2_implementation_NEON(size_t dim); dist_func_t Choose_SQ8_IP_implementation_NEON(size_t dim); dist_func_t Choose_SQ8_Cosine_implementation_NEON(size_t dim); -// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum/norm) +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum) dist_func_t Choose_SQ8_SQ8_IP_implementation_NEON(size_t dim); dist_func_t Choose_SQ8_SQ8_Cosine_implementation_NEON(size_t dim); diff --git a/src/VecSim/spaces/functions/NEON_DOTPROD.cpp b/src/VecSim/spaces/functions/NEON_DOTPROD.cpp index 8ab510c99..d9ec6da35 100644 --- a/src/VecSim/spaces/functions/NEON_DOTPROD.cpp +++ b/src/VecSim/spaces/functions/NEON_DOTPROD.cpp @@ -53,7 +53,7 @@ dist_func_t Choose_UINT8_L2_implementation_NEON_DOTPROD(size_t dim) { return ret_dist_func; } -// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum/norm) +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum) dist_func_t Choose_SQ8_SQ8_IP_implementation_NEON_DOTPROD(size_t dim) { dist_func_t ret_dist_func; CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, SQ8_SQ8_InnerProductSIMD64_NEON_DOTPROD); diff --git a/src/VecSim/spaces/functions/NEON_DOTPROD.h b/src/VecSim/spaces/functions/NEON_DOTPROD.h index 0487a5b6f..6e98358c5 100644 --- a/src/VecSim/spaces/functions/NEON_DOTPROD.h +++ b/src/VecSim/spaces/functions/NEON_DOTPROD.h @@ -21,7 +21,7 @@ dist_func_t Choose_UINT8_Cosine_implementation_NEON_DOTPROD(size_t dim); dist_func_t Choose_INT8_L2_implementation_NEON_DOTPROD(size_t dim); dist_func_t Choose_UINT8_L2_implementation_NEON_DOTPROD(size_t dim); -// SQ8-to-SQ8 DOTPROD-optimized distance functions (with precomputed sum/norm) +// SQ8-to-SQ8 DOTPROD-optimized distance functions (with precomputed sum) dist_func_t Choose_SQ8_SQ8_IP_implementation_NEON_DOTPROD(size_t dim); dist_func_t Choose_SQ8_SQ8_Cosine_implementation_NEON_DOTPROD(size_t dim); diff --git a/src/VecSim/spaces/functions/SVE.cpp b/src/VecSim/spaces/functions/SVE.cpp index da08009ec..d3f0a757d 100644 --- a/src/VecSim/spaces/functions/SVE.cpp +++ b/src/VecSim/spaces/functions/SVE.cpp @@ -118,7 +118,7 @@ dist_func_t Choose_SQ8_L2_implementation_SVE(size_t dim) { return ret_dist_func; } -// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum/norm) +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum) // Note: Use svcntb for uint8 elements (not svcntw which is for 32-bit elements) dist_func_t Choose_SQ8_SQ8_IP_implementation_SVE(size_t dim) { dist_func_t ret_dist_func; diff --git a/src/VecSim/spaces/functions/SVE.h b/src/VecSim/spaces/functions/SVE.h index 3b88573d2..4cce8cfc8 100644 --- a/src/VecSim/spaces/functions/SVE.h +++ b/src/VecSim/spaces/functions/SVE.h @@ -33,7 +33,7 @@ dist_func_t Choose_SQ8_IP_implementation_SVE(size_t dim); dist_func_t Choose_SQ8_Cosine_implementation_SVE(size_t dim); dist_func_t Choose_SQ8_L2_implementation_SVE(size_t dim); -// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum/norm) +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum) dist_func_t Choose_SQ8_SQ8_IP_implementation_SVE(size_t dim); dist_func_t Choose_SQ8_SQ8_Cosine_implementation_SVE(size_t dim); diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp index 61f8a0ac7..cb51efabc 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp @@ -28,8 +28,8 @@ class BM_VecSimSpaces_SQ8_SQ8 : public benchmark::Fixture { // Allocate both vectors with extra space for min and delta v1 = new uint8_t[dim + sizeof(float) * 2]; v2 = new uint8_t[dim + sizeof(float) * 2]; - test_utils::populate_float_vec_to_sq8_with_sum(v1, dim, 123); - test_utils::populate_float_vec_to_sq8_with_sum(v2, dim, 1234); + test_utils::populate_float_vec_to_sq8_with_metadata(v1, dim, true, 123); + test_utils::populate_float_vec_to_sq8_with_metadata(v2, dim, true, 1234); } void TearDown(const ::benchmark::State &state) { delete[] v1; diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 06ff31b1a..bde3055a7 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -393,7 +393,7 @@ TEST_F(SpacesTest, SQ8_Cosine_no_optimization_func_test) { // Create SQ8 quantized version of v2 size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector v2_quantized(quantized_size); - test_utils::populate_float_vec_to_sq8_with_sum(v2_quantized.data(), dim); + test_utils::populate_float_vec_to_sq8_with_metadata(v2_quantized.data(), dim, true); float baseline = SQ8_Cosine(v1_orig.data(), v2_quantized.data(), dim); @@ -2227,7 +2227,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { // Create SQ8 quantized version of v2 (with normalization) size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector v2_quantized(quantized_size); - test_utils::populate_float_vec_to_sq8_with_sum(v2_quantized.data(), dim); + test_utils::populate_float_vec_to_sq8_with_metadata(v2_quantized.data(), dim, true); auto expected_alignment = [](size_t reg_bit_size, size_t dim) { size_t elements_in_reg = reg_bit_size / sizeof(uint8_t) / 8; @@ -2335,15 +2335,17 @@ TEST_F(SpacesTest, SQ8_SQ8_ip_no_optimization_func_test) { size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector v1_quantized(quantized_size); std::vector v2_quantized(quantized_size); - test_utils::populate_float_vec_to_sq8_with_sum(v1_quantized.data(), dim); - test_utils::populate_float_vec_to_sq8_with_sum(v2_quantized.data(), dim); + test_utils::populate_float_vec_to_sq8_with_metadata(v1_quantized.data(), dim, true, 1234); + test_utils::populate_float_vec_to_sq8_with_metadata(v2_quantized.data(), dim, true, 5678); - float baseline = SQ8_SQ8_InnerProduct(v1_quantized.data(), v2_quantized.data(), dim); + float baseline = test_utils::SQ8_SQ8_NotOptimized_InnerProduct(v1_quantized.data(), + v2_quantized.data(), dim); unsigned char alignment = 0; auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); ASSERT_EQ(arch_opt_func, SQ8_SQ8_InnerProduct) << "Unexpected distance function chosen for dim " << dim; + // Checks that the function with the optimzied math equivalence returns the same result. ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.01) << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; @@ -2356,16 +2358,19 @@ TEST_F(SpacesTest, SQ8_SQ8_Cosine_no_optimization_func_test) { size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector v1_quantized(quantized_size); std::vector v2_quantized(quantized_size); - test_utils::populate_float_vec_to_sq8_with_sum(v1_quantized.data(), dim); - test_utils::populate_float_vec_to_sq8_with_sum(v2_quantized.data(), dim); + test_utils::populate_float_vec_to_sq8_with_metadata(v1_quantized.data(), dim, true, 1234); + test_utils::populate_float_vec_to_sq8_with_metadata(v2_quantized.data(), dim, true, 5678); - float baseline = SQ8_SQ8_Cosine(v1_quantized.data(), v2_quantized.data(), dim); + float baseline = + test_utils::SQ8_SQ8_NotOptimized_Cosine(v1_quantized.data(), v2_quantized.data(), dim); unsigned char alignment = 0; auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); ASSERT_EQ(arch_opt_func, SQ8_SQ8_Cosine) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.01) + // Checks that the function with the optimzied math equivalence returns the same result. + // min1*sum2 + min2*sum1 + delta1*delta2*Σ(q1[i]*q2[i]) - dim*min1*min2 + ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.001) << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } @@ -2380,8 +2385,8 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_InnerProductTest) { size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector v1_quantized(quantized_size); std::vector v2_quantized(quantized_size); - test_utils::populate_float_vec_to_sq8_with_sum(v1_quantized.data(), dim, 1234); - test_utils::populate_float_vec_to_sq8_with_sum(v2_quantized.data(), dim, 5678); + test_utils::populate_float_vec_to_sq8_with_metadata(v1_quantized.data(), dim, true, 1234); + test_utils::populate_float_vec_to_sq8_with_metadata(v2_quantized.data(), dim, true, 5678); dist_func_t arch_opt_func; float baseline = SQ8_SQ8_InnerProduct(v1_quantized.data(), v2_quantized.data(), dim); @@ -2462,8 +2467,8 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_CosineTest) { size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector v1_quantized(quantized_size); std::vector v2_quantized(quantized_size); - test_utils::populate_float_vec_to_sq8_with_sum(v1_quantized.data(), dim, 1234); - test_utils::populate_float_vec_to_sq8_with_sum(v2_quantized.data(), dim, 5678); + test_utils::populate_float_vec_to_sq8_with_metadata(v1_quantized.data(), dim, true, 1234); + test_utils::populate_float_vec_to_sq8_with_metadata(v2_quantized.data(), dim, true, 5678); dist_func_t arch_opt_func; float baseline = SQ8_SQ8_Cosine(v1_quantized.data(), v2_quantized.data(), dim); @@ -2548,12 +2553,12 @@ TEST(SQ8_SQ8_EdgeCases, SelfDistanceCosine) { size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector v_quantized(quantized_size); - test_utils::populate_float_vec_to_sq8_with_sum(v_quantized.data(), dim); + test_utils::populate_float_vec_to_sq8_with_metadata(v_quantized.data(), dim, true); float baseline = SQ8_SQ8_Cosine(v_quantized.data(), v_quantized.data(), dim); // Self-distance for cosine should be close to 0 - ASSERT_NEAR(baseline, 0.0f, 0.02f) << "Self-distance should be ~0 for cosine"; + ASSERT_NEAR(baseline, 0.0f, 0.001f) << "Self-distance should be ~0 for cosine"; #ifdef OPT_SVE2 if (optimization.sve2) { @@ -2611,21 +2616,68 @@ TEST(SQ8_SQ8_EdgeCases, SelfDistanceCosine) { // Test symmetry: dist(v1, v2) == dist(v2, v1) TEST(SQ8_SQ8_EdgeCases, SymmetryTest) { size_t dim = 128; - + auto optimization = getCpuOptimizationFeatures(); size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector v1_quantized(quantized_size); std::vector v2_quantized(quantized_size); - test_utils::populate_float_vec_to_sq8_with_sum(v1_quantized.data(), dim, 123, -1.0f, 1.0f); - test_utils::populate_float_vec_to_sq8_with_sum(v2_quantized.data(), dim, 123, -1.0f, 1.0f); + test_utils::populate_float_vec_to_sq8_with_metadata(v1_quantized.data(), dim, true, 456, -1.0f, + 1.0f); + test_utils::populate_float_vec_to_sq8_with_metadata(v2_quantized.data(), dim, true, 123, -1.0f, + 1.0f); unsigned char alignment = 0; - auto ip_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); - auto cosine_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); - - float ip_12 = ip_func(v1_quantized.data(), v2_quantized.data(), dim); - float ip_21 = ip_func(v2_quantized.data(), v1_quantized.data(), dim); - ASSERT_NEAR(ip_12, ip_21, 1e-6f) << "IP should be symmetric"; +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float cos_12 = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + float cos_21 = arch_opt_func(v2_quantized.data(), v1_quantized.data(), dim); + ASSERT_NEAR(cos_12, cos_21, 1e-6f) << "Optimized cosine should be symmetric"; + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float cos_12 = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + float cos_21 = arch_opt_func(v2_quantized.data(), v1_quantized.data(), dim); + ASSERT_NEAR(cos_12, cos_21, 1e-6f) << "Optimized cosine should be symmetric"; + optimization.sve = 0; + } +#endif +#ifdef OPT_NEON_DOTPROD + if (optimization.asimddp) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float cos_12 = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + float cos_21 = arch_opt_func(v2_quantized.data(), v1_quantized.data(), dim); + ASSERT_NEAR(cos_12, cos_21, 1e-6f) << "Optimized cosine should be symmetric"; + optimization.asimddp = 0; + } +#endif +#ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float cos_12 = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + float cos_21 = arch_opt_func(v2_quantized.data(), v1_quantized.data(), dim); + ASSERT_NEAR(cos_12, cos_21, 1e-6f) << "Optimized cosine should be symmetric"; + optimization.asimd = 0; + } +#endif +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float cos_12 = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + float cos_21 = arch_opt_func(v2_quantized.data(), v1_quantized.data(), dim); + ASSERT_NEAR(cos_12, cos_21, 1e-6f) << "Optimized cosine should be symmetric"; + optimization.avx512f = 0; + } +#endif + auto cosine_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); float cos_12 = cosine_func(v1_quantized.data(), v2_quantized.data(), dim); float cos_21 = cosine_func(v2_quantized.data(), v1_quantized.data(), dim); ASSERT_NEAR(cos_12, cos_21, 1e-6f) << "Cosine should be symmetric"; @@ -2640,8 +2692,9 @@ TEST(SQ8_SQ8_EdgeCases, ZeroVectorTest) { size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector v_zero_quantized(quantized_size); std::vector v_nonzero_quantized(quantized_size); - test_utils::quantize_float_vec_to_uint8_with_sum(v_zero.data(), dim, v_zero_quantized.data()); - test_utils::populate_float_vec_to_sq8_with_sum(v_nonzero_quantized.data(), dim); + test_utils::quantize_float_vec_to_sq8_with_metadata(v_zero.data(), dim, + v_zero_quantized.data()); + test_utils::populate_float_vec_to_sq8_with_metadata(v_nonzero_quantized.data(), dim, true); float baseline = SQ8_SQ8_InnerProduct(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); @@ -2706,8 +2759,10 @@ TEST(SQ8_SQ8_EdgeCases, ConstantVectorTest) { size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector v_const_quantized(quantized_size); std::vector v_random_quantized(quantized_size); - test_utils::quantize_float_vec_to_uint8_with_sum(v_const.data(), dim, v_const_quantized.data()); - test_utils::populate_float_vec_to_sq8_with_sum(v_random_quantized.data(), dim); + spaces::GetNormalizeFunc()(v_const.data(), dim); + test_utils::quantize_float_vec_to_sq8_with_metadata(v_const.data(), dim, + v_const_quantized.data()); + test_utils::populate_float_vec_to_sq8_with_metadata(v_random_quantized.data(), dim, true); float baseline = SQ8_SQ8_InnerProduct(v_const_quantized.data(), v_random_quantized.data(), dim); #ifdef OPT_SVE2 @@ -2785,8 +2840,8 @@ TEST(SQ8_SQ8_EdgeCases, ExtremeValuesTest) { size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); std::vector v1_quantized(quantized_size); std::vector v2_quantized(quantized_size); - test_utils::quantize_float_vec_to_uint8_with_sum(v1.data(), dim, v1_quantized.data()); - test_utils::quantize_float_vec_to_uint8_with_sum(v2.data(), dim, v2_quantized.data()); + test_utils::quantize_float_vec_to_sq8_with_metadata(v1.data(), dim, v1_quantized.data()); + test_utils::quantize_float_vec_to_sq8_with_metadata(v2.data(), dim, v2_quantized.data()); float baseline = SQ8_SQ8_InnerProduct(v1_quantized.data(), v2_quantized.data(), dim); ASSERT_FALSE(std::isnan(baseline)) << "Extreme values IP should not produce NaN"; diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h index 1d0314626..3ba3d119c 100644 --- a/tests/utils/tests_utils.h +++ b/tests/utils/tests_utils.h @@ -102,12 +102,41 @@ static void populate_float_vec_to_sq8(uint8_t *v, size_t dim, int seed = 1234) { quantize_float_vec_to_uint8(vec.data(), dim, v, seed); } +static float SQ8_SQ8_NotOptimized_InnerProduct(const void *pVect1v, const void *pVect2v, + size_t dimension) { + const auto *pVect1 = static_cast(pVect1v); + const auto *pVect2 = static_cast(pVect2v); + + // Extract metadata from the end of vectors (likely already prefetched) + // Get quantization parameters from pVect1 + const float min_val1 = *reinterpret_cast(pVect1 + dimension); + const float delta1 = *reinterpret_cast(pVect1 + dimension + sizeof(float)); + const float sum1 = *reinterpret_cast(pVect1 + dimension + 2 * sizeof(float)); + + // Get quantization parameters from pVect2 + const float min_val2 = *reinterpret_cast(pVect2 + dimension); + const float delta2 = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + const float sum2 = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); + + // Compute inner product with dequantization + float res = 0.0f; + for (size_t i = 0; i < dimension; i++) { + res += (pVect1[i] * delta1 + min_val1) * (pVect2[i] * delta2 + min_val2); + } + return 1.0f - res; +} + +static float SQ8_SQ8_NotOptimized_Cosine(const void *pVect1v, const void *pVect2v, + size_t dimension) { + return SQ8_SQ8_NotOptimized_InnerProduct(pVect1v, pVect2v, dimension); +} + /** - * Quantize float vector to SQ8 with precomputed sum and norm. - * Vector layout: [uint8_t values (dim)] [min (float)] [delta (float)] [sum (float)]] - * where sum = Σv[i] and norm = Σv[i]² (sum of squares of uint8 elements) + * Quantize float vector to SQ8 with precomputed sum and sum_squares. + * Vector layout: [uint8_t values (dim)] [min (float)] [delta (float)] [sum (float)] [sum_squares + * (float)] where sum = Σv[i] and norm = Σv[i]² (sum of squares of uint8 elements) */ -static void quantize_float_vec_to_uint8_with_sum(const float *v, size_t dim, uint8_t *qv) { +static void quantize_float_vec_to_sq8_with_metadata(const float *v, size_t dim, uint8_t *qv) { float min_val = v[0]; float max_val = v[0]; for (size_t i = 1; i < dim; i++) { @@ -135,7 +164,7 @@ static void quantize_float_vec_to_uint8_with_sum(const float *v, size_t dim, uin qv[i] = static_cast(std::round(normalized)); } - // Store parameters: [min, delta, sum, norm] + // Store parameters: [min, delta, sum, square_sum] float *params = reinterpret_cast(qv + dim); params[0] = min_val; params[1] = delta; @@ -144,16 +173,19 @@ static void quantize_float_vec_to_uint8_with_sum(const float *v, size_t dim, uin } /** - * Populate a float vector and quantize to SQ8 with precomputed sum and norm. - * Vector layout: [uint8_t values (dim)] [min (float)] [delta (float)] [sum (float)]] + * Populate a float vector and quantize to SQ8 with precomputed sum and sum_squares. + * Vector layout: [uint8_t values (dim)] [min (float)] [delta (float)] [sum (float)] [sum_squares + * (float)] */ -static void populate_float_vec_to_sq8_with_sum(uint8_t *v, size_t dim, int seed = 1234, - float min = -1.0f, float max = 1.0f) { +static void populate_float_vec_to_sq8_with_metadata(uint8_t *v, size_t dim, + bool should_normalize = false, int seed = 1234, + float min = -1.0f, float max = 1.0f) { std::vector vec(dim); populate_float_vec(vec.data(), dim, seed, min, max); - // Normalize vector - spaces::GetNormalizeFunc()(vec.data(), dim); - quantize_float_vec_to_uint8_with_sum(vec.data(), dim, v); + if (should_normalize) { + spaces::GetNormalizeFunc()(vec.data(), dim); + } + quantize_float_vec_to_sq8_with_metadata(vec.data(), dim, v); } template From d5b858729183075101911c4bb71381e2e7621605 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Thu, 1 Jan 2026 16:23:37 +0200 Subject: [PATCH 32/51] Update SQ8-to-SQ8 distance function comment to remove norm reference --- src/VecSim/spaces/functions/NEON.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/VecSim/spaces/functions/NEON.cpp b/src/VecSim/spaces/functions/NEON.cpp index 4f3e9eef5..cba3f878f 100644 --- a/src/VecSim/spaces/functions/NEON.cpp +++ b/src/VecSim/spaces/functions/NEON.cpp @@ -100,7 +100,7 @@ dist_func_t Choose_SQ8_Cosine_implementation_NEON(size_t dim) { return ret_dist_func; } -// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum/norm) +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum) dist_func_t Choose_SQ8_SQ8_IP_implementation_NEON(size_t dim) { dist_func_t ret_dist_func; CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_SQ8_InnerProductSIMD16_NEON); From 91f48df8f4c6272330394e46972854f7d7fe8e86 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Thu, 1 Jan 2026 16:33:07 +0200 Subject: [PATCH 33/51] Refactor cosine similarity functions to remove unnecessary subtraction in AVX2, SSE4, and SVE implementations --- src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h | 2 +- src/VecSim/spaces/IP/IP_AVX2_SQ8.h | 2 +- src/VecSim/spaces/IP/IP_SSE4_SQ8.h | 2 +- src/VecSim/spaces/IP/IP_SVE_SQ8.h | 3 +-- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h index 4d2927213..2d7136b9e 100644 --- a/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h @@ -101,5 +101,5 @@ float SQ8_InnerProductSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, template // 0..15 float SQ8_CosineSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) { // Assume vectors are normalized. - return 1.0f - SQ8_InnerProductImp_FMA(pVect1v, pVect2v, dimension); + return SQ8_InnerProductImp_FMA(pVect1v, pVect2v, dimension); } diff --git a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h index 3b0303e6d..8327e337d 100644 --- a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h @@ -95,5 +95,5 @@ float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size template // 0..15 float SQ8_CosineSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) { // Assume vectors are normalized. - return 1.0f - SQ8_InnerProductImp_AVX2(pVect1v, pVect2v, dimension); + return SQ8_InnerProductSIMD16_AVX2(pVect1v, pVect2v, dimension); } diff --git a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h index 2bbd9f582..243e147ad 100644 --- a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h @@ -105,5 +105,5 @@ float SQ8_InnerProductSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size template // 0..15 float SQ8_CosineSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dimension) { // Assume vectors are normalized. - return 1.0f - SQ8_InnerProductSIMD16_SSE4_IMP(pVect1v, pVect2v, dimension); + return SQ8_InnerProductSIMD16_SSE4(pVect1v, pVect2v, dimension); } diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8.h index a97c14e19..9fdbf7672 100644 --- a/src/VecSim/spaces/IP/IP_SVE_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8.h @@ -144,6 +144,5 @@ float SQ8_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t template float SQ8_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { // Assume vectors are normalized. - return 1.0f - SQ8_InnerProductSIMD_SVE_IMP(pVect1v, pVect2v, - dimension); + return SQ8_InnerProductSIMD_SVE(pVect1v, pVect2v, dimension); } From b660111fe22b0095aed087b0fa232b255cbad56a Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Thu, 1 Jan 2026 17:39:48 +0200 Subject: [PATCH 34/51] Refactor cosine similarity functions to use specific SIMD implementations for improved clarity and performance --- src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h | 2 +- src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h | 7 ++----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h index 2d7136b9e..c0c830c42 100644 --- a/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h @@ -101,5 +101,5 @@ float SQ8_InnerProductSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, template // 0..15 float SQ8_CosineSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) { // Assume vectors are normalized. - return SQ8_InnerProductImp_FMA(pVect1v, pVect2v, dimension); + return SQ8_InnerProductSIMD16_AVX2_FMA(pVect1v, pVect2v, dimension); } diff --git a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h index ae13d3527..b51f9fb6f 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h @@ -133,16 +133,13 @@ float SQ8_InnerProductImp_AVX512(const void *pVec1v, const void *pVec2v, size_t template // 0..15 float SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, size_t dimension) { - // Calculate inner product using common implementation - float ip = SQ8_InnerProductImp_AVX512(pVec1v, pVec2v, dimension); - // The inner product similarity is 1 - ip - return 1.0f - ip; + return 1.0f -SQ8_InnerProductImp_AVX512(pVec1v, pVec2v, dimension);; } template // 0..15 float SQ8_CosineSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, size_t dimension) { // Assume vectors are normalized. - return 1.0f - SQ8_InnerProductImp_AVX512(pVec1v, pVec2v, dimension); + return SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI(pVec1v, pVec2v, dimension); } From 9166caced2c5145db67d8c4abe39f743a2005862 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 4 Jan 2026 09:07:06 +0200 Subject: [PATCH 35/51] Refactor benchmark setup to allocate additional space for sum and sum_squares in SQ8 vector tests --- .../spaces_benchmarks/bm_spaces_sq8.cpp | 2 +- .../spaces_benchmarks/bm_spaces_sq8_sq8.cpp | 8 +++-- tests/unit/test_spaces.cpp | 9 ++++- tests/utils/tests_utils.h | 35 ------------------- 4 files changed, 14 insertions(+), 40 deletions(-) diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp index 29a4319ad..c95bf8026 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp @@ -26,7 +26,7 @@ class BM_VecSimSpaces_SQ8 : public benchmark::Fixture { test_utils::populate_float_vec(v1, dim, 123); // Allocate vector with extra space for min, delta and cosine calculations v2 = new uint8_t[dim + sizeof(float) * 3]; - test_utils::populate_float_vec_to_sq8(v2, dim, 1234); + test_utils::populate_float_vec_to_sq8_with_metadata(v2, dim, 1234); } void TearDown(const ::benchmark::State &state) { delete v1; diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp index cb51efabc..d7c731c3f 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp @@ -25,9 +25,11 @@ class BM_VecSimSpaces_SQ8_SQ8 : public benchmark::Fixture { void SetUp(const ::benchmark::State &state) { dim = state.range(0); - // Allocate both vectors with extra space for min and delta - v1 = new uint8_t[dim + sizeof(float) * 2]; - v2 = new uint8_t[dim + sizeof(float) * 2]; + // Allocate both vectors with extra space for min, delta, sum, and sum_squares + // Vector layout: [uint8_t values (dim)] [min (float)] [delta (float)] [sum (float)] + // [sum_squares (float)] + v1 = new uint8_t[dim + sizeof(float) * 4]; + v2 = new uint8_t[dim + sizeof(float) * 4]; test_utils::populate_float_vec_to_sq8_with_metadata(v1, dim, true, 123); test_utils::populate_float_vec_to_sq8_with_metadata(v2, dim, true, 1234); } diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index bde3055a7..de4d6c58d 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -308,6 +308,7 @@ TEST_F(SpacesTest, uint8_Cosine_no_optimization_func_test) { ASSERT_NEAR(dist, 0.0, 0.000001); } + void common_ip_sq8(bool should_normalize, float expected_dist) { size_t dim = 5; @@ -384,7 +385,6 @@ TEST_F(SpacesTest, SQ8_ip_no_optimization_norm_func_test) { common_ip_sq8(true, TEST_F(SpacesTest, SQ8_Cosine_no_optimization_func_test) { size_t dim = 5; - // Create original vectors std::vector v1_orig(dim); test_utils::populate_float_vec(v1_orig.data(), dim); @@ -398,7 +398,14 @@ TEST_F(SpacesTest, SQ8_Cosine_no_optimization_func_test) { float baseline = SQ8_Cosine(v1_orig.data(), v2_quantized.data(), dim); unsigned char alignment = 0; + #ifdef CPU_FEATURES_ARCH_AARCH64 + // Make sure we don't use any optimization (because there is no size optimization for arm) + auto optimization = getCpuOptimizationFeatures(); + optimization.sve = optimization.sve2 = optimization.asimddp = optimization.asimd = 0; + auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + #else auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, nullptr); + #endif ASSERT_EQ(arch_opt_func, SQ8_Cosine) << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) << "No optimization with dim " << dim; diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h index 3ba3d119c..8f8bab688 100644 --- a/tests/utils/tests_utils.h +++ b/tests/utils/tests_utils.h @@ -67,41 +67,6 @@ static void populate_float16_vec(vecsim_types::float16 *v, const size_t dim, int } } -static void quantize_float_vec_to_uint8(float *v, size_t dim, uint8_t *qv, int seed = 1234) { - - float min_val = v[0]; - float max_val = v[0]; - for (size_t i = 1; i < dim; i++) { - min_val = std::min(min_val, v[i]); - max_val = std::max(max_val, v[i]); - } - // Calculate delta - float delta = (max_val - min_val) / 255.0f; - if (delta == 0) - delta = 1.0f; // Avoid division by zero - // Quantize each value - for (size_t i = 0; i < dim; i++) { - float normalized = (v[i] - min_val) / delta; - normalized = std::max(0.0f, std::min(255.0f, normalized)); - qv[i] = static_cast(std::round(normalized)); - } - // Store parameters - float *params = reinterpret_cast(qv + dim); - params[0] = min_val; - params[1] = delta; -} - -static void populate_float_vec_to_sq8(uint8_t *v, size_t dim, int seed = 1234) { - - std::mt19937 gen(seed); // Mersenne Twister engine initialized with the fixed seed - std::uniform_real_distribution dis(-1.0f, 1.0f); - std::vector vec(dim); - for (size_t i = 0; i < dim; i++) { - vec[i] = dis(gen); - } - quantize_float_vec_to_uint8(vec.data(), dim, v, seed); -} - static float SQ8_SQ8_NotOptimized_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) { const auto *pVect1 = static_cast(pVect1v); From f28f4e7e9a33add0203a423e4fb1f4c1c0e2e19b Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 4 Jan 2026 09:10:54 +0200 Subject: [PATCH 36/51] Add CPU feature checks to disable optimizations for AArch64 in SQ8 distance function --- tests/unit/test_spaces.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index de4d6c58d..195e03fe1 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -367,7 +367,14 @@ void common_ip_sq8(bool should_normalize, float expected_dist) { << "SQ8_InnerProduct failed to match expected distance"; unsigned char alignment = 0; + #ifdef CPU_FEATURES_ARCH_AARCH64 + // Make sure we don't use any optimization (because there is no size optimization for arm) + auto optimization = getCpuOptimizationFeatures(); + optimization.sve = optimization.sve2 = optimization.asimddp = optimization.asimd = 0; + auto arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); + #else auto arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, nullptr); + #endif ASSERT_EQ(arch_opt_func, SQ8_InnerProduct) << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig, v2_quantized.data(), dim), 0.01) From e50dc45a3149743ec61d01d131be48fd37d69650 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 4 Jan 2026 09:13:18 +0200 Subject: [PATCH 37/51] Add CPU feature checks to disable optimizations for AArch64 in SQ8 distance function tests --- tests/unit/test_spaces.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 195e03fe1..bb0cb8307 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2356,7 +2356,14 @@ TEST_F(SpacesTest, SQ8_SQ8_ip_no_optimization_func_test) { v2_quantized.data(), dim); unsigned char alignment = 0; + #ifdef CPU_FEATURES_ARCH_AARCH64 + // Make sure we don't use any optimization (because there is no size optimization for arm) + auto optimization = getCpuOptimizationFeatures(); + optimization.sve = optimization.sve2 = optimization.asimddp = optimization.asimd = 0; + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + #else auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); + #endif ASSERT_EQ(arch_opt_func, SQ8_SQ8_InnerProduct) << "Unexpected distance function chosen for dim " << dim; // Checks that the function with the optimzied math equivalence returns the same result. @@ -2379,7 +2386,14 @@ TEST_F(SpacesTest, SQ8_SQ8_Cosine_no_optimization_func_test) { test_utils::SQ8_SQ8_NotOptimized_Cosine(v1_quantized.data(), v2_quantized.data(), dim); unsigned char alignment = 0; + #ifdef CPU_FEATURES_ARCH_AARCH64 + // Make sure we don't use any optimization (because there is no size optimization for arm) + auto optimization = getCpuOptimizationFeatures(); + optimization.sve = optimization.sve2 = optimization.asimddp = optimization.asimd = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + #else auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); + #endif ASSERT_EQ(arch_opt_func, SQ8_SQ8_Cosine) << "Unexpected distance function chosen for dim " << dim; // Checks that the function with the optimzied math equivalence returns the same result. From 6bbbc389bbbac37b76ff925e20eafd81cf90b093 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 4 Jan 2026 09:44:08 +0200 Subject: [PATCH 38/51] Fix formatting issues in SQ8 inner product function and clean up conditional compilation in tests --- .../spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h | 2 +- tests/unit/test_spaces.cpp | 25 +++++++++---------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h index b51f9fb6f..f821e069f 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h @@ -134,7 +134,7 @@ template // 0..15 float SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, size_t dimension) { // The inner product similarity is 1 - ip - return 1.0f -SQ8_InnerProductImp_AVX512(pVec1v, pVec2v, dimension);; + return 1.0f - SQ8_InnerProductImp_AVX512(pVec1v, pVec2v, dimension); } template // 0..15 diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index bb0cb8307..cbc55c62f 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -308,7 +308,6 @@ TEST_F(SpacesTest, uint8_Cosine_no_optimization_func_test) { ASSERT_NEAR(dist, 0.0, 0.000001); } - void common_ip_sq8(bool should_normalize, float expected_dist) { size_t dim = 5; @@ -367,14 +366,14 @@ void common_ip_sq8(bool should_normalize, float expected_dist) { << "SQ8_InnerProduct failed to match expected distance"; unsigned char alignment = 0; - #ifdef CPU_FEATURES_ARCH_AARCH64 +#ifdef CPU_FEATURES_ARCH_AARCH64 // Make sure we don't use any optimization (because there is no size optimization for arm) auto optimization = getCpuOptimizationFeatures(); optimization.sve = optimization.sve2 = optimization.asimddp = optimization.asimd = 0; auto arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); - #else +#else auto arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, nullptr); - #endif +#endif ASSERT_EQ(arch_opt_func, SQ8_InnerProduct) << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig, v2_quantized.data(), dim), 0.01) @@ -405,14 +404,14 @@ TEST_F(SpacesTest, SQ8_Cosine_no_optimization_func_test) { float baseline = SQ8_Cosine(v1_orig.data(), v2_quantized.data(), dim); unsigned char alignment = 0; - #ifdef CPU_FEATURES_ARCH_AARCH64 +#ifdef CPU_FEATURES_ARCH_AARCH64 // Make sure we don't use any optimization (because there is no size optimization for arm) auto optimization = getCpuOptimizationFeatures(); optimization.sve = optimization.sve2 = optimization.asimddp = optimization.asimd = 0; auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - #else +#else auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, nullptr); - #endif +#endif ASSERT_EQ(arch_opt_func, SQ8_Cosine) << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) << "No optimization with dim " << dim; @@ -2356,14 +2355,14 @@ TEST_F(SpacesTest, SQ8_SQ8_ip_no_optimization_func_test) { v2_quantized.data(), dim); unsigned char alignment = 0; - #ifdef CPU_FEATURES_ARCH_AARCH64 +#ifdef CPU_FEATURES_ARCH_AARCH64 // Make sure we don't use any optimization (because there is no size optimization for arm) auto optimization = getCpuOptimizationFeatures(); optimization.sve = optimization.sve2 = optimization.asimddp = optimization.asimd = 0; auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); - #else +#else auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); - #endif +#endif ASSERT_EQ(arch_opt_func, SQ8_SQ8_InnerProduct) << "Unexpected distance function chosen for dim " << dim; // Checks that the function with the optimzied math equivalence returns the same result. @@ -2386,14 +2385,14 @@ TEST_F(SpacesTest, SQ8_SQ8_Cosine_no_optimization_func_test) { test_utils::SQ8_SQ8_NotOptimized_Cosine(v1_quantized.data(), v2_quantized.data(), dim); unsigned char alignment = 0; - #ifdef CPU_FEATURES_ARCH_AARCH64 +#ifdef CPU_FEATURES_ARCH_AARCH64 // Make sure we don't use any optimization (because there is no size optimization for arm) auto optimization = getCpuOptimizationFeatures(); optimization.sve = optimization.sve2 = optimization.asimddp = optimization.asimd = 0; auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); - #else +#else auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); - #endif +#endif ASSERT_EQ(arch_opt_func, SQ8_SQ8_Cosine) << "Unexpected distance function chosen for dim " << dim; // Checks that the function with the optimzied math equivalence returns the same result. From 66a5f88f542f21b19153eb1e40c27c87b63880c9 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 4 Jan 2026 13:54:47 +0200 Subject: [PATCH 39/51] Enhance SQ8 Inner Product Implementations with Optimized Dot Product Calculations - Refactored inner product calculations for SQ8 vectors using NEON and SVE optimizations. - Integrated UINT8_InnerProductImp for efficient dot product computation in NEON and SVE implementations. - Updated inner product functions to handle 64-element chunks for improved performance. - Adjusted distance function selection logic to ensure optimizations are applied only for dimensions >= 16. - Added tests for zero vectors and constant vectors to validate optimized implementations against baseline results. - Ensured consistency in assertions for symmetry tests across various optimization flags. - Improved code readability and maintainability by removing redundant code and comments. --- src/VecSim/spaces/IP/IP.cpp | 1 - .../spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_SQ8.h | 103 +------- .../spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h | 72 +----- src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h | 93 ++----- src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h | 80 +----- src/VecSim/spaces/IP_space.cpp | 8 +- src/VecSim/spaces/functions/NEON.cpp | 5 +- tests/unit/test_spaces.cpp | 243 +++++++++++++++++- tests/utils/tests_utils.h | 11 +- 9 files changed, 297 insertions(+), 319 deletions(-) diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp index 4797a5d5e..247924d3b 100644 --- a/src/VecSim/spaces/IP/IP.cpp +++ b/src/VecSim/spaces/IP/IP.cpp @@ -61,7 +61,6 @@ float SQ8_SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dime product += pVect1[i] * pVect2[i]; } - // Extract metadata from the end of vectors (likely already prefetched) // Get quantization parameters from pVect1 const float min_val1 = *reinterpret_cast(pVect1 + dimension); const float delta1 = *reinterpret_cast(pVect1 + dimension + sizeof(float)); diff --git a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_SQ8.h index c2638c384..9b4f7e01a 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_SQ8.h @@ -8,6 +8,7 @@ */ #pragma once #include "VecSim/spaces/space_includes.h" +#include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_UINT8.h" #include /** @@ -24,119 +25,39 @@ * IP = min1*sum2 + min2*sum1 + δ1*δ2 * Σ(q1[i]*q2[i]) - dim*min1*min2 * * Since sum is precomputed, we only need to compute the dot product Σ(q1[i]*q2[i]). + * The dot product is computed using the efficient UINT8_InnerProductImp which uses + * VNNI instructions (_mm512_dpwssd_epi32) for native integer dot product computation. * * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] */ -// Process 64 uint8 elements using VNNI with multiple accumulators for ILP (dot product only) -static inline void SQ8_SQ8_InnerProductStep64(const uint8_t *pVec1, const uint8_t *pVec2, - __m512i &dot_acc0, __m512i &dot_acc1) { - // Load 64 bytes from each vector - __m512i v1_full = _mm512_loadu_si512(reinterpret_cast(pVec1)); - __m512i v2_full = _mm512_loadu_si512(reinterpret_cast(pVec2)); - - // Extract lower and upper 256-bit halves - __m256i v1_lo = _mm512_castsi512_si256(v1_full); - __m256i v1_hi = _mm512_extracti64x4_epi64(v1_full, 1); - __m256i v2_lo = _mm512_castsi512_si256(v2_full); - __m256i v2_hi = _mm512_extracti64x4_epi64(v2_full, 1); - - // Convert to int16 (zero-extend) and compute dot products using VNNI - // dpwssd: multiply pairs of int16, sum pairs to int32, accumulate - dot_acc0 = - _mm512_dpwssd_epi32(dot_acc0, _mm512_cvtepu8_epi16(v1_lo), _mm512_cvtepu8_epi16(v2_lo)); - dot_acc1 = - _mm512_dpwssd_epi32(dot_acc1, _mm512_cvtepu8_epi16(v1_hi), _mm512_cvtepu8_epi16(v2_hi)); -} - -// Process 32 uint8 elements using VNNI (dot product only) -static inline void SQ8_SQ8_InnerProductStep32(const uint8_t *pVec1, const uint8_t *pVec2, - __m512i &dot_acc) { - // Load 32 bytes from each vector - __m256i v1_256 = _mm256_loadu_si256(reinterpret_cast(pVec1)); - __m256i v2_256 = _mm256_loadu_si256(reinterpret_cast(pVec2)); - - // Convert to int16 (zero-extend) and compute dot product using VNNI - dot_acc = - _mm512_dpwssd_epi32(dot_acc, _mm512_cvtepu8_epi16(v1_256), _mm512_cvtepu8_epi16(v2_256)); -} - // Common implementation for inner product between two SQ8 vectors with precomputed sum +// Uses UINT8_InnerProductImp for efficient dot product computation with VNNI template // 0..63 float SQ8_SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimension) { + // Compute raw dot product using efficient UINT8 AVX512 VNNI implementation + // UINT8_InnerProductImp uses _mm512_dpwssd_epi32 for native integer dot product + int dot_product = UINT8_InnerProductImp(pVec1v, pVec2v, dimension); + + // Get dequantization parameters and precomputed values from the end of vectors + // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] const uint8_t *pVec1 = static_cast(pVec1v); const uint8_t *pVec2 = static_cast(pVec2v); - const uint8_t *pEnd1 = pVec1 + dimension; - // Get dequantization parameters and precomputed values from the end of pVec1 - // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] const float *params1 = reinterpret_cast(pVec1 + dimension); const float min1 = params1[0]; const float delta1 = params1[1]; const float sum1 = params1[2]; // Precomputed sum of original float elements - // Get dequantization parameters and precomputed values from the end of pVec2 const float *params2 = reinterpret_cast(pVec2 + dimension); const float min2 = params2[0]; const float delta2 = params2[1]; const float sum2 = params2[2]; // Precomputed sum of original float elements - // Multiple accumulators for instruction-level parallelism (dot product only) - __m512i dot_acc0 = _mm512_setzero_si512(); - __m512i dot_acc1 = _mm512_setzero_si512(); - - // Handle residual first (0..63 elements) - if constexpr (residual > 0) { - if constexpr (residual < 32) { - // Handle less than 32 elements with mask - constexpr __mmask32 mask = (1LU << residual) - 1; - __m256i v1_256 = _mm256_maskz_loadu_epi8(mask, pVec1); - __m256i v2_256 = _mm256_maskz_loadu_epi8(mask, pVec2); - - // Convert to int16 and compute dot product - dot_acc0 = _mm512_dpwssd_epi32(dot_acc0, _mm512_cvtepu8_epi16(v1_256), - _mm512_cvtepu8_epi16(v2_256)); - } else if constexpr (residual == 32) { - // Exactly 32 elements - SQ8_SQ8_InnerProductStep32(pVec1, pVec2, dot_acc0); - } else { - // 33-63 elements: use masked 64-byte load - constexpr __mmask64 mask = (1LLU << residual) - 1; - __m512i v1_full = _mm512_maskz_loadu_epi8(mask, pVec1); - __m512i v2_full = _mm512_maskz_loadu_epi8(mask, pVec2); - - // Extract halves and compute dot products - __m256i v1_lo = _mm512_castsi512_si256(v1_full); - __m256i v1_hi = _mm512_extracti64x4_epi64(v1_full, 1); - __m256i v2_lo = _mm512_castsi512_si256(v2_full); - __m256i v2_hi = _mm512_extracti64x4_epi64(v2_full, 1); - - dot_acc0 = _mm512_dpwssd_epi32(dot_acc0, _mm512_cvtepu8_epi16(v1_lo), - _mm512_cvtepu8_epi16(v2_lo)); - dot_acc1 = _mm512_dpwssd_epi32(dot_acc1, _mm512_cvtepu8_epi16(v1_hi), - _mm512_cvtepu8_epi16(v2_hi)); - } - pVec1 += residual; - pVec2 += residual; - } - - // Process full 64-byte chunks - while (pVec1 < pEnd1) { - SQ8_SQ8_InnerProductStep64(pVec1, pVec2, dot_acc0, dot_acc1); - pVec1 += 64; - pVec2 += 64; - } - - // Combine dot product accumulators and reduce - __m512i dot_total = _mm512_add_epi32(dot_acc0, dot_acc1); - int64_t dot_product = _mm512_reduce_add_epi32(dot_total); - // Apply the algebraic formula using precomputed sums: // IP = min1*sum2 + min2*sum1 + δ1*δ2 * Σ(q1[i]*q2[i]) - dim*min1*min2 - float result = min1 * sum2 + min2 * sum1 + delta1 * delta2 * static_cast(dot_product) - - static_cast(dimension) * min1 * min2; - - return result; + return min1 * sum2 + min2 * sum1 + delta1 * delta2 * static_cast(dot_product) - + static_cast(dimension) * min1 * min2; } // SQ8-to-SQ8 Inner Product distance function diff --git a/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h index f395ac547..7b2ed8829 100644 --- a/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h +++ b/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h @@ -8,6 +8,7 @@ */ #pragma once #include "VecSim/spaces/space_includes.h" +#include "VecSim/spaces/IP/IP_NEON_DOTPROD_UINT8.h" #include /** @@ -24,90 +25,39 @@ * IP = min1*sum2 + min2*sum1 + δ1*δ2 * Σ(q1[i]*q2[i]) - dim*min1*min2 * * Since sum is precomputed, we only need to compute the dot product Σ(q1[i]*q2[i]). + * The dot product is computed using the efficient UINT8_InnerProductImp which uses + * the DOTPROD instruction (vdotq_u32) for native uint8 dot product computation. * * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] */ -// Helper function: computes dot product using DOTPROD instruction (no sum computation needed) -__attribute__((always_inline)) static inline void -SQ8_SQ8_InnerProductStep_NEON_DOTPROD(const uint8_t *&pVec1, const uint8_t *&pVec2, - uint32x4_t &dot_sum) { - // Load 16 uint8 elements - uint8x16_t v1 = vld1q_u8(pVec1); - uint8x16_t v2 = vld1q_u8(pVec2); - - // Compute dot product using DOTPROD instruction: dot_sum += v1 . v2 - dot_sum = vdotq_u32(dot_sum, v1, v2); - - pVec1 += 16; - pVec2 += 16; -} - // Common implementation for inner product between two SQ8 vectors with precomputed sum +// Uses UINT8_InnerProductImp for efficient dot product computation with DOTPROD template // 0..63 float SQ8_SQ8_InnerProductSIMD64_NEON_DOTPROD_IMP(const void *pVec1v, const void *pVec2v, size_t dimension) { + // Compute raw dot product using efficient UINT8 DOTPROD implementation + // UINT8_InnerProductImp uses vdotq_u32 for native uint8 dot product + float dot_product = UINT8_InnerProductImp(pVec1v, pVec2v, dimension); + + // Get dequantization parameters and precomputed values from the end of vectors + // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] const uint8_t *pVec1 = static_cast(pVec1v); const uint8_t *pVec2 = static_cast(pVec2v); - // Get dequantization parameters and precomputed values from the end of pVec1 - // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] const float *params1 = reinterpret_cast(pVec1 + dimension); const float min1 = params1[0]; const float delta1 = params1[1]; const float sum1 = params1[2]; // Precomputed sum of original float elements - // Get dequantization parameters and precomputed values from the end of pVec2 const float *params2 = reinterpret_cast(pVec2 + dimension); const float min2 = params2[0]; const float delta2 = params2[1]; const float sum2 = params2[2]; // Precomputed sum of original float elements - // Calculate number of 64-element chunks - size_t num_of_chunks = (dimension - residual) / 64; - - // Multiple accumulators for ILP (dot product only) - uint32x4_t dot_sum0 = vdupq_n_u32(0); - uint32x4_t dot_sum1 = vdupq_n_u32(0); - uint32x4_t dot_sum2 = vdupq_n_u32(0); - uint32x4_t dot_sum3 = vdupq_n_u32(0); - - // Process 64 elements at a time (4 x 16) in the main loop - for (size_t i = 0; i < num_of_chunks; i++) { - SQ8_SQ8_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum0); - SQ8_SQ8_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum1); - SQ8_SQ8_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum2); - SQ8_SQ8_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum3); - } - - // Handle remaining complete 16-element blocks within residual - if constexpr (residual >= 16) { - SQ8_SQ8_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum0); - } - if constexpr (residual >= 32) { - SQ8_SQ8_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum1); - } - if constexpr (residual >= 48) { - SQ8_SQ8_InnerProductStep_NEON_DOTPROD(pVec1, pVec2, dot_sum2); - } - - // Combine accumulators - uint32x4_t dot_total = vaddq_u32(vaddq_u32(dot_sum0, dot_sum1), vaddq_u32(dot_sum2, dot_sum3)); - - // Horizontal sum for dot product - uint32_t dot_product = vaddvq_u32(dot_total); - - // Handle remaining scalar elements (0-15) - constexpr unsigned char remaining = residual % 16; - if constexpr (remaining > 0) { - for (unsigned char i = 0; i < remaining; i++) { - dot_product += static_cast(pVec1[i]) * static_cast(pVec2[i]); - } - } - // Apply algebraic formula using precomputed sums: // IP = min1*sum2 + min2*sum1 + δ1*δ2 * Σ(q1*q2) - dim*min1*min2 - return min1 * sum2 + min2 * sum1 + delta1 * delta2 * static_cast(dot_product) - + return min1 * sum2 + min2 * sum1 + delta1 * delta2 * dot_product - static_cast(dimension) * min1 * min2; } diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h index 401ab607a..8d6cbd650 100644 --- a/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h +++ b/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h @@ -8,6 +8,7 @@ */ #pragma once #include "VecSim/spaces/space_includes.h" +#include "VecSim/spaces/IP/IP_NEON_UINT8.h" #include /** @@ -24,40 +25,26 @@ * IP = min1*sum2 + min2*sum1 + δ1*δ2 * Σ(q1[i]*q2[i]) - dim*min1*min2 * * Since sum is precomputed, we only need to compute the dot product Σ(q1[i]*q2[i]). + * The dot product is computed using the efficient UINT8_InnerProductImp which uses + * native NEON uint8 multiply-accumulate instructions (vmull_u8, vpadalq_u16). * * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] */ -// Helper function with dot product only (no sum computation needed) -static inline void SQ8_SQ8_InnerProductStep_NEON(const uint8_t *&pVec1, const uint8_t *&pVec2, - float32x4_t &dot_sum) { - // Load 4 uint8 elements from pVec1 and convert to float - uint8x8_t v1_u8 = vld1_u8(pVec1); - uint32x4_t v1_u32 = vmovl_u16(vget_low_u16(vmovl_u8(v1_u8))); - float32x4_t v1_f = vcvtq_f32_u32(v1_u32); - - // Load 4 uint8 elements from pVec2 and convert to float - uint8x8_t v2_u8 = vld1_u8(pVec2); - uint32x4_t v2_u32 = vmovl_u16(vget_low_u16(vmovl_u8(v2_u8))); - float32x4_t v2_f = vcvtq_f32_u32(v2_u32); - - // Accumulate dot product: dot_sum += v1 * v2 (no dequantization) - dot_sum = vmlaq_f32(dot_sum, v1_f, v2_f); - - // Advance pointers - pVec1 += 4; - pVec2 += 4; -} - // Common implementation for inner product between two SQ8 vectors with precomputed sum -template // 0..15 -float SQ8_SQ8_InnerProductSIMD16_NEON_IMP(const void *pVec1v, const void *pVec2v, +// Uses UINT8_InnerProductImp for efficient dot product computation +template // 0..63 +float SQ8_SQ8_InnerProductSIMD64_NEON_IMP(const void *pVec1v, const void *pVec2v, size_t dimension) { - const uint8_t *pVec1 = static_cast(pVec1v); - const uint8_t *pVec2 = static_cast(pVec2v); + // Compute raw dot product using efficient UINT8 implementation + // UINT8_InnerProductImp processes 16 elements at a time using native uint8 instructions + float dot_product = UINT8_InnerProductImp(pVec1v, pVec2v, dimension); // Get dequantization parameters and precomputed values from the end of pVec1 // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] + const uint8_t *pVec1 = static_cast(pVec1v); + const uint8_t *pVec2 = static_cast(pVec2v); + const float *params1 = reinterpret_cast(pVec1 + dimension); const float min1 = params1[0]; const float delta1 = params1[1]; @@ -69,50 +56,6 @@ float SQ8_SQ8_InnerProductSIMD16_NEON_IMP(const void *pVec1v, const void *pVec2v const float delta2 = params2[1]; const float sum2 = params2[2]; // Precomputed sum of original float elements - // Calculate number of 16-element chunks - size_t num_of_chunks = (dimension - residual) / 16; - - // Multiple accumulators for ILP (dot product only) - float32x4_t dot_sum0 = vdupq_n_f32(0.0f); - float32x4_t dot_sum1 = vdupq_n_f32(0.0f); - float32x4_t dot_sum2 = vdupq_n_f32(0.0f); - float32x4_t dot_sum3 = vdupq_n_f32(0.0f); - - // Process 16 elements at a time in the main loop - for (size_t i = 0; i < num_of_chunks; i++) { - SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, dot_sum0); - SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, dot_sum1); - SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, dot_sum2); - SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, dot_sum3); - } - - // Handle remaining complete 4-element blocks within residual - if constexpr (residual >= 4) { - SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, dot_sum0); - } - if constexpr (residual >= 8) { - SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, dot_sum1); - } - if constexpr (residual >= 12) { - SQ8_SQ8_InnerProductStep_NEON(pVec1, pVec2, dot_sum2); - } - - // Combine dot product accumulators - float32x4_t dot_total = vaddq_f32(vaddq_f32(dot_sum0, dot_sum1), vaddq_f32(dot_sum2, dot_sum3)); - - // Horizontal sum for dot product - float32x2_t dot_halves = vadd_f32(vget_low_f32(dot_total), vget_high_f32(dot_total)); - float32x2_t dot_summed = vpadd_f32(dot_halves, dot_halves); - float dot_product = vget_lane_f32(dot_summed, 0); - - // Handle remaining scalar elements (0-3) - constexpr unsigned char remaining = residual % 4; - if constexpr (remaining > 0) { - for (unsigned char i = 0; i < remaining; i++) { - dot_product += static_cast(pVec1[i]) * static_cast(pVec2[i]); - } - } - // Apply algebraic formula using precomputed sums: // IP = min1*sum2 + min2*sum1 + δ1*δ2 * Σ(q1*q2) - dim*min1*min2 return min1 * sum2 + min2 * sum1 + delta1 * delta2 * dot_product - @@ -121,14 +64,14 @@ float SQ8_SQ8_InnerProductSIMD16_NEON_IMP(const void *pVec1v, const void *pVec2v // SQ8-to-SQ8 Inner Product distance function // Returns 1 - inner_product (distance form) -template // 0..15 -float SQ8_SQ8_InnerProductSIMD16_NEON(const void *pVec1v, const void *pVec2v, size_t dimension) { - return 1.0f - SQ8_SQ8_InnerProductSIMD16_NEON_IMP(pVec1v, pVec2v, dimension); +template // 0..63 +float SQ8_SQ8_InnerProductSIMD64_NEON(const void *pVec1v, const void *pVec2v, size_t dimension) { + return 1.0f - SQ8_SQ8_InnerProductSIMD64_NEON_IMP(pVec1v, pVec2v, dimension); } // SQ8-to-SQ8 Cosine distance function // Returns 1 - inner_product (assumes vectors are pre-normalized) -template // 0..15 -float SQ8_SQ8_CosineSIMD16_NEON(const void *pVec1v, const void *pVec2v, size_t dimension) { - return SQ8_SQ8_InnerProductSIMD16_NEON(pVec1v, pVec2v, dimension); +template // 0..63 +float SQ8_SQ8_CosineSIMD64_NEON(const void *pVec1v, const void *pVec2v, size_t dimension) { + return SQ8_SQ8_InnerProductSIMD64_NEON(pVec1v, pVec2v, dimension); } diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h index b510881dc..e0369f5b7 100644 --- a/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h @@ -8,6 +8,7 @@ */ #pragma once #include "VecSim/spaces/space_includes.h" +#include "VecSim/spaces/IP/IP_SVE_UINT8.h" #include /** @@ -24,96 +25,39 @@ * IP = min1*sum2 + min2*sum1 + δ1*δ2 * Σ(q1[i]*q2[i]) - dim*min1*min2 * * Since sum is precomputed, we only need to compute the dot product Σ(q1[i]*q2[i]). + * The dot product is computed using the efficient UINT8_InnerProductImp which uses + * SVE dot product instruction (svdot_u32) for native uint8 dot product computation. * * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] */ -// Helper function to perform inner product step using integer dot product (no sum computation) -static inline void SQ8_SQ8_InnerProductStep_SVE(const uint8_t *pVec1, const uint8_t *pVec2, - size_t &offset, svuint32_t &dot_sum, - const size_t chunk) { - svbool_t pg = svptrue_b8(); - - // Load uint8 vectors - svuint8_t v1_u8 = svld1_u8(pg, pVec1 + offset); - svuint8_t v2_u8 = svld1_u8(pg, pVec2 + offset); - - // Compute dot product using integer svdot instruction - dot_sum = svdot_u32(dot_sum, v1_u8, v2_u8); - - offset += chunk; -} - // Common implementation for inner product between two SQ8 vectors with precomputed sum +// Uses UINT8_InnerProductImp for efficient dot product computation with SVE template float SQ8_SQ8_InnerProductSIMD_SVE_IMP(const void *pVec1v, const void *pVec2v, size_t dimension) { + // Compute raw dot product using efficient UINT8 SVE implementation + // UINT8_InnerProductImp uses svdot_u32 for native uint8 dot product + float dot_product = + UINT8_InnerProductImp(pVec1v, pVec2v, dimension); + + // Get dequantization parameters and precomputed values from the end of vectors + // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] const uint8_t *pVec1 = static_cast(pVec1v); const uint8_t *pVec2 = static_cast(pVec2v); - size_t offset = 0; - // Get dequantization parameters and precomputed values from the end of pVec1 - // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] const float *params1 = reinterpret_cast(pVec1 + dimension); const float min1 = params1[0]; const float delta1 = params1[1]; const float sum1 = params1[2]; // Precomputed sum of original float elements - // Get dequantization parameters and precomputed values from the end of pVec2 const float *params2 = reinterpret_cast(pVec2 + dimension); const float min2 = params2[0]; const float delta2 = params2[1]; const float sum2 = params2[2]; // Precomputed sum of original float elements - // Get the vector length for uint8 elements - const size_t vl = svcntb(); - - // Calculate number of complete 4-chunk groups - size_t number_of_chunks = dimension / (vl * 4); - - // Multiple accumulators for ILP (dot product only) - svuint32_t dot_sum0 = svdup_u32(0); - svuint32_t dot_sum1 = svdup_u32(0); - svuint32_t dot_sum2 = svdup_u32(0); - svuint32_t dot_sum3 = svdup_u32(0); - - for (size_t i = 0; i < number_of_chunks; i++) { - SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum0, vl); - SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum1, vl); - SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum2, vl); - SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum3, vl); - } - - // Handle remaining steps (0-3 complete chunks) - if constexpr (additional_steps >= 1) { - SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum0, vl); - } - if constexpr (additional_steps >= 2) { - SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum1, vl); - } - if constexpr (additional_steps >= 3) { - SQ8_SQ8_InnerProductStep_SVE(pVec1, pVec2, offset, dot_sum2, vl); - } - - // Handle partial chunk if needed - if constexpr (partial_chunk) { - svbool_t pg = svwhilelt_b8(offset, dimension); - svuint8_t v1_u8 = svld1_u8(pg, pVec1 + offset); - svuint8_t v2_u8 = svld1_u8(pg, pVec2 + offset); - dot_sum3 = svdot_u32(dot_sum3, v1_u8, v2_u8); - } - - // Combine all accumulators - svuint32_t dot_total = svadd_u32_x(svptrue_b32(), dot_sum0, dot_sum1); - dot_total = svadd_u32_x(svptrue_b32(), dot_total, dot_sum2); - dot_total = svadd_u32_x(svptrue_b32(), dot_total, dot_sum3); - - // Horizontal sum to scalar integer - svbool_t pg32 = svptrue_b32(); - uint32_t dot_product = svaddv_u32(pg32, dot_total); - // Apply algebraic formula with float conversion only at the end: // IP = min1*sum2 + min2*sum1 + δ1*δ2 * Σ(q1*q2) - dim*min1*min2 - return min1 * sum2 + min2 * sum1 + delta1 * delta2 * static_cast(dot_product) - + return min1 * sum2 + min2 * sum1 + delta1 * delta2 * dot_product - static_cast(dimension) * min1 * min2; } diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index 7ad23d2a6..34a615695 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -169,12 +169,12 @@ dist_func_t IP_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, } #endif #ifdef OPT_NEON_DOTPROD - if (features.asimddp) { + if (features.asimddp && dim >= 16) { return Choose_SQ8_SQ8_IP_implementation_NEON_DOTPROD(dim); } #endif #ifdef OPT_NEON - if (features.asimd) { + if (features.asimd && dim >= 16) { return Choose_SQ8_SQ8_IP_implementation_NEON(dim); } #endif @@ -213,12 +213,12 @@ dist_func_t Cosine_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignme } #endif #ifdef OPT_NEON_DOTPROD - if (features.asimddp) { + if (features.asimddp && dim >= 16) { return Choose_SQ8_SQ8_Cosine_implementation_NEON_DOTPROD(dim); } #endif #ifdef OPT_NEON - if (features.asimd) { + if (features.asimd && dim >= 16) { return Choose_SQ8_SQ8_Cosine_implementation_NEON(dim); } #endif diff --git a/src/VecSim/spaces/functions/NEON.cpp b/src/VecSim/spaces/functions/NEON.cpp index cba3f878f..df181ecad 100644 --- a/src/VecSim/spaces/functions/NEON.cpp +++ b/src/VecSim/spaces/functions/NEON.cpp @@ -101,15 +101,16 @@ dist_func_t Choose_SQ8_Cosine_implementation_NEON(size_t dim) { } // SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum) +// Uses 64-element chunking to leverage efficient UINT8_InnerProductImp dist_func_t Choose_SQ8_SQ8_IP_implementation_NEON(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_SQ8_InnerProductSIMD16_NEON); + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, SQ8_SQ8_InnerProductSIMD64_NEON); return ret_dist_func; } dist_func_t Choose_SQ8_SQ8_Cosine_implementation_NEON(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_SQ8_CosineSIMD16_NEON); + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, SQ8_SQ8_CosineSIMD64_NEON); return ret_dist_func; } diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index cbc55c62f..adaaa4111 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2358,15 +2358,15 @@ TEST_F(SpacesTest, SQ8_SQ8_ip_no_optimization_func_test) { #ifdef CPU_FEATURES_ARCH_AARCH64 // Make sure we don't use any optimization (because there is no size optimization for arm) auto optimization = getCpuOptimizationFeatures(); - optimization.sve = optimization.sve2 = optimization.asimddp = optimization.asimd = 0; + optimization.sve = optimization.sve2 = 0; auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); #else auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); #endif ASSERT_EQ(arch_opt_func, SQ8_SQ8_InnerProduct) << "Unexpected distance function chosen for dim " << dim; - // Checks that the function with the optimzied math equivalence returns the same result. - ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.01) + // Checks that the function with the optimized math equivalence returns the same result. + ASSERT_EQ(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim)) << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } @@ -2395,7 +2395,7 @@ TEST_F(SpacesTest, SQ8_SQ8_Cosine_no_optimization_func_test) { #endif ASSERT_EQ(arch_opt_func, SQ8_SQ8_Cosine) << "Unexpected distance function chosen for dim " << dim; - // Checks that the function with the optimzied math equivalence returns the same result. + // Checks that the function with the optimized math equivalence returns the same result. // min1*sum2 + min2*sum1 + delta1*delta2*Σ(q1[i]*q2[i]) - dim*min1*min2 ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.001) << "No optimization with dim " << dim; @@ -2660,7 +2660,7 @@ TEST(SQ8_SQ8_EdgeCases, SymmetryTest) { auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); float cos_12 = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); float cos_21 = arch_opt_func(v2_quantized.data(), v1_quantized.data(), dim); - ASSERT_NEAR(cos_12, cos_21, 1e-6f) << "Optimized cosine should be symmetric"; + ASSERT_EQ(cos_12, cos_21) << "Optimized cosine should be symmetric"; optimization.sve2 = 0; } #endif @@ -2670,7 +2670,7 @@ TEST(SQ8_SQ8_EdgeCases, SymmetryTest) { auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); float cos_12 = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); float cos_21 = arch_opt_func(v2_quantized.data(), v1_quantized.data(), dim); - ASSERT_NEAR(cos_12, cos_21, 1e-6f) << "Optimized cosine should be symmetric"; + ASSERT_EQ(cos_12, cos_21) << "Optimized cosine should be symmetric"; optimization.sve = 0; } #endif @@ -2680,7 +2680,7 @@ TEST(SQ8_SQ8_EdgeCases, SymmetryTest) { auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); float cos_12 = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); float cos_21 = arch_opt_func(v2_quantized.data(), v1_quantized.data(), dim); - ASSERT_NEAR(cos_12, cos_21, 1e-6f) << "Optimized cosine should be symmetric"; + ASSERT_EQ(cos_12, cos_21) << "Optimized cosine should be symmetric"; optimization.asimddp = 0; } #endif @@ -2690,7 +2690,7 @@ TEST(SQ8_SQ8_EdgeCases, SymmetryTest) { auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); float cos_12 = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); float cos_21 = arch_opt_func(v2_quantized.data(), v1_quantized.data(), dim); - ASSERT_NEAR(cos_12, cos_21, 1e-6f) << "Optimized cosine should be symmetric"; + ASSERT_EQ(cos_12, cos_21) << "Optimized cosine should be symmetric"; optimization.asimd = 0; } #endif @@ -2700,7 +2700,7 @@ TEST(SQ8_SQ8_EdgeCases, SymmetryTest) { auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); float cos_12 = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); float cos_21 = arch_opt_func(v2_quantized.data(), v1_quantized.data(), dim); - ASSERT_NEAR(cos_12, cos_21, 1e-6f) << "Optimized cosine should be symmetric"; + ASSERT_EQ(cos_12, cos_21) << "Optimized cosine should be symmetric"; optimization.avx512f = 0; } #endif @@ -2711,7 +2711,7 @@ TEST(SQ8_SQ8_EdgeCases, SymmetryTest) { } // Test with zero vector -TEST(SQ8_SQ8_EdgeCases, ZeroVectorTest) { +TEST(SQ8_SQ8_EdgeCases, IPZeroVectorTest) { auto optimization = getCpuOptimizationFeatures(); size_t dim = 128; std::vector v_zero(dim, 0.0f); @@ -2777,8 +2777,74 @@ TEST(SQ8_SQ8_EdgeCases, ZeroVectorTest) { ASSERT_NEAR(result, baseline, 0.01f) << "Zero vector IP should match baseline"; } +TEST(SQ8_SQ8_EdgeCases, CosineZeroVectorTest) { + auto optimization = getCpuOptimizationFeatures(); + size_t dim = 128; + std::vector v_zero(dim, 0.0f); + + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v_zero_quantized(quantized_size); + std::vector v_nonzero_quantized(quantized_size); + test_utils::quantize_float_vec_to_sq8_with_metadata(v_zero.data(), dim, + v_zero_quantized.data()); + test_utils::populate_float_vec_to_sq8_with_metadata(v_nonzero_quantized.data(), dim, true); + + float baseline = SQ8_SQ8_Cosine(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); + +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; + optimization.sve = 0; + } +#endif +#ifdef OPT_NEON_DOTPROD + if (optimization.asimddp) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; + optimization.asimddp = 0; + } +#endif +#ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; + optimization.asimd = 0; + } +#endif +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; + optimization.avx512f = 0; + } +#endif + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); + float result = arch_opt_func(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); + + ASSERT_NEAR(result, baseline, 0.01f) << "Zero vector Cosine should match baseline"; +} + // Test with constant vector (all same values) -TEST(SQ8_SQ8_EdgeCases, ConstantVectorTest) { +TEST(SQ8_SQ8_EdgeCases, IPConstantVectorTest) { auto optimization = getCpuOptimizationFeatures(); size_t dim = 128; std::vector v_const(dim, 0.5f); @@ -2850,8 +2916,80 @@ TEST(SQ8_SQ8_EdgeCases, ConstantVectorTest) { ASSERT_NEAR(result, baseline, 0.01f) << "Constant vector IP should match baseline"; } +TEST(SQ8_SQ8_EdgeCases, CosineConstantVectorTest) { + auto optimization = getCpuOptimizationFeatures(); + size_t dim = 128; + std::vector v_const(dim, 0.5f); + + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v_const_quantized(quantized_size); + std::vector v_random_quantized(quantized_size); + spaces::GetNormalizeFunc()(v_const.data(), dim); + test_utils::quantize_float_vec_to_sq8_with_metadata(v_const.data(), dim, + v_const_quantized.data()); + test_utils::populate_float_vec_to_sq8_with_metadata(v_random_quantized.data(), dim, true); + + float baseline = SQ8_SQ8_Cosine(v_const_quantized.data(), v_random_quantized.data(), dim); +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_const_quantized.data(), v_random_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized constant vector Cosine should match baseline"; + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_const_quantized.data(), v_random_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized constant vector Cosine should match baseline"; + optimization.sve = 0; + } +#endif +#ifdef OPT_NEON_DOTPROD + if (optimization.asimddp) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_const_quantized.data(), v_random_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized constant vector Cosine should match baseline"; + optimization.asimddp = 0; + } +#endif +#ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_const_quantized.data(), v_random_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized constant vector Cosine should match baseline"; + optimization.asimd = 0; + } +#endif +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_const_quantized.data(), v_random_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized constant vector Cosine should match baseline"; + optimization.avx512f = 0; + } +#endif + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); + float result = arch_opt_func(v_const_quantized.data(), v_random_quantized.data(), dim); + + ASSERT_NEAR(result, baseline, 0.01f) << "Constant vector Cosine should match baseline"; +} + // Test with extreme values (-1 and 1 only) -TEST(SQ8_SQ8_EdgeCases, ExtremeValuesTest) { +TEST(SQ8_SQ8_EdgeCases, IPExtremeValuesTest) { + auto optimization = getCpuOptimizationFeatures(); size_t dim = 128; std::vector v1(dim), v2(dim); @@ -2871,9 +3009,7 @@ TEST(SQ8_SQ8_EdgeCases, ExtremeValuesTest) { test_utils::quantize_float_vec_to_sq8_with_metadata(v2.data(), dim, v2_quantized.data()); float baseline = SQ8_SQ8_InnerProduct(v1_quantized.data(), v2_quantized.data(), dim); - ASSERT_FALSE(std::isnan(baseline)) << "Extreme values IP should not produce NaN"; - auto optimization = getCpuOptimizationFeatures(); #ifdef OPT_SVE2 if (optimization.sve2) { unsigned char alignment = 0; @@ -2926,3 +3062,82 @@ TEST(SQ8_SQ8_EdgeCases, ExtremeValuesTest) { ASSERT_NEAR(result, baseline, 0.01f) << "Extreme values IP should match baseline"; } + +TEST(SQ8_SQ8_EdgeCases, CosineExtremeValuesTest) { + auto optimization = getCpuOptimizationFeatures(); + size_t dim = 128; + std::vector v1(dim), v2(dim); + + // Alternating extreme values + for (size_t i = 0; i < dim; i++) { + v1[i] = (i % 2 == 0) ? 1.0f : -1.0f; + v2[i] = (i % 3 == 0) ? 1.0f : -1.0f; + } + + spaces::GetNormalizeFunc()(v1.data(), dim); + spaces::GetNormalizeFunc()(v2.data(), dim); + + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v1_quantized(quantized_size); + std::vector v2_quantized(quantized_size); + test_utils::quantize_float_vec_to_sq8_with_metadata(v1.data(), dim, v1_quantized.data()); + test_utils::quantize_float_vec_to_sq8_with_metadata(v2.data(), dim, v2_quantized.data()); + + float baseline = SQ8_SQ8_Cosine(v1_quantized.data(), v2_quantized.data(), dim); + +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized extreme values Cosine should match baseline"; + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized extreme values Cosine should match baseline"; + optimization.sve = 0; + } +#endif +#ifdef OPT_NEON_DOTPROD + if (optimization.asimddp) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized extreme values Cosine should match baseline"; + optimization.asimddp = 0; + } +#endif +#ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized extreme values Cosine should match baseline"; + optimization.asimd = 0; + } +#endif +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized extreme values Cosine should match baseline"; + optimization.avx512f = 0; + } +#endif + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); + float result = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + + ASSERT_NEAR(result, baseline, 0.01f) << "Extreme values Cosine should match baseline"; +} diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h index 8f8bab688..7eb4ffe6d 100644 --- a/tests/utils/tests_utils.h +++ b/tests/utils/tests_utils.h @@ -67,21 +67,26 @@ static void populate_float16_vec(vecsim_types::float16 *v, const size_t dim, int } } +/* + * SQ8_SQ8 distance function without the algebraic optimizations + * uses the regular dequantization formula: + * IP = Σ((min1 + delta1 * q1_i) * (min2 + delta2 * q2_i)) + * Used for testing the correctness of the optimized functions. + * + */ static float SQ8_SQ8_NotOptimized_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) { + const auto *pVect1 = static_cast(pVect1v); const auto *pVect2 = static_cast(pVect2v); - // Extract metadata from the end of vectors (likely already prefetched) // Get quantization parameters from pVect1 const float min_val1 = *reinterpret_cast(pVect1 + dimension); const float delta1 = *reinterpret_cast(pVect1 + dimension + sizeof(float)); - const float sum1 = *reinterpret_cast(pVect1 + dimension + 2 * sizeof(float)); // Get quantization parameters from pVect2 const float min_val2 = *reinterpret_cast(pVect2 + dimension); const float delta2 = *reinterpret_cast(pVect2 + dimension + sizeof(float)); - const float sum2 = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); // Compute inner product with dequantization float res = 0.0f; From d7972e9b9bb84b82aa5e35827aa0c4f6282c912b Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 4 Jan 2026 13:59:55 +0200 Subject: [PATCH 40/51] Fix header guard duplication and update test assertion for floating-point comparison --- src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_UINT8.h | 1 + tests/unit/test_spaces.cpp | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_UINT8.h b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_UINT8.h index 96ff1e8f5..deed0f706 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_UINT8.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_UINT8.h @@ -6,6 +6,7 @@ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). */ +#pragma once #include "VecSim/spaces/space_includes.h" static inline void InnerProductStep(uint8_t *&pVect1, uint8_t *&pVect2, __m512i &sum) { diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index adaaa4111..968526f75 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2365,8 +2365,9 @@ TEST_F(SpacesTest, SQ8_SQ8_ip_no_optimization_func_test) { #endif ASSERT_EQ(arch_opt_func, SQ8_SQ8_InnerProduct) << "Unexpected distance function chosen for dim " << dim; - // Checks that the function with the optimized math equivalence returns the same result. - ASSERT_EQ(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim)) + // Checks that the function with the optimized math equivalence returns similar result. + // Use ASSERT_NEAR due to floating-point differences between naive and algebraic formulas. + ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.001) << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } From a8075bf02dee780491c5a43b4618f1db86040279 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 4 Jan 2026 14:03:32 +0200 Subject: [PATCH 41/51] Add missing pragma once directive in NEON header files --- src/VecSim/spaces/IP/IP_NEON_DOTPROD_UINT8.h | 1 + src/VecSim/spaces/IP/IP_NEON_UINT8.h | 1 + 2 files changed, 2 insertions(+) diff --git a/src/VecSim/spaces/IP/IP_NEON_DOTPROD_UINT8.h b/src/VecSim/spaces/IP/IP_NEON_DOTPROD_UINT8.h index dde497c46..73682a21a 100644 --- a/src/VecSim/spaces/IP/IP_NEON_DOTPROD_UINT8.h +++ b/src/VecSim/spaces/IP/IP_NEON_DOTPROD_UINT8.h @@ -6,6 +6,7 @@ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). */ +#pragma once #include "VecSim/spaces/space_includes.h" #include diff --git a/src/VecSim/spaces/IP/IP_NEON_UINT8.h b/src/VecSim/spaces/IP/IP_NEON_UINT8.h index 10bb18707..6263eeea4 100644 --- a/src/VecSim/spaces/IP/IP_NEON_UINT8.h +++ b/src/VecSim/spaces/IP/IP_NEON_UINT8.h @@ -6,6 +6,7 @@ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). */ +#pragma once #include "VecSim/spaces/space_includes.h" #include From cddc4970621fa748624007339686cf567f3fd0d7 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 4 Jan 2026 14:54:32 +0200 Subject: [PATCH 42/51] Refactor SQ8 distance functions for improved performance and clarity - Updated inner product functions for NEON, SSE4, and SVE to streamline dequantization and reduce unnecessary calculations. - Consolidated common logic for inner product and cosine calculations across different SIMD implementations. - Enhanced the handling of vector normalization and quantization in unit tests, ensuring consistency in compressed vector sizes. - Adjusted benchmark tests to reflect changes in vector compression and distance function calls. - Corrected include paths for AVX512 implementations to maintain consistency across the codebase. --- src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h | 5 +- src/VecSim/spaces/IP/IP_AVX2_SQ8.h | 9 +- ...VNNI_SQ8.h => IP_AVX512F_SQ8_BW_VL_VNNI.h} | 0 src/VecSim/spaces/IP/IP_NEON_SQ8.h | 114 ++++---- src/VecSim/spaces/IP/IP_SSE4_SQ8.h | 8 +- src/VecSim/spaces/IP/IP_SVE_SQ8.h | 124 ++++----- .../spaces/functions/AVX512F_BW_VL_VNNI.cpp | 2 +- .../spaces_benchmarks/bm_spaces_sq8.cpp | 4 +- tests/unit/test_spaces.cpp | 251 +++++++++++++----- 9 files changed, 314 insertions(+), 203 deletions(-) rename src/VecSim/spaces/IP/{IP_AVX512F_BW_VL_VNNI_SQ8.h => IP_AVX512F_SQ8_BW_VL_VNNI.h} (100%) diff --git a/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h index c0c830c42..f8333e6e8 100644 --- a/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h @@ -100,6 +100,7 @@ float SQ8_InnerProductSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, template // 0..15 float SQ8_CosineSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) { - // Assume vectors are normalized. - return SQ8_InnerProductSIMD16_AVX2_FMA(pVect1v, pVect2v, dimension); + // Calculate inner product using common implementation with normalization + float ip = SQ8_InnerProductImp_FMA(pVect1v, pVect2v, dimension); + return 1.0f - ip; } diff --git a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h index 8327e337d..495ef90a4 100644 --- a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h @@ -33,7 +33,7 @@ static inline void InnerProductStepSQ8(const float *&pVect1, const uint8_t *&pVe } template // 0..15 -float SQ8_InnerProductImp_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) { +float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) { const float *pVect1 = static_cast(pVect1v); // pVect2 is a quantized uint8_t vector const uint8_t *pVect2 = static_cast(pVect2v); @@ -89,11 +89,12 @@ float SQ8_InnerProductImp_AVX2(const void *pVect1v, const void *pVect2v, size_t template // 0..15 float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) { - return 1.0f - SQ8_InnerProductImp_AVX2(pVect1v, pVect2v, dimension); + return 1.0f - SQ8_InnerProductImp(pVect1v, pVect2v, dimension); } template // 0..15 float SQ8_CosineSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) { - // Assume vectors are normalized. - return SQ8_InnerProductSIMD16_AVX2(pVect1v, pVect2v, dimension); + // Calculate inner product using common implementation with normalization + float ip = SQ8_InnerProductImp(pVect1v, pVect2v, dimension); + return 1.0f - ip; } diff --git a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h similarity index 100% rename from src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h rename to src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_SQ8.h index 612ef875d..7c3f27e10 100644 --- a/src/VecSim/spaces/IP/IP_NEON_SQ8.h +++ b/src/VecSim/spaces/IP/IP_NEON_SQ8.h @@ -9,37 +9,27 @@ #include "VecSim/spaces/space_includes.h" #include -/** - * SQ8 distance functions (float32 query vs uint8 stored) for NEON. - * - * Uses algebraic optimization to reduce operations per element: - * - * IP = Σ query[i] * (val[i] * δ + min) - * = δ * Σ(query[i] * val[i]) + min * Σ(query[i]) - * - * This saves 1 FMA per 4-element step by deferring dequantization to scalar math at the end. - * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] - */ - -// Helper function with algebraic optimization -static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, - float32x4_t &dot_sum, float32x4_t &query_sum) { - // Load 4 float elements from query +static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, float32x4_t &sum, + const float32x4_t &min_val_vec, const float32x4_t &delta_vec) { + // Load 4 float elements from pVect1 float32x4_t v1 = vld1q_f32(pVect1); pVect1 += 4; - // Load 4 uint8 elements and convert to float + // Load 4 uint8 elements from pVect2 uint8x8_t v2_u8 = vld1_u8(pVect2); pVect2 += 4; + + // Convert uint8 to uint32 uint32x4_t v2_u32 = vmovl_u16(vget_low_u16(vmovl_u8(v2_u8))); + + // Convert uint32 to float32 float32x4_t v2_f = vcvtq_f32_u32(v2_u32); - // Accumulate query * val (without dequantization) - dot_sum = vmlaq_f32(dot_sum, v1, v2_f); + // Dequantize: (val * delta) + min_val + float32x4_t v2_dequant = vmlaq_f32(min_val_vec, v2_f, delta_vec); - // Accumulate query sum - query_sum = vaddq_f32(query_sum, v1); + // Compute dot product and add to sum + sum = vmlaq_f32(sum, v1, v2_dequant); } template // 0..15 @@ -51,81 +41,70 @@ float SQ8_InnerProductSIMD16_NEON_IMP(const void *pVect1v, const void *pVect2v, const float min_val = *reinterpret_cast(pVect2 + dimension); const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); - // Multiple accumulators for instruction-level parallelism - // dot_sum: accumulates query[i] * val[i] - // query_sum: accumulates query[i] - float32x4_t dot_sum0 = vdupq_n_f32(0.0f); - float32x4_t dot_sum1 = vdupq_n_f32(0.0f); - float32x4_t dot_sum2 = vdupq_n_f32(0.0f); - float32x4_t dot_sum3 = vdupq_n_f32(0.0f); - float32x4_t query_sum0 = vdupq_n_f32(0.0f); - float32x4_t query_sum1 = vdupq_n_f32(0.0f); - float32x4_t query_sum2 = vdupq_n_f32(0.0f); - float32x4_t query_sum3 = vdupq_n_f32(0.0f); + // Create broadcast vectors for SIMD operations + float32x4_t min_val_vec = vdupq_n_f32(min_val); + float32x4_t delta_vec = vdupq_n_f32(delta); + + float32x4_t sum0 = vdupq_n_f32(0.0f); + float32x4_t sum1 = vdupq_n_f32(0.0f); + float32x4_t sum2 = vdupq_n_f32(0.0f); + float32x4_t sum3 = vdupq_n_f32(0.0f); const size_t num_of_chunks = dimension / 16; // Process 16 elements at a time in the main loop for (size_t i = 0; i < num_of_chunks; i++) { - InnerProductStep(pVect1, pVect2, dot_sum0, query_sum0); - InnerProductStep(pVect1, pVect2, dot_sum1, query_sum1); - InnerProductStep(pVect1, pVect2, dot_sum2, query_sum2); - InnerProductStep(pVect1, pVect2, dot_sum3, query_sum3); + InnerProductStep(pVect1, pVect2, sum0, min_val_vec, delta_vec); + InnerProductStep(pVect1, pVect2, sum1, min_val_vec, delta_vec); + InnerProductStep(pVect1, pVect2, sum2, min_val_vec, delta_vec); + InnerProductStep(pVect1, pVect2, sum3, min_val_vec, delta_vec); } - // Handle remaining complete 4-element blocks within residual + // Handle remaining complete 4-float blocks within residual if constexpr (residual >= 4) { - InnerProductStep(pVect1, pVect2, dot_sum0, query_sum0); + InnerProductStep(pVect1, pVect2, sum0, min_val_vec, delta_vec); } if constexpr (residual >= 8) { - InnerProductStep(pVect1, pVect2, dot_sum1, query_sum1); + InnerProductStep(pVect1, pVect2, sum1, min_val_vec, delta_vec); } if constexpr (residual >= 12) { - InnerProductStep(pVect1, pVect2, dot_sum2, query_sum2); + InnerProductStep(pVect1, pVect2, sum2, min_val_vec, delta_vec); } - // Handle final residual elements (0-3 elements) with scalar math + // Handle final residual elements (0-3 elements) constexpr size_t final_residual = residual % 4; if constexpr (final_residual > 0) { float32x4_t v1 = vdupq_n_f32(0.0f); - float32x4_t v2_f = vdupq_n_f32(0.0f); + float32x4_t v2_dequant = vdupq_n_f32(0.0f); if constexpr (final_residual >= 1) { v1 = vld1q_lane_f32(pVect1, v1, 0); - float val0 = static_cast(pVect2[0]); - v2_f = vld1q_lane_f32(&val0, v2_f, 0); + float dequant0 = pVect2[0] * delta + min_val; + v2_dequant = vld1q_lane_f32(&dequant0, v2_dequant, 0); } if constexpr (final_residual >= 2) { v1 = vld1q_lane_f32(pVect1 + 1, v1, 1); - float val1 = static_cast(pVect2[1]); - v2_f = vld1q_lane_f32(&val1, v2_f, 1); + float dequant1 = pVect2[1] * delta + min_val; + v2_dequant = vld1q_lane_f32(&dequant1, v2_dequant, 1); } if constexpr (final_residual >= 3) { v1 = vld1q_lane_f32(pVect1 + 2, v1, 2); - float val2 = static_cast(pVect2[2]); - v2_f = vld1q_lane_f32(&val2, v2_f, 2); + float dequant2 = pVect2[2] * delta + min_val; + v2_dequant = vld1q_lane_f32(&dequant2, v2_dequant, 2); } - dot_sum3 = vmlaq_f32(dot_sum3, v1, v2_f); - query_sum3 = vaddq_f32(query_sum3, v1); + sum3 = vmlaq_f32(sum3, v1, v2_dequant); } - // Combine accumulators - float32x4_t dot_total = vaddq_f32(vaddq_f32(dot_sum0, dot_sum1), vaddq_f32(dot_sum2, dot_sum3)); - float32x4_t query_total = - vaddq_f32(vaddq_f32(query_sum0, query_sum1), vaddq_f32(query_sum2, query_sum3)); + // Combine all four sum accumulators + float32x4_t sum_combined = vaddq_f32(vaddq_f32(sum0, sum1), vaddq_f32(sum2, sum3)); - // Horizontal sum - float32x2_t dot_halves = vadd_f32(vget_low_f32(dot_total), vget_high_f32(dot_total)); - float32x2_t dot_summed = vpadd_f32(dot_halves, dot_halves); - float dot_product = vget_lane_f32(dot_summed, 0); + // Horizontal sum of the 4 elements in the combined NEON register + float32x2_t sum_halves = vadd_f32(vget_low_f32(sum_combined), vget_high_f32(sum_combined)); + float32x2_t summed = vpadd_f32(sum_halves, sum_halves); + float sum = vget_lane_f32(summed, 0); - float32x2_t query_halves = vadd_f32(vget_low_f32(query_total), vget_high_f32(query_total)); - float32x2_t query_summed = vpadd_f32(query_halves, query_halves); - float query_sum = vget_lane_f32(query_summed, 0); - - // Apply algebraic formula: IP = δ * Σ(query*val) + min * Σ(query) - return delta * dot_product + min_val * query_sum; + return sum; } template // 0..15 @@ -135,6 +114,7 @@ float SQ8_InnerProductSIMD16_NEON(const void *pVect1v, const void *pVect2v, size template // 0..15 float SQ8_CosineSIMD16_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) { - // Assume vectors are normalized. - return 1.0f - SQ8_InnerProductSIMD16_NEON_IMP(pVect1v, pVect2v, dimension); + // Compute inner product with dequantization using the common function + const float res = SQ8_InnerProductSIMD16_NEON_IMP(pVect1v, pVect2v, dimension); + return 1.0f - res; } diff --git a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h index 243e147ad..d5dfc4e80 100644 --- a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h @@ -104,6 +104,10 @@ float SQ8_InnerProductSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size template // 0..15 float SQ8_CosineSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dimension) { - // Assume vectors are normalized. - return SQ8_InnerProductSIMD16_SSE4(pVect1v, pVect2v, dimension); + // Compute inner product with dequantization using the common function + // We need to cast away const for the inner product function, but it doesn't modify the vectors + const float res = SQ8_InnerProductSIMD16_SSE4_IMP(pVect1v, pVect2v, dimension); + + // For cosine, we need to account for the vector norms + return 1.0f - res; } diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8.h index 9fdbf7672..825e9c501 100644 --- a/src/VecSim/spaces/IP/IP_SVE_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8.h @@ -8,38 +8,31 @@ */ #include "VecSim/spaces/space_includes.h" #include +#include +#include -/** - * SQ8 distance functions (float32 query vs uint8 stored) for SVE. - * - * Uses algebraic optimization to reduce operations per element: - * - * IP = Σ query[i] * (val[i] * δ + min) - * = δ * Σ(query[i] * val[i]) + min * Σ(query[i]) - * - * This saves 1 FMA per chunk by deferring dequantization to scalar math at the end. - * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] - */ - -// Helper function to perform inner product step with algebraic optimization -static inline void InnerProductStep(const float *pVect1, const uint8_t *pVect2, size_t offset, - svfloat32_t &dot_sum, svfloat32_t &query_sum, - const size_t chunk) { +static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, size_t &offset, + svfloat32_t &sum, const svfloat32_t &min_val_vec, + const svfloat32_t &delta_vec, const size_t chunk) { svbool_t pg = svptrue_b32(); - // Load float elements from query + // Load float elements from pVect1 svfloat32_t v1 = svld1_f32(pg, pVect1 + offset); - // Load uint8 elements and convert to float - svuint32_t v2_u32 = svld1ub_u32(pg, pVect2 + offset); + // Convert uint8 to uint32 + svuint32_t v2_u32 = svld1ub_u32(pg, pVect2 + offset); // LD1UB: loa + + // Convert uint32 to float32 svfloat32_t v2_f = svcvt_f32_u32_x(pg, v2_u32); - // Accumulate query * val (without dequantization) - dot_sum = svmla_f32_x(pg, dot_sum, v1, v2_f); + // Dequantize: (val * delta) + min_val + svfloat32_t v2_dequant = svmla_f32_x(pg, min_val_vec, v2_f, delta_vec); + + // Compute dot product and add to sum + sum = svmla_f32_x(pg, sum, v1, v2_dequant); - // Accumulate query sum - query_sum = svadd_f32_x(pg, query_sum, v1); + // Move to the next set of elements + offset += chunk; } template @@ -49,25 +42,22 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz size_t offset = 0; // Get dequantization parameters from the end of quantized vector - const float min_val = *reinterpret_cast(pVect2 + dimension); - const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + float min = *(float *)(pVect2 + dimension); + float delta = *(float *)(pVect2 + dimension + sizeof(float)); + // Create broadcast vectors for SIMD operations svbool_t pg = svptrue_b32(); + svfloat32_t min_val_vec = svdup_f32(min); + svfloat32_t delta_vec = svdup_f32(delta); // Get the number of 32-bit elements per vector at runtime uint64_t chunk = svcntw(); - // Multiple accumulators for instruction-level parallelism - // dot_sum: accumulates query[i] * val[i] - // query_sum: accumulates query[i] - svfloat32_t dot_sum0 = svdup_f32(0.0f); - svfloat32_t dot_sum1 = svdup_f32(0.0f); - svfloat32_t dot_sum2 = svdup_f32(0.0f); - svfloat32_t dot_sum3 = svdup_f32(0.0f); - svfloat32_t query_sum0 = svdup_f32(0.0f); - svfloat32_t query_sum1 = svdup_f32(0.0f); - svfloat32_t query_sum2 = svdup_f32(0.0f); - svfloat32_t query_sum3 = svdup_f32(0.0f); + // Multiple accumulators to increase instruction-level parallelism + svfloat32_t sum0 = svdup_f32(0.0f); + svfloat32_t sum1 = svdup_f32(0.0f); + svfloat32_t sum2 = svdup_f32(0.0f); + svfloat32_t sum3 = svdup_f32(0.0f); // Handle partial chunk if needed if constexpr (partial_chunk) { @@ -77,20 +67,24 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz svbool_t pg_partial = svwhilelt_b32(static_cast(0), static_cast(remaining)); - // Load query float elements with predicate + // Load float elements from pVect1 with predicate svfloat32_t v1 = svld1_f32(pg_partial, pVect1); - // Load uint8 elements and convert to float - svuint32_t v2_u32 = svld1ub_u32(pg_partial, pVect2 + offset); + // load 8-bit bytes from pVect2+offset and zero-extend each into a 32-bit lane + svuint32_t v2_u32 = svld1ub_u32( + pg_partial, pVect2 + offset); // LD1UB: load 8-bit, zero-extend to 32-bit + // :contentReference[oaicite:0]{index=0} + + // Convert uint32 to float32 svfloat32_t v2_f = svcvt_f32_u32_z(pg_partial, v2_u32); - // Accumulate dot product (no dequantization) - dot_sum0 = svmla_f32_z(pg_partial, dot_sum0, v1, v2_f); + // Dequantize: (val * delta) + min_val + svfloat32_t v2_dequant = svmla_f32_z(pg_partial, min_val_vec, v2_f, delta_vec); - // Accumulate query sum - query_sum0 = svadd_f32_z(pg_partial, query_sum0, v1); + // Compute dot product and add to sum + sum0 = svmla_f32_z(pg_partial, sum0, v1, v2_dequant); - // Move past the partial chunk + // Move pointers past the partial chunk offset += remaining; } } @@ -101,38 +95,32 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz (dimension - (partial_chunk ? dimension % chunk : 0)) / chunk_size; for (size_t i = 0; i < number_of_chunks; i++) { - InnerProductStep(pVect1, pVect2, offset, dot_sum0, query_sum0, chunk); - InnerProductStep(pVect1, pVect2, offset + chunk, dot_sum1, query_sum1, chunk); - InnerProductStep(pVect1, pVect2, offset + 2 * chunk, dot_sum2, query_sum2, chunk); - InnerProductStep(pVect1, pVect2, offset + 3 * chunk, dot_sum3, query_sum3, chunk); - offset += chunk_size; + InnerProductStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec, chunk); + InnerProductStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec, chunk); + InnerProductStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec, chunk); + InnerProductStep(pVect1, pVect2, offset, sum3, min_val_vec, delta_vec, chunk); } // Handle remaining steps (0-3) if constexpr (additional_steps > 0) { - InnerProductStep(pVect1, pVect2, offset, dot_sum0, query_sum0, chunk); - offset += chunk; + InnerProductStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec, chunk); } if constexpr (additional_steps > 1) { - InnerProductStep(pVect1, pVect2, offset, dot_sum1, query_sum1, chunk); - offset += chunk; + InnerProductStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec, chunk); } if constexpr (additional_steps > 2) { - InnerProductStep(pVect1, pVect2, offset, dot_sum2, query_sum2, chunk); + InnerProductStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec, chunk); } // Combine the accumulators - svfloat32_t dot_total = - svadd_f32_x(pg, svadd_f32_x(pg, dot_sum0, dot_sum1), svadd_f32_x(pg, dot_sum2, dot_sum3)); - svfloat32_t query_total = svadd_f32_x(pg, svadd_f32_x(pg, query_sum0, query_sum1), - svadd_f32_x(pg, query_sum2, query_sum3)); + svfloat32_t sum = svadd_f32_z(pg, sum0, sum1); + sum = svadd_f32_z(pg, sum, sum2); + sum = svadd_f32_z(pg, sum, sum3); - // Horizontal sum of all elements - float dot_product = svaddv_f32(pg, dot_total); - float query_sum = svaddv_f32(pg, query_total); + // Horizontal sum of all elements in the vector + float result = svaddv_f32(pg, sum); - // Apply algebraic formula: IP = δ * Σ(query*val) + min * Σ(query) - return delta * dot_product + min_val * query_sum; + return result; } template @@ -143,6 +131,10 @@ float SQ8_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t template float SQ8_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { - // Assume vectors are normalized. - return SQ8_InnerProductSIMD_SVE(pVect1v, pVect2v, dimension); + // Compute inner product with dequantization using the common function + const float res = + SQ8_InnerProductSIMD_SVE_IMP(pVect1v, pVect2v, dimension); + + // For cosine, we need to account for the vector norms + return 1.0f - res; } diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp index 4041ef41a..256d2eea2 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp @@ -14,7 +14,7 @@ #include "VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_UINT8.h" #include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_UINT8.h" -#include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h" +#include "VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h" #include "VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h" #include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_SQ8.h" diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp index c95bf8026..1349a3512 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp @@ -26,7 +26,7 @@ class BM_VecSimSpaces_SQ8 : public benchmark::Fixture { test_utils::populate_float_vec(v1, dim, 123); // Allocate vector with extra space for min, delta and cosine calculations v2 = new uint8_t[dim + sizeof(float) * 3]; - test_utils::populate_float_vec_to_sq8_with_metadata(v2, dim, 1234); + test_utils::populate_float_vec_to_sq8(v2, dim, 1234); } void TearDown(const ::benchmark::State &state) { delete v1; @@ -96,4 +96,6 @@ INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8, SQ8, InnerProduct, 16); INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8, SQ8, Cosine, 16); INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8, SQ8, L2Sqr, 16); +// Naive + BENCHMARK_MAIN(); diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 968526f75..37dcfdb72 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -319,9 +319,9 @@ void common_ip_sq8(bool should_normalize, float expected_dist) { v2_orig[i] = float(i + 1.5); } - // Create SQ8 quantized version of v2 - // Size: dim (uint8_t) + min_val (float) + delta (float) - size_t quantized_size = dim * sizeof(uint8_t) + 2 * sizeof(float); + // Create SQ8 compressed version of v2 + // Size: dim (uint8_t) + min_val (float) + delta (float) + sum (float) + sum_squares (float) + size_t compressed_size = sizeof(uint8_t) + 4 * sizeof(float); if (should_normalize) { spaces::GetNormalizeFunc()(v1_orig, dim); spaces::GetNormalizeFunc()(v2_orig, dim); @@ -330,9 +330,13 @@ void common_ip_sq8(bool should_normalize, float expected_dist) { // Find min and max for quantization float min_val = v2_orig[0]; float max_val = v2_orig[0]; + float sum = 0.0f; + float sum_squares = 0.0f; for (size_t i = 1; i < dim; i++) { min_val = std::min(min_val, v2_orig[i]); max_val = std::max(max_val, v2_orig[i]); + sum += v2_orig[i]; + sum_squares += v2_orig[i] * v2_orig[i]; } // Calculate delta @@ -340,15 +344,17 @@ void common_ip_sq8(bool should_normalize, float expected_dist) { if (delta == 0) delta = 1.0f; // Avoid division by zero - std::vector v2_quantized(quantized_size); + std::vector v2_compressed(compressed_size); // Quantize v2 - uint8_t *quant_values = reinterpret_cast(v2_quantized.data()); + uint8_t *quant_values = reinterpret_cast(v2_compressed.data()); float *params = reinterpret_cast(quant_values + dim); // Store parameters params[0] = min_val; params[1] = delta; + params[2] = sum; + params[3] = sum_squares; // Quantize each value for (size_t i = 0; i < dim; i++) { @@ -357,28 +363,11 @@ void common_ip_sq8(bool should_normalize, float expected_dist) { quant_values[i] = static_cast(std::round(quantized)); } - float baseline = - SQ8_InnerProduct((const void *)v1_orig, (const void *)v2_quantized.data(), dim); + float dist = SQ8_InnerProduct((const void *)v1_orig, (const void *)v2_compressed.data(), dim); // Since we're comparing identical vectors, the inner product distance should be close to // expected - ASSERT_NEAR(baseline, expected_dist, 0.01) - << "SQ8_InnerProduct failed to match expected distance"; - - unsigned char alignment = 0; -#ifdef CPU_FEATURES_ARCH_AARCH64 - // Make sure we don't use any optimization (because there is no size optimization for arm) - auto optimization = getCpuOptimizationFeatures(); - optimization.sve = optimization.sve2 = optimization.asimddp = optimization.asimd = 0; - auto arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); -#else - auto arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, nullptr); -#endif - ASSERT_EQ(arch_opt_func, SQ8_InnerProduct) - << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig, v2_quantized.data(), dim), 0.01) - << "No optimization with dim " << dim; - ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; + ASSERT_NEAR(dist, expected_dist, 0.01) << "SQ8_InnerProduct failed to match expected distance"; } /* ======================== Tests SQ8 ========================= */ @@ -390,32 +379,55 @@ TEST_F(SpacesTest, SQ8_ip_no_optimization_func_test) { TEST_F(SpacesTest, SQ8_ip_no_optimization_norm_func_test) { common_ip_sq8(true, 0.0f); } TEST_F(SpacesTest, SQ8_Cosine_no_optimization_func_test) { + // create a vector with extra space for the norm size_t dim = 5; + // Create original vectors - std::vector v1_orig(dim); - test_utils::populate_float_vec(v1_orig.data(), dim); - spaces::GetNormalizeFunc()(v1_orig.data(), dim); + float v1_orig[dim], v2_orig[dim]; + for (size_t i = 0; i < dim; i++) { + v1_orig[i] = float(i + 1.5); + v2_orig[i] = float(i + 1.5); + } - // Create SQ8 quantized version of v2 - size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); - std::vector v2_quantized(quantized_size); - test_utils::populate_float_vec_to_sq8_with_metadata(v2_quantized.data(), dim, true); + // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float) + size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float); + spaces::GetNormalizeFunc()(v1_orig, dim); + // Find min and max for quantization + float min_val = v2_orig[0]; + float max_val = v2_orig[0]; + float sum = 0.0f; + float sum_squares = 0.0f; + for (size_t i = 1; i < dim; i++) { + min_val = std::min(min_val, v2_orig[i]); + max_val = std::max(max_val, v2_orig[i]); + sum += v2_orig[i]; + sum_squares += v2_orig[i] * v2_orig[i]; + } + // Calculate delta and inverse norm + float delta = (max_val - min_val) / 255.0f; + if (delta == 0) + delta = 1.0f; // Avoid division by zero - float baseline = SQ8_Cosine(v1_orig.data(), v2_quantized.data(), dim); + // Compress v2 + std::vector v2_compressed(compressed_size); + uint8_t *quant_values = reinterpret_cast(v2_compressed.data()); + float *params = reinterpret_cast(quant_values + dim); - unsigned char alignment = 0; -#ifdef CPU_FEATURES_ARCH_AARCH64 - // Make sure we don't use any optimization (because there is no size optimization for arm) - auto optimization = getCpuOptimizationFeatures(); - optimization.sve = optimization.sve2 = optimization.asimddp = optimization.asimd = 0; - auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); -#else - auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, nullptr); -#endif - ASSERT_EQ(arch_opt_func, SQ8_Cosine) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) - << "No optimization with dim " << dim; - ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; + // Quantize each value + for (size_t i = 0; i < dim; i++) { + float normalized = (v2_orig[i] - min_val) / delta; + normalized = std::max(0.0f, std::min(255.0f, normalized)); + quant_values[i] = static_cast(std::round(normalized)); + } + // Store parameters + params[0] = min_val; + params[1] = delta; + params[2] = sum; + params[3] = sum_squares; + + + float dist = SQ8_Cosine((const void *)v1_orig, (const void *)v2_compressed.data(), dim); + ASSERT_NEAR(dist, 0.0f, 0.000001f) << "SQ8_Cosine failed to match expected distance"; } TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) { // create a vector with extra space for the norm @@ -2224,7 +2236,121 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } -INSTANTIATE_TEST_SUITE_P(SQ8OptFuncs, SQ8SpacesOptimizationTest, +TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { + auto optimization = getCpuOptimizationFeatures(); + size_t dim = GetParam(); + + // Create original vectors + std::vector v1_orig(dim); + std::vector v2_orig(dim); + for (size_t i = 0; i < dim; i++) { + v1_orig[i] = float(i + 1.5); + v2_orig[i] = float(i * 0.75 + 1.0); + } + spaces::GetNormalizeFunc()(v1_orig.data(), dim); + + // Create SQ8 compressed version of v2 + std::vector v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim); + // print min and delta + float *params = reinterpret_cast(v2_compressed.data() + dim); + + auto expected_alignment = [](size_t reg_bit_size, size_t dim) { + size_t elements_in_reg = reg_bit_size / sizeof(uint8_t) / 8; + return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(uint8_t) : 0; + }; + + dist_func_t arch_opt_func; + float baseline = SQ8_InnerProduct(v1_orig.data(), v2_compressed.data(), dim); + +// Test different optimizations based on CPU features +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "AVX512 with dim " << dim; + optimization.avx512f = 0; + } +#endif +#ifdef OPT_AVX2_FMA + if (optimization.avx2 && optimization.fma3) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX2_FMA(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "AVX with dim " << dim; + optimization.fma3 = 0; + } +#endif +#ifdef OPT_AVX2 + if (optimization.avx2) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "AVX with dim " << dim; + optimization.avx2 = 0; + } +#endif +#ifdef OPT_SSE + if (optimization.sse4_1) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_SSE4(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "SSE with dim " << dim; + optimization.sse4_1 = 0; + } +#endif +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_SVE2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "SVE2 with dim " << dim; + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_SVE(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "SVE with dim " << dim; + optimization.sve = 0; + } +#endif +#ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_NEON(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "NEON with dim " << dim; + optimization.asimd = 0; + } +#endif +unsigned char alignment = 0; + arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, SQ8_InnerProduct) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "No optimization with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; +} + +// Instantiate the test suite with dimensions to test +INSTANTIATE_TEST_SUITE_P(SQ8InnerProductTest, SQ8SpacesOptimizationTest, testing::Range(16UL, 16 * 2UL + 1)); TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { @@ -2233,14 +2359,18 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { // Create original vectors std::vector v1_orig(dim); - test_utils::populate_float_vec(v1_orig.data(), dim); + std::vector v2_orig(dim); + for (size_t i = 0; i < dim; i++) { + v1_orig[i] = float(i + 1.5); + v2_orig[i] = float(i * 0.75 + 1.0); + } + // Normalize v1 spaces::GetNormalizeFunc()(v1_orig.data(), dim); + spaces::GetNormalizeFunc()(v2_orig.data(), dim); - // Create SQ8 quantized version of v2 (with normalization) - size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); - std::vector v2_quantized(quantized_size); - test_utils::populate_float_vec_to_sq8_with_metadata(v2_quantized.data(), dim, true); + // Create SQ8 compressed version of v2 (with normalization) + std::vector v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim); auto expected_alignment = [](size_t reg_bit_size, size_t dim) { size_t elements_in_reg = reg_bit_size / sizeof(uint8_t) / 8; @@ -2248,7 +2378,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { }; dist_func_t arch_opt_func; - float baseline = SQ8_Cosine(v1_orig.data(), v2_quantized.data(), dim); + float baseline = SQ8_Cosine(v1_orig.data(), v2_compressed.data(), dim); #ifdef OPT_SVE2 if (optimization.sve2) { @@ -2256,7 +2386,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_SVE2(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "SVE2 with dim " << dim; optimization.sve2 = 0; } @@ -2267,7 +2397,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_SVE(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "SVE with dim " << dim; optimization.sve = 0; } @@ -2278,7 +2408,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_NEON(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "NEON with dim " << dim; optimization.asimd = 0; } @@ -2291,7 +2421,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "AVX512 with dim " << dim; optimization.avx512f = 0; } @@ -2302,7 +2432,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX2_FMA(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "AVX with dim " << dim; optimization.fma3 = 0; } @@ -2313,18 +2443,19 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX2(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "AVX with dim " << dim; optimization.avx2 = 0; } #endif + #ifdef OPT_SSE if (optimization.sse4_1) { unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_SSE4(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "SSE with dim " << dim; optimization.sse4_1 = 0; } @@ -2334,7 +2465,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, SQ8_Cosine) << "Unexpected distance function chosen for dim " << dim; - ASSERT_EQ(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim)) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } From 4f0fec753f6945656a98c1c88b629194e64b0998 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 4 Jan 2026 15:07:44 +0200 Subject: [PATCH 43/51] Update SQ8 vector population functions to include metadata and adjust compressed size calculations --- .../spaces_benchmarks/bm_spaces_sq8.cpp | 2 +- tests/unit/test_spaces.cpp | 20 ++++++++----------- tests/utils/tests_utils.h | 1 - 3 files changed, 9 insertions(+), 14 deletions(-) diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp index 1349a3512..ddf188832 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp @@ -26,7 +26,7 @@ class BM_VecSimSpaces_SQ8 : public benchmark::Fixture { test_utils::populate_float_vec(v1, dim, 123); // Allocate vector with extra space for min, delta and cosine calculations v2 = new uint8_t[dim + sizeof(float) * 3]; - test_utils::populate_float_vec_to_sq8(v2, dim, 1234); + test_utils::populate_float_vec_to_sq8_with_metadata(v2, dim, 1234, true); } void TearDown(const ::benchmark::State &state) { delete v1; diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 37dcfdb72..73a3e9245 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -321,7 +321,7 @@ void common_ip_sq8(bool should_normalize, float expected_dist) { // Create SQ8 compressed version of v2 // Size: dim (uint8_t) + min_val (float) + delta (float) + sum (float) + sum_squares (float) - size_t compressed_size = sizeof(uint8_t) + 4 * sizeof(float); + size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float); if (should_normalize) { spaces::GetNormalizeFunc()(v1_orig, dim); spaces::GetNormalizeFunc()(v2_orig, dim); @@ -330,13 +330,11 @@ void common_ip_sq8(bool should_normalize, float expected_dist) { // Find min and max for quantization float min_val = v2_orig[0]; float max_val = v2_orig[0]; - float sum = 0.0f; - float sum_squares = 0.0f; + float sum = v2_orig[0]; for (size_t i = 1; i < dim; i++) { min_val = std::min(min_val, v2_orig[i]); max_val = std::max(max_val, v2_orig[i]); sum += v2_orig[i]; - sum_squares += v2_orig[i] * v2_orig[i]; } // Calculate delta @@ -354,7 +352,6 @@ void common_ip_sq8(bool should_normalize, float expected_dist) { params[0] = min_val; params[1] = delta; params[2] = sum; - params[3] = sum_squares; // Quantize each value for (size_t i = 0; i < dim; i++) { @@ -392,16 +389,15 @@ TEST_F(SpacesTest, SQ8_Cosine_no_optimization_func_test) { // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float) size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float); spaces::GetNormalizeFunc()(v1_orig, dim); + spaces::GetNormalizeFunc()(v2_orig, dim); // Find min and max for quantization float min_val = v2_orig[0]; float max_val = v2_orig[0]; - float sum = 0.0f; - float sum_squares = 0.0f; + float sum = v2_orig[0]; for (size_t i = 1; i < dim; i++) { min_val = std::min(min_val, v2_orig[i]); max_val = std::max(max_val, v2_orig[i]); sum += v2_orig[i]; - sum_squares += v2_orig[i] * v2_orig[i]; } // Calculate delta and inverse norm float delta = (max_val - min_val) / 255.0f; @@ -423,11 +419,9 @@ TEST_F(SpacesTest, SQ8_Cosine_no_optimization_func_test) { params[0] = min_val; params[1] = delta; params[2] = sum; - params[3] = sum_squares; - float dist = SQ8_Cosine((const void *)v1_orig, (const void *)v2_compressed.data(), dim); - ASSERT_NEAR(dist, 0.0f, 0.000001f) << "SQ8_Cosine failed to match expected distance"; + ASSERT_NEAR(dist, 0.0f, 0.001f) << "SQ8_Cosine failed to match expected distance"; } TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) { // create a vector with extra space for the norm @@ -2340,7 +2334,9 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { optimization.asimd = 0; } #endif -unsigned char alignment = 0; + + // Test default implementation + unsigned char alignment = 0; arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, SQ8_InnerProduct) << "Unexpected distance function chosen for dim " << dim; diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h index 7eb4ffe6d..0479f2101 100644 --- a/tests/utils/tests_utils.h +++ b/tests/utils/tests_utils.h @@ -127,7 +127,6 @@ static void quantize_float_vec_to_sq8_with_metadata(const float *v, size_t dim, delta = 1.0f; // Avoid division by zero // Quantize each value - for (size_t i = 0; i < dim; i++) { float normalized = (v[i] - min_val) / delta; normalized = std::max(0.0f, std::min(255.0f, normalized)); From 8ab419268e9caf82ffeb04eaa9819d9d2fe3498c Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 4 Jan 2026 15:13:38 +0200 Subject: [PATCH 44/51] Refactor SQ8 inner product functions for improved clarity and performance --- .../spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h | 133 +++++++----------- 1 file changed, 48 insertions(+), 85 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h index f821e069f..64b924084 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h @@ -9,137 +9,100 @@ #pragma once #include "VecSim/spaces/space_includes.h" #include +#include -/** - * SQ8 distance functions (float32 query vs uint8 stored) using AVX512. - * - * Uses algebraic optimization to reduce operations per element: - * - * IP = Σ query[i] * (val[i] * δ + min) - * = δ * Σ(query[i] * val[i]) + min * Σ(query[i]) - * - * This saves one FMA per 16 elements by separating: - * - dot_sum: accumulates query[i] * val[i] - * - query_sum: accumulates query[i] - * Then combines at the end: result = δ * dot_sum + min * query_sum - * - * Also uses multiple accumulators for better instruction-level parallelism. - * - * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] - */ - -// Process 16 elements with algebraic optimization -static inline void SQ8_InnerProductStep(const float *pVec1, const uint8_t *pVec2, __m512 &dot_sum, - __m512 &query_sum) { - // Load 16 float elements from query +static inline void SQ8_InnerProductStep(const float *&pVec1, const uint8_t *&pVec2, __m512 &sum, + const __m512 &min_val_vec, const __m512 &delta_vec) { + // Load 16 float elements from pVec1 __m512 v1 = _mm512_loadu_ps(pVec1); - // Load 16 uint8 elements and convert to float - __m128i v2_128 = _mm_loadu_si128(reinterpret_cast(pVec2)); + // Load 16 uint8 elements from pVec2 and convert to __m512i + __m128i v2_128 = _mm_loadu_si128((__m128i *)pVec2); __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); + + // Convert uint8 to float __m512 v2_f = _mm512_cvtepi32_ps(v2_512); - // Accumulate query * val (without dequantization) - dot_sum = _mm512_fmadd_ps(v1, v2_f, dot_sum); + // Dequantize: (val * delta) + min_val + __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec); + + // Compute dot product and add to sum + sum = _mm512_fmadd_ps(v1, dequantized, sum); - // Accumulate query sum - query_sum = _mm512_add_ps(query_sum, v1); + // Advance pointers + pVec1 += 16; + pVec2 += 16; } // Common implementation for both inner product and cosine similarity template // 0..15 -float SQ8_InnerProductImp_AVX512(const void *pVec1v, const void *pVec2v, size_t dimension) { +float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimension) { const float *pVec1 = static_cast(pVec1v); const uint8_t *pVec2 = static_cast(pVec2v); + const float *pEnd1 = pVec1 + dimension; // Get dequantization parameters from the end of pVec2 const float min_val = *reinterpret_cast(pVec2 + dimension); const float delta = *reinterpret_cast(pVec2 + dimension + sizeof(float)); - // Multiple accumulators for instruction-level parallelism - __m512 dot_sum0 = _mm512_setzero_ps(); - __m512 dot_sum1 = _mm512_setzero_ps(); - __m512 dot_sum2 = _mm512_setzero_ps(); - __m512 dot_sum3 = _mm512_setzero_ps(); - __m512 query_sum0 = _mm512_setzero_ps(); - __m512 query_sum1 = _mm512_setzero_ps(); - __m512 query_sum2 = _mm512_setzero_ps(); - __m512 query_sum3 = _mm512_setzero_ps(); + // Create broadcast vectors for SIMD operations + __m512 min_val_vec = _mm512_set1_ps(min_val); + __m512 delta_vec = _mm512_set1_ps(delta); - size_t offset = 0; + // Initialize sum accumulator + __m512 sum = _mm512_setzero_ps(); // Deal with remainder first if constexpr (residual > 0) { // Handle less than 16 elements __mmask16 mask = (1U << residual) - 1; - // Load masked float elements from query + // Load masked float elements __m512 v1 = _mm512_maskz_loadu_ps(mask, pVec1); - // Load uint8 elements and convert to float + // Load full uint8 elements - we know that the first 16 elements are safe to load __m128i v2_128 = _mm_loadu_si128(reinterpret_cast(pVec2)); __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); __m512 v2_f = _mm512_cvtepi32_ps(v2_512); - // Masked accumulation (mask already zeroed unused elements in v1) - dot_sum0 = _mm512_mul_ps(v1, v2_f); - query_sum0 = v1; + // Dequantize + __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec); - offset = residual; - } - - // Calculate number of full 64-element chunks (4 x 16) - size_t num_chunks = (dimension - residual) / 64; + // Compute dot product + __m512 product = _mm512_mul_ps(v1, dequantized); - // Process 4 chunks at a time for maximum ILP - for (size_t i = 0; i < num_chunks; i++) { - SQ8_InnerProductStep(pVec1 + offset, pVec2 + offset, dot_sum0, query_sum0); - SQ8_InnerProductStep(pVec1 + offset + 16, pVec2 + offset + 16, dot_sum1, query_sum1); - SQ8_InnerProductStep(pVec1 + offset + 32, pVec2 + offset + 32, dot_sum2, query_sum2); - SQ8_InnerProductStep(pVec1 + offset + 48, pVec2 + offset + 48, dot_sum3, query_sum3); - offset += 64; - } + // Apply mask to product and add to sum + sum = _mm512_fmadd_ps(sum, sum, product); - // Handle remaining 16-element chunks (0-3 remaining) - size_t remaining = (dimension - residual) % 64; - if (remaining >= 16) { - SQ8_InnerProductStep(pVec1 + offset, pVec2 + offset, dot_sum0, query_sum0); - offset += 16; - remaining -= 16; - } - if (remaining >= 16) { - SQ8_InnerProductStep(pVec1 + offset, pVec2 + offset, dot_sum1, query_sum1); - offset += 16; - remaining -= 16; + pVec1 += residual; + pVec2 += residual; } - if (remaining >= 16) { - SQ8_InnerProductStep(pVec1 + offset, pVec2 + offset, dot_sum2, query_sum2); - } - - // Combine accumulators - __m512 dot_total = - _mm512_add_ps(_mm512_add_ps(dot_sum0, dot_sum1), _mm512_add_ps(dot_sum2, dot_sum3)); - __m512 query_total = - _mm512_add_ps(_mm512_add_ps(query_sum0, query_sum1), _mm512_add_ps(query_sum2, query_sum3)); - // Reduce to scalar - float dot_product = _mm512_reduce_add_ps(dot_total); - float query_sum = _mm512_reduce_add_ps(query_total); + // Process remaining full chunks of 16 elements + do { + SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec); + } while (pVec1 < pEnd1); - // Apply algebraic formula: IP = δ * Σ(query*val) + min * Σ(query) - return delta * dot_product + min_val * query_sum; + // Return the raw inner product result + return _mm512_reduce_add_ps(sum); } template // 0..15 float SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, size_t dimension) { + // Calculate inner product using common implementation + float ip = SQ8_InnerProductImp(pVec1v, pVec2v, dimension); + // The inner product similarity is 1 - ip - return 1.0f - SQ8_InnerProductImp_AVX512(pVec1v, pVec2v, dimension); + return 1.0f - ip; } template // 0..15 float SQ8_CosineSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, size_t dimension) { - // Assume vectors are normalized. - return SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI(pVec1v, pVec2v, dimension); + // Calculate inner product using common implementation with normalization + float ip = SQ8_InnerProductImp(pVec1v, pVec2v, dimension); + + // The cosine similarity is 1 - ip + return 1.0f - ip; } From 8c59cb2097ebc7f727ce61a6fc4847d2eed5acff Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 4 Jan 2026 15:41:37 +0200 Subject: [PATCH 45/51] Rename inner product implementation functions for AVX2 and AVX512 for clarity --- src/VecSim/spaces/IP/IP_AVX2_SQ8.h | 6 +- .../spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h | 6 +- tests/unit/test_spaces.cpp | 72 ++++++++++++++++++- 3 files changed, 76 insertions(+), 8 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h index 495ef90a4..203e32fad 100644 --- a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h @@ -33,7 +33,7 @@ static inline void InnerProductStepSQ8(const float *&pVect1, const uint8_t *&pVe } template // 0..15 -float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) { +float SQ8_InnerProductImp_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) { const float *pVect1 = static_cast(pVect1v); // pVect2 is a quantized uint8_t vector const uint8_t *pVect2 = static_cast(pVect2v); @@ -89,12 +89,12 @@ float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimen template // 0..15 float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) { - return 1.0f - SQ8_InnerProductImp(pVect1v, pVect2v, dimension); + return 1.0f - SQ8_InnerProductImp_AVX2(pVect1v, pVect2v, dimension); } template // 0..15 float SQ8_CosineSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) { // Calculate inner product using common implementation with normalization - float ip = SQ8_InnerProductImp(pVect1v, pVect2v, dimension); + float ip = SQ8_InnerProductImp_AVX2(pVect1v, pVect2v, dimension); return 1.0f - ip; } diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h index 64b924084..35ea482fa 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h @@ -36,7 +36,7 @@ static inline void SQ8_InnerProductStep(const float *&pVec1, const uint8_t *&pVe // Common implementation for both inner product and cosine similarity template // 0..15 -float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimension) { +float SQ8_InnerProductImp_AVX512(const void *pVec1v, const void *pVec2v, size_t dimension) { const float *pVec1 = static_cast(pVec1v); const uint8_t *pVec2 = static_cast(pVec2v); const float *pEnd1 = pVec1 + dimension; @@ -91,7 +91,7 @@ template // 0..15 float SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, size_t dimension) { // Calculate inner product using common implementation - float ip = SQ8_InnerProductImp(pVec1v, pVec2v, dimension); + float ip = SQ8_InnerProductImp_AVX512(pVec1v, pVec2v, dimension); // The inner product similarity is 1 - ip return 1.0f - ip; @@ -101,7 +101,7 @@ template // 0..15 float SQ8_CosineSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, size_t dimension) { // Calculate inner product using common implementation with normalization - float ip = SQ8_InnerProductImp(pVec1v, pVec2v, dimension); + float ip = SQ8_InnerProductImp_AVX512(pVec1v, pVec2v, dimension); // The cosine similarity is 1 - ip return 1.0f - ip; diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 73a3e9245..ed7135b2c 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2769,7 +2769,75 @@ TEST(SQ8_SQ8_EdgeCases, SelfDistanceCosine) { } // Test symmetry: dist(v1, v2) == dist(v2, v1) -TEST(SQ8_SQ8_EdgeCases, SymmetryTest) { +TEST(SQ8_SQ8_EdgeCases, IPSymmetryTest) { + size_t dim = 128; + auto optimization = getCpuOptimizationFeatures(); + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v1_quantized(quantized_size); + std::vector v2_quantized(quantized_size); + test_utils::populate_float_vec_to_sq8_with_metadata(v1_quantized.data(), dim, true, 456, -1.0f, + 1.0f); + test_utils::populate_float_vec_to_sq8_with_metadata(v2_quantized.data(), dim, true, 123, -1.0f, + 1.0f); + + unsigned char alignment = 0; + +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float cos_12 = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + float cos_21 = arch_opt_func(v2_quantized.data(), v1_quantized.data(), dim); + ASSERT_EQ(cos_12, cos_21) << "ip should be symmetric"; + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float cos_12 = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + float cos_21 = arch_opt_func(v2_quantized.data(), v1_quantized.data(), dim); + ASSERT_EQ(cos_12, cos_21) << "ip should be symmetric"; + optimization.sve = 0; + } +#endif +#ifdef OPT_NEON_DOTPROD + if (optimization.asimddp) { + unsigned char alignment = 0; + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float cos_12 = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + float cos_21 = arch_opt_func(v2_quantized.data(), v1_quantized.data(), dim); + ASSERT_EQ(cos_12, cos_21) << "ip should be symmetric"; + optimization.asimddp = 0; + } +#endif +#ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float cos_12 = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + float cos_21 = arch_opt_func(v2_quantized.data(), v1_quantized.data(), dim); + ASSERT_EQ(cos_12, cos_21) << "ip should be symmetric"; + optimization.asimd = 0; + } +#endif +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { + unsigned char alignment = 0; + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float cos_12 = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + float cos_21 = arch_opt_func(v2_quantized.data(), v1_quantized.data(), dim); + ASSERT_EQ(cos_12, cos_21) << "ip should be symmetric"; + optimization.avx512f = 0; + } +#endif + auto ip_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); + float ip_12 = ip_func(v1_quantized.data(), v2_quantized.data(), dim); + float ip_21 = ip_func(v2_quantized.data(), v1_quantized.data(), dim); + ASSERT_EQ(ip_12, ip_21) << "IP should be symmetric"; +} +TEST(SQ8_SQ8_EdgeCases, CosineSymmetryTest) { size_t dim = 128; auto optimization = getCpuOptimizationFeatures(); size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); @@ -2835,7 +2903,7 @@ TEST(SQ8_SQ8_EdgeCases, SymmetryTest) { auto cosine_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); float cos_12 = cosine_func(v1_quantized.data(), v2_quantized.data(), dim); float cos_21 = cosine_func(v2_quantized.data(), v1_quantized.data(), dim); - ASSERT_NEAR(cos_12, cos_21, 1e-6f) << "Cosine should be symmetric"; + ASSERT_EQ(cos_12, cos_21) << "Cosine should be symmetric"; } // Test with zero vector From a4ff5d0c670906e0d585f1ea85b025222d7228c2 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 4 Jan 2026 15:49:33 +0200 Subject: [PATCH 46/51] Refactor SQ8 cosine function to utilize inner product function for improved clarity --- src/VecSim/spaces/IP/IP.cpp | 26 +------------- tests/unit/test_spaces.cpp | 67 +++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 25 deletions(-) diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp index 247924d3b..59d5d5fe1 100644 --- a/src/VecSim/spaces/IP/IP.cpp +++ b/src/VecSim/spaces/IP/IP.cpp @@ -81,31 +81,7 @@ float SQ8_SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dime // SQ8-to-SQ8: Both vectors are uint8 quantized and normalized with precomputed sum // Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] float SQ8_SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) { - const auto *pVect1 = static_cast(pVect1v); - const auto *pVect2 = static_cast(pVect2v); - - // Compute inner product of quantized values: Σ(q1[i]*q2[i]) - float product = 0; - for (size_t i = 0; i < dimension; i++) { - product += pVect1[i] * pVect2[i]; - } - - // Extract metadata from the end of vectors - // Get quantization parameters from pVect1 - const float min_val1 = *reinterpret_cast(pVect1 + dimension); - const float delta1 = *reinterpret_cast(pVect1 + dimension + sizeof(float)); - const float sum1 = *reinterpret_cast(pVect1 + dimension + 2 * sizeof(float)); - - // Get quantization parameters from pVect2 - const float min_val2 = *reinterpret_cast(pVect2 + dimension); - const float delta2 = *reinterpret_cast(pVect2 + dimension + sizeof(float)); - const float sum2 = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); - - // Apply the algebraic formula using precomputed sums: - // IP = min1*sum2 + min2*sum1 + delta1*delta2*Σ(q1[i]*q2[i]) - dim*min1*min2 - float res = min_val1 * sum2 + min_val2 * sum1 - - static_cast(dimension) * min_val1 * min_val2 + delta1 * delta2 * product; - return 1.0f - res; + return SQ8_SQ8_InnerProduct(pVect1v, pVect2v, dimension); } float FP32_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension) { diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index ed7135b2c..c823c16c8 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2701,6 +2701,73 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_CosineTest) { INSTANTIATE_TEST_SUITE_P(SQ8_SQ8OptFuncs, SQ8_SQ8_SpacesOptimizationTest, testing::Range(64UL, 64 * 2UL + 1)); +// Test self-distance: distance to itself should be 0 for ip (normalized vectors) +TEST(SQ8_SQ8_EdgeCases, SelfDistanceIP) { + auto optimization = getCpuOptimizationFeatures(); + size_t dim = 128; + + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v_quantized(quantized_size); + test_utils::populate_float_vec_to_sq8_with_metadata(v_quantized.data(), dim, true); + + float baseline = SQ8_SQ8_InnerProduct(v_quantized.data(), v_quantized.data(), dim); + + // Self-distance for inner product should be close to 0 + ASSERT_NEAR(baseline, 0.0f, 0.001f) << "Self-distance should be ~0 for inner product"; + +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_quantized.data(), v_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_quantized.data(), v_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; + optimization.sve = 0; + } +#endif +#ifdef OPT_NEON_DOTPROD + if (optimization.asimddp) { + unsigned char alignment = 0; + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_quantized.data(), v_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; + optimization.asimddp = 0; + } +#endif +#ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_quantized.data(), v_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; + optimization.asimd = 0; + } +#endif +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { + unsigned char alignment = 0; + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_quantized.data(), v_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; + optimization.avx512f = 0; + } +#endif + + unsigned char alignment = 0; + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(baseline, arch_opt_func(v_quantized.data(), v_quantized.data(), dim)) + << "No optimization self-distance should match baseline"; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; +} + // Test self-distance: distance to itself should be 0 for cosine (normalized vectors) TEST(SQ8_SQ8_EdgeCases, SelfDistanceCosine) { auto optimization = getCpuOptimizationFeatures(); From c22158fece88a56054512ed0ab740bf5d177e154 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 4 Jan 2026 15:56:16 +0200 Subject: [PATCH 47/51] Remove redundant inner product edge case tests for SQ8 distance functions --- tests/unit/test_spaces.cpp | 348 ------------------------------------- 1 file changed, 348 deletions(-) diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index c823c16c8..70a36d14e 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2701,73 +2701,6 @@ TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_CosineTest) { INSTANTIATE_TEST_SUITE_P(SQ8_SQ8OptFuncs, SQ8_SQ8_SpacesOptimizationTest, testing::Range(64UL, 64 * 2UL + 1)); -// Test self-distance: distance to itself should be 0 for ip (normalized vectors) -TEST(SQ8_SQ8_EdgeCases, SelfDistanceIP) { - auto optimization = getCpuOptimizationFeatures(); - size_t dim = 128; - - size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); - std::vector v_quantized(quantized_size); - test_utils::populate_float_vec_to_sq8_with_metadata(v_quantized.data(), dim, true); - - float baseline = SQ8_SQ8_InnerProduct(v_quantized.data(), v_quantized.data(), dim); - - // Self-distance for inner product should be close to 0 - ASSERT_NEAR(baseline, 0.0f, 0.001f) << "Self-distance should be ~0 for inner product"; - -#ifdef OPT_SVE2 - if (optimization.sve2) { - unsigned char alignment = 0; - auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_quantized.data(), v_quantized.data(), dim); - ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; - optimization.sve2 = 0; - } -#endif -#ifdef OPT_SVE - if (optimization.sve) { - unsigned char alignment = 0; - auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_quantized.data(), v_quantized.data(), dim); - ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; - optimization.sve = 0; - } -#endif -#ifdef OPT_NEON_DOTPROD - if (optimization.asimddp) { - unsigned char alignment = 0; - auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_quantized.data(), v_quantized.data(), dim); - ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; - optimization.asimddp = 0; - } -#endif -#ifdef OPT_NEON - if (optimization.asimd) { - unsigned char alignment = 0; - auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_quantized.data(), v_quantized.data(), dim); - ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; - optimization.asimd = 0; - } -#endif -#ifdef OPT_AVX512_F_BW_VL_VNNI - if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { - unsigned char alignment = 0; - auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_quantized.data(), v_quantized.data(), dim); - ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; - optimization.avx512f = 0; - } -#endif - - unsigned char alignment = 0; - auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(baseline, arch_opt_func(v_quantized.data(), v_quantized.data(), dim)) - << "No optimization self-distance should match baseline"; - ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; -} - // Test self-distance: distance to itself should be 0 for cosine (normalized vectors) TEST(SQ8_SQ8_EdgeCases, SelfDistanceCosine) { auto optimization = getCpuOptimizationFeatures(); @@ -2836,74 +2769,6 @@ TEST(SQ8_SQ8_EdgeCases, SelfDistanceCosine) { } // Test symmetry: dist(v1, v2) == dist(v2, v1) -TEST(SQ8_SQ8_EdgeCases, IPSymmetryTest) { - size_t dim = 128; - auto optimization = getCpuOptimizationFeatures(); - size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); - std::vector v1_quantized(quantized_size); - std::vector v2_quantized(quantized_size); - test_utils::populate_float_vec_to_sq8_with_metadata(v1_quantized.data(), dim, true, 456, -1.0f, - 1.0f); - test_utils::populate_float_vec_to_sq8_with_metadata(v2_quantized.data(), dim, true, 123, -1.0f, - 1.0f); - - unsigned char alignment = 0; - -#ifdef OPT_SVE2 - if (optimization.sve2) { - unsigned char alignment = 0; - auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); - float cos_12 = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); - float cos_21 = arch_opt_func(v2_quantized.data(), v1_quantized.data(), dim); - ASSERT_EQ(cos_12, cos_21) << "ip should be symmetric"; - optimization.sve2 = 0; - } -#endif -#ifdef OPT_SVE - if (optimization.sve) { - unsigned char alignment = 0; - auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); - float cos_12 = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); - float cos_21 = arch_opt_func(v2_quantized.data(), v1_quantized.data(), dim); - ASSERT_EQ(cos_12, cos_21) << "ip should be symmetric"; - optimization.sve = 0; - } -#endif -#ifdef OPT_NEON_DOTPROD - if (optimization.asimddp) { - unsigned char alignment = 0; - auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); - float cos_12 = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); - float cos_21 = arch_opt_func(v2_quantized.data(), v1_quantized.data(), dim); - ASSERT_EQ(cos_12, cos_21) << "ip should be symmetric"; - optimization.asimddp = 0; - } -#endif -#ifdef OPT_NEON - if (optimization.asimd) { - unsigned char alignment = 0; - auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); - float cos_12 = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); - float cos_21 = arch_opt_func(v2_quantized.data(), v1_quantized.data(), dim); - ASSERT_EQ(cos_12, cos_21) << "ip should be symmetric"; - optimization.asimd = 0; - } -#endif -#ifdef OPT_AVX512_F_BW_VL_VNNI - if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { - unsigned char alignment = 0; - auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); - float cos_12 = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); - float cos_21 = arch_opt_func(v2_quantized.data(), v1_quantized.data(), dim); - ASSERT_EQ(cos_12, cos_21) << "ip should be symmetric"; - optimization.avx512f = 0; - } -#endif - auto ip_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); - float ip_12 = ip_func(v1_quantized.data(), v2_quantized.data(), dim); - float ip_21 = ip_func(v2_quantized.data(), v1_quantized.data(), dim); - ASSERT_EQ(ip_12, ip_21) << "IP should be symmetric"; -} TEST(SQ8_SQ8_EdgeCases, CosineSymmetryTest) { size_t dim = 128; auto optimization = getCpuOptimizationFeatures(); @@ -2974,72 +2839,6 @@ TEST(SQ8_SQ8_EdgeCases, CosineSymmetryTest) { } // Test with zero vector -TEST(SQ8_SQ8_EdgeCases, IPZeroVectorTest) { - auto optimization = getCpuOptimizationFeatures(); - size_t dim = 128; - std::vector v_zero(dim, 0.0f); - - size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); - std::vector v_zero_quantized(quantized_size); - std::vector v_nonzero_quantized(quantized_size); - test_utils::quantize_float_vec_to_sq8_with_metadata(v_zero.data(), dim, - v_zero_quantized.data()); - test_utils::populate_float_vec_to_sq8_with_metadata(v_nonzero_quantized.data(), dim, true); - - float baseline = SQ8_SQ8_InnerProduct(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); - -#ifdef OPT_SVE2 - if (optimization.sve2) { - unsigned char alignment = 0; - auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); - ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; - optimization.sve2 = 0; - } -#endif -#ifdef OPT_SVE - if (optimization.sve) { - unsigned char alignment = 0; - auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); - ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; - optimization.sve = 0; - } -#endif -#ifdef OPT_NEON_DOTPROD - if (optimization.asimddp) { - unsigned char alignment = 0; - auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); - ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; - optimization.asimddp = 0; - } -#endif -#ifdef OPT_NEON - if (optimization.asimd) { - unsigned char alignment = 0; - auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); - ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; - optimization.asimd = 0; - } -#endif -#ifdef OPT_AVX512_F_BW_VL_VNNI - if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { - unsigned char alignment = 0; - auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); - ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; - optimization.avx512f = 0; - } -#endif - unsigned char alignment = 0; - auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); - float result = arch_opt_func(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); - - ASSERT_NEAR(result, baseline, 0.01f) << "Zero vector IP should match baseline"; -} - TEST(SQ8_SQ8_EdgeCases, CosineZeroVectorTest) { auto optimization = getCpuOptimizationFeatures(); size_t dim = 128; @@ -3107,78 +2906,6 @@ TEST(SQ8_SQ8_EdgeCases, CosineZeroVectorTest) { } // Test with constant vector (all same values) -TEST(SQ8_SQ8_EdgeCases, IPConstantVectorTest) { - auto optimization = getCpuOptimizationFeatures(); - size_t dim = 128; - std::vector v_const(dim, 0.5f); - - size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); - std::vector v_const_quantized(quantized_size); - std::vector v_random_quantized(quantized_size); - spaces::GetNormalizeFunc()(v_const.data(), dim); - test_utils::quantize_float_vec_to_sq8_with_metadata(v_const.data(), dim, - v_const_quantized.data()); - test_utils::populate_float_vec_to_sq8_with_metadata(v_random_quantized.data(), dim, true); - - float baseline = SQ8_SQ8_InnerProduct(v_const_quantized.data(), v_random_quantized.data(), dim); -#ifdef OPT_SVE2 - if (optimization.sve2) { - unsigned char alignment = 0; - auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_const_quantized.data(), v_random_quantized.data(), dim); - ASSERT_NEAR(result, baseline, 0.01f) - << "Optimized constant vector IP should match baseline"; - optimization.sve2 = 0; - } -#endif -#ifdef OPT_SVE - if (optimization.sve) { - unsigned char alignment = 0; - auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_const_quantized.data(), v_random_quantized.data(), dim); - ASSERT_NEAR(result, baseline, 0.01f) - << "Optimized constant vector IP should match baseline"; - optimization.sve = 0; - } -#endif -#ifdef OPT_NEON_DOTPROD - if (optimization.asimddp) { - unsigned char alignment = 0; - auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_const_quantized.data(), v_random_quantized.data(), dim); - ASSERT_NEAR(result, baseline, 0.01f) - << "Optimized constant vector IP should match baseline"; - optimization.asimddp = 0; - } -#endif -#ifdef OPT_NEON - if (optimization.asimd) { - unsigned char alignment = 0; - auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_const_quantized.data(), v_random_quantized.data(), dim); - ASSERT_NEAR(result, baseline, 0.01f) - << "Optimized constant vector IP should match baseline"; - optimization.asimd = 0; - } -#endif -#ifdef OPT_AVX512_F_BW_VL_VNNI - if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { - unsigned char alignment = 0; - auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v_const_quantized.data(), v_random_quantized.data(), dim); - ASSERT_NEAR(result, baseline, 0.01f) - << "Optimized constant vector IP should match baseline"; - optimization.avx512f = 0; - } -#endif - - unsigned char alignment = 0; - auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); - float result = arch_opt_func(v_const_quantized.data(), v_random_quantized.data(), dim); - - ASSERT_NEAR(result, baseline, 0.01f) << "Constant vector IP should match baseline"; -} - TEST(SQ8_SQ8_EdgeCases, CosineConstantVectorTest) { auto optimization = getCpuOptimizationFeatures(); size_t dim = 128; @@ -3251,81 +2978,6 @@ TEST(SQ8_SQ8_EdgeCases, CosineConstantVectorTest) { } // Test with extreme values (-1 and 1 only) -TEST(SQ8_SQ8_EdgeCases, IPExtremeValuesTest) { - auto optimization = getCpuOptimizationFeatures(); - size_t dim = 128; - std::vector v1(dim), v2(dim); - - // Alternating extreme values - for (size_t i = 0; i < dim; i++) { - v1[i] = (i % 2 == 0) ? 1.0f : -1.0f; - v2[i] = (i % 3 == 0) ? 1.0f : -1.0f; - } - - spaces::GetNormalizeFunc()(v1.data(), dim); - spaces::GetNormalizeFunc()(v2.data(), dim); - - size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); - std::vector v1_quantized(quantized_size); - std::vector v2_quantized(quantized_size); - test_utils::quantize_float_vec_to_sq8_with_metadata(v1.data(), dim, v1_quantized.data()); - test_utils::quantize_float_vec_to_sq8_with_metadata(v2.data(), dim, v2_quantized.data()); - - float baseline = SQ8_SQ8_InnerProduct(v1_quantized.data(), v2_quantized.data(), dim); - -#ifdef OPT_SVE2 - if (optimization.sve2) { - unsigned char alignment = 0; - auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); - ASSERT_NEAR(result, baseline, 0.01f) << "Optimized extreme values IP should match baseline"; - optimization.sve2 = 0; - } -#endif -#ifdef OPT_SVE - if (optimization.sve) { - unsigned char alignment = 0; - auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); - ASSERT_NEAR(result, baseline, 0.01f) << "Optimized extreme values IP should match baseline"; - optimization.sve = 0; - } -#endif -#ifdef OPT_NEON_DOTPROD - if (optimization.asimddp) { - unsigned char alignment = 0; - auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); - ASSERT_NEAR(result, baseline, 0.01f) << "Optimized extreme values IP should match baseline"; - optimization.asimddp = 0; - } -#endif -#ifdef OPT_NEON - if (optimization.asimd) { - unsigned char alignment = 0; - auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); - ASSERT_NEAR(result, baseline, 0.01f) << "Optimized extreme values IP should match baseline"; - optimization.asimd = 0; - } -#endif -#ifdef OPT_AVX512_F_BW_VL_VNNI - if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { - unsigned char alignment = 0; - auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); - float result = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); - ASSERT_NEAR(result, baseline, 0.01f) << "Optimized extreme values IP should match baseline"; - optimization.avx512f = 0; - } -#endif - - unsigned char alignment = 0; - auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); - float result = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); - - ASSERT_NEAR(result, baseline, 0.01f) << "Extreme values IP should match baseline"; -} - TEST(SQ8_SQ8_EdgeCases, CosineExtremeValuesTest) { auto optimization = getCpuOptimizationFeatures(); size_t dim = 128; From 4c19d9ee92ba05b79c06f9e388b187762de96f50 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 4 Jan 2026 16:07:02 +0200 Subject: [PATCH 48/51] Add SVE2 support to SQ8-to-SQ8 Inner Product distance function --- src/VecSim/spaces/IP_space.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index 34a615695..c25f0d043 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -163,6 +163,11 @@ dist_func_t IP_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt); #ifdef CPU_FEATURES_ARCH_AARCH64 +#ifdef OPT_SVE2 + if (features.sve2) { + return Choose_SQ8_SQ8_IP_implementation_SVE2(dim); + } +#endif #ifdef OPT_SVE if (features.sve) { return Choose_SQ8_SQ8_IP_implementation_SVE(dim); From 5c22af8d456ae7c5883939ea26d2950dde36fe35 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 4 Jan 2026 16:15:32 +0200 Subject: [PATCH 49/51] Remove SVE2 and other optimizations from SQ8 cosine function test for ARM architecture --- tests/unit/test_spaces.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 70a36d14e..0ae60ee86 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2516,7 +2516,7 @@ TEST_F(SpacesTest, SQ8_SQ8_Cosine_no_optimization_func_test) { #ifdef CPU_FEATURES_ARCH_AARCH64 // Make sure we don't use any optimization (because there is no size optimization for arm) auto optimization = getCpuOptimizationFeatures(); - optimization.sve = optimization.sve2 = optimization.asimddp = optimization.asimd = 0; + optimization.sve = optimization.sve2 = 0; auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); #else auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); From 9e50d7c924996bb945211e508bec0ed8890e325a Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 4 Jan 2026 16:25:53 +0200 Subject: [PATCH 50/51] Update NEON benchmarks to use a vector size of 64 for SQ8-to-SQ8 functions --- tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp index d7c731c3f..13c28ee4e 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp @@ -45,8 +45,8 @@ cpu_features::Aarch64Features opt = cpu_features::GetAarch64Info().features; // NEON SQ8-to-SQ8 functions #ifdef OPT_NEON bool neon_supported = opt.asimd; -INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, NEON, 16, neon_supported); -INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, NEON, 16, neon_supported); +INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, NEON, 64, neon_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, NEON, 64, neon_supported); #endif // NEON // SVE SQ8-to-SQ8 functions #ifdef OPT_SVE From 2e57cf2c40b256f92f489bf680c97a1708f8db4f Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 4 Jan 2026 19:09:41 +0200 Subject: [PATCH 51/51] Increase allocated space for cosine calculations in SQ8 benchmark setup --- tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp index ddf188832..f1d9ebd90 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp @@ -25,7 +25,7 @@ class BM_VecSimSpaces_SQ8 : public benchmark::Fixture { v1 = new float[dim]; test_utils::populate_float_vec(v1, dim, 123); // Allocate vector with extra space for min, delta and cosine calculations - v2 = new uint8_t[dim + sizeof(float) * 3]; + v2 = new uint8_t[dim + sizeof(float) * 4]; test_utils::populate_float_vec_to_sq8_with_metadata(v2, dim, 1234, true); } void TearDown(const ::benchmark::State &state) {