RedisAI · dor-forer · Dec 28, 2025 · Dec 28, 2025 · Dec 28, 2025 · Dec 28, 2025
diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp
@@ -15,25 +15,25 @@ using bfloat16 = vecsim_types::bfloat16;
 using float16 = vecsim_types::float16;
 
 float FLOAT_INTEGER_InnerProduct(const float *pVect1v, const uint8_t *pVect2v, size_t dimension,
-                                 float min_val, float delta, float inv_norm) {
+                                 float min_val, float delta) {
     float res = 0;
     for (size_t i = 0; i < dimension; i++) {
         float dequantized_V2 = (pVect2v[i] * delta + min_val);
         res += pVect1v[i] * dequantized_V2;
     }
-    return res * inv_norm;
+    return res;
 }
 
 float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) {
     const auto *pVect1 = static_cast<const float *>(pVect1v);
     const auto *pVect2 = static_cast<const uint8_t *>(pVect2v);
     // pVect2 is a vector of uint8_t, so we need to de-quantize it, normalize it and then multiply
     // it. it is structured as [quantized values (int8_t * dim)][min_val (float)][delta
-    // (float)][inv_norm (float)] The last two values are used to dequantize the vector.
+    // (float)]] The last two values are used to dequantize the vector.
     const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
     const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
     // Compute inner product with dequantization
-    const float res = FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta, 1.0f);
+    const float res = FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta);
     return 1.0f - res;
 }
 
@@ -44,10 +44,67 @@ float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) {
     // Get quantization parameters
     const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
     const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
-    const float inv_norm = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));
     // Compute inner product with dequantization
-    const float res =
-        FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta, inv_norm);
+    const float res = FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta);
+    return 1.0f - res;
+}
+
+// SQ8-to-SQ8: Both vectors are uint8 quantized with precomputed sum
+// Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)]
+float SQ8_SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const auto *pVect1 = static_cast<const uint8_t *>(pVect1v);
+    const auto *pVect2 = static_cast<const uint8_t *>(pVect2v);
+
+    // Compute inner product of quantized values: Σ(q1[i]*q2[i])
+    float product = 0;
+    for (size_t i = 0; i < dimension; i++) {
+        product += pVect1[i] * pVect2[i];
+    }
+
+    // Get quantization parameters from pVect1
+    const float min_val1 = *reinterpret_cast<const float *>(pVect1 + dimension);
+    const float delta1 = *reinterpret_cast<const float *>(pVect1 + dimension + sizeof(float));
+    const float sum1 = *reinterpret_cast<const float *>(pVect1 + dimension + 2 * sizeof(float));
+
+    // Get quantization parameters from pVect2
+    const float min_val2 = *reinterpret_cast<const float *>(pVect2 + dimension);
+    const float delta2 = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
+    const float sum2 = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));
+
+    // Apply the algebraic formula using precomputed sums:
+    // IP = min1*sum2 + min2*sum1 + delta1*delta2*Σ(q1[i]*q2[i]) - dim*min1*min2
+    float res = min_val1 * sum2 + min_val2 * sum1 -
+                static_cast<float>(dimension) * min_val1 * min_val2 + delta1 * delta2 * product;
+    return 1.0f - res;
+}
+
+// SQ8-to-SQ8: Both vectors are uint8 quantized and normalized with precomputed sum
+// Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)]
+float SQ8_SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const auto *pVect1 = static_cast<const uint8_t *>(pVect1v);
+    const auto *pVect2 = static_cast<const uint8_t *>(pVect2v);
+
+    // Compute inner product of quantized values: Σ(q1[i]*q2[i])
+    float product = 0;
+    for (size_t i = 0; i < dimension; i++) {
+        product += pVect1[i] * pVect2[i];
+    }
+
+    // Extract metadata from the end of vectors
+    // Get quantization parameters from pVect1
+    const float min_val1 = *reinterpret_cast<const float *>(pVect1 + dimension);
+    const float delta1 = *reinterpret_cast<const float *>(pVect1 + dimension + sizeof(float));
+    const float sum1 = *reinterpret_cast<const float *>(pVect1 + dimension + 2 * sizeof(float));
+
+    // Get quantization parameters from pVect2
+    const float min_val2 = *reinterpret_cast<const float *>(pVect2 + dimension);
+    const float delta2 = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
+    const float sum2 = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));
+
+    // Apply the algebraic formula using precomputed sums:
+    // IP = min1*sum2 + min2*sum1 + delta1*delta2*Σ(q1[i]*q2[i]) - dim*min1*min2
+    float res = min_val1 * sum2 + min_val2 * sum1 -
+                static_cast<float>(dimension) * min_val1 * min_val2 + delta1 * delta2 * product;
     return 1.0f - res;
 }
 

diff --git a/src/VecSim/spaces/IP/IP.h b/src/VecSim/spaces/IP/IP.h
@@ -16,6 +16,14 @@ float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimensio
 // pVect1v vector of type fp32 and pVect2v vector of type uint8
 float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension);
 
+// SQ8-to-SQ8: Both vectors are uint8 quantized with precomputed sum
+// Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)]
+float SQ8_SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension);
+
+// SQ8-to-SQ8: Both vectors are uint8 quantized and normalized with precomputed sum
+// Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)]
+float SQ8_SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension);
+
 float FP32_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension);
 
 double FP64_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension);

diff --git a/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h
@@ -100,14 +100,7 @@ float SQ8_InnerProductSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v,
 
 template <unsigned char residual> // 0..15
 float SQ8_CosineSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) {
-    // Get dequantization parameters from the end of quantized vector
-    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
-    const float inv_norm = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));
-
     // Calculate inner product using common implementation with normalization
     float ip = SQ8_InnerProductImp_FMA<residual>(pVect1v, pVect2v, dimension);
-
-    // For cosine, we need to account for the vector norms
-    // The inv_norm parameter is stored after min_val and delta in the quantized vector
-    return 1.0f - ip * inv_norm;
+    return 1.0f - ip;
 }
diff --git a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h
@@ -33,7 +33,7 @@ static inline void InnerProductStepSQ8(const float *&pVect1, const uint8_t *&pVe
 }
 
 template <unsigned char residual> // 0..15
-float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) {
+float SQ8_InnerProductImp_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) {
     const float *pVect1 = static_cast<const float *>(pVect1v);
     // pVect2 is a quantized uint8_t vector
     const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
@@ -89,19 +89,12 @@ float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimen
 
 template <unsigned char residual> // 0..15
 float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) {
-    return 1.0f - SQ8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
+    return 1.0f - SQ8_InnerProductImp_AVX2<residual>(pVect1v, pVect2v, dimension);
 }
 
 template <unsigned char residual> // 0..15
 float SQ8_CosineSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) {
-    // Get dequantization parameters from the end of quantized vector
-    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
-    const float inv_norm = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));
-
     // Calculate inner product using common implementation with normalization
-    float ip = SQ8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
-
-    // For cosine, we need to account for the vector norms
-    // The inv_norm parameter is stored after min_val and delta in the quantized vector
-    return 1.0f - ip * inv_norm;
+    float ip = SQ8_InnerProductImp_AVX2<residual>(pVect1v, pVect2v, dimension);
+    return 1.0f - ip;
 }
diff --git a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_SQ8.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_UINT8.h"
+#include <immintrin.h>
+
+/**
+ * SQ8-to-SQ8 distance functions using AVX512 VNNI with precomputed sum.
+ * These functions compute distance between two SQ8 (scalar quantized 8-bit) vectors,
+ * where BOTH vectors are uint8 quantized.
+ *
+ * Uses precomputed sum stored in the vector data,
+ * eliminating the need to compute them during distance calculation.
+ *
+ * Uses algebraic optimization to leverage integer VNNI instructions:
+ *
+ * With sum = Σv[i] (sum of original float values), the formula is:
+ * IP = min1*sum2 + min2*sum1 + δ1*δ2 * Σ(q1[i]*q2[i]) - dim*min1*min2
+ *
+ * Since sum is precomputed, we only need to compute the dot product Σ(q1[i]*q2[i]).
+ * The dot product is computed using the efficient UINT8_InnerProductImp which uses
+ * VNNI instructions (_mm512_dpwssd_epi32) for native integer dot product computation.
+ *
+ * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)]
+ */
+
+// Common implementation for inner product between two SQ8 vectors with precomputed sum
+// Uses UINT8_InnerProductImp for efficient dot product computation with VNNI
+template <unsigned char residual> // 0..63
+float SQ8_SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimension) {
+    // Compute raw dot product using efficient UINT8 AVX512 VNNI implementation
+    // UINT8_InnerProductImp uses _mm512_dpwssd_epi32 for native integer dot product
+    int dot_product = UINT8_InnerProductImp<residual>(pVec1v, pVec2v, dimension);
+
+    // Get dequantization parameters and precomputed values from the end of vectors
+    // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)]
+    const uint8_t *pVec1 = static_cast<const uint8_t *>(pVec1v);
+    const uint8_t *pVec2 = static_cast<const uint8_t *>(pVec2v);
+
+    const float *params1 = reinterpret_cast<const float *>(pVec1 + dimension);
+    const float min1 = params1[0];
+    const float delta1 = params1[1];
+    const float sum1 = params1[2]; // Precomputed sum of original float elements
+
+    const float *params2 = reinterpret_cast<const float *>(pVec2 + dimension);
+    const float min2 = params2[0];
+    const float delta2 = params2[1];
+    const float sum2 = params2[2]; // Precomputed sum of original float elements
+
+    // Apply the algebraic formula using precomputed sums:
+    // IP = min1*sum2 + min2*sum1 + δ1*δ2 * Σ(q1[i]*q2[i]) - dim*min1*min2
+    return min1 * sum2 + min2 * sum1 + delta1 * delta2 * static_cast<float>(dot_product) -
+           static_cast<float>(dimension) * min1 * min2;
+}
+
+// SQ8-to-SQ8 Inner Product distance function
+// Returns 1 - inner_product (distance form)
+template <unsigned char residual> // 0..63
+float SQ8_SQ8_InnerProductSIMD64_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v,
+                                                    size_t dimension) {
+    return 1.0f - SQ8_SQ8_InnerProductImp<residual>(pVec1v, pVec2v, dimension);
+}
+
+// SQ8-to-SQ8 Cosine distance function
+// Returns 1 - (inner_product)
+template <unsigned char residual> // 0..63
+float SQ8_SQ8_CosineSIMD64_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v,
+                                              size_t dimension) {
+    // Assume vectors are normalized.
+    return SQ8_SQ8_InnerProductSIMD64_AVX512F_BW_VL_VNNI<residual>(pVec1v, pVec2v, dimension);
+}
diff --git a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_UINT8.h b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_UINT8.h
@@ -6,6 +6,7 @@
  * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
  * GNU Affero General Public License v3 (AGPLv3).
  */
+#pragma once
 #include "VecSim/spaces/space_includes.h"
 
 static inline void InnerProductStep(uint8_t *&pVect1, uint8_t *&pVect2, __m512i &sum) {

diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h
@@ -36,8 +36,7 @@ static inline void SQ8_InnerProductStep(const float *&pVec1, const uint8_t *&pVe
 
 // Common implementation for both inner product and cosine similarity
 template <unsigned char residual> // 0..15
-float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimension,
-                          float inv_norm = 1.0f) {
+float SQ8_InnerProductImp_AVX512(const void *pVec1v, const void *pVec2v, size_t dimension) {
     const float *pVec1 = static_cast<const float *>(pVec1v);
     const uint8_t *pVec2 = static_cast<const uint8_t *>(pVec2v);
     const float *pEnd1 = pVec1 + dimension;
@@ -92,7 +91,7 @@ template <unsigned char residual> // 0..15
 float SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v,
                                                 size_t dimension) {
     // Calculate inner product using common implementation
-    float ip = SQ8_InnerProductImp<residual>(pVec1v, pVec2v, dimension);
+    float ip = SQ8_InnerProductImp_AVX512<residual>(pVec1v, pVec2v, dimension);
 
     // The inner product similarity is 1 - ip
     return 1.0f - ip;
@@ -101,12 +100,8 @@ float SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *
 template <unsigned char residual> // 0..15
 float SQ8_CosineSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v,
                                           size_t dimension) {
-    // Get the inverse norm factor stored after min_val and delta
-    const uint8_t *pVec2 = static_cast<const uint8_t *>(pVec2v);
-    const float inv_norm = *reinterpret_cast<const float *>(pVec2 + dimension + 2 * sizeof(float));
-
     // Calculate inner product using common implementation with normalization
-    float ip = SQ8_InnerProductImp<residual>(pVec1v, pVec2v, dimension, inv_norm);
+    float ip = SQ8_InnerProductImp_AVX512<residual>(pVec1v, pVec2v, dimension);
 
     // The cosine similarity is 1 - ip
     return 1.0f - ip;

diff --git a/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/spaces/IP/IP_NEON_DOTPROD_UINT8.h"
+#include <arm_neon.h>
+
+/**
+ * SQ8-to-SQ8 distance functions using ARM NEON DOTPROD with precomputed sum.
+ * These functions compute distance between two SQ8 (scalar quantized 8-bit) vectors,
+ * where BOTH vectors are uint8 quantized.
+ *
+ * Uses precomputed sum stored in the vector data,
+ * eliminating the need to compute them during distance calculation.
+ *
+ * Uses algebraic optimization with DOTPROD instruction:
+ *
+ * With sum = Σv[i] (sum of original float values), the formula is:
+ * IP = min1*sum2 + min2*sum1 + δ1*δ2 * Σ(q1[i]*q2[i]) - dim*min1*min2
+ *
+ * Since sum is precomputed, we only need to compute the dot product Σ(q1[i]*q2[i]).
+ * The dot product is computed using the efficient UINT8_InnerProductImp which uses
+ * the DOTPROD instruction (vdotq_u32) for native uint8 dot product computation.
+ *
+ * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)]
+ */
+
+// Common implementation for inner product between two SQ8 vectors with precomputed sum
+// Uses UINT8_InnerProductImp for efficient dot product computation with DOTPROD
+template <unsigned char residual> // 0..63
+float SQ8_SQ8_InnerProductSIMD64_NEON_DOTPROD_IMP(const void *pVec1v, const void *pVec2v,
+                                                  size_t dimension) {
+    // Compute raw dot product using efficient UINT8 DOTPROD implementation
+    // UINT8_InnerProductImp uses vdotq_u32 for native uint8 dot product
+    float dot_product = UINT8_InnerProductImp<residual>(pVec1v, pVec2v, dimension);
+
+    // Get dequantization parameters and precomputed values from the end of vectors
+    // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)]
+    const uint8_t *pVec1 = static_cast<const uint8_t *>(pVec1v);
+    const uint8_t *pVec2 = static_cast<const uint8_t *>(pVec2v);
+
+    const float *params1 = reinterpret_cast<const float *>(pVec1 + dimension);
+    const float min1 = params1[0];
+    const float delta1 = params1[1];
+    const float sum1 = params1[2]; // Precomputed sum of original float elements
+
+    const float *params2 = reinterpret_cast<const float *>(pVec2 + dimension);
+    const float min2 = params2[0];
+    const float delta2 = params2[1];
+    const float sum2 = params2[2]; // Precomputed sum of original float elements
+
+    // Apply algebraic formula using precomputed sums:
+    // IP = min1*sum2 + min2*sum1 + δ1*δ2 * Σ(q1*q2) - dim*min1*min2
+    return min1 * sum2 + min2 * sum1 + delta1 * delta2 * dot_product -
+           static_cast<float>(dimension) * min1 * min2;
+}
+
+// SQ8-to-SQ8 Inner Product distance function
+// Returns 1 - inner_product (distance form)
+template <unsigned char residual> // 0..63
+float SQ8_SQ8_InnerProductSIMD64_NEON_DOTPROD(const void *pVec1v, const void *pVec2v,
+                                              size_t dimension) {
+    return 1.0f - SQ8_SQ8_InnerProductSIMD64_NEON_DOTPROD_IMP<residual>(pVec1v, pVec2v, dimension);
+}
+
+// SQ8-to-SQ8 Cosine distance function
+// Returns 1 - inner_product (assumes vectors are pre-normalized)
+template <unsigned char residual> // 0..63
+float SQ8_SQ8_CosineSIMD64_NEON_DOTPROD(const void *pVec1v, const void *pVec2v, size_t dimension) {
+    return SQ8_SQ8_InnerProductSIMD64_NEON_DOTPROD<residual>(pVec1v, pVec2v, dimension);
+}
diff --git a/src/VecSim/spaces/IP/IP_NEON_DOTPROD_UINT8.h b/src/VecSim/spaces/IP/IP_NEON_DOTPROD_UINT8.h
@@ -6,6 +6,7 @@
  * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
  * GNU Affero General Public License v3 (AGPLv3).
  */
+#pragma once
 #include "VecSim/spaces/space_includes.h"
 #include <arm_neon.h>
 

diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_SQ8.h
@@ -114,15 +114,7 @@ float SQ8_InnerProductSIMD16_NEON(const void *pVect1v, const void *pVect2v, size
 
 template <unsigned char residual> // 0..15
 float SQ8_CosineSIMD16_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) {
-    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
-
-    // Get quantization parameters
-    const float inv_norm = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));
-
     // Compute inner product with dequantization using the common function
     const float res = SQ8_InnerProductSIMD16_NEON_IMP<residual>(pVect1v, pVect2v, dimension);
-
-    // For cosine, we need to account for the vector norms
-    // The inv_norm parameter is stored after min_val and delta in the quantized vector
-    return 1.0f - res * inv_norm;
+    return 1.0f - res;
 }