From 6019a6edca6a980994f7b570096d3f5fb1e33c85 Mon Sep 17 00:00:00 2001
From: Chenjie Luo <108829653+cjluo-nv@users.noreply.github.com>
Date: Thu, 15 Jan 2026 21:47:21 -0800
Subject: [PATCH 1/2] Define kv cache scaling factor as amax / 448

Unified the FP8 and NVFP4 kv cache scaling factor definition so the same checkpoint can be used for both FP8 and NVFP4 kv cache quantization deployment

Signed-off-by: Chenjie Luo <108829653+cjluo-nv@users.noreply.github.com>
---
 modelopt/torch/export/unified_export_hf.py | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index b46f2dd70..8dd97b81f 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -56,9 +56,6 @@
     set_expert_quantizer_amax,
 )
 from .model_config import (
-    KV_CACHE_FP8,
-    KV_CACHE_NVFP4,
-    KV_CACHE_NVFP4_AFFINE,
     QUANTIZATION_FP8,
     QUANTIZATION_FP8_PB_REAL,
     QUANTIZATION_FP8_PC_PT,
@@ -647,19 +644,6 @@ def _export_transformers_checkpoint(
 
     quant_config = get_quant_config(model, is_modelopt_qlora=is_modelopt_qlora)
 
-    kv_cache_max_bound = 0
-    kv_cache_format = quant_config["quantization"]["kv_cache_quant_algo"]
-
-    cache_bound_mapping = {
-        KV_CACHE_NVFP4: 6 * 448,
-        KV_CACHE_NVFP4_AFFINE: 6 * 448,
-        KV_CACHE_FP8: 448,
-    }
-
-    # Only update kv_cache_max_bound if a quantization is applied.
-    if kv_cache_format != QUANTIZATION_NONE:
-        kv_cache_max_bound = cache_bound_mapping.get(kv_cache_format)
-
     # Process all quantized modules and export weights
     _process_quantized_modules(model, dtype, is_modelopt_qlora)
 
@@ -669,6 +653,8 @@ def _export_transformers_checkpoint(
     else:
         quantized_state_dict = model.state_dict()
 
+    # We define kv cache scale as amax / 448 for both FP8 and NVFP4 KV cache quantization.
+    kv_cache_max_bound = 448
     quantized_state_dict = postprocess_state_dict(
         quantized_state_dict, kv_cache_max_bound, kv_cache_format, is_modelopt_qlora
     )

From 852316f39f5d08fe87992104e4943ef17144bc18 Mon Sep 17 00:00:00 2001
From: Chenjie Luo <chenjiel@nvidia.com>
Date: Fri, 16 Jan 2026 17:29:12 +0000
Subject: [PATCH 2/2] Fix

Signed-off-by: Chenjie Luo <chenjiel@nvidia.com>
---
 modelopt/torch/export/unified_export_hf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index 8dd97b81f..d2c19b8e1 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -655,6 +655,7 @@ def _export_transformers_checkpoint(
 
     # We define kv cache scale as amax / 448 for both FP8 and NVFP4 KV cache quantization.
     kv_cache_max_bound = 448
+    kv_cache_format = quant_config["quantization"]["kv_cache_quant_algo"]
     quantized_state_dict = postprocess_state_dict(
         quantized_state_dict, kv_cache_max_bound, kv_cache_format, is_modelopt_qlora
     )