From 6019a6edca6a980994f7b570096d3f5fb1e33c85 Mon Sep 17 00:00:00 2001 From: Chenjie Luo <108829653+cjluo-nv@users.noreply.github.com> Date: Thu, 15 Jan 2026 21:47:21 -0800 Subject: [PATCH 1/2] Define kv cache scaling factor as amax / 448 Unified the FP8 and NVFP4 kv cache scaling factor definition so the same checkpoint can be used for both FP8 and NVFP4 kv cache quantization deployment Signed-off-by: Chenjie Luo <108829653+cjluo-nv@users.noreply.github.com> --- modelopt/torch/export/unified_export_hf.py | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index b46f2dd70..8dd97b81f 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -56,9 +56,6 @@ set_expert_quantizer_amax, ) from .model_config import ( - KV_CACHE_FP8, - KV_CACHE_NVFP4, - KV_CACHE_NVFP4_AFFINE, QUANTIZATION_FP8, QUANTIZATION_FP8_PB_REAL, QUANTIZATION_FP8_PC_PT, @@ -647,19 +644,6 @@ def _export_transformers_checkpoint( quant_config = get_quant_config(model, is_modelopt_qlora=is_modelopt_qlora) - kv_cache_max_bound = 0 - kv_cache_format = quant_config["quantization"]["kv_cache_quant_algo"] - - cache_bound_mapping = { - KV_CACHE_NVFP4: 6 * 448, - KV_CACHE_NVFP4_AFFINE: 6 * 448, - KV_CACHE_FP8: 448, - } - - # Only update kv_cache_max_bound if a quantization is applied. - if kv_cache_format != QUANTIZATION_NONE: - kv_cache_max_bound = cache_bound_mapping.get(kv_cache_format) - # Process all quantized modules and export weights _process_quantized_modules(model, dtype, is_modelopt_qlora) @@ -669,6 +653,8 @@ def _export_transformers_checkpoint( else: quantized_state_dict = model.state_dict() + # We define kv cache scale as amax / 448 for both FP8 and NVFP4 KV cache quantization. + kv_cache_max_bound = 448 quantized_state_dict = postprocess_state_dict( quantized_state_dict, kv_cache_max_bound, kv_cache_format, is_modelopt_qlora ) From 852316f39f5d08fe87992104e4943ef17144bc18 Mon Sep 17 00:00:00 2001 From: Chenjie Luo Date: Fri, 16 Jan 2026 17:29:12 +0000 Subject: [PATCH 2/2] Fix Signed-off-by: Chenjie Luo --- modelopt/torch/export/unified_export_hf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 8dd97b81f..d2c19b8e1 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -655,6 +655,7 @@ def _export_transformers_checkpoint( # We define kv cache scale as amax / 448 for both FP8 and NVFP4 KV cache quantization. kv_cache_max_bound = 448 + kv_cache_format = quant_config["quantization"]["kv_cache_quant_algo"] quantized_state_dict = postprocess_state_dict( quantized_state_dict, kv_cache_max_bound, kv_cache_format, is_modelopt_qlora )