sdpython · sdpython · Feb 5, 2026 · Feb 3, 2026 · Feb 3, 2026 · Feb 4, 2026
diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst
@@ -1,10 +1,10 @@
 Change Logs
 ===========
 
-0.8.12
-++++++
-
+0.9.0
++++++
 
+* :pr:`403`: update the serialization of SlidingWindowCache to include parameter slidinw_window, patch for sdpa_mask
 * :pr:`400`, :pr:`401`:, :pr:`402`: improves InputObserver (investigations), add it the documentation
 * :pr:`399`: update CI
 

diff --git a/_doc/final/plot_export_gemma3_tiny_input_observer.py b/_doc/final/plot_export_gemma3_tiny_input_observer.py
@@ -9,6 +9,7 @@
 """
 
 import pandas
+import torch
 from onnx_diagnostic import doc
 from onnx_diagnostic.helpers import string_type
 from onnx_diagnostic.export.api import to_onnx
@@ -25,9 +26,10 @@
 pipe = pipeline(
     "image-text-to-text",
     model=model_id,
-    device="cuda",
+    device="cpu",
     trust_remote_code=True,
     max_new_tokens=3,
+    dtype=torch.float16,
 )
 messages = [
     {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
@@ -50,7 +52,9 @@
 
 # %%
 # Captures inputs and outputs for the model.
-observer = InputObserver()
+observer = InputObserver(
+    missing=dict(pixel_values=torch.empty((0, 3, 896, 896), dtype=torch.float16))
+)
 with (
     register_additional_serialization_functions(patch_transformers=True),
     observer(pipe.model),
@@ -76,7 +80,7 @@
 
 
 filename = "plot_export_gemma3_tiny_input_observer.onnx"
-with torch_export_patches(patch_transformers=True):
+with torch_export_patches(patch_transformers=True, patch_torch=True, stop_if_static=2):
     to_onnx(
         pipe.model,
         args=(),
@@ -88,8 +92,16 @@
 
 # %%
 # Let's measure the discrepancies.
-data = observer.check_discrepancies(filename, progress_bar=True)
-print(pandas.DataFrame(data))
+data = observer.check_discrepancies(filename, progress_bar=True, atol=1e-2, include_io=True)
+df = pandas.DataFrame(data)
+df.to_excel("plot_export_gemma3_tiny_input_observer.xlsx")
+print(df)
+
+# %%
+# Let's show the errors.
+for row in data:
+    if not row["SUCCESS"] and "error" in row:
+        print(row["error"])
 
 
 # %%

diff --git a/_doc/index.rst b/_doc/index.rst
@@ -240,7 +240,7 @@ The function replaces dynamic dimensions defined as strings by
 Older versions
 ==============
 
-* `0.8.12 <../v0.8.12/index.html>`_
+* `0.9.0 <../v0.9.0/index.html>`_
 * `0.8.11 <../v0.8.11/index.html>`_
 * `0.7.16 <../v0.7.16/index.html>`_
 * `0.6.3 <../v0.6.3/index.html>`_

diff --git a/_unittests/ut_helpers/test_cache_helper.py b/_unittests/ut_helpers/test_cache_helper.py
@@ -373,6 +373,28 @@ def test_make_dynamic_cache_2_types(self):
         )
         self.assertEqual(0, max_diff(cache, cache)["abs"])
 
+    @requires_transformers("4.57")
+    def test_make_dynamic_cache_2_types_kwargs(self):
+        cache = make_dynamic_cache(
+            [
+                (torch.rand((4, 5, 6, 7)), torch.rand((4, 5, 6, 7))),
+                (torch.rand((4, 5, 6, 7)), torch.rand((4, 5, 6, 7))),
+            ],
+            cls_layers=[
+                transformers.cache_utils.DynamicLayer,
+                transformers.cache_utils.DynamicSlidingWindowLayer,
+            ],
+            cls_kwargs=[{}, dict(sliding_window=12)],
+        )
+        text = self.string_type(cache, with_shape=True)
+        self.assertEqual(
+            "DynamicCache(DynamicLayer(T1s4x5x6x7, T1s4x5x6x7), "
+            "DynamicSlidingWindowLayer(T1s4x5x6x7, T1s4x5x6x7))",
+            text,
+        )
+        self.assertEqual(0, max_diff(cache, cache)["abs"])
+        self.assertEqual(cache.layers[1].sliding_window, 12)
+
     @requires_transformers("4.57")
     def test_unflatten_flatten_mixed_layers(self):
         with torch_export_patches(patch_transformers=True):

diff --git a/_unittests/ut_investigate/test_input_observer.py b/_unittests/ut_investigate/test_input_observer.py
@@ -695,7 +695,7 @@ def forward(self, x, y, z=None, w=None):
             exporter="custom",
             filename=proto_name,
         )
-        data = observer.check_discrepancies(proto_name, progress_bar=False)
+        data = observer.check_discrepancies(proto_name, progress_bar=False, include_io=True)
         df = pandas.DataFrame(data)
         self.assertLess(df["abs"].max(), 1e-5)
 
@@ -878,25 +878,65 @@ def forward(self, x=None, y=None):
         # self.assertEqual(2, len(args))
         # self.assertEqual(len([v for v in args.values() if v is not None]), 2)
 
-    def test_infer_dynamic_shapes_exception(self):
-        """
-        dict(input_ids:T7s1x282,
-            pixel_values:T1s1x3x896x896,
-            attention_mask:T7s1x282,
-            position_ids:T7s1x282,
-            token_type_ids:T7s1x282,cache_position:T7s282
-        )
-        dict(input_ids:T7s1x1,attention_mask:T7s1x283,position_ids:T7s1x1,
-                past_key_values:DynamicCache(
-                    DynamicSlidingWindowLayer(T16s1x1x282x32, T16s1x1x282x32),
-                    DynamicLayer(T16s1x1x282x32, T16s1x1x282x32)),
-                token_type_ids:T7s1x1,cache_position:T7s1)
-        dict(input_ids:T7s1x1,attention_mask:T7s1x284,position_ids:T7s1x1,
-                past_key_values:DynamicCache(
-                    DynamicSlidingWindowLayer(T16s1x1x283x32, T16s1x1x283x32),
-                    DynamicLayer(T16s1x1x283x32, T16s1x1x283x32)),
-                token_type_ids:T7s1x1,cache_position:T7s1)
-        """
+    def test_infer_dynamic_shapes_missing(self):
+        class Model(torch.nn.Module):
+            def forward(
+                self,
+                input_ids=None,
+                pixel_values=None,
+                attention_mask=None,
+                position_ids=None,
+                past_key_values=None,
+                token_type_ids=None,
+                cache_position=None,
+            ):
+                return input_ids
+
+        inputs = [
+            dict(
+                input_ids=torch.ones((1, 282), dtype=torch.int64),
+                pixel_values=torch.ones((1, 3, 896, 896), dtype=torch.int64),
+                attention_mask=torch.ones((1, 282), dtype=torch.int64),
+                position_ids=torch.ones((1, 282), dtype=torch.int64),
+                token_type_ids=torch.ones((1, 282), dtype=torch.int64),
+                cache_position=torch.ones((282,), dtype=torch.int64),
+            ),
+            dict(
+                input_ids=torch.ones((1, 1), dtype=torch.int64),
+                attention_mask=torch.ones((1, 283), dtype=torch.int64),
+                position_ids=torch.ones((1, 1), dtype=torch.int64),
+                past_key_values=torch.rand((1, 1, 282, 32)),
+                token_type_ids=torch.ones((1, 1), dtype=torch.int64),
+                cache_position=torch.ones((1,), dtype=torch.int64),
+            ),
+            dict(
+                input_ids=torch.ones((1, 1), dtype=torch.int64),
+                attention_mask=torch.ones((1, 284), dtype=torch.int64),
+                position_ids=torch.ones((1, 1), dtype=torch.int64),
+                past_key_values=torch.rand((1, 1, 283, 32)),
+                token_type_ids=torch.ones((1, 1), dtype=torch.int64),
+                cache_position=torch.ones((1,), dtype=torch.int64),
+            ),
+        ]
+
+        model = Model()
+        observer = InputObserver(missing=dict(pixel_values=torch.empty((0, 3, 896, 896))))
+        with observer(model):
+            for kwargs in inputs:
+                model(**kwargs)
+
+        shapes = observer.infer_dynamic_shapes(set_batch_dimension_for=True)
+        cst = torch.export.Dim.DYNAMIC
+        expected = {
+            "input_ids": {0: cst, 1: cst},
+            "pixel_values": {0: cst},
+            "attention_mask": {0: cst, 1: cst},
+            "position_ids": {0: cst, 1: cst},
+            "past_key_values": {0: cst, 2: cst},
+            "token_type_ids": {0: cst, 1: cst},
+            "cache_position": {0: cst},
+        }
+        self.assertEqual(expected, shapes)
 
 
 if __name__ == "__main__":

diff --git a/_unittests/ut_investigate/test_input_observer_transformers.py b/_unittests/ut_investigate/test_input_observer_transformers.py
@@ -216,6 +216,95 @@ def forward(self, cache):
             args["cache"].cross_attention_cache.layers[0].keys.shape, (1, 6, 1500, 64)
         )
 
+    @requires_transformers("4.57")
+    def test_infer_dynamic_shapes_missing_pixels(self):
+        import transformers
+
+        class Model(torch.nn.Module):
+            def forward(
+                self,
+                input_ids=None,
+                pixel_values=None,
+                attention_mask=None,
+                position_ids=None,
+                past_key_values=None,
+                token_type_ids=None,
+                cache_position=None,
+            ):
+                return input_ids
+
+        inputs = [
+            dict(
+                input_ids=torch.ones((1, 282), dtype=torch.int64),
+                pixel_values=torch.ones((1, 3, 896, 896), dtype=torch.int64),
+                attention_mask=torch.ones((1, 282), dtype=torch.int64),
+                position_ids=torch.ones((1, 282), dtype=torch.int64),
+                token_type_ids=torch.ones((1, 282), dtype=torch.int64),
+                cache_position=torch.ones((282,), dtype=torch.int64),
+            ),
+            dict(
+                input_ids=torch.ones((1, 1), dtype=torch.int64),
+                attention_mask=torch.ones((1, 283), dtype=torch.int64),
+                position_ids=torch.ones((1, 1), dtype=torch.int64),
+                past_key_values=make_dynamic_cache(
+                    [
+                        (torch.rand((1, 1, 282, 32)), torch.rand((1, 1, 282, 32))),
+                        (torch.rand((1, 1, 282, 32)), torch.rand((1, 1, 282, 32))),
+                    ],
+                    cls_layers=[
+                        transformers.cache_utils.DynamicSlidingWindowLayer,
+                        transformers.cache_utils.DynamicLayer,
+                    ],
+                ),
+                token_type_ids=torch.ones((1, 1), dtype=torch.int64),
+                cache_position=torch.ones((1,), dtype=torch.int64),
+            ),
+            dict(
+                input_ids=torch.ones((1, 1), dtype=torch.int64),
+                attention_mask=torch.ones((1, 284), dtype=torch.int64),
+                position_ids=torch.ones((1, 1), dtype=torch.int64),
+                past_key_values=make_dynamic_cache(
+                    [
+                        (torch.rand((1, 1, 283, 32)), torch.rand((1, 1, 283, 32))),
+                        (torch.rand((1, 1, 283, 32)), torch.rand((1, 1, 283, 32))),
+                    ],
+                    cls_layers=[
+                        transformers.cache_utils.DynamicSlidingWindowLayer,
+                        transformers.cache_utils.DynamicLayer,
+                    ],
+                ),
+                token_type_ids=torch.ones((1, 1), dtype=torch.int64),
+                cache_position=torch.ones((1,), dtype=torch.int64),
+            ),
+        ]
+
+        model = Model()
+        observer = InputObserver(missing=dict(pixel_values=torch.empty((0, 3, 896, 896))))
+        with (
+            register_additional_serialization_functions(patch_transformers=True),
+            observer(model),
+        ):
+            for kwargs in inputs:
+                model(**kwargs)
+
+        shapes = observer.infer_dynamic_shapes(set_batch_dimension_for=True)
+        cst = torch.export.Dim.DYNAMIC
+        expected = {
+            "input_ids": {0: cst, 1: cst},
+            "pixel_values": {0: cst},
+            "attention_mask": {0: cst, 1: cst},
+            "position_ids": {0: cst, 1: cst},
+            "past_key_values": [
+                {0: cst, 2: cst},
+                {0: cst, 2: cst},
+                {0: cst, 2: cst},
+                {0: cst, 2: cst},
+            ],
+            "token_type_ids": {0: cst, 1: cst},
+            "cache_position": {0: cst},
+        }
+        self.assertEqual(expected, shapes)
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/_unittests/ut_torch_export_patches/test_patch_serialization_transformers.py b/_unittests/ut_torch_export_patches/test_patch_serialization_transformers.py
@@ -275,13 +275,19 @@ def test_sliding_window_cache_flatten(self):
     @unittest.skipIf(make_sliding_window_cache, "transformers<5")
     def test_sliding_window_cache_flatten5(self):
         cache = make_dynamic_cache(
-            [(torch.rand((4, 4, 4, 4)), torch.rand((4, 4, 4, 4)))],
+            [
+                (torch.rand((4, 4, 4, 4)), torch.rand((4, 4, 4, 4))),
+                (torch.rand((4, 4, 4, 4)), torch.rand((4, 4, 4, 4))),
+            ],
             cls_layers="DynamicSlidingWindowLayer",
+            cls_kwargs=[dict(sliding_window=11), dict(sliding_window=12)],
         )
+        self.assertEqual(cache.layers[0].sliding_window, 11)
+        self.assertEqual(cache.layers[1].sliding_window, 12)
         with torch_export_patches(patch_transformers=True):
             flat, _spec = torch.utils._pytree.tree_flatten(cache)
             self.assertEqual(
-                "#2[T1s4x4x4x4,T1s4x4x4x4]",
+                "#4[T1s4x4x4x4,T1s4x4x4x4,T1s4x4x4x4,T1s4x4x4x4]",
                 self.string_type(flat, with_shape=True),
             )
             cache2 = torch.utils._pytree.tree_unflatten(flat, _spec)
@@ -292,6 +298,8 @@ def test_sliding_window_cache_flatten5(self):
             self.assertEqual(
                 [type(lay) for lay in cache.layers], [type(lay) for lay in cache2.layers]
             )
+            self.assertEqual(cache2.layers[0].sliding_window, 11)
+            self.assertEqual(cache2.layers[1].sliding_window, 12)
 
     @ignore_warnings(UserWarning)
     @requires_torch("2.7.99")

diff --git a/_unittests/ut_torch_export_patches/test_patch_transformers.py b/_unittests/ut_torch_export_patches/test_patch_transformers.py
@@ -65,6 +65,40 @@ def test_sdpa_mask_recent_torch(self):
         got = patched_sdpa_mask_recent_torch(**kwargs)
         self.assertEqualArray(expected, got)
 
+    @requires_transformers("4.99")
+    def test_sdpa_mask_patched(self):
+        sdpa_mask = transformers.masking_utils.sdpa_mask
+        patched_sdpa_mask = patch_transformers.patched_sdpa_mask
+        kwargs = {
+            "batch_size": 1,
+            "cache_position": torch.tensor([3], dtype=torch.int64),
+            "kv_length": 4,
+            "kv_offset": 0,
+            "mask_function": transformers.masking_utils.causal_mask_function,
+            "attention_mask": torch.tensor([[True, True, True, True]]),
+            "local_size": None,
+            "allow_is_causal_skip": True,
+            "allow_is_bidirectional_skip": False,
+        }
+        expected = sdpa_mask(**kwargs)
+        got = patched_sdpa_mask(**kwargs)
+        self.assertEqual(expected, got)
+
+        kwargs = {
+            "batch_size": 1,
+            "cache_position": torch.tensor([3], dtype=torch.int64),
+            "kv_length": 4,
+            "kv_offset": 0,
+            "mask_function": transformers.masking_utils.causal_mask_function,
+            "attention_mask": torch.tensor([[True, True, True, True]]),
+            "local_size": None,
+            "allow_is_causal_skip": False,
+            "allow_is_bidirectional_skip": False,
+        }
+        expected = sdpa_mask(**kwargs)
+        got = patched_sdpa_mask(**kwargs)
+        self.assertEqualArray(expected, got)
+
     @requires_transformers("4.99")
     def test_sdpa_mask_recent_torch_is_running(self):
         def _copy_vmap_for_bhqkv(mask_function, bh_indices=True):

diff --git a/onnx_diagnostic/__init__.py b/onnx_diagnostic/__init__.py
@@ -3,5 +3,5 @@
 Functions, classes to dig into a model when this one is right, slow, wrong...
 """
 
-__version__ = "0.8.12"
+__version__ = "0.9.0"
 __author__ = "Xavier Dupré"