speedup model cpu offload (#136)

akaitsuki-ii · web-flow · commit 7a6d86d4fc87 · 2025-08-06T14:35:44.000+08:00
* speedup model cpu offload

* fix
diff --git a/diffsynth_engine/models/qwen_image/qwen_image_dit.py b/diffsynth_engine/models/qwen_image/qwen_image_dit.py
@@ -315,6 +315,7 @@ def forward(
 
 class QwenImageDiT(PreTrainedModel):
     converter = QwenImageDiTStateDictConverter()
+    _supports_parallelization = True
 
     def __init__(
         self,
@@ -423,3 +424,6 @@ def from_state_dict(
         model.load_state_dict(state_dict, assign=True)
         model.to(device=device, dtype=dtype, non_blocking=True)
         return model
+
+    def get_fsdp_modules(self):
+        return ["transformer_blocks"]
diff --git a/diffsynth_engine/pipelines/base.py b/diffsynth_engine/pipelines/base.py
@@ -6,7 +6,7 @@
 from PIL import Image
 
 from diffsynth_engine.configs import BaseConfig, BaseStateDicts
-from diffsynth_engine.utils.offload import enable_sequential_cpu_offload
+from diffsynth_engine.utils.offload import enable_sequential_cpu_offload, offload_model_to_dict, restore_model_from_dict
 from diffsynth_engine.utils.fp8_linear import enable_fp8_autocast
 from diffsynth_engine.utils.gguf import load_gguf_checkpoint
 from diffsynth_engine.utils import logging
@@ -40,6 +40,7 @@ def __init__(
         self.dtype = dtype
         self.offload_mode = None
         self.model_names = []
+        self._offload_param_dict = {}
 
     @classmethod
     def from_pretrained(cls, model_path_or_config: str | BaseConfig) -> "BasePipeline":
@@ -243,14 +244,13 @@ def _enable_model_cpu_offload(self):
         for model_name in self.model_names:
             model = getattr(self, model_name)
             if model is not None:
-                model.to("cpu")
+                self._offload_param_dict[model_name] = offload_model_to_dict(model)
         self.offload_mode = "cpu_offload"
 
     def _enable_sequential_cpu_offload(self):
         for model_name in self.model_names:
             model = getattr(self, model_name)
             if model is not None:
-                model.to("cpu")
                 enable_sequential_cpu_offload(model, self.device)
         self.offload_mode = "sequential_cpu_offload"
 
@@ -277,20 +277,12 @@ def load_models_to_device(self, load_model_names: List[str] | None = None):
         for model_name in self.model_names:
             if model_name not in load_model_names:
                 model = getattr(self, model_name)
-                if (
-                    model is not None
-                    and (p := next(model.parameters(), None)) is not None
-                    and p.device != torch.device("cpu")
-                ):
-                    model.to("cpu")
+                if model is not None and (p := next(model.parameters(), None)) is not None and p.device.type != "cpu":
+                    restore_model_from_dict(model, self._offload_param_dict[model_name])
         # load the needed models to device
         for model_name in load_model_names:
             model = getattr(self, model_name)
-            if (
-                model is not None
-                and (p := next(model.parameters(), None)) is not None
-                and p.device != torch.device(self.device)
-            ):
+            if model is not None and (p := next(model.parameters(), None)) is not None and p.device.type != self.device:
                 model.to(self.device)
         # fresh the cuda cache
         empty_cache()
diff --git a/diffsynth_engine/pipelines/wan_video.py b/diffsynth_engine/pipelines/wan_video.py
@@ -584,4 +584,11 @@ def from_pretrained(cls, model_path_or_config: WanPipelineConfig) -> "WanVideoPi
                 use_fsdp=config.use_fsdp,
                 device="cuda",
             )
+        if config.use_torch_compile:
+            pipe.compile()
         return pipe
+
+    def compile(self):
+        self.dit.compile()
+        if self.dit2 is not None:
+            self.dit2.compile()
diff --git a/diffsynth_engine/utils/offload.py b/diffsynth_engine/utils/offload.py
@@ -1,8 +1,10 @@
 import torch
 import torch.nn as nn
+from typing import Dict
 
 
 def enable_sequential_cpu_offload(module: nn.Module, device: str = "cuda"):
+    module = module.to("cpu")
     if len(list(module.children())) == 0:
         if len(list(module.parameters())) > 0 or len(list(module.buffers())) > 0:
             # leaf module with parameters or buffers
@@ -50,3 +52,24 @@ def _forward_hook(module: nn.Module, input_, output_):
     module.register_forward_pre_hook(_forward_pre_hook)
     module.register_forward_hook(_forward_hook)
     setattr(module, "_cpu_offload_enabled", True)
+
+
+def offload_model_to_dict(module: nn.Module) -> Dict[str, torch.Tensor]:
+    module = module.to("cpu")
+    offload_param_dict = {}
+    for name, param in module.named_parameters(recurse=True):
+        param.data = param.data.pin_memory()
+        offload_param_dict[name] = param.data
+    for name, buffer in module.named_buffers(recurse=True):
+        buffer.data = buffer.data.pin_memory()
+        offload_param_dict[name] = buffer.data
+    return offload_param_dict
+
+
+def restore_model_from_dict(module: nn.Module, offload_param_dict: Dict[str, torch.Tensor]):
+    for name, param in module.named_parameters(recurse=True):
+        if name in offload_param_dict:
+            param.data = offload_param_dict[name]
+    for name, buffer in module.named_buffers(recurse=True):
+        if name in offload_param_dict:
+            buffer.data = offload_param_dict[name]