huggingface
diff --git a/‎src/diffusers/pipelines/cosmos/pipeline_cosmos25_predict.py‎
Lines changed: 12 additions & 11 deletions b/‎src/diffusers/pipelines/cosmos/pipeline_cosmos25_predict.py‎
Lines changed: 12 additions & 11 deletions
diff --git a/‎src/diffusers/schedulers/scheduling_flow_unipc_multistep.py‎
Lines changed: 37 additions & 35 deletions b/‎src/diffusers/schedulers/scheduling_flow_unipc_multistep.py‎
Lines changed: 37 additions & 35 deletions
@@ -118,7 +118,7 @@ def retrieve_latents(
         ... ).frames[0]
         >>> export_to_video(video, "video2world.mp4", fps=16)
 
-        >>> # To produce a single-frame image instead of a world clip, set num_frames=1 and
+        >>> # To produce a single-frame image instead of a world (video) clip, set num_frames=1 and
         >>> # save the first frame: pipe(..., num_frames=1).frames[0][0].
         ```
 """
@@ -201,7 +201,6 @@ def _get_prompt_embeds(
         dtype = dtype or self.text_encoder.dtype
         prompt = [prompt] if isinstance(prompt, str) else prompt
 
-        # Tokenize prompts
         input_ids_batch = []
 
         for sample_idx in range(len(prompt)):
@@ -257,7 +256,7 @@ def _get_prompt_embeds(
 
         return prompt_embeds
 
-    # Copied from diffusers.pipelines.cosmos.pipeline_cosmos_text2world.CosmosTextToWorldPipeline.encode_prompt with num_videos_per_prompt->num_videos_per_prompt
+    # Modified from diffusers.pipelines.cosmos.pipeline_cosmos_text2world.CosmosTextToWorldPipeline.encode_prompt
     def encode_prompt(
         self,
         prompt: Union[str, List[str]],
@@ -505,7 +504,6 @@ def __call__(
         ] = None,
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
         max_sequence_length: int = 512,
-        shift: float = 5.0,
         conditional_frame_timestep: float = 0.1,
     ):
         r"""
@@ -656,13 +654,16 @@ def __call__(
         video = self.video_processor.preprocess_video(video, height, width)
 
         # pad with last frame (for video2world)
-        if video.shape[2] < num_frames:
+        num_frames_out = num_frames
+        if video.shape[2] < num_frames_out:
             assert batch_size == 1, "batch_size must be 1 for padding frames"
-            n_pad_frames = num_frames - num_frames_in
+            n_pad_frames = num_frames_out - num_frames_in
             last_frame = video[0, :, -1:, :, :]  # [C, T==1, H, W]
             pad_frames = last_frame.repeat(1, 1, n_pad_frames, 1, 1)  # [B, C, T, H, W]
             video = torch.cat((video, pad_frames), dim=2)
 
+        assert num_frames_in <= num_frames_out, f"expected ({num_frames_in=}) <= ({num_frames_out=})"
+
         video = video.to(device=device, dtype=vae_dtype)
 
         num_channels_latents = self.transformer.config.in_channels - 1
@@ -686,7 +687,7 @@ def __call__(
         padding_mask = latents.new_zeros(1, 1, height, width, dtype=transformer_dtype)
 
         # Denoising loop
-        self.scheduler.set_timesteps(num_inference_steps, shift=shift, device=device)
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps = self.scheduler.timesteps
         self._num_timesteps = len(timesteps)
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
@@ -699,10 +700,10 @@ def __call__(
 
                 self._current_timestep = t.cpu().item()
 
-                # NOTE: sigmas are in [0, 1] in FlowUniPCMultistepScheduler
-                sigma_t = torch.tensor(self.scheduler.sigmas[i]).unsqueeze(0).to(device=device, dtype=transformer_dtype)
+                # NOTE: assumes sigma(t) \in [0, 1]
+                sigma_t = torch.tensor(self.scheduler.sigmas[i].item()).unsqueeze(0).to(device=device, dtype=transformer_dtype)
 
-                in_latents = cond_mask * cond_latent + (1 - cond_mask) * latents  # TODO: could use cond_indicator
+                in_latents = cond_mask * cond_latent + (1 - cond_mask) * latents
                 in_latents = in_latents.to(transformer_dtype)
                 in_timestep = cond_indicator * cond_timestep + (1 - cond_indicator) * sigma_t
                 noise_pred = self.transformer(
@@ -725,7 +726,7 @@ def __call__(
                         padding_mask=padding_mask,
                         return_dict=False,
                     )[0]
-                    # NOTE: replace velocity (noise_pred) with gt_velocity for conditioning inputs only
+                    # NOTE: replace velocity (noise_pred_neg) with gt_velocity for conditioning inputs only
                     noise_pred_neg = gt_velocity + noise_pred_neg * (1 - cond_mask)
                     noise_pred = noise_pred + self.guidance_scale * (noise_pred - noise_pred_neg)
 
 
@@ -1,7 +1,4 @@
-# Copied from https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/schedulers/scheduling_unipc_multistep.py
-
-import math
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -11,35 +8,38 @@
 from diffusers.utils import deprecate
 
 
-def _get_karras_sigmas(self, num_steps: int, sigma_max: float, sigma_min: float, rho: int, final_sigmas_type: str):
+def _get_karras_sigmas(num_train_steps: int, num_steps: int, sigma_max: float, sigma_min: float, rho: int, final_sigmas_type: str):
     sigmas = np.arange(num_steps + 1, dtype=np.float32) / num_steps
     min_inv_rho = sigma_min ** (1 / rho)
     max_inv_rho = sigma_max ** (1 / rho)
     sigmas = (max_inv_rho + sigmas * (min_inv_rho - max_inv_rho)) ** rho
     sigmas = sigmas / (1 + sigmas)
 
-    if self.config.final_sigmas_type == "zero":
+    if final_sigmas_type == "zero":
         sigma_last = 0
+    elif final_sigmas_type == "sigma_min":
+        sigma_last = sigmas[-1]
     else:
         raise ValueError(
-            f"`final_sigmas_type` must be 'zero' but got {self.config.final_sigmas_type}"
+            f"`final_sigmas_type` must be 'zero' or 'sigma_min' but got {final_sigmas_type}"
         )
 
-    timesteps = torch.from_numpy(sigmas * self.config.num_train_timesteps).to(torch.int64)
-    sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32)  # pyright: ignore
+    timesteps = torch.from_numpy(sigmas * num_train_steps).to(torch.int64)
+    sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32)
     sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32)
     return sigmas, timesteps
 
 
 class FlowUniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
     """
-    `FlowUniPCMultistepScheduler` is the UniPC algorithm [1] for flow matching [2], but strictly uses the Karras sigmas [3].
+    `FlowUniPCMultistepScheduler` is the UniPC algorithm[1] for flow
+    matching[2], but strictly uses the Karras sigmas [3] (i.e. it follows the EDMEulerScheduler).
+
+    Note this a simplified version of `UniPCMultistepScheduler`, as it:
+    1. Does not have variance preserving sigmas
+    2. Does not store betas and other variables used by `UniPCMultistepScheduler`
+    3. Assumes prediction_type == "flow_prediction" (this parameter is removed)
 
-    Note this a simplified version of `UniPCMultistepScheduler`, as:
-    1. it does not have variance preserving sigmas
-    2. it does not store betas and other variables used by `UniPCMultistepScheduler`
-    3. it assumes prediction_type == "flow_prediction" (this variable is removed from `FlowUniPCMultistepScheduler`)
-    
     References:
         [1] Wang, Chong, et al. "UniPC: A Unified Predictor-Corrector Framework for Fast Sampling of Diffusion Models" https://arxiv.org/abs/2302.04867
         [2] Lipman, Chen, et al. "Flow matching for generative modeling." https://arxiv.org/abs/2210.02747
@@ -97,7 +97,7 @@ def __init__(
         lower_order_final: bool = True,
         disable_corrector: List[int] = [],
         solver_p: SchedulerMixin = None,
-        final_sigmas_type: Optional[str] = "zero",  # "zero", "sigma_min"
+        final_sigmas_type: Literal["zero", "sigma_min"] = "zero",
         rho: int = 7,
         sigma_max: float = 200.0,
         sigma_min: float = 0.01,
@@ -109,21 +109,15 @@ def __init__(
                 raise NotImplementedError(f"{solver_type} is not implemented for {self.__class__}")
 
         self.predict_x0 = predict_x0
-        self.num_inference_steps = None
         self.disable_corrector = disable_corrector
+        self.solver_p = solver_p
+        self.num_inference_steps = None
 
-        self.sigmas, self.timesteps = _get_karras_sigmas(self, num_train_timesteps, sigma_max, sigma_min, rho, final_sigmas_type)
+        self.sigmas, self.timesteps = _get_karras_sigmas(num_train_timesteps, num_train_timesteps, sigma_max, sigma_min, rho, final_sigmas_type)
         self.sigma_min = self.sigmas[-1].item()
         self.sigma_max = self.sigmas[0].item()
 
-        self.last_sample = None
-        self._step_index = None
-        self._begin_index = None
-        self.model_outputs = [None] * self.config.solver_order
-        self.timestep_list = [None] * self.config.solver_order
-        self.lower_order_nums = 0
-        self.solver_p = self.config.solver_p
-
+        self._reset_state(solver_order)
 
     @property
     def step_index(self):
@@ -169,22 +163,29 @@ def set_timesteps(
         """
         assert sigmas is None, "sigmas are not supported for FlowUniPCMultistepScheduler"
 
-        self.sigmas, self.timesteps = _get_karras_sigmas(self, num_inference_steps, self.config.sigma_max, self.config.sigma_min, self.config.rho, self.config.final_sigmas_type)
+        self.sigmas, self.timesteps = _get_karras_sigmas(self.config.num_train_timesteps, num_inference_steps, self.config.sigma_max, self.config.sigma_min, self.config.rho, self.config.final_sigmas_type)
         self.num_inference_steps = len(self.timesteps)
 
         self.sigma_min = self.sigmas[-1].item()
         self.sigma_max = self.sigmas[0].item()
 
+        self.sigmas = self.sigmas.to(device)
+        self.timesteps = self.timesteps.to(device)
+        self._reset_state()
+
+    def _reset_state(self, solver_order: Optional[int] = None):
+        """
+        Resets the noise schedule & solver state variables
+        """
+        solver_order = solver_order or self.config.solver_order
+        self.model_outputs = [None] * solver_order
+        self.timestep_list = [None] * solver_order
+        self.lower_order_nums = 0
         self.last_sample = None
         self._step_index = None
         self._begin_index = None
-        self.model_outputs = [None] * self.config.solver_order
-        self.timestep_list = [None] * self.config.solver_order
-        self.lower_order_nums = 0
-        self.solver_p = self.config.solver_p
-
-        self.sigmas = self.sigmas.to(device)
-        self.timesteps = self.timesteps.to(device)
+        if self.solver_p:
+            self.solver_p.set_timesteps(self.num_inference_steps, device=device)
 
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
@@ -575,6 +576,7 @@ def _init_step_index(self, timestep):
         else:
             self._step_index = self._begin_index
 
+    # Modified from diffusers.schedulers.scheduling_unipc_multistep.UniPCMultistepScheduler.step
     def step(
         self,
         model_output: torch.Tensor,
@@ -638,7 +640,7 @@ def step(
             this_order = self.config.solver_order
 
         self.this_order = min(this_order, self.lower_order_nums + 1)  # warmup for multistep
-        assert self.this_order > 0
+        assert self.this_order > 0, "expected this_order > 0, this could be due to duplicate timesteps"
 
         self.last_sample = sample
         prev_sample = self.multistep_uni_p_bh_update(