simplify inference pipeline

miguelmartin75 · miguelmartin75 · commit df085fae813f · 2025-12-16T23:56:48.000Z
diff --git a/src/diffusers/pipelines/cosmos/pipeline_cosmos25_predict.py b/src/diffusers/pipelines/cosmos/pipeline_cosmos25_predict.py
@@ -506,7 +506,6 @@ def __call__(
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
         max_sequence_length: int = 512,
         shift: float = 5.0,
-        timestep_scale: float = 0.001,
         conditional_frame_timestep: float = 0.1,
     ):
         r"""
@@ -592,7 +591,7 @@ def __call__(
         if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
             callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
 
-        # 1. Check inputs. Raise error if not correct
+        # Check inputs. Raise error if not correct
         self.check_inputs(prompt, height, width, prompt_embeds, callback_on_step_end_tensor_inputs)
 
         self._guidance_scale = guidance_scale
@@ -613,15 +612,15 @@ def __call__(
                         )
             self.safety_checker.to("cpu")
 
-        # 2. Define call parameters
+        # Define call parameters
         if prompt is not None and isinstance(prompt, str):
             batch_size = 1
         elif prompt is not None and isinstance(prompt, list):
             batch_size = len(prompt)
         else:
             batch_size = prompt_embeds.shape[0]
 
-        # 3. Encode input prompt
+        # Encode input prompt
         (
             prompt_embeds,
             negative_prompt_embeds,
@@ -639,10 +638,6 @@ def __call__(
         vae_dtype = self.vae.dtype
         transformer_dtype = self.transformer.dtype
 
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps, shift=shift, device=device, scale=timestep_scale)
-        timesteps = torch.tensor(self.scheduler.timesteps).to(transformer_dtype)
-
         num_frames_in = None
         if image is not None:
             # TODO: handle batch_size > 1
@@ -690,22 +685,26 @@ def __call__(
 
         padding_mask = latents.new_zeros(1, 1, height, width, dtype=transformer_dtype)
 
-        # 6. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        # Denoising loop
+        self.scheduler.set_timesteps(num_inference_steps, shift=shift, device=device)
+        timesteps = self.scheduler.timesteps
         self._num_timesteps = len(timesteps)
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
 
-        gt_velocity = latents - cond_latent
+        gt_velocity = (latents - cond_latent) * cond_mask
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 if self.interrupt:
                     continue
 
                 self._current_timestep = t.cpu().item()
-                timestep = t.unsqueeze(0)
+
+                # NOTE: equivalent to t / 1000 for FlowUniPCMultistepScheduler (sigmas are in [0, 1], num_train_timesteps=1000)
+                sigma_t = torch.tensor(self.scheduler.sigmas[i]).unsqueeze(0).to(device=device, dtype=transformer_dtype)
 
                 in_latents = cond_mask * cond_latent + (1 - cond_mask) * latents  # TODO: could use cond_indicator
                 in_latents = in_latents.to(transformer_dtype)
-                in_timestep = cond_indicator * cond_timestep + (1 - cond_indicator) * timestep
+                in_timestep = cond_indicator * cond_timestep + (1 - cond_indicator) * sigma_t
                 noise_pred = self.transformer(
                     hidden_states=in_latents,
                     condition_mask=cond_mask,
@@ -714,8 +713,8 @@ def __call__(
                     padding_mask=padding_mask,
                     return_dict=False,
                 )[0]
-                # NOTE: force input video latents for noise_pred by correcting velocity
-                noise_pred = gt_velocity * cond_mask + noise_pred * (1 - cond_mask)
+                # NOTE: replace velocity (noise_pred) with gt_velocity for conditioning inputs only
+                noise_pred = gt_velocity + noise_pred * (1 - cond_mask)
 
                 if self.do_classifier_free_guidance:
                     noise_pred_neg = self.transformer(
@@ -726,8 +725,8 @@ def __call__(
                         padding_mask=padding_mask,
                         return_dict=False,
                     )[0]
-                    # NOTE: force input video latents for noise_pred by correcting velocity
-                    noise_pred_neg = gt_velocity * cond_mask + noise_pred_neg * (1 - cond_mask)
+                    # NOTE: replace velocity (noise_pred) with gt_velocity for conditioning inputs only
+                    noise_pred_neg = gt_velocity + noise_pred_neg * (1 - cond_mask)
                     noise_pred = noise_pred + self.guidance_scale * (noise_pred - noise_pred_neg)
 
                 latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
diff --git a/src/diffusers/schedulers/scheduling_flow_unipc_multistep.py b/src/diffusers/schedulers/scheduling_flow_unipc_multistep.py
@@ -157,7 +157,6 @@ def set_timesteps(
         sigmas: Optional[List[float]] = None,
         mu: Optional[Union[float, None]] = None,
         shift: Optional[Union[float, None]] = None,
-        scale: float = 0.001,
     ):
         """
         Sets the discrete timesteps used for the diffusion chain (to be run before inference).
@@ -192,7 +191,10 @@ def set_timesteps(
                     shift = self.config.shift
                 sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)  # pyright: ignore
 
-        if self.config.final_sigmas_type == "zero":
+        if self.config.final_sigmas_type == "sigma_min":
+            # TODO(migmartin): this raises an error, rewrite this class
+            sigma_last = ((1 - self.alphas_cumprod[0]) / self.alphas_cumprod[0]) ** 0.5
+        elif self.config.final_sigmas_type == "zero":
             sigma_last = 0
         else:
             raise ValueError(
@@ -203,8 +205,7 @@ def set_timesteps(
         sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32)  # pyright: ignore
 
         self.sigmas = torch.from_numpy(sigmas)
-        self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.float32)
-        self.timesteps *= scale
+        self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.int64)
 
         self.num_inference_steps = len(timesteps)
 
@@ -304,7 +305,6 @@ def convert_model_output(
         sigma = self.sigmas[self.step_index]
         alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
 
-        # print("sigma_t ==>", self.step_index, sigma, sigma_t, alpha_t, sample.shape, model_output.shape)
         if self.predict_x0:
             if self.config.prediction_type == "flow_prediction":
                 sigma_t = self.sigmas[self.step_index]
@@ -317,7 +317,6 @@ def convert_model_output(
 
             if self.config.thresholding:
                 x0_pred = self._threshold_sample(x0_pred)
-            # print("self.config.thresholding", self.config.thresholding)
             return x0_pred
         else:
             if self.config.prediction_type == "flow_prediction":