From b696f63655829732c9fa1ee129b8a155cbe1c9ee Mon Sep 17 00:00:00 2001 From: Mr-Neutr0n <64578610+Mr-Neutr0n@users.noreply.github.com> Date: Tue, 10 Feb 2026 00:04:56 +0530 Subject: [PATCH 1/2] Add bounds check for AuraFlow positional embedding indices When the input latent spatial dimensions exceed the positional embedding grid size, the centered-crop index calculation produces negative or out-of-range indices. This causes a fatal CUDA assertion error that destroys the CUDA context for the entire process. Add an explicit check that raises a clear ValueError instead of silently producing invalid indices. --- .../models/transformers/auraflow_transformer_2d.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/diffusers/models/transformers/auraflow_transformer_2d.py b/src/diffusers/models/transformers/auraflow_transformer_2d.py index e3732662e408..98fa5e3051e9 100644 --- a/src/diffusers/models/transformers/auraflow_transformer_2d.py +++ b/src/diffusers/models/transformers/auraflow_transformer_2d.py @@ -76,6 +76,12 @@ def pe_selection_index_based_on_dim(self, h, w): h_p, w_p = h // self.patch_size, w // self.patch_size h_max, w_max = int(self.pos_embed_max_size**0.5), int(self.pos_embed_max_size**0.5) + if h_p > h_max or w_p > w_max: + raise ValueError( + f"Input latent size ({h_p}x{w_p} patches) exceeds the positional embedding grid " + f"({h_max}x{w_max}). Use a smaller resolution or increase pos_embed_max_size." + ) + # Calculate the top-left corner indices for the centered patch grid starth = h_max // 2 - h_p // 2 startw = w_max // 2 - w_p // 2 From d09af3b2300aea4aea55fa045c9b963467807713 Mon Sep 17 00:00:00 2001 From: Mr-Neutr0n <64578610+Mr-Neutr0n@users.noreply.github.com> Date: Tue, 10 Feb 2026 00:10:55 +0530 Subject: [PATCH 2/2] Fix AudioLDM2Pipeline crash when language_model lacks GenerationMixin Some AudioLDM2 model repos (e.g. anhnct/audioldm2_gigaspeech) ship a GPT2Model instead of GPT2LMHeadModel. GPT2Model does not inherit from GenerationMixin, so calling _get_initial_cache_position() and _update_model_kwargs_for_generation() on it raises an AttributeError. This adds a hasattr check for GenerationMixin methods and provides equivalent inline fallback logic (cache_position initialization, past_key_values / attention_mask / cache_position updates) so the pipeline works with both GPT2Model and GPT2LMHeadModel. Fixes #12630 --- .../pipelines/audioldm2/pipeline_audioldm2.py | 40 ++++++++++++++----- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py index 452fc3c01b27..8bef2dcb7659 100644 --- a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py +++ b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py @@ -324,17 +324,28 @@ def generate_language_model( `inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): The sequence of generated hidden-states. """ - cache_position_kwargs = {} - if is_transformers_version("<", "4.52.1"): - cache_position_kwargs["input_ids"] = inputs_embeds + max_new_tokens = max_new_tokens if max_new_tokens is not None else self.language_model.config.max_new_tokens + + has_generation_mixin = hasattr(self.language_model, "_get_initial_cache_position") + + if has_generation_mixin: + cache_position_kwargs = {} + if is_transformers_version("<", "4.52.1"): + cache_position_kwargs["input_ids"] = inputs_embeds + else: + cache_position_kwargs["seq_length"] = inputs_embeds.shape[0] + cache_position_kwargs["device"] = ( + self.language_model.device if getattr(self, "language_model", None) is not None else self.device + ) + cache_position_kwargs["model_kwargs"] = model_kwargs + model_kwargs = self.language_model._get_initial_cache_position(**cache_position_kwargs) else: - cache_position_kwargs["seq_length"] = inputs_embeds.shape[0] - cache_position_kwargs["device"] = ( + # Fallback for models without GenerationMixin (e.g. GPT2Model instead of GPT2LMHeadModel). + # Set initial cache_position as a simple arange over the input sequence length. + device = ( self.language_model.device if getattr(self, "language_model", None) is not None else self.device ) - cache_position_kwargs["model_kwargs"] = model_kwargs - max_new_tokens = max_new_tokens if max_new_tokens is not None else self.language_model.config.max_new_tokens - model_kwargs = self.language_model._get_initial_cache_position(**cache_position_kwargs) + model_kwargs["cache_position"] = torch.arange(inputs_embeds.shape[1], device=device) for _ in range(max_new_tokens): # prepare model inputs @@ -349,7 +360,18 @@ def generate_language_model( inputs_embeds = torch.cat([inputs_embeds, next_hidden_states[:, -1:, :]], dim=1) # Update generated hidden states, model inputs, and length for next step - model_kwargs = self.language_model._update_model_kwargs_for_generation(output, model_kwargs) + if has_generation_mixin: + model_kwargs = self.language_model._update_model_kwargs_for_generation(output, model_kwargs) + else: + # Fallback: manually update past_key_values, attention_mask, and cache_position. + if hasattr(output, "past_key_values") and output.past_key_values is not None: + model_kwargs["past_key_values"] = output.past_key_values + if "attention_mask" in model_kwargs and model_kwargs["attention_mask"] is not None: + attention_mask = model_kwargs["attention_mask"] + model_kwargs["attention_mask"] = torch.cat( + [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 + ) + model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + 1 return inputs_embeds[:, -max_new_tokens:, :]