From b696f63655829732c9fa1ee129b8a155cbe1c9ee Mon Sep 17 00:00:00 2001
From: Mr-Neutr0n <64578610+Mr-Neutr0n@users.noreply.github.com>
Date: Tue, 10 Feb 2026 00:04:56 +0530
Subject: [PATCH 1/2] Add bounds check for AuraFlow positional embedding
 indices

When the input latent spatial dimensions exceed the positional
embedding grid size, the centered-crop index calculation produces
negative or out-of-range indices. This causes a fatal CUDA assertion
error that destroys the CUDA context for the entire process.

Add an explicit check that raises a clear ValueError instead of
silently producing invalid indices.
---
 .../models/transformers/auraflow_transformer_2d.py          | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/diffusers/models/transformers/auraflow_transformer_2d.py b/src/diffusers/models/transformers/auraflow_transformer_2d.py
index e3732662e408..98fa5e3051e9 100644
--- a/src/diffusers/models/transformers/auraflow_transformer_2d.py
+++ b/src/diffusers/models/transformers/auraflow_transformer_2d.py
@@ -76,6 +76,12 @@ def pe_selection_index_based_on_dim(self, h, w):
         h_p, w_p = h // self.patch_size, w // self.patch_size
         h_max, w_max = int(self.pos_embed_max_size**0.5), int(self.pos_embed_max_size**0.5)
 
+        if h_p > h_max or w_p > w_max:
+            raise ValueError(
+                f"Input latent size ({h_p}x{w_p} patches) exceeds the positional embedding grid "
+                f"({h_max}x{w_max}). Use a smaller resolution or increase pos_embed_max_size."
+            )
+
         # Calculate the top-left corner indices for the centered patch grid
         starth = h_max // 2 - h_p // 2
         startw = w_max // 2 - w_p // 2

From d09af3b2300aea4aea55fa045c9b963467807713 Mon Sep 17 00:00:00 2001
From: Mr-Neutr0n <64578610+Mr-Neutr0n@users.noreply.github.com>
Date: Tue, 10 Feb 2026 00:10:55 +0530
Subject: [PATCH 2/2] Fix AudioLDM2Pipeline crash when language_model lacks
 GenerationMixin

Some AudioLDM2 model repos (e.g. anhnct/audioldm2_gigaspeech) ship a
GPT2Model instead of GPT2LMHeadModel. GPT2Model does not inherit from
GenerationMixin, so calling _get_initial_cache_position() and
_update_model_kwargs_for_generation() on it raises an AttributeError.

This adds a hasattr check for GenerationMixin methods and provides
equivalent inline fallback logic (cache_position initialization,
past_key_values / attention_mask / cache_position updates) so the
pipeline works with both GPT2Model and GPT2LMHeadModel.

Fixes #12630
---
 .../pipelines/audioldm2/pipeline_audioldm2.py | 40 ++++++++++++++-----
 1 file changed, 31 insertions(+), 9 deletions(-)

diff --git a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
index 452fc3c01b27..8bef2dcb7659 100644
--- a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
+++ b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
@@ -324,17 +324,28 @@ def generate_language_model(
             `inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
                 The sequence of generated hidden-states.
         """
-        cache_position_kwargs = {}
-        if is_transformers_version("<", "4.52.1"):
-            cache_position_kwargs["input_ids"] = inputs_embeds
+        max_new_tokens = max_new_tokens if max_new_tokens is not None else self.language_model.config.max_new_tokens
+
+        has_generation_mixin = hasattr(self.language_model, "_get_initial_cache_position")
+
+        if has_generation_mixin:
+            cache_position_kwargs = {}
+            if is_transformers_version("<", "4.52.1"):
+                cache_position_kwargs["input_ids"] = inputs_embeds
+            else:
+                cache_position_kwargs["seq_length"] = inputs_embeds.shape[0]
+                cache_position_kwargs["device"] = (
+                    self.language_model.device if getattr(self, "language_model", None) is not None else self.device
+                )
+            cache_position_kwargs["model_kwargs"] = model_kwargs
+            model_kwargs = self.language_model._get_initial_cache_position(**cache_position_kwargs)
         else:
-            cache_position_kwargs["seq_length"] = inputs_embeds.shape[0]
-            cache_position_kwargs["device"] = (
+            # Fallback for models without GenerationMixin (e.g. GPT2Model instead of GPT2LMHeadModel).
+            # Set initial cache_position as a simple arange over the input sequence length.
+            device = (
                 self.language_model.device if getattr(self, "language_model", None) is not None else self.device
             )
-        cache_position_kwargs["model_kwargs"] = model_kwargs
-        max_new_tokens = max_new_tokens if max_new_tokens is not None else self.language_model.config.max_new_tokens
-        model_kwargs = self.language_model._get_initial_cache_position(**cache_position_kwargs)
+            model_kwargs["cache_position"] = torch.arange(inputs_embeds.shape[1], device=device)
 
         for _ in range(max_new_tokens):
             # prepare model inputs
@@ -349,7 +360,18 @@ def generate_language_model(
             inputs_embeds = torch.cat([inputs_embeds, next_hidden_states[:, -1:, :]], dim=1)
 
             # Update generated hidden states, model inputs, and length for next step
-            model_kwargs = self.language_model._update_model_kwargs_for_generation(output, model_kwargs)
+            if has_generation_mixin:
+                model_kwargs = self.language_model._update_model_kwargs_for_generation(output, model_kwargs)
+            else:
+                # Fallback: manually update past_key_values, attention_mask, and cache_position.
+                if hasattr(output, "past_key_values") and output.past_key_values is not None:
+                    model_kwargs["past_key_values"] = output.past_key_values
+                if "attention_mask" in model_kwargs and model_kwargs["attention_mask"] is not None:
+                    attention_mask = model_kwargs["attention_mask"]
+                    model_kwargs["attention_mask"] = torch.cat(
+                        [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+                    )
+                model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + 1
 
         return inputs_embeds[:, -max_new_tokens:, :]