From 36e222954544921c8016aa4367307dbe9ea3312d Mon Sep 17 00:00:00 2001
From: chuxij <junmingong@qq.com>
Date: Sat, 7 Feb 2026 10:53:49 +0000
Subject: [PATCH 1/3] Add ACE-Step pipeline for text-to-music generation

## What does this PR do?

This PR adds support for the ACE-Step pipeline, a text-to-music generation model that generates high-quality music with lyrics from text prompts. ACE-Step generates variable-length stereo music at 48kHz from text prompts and optional lyrics.

The implementation includes:
- **AceStepDiTModel**: A Diffusion Transformer (DiT) model that operates in the latent space using flow matching
- **AceStepPipeline**: The main pipeline for text-to-music generation with support for lyrics conditioning
- **AceStepConditionEncoder**: Condition encoder that combines text, lyric, and timbre embeddings
- **Conversion script**: Script to convert ACE-Step checkpoint weights to Diffusers format
- **Comprehensive tests**: Full test coverage for the pipeline and models
- **Documentation**: API documentation for the pipeline and transformer model

## Key Features

- Text-to-music generation with optional lyrics support
- Multi-language lyrics support (English, Chinese, Japanese, Korean, and more)
- Flow matching with custom timestep schedules
- Turbo model variant optimized for 8 inference steps
- Variable-length audio generation (configurable duration)

## Technical Details

ACE-Step comprises three main components:
1. **Oobleck autoencoder (VAE)**: Compresses waveforms into 25Hz latent representations
2. **Qwen3-based text encoder**: Encodes text prompts and lyrics for conditioning
3. **Diffusion Transformer (DiT)**: Operates in the latent space using flow matching

The pipeline supports multiple shift parameters (1.0, 2.0, 3.0) for different timestep schedules, with the turbo model designed for 8 inference steps using `shift=3.0`.

## Testing

All tests pass successfully:
- Model forward pass tests
- Pipeline basic functionality tests
- Batch processing tests
- Latent output tests
- Return dict tests

Run tests with:
```bash
pytest tests/pipelines/ace_step/test_ace_step.py -v
```

## Code Quality

- Code formatted with `make style`
- Quality checks passed with `make quality`
- All tests passing

## References

- Original codebase: [ACE-Step/ACE-Step](https://github.com/ACE-Step/ACE-Step)
- Paper: [ACE-Step: A Step Towards Music Generation Foundation Model](https://github.com/ACE-Step/ACE-Step)
---
 docs/source/en/_toctree.yml                   |   4 +
 .../en/api/models/ace_step_transformer.md     |  19 +
 docs/source/en/api/pipelines/ace_step.md      |  58 ++
 scripts/convert_ace_step_to_diffusers.py      | 320 ++++++++
 src/diffusers/__init__.py                     |   6 +
 src/diffusers/models/__init__.py              |   2 +
 src/diffusers/models/transformers/__init__.py |   1 +
 .../transformers/ace_step_transformer.py      | 714 ++++++++++++++++++
 src/diffusers/pipelines/__init__.py           |   5 +
 src/diffusers/pipelines/ace_step/__init__.py  |  50 ++
 .../pipelines/ace_step/modeling_ace_step.py   | 562 ++++++++++++++
 .../pipelines/ace_step/pipeline_ace_step.py   | 539 +++++++++++++
 src/diffusers/utils/dummy_pt_objects.py       |  15 +
 .../dummy_torch_and_transformers_objects.py   |  30 +
 tests/pipelines/ace_step/__init__.py          |   0
 tests/pipelines/ace_step/test_ace_step.py     | 335 ++++++++
 16 files changed, 2660 insertions(+)
 create mode 100644 docs/source/en/api/models/ace_step_transformer.md
 create mode 100644 docs/source/en/api/pipelines/ace_step.md
 create mode 100644 scripts/convert_ace_step_to_diffusers.py
 create mode 100644 src/diffusers/models/transformers/ace_step_transformer.py
 create mode 100644 src/diffusers/pipelines/ace_step/__init__.py
 create mode 100644 src/diffusers/pipelines/ace_step/modeling_ace_step.py
 create mode 100644 src/diffusers/pipelines/ace_step/pipeline_ace_step.py
 create mode 100644 tests/pipelines/ace_step/__init__.py
 create mode 100644 tests/pipelines/ace_step/test_ace_step.py

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 64a4222845b0..5093c3ee0455 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -318,6 +318,8 @@
         title: SparseControlNetModel
       title: ControlNets
     - sections:
+      - local: api/models/ace_step_transformer
+        title: AceStepDiTModel
       - local: api/models/allegro_transformer3d
         title: AllegroTransformer3DModel
       - local: api/models/aura_flow_transformer2d
@@ -472,6 +474,8 @@
     - local: api/pipelines/auto_pipeline
       title: AutoPipeline
     - sections:
+      - local: api/pipelines/ace_step
+        title: ACE-Step
       - local: api/pipelines/audioldm
         title: AudioLDM
       - local: api/pipelines/audioldm2
diff --git a/docs/source/en/api/models/ace_step_transformer.md b/docs/source/en/api/models/ace_step_transformer.md
new file mode 100644
index 000000000000..224d1a5777e7
--- /dev/null
+++ b/docs/source/en/api/models/ace_step_transformer.md
@@ -0,0 +1,19 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# AceStepDiTModel
+
+A Diffusion Transformer (DiT) model for music generation from [ACE-Step](https://github.com/ACE-Step/ACE-Step).
+
+## AceStepDiTModel
+
+[[autodoc]] AceStepDiTModel
diff --git a/docs/source/en/api/pipelines/ace_step.md b/docs/source/en/api/pipelines/ace_step.md
new file mode 100644
index 000000000000..9bf5109aaff4
--- /dev/null
+++ b/docs/source/en/api/pipelines/ace_step.md
@@ -0,0 +1,58 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# ACE-Step
+
+ACE-Step was proposed in [ACE-Step: A Step Towards Music Generation Foundation Model](https://github.com/ACE-Step/ACE-Step) by the ACE-Step Team. It is a text-to-music generation model that generates high-quality music with lyrics from text prompts.
+
+ACE-Step generates variable-length stereo music at 48kHz from text prompts and optional lyrics. It comprises three components: an Oobleck autoencoder (VAE) that compresses waveforms into 25Hz latent representations, a Qwen3-based text encoder for text and lyric conditioning, and a Diffusion Transformer (DiT) model that operates in the latent space of the autoencoder using flow matching.
+
+The model supports multiple languages for lyrics including English, Chinese, Japanese, Korean, French, German, Spanish, Italian, Portuguese, Russian, and more.
+
+This pipeline was contributed by [ACE-Step Team](https://github.com/ACE-Step). The original codebase can be found at [ACE-Step/ACE-Step](https://github.com/ACE-Step/ACE-Step).
+
+## Tips
+
+When constructing a prompt, keep in mind:
+
+* Descriptive prompt inputs work best; use adjectives to describe the music style, instruments, mood, and tempo.
+* The prompt should describe the overall musical characteristics (e.g., "upbeat pop song with electric guitar and drums").
+* Lyrics should be structured with tags like `[verse]`, `[chorus]`, `[bridge]`, etc.
+
+During inference:
+
+* The turbo model variant is designed for 8 inference steps with `shift=3.0`.
+* The `audio_duration` parameter controls the length of the generated music in seconds.
+* The `vocal_language` parameter should match the language of the lyrics.
+
+```python
+import torch
+import soundfile as sf
+from diffusers import AceStepPipeline
+
+pipe = AceStepPipeline.from_pretrained("ACE-Step/ACE-Step-v1-5-turbo", torch_dtype=torch.bfloat16)
+pipe = pipe.to("cuda")
+
+audio = pipe(
+    prompt="A beautiful piano piece with soft melodies and gentle rhythm",
+    lyrics="[verse]\nSoft notes in the morning light\nDancing through the air so bright\n[chorus]\nMusic fills the air tonight\nEvery note feels just right",
+    audio_duration=30.0,
+    num_inference_steps=8,
+).audios
+
+sf.write("output.wav", audio[0].T.cpu().float().numpy(), 48000)
+```
+
+## AceStepPipeline
+[[autodoc]] AceStepPipeline
+	- all
+	- __call__
diff --git a/scripts/convert_ace_step_to_diffusers.py b/scripts/convert_ace_step_to_diffusers.py
new file mode 100644
index 000000000000..ca391e1f4d97
--- /dev/null
+++ b/scripts/convert_ace_step_to_diffusers.py
@@ -0,0 +1,320 @@
+# Run this script to convert ACE-Step model weights to a diffusers pipeline.
+#
+# Usage:
+#   python scripts/convert_ace_step_to_diffusers.py \
+#       --checkpoint_dir /path/to/ACE-Step-1.5/checkpoints \
+#       --dit_config acestep-v15-turbo \
+#       --output_dir /path/to/output/ACE-Step-v1-5-turbo \
+#       --dtype bf16
+
+import argparse
+import json
+import os
+import shutil
+
+import torch
+from safetensors.torch import load_file, save_file
+
+
+def convert_ace_step_weights(checkpoint_dir, dit_config, output_dir, dtype_str="bf16"):
+    """
+    Convert ACE-Step checkpoint weights into a Diffusers-compatible pipeline layout.
+
+    The original ACE-Step model stores all weights in a single `model.safetensors` file
+    under `checkpoints/<dit_config>/`. This script splits the weights into separate
+    sub-model directories that can be loaded by `AceStepPipeline.from_pretrained()`.
+
+    Expected input layout:
+        checkpoint_dir/
+            <dit_config>/           # e.g., acestep-v15-turbo
+                config.json
+                model.safetensors
+                silence_latent.pt
+            vae/
+                config.json
+                diffusion_pytorch_model.safetensors
+            Qwen3-Embedding-0.6B/
+                config.json
+                model.safetensors
+                tokenizer.json
+                ...
+
+    Output layout:
+        output_dir/
+            model_index.json
+            transformer/
+                config.json
+                diffusion_pytorch_model.safetensors
+            condition_encoder/
+                config.json
+                diffusion_pytorch_model.safetensors
+            vae/
+                config.json
+                diffusion_pytorch_model.safetensors
+            text_encoder/
+                config.json
+                model.safetensors
+                ...
+            tokenizer/
+                tokenizer.json
+                ...
+    """
+    # Resolve paths
+    dit_dir = os.path.join(checkpoint_dir, dit_config)
+    vae_dir = os.path.join(checkpoint_dir, "vae")
+    text_encoder_dir = os.path.join(checkpoint_dir, "Qwen3-Embedding-0.6B")
+
+    # Validate inputs
+    model_path = os.path.join(dit_dir, "model.safetensors")
+    config_path = os.path.join(dit_dir, "config.json")
+    for path, name in [
+        (model_path, "model weights"),
+        (config_path, "config"),
+        (vae_dir, "VAE"),
+        (text_encoder_dir, "text encoder"),
+    ]:
+        if not os.path.exists(path):
+            raise FileNotFoundError(f"{name} not found at: {path}")
+
+    # Select dtype
+    dtype_map = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16}
+    if dtype_str not in dtype_map:
+        raise ValueError(f"Unsupported dtype: {dtype_str}. Choose from {list(dtype_map.keys())}")
+    target_dtype = dtype_map[dtype_str]
+
+    # Load original config
+    with open(config_path) as f:
+        original_config = json.load(f)
+
+    print(f"Loading weights from {model_path}...")
+    state_dict = load_file(model_path)
+    print(f"  Total keys: {len(state_dict)}")
+
+    # =========================================================================
+    # 1. Split weights by prefix
+    # =========================================================================
+    transformer_sd = {}
+    condition_encoder_sd = {}
+    other_sd = {}  # tokenizer, detokenizer, null_condition_emb
+
+    for key, value in state_dict.items():
+        if key.startswith("decoder."):
+            # Strip "decoder." prefix for the transformer
+            new_key = key[len("decoder.") :]
+            # The original model uses nn.Sequential for proj_in/proj_out:
+            #   proj_in = Sequential(Lambda, Conv1d, Lambda)
+            #   proj_out = Sequential(Lambda, ConvTranspose1d, Lambda)
+            # Only the Conv1d/ConvTranspose1d (index 1) has parameters.
+            # In diffusers, we use standalone Conv1d/ConvTranspose1d named proj_in_conv/proj_out_conv.
+            new_key = new_key.replace("proj_in.1.", "proj_in_conv.")
+            new_key = new_key.replace("proj_out.1.", "proj_out_conv.")
+            transformer_sd[new_key] = value.to(target_dtype)
+        elif key.startswith("encoder."):
+            # Strip "encoder." prefix for the condition encoder
+            new_key = key[len("encoder.") :]
+            condition_encoder_sd[new_key] = value.to(target_dtype)
+        else:
+            other_sd[key] = value.to(target_dtype)
+
+    print(f"  Transformer keys: {len(transformer_sd)}")
+    print(f"  Condition encoder keys: {len(condition_encoder_sd)}")
+    print(f"  Other keys: {len(other_sd)} ({list(other_sd.keys())[:5]}...)")
+
+    # =========================================================================
+    # 2. Build configs for each sub-model
+    # =========================================================================
+
+    # Transformer (DiT) config
+    transformer_config = {
+        "_class_name": "AceStepDiTModel",
+        "_diffusers_version": "0.33.0.dev0",
+        "hidden_size": original_config["hidden_size"],
+        "intermediate_size": original_config["intermediate_size"],
+        "num_hidden_layers": original_config["num_hidden_layers"],
+        "num_attention_heads": original_config["num_attention_heads"],
+        "num_key_value_heads": original_config["num_key_value_heads"],
+        "head_dim": original_config["head_dim"],
+        "in_channels": original_config["in_channels"],
+        "audio_acoustic_hidden_dim": original_config["audio_acoustic_hidden_dim"],
+        "patch_size": original_config["patch_size"],
+        "max_position_embeddings": original_config["max_position_embeddings"],
+        "rope_theta": original_config["rope_theta"],
+        "attention_bias": original_config["attention_bias"],
+        "attention_dropout": original_config["attention_dropout"],
+        "rms_norm_eps": original_config["rms_norm_eps"],
+        "use_sliding_window": original_config["use_sliding_window"],
+        "sliding_window": original_config["sliding_window"],
+        "layer_types": original_config["layer_types"],
+    }
+
+    # Condition encoder config
+    condition_encoder_config = {
+        "_class_name": "AceStepConditionEncoder",
+        "_diffusers_version": "0.33.0.dev0",
+        "hidden_size": original_config["hidden_size"],
+        "intermediate_size": original_config["intermediate_size"],
+        "text_hidden_dim": original_config["text_hidden_dim"],
+        "timbre_hidden_dim": original_config["timbre_hidden_dim"],
+        "num_lyric_encoder_hidden_layers": original_config["num_lyric_encoder_hidden_layers"],
+        "num_timbre_encoder_hidden_layers": original_config["num_timbre_encoder_hidden_layers"],
+        "num_attention_heads": original_config["num_attention_heads"],
+        "num_key_value_heads": original_config["num_key_value_heads"],
+        "head_dim": original_config["head_dim"],
+        "max_position_embeddings": original_config["max_position_embeddings"],
+        "rope_theta": original_config["rope_theta"],
+        "attention_bias": original_config["attention_bias"],
+        "attention_dropout": original_config["attention_dropout"],
+        "rms_norm_eps": original_config["rms_norm_eps"],
+        "use_sliding_window": original_config["use_sliding_window"],
+        "sliding_window": original_config["sliding_window"],
+    }
+
+    # Resolve actual tokenizer and text encoder class names for model_index.json
+    # (AutoTokenizer/AutoModel are not directly loadable by the pipeline loader)
+    from transformers import AutoConfig
+    from transformers import AutoModel as _AutoModel
+    from transformers import AutoTokenizer as _AutoTokenizer
+
+    _tok = _AutoTokenizer.from_pretrained(text_encoder_dir)
+    tokenizer_class_name = type(_tok).__name__
+    del _tok
+
+    _config = AutoConfig.from_pretrained(text_encoder_dir, trust_remote_code=True)
+    _model_cls = _AutoModel.from_config(_config)
+    text_encoder_class_name = type(_model_cls).__name__
+    del _model_cls, _config
+
+    print(f"  Tokenizer class: {tokenizer_class_name}")
+    print(f"  Text encoder class: {text_encoder_class_name}")
+
+    # model_index.json
+    model_index = {
+        "_class_name": "AceStepPipeline",
+        "_diffusers_version": "0.33.0.dev0",
+        "condition_encoder": ["diffusers", "AceStepConditionEncoder"],
+        "text_encoder": ["transformers", text_encoder_class_name],
+        "tokenizer": ["transformers", tokenizer_class_name],
+        "transformer": ["diffusers", "AceStepDiTModel"],
+        "vae": ["diffusers", "AutoencoderOobleck"],
+    }
+
+    # =========================================================================
+    # 3. Save everything
+    # =========================================================================
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Save model_index.json
+    model_index_path = os.path.join(output_dir, "model_index.json")
+    with open(model_index_path, "w") as f:
+        json.dump(model_index, f, indent=2)
+    print(f"\nSaved model_index.json -> {model_index_path}")
+
+    # Save transformer
+    transformer_dir = os.path.join(output_dir, "transformer")
+    os.makedirs(transformer_dir, exist_ok=True)
+    with open(os.path.join(transformer_dir, "config.json"), "w") as f:
+        json.dump(transformer_config, f, indent=2)
+    save_file(transformer_sd, os.path.join(transformer_dir, "diffusion_pytorch_model.safetensors"))
+    print(f"Saved transformer ({len(transformer_sd)} keys) -> {transformer_dir}")
+
+    # Save condition encoder
+    condition_encoder_dir = os.path.join(output_dir, "condition_encoder")
+    os.makedirs(condition_encoder_dir, exist_ok=True)
+    with open(os.path.join(condition_encoder_dir, "config.json"), "w") as f:
+        json.dump(condition_encoder_config, f, indent=2)
+    save_file(condition_encoder_sd, os.path.join(condition_encoder_dir, "diffusion_pytorch_model.safetensors"))
+    print(f"Saved condition_encoder ({len(condition_encoder_sd)} keys) -> {condition_encoder_dir}")
+
+    # Copy VAE
+    vae_output_dir = os.path.join(output_dir, "vae")
+    if os.path.exists(vae_output_dir):
+        shutil.rmtree(vae_output_dir)
+    shutil.copytree(vae_dir, vae_output_dir)
+    print(f"Copied VAE -> {vae_output_dir}")
+
+    # Copy text encoder
+    text_encoder_output_dir = os.path.join(output_dir, "text_encoder")
+    if os.path.exists(text_encoder_output_dir):
+        shutil.rmtree(text_encoder_output_dir)
+    shutil.copytree(text_encoder_dir, text_encoder_output_dir)
+    print(f"Copied text_encoder -> {text_encoder_output_dir}")
+
+    # Copy tokenizer (same source as text encoder for Qwen3)
+    tokenizer_output_dir = os.path.join(output_dir, "tokenizer")
+    if os.path.exists(tokenizer_output_dir):
+        shutil.rmtree(tokenizer_output_dir)
+    # Copy only tokenizer-related files
+    os.makedirs(tokenizer_output_dir, exist_ok=True)
+    tokenizer_files = [
+        "tokenizer.json",
+        "tokenizer_config.json",
+        "special_tokens_map.json",
+        "vocab.json",
+        "merges.txt",
+        "added_tokens.json",
+        "chat_template.jinja",
+    ]
+    for fname in tokenizer_files:
+        src = os.path.join(text_encoder_dir, fname)
+        if os.path.exists(src):
+            shutil.copy2(src, os.path.join(tokenizer_output_dir, fname))
+    print(f"Copied tokenizer -> {tokenizer_output_dir}")
+
+    # Copy silence_latent.pt if it exists
+    silence_latent_src = os.path.join(dit_dir, "silence_latent.pt")
+    if os.path.exists(silence_latent_src):
+        shutil.copy2(silence_latent_src, os.path.join(output_dir, "silence_latent.pt"))
+        print(f"Copied silence_latent.pt -> {output_dir}")
+
+    # Report other keys that were not saved to transformer or condition_encoder
+    if other_sd:
+        print(f"\nNote: {len(other_sd)} keys were not included in transformer or condition_encoder:")
+        for key in sorted(other_sd.keys()):
+            print(f"  {key}")
+        print("These include tokenizer/detokenizer weights and null_condition_emb.")
+        print("The null_condition_emb, tokenizer, and detokenizer are part of the original")
+        print("AceStepConditionGenerationModel but are not needed for the Diffusers pipeline")
+        print("in text2music mode (they are used for cover/repaint tasks).")
+
+    print(f"\nConversion complete! Output saved to: {output_dir}")
+    print("\nTo load the pipeline:")
+    print("  from diffusers import AceStepPipeline")
+    print(f"  pipe = AceStepPipeline.from_pretrained('{output_dir}', torch_dtype=torch.bfloat16)")
+    print("  pipe = pipe.to('cuda')")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Convert ACE-Step model weights to Diffusers pipeline format")
+    parser.add_argument(
+        "--checkpoint_dir",
+        type=str,
+        required=True,
+        help="Path to the ACE-Step checkpoints directory (containing vae/, Qwen3-Embedding-0.6B/, and dit config dirs)",
+    )
+    parser.add_argument(
+        "--dit_config",
+        type=str,
+        default="acestep-v15-turbo",
+        help="Name of the DiT config directory (default: acestep-v15-turbo)",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        required=True,
+        help="Path to save the converted Diffusers pipeline",
+    )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="bf16",
+        choices=["fp32", "fp16", "bf16"],
+        help="Data type for saved weights (default: bf16)",
+    )
+
+    args = parser.parse_args()
+    convert_ace_step_weights(
+        checkpoint_dir=args.checkpoint_dir,
+        dit_config=args.dit_config,
+        output_dir=args.output_dir,
+        dtype_str=args.dtype,
+    )
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index d5a9b4f6ba93..fb25924b24db 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -182,6 +182,7 @@
     )
     _import_structure["models"].extend(
         [
+            "AceStepDiTModel",
             "AllegroTransformer3DModel",
             "AsymmetricAutoencoderKL",
             "AttentionBackendName",
@@ -442,6 +443,8 @@
     )
     _import_structure["pipelines"].extend(
         [
+            "AceStepConditionEncoder",
+            "AceStepPipeline",
             "AllegroPipeline",
             "AltDiffusionImg2ImgPipeline",
             "AltDiffusionPipeline",
@@ -947,6 +950,7 @@
             apply_taylorseer_cache,
         )
         from .models import (
+            AceStepDiTModel,
             AllegroTransformer3DModel,
             AsymmetricAutoencoderKL,
             AttentionBackendName,
@@ -1179,6 +1183,8 @@
             ZImageModularPipeline,
         )
         from .pipelines import (
+            AceStepConditionEncoder,
+            AceStepPipeline,
             AllegroPipeline,
             AltDiffusionImg2ImgPipeline,
             AltDiffusionPipeline,
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index 4d1db36a7352..41c4517ddf6c 100755
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -73,6 +73,7 @@
     _import_structure["controlnets.multicontrolnet_union"] = ["MultiControlNetUnionModel"]
     _import_structure["embeddings"] = ["ImageProjection"]
     _import_structure["modeling_utils"] = ["ModelMixin"]
+    _import_structure["transformers.ace_step_transformer"] = ["AceStepDiTModel"]
     _import_structure["transformers.auraflow_transformer_2d"] = ["AuraFlowTransformer2DModel"]
     _import_structure["transformers.cogvideox_transformer_3d"] = ["CogVideoXTransformer3DModel"]
     _import_structure["transformers.consisid_transformer_3d"] = ["ConsisIDTransformer3DModel"]
@@ -193,6 +194,7 @@
         from .embeddings import ImageProjection
         from .modeling_utils import ModelMixin
         from .transformers import (
+            AceStepDiTModel,
             AllegroTransformer3DModel,
             AuraFlowTransformer2DModel,
             BriaFiboTransformer2DModel,
diff --git a/src/diffusers/models/transformers/__init__.py b/src/diffusers/models/transformers/__init__.py
index d9d1b27a1e40..bcc20f3dde44 100755
--- a/src/diffusers/models/transformers/__init__.py
+++ b/src/diffusers/models/transformers/__init__.py
@@ -2,6 +2,7 @@
 
 
 if is_torch_available():
+    from .ace_step_transformer import AceStepDiTModel
     from .auraflow_transformer_2d import AuraFlowTransformer2DModel
     from .cogvideox_transformer_3d import CogVideoXTransformer3DModel
     from .consisid_transformer_3d import ConsisIDTransformer3DModel
diff --git a/src/diffusers/models/transformers/ace_step_transformer.py b/src/diffusers/models/transformers/ace_step_transformer.py
new file mode 100644
index 000000000000..3d7a0e4fa95d
--- /dev/null
+++ b/src/diffusers/models/transformers/ace_step_transformer.py
@@ -0,0 +1,714 @@
+# Copyright 2025 The ACE-Step Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import logging
+from ..modeling_outputs import Transformer2DModelOutput
+from ..modeling_utils import ModelMixin
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def _create_4d_mask(
+    seq_len: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    attention_mask: Optional[torch.Tensor] = None,
+    sliding_window: Optional[int] = None,
+    is_sliding_window: bool = False,
+    is_causal: bool = True,
+) -> torch.Tensor:
+    """
+    Create a 4D attention mask compatible with SDPA and eager attention.
+
+    Supports causal/bidirectional attention with optional sliding window.
+
+    Returns:
+        Tensor of shape `[batch, 1, seq_len, seq_len]` with `0.0` for visible positions and `-inf` for masked ones.
+    """
+    indices = torch.arange(seq_len, device=device)
+    diff = indices.unsqueeze(1) - indices.unsqueeze(0)
+    valid_mask = torch.ones((seq_len, seq_len), device=device, dtype=torch.bool)
+
+    if is_causal:
+        valid_mask = valid_mask & (diff >= 0)
+
+    if is_sliding_window and sliding_window is not None:
+        if is_causal:
+            valid_mask = valid_mask & (diff <= sliding_window)
+        else:
+            valid_mask = valid_mask & (torch.abs(diff) <= sliding_window)
+
+    valid_mask = valid_mask.unsqueeze(0).unsqueeze(0)
+
+    if attention_mask is not None:
+        padding_mask_4d = attention_mask.view(attention_mask.shape[0], 1, 1, seq_len).to(torch.bool)
+        valid_mask = valid_mask & padding_mask_4d
+
+    min_dtype = torch.finfo(dtype).min
+    mask_tensor = torch.full(valid_mask.shape, min_dtype, dtype=dtype, device=device)
+    mask_tensor.masked_fill_(valid_mask, 0.0)
+    return mask_tensor
+
+
+def _pack_sequences(
+    hidden1: torch.Tensor, hidden2: torch.Tensor, mask1: torch.Tensor, mask2: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Pack two sequences by concatenating and sorting valid tokens first.
+
+    Args:
+        hidden1: First hidden states `[B, L1, D]`.
+        hidden2: Second hidden states `[B, L2, D]`.
+        mask1: Mask for first sequence `[B, L1]`.
+        mask2: Mask for second sequence `[B, L2]`.
+
+    Returns:
+        Tuple of `(packed_hidden_states, new_mask)` with valid tokens sorted first.
+    """
+    hidden_cat = torch.cat([hidden1, hidden2], dim=1)
+    mask_cat = torch.cat([mask1, mask2], dim=1)
+
+    B, L, D = hidden_cat.shape
+    sort_idx = mask_cat.argsort(dim=1, descending=True, stable=True)
+    hidden_left = torch.gather(hidden_cat, 1, sort_idx.unsqueeze(-1).expand(B, L, D))
+    lengths = mask_cat.sum(dim=1)
+    new_mask = torch.arange(L, dtype=torch.long, device=hidden_cat.device).unsqueeze(0) < lengths.unsqueeze(1)
+    return hidden_left, new_mask
+
+
+class AceStepRMSNorm(nn.Module):
+    """RMS Normalization used throughout the ACE-Step model."""
+
+    def __init__(self, hidden_size: int, eps: float = 1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.eps = eps
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+class AceStepRotaryEmbedding(nn.Module):
+    """Rotary Position Embedding (RoPE) for ACE-Step attention layers."""
+
+    def __init__(self, dim: int, max_position_embeddings: int = 32768, base: float = 1000000.0):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.float32) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    @torch.no_grad()
+    def forward(self, x: torch.Tensor, position_ids: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        freqs = (inv_freq_expanded @ position_ids_expanded).transpose(1, 2)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        cos = emb.cos()
+        sin = emb.sin()
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+def _rotate_half(x: torch.Tensor) -> torch.Tensor:
+    """Rotate half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def _apply_rotary_pos_emb(
+    q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Apply rotary position embeddings to query and key tensors."""
+    q_embed = (q * cos) + (_rotate_half(q) * sin)
+    k_embed = (k * cos) + (_rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class AceStepMLP(nn.Module):
+    """MLP (SwiGLU) used in ACE-Step transformer layers."""
+
+    def __init__(self, hidden_size: int, intermediate_size: int):
+        super().__init__()
+        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
+
+
+class AceStepTimestepEmbedding(nn.Module):
+    """
+    Timestep embedding module for the ACE-Step diffusion model.
+
+    Converts scalar timestep values into high-dimensional embeddings using sinusoidal positional encoding followed by
+    MLP layers. Also produces scale-shift parameters for adaptive layer normalization.
+    """
+
+    def __init__(self, in_channels: int, time_embed_dim: int, scale: float = 1000.0):
+        super().__init__()
+        self.linear_1 = nn.Linear(in_channels, time_embed_dim, bias=True)
+        self.act1 = nn.SiLU()
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim, bias=True)
+        self.in_channels = in_channels
+        self.act2 = nn.SiLU()
+        self.time_proj = nn.Linear(time_embed_dim, time_embed_dim * 6)
+        self.scale = scale
+
+    def _timestep_embedding(self, t: torch.Tensor, dim: int, max_period: int = 10000) -> torch.Tensor:
+        """Create sinusoidal timestep embeddings."""
+        t = t * self.scale
+        half = dim // 2
+        freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(
+            device=t.device
+        )
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+
+    def forward(self, t: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        t_freq = self._timestep_embedding(t, self.in_channels)
+        temb = self.linear_1(t_freq.to(t.dtype))
+        temb = self.act1(temb)
+        temb = self.linear_2(temb)
+        timestep_proj = self.time_proj(self.act2(temb)).unflatten(1, (6, -1))
+        return temb, timestep_proj
+
+
+class AceStepAttention(nn.Module):
+    """
+    Multi-headed attention module for the ACE-Step model.
+
+    Supports self-attention and cross-attention with RMSNorm on query/key and optional sliding window attention.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_attention_heads: int,
+        num_key_value_heads: int,
+        head_dim: int,
+        attention_bias: bool = False,
+        attention_dropout: float = 0.0,
+        is_cross_attention: bool = False,
+        is_causal: bool = False,
+        sliding_window: Optional[int] = None,
+        rms_norm_eps: float = 1e-6,
+    ):
+        super().__init__()
+        self.head_dim = head_dim
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.num_key_value_groups = num_attention_heads // num_key_value_heads
+        self.scaling = head_dim**-0.5
+        self.attention_dropout = attention_dropout
+        if is_cross_attention:
+            is_causal = False
+        self.is_causal = is_causal
+        self.is_cross_attention = is_cross_attention
+        self.sliding_window = sliding_window
+
+        self.q_proj = nn.Linear(hidden_size, num_attention_heads * head_dim, bias=attention_bias)
+        self.k_proj = nn.Linear(hidden_size, num_key_value_heads * head_dim, bias=attention_bias)
+        self.v_proj = nn.Linear(hidden_size, num_key_value_heads * head_dim, bias=attention_bias)
+        self.o_proj = nn.Linear(num_attention_heads * head_dim, hidden_size, bias=attention_bias)
+        self.q_norm = AceStepRMSNorm(head_dim, eps=rms_norm_eps)
+        self.k_norm = AceStepRMSNorm(head_dim, eps=rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, self.num_attention_heads, self.head_dim)
+
+        query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(-3, -2)
+
+        is_cross_attention = self.is_cross_attention and encoder_hidden_states is not None
+
+        if is_cross_attention:
+            kv_input = encoder_hidden_states
+            kv_shape = (*encoder_hidden_states.shape[:-1], self.num_key_value_heads, self.head_dim)
+        else:
+            kv_input = hidden_states
+            kv_shape = (*input_shape, self.num_key_value_heads, self.head_dim)
+
+        key_states = self.k_norm(self.k_proj(kv_input).view(kv_shape)).transpose(-3, -2)
+        value_states = self.v_proj(kv_input).view(kv_shape).transpose(-3, -2)
+
+        if not is_cross_attention and position_embeddings is not None:
+            cos, sin = position_embeddings
+            query_states, key_states = _apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        # Repeat KV heads for grouped query attention
+        if self.num_key_value_groups > 1:
+            key_states = key_states.repeat_interleave(self.num_key_value_groups, dim=-3)
+            value_states = value_states.repeat_interleave(self.num_key_value_groups, dim=-3)
+
+        attn_output = F.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            scale=self.scaling,
+        )
+
+        attn_output = attn_output.transpose(-3, -2).reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+
+
+class AceStepEncoderLayer(nn.Module):
+    """
+    Encoder layer for the ACE-Step model.
+
+    Consists of self-attention and MLP (feed-forward) sub-layers with residual connections.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_attention_heads: int,
+        num_key_value_heads: int,
+        head_dim: int,
+        intermediate_size: int,
+        attention_bias: bool = False,
+        attention_dropout: float = 0.0,
+        rms_norm_eps: float = 1e-6,
+        sliding_window: Optional[int] = None,
+    ):
+        super().__init__()
+        self.self_attn = AceStepAttention(
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            num_key_value_heads=num_key_value_heads,
+            head_dim=head_dim,
+            attention_bias=attention_bias,
+            attention_dropout=attention_dropout,
+            is_cross_attention=False,
+            is_causal=False,
+            sliding_window=sliding_window,
+            rms_norm_eps=rms_norm_eps,
+        )
+        self.input_layernorm = AceStepRMSNorm(hidden_size, eps=rms_norm_eps)
+        self.post_attention_layernorm = AceStepRMSNorm(hidden_size, eps=rms_norm_eps)
+        self.mlp = AceStepMLP(hidden_size, intermediate_size)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=position_embeddings,
+            attention_mask=attention_mask,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class AceStepDiTLayer(nn.Module):
+    """
+    DiT (Diffusion Transformer) layer for the ACE-Step model.
+
+    Implements a transformer layer with:
+    1. Self-attention with adaptive layer norm (AdaLN)
+    2. Cross-attention for conditioning on encoder outputs
+    3. Feed-forward MLP with adaptive layer norm
+
+    Uses scale-shift modulation from timestep embeddings for adaptive normalization.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_attention_heads: int,
+        num_key_value_heads: int,
+        head_dim: int,
+        intermediate_size: int,
+        attention_bias: bool = False,
+        attention_dropout: float = 0.0,
+        rms_norm_eps: float = 1e-6,
+        sliding_window: Optional[int] = None,
+        use_cross_attention: bool = True,
+    ):
+        super().__init__()
+        # Self-attention
+        self.self_attn_norm = AceStepRMSNorm(hidden_size, eps=rms_norm_eps)
+        self.self_attn = AceStepAttention(
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            num_key_value_heads=num_key_value_heads,
+            head_dim=head_dim,
+            attention_bias=attention_bias,
+            attention_dropout=attention_dropout,
+            is_cross_attention=False,
+            is_causal=False,
+            sliding_window=sliding_window,
+            rms_norm_eps=rms_norm_eps,
+        )
+
+        # Cross-attention
+        self.use_cross_attention = use_cross_attention
+        if self.use_cross_attention:
+            self.cross_attn_norm = AceStepRMSNorm(hidden_size, eps=rms_norm_eps)
+            self.cross_attn = AceStepAttention(
+                hidden_size=hidden_size,
+                num_attention_heads=num_attention_heads,
+                num_key_value_heads=num_key_value_heads,
+                head_dim=head_dim,
+                attention_bias=attention_bias,
+                attention_dropout=attention_dropout,
+                is_cross_attention=True,
+                rms_norm_eps=rms_norm_eps,
+            )
+
+        # Feed-forward MLP
+        self.mlp_norm = AceStepRMSNorm(hidden_size, eps=rms_norm_eps)
+        self.mlp = AceStepMLP(hidden_size, intermediate_size)
+
+        # Scale-shift table for adaptive layer norm (6 values)
+        self.scale_shift_table = nn.Parameter(torch.randn(1, 6, hidden_size) / hidden_size**0.5)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        temb: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = (self.scale_shift_table + temb).chunk(
+            6, dim=1
+        )
+
+        # Self-attention with AdaLN
+        norm_hidden_states = (self.self_attn_norm(hidden_states) * (1 + scale_msa) + shift_msa).type_as(hidden_states)
+        attn_output = self.self_attn(
+            hidden_states=norm_hidden_states,
+            position_embeddings=position_embeddings,
+            attention_mask=attention_mask,
+        )
+        hidden_states = (hidden_states + attn_output * gate_msa).type_as(hidden_states)
+
+        # Cross-attention
+        if self.use_cross_attention and encoder_hidden_states is not None:
+            norm_hidden_states = self.cross_attn_norm(hidden_states).type_as(hidden_states)
+            attn_output = self.cross_attn(
+                hidden_states=norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+            )
+            hidden_states = hidden_states + attn_output
+
+        # Feed-forward MLP with AdaLN
+        norm_hidden_states = (self.mlp_norm(hidden_states) * (1 + c_scale_msa) + c_shift_msa).type_as(hidden_states)
+        ff_output = self.mlp(norm_hidden_states)
+        hidden_states = (hidden_states + ff_output * c_gate_msa).type_as(hidden_states)
+
+        return hidden_states
+
+
+class AceStepDiTModel(ModelMixin, ConfigMixin):
+    """
+    The Diffusion Transformer (DiT) model for ACE-Step music generation.
+
+    This model generates audio latents conditioned on text, lyrics, and timbre. It uses patch-based processing with
+    transformer layers, timestep conditioning via AdaLN, and cross-attention to encoder outputs.
+
+    Parameters:
+        hidden_size (`int`, defaults to 2048):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, defaults to 6144):
+            Dimension of the MLP intermediate representations.
+        num_hidden_layers (`int`, defaults to 24):
+            Number of DiT transformer layers.
+        num_attention_heads (`int`, defaults to 16):
+            Number of attention heads for query states.
+        num_key_value_heads (`int`, defaults to 8):
+            Number of attention heads for key and value states (for grouped query attention).
+        head_dim (`int`, defaults to 128):
+            Dimension of each attention head.
+        in_channels (`int`, defaults to 192):
+            Number of input channels (context_latents + hidden_states concatenated).
+        audio_acoustic_hidden_dim (`int`, defaults to 64):
+            Output dimension of the model (acoustic latent dimension).
+        patch_size (`int`, defaults to 2):
+            Patch size for input patchification.
+        max_position_embeddings (`int`, defaults to 32768):
+            Maximum sequence length for rotary embeddings.
+        rope_theta (`float`, defaults to 1000000.0):
+            Base period of the RoPE embeddings.
+        attention_bias (`bool`, defaults to `False`):
+            Whether to use bias in attention projection layers.
+        attention_dropout (`float`, defaults to 0.0):
+            Dropout probability for attention weights.
+        rms_norm_eps (`float`, defaults to 1e-6):
+            Epsilon for RMS normalization.
+        use_sliding_window (`bool`, defaults to `True`):
+            Whether to use sliding window attention for alternating layers.
+        sliding_window (`int`, defaults to 128):
+            Sliding window size for local attention layers.
+        layer_types (`List[str]`, *optional*):
+            Attention pattern for each layer. Defaults to alternating `"sliding_attention"` and `"full_attention"`.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        hidden_size: int = 2048,
+        intermediate_size: int = 6144,
+        num_hidden_layers: int = 24,
+        num_attention_heads: int = 16,
+        num_key_value_heads: int = 8,
+        head_dim: int = 128,
+        in_channels: int = 192,
+        audio_acoustic_hidden_dim: int = 64,
+        patch_size: int = 2,
+        max_position_embeddings: int = 32768,
+        rope_theta: float = 1000000.0,
+        attention_bias: bool = False,
+        attention_dropout: float = 0.0,
+        rms_norm_eps: float = 1e-6,
+        use_sliding_window: bool = True,
+        sliding_window: int = 128,
+        layer_types: Optional[List[str]] = None,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+
+        # Determine layer types
+        if layer_types is None:
+            layer_types = [
+                "sliding_attention" if bool((i + 1) % 2) else "full_attention" for i in range(num_hidden_layers)
+            ]
+
+        # Rotary position embeddings
+        self.rotary_emb = AceStepRotaryEmbedding(
+            dim=head_dim, max_position_embeddings=max_position_embeddings, base=rope_theta
+        )
+
+        # DiT transformer layers
+        self.layers = nn.ModuleList(
+            [
+                AceStepDiTLayer(
+                    hidden_size=hidden_size,
+                    num_attention_heads=num_attention_heads,
+                    num_key_value_heads=num_key_value_heads,
+                    head_dim=head_dim,
+                    intermediate_size=intermediate_size,
+                    attention_bias=attention_bias,
+                    attention_dropout=attention_dropout,
+                    rms_norm_eps=rms_norm_eps,
+                    sliding_window=sliding_window if layer_types[i] == "sliding_attention" else None,
+                    use_cross_attention=True,
+                )
+                for i in range(num_hidden_layers)
+            ]
+        )
+
+        # Store layer types for mask selection
+        self._layer_types = layer_types
+
+        # Input projection (patchify)
+        self.proj_in_conv = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=hidden_size,
+            kernel_size=patch_size,
+            stride=patch_size,
+            padding=0,
+        )
+
+        # Timestep embeddings
+        self.time_embed = AceStepTimestepEmbedding(in_channels=256, time_embed_dim=hidden_size)
+        self.time_embed_r = AceStepTimestepEmbedding(in_channels=256, time_embed_dim=hidden_size)
+
+        # Condition projection
+        self.condition_embedder = nn.Linear(hidden_size, hidden_size, bias=True)
+
+        # Output (de-patchify)
+        self.norm_out = AceStepRMSNorm(hidden_size, eps=rms_norm_eps)
+        self.proj_out_conv = nn.ConvTranspose1d(
+            in_channels=hidden_size,
+            out_channels=audio_acoustic_hidden_dim,
+            kernel_size=patch_size,
+            stride=patch_size,
+            padding=0,
+        )
+        self.scale_shift_table = nn.Parameter(torch.randn(1, 2, hidden_size) / hidden_size**0.5)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: torch.Tensor,
+        timestep_r: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        context_latents: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[torch.Tensor, Transformer2DModelOutput]:
+        """
+        The [`AceStepDiTModel`] forward method.
+
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch_size, seq_len, channels)`):
+                Noisy latent input for the diffusion process.
+            timestep (`torch.Tensor` of shape `(batch_size,)`):
+                Current diffusion timestep `t`.
+            timestep_r (`torch.Tensor` of shape `(batch_size,)`):
+                Reference timestep `r` (set equal to `t` for standard inference).
+            encoder_hidden_states (`torch.Tensor` of shape `(batch_size, encoder_seq_len, hidden_size)`):
+                Conditioning embeddings from the condition encoder (text + lyrics + timbre).
+            context_latents (`torch.Tensor` of shape `(batch_size, seq_len, context_dim)`):
+                Context latents (source latents concatenated with chunk masks).
+            attention_mask (`torch.Tensor`, *optional*):
+                Attention mask for the hidden states sequence.
+            encoder_attention_mask (`torch.Tensor`, *optional*):
+                Attention mask for the encoder hidden states.
+            return_dict (`bool`, defaults to `True`):
+                Whether to return a `Transformer2DModelOutput` or a plain tuple.
+
+        Returns:
+            `Transformer2DModelOutput` or `tuple`: The predicted velocity field for flow matching.
+        """
+        # Compute timestep embeddings
+        temb_t, timestep_proj_t = self.time_embed(timestep)
+        temb_r, timestep_proj_r = self.time_embed_r(timestep - timestep_r)
+        temb = temb_t + temb_r
+        timestep_proj = timestep_proj_t + timestep_proj_r
+
+        # Concatenate context latents with hidden states
+        hidden_states = torch.cat([context_latents, hidden_states], dim=-1)
+        original_seq_len = hidden_states.shape[1]
+
+        # Pad if sequence length is not divisible by patch_size
+        pad_length = 0
+        if hidden_states.shape[1] % self.patch_size != 0:
+            pad_length = self.patch_size - (hidden_states.shape[1] % self.patch_size)
+            hidden_states = F.pad(hidden_states, (0, 0, 0, pad_length), mode="constant", value=0)
+
+        # Patchify: [B, T, C] -> [B, C, T] -> conv -> [B, C', T'] -> [B, T', C']
+        hidden_states = self.proj_in_conv(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        # Project encoder hidden states
+        encoder_hidden_states = self.condition_embedder(encoder_hidden_states)
+
+        # Position embeddings
+        seq_len = hidden_states.shape[1]
+        position_ids = torch.arange(seq_len, device=hidden_states.device).unsqueeze(0)
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # Build attention masks
+        dtype = hidden_states.dtype
+        device = hidden_states.device
+        encoder_seq_len = encoder_hidden_states.shape[1]
+
+        full_attn_mask = _create_4d_mask(
+            seq_len=seq_len, dtype=dtype, device=device, attention_mask=None, is_causal=False
+        )
+        encoder_4d_mask = _create_4d_mask(
+            seq_len=max(seq_len, encoder_seq_len), dtype=dtype, device=device, attention_mask=None, is_causal=False
+        )
+        encoder_4d_mask = encoder_4d_mask[:, :, :seq_len, :encoder_seq_len]
+
+        sliding_attn_mask = None
+        if self.config.use_sliding_window:
+            sliding_attn_mask = _create_4d_mask(
+                seq_len=seq_len,
+                dtype=dtype,
+                device=device,
+                attention_mask=None,
+                sliding_window=self.config.sliding_window,
+                is_sliding_window=True,
+                is_causal=False,
+            )
+
+        # Process through transformer layers
+        for i, layer_module in enumerate(self.layers):
+            layer_type = self._layer_types[i]
+            if layer_type == "sliding_attention" and sliding_attn_mask is not None:
+                layer_attn_mask = sliding_attn_mask
+            else:
+                layer_attn_mask = full_attn_mask
+
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                hidden_states = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    position_embeddings,
+                    timestep_proj,
+                    layer_attn_mask,
+                    encoder_hidden_states,
+                    encoder_4d_mask,
+                )
+            else:
+                hidden_states = layer_module(
+                    hidden_states=hidden_states,
+                    position_embeddings=position_embeddings,
+                    temb=timestep_proj,
+                    attention_mask=layer_attn_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_4d_mask,
+                )
+
+        # Adaptive output normalization
+        shift, scale = (self.scale_shift_table + temb.unsqueeze(1)).chunk(2, dim=1)
+        hidden_states = (self.norm_out(hidden_states) * (1 + scale) + shift).type_as(hidden_states)
+
+        # De-patchify: [B, T', C'] -> [B, C', T'] -> deconv -> [B, C, T] -> [B, T, C]
+        hidden_states = self.proj_out_conv(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        # Crop to original sequence length
+        hidden_states = hidden_states[:, :original_seq_len, :]
+
+        if not return_dict:
+            return (hidden_states,)
+
+        return Transformer2DModelOutput(sample=hidden_states)
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 72923cbb5c18..f95122888823 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -118,6 +118,10 @@
             "VersatileDiffusionTextToImagePipeline",
         ]
     )
+    _import_structure["ace_step"] = [
+        "AceStepConditionEncoder",
+        "AceStepPipeline",
+    ]
     _import_structure["allegro"] = ["AllegroPipeline"]
     _import_structure["amused"] = ["AmusedImg2ImgPipeline", "AmusedInpaintPipeline", "AmusedPipeline"]
     _import_structure["animatediff"] = [
@@ -579,6 +583,7 @@
     except OptionalDependencyNotAvailable:
         from ..utils.dummy_torch_and_transformers_objects import *
     else:
+        from .ace_step import AceStepConditionEncoder, AceStepPipeline
         from .allegro import AllegroPipeline
         from .amused import AmusedImg2ImgPipeline, AmusedInpaintPipeline, AmusedPipeline
         from .animatediff import (
diff --git a/src/diffusers/pipelines/ace_step/__init__.py b/src/diffusers/pipelines/ace_step/__init__.py
new file mode 100644
index 000000000000..a792420a0a0e
--- /dev/null
+++ b/src/diffusers/pipelines/ace_step/__init__.py
@@ -0,0 +1,50 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+    is_transformers_version,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["modeling_ace_step"] = ["AceStepConditionEncoder"]
+    _import_structure["pipeline_ace_step"] = ["AceStepPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+
+    else:
+        from .modeling_ace_step import AceStepConditionEncoder
+        from .pipeline_ace_step import AceStepPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/ace_step/modeling_ace_step.py b/src/diffusers/pipelines/ace_step/modeling_ace_step.py
new file mode 100644
index 000000000000..0f3fbd642b19
--- /dev/null
+++ b/src/diffusers/pipelines/ace_step/modeling_ace_step.py
@@ -0,0 +1,562 @@
+# Copyright 2025 The ACE-Step Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Pipeline-specific models for ACE-Step: ConditionEncoder, LyricEncoder, TimbreEncoder, AudioTokenizer, and
+AudioTokenDetokenizer.
+
+These models are used within the AceStepPipeline to encode conditioning inputs (text, lyrics, timbre) for
+cross-attention in the DiT model.
+"""
+
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...models.modeling_utils import ModelMixin
+from ...models.transformers.ace_step_transformer import (
+    AceStepEncoderLayer,
+    AceStepRMSNorm,
+    AceStepRotaryEmbedding,
+    _create_4d_mask,
+    _pack_sequences,
+)
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class AceStepLyricEncoder(ModelMixin, ConfigMixin):
+    """
+    Encoder for processing lyric text embeddings in the ACE-Step pipeline.
+
+    Encodes lyric text hidden states using a transformer encoder architecture with bidirectional attention. Projects
+    text embeddings to model hidden size and processes them through multiple encoder layers.
+
+    Parameters:
+        hidden_size (`int`, defaults to 2048):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, defaults to 6144):
+            Dimension of the MLP representations.
+        text_hidden_dim (`int`, defaults to 1024):
+            Dimension of the input text embeddings from the text encoder.
+        num_lyric_encoder_hidden_layers (`int`, defaults to 8):
+            Number of transformer encoder layers.
+        num_attention_heads (`int`, defaults to 16):
+            Number of attention heads.
+        num_key_value_heads (`int`, defaults to 8):
+            Number of key/value heads for grouped query attention.
+        head_dim (`int`, defaults to 128):
+            Dimension of each attention head.
+        max_position_embeddings (`int`, defaults to 32768):
+            Maximum sequence length for rotary embeddings.
+        rope_theta (`float`, defaults to 1000000.0):
+            Base period of the RoPE embeddings.
+        attention_bias (`bool`, defaults to `False`):
+            Whether to use bias in attention layers.
+        attention_dropout (`float`, defaults to 0.0):
+            Dropout probability for attention weights.
+        rms_norm_eps (`float`, defaults to 1e-6):
+            Epsilon for RMS normalization.
+        use_sliding_window (`bool`, defaults to `True`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, defaults to 128):
+            Sliding window size.
+        layer_types (`list`, *optional*):
+            Attention pattern for each layer.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        hidden_size: int = 2048,
+        intermediate_size: int = 6144,
+        text_hidden_dim: int = 1024,
+        num_lyric_encoder_hidden_layers: int = 8,
+        num_attention_heads: int = 16,
+        num_key_value_heads: int = 8,
+        head_dim: int = 128,
+        max_position_embeddings: int = 32768,
+        rope_theta: float = 1000000.0,
+        attention_bias: bool = False,
+        attention_dropout: float = 0.0,
+        rms_norm_eps: float = 1e-6,
+        use_sliding_window: bool = True,
+        sliding_window: int = 128,
+        layer_types: list = None,
+    ):
+        super().__init__()
+
+        if layer_types is None:
+            layer_types = [
+                "sliding_attention" if bool((i + 1) % 2) else "full_attention"
+                for i in range(num_lyric_encoder_hidden_layers)
+            ]
+
+        self.embed_tokens = nn.Linear(text_hidden_dim, hidden_size)
+        self.norm = AceStepRMSNorm(hidden_size, eps=rms_norm_eps)
+        self.rotary_emb = AceStepRotaryEmbedding(
+            dim=head_dim, max_position_embeddings=max_position_embeddings, base=rope_theta
+        )
+
+        self.layers = nn.ModuleList(
+            [
+                AceStepEncoderLayer(
+                    hidden_size=hidden_size,
+                    num_attention_heads=num_attention_heads,
+                    num_key_value_heads=num_key_value_heads,
+                    head_dim=head_dim,
+                    intermediate_size=intermediate_size,
+                    attention_bias=attention_bias,
+                    attention_dropout=attention_dropout,
+                    rms_norm_eps=rms_norm_eps,
+                    sliding_window=sliding_window if layer_types[i] == "sliding_attention" else None,
+                )
+                for i in range(num_lyric_encoder_hidden_layers)
+            ]
+        )
+
+        self._layer_types = layer_types
+        self._use_sliding_window = use_sliding_window
+        self._sliding_window = sliding_window
+
+    def forward(
+        self,
+        inputs_embeds: torch.FloatTensor,
+        attention_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, seq_len, text_hidden_dim)`):
+                Lyric text embeddings from the text encoder.
+            attention_mask (`torch.Tensor` of shape `(batch_size, seq_len)`):
+                Attention mask for padding (1 for valid, 0 for padding).
+
+        Returns:
+            `torch.Tensor`: Encoded lyric hidden states of shape `(batch_size, seq_len, hidden_size)`.
+        """
+        inputs_embeds = self.embed_tokens(inputs_embeds)
+
+        seq_len = inputs_embeds.shape[1]
+        dtype = inputs_embeds.dtype
+        device = inputs_embeds.device
+
+        position_ids = torch.arange(seq_len, device=device).unsqueeze(0)
+        position_embeddings = self.rotary_emb(inputs_embeds, position_ids)
+
+        # Build attention masks
+        full_attn_mask = _create_4d_mask(
+            seq_len=seq_len, dtype=dtype, device=device, attention_mask=attention_mask, is_causal=False
+        )
+        sliding_attn_mask = None
+        if self._use_sliding_window:
+            sliding_attn_mask = _create_4d_mask(
+                seq_len=seq_len,
+                dtype=dtype,
+                device=device,
+                attention_mask=attention_mask,
+                sliding_window=self._sliding_window,
+                is_sliding_window=True,
+                is_causal=False,
+            )
+
+        hidden_states = inputs_embeds
+        for i, layer_module in enumerate(self.layers):
+            layer_type = self._layer_types[i]
+            if layer_type == "sliding_attention" and sliding_attn_mask is not None:
+                mask = sliding_attn_mask
+            else:
+                mask = full_attn_mask
+
+            hidden_states = layer_module(
+                hidden_states=hidden_states,
+                position_embeddings=position_embeddings,
+                attention_mask=mask,
+            )
+
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class AceStepTimbreEncoder(ModelMixin, ConfigMixin):
+    """
+    Encoder for extracting timbre embeddings from reference audio in the ACE-Step pipeline.
+
+    Processes packed reference audio acoustic features to extract timbre representations. Outputs are unpacked back to
+    batch format for use in conditioning.
+
+    Parameters:
+        hidden_size (`int`, defaults to 2048):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, defaults to 6144):
+            Dimension of the MLP representations.
+        timbre_hidden_dim (`int`, defaults to 64):
+            Dimension of the input acoustic features.
+        num_timbre_encoder_hidden_layers (`int`, defaults to 4):
+            Number of transformer encoder layers.
+        num_attention_heads (`int`, defaults to 16):
+            Number of attention heads.
+        num_key_value_heads (`int`, defaults to 8):
+            Number of key/value heads.
+        head_dim (`int`, defaults to 128):
+            Dimension of each attention head.
+        max_position_embeddings (`int`, defaults to 32768):
+            Maximum sequence length for rotary embeddings.
+        rope_theta (`float`, defaults to 1000000.0):
+            Base period of the RoPE embeddings.
+        attention_bias (`bool`, defaults to `False`):
+            Whether to use bias in attention layers.
+        attention_dropout (`float`, defaults to 0.0):
+            Dropout probability for attention weights.
+        rms_norm_eps (`float`, defaults to 1e-6):
+            Epsilon for RMS normalization.
+        use_sliding_window (`bool`, defaults to `True`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, defaults to 128):
+            Sliding window size.
+        layer_types (`list`, *optional*):
+            Attention pattern for each layer.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        hidden_size: int = 2048,
+        intermediate_size: int = 6144,
+        timbre_hidden_dim: int = 64,
+        num_timbre_encoder_hidden_layers: int = 4,
+        num_attention_heads: int = 16,
+        num_key_value_heads: int = 8,
+        head_dim: int = 128,
+        max_position_embeddings: int = 32768,
+        rope_theta: float = 1000000.0,
+        attention_bias: bool = False,
+        attention_dropout: float = 0.0,
+        rms_norm_eps: float = 1e-6,
+        use_sliding_window: bool = True,
+        sliding_window: int = 128,
+        layer_types: list = None,
+    ):
+        super().__init__()
+
+        if layer_types is None:
+            layer_types = [
+                "sliding_attention" if bool((i + 1) % 2) else "full_attention"
+                for i in range(num_timbre_encoder_hidden_layers)
+            ]
+
+        self.embed_tokens = nn.Linear(timbre_hidden_dim, hidden_size)
+        self.norm = AceStepRMSNorm(hidden_size, eps=rms_norm_eps)
+        self.rotary_emb = AceStepRotaryEmbedding(
+            dim=head_dim, max_position_embeddings=max_position_embeddings, base=rope_theta
+        )
+        self.special_token = nn.Parameter(torch.randn(1, 1, hidden_size))
+
+        self.layers = nn.ModuleList(
+            [
+                AceStepEncoderLayer(
+                    hidden_size=hidden_size,
+                    num_attention_heads=num_attention_heads,
+                    num_key_value_heads=num_key_value_heads,
+                    head_dim=head_dim,
+                    intermediate_size=intermediate_size,
+                    attention_bias=attention_bias,
+                    attention_dropout=attention_dropout,
+                    rms_norm_eps=rms_norm_eps,
+                    sliding_window=sliding_window if layer_types[i] == "sliding_attention" else None,
+                )
+                for i in range(num_timbre_encoder_hidden_layers)
+            ]
+        )
+
+        self._layer_types = layer_types
+        self._use_sliding_window = use_sliding_window
+        self._sliding_window = sliding_window
+
+    @staticmethod
+    def unpack_timbre_embeddings(
+        timbre_embs_packed: torch.Tensor, refer_audio_order_mask: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Unpack packed timbre embeddings into batch format.
+
+        Args:
+            timbre_embs_packed (`torch.Tensor` of shape `(N, d)`):
+                Packed timbre embeddings.
+            refer_audio_order_mask (`torch.Tensor` of shape `(N,)`):
+                Order mask indicating batch assignment.
+
+        Returns:
+            Tuple of `(unpacked_embeddings, mask)`:
+            - `unpacked_embeddings` of shape `(B, max_count, d)`
+            - `mask` of shape `(B, max_count)`
+        """
+        N, d = timbre_embs_packed.shape
+        device = timbre_embs_packed.device
+        dtype = timbre_embs_packed.dtype
+
+        B = int(refer_audio_order_mask.max().item() + 1)
+        counts = torch.bincount(refer_audio_order_mask, minlength=B)
+        max_count = counts.max().item()
+
+        sorted_indices = torch.argsort(refer_audio_order_mask * N + torch.arange(N, device=device), stable=True)
+        sorted_batch_ids = refer_audio_order_mask[sorted_indices]
+
+        positions = torch.arange(N, device=device)
+        batch_starts = torch.cat([torch.tensor([0], device=device), torch.cumsum(counts, dim=0)[:-1]])
+        positions_in_sorted = positions - batch_starts[sorted_batch_ids]
+
+        inverse_indices = torch.empty_like(sorted_indices)
+        inverse_indices[sorted_indices] = torch.arange(N, device=device)
+        positions_in_batch = positions_in_sorted[inverse_indices]
+
+        indices_2d = refer_audio_order_mask * max_count + positions_in_batch
+        one_hot = F.one_hot(indices_2d, num_classes=B * max_count).to(dtype)
+
+        timbre_embs_flat = one_hot.t() @ timbre_embs_packed
+        timbre_embs_unpack = timbre_embs_flat.reshape(B, max_count, d)
+
+        mask_flat = (one_hot.sum(dim=0) > 0).long()
+        new_mask = mask_flat.reshape(B, max_count)
+
+        return timbre_embs_unpack, new_mask
+
+    def forward(
+        self,
+        refer_audio_acoustic_hidden_states_packed: torch.FloatTensor,
+        refer_audio_order_mask: torch.LongTensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            refer_audio_acoustic_hidden_states_packed (`torch.FloatTensor` of shape `(N, T, timbre_hidden_dim)`):
+                Packed reference audio acoustic features.
+            refer_audio_order_mask (`torch.LongTensor` of shape `(N,)`):
+                Order mask indicating which batch element each packed sequence belongs to.
+
+        Returns:
+            Tuple of `(timbre_embeddings, timbre_mask)`:
+            - `timbre_embeddings` of shape `(B, max_refs, hidden_size)`
+            - `timbre_mask` of shape `(B, max_refs)`
+        """
+        inputs_embeds = self.embed_tokens(refer_audio_acoustic_hidden_states_packed)
+
+        seq_len = inputs_embeds.shape[1]
+        dtype = inputs_embeds.dtype
+        device = inputs_embeds.device
+
+        position_ids = torch.arange(seq_len, device=device).unsqueeze(0)
+        position_embeddings = self.rotary_emb(inputs_embeds, position_ids)
+
+        # Build attention masks
+        full_attn_mask = _create_4d_mask(
+            seq_len=seq_len, dtype=dtype, device=device, attention_mask=None, is_causal=False
+        )
+        sliding_attn_mask = None
+        if self._use_sliding_window:
+            sliding_attn_mask = _create_4d_mask(
+                seq_len=seq_len,
+                dtype=dtype,
+                device=device,
+                attention_mask=None,
+                sliding_window=self._sliding_window,
+                is_sliding_window=True,
+                is_causal=False,
+            )
+
+        hidden_states = inputs_embeds
+        for i, layer_module in enumerate(self.layers):
+            layer_type = self._layer_types[i]
+            if layer_type == "sliding_attention" and sliding_attn_mask is not None:
+                mask = sliding_attn_mask
+            else:
+                mask = full_attn_mask
+
+            hidden_states = layer_module(
+                hidden_states=hidden_states,
+                position_embeddings=position_embeddings,
+                attention_mask=mask,
+            )
+
+        hidden_states = self.norm(hidden_states)
+        # Extract first token (CLS-like) as timbre embedding
+        hidden_states = hidden_states[:, 0, :]
+        timbre_embs_unpack, timbre_embs_mask = self.unpack_timbre_embeddings(hidden_states, refer_audio_order_mask)
+        return timbre_embs_unpack, timbre_embs_mask
+
+
+class AceStepConditionEncoder(ModelMixin, ConfigMixin):
+    """
+    Condition encoder for the ACE-Step pipeline.
+
+    Encodes multiple conditioning inputs (text, lyrics, timbre) and packs them into a single sequence for
+    cross-attention in the DiT model. This model handles projection, encoding, and sequence packing.
+
+    Parameters:
+        hidden_size (`int`, defaults to 2048):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, defaults to 6144):
+            Dimension of the MLP representations.
+        text_hidden_dim (`int`, defaults to 1024):
+            Dimension of the input text embeddings.
+        timbre_hidden_dim (`int`, defaults to 64):
+            Dimension of the input acoustic features.
+        num_lyric_encoder_hidden_layers (`int`, defaults to 8):
+            Number of lyric encoder layers.
+        num_timbre_encoder_hidden_layers (`int`, defaults to 4):
+            Number of timbre encoder layers.
+        num_attention_heads (`int`, defaults to 16):
+            Number of attention heads.
+        num_key_value_heads (`int`, defaults to 8):
+            Number of key/value heads.
+        head_dim (`int`, defaults to 128):
+            Dimension of each attention head.
+        max_position_embeddings (`int`, defaults to 32768):
+            Maximum sequence length for rotary embeddings.
+        rope_theta (`float`, defaults to 1000000.0):
+            Base period of the RoPE embeddings.
+        attention_bias (`bool`, defaults to `False`):
+            Whether to use bias in attention layers.
+        attention_dropout (`float`, defaults to 0.0):
+            Dropout probability for attention weights.
+        rms_norm_eps (`float`, defaults to 1e-6):
+            Epsilon for RMS normalization.
+        use_sliding_window (`bool`, defaults to `True`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, defaults to 128):
+            Sliding window size.
+        layer_types (`list`, *optional*):
+            Attention pattern for each layer.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        hidden_size: int = 2048,
+        intermediate_size: int = 6144,
+        text_hidden_dim: int = 1024,
+        timbre_hidden_dim: int = 64,
+        num_lyric_encoder_hidden_layers: int = 8,
+        num_timbre_encoder_hidden_layers: int = 4,
+        num_attention_heads: int = 16,
+        num_key_value_heads: int = 8,
+        head_dim: int = 128,
+        max_position_embeddings: int = 32768,
+        rope_theta: float = 1000000.0,
+        attention_bias: bool = False,
+        attention_dropout: float = 0.0,
+        rms_norm_eps: float = 1e-6,
+        use_sliding_window: bool = True,
+        sliding_window: int = 128,
+        layer_types: list = None,
+    ):
+        super().__init__()
+
+        # Text projector
+        self.text_projector = nn.Linear(text_hidden_dim, hidden_size, bias=False)
+
+        # Lyric encoder
+        self.lyric_encoder = AceStepLyricEncoder(
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            text_hidden_dim=text_hidden_dim,
+            num_lyric_encoder_hidden_layers=num_lyric_encoder_hidden_layers,
+            num_attention_heads=num_attention_heads,
+            num_key_value_heads=num_key_value_heads,
+            head_dim=head_dim,
+            max_position_embeddings=max_position_embeddings,
+            rope_theta=rope_theta,
+            attention_bias=attention_bias,
+            attention_dropout=attention_dropout,
+            rms_norm_eps=rms_norm_eps,
+            use_sliding_window=use_sliding_window,
+            sliding_window=sliding_window,
+            layer_types=layer_types,
+        )
+
+        # Timbre encoder
+        self.timbre_encoder = AceStepTimbreEncoder(
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            timbre_hidden_dim=timbre_hidden_dim,
+            num_timbre_encoder_hidden_layers=num_timbre_encoder_hidden_layers,
+            num_attention_heads=num_attention_heads,
+            num_key_value_heads=num_key_value_heads,
+            head_dim=head_dim,
+            max_position_embeddings=max_position_embeddings,
+            rope_theta=rope_theta,
+            attention_bias=attention_bias,
+            attention_dropout=attention_dropout,
+            rms_norm_eps=rms_norm_eps,
+            use_sliding_window=use_sliding_window,
+            sliding_window=sliding_window,
+        )
+
+    def forward(
+        self,
+        text_hidden_states: torch.FloatTensor,
+        text_attention_mask: torch.Tensor,
+        lyric_hidden_states: torch.FloatTensor,
+        lyric_attention_mask: torch.Tensor,
+        refer_audio_acoustic_hidden_states_packed: torch.FloatTensor,
+        refer_audio_order_mask: torch.LongTensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Encode text, lyrics, and timbre into a single packed conditioning sequence.
+
+        Args:
+            text_hidden_states (`torch.FloatTensor` of shape `(batch_size, text_seq_len, text_hidden_dim)`):
+                Text embeddings from the text encoder.
+            text_attention_mask (`torch.Tensor` of shape `(batch_size, text_seq_len)`):
+                Attention mask for text.
+            lyric_hidden_states (`torch.FloatTensor` of shape `(batch_size, lyric_seq_len, text_hidden_dim)`):
+                Lyric embeddings from the text encoder.
+            lyric_attention_mask (`torch.Tensor` of shape `(batch_size, lyric_seq_len)`):
+                Attention mask for lyrics.
+            refer_audio_acoustic_hidden_states_packed (`torch.FloatTensor` of shape `(N, T, timbre_hidden_dim)`):
+                Packed reference audio acoustic features.
+            refer_audio_order_mask (`torch.LongTensor` of shape `(N,)`):
+                Order mask for reference audio packing.
+
+        Returns:
+            Tuple of `(encoder_hidden_states, encoder_attention_mask)`:
+            - `encoder_hidden_states` of shape `(batch_size, total_seq_len, hidden_size)`
+            - `encoder_attention_mask` of shape `(batch_size, total_seq_len)`
+        """
+        # Project text
+        text_hidden_states = self.text_projector(text_hidden_states)
+
+        # Encode lyrics
+        lyric_hidden_states = self.lyric_encoder(
+            inputs_embeds=lyric_hidden_states,
+            attention_mask=lyric_attention_mask,
+        )
+
+        # Encode timbre
+        timbre_embs_unpack, timbre_embs_mask = self.timbre_encoder(
+            refer_audio_acoustic_hidden_states_packed, refer_audio_order_mask
+        )
+
+        # Pack sequences: lyrics + timbre, then + text
+        encoder_hidden_states, encoder_attention_mask = _pack_sequences(
+            lyric_hidden_states, timbre_embs_unpack, lyric_attention_mask, timbre_embs_mask
+        )
+        encoder_hidden_states, encoder_attention_mask = _pack_sequences(
+            encoder_hidden_states, text_hidden_states, encoder_attention_mask, text_attention_mask
+        )
+
+        return encoder_hidden_states, encoder_attention_mask
diff --git a/src/diffusers/pipelines/ace_step/pipeline_ace_step.py b/src/diffusers/pipelines/ace_step/pipeline_ace_step.py
new file mode 100644
index 000000000000..8c21dcb80c15
--- /dev/null
+++ b/src/diffusers/pipelines/ace_step/pipeline_ace_step.py
@@ -0,0 +1,539 @@
+# Copyright 2025 The ACE-Step Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+from transformers import PreTrainedModel, PreTrainedTokenizerFast
+
+from ...models import AutoencoderOobleck
+from ...models.transformers.ace_step_transformer import AceStepDiTModel
+from ...utils import logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
+from .modeling_ace_step import AceStepConditionEncoder
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+# SFT prompt template from ACE-Step constants
+SFT_GEN_PROMPT = """# Instruction {}
+
+# Caption {}
+
+# Metas {}<|endoftext|>
+"""
+
+DEFAULT_DIT_INSTRUCTION = "Fill the audio semantic mask based on the given conditions:"
+
+# Pre-defined timestep schedules for the turbo model (fix_nfe=8)
+SHIFT_TIMESTEPS = {
+    1.0: [1.0, 0.875, 0.75, 0.625, 0.5, 0.375, 0.25, 0.125],
+    2.0: [
+        1.0,
+        0.9333333333333333,
+        0.8571428571428571,
+        0.7692307692307693,
+        0.6666666666666666,
+        0.5454545454545454,
+        0.4,
+        0.2222222222222222,
+    ],
+    3.0: [
+        1.0,
+        0.9545454545454546,
+        0.9,
+        0.8333333333333334,
+        0.75,
+        0.6428571428571429,
+        0.5,
+        0.3,
+    ],
+}
+
+VALID_SHIFTS = [1.0, 2.0, 3.0]
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> import soundfile as sf
+        >>> from diffusers import AceStepPipeline
+
+        >>> pipe = AceStepPipeline.from_pretrained("ACE-Step/ACE-Step-v1-5-turbo", torch_dtype=torch.bfloat16)
+        >>> pipe = pipe.to("cuda")
+
+        >>> # Generate music from text
+        >>> audio = pipe(
+        ...     prompt="A beautiful piano piece with soft melodies",
+        ...     lyrics="[verse]\\nSoft notes in the morning light\\n[chorus]\\nMusic fills the air tonight",
+        ...     audio_duration=30.0,
+        ...     num_inference_steps=8,
+        ... ).audios
+
+        >>> # Save the generated audio
+        >>> sf.write("output.wav", audio[0].T.cpu().numpy(), 48000)
+        ```
+"""
+
+
+class AceStepPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-music generation using ACE-Step.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline uses flow matching with a custom timestep schedule for the diffusion process. The turbo model variant
+    uses 8 inference steps by default.
+
+    Args:
+        vae ([`AutoencoderOobleck`]):
+            Variational Auto-Encoder (VAE) model to encode and decode audio waveforms to and from latent
+            representations.
+        text_encoder ([`~transformers.AutoModel`]):
+            Text encoder model (e.g., Qwen3-Embedding-0.6B) for encoding text prompts and lyrics.
+        tokenizer ([`~transformers.AutoTokenizer`]):
+            Tokenizer for the text encoder.
+        transformer ([`AceStepDiTModel`]):
+            The Diffusion Transformer (DiT) model for denoising audio latents.
+        condition_encoder ([`AceStepConditionEncoder`]):
+            Condition encoder that combines text, lyric, and timbre embeddings for cross-attention.
+    """
+
+    model_cpu_offload_seq = "text_encoder->condition_encoder->transformer->vae"
+
+    def __init__(
+        self,
+        vae: AutoencoderOobleck,
+        text_encoder: PreTrainedModel,
+        tokenizer: PreTrainedTokenizerFast,
+        transformer: AceStepDiTModel,
+        condition_encoder: AceStepConditionEncoder,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            transformer=transformer,
+            condition_encoder=condition_encoder,
+        )
+
+    def _format_prompt(
+        self,
+        prompt: str,
+        lyrics: str = "",
+        vocal_language: str = "en",
+        audio_duration: float = 60.0,
+        instruction: str = None,
+    ) -> Tuple[str, str]:
+        """
+        Format the prompt and lyrics into the expected text encoder input format.
+
+        The text prompt uses the SFT generation template with instruction, caption, and metadata. The lyrics use a
+        separate format with language header and lyric content, matching the original ACE-Step handler.
+
+        Args:
+            prompt: Text caption describing the music.
+            lyrics: Lyric text.
+            vocal_language: Language code for lyrics.
+            audio_duration: Duration of the audio in seconds.
+            instruction: Instruction text for generation.
+
+        Returns:
+            Tuple of (formatted_text, formatted_lyrics).
+        """
+        if instruction is None:
+            instruction = DEFAULT_DIT_INSTRUCTION
+
+        # Ensure instruction ends with colon (matching handler.py _format_instruction)
+        if not instruction.endswith(":"):
+            instruction = instruction + ":"
+
+        # Build metadata string in the original multi-line format
+        # Matches handler.py _dict_to_meta_string output
+        metas_str = f"- bpm: N/A\n- timesignature: N/A\n- keyscale: N/A\n- duration: {int(audio_duration)} seconds\n"
+
+        # Format text prompt using SFT template
+        formatted_text = SFT_GEN_PROMPT.format(instruction, prompt, metas_str)
+
+        # Format lyrics using the dedicated lyrics format (NOT the SFT template)
+        # Matches handler.py _format_lyrics
+        formatted_lyrics = f"# Languages\n{vocal_language}\n\n# Lyric\n{lyrics}<|endoftext|>"
+
+        return formatted_text, formatted_lyrics
+
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        lyrics: Union[str, List[str]],
+        device: torch.device,
+        vocal_language: Union[str, List[str]] = "en",
+        audio_duration: float = 60.0,
+        instruction: Optional[str] = None,
+        max_text_length: int = 256,
+        max_lyric_length: int = 2048,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Encode text prompts and lyrics into embeddings.
+
+        Text prompts are encoded through the full text encoder model to produce contextual hidden states. Lyrics are
+        only passed through the text encoder's embedding layer (token lookup), since the lyric encoder in the condition
+        encoder handles the contextual encoding.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                Text caption(s) describing the music.
+            lyrics (`str` or `List[str]`):
+                Lyric text(s).
+            device (`torch.device`):
+                Device for tensors.
+            vocal_language (`str` or `List[str]`, *optional*, defaults to `"en"`):
+                Language code(s) for lyrics.
+            audio_duration (`float`, *optional*, defaults to 60.0):
+                Duration of the audio in seconds.
+            instruction (`str`, *optional*):
+                Instruction text for generation.
+            max_text_length (`int`, *optional*, defaults to 256):
+                Maximum token length for text prompts.
+            max_lyric_length (`int`, *optional*, defaults to 2048):
+                Maximum token length for lyrics.
+
+        Returns:
+            Tuple of `(text_hidden_states, text_attention_mask, lyric_hidden_states, lyric_attention_mask)`.
+        """
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        if isinstance(lyrics, str):
+            lyrics = [lyrics]
+        if isinstance(vocal_language, str):
+            vocal_language = [vocal_language] * len(prompt)
+
+        batch_size = len(prompt)
+
+        all_text_strs = []
+        all_lyric_strs = []
+        for i in range(batch_size):
+            text_str, lyric_str = self._format_prompt(
+                prompt=prompt[i],
+                lyrics=lyrics[i],
+                vocal_language=vocal_language[i],
+                audio_duration=audio_duration,
+                instruction=instruction,
+            )
+            all_text_strs.append(text_str)
+            all_lyric_strs.append(lyric_str)
+
+        # Tokenize text prompts (matching handler.py: padding="longest", max_length=256)
+        text_inputs = self.tokenizer(
+            all_text_strs,
+            padding="longest",
+            truncation=True,
+            max_length=max_text_length,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids.to(device)
+        text_attention_mask = text_inputs.attention_mask.to(device).bool()
+
+        # Tokenize lyrics (matching handler.py: padding="longest", max_length=2048)
+        lyric_inputs = self.tokenizer(
+            all_lyric_strs,
+            padding="longest",
+            truncation=True,
+            max_length=max_lyric_length,
+            return_tensors="pt",
+        )
+        lyric_input_ids = lyric_inputs.input_ids.to(device)
+        lyric_attention_mask = lyric_inputs.attention_mask.to(device).bool()
+
+        # Encode text through the full text encoder model
+        with torch.no_grad():
+            text_hidden_states = self.text_encoder(input_ids=text_input_ids).last_hidden_state
+
+        # Encode lyrics using only the embedding layer (token lookup)
+        # The lyric encoder in the condition_encoder handles contextual encoding
+        with torch.no_grad():
+            embed_layer = self.text_encoder.get_input_embeddings()
+            lyric_hidden_states = embed_layer(lyric_input_ids)
+
+        return text_hidden_states, text_attention_mask, lyric_hidden_states, lyric_attention_mask
+
+    def prepare_latents(
+        self,
+        batch_size: int,
+        audio_duration: float,
+        dtype: torch.dtype,
+        device: torch.device,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Prepare initial noise latents for the flow matching process.
+
+        Args:
+            batch_size: Number of samples to generate.
+            audio_duration: Duration of audio in seconds.
+            dtype: Data type for the latents.
+            device: Device for the latents.
+            generator: Random number generator(s).
+            latents: Pre-generated latents.
+
+        Returns:
+            Noise latents of shape `(batch_size, latent_length, acoustic_dim)`.
+        """
+        # 25 Hz latent rate for ACE-Step
+        latent_length = int(audio_duration * 25)
+        acoustic_dim = self.transformer.config.audio_acoustic_hidden_dim
+
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+
+        shape = (batch_size, latent_length, acoustic_dim)
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        return latents
+
+    def _get_timestep_schedule(
+        self,
+        num_inference_steps: int = 8,
+        shift: float = 3.0,
+        device: torch.device = None,
+        dtype: torch.dtype = None,
+    ) -> torch.Tensor:
+        """
+        Get the timestep schedule for the flow matching process.
+
+        ACE-Step uses a fixed timestep schedule based on the shift parameter. The schedule goes from t=1 (pure noise)
+        to t=0 (clean data).
+
+        Args:
+            num_inference_steps: Number of denoising steps.
+            shift: Shift parameter controlling the timestep distribution (1.0, 2.0, or 3.0).
+            device: Device for the schedule tensor.
+            dtype: Data type for the schedule tensor.
+
+        Returns:
+            Tensor of timestep values.
+        """
+        # Use pre-defined schedules for known shift values
+        original_shift = shift
+        shift = min(VALID_SHIFTS, key=lambda x: abs(x - shift))
+        if original_shift != shift:
+            logger.warning(f"shift={original_shift} not supported, rounded to nearest valid shift={shift}")
+
+        t_schedule_list = SHIFT_TIMESTEPS[shift]
+
+        # Truncate or extend to match num_inference_steps
+        if num_inference_steps < len(t_schedule_list):
+            t_schedule_list = t_schedule_list[:num_inference_steps]
+        elif num_inference_steps > len(t_schedule_list):
+            # Generate a linear schedule for non-standard step counts
+            t_schedule_list = [1.0 - i / num_inference_steps for i in range(num_inference_steps)]
+
+        return torch.tensor(t_schedule_list, device=device, dtype=dtype)
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        lyrics: Union[str, List[str]] = "",
+        audio_duration: float = 60.0,
+        vocal_language: Union[str, List[str]] = "en",
+        num_inference_steps: int = 8,
+        guidance_scale: float = 7.0,
+        shift: float = 3.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pt",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        instruction: Optional[str] = None,
+        max_text_length: int = 256,
+        max_lyric_length: int = 2048,
+    ):
+        r"""
+        The call function to the pipeline for music generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide music generation. Describes the style, genre, instruments, etc.
+            lyrics (`str` or `List[str]`, *optional*, defaults to `""`):
+                The lyrics text for the music. Supports structured lyrics with tags like `[verse]`, `[chorus]`, etc.
+            audio_duration (`float`, *optional*, defaults to 60.0):
+                Duration of the generated audio in seconds.
+            vocal_language (`str` or `List[str]`, *optional*, defaults to `"en"`):
+                Language code for the lyrics (e.g., `"en"`, `"zh"`, `"ja"`).
+            num_inference_steps (`int`, *optional*, defaults to 8):
+                The number of denoising steps. The turbo model is designed for 8 steps.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale for classifier-free guidance. Note: the turbo model may ignore this.
+            shift (`float`, *optional*, defaults to 3.0):
+                Shift parameter for the timestep schedule (1.0, 2.0, or 3.0).
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A generator to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noise latents of shape `(batch_size, latent_length, acoustic_dim)`.
+            output_type (`str`, *optional*, defaults to `"pt"`):
+                Output format. `"pt"` for PyTorch tensor, `"np"` for NumPy array, `"latent"` for raw latents.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return an `AudioPipelineOutput` or a plain tuple.
+            callback (`Callable`, *optional*):
+                A function called every `callback_steps` steps with `(step, timestep, latents)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                Frequency of the callback function.
+            instruction (`str`, *optional*):
+                Custom instruction text for the generation task.
+            max_text_length (`int`, *optional*, defaults to 256):
+                Maximum token length for text prompt encoding.
+            max_lyric_length (`int`, *optional*, defaults to 2048):
+                Maximum token length for lyrics encoding.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.AudioPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, an `AudioPipelineOutput` is returned, otherwise a tuple with the generated
+                audio.
+        """
+        # 0. Default values and input validation
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError("Must provide `prompt` as a string or list of strings.")
+
+        device = self._execution_device
+        dtype = self.transformer.dtype
+
+        # 1. Encode text prompts and lyrics
+        text_hidden_states, text_attention_mask, lyric_hidden_states, lyric_attention_mask = self.encode_prompt(
+            prompt=prompt,
+            lyrics=lyrics,
+            device=device,
+            vocal_language=vocal_language,
+            audio_duration=audio_duration,
+            instruction=instruction,
+            max_text_length=max_text_length,
+            max_lyric_length=max_lyric_length,
+        )
+
+        # 2. Prepare latents (noise for flow matching, starts at t=1)
+        latent_length = int(audio_duration * 25)
+        acoustic_dim = self.transformer.config.audio_acoustic_hidden_dim
+        latents = self.prepare_latents(
+            batch_size=batch_size,
+            audio_duration=audio_duration,
+            dtype=dtype,
+            device=device,
+            generator=generator,
+            latents=latents,
+        )
+
+        # 3. Prepare reference audio (silence for basic text2music)
+        # Use a dummy silence reference for timbre encoder
+        timbre_fix_frame = 750  # Default from config
+        refer_audio_acoustic = torch.zeros(
+            batch_size,
+            timbre_fix_frame,
+            self.condition_encoder.config.timbre_hidden_dim,
+            device=device,
+            dtype=dtype,
+        )
+        refer_audio_order_mask = torch.arange(batch_size, device=device, dtype=torch.long)
+
+        # 4. Encode conditions
+        encoder_hidden_states, encoder_attention_mask = self.condition_encoder(
+            text_hidden_states=text_hidden_states,
+            text_attention_mask=text_attention_mask,
+            lyric_hidden_states=lyric_hidden_states,
+            lyric_attention_mask=lyric_attention_mask,
+            refer_audio_acoustic_hidden_states_packed=refer_audio_acoustic,
+            refer_audio_order_mask=refer_audio_order_mask,
+        )
+
+        # 5. Prepare context latents (silence src_latents + chunk_mask for text2music)
+        src_latents = torch.zeros(batch_size, latent_length, acoustic_dim, device=device, dtype=dtype)
+        # chunk_mask = 1 means "generate this region" (all 1s for text2music)
+        chunk_masks = torch.ones(batch_size, latent_length, acoustic_dim, device=device, dtype=dtype)
+        context_latents = torch.cat([src_latents, chunk_masks], dim=-1)
+
+        # 6. Get timestep schedule
+        t_schedule = self._get_timestep_schedule(
+            num_inference_steps=num_inference_steps,
+            shift=shift,
+            device=device,
+            dtype=dtype,
+        )
+        num_steps = len(t_schedule)
+
+        # 7. Denoising loop (flow matching ODE)
+        xt = latents
+        with self.progress_bar(total=num_steps) as progress_bar:
+            for step_idx in range(num_steps):
+                current_timestep = t_schedule[step_idx].item()
+                t_curr_tensor = current_timestep * torch.ones((batch_size,), device=device, dtype=dtype)
+
+                # DiT forward pass
+                model_output = self.transformer(
+                    hidden_states=xt,
+                    timestep=t_curr_tensor,
+                    timestep_r=t_curr_tensor,
+                    encoder_hidden_states=encoder_hidden_states,
+                    context_latents=context_latents,
+                    return_dict=False,
+                )
+                vt = model_output[0]
+
+                # On final step, directly compute x0
+                if step_idx == num_steps - 1:
+                    # x0 = xt - vt * t
+                    xt = xt - vt * t_curr_tensor.unsqueeze(-1).unsqueeze(-1)
+                    progress_bar.update()
+                    break
+
+                # Euler ODE step: x_{t-1} = x_t - v_t * dt
+                next_timestep = t_schedule[step_idx + 1].item()
+                dt = current_timestep - next_timestep
+                dt_tensor = dt * torch.ones((batch_size,), device=device, dtype=dtype).unsqueeze(-1).unsqueeze(-1)
+                xt = xt - vt * dt_tensor
+
+                progress_bar.update()
+
+                if callback is not None and step_idx % callback_steps == 0:
+                    callback(step_idx, t_curr_tensor, xt)
+
+        # 8. Post-processing: decode latents to audio
+        if output_type == "latent":
+            if not return_dict:
+                return (xt,)
+            return AudioPipelineOutput(audios=xt)
+
+        # Decode latents to audio waveform using VAE
+        # VAE expects [B, C, T] format, our latents are [B, T, C]
+        audio_latents = xt.transpose(1, 2)  # [B, T, C] -> [B, C, T]
+        audio = self.vae.decode(audio_latents).sample
+
+        if output_type == "np":
+            audio = audio.cpu().float().numpy()
+
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (audio,)
+
+        return AudioPipelineOutput(audios=audio)
diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py
index 6c436161c5a7..da365526396e 100644
--- a/src/diffusers/utils/dummy_pt_objects.py
+++ b/src/diffusers/utils/dummy_pt_objects.py
@@ -1601,6 +1601,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
+class AceStepDiTModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class StableAudioDiTModel(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index 4837faabde90..a891a7efde64 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -362,6 +362,36 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class AceStepConditionEncoder(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class AceStepPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class AllegroPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
diff --git a/tests/pipelines/ace_step/__init__.py b/tests/pipelines/ace_step/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/pipelines/ace_step/test_ace_step.py b/tests/pipelines/ace_step/test_ace_step.py
new file mode 100644
index 000000000000..1c7162e02bfe
--- /dev/null
+++ b/tests/pipelines/ace_step/test_ace_step.py
@@ -0,0 +1,335 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import torch
+
+from diffusers import AutoencoderOobleck
+from diffusers.models.transformers.ace_step_transformer import AceStepDiTModel
+from diffusers.pipelines.ace_step import AceStepConditionEncoder, AceStepPipeline
+
+from ...testing_utils import enable_full_determinism
+
+
+enable_full_determinism()
+
+
+class AceStepDiTModelTests(unittest.TestCase):
+    """Fast tests for the AceStepDiTModel (DiT transformer)."""
+
+    def get_tiny_config(self):
+        return {
+            "hidden_size": 32,
+            "intermediate_size": 64,
+            "num_hidden_layers": 2,
+            "num_attention_heads": 4,
+            "num_key_value_heads": 2,
+            "head_dim": 8,
+            "in_channels": 24,  # audio_acoustic_hidden_dim * 3 (hidden + context_latents)
+            "audio_acoustic_hidden_dim": 8,
+            "patch_size": 2,
+            "max_position_embeddings": 256,
+            "rope_theta": 10000.0,
+            "attention_bias": False,
+            "attention_dropout": 0.0,
+            "rms_norm_eps": 1e-6,
+            "use_sliding_window": True,
+            "sliding_window": 16,
+        }
+
+    def test_forward_shape(self):
+        """Test that the DiT model produces output with correct shape."""
+        config = self.get_tiny_config()
+        model = AceStepDiTModel(**config)
+        model.eval()
+
+        batch_size = 2
+        seq_len = 8
+        acoustic_dim = config["audio_acoustic_hidden_dim"]
+        hidden_size = config["hidden_size"]
+        encoder_seq_len = 10
+
+        hidden_states = torch.randn(batch_size, seq_len, acoustic_dim)
+        timestep = torch.rand(batch_size)
+        timestep_r = torch.rand(batch_size)
+        encoder_hidden_states = torch.randn(batch_size, encoder_seq_len, hidden_size)
+        # context_latents = src_latents + chunk_masks, each of dim acoustic_dim
+        context_latents = torch.randn(batch_size, seq_len, acoustic_dim * 2)
+
+        with torch.no_grad():
+            output = model(
+                hidden_states=hidden_states,
+                timestep=timestep,
+                timestep_r=timestep_r,
+                encoder_hidden_states=encoder_hidden_states,
+                context_latents=context_latents,
+                return_dict=False,
+            )
+
+        self.assertEqual(output[0].shape, (batch_size, seq_len, acoustic_dim))
+
+    def test_forward_return_dict(self):
+        """Test that return_dict=True returns a Transformer2DModelOutput."""
+        config = self.get_tiny_config()
+        model = AceStepDiTModel(**config)
+        model.eval()
+
+        batch_size = 1
+        seq_len = 4
+        acoustic_dim = config["audio_acoustic_hidden_dim"]
+        hidden_size = config["hidden_size"]
+
+        hidden_states = torch.randn(batch_size, seq_len, acoustic_dim)
+        timestep = torch.rand(batch_size)
+        timestep_r = torch.rand(batch_size)
+        encoder_hidden_states = torch.randn(batch_size, 6, hidden_size)
+        context_latents = torch.randn(batch_size, seq_len, acoustic_dim * 2)
+
+        with torch.no_grad():
+            output = model(
+                hidden_states=hidden_states,
+                timestep=timestep,
+                timestep_r=timestep_r,
+                encoder_hidden_states=encoder_hidden_states,
+                context_latents=context_latents,
+                return_dict=True,
+            )
+
+        self.assertTrue(hasattr(output, "sample"))
+        self.assertEqual(output.sample.shape, (batch_size, seq_len, acoustic_dim))
+
+
+class AceStepConditionEncoderTests(unittest.TestCase):
+    """Fast tests for the AceStepConditionEncoder."""
+
+    def get_tiny_config(self):
+        return {
+            "hidden_size": 32,
+            "intermediate_size": 64,
+            "text_hidden_dim": 16,
+            "timbre_hidden_dim": 8,
+            "num_lyric_encoder_hidden_layers": 2,
+            "num_timbre_encoder_hidden_layers": 2,
+            "num_attention_heads": 4,
+            "num_key_value_heads": 2,
+            "head_dim": 8,
+            "max_position_embeddings": 256,
+            "rope_theta": 10000.0,
+            "attention_bias": False,
+            "attention_dropout": 0.0,
+            "rms_norm_eps": 1e-6,
+            "use_sliding_window": False,
+            "sliding_window": 16,
+        }
+
+    def test_forward_shape(self):
+        """Test that the condition encoder produces packed hidden states."""
+        config = self.get_tiny_config()
+        encoder = AceStepConditionEncoder(**config)
+        encoder.eval()
+
+        batch_size = 2
+        text_seq_len = 8
+        lyric_seq_len = 12
+        text_dim = config["text_hidden_dim"]
+        timbre_dim = config["timbre_hidden_dim"]
+        timbre_time = 10
+
+        text_hidden_states = torch.randn(batch_size, text_seq_len, text_dim)
+        text_attention_mask = torch.ones(batch_size, text_seq_len)
+        lyric_hidden_states = torch.randn(batch_size, lyric_seq_len, text_dim)
+        lyric_attention_mask = torch.ones(batch_size, lyric_seq_len)
+
+        # Packed reference audio: 3 references across 2 batch items
+        refer_audio = torch.randn(3, timbre_time, timbre_dim)
+        refer_order_mask = torch.tensor([0, 0, 1], dtype=torch.long)
+
+        with torch.no_grad():
+            enc_hidden, enc_mask = encoder(
+                text_hidden_states=text_hidden_states,
+                text_attention_mask=text_attention_mask,
+                lyric_hidden_states=lyric_hidden_states,
+                lyric_attention_mask=lyric_attention_mask,
+                refer_audio_acoustic_hidden_states_packed=refer_audio,
+                refer_audio_order_mask=refer_order_mask,
+            )
+
+        # Output should be packed: batch_size x (lyric + timbre + text seq_len) x hidden_size
+        self.assertEqual(enc_hidden.shape[0], batch_size)
+        self.assertEqual(enc_hidden.shape[2], config["hidden_size"])
+        self.assertEqual(enc_mask.shape[0], batch_size)
+        self.assertEqual(enc_mask.shape[1], enc_hidden.shape[1])
+
+
+class AceStepPipelineFastTests(unittest.TestCase):
+    """Fast end-to-end tests for AceStepPipeline with tiny models."""
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = AceStepDiTModel(
+            hidden_size=32,
+            intermediate_size=64,
+            num_hidden_layers=2,
+            num_attention_heads=4,
+            num_key_value_heads=2,
+            head_dim=8,
+            in_channels=24,
+            audio_acoustic_hidden_dim=8,
+            patch_size=2,
+            max_position_embeddings=256,
+            rope_theta=10000.0,
+            use_sliding_window=False,
+            sliding_window=16,
+        )
+
+        # Use T5 as a small text encoder for testing (d_model=32)
+        # In production, ACE-Step uses Qwen3-Embedding-0.6B (hidden_size=1024)
+        from transformers import T5EncoderModel, T5Tokenizer
+
+        torch.manual_seed(0)
+        t5_id = "hf-internal-testing/tiny-random-T5ForConditionalGeneration"
+        text_encoder = T5EncoderModel.from_pretrained(t5_id)
+        tokenizer = T5Tokenizer.from_pretrained(t5_id, truncation=True, model_max_length=256)
+        text_hidden_dim = text_encoder.config.d_model  # 32
+
+        torch.manual_seed(0)
+        condition_encoder = AceStepConditionEncoder(
+            hidden_size=32,
+            intermediate_size=64,
+            text_hidden_dim=text_hidden_dim,
+            timbre_hidden_dim=8,
+            num_lyric_encoder_hidden_layers=2,
+            num_timbre_encoder_hidden_layers=2,
+            num_attention_heads=4,
+            num_key_value_heads=2,
+            head_dim=8,
+            max_position_embeddings=256,
+            rope_theta=10000.0,
+            use_sliding_window=False,
+            sliding_window=16,
+        )
+
+        torch.manual_seed(0)
+        vae = AutoencoderOobleck(
+            encoder_hidden_size=6,
+            downsampling_ratios=[1, 2],
+            decoder_channels=3,
+            decoder_input_channels=8,
+            audio_channels=2,
+            channel_multiples=[2, 4],
+            sampling_rate=4,
+        )
+
+        components = {
+            "transformer": transformer,
+            "condition_encoder": condition_encoder,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+        }
+        return components
+
+    def test_ace_step_basic(self):
+        """Test basic text-to-music generation."""
+        device = "cpu"
+        components = self.get_dummy_components()
+        pipe = AceStepPipeline(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = pipe(
+            prompt="A beautiful piano piece",
+            lyrics="[verse]\nSoft notes in the morning",
+            audio_duration=0.4,  # Very short for fast test (10 latent frames at 25Hz)
+            num_inference_steps=2,
+            generator=generator,
+            max_text_length=32,
+        )
+        audio = output.audios
+        self.assertIsNotNone(audio)
+        self.assertEqual(audio.ndim, 3)  # [batch, channels, samples]
+
+    def test_ace_step_batch(self):
+        """Test batch generation."""
+        device = "cpu"
+        components = self.get_dummy_components()
+        pipe = AceStepPipeline(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device=device).manual_seed(42)
+        output = pipe(
+            prompt=["Piano piece", "Guitar solo"],
+            lyrics=["[verse]\nHello", "[chorus]\nWorld"],
+            audio_duration=0.4,
+            num_inference_steps=2,
+            generator=generator,
+            max_text_length=32,
+        )
+        audio = output.audios
+        self.assertIsNotNone(audio)
+        self.assertEqual(audio.shape[0], 2)  # batch size = 2
+
+    def test_ace_step_latent_output(self):
+        """Test that output_type='latent' returns latents."""
+        device = "cpu"
+        components = self.get_dummy_components()
+        pipe = AceStepPipeline(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = pipe(
+            prompt="A test prompt",
+            lyrics="",
+            audio_duration=0.4,
+            num_inference_steps=2,
+            generator=generator,
+            output_type="latent",
+            max_text_length=32,
+        )
+        latents = output.audios
+        self.assertIsNotNone(latents)
+        # Latent shape: [batch, latent_length, acoustic_dim]
+        self.assertEqual(latents.ndim, 3)
+        self.assertEqual(latents.shape[0], 1)
+
+    def test_ace_step_return_dict_false(self):
+        """Test that return_dict=False returns a tuple."""
+        device = "cpu"
+        components = self.get_dummy_components()
+        pipe = AceStepPipeline(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = pipe(
+            prompt="A test prompt",
+            lyrics="",
+            audio_duration=0.4,
+            num_inference_steps=2,
+            generator=generator,
+            return_dict=False,
+            max_text_length=32,
+        )
+        self.assertIsInstance(output, tuple)
+        self.assertEqual(len(output), 1)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 79e5a4d81ef2d628fa1289c94c2a77665a9bd7bf Mon Sep 17 00:00:00 2001
From: chuxij <junmingong@qq.com>
Date: Sat, 7 Feb 2026 12:51:40 +0000
Subject: [PATCH 2/3] Update ACE-Step pipeline: improve tests and documentation

- Add gradient checkpointing test for AceStepDiTModel
- Add save/load config test for AceStepConditionEncoder
- Enhance pipeline tests with PipelineTesterMixin
- Update documentation to reflect ACE-Step 1.5
- Add comprehensive transformer model tests
- Improve test coverage and code quality
---
 .../en/api/models/ace_step_transformer.md     |   2 +-
 docs/source/en/api/pipelines/ace_step.md      |  10 +-
 .../transformers/ace_step_transformer.py      |   2 +-
 .../pipelines/ace_step/modeling_ace_step.py   |   2 +-
 .../pipelines/ace_step/pipeline_ace_step.py   |   2 +-
 src/diffusers/utils/dummy_pt_objects.py       |  30 +--
 .../test_models_transformer_ace_step.py       | 118 ++++++++++
 tests/pipelines/ace_step/test_ace_step.py     | 210 +++++++++++++++++-
 8 files changed, 342 insertions(+), 34 deletions(-)
 create mode 100644 tests/models/transformers/test_models_transformer_ace_step.py

diff --git a/docs/source/en/api/models/ace_step_transformer.md b/docs/source/en/api/models/ace_step_transformer.md
index 224d1a5777e7..b5c8e748a961 100644
--- a/docs/source/en/api/models/ace_step_transformer.md
+++ b/docs/source/en/api/models/ace_step_transformer.md
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.
 
 # AceStepDiTModel
 
-A Diffusion Transformer (DiT) model for music generation from [ACE-Step](https://github.com/ACE-Step/ACE-Step).
+A Diffusion Transformer (DiT) model for music generation from [ACE-Step 1.5](https://github.com/ACE-Step/ACE-Step-1.5).
 
 ## AceStepDiTModel
 
diff --git a/docs/source/en/api/pipelines/ace_step.md b/docs/source/en/api/pipelines/ace_step.md
index 9bf5109aaff4..313a1d29b1df 100644
--- a/docs/source/en/api/pipelines/ace_step.md
+++ b/docs/source/en/api/pipelines/ace_step.md
@@ -10,15 +10,15 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# ACE-Step
+# ACE-Step 1.5
 
-ACE-Step was proposed in [ACE-Step: A Step Towards Music Generation Foundation Model](https://github.com/ACE-Step/ACE-Step) by the ACE-Step Team. It is a text-to-music generation model that generates high-quality music with lyrics from text prompts.
+ACE-Step 1.5 was proposed in [ACE-Step: A Step Towards Music Generation Foundation Model](https://arxiv.org/abs/2602.00744) by the ACE-Step Team. It is a highly efficient open-source music foundation model that generates commercial-grade music with lyrics from text prompts.
 
-ACE-Step generates variable-length stereo music at 48kHz from text prompts and optional lyrics. It comprises three components: an Oobleck autoencoder (VAE) that compresses waveforms into 25Hz latent representations, a Qwen3-based text encoder for text and lyric conditioning, and a Diffusion Transformer (DiT) model that operates in the latent space of the autoencoder using flow matching.
+ACE-Step 1.5 generates variable-length stereo music at 48kHz (from 10 seconds to 10 minutes) from text prompts and optional lyrics. It comprises three components: an Oobleck autoencoder (VAE) that compresses waveforms into 25Hz latent representations, a Qwen3-based text encoder for text and lyric conditioning, and a Diffusion Transformer (DiT) model that operates in the latent space of the autoencoder using flow matching.
 
-The model supports multiple languages for lyrics including English, Chinese, Japanese, Korean, French, German, Spanish, Italian, Portuguese, Russian, and more.
+The model supports 50+ languages for lyrics including English, Chinese, Japanese, Korean, French, German, Spanish, Italian, Portuguese, Russian, and more. It runs locally with less than 4GB of VRAM and generates a full song in under 2 seconds on an A100.
 
-This pipeline was contributed by [ACE-Step Team](https://github.com/ACE-Step). The original codebase can be found at [ACE-Step/ACE-Step](https://github.com/ACE-Step/ACE-Step).
+This pipeline was contributed by [ACE-Step Team](https://github.com/ACE-Step). The original codebase can be found at [ACE-Step/ACE-Step-1.5](https://github.com/ACE-Step/ACE-Step-1.5).
 
 ## Tips
 
diff --git a/src/diffusers/models/transformers/ace_step_transformer.py b/src/diffusers/models/transformers/ace_step_transformer.py
index 3d7a0e4fa95d..477a9a003571 100644
--- a/src/diffusers/models/transformers/ace_step_transformer.py
+++ b/src/diffusers/models/transformers/ace_step_transformer.py
@@ -453,7 +453,7 @@ def forward(
 
 class AceStepDiTModel(ModelMixin, ConfigMixin):
     """
-    The Diffusion Transformer (DiT) model for ACE-Step music generation.
+    The Diffusion Transformer (DiT) model for ACE-Step 1.5 music generation.
 
     This model generates audio latents conditioned on text, lyrics, and timbre. It uses patch-based processing with
     transformer layers, timestep conditioning via AdaLN, and cross-attention to encoder outputs.
diff --git a/src/diffusers/pipelines/ace_step/modeling_ace_step.py b/src/diffusers/pipelines/ace_step/modeling_ace_step.py
index 0f3fbd642b19..42c7188914b1 100644
--- a/src/diffusers/pipelines/ace_step/modeling_ace_step.py
+++ b/src/diffusers/pipelines/ace_step/modeling_ace_step.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 """
-Pipeline-specific models for ACE-Step: ConditionEncoder, LyricEncoder, TimbreEncoder, AudioTokenizer, and
+Pipeline-specific models for ACE-Step 1.5: ConditionEncoder, LyricEncoder, TimbreEncoder, AudioTokenizer, and
 AudioTokenDetokenizer.
 
 These models are used within the AceStepPipeline to encode conditioning inputs (text, lyrics, timbre) for
diff --git a/src/diffusers/pipelines/ace_step/pipeline_ace_step.py b/src/diffusers/pipelines/ace_step/pipeline_ace_step.py
index 8c21dcb80c15..3f4bed999ee7 100644
--- a/src/diffusers/pipelines/ace_step/pipeline_ace_step.py
+++ b/src/diffusers/pipelines/ace_step/pipeline_ace_step.py
@@ -90,7 +90,7 @@
 
 class AceStepPipeline(DiffusionPipeline):
     r"""
-    Pipeline for text-to-music generation using ACE-Step.
+    Pipeline for text-to-music generation using ACE-Step 1.5.
 
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
     implemented for all pipelines (downloading, saving, running on a particular device, etc.).
diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py
index da365526396e..d6d9b2364830 100644
--- a/src/diffusers/utils/dummy_pt_objects.py
+++ b/src/diffusers/utils/dummy_pt_objects.py
@@ -311,6 +311,21 @@ def apply_taylorseer_cache(*args, **kwargs):
     requires_backends(apply_taylorseer_cache, ["torch"])
 
 
+class AceStepDiTModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class AllegroTransformer3DModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -1601,21 +1616,6 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
-class AceStepDiTModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch"])
-
-
 class StableAudioDiTModel(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/models/transformers/test_models_transformer_ace_step.py b/tests/models/transformers/test_models_transformer_ace_step.py
new file mode 100644
index 000000000000..366494c03010
--- /dev/null
+++ b/tests/models/transformers/test_models_transformer_ace_step.py
@@ -0,0 +1,118 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from diffusers.models.transformers.ace_step_transformer import AceStepDiTModel
+from diffusers.utils.torch_utils import randn_tensor
+
+from ...testing_utils import enable_full_determinism, torch_device
+from ..testing_utils import (
+    BaseModelTesterConfig,
+    MemoryTesterMixin,
+    ModelTesterMixin,
+    TrainingTesterMixin,
+)
+
+
+enable_full_determinism()
+
+
+class AceStepDiTModelTesterConfig(BaseModelTesterConfig):
+    @property
+    def model_class(self):
+        return AceStepDiTModel
+
+    @property
+    def output_shape(self):
+        # (seq_len, acoustic_dim)
+        return (8, 8)
+
+    @property
+    def input_shape(self):
+        return (8, 8)
+
+    @property
+    def model_split_percents(self):
+        return [0.9]
+
+    @property
+    def main_input_name(self):
+        return "hidden_states"
+
+    @property
+    def generator(self):
+        return torch.Generator("cpu").manual_seed(0)
+
+    def get_init_dict(self):
+        return {
+            "hidden_size": 32,
+            "intermediate_size": 64,
+            "num_hidden_layers": 2,
+            "num_attention_heads": 4,
+            "num_key_value_heads": 2,
+            "head_dim": 8,
+            "in_channels": 24,
+            "audio_acoustic_hidden_dim": 8,
+            "patch_size": 2,
+            "max_position_embeddings": 256,
+            "rope_theta": 10000.0,
+            "attention_bias": False,
+            "attention_dropout": 0.0,
+            "rms_norm_eps": 1e-6,
+            "use_sliding_window": False,
+            "sliding_window": 16,
+        }
+
+    def get_dummy_inputs(self):
+        batch_size = 1
+        seq_len = 8
+        acoustic_dim = 8
+        hidden_size = 32
+        encoder_seq_len = 10
+
+        return {
+            "hidden_states": randn_tensor(
+                (batch_size, seq_len, acoustic_dim), generator=self.generator, device=torch_device
+            ),
+            "timestep": torch.tensor([0.5], device=torch_device).expand(batch_size),
+            "timestep_r": torch.tensor([0.3], device=torch_device).expand(batch_size),
+            "encoder_hidden_states": randn_tensor(
+                (batch_size, encoder_seq_len, hidden_size), generator=self.generator, device=torch_device
+            ),
+            "context_latents": randn_tensor(
+                (batch_size, seq_len, acoustic_dim * 2), generator=self.generator, device=torch_device
+            ),
+        }
+
+
+class TestAceStepDiTModel(AceStepDiTModelTesterConfig, ModelTesterMixin):
+    """Core model tests for AceStepDiTModel."""
+
+    def _check_dtype_inference_output(self, output, output_loaded, dtype, atol=2e-2, rtol=0):
+        """Increase tolerance for half-precision inference with tiny random models."""
+        super()._check_dtype_inference_output(output, output_loaded, dtype, atol=atol, rtol=rtol)
+
+
+class TestAceStepDiTModelMemory(AceStepDiTModelTesterConfig, MemoryTesterMixin):
+    """Memory optimization tests for AceStepDiTModel."""
+
+    pass
+
+
+class TestAceStepDiTModelTraining(AceStepDiTModelTesterConfig, TrainingTesterMixin):
+    """Training tests for AceStepDiTModel."""
+
+    pass
diff --git a/tests/pipelines/ace_step/test_ace_step.py b/tests/pipelines/ace_step/test_ace_step.py
index 1c7162e02bfe..0a72a1de65cb 100644
--- a/tests/pipelines/ace_step/test_ace_step.py
+++ b/tests/pipelines/ace_step/test_ace_step.py
@@ -17,12 +17,14 @@
 import unittest
 
 import torch
+from transformers import AutoTokenizer, Qwen3Config, Qwen3Model
 
 from diffusers import AutoencoderOobleck
 from diffusers.models.transformers.ace_step_transformer import AceStepDiTModel
 from diffusers.pipelines.ace_step import AceStepConditionEncoder, AceStepPipeline
 
 from ...testing_utils import enable_full_determinism
+from ..test_pipelines_common import PipelineTesterMixin
 
 
 enable_full_determinism()
@@ -112,6 +114,13 @@ def test_forward_return_dict(self):
         self.assertTrue(hasattr(output, "sample"))
         self.assertEqual(output.sample.shape, (batch_size, seq_len, acoustic_dim))
 
+    def test_gradient_checkpointing(self):
+        """Test that gradient checkpointing can be enabled."""
+        config = self.get_tiny_config()
+        model = AceStepDiTModel(**config)
+        model.enable_gradient_checkpointing()
+        self.assertTrue(model.gradient_checkpointing)
+
 
 class AceStepConditionEncoderTests(unittest.TestCase):
     """Fast tests for the AceStepConditionEncoder."""
@@ -174,10 +183,52 @@ def test_forward_shape(self):
         self.assertEqual(enc_mask.shape[0], batch_size)
         self.assertEqual(enc_mask.shape[1], enc_hidden.shape[1])
 
+    def test_save_load_config(self):
+        """Test that the condition encoder config can be saved and loaded."""
+        import tempfile
+
+        config = self.get_tiny_config()
+        encoder = AceStepConditionEncoder(**config)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            encoder.save_config(tmpdir)
+            loaded = AceStepConditionEncoder.from_config(tmpdir)
+
+        self.assertEqual(encoder.config.hidden_size, loaded.config.hidden_size)
+        self.assertEqual(encoder.config.text_hidden_dim, loaded.config.text_hidden_dim)
+        self.assertEqual(encoder.config.timbre_hidden_dim, loaded.config.timbre_hidden_dim)
 
-class AceStepPipelineFastTests(unittest.TestCase):
+
+class AceStepPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     """Fast end-to-end tests for AceStepPipeline with tiny models."""
 
+    pipeline_class = AceStepPipeline
+    params = frozenset(
+        [
+            "prompt",
+            "lyrics",
+            "audio_duration",
+            "vocal_language",
+            "guidance_scale",
+            "shift",
+        ]
+    )
+    batch_params = frozenset(["prompt", "lyrics"])
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "latents",
+            "output_type",
+            "return_dict",
+        ]
+    )
+
+    # ACE-Step uses custom attention, not standard diffusers attention processors
+    test_attention_slicing = False
+    test_xformers_attention = False
+    supports_dduf = False
+
     def get_dummy_components(self):
         torch.manual_seed(0)
         transformer = AceStepDiTModel(
@@ -196,15 +247,21 @@ def get_dummy_components(self):
             sliding_window=16,
         )
 
-        # Use T5 as a small text encoder for testing (d_model=32)
-        # In production, ACE-Step uses Qwen3-Embedding-0.6B (hidden_size=1024)
-        from transformers import T5EncoderModel, T5Tokenizer
-
+        # Create a tiny Qwen3Model for testing (matching the real Qwen3-Embedding-0.6B architecture)
         torch.manual_seed(0)
-        t5_id = "hf-internal-testing/tiny-random-T5ForConditionalGeneration"
-        text_encoder = T5EncoderModel.from_pretrained(t5_id)
-        tokenizer = T5Tokenizer.from_pretrained(t5_id, truncation=True, model_max_length=256)
-        text_hidden_dim = text_encoder.config.d_model  # 32
+        qwen3_config = Qwen3Config(
+            hidden_size=32,
+            intermediate_size=64,
+            num_hidden_layers=2,
+            num_attention_heads=4,
+            num_key_value_heads=2,
+            head_dim=8,
+            vocab_size=151936,  # Qwen3 vocab size
+            max_position_embeddings=256,
+        )
+        text_encoder = Qwen3Model(qwen3_config)
+        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-Embedding-0.6B")
+        text_hidden_dim = qwen3_config.hidden_size  # 32
 
         torch.manual_seed(0)
         condition_encoder = AceStepConditionEncoder(
@@ -243,6 +300,21 @@ def get_dummy_components(self):
         }
         return components
 
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A beautiful piano piece",
+            "lyrics": "[verse]\nSoft notes in the morning",
+            "audio_duration": 0.4,  # Very short for fast test (10 latent frames at 25Hz)
+            "num_inference_steps": 2,
+            "generator": generator,
+            "max_text_length": 32,
+        }
+        return inputs
+
     def test_ace_step_basic(self):
         """Test basic text-to-music generation."""
         device = "cpu"
@@ -255,7 +327,7 @@ def test_ace_step_basic(self):
         output = pipe(
             prompt="A beautiful piano piece",
             lyrics="[verse]\nSoft notes in the morning",
-            audio_duration=0.4,  # Very short for fast test (10 latent frames at 25Hz)
+            audio_duration=0.4,
             num_inference_steps=2,
             generator=generator,
             max_text_length=32,
@@ -330,6 +402,124 @@ def test_ace_step_return_dict_false(self):
         self.assertIsInstance(output, tuple)
         self.assertEqual(len(output), 1)
 
+    def test_save_load_local(self, expected_max_difference=7e-3):
+        # increase tolerance to account for large composite model
+        super().test_save_load_local(expected_max_difference=expected_max_difference)
+
+    def test_save_load_optional_components(self, expected_max_difference=7e-3):
+        # increase tolerance to account for large composite model
+        super().test_save_load_optional_components(expected_max_difference=expected_max_difference)
+
+    def test_inference_batch_single_identical(self, batch_size=3, expected_max_diff=7e-3):
+        # increase tolerance for audio pipeline
+        super().test_inference_batch_single_identical(batch_size=batch_size, expected_max_diff=expected_max_diff)
+
+    def test_dict_tuple_outputs_equivalent(self, expected_slice=None, expected_max_difference=7e-3):
+        # increase tolerance for audio pipeline
+        super().test_dict_tuple_outputs_equivalent(
+            expected_slice=expected_slice, expected_max_difference=expected_max_difference
+        )
+
+    # ACE-Step does not use num_images_per_prompt
+    def test_num_images_per_prompt(self):
+        pass
+
+    # ACE-Step does not use standard schedulers
+    @unittest.skip("ACE-Step uses built-in flow matching schedule, not diffusers schedulers")
+    def test_karras_schedulers_shape(self):
+        pass
+
+    # ACE-Step does not support prompt_embeds directly
+    @unittest.skip("ACE-Step does not support prompt_embeds / negative_prompt_embeds")
+    def test_cfg(self):
+        pass
+
+    def test_float16_inference(self, expected_max_diff=5e-2):
+        super().test_float16_inference(expected_max_diff=expected_max_diff)
+
+    @unittest.skip("ACE-Step __call__ does not accept prompt_embeds, so encode_prompt isolation test is not applicable")
+    def test_encode_prompt_works_in_isolation(self):
+        pass
+
+    @unittest.skip("Sequential CPU offloading produces NaN with tiny random models")
+    def test_sequential_cpu_offload_forward_pass(self):
+        pass
+
+    @unittest.skip("Sequential CPU offloading produces NaN with tiny random models")
+    def test_sequential_offload_forward_pass_twice(self):
+        pass
+
+    def test_encode_prompt(self):
+        """Test that encode_prompt returns correct shapes."""
+        device = "cpu"
+        components = self.get_dummy_components()
+        pipe = AceStepPipeline(**components)
+        pipe = pipe.to(device)
+
+        text_hidden, text_mask, lyric_hidden, lyric_mask = pipe.encode_prompt(
+            prompt="A test prompt",
+            lyrics="[verse]\nHello world",
+            device=device,
+            max_text_length=32,
+            max_lyric_length=64,
+        )
+
+        self.assertEqual(text_hidden.ndim, 3)  # [batch, seq_len, hidden_dim]
+        self.assertEqual(text_mask.ndim, 2)  # [batch, seq_len]
+        self.assertEqual(lyric_hidden.ndim, 3)
+        self.assertEqual(lyric_mask.ndim, 2)
+        self.assertEqual(text_hidden.shape[0], 1)
+        self.assertEqual(lyric_hidden.shape[0], 1)
+
+    def test_prepare_latents(self):
+        """Test that prepare_latents returns correct shapes."""
+        device = "cpu"
+        components = self.get_dummy_components()
+        pipe = AceStepPipeline(**components)
+        pipe = pipe.to(device)
+
+        latents = pipe.prepare_latents(
+            batch_size=2,
+            audio_duration=1.0,
+            dtype=torch.float32,
+            device=device,
+        )
+
+        # 25 Hz latent rate, 1s duration -> 25 frames
+        self.assertEqual(latents.shape, (2, 25, 8))
+
+    def test_timestep_schedule(self):
+        """Test that the timestep schedule is generated correctly."""
+        components = self.get_dummy_components()
+        pipe = AceStepPipeline(**components)
+
+        # Test standard schedule
+        schedule = pipe._get_timestep_schedule(num_inference_steps=8, shift=3.0)
+        self.assertEqual(len(schedule), 8)
+        self.assertAlmostEqual(schedule[0].item(), 1.0, places=5)
+
+        # Test truncated schedule
+        schedule = pipe._get_timestep_schedule(num_inference_steps=4, shift=3.0)
+        self.assertEqual(len(schedule), 4)
+
+    def test_format_prompt(self):
+        """Test that prompt formatting works correctly."""
+        components = self.get_dummy_components()
+        pipe = AceStepPipeline(**components)
+
+        text, lyrics = pipe._format_prompt(
+            prompt="A piano piece",
+            lyrics="[verse]\nHello",
+            vocal_language="en",
+            audio_duration=30.0,
+        )
+
+        self.assertIn("A piano piece", text)
+        self.assertIn("30 seconds", text)
+        self.assertIn("[verse]", lyrics)
+        self.assertIn("Hello", lyrics)
+        self.assertIn("en", lyrics)
+
 
 if __name__ == "__main__":
     unittest.main()

From 8c2db469b7efeed45ec244f66f0d9678f90a9924 Mon Sep 17 00:00:00 2001
From: chuxij <junmingong@qq.com>
Date: Sat, 7 Feb 2026 14:17:51 +0000
Subject: [PATCH 3/3] Enhance ACE-Step pipeline with multi-task support

- Add support for multiple task types: text2music, repaint, cover, extract, lego, complete
- Add audio normalization and preprocessing utilities
- Add tiled encode/decode for handling long audio sequences
- Add reference audio support for timbre transfer in cover task
- Add repaint functionality for regenerating audio sections
- Add metadata handling (BPM, keyscale, timesignature)
- Add audio code parsing and chunk mask building utilities
- Improve documentation with multi-task usage examples
---
 .../pipelines/ace_step/pipeline_ace_step.py   | 867 ++++++++++++++++--
 1 file changed, 800 insertions(+), 67 deletions(-)

diff --git a/src/diffusers/pipelines/ace_step/pipeline_ace_step.py b/src/diffusers/pipelines/ace_step/pipeline_ace_step.py
index 3f4bed999ee7..e3334f58d529 100644
--- a/src/diffusers/pipelines/ace_step/pipeline_ace_step.py
+++ b/src/diffusers/pipelines/ace_step/pipeline_ace_step.py
@@ -12,9 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import math
+import re
 from typing import Callable, List, Optional, Tuple, Union
 
 import torch
+import torch.nn.functional as F
 from transformers import PreTrainedModel, PreTrainedTokenizerFast
 
 from ...models import AutoencoderOobleck
@@ -37,6 +40,25 @@
 
 DEFAULT_DIT_INSTRUCTION = "Fill the audio semantic mask based on the given conditions:"
 
+# Task-specific instruction templates (from ACE-Step constants)
+TASK_INSTRUCTIONS = {
+    "text2music": "Fill the audio semantic mask based on the given conditions:",
+    "repaint": "Repaint the mask area based on the given conditions:",
+    "cover": "Generate audio semantic tokens based on the given conditions:",
+    "extract": "Extract the {TRACK_NAME} track from the audio:",
+    "extract_default": "Extract the track from the audio:",
+    "lego": "Generate the {TRACK_NAME} track based on the audio context:",
+    "lego_default": "Generate the track based on the audio context:",
+    "complete": "Complete the input track with {TRACK_CLASSES}:",
+    "complete_default": "Complete the input track:",
+}
+
+# Valid task types
+TASK_TYPES = ["text2music", "repaint", "cover", "extract", "lego", "complete"]
+
+# Sample rate used by ACE-Step
+SAMPLE_RATE = 48000
+
 # Pre-defined timestep schedules for the turbo model (fix_nfe=8)
 SHIFT_TIMESTEPS = {
     1.0: [1.0, 0.875, 0.75, 0.625, 0.5, 0.375, 0.25, 0.125],
@@ -74,16 +96,43 @@
         >>> pipe = AceStepPipeline.from_pretrained("ACE-Step/ACE-Step-v1-5-turbo", torch_dtype=torch.bfloat16)
         >>> pipe = pipe.to("cuda")
 
-        >>> # Generate music from text
+        >>> # Text-to-music generation with metadata
         >>> audio = pipe(
         ...     prompt="A beautiful piano piece with soft melodies",
         ...     lyrics="[verse]\\nSoft notes in the morning light\\n[chorus]\\nMusic fills the air tonight",
         ...     audio_duration=30.0,
         ...     num_inference_steps=8,
+        ...     bpm=120,
+        ...     keyscale="C major",
+        ...     timesignature="4",
         ... ).audios
 
         >>> # Save the generated audio
-        >>> sf.write("output.wav", audio[0].T.cpu().numpy(), 48000)
+        >>> sf.write("output.wav", audio[0, 0].cpu().numpy(), 48000)
+
+        >>> # Repaint task: regenerate a section of existing audio
+        >>> import torchaudio
+        >>> src_audio, sr = torchaudio.load("input.wav")
+        >>> src_audio = AceStepPipeline._normalize_audio_to_stereo_48k(src_audio, sr)
+        >>> audio = pipe(
+        ...     prompt="Epic rock guitar solo",
+        ...     lyrics="",
+        ...     task_type="repaint",
+        ...     src_audio=src_audio,
+        ...     repainting_start=10.0,
+        ...     repainting_end=20.0,
+        ... ).audios
+
+        >>> # Cover task with reference audio for timbre transfer
+        >>> ref_audio, sr = torchaudio.load("reference.wav")
+        >>> ref_audio = AceStepPipeline._normalize_audio_to_stereo_48k(ref_audio, sr)
+        >>> audio = pipe(
+        ...     prompt="Pop song with bright vocals",
+        ...     lyrics="[verse]\\nHello world",
+        ...     task_type="cover",
+        ...     reference_audio=ref_audio,
+        ...     audio_cover_strength=0.8,
+        ... ).audios
         ```
 """
 
@@ -98,6 +147,14 @@ class AceStepPipeline(DiffusionPipeline):
     The pipeline uses flow matching with a custom timestep schedule for the diffusion process. The turbo model variant
     uses 8 inference steps by default.
 
+    Supported task types:
+    - `"text2music"`: Generate music from text prompts and lyrics.
+    - `"cover"`: Generate audio from semantic codes or with timbre transfer from reference audio.
+    - `"repaint"`: Regenerate a section of existing audio while keeping the rest.
+    - `"extract"`: Extract a specific track (e.g., vocals, drums) from audio.
+    - `"lego"`: Generate a specific track based on audio context.
+    - `"complete"`: Complete an input audio with additional tracks.
+
     Args:
         vae ([`AutoencoderOobleck`]):
             Variational Auto-Encoder (VAE) model to encode and decode audio waveforms to and from latent
@@ -132,13 +189,85 @@ def __init__(
             condition_encoder=condition_encoder,
         )
 
+    @staticmethod
+    def _get_task_instruction(
+        task_type: str = "text2music",
+        track_name: Optional[str] = None,
+        complete_track_classes: Optional[List[str]] = None,
+    ) -> str:
+        """
+        Get the instruction text for a specific task type.
+
+        Args:
+            task_type (`str`, *optional*, defaults to `"text2music"`):
+                The task type. One of `"text2music"`, `"cover"`, `"repaint"`, `"extract"`, `"lego"`, `"complete"`.
+            track_name (`str`, *optional*):
+                Track name for extract/lego tasks (e.g., `"vocals"`, `"drums"`).
+            complete_track_classes (`List[str]`, *optional*):
+                Track classes for complete task.
+
+        Returns:
+            `str`: The instruction text for the task.
+        """
+        if task_type == "extract":
+            if track_name:
+                return TASK_INSTRUCTIONS["extract"].format(TRACK_NAME=track_name.upper())
+            return TASK_INSTRUCTIONS["extract_default"]
+        elif task_type == "lego":
+            if track_name:
+                return TASK_INSTRUCTIONS["lego"].format(TRACK_NAME=track_name.upper())
+            return TASK_INSTRUCTIONS["lego_default"]
+        elif task_type == "complete":
+            if complete_track_classes and len(complete_track_classes) > 0:
+                classes_str = " | ".join(t.upper() for t in complete_track_classes)
+                return TASK_INSTRUCTIONS["complete"].format(TRACK_CLASSES=classes_str)
+            return TASK_INSTRUCTIONS["complete_default"]
+        elif task_type in TASK_INSTRUCTIONS:
+            return TASK_INSTRUCTIONS[task_type]
+        return TASK_INSTRUCTIONS["text2music"]
+
+    @staticmethod
+    def _build_metadata_string(
+        bpm: Optional[int] = None,
+        keyscale: Optional[str] = None,
+        timesignature: Optional[str] = None,
+        audio_duration: Optional[float] = None,
+    ) -> str:
+        """
+        Build the metadata string for the SFT prompt template.
+
+        Matches the original ACE-Step handler `_dict_to_meta_string` format.
+
+        Args:
+            bpm (`int`, *optional*): BPM value. Uses `"N/A"` if `None`.
+            keyscale (`str`, *optional*): Musical key (e.g., `"C major"`). Uses `"N/A"` if empty.
+            timesignature (`str`, *optional*): Time signature (e.g., `"4"`). Uses `"N/A"` if empty.
+            audio_duration (`float`, *optional*): Duration in seconds.
+
+        Returns:
+            `str`: Formatted metadata string.
+        """
+        bpm_str = str(bpm) if bpm is not None and bpm > 0 else "N/A"
+        ts_str = timesignature if timesignature and timesignature.strip() else "N/A"
+        ks_str = keyscale if keyscale and keyscale.strip() else "N/A"
+
+        if audio_duration is not None and audio_duration > 0:
+            dur_str = f"{int(audio_duration)} seconds"
+        else:
+            dur_str = "30 seconds"
+
+        return f"- bpm: {bpm_str}\n- timesignature: {ts_str}\n- keyscale: {ks_str}\n- duration: {dur_str}\n"
+
     def _format_prompt(
         self,
         prompt: str,
         lyrics: str = "",
         vocal_language: str = "en",
         audio_duration: float = 60.0,
-        instruction: str = None,
+        instruction: Optional[str] = None,
+        bpm: Optional[int] = None,
+        keyscale: Optional[str] = None,
+        timesignature: Optional[str] = None,
     ) -> Tuple[str, str]:
         """
         Format the prompt and lyrics into the expected text encoder input format.
@@ -147,14 +276,17 @@ def _format_prompt(
         separate format with language header and lyric content, matching the original ACE-Step handler.
 
         Args:
-            prompt: Text caption describing the music.
-            lyrics: Lyric text.
-            vocal_language: Language code for lyrics.
-            audio_duration: Duration of the audio in seconds.
-            instruction: Instruction text for generation.
+            prompt (`str`): Text caption describing the music.
+            lyrics (`str`, *optional*, defaults to `""`): Lyric text.
+            vocal_language (`str`, *optional*, defaults to `"en"`): Language code for lyrics.
+            audio_duration (`float`, *optional*, defaults to 60.0): Duration of the audio in seconds.
+            instruction (`str`, *optional*): Instruction text for generation.
+            bpm (`int`, *optional*): BPM (beats per minute).
+            keyscale (`str`, *optional*): Musical key (e.g., `"C major"`).
+            timesignature (`str`, *optional*): Time signature (e.g., `"4"`).
 
         Returns:
-            Tuple of (formatted_text, formatted_lyrics).
+            Tuple of `(formatted_text, formatted_lyrics)`.
         """
         if instruction is None:
             instruction = DEFAULT_DIT_INSTRUCTION
@@ -163,9 +295,13 @@ def _format_prompt(
         if not instruction.endswith(":"):
             instruction = instruction + ":"
 
-        # Build metadata string in the original multi-line format
-        # Matches handler.py _dict_to_meta_string output
-        metas_str = f"- bpm: N/A\n- timesignature: N/A\n- keyscale: N/A\n- duration: {int(audio_duration)} seconds\n"
+        # Build metadata string
+        metas_str = self._build_metadata_string(
+            bpm=bpm,
+            keyscale=keyscale,
+            timesignature=timesignature,
+            audio_duration=audio_duration,
+        )
 
         # Format text prompt using SFT template
         formatted_text = SFT_GEN_PROMPT.format(instruction, prompt, metas_str)
@@ -184,6 +320,9 @@ def encode_prompt(
         vocal_language: Union[str, List[str]] = "en",
         audio_duration: float = 60.0,
         instruction: Optional[str] = None,
+        bpm: Optional[int] = None,
+        keyscale: Optional[str] = None,
+        timesignature: Optional[str] = None,
         max_text_length: int = 256,
         max_lyric_length: int = 2048,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
@@ -207,6 +346,12 @@ def encode_prompt(
                 Duration of the audio in seconds.
             instruction (`str`, *optional*):
                 Instruction text for generation.
+            bpm (`int`, *optional*):
+                BPM (beats per minute) for metadata.
+            keyscale (`str`, *optional*):
+                Musical key (e.g., `"C major"`).
+            timesignature (`str`, *optional*):
+                Time signature (e.g., `"4"` for 4/4).
             max_text_length (`int`, *optional*, defaults to 256):
                 Maximum token length for text prompts.
             max_lyric_length (`int`, *optional*, defaults to 2048):
@@ -233,6 +378,9 @@ def encode_prompt(
                 vocal_language=vocal_language[i],
                 audio_duration=audio_duration,
                 instruction=instruction,
+                bpm=bpm,
+                keyscale=keyscale,
+                timesignature=timesignature,
             )
             all_text_strs.append(text_str)
             all_lyric_strs.append(lyric_str)
@@ -284,12 +432,12 @@ def prepare_latents(
         Prepare initial noise latents for the flow matching process.
 
         Args:
-            batch_size: Number of samples to generate.
-            audio_duration: Duration of audio in seconds.
-            dtype: Data type for the latents.
-            device: Device for the latents.
-            generator: Random number generator(s).
-            latents: Pre-generated latents.
+            batch_size (`int`): Number of samples to generate.
+            audio_duration (`float`): Duration of audio in seconds.
+            dtype (`torch.dtype`): Data type for the latents.
+            device (`torch.device`): Device for the latents.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*): Random number generator(s).
+            latents (`torch.Tensor`, *optional*): Pre-generated latents.
 
         Returns:
             Noise latents of shape `(batch_size, latent_length, acoustic_dim)`.
@@ -311,6 +459,7 @@ def _get_timestep_schedule(
         shift: float = 3.0,
         device: torch.device = None,
         dtype: torch.dtype = None,
+        timesteps: Optional[List[float]] = None,
     ) -> torch.Tensor:
         """
         Get the timestep schedule for the flow matching process.
@@ -319,14 +468,22 @@ def _get_timestep_schedule(
         to t=0 (clean data).
 
         Args:
-            num_inference_steps: Number of denoising steps.
-            shift: Shift parameter controlling the timestep distribution (1.0, 2.0, or 3.0).
-            device: Device for the schedule tensor.
-            dtype: Data type for the schedule tensor.
+            num_inference_steps (`int`, *optional*, defaults to 8):
+                Number of denoising steps.
+            shift (`float`, *optional*, defaults to 3.0):
+                Shift parameter controlling the timestep distribution (1.0, 2.0, or 3.0).
+            device (`torch.device`, *optional*): Device for the schedule tensor.
+            dtype (`torch.dtype`, *optional*): Data type for the schedule tensor.
+            timesteps (`List[float]`, *optional*):
+                Custom timestep schedule. If provided, overrides `num_inference_steps` and `shift`.
 
         Returns:
-            Tensor of timestep values.
+            `torch.Tensor`: Tensor of timestep values.
         """
+        # Use custom timesteps if provided
+        if timesteps is not None:
+            return torch.tensor(timesteps, device=device, dtype=dtype)
+
         # Use pre-defined schedules for known shift values
         original_shift = shift
         shift = min(VALID_SHIFTS, key=lambda x: abs(x - shift))
@@ -344,6 +501,329 @@ def _get_timestep_schedule(
 
         return torch.tensor(t_schedule_list, device=device, dtype=dtype)
 
+    @staticmethod
+    def _normalize_audio_to_stereo_48k(audio: torch.Tensor, sr: int) -> torch.Tensor:
+        """
+        Normalize audio to stereo 48kHz format.
+
+        Args:
+            audio (`torch.Tensor`): Audio tensor of shape `[channels, samples]` or `[samples]`.
+            sr (`int`): Original sample rate.
+
+        Returns:
+            `torch.Tensor`: Normalized audio tensor of shape `[2, samples]` at 48kHz.
+        """
+        if audio.dim() == 1:
+            audio = audio.unsqueeze(0)
+        if audio.shape[0] == 1:
+            audio = torch.cat([audio, audio], dim=0)
+        audio = audio[:2]
+
+        if sr != SAMPLE_RATE:
+            try:
+                import torchaudio
+
+                audio = torchaudio.transforms.Resample(sr, SAMPLE_RATE)(audio)
+            except ImportError:
+                # Simple linear resampling fallback
+                target_len = int(audio.shape[-1] * SAMPLE_RATE / sr)
+                audio = F.interpolate(audio.unsqueeze(0), size=target_len, mode="linear", align_corners=False)[0]
+
+        audio = torch.clamp(audio, -1.0, 1.0)
+        return audio
+
+    def _encode_audio_to_latents(self, audio: torch.Tensor, device: torch.device, dtype: torch.dtype) -> torch.Tensor:
+        """
+        Encode audio waveform to VAE latents using tiled encoding for memory efficiency.
+
+        Args:
+            audio (`torch.Tensor`): Audio tensor of shape `[channels, samples]` or `[batch, channels, samples]`.
+            device (`torch.device`): Target device.
+            dtype (`torch.dtype`): Target dtype.
+
+        Returns:
+            `torch.Tensor`: Latents of shape `[T, D]` or `[batch, T, D]`.
+        """
+        input_was_2d = audio.dim() == 2
+        if input_was_2d:
+            audio = audio.unsqueeze(0)
+
+        # Tiled encode for memory efficiency
+        audio = audio.to(device=device, dtype=self.vae.dtype)
+        latents = self._tiled_encode(audio)
+
+        # Transpose: [batch, D, T] -> [batch, T, D]
+        latents = latents.transpose(1, 2).to(dtype=dtype)
+
+        if input_was_2d:
+            latents = latents.squeeze(0)
+        return latents
+
+    def _tiled_encode(
+        self,
+        audio: torch.Tensor,
+        chunk_size: int = 48000 * 30,
+        overlap: int = 48000 * 2,
+    ) -> torch.Tensor:
+        """
+        Encode audio to latents using tiling to reduce VRAM usage.
+
+        Args:
+            audio (`torch.Tensor`): Audio tensor of shape `[batch, channels, samples]`.
+            chunk_size (`int`, *optional*): Size of audio chunk to process at once (in samples).
+            overlap (`int`, *optional*): Overlap size in audio samples.
+
+        Returns:
+            `torch.Tensor`: Latents of shape `[batch, channels, T]`.
+        """
+        _B, _C, S = audio.shape
+
+        if S <= chunk_size:
+            with torch.no_grad():
+                return self.vae.encode(audio).latent_dist.sample()
+
+        stride = chunk_size - 2 * overlap
+        if stride <= 0:
+            raise ValueError(f"chunk_size {chunk_size} must be > 2 * overlap {overlap}")
+
+        num_steps = math.ceil(S / stride)
+        encoded_latent_list = []
+        downsample_factor = None
+
+        for i in range(num_steps):
+            core_start = i * stride
+            core_end = min(core_start + stride, S)
+            win_start = max(0, core_start - overlap)
+            win_end = min(S, core_end + overlap)
+
+            audio_chunk = audio[:, :, win_start:win_end]
+            with torch.no_grad():
+                latent_chunk = self.vae.encode(audio_chunk).latent_dist.sample()
+
+            if downsample_factor is None:
+                downsample_factor = audio_chunk.shape[-1] / latent_chunk.shape[-1]
+
+            added_start = core_start - win_start
+            trim_start = int(round(added_start / downsample_factor))
+            added_end = win_end - core_end
+            trim_end = int(round(added_end / downsample_factor))
+
+            latent_len = latent_chunk.shape[-1]
+            end_idx = latent_len - trim_end if trim_end > 0 else latent_len
+            encoded_latent_list.append(latent_chunk[:, :, trim_start:end_idx])
+
+        return torch.cat(encoded_latent_list, dim=-1)
+
+    def _tiled_decode(
+        self,
+        latents: torch.Tensor,
+        chunk_size: int = 512,
+        overlap: int = 64,
+    ) -> torch.Tensor:
+        """
+        Decode latents to audio using tiling to reduce VRAM usage.
+
+        Args:
+            latents (`torch.Tensor`): Latents of shape `[batch, channels, T]`.
+            chunk_size (`int`, *optional*): Size of latent chunk to process at once.
+            overlap (`int`, *optional*): Overlap size in latent frames.
+
+        Returns:
+            `torch.Tensor`: Audio of shape `[batch, channels, samples]`.
+        """
+        _B, _C, T = latents.shape
+
+        if T <= chunk_size:
+            return self.vae.decode(latents).sample
+
+        stride = chunk_size - 2 * overlap
+        if stride <= 0:
+            raise ValueError(f"chunk_size {chunk_size} must be > 2 * overlap {overlap}")
+
+        num_steps = math.ceil(T / stride)
+        decoded_audio_list = []
+        upsample_factor = None
+
+        for i in range(num_steps):
+            core_start = i * stride
+            core_end = min(core_start + stride, T)
+            win_start = max(0, core_start - overlap)
+            win_end = min(T, core_end + overlap)
+
+            latent_chunk = latents[:, :, win_start:win_end]
+            decoder_output = self.vae.decode(latent_chunk)
+            audio_chunk = decoder_output.sample
+
+            if upsample_factor is None:
+                upsample_factor = audio_chunk.shape[-1] / latent_chunk.shape[-1]
+
+            added_start = core_start - win_start
+            trim_start = int(round(added_start * upsample_factor))
+            added_end = win_end - core_end
+            trim_end = int(round(added_end * upsample_factor))
+
+            audio_len = audio_chunk.shape[-1]
+            end_idx = audio_len - trim_end if trim_end > 0 else audio_len
+            decoded_audio_list.append(audio_chunk[:, :, trim_start:end_idx])
+
+        return torch.cat(decoded_audio_list, dim=-1)
+
+    @staticmethod
+    def _parse_audio_code_string(code_str: str) -> List[int]:
+        """
+        Extract integer audio codes from prompt tokens like `<|audio_code_123|>`.
+
+        Code values are clamped to valid range `[0, 63999]` (codebook size = 64000).
+
+        Args:
+            code_str (`str`): String containing audio code tokens.
+
+        Returns:
+            `List[int]`: List of parsed audio code integers.
+        """
+        if not code_str:
+            return []
+        max_audio_code = 63999
+        codes = []
+        for x in re.findall(r"<\|audio_code_(\d+)\|>", code_str):
+            code_value = int(x)
+            codes.append(max(0, min(code_value, max_audio_code)))
+        return codes
+
+    def _prepare_reference_audio_latents(
+        self,
+        reference_audio: torch.Tensor,
+        batch_size: int,
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Process reference audio into acoustic latents for the timbre encoder.
+
+        The reference audio is repeated/cropped to 30 seconds (3 segments of 10 seconds each from front, middle,
+        and back), encoded through the VAE, and then transposed for the timbre encoder.
+
+        Args:
+            reference_audio (`torch.Tensor`): Reference audio tensor of shape `[channels, samples]` at 48kHz.
+            batch_size (`int`): Batch size.
+            device (`torch.device`): Target device.
+            dtype (`torch.dtype`): Target dtype.
+
+        Returns:
+            Tuple of `(refer_audio_acoustic, refer_audio_order_mask)`.
+        """
+        target_frames = 30 * SAMPLE_RATE  # 30 seconds
+
+        # Repeat if shorter than 30 seconds
+        if reference_audio.shape[-1] < target_frames:
+            repeat_times = math.ceil(target_frames / reference_audio.shape[-1])
+            reference_audio = reference_audio.repeat(1, repeat_times)
+
+        # Select 3 segments of 10 seconds each
+        segment_frames = 10 * SAMPLE_RATE
+        total_frames = reference_audio.shape[-1]
+        segment_size = total_frames // 3
+
+        front_audio = reference_audio[:, :segment_frames]
+        mid_start = segment_size
+        middle_audio = reference_audio[:, mid_start : mid_start + segment_frames]
+        back_start = max(total_frames - segment_frames, 0)
+        back_audio = reference_audio[:, back_start : back_start + segment_frames]
+
+        reference_audio = torch.cat([front_audio, middle_audio, back_audio], dim=-1)
+
+        # Encode through VAE
+        with torch.no_grad():
+            ref_audio_input = reference_audio.unsqueeze(0).to(device=device, dtype=self.vae.dtype)
+            ref_latents = self.vae.encode(ref_audio_input).latent_dist.sample()
+            # [1, D, T] -> [1, T, D]
+            ref_latents = ref_latents.transpose(1, 2).to(dtype=dtype)
+
+        # Repeat for batch
+        refer_audio_acoustic = ref_latents.expand(batch_size, -1, -1)
+        refer_audio_order_mask = torch.arange(batch_size, device=device, dtype=torch.long)
+        return refer_audio_acoustic, refer_audio_order_mask
+
+    def _prepare_src_audio_and_latents(
+        self,
+        src_audio: torch.Tensor,
+        device: torch.device,
+        dtype: torch.dtype,
+        batch_size: int,
+    ) -> Tuple[torch.Tensor, int]:
+        """
+        Encode source audio to latents and compute the latent length.
+
+        Args:
+            src_audio (`torch.Tensor`): Source audio tensor of shape `[channels, samples]` at 48kHz.
+            device (`torch.device`): Target device.
+            dtype (`torch.dtype`): Target dtype.
+            batch_size (`int`): Batch size.
+
+        Returns:
+            Tuple of `(src_latents, latent_length)` where `src_latents` has shape `[batch, T, D]`.
+        """
+        with torch.no_grad():
+            src_latent = self._encode_audio_to_latents(src_audio, device=device, dtype=dtype)
+            # src_latent is [T, D]
+            latent_length = src_latent.shape[0]
+            src_latents = src_latent.unsqueeze(0).expand(batch_size, -1, -1)
+        return src_latents, latent_length
+
+    def _build_chunk_mask(
+        self,
+        task_type: str,
+        latent_length: int,
+        batch_size: int,
+        device: torch.device,
+        dtype: torch.dtype,
+        acoustic_dim: int,
+        repainting_start: Optional[float] = None,
+        repainting_end: Optional[float] = None,
+        has_src_audio: bool = False,
+    ) -> torch.Tensor:
+        """
+        Build chunk masks for different task types.
+
+        The chunk mask indicates which latent frames should be generated (1) vs kept from source (0).
+
+        Args:
+            task_type (`str`): Task type.
+            latent_length (`int`): Length of the latent sequence.
+            batch_size (`int`): Batch size.
+            device (`torch.device`): Target device.
+            dtype (`torch.dtype`): Target dtype.
+            acoustic_dim (`int`): Acoustic dimension.
+            repainting_start (`float`, *optional*): Start time in seconds for repaint region.
+            repainting_end (`float`, *optional*): End time in seconds for repaint region.
+            has_src_audio (`bool`, *optional*): Whether source audio was provided.
+
+        Returns:
+            `torch.Tensor`: Chunk mask of shape `[batch, latent_length, acoustic_dim]`.
+        """
+        if task_type in ("repaint", "lego") and has_src_audio:
+            # Create a mask where the repaint region is 1 (generate) and outside is 0 (keep)
+            start_latent = int((repainting_start or 0.0) * SAMPLE_RATE / 1920)
+            if repainting_end is not None and repainting_end > 0:
+                end_latent = int(repainting_end * SAMPLE_RATE / 1920)
+            else:
+                end_latent = latent_length
+
+            start_latent = max(0, min(start_latent, latent_length - 1))
+            end_latent = max(start_latent + 1, min(end_latent, latent_length))
+
+            mask_1d = torch.zeros(latent_length, device=device, dtype=dtype)
+            mask_1d[start_latent:end_latent] = 1.0
+            chunk_mask = mask_1d.unsqueeze(0).unsqueeze(-1).expand(batch_size, -1, acoustic_dim)
+        elif task_type == "text2music":
+            # Full generation
+            chunk_mask = torch.ones(batch_size, latent_length, acoustic_dim, device=device, dtype=dtype)
+        else:
+            # cover, extract, complete, lego without src_audio: full generation
+            chunk_mask = torch.ones(batch_size, latent_length, acoustic_dim, device=device, dtype=dtype)
+
+        return chunk_mask
+
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
@@ -364,6 +844,27 @@ def __call__(
         instruction: Optional[str] = None,
         max_text_length: int = 256,
         max_lyric_length: int = 2048,
+        # --- Metadata parameters ---
+        bpm: Optional[int] = None,
+        keyscale: Optional[str] = None,
+        timesignature: Optional[str] = None,
+        # --- Task parameters ---
+        task_type: str = "text2music",
+        track_name: Optional[str] = None,
+        complete_track_classes: Optional[List[str]] = None,
+        # --- Audio input parameters ---
+        src_audio: Optional[torch.Tensor] = None,
+        reference_audio: Optional[torch.Tensor] = None,
+        audio_codes: Optional[Union[str, List[str]]] = None,
+        # --- Repaint/lego parameters ---
+        repainting_start: Optional[float] = None,
+        repainting_end: Optional[float] = None,
+        # --- Advanced generation parameters ---
+        audio_cover_strength: float = 1.0,
+        cfg_interval_start: float = 0.0,
+        cfg_interval_end: float = 1.0,
+        use_tiled_decode: bool = True,
+        timesteps: Optional[List[float]] = None,
     ):
         r"""
         The call function to the pipeline for music generation.
@@ -380,7 +881,7 @@ def __call__(
             num_inference_steps (`int`, *optional*, defaults to 8):
                 The number of denoising steps. The turbo model is designed for 8 steps.
             guidance_scale (`float`, *optional*, defaults to 7.0):
-                Guidance scale for classifier-free guidance. Note: the turbo model may ignore this.
+                Guidance scale for classifier-free guidance. A value of 1.0 disables CFG.
             shift (`float`, *optional*, defaults to 3.0):
                 Shift parameter for the timestep schedule (1.0, 2.0, or 3.0).
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -396,11 +897,49 @@ def __call__(
             callback_steps (`int`, *optional*, defaults to 1):
                 Frequency of the callback function.
             instruction (`str`, *optional*):
-                Custom instruction text for the generation task.
+                Custom instruction text for the generation task. If not provided, it is auto-generated based on
+                `task_type`.
             max_text_length (`int`, *optional*, defaults to 256):
                 Maximum token length for text prompt encoding.
             max_lyric_length (`int`, *optional*, defaults to 2048):
                 Maximum token length for lyrics encoding.
+            bpm (`int`, *optional*):
+                BPM (beats per minute) for music metadata. If `None`, the model estimates it.
+            keyscale (`str`, *optional*):
+                Musical key (e.g., `"C major"`, `"A minor"`). If `None`, the model estimates it.
+            timesignature (`str`, *optional*):
+                Time signature (e.g., `"4"` for 4/4, `"3"` for 3/4). If `None`, the model estimates it.
+            task_type (`str`, *optional*, defaults to `"text2music"`):
+                The generation task type. One of `"text2music"`, `"cover"`, `"repaint"`, `"extract"`, `"lego"`,
+                `"complete"`.
+            track_name (`str`, *optional*):
+                Track name for `"extract"` or `"lego"` tasks (e.g., `"vocals"`, `"drums"`).
+            complete_track_classes (`List[str]`, *optional*):
+                Track classes for the `"complete"` task.
+            src_audio (`torch.Tensor`, *optional*):
+                Source audio tensor of shape `[channels, samples]` at 48kHz for audio-to-audio tasks (repaint, lego,
+                cover, extract, complete). The audio is encoded through the VAE to produce source latents.
+            reference_audio (`torch.Tensor`, *optional*):
+                Reference audio tensor of shape `[channels, samples]` at 48kHz for timbre conditioning. Used to
+                extract timbre features for style transfer.
+            audio_codes (`str` or `List[str]`, *optional*):
+                Audio semantic codes as strings (e.g., `"<|audio_code_123|><|audio_code_456|>..."`). When provided,
+                the task is automatically switched to `"cover"` mode.
+            repainting_start (`float`, *optional*):
+                Start time in seconds for the repaint region (for `"repaint"` and `"lego"` tasks).
+            repainting_end (`float`, *optional*):
+                End time in seconds for the repaint region. Use `-1` or `None` for until end.
+            audio_cover_strength (`float`, *optional*, defaults to 1.0):
+                Strength of audio cover blending (0.0 to 1.0). When < 1.0, blends cover-conditioned and
+                text-only-conditioned outputs. Lower values produce more style transfer effect.
+            cfg_interval_start (`float`, *optional*, defaults to 0.0):
+                Start ratio (0.0-1.0) of the timestep range where CFG is applied.
+            cfg_interval_end (`float`, *optional*, defaults to 1.0):
+                End ratio (0.0-1.0) of the timestep range where CFG is applied.
+            use_tiled_decode (`bool`, *optional*, defaults to `True`):
+                Whether to use tiled decoding for memory-efficient VAE decode.
+            timesteps (`List[float]`, *optional*):
+                Custom timestep schedule. If provided, overrides `num_inference_steps` and `shift`.
 
         Examples:
 
@@ -419,6 +958,34 @@ def __call__(
 
         device = self._execution_device
         dtype = self.transformer.dtype
+        acoustic_dim = self.transformer.config.audio_acoustic_hidden_dim
+
+        # Auto-detect task type from audio_codes
+        if task_type == "text2music" and audio_codes is not None:
+            has_codes = False
+            if isinstance(audio_codes, list):
+                has_codes = any((c or "").strip() for c in audio_codes)
+            elif isinstance(audio_codes, str):
+                has_codes = bool(audio_codes.strip())
+            if has_codes:
+                task_type = "cover"
+
+        # Auto-generate instruction based on task_type if not provided
+        if instruction is None:
+            instruction = self._get_task_instruction(
+                task_type=task_type,
+                track_name=track_name,
+                complete_track_classes=complete_track_classes,
+            )
+
+        # Determine if src_audio provides the duration
+        has_src_audio = src_audio is not None
+        if has_src_audio:
+            src_audio_duration = src_audio.shape[-1] / SAMPLE_RATE
+            if audio_duration is None or audio_duration <= 0:
+                audio_duration = src_audio_duration
+        if audio_duration is None or audio_duration <= 0:
+            audio_duration = 60.0
 
         # 1. Encode text prompts and lyrics
         text_hidden_states, text_attention_mask, lyric_hidden_states, lyric_attention_mask = self.encode_prompt(
@@ -428,35 +995,63 @@ def __call__(
             vocal_language=vocal_language,
             audio_duration=audio_duration,
             instruction=instruction,
+            bpm=bpm,
+            keyscale=keyscale,
+            timesignature=timesignature,
             max_text_length=max_text_length,
             max_lyric_length=max_lyric_length,
         )
 
-        # 2. Prepare latents (noise for flow matching, starts at t=1)
+        # 2. Prepare source latents and latent length
         latent_length = int(audio_duration * 25)
-        acoustic_dim = self.transformer.config.audio_acoustic_hidden_dim
-        latents = self.prepare_latents(
-            batch_size=batch_size,
-            audio_duration=audio_duration,
-            dtype=dtype,
-            device=device,
-            generator=generator,
-            latents=latents,
-        )
 
-        # 3. Prepare reference audio (silence for basic text2music)
-        # Use a dummy silence reference for timbre encoder
-        timbre_fix_frame = 750  # Default from config
-        refer_audio_acoustic = torch.zeros(
-            batch_size,
-            timbre_fix_frame,
-            self.condition_encoder.config.timbre_hidden_dim,
-            device=device,
-            dtype=dtype,
-        )
-        refer_audio_order_mask = torch.arange(batch_size, device=device, dtype=torch.long)
+        if has_src_audio:
+            src_latents, src_latent_length = self._prepare_src_audio_and_latents(
+                src_audio=src_audio, device=device, dtype=dtype, batch_size=batch_size
+            )
+            latent_length = src_latent_length
+        else:
+            src_latents = torch.zeros(batch_size, latent_length, acoustic_dim, device=device, dtype=dtype)
+
+        # 3. Handle audio_codes: decode to latents for cover task
+        if audio_codes is not None:
+            if isinstance(audio_codes, str):
+                audio_codes = [audio_codes] * batch_size
+            # Pad/truncate to batch_size
+            while len(audio_codes) < batch_size:
+                audio_codes.append(audio_codes[-1] if audio_codes else "")
+
+            # Check if any codes are actually provided
+            has_any_codes = any((c or "").strip() for c in audio_codes)
+            if has_any_codes:
+                # For cover task with audio codes, we don't use src_audio
+                # The codes define the target latent structure
+                code_ids_first = self._parse_audio_code_string(audio_codes[0])
+                if code_ids_first:
+                    # Estimate latent length from codes: 5Hz codes -> 25Hz latents (5x upsampling)
+                    code_latent_length = len(code_ids_first) * 5
+                    latent_length = code_latent_length
+                    # Reset src_latents to silence for the new length
+                    src_latents = torch.zeros(batch_size, latent_length, acoustic_dim, device=device, dtype=dtype)
+
+        # 4. Prepare reference audio for timbre encoder
+        if reference_audio is not None:
+            refer_audio_acoustic, refer_audio_order_mask = self._prepare_reference_audio_latents(
+                reference_audio=reference_audio, batch_size=batch_size, device=device, dtype=dtype
+            )
+        else:
+            # Use silence reference
+            timbre_fix_frame = 750
+            refer_audio_acoustic = torch.zeros(
+                batch_size,
+                timbre_fix_frame,
+                self.condition_encoder.config.timbre_hidden_dim,
+                device=device,
+                dtype=dtype,
+            )
+            refer_audio_order_mask = torch.arange(batch_size, device=device, dtype=torch.long)
 
-        # 4. Encode conditions
+        # 5. Encode conditions
         encoder_hidden_states, encoder_attention_mask = self.condition_encoder(
             text_hidden_states=text_hidden_states,
             text_attention_mask=text_attention_mask,
@@ -466,42 +1061,169 @@ def __call__(
             refer_audio_order_mask=refer_audio_order_mask,
         )
 
-        # 5. Prepare context latents (silence src_latents + chunk_mask for text2music)
-        src_latents = torch.zeros(batch_size, latent_length, acoustic_dim, device=device, dtype=dtype)
-        # chunk_mask = 1 means "generate this region" (all 1s for text2music)
-        chunk_masks = torch.ones(batch_size, latent_length, acoustic_dim, device=device, dtype=dtype)
-        context_latents = torch.cat([src_latents, chunk_masks], dim=-1)
+        # For audio_cover_strength < 1.0, also encode a non-cover (text2music) condition
+        non_cover_encoder_hidden_states = None
+        if audio_cover_strength < 1.0 and task_type == "cover":
+            text2music_instruction = TASK_INSTRUCTIONS["text2music"]
+            nc_text_hs, nc_text_mask, nc_lyric_hs, nc_lyric_mask = self.encode_prompt(
+                prompt=prompt,
+                lyrics=lyrics,
+                device=device,
+                vocal_language=vocal_language,
+                audio_duration=audio_duration,
+                instruction=text2music_instruction,
+                bpm=bpm,
+                keyscale=keyscale,
+                timesignature=timesignature,
+                max_text_length=max_text_length,
+                max_lyric_length=max_lyric_length,
+            )
+            non_cover_encoder_hidden_states, _ = self.condition_encoder(
+                text_hidden_states=nc_text_hs,
+                text_attention_mask=nc_text_mask,
+                lyric_hidden_states=nc_lyric_hs,
+                lyric_attention_mask=nc_lyric_mask,
+                refer_audio_acoustic_hidden_states_packed=refer_audio_acoustic,
+                refer_audio_order_mask=refer_audio_order_mask,
+            )
+
+        # 6. Build chunk mask and context latents
+        chunk_mask = self._build_chunk_mask(
+            task_type=task_type,
+            latent_length=latent_length,
+            batch_size=batch_size,
+            device=device,
+            dtype=dtype,
+            acoustic_dim=acoustic_dim,
+            repainting_start=repainting_start,
+            repainting_end=repainting_end,
+            has_src_audio=has_src_audio,
+        )
 
-        # 6. Get timestep schedule
+        # For repaint task: replace the repaint region in src_latents with zeros
+        if task_type in ("repaint",) and has_src_audio:
+            src_latents = src_latents.clone()
+            # Where chunk_mask is 1, set src_latents to 0 (silence)
+            src_latents = src_latents * (1.0 - chunk_mask)
+
+        context_latents = torch.cat([src_latents, chunk_mask], dim=-1)
+
+        # 7. Prepare noise latents
+        latents = self.prepare_latents(
+            batch_size=batch_size,
+            audio_duration=latent_length / 25.0,  # Use actual latent length
+            dtype=dtype,
+            device=device,
+            generator=generator,
+            latents=latents,
+        )
+
+        # 8. Prepare null condition for CFG (if guidance_scale > 1)
+        do_cfg = guidance_scale > 1.0
+        null_encoder_hidden_states = None
+        if do_cfg:
+            # Create null (empty) text condition
+            null_text_hs, null_text_mask, null_lyric_hs, null_lyric_mask = self.encode_prompt(
+                prompt=[""] * batch_size,
+                lyrics=[""] * batch_size,
+                device=device,
+                vocal_language=vocal_language,
+                audio_duration=audio_duration,
+                instruction=instruction,
+                bpm=bpm,
+                keyscale=keyscale,
+                timesignature=timesignature,
+                max_text_length=max_text_length,
+                max_lyric_length=max_lyric_length,
+            )
+            null_encoder_hidden_states, _ = self.condition_encoder(
+                text_hidden_states=null_text_hs,
+                text_attention_mask=null_text_mask,
+                lyric_hidden_states=null_lyric_hs,
+                lyric_attention_mask=null_lyric_mask,
+                refer_audio_acoustic_hidden_states_packed=refer_audio_acoustic,
+                refer_audio_order_mask=refer_audio_order_mask,
+            )
+
+        # 9. Get timestep schedule
         t_schedule = self._get_timestep_schedule(
             num_inference_steps=num_inference_steps,
             shift=shift,
             device=device,
             dtype=dtype,
+            timesteps=timesteps,
         )
         num_steps = len(t_schedule)
 
-        # 7. Denoising loop (flow matching ODE)
+        # 10. Denoising loop (flow matching ODE)
         xt = latents
         with self.progress_bar(total=num_steps) as progress_bar:
             for step_idx in range(num_steps):
                 current_timestep = t_schedule[step_idx].item()
                 t_curr_tensor = current_timestep * torch.ones((batch_size,), device=device, dtype=dtype)
 
-                # DiT forward pass
-                model_output = self.transformer(
-                    hidden_states=xt,
-                    timestep=t_curr_tensor,
-                    timestep_r=t_curr_tensor,
-                    encoder_hidden_states=encoder_hidden_states,
-                    context_latents=context_latents,
-                    return_dict=False,
-                )
-                vt = model_output[0]
+                # Determine if CFG should be applied at this timestep
+                # cfg_interval maps timestep ratio to [cfg_interval_start, cfg_interval_end]
+                timestep_ratio = 1.0 - current_timestep  # t=1 -> ratio=0, t=0 -> ratio=1
+                apply_cfg = do_cfg and (cfg_interval_start <= timestep_ratio <= cfg_interval_end)
+
+                if apply_cfg:
+                    # Conditional forward pass
+                    cond_output = self.transformer(
+                        hidden_states=xt,
+                        timestep=t_curr_tensor,
+                        timestep_r=t_curr_tensor,
+                        encoder_hidden_states=encoder_hidden_states,
+                        context_latents=context_latents,
+                        return_dict=False,
+                    )
+                    vt_cond = cond_output[0]
+
+                    # Unconditional forward pass
+                    uncond_output = self.transformer(
+                        hidden_states=xt,
+                        timestep=t_curr_tensor,
+                        timestep_r=t_curr_tensor,
+                        encoder_hidden_states=null_encoder_hidden_states,
+                        context_latents=context_latents,
+                        return_dict=False,
+                    )
+                    vt_uncond = uncond_output[0]
+
+                    # CFG: v = v_uncond + guidance_scale * (v_cond - v_uncond)
+                    vt = vt_uncond + guidance_scale * (vt_cond - vt_uncond)
+                else:
+                    # Standard forward pass (no CFG)
+                    model_output = self.transformer(
+                        hidden_states=xt,
+                        timestep=t_curr_tensor,
+                        timestep_r=t_curr_tensor,
+                        encoder_hidden_states=encoder_hidden_states,
+                        context_latents=context_latents,
+                        return_dict=False,
+                    )
+                    vt = model_output[0]
+
+                # Audio cover strength blending for cover tasks
+                if (
+                    audio_cover_strength < 1.0
+                    and non_cover_encoder_hidden_states is not None
+                    and task_type == "cover"
+                ):
+                    nc_output = self.transformer(
+                        hidden_states=xt,
+                        timestep=t_curr_tensor,
+                        timestep_r=t_curr_tensor,
+                        encoder_hidden_states=non_cover_encoder_hidden_states,
+                        context_latents=context_latents,
+                        return_dict=False,
+                    )
+                    vt_nc = nc_output[0]
+                    # Blend: strength * cover_vt + (1 - strength) * text2music_vt
+                    vt = audio_cover_strength * vt + (1.0 - audio_cover_strength) * vt_nc
 
                 # On final step, directly compute x0
                 if step_idx == num_steps - 1:
-                    # x0 = xt - vt * t
                     xt = xt - vt * t_curr_tensor.unsqueeze(-1).unsqueeze(-1)
                     progress_bar.update()
                     break
@@ -517,7 +1239,7 @@ def __call__(
                 if callback is not None and step_idx % callback_steps == 0:
                     callback(step_idx, t_curr_tensor, xt)
 
-        # 8. Post-processing: decode latents to audio
+        # 11. Post-processing: decode latents to audio
         if output_type == "latent":
             if not return_dict:
                 return (xt,)
@@ -526,7 +1248,18 @@ def __call__(
         # Decode latents to audio waveform using VAE
         # VAE expects [B, C, T] format, our latents are [B, T, C]
         audio_latents = xt.transpose(1, 2)  # [B, T, C] -> [B, C, T]
-        audio = self.vae.decode(audio_latents).sample
+
+        if use_tiled_decode:
+            audio = self._tiled_decode(audio_latents)
+        else:
+            audio = self.vae.decode(audio_latents).sample
+
+        # Anti-clipping normalization (from handler.py)
+        if audio.dtype != torch.float32:
+            audio = audio.float()
+        std = torch.std(audio, dim=[1, 2], keepdim=True) * 5.0
+        std[std < 1.0] = 1.0
+        audio = audio / std
 
         if output_type == "np":
             audio = audio.cpu().float().numpy()