Spaces:

stepfun-ai
/

Step-Audio-EditX

Running on Zero

App Files Files Community

xieli commited on Nov 8

Commit

0dc7005

1 Parent(s): d94f450

feat: support int4/int8 quantization when load

Browse files

Files changed (5) hide show

README.md +1 -0
app.py +50 -11
model_loader.py +159 -30
requirements.txt +1 -0
tts.py +13 -3

README.md CHANGED Viewed

@@ -11,3 +11,4 @@ short_description: Try out Step-Audio-EditX
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference


11	---
12
13	Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14	+

app.py CHANGED Viewed

@@ -81,7 +81,10 @@ def initialize_models():
                 os.path.join(args_global.model_path, "Step-Audio-EditX"),
                 encoder,
                 model_source=model_source,
-                tts_model_id=args_global.tts_model_id
             )
             logger.info("✓ StepCommonAudioTTS loaded")
             print("Models initialized inside GPU context.")
@@ -477,26 +480,62 @@ if __name__ == "__main__":
         default=None,
         help="TTS model ID for online loading (if different from model-path)"
     )
     args = parser.parse_args()
     # Store args globally for model configuration
     args_global = args
     logger.info(f"Configuration loaded:")
-    logger.info(f"Model source: {args.model_source}")
     logger.info(f"Model path: {args.model_path}")
     logger.info(f"Tokenizer model ID: {args.tokenizer_model_id}")
     if args.tts_model_id:
         logger.info(f"TTS model ID: {args.tts_model_id}")
-    # Models will be initialized on first GPU call to avoid ZeroGPU main process errors
-    if ZEROGPU_AVAILABLE:
-        logger.info("🎉 ZeroGPU detected - using dynamic GPU duration management!")
-        logger.info("💡 First call: 300s (model loading), subsequent calls: 120s (inference only)")
-    else:
-        logger.info("💻 Running in local mode - models will be loaded on first call")
     # Create EditxTab instance
     editx_tab = EditxTab(args)

                 os.path.join(args_global.model_path, "Step-Audio-EditX"),
                 encoder,
                 model_source=model_source,
+                tts_model_id=args_global.tts_model_id,
+                quantization_config=args_global.quantization,
+                torch_dtype=torch_dtype,
+                device_map=args_global.device_map,
             )
             logger.info("✓ StepCommonAudioTTS loaded")
             print("Models initialized inside GPU context.")
         default=None,
         help="TTS model ID for online loading (if different from model-path)"
     )
+    parser.add_argument(
+        "--quantization",
+        type=str,
+        default=None,
+        choices=["int4", "int8"],
+        help="Enable quantization for the TTS model to reduce memory usage."
+             "Choices: int4 (online), int8 (online)."
+             "When quantization is enabled, data types are handled automatically by the quantization library."
+    )
+    parser.add_argument(
+        "--torch-dtype",
+        type=str,
+        default="bfloat16",
+        choices=["float16", "bfloat16", "float32"],
+        help="PyTorch data type for model operations. This setting only applies when quantization is disabled. "
+             "When quantization is enabled, data types are managed automatically."
+    )
+    parser.add_argument(
+        "--device-map",
+        type=str,
+        default="cuda",
+        help="Device mapping for model loading (default: cuda)"
+    )
     args = parser.parse_args()
     # Store args globally for model configuration
     args_global = args
     logger.info(f"Configuration loaded:")
+    # Map string arguments to actual types
+    source_mapping = {
+        "auto": ModelSource.AUTO,
+        "local": ModelSource.LOCAL,
+        "modelscope": ModelSource.MODELSCOPE,
+        "huggingface": ModelSource.HUGGINGFACE
+    }
+    model_source = source_mapping[args.model_source]
+    # Map torch dtype string to actual torch dtype
+    dtype_mapping = {
+        "float16": torch.float16,
+        "bfloat16": torch.bfloat16,
+        "float32": torch.float32
+    }
+    torch_dtype = dtype_mapping[args.torch_dtype]
+    logger.info(f"Loading models with source: {args.model_source}")
     logger.info(f"Model path: {args.model_path}")
     logger.info(f"Tokenizer model ID: {args.tokenizer_model_id}")
+    logger.info(f"Torch dtype: {args.torch_dtype}")
+    logger.info(f"Device map: {args.device_map}")
     if args.tts_model_id:
         logger.info(f"TTS model ID: {args.tts_model_id}")
+    if args.quantization:
+        logger.info(f"🔧 {args.quantization.upper()} quantization enabled")
     # Create EditxTab instance
     editx_tab = EditxTab(args)

model_loader.py CHANGED Viewed

@@ -1,17 +1,14 @@
 """
 Unified model loading utility supporting ModelScope, HuggingFace and local path loading
 """
-import importlib
 import os
 import logging
-from pathlib import Path
-import sys
 import threading
-from typing import Union, Optional, Dict, Any
-import spaces
-from transformers import AutoModelForCausalLM, AutoTokenizer
 from funasr_detach import AutoModel
-from transformers.models.auto import tokenization_auto, configuration_auto
 # Global cache for downloaded models to avoid repeated downloads
 # Key: (model_path, source)
@@ -104,19 +101,71 @@ class UnifiedModelLoader:
         modelscope_patterns = []
         return any(pattern in model_path for pattern in modelscope_patterns)
     def load_transformers_model(
         self,
         model_path: str,
         source: str = ModelSource.AUTO,
         **kwargs
-    ) -> tuple:
         """
         Load Transformers model (for StepAudioTTS)
         Args:
             model_path: Model path or ID
             source: Model source, auto means auto-detect
-            **kwargs: Other parameters
         Returns:
             (model, tokenizer) tuple
@@ -125,17 +174,47 @@ class UnifiedModelLoader:
             source = self.detect_model_source(model_path)
         self.logger.info(f"Loading Transformers model from {source}: {model_path}")
         try:
             if source == ModelSource.LOCAL:
                 # Local loading
-                model = AutoModelForCausalLM.from_pretrained(
-                    model_path,
-                    torch_dtype=kwargs.get("torch_dtype"),
-                    device_map=kwargs.get("device_map", "auto"),
-                    trust_remote_code=True,
-                    local_files_only=True
-                )
                 tokenizer = AutoTokenizer.from_pretrained(
                     model_path,
                     trust_remote_code=True,
@@ -148,13 +227,38 @@ class UnifiedModelLoader:
                 from modelscope import AutoTokenizer as MSAutoTokenizer
                 model_path = self._cached_snapshot_download(model_path, ModelSource.MODELSCOPE)
-                model = MSAutoModelForCausalLM.from_pretrained(
-                    model_path,
-                    torch_dtype=kwargs.get("torch_dtype"),
-                    device_map=kwargs.get("device_map", "auto"),
-                    trust_remote_code=True,
-                    local_files_only=True
-                )
                 tokenizer = MSAutoTokenizer.from_pretrained(
                     model_path,
                     trust_remote_code=True,
@@ -165,13 +269,38 @@ class UnifiedModelLoader:
                 model_path = self._cached_snapshot_download(model_path, ModelSource.HUGGINGFACE)
                 # Load from HuggingFace
-                model = AutoModelForCausalLM.from_pretrained(
-                    model_path,
-                    torch_dtype=kwargs.get("torch_dtype"),
-                    device_map=kwargs.get("device_map", "auto"),
-                    trust_remote_code=True,
-                    local_files_only=True
-                )
                 tokenizer = AutoTokenizer.from_pretrained(
                     model_path,
                     trust_remote_code=True,

 """
 Unified model loading utility supporting ModelScope, HuggingFace and local path loading
 """
 import os
 import logging
 import threading
+from typing import Optional, Dict, Any, Tuple
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from awq import AutoAWQForCausalLM
 from funasr_detach import AutoModel
 # Global cache for downloaded models to avoid repeated downloads
 # Key: (model_path, source)
         modelscope_patterns = []
         return any(pattern in model_path for pattern in modelscope_patterns)
+    def _prepare_quantization_config(self, quantization_config: Optional[str], torch_dtype: Optional[torch.dtype] = None) -> Tuple[Dict[str, Any], bool]:
+        """
+        Prepare quantization configuration for model loading
+        Args:
+            quantization_config: Quantization type ('int4', 'int8', 'int4_offline_awq', or None)
+            torch_dtype: PyTorch data type for compute operations
+        Returns:
+            Tuple of (quantization parameters dict, should_set_torch_dtype)
+        """
+        if not quantization_config:
+            return {}, True
+        quantization_config = quantization_config.lower()
+        if quantization_config == "int4_offline_awq":
+            # For pre-quantized AWQ models, no additional quantization needed
+            self.logger.info("🔧 Loading pre-quantized AWQ 4-bit model (offline)")
+            return {}, True  # Load pre-quantized model normally, allow torch_dtype setting
+        elif quantization_config == "int8":
+            # Use user-specified torch_dtype for compute, default to bfloat16
+            compute_dtype = torch_dtype if torch_dtype is not None else torch.bfloat16
+            self.logger.info(f"🔧 INT8 quantization: using {compute_dtype} for compute operations")
+            bnb_config = BitsAndBytesConfig(
+                load_in_8bit=True,
+                bnb_8bit_compute_dtype=compute_dtype,
+            )
+            return {
+                "quantization_config": bnb_config
+            }, False  # INT8 quantization handles data types automatically, don't set torch_dtype
+        elif quantization_config == "int4":
+            # Use user-specified torch_dtype for compute, default to bfloat16
+            compute_dtype = torch_dtype if torch_dtype is not None else torch.bfloat16
+            self.logger.info(f"🔧 INT4 quantization: using {compute_dtype} for compute operations")
+            bnb_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=compute_dtype,
+                bnb_4bit_use_double_quant=True,
+            )
+            return {
+                "quantization_config": bnb_config
+            }, False  # INT4 quantization handles torch_dtype internally, don't set it again
+        else:
+            raise ValueError(f"Unsupported quantization config: {quantization_config}. Supported: 'int4', 'int8', 'int4_offline_awq'")
     def load_transformers_model(
         self,
         model_path: str,
         source: str = ModelSource.AUTO,
+        quantization_config: Optional[str] = None,
         **kwargs
+    ) -> Tuple:
         """
         Load Transformers model (for StepAudioTTS)
         Args:
             model_path: Model path or ID
             source: Model source, auto means auto-detect
+            quantization_config: Quantization configuration ('int4', 'int8', 'int4_offline_awq', or None for no quantization)
+            **kwargs: Other parameters (torch_dtype, device_map, etc.)
         Returns:
             (model, tokenizer) tuple
             source = self.detect_model_source(model_path)
         self.logger.info(f"Loading Transformers model from {source}: {model_path}")
+        if quantization_config:
+            self.logger.info(f"🔧 {quantization_config.upper()} quantization enabled")
+        # Prepare quantization configuration
+        quantization_kwargs, should_set_torch_dtype = self._prepare_quantization_config(quantization_config, kwargs.get("torch_dtype"))
         try:
             if source == ModelSource.LOCAL:
                 # Local loading
+                load_kwargs = {
+                    "device_map": kwargs.get("device_map", "auto"),
+                    "trust_remote_code": True,
+                    "local_files_only": True
+                }
+                # Add quantization configuration if specified
+                load_kwargs.update(quantization_kwargs)
+                # Add torch_dtype based on quantization requirements
+                if should_set_torch_dtype and kwargs.get("torch_dtype") is not None:
+                    load_kwargs["torch_dtype"] = kwargs.get("torch_dtype")
+                # Check if using AWQ quantization
+                if quantization_config and quantization_config.lower() == "int4_offline_awq":
+                    # Use AWQ loading for pre-quantized AWQ models
+                    awq_model_path = os.path.join(model_path, "awq_quantized")
+                    if not os.path.exists(awq_model_path):
+                        raise FileNotFoundError(f"AWQ quantized model not found at {awq_model_path}. Please run quantize_model_offline.py first.")
+                    self.logger.info(f"🔧 Loading AWQ quantized model from: {awq_model_path}")
+                    model = AutoAWQForCausalLM.from_quantized(
+                        awq_model_path,
+                        device_map=kwargs.get("device_map", "auto"),
+                        trust_remote_code=True
+                    )
+                else:
+                    # Standard loading
+                    model = AutoModelForCausalLM.from_pretrained(
+                        model_path,
+                        **load_kwargs
+                    )
                 tokenizer = AutoTokenizer.from_pretrained(
                     model_path,
                     trust_remote_code=True,
                 from modelscope import AutoTokenizer as MSAutoTokenizer
                 model_path = self._cached_snapshot_download(model_path, ModelSource.MODELSCOPE)
+                load_kwargs = {
+                    "device_map": kwargs.get("device_map", "auto"),
+                    "trust_remote_code": True,
+                    "local_files_only": True
+                }
+                # Add quantization configuration if specified
+                load_kwargs.update(quantization_kwargs)
+                # Add torch_dtype based on quantization requirements
+                if should_set_torch_dtype and kwargs.get("torch_dtype") is not None:
+                    load_kwargs["torch_dtype"] = kwargs.get("torch_dtype")
+                # Check if using AWQ quantization
+                if quantization_config and quantization_config.lower() == "int4_offline_awq":
+                    # Use AWQ loading for pre-quantized AWQ models
+                    awq_model_path = os.path.join(model_path, "awq_quantized")
+                    if not os.path.exists(awq_model_path):
+                        raise FileNotFoundError(f"AWQ quantized model not found at {awq_model_path}. Please run quantize_model_offline.py first.")
+                    self.logger.info(f"🔧 Loading AWQ quantized model from: {awq_model_path}")
+                    model = AutoAWQForCausalLM.from_quantized(
+                        awq_model_path,
+                        device_map=kwargs.get("device_map", "auto"),
+                        trust_remote_code=True
+                    )
+                else:
+                    # Standard loading
+                    model = MSAutoModelForCausalLM.from_pretrained(
+                        model_path,
+                        **load_kwargs
+                    )
                 tokenizer = MSAutoTokenizer.from_pretrained(
                     model_path,
                     trust_remote_code=True,
                 model_path = self._cached_snapshot_download(model_path, ModelSource.HUGGINGFACE)
                 # Load from HuggingFace
+                load_kwargs = {
+                    "device_map": kwargs.get("device_map", "auto"),
+                    "trust_remote_code": True,
+                    "local_files_only": True
+                }
+                # Add quantization configuration if specified
+                load_kwargs.update(quantization_kwargs)
+                # Add torch_dtype based on quantization requirements
+                if should_set_torch_dtype and kwargs.get("torch_dtype") is not None:
+                    load_kwargs["torch_dtype"] = kwargs.get("torch_dtype")
+                # Check if using AWQ quantization
+                if quantization_config and quantization_config.lower() == "int4_offline_awq":
+                    # Use AWQ loading for pre-quantized AWQ models
+                    awq_model_path = os.path.join(model_path, "awq_quantized")
+                    if not os.path.exists(awq_model_path):
+                        raise FileNotFoundError(f"AWQ quantized model not found at {awq_model_path}. Please run quantize_model_offline.py first.")
+                    self.logger.info(f"🔧 Loading AWQ quantized model from: {awq_model_path}")
+                    model = AutoAWQForCausalLM.from_quantized(
+                        awq_model_path,
+                        device_map=kwargs.get("device_map", "auto"),
+                        trust_remote_code=True
+                    )
+                else:
+                    # Standard loading
+                    model = AutoModelForCausalLM.from_pretrained(
+                        model_path,
+                        **load_kwargs
+                    )
                 tokenizer = AutoTokenizer.from_pretrained(
                     model_path,
                     trust_remote_code=True,

requirements.txt CHANGED Viewed

@@ -22,3 +22,4 @@ gradio>=5.16.0
 nvidia-cuda-nvrtc-cu12==12.8.93
 spaces==0.42.1
 matplotlib==3.10.7

 nvidia-cuda-nvrtc-cu12==12.8.93
 spaces==0.42.1
 matplotlib==3.10.7
+autoawq==0.2.8

tts.py CHANGED Viewed

@@ -60,7 +60,10 @@ class StepAudioTTS:
         model_path,
         audio_tokenizer,
         model_source=ModelSource.AUTO,
-        tts_model_id=None
     ):
         """
         Initialize StepAudioTTS
@@ -70,6 +73,9 @@ class StepAudioTTS:
             audio_tokenizer: Audio tokenizer for wav2token processing
             model_source: Model source (auto/local/modelscope/huggingface)
             tts_model_id: TTS model ID, if None use model_path
         """
         # Determine model ID or path to load
         if tts_model_id is None:
@@ -87,8 +93,9 @@ class StepAudioTTS:
             self.llm, self.tokenizer, model_path = model_loader.load_transformers_model(
                 tts_model_id,
                 source=model_source,
-                torch_dtype=torch.bfloat16,
-                device_map="cuda"
             )
             logger.info(f"✅ Successfully loaded LLM and tokenizer: {tts_model_id}")
         except Exception as e:
@@ -100,6 +107,9 @@ class StepAudioTTS:
             os.path.join(model_path, "CosyVoice-300M-25Hz")
         )
         # Use system prompts from config module
         self.edit_clone_sys_prompt_tpl = AUDIO_EDIT_CLONE_SYSTEM_PROMPT_TPL
         self.edit_sys_prompt = AUDIO_EDIT_SYSTEM_PROMPT

         model_path,
         audio_tokenizer,
         model_source=ModelSource.AUTO,
+        tts_model_id=None,
+        quantization_config=None,
+        torch_dtype=torch.bfloat16,
+        device_map="cuda"
     ):
         """
         Initialize StepAudioTTS
             audio_tokenizer: Audio tokenizer for wav2token processing
             model_source: Model source (auto/local/modelscope/huggingface)
             tts_model_id: TTS model ID, if None use model_path
+            quantization_config: Quantization configuration ('int4', 'int8', or None)
+            torch_dtype: PyTorch data type for model weights (default: torch.bfloat16)
+            device_map: Device mapping for model (default: "cuda")
         """
         # Determine model ID or path to load
         if tts_model_id is None:
             self.llm, self.tokenizer, model_path = model_loader.load_transformers_model(
                 tts_model_id,
                 source=model_source,
+                quantization_config=quantization_config,
+                torch_dtype=torch_dtype,
+                device_map=device_map
             )
             logger.info(f"✅ Successfully loaded LLM and tokenizer: {tts_model_id}")
         except Exception as e:
             os.path.join(model_path, "CosyVoice-300M-25Hz")
         )
+        # Print final GPU memory usage after all models are loaded
+        logger.info("🎤 CosyVoice model loaded successfully")
         # Use system prompts from config module
         self.edit_clone_sys_prompt_tpl = AUDIO_EDIT_CLONE_SYSTEM_PROMPT_TPL
         self.edit_sys_prompt = AUDIO_EDIT_SYSTEM_PROMPT