Spaces:

IFMedTechdemo
/

Multi-Model-OCR

Runtime error

App Files Files Community

IFMedTechdemo commited on Nov 6

Commit

da048ad

verified ·

1 Parent(s): c0fa0f9

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -17

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import os
 import time
 import torch
@@ -18,13 +17,18 @@ from transformers import (
     Qwen2_5_VLForConditionalGeneration,
     TextIteratorStreamer
 )
 from qwen_vl_utils import process_vision_info
 # Suppress the warning about uninitialized weights
 warnings.filterwarnings('ignore', message='Some weights.*were not initialized')
 # Try importing Qwen3VL if available
 try:
     from transformers import Qwen3VLForConditionalGeneration
@@ -32,18 +36,27 @@ except ImportError:
     Qwen3VLForConditionalGeneration = None
 MAX_MAX_NEW_TOKENS = 4096
 DEFAULT_MAX_NEW_TOKENS = 2048
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 print(f"Initial Device: {device}")
 print(f"CUDA Available: {torch.cuda.is_available()}")
 # Load Chandra-OCR
 try:
     MODEL_ID_V = "datalab-to/chandra"
@@ -52,7 +65,8 @@ try:
         model_v = Qwen3VLForConditionalGeneration.from_pretrained(
             MODEL_ID_V,
             trust_remote_code=True,
-            torch_dtype=torch.float16
         ).eval()
         print("✓ Chandra-OCR loaded")
     else:
@@ -64,6 +78,8 @@ except Exception as e:
     print(f"✗ Chandra-OCR: Failed to load - {str(e)}")
 # Load Nanonets-OCR2-3B
 try:
     MODEL_ID_X = "nanonets/Nanonets-OCR2-3B"
@@ -71,7 +87,8 @@ try:
     model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
         MODEL_ID_X,
         trust_remote_code=True,
-        torch_dtype=torch.float16
     ).eval()
     print("✓ Nanonets-OCR2-3B loaded")
 except Exception as e:
@@ -80,14 +97,31 @@ except Exception as e:
     print(f"✗ Nanonets-OCR2-3B: Failed to load - {str(e)}")
-# Load Dots.OCR - will be moved to GPU when needed
 try:
-    MODEL_PATH_D = "strangervisionhf/dots.ocr-base-fix"
-    processor_d = AutoProcessor.from_pretrained(MODEL_PATH_D, trust_remote_code=True)
     model_d = AutoModelForCausalLM.from_pretrained(
-        MODEL_PATH_D,
         attn_implementation="flash_attention_2",
         torch_dtype=torch.bfloat16,
         trust_remote_code=True
     ).eval()
     print("✓ Dots.OCR loaded")
@@ -95,6 +129,10 @@ except Exception as e:
     model_d = None
     processor_d = None
     print(f"✗ Dots.OCR: Failed to load - {str(e)}")
 # Load olmOCR-2-7B-1025
@@ -104,7 +142,8 @@ try:
     model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
         MODEL_ID_M,
         trust_remote_code=True,
-        torch_dtype=torch.float16
     ).eval()
     print("✓ olmOCR-2-7B-1025 loaded")
 except Exception as e:
@@ -113,6 +152,8 @@ except Exception as e:
     print(f"✗ olmOCR-2-7B-1025: Failed to load - {str(e)}")
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
                    max_new_tokens: int, temperature: float, top_p: float,
@@ -120,10 +161,8 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     """
     Generates responses using the selected model for image input.
     Yields raw text and Markdown-formatted text.
     This function is decorated with @spaces.GPU to ensure it runs on GPU
     when available in Hugging Face Spaces.
     Args:
         model_name: Name of the OCR model to use
         text: Prompt text for the model
@@ -133,48 +172,52 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         top_p: Nucleus sampling parameter
         top_k: Top-k sampling parameter
         repetition_penalty: Penalty for repeating tokens
     Yields:
         tuple: (raw_text, markdown_text)
     """
     # Device will be cuda when @spaces.GPU decorator activates
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     # Select model and processor based on model_name
     if model_name == "olmOCR-2-7B-1025":
         if model_m is None:
             yield "olmOCR-2-7B-1025 is not available.", "olmOCR-2-7B-1025 is not available."
             return
         processor = processor_m
-        model = model_m.to(device)
     elif model_name == "Nanonets-OCR2-3B":
         if model_x is None:
             yield "Nanonets-OCR2-3B is not available.", "Nanonets-OCR2-3B is not available."
             return
         processor = processor_x
-        model = model_x.to(device)
     elif model_name == "Chandra-OCR":
         if model_v is None:
             yield "Chandra-OCR is not available.", "Chandra-OCR is not available."
             return
         processor = processor_v
-        model = model_v.to(device)
     elif model_name == "Dots.OCR":
         if model_d is None:
             yield "Dots.OCR is not available.", "Dots.OCR is not available."
             return
         processor = processor_d
-        model = model_d.to(device)
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
     if image is None:
         yield "Please upload an image.", "Please upload an image."
         return
     try:
         # Prepare messages in chat format
         messages = [{
@@ -185,6 +228,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
             ]
         }]
         # Apply chat template with fallback
         try:
             prompt_full = processor.apply_chat_template(
@@ -198,6 +242,8 @@ def generate_image(model_name: str, text: str, image: Image.Image,
             prompt_full = f"{text}"
         # Process inputs
         inputs = processor(
             text=[prompt_full],
@@ -207,6 +253,8 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         ).to(device)
         # Setup streaming generation
         streamer = TextIteratorStreamer(
             processor.tokenizer if hasattr(processor, 'tokenizer') else processor,
@@ -214,6 +262,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
             skip_special_tokens=True
         )
         generation_kwargs = {
             **inputs,
             "streamer": streamer,
@@ -225,10 +274,12 @@ def generate_image(model_name: str, text: str, image: Image.Image,
             "repetition_penalty": repetition_penalty,
         }
         # Start generation in separate thread
         thread = Thread(target=model.generate, kwargs=generation_kwargs)
         thread.start()
         # Stream the results
         buffer = ""
         for new_text in streamer:
@@ -237,9 +288,11 @@ def generate_image(model_name: str, text: str, image: Image.Image,
             time.sleep(0.01)
             yield buffer, buffer
         # Ensure thread completes
         thread.join()
     except Exception as e:
         error_msg = f"Error during generation: {str(e)}"
         print(f"Full error: {e}")
@@ -248,10 +301,13 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         yield error_msg, error_msg
 # Example usage for Gradio interface
 if __name__ == "__main__":
     import gradio as gr
     # Determine available models
     available_models = []
     if model_m is not None:
@@ -267,16 +323,20 @@ if __name__ == "__main__":
         available_models.append("Dots.OCR")
         print("  Added: Dots.OCR")
     if not available_models:
         print("ERROR: No models were loaded successfully!")
         exit(1)
     print(f"\n✓ Available models for dropdown: {', '.join(available_models)}")
     with gr.Blocks(title="Multi-Model OCR") as demo:
         gr.Markdown("# 🔍 Multi-Model OCR Application")
         gr.Markdown("Upload an image and select a model to extract text. Models run on GPU via Hugging Face Spaces.")
         with gr.Row():
             with gr.Column():
                 model_selector = gr.Dropdown(
@@ -291,6 +351,7 @@ if __name__ == "__main__":
                     lines=2
                 )
                 with gr.Accordion("Advanced Settings", open=False):
                     max_tokens = gr.Slider(
                         minimum=1,
@@ -328,20 +389,24 @@ if __name__ == "__main__":
                         label="Repetition Penalty"
                     )
                 submit_btn = gr.Button("Extract Text", variant="primary")
             with gr.Column():
                 output_text = gr.Textbox(label="Extracted Text", lines=20)
                 output_markdown = gr.Markdown(label="Formatted Output")
         gr.Markdown("""
         ### Available Models:
         - **olmOCR-2-7B-1025**: Allen AI's OCR model
         - **Nanonets-OCR2-3B**: Nanonets OCR model
         - **Chandra-OCR**: Datalab OCR model
-        - **Dots.OCR**: Stranger Vision OCR model
         """)
         submit_btn.click(
             fn=generate_image,
             inputs=[
@@ -357,5 +422,6 @@ if __name__ == "__main__":
             outputs=[output_text, output_markdown]
         )
     # Launch with share=True for Hugging Face Spaces
-    demo.launch(share=True)

 import os
 import time
 import torch
     Qwen2_5_VLForConditionalGeneration,
     TextIteratorStreamer
 )
+from huggingface_hub import snapshot_download
 from qwen_vl_utils import process_vision_info
 # Suppress the warning about uninitialized weights
 warnings.filterwarnings('ignore', message='Some weights.*were not initialized')
 # Try importing Qwen3VL if available
 try:
     from transformers import Qwen3VLForConditionalGeneration
     Qwen3VLForConditionalGeneration = None
 MAX_MAX_NEW_TOKENS = 4096
 DEFAULT_MAX_NEW_TOKENS = 2048
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+CACHE_DIR = os.getenv("HF_CACHE_DIR", "./models")
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 print(f"Initial Device: {device}")
 print(f"CUDA Available: {torch.cuda.is_available()}")
 # Load Chandra-OCR
 try:
     MODEL_ID_V = "datalab-to/chandra"
         model_v = Qwen3VLForConditionalGeneration.from_pretrained(
             MODEL_ID_V,
             trust_remote_code=True,
+            torch_dtype=torch.float16,
+            device_map="auto"
         ).eval()
         print("✓ Chandra-OCR loaded")
     else:
     print(f"✗ Chandra-OCR: Failed to load - {str(e)}")
 # Load Nanonets-OCR2-3B
 try:
     MODEL_ID_X = "nanonets/Nanonets-OCR2-3B"
     model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
         MODEL_ID_X,
         trust_remote_code=True,
+        torch_dtype=torch.float16,
+        device_map="auto"
     ).eval()
     print("✓ Nanonets-OCR2-3B loaded")
 except Exception as e:
     print(f"✗ Nanonets-OCR2-3B: Failed to load - {str(e)}")
+# Load Dots.OCR - UPDATED with snapshot_download and device_map="auto"
 try:
+    MODEL_ID_D = "rednote-hilab/dots.ocr"
+    model_path_d = os.path.join(CACHE_DIR, "dots-ocr-local")
+    # Download and cache model locally
+    snapshot_download(
+        repo_id=MODEL_ID_D,
+        local_dir=model_path_d,
+        local_dir_use_symlinks=False,  # Avoid symlink issues on HF Spaces
+        allow_patterns=["*.json", "*.bin", "*.safetensors", "*.txt"]
+    )
+    processor_d = AutoProcessor.from_pretrained(
+        model_path_d,
+        trust_remote_code=True
+    )
     model_d = AutoModelForCausalLM.from_pretrained(
+        model_path_d,
         attn_implementation="flash_attention_2",
         torch_dtype=torch.bfloat16,
+        device_map="auto",  # Better memory management
         trust_remote_code=True
     ).eval()
     print("✓ Dots.OCR loaded")
     model_d = None
     processor_d = None
     print(f"✗ Dots.OCR: Failed to load - {str(e)}")
+    import traceback
+    traceback.print_exc()
 # Load olmOCR-2-7B-1025
     model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
         MODEL_ID_M,
         trust_remote_code=True,
+        torch_dtype=torch.float16,
+        device_map="auto"
     ).eval()
     print("✓ olmOCR-2-7B-1025 loaded")
 except Exception as e:
     print(f"✗ olmOCR-2-7B-1025: Failed to load - {str(e)}")
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
                    max_new_tokens: int, temperature: float, top_p: float,
     """
     Generates responses using the selected model for image input.
     Yields raw text and Markdown-formatted text.
     This function is decorated with @spaces.GPU to ensure it runs on GPU
     when available in Hugging Face Spaces.
     Args:
         model_name: Name of the OCR model to use
         text: Prompt text for the model
         top_p: Nucleus sampling parameter
         top_k: Top-k sampling parameter
         repetition_penalty: Penalty for repeating tokens
     Yields:
         tuple: (raw_text, markdown_text)
     """
     # Device will be cuda when @spaces.GPU decorator activates
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     # Select model and processor based on model_name
     if model_name == "olmOCR-2-7B-1025":
         if model_m is None:
             yield "olmOCR-2-7B-1025 is not available.", "olmOCR-2-7B-1025 is not available."
             return
         processor = processor_m
+        model = model_m
     elif model_name == "Nanonets-OCR2-3B":
         if model_x is None:
             yield "Nanonets-OCR2-3B is not available.", "Nanonets-OCR2-3B is not available."
             return
         processor = processor_x
+        model = model_x
     elif model_name == "Chandra-OCR":
         if model_v is None:
             yield "Chandra-OCR is not available.", "Chandra-OCR is not available."
             return
         processor = processor_v
+        model = model_v
     elif model_name == "Dots.OCR":
         if model_d is None:
             yield "Dots.OCR is not available.", "Dots.OCR is not available."
             return
         processor = processor_d
+        model = model_d
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
     if image is None:
         yield "Please upload an image.", "Please upload an image."
         return
     try:
         # Prepare messages in chat format
         messages = [{
             ]
         }]
         # Apply chat template with fallback
         try:
             prompt_full = processor.apply_chat_template(
             prompt_full = f"{text}"
         # Process inputs
         inputs = processor(
             text=[prompt_full],
         ).to(device)
         # Setup streaming generation
         streamer = TextIteratorStreamer(
             processor.tokenizer if hasattr(processor, 'tokenizer') else processor,
             skip_special_tokens=True
         )
         generation_kwargs = {
             **inputs,
             "streamer": streamer,
             "repetition_penalty": repetition_penalty,
         }
         # Start generation in separate thread
         thread = Thread(target=model.generate, kwargs=generation_kwargs)
         thread.start()
         # Stream the results
         buffer = ""
         for new_text in streamer:
             time.sleep(0.01)
             yield buffer, buffer
         # Ensure thread completes
         thread.join()
     except Exception as e:
         error_msg = f"Error during generation: {str(e)}"
         print(f"Full error: {e}")
         yield error_msg, error_msg
 # Example usage for Gradio interface
 if __name__ == "__main__":
     import gradio as gr
     # Determine available models
     available_models = []
     if model_m is not None:
         available_models.append("Dots.OCR")
         print("  Added: Dots.OCR")
     if not available_models:
         print("ERROR: No models were loaded successfully!")
         exit(1)
     print(f"\n✓ Available models for dropdown: {', '.join(available_models)}")
     with gr.Blocks(title="Multi-Model OCR") as demo:
         gr.Markdown("# 🔍 Multi-Model OCR Application")
         gr.Markdown("Upload an image and select a model to extract text. Models run on GPU via Hugging Face Spaces.")
         with gr.Row():
             with gr.Column():
                 model_selector = gr.Dropdown(
                     lines=2
                 )
                 with gr.Accordion("Advanced Settings", open=False):
                     max_tokens = gr.Slider(
                         minimum=1,
                         label="Repetition Penalty"
                     )
                 submit_btn = gr.Button("Extract Text", variant="primary")
             with gr.Column():
                 output_text = gr.Textbox(label="Extracted Text", lines=20)
                 output_markdown = gr.Markdown(label="Formatted Output")
         gr.Markdown("""
         ### Available Models:
         - **olmOCR-2-7B-1025**: Allen AI's OCR model
         - **Nanonets-OCR2-3B**: Nanonets OCR model
         - **Chandra-OCR**: Datalab OCR model
+        - **Dots.OCR**: Stranger Vision OCR model (Updated)
         """)
         submit_btn.click(
             fn=generate_image,
             inputs=[
             outputs=[output_text, output_markdown]
         )
     # Launch with share=True for Hugging Face Spaces
+    demo.launch(share=True)