Spaces:

Souvik3333
/

Nanonets-ocr-s

Runtime error

App Files Files Community

Souvik3333 commited on Jun 13

Commit

50060d5

verified ·

1 Parent(s): c36e3a0

Create app.py

Browse files

Files changed (1) hide show

app.py +173 -0

app.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import gradio as gr
+from PIL import Image
+from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText
+import torch
+import spaces
+model_path = "nanonets/Nanonets-OCR-s"
+# Load model once at startup
+print("Loading Nanonets OCR model...")
+model = AutoModelForImageTextToText.from_pretrained(
+    model_path,
+    torch_dtype="auto",
+    device_map="auto",
+    attn_implementation="flash_attention_2"
+)
+model.eval()
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+processor = AutoProcessor.from_pretrained(model_path)
+print("Model loaded successfully!")
+def process_tags(content: str) -> str:
+    content = content.replace("<img>", "&lt;img&gt;")
+    content = content.replace("</img>", "&lt;/img&gt;")
+    content = content.replace("<watermark>", "&lt;watermark&gt;")
+    content = content.replace("</watermark>", "&lt;/watermark&gt;")
+    content = content.replace("<page_number>", "&lt;page_number&gt;")
+    content = content.replace("</page_number>", "&lt;/page_number&gt;")
+    content = content.replace("<signature>", "&lt;signature&gt;")
+    content = content.replace("</signature>", "&lt;/signature&gt;")
+    return content
+@spaces.GPU()
+def ocr_image_gradio(image, max_tokens=4096):
+    """Process image through Nanonets OCR model for Gradio interface"""
+    if image is None:
+        return "Please upload an image."
+    try:
+        prompt = """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes."""
+        # Convert PIL image if needed
+        if not isinstance(image, Image.Image):
+            image = Image.fromarray(image)
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": prompt},
+            ]},
+        ]
+        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt")
+        inputs = inputs.to(model.device)
+        with torch.no_grad():
+            output_ids = model.generate(**inputs, max_new_tokens=max_tokens, do_sample=False)
+            generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
+        output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+        return process_tags(output_text[0])
+    except Exception as e:
+        return f"Error processing image: {str(e)}"
+# Create Gradio interface
+with gr.Blocks(title="Nanonets OCR Demo") as demo:
+    # Replace simple markdown with styled HTML header that includes resources
+    gr.HTML("""
+    <div class="title" style="text-align: center">
+        <h1>🔍 Nanonets OCR - Document Text Extraction</h1>
+        <p style="font-size: 1.1em; color: #6b7280; margin-bottom: 0.6em;">
+            A model for transforming documents into structured markdown with intelligent content recognition and semantic tagging
+        </p>
+        <div style="display: flex; justify-content: center; gap: 20px; margin: 15px 0;">
+            <a href="https://huggingface.co/nanonets/Nanonets-OCR-s" target="_blank" style="text-decoration: none; color: #2563eb; font-weight: 500;">
+                📚 Hugging Face Model
+            </a>
+            <a href="https://nanonets.com/research/nanonets-ocr-s/" target="_blank" style="text-decoration: none; color: #2563eb; font-weight: 500;">
+                📝 Release Blog
+            </a>
+            <a href="https://github.com/NanoNets/docext" target="_blank" style="text-decoration: none; color: #2563eb; font-weight: 500;">
+                💻 GitHub Repository
+            </a>
+        </div>
+    </div>
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            image_input = gr.Image(
+                label="Upload Document Image",
+                type="pil",
+                height=400
+            )
+            max_tokens_slider = gr.Slider(
+                minimum=1024,
+                maximum=8192,
+                value=4096,
+                step=512,
+                label="Max Tokens",
+                info="Maximum number of tokens to generate"
+            )
+            extract_btn = gr.Button("Extract Text", variant="primary", size="lg")
+        with gr.Column(scale=2):
+            output_text = gr.Markdown(
+                label="Formatted model prediction",
+                latex_delimiters=[
+                    {"left": "$$", "right": "$$", "display": True},
+                    {"left": "$", "right": "$", "display": False},
+                    {
+                        "left": "\\begin{align*}",
+                        "right": "\\end{align*}",
+                        "display": True,
+                    },
+                ],
+                line_breaks=True,
+                show_copy_button=True,
+            )
+    # Event handlers
+    extract_btn.click(
+        fn=ocr_image_gradio,
+        inputs=[image_input, max_tokens_slider],
+        outputs=output_text,
+        show_progress=True
+    )
+    image_input.change(
+        fn=ocr_image_gradio,
+        inputs=[image_input, max_tokens_slider],
+        outputs=output_text,
+        show_progress=True
+    )
+    # Add model information section
+    with gr.Accordion("About Nanonets-OCR-s", open=False):
+        gr.Markdown("""
+        ## Nanonets-OCR-s
+        Nanonets-OCR-s is a powerful, state-of-the-art image-to-markdown OCR model that goes far beyond traditional text extraction.
+        It transforms documents into structured markdown with intelligent content recognition and semantic tagging, making it ideal
+        for downstream processing by Large Language Models (LLMs).
+        ### Key Features
+        - **LaTeX Equation Recognition**: Automatically converts mathematical equations and formulas into properly formatted LaTeX syntax.
+          It distinguishes between inline ($...$) and display ($$...$$) equations.
+        - **Intelligent Image Description**: Describes images within documents using structured `<img>` tags, making them digestible
+          for LLM processing. It can describe various image types, including logos, charts, graphs and so on, detailing their content,
+          style, and context.
+        - **Signature Detection & Isolation**: Identifies and isolates signatures from other text, outputting them within a `<signature>` tag.
+          This is crucial for processing legal and business documents.
+        - **Watermark Extraction**: Detects and extracts watermark text from documents, placing it within a `<watermark>` tag.
+        - **Smart Checkbox Handling**: Converts form checkboxes and radio buttons into standardized Unicode symbols (☐, ☑, ☒)
+          for consistent and reliable processing.
+        - **Complex Table Extraction**: Accurately extracts complex tables from documents and converts them into both markdown
+          and HTML table formats.
+        """)
+if __name__ == "__main__":
+    demo.queue().launch()