Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from PIL import Image | |
| from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText | |
| import torch | |
| import spaces | |
| model_path = "nanonets/Nanonets-OCR-s" | |
| # Load model once at startup | |
| print("Loading Nanonets OCR model...") | |
| model = AutoModelForImageTextToText.from_pretrained( | |
| model_path, | |
| torch_dtype="auto", | |
| device_map="auto", | |
| attn_implementation="flash_attention_2" | |
| ) | |
| model.eval() | |
| tokenizer = AutoTokenizer.from_pretrained(model_path) | |
| processor = AutoProcessor.from_pretrained(model_path) | |
| print("Model loaded successfully!") | |
| def process_tags(content: str) -> str: | |
| content = content.replace("<img>", "<img>") | |
| content = content.replace("</img>", "</img>") | |
| content = content.replace("<watermark>", "<watermark>") | |
| content = content.replace("</watermark>", "</watermark>") | |
| content = content.replace("<page_number>", "<page_number>") | |
| content = content.replace("</page_number>", "</page_number>") | |
| content = content.replace("<signature>", "<signature>") | |
| content = content.replace("</signature>", "</signature>") | |
| return content | |
| def ocr_image_gradio(image, max_tokens=4096): | |
| """Process image through Nanonets OCR model for Gradio interface""" | |
| if image is None: | |
| return "Please upload an image." | |
| try: | |
| prompt = """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using β and β for check boxes.""" | |
| # Convert PIL image if needed | |
| if not isinstance(image, Image.Image): | |
| image = Image.fromarray(image) | |
| messages = [ | |
| {"role": "system", "content": "You are a helpful assistant."}, | |
| {"role": "user", "content": [ | |
| {"type": "image", "image": image}, | |
| {"type": "text", "text": prompt}, | |
| ]}, | |
| ] | |
| text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
| inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt") | |
| inputs = inputs.to(model.device) | |
| with torch.no_grad(): | |
| output_ids = model.generate(**inputs, max_new_tokens=max_tokens, do_sample=False) | |
| generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)] | |
| output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) | |
| return process_tags(output_text[0]) | |
| except Exception as e: | |
| return f"Error processing image: {str(e)}" | |
| # Create Gradio interface | |
| with gr.Blocks(title="Nanonets OCR Demo") as demo: | |
| # Replace simple markdown with styled HTML header that includes resources | |
| gr.HTML(""" | |
| <div class="title" style="text-align: center"> | |
| <h1>π Nanonets OCR - Document Text Extraction</h1> | |
| <p style="font-size: 1.1em; color: #6b7280; margin-bottom: 0.6em;"> | |
| A model for transforming documents into structured markdown with intelligent content recognition and semantic tagging | |
| </p> | |
| <div style="display: flex; justify-content: center; gap: 20px; margin: 15px 0;"> | |
| <a href="https://huggingface.co/nanonets/Nanonets-OCR-s" target="_blank" style="text-decoration: none; color: #2563eb; font-weight: 500;"> | |
| π Hugging Face Model | |
| </a> | |
| <a href="https://nanonets.com/research/nanonets-ocr-s/" target="_blank" style="text-decoration: none; color: #2563eb; font-weight: 500;"> | |
| π Release Blog | |
| </a> | |
| <a href="https://github.com/NanoNets/docext" target="_blank" style="text-decoration: none; color: #2563eb; font-weight: 500;"> | |
| π» GitHub Repository | |
| </a> | |
| </div> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| image_input = gr.Image( | |
| label="Upload Document Image", | |
| type="pil", | |
| height=400 | |
| ) | |
| max_tokens_slider = gr.Slider( | |
| minimum=1024, | |
| maximum=8192, | |
| value=4096, | |
| step=512, | |
| label="Max Tokens", | |
| info="Maximum number of tokens to generate" | |
| ) | |
| extract_btn = gr.Button("Extract Text", variant="primary", size="lg") | |
| with gr.Column(scale=2): | |
| output_text = gr.Markdown( | |
| label="Formatted model prediction", | |
| latex_delimiters=[ | |
| {"left": "$$", "right": "$$", "display": True}, | |
| {"left": "$", "right": "$", "display": False}, | |
| { | |
| "left": "\\begin{align*}", | |
| "right": "\\end{align*}", | |
| "display": True, | |
| }, | |
| ], | |
| line_breaks=True, | |
| show_copy_button=True, | |
| ) | |
| # Event handlers | |
| extract_btn.click( | |
| fn=ocr_image_gradio, | |
| inputs=[image_input, max_tokens_slider], | |
| outputs=output_text, | |
| show_progress=True | |
| ) | |
| image_input.change( | |
| fn=ocr_image_gradio, | |
| inputs=[image_input, max_tokens_slider], | |
| outputs=output_text, | |
| show_progress=True | |
| ) | |
| # Add model information section | |
| with gr.Accordion("About Nanonets-OCR-s", open=False): | |
| gr.Markdown(""" | |
| ## Nanonets-OCR-s | |
| Nanonets-OCR-s is a powerful, state-of-the-art image-to-markdown OCR model that goes far beyond traditional text extraction. | |
| It transforms documents into structured markdown with intelligent content recognition and semantic tagging, making it ideal | |
| for downstream processing by Large Language Models (LLMs). | |
| ### Key Features | |
| - **LaTeX Equation Recognition**: Automatically converts mathematical equations and formulas into properly formatted LaTeX syntax. | |
| It distinguishes between inline ($...$) and display ($$...$$) equations. | |
| - **Intelligent Image Description**: Describes images within documents using structured `<img>` tags, making them digestible | |
| for LLM processing. It can describe various image types, including logos, charts, graphs and so on, detailing their content, | |
| style, and context. | |
| - **Signature Detection & Isolation**: Identifies and isolates signatures from other text, outputting them within a `<signature>` tag. | |
| This is crucial for processing legal and business documents. | |
| - **Watermark Extraction**: Detects and extracts watermark text from documents, placing it within a `<watermark>` tag. | |
| - **Smart Checkbox Handling**: Converts form checkboxes and radio buttons into standardized Unicode symbols (β, β, β) | |
| for consistent and reliable processing. | |
| - **Complex Table Extraction**: Accurately extracts complex tables from documents and converts them into both markdown | |
| and HTML table formats. | |
| """) | |
| if __name__ == "__main__": | |
| demo.queue().launch() |