"""Module dealing specifically with loading files into Document objects. Contains the `load_file` function to load text, PDF, and markdown files. Uses Docling for advanced PDF parsing with OCR support for scanned PDFs. Falls back to PyMuPDF if Docling is not available. Supports multimodal document loading with automatic image extraction from PDFs. ## For testing: - Run this file from `server` folder as: - `python -m llm_system.utils.loader` """ import os from typing import List, Optional, Dict, Any from pathlib import Path from datetime import datetime from dataclasses import dataclass, field import uuid from langchain_core.documents import Document from langchain_community.document_loaders import TextLoader, PyMuPDFLoader from langchain_community.document_loaders import UnstructuredMarkdownLoader import fitz # PyMuPDF from PIL import Image from logger import get_logger log = get_logger(name="doc_loader") # Try to import Docling for advanced PDF parsing try: from docling.document_converter import DocumentConverter DOCLING_AVAILABLE = True log.info("✅ Docling library available - will use for PDF parsing with OCR support") except ImportError: DOCLING_AVAILABLE = False log.warning("⚠️ Docling library not available - will fallback to PyMuPDF for PDFs") # Import config for multimodal settings try: from llm_system.config import EXTRACT_IMAGES_FROM_PDF, IMAGE_OUTPUT_DIR except ImportError: # Fallback defaults if config not available EXTRACT_IMAGES_FROM_PDF = True IMAGE_OUTPUT_DIR = "server/user_uploads/extracted_images" @dataclass class ImageContent: """Represents an image extracted from a document. Attributes: image_id: Unique identifier for the image image_path: Path to where the image is stored on disk description: Text description of the image (optional) page_number: Page number where image was found position: Position on page (e.g., "top", "center", "bottom") metadata: Additional metadata (size, format, source PDF, etc.) """ image_id: str image_path: Path description: str = "" page_number: int = 0 position: str = "" metadata: Dict[str, Any] = field(default_factory=dict) def __post_init__(self): """Ensure image_path is a Path object.""" if isinstance(self.image_path, str): self.image_path = Path(self.image_path) def extract_images_from_pdf(pdf_path: str, output_dir: str = None, user_id: str = "") -> List[ImageContent]: """Extract images from a PDF file and save them to disk. Attempts to use Docling's advanced image extraction first, falls back to PyMuPDF for faster extraction. Args: pdf_path: Path to the PDF file output_dir: Directory to save extracted images (default: IMAGE_OUTPUT_DIR) user_id: User ID for organizing images Returns: List of ImageContent objects with paths and metadata """ if not EXTRACT_IMAGES_FROM_PDF: log.debug("Image extraction disabled in config") return [] if output_dir is None: output_dir = IMAGE_OUTPUT_DIR images = [] pdf_name = Path(pdf_path).stem try: # Sanitize directory name (remove special characters) pdf_name_safe = "".join(c if c.isalnum() or c in ('-', '_') else '_' for c in pdf_name) # Create user-specific output directory user_image_dir = Path(output_dir) / user_id / pdf_name_safe user_image_dir.mkdir(parents=True, exist_ok=True) log.info(f"Created image output directory: {user_image_dir}") # Try Docling first for advanced image understanding docling_images = [] if DOCLING_AVAILABLE: log.info(f"🔍 Attempting to extract images using Docling...") try: converter = DocumentConverter() docling_doc = converter.convert(pdf_path) doc = docling_doc.document # Docling stores images in various ways depending on PDF structure # Try to access pictures from the document if hasattr(doc, 'body') and hasattr(doc.body, 'blocks'): log.debug(f"Scanning {len(doc.body.blocks)} Docling blocks for pictures...") for block_idx, block in enumerate(doc.body.blocks): block_type = type(block).__name__ log.debug(f"Block {block_idx}: {block_type}") # Check for picture blocks if 'Picture' in block_type: try: # Docling picture blocks may have image data if hasattr(block, 'image') and block.image is not None: image_id = f"img_docling_{block_idx:03d}_{uuid.uuid4().hex[:8]}" image_filename = f"{image_id}.png" image_path = user_image_dir / image_filename # Save the image block.image.save(str(image_path), format='PNG') log.info(f"✅ Extracted image via Docling: {image_path}") # Get page number page_num = 0 if hasattr(block, 'page_number'): page_num = block.page_number # Create ImageContent image_content = ImageContent( image_id=image_id, image_path=image_path, page_number=page_num + 1, position="middle", metadata={ "source_pdf": pdf_name, "extracted_at": datetime.now().isoformat(), "format": "PNG", "extractor": "docling", "size": (block.image.width, block.image.height) if hasattr(block.image, 'width') else (0, 0), } ) docling_images.append(image_content) except Exception as e: log.debug(f"Could not extract Docling picture block {block_idx}: {e}") continue if docling_images: log.info(f"✅ Docling extracted {len(docling_images)} images") images.extend(docling_images) return images else: log.debug("Docling found no extractable picture blocks, falling back to PyMuPDF") except Exception as e: log.warning(f"⚠️ Docling image extraction failed: {e}, falling back to PyMuPDF") # Fallback to PyMuPDF for faster extraction log.info(f"📕 Extracting images using PyMuPDF...") pdf_document = fitz.open(pdf_path) log.info(f"Opened PDF with {pdf_document.page_count} pages") for page_num in range(pdf_document.page_count): page = pdf_document[page_num] image_list = page.get_images(full=True) if not image_list: log.debug(f"No images found on page {page_num}") continue log.info(f"Found {len(image_list)} images on page {page_num}") for img_index, img in enumerate(image_list): try: xref = img[0] pix = fitz.Pixmap(pdf_document, xref) # Convert CMYK to RGB if needed if pix.n - pix.alpha < 4: pix = fitz.Pixmap(fitz.csRGB, pix) # Generate image filename image_id = f"img_{page_num:03d}_{img_index:02d}_{uuid.uuid4().hex[:8]}" image_filename = f"{image_id}.png" image_path = user_image_dir / image_filename # Save image pix.save(str(image_path)) log.info(f"✅ Saved image: {image_path}") # Create ImageContent object image_content = ImageContent( image_id=image_id, image_path=image_path, page_number=page_num + 1, # 1-indexed for humans position="middle", # Can be enhanced with actual position metadata={ "source_pdf": pdf_name, "extracted_at": datetime.now().isoformat(), "format": "PNG", "extractor": "pymupdf", "size": (pix.width, pix.height), } ) images.append(image_content) except Exception as e: log.warning(f"Failed to extract image {img_index} on page {page_num}: {e}") continue pdf_document.close() log.info(f"✅ Extracted {len(images)} images from PDF") except Exception as e: log.error(f"❌ Error extracting images from PDF: {e}") import traceback log.error(traceback.format_exc()) return images def load_file(user_id: str, file_path: str) -> tuple[bool, List[Document], str]: """Load a file and return its content as a list of Document objects. Usually one document per page. For PDFs, automatically extracts images and attaches them to metadata. Args: user_id (str): The ID of the user who is loading the file. file_path (str): The absolute path to the file to be loaded. Returns: tuple[bool, List[Document], str]: A tuple containing: - bool: True if the file was loaded successfully, False otherwise. - List[Document]: A list of Document objects containing the file's content. - str: Message indicating the result of the loading operation. """ log.info(f"🔍 load_file() starting - file_path: {file_path}, user_id: {user_id}") file_extension = file_path.split('.')[-1].lower() log.info(f"📋 File extension detected: {file_extension}") if file_extension not in ['txt', 'pdf', "md"]: log.error(f"❌ Unsupported file type: {file_extension}.") return False, [], f"Unsupported file type: {file_extension}. Supported types are: txt, pdf." if file_path.endswith('.txt'): log.info(f"📄 Loading as TXT file") loader = TextLoader(file_path, encoding='utf-8') elif file_path.endswith('.md'): log.info(f"📝 Loading as Markdown file") loader = UnstructuredMarkdownLoader(file_path) else: # Use Docling for PDFs if available (better OCR support for scanned PDFs) file_content = None use_docling = DOCLING_AVAILABLE if use_docling: log.info(f"📕 Loading PDF using Docling (with OCR support for scanned PDFs)") try: converter = DocumentConverter() docling_doc = converter.convert(file_path) # Convert Docling output to LangChain Documents # Docling preserves structure better than PyMuPDF markdown_text = docling_doc.document.export_to_markdown() # Create a single document with all content file_content = [ Document( page_content=markdown_text, metadata={ "source": os.path.basename(file_path), "file_path": file_path, "loader": "docling" } ) ] log.info(f"✅ Docling successfully parsed PDF: {len(markdown_text)} chars extracted") except Exception as e: log.warning(f"⚠️ Docling parsing failed: {e}, falling back to PyMuPDF") file_content = None use_docling = False if not use_docling: # Fallback to PyMuPDF if Docling not available or failed log.info(f"📕 Loading as PDF file using PyMuPDFLoader") loader = PyMuPDFLoader(file_path, extract_images=False) # Load the file and return the documents if file_content is None: # If we didn't get content from Docling, use the loader (PyMuPDF, TextLoader, etc.) log.info(f"⏳ Executing loader.load()...") try: file_content = loader.load() log.info(f"✅ loader.load() completed, got {len(file_content)} pages/documents") except Exception as e: log.error(f"❌ loader.load() failed with exception: {e}") import traceback log.error(f"Traceback: {traceback.format_exc()}") return False, [], f"Error loading file: {e}" # Extract images from PDF if applicable extracted_images = [] if file_path.endswith('.pdf'): log.info(f"🖼️ Extracting images from PDF...") extracted_images = extract_images_from_pdf(file_path, user_id=user_id) log.info(f"Found {len(extracted_images)} images") # Add user metadata to each doc and attach images for doc in file_content: doc.metadata['user_id'] = user_id # Attach extracted images to the document if extracted_images: # Convert ImageContent objects to serializable format doc.metadata['images'] = [ { 'image_id': img.image_id, 'image_path': str(img.image_path), 'page_number': img.page_number, 'position': img.position, 'metadata': img.metadata } for img in extracted_images ] log.info(f"Attached {len(extracted_images)} images to document metadata") # Since i am exposing the retrieved docs to UI # Hide full server file path if its there: if 'file_path' in doc.metadata: doc.metadata['file_path'] = os.path.basename(doc.metadata['file_path']) if 'source' in doc.metadata: # If it is not local file, keep source as is: if "www." in doc.metadata['source'] or "http" in doc.metadata['source']: continue # If it is local file, keep only the file name: else: doc.metadata['source'] = os.path.basename(doc.metadata['source']) if not file_content: log.error(f"No content found in the file: {file_path}") return True, [], f"No content found in the file: {file_path}" log.info(f"Loaded {len(file_content)} documents from {file_path} for user {user_id} (with {len(extracted_images)} images).") return True, file_content, f"Loaded {len(file_content)} documents with {len(extracted_images)} images." if __name__ == "__main__": # Example usage import os print(os.getcwd()) try: status, docs, message = load_file( user_id="test_user", file_path="/Users/neetikasaxena/Documents/sanchit/sample_code/chat-with-your-data/test_data/resume_sanchit_imo_health.pdf" # file_path="../../../GenAI/Data/speech.txt" # file_path="../../../GenAI/Data/speech.md" ) print(status) print(message) print(len(docs)) for ind, doc in enumerate(docs[:3]): print("\n") print(repr(doc)) except Exception as e: print(f"Error loading file: {e}")