File size: 16,500 Bytes
4aec76b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
"""Module dealing specifically with loading files into Document objects.
Contains the `load_file` function to load text, PDF, and markdown files.
Uses Docling for advanced PDF parsing with OCR support for scanned PDFs.
Falls back to PyMuPDF if Docling is not available.

Supports multimodal document loading with automatic image extraction from PDFs.

## For testing:
- Run this file from `server` folder as:
- `python -m llm_system.utils.loader`
"""

import os
from typing import List, Optional, Dict, Any
from pathlib import Path
from datetime import datetime
from dataclasses import dataclass, field
import uuid

from langchain_core.documents import Document
from langchain_community.document_loaders import TextLoader, PyMuPDFLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader
import fitz  # PyMuPDF
from PIL import Image

from logger import get_logger
log = get_logger(name="doc_loader")

# Try to import Docling for advanced PDF parsing
try:
    from docling.document_converter import DocumentConverter
    DOCLING_AVAILABLE = True
    log.info("βœ… Docling library available - will use for PDF parsing with OCR support")
except ImportError:
    DOCLING_AVAILABLE = False
    log.warning("⚠️ Docling library not available - will fallback to PyMuPDF for PDFs")

# Import config for multimodal settings
try:
    from llm_system.config import EXTRACT_IMAGES_FROM_PDF, IMAGE_OUTPUT_DIR
except ImportError:
    # Fallback defaults if config not available
    EXTRACT_IMAGES_FROM_PDF = True
    IMAGE_OUTPUT_DIR = "server/user_uploads/extracted_images"


@dataclass
class ImageContent:
    """Represents an image extracted from a document.
    
    Attributes:
        image_id: Unique identifier for the image
        image_path: Path to where the image is stored on disk
        description: Text description of the image (optional)
        page_number: Page number where image was found
        position: Position on page (e.g., "top", "center", "bottom")
        metadata: Additional metadata (size, format, source PDF, etc.)
    """
    image_id: str
    image_path: Path
    description: str = ""
    page_number: int = 0
    position: str = ""
    metadata: Dict[str, Any] = field(default_factory=dict)
    
    def __post_init__(self):
        """Ensure image_path is a Path object."""
        if isinstance(self.image_path, str):
            self.image_path = Path(self.image_path)


def extract_images_from_pdf(pdf_path: str, output_dir: str = None, user_id: str = "") -> List[ImageContent]:
    """Extract images from a PDF file and save them to disk.
    
    Attempts to use Docling's advanced image extraction first,
    falls back to PyMuPDF for faster extraction.
    
    Args:
        pdf_path: Path to the PDF file
        output_dir: Directory to save extracted images (default: IMAGE_OUTPUT_DIR)
        user_id: User ID for organizing images
        
    Returns:
        List of ImageContent objects with paths and metadata
    """
    if not EXTRACT_IMAGES_FROM_PDF:
        log.debug("Image extraction disabled in config")
        return []
    
    if output_dir is None:
        output_dir = IMAGE_OUTPUT_DIR
    
    images = []
    pdf_name = Path(pdf_path).stem
    
    try:
        # Sanitize directory name (remove special characters)
        pdf_name_safe = "".join(c if c.isalnum() or c in ('-', '_') else '_' for c in pdf_name)
        
        # Create user-specific output directory
        user_image_dir = Path(output_dir) / user_id / pdf_name_safe
        user_image_dir.mkdir(parents=True, exist_ok=True)
        log.info(f"Created image output directory: {user_image_dir}")
        
        # Try Docling first for advanced image understanding
        docling_images = []
        if DOCLING_AVAILABLE:
            log.info(f"πŸ” Attempting to extract images using Docling...")
            try:
                converter = DocumentConverter()
                docling_doc = converter.convert(pdf_path)
                doc = docling_doc.document
                
                # Docling stores images in various ways depending on PDF structure
                # Try to access pictures from the document
                if hasattr(doc, 'body') and hasattr(doc.body, 'blocks'):
                    log.debug(f"Scanning {len(doc.body.blocks)} Docling blocks for pictures...")
                    for block_idx, block in enumerate(doc.body.blocks):
                        block_type = type(block).__name__
                        log.debug(f"Block {block_idx}: {block_type}")
                        
                        # Check for picture blocks
                        if 'Picture' in block_type:
                            try:
                                # Docling picture blocks may have image data
                                if hasattr(block, 'image') and block.image is not None:
                                    image_id = f"img_docling_{block_idx:03d}_{uuid.uuid4().hex[:8]}"
                                    image_filename = f"{image_id}.png"
                                    image_path = user_image_dir / image_filename
                                    
                                    # Save the image
                                    block.image.save(str(image_path), format='PNG')
                                    log.info(f"βœ… Extracted image via Docling: {image_path}")
                                    
                                    # Get page number
                                    page_num = 0
                                    if hasattr(block, 'page_number'):
                                        page_num = block.page_number
                                    
                                    # Create ImageContent
                                    image_content = ImageContent(
                                        image_id=image_id,
                                        image_path=image_path,
                                        page_number=page_num + 1,
                                        position="middle",
                                        metadata={
                                            "source_pdf": pdf_name,
                                            "extracted_at": datetime.now().isoformat(),
                                            "format": "PNG",
                                            "extractor": "docling",
                                            "size": (block.image.width, block.image.height) if hasattr(block.image, 'width') else (0, 0),
                                        }
                                    )
                                    docling_images.append(image_content)
                            except Exception as e:
                                log.debug(f"Could not extract Docling picture block {block_idx}: {e}")
                                continue
                
                if docling_images:
                    log.info(f"βœ… Docling extracted {len(docling_images)} images")
                    images.extend(docling_images)
                    return images
                else:
                    log.debug("Docling found no extractable picture blocks, falling back to PyMuPDF")
            
            except Exception as e:
                log.warning(f"⚠️ Docling image extraction failed: {e}, falling back to PyMuPDF")
        
        # Fallback to PyMuPDF for faster extraction
        log.info(f"πŸ“• Extracting images using PyMuPDF...")
        pdf_document = fitz.open(pdf_path)
        log.info(f"Opened PDF with {pdf_document.page_count} pages")
        
        for page_num in range(pdf_document.page_count):
            page = pdf_document[page_num]
            image_list = page.get_images(full=True)
            
            if not image_list:
                log.debug(f"No images found on page {page_num}")
                continue
            
            log.info(f"Found {len(image_list)} images on page {page_num}")
            
            for img_index, img in enumerate(image_list):
                try:
                    xref = img[0]
                    pix = fitz.Pixmap(pdf_document, xref)
                    
                    # Convert CMYK to RGB if needed
                    if pix.n - pix.alpha < 4:
                        pix = fitz.Pixmap(fitz.csRGB, pix)
                    
                    # Generate image filename
                    image_id = f"img_{page_num:03d}_{img_index:02d}_{uuid.uuid4().hex[:8]}"
                    image_filename = f"{image_id}.png"
                    image_path = user_image_dir / image_filename
                    
                    # Save image
                    pix.save(str(image_path))
                    log.info(f"βœ… Saved image: {image_path}")
                    
                    # Create ImageContent object
                    image_content = ImageContent(
                        image_id=image_id,
                        image_path=image_path,
                        page_number=page_num + 1,  # 1-indexed for humans
                        position="middle",  # Can be enhanced with actual position
                        metadata={
                            "source_pdf": pdf_name,
                            "extracted_at": datetime.now().isoformat(),
                            "format": "PNG",
                            "extractor": "pymupdf",
                            "size": (pix.width, pix.height),
                        }
                    )
                    images.append(image_content)
                    
                except Exception as e:
                    log.warning(f"Failed to extract image {img_index} on page {page_num}: {e}")
                    continue
        
        pdf_document.close()
        log.info(f"βœ… Extracted {len(images)} images from PDF")
        
    except Exception as e:
        log.error(f"❌ Error extracting images from PDF: {e}")
        import traceback
        log.error(traceback.format_exc())
    
    return images


def load_file(user_id: str, file_path: str) -> tuple[bool, List[Document], str]:
    """Load a file and return its content as a list of Document objects. Usually one document per page.
    
    For PDFs, automatically extracts images and attaches them to metadata.

    Args:
        user_id (str): The ID of the user who is loading the file.
        file_path (str): The absolute path to the file to be loaded.

    Returns:
        tuple[bool, List[Document], str]: A tuple containing:
            - bool: True if the file was loaded successfully, False otherwise.
            - List[Document]: A list of Document objects containing the file's content.
            - str: Message indicating the result of the loading operation.
    """

    log.info(f"πŸ” load_file() starting - file_path: {file_path}, user_id: {user_id}")
    file_extension = file_path.split('.')[-1].lower()
    log.info(f"πŸ“‹ File extension detected: {file_extension}")

    if file_extension not in ['txt', 'pdf', "md"]:
        log.error(f"❌ Unsupported file type: {file_extension}.")
        return False, [], f"Unsupported file type: {file_extension}. Supported types are: txt, pdf."

    if file_path.endswith('.txt'):
        log.info(f"πŸ“„ Loading as TXT file")
        loader = TextLoader(file_path, encoding='utf-8')

    elif file_path.endswith('.md'):
        log.info(f"πŸ“ Loading as Markdown file")
        loader = UnstructuredMarkdownLoader(file_path)

    else:
        # Use Docling for PDFs if available (better OCR support for scanned PDFs)
        file_content = None
        use_docling = DOCLING_AVAILABLE
        
        if use_docling:
            log.info(f"πŸ“• Loading PDF using Docling (with OCR support for scanned PDFs)")
            try:
                converter = DocumentConverter()
                docling_doc = converter.convert(file_path)
                
                # Convert Docling output to LangChain Documents
                # Docling preserves structure better than PyMuPDF
                markdown_text = docling_doc.document.export_to_markdown()
                
                # Create a single document with all content
                file_content = [
                    Document(
                        page_content=markdown_text,
                        metadata={
                            "source": os.path.basename(file_path),
                            "file_path": file_path,
                            "loader": "docling"
                        }
                    )
                ]
                log.info(f"βœ… Docling successfully parsed PDF: {len(markdown_text)} chars extracted")
            except Exception as e:
                log.warning(f"⚠️ Docling parsing failed: {e}, falling back to PyMuPDF")
                file_content = None
                use_docling = False
        
        if not use_docling:
            # Fallback to PyMuPDF if Docling not available or failed
            log.info(f"πŸ“• Loading as PDF file using PyMuPDFLoader")
            loader = PyMuPDFLoader(file_path, extract_images=False)

    # Load the file and return the documents
    if file_content is None:
        # If we didn't get content from Docling, use the loader (PyMuPDF, TextLoader, etc.)
        log.info(f"⏳ Executing loader.load()...")
        try:
            file_content = loader.load()
            log.info(f"βœ… loader.load() completed, got {len(file_content)} pages/documents")
        except Exception as e:
            log.error(f"❌ loader.load() failed with exception: {e}")
            import traceback
            log.error(f"Traceback: {traceback.format_exc()}")
            return False, [], f"Error loading file: {e}"

    # Extract images from PDF if applicable
    extracted_images = []
    if file_path.endswith('.pdf'):
        log.info(f"πŸ–ΌοΈ  Extracting images from PDF...")
        extracted_images = extract_images_from_pdf(file_path, user_id=user_id)
        log.info(f"Found {len(extracted_images)} images")

    # Add user metadata to each doc and attach images
    for doc in file_content:
        doc.metadata['user_id'] = user_id
        
        # Attach extracted images to the document
        if extracted_images:
            # Convert ImageContent objects to serializable format
            doc.metadata['images'] = [
                {
                    'image_id': img.image_id,
                    'image_path': str(img.image_path),
                    'page_number': img.page_number,
                    'position': img.position,
                    'metadata': img.metadata
                }
                for img in extracted_images
            ]
            log.info(f"Attached {len(extracted_images)} images to document metadata")
        
        # Since i am exposing the retrieved docs to UI
        # Hide full server file path if its there:
        if 'file_path' in doc.metadata:
            doc.metadata['file_path'] = os.path.basename(doc.metadata['file_path'])

        if 'source' in doc.metadata:
            # If it is not local file, keep source as is:
            if "www." in doc.metadata['source'] or "http" in doc.metadata['source']:
                continue
            # If it is local file, keep only the file name:
            else:
                doc.metadata['source'] = os.path.basename(doc.metadata['source'])

    if not file_content:
        log.error(f"No content found in the file: {file_path}")
        return True, [], f"No content found in the file: {file_path}"

    log.info(f"Loaded {len(file_content)} documents from {file_path} for user {user_id} (with {len(extracted_images)} images).")
    return True, file_content, f"Loaded {len(file_content)} documents with {len(extracted_images)} images."


if __name__ == "__main__":
    # Example usage
    import os
    print(os.getcwd())
    try:
        status, docs, message = load_file(
            user_id="test_user",
            file_path="/Users/neetikasaxena/Documents/sanchit/sample_code/chat-with-your-data/test_data/resume_sanchit_imo_health.pdf"
            # file_path="../../../GenAI/Data/speech.txt"
            # file_path="../../../GenAI/Data/speech.md"
        )

        print(status)
        print(message)
        print(len(docs))

        for ind, doc in enumerate(docs[:3]):
            print("\n")
            print(repr(doc))

    except Exception as e:
        print(f"Error loading file: {e}")