| """ |
| Base IO Classes for Document Intelligence |
| |
| Abstract interfaces for document loading and page rendering. |
| """ |
|
|
| from abc import ABC, abstractmethod |
| from dataclasses import dataclass, field |
| from enum import Enum |
| from pathlib import Path |
| from typing import Any, Dict, Iterator, List, Optional, Tuple, Union |
|
|
| import numpy as np |
| from PIL import Image |
|
|
|
|
| class DocumentFormat(str, Enum): |
| """Supported document formats.""" |
|
|
| PDF = "pdf" |
| IMAGE = "image" |
| TIFF_MULTIPAGE = "tiff_multipage" |
| UNKNOWN = "unknown" |
|
|
| @classmethod |
| def from_path(cls, path: Union[str, Path]) -> "DocumentFormat": |
| """Detect format from file path.""" |
| path = Path(path) |
| suffix = path.suffix.lower() |
|
|
| if suffix == ".pdf": |
| return cls.PDF |
| elif suffix in {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".webp"}: |
| return cls.IMAGE |
| elif suffix in {".tif", ".tiff"}: |
| |
| return cls.TIFF_MULTIPAGE |
| else: |
| return cls.UNKNOWN |
|
|
|
|
| @dataclass |
| class PageInfo: |
| """Information about a document page.""" |
|
|
| page_number: int |
| width_pixels: int |
| height_pixels: int |
| width_points: Optional[float] = None |
| height_points: Optional[float] = None |
| dpi: int = 72 |
| rotation: int = 0 |
| has_text: bool = False |
| has_images: bool = False |
|
|
|
|
| @dataclass |
| class DocumentInfo: |
| """Metadata about a loaded document.""" |
|
|
| path: Path |
| format: DocumentFormat |
| num_pages: int |
| pages: List[PageInfo] = field(default_factory=list) |
|
|
| |
| title: Optional[str] = None |
| author: Optional[str] = None |
| subject: Optional[str] = None |
| creator: Optional[str] = None |
| creation_date: Optional[str] = None |
| modification_date: Optional[str] = None |
|
|
| |
| file_size_bytes: int = 0 |
| is_encrypted: bool = False |
| is_digitally_signed: bool = False |
|
|
| |
| has_text_layer: bool = False |
| is_scanned: bool = False |
| has_forms: bool = False |
| has_annotations: bool = False |
|
|
| @property |
| def doc_id(self) -> str: |
| """Generate a stable document ID from path and size.""" |
| import hashlib |
| content = f"{self.path.name}_{self.file_size_bytes}_{self.num_pages}" |
| return hashlib.sha256(content.encode()).hexdigest()[:16] |
|
|
|
|
| @dataclass |
| class RenderOptions: |
| """Options for page rendering.""" |
|
|
| dpi: int = 200 |
| color_mode: str = "RGB" |
| background_color: Tuple[int, ...] = (255, 255, 255) |
| antialias: bool = True |
| include_annotations: bool = True |
| include_forms: bool = True |
|
|
|
|
| class DocumentLoader(ABC): |
| """ |
| Abstract base class for document loaders. |
| |
| Handles opening documents and extracting metadata. |
| """ |
|
|
| @abstractmethod |
| def load(self, path: Union[str, Path]) -> DocumentInfo: |
| """ |
| Load a document and extract metadata. |
| |
| Args: |
| path: Path to the document file |
| |
| Returns: |
| DocumentInfo with document metadata |
| """ |
| pass |
|
|
| @abstractmethod |
| def close(self) -> None: |
| """Release resources and close the document.""" |
| pass |
|
|
| @abstractmethod |
| def is_loaded(self) -> bool: |
| """Check if a document is currently loaded.""" |
| pass |
|
|
| @property |
| @abstractmethod |
| def info(self) -> Optional[DocumentInfo]: |
| """Get information about the loaded document.""" |
| pass |
|
|
| def __enter__(self): |
| return self |
|
|
| def __exit__(self, exc_type, exc_val, exc_tb): |
| self.close() |
| return False |
|
|
|
|
| class PageRenderer(ABC): |
| """ |
| Abstract base class for page rendering. |
| |
| Converts document pages to images for processing. |
| """ |
|
|
| @abstractmethod |
| def render_page( |
| self, |
| page_number: int, |
| options: Optional[RenderOptions] = None |
| ) -> np.ndarray: |
| """ |
| Render a single page to an image. |
| |
| Args: |
| page_number: 1-indexed page number |
| options: Rendering options |
| |
| Returns: |
| Page image as numpy array (H, W, C) |
| """ |
| pass |
|
|
| def render_pages( |
| self, |
| page_numbers: Optional[List[int]] = None, |
| options: Optional[RenderOptions] = None |
| ) -> Iterator[Tuple[int, np.ndarray]]: |
| """ |
| Render multiple pages. |
| |
| Args: |
| page_numbers: List of 1-indexed page numbers (None = all pages) |
| options: Rendering options |
| |
| Yields: |
| Tuples of (page_number, image_array) |
| """ |
| if page_numbers is None: |
| |
| raise NotImplementedError("Subclass must provide page iteration") |
|
|
| for page_num in page_numbers: |
| yield page_num, self.render_page(page_num, options) |
|
|
| def render_region( |
| self, |
| page_number: int, |
| region: Tuple[float, float, float, float], |
| options: Optional[RenderOptions] = None, |
| normalized: bool = True |
| ) -> np.ndarray: |
| """ |
| Render a specific region of a page. |
| |
| Args: |
| page_number: 1-indexed page number |
| region: (x_min, y_min, x_max, y_max) coordinates |
| options: Rendering options |
| normalized: Whether coordinates are normalized (0-1) |
| |
| Returns: |
| Region image as numpy array |
| """ |
| |
| full_page = self.render_page(page_number, options) |
| h, w = full_page.shape[:2] |
|
|
| x_min, y_min, x_max, y_max = region |
| if normalized: |
| x_min, x_max = int(x_min * w), int(x_max * w) |
| y_min, y_max = int(y_min * h), int(y_max * h) |
| else: |
| x_min, y_min = int(x_min), int(y_min) |
| x_max, y_max = int(x_max), int(y_max) |
|
|
| |
| x_min = max(0, min(x_min, w)) |
| x_max = max(0, min(x_max, w)) |
| y_min = max(0, min(y_min, h)) |
| y_max = max(0, min(y_max, h)) |
|
|
| return full_page[y_min:y_max, x_min:x_max] |
|
|
|
|
| class DocumentProcessor(ABC): |
| """ |
| Combined document loader and renderer. |
| |
| Convenience class that combines loading and rendering. |
| """ |
|
|
| def __init__(self, loader: DocumentLoader, renderer: PageRenderer): |
| self.loader = loader |
| self.renderer = renderer |
|
|
| @abstractmethod |
| def process( |
| self, |
| path: Union[str, Path], |
| options: Optional[RenderOptions] = None, |
| page_range: Optional[Tuple[int, int]] = None |
| ) -> Iterator[Tuple[int, np.ndarray, PageInfo]]: |
| """ |
| Load and render document pages. |
| |
| Args: |
| path: Document path |
| options: Rendering options |
| page_range: Optional (start, end) page range (1-indexed, inclusive) |
| |
| Yields: |
| Tuples of (page_number, image, page_info) |
| """ |
| pass |
|
|