| """ |
| Evidence Builder for Document Grounding |
| |
| Creates evidence references for extracted information. |
| Handles image cropping and base64 encoding. |
| """ |
|
|
| import base64 |
| import io |
| from typing import List, Optional, Dict, Any, Tuple |
| from pydantic import BaseModel, Field |
| import numpy as np |
| from PIL import Image |
| from loguru import logger |
|
|
| from ..schemas.core import ( |
| BoundingBox, |
| DocumentChunk, |
| EvidenceRef, |
| OCRRegion, |
| ) |
|
|
|
|
| class GroundingConfig(BaseModel): |
| """Configuration for grounding and evidence generation.""" |
| |
| include_images: bool = Field( |
| default=True, |
| description="Include cropped images in evidence" |
| ) |
| crop_padding: int = Field( |
| default=10, |
| ge=0, |
| description="Padding around crop regions in pixels" |
| ) |
| max_image_size: int = Field( |
| default=512, |
| ge=64, |
| description="Maximum dimension for cropped images" |
| ) |
| image_format: str = Field( |
| default="PNG", |
| description="Image format for encoding (PNG/JPEG)" |
| ) |
| image_quality: int = Field( |
| default=85, |
| ge=1, |
| le=100, |
| description="JPEG quality if using JPEG format" |
| ) |
|
|
| |
| max_snippet_length: int = Field( |
| default=200, |
| ge=50, |
| description="Maximum length of text snippets" |
| ) |
| include_context: bool = Field( |
| default=True, |
| description="Include surrounding context in snippets" |
| ) |
|
|
|
|
| def crop_region_image( |
| image: np.ndarray, |
| bbox: BoundingBox, |
| padding: int = 10, |
| max_size: Optional[int] = None, |
| ) -> np.ndarray: |
| """ |
| Crop a region from an image. |
| |
| Args: |
| image: Source image (RGB, HWC format) |
| bbox: Bounding box to crop |
| padding: Padding around the crop |
| max_size: Maximum dimension (will resize if larger) |
| |
| Returns: |
| Cropped image as numpy array |
| """ |
| height, width = image.shape[:2] |
|
|
| |
| x1 = max(0, int(bbox.x_min) - padding) |
| y1 = max(0, int(bbox.y_min) - padding) |
| x2 = min(width, int(bbox.x_max) + padding) |
| y2 = min(height, int(bbox.y_max) + padding) |
|
|
| |
| cropped = image[y1:y2, x1:x2] |
|
|
| |
| if max_size and max(cropped.shape[:2]) > max_size: |
| pil_img = Image.fromarray(cropped) |
| pil_img.thumbnail((max_size, max_size), Image.Resampling.LANCZOS) |
| cropped = np.array(pil_img) |
|
|
| return cropped |
|
|
|
|
| def encode_image_base64( |
| image: np.ndarray, |
| format: str = "PNG", |
| quality: int = 85, |
| ) -> str: |
| """ |
| Encode image to base64 string. |
| |
| Args: |
| image: Image as numpy array |
| format: Image format (PNG/JPEG) |
| quality: JPEG quality if applicable |
| |
| Returns: |
| Base64-encoded string |
| """ |
| pil_img = Image.fromarray(image) |
|
|
| |
| if pil_img.mode != "RGB": |
| pil_img = pil_img.convert("RGB") |
|
|
| |
| buffer = io.BytesIO() |
| if format.upper() == "JPEG": |
| pil_img.save(buffer, format="JPEG", quality=quality) |
| else: |
| pil_img.save(buffer, format="PNG") |
|
|
| buffer.seek(0) |
| encoded = base64.b64encode(buffer.read()).decode("utf-8") |
|
|
| return encoded |
|
|
|
|
| def create_evidence_ref( |
| chunk: DocumentChunk, |
| source_type: str = "text", |
| snippet: Optional[str] = None, |
| confidence: float = 1.0, |
| image: Optional[np.ndarray] = None, |
| config: Optional[GroundingConfig] = None, |
| ) -> EvidenceRef: |
| """ |
| Create an evidence reference from a document chunk. |
| |
| Args: |
| chunk: Source chunk |
| source_type: Type of source (text/table/figure) |
| snippet: Optional specific snippet (defaults to chunk text) |
| confidence: Confidence score |
| image: Optional page image for cropping |
| config: Grounding configuration |
| |
| Returns: |
| EvidenceRef instance |
| """ |
| config = config or GroundingConfig() |
|
|
| |
| if snippet is None: |
| snippet = chunk.text[:config.max_snippet_length] |
| if len(chunk.text) > config.max_snippet_length: |
| snippet += "..." |
|
|
| |
| evidence = EvidenceRef( |
| chunk_id=chunk.chunk_id, |
| page=chunk.page, |
| bbox=chunk.bbox, |
| source_type=source_type, |
| snippet=snippet, |
| confidence=confidence, |
| ) |
|
|
| |
| if image is not None and config.include_images: |
| try: |
| cropped = crop_region_image( |
| image, |
| chunk.bbox, |
| padding=config.crop_padding, |
| max_size=config.max_image_size, |
| ) |
| evidence.image_base64 = encode_image_base64( |
| cropped, |
| format=config.image_format, |
| quality=config.image_quality, |
| ) |
| except Exception as e: |
| logger.warning(f"Failed to crop evidence image: {e}") |
|
|
| return evidence |
|
|
|
|
| class EvidenceBuilder: |
| """ |
| Builder for creating evidence references. |
| |
| Handles: |
| - Evidence from chunks |
| - Evidence from OCR regions |
| - Evidence aggregation |
| - Image cropping and encoding |
| """ |
|
|
| def __init__(self, config: Optional[GroundingConfig] = None): |
| """Initialize evidence builder.""" |
| self.config = config or GroundingConfig() |
|
|
| def from_chunk( |
| self, |
| chunk: DocumentChunk, |
| image: Optional[np.ndarray] = None, |
| additional_context: Optional[str] = None, |
| ) -> EvidenceRef: |
| """ |
| Create evidence reference from a chunk. |
| |
| Args: |
| chunk: Source chunk |
| image: Optional page image for visual evidence |
| additional_context: Optional additional context |
| |
| Returns: |
| EvidenceRef |
| """ |
| |
| source_type = chunk.chunk_type.value |
|
|
| |
| snippet = chunk.text[:self.config.max_snippet_length] |
| if additional_context: |
| snippet = f"{additional_context}\n{snippet}" |
| if len(chunk.text) > self.config.max_snippet_length: |
| snippet += "..." |
|
|
| return create_evidence_ref( |
| chunk=chunk, |
| source_type=source_type, |
| snippet=snippet, |
| confidence=chunk.confidence, |
| image=image, |
| config=self.config, |
| ) |
|
|
| def from_ocr_region( |
| self, |
| region: OCRRegion, |
| chunk_id: str, |
| document_id: str, |
| image: Optional[np.ndarray] = None, |
| ) -> EvidenceRef: |
| """ |
| Create evidence reference from an OCR region. |
| |
| Args: |
| region: OCR region |
| chunk_id: ID to assign |
| document_id: Parent document ID |
| image: Optional page image |
| |
| Returns: |
| EvidenceRef |
| """ |
| |
| from ..schemas.core import DocumentChunk, ChunkType |
|
|
| chunk = DocumentChunk( |
| chunk_id=chunk_id, |
| chunk_type=ChunkType.TEXT, |
| text=region.text, |
| bbox=region.bbox, |
| page=region.page, |
| document_id=document_id, |
| source_path=None, |
| sequence_index=0, |
| confidence=region.confidence, |
| ) |
|
|
| return self.from_chunk(chunk, image) |
|
|
| def aggregate_evidence( |
| self, |
| evidence_list: List[EvidenceRef], |
| combine_snippets: bool = True, |
| ) -> List[EvidenceRef]: |
| """ |
| Aggregate and deduplicate evidence references. |
| |
| Args: |
| evidence_list: List of evidence references |
| combine_snippets: Whether to combine snippets from same chunk |
| |
| Returns: |
| Deduplicated evidence list |
| """ |
| if not evidence_list: |
| return [] |
|
|
| |
| by_chunk: Dict[str, List[EvidenceRef]] = {} |
| for ev in evidence_list: |
| if ev.chunk_id not in by_chunk: |
| by_chunk[ev.chunk_id] = [] |
| by_chunk[ev.chunk_id].append(ev) |
|
|
| |
| result = [] |
| for chunk_id, evidences in by_chunk.items(): |
| if len(evidences) == 1: |
| result.append(evidences[0]) |
| else: |
| |
| best = max(evidences, key=lambda e: e.confidence) |
| if combine_snippets: |
| all_snippets = list(set(e.snippet for e in evidences)) |
| combined = " ... ".join(all_snippets[:3]) |
| best = EvidenceRef( |
| chunk_id=best.chunk_id, |
| page=best.page, |
| bbox=best.bbox, |
| source_type=best.source_type, |
| snippet=combined[:self.config.max_snippet_length], |
| confidence=best.confidence, |
| image_base64=best.image_base64, |
| ) |
| result.append(best) |
|
|
| |
| result.sort(key=lambda e: (e.page, e.bbox.y_min, e.bbox.x_min)) |
|
|
| return result |
|
|
| def create_grounding_context( |
| self, |
| evidence_list: List[EvidenceRef], |
| include_images: bool = False, |
| ) -> str: |
| """ |
| Create a text context from evidence for LLM prompting. |
| |
| Args: |
| evidence_list: Evidence references |
| include_images: Whether to include image markers |
| |
| Returns: |
| Formatted context string |
| """ |
| if not evidence_list: |
| return "" |
|
|
| lines = ["Evidence from document:"] |
| for i, ev in enumerate(evidence_list, 1): |
| lines.append( |
| f"\n[{i}] Page {ev.page + 1}, {ev.source_type} " |
| f"(confidence: {ev.confidence:.2f}):" |
| ) |
| lines.append(f' "{ev.snippet}"') |
|
|
| if include_images and ev.image_base64: |
| lines.append(" [Image available]") |
|
|
| return "\n".join(lines) |
|
|