Spaces:

MHamdan
/

SPARKNET

Sleeping

App Files Files Community

SPARKNET / src /document /ocr /paddle_ocr.py

MHamdan

Initial commit: SPARKNET framework

d520909 2 months ago

raw

history blame contribute delete

6.99 kB

	"""
	PaddleOCR Engine

	High-accuracy OCR using PaddleOCR.
	Supports detection, recognition, and angle classification.
	"""

	import time
	from typing import List, Optional, Tuple
	import numpy as np
	from loguru import logger

	from .base import OCREngine, OCRConfig, OCRResult
	from ..schemas.core import BoundingBox, OCRRegion

	# Try to import PaddleOCR
	try:
	from paddleocr import PaddleOCR
	HAS_PADDLEOCR = True
	except ImportError:
	HAS_PADDLEOCR = False
	logger.warning(
	"PaddleOCR not installed. Install with: "
	"pip install paddleocr paddlepaddle-gpu (or paddlepaddle for CPU)"
	)


	class PaddleOCREngine(OCREngine):
	"""
	OCR engine using PaddleOCR.

	Features:
	- High accuracy text detection and recognition
	- Multi-language support
	- GPU acceleration
	- Angle classification for rotated text
	"""

	# Language code mapping (PaddleOCR uses different codes)
	LANGUAGE_MAP = {
	"en": "en",
	"ch": "ch",
	"chinese_cht": "chinese_cht",
	"fr": "french",
	"german": "german",
	"es": "es",
	"it": "it",
	"pt": "pt",
	"ru": "ru",
	"japan": "japan",
	"korean": "korean",
	"ar": "ar",
	"hi": "hi",
	"latin": "latin",
	}

	def __init__(self, config: Optional[OCRConfig] = None):
	"""Initialize PaddleOCR engine."""
	super().__init__(config)
	self._ocr: Optional[PaddleOCR] = None

	def initialize(self):
	"""Initialize PaddleOCR model."""
	if not HAS_PADDLEOCR:
	raise RuntimeError(
	"PaddleOCR not installed. Install with: "
	"pip install paddleocr paddlepaddle-gpu"
	)

	if self._initialized:
	return

	logger.info("Initializing PaddleOCR engine...")

	# Map language codes
	lang = self.config.languages[0] if self.config.languages else "en"
	paddle_lang = self.LANGUAGE_MAP.get(lang, "en")

	try:
	self._ocr = PaddleOCR(
	use_angle_cls=self.config.use_angle_cls,
	lang=paddle_lang,
	use_gpu=self.config.use_gpu,
	gpu_mem=500, # GPU memory limit in MB
	det_db_thresh=self.config.det_db_thresh,
	det_db_box_thresh=self.config.det_db_box_thresh,
	rec_batch_num=self.config.rec_batch_num,
	drop_score=self.config.drop_score,
	show_log=False, # Suppress verbose logging
	)
	self._initialized = True
	logger.info(f"PaddleOCR initialized (lang={paddle_lang}, gpu={self.config.use_gpu})")

	except Exception as e:
	logger.error(f"Failed to initialize PaddleOCR: {e}")
	raise

	def recognize(
	self,
	image: np.ndarray,
	page_number: int = 0,
	) -> OCRResult:
	"""
	Perform OCR on an image using PaddleOCR.

	Args:
	image: Image as numpy array (RGB, HWC format)
	page_number: Page number for multi-page documents

	Returns:
	OCRResult with recognized text and regions
	"""
	if not self._initialized:
	self.initialize()

	start_time = time.time()

	try:
	# Run OCR
	results = self._ocr.ocr(image, cls=self.config.use_angle_cls)

	# Process results
	regions = []
	all_texts = []
	total_confidence = 0.0

	# Results format: [[[box], (text, confidence)], ...]
	if results and results[0]:
	for idx, line in enumerate(results[0]):
	if line is None:
	continue

	box_points = line[0] # [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
	text, confidence = line[1]

	# Skip low confidence results
	if confidence < self.config.min_confidence:
	continue

	# Convert polygon to bounding box
	bbox = self._polygon_to_bbox(box_points, image.shape[:2])

	# Create polygon points
	polygon = [(float(p[0]), float(p[1])) for p in box_points]

	region = OCRRegion(
	text=text,
	confidence=float(confidence),
	bbox=bbox,
	polygon=polygon,
	page=page_number,
	line_id=idx,
	engine="paddleocr",
	)
	regions.append(region)
	all_texts.append(text)
	total_confidence += confidence

	processing_time = (time.time() - start_time) * 1000

	return OCRResult(
	regions=regions,
	full_text="\n".join(all_texts),
	confidence_avg=total_confidence / len(regions) if regions else 0.0,
	processing_time_ms=processing_time,
	engine="paddleocr",
	success=True,
	)

	except Exception as e:
	logger.error(f"PaddleOCR recognition failed: {e}")
	return OCRResult(
	regions=[],
	full_text="",
	confidence_avg=0.0,
	processing_time_ms=(time.time() - start_time) * 1000,
	engine="paddleocr",
	success=False,
	error=str(e),
	)

	def _polygon_to_bbox(
	self,
	points: List[List[float]],
	image_shape: Tuple[int, int],
	) -> BoundingBox:
	"""Convert polygon points to bounding box."""
	x_coords = [p[0] for p in points]
	y_coords = [p[1] for p in points]

	height, width = image_shape

	return BoundingBox(
	x_min=max(0, min(x_coords)),
	y_min=max(0, min(y_coords)),
	x_max=min(width, max(x_coords)),
	y_max=min(height, max(y_coords)),
	normalized=False,
	page_width=width,
	page_height=height,
	)

	def get_supported_languages(self) -> List[str]:
	"""Return list of supported language codes."""
	return list(self.LANGUAGE_MAP.keys())

	def recognize_with_structure(
	self,
	image: np.ndarray,
	page_number: int = 0,
	) -> Tuple[OCRResult, Optional[dict]]:
	"""
	Perform OCR with structure analysis (tables, layout).

	Args:
	image: Image as numpy array
	page_number: Page number

	Returns:
	Tuple of (OCRResult, structure_info)
	"""
	# First do regular OCR
	ocr_result = self.recognize(image, page_number)

	# PaddleOCR can also do table structure recognition
	# This would require ppstructure which we can add later
	structure_info = None

	return ocr_result, structure_info