"""PaddleOCR wrapper. PaddleOCR has a heavy initialization cost (~2-5s on CPU as model files load), so we keep a process-global instance behind a lazy accessor. The wrapper exposes a small, stable surface so the rest of the pipeline does not depend directly on paddleocr's evolving API. """ from __future__ import annotations from dataclasses import dataclass from threading import Lock from typing import TYPE_CHECKING import numpy as np from ocr_sprint.config import get_settings from ocr_sprint.pipeline.ingest import NDArrayU8 from ocr_sprint.utils.logging import get_logger if TYPE_CHECKING: from paddleocr import PaddleOCR _logger = get_logger(__name__) _lock = Lock() _instance: PaddleOCR | None = None @dataclass(frozen=True) class OCRLine: """One recognized line with its bounding polygon and confidence.""" text: str confidence: float box: tuple[tuple[float, float], ...] # 4 (x, y) corner points @dataclass(frozen=True) class OCRPage: """OCR output for a single page.""" lines: list[OCRLine] @property def text(self) -> str: """Reconstruct page text by concatenating lines (order = paddle's output order).""" return "\n".join(line.text for line in self.lines) @property def mean_confidence(self) -> float: if not self.lines: return 0.0 return float(np.mean([line.confidence for line in self.lines])) def _build_paddleocr() -> PaddleOCR: from paddleocr import PaddleOCR s = get_settings() kwargs: dict[str, object] = { "lang": s.ocr_lang, "use_angle_cls": True, "use_gpu": s.ocr_use_gpu, "show_log": False, } if s.ocr_det_model_dir: kwargs["det_model_dir"] = s.ocr_det_model_dir if s.ocr_rec_model_dir: kwargs["rec_model_dir"] = s.ocr_rec_model_dir if s.ocr_cls_model_dir: kwargs["cls_model_dir"] = s.ocr_cls_model_dir _logger.info("paddleocr.init", lang=s.ocr_lang, use_gpu=s.ocr_use_gpu) return PaddleOCR(**kwargs) def get_ocr() -> PaddleOCR: """Lazy, thread-safe singleton accessor for the PaddleOCR engine.""" global _instance if _instance is None: with _lock: if _instance is None: _instance = _build_paddleocr() return _instance def run_ocr(image: NDArrayU8) -> OCRPage: """Run OCR on a single BGR image and return a structured page result.""" engine = get_ocr() raw = engine.ocr(image, cls=True) # PaddleOCR returns [[ [box, (text, conf)], ... ]] — one outer list per image. if not raw or raw[0] is None: return OCRPage(lines=[]) page_raw = raw[0] lines: list[OCRLine] = [] for item in page_raw: if not item or len(item) < 2: continue box_raw, text_conf = item[0], item[1] text, conf = text_conf[0], float(text_conf[1]) try: box = tuple((float(p[0]), float(p[1])) for p in box_raw) except (TypeError, ValueError, IndexError): continue lines.append(OCRLine(text=text, confidence=conf, box=box)) return OCRPage(lines=lines)