OCR-SPRIN-SERVICE/src/ocr_sprint/pipeline/ocr.py

"""PaddleOCR wrapper.

PaddleOCR has a heavy initialization cost (~2-5s on CPU as model files load),
so we keep a process-global instance behind a lazy accessor.

The wrapper exposes a small, stable surface so the rest of the pipeline does
not depend directly on paddleocr's evolving API.
"""

from __future__ import annotations

from dataclasses import dataclass
from threading import Lock
from typing import TYPE_CHECKING

import numpy as np

from ocr_sprint.config import get_settings
from ocr_sprint.pipeline.ingest import NDArrayU8
from ocr_sprint.utils.logging import get_logger

if TYPE_CHECKING:
    from paddleocr import PaddleOCR

_logger = get_logger(__name__)
_lock = Lock()
_instance: PaddleOCR | None = None


@dataclass(frozen=True)
class OCRLine:
    """One recognized line with its bounding polygon and confidence."""

    text: str
    confidence: float
    box: tuple[tuple[float, float], ...]  # 4 (x, y) corner points


@dataclass(frozen=True)
class OCRPage:
    """OCR output for a single page."""

    lines: list[OCRLine]

    @property
    def text(self) -> str:
        """Reconstruct page text by concatenating lines (order = paddle's output order)."""
        return "\n".join(line.text for line in self.lines)

    @property
    def mean_confidence(self) -> float:
        if not self.lines:
            return 0.0
        return float(np.mean([line.confidence for line in self.lines]))


def _build_paddleocr() -> PaddleOCR:
    from paddleocr import PaddleOCR

    s = get_settings()
    kwargs: dict[str, object] = {
        "lang": s.ocr_lang,
        "use_angle_cls": True,
        "use_gpu": s.ocr_use_gpu,
        "show_log": False,
    }
    if s.ocr_det_model_dir:
        kwargs["det_model_dir"] = s.ocr_det_model_dir
    if s.ocr_rec_model_dir:
        kwargs["rec_model_dir"] = s.ocr_rec_model_dir
    if s.ocr_cls_model_dir:
        kwargs["cls_model_dir"] = s.ocr_cls_model_dir
    _logger.info("paddleocr.init", lang=s.ocr_lang, use_gpu=s.ocr_use_gpu)
    return PaddleOCR(**kwargs)


def get_ocr() -> PaddleOCR:
    """Lazy, thread-safe singleton accessor for the PaddleOCR engine."""
    global _instance
    if _instance is None:
        with _lock:
            if _instance is None:
                _instance = _build_paddleocr()
    return _instance


def run_ocr(image: NDArrayU8) -> OCRPage:
    """Run OCR on a single BGR image and return a structured page result."""
    engine = get_ocr()
    raw = engine.ocr(image, cls=True)
    # PaddleOCR returns [[ [box, (text, conf)], ... ]] — one outer list per image.
    if not raw or raw[0] is None:
        return OCRPage(lines=[])
    page_raw = raw[0]
    lines: list[OCRLine] = []
    for item in page_raw:
        if not item or len(item) < 2:
            continue
        box_raw, text_conf = item[0], item[1]
        text, conf = text_conf[0], float(text_conf[1])
        try:
            box = tuple((float(p[0]), float(p[1])) for p in box_raw)
        except (TypeError, ValueError, IndexError):
            continue
        lines.append(OCRLine(text=text, confidence=conf, box=box))
    return OCRPage(lines=lines)