Implements the foundation of the OCR Sprint service: - FastAPI app with /api/v1/health and /api/v1/documents (sync upload) - Pydantic v2 schemas for documents, extraction result, personnel - Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew, denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based header extraction (nomor sprint, tanggal, satuan, perihal, dasar), signatory NRP, master-pangkat validation, confidence scoring + routing. - Tests: 61 unit tests covering regex rules, validators, preprocess, ingest, confidence, and API contract (PaddleOCR mocked). - Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit, Dockerfile, docker-compose, Makefile. - Docs: README + docs/architecture.md (full hybrid stack rationale and 6-phase roadmap). Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
107 lines
3.1 KiB
Python
107 lines
3.1 KiB
Python
"""PaddleOCR wrapper.
|
|
|
|
PaddleOCR has a heavy initialization cost (~2-5s on CPU as model files load),
|
|
so we keep a process-global instance behind a lazy accessor.
|
|
|
|
The wrapper exposes a small, stable surface so the rest of the pipeline does
|
|
not depend directly on paddleocr's evolving API.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass
|
|
from threading import Lock
|
|
from typing import TYPE_CHECKING
|
|
|
|
import numpy as np
|
|
|
|
from ocr_sprint.config import get_settings
|
|
from ocr_sprint.pipeline.ingest import NDArrayU8
|
|
from ocr_sprint.utils.logging import get_logger
|
|
|
|
if TYPE_CHECKING:
|
|
from paddleocr import PaddleOCR
|
|
|
|
_logger = get_logger(__name__)
|
|
_lock = Lock()
|
|
_instance: PaddleOCR | None = None
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class OCRLine:
|
|
"""One recognized line with its bounding polygon and confidence."""
|
|
|
|
text: str
|
|
confidence: float
|
|
box: tuple[tuple[float, float], ...] # 4 (x, y) corner points
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class OCRPage:
|
|
"""OCR output for a single page."""
|
|
|
|
lines: list[OCRLine]
|
|
|
|
@property
|
|
def text(self) -> str:
|
|
"""Reconstruct page text by concatenating lines (order = paddle's output order)."""
|
|
return "\n".join(line.text for line in self.lines)
|
|
|
|
@property
|
|
def mean_confidence(self) -> float:
|
|
if not self.lines:
|
|
return 0.0
|
|
return float(np.mean([line.confidence for line in self.lines]))
|
|
|
|
|
|
def _build_paddleocr() -> PaddleOCR:
|
|
from paddleocr import PaddleOCR
|
|
|
|
s = get_settings()
|
|
kwargs: dict[str, object] = {
|
|
"lang": s.ocr_lang,
|
|
"use_angle_cls": True,
|
|
"use_gpu": s.ocr_use_gpu,
|
|
"show_log": False,
|
|
}
|
|
if s.ocr_det_model_dir:
|
|
kwargs["det_model_dir"] = s.ocr_det_model_dir
|
|
if s.ocr_rec_model_dir:
|
|
kwargs["rec_model_dir"] = s.ocr_rec_model_dir
|
|
if s.ocr_cls_model_dir:
|
|
kwargs["cls_model_dir"] = s.ocr_cls_model_dir
|
|
_logger.info("paddleocr.init", lang=s.ocr_lang, use_gpu=s.ocr_use_gpu)
|
|
return PaddleOCR(**kwargs)
|
|
|
|
|
|
def get_ocr() -> PaddleOCR:
|
|
"""Lazy, thread-safe singleton accessor for the PaddleOCR engine."""
|
|
global _instance
|
|
if _instance is None:
|
|
with _lock:
|
|
if _instance is None:
|
|
_instance = _build_paddleocr()
|
|
return _instance
|
|
|
|
|
|
def run_ocr(image: NDArrayU8) -> OCRPage:
|
|
"""Run OCR on a single BGR image and return a structured page result."""
|
|
engine = get_ocr()
|
|
raw = engine.ocr(image, cls=True)
|
|
# PaddleOCR returns [[ [box, (text, conf)], ... ]] — one outer list per image.
|
|
if not raw or raw[0] is None:
|
|
return OCRPage(lines=[])
|
|
page_raw = raw[0]
|
|
lines: list[OCRLine] = []
|
|
for item in page_raw:
|
|
if not item or len(item) < 2:
|
|
continue
|
|
box_raw, text_conf = item[0], item[1]
|
|
text, conf = text_conf[0], float(text_conf[1])
|
|
try:
|
|
box = tuple((float(p[0]), float(p[1])) for p in box_raw)
|
|
except (TypeError, ValueError, IndexError):
|
|
continue
|
|
lines.append(OCRLine(text=text, confidence=conf, box=box))
|
|
return OCRPage(lines=lines)
|