Files
OCR-SPRIN-SERVICE/src/ocr_sprint/pipeline/ocr.py
Devin AI ca0c0a0428 Phase 1 MVP: synchronous OCR + regex header extraction
Implements the foundation of the OCR Sprint service:
- FastAPI app with /api/v1/health and /api/v1/documents (sync upload)
- Pydantic v2 schemas for documents, extraction result, personnel
- Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew,
  denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based
  header extraction (nomor sprint, tanggal, satuan, perihal, dasar),
  signatory NRP, master-pangkat validation, confidence scoring + routing.
- Tests: 61 unit tests covering regex rules, validators, preprocess,
  ingest, confidence, and API contract (PaddleOCR mocked).
- Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit,
  Dockerfile, docker-compose, Makefile.
- Docs: README + docs/architecture.md (full hybrid stack rationale and
  6-phase roadmap).

Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
2026-04-25 14:58:50 +00:00

107 lines
3.1 KiB
Python

"""PaddleOCR wrapper.
PaddleOCR has a heavy initialization cost (~2-5s on CPU as model files load),
so we keep a process-global instance behind a lazy accessor.
The wrapper exposes a small, stable surface so the rest of the pipeline does
not depend directly on paddleocr's evolving API.
"""
from __future__ import annotations
from dataclasses import dataclass
from threading import Lock
from typing import TYPE_CHECKING
import numpy as np
from ocr_sprint.config import get_settings
from ocr_sprint.pipeline.ingest import NDArrayU8
from ocr_sprint.utils.logging import get_logger
if TYPE_CHECKING:
from paddleocr import PaddleOCR
_logger = get_logger(__name__)
_lock = Lock()
_instance: PaddleOCR | None = None
@dataclass(frozen=True)
class OCRLine:
"""One recognized line with its bounding polygon and confidence."""
text: str
confidence: float
box: tuple[tuple[float, float], ...] # 4 (x, y) corner points
@dataclass(frozen=True)
class OCRPage:
"""OCR output for a single page."""
lines: list[OCRLine]
@property
def text(self) -> str:
"""Reconstruct page text by concatenating lines (order = paddle's output order)."""
return "\n".join(line.text for line in self.lines)
@property
def mean_confidence(self) -> float:
if not self.lines:
return 0.0
return float(np.mean([line.confidence for line in self.lines]))
def _build_paddleocr() -> PaddleOCR:
from paddleocr import PaddleOCR
s = get_settings()
kwargs: dict[str, object] = {
"lang": s.ocr_lang,
"use_angle_cls": True,
"use_gpu": s.ocr_use_gpu,
"show_log": False,
}
if s.ocr_det_model_dir:
kwargs["det_model_dir"] = s.ocr_det_model_dir
if s.ocr_rec_model_dir:
kwargs["rec_model_dir"] = s.ocr_rec_model_dir
if s.ocr_cls_model_dir:
kwargs["cls_model_dir"] = s.ocr_cls_model_dir
_logger.info("paddleocr.init", lang=s.ocr_lang, use_gpu=s.ocr_use_gpu)
return PaddleOCR(**kwargs)
def get_ocr() -> PaddleOCR:
"""Lazy, thread-safe singleton accessor for the PaddleOCR engine."""
global _instance
if _instance is None:
with _lock:
if _instance is None:
_instance = _build_paddleocr()
return _instance
def run_ocr(image: NDArrayU8) -> OCRPage:
"""Run OCR on a single BGR image and return a structured page result."""
engine = get_ocr()
raw = engine.ocr(image, cls=True)
# PaddleOCR returns [[ [box, (text, conf)], ... ]] — one outer list per image.
if not raw or raw[0] is None:
return OCRPage(lines=[])
page_raw = raw[0]
lines: list[OCRLine] = []
for item in page_raw:
if not item or len(item) < 2:
continue
box_raw, text_conf = item[0], item[1]
text, conf = text_conf[0], float(text_conf[1])
try:
box = tuple((float(p[0]), float(p[1])) for p in box_raw)
except (TypeError, ValueError, IndexError):
continue
lines.append(OCRLine(text=text, confidence=conf, box=box))
return OCRPage(lines=lines)