Phase 1 MVP: synchronous OCR + regex header extraction

Implements the foundation of the OCR Sprint service: - FastAPI app with /api/v1/health and /api/v1/documents (sync upload) - Pydantic v2 schemas for documents, extraction result, personnel - Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew, denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based header extraction (nomor sprint, tanggal, satuan, perihal, dasar), signatory NRP, master-pangkat validation, confidence scoring + routing. - Tests: 61 unit tests covering regex rules, validators, preprocess, ingest, confidence, and API contract (PaddleOCR mocked). - Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit, Dockerfile, docker-compose, Makefile. - Docs: README + docs/architecture.md (full hybrid stack rationale and 6-phase roadmap). Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
2026-04-25 14:58:50 +00:00
commit ca0c0a0428
45 changed files with 2457 additions and 0 deletions
--- a/src/ocr_sprint/pipeline/ocr.py
+++ b/src/ocr_sprint/pipeline/ocr.py
@@ -0,0 +1,106 @@
+"""PaddleOCR wrapper.
+
+PaddleOCR has a heavy initialization cost (~2-5s on CPU as model files load),
+so we keep a process-global instance behind a lazy accessor.
+
+The wrapper exposes a small, stable surface so the rest of the pipeline does
+not depend directly on paddleocr's evolving API.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from threading import Lock
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+from ocr_sprint.config import get_settings
+from ocr_sprint.pipeline.ingest import NDArrayU8
+from ocr_sprint.utils.logging import get_logger
+
+if TYPE_CHECKING:
+    from paddleocr import PaddleOCR
+
+_logger = get_logger(__name__)
+_lock = Lock()
+_instance: PaddleOCR | None = None
+
+
+@dataclass(frozen=True)
+class OCRLine:
+    """One recognized line with its bounding polygon and confidence."""
+
+    text: str
+    confidence: float
+    box: tuple[tuple[float, float], ...]  # 4 (x, y) corner points
+
+
+@dataclass(frozen=True)
+class OCRPage:
+    """OCR output for a single page."""
+
+    lines: list[OCRLine]
+
+    @property
+    def text(self) -> str:
+        """Reconstruct page text by concatenating lines (order = paddle's output order)."""
+        return "\n".join(line.text for line in self.lines)
+
+    @property
+    def mean_confidence(self) -> float:
+        if not self.lines:
+            return 0.0
+        return float(np.mean([line.confidence for line in self.lines]))
+
+
+def _build_paddleocr() -> PaddleOCR:
+    from paddleocr import PaddleOCR
+
+    s = get_settings()
+    kwargs: dict[str, object] = {
+        "lang": s.ocr_lang,
+        "use_angle_cls": True,
+        "use_gpu": s.ocr_use_gpu,
+        "show_log": False,
+    }
+    if s.ocr_det_model_dir:
+        kwargs["det_model_dir"] = s.ocr_det_model_dir
+    if s.ocr_rec_model_dir:
+        kwargs["rec_model_dir"] = s.ocr_rec_model_dir
+    if s.ocr_cls_model_dir:
+        kwargs["cls_model_dir"] = s.ocr_cls_model_dir
+    _logger.info("paddleocr.init", lang=s.ocr_lang, use_gpu=s.ocr_use_gpu)
+    return PaddleOCR(**kwargs)
+
+
+def get_ocr() -> PaddleOCR:
+    """Lazy, thread-safe singleton accessor for the PaddleOCR engine."""
+    global _instance
+    if _instance is None:
+        with _lock:
+            if _instance is None:
+                _instance = _build_paddleocr()
+    return _instance
+
+
+def run_ocr(image: NDArrayU8) -> OCRPage:
+    """Run OCR on a single BGR image and return a structured page result."""
+    engine = get_ocr()
+    raw = engine.ocr(image, cls=True)
+    # PaddleOCR returns [[ [box, (text, conf)], ... ]] — one outer list per image.
+    if not raw or raw[0] is None:
+        return OCRPage(lines=[])
+    page_raw = raw[0]
+    lines: list[OCRLine] = []
+    for item in page_raw:
+        if not item or len(item) < 2:
+            continue
+        box_raw, text_conf = item[0], item[1]
+        text, conf = text_conf[0], float(text_conf[1])
+        try:
+            box = tuple((float(p[0]), float(p[1])) for p in box_raw)
+        except (TypeError, ValueError, IndexError):
+            continue
+        lines.append(OCRLine(text=text, confidence=conf, box=box))
+    return OCRPage(lines=lines)