Phase 1 MVP: synchronous OCR + regex header extraction
Implements the foundation of the OCR Sprint service: - FastAPI app with /api/v1/health and /api/v1/documents (sync upload) - Pydantic v2 schemas for documents, extraction result, personnel - Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew, denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based header extraction (nomor sprint, tanggal, satuan, perihal, dasar), signatory NRP, master-pangkat validation, confidence scoring + routing. - Tests: 61 unit tests covering regex rules, validators, preprocess, ingest, confidence, and API contract (PaddleOCR mocked). - Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit, Dockerfile, docker-compose, Makefile. - Docs: README + docs/architecture.md (full hybrid stack rationale and 6-phase roadmap). Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
This commit is contained in:
106
src/ocr_sprint/pipeline/ocr.py
Normal file
106
src/ocr_sprint/pipeline/ocr.py
Normal file
@@ -0,0 +1,106 @@
|
||||
"""PaddleOCR wrapper.
|
||||
|
||||
PaddleOCR has a heavy initialization cost (~2-5s on CPU as model files load),
|
||||
so we keep a process-global instance behind a lazy accessor.
|
||||
|
||||
The wrapper exposes a small, stable surface so the rest of the pipeline does
|
||||
not depend directly on paddleocr's evolving API.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from threading import Lock
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ocr_sprint.config import get_settings
|
||||
from ocr_sprint.pipeline.ingest import NDArrayU8
|
||||
from ocr_sprint.utils.logging import get_logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from paddleocr import PaddleOCR
|
||||
|
||||
_logger = get_logger(__name__)
|
||||
_lock = Lock()
|
||||
_instance: PaddleOCR | None = None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class OCRLine:
|
||||
"""One recognized line with its bounding polygon and confidence."""
|
||||
|
||||
text: str
|
||||
confidence: float
|
||||
box: tuple[tuple[float, float], ...] # 4 (x, y) corner points
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class OCRPage:
|
||||
"""OCR output for a single page."""
|
||||
|
||||
lines: list[OCRLine]
|
||||
|
||||
@property
|
||||
def text(self) -> str:
|
||||
"""Reconstruct page text by concatenating lines (order = paddle's output order)."""
|
||||
return "\n".join(line.text for line in self.lines)
|
||||
|
||||
@property
|
||||
def mean_confidence(self) -> float:
|
||||
if not self.lines:
|
||||
return 0.0
|
||||
return float(np.mean([line.confidence for line in self.lines]))
|
||||
|
||||
|
||||
def _build_paddleocr() -> PaddleOCR:
|
||||
from paddleocr import PaddleOCR
|
||||
|
||||
s = get_settings()
|
||||
kwargs: dict[str, object] = {
|
||||
"lang": s.ocr_lang,
|
||||
"use_angle_cls": True,
|
||||
"use_gpu": s.ocr_use_gpu,
|
||||
"show_log": False,
|
||||
}
|
||||
if s.ocr_det_model_dir:
|
||||
kwargs["det_model_dir"] = s.ocr_det_model_dir
|
||||
if s.ocr_rec_model_dir:
|
||||
kwargs["rec_model_dir"] = s.ocr_rec_model_dir
|
||||
if s.ocr_cls_model_dir:
|
||||
kwargs["cls_model_dir"] = s.ocr_cls_model_dir
|
||||
_logger.info("paddleocr.init", lang=s.ocr_lang, use_gpu=s.ocr_use_gpu)
|
||||
return PaddleOCR(**kwargs)
|
||||
|
||||
|
||||
def get_ocr() -> PaddleOCR:
|
||||
"""Lazy, thread-safe singleton accessor for the PaddleOCR engine."""
|
||||
global _instance
|
||||
if _instance is None:
|
||||
with _lock:
|
||||
if _instance is None:
|
||||
_instance = _build_paddleocr()
|
||||
return _instance
|
||||
|
||||
|
||||
def run_ocr(image: NDArrayU8) -> OCRPage:
|
||||
"""Run OCR on a single BGR image and return a structured page result."""
|
||||
engine = get_ocr()
|
||||
raw = engine.ocr(image, cls=True)
|
||||
# PaddleOCR returns [[ [box, (text, conf)], ... ]] — one outer list per image.
|
||||
if not raw or raw[0] is None:
|
||||
return OCRPage(lines=[])
|
||||
page_raw = raw[0]
|
||||
lines: list[OCRLine] = []
|
||||
for item in page_raw:
|
||||
if not item or len(item) < 2:
|
||||
continue
|
||||
box_raw, text_conf = item[0], item[1]
|
||||
text, conf = text_conf[0], float(text_conf[1])
|
||||
try:
|
||||
box = tuple((float(p[0]), float(p[1])) for p in box_raw)
|
||||
except (TypeError, ValueError, IndexError):
|
||||
continue
|
||||
lines.append(OCRLine(text=text, confidence=conf, box=box))
|
||||
return OCRPage(lines=lines)
|
||||
Reference in New Issue
Block a user