Phase 1 MVP: synchronous OCR + regex header extraction
Implements the foundation of the OCR Sprint service: - FastAPI app with /api/v1/health and /api/v1/documents (sync upload) - Pydantic v2 schemas for documents, extraction result, personnel - Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew, denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based header extraction (nomor sprint, tanggal, satuan, perihal, dasar), signatory NRP, master-pangkat validation, confidence scoring + routing. - Tests: 61 unit tests covering regex rules, validators, preprocess, ingest, confidence, and API contract (PaddleOCR mocked). - Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit, Dockerfile, docker-compose, Makefile. - Docs: README + docs/architecture.md (full hybrid stack rationale and 6-phase roadmap). Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
This commit is contained in:
37
tests/unit/test_preprocess.py
Normal file
37
tests/unit/test_preprocess.py
Normal file
@@ -0,0 +1,37 @@
|
||||
"""Smoke tests for the preprocessing pipeline."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ocr_sprint.pipeline.preprocess import PreprocessConfig, preprocess
|
||||
|
||||
|
||||
def test_preprocess_returns_bgr_uint8(blank_bgr_image: np.ndarray) -> None:
|
||||
out = preprocess(blank_bgr_image)
|
||||
assert out.dtype == np.uint8
|
||||
assert out.ndim == 3
|
||||
assert out.shape[2] == 3
|
||||
|
||||
|
||||
def test_preprocess_resizes_to_max_side() -> None:
|
||||
big = np.full((4000, 3000, 3), 255, dtype=np.uint8)
|
||||
cfg = PreprocessConfig(max_side=1000, denoise=False, deskew=False)
|
||||
out = preprocess(big, cfg)
|
||||
assert max(out.shape[:2]) == 1000
|
||||
|
||||
|
||||
def test_preprocess_does_not_upscale_small_images() -> None:
|
||||
small = np.full((400, 300, 3), 255, dtype=np.uint8)
|
||||
cfg = PreprocessConfig(max_side=2200, denoise=False, deskew=False)
|
||||
out = preprocess(small, cfg)
|
||||
assert out.shape[:2] == (400, 300)
|
||||
|
||||
|
||||
def test_adaptive_threshold_produces_binary_image() -> None:
|
||||
img = np.random.randint(0, 256, (200, 200, 3), dtype=np.uint8)
|
||||
cfg = PreprocessConfig(denoise=False, deskew=False, adaptive_threshold=True)
|
||||
out = preprocess(img, cfg)
|
||||
# adaptive threshold should leave only 0s and 255s
|
||||
unique = np.unique(out)
|
||||
assert set(unique.tolist()).issubset({0, 255})
|
||||
Reference in New Issue
Block a user