Files
OCR-SPRIN-SERVICE/tests/unit/test_ingest.py
Devin AI ca0c0a0428 Phase 1 MVP: synchronous OCR + regex header extraction
Implements the foundation of the OCR Sprint service:
- FastAPI app with /api/v1/health and /api/v1/documents (sync upload)
- Pydantic v2 schemas for documents, extraction result, personnel
- Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew,
  denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based
  header extraction (nomor sprint, tanggal, satuan, perihal, dasar),
  signatory NRP, master-pangkat validation, confidence scoring + routing.
- Tests: 61 unit tests covering regex rules, validators, preprocess,
  ingest, confidence, and API contract (PaddleOCR mocked).
- Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit,
  Dockerfile, docker-compose, Makefile.
- Docs: README + docs/architecture.md (full hybrid stack rationale and
  6-phase roadmap).

Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
2026-04-25 14:58:50 +00:00

51 lines
1.3 KiB
Python

"""Tests for source detection + image ingest."""
from __future__ import annotations
import io
import numpy as np
from PIL import Image
from ocr_sprint.pipeline.ingest import detect_source_kind, ingest_image
from ocr_sprint.schemas.document import SourceKind
def _png_bytes() -> bytes:
img = Image.new("RGB", (100, 80), color="white")
buf = io.BytesIO()
img.save(buf, format="PNG")
return buf.getvalue()
def _jpeg_bytes() -> bytes:
img = Image.new("RGB", (100, 80), color="white")
buf = io.BytesIO()
img.save(buf, format="JPEG")
return buf.getvalue()
def test_detect_pdf() -> None:
assert detect_source_kind(b"%PDF-1.7\n...") == SourceKind.PDF
def test_detect_png() -> None:
assert detect_source_kind(_png_bytes()) == SourceKind.IMAGE
def test_detect_jpeg() -> None:
assert detect_source_kind(_jpeg_bytes()) == SourceKind.IMAGE
def test_detect_unknown() -> None:
assert detect_source_kind(b"garbage") == SourceKind.UNKNOWN
def test_ingest_image_returns_one_page() -> None:
pages = ingest_image(_png_bytes())
assert len(pages) == 1
assert pages[0].page_index == 0
assert isinstance(pages[0].image, np.ndarray)
assert pages[0].image.dtype == np.uint8
assert pages[0].image.shape == (80, 100, 3)