Implements the foundation of the OCR Sprint service: - FastAPI app with /api/v1/health and /api/v1/documents (sync upload) - Pydantic v2 schemas for documents, extraction result, personnel - Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew, denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based header extraction (nomor sprint, tanggal, satuan, perihal, dasar), signatory NRP, master-pangkat validation, confidence scoring + routing. - Tests: 61 unit tests covering regex rules, validators, preprocess, ingest, confidence, and API contract (PaddleOCR mocked). - Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit, Dockerfile, docker-compose, Makefile. - Docs: README + docs/architecture.md (full hybrid stack rationale and 6-phase roadmap). Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
51 lines
1.3 KiB
Python
51 lines
1.3 KiB
Python
"""Tests for source detection + image ingest."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import io
|
|
|
|
import numpy as np
|
|
from PIL import Image
|
|
|
|
from ocr_sprint.pipeline.ingest import detect_source_kind, ingest_image
|
|
from ocr_sprint.schemas.document import SourceKind
|
|
|
|
|
|
def _png_bytes() -> bytes:
|
|
img = Image.new("RGB", (100, 80), color="white")
|
|
buf = io.BytesIO()
|
|
img.save(buf, format="PNG")
|
|
return buf.getvalue()
|
|
|
|
|
|
def _jpeg_bytes() -> bytes:
|
|
img = Image.new("RGB", (100, 80), color="white")
|
|
buf = io.BytesIO()
|
|
img.save(buf, format="JPEG")
|
|
return buf.getvalue()
|
|
|
|
|
|
def test_detect_pdf() -> None:
|
|
assert detect_source_kind(b"%PDF-1.7\n...") == SourceKind.PDF
|
|
|
|
|
|
def test_detect_png() -> None:
|
|
assert detect_source_kind(_png_bytes()) == SourceKind.IMAGE
|
|
|
|
|
|
def test_detect_jpeg() -> None:
|
|
assert detect_source_kind(_jpeg_bytes()) == SourceKind.IMAGE
|
|
|
|
|
|
def test_detect_unknown() -> None:
|
|
assert detect_source_kind(b"garbage") == SourceKind.UNKNOWN
|
|
|
|
|
|
def test_ingest_image_returns_one_page() -> None:
|
|
pages = ingest_image(_png_bytes())
|
|
assert len(pages) == 1
|
|
assert pages[0].page_index == 0
|
|
assert isinstance(pages[0].image, np.ndarray)
|
|
assert pages[0].image.dtype == np.uint8
|
|
assert pages[0].image.shape == (80, 100, 3)
|