Phase 1 MVP: synchronous OCR + regex header extraction
Implements the foundation of the OCR Sprint service: - FastAPI app with /api/v1/health and /api/v1/documents (sync upload) - Pydantic v2 schemas for documents, extraction result, personnel - Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew, denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based header extraction (nomor sprint, tanggal, satuan, perihal, dasar), signatory NRP, master-pangkat validation, confidence scoring + routing. - Tests: 61 unit tests covering regex rules, validators, preprocess, ingest, confidence, and API contract (PaddleOCR mocked). - Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit, Dockerfile, docker-compose, Makefile. - Docs: README + docs/architecture.md (full hybrid stack rationale and 6-phase roadmap). Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
This commit is contained in:
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
43
tests/conftest.py
Normal file
43
tests/conftest.py
Normal file
@@ -0,0 +1,43 @@
|
||||
"""Shared pytest fixtures."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def blank_bgr_image() -> np.ndarray:
|
||||
"""A 600x800 white BGR image (uint8) — useful for preprocessing smoke tests."""
|
||||
return np.full((600, 800, 3), 255, dtype=np.uint8)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_sprint_text() -> str:
|
||||
"""Realistic-but-synthetic OCR text for regex extractor tests."""
|
||||
return (
|
||||
"KEPOLISIAN NEGARA REPUBLIK INDONESIA\n"
|
||||
"DAERAH JAWA BARAT\n"
|
||||
"RESOR BANDUNG\n"
|
||||
"\n"
|
||||
"SURAT PERINTAH\n"
|
||||
"Nomor : Sprin/123/IV/2025/Reskrim\n"
|
||||
"\n"
|
||||
"DASAR :\n"
|
||||
"1. Undang-Undang Nomor 2 Tahun 2002 tentang Kepolisian Negara Republik Indonesia.\n"
|
||||
"2. Peraturan Kapolri Nomor 6 Tahun 2017 tentang Susunan Organisasi.\n"
|
||||
"3. Laporan Polisi Nomor LP/123/IV/2025/Reskrim tanggal 20 April 2025.\n"
|
||||
"\n"
|
||||
"DIPERINTAHKAN :\n"
|
||||
"Kepada : 1. Nama anggota tersebut di bawah ini.\n"
|
||||
"\n"
|
||||
"Untuk : Melaksanakan penyelidikan tindak pidana.\n"
|
||||
"\n"
|
||||
"PERIHAL : Pelaksanaan penyelidikan kasus pencurian.\n"
|
||||
"\n"
|
||||
"Bandung, 21 April 2025\n"
|
||||
"KEPALA KEPOLISIAN RESOR BANDUNG\n"
|
||||
"\n"
|
||||
"Drs. BUDI SANTOSO\n"
|
||||
"AKBP NRP 12345678\n"
|
||||
)
|
||||
0
tests/unit/__init__.py
Normal file
0
tests/unit/__init__.py
Normal file
87
tests/unit/test_api.py
Normal file
87
tests/unit/test_api.py
Normal file
@@ -0,0 +1,87 @@
|
||||
"""API tests with the OCR engine mocked.
|
||||
|
||||
These tests do NOT load PaddleOCR — instead they monkeypatch the orchestrator
|
||||
so we can exercise the FastAPI surface without the heavy ML init cost.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import date
|
||||
|
||||
import pytest
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from ocr_sprint.main import create_app
|
||||
from ocr_sprint.pipeline import orchestrator as orch_module
|
||||
from ocr_sprint.pipeline.orchestrator import PipelineOutput
|
||||
from ocr_sprint.schemas.document import DocumentStatus, SourceKind
|
||||
from ocr_sprint.schemas.extraction import ExtractionResult, HeaderFields
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def client() -> TestClient:
|
||||
return TestClient(create_app())
|
||||
|
||||
|
||||
def test_health_endpoint(client: TestClient) -> None:
|
||||
response = client.get("/api/v1/health")
|
||||
assert response.status_code == 200
|
||||
assert response.json()["status"] == "ok"
|
||||
|
||||
|
||||
def test_documents_rejects_empty_upload(client: TestClient) -> None:
|
||||
response = client.post(
|
||||
"/api/v1/documents",
|
||||
files={"file": ("empty.pdf", b"", "application/pdf")},
|
||||
)
|
||||
assert response.status_code == 400
|
||||
|
||||
|
||||
def test_documents_rejects_unknown_format(
|
||||
client: TestClient,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
response = client.post(
|
||||
"/api/v1/documents",
|
||||
files={"file": ("x.bin", b"random garbage bytes here", "application/octet-stream")},
|
||||
)
|
||||
assert response.status_code == 400
|
||||
|
||||
|
||||
def test_documents_returns_pipeline_output(
|
||||
client: TestClient,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
fake_result = ExtractionResult(
|
||||
header=HeaderFields(
|
||||
nomor_sprint="Sprin/1/I/2025",
|
||||
tanggal=date(2025, 1, 1),
|
||||
satuan_penerbit="POLRES TEST",
|
||||
),
|
||||
confidence=0.97,
|
||||
)
|
||||
fake_output = PipelineOutput(
|
||||
source_kind=SourceKind.PDF,
|
||||
status=DocumentStatus.COMPLETED,
|
||||
confidence=0.97,
|
||||
result=fake_result,
|
||||
)
|
||||
|
||||
def _fake_run(_content: bytes) -> PipelineOutput:
|
||||
return fake_output
|
||||
|
||||
# Patch the symbol *imported into* the routes module.
|
||||
monkeypatch.setattr(orch_module, "run_pipeline", _fake_run)
|
||||
from ocr_sprint.api.routes import documents as docs_module
|
||||
|
||||
monkeypatch.setattr(docs_module, "run_pipeline", _fake_run)
|
||||
|
||||
response = client.post(
|
||||
"/api/v1/documents",
|
||||
files={"file": ("x.pdf", b"%PDF-1.4\n%fake", "application/pdf")},
|
||||
)
|
||||
assert response.status_code == 200
|
||||
body = response.json()
|
||||
assert body["status"] == "completed"
|
||||
assert body["confidence"] == 0.97
|
||||
assert body["data"]["header"]["nomor_sprint"] == "Sprin/1/I/2025"
|
||||
46
tests/unit/test_confidence.py
Normal file
46
tests/unit/test_confidence.py
Normal file
@@ -0,0 +1,46 @@
|
||||
"""Tests for confidence scoring + routing."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from ocr_sprint.pipeline.confidence import compute_confidence, route
|
||||
from ocr_sprint.schemas.document import DocumentStatus
|
||||
from ocr_sprint.schemas.extraction import ReviewFlag
|
||||
|
||||
|
||||
def test_no_flags_returns_blend_of_ocr_only() -> None:
|
||||
score = compute_confidence(0.9, [])
|
||||
# OCR weight 0.6 * 0.9 + validation 0.4 * 1.0 = 0.94
|
||||
assert abs(score - 0.94) < 1e-6
|
||||
|
||||
|
||||
def test_flags_reduce_score() -> None:
|
||||
base = compute_confidence(0.9, [])
|
||||
with_flags = compute_confidence(0.9, [ReviewFlag.MISSING_FIELD])
|
||||
assert with_flags < base
|
||||
|
||||
|
||||
def test_score_is_clamped() -> None:
|
||||
catastrophic = compute_confidence(
|
||||
0.0,
|
||||
[
|
||||
ReviewFlag.MISSING_FIELD,
|
||||
ReviewFlag.LOW_OCR_CONFIDENCE,
|
||||
ReviewFlag.PERSONNEL_COUNT_MISMATCH,
|
||||
ReviewFlag.INVALID_NRP,
|
||||
ReviewFlag.UNKNOWN_PANGKAT,
|
||||
ReviewFlag.DATE_PARSE_FAILED,
|
||||
],
|
||||
)
|
||||
assert 0.0 <= catastrophic <= 1.0
|
||||
|
||||
|
||||
def test_route_high_confidence() -> None:
|
||||
assert route(0.97) == DocumentStatus.COMPLETED
|
||||
|
||||
|
||||
def test_route_mid_goes_to_review() -> None:
|
||||
assert route(0.88) == DocumentStatus.NEEDS_REVIEW
|
||||
|
||||
|
||||
def test_route_low_goes_to_review() -> None:
|
||||
assert route(0.40) == DocumentStatus.NEEDS_REVIEW
|
||||
50
tests/unit/test_ingest.py
Normal file
50
tests/unit/test_ingest.py
Normal file
@@ -0,0 +1,50 @@
|
||||
"""Tests for source detection + image ingest."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
from ocr_sprint.pipeline.ingest import detect_source_kind, ingest_image
|
||||
from ocr_sprint.schemas.document import SourceKind
|
||||
|
||||
|
||||
def _png_bytes() -> bytes:
|
||||
img = Image.new("RGB", (100, 80), color="white")
|
||||
buf = io.BytesIO()
|
||||
img.save(buf, format="PNG")
|
||||
return buf.getvalue()
|
||||
|
||||
|
||||
def _jpeg_bytes() -> bytes:
|
||||
img = Image.new("RGB", (100, 80), color="white")
|
||||
buf = io.BytesIO()
|
||||
img.save(buf, format="JPEG")
|
||||
return buf.getvalue()
|
||||
|
||||
|
||||
def test_detect_pdf() -> None:
|
||||
assert detect_source_kind(b"%PDF-1.7\n...") == SourceKind.PDF
|
||||
|
||||
|
||||
def test_detect_png() -> None:
|
||||
assert detect_source_kind(_png_bytes()) == SourceKind.IMAGE
|
||||
|
||||
|
||||
def test_detect_jpeg() -> None:
|
||||
assert detect_source_kind(_jpeg_bytes()) == SourceKind.IMAGE
|
||||
|
||||
|
||||
def test_detect_unknown() -> None:
|
||||
assert detect_source_kind(b"garbage") == SourceKind.UNKNOWN
|
||||
|
||||
|
||||
def test_ingest_image_returns_one_page() -> None:
|
||||
pages = ingest_image(_png_bytes())
|
||||
assert len(pages) == 1
|
||||
assert pages[0].page_index == 0
|
||||
assert isinstance(pages[0].image, np.ndarray)
|
||||
assert pages[0].image.dtype == np.uint8
|
||||
assert pages[0].image.shape == (80, 100, 3)
|
||||
37
tests/unit/test_preprocess.py
Normal file
37
tests/unit/test_preprocess.py
Normal file
@@ -0,0 +1,37 @@
|
||||
"""Smoke tests for the preprocessing pipeline."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ocr_sprint.pipeline.preprocess import PreprocessConfig, preprocess
|
||||
|
||||
|
||||
def test_preprocess_returns_bgr_uint8(blank_bgr_image: np.ndarray) -> None:
|
||||
out = preprocess(blank_bgr_image)
|
||||
assert out.dtype == np.uint8
|
||||
assert out.ndim == 3
|
||||
assert out.shape[2] == 3
|
||||
|
||||
|
||||
def test_preprocess_resizes_to_max_side() -> None:
|
||||
big = np.full((4000, 3000, 3), 255, dtype=np.uint8)
|
||||
cfg = PreprocessConfig(max_side=1000, denoise=False, deskew=False)
|
||||
out = preprocess(big, cfg)
|
||||
assert max(out.shape[:2]) == 1000
|
||||
|
||||
|
||||
def test_preprocess_does_not_upscale_small_images() -> None:
|
||||
small = np.full((400, 300, 3), 255, dtype=np.uint8)
|
||||
cfg = PreprocessConfig(max_side=2200, denoise=False, deskew=False)
|
||||
out = preprocess(small, cfg)
|
||||
assert out.shape[:2] == (400, 300)
|
||||
|
||||
|
||||
def test_adaptive_threshold_produces_binary_image() -> None:
|
||||
img = np.random.randint(0, 256, (200, 200, 3), dtype=np.uint8)
|
||||
cfg = PreprocessConfig(denoise=False, deskew=False, adaptive_threshold=True)
|
||||
out = preprocess(img, cfg)
|
||||
# adaptive threshold should leave only 0s and 255s
|
||||
unique = np.unique(out)
|
||||
assert set(unique.tolist()).issubset({0, 255})
|
||||
112
tests/unit/test_regex_rules.py
Normal file
112
tests/unit/test_regex_rules.py
Normal file
@@ -0,0 +1,112 @@
|
||||
"""Tests for regex-based header extraction."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import date
|
||||
|
||||
import pytest
|
||||
|
||||
from ocr_sprint.pipeline.extract.regex_rules import (
|
||||
extract_header,
|
||||
find_dasar_list,
|
||||
find_nomor_sprint,
|
||||
find_perihal,
|
||||
find_satuan,
|
||||
find_signatory,
|
||||
find_tanggal,
|
||||
)
|
||||
|
||||
|
||||
class TestNomorSprint:
|
||||
@pytest.mark.parametrize(
|
||||
("text", "needle"),
|
||||
[
|
||||
("Nomor : Sprin/123/IV/2025/Reskrim", "123"),
|
||||
("Nomor: SPRIN / 7 / I / 2024", "7"),
|
||||
("...Sprin-345-X-2024-Sat Intelkam...", "345"),
|
||||
],
|
||||
)
|
||||
def test_finds_nomor(self, text: str, needle: str) -> None:
|
||||
result = find_nomor_sprint(text)
|
||||
assert result is not None
|
||||
assert needle in result
|
||||
assert result.upper().startswith("SPRIN")
|
||||
|
||||
def test_returns_none_when_absent(self) -> None:
|
||||
assert find_nomor_sprint("no nomor here, just some text") is None
|
||||
|
||||
|
||||
class TestTanggal:
|
||||
def test_basic_date(self) -> None:
|
||||
assert find_tanggal("Bandung, 21 April 2025") == date(2025, 4, 21)
|
||||
|
||||
def test_with_dashes(self) -> None:
|
||||
assert find_tanggal("Tanggal 1 - Desember - 2024") == date(2024, 12, 1)
|
||||
|
||||
def test_invalid_month(self) -> None:
|
||||
assert find_tanggal("21 Foo 2025") is None
|
||||
|
||||
def test_no_date_present(self) -> None:
|
||||
assert find_tanggal("nothing here") is None
|
||||
|
||||
|
||||
class TestSatuan:
|
||||
def test_polres(self) -> None:
|
||||
result = find_satuan("KEPOLISIAN RESOR BANDUNG\nLainnya")
|
||||
assert result is not None
|
||||
assert "RESOR BANDUNG" in result.upper()
|
||||
|
||||
def test_polri_pusat(self) -> None:
|
||||
result = find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA")
|
||||
assert result is not None
|
||||
|
||||
|
||||
class TestPerihal:
|
||||
def test_extracts_perihal_line(self) -> None:
|
||||
text = "Other line\nPERIHAL : Pelaksanaan penyelidikan kasus.\nMore"
|
||||
assert find_perihal(text) == "Pelaksanaan penyelidikan kasus."
|
||||
|
||||
def test_returns_none_when_absent(self) -> None:
|
||||
assert find_perihal("no perihal field") is None
|
||||
|
||||
|
||||
class TestDasar:
|
||||
def test_numbered_list(self) -> None:
|
||||
text = (
|
||||
"DASAR :\n"
|
||||
"1. UU No 2 Tahun 2002.\n"
|
||||
"2. Peraturan Kapolri Nomor 6.\n"
|
||||
"\n"
|
||||
"DIPERINTAHKAN :\n"
|
||||
"Kepada : ...\n"
|
||||
)
|
||||
items = find_dasar_list(text)
|
||||
assert len(items) == 2
|
||||
assert items[0].startswith("UU No 2")
|
||||
assert items[1].startswith("Peraturan Kapolri")
|
||||
|
||||
def test_empty_when_section_missing(self) -> None:
|
||||
assert find_dasar_list("no dasar section") == []
|
||||
|
||||
|
||||
class TestSignatory:
|
||||
def test_extracts_last_nrp(self) -> None:
|
||||
text = "Some 12345678 NRP earlier 87654321\nNRP. 11223344"
|
||||
sig = find_signatory(text)
|
||||
assert sig.nrp == "11223344"
|
||||
|
||||
def test_no_nrp(self) -> None:
|
||||
assert find_signatory("no NRP here").nrp is None
|
||||
|
||||
|
||||
class TestExtractHeader:
|
||||
def test_full_synthetic_doc(self, sample_sprint_text: str) -> None:
|
||||
header = extract_header(sample_sprint_text)
|
||||
assert header.nomor_sprint is not None
|
||||
assert "Sprin" in header.nomor_sprint
|
||||
assert header.tanggal == date(2025, 4, 21)
|
||||
assert header.satuan_penerbit is not None
|
||||
assert "KEPOLISIAN" in header.satuan_penerbit.upper()
|
||||
assert header.perihal is not None
|
||||
assert "penyelidikan" in header.perihal.lower()
|
||||
assert len(header.dasar) == 3
|
||||
108
tests/unit/test_validators.py
Normal file
108
tests/unit/test_validators.py
Normal file
@@ -0,0 +1,108 @@
|
||||
"""Tests for the validation layer."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import date
|
||||
|
||||
import pytest
|
||||
|
||||
from ocr_sprint.data.master_pangkat import is_valid_pangkat, normalize_pangkat
|
||||
from ocr_sprint.pipeline.extract.validators import (
|
||||
validate_extraction,
|
||||
validate_header,
|
||||
validate_nrp,
|
||||
validate_personnel_entry,
|
||||
)
|
||||
from ocr_sprint.schemas.extraction import ExtractionResult, HeaderFields, ReviewFlag
|
||||
from ocr_sprint.schemas.personnel import PersonnelEntry
|
||||
|
||||
|
||||
class TestNRP:
|
||||
@pytest.mark.parametrize("nrp", ["12345678", "00000001", "99999999"])
|
||||
def test_valid_8_digits(self, nrp: str) -> None:
|
||||
assert validate_nrp(nrp) is True
|
||||
|
||||
@pytest.mark.parametrize("nrp", ["1234567", "123456789", "abcdefgh", "", None])
|
||||
def test_invalid(self, nrp: str | None) -> None:
|
||||
assert validate_nrp(nrp) is False
|
||||
|
||||
|
||||
class TestPangkat:
|
||||
@pytest.mark.parametrize(
|
||||
("input_str", "expected"),
|
||||
[
|
||||
("AKP", "AKP"),
|
||||
("akp", "AKP"),
|
||||
("AKP.", "AKP"),
|
||||
("AKBP", "AKBP"),
|
||||
("Brigjen Pol", "BRIGJEN POL"),
|
||||
("BRIGJEN", "BRIGJEN POL"),
|
||||
("Kombespol", "KOMBES POL"),
|
||||
("BRIPDA", "BRIPDA"),
|
||||
],
|
||||
)
|
||||
def test_normalizes_known_ranks(self, input_str: str, expected: str) -> None:
|
||||
assert normalize_pangkat(input_str) == expected
|
||||
|
||||
def test_unknown_returns_none(self) -> None:
|
||||
assert normalize_pangkat("Sersan Mayor") is None
|
||||
assert is_valid_pangkat("Sersan Mayor") is False
|
||||
|
||||
|
||||
class TestPersonnelValidator:
|
||||
def test_clean_entry_no_flags(self) -> None:
|
||||
entry = PersonnelEntry(pangkat="AKP", nrp="12345678", nama="Test")
|
||||
assert validate_personnel_entry(entry) == []
|
||||
|
||||
def test_invalid_nrp_flagged(self) -> None:
|
||||
entry = PersonnelEntry(pangkat="AKP", nrp="123", nama="Test")
|
||||
assert ReviewFlag.INVALID_NRP in validate_personnel_entry(entry)
|
||||
|
||||
def test_unknown_pangkat_flagged(self) -> None:
|
||||
entry = PersonnelEntry(pangkat="Sersan Mayor", nrp="12345678", nama="Test")
|
||||
assert ReviewFlag.UNKNOWN_PANGKAT in validate_personnel_entry(entry)
|
||||
|
||||
|
||||
class TestHeaderValidator:
|
||||
def test_complete_header_no_flags(self) -> None:
|
||||
header = HeaderFields(
|
||||
nomor_sprint="Sprin/1/I/2025",
|
||||
tanggal=date(2025, 1, 1),
|
||||
satuan_penerbit="POLRES BANDUNG",
|
||||
)
|
||||
assert validate_header(header) == []
|
||||
|
||||
def test_missing_nomor_flagged(self) -> None:
|
||||
header = HeaderFields(tanggal=date(2025, 1, 1))
|
||||
assert ReviewFlag.MISSING_FIELD in validate_header(header)
|
||||
|
||||
def test_missing_date_flagged(self) -> None:
|
||||
header = HeaderFields(nomor_sprint="Sprin/1/I/2025")
|
||||
assert ReviewFlag.DATE_PARSE_FAILED in validate_header(header)
|
||||
|
||||
|
||||
class TestFullValidation:
|
||||
def test_personnel_count_mismatch(self) -> None:
|
||||
result = ExtractionResult(
|
||||
header=HeaderFields(
|
||||
nomor_sprint="Sprin/1/I/2025",
|
||||
tanggal=date(2025, 1, 1),
|
||||
),
|
||||
personel=[
|
||||
PersonnelEntry(pangkat="AKP", nrp="12345678", nama="A"),
|
||||
],
|
||||
)
|
||||
flags = validate_extraction(result, expected_personnel_count=2)
|
||||
assert ReviewFlag.PERSONNEL_COUNT_MISMATCH in flags
|
||||
|
||||
def test_flags_are_deduped(self) -> None:
|
||||
result = ExtractionResult(
|
||||
header=HeaderFields(), # missing both nomor and tanggal
|
||||
personel=[
|
||||
PersonnelEntry(nrp="123", pangkat="X"),
|
||||
PersonnelEntry(nrp="456", pangkat="Y"),
|
||||
],
|
||||
)
|
||||
flags = validate_extraction(result)
|
||||
# each flag type should appear at most once
|
||||
assert len(flags) == len(set(flags))
|
||||
Reference in New Issue
Block a user