Phase 1 MVP: synchronous OCR + regex header extraction

Implements the foundation of the OCR Sprint service: - FastAPI app with /api/v1/health and /api/v1/documents (sync upload) - Pydantic v2 schemas for documents, extraction result, personnel - Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew, denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based header extraction (nomor sprint, tanggal, satuan, perihal, dasar), signatory NRP, master-pangkat validation, confidence scoring + routing. - Tests: 61 unit tests covering regex rules, validators, preprocess, ingest, confidence, and API contract (PaddleOCR mocked). - Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit, Dockerfile, docker-compose, Makefile. - Docs: README + docs/architecture.md (full hybrid stack rationale and 6-phase roadmap). Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
2026-04-25 14:58:50 +00:00
commit ca0c0a0428
45 changed files with 2457 additions and 0 deletions
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -0,0 +1,43 @@
+"""Shared pytest fixtures."""
+
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+
+@pytest.fixture
+def blank_bgr_image() -> np.ndarray:
+    """A 600x800 white BGR image (uint8) — useful for preprocessing smoke tests."""
+    return np.full((600, 800, 3), 255, dtype=np.uint8)
+
+
+@pytest.fixture
+def sample_sprint_text() -> str:
+    """Realistic-but-synthetic OCR text for regex extractor tests."""
+    return (
+        "KEPOLISIAN NEGARA REPUBLIK INDONESIA\n"
+        "DAERAH JAWA BARAT\n"
+        "RESOR BANDUNG\n"
+        "\n"
+        "SURAT PERINTAH\n"
+        "Nomor : Sprin/123/IV/2025/Reskrim\n"
+        "\n"
+        "DASAR :\n"
+        "1. Undang-Undang Nomor 2 Tahun 2002 tentang Kepolisian Negara Republik Indonesia.\n"
+        "2. Peraturan Kapolri Nomor 6 Tahun 2017 tentang Susunan Organisasi.\n"
+        "3. Laporan Polisi Nomor LP/123/IV/2025/Reskrim tanggal 20 April 2025.\n"
+        "\n"
+        "DIPERINTAHKAN :\n"
+        "Kepada : 1. Nama anggota tersebut di bawah ini.\n"
+        "\n"
+        "Untuk : Melaksanakan penyelidikan tindak pidana.\n"
+        "\n"
+        "PERIHAL : Pelaksanaan penyelidikan kasus pencurian.\n"
+        "\n"
+        "Bandung, 21 April 2025\n"
+        "KEPALA KEPOLISIAN RESOR BANDUNG\n"
+        "\n"
+        "Drs. BUDI SANTOSO\n"
+        "AKBP NRP 12345678\n"
+    )
--- a/tests/unit/init.py
+++ b/tests/unit/init.py
--- a/tests/unit/test_api.py
+++ b/tests/unit/test_api.py
@@ -0,0 +1,87 @@
+"""API tests with the OCR engine mocked.
+
+These tests do NOT load PaddleOCR — instead they monkeypatch the orchestrator
+so we can exercise the FastAPI surface without the heavy ML init cost.
+"""
+
+from __future__ import annotations
+
+from datetime import date
+
+import pytest
+from fastapi.testclient import TestClient
+
+from ocr_sprint.main import create_app
+from ocr_sprint.pipeline import orchestrator as orch_module
+from ocr_sprint.pipeline.orchestrator import PipelineOutput
+from ocr_sprint.schemas.document import DocumentStatus, SourceKind
+from ocr_sprint.schemas.extraction import ExtractionResult, HeaderFields
+
+
+@pytest.fixture
+def client() -> TestClient:
+    return TestClient(create_app())
+
+
+def test_health_endpoint(client: TestClient) -> None:
+    response = client.get("/api/v1/health")
+    assert response.status_code == 200
+    assert response.json()["status"] == "ok"
+
+
+def test_documents_rejects_empty_upload(client: TestClient) -> None:
+    response = client.post(
+        "/api/v1/documents",
+        files={"file": ("empty.pdf", b"", "application/pdf")},
+    )
+    assert response.status_code == 400
+
+
+def test_documents_rejects_unknown_format(
+    client: TestClient,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    response = client.post(
+        "/api/v1/documents",
+        files={"file": ("x.bin", b"random garbage bytes here", "application/octet-stream")},
+    )
+    assert response.status_code == 400
+
+
+def test_documents_returns_pipeline_output(
+    client: TestClient,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    fake_result = ExtractionResult(
+        header=HeaderFields(
+            nomor_sprint="Sprin/1/I/2025",
+            tanggal=date(2025, 1, 1),
+            satuan_penerbit="POLRES TEST",
+        ),
+        confidence=0.97,
+    )
+    fake_output = PipelineOutput(
+        source_kind=SourceKind.PDF,
+        status=DocumentStatus.COMPLETED,
+        confidence=0.97,
+        result=fake_result,
+    )
+
+    def _fake_run(_content: bytes) -> PipelineOutput:
+        return fake_output
+
+    # Patch the symbol *imported into* the routes module.
+    monkeypatch.setattr(orch_module, "run_pipeline", _fake_run)
+    from ocr_sprint.api.routes import documents as docs_module
+
+    monkeypatch.setattr(docs_module, "run_pipeline", _fake_run)
+
+    response = client.post(
+        "/api/v1/documents",
+        files={"file": ("x.pdf", b"%PDF-1.4\n%fake", "application/pdf")},
+    )
+    assert response.status_code == 200
+    body = response.json()
+    assert body["status"] == "completed"
+    assert body["confidence"] == 0.97
+    assert body["data"]["header"]["nomor_sprint"] == "Sprin/1/I/2025"
--- a/tests/unit/test_confidence.py
+++ b/tests/unit/test_confidence.py
@@ -0,0 +1,46 @@
+"""Tests for confidence scoring + routing."""
+
+from __future__ import annotations
+
+from ocr_sprint.pipeline.confidence import compute_confidence, route
+from ocr_sprint.schemas.document import DocumentStatus
+from ocr_sprint.schemas.extraction import ReviewFlag
+
+
+def test_no_flags_returns_blend_of_ocr_only() -> None:
+    score = compute_confidence(0.9, [])
+    # OCR weight 0.6 * 0.9 + validation 0.4 * 1.0 = 0.94
+    assert abs(score - 0.94) < 1e-6
+
+
+def test_flags_reduce_score() -> None:
+    base = compute_confidence(0.9, [])
+    with_flags = compute_confidence(0.9, [ReviewFlag.MISSING_FIELD])
+    assert with_flags < base
+
+
+def test_score_is_clamped() -> None:
+    catastrophic = compute_confidence(
+        0.0,
+        [
+            ReviewFlag.MISSING_FIELD,
+            ReviewFlag.LOW_OCR_CONFIDENCE,
+            ReviewFlag.PERSONNEL_COUNT_MISMATCH,
+            ReviewFlag.INVALID_NRP,
+            ReviewFlag.UNKNOWN_PANGKAT,
+            ReviewFlag.DATE_PARSE_FAILED,
+        ],
+    )
+    assert 0.0 <= catastrophic <= 1.0
+
+
+def test_route_high_confidence() -> None:
+    assert route(0.97) == DocumentStatus.COMPLETED
+
+
+def test_route_mid_goes_to_review() -> None:
+    assert route(0.88) == DocumentStatus.NEEDS_REVIEW
+
+
+def test_route_low_goes_to_review() -> None:
+    assert route(0.40) == DocumentStatus.NEEDS_REVIEW
--- a/tests/unit/test_ingest.py
+++ b/tests/unit/test_ingest.py
@@ -0,0 +1,50 @@
+"""Tests for source detection + image ingest."""
+
+from __future__ import annotations
+
+import io
+
+import numpy as np
+from PIL import Image
+
+from ocr_sprint.pipeline.ingest import detect_source_kind, ingest_image
+from ocr_sprint.schemas.document import SourceKind
+
+
+def _png_bytes() -> bytes:
+    img = Image.new("RGB", (100, 80), color="white")
+    buf = io.BytesIO()
+    img.save(buf, format="PNG")
+    return buf.getvalue()
+
+
+def _jpeg_bytes() -> bytes:
+    img = Image.new("RGB", (100, 80), color="white")
+    buf = io.BytesIO()
+    img.save(buf, format="JPEG")
+    return buf.getvalue()
+
+
+def test_detect_pdf() -> None:
+    assert detect_source_kind(b"%PDF-1.7\n...") == SourceKind.PDF
+
+
+def test_detect_png() -> None:
+    assert detect_source_kind(_png_bytes()) == SourceKind.IMAGE
+
+
+def test_detect_jpeg() -> None:
+    assert detect_source_kind(_jpeg_bytes()) == SourceKind.IMAGE
+
+
+def test_detect_unknown() -> None:
+    assert detect_source_kind(b"garbage") == SourceKind.UNKNOWN
+
+
+def test_ingest_image_returns_one_page() -> None:
+    pages = ingest_image(_png_bytes())
+    assert len(pages) == 1
+    assert pages[0].page_index == 0
+    assert isinstance(pages[0].image, np.ndarray)
+    assert pages[0].image.dtype == np.uint8
+    assert pages[0].image.shape == (80, 100, 3)
--- a/tests/unit/test_preprocess.py
+++ b/tests/unit/test_preprocess.py
@@ -0,0 +1,37 @@
+"""Smoke tests for the preprocessing pipeline."""
+
+from __future__ import annotations
+
+import numpy as np
+
+from ocr_sprint.pipeline.preprocess import PreprocessConfig, preprocess
+
+
+def test_preprocess_returns_bgr_uint8(blank_bgr_image: np.ndarray) -> None:
+    out = preprocess(blank_bgr_image)
+    assert out.dtype == np.uint8
+    assert out.ndim == 3
+    assert out.shape[2] == 3
+
+
+def test_preprocess_resizes_to_max_side() -> None:
+    big = np.full((4000, 3000, 3), 255, dtype=np.uint8)
+    cfg = PreprocessConfig(max_side=1000, denoise=False, deskew=False)
+    out = preprocess(big, cfg)
+    assert max(out.shape[:2]) == 1000
+
+
+def test_preprocess_does_not_upscale_small_images() -> None:
+    small = np.full((400, 300, 3), 255, dtype=np.uint8)
+    cfg = PreprocessConfig(max_side=2200, denoise=False, deskew=False)
+    out = preprocess(small, cfg)
+    assert out.shape[:2] == (400, 300)
+
+
+def test_adaptive_threshold_produces_binary_image() -> None:
+    img = np.random.randint(0, 256, (200, 200, 3), dtype=np.uint8)
+    cfg = PreprocessConfig(denoise=False, deskew=False, adaptive_threshold=True)
+    out = preprocess(img, cfg)
+    # adaptive threshold should leave only 0s and 255s
+    unique = np.unique(out)
+    assert set(unique.tolist()).issubset({0, 255})
--- a/tests/unit/test_regex_rules.py
+++ b/tests/unit/test_regex_rules.py
@@ -0,0 +1,112 @@
+"""Tests for regex-based header extraction."""
+
+from __future__ import annotations
+
+from datetime import date
+
+import pytest
+
+from ocr_sprint.pipeline.extract.regex_rules import (
+    extract_header,
+    find_dasar_list,
+    find_nomor_sprint,
+    find_perihal,
+    find_satuan,
+    find_signatory,
+    find_tanggal,
+)
+
+
+class TestNomorSprint:
+    @pytest.mark.parametrize(
+        ("text", "needle"),
+        [
+            ("Nomor : Sprin/123/IV/2025/Reskrim", "123"),
+            ("Nomor: SPRIN / 7 / I / 2024", "7"),
+            ("...Sprin-345-X-2024-Sat Intelkam...", "345"),
+        ],
+    )
+    def test_finds_nomor(self, text: str, needle: str) -> None:
+        result = find_nomor_sprint(text)
+        assert result is not None
+        assert needle in result
+        assert result.upper().startswith("SPRIN")
+
+    def test_returns_none_when_absent(self) -> None:
+        assert find_nomor_sprint("no nomor here, just some text") is None
+
+
+class TestTanggal:
+    def test_basic_date(self) -> None:
+        assert find_tanggal("Bandung, 21 April 2025") == date(2025, 4, 21)
+
+    def test_with_dashes(self) -> None:
+        assert find_tanggal("Tanggal 1 - Desember - 2024") == date(2024, 12, 1)
+
+    def test_invalid_month(self) -> None:
+        assert find_tanggal("21 Foo 2025") is None
+
+    def test_no_date_present(self) -> None:
+        assert find_tanggal("nothing here") is None
+
+
+class TestSatuan:
+    def test_polres(self) -> None:
+        result = find_satuan("KEPOLISIAN RESOR BANDUNG\nLainnya")
+        assert result is not None
+        assert "RESOR BANDUNG" in result.upper()
+
+    def test_polri_pusat(self) -> None:
+        result = find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA")
+        assert result is not None
+
+
+class TestPerihal:
+    def test_extracts_perihal_line(self) -> None:
+        text = "Other line\nPERIHAL : Pelaksanaan penyelidikan kasus.\nMore"
+        assert find_perihal(text) == "Pelaksanaan penyelidikan kasus."
+
+    def test_returns_none_when_absent(self) -> None:
+        assert find_perihal("no perihal field") is None
+
+
+class TestDasar:
+    def test_numbered_list(self) -> None:
+        text = (
+            "DASAR :\n"
+            "1. UU No 2 Tahun 2002.\n"
+            "2. Peraturan Kapolri Nomor 6.\n"
+            "\n"
+            "DIPERINTAHKAN :\n"
+            "Kepada : ...\n"
+        )
+        items = find_dasar_list(text)
+        assert len(items) == 2
+        assert items[0].startswith("UU No 2")
+        assert items[1].startswith("Peraturan Kapolri")
+
+    def test_empty_when_section_missing(self) -> None:
+        assert find_dasar_list("no dasar section") == []
+
+
+class TestSignatory:
+    def test_extracts_last_nrp(self) -> None:
+        text = "Some 12345678 NRP earlier 87654321\nNRP. 11223344"
+        sig = find_signatory(text)
+        assert sig.nrp == "11223344"
+
+    def test_no_nrp(self) -> None:
+        assert find_signatory("no NRP here").nrp is None
+
+
+class TestExtractHeader:
+    def test_full_synthetic_doc(self, sample_sprint_text: str) -> None:
+        header = extract_header(sample_sprint_text)
+        assert header.nomor_sprint is not None
+        assert "Sprin" in header.nomor_sprint
+        assert header.tanggal == date(2025, 4, 21)
+        assert header.satuan_penerbit is not None
+        assert "KEPOLISIAN" in header.satuan_penerbit.upper()
+        assert header.perihal is not None
+        assert "penyelidikan" in header.perihal.lower()
+        assert len(header.dasar) == 3
--- a/tests/unit/test_validators.py
+++ b/tests/unit/test_validators.py
@@ -0,0 +1,108 @@
+"""Tests for the validation layer."""
+
+from __future__ import annotations
+
+from datetime import date
+
+import pytest
+
+from ocr_sprint.data.master_pangkat import is_valid_pangkat, normalize_pangkat
+from ocr_sprint.pipeline.extract.validators import (
+    validate_extraction,
+    validate_header,
+    validate_nrp,
+    validate_personnel_entry,
+)
+from ocr_sprint.schemas.extraction import ExtractionResult, HeaderFields, ReviewFlag
+from ocr_sprint.schemas.personnel import PersonnelEntry
+
+
+class TestNRP:
+    @pytest.mark.parametrize("nrp", ["12345678", "00000001", "99999999"])
+    def test_valid_8_digits(self, nrp: str) -> None:
+        assert validate_nrp(nrp) is True
+
+    @pytest.mark.parametrize("nrp", ["1234567", "123456789", "abcdefgh", "", None])
+    def test_invalid(self, nrp: str | None) -> None:
+        assert validate_nrp(nrp) is False
+
+
+class TestPangkat:
+    @pytest.mark.parametrize(
+        ("input_str", "expected"),
+        [
+            ("AKP", "AKP"),
+            ("akp", "AKP"),
+            ("AKP.", "AKP"),
+            ("AKBP", "AKBP"),
+            ("Brigjen Pol", "BRIGJEN POL"),
+            ("BRIGJEN", "BRIGJEN POL"),
+            ("Kombespol", "KOMBES POL"),
+            ("BRIPDA", "BRIPDA"),
+        ],
+    )
+    def test_normalizes_known_ranks(self, input_str: str, expected: str) -> None:
+        assert normalize_pangkat(input_str) == expected
+
+    def test_unknown_returns_none(self) -> None:
+        assert normalize_pangkat("Sersan Mayor") is None
+        assert is_valid_pangkat("Sersan Mayor") is False
+
+
+class TestPersonnelValidator:
+    def test_clean_entry_no_flags(self) -> None:
+        entry = PersonnelEntry(pangkat="AKP", nrp="12345678", nama="Test")
+        assert validate_personnel_entry(entry) == []
+
+    def test_invalid_nrp_flagged(self) -> None:
+        entry = PersonnelEntry(pangkat="AKP", nrp="123", nama="Test")
+        assert ReviewFlag.INVALID_NRP in validate_personnel_entry(entry)
+
+    def test_unknown_pangkat_flagged(self) -> None:
+        entry = PersonnelEntry(pangkat="Sersan Mayor", nrp="12345678", nama="Test")
+        assert ReviewFlag.UNKNOWN_PANGKAT in validate_personnel_entry(entry)
+
+
+class TestHeaderValidator:
+    def test_complete_header_no_flags(self) -> None:
+        header = HeaderFields(
+            nomor_sprint="Sprin/1/I/2025",
+            tanggal=date(2025, 1, 1),
+            satuan_penerbit="POLRES BANDUNG",
+        )
+        assert validate_header(header) == []
+
+    def test_missing_nomor_flagged(self) -> None:
+        header = HeaderFields(tanggal=date(2025, 1, 1))
+        assert ReviewFlag.MISSING_FIELD in validate_header(header)
+
+    def test_missing_date_flagged(self) -> None:
+        header = HeaderFields(nomor_sprint="Sprin/1/I/2025")
+        assert ReviewFlag.DATE_PARSE_FAILED in validate_header(header)
+
+
+class TestFullValidation:
+    def test_personnel_count_mismatch(self) -> None:
+        result = ExtractionResult(
+            header=HeaderFields(
+                nomor_sprint="Sprin/1/I/2025",
+                tanggal=date(2025, 1, 1),
+            ),
+            personel=[
+                PersonnelEntry(pangkat="AKP", nrp="12345678", nama="A"),
+            ],
+        )
+        flags = validate_extraction(result, expected_personnel_count=2)
+        assert ReviewFlag.PERSONNEL_COUNT_MISMATCH in flags
+
+    def test_flags_are_deduped(self) -> None:
+        result = ExtractionResult(
+            header=HeaderFields(),  # missing both nomor and tanggal
+            personel=[
+                PersonnelEntry(nrp="123", pangkat="X"),
+                PersonnelEntry(nrp="456", pangkat="Y"),
+            ],
+        )
+        flags = validate_extraction(result)
+        # each flag type should appear at most once
+        assert len(flags) == len(set(flags))