Phase 1 MVP: synchronous OCR + regex header extraction
Implements the foundation of the OCR Sprint service: - FastAPI app with /api/v1/health and /api/v1/documents (sync upload) - Pydantic v2 schemas for documents, extraction result, personnel - Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew, denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based header extraction (nomor sprint, tanggal, satuan, perihal, dasar), signatory NRP, master-pangkat validation, confidence scoring + routing. - Tests: 61 unit tests covering regex rules, validators, preprocess, ingest, confidence, and API contract (PaddleOCR mocked). - Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit, Dockerfile, docker-compose, Makefile. - Docs: README + docs/architecture.md (full hybrid stack rationale and 6-phase roadmap). Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
This commit is contained in:
108
tests/unit/test_validators.py
Normal file
108
tests/unit/test_validators.py
Normal file
@@ -0,0 +1,108 @@
|
||||
"""Tests for the validation layer."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import date
|
||||
|
||||
import pytest
|
||||
|
||||
from ocr_sprint.data.master_pangkat import is_valid_pangkat, normalize_pangkat
|
||||
from ocr_sprint.pipeline.extract.validators import (
|
||||
validate_extraction,
|
||||
validate_header,
|
||||
validate_nrp,
|
||||
validate_personnel_entry,
|
||||
)
|
||||
from ocr_sprint.schemas.extraction import ExtractionResult, HeaderFields, ReviewFlag
|
||||
from ocr_sprint.schemas.personnel import PersonnelEntry
|
||||
|
||||
|
||||
class TestNRP:
|
||||
@pytest.mark.parametrize("nrp", ["12345678", "00000001", "99999999"])
|
||||
def test_valid_8_digits(self, nrp: str) -> None:
|
||||
assert validate_nrp(nrp) is True
|
||||
|
||||
@pytest.mark.parametrize("nrp", ["1234567", "123456789", "abcdefgh", "", None])
|
||||
def test_invalid(self, nrp: str | None) -> None:
|
||||
assert validate_nrp(nrp) is False
|
||||
|
||||
|
||||
class TestPangkat:
|
||||
@pytest.mark.parametrize(
|
||||
("input_str", "expected"),
|
||||
[
|
||||
("AKP", "AKP"),
|
||||
("akp", "AKP"),
|
||||
("AKP.", "AKP"),
|
||||
("AKBP", "AKBP"),
|
||||
("Brigjen Pol", "BRIGJEN POL"),
|
||||
("BRIGJEN", "BRIGJEN POL"),
|
||||
("Kombespol", "KOMBES POL"),
|
||||
("BRIPDA", "BRIPDA"),
|
||||
],
|
||||
)
|
||||
def test_normalizes_known_ranks(self, input_str: str, expected: str) -> None:
|
||||
assert normalize_pangkat(input_str) == expected
|
||||
|
||||
def test_unknown_returns_none(self) -> None:
|
||||
assert normalize_pangkat("Sersan Mayor") is None
|
||||
assert is_valid_pangkat("Sersan Mayor") is False
|
||||
|
||||
|
||||
class TestPersonnelValidator:
|
||||
def test_clean_entry_no_flags(self) -> None:
|
||||
entry = PersonnelEntry(pangkat="AKP", nrp="12345678", nama="Test")
|
||||
assert validate_personnel_entry(entry) == []
|
||||
|
||||
def test_invalid_nrp_flagged(self) -> None:
|
||||
entry = PersonnelEntry(pangkat="AKP", nrp="123", nama="Test")
|
||||
assert ReviewFlag.INVALID_NRP in validate_personnel_entry(entry)
|
||||
|
||||
def test_unknown_pangkat_flagged(self) -> None:
|
||||
entry = PersonnelEntry(pangkat="Sersan Mayor", nrp="12345678", nama="Test")
|
||||
assert ReviewFlag.UNKNOWN_PANGKAT in validate_personnel_entry(entry)
|
||||
|
||||
|
||||
class TestHeaderValidator:
|
||||
def test_complete_header_no_flags(self) -> None:
|
||||
header = HeaderFields(
|
||||
nomor_sprint="Sprin/1/I/2025",
|
||||
tanggal=date(2025, 1, 1),
|
||||
satuan_penerbit="POLRES BANDUNG",
|
||||
)
|
||||
assert validate_header(header) == []
|
||||
|
||||
def test_missing_nomor_flagged(self) -> None:
|
||||
header = HeaderFields(tanggal=date(2025, 1, 1))
|
||||
assert ReviewFlag.MISSING_FIELD in validate_header(header)
|
||||
|
||||
def test_missing_date_flagged(self) -> None:
|
||||
header = HeaderFields(nomor_sprint="Sprin/1/I/2025")
|
||||
assert ReviewFlag.DATE_PARSE_FAILED in validate_header(header)
|
||||
|
||||
|
||||
class TestFullValidation:
|
||||
def test_personnel_count_mismatch(self) -> None:
|
||||
result = ExtractionResult(
|
||||
header=HeaderFields(
|
||||
nomor_sprint="Sprin/1/I/2025",
|
||||
tanggal=date(2025, 1, 1),
|
||||
),
|
||||
personel=[
|
||||
PersonnelEntry(pangkat="AKP", nrp="12345678", nama="A"),
|
||||
],
|
||||
)
|
||||
flags = validate_extraction(result, expected_personnel_count=2)
|
||||
assert ReviewFlag.PERSONNEL_COUNT_MISMATCH in flags
|
||||
|
||||
def test_flags_are_deduped(self) -> None:
|
||||
result = ExtractionResult(
|
||||
header=HeaderFields(), # missing both nomor and tanggal
|
||||
personel=[
|
||||
PersonnelEntry(nrp="123", pangkat="X"),
|
||||
PersonnelEntry(nrp="456", pangkat="Y"),
|
||||
],
|
||||
)
|
||||
flags = validate_extraction(result)
|
||||
# each flag type should appear at most once
|
||||
assert len(flags) == len(set(flags))
|
||||
Reference in New Issue
Block a user