This fixes 4 bugs found on a real Polres Cimahi SPRIN PDF:
1. satuan_penerbit captured the generic 'KEPOLISIAN NEGARA REPUBLIK
INDONESIA' letterhead line instead of the most-specific issuing unit
(e.g. RESOR CIMAHI / SEKTOR PADALARANG). Reworked find_satuan to
scan for each level independently and return the deepest available.
2. find_dasar_list dropped numbered items when OCR put the marker on
its own line ("1.\n Undang-Undang ..."). Refactored into
_collect_numbered_section that buffers a bare-number line and uses
the next non-empty line as the body. Also reused for the new
find_untuk_list which extracts the previously-empty 'untuk' bullets.
3. find_perihal returned None for documents that use 'Pertimbangan'
(very common in Polres-level sprint), forcing the LLM to guess.
Added a regex fallback that picks up the first line under a
'Pertimbangan' label so we keep extraction deterministic.
4. Personnel rows were emitted with only nama populated when
PP-Structure detected a table but the column mapper degraded.
Added a text-based fallback (extract_personnel_from_text) that
scans raw OCR for <rank> + <8-digit NRP> patterns. Triggered when
the PP-Structure result has fewer than 30% rank/NRP-bearing rows.
Reviewed by raising the new PERSONNEL_TEXT_FALLBACK flag.
5. Validation now flags rows with neither pangkat nor nrp as
INCOMPLETE_PERSONNEL_ROW, so the document routes to needs_review
even when individual nrp/pangkat checks pass on empty values.
6. Added 'BRIGPOL' as a variant of BRIGADIR (seen in real scans).
Tests: 229 (was 203) — 26 new tests covering the regex fixes,
text-based personnel extractor, low-quality detector, validator
behaviour, and orchestrator wiring of the fallback path.
Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
123 lines
4.5 KiB
Python
123 lines
4.5 KiB
Python
"""Tests for the validation layer."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from datetime import date
|
|
|
|
import pytest
|
|
|
|
from ocr_sprint.data.master_pangkat import is_valid_pangkat, normalize_pangkat
|
|
from ocr_sprint.pipeline.extract.validators import (
|
|
validate_extraction,
|
|
validate_header,
|
|
validate_nrp,
|
|
validate_personnel_entry,
|
|
)
|
|
from ocr_sprint.schemas.extraction import ExtractionResult, HeaderFields, ReviewFlag
|
|
from ocr_sprint.schemas.personnel import PersonnelEntry
|
|
|
|
|
|
class TestNRP:
|
|
@pytest.mark.parametrize("nrp", ["12345678", "00000001", "99999999"])
|
|
def test_valid_8_digits(self, nrp: str) -> None:
|
|
assert validate_nrp(nrp) is True
|
|
|
|
@pytest.mark.parametrize("nrp", ["1234567", "123456789", "abcdefgh", "", None])
|
|
def test_invalid(self, nrp: str | None) -> None:
|
|
assert validate_nrp(nrp) is False
|
|
|
|
|
|
class TestPangkat:
|
|
@pytest.mark.parametrize(
|
|
("input_str", "expected"),
|
|
[
|
|
("AKP", "AKP"),
|
|
("akp", "AKP"),
|
|
("AKP.", "AKP"),
|
|
("AKBP", "AKBP"),
|
|
("Brigjen Pol", "BRIGJEN POL"),
|
|
("BRIGJEN", "BRIGJEN POL"),
|
|
("Kombespol", "KOMBES POL"),
|
|
("BRIPDA", "BRIPDA"),
|
|
],
|
|
)
|
|
def test_normalizes_known_ranks(self, input_str: str, expected: str) -> None:
|
|
assert normalize_pangkat(input_str) == expected
|
|
|
|
def test_unknown_returns_none(self) -> None:
|
|
assert normalize_pangkat("Sersan Mayor") is None
|
|
assert is_valid_pangkat("Sersan Mayor") is False
|
|
|
|
|
|
class TestPersonnelValidator:
|
|
def test_clean_entry_no_flags(self) -> None:
|
|
entry = PersonnelEntry(pangkat="AKP", nrp="12345678", nama="Test")
|
|
assert validate_personnel_entry(entry) == []
|
|
|
|
def test_invalid_nrp_flagged(self) -> None:
|
|
entry = PersonnelEntry(pangkat="AKP", nrp="123", nama="Test")
|
|
assert ReviewFlag.INVALID_NRP in validate_personnel_entry(entry)
|
|
|
|
def test_unknown_pangkat_flagged(self) -> None:
|
|
entry = PersonnelEntry(pangkat="Sersan Mayor", nrp="12345678", nama="Test")
|
|
assert ReviewFlag.UNKNOWN_PANGKAT in validate_personnel_entry(entry)
|
|
|
|
def test_row_with_only_name_is_flagged_incomplete(self) -> None:
|
|
# A row that captured only `nama` (no pangkat AND no nrp) is the
|
|
# signature of mis-aligned table extraction. Must be flagged so
|
|
# the operator routes the document to needs_review.
|
|
entry = PersonnelEntry(nama="LEAKED FROM SOMEWHERE")
|
|
flags = validate_personnel_entry(entry)
|
|
assert ReviewFlag.INCOMPLETE_PERSONNEL_ROW in flags
|
|
|
|
def test_row_with_only_pangkat_is_not_flagged_incomplete(self) -> None:
|
|
# Having pangkat without NRP is suboptimal but still identifies a
|
|
# rank, so we don't raise the structural-incompleteness flag.
|
|
entry = PersonnelEntry(pangkat="AKP", nama="Test")
|
|
assert ReviewFlag.INCOMPLETE_PERSONNEL_ROW not in validate_personnel_entry(entry)
|
|
|
|
|
|
class TestHeaderValidator:
|
|
def test_complete_header_no_flags(self) -> None:
|
|
header = HeaderFields(
|
|
nomor_sprint="Sprin/1/I/2025",
|
|
tanggal=date(2025, 1, 1),
|
|
satuan_penerbit="POLRES BANDUNG",
|
|
)
|
|
assert validate_header(header) == []
|
|
|
|
def test_missing_nomor_flagged(self) -> None:
|
|
header = HeaderFields(tanggal=date(2025, 1, 1))
|
|
assert ReviewFlag.MISSING_FIELD in validate_header(header)
|
|
|
|
def test_missing_date_flagged(self) -> None:
|
|
header = HeaderFields(nomor_sprint="Sprin/1/I/2025")
|
|
assert ReviewFlag.DATE_PARSE_FAILED in validate_header(header)
|
|
|
|
|
|
class TestFullValidation:
|
|
def test_personnel_count_mismatch(self) -> None:
|
|
result = ExtractionResult(
|
|
header=HeaderFields(
|
|
nomor_sprint="Sprin/1/I/2025",
|
|
tanggal=date(2025, 1, 1),
|
|
),
|
|
personel=[
|
|
PersonnelEntry(pangkat="AKP", nrp="12345678", nama="A"),
|
|
],
|
|
)
|
|
flags = validate_extraction(result, expected_personnel_count=2)
|
|
assert ReviewFlag.PERSONNEL_COUNT_MISMATCH in flags
|
|
|
|
def test_flags_are_deduped(self) -> None:
|
|
result = ExtractionResult(
|
|
header=HeaderFields(), # missing both nomor and tanggal
|
|
personel=[
|
|
PersonnelEntry(nrp="123", pangkat="X"),
|
|
PersonnelEntry(nrp="456", pangkat="Y"),
|
|
],
|
|
)
|
|
flags = validate_extraction(result)
|
|
# each flag type should appear at most once
|
|
assert len(flags) == len(set(flags))
|