"""Tests for the validation layer.""" from __future__ import annotations from datetime import date import pytest from ocr_sprint.data.master_pangkat import is_valid_pangkat, normalize_pangkat from ocr_sprint.pipeline.extract.validators import ( validate_extraction, validate_header, validate_nrp, validate_personnel_entry, ) from ocr_sprint.schemas.extraction import ExtractionResult, HeaderFields, ReviewFlag from ocr_sprint.schemas.personnel import PersonnelEntry class TestNRP: @pytest.mark.parametrize("nrp", ["12345678", "00000001", "99999999"]) def test_valid_8_digits(self, nrp: str) -> None: assert validate_nrp(nrp) is True @pytest.mark.parametrize("nrp", ["1234567", "123456789", "abcdefgh", "", None]) def test_invalid(self, nrp: str | None) -> None: assert validate_nrp(nrp) is False class TestPangkat: @pytest.mark.parametrize( ("input_str", "expected"), [ ("AKP", "AKP"), ("akp", "AKP"), ("AKP.", "AKP"), ("AKBP", "AKBP"), ("Brigjen Pol", "BRIGJEN POL"), ("BRIGJEN", "BRIGJEN POL"), ("Kombespol", "KOMBES POL"), ("BRIPDA", "BRIPDA"), ], ) def test_normalizes_known_ranks(self, input_str: str, expected: str) -> None: assert normalize_pangkat(input_str) == expected def test_unknown_returns_none(self) -> None: assert normalize_pangkat("Sersan Mayor") is None assert is_valid_pangkat("Sersan Mayor") is False class TestPersonnelValidator: def test_clean_entry_no_flags(self) -> None: entry = PersonnelEntry(pangkat="AKP", nrp="12345678", nama="Test") assert validate_personnel_entry(entry) == [] def test_invalid_nrp_flagged(self) -> None: entry = PersonnelEntry(pangkat="AKP", nrp="123", nama="Test") assert ReviewFlag.INVALID_NRP in validate_personnel_entry(entry) def test_unknown_pangkat_flagged(self) -> None: entry = PersonnelEntry(pangkat="Sersan Mayor", nrp="12345678", nama="Test") assert ReviewFlag.UNKNOWN_PANGKAT in validate_personnel_entry(entry) def test_row_with_only_name_is_flagged_incomplete(self) -> None: # A row that captured only `nama` (no pangkat AND no nrp) is the # signature of mis-aligned table extraction. Must be flagged so # the operator routes the document to needs_review. entry = PersonnelEntry(nama="LEAKED FROM SOMEWHERE") flags = validate_personnel_entry(entry) assert ReviewFlag.INCOMPLETE_PERSONNEL_ROW in flags def test_row_with_only_pangkat_is_not_flagged_incomplete(self) -> None: # Having pangkat without NRP is suboptimal but still identifies a # rank, so we don't raise the structural-incompleteness flag. entry = PersonnelEntry(pangkat="AKP", nama="Test") assert ReviewFlag.INCOMPLETE_PERSONNEL_ROW not in validate_personnel_entry(entry) class TestHeaderValidator: def test_complete_header_no_flags(self) -> None: header = HeaderFields( nomor_sprint="Sprin/1/I/2025", tanggal=date(2025, 1, 1), satuan_penerbit="POLRES BANDUNG", ) assert validate_header(header) == [] def test_missing_nomor_flagged(self) -> None: header = HeaderFields(tanggal=date(2025, 1, 1)) assert ReviewFlag.MISSING_FIELD in validate_header(header) def test_missing_date_flagged(self) -> None: header = HeaderFields(nomor_sprint="Sprin/1/I/2025") assert ReviewFlag.DATE_PARSE_FAILED in validate_header(header) class TestFullValidation: def test_personnel_count_mismatch(self) -> None: result = ExtractionResult( header=HeaderFields( nomor_sprint="Sprin/1/I/2025", tanggal=date(2025, 1, 1), ), personel=[ PersonnelEntry(pangkat="AKP", nrp="12345678", nama="A"), ], ) flags = validate_extraction(result, expected_personnel_count=2) assert ReviewFlag.PERSONNEL_COUNT_MISMATCH in flags def test_flags_are_deduped(self) -> None: result = ExtractionResult( header=HeaderFields(), # missing both nomor and tanggal personel=[ PersonnelEntry(nrp="123", pangkat="X"), PersonnelEntry(nrp="456", pangkat="Y"), ], ) flags = validate_extraction(result) # each flag type should appear at most once assert len(flags) == len(set(flags))