"""Tests for the personnel-row mapper.""" from __future__ import annotations import pytest from ocr_sprint.pipeline.extract.personnel import ( _classify_header_cell, _split_pangkat_nrp, _split_pangkat_nrp_nama, detect_header_row, extract_personnel, is_personnel_table, map_row, ) from ocr_sprint.pipeline.table import DetectedTable # ---------- header detection ---------- class TestClassifyHeaderCell: @pytest.mark.parametrize( ("text", "expected"), [ ("No", "no"), ("NO.", "no"), ("Nomor", "no"), ("Pangkat", "pangkat"), ("NRP", "nrp"), ("Pangkat / NRP", "pangkat_nrp"), ("PANGKAT/NRP", "pangkat_nrp"), ("Pangkat / NRP / Nama", "pangkat_nrp_nama"), ("PANGKAT/NRP/NAMA", "pangkat_nrp_nama"), ("Pangkat, NRP, Nama", "pangkat_nrp_nama"), ("Nama", "nama"), ("Nama Lengkap", "nama"), ("Jabatan dalam Dinas", "jabatan_dinas"), ("Jabatan dalam Sprint", "jabatan_sprint"), ("Keterangan", "keterangan"), ], ) def test_known_header(self, text: str, expected: str) -> None: assert _classify_header_cell(text) == expected def test_substring_match_prefers_longest_synonym(self) -> None: # 'pangkat' is a shorter prefix of 'pangkat / nrp / nama'. Without # length-sorted iteration we'd misclassify combined headers as plain # 'pangkat' and downstream map_row would drop every row. assert _classify_header_cell("Pangkat / NRP / Nama Personel") == "pangkat_nrp_nama" assert _classify_header_cell("Pangkat / NRP Polri") == "pangkat_nrp" def test_unknown_header(self) -> None: assert _classify_header_cell("Random Text") is None assert _classify_header_cell("") is None class TestDetectHeaderRow: def test_detects_first_row_as_header(self) -> None: table = DetectedTable( cells=[ ["No", "Pangkat", "NRP", "Nama"], ["1", "AKP", "87010101", "Budi"], ] ) result = detect_header_row(table) assert result is not None idx, mapping = result assert idx == 0 assert mapping == ["no", "pangkat", "nrp", "nama"] def test_detects_second_row_when_first_is_title(self) -> None: table = DetectedTable( cells=[ ["DAFTAR PERSONEL"], # title row, not a header ["No", "Pangkat / NRP", "Nama", "Jabatan dalam Dinas"], ["1", "AKP 87010101", "Budi", "Kanit"], ] ) result = detect_header_row(table) assert result is not None idx, _ = result assert idx == 1 def test_returns_none_when_no_header_found(self) -> None: table = DetectedTable(cells=[["foo", "bar"], ["baz", "qux"]]) assert detect_header_row(table) is None # ---------- combined-cell splitting ---------- class TestSplitPangkatNrp: @pytest.mark.parametrize( ("text", "expected"), [ ("AKP 87010101", ("AKP", "87010101")), ("IPDA / 92030404", ("IPDA", "92030404")), ("BRIPKA98050505", ("BRIPKA", "98050505")), ("KOMPOL 88123456", ("KOMPOL", "88123456")), ], ) def test_known_combos(self, text: str, expected: tuple[str, str]) -> None: assert _split_pangkat_nrp(text) == expected def test_returns_none_when_no_nrp(self) -> None: pangkat, nrp = _split_pangkat_nrp("AKP") assert pangkat == "AKP" assert nrp is None class TestSplitPangkatNrpNama: def test_three_way_split(self) -> None: pangkat, nrp, nama = _split_pangkat_nrp_nama("AKP 87010101 Budi Santoso") assert pangkat == "AKP" assert nrp == "87010101" assert nama == "Budi Santoso" @pytest.mark.parametrize( ("text", "expected_pangkat", "expected_name"), [ # multi-word ranks must be matched as contiguous token sequences, # otherwise tokens like 'POL' would leak into the name. ("KOMBES POL 88123456 John Doe", "KOMBES POL", "John Doe"), ("BRIGJEN POL 99887766 Jane Doe", "BRIGJEN POL", "Jane Doe"), ("IRJEN POL 77665544 Ahmad Hidayat", "IRJEN POL", "Ahmad Hidayat"), ("JENDERAL POL 11223344 Sari Wulandari", "JENDERAL POL", "Sari Wulandari"), ], ) def test_multi_word_ranks(self, text: str, expected_pangkat: str, expected_name: str) -> None: pangkat, _nrp, nama = _split_pangkat_nrp_nama(text) assert pangkat == expected_pangkat assert nama == expected_name def test_unknown_rank_returns_none_pangkat(self) -> None: pangkat, nrp, nama = _split_pangkat_nrp_nama("Foobar 87010101 Budi Santoso") assert pangkat is None assert nrp == "87010101" # name keeps the unknown rank token; validators will flag the row. assert nama == "Foobar Budi Santoso" # ---------- row mapping ---------- class TestMapRow: def test_split_columns_polres_layout(self) -> None: mapping = ["no", "pangkat", "nrp", "nama", "jabatan_dinas", "jabatan_sprint"] row = ["1", "AKP", "87010101", "Budi Santoso", "Kanit Reskrim", "Ketua Tim"] entry = map_row(row, mapping) assert entry is not None assert entry.no == 1 assert entry.pangkat == "AKP" assert entry.nrp == "87010101" assert entry.nama == "Budi Santoso" assert entry.jabatan_dinas == "Kanit Reskrim" assert entry.jabatan_sprint == "Ketua Tim" def test_combined_pangkat_nrp_nama_cell(self) -> None: mapping = ["no", "pangkat_nrp_nama", "jabatan_dinas", "jabatan_sprint"] row = ["1", "AKP 87010101 Budi Santoso", "Kanit Reskrim", "Ketua Tim"] entry = map_row(row, mapping) assert entry is not None assert entry.no == 1 assert entry.pangkat == "AKP" assert entry.nrp == "87010101" assert entry.nama == "Budi Santoso" assert entry.jabatan_dinas == "Kanit Reskrim" assert entry.jabatan_sprint == "Ketua Tim" def test_combined_pangkat_nrp_cell(self) -> None: mapping = ["no", "pangkat_nrp", "nama", "jabatan_dinas"] row = ["1", "AKP 87010101", "Budi Santoso", "Kanit Reskrim"] entry = map_row(row, mapping) assert entry is not None assert entry.pangkat == "AKP" assert entry.nrp == "87010101" assert entry.nama == "Budi Santoso" def test_skips_row_without_nama_or_nrp(self) -> None: mapping = ["no", "pangkat"] row = ["", ""] assert map_row(row, mapping) is None def test_unknown_pangkat_kept_verbatim(self) -> None: mapping = ["no", "pangkat", "nrp", "nama"] row = ["1", "Foobar", "87010101", "Budi"] entry = map_row(row, mapping) assert entry is not None # unknown pangkat is preserved so the validation layer can flag it assert entry.pangkat == "Foobar" # ---------- end-to-end extraction ---------- class TestExtractPersonnel: def test_full_table_with_header(self) -> None: table = DetectedTable( cells=[ [ "No", "Pangkat / NRP", "Nama", "Jabatan dalam Dinas", "Jabatan dalam Sprint", ], ["1", "AKP 87010101", "Budi Santoso", "Kanit Reskrim", "Ketua Tim"], ["2", "IPDA 92030404", "Sari Wulandari", "Banit Reskrim", "Anggota"], ["3", "BRIPKA 98050505", "Ahmad Hidayat", "Banit Reskrim", "Anggota"], ] ) entries = extract_personnel([table]) assert len(entries) == 3 assert entries[0].nama == "Budi Santoso" assert entries[0].nrp == "87010101" assert entries[1].pangkat == "IPDA" assert entries[2].pangkat == "BRIPKA" def test_full_table_with_triple_combined_header(self) -> None: # Regression test for header misclassification: 'Pangkat / NRP / Nama' # used to be classified as 'pangkat' due to substring matching, which # silently dropped every personnel row. table = DetectedTable( cells=[ ["No", "Pangkat / NRP / Nama", "Jabatan dalam Sprint"], ["1", "AKP 87010101 Budi Santoso", "Ketua Tim"], ["2", "IPDA 92030404 Sari Wulandari", "Anggota"], ] ) entries = extract_personnel([table]) assert len(entries) == 2 assert entries[0].pangkat == "AKP" assert entries[0].nrp == "87010101" assert entries[0].nama == "Budi Santoso" assert entries[1].nama == "Sari Wulandari" def test_skips_non_personnel_table(self) -> None: table = DetectedTable( cells=[["Tahun", "Anggaran"], ["2024", "100M"]], ) assert extract_personnel([table]) == [] def test_concatenates_multiple_personnel_tables(self) -> None: t1 = DetectedTable( cells=[ ["No", "Pangkat", "NRP", "Nama"], ["1", "AKP", "87010101", "Budi"], ] ) t2 = DetectedTable( cells=[ ["No", "Pangkat", "NRP", "Nama"], ["1", "IPDA", "92030404", "Sari"], ] ) entries = extract_personnel([t1, t2]) assert len(entries) == 2 assert entries[0].nama == "Budi" assert entries[1].nama == "Sari" class TestIsPersonnelTable: def test_matches_with_pangkat_and_nama(self) -> None: table = DetectedTable( cells=[["No", "Pangkat", "NRP", "Nama"], ["1", "AKP", "87010101", "X"]] ) assert is_personnel_table(table) is True def test_rejects_unrelated_table(self) -> None: table = DetectedTable(cells=[["A", "B"], ["1", "2"]]) assert is_personnel_table(table) is False def test_rejects_id_only_table_without_name_column(self) -> None: # 'Pangkat / NRP' carries id but no name; without a name signal # this should not be classified as a personnel table. table = DetectedTable( cells=[ ["No", "Pangkat / NRP", "Jabatan"], ["1", "AKP 87010101", "Kanit Reskrim"], ] ) assert is_personnel_table(table) is False def test_accepts_pangkat_nrp_when_separate_nama_present(self) -> None: table = DetectedTable( cells=[ ["No", "Pangkat / NRP", "Nama"], ["1", "AKP 87010101", "Budi"], ] ) assert is_personnel_table(table) is True def test_accepts_pangkat_nrp_nama_combined(self) -> None: table = DetectedTable( cells=[ ["No", "Pangkat / NRP / Nama", "Jabatan"], ["1", "AKP 87010101 Budi", "Kanit"], ] ) assert is_personnel_table(table) is True