"""Tests for the text-based personnel fallback extractor. Driven by the real Polres Cimahi sprint document where PP-Structure produced 24 rows with only ``nama`` populated. The fallback should recover at least the rank + NRP for every row. """ from __future__ import annotations from ocr_sprint.pipeline.extract.personnel_text import ( extract_personnel_from_text, is_low_quality, ) from ocr_sprint.schemas.personnel import PersonnelEntry _CIMAHI_FIXTURE = """\ DAFTAR PERSONIL SKCK POLRES DAN POLSEK JAJARAN POLRES CIMAHI TA 2024 NO NAMA PANGKAT / NRP JABATAN KET BAUR SKCK SAT 1. SRI WAHYUNI AIPTU / 75070328 INTELKAM POLRES CIMAHI BA PELAKSANA SKCK 2. CITRA DWI PUTRI R BRIPTU / 95070659 SAT INTELKAM POLRES CIMAHI BA PELAKSANA SKCK 3. AGUNG LUKMAN AL BRIPTU / 99030245 SAT INTELKAM POLRES CIMAHI BA POLSEK 8. ARIEF SYAHRUL ZAMAN BRIGPOL /96030446 MARGAASIH """ class TestExtractPersonnelFromText: def test_extracts_rank_nrp_and_name(self) -> None: rows = extract_personnel_from_text(_CIMAHI_FIXTURE) assert len(rows) == 4 first = rows[0] assert first.pangkat == "AIPTU" assert first.nrp == "75070328" assert first.nama == "SRI WAHYUNI" def test_normalizes_brigpol_to_brigadir(self) -> None: rows = extract_personnel_from_text(_CIMAHI_FIXTURE) last = rows[-1] # 'BRIGPOL' (no space) must canonicalize to 'BRIGADIR'. assert last.pangkat == "BRIGADIR" assert last.nrp == "96030446" assert last.nama == "ARIEF SYAHRUL ZAMAN" def test_skips_header_lines_as_names(self) -> None: # No row should ever have a column-header word as nama. rows = extract_personnel_from_text(_CIMAHI_FIXTURE) names = [r.nama for r in rows] for blocked in {"NAMA", "PANGKAT", "JABATAN", "KET", "DAFTAR"}: assert blocked not in names def test_jabatan_collected_from_following_lines(self) -> None: rows = extract_personnel_from_text(_CIMAHI_FIXTURE) assert rows[0].jabatan_dinas is not None assert "INTELKAM" in rows[0].jabatan_dinas def test_empty_text_returns_empty(self) -> None: assert extract_personnel_from_text("") == [] def test_text_without_rank_nrp_pattern_returns_empty(self) -> None: text = "Just a paragraph with no rank or NRP at all.\nAnother line." assert extract_personnel_from_text(text) == [] def test_ignores_isolated_8digit_number_without_rank(self) -> None: # NRP without a recognised rank token must not produce a row. text = "Some line\n12345678\nanother line" assert extract_personnel_from_text(text) == [] def test_rejects_unknown_rank_with_8digit_number(self) -> None: # A "rank-shaped" word that isn't in the master list must not yield a row. text = "Some line\nFAKERANK / 12345678\nanother line" assert extract_personnel_from_text(text) == [] class TestIsLowQuality: def test_empty_list_is_low_quality(self) -> None: assert is_low_quality([]) is True def test_all_rows_with_only_name_is_low_quality(self) -> None: rows = [PersonnelEntry(nama=f"NAMA {i}") for i in range(10)] assert is_low_quality(rows) is True def test_majority_with_rank_nrp_is_high_quality(self) -> None: rows = [ PersonnelEntry(nama=f"NAMA {i}", pangkat="AIPTU", nrp=f"{10000000 + i:08d}") for i in range(10) ] assert is_low_quality(rows) is False def test_borderline_30_percent_threshold(self) -> None: # 3 useful out of 10 = exactly 0.3, treated as not-low-quality. useful = [ PersonnelEntry(nama=f"NAMA {i}", pangkat="AIPTU", nrp=f"{10000000 + i:08d}") for i in range(3) ] useless = [PersonnelEntry(nama=f"NAMA {i + 3}") for i in range(7)] assert is_low_quality(useful + useless) is False