Fix personnel extraction + header bugs on real Polres Cimahi sprint

This fixes 4 bugs found on a real Polres Cimahi SPRIN PDF: 1. satuan_penerbit captured the generic 'KEPOLISIAN NEGARA REPUBLIK INDONESIA' letterhead line instead of the most-specific issuing unit (e.g. RESOR CIMAHI / SEKTOR PADALARANG). Reworked find_satuan to scan for each level independently and return the deepest available. 2. find_dasar_list dropped numbered items when OCR put the marker on its own line ("1.\n Undang-Undang ..."). Refactored into _collect_numbered_section that buffers a bare-number line and uses the next non-empty line as the body. Also reused for the new find_untuk_list which extracts the previously-empty 'untuk' bullets. 3. find_perihal returned None for documents that use 'Pertimbangan' (very common in Polres-level sprint), forcing the LLM to guess. Added a regex fallback that picks up the first line under a 'Pertimbangan' label so we keep extraction deterministic. 4. Personnel rows were emitted with only nama populated when PP-Structure detected a table but the column mapper degraded. Added a text-based fallback (extract_personnel_from_text) that scans raw OCR for <rank> + <8-digit NRP> patterns. Triggered when the PP-Structure result has fewer than 30% rank/NRP-bearing rows. Reviewed by raising the new PERSONNEL_TEXT_FALLBACK flag. 5. Validation now flags rows with neither pangkat nor nrp as INCOMPLETE_PERSONNEL_ROW, so the document routes to needs_review even when individual nrp/pangkat checks pass on empty values. 6. Added 'BRIGPOL' as a variant of BRIGADIR (seen in real scans). Tests: 229 (was 203) — 26 new tests covering the regex fixes, text-based personnel extractor, low-quality detector, validator behaviour, and orchestrator wiring of the fallback path. Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
2026-04-26 05:35:42 +00:00
parent dce77e80e1
commit 58a2bf2648
11 changed files with 747 additions and 39 deletions
--- a/tests/unit/test_orchestrator_llm.py
+++ b/tests/unit/test_orchestrator_llm.py
@@ -169,3 +169,92 @@ def test_orchestrator_marks_unavailable_when_llm_returns_none(
    out = run_pipeline(b"%PDF-1.4\n%fake")
    assert ReviewFlag.LLM_UNAVAILABLE in out.result.review_flags
    assert ReviewFlag.LLM_FALLBACK not in out.result.review_flags
+
+
+def test_orchestrator_uses_text_fallback_when_pp_structure_yields_only_names(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """When PP-Structure produces low-quality rows (e.g. only ``nama`` filled),
+    the orchestrator must run the text fallback against the raw OCR text and
+    raise the ``personnel_text_fallback`` flag.
+    """
+    monkeypatch.setenv("LLM_ENABLED", "false")
+    from ocr_sprint.config import get_settings
+
+    get_settings.cache_clear()
+
+    raw_text = (
+        "DAFTAR PERSONIL\n"
+        "1.\n"
+        "SRI WAHYUNI\n"
+        "AIPTU / 75070328\n"
+        "INTELKAM POLRES CIMAHI\n"
+        "2.\n"
+        "AGUNG LUKMAN\n"
+        "BRIPTU / 99030245\n"
+        "SAT INTELKAM\n"
+    )
+
+    # PP-Structure 'succeeded' but emitted name-only rows (the bug we saw on
+    # the real Polres Cimahi document).
+    from ocr_sprint.schemas.personnel import PersonnelEntry
+
+    pp_structure_low_quality = [
+        PersonnelEntry(nama="SRI WAHYUNI"),
+        PersonnelEntry(nama="AGUNG LUKMAN"),
+    ]
+    _stub_pipeline_stages(
+        monkeypatch,
+        raw_text=raw_text,
+        regex_header=HeaderFields(
+            nomor_sprint="Sprin/1/I/2025",
+            tanggal=date(2025, 1, 1),
+            satuan_penerbit="Polres Cimahi",
+            perihal="ok",
+            dasar=["UU 2/2002"],
+        ),
+    )
+    # Override extract_personnel to return the broken PP-Structure rows.
+    monkeypatch.setattr(orch_module, "extract_personnel", lambda _t: pp_structure_low_quality)
+
+    out = run_pipeline(b"%PDF-1.4\n%fake")
+    assert ReviewFlag.PERSONNEL_TEXT_FALLBACK in out.result.review_flags
+    # Fallback rows must carry pangkat + nrp (the whole point of the path).
+    assert all(r.pangkat and r.nrp for r in out.result.personel)
+    assert {r.pangkat for r in out.result.personel} == {"AIPTU", "BRIPTU"}
+
+
+def test_orchestrator_keeps_pp_structure_rows_when_quality_is_high(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Healthy PP-Structure output (rank+nrp present on most rows) must NOT
+    be replaced by the text fallback.
+    """
+    monkeypatch.setenv("LLM_ENABLED", "false")
+    from ocr_sprint.config import get_settings
+
+    get_settings.cache_clear()
+
+    from ocr_sprint.schemas.personnel import PersonnelEntry
+
+    healthy = [
+        PersonnelEntry(pangkat="AIPTU", nrp="11111111", nama="A"),
+        PersonnelEntry(pangkat="BRIPTU", nrp="22222222", nama="B"),
+        PersonnelEntry(pangkat="BRIPDA", nrp="33333333", nama="C"),
+    ]
+    _stub_pipeline_stages(
+        monkeypatch,
+        raw_text="ignored — should not be parsed",
+        regex_header=HeaderFields(
+            nomor_sprint="Sprin/1/I/2025",
+            tanggal=date(2025, 1, 1),
+            satuan_penerbit="Polres X",
+            perihal="ok",
+            dasar=["UU 2/2002"],
+        ),
+    )
+    monkeypatch.setattr(orch_module, "extract_personnel", lambda _t: healthy)
+
+    out = run_pipeline(b"%PDF-1.4\n%fake")
+    assert ReviewFlag.PERSONNEL_TEXT_FALLBACK not in out.result.review_flags
+    assert [r.nrp for r in out.result.personel] == ["11111111", "22222222", "33333333"]
--- a/tests/unit/test_personnel_text_fallback.py
+++ b/tests/unit/test_personnel_text_fallback.py
@@ -0,0 +1,118 @@
+"""Tests for the text-based personnel fallback extractor.
+
+Driven by the real Polres Cimahi sprint document where PP-Structure
+produced 24 rows with only ``nama`` populated. The fallback should
+recover at least the rank + NRP for every row.
+"""
+
+from __future__ import annotations
+
+from ocr_sprint.pipeline.extract.personnel_text import (
+    extract_personnel_from_text,
+    is_low_quality,
+)
+from ocr_sprint.schemas.personnel import PersonnelEntry
+
+_CIMAHI_FIXTURE = """\
+DAFTAR PERSONIL SKCK POLRES DAN POLSEK JAJARAN POLRES CIMAHI TA 2024
+NO
+NAMA
+PANGKAT / NRP
+JABATAN
+KET
+BAUR SKCK SAT
+1.
+SRI WAHYUNI
+AIPTU / 75070328
+INTELKAM POLRES
+CIMAHI
+BA PELAKSANA SKCK
+2.
+CITRA DWI PUTRI R
+BRIPTU / 95070659
+ SAT INTELKAM
+POLRES CIMAHI
+BA PELAKSANA SKCK
+3.
+AGUNG LUKMAN AL
+BRIPTU / 99030245
+SAT INTELKAM
+POLRES CIMAHI
+BA POLSEK
+8.
+ARIEF SYAHRUL ZAMAN
+BRIGPOL /96030446
+MARGAASIH
+"""
+
+
+class TestExtractPersonnelFromText:
+    def test_extracts_rank_nrp_and_name(self) -> None:
+        rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
+        assert len(rows) == 4
+        first = rows[0]
+        assert first.pangkat == "AIPTU"
+        assert first.nrp == "75070328"
+        assert first.nama == "SRI WAHYUNI"
+
+    def test_normalizes_brigpol_to_brigadir(self) -> None:
+        rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
+        last = rows[-1]
+        # 'BRIGPOL' (no space) must canonicalize to 'BRIGADIR'.
+        assert last.pangkat == "BRIGADIR"
+        assert last.nrp == "96030446"
+        assert last.nama == "ARIEF SYAHRUL ZAMAN"
+
+    def test_skips_header_lines_as_names(self) -> None:
+        # No row should ever have a column-header word as nama.
+        rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
+        names = [r.nama for r in rows]
+        for blocked in {"NAMA", "PANGKAT", "JABATAN", "KET", "DAFTAR"}:
+            assert blocked not in names
+
+    def test_jabatan_collected_from_following_lines(self) -> None:
+        rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
+        assert rows[0].jabatan_dinas is not None
+        assert "INTELKAM" in rows[0].jabatan_dinas
+
+    def test_empty_text_returns_empty(self) -> None:
+        assert extract_personnel_from_text("") == []
+
+    def test_text_without_rank_nrp_pattern_returns_empty(self) -> None:
+        text = "Just a paragraph with no rank or NRP at all.\nAnother line."
+        assert extract_personnel_from_text(text) == []
+
+    def test_ignores_isolated_8digit_number_without_rank(self) -> None:
+        # NRP without a recognised rank token must not produce a row.
+        text = "Some line\n12345678\nanother line"
+        assert extract_personnel_from_text(text) == []
+
+    def test_rejects_unknown_rank_with_8digit_number(self) -> None:
+        # A "rank-shaped" word that isn't in the master list must not yield a row.
+        text = "Some line\nFAKERANK / 12345678\nanother line"
+        assert extract_personnel_from_text(text) == []
+
+
+class TestIsLowQuality:
+    def test_empty_list_is_low_quality(self) -> None:
+        assert is_low_quality([]) is True
+
+    def test_all_rows_with_only_name_is_low_quality(self) -> None:
+        rows = [PersonnelEntry(nama=f"NAMA {i}") for i in range(10)]
+        assert is_low_quality(rows) is True
+
+    def test_majority_with_rank_nrp_is_high_quality(self) -> None:
+        rows = [
+            PersonnelEntry(nama=f"NAMA {i}", pangkat="AIPTU", nrp=f"{10000000 + i:08d}")
+            for i in range(10)
+        ]
+        assert is_low_quality(rows) is False
+
+    def test_borderline_30_percent_threshold(self) -> None:
+        # 3 useful out of 10 = exactly 0.3, treated as not-low-quality.
+        useful = [
+            PersonnelEntry(nama=f"NAMA {i}", pangkat="AIPTU", nrp=f"{10000000 + i:08d}")
+            for i in range(3)
+        ]
+        useless = [PersonnelEntry(nama=f"NAMA {i + 3}") for i in range(7)]
+        assert is_low_quality(useful + useless) is False
--- a/tests/unit/test_regex_rules.py
+++ b/tests/unit/test_regex_rules.py
@@ -14,6 +14,7 @@ from ocr_sprint.pipeline.extract.regex_rules import (
    find_satuan,
    find_signatory,
    find_tanggal,
+    find_untuk_list,
 )


@@ -60,6 +61,36 @@ class TestSatuan:
        result = find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA")
        assert result is not None

+    def test_prefers_resor_over_negara_when_both_present(self) -> None:
+        # The Polri letterhead lists units hierarchically; the issuing unit
+        # is the deepest level, not the topmost generic "NEGARA" line.
+        text = (
+            "KEPOLISIAN NEGARA REPUBLIK INDONESIA\n"
+            "DAERAH JAWA BARAT\n"
+            "RESOR CIMAHI\n"
+            "SURAT PERINTAH\n"
+        )
+        result = find_satuan(text)
+        assert result == "KEPOLISIAN RESOR CIMAHI"
+
+    def test_prefers_sektor_over_resor(self) -> None:
+        text = (
+            "KEPOLISIAN NEGARA REPUBLIK INDONESIA\n"
+            "DAERAH JAWA BARAT\n"
+            "RESOR CIMAHI\n"
+            "SEKTOR PADALARANG\n"
+        )
+        result = find_satuan(text)
+        assert result == "KEPOLISIAN SEKTOR PADALARANG"
+
+    def test_handles_daerah_only(self) -> None:
+        text = "KEPOLISIAN NEGARA REPUBLIK INDONESIA\nDAERAH JAWA BARAT\n"
+        result = find_satuan(text)
+        assert result == "KEPOLISIAN DAERAH JAWA BARAT"
+
+    def test_returns_none_when_no_letterhead(self) -> None:
+        assert find_satuan("no police letterhead here") is None
+

 class TestPerihal:
    def test_extracts_perihal_line(self) -> None:
@@ -69,6 +100,25 @@ class TestPerihal:
    def test_returns_none_when_absent(self) -> None:
        assert find_perihal("no perihal field") is None

+    def test_falls_back_to_pertimbangan_block(self) -> None:
+        # Many Polres-level sprints use "Pertimbangan" instead of "Perihal".
+        # The fallback should pick up the first non-empty line under it.
+        text = (
+            "Pertimbangan\n"
+            "Bahwa dalam rangka mendukung kepentingan Dinas Polres Cimahi.\n"
+            "DASAR :\n"
+            "1. ...\n"
+        )
+        result = find_perihal(text)
+        assert result is not None
+        assert result.startswith("Bahwa dalam rangka mendukung")
+
+    def test_perihal_wins_over_pertimbangan_when_both_present(self) -> None:
+        # If the document has both a Perihal label AND a Pertimbangan
+        # paragraph, the explicit Perihal wins.
+        text = "Pertimbangan\nSome pertimbangan content.\nPERIHAL : The actual perihal.\n"
+        assert find_perihal(text) == "The actual perihal."
+

 class TestDasar:
    def test_numbered_list(self) -> None:
@@ -88,6 +138,57 @@ class TestDasar:
    def test_empty_when_section_missing(self) -> None:
        assert find_dasar_list("no dasar section") == []

+    def test_handles_bare_number_lines_split_by_ocr(self) -> None:
+        # OCR sometimes places the number marker on its own line and the
+        # body on the next non-empty line. The collector must merge them
+        # rather than dropping the body or appending it to the previous
+        # item (which the old implementation did).
+        text = (
+            "Dasar\n"
+            ":\n"
+            "1.\n"
+            " Undang - Undang Nomor 2 tahun 2002 tentang Kepolisian;\n"
+            "2. Peraturan Pemerintah Republik Indonesia No. 76 tahun 2020;\n"
+            "3.\n"
+            "Keterangan Catatan Kepolisian (SKCK);\n"
+            "4.\n"
+            "Pelayanan dilingkungan Badan Intelijen Keamanan Polri.\n"
+            "5. DIPA Petikan Satker Polres Cimahi.\n"
+            "DIPERINTAHKAN\n"
+        )
+        items = find_dasar_list(text)
+        assert len(items) == 5
+        assert items[0].startswith("Undang - Undang")
+        assert items[2].startswith("Keterangan Catatan")
+        assert items[3].startswith("Pelayanan dilingkungan")
+        assert items[4].startswith("DIPA")
+
+
+class TestUntuk:
+    def test_extracts_numbered_untuk_bullets(self) -> None:
+        text = (
+            "DIPERINTAHKAN\n"
+            "Kepada\n"
+            "Untuk\n"
+            "1.\n"
+            "melaksanakan tugas A;\n"
+            "2.\n"
+            "melaksanakan tugas B;\n"
+            "Selesai.\n"
+        )
+        items = find_untuk_list(text)
+        assert len(items) == 2
+        assert items[0] == "melaksanakan tugas A;"
+        assert items[1] == "melaksanakan tugas B;"
+
+    def test_returns_empty_when_section_missing(self) -> None:
+        assert find_untuk_list("no untuk section") == []
+
+    def test_stops_at_dikeluarkan(self) -> None:
+        text = "Untuk\n1. tugas A;\nDikeluarkan di Cimahi\n2. should not be captured\n"
+        items = find_untuk_list(text)
+        assert items == ["tugas A;"]
+

 class TestSignatory:
    def test_extracts_last_nrp(self) -> None:
--- a/tests/unit/test_validators.py
+++ b/tests/unit/test_validators.py
@@ -62,6 +62,20 @@ class TestPersonnelValidator:
        entry = PersonnelEntry(pangkat="Sersan Mayor", nrp="12345678", nama="Test")
        assert ReviewFlag.UNKNOWN_PANGKAT in validate_personnel_entry(entry)

+    def test_row_with_only_name_is_flagged_incomplete(self) -> None:
+        # A row that captured only `nama` (no pangkat AND no nrp) is the
+        # signature of mis-aligned table extraction. Must be flagged so
+        # the operator routes the document to needs_review.
+        entry = PersonnelEntry(nama="LEAKED FROM SOMEWHERE")
+        flags = validate_personnel_entry(entry)
+        assert ReviewFlag.INCOMPLETE_PERSONNEL_ROW in flags
+
+    def test_row_with_only_pangkat_is_not_flagged_incomplete(self) -> None:
+        # Having pangkat without NRP is suboptimal but still identifies a
+        # rank, so we don't raise the structural-incompleteness flag.
+        entry = PersonnelEntry(pangkat="AKP", nama="Test")
+        assert ReviewFlag.INCOMPLETE_PERSONNEL_ROW not in validate_personnel_entry(entry)
+

 class TestHeaderValidator:
    def test_complete_header_no_flags(self) -> None: