Files
OCR-SPRIN-SERVICE/tests/unit/test_personnel_text_fallback.py
Devin AI 58a2bf2648 Fix personnel extraction + header bugs on real Polres Cimahi sprint
This fixes 4 bugs found on a real Polres Cimahi SPRIN PDF:

1. satuan_penerbit captured the generic 'KEPOLISIAN NEGARA REPUBLIK
   INDONESIA' letterhead line instead of the most-specific issuing unit
   (e.g. RESOR CIMAHI / SEKTOR PADALARANG). Reworked find_satuan to
   scan for each level independently and return the deepest available.

2. find_dasar_list dropped numbered items when OCR put the marker on
   its own line ("1.\n Undang-Undang ..."). Refactored into
   _collect_numbered_section that buffers a bare-number line and uses
   the next non-empty line as the body. Also reused for the new
   find_untuk_list which extracts the previously-empty 'untuk' bullets.

3. find_perihal returned None for documents that use 'Pertimbangan'
   (very common in Polres-level sprint), forcing the LLM to guess.
   Added a regex fallback that picks up the first line under a
   'Pertimbangan' label so we keep extraction deterministic.

4. Personnel rows were emitted with only nama populated when
   PP-Structure detected a table but the column mapper degraded.
   Added a text-based fallback (extract_personnel_from_text) that
   scans raw OCR for <rank> + <8-digit NRP> patterns. Triggered when
   the PP-Structure result has fewer than 30% rank/NRP-bearing rows.
   Reviewed by raising the new PERSONNEL_TEXT_FALLBACK flag.

5. Validation now flags rows with neither pangkat nor nrp as
   INCOMPLETE_PERSONNEL_ROW, so the document routes to needs_review
   even when individual nrp/pangkat checks pass on empty values.

6. Added 'BRIGPOL' as a variant of BRIGADIR (seen in real scans).

Tests: 229 (was 203) — 26 new tests covering the regex fixes,
text-based personnel extractor, low-quality detector, validator
behaviour, and orchestrator wiring of the fallback path.

Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
2026-04-26 05:35:42 +00:00

119 lines
3.8 KiB
Python

"""Tests for the text-based personnel fallback extractor.
Driven by the real Polres Cimahi sprint document where PP-Structure
produced 24 rows with only ``nama`` populated. The fallback should
recover at least the rank + NRP for every row.
"""
from __future__ import annotations
from ocr_sprint.pipeline.extract.personnel_text import (
extract_personnel_from_text,
is_low_quality,
)
from ocr_sprint.schemas.personnel import PersonnelEntry
_CIMAHI_FIXTURE = """\
DAFTAR PERSONIL SKCK POLRES DAN POLSEK JAJARAN POLRES CIMAHI TA 2024
NO
NAMA
PANGKAT / NRP
JABATAN
KET
BAUR SKCK SAT
1.
SRI WAHYUNI
AIPTU / 75070328
INTELKAM POLRES
CIMAHI
BA PELAKSANA SKCK
2.
CITRA DWI PUTRI R
BRIPTU / 95070659
SAT INTELKAM
POLRES CIMAHI
BA PELAKSANA SKCK
3.
AGUNG LUKMAN AL
BRIPTU / 99030245
SAT INTELKAM
POLRES CIMAHI
BA POLSEK
8.
ARIEF SYAHRUL ZAMAN
BRIGPOL /96030446
MARGAASIH
"""
class TestExtractPersonnelFromText:
def test_extracts_rank_nrp_and_name(self) -> None:
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
assert len(rows) == 4
first = rows[0]
assert first.pangkat == "AIPTU"
assert first.nrp == "75070328"
assert first.nama == "SRI WAHYUNI"
def test_normalizes_brigpol_to_brigadir(self) -> None:
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
last = rows[-1]
# 'BRIGPOL' (no space) must canonicalize to 'BRIGADIR'.
assert last.pangkat == "BRIGADIR"
assert last.nrp == "96030446"
assert last.nama == "ARIEF SYAHRUL ZAMAN"
def test_skips_header_lines_as_names(self) -> None:
# No row should ever have a column-header word as nama.
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
names = [r.nama for r in rows]
for blocked in {"NAMA", "PANGKAT", "JABATAN", "KET", "DAFTAR"}:
assert blocked not in names
def test_jabatan_collected_from_following_lines(self) -> None:
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
assert rows[0].jabatan_dinas is not None
assert "INTELKAM" in rows[0].jabatan_dinas
def test_empty_text_returns_empty(self) -> None:
assert extract_personnel_from_text("") == []
def test_text_without_rank_nrp_pattern_returns_empty(self) -> None:
text = "Just a paragraph with no rank or NRP at all.\nAnother line."
assert extract_personnel_from_text(text) == []
def test_ignores_isolated_8digit_number_without_rank(self) -> None:
# NRP without a recognised rank token must not produce a row.
text = "Some line\n12345678\nanother line"
assert extract_personnel_from_text(text) == []
def test_rejects_unknown_rank_with_8digit_number(self) -> None:
# A "rank-shaped" word that isn't in the master list must not yield a row.
text = "Some line\nFAKERANK / 12345678\nanother line"
assert extract_personnel_from_text(text) == []
class TestIsLowQuality:
def test_empty_list_is_low_quality(self) -> None:
assert is_low_quality([]) is True
def test_all_rows_with_only_name_is_low_quality(self) -> None:
rows = [PersonnelEntry(nama=f"NAMA {i}") for i in range(10)]
assert is_low_quality(rows) is True
def test_majority_with_rank_nrp_is_high_quality(self) -> None:
rows = [
PersonnelEntry(nama=f"NAMA {i}", pangkat="AIPTU", nrp=f"{10000000 + i:08d}")
for i in range(10)
]
assert is_low_quality(rows) is False
def test_borderline_30_percent_threshold(self) -> None:
# 3 useful out of 10 = exactly 0.3, treated as not-low-quality.
useful = [
PersonnelEntry(nama=f"NAMA {i}", pangkat="AIPTU", nrp=f"{10000000 + i:08d}")
for i in range(3)
]
useless = [PersonnelEntry(nama=f"NAMA {i + 3}") for i in range(7)]
assert is_low_quality(useful + useless) is False