Fix personnel extraction + header bugs on real Polres Cimahi sprint
This fixes 4 bugs found on a real Polres Cimahi SPRIN PDF:
1. satuan_penerbit captured the generic 'KEPOLISIAN NEGARA REPUBLIK
INDONESIA' letterhead line instead of the most-specific issuing unit
(e.g. RESOR CIMAHI / SEKTOR PADALARANG). Reworked find_satuan to
scan for each level independently and return the deepest available.
2. find_dasar_list dropped numbered items when OCR put the marker on
its own line ("1.\n Undang-Undang ..."). Refactored into
_collect_numbered_section that buffers a bare-number line and uses
the next non-empty line as the body. Also reused for the new
find_untuk_list which extracts the previously-empty 'untuk' bullets.
3. find_perihal returned None for documents that use 'Pertimbangan'
(very common in Polres-level sprint), forcing the LLM to guess.
Added a regex fallback that picks up the first line under a
'Pertimbangan' label so we keep extraction deterministic.
4. Personnel rows were emitted with only nama populated when
PP-Structure detected a table but the column mapper degraded.
Added a text-based fallback (extract_personnel_from_text) that
scans raw OCR for <rank> + <8-digit NRP> patterns. Triggered when
the PP-Structure result has fewer than 30% rank/NRP-bearing rows.
Reviewed by raising the new PERSONNEL_TEXT_FALLBACK flag.
5. Validation now flags rows with neither pangkat nor nrp as
INCOMPLETE_PERSONNEL_ROW, so the document routes to needs_review
even when individual nrp/pangkat checks pass on empty values.
6. Added 'BRIGPOL' as a variant of BRIGADIR (seen in real scans).
Tests: 229 (was 203) — 26 new tests covering the regex fixes,
text-based personnel extractor, low-quality detector, validator
behaviour, and orchestrator wiring of the fallback path.
Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
This commit is contained in:
118
tests/unit/test_personnel_text_fallback.py
Normal file
118
tests/unit/test_personnel_text_fallback.py
Normal file
@@ -0,0 +1,118 @@
|
||||
"""Tests for the text-based personnel fallback extractor.
|
||||
|
||||
Driven by the real Polres Cimahi sprint document where PP-Structure
|
||||
produced 24 rows with only ``nama`` populated. The fallback should
|
||||
recover at least the rank + NRP for every row.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from ocr_sprint.pipeline.extract.personnel_text import (
|
||||
extract_personnel_from_text,
|
||||
is_low_quality,
|
||||
)
|
||||
from ocr_sprint.schemas.personnel import PersonnelEntry
|
||||
|
||||
_CIMAHI_FIXTURE = """\
|
||||
DAFTAR PERSONIL SKCK POLRES DAN POLSEK JAJARAN POLRES CIMAHI TA 2024
|
||||
NO
|
||||
NAMA
|
||||
PANGKAT / NRP
|
||||
JABATAN
|
||||
KET
|
||||
BAUR SKCK SAT
|
||||
1.
|
||||
SRI WAHYUNI
|
||||
AIPTU / 75070328
|
||||
INTELKAM POLRES
|
||||
CIMAHI
|
||||
BA PELAKSANA SKCK
|
||||
2.
|
||||
CITRA DWI PUTRI R
|
||||
BRIPTU / 95070659
|
||||
SAT INTELKAM
|
||||
POLRES CIMAHI
|
||||
BA PELAKSANA SKCK
|
||||
3.
|
||||
AGUNG LUKMAN AL
|
||||
BRIPTU / 99030245
|
||||
SAT INTELKAM
|
||||
POLRES CIMAHI
|
||||
BA POLSEK
|
||||
8.
|
||||
ARIEF SYAHRUL ZAMAN
|
||||
BRIGPOL /96030446
|
||||
MARGAASIH
|
||||
"""
|
||||
|
||||
|
||||
class TestExtractPersonnelFromText:
|
||||
def test_extracts_rank_nrp_and_name(self) -> None:
|
||||
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
|
||||
assert len(rows) == 4
|
||||
first = rows[0]
|
||||
assert first.pangkat == "AIPTU"
|
||||
assert first.nrp == "75070328"
|
||||
assert first.nama == "SRI WAHYUNI"
|
||||
|
||||
def test_normalizes_brigpol_to_brigadir(self) -> None:
|
||||
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
|
||||
last = rows[-1]
|
||||
# 'BRIGPOL' (no space) must canonicalize to 'BRIGADIR'.
|
||||
assert last.pangkat == "BRIGADIR"
|
||||
assert last.nrp == "96030446"
|
||||
assert last.nama == "ARIEF SYAHRUL ZAMAN"
|
||||
|
||||
def test_skips_header_lines_as_names(self) -> None:
|
||||
# No row should ever have a column-header word as nama.
|
||||
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
|
||||
names = [r.nama for r in rows]
|
||||
for blocked in {"NAMA", "PANGKAT", "JABATAN", "KET", "DAFTAR"}:
|
||||
assert blocked not in names
|
||||
|
||||
def test_jabatan_collected_from_following_lines(self) -> None:
|
||||
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
|
||||
assert rows[0].jabatan_dinas is not None
|
||||
assert "INTELKAM" in rows[0].jabatan_dinas
|
||||
|
||||
def test_empty_text_returns_empty(self) -> None:
|
||||
assert extract_personnel_from_text("") == []
|
||||
|
||||
def test_text_without_rank_nrp_pattern_returns_empty(self) -> None:
|
||||
text = "Just a paragraph with no rank or NRP at all.\nAnother line."
|
||||
assert extract_personnel_from_text(text) == []
|
||||
|
||||
def test_ignores_isolated_8digit_number_without_rank(self) -> None:
|
||||
# NRP without a recognised rank token must not produce a row.
|
||||
text = "Some line\n12345678\nanother line"
|
||||
assert extract_personnel_from_text(text) == []
|
||||
|
||||
def test_rejects_unknown_rank_with_8digit_number(self) -> None:
|
||||
# A "rank-shaped" word that isn't in the master list must not yield a row.
|
||||
text = "Some line\nFAKERANK / 12345678\nanother line"
|
||||
assert extract_personnel_from_text(text) == []
|
||||
|
||||
|
||||
class TestIsLowQuality:
|
||||
def test_empty_list_is_low_quality(self) -> None:
|
||||
assert is_low_quality([]) is True
|
||||
|
||||
def test_all_rows_with_only_name_is_low_quality(self) -> None:
|
||||
rows = [PersonnelEntry(nama=f"NAMA {i}") for i in range(10)]
|
||||
assert is_low_quality(rows) is True
|
||||
|
||||
def test_majority_with_rank_nrp_is_high_quality(self) -> None:
|
||||
rows = [
|
||||
PersonnelEntry(nama=f"NAMA {i}", pangkat="AIPTU", nrp=f"{10000000 + i:08d}")
|
||||
for i in range(10)
|
||||
]
|
||||
assert is_low_quality(rows) is False
|
||||
|
||||
def test_borderline_30_percent_threshold(self) -> None:
|
||||
# 3 useful out of 10 = exactly 0.3, treated as not-low-quality.
|
||||
useful = [
|
||||
PersonnelEntry(nama=f"NAMA {i}", pangkat="AIPTU", nrp=f"{10000000 + i:08d}")
|
||||
for i in range(3)
|
||||
]
|
||||
useless = [PersonnelEntry(nama=f"NAMA {i + 3}") for i in range(7)]
|
||||
assert is_low_quality(useful + useless) is False
|
||||
Reference in New Issue
Block a user