This fixes 4 bugs found on a real Polres Cimahi SPRIN PDF:
1. satuan_penerbit captured the generic 'KEPOLISIAN NEGARA REPUBLIK
INDONESIA' letterhead line instead of the most-specific issuing unit
(e.g. RESOR CIMAHI / SEKTOR PADALARANG). Reworked find_satuan to
scan for each level independently and return the deepest available.
2. find_dasar_list dropped numbered items when OCR put the marker on
its own line ("1.\n Undang-Undang ..."). Refactored into
_collect_numbered_section that buffers a bare-number line and uses
the next non-empty line as the body. Also reused for the new
find_untuk_list which extracts the previously-empty 'untuk' bullets.
3. find_perihal returned None for documents that use 'Pertimbangan'
(very common in Polres-level sprint), forcing the LLM to guess.
Added a regex fallback that picks up the first line under a
'Pertimbangan' label so we keep extraction deterministic.
4. Personnel rows were emitted with only nama populated when
PP-Structure detected a table but the column mapper degraded.
Added a text-based fallback (extract_personnel_from_text) that
scans raw OCR for <rank> + <8-digit NRP> patterns. Triggered when
the PP-Structure result has fewer than 30% rank/NRP-bearing rows.
Reviewed by raising the new PERSONNEL_TEXT_FALLBACK flag.
5. Validation now flags rows with neither pangkat nor nrp as
INCOMPLETE_PERSONNEL_ROW, so the document routes to needs_review
even when individual nrp/pangkat checks pass on empty values.
6. Added 'BRIGPOL' as a variant of BRIGADIR (seen in real scans).
Tests: 229 (was 203) — 26 new tests covering the regex fixes,
text-based personnel extractor, low-quality detector, validator
behaviour, and orchestrator wiring of the fallback path.
Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
119 lines
3.8 KiB
Python
119 lines
3.8 KiB
Python
"""Tests for the text-based personnel fallback extractor.
|
|
|
|
Driven by the real Polres Cimahi sprint document where PP-Structure
|
|
produced 24 rows with only ``nama`` populated. The fallback should
|
|
recover at least the rank + NRP for every row.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from ocr_sprint.pipeline.extract.personnel_text import (
|
|
extract_personnel_from_text,
|
|
is_low_quality,
|
|
)
|
|
from ocr_sprint.schemas.personnel import PersonnelEntry
|
|
|
|
_CIMAHI_FIXTURE = """\
|
|
DAFTAR PERSONIL SKCK POLRES DAN POLSEK JAJARAN POLRES CIMAHI TA 2024
|
|
NO
|
|
NAMA
|
|
PANGKAT / NRP
|
|
JABATAN
|
|
KET
|
|
BAUR SKCK SAT
|
|
1.
|
|
SRI WAHYUNI
|
|
AIPTU / 75070328
|
|
INTELKAM POLRES
|
|
CIMAHI
|
|
BA PELAKSANA SKCK
|
|
2.
|
|
CITRA DWI PUTRI R
|
|
BRIPTU / 95070659
|
|
SAT INTELKAM
|
|
POLRES CIMAHI
|
|
BA PELAKSANA SKCK
|
|
3.
|
|
AGUNG LUKMAN AL
|
|
BRIPTU / 99030245
|
|
SAT INTELKAM
|
|
POLRES CIMAHI
|
|
BA POLSEK
|
|
8.
|
|
ARIEF SYAHRUL ZAMAN
|
|
BRIGPOL /96030446
|
|
MARGAASIH
|
|
"""
|
|
|
|
|
|
class TestExtractPersonnelFromText:
|
|
def test_extracts_rank_nrp_and_name(self) -> None:
|
|
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
|
|
assert len(rows) == 4
|
|
first = rows[0]
|
|
assert first.pangkat == "AIPTU"
|
|
assert first.nrp == "75070328"
|
|
assert first.nama == "SRI WAHYUNI"
|
|
|
|
def test_normalizes_brigpol_to_brigadir(self) -> None:
|
|
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
|
|
last = rows[-1]
|
|
# 'BRIGPOL' (no space) must canonicalize to 'BRIGADIR'.
|
|
assert last.pangkat == "BRIGADIR"
|
|
assert last.nrp == "96030446"
|
|
assert last.nama == "ARIEF SYAHRUL ZAMAN"
|
|
|
|
def test_skips_header_lines_as_names(self) -> None:
|
|
# No row should ever have a column-header word as nama.
|
|
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
|
|
names = [r.nama for r in rows]
|
|
for blocked in {"NAMA", "PANGKAT", "JABATAN", "KET", "DAFTAR"}:
|
|
assert blocked not in names
|
|
|
|
def test_jabatan_collected_from_following_lines(self) -> None:
|
|
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
|
|
assert rows[0].jabatan_dinas is not None
|
|
assert "INTELKAM" in rows[0].jabatan_dinas
|
|
|
|
def test_empty_text_returns_empty(self) -> None:
|
|
assert extract_personnel_from_text("") == []
|
|
|
|
def test_text_without_rank_nrp_pattern_returns_empty(self) -> None:
|
|
text = "Just a paragraph with no rank or NRP at all.\nAnother line."
|
|
assert extract_personnel_from_text(text) == []
|
|
|
|
def test_ignores_isolated_8digit_number_without_rank(self) -> None:
|
|
# NRP without a recognised rank token must not produce a row.
|
|
text = "Some line\n12345678\nanother line"
|
|
assert extract_personnel_from_text(text) == []
|
|
|
|
def test_rejects_unknown_rank_with_8digit_number(self) -> None:
|
|
# A "rank-shaped" word that isn't in the master list must not yield a row.
|
|
text = "Some line\nFAKERANK / 12345678\nanother line"
|
|
assert extract_personnel_from_text(text) == []
|
|
|
|
|
|
class TestIsLowQuality:
|
|
def test_empty_list_is_low_quality(self) -> None:
|
|
assert is_low_quality([]) is True
|
|
|
|
def test_all_rows_with_only_name_is_low_quality(self) -> None:
|
|
rows = [PersonnelEntry(nama=f"NAMA {i}") for i in range(10)]
|
|
assert is_low_quality(rows) is True
|
|
|
|
def test_majority_with_rank_nrp_is_high_quality(self) -> None:
|
|
rows = [
|
|
PersonnelEntry(nama=f"NAMA {i}", pangkat="AIPTU", nrp=f"{10000000 + i:08d}")
|
|
for i in range(10)
|
|
]
|
|
assert is_low_quality(rows) is False
|
|
|
|
def test_borderline_30_percent_threshold(self) -> None:
|
|
# 3 useful out of 10 = exactly 0.3, treated as not-low-quality.
|
|
useful = [
|
|
PersonnelEntry(nama=f"NAMA {i}", pangkat="AIPTU", nrp=f"{10000000 + i:08d}")
|
|
for i in range(3)
|
|
]
|
|
useless = [PersonnelEntry(nama=f"NAMA {i + 3}") for i in range(7)]
|
|
assert is_low_quality(useful + useless) is False
|