Devin Review correctly flagged that the bare "NO" and "KET" entries in the blocklist would silently drop common Indonesian names (KETUT, NOVA, NOOR, NORMAN, NOVIANTI, ...) because the check used startswith rather than a word boundary. Replaced the per-prefix loop with a single compiled regex anchored at ^ with a trailing \b, which still matches column headers like "NO" or "KET" on their own line but no longer rejects "NOOR HIDAYAT" or "KETUT WARDANA". Also fixes the same bug in _following_jabatan. Added two regression tests covering both directions: names starting with the offending tokens are kept, bare column headers still rejected. Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
150 lines
5.0 KiB
Python
150 lines
5.0 KiB
Python
"""Tests for the text-based personnel fallback extractor.
|
|
|
|
Driven by the real Polres Cimahi sprint document where PP-Structure
|
|
produced 24 rows with only ``nama`` populated. The fallback should
|
|
recover at least the rank + NRP for every row.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from ocr_sprint.pipeline.extract.personnel_text import (
|
|
extract_personnel_from_text,
|
|
is_low_quality,
|
|
)
|
|
from ocr_sprint.schemas.personnel import PersonnelEntry
|
|
|
|
_CIMAHI_FIXTURE = """\
|
|
DAFTAR PERSONIL SKCK POLRES DAN POLSEK JAJARAN POLRES CIMAHI TA 2024
|
|
NO
|
|
NAMA
|
|
PANGKAT / NRP
|
|
JABATAN
|
|
KET
|
|
BAUR SKCK SAT
|
|
1.
|
|
SRI WAHYUNI
|
|
AIPTU / 75070328
|
|
INTELKAM POLRES
|
|
CIMAHI
|
|
BA PELAKSANA SKCK
|
|
2.
|
|
CITRA DWI PUTRI R
|
|
BRIPTU / 95070659
|
|
SAT INTELKAM
|
|
POLRES CIMAHI
|
|
BA PELAKSANA SKCK
|
|
3.
|
|
AGUNG LUKMAN AL
|
|
BRIPTU / 99030245
|
|
SAT INTELKAM
|
|
POLRES CIMAHI
|
|
BA POLSEK
|
|
8.
|
|
ARIEF SYAHRUL ZAMAN
|
|
BRIGPOL /96030446
|
|
MARGAASIH
|
|
"""
|
|
|
|
|
|
class TestExtractPersonnelFromText:
|
|
def test_extracts_rank_nrp_and_name(self) -> None:
|
|
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
|
|
assert len(rows) == 4
|
|
first = rows[0]
|
|
assert first.pangkat == "AIPTU"
|
|
assert first.nrp == "75070328"
|
|
assert first.nama == "SRI WAHYUNI"
|
|
|
|
def test_normalizes_brigpol_to_brigadir(self) -> None:
|
|
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
|
|
last = rows[-1]
|
|
# 'BRIGPOL' (no space) must canonicalize to 'BRIGADIR'.
|
|
assert last.pangkat == "BRIGADIR"
|
|
assert last.nrp == "96030446"
|
|
assert last.nama == "ARIEF SYAHRUL ZAMAN"
|
|
|
|
def test_skips_header_lines_as_names(self) -> None:
|
|
# No row should ever have a column-header word as nama.
|
|
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
|
|
names = [r.nama for r in rows]
|
|
for blocked in {"NAMA", "PANGKAT", "JABATAN", "KET", "DAFTAR"}:
|
|
assert blocked not in names
|
|
|
|
def test_jabatan_collected_from_following_lines(self) -> None:
|
|
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
|
|
assert rows[0].jabatan_dinas is not None
|
|
assert "INTELKAM" in rows[0].jabatan_dinas
|
|
|
|
def test_empty_text_returns_empty(self) -> None:
|
|
assert extract_personnel_from_text("") == []
|
|
|
|
def test_text_without_rank_nrp_pattern_returns_empty(self) -> None:
|
|
text = "Just a paragraph with no rank or NRP at all.\nAnother line."
|
|
assert extract_personnel_from_text(text) == []
|
|
|
|
def test_ignores_isolated_8digit_number_without_rank(self) -> None:
|
|
# NRP without a recognised rank token must not produce a row.
|
|
text = "Some line\n12345678\nanother line"
|
|
assert extract_personnel_from_text(text) == []
|
|
|
|
def test_rejects_unknown_rank_with_8digit_number(self) -> None:
|
|
# A "rank-shaped" word that isn't in the master list must not yield a row.
|
|
text = "Some line\nFAKERANK / 12345678\nanother line"
|
|
assert extract_personnel_from_text(text) == []
|
|
|
|
def test_does_not_drop_indonesian_names_starting_with_no_or_ket(self) -> None:
|
|
# Regression: 'NO' / 'KET' are legitimate column header tokens but
|
|
# also prefix common Indonesian names (KETUT, NOVA, NOOR). The
|
|
# blocklist must use word boundaries, not a raw startswith check.
|
|
text = (
|
|
"DAFTAR PERSONIL\n"
|
|
"1.\n"
|
|
"KETUT WARDANA\n"
|
|
"AIPTU / 11111111\n"
|
|
"JABATAN A\n"
|
|
"2.\n"
|
|
"NOVA SARI\n"
|
|
"BRIPTU / 22222222\n"
|
|
"JABATAN B\n"
|
|
"3.\n"
|
|
"NOOR HIDAYAT\n"
|
|
"BRIPDA / 33333333\n"
|
|
"JABATAN C\n"
|
|
)
|
|
rows = extract_personnel_from_text(text)
|
|
names = [r.nama for r in rows]
|
|
assert names == ["KETUT WARDANA", "NOVA SARI", "NOOR HIDAYAT"]
|
|
|
|
def test_still_blocks_bare_column_header_tokens(self) -> None:
|
|
# Word-boundary fix must still reject the actual column-header
|
|
# rows that motivated the blocklist in the first place.
|
|
text = "NO\nNAMA\nPANGKAT / NRP\nJABATAN\nKET\n1.\nREAL NAME\nAIPTU / 12345678\n"
|
|
rows = extract_personnel_from_text(text)
|
|
assert len(rows) == 1
|
|
assert rows[0].nama == "REAL NAME"
|
|
|
|
|
|
class TestIsLowQuality:
|
|
def test_empty_list_is_low_quality(self) -> None:
|
|
assert is_low_quality([]) is True
|
|
|
|
def test_all_rows_with_only_name_is_low_quality(self) -> None:
|
|
rows = [PersonnelEntry(nama=f"NAMA {i}") for i in range(10)]
|
|
assert is_low_quality(rows) is True
|
|
|
|
def test_majority_with_rank_nrp_is_high_quality(self) -> None:
|
|
rows = [
|
|
PersonnelEntry(nama=f"NAMA {i}", pangkat="AIPTU", nrp=f"{10000000 + i:08d}")
|
|
for i in range(10)
|
|
]
|
|
assert is_low_quality(rows) is False
|
|
|
|
def test_borderline_30_percent_threshold(self) -> None:
|
|
# 3 useful out of 10 = exactly 0.3, treated as not-low-quality.
|
|
useful = [
|
|
PersonnelEntry(nama=f"NAMA {i}", pangkat="AIPTU", nrp=f"{10000000 + i:08d}")
|
|
for i in range(3)
|
|
]
|
|
useless = [PersonnelEntry(nama=f"NAMA {i + 3}") for i in range(7)]
|
|
assert is_low_quality(useful + useless) is False
|