Files
OCR-SPRIN-SERVICE/tests/unit/test_personnel_text_fallback.py

325 lines
13 KiB
Python

"""Tests for the text-based personnel fallback extractor.
Driven by the real Polres Cimahi sprint document where PP-Structure
produced 24 rows with only ``nama`` populated. The fallback should
recover at least the rank + NRP for every row.
"""
from __future__ import annotations
from ocr_sprint.pipeline.extract.personnel_text import (
extract_personnel_from_ocr_lines,
extract_personnel_from_text,
is_low_quality,
)
from ocr_sprint.pipeline.ocr import OCRLine
from ocr_sprint.schemas.personnel import PersonnelEntry
def _ocr_line(text: str, x: float, y: float, w: float = 80, h: float = 15) -> OCRLine:
box = ((x, y), (x + w, y), (x + w, y + h), (x, y + h))
return OCRLine(text=text, confidence=1.0, box=box)
_CIMAHI_FIXTURE = """\
DAFTAR PERSONIL SKCK POLRES DAN POLSEK JAJARAN POLRES CIMAHI TA 2024
NO
NAMA
PANGKAT / NRP
JABATAN
KET
BAUR SKCK SAT
1.
SRI WAHYUNI
AIPTU / 75070328
INTELKAM POLRES
CIMAHI
BA PELAKSANA SKCK
2.
CITRA DWI PUTRI R
BRIPTU / 95070659
SAT INTELKAM
POLRES CIMAHI
BA PELAKSANA SKCK
3.
AGUNG LUKMAN AL
BRIPTU / 99030245
SAT INTELKAM
POLRES CIMAHI
BA POLSEK
8.
ARIEF SYAHRUL ZAMAN
BRIGPOL /96030446
MARGAASIH
"""
class TestExtractPersonnelFromText:
def test_extracts_rank_nrp_and_name(self) -> None:
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
assert len(rows) == 4
first = rows[0]
assert first.pangkat == "AIPTU"
assert first.nrp == "75070328"
assert first.nama == "SRI WAHYUNI"
def test_normalizes_brigpol_to_brigadir(self) -> None:
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
last = rows[-1]
# 'BRIGPOL' (no space) must canonicalize to 'BRIGADIR'.
assert last.pangkat == "BRIGADIR"
assert last.nrp == "96030446"
assert last.nama == "ARIEF SYAHRUL ZAMAN"
def test_skips_header_lines_as_names(self) -> None:
# No row should ever have a column-header word as nama.
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
names = [r.nama for r in rows]
for blocked in {"NAMA", "PANGKAT", "JABATAN", "KET", "DAFTAR"}:
assert blocked not in names
def test_jabatan_collected_from_following_lines(self) -> None:
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
assert rows[0].jabatan_dinas is not None
assert "INTELKAM" in rows[0].jabatan_dinas
def test_empty_text_returns_empty(self) -> None:
assert extract_personnel_from_text("") == []
def test_text_without_rank_nrp_pattern_returns_empty(self) -> None:
text = "Just a paragraph with no rank or NRP at all.\nAnother line."
assert extract_personnel_from_text(text) == []
def test_ignores_isolated_8digit_number_without_rank(self) -> None:
# NRP without a recognised rank token must not produce a row.
text = "Some line\n12345678\nanother line"
assert extract_personnel_from_text(text) == []
def test_rejects_unknown_rank_with_8digit_number(self) -> None:
# A "rank-shaped" word that isn't in the master list must not yield a row.
text = "Some line\nFAKERANK / 12345678\nanother line"
assert extract_personnel_from_text(text) == []
def test_does_not_drop_indonesian_names_starting_with_no_or_ket(self) -> None:
# Regression: 'NO' / 'KET' are legitimate column header tokens but
# also prefix common Indonesian names (KETUT, NOVA, NOOR). The
# blocklist must use word boundaries, not a raw startswith check.
text = (
"DAFTAR PERSONIL\n"
"1.\n"
"KETUT WARDANA\n"
"AIPTU / 11111111\n"
"JABATAN A\n"
"2.\n"
"NOVA SARI\n"
"BRIPTU / 22222222\n"
"JABATAN B\n"
"3.\n"
"NOOR HIDAYAT\n"
"BRIPDA / 33333333\n"
"JABATAN C\n"
)
rows = extract_personnel_from_text(text)
names = [r.nama for r in rows]
assert names == ["KETUT WARDANA", "NOVA SARI", "NOOR HIDAYAT"]
def test_extracts_multiple_rows_when_collapsed_to_one_line(self) -> None:
# Polres Banjar regression: when PaddleOCR merges several table
# rows onto a single OCR line, every rank+NRP pair on that line
# must still produce a separate row. Previously per-line
# ``re.search`` returned only the first match.
text = (
"DAFTAR NAMA INSTRUKTUR\n"
"1 CUCU JUHANA, A.K.S. KOMPOL 70100418 KABAGOPS "
"INSTRUKTUR LAT PRA OPS "
"HERU SAMSUL BAHRI, S.E., M.M., CPHR., CBA.AKP 77020049 "
"KASAT RESKRIM SDA "
"YAYAN SOPIANA, S.A.P., M.A.P., CPHR., CBA, CIAKP 84011113 "
"KASATINTELKAM POLRES BANJAR SDA\n"
)
rows = extract_personnel_from_text(text)
assert len(rows) == 3
assert [r.pangkat for r in rows] == ["KOMPOL", "AKP", "AKP"]
assert [r.nrp for r in rows] == ["70100418", "77020049", "84011113"]
assert rows[0].nama == "CUCU JUHANA, A.K.S."
assert rows[1].nama is not None and "HERU SAMSUL BAHRI" in rows[1].nama
assert rows[2].nama is not None and "YAYAN SOPIANA" in rows[2].nama
def test_extracts_multiple_rows_when_split_across_lines(self) -> None:
# Variant of the squished case where OCR produces one line per
# table row. Each row still ends up with multiple rank+NRP pairs
# never being on the same line, but verifies the finditer-based
# path doesn't regress this layout.
text = (
"1 CUCU JUHANA, A.K.S. KOMPOL 70100418 KABAGOPS\n"
"INSTRUKTUR LAT PRA OPS\n"
"HERU SAMSUL BAHRI, S.E., M.M., CPHR., CBA.AKP 77020049 KASAT RESKRIM\n"
"SDA\n"
"YAYAN SOPIANA, S.A.P., M.A.P., CPHR., CBA, CIAKP 84011113 KASATINTELKAM\n"
"POLRES BANJAR SDA\n"
)
rows = extract_personnel_from_text(text)
assert [r.pangkat for r in rows] == ["KOMPOL", "AKP", "AKP"]
assert [r.nrp for r in rows] == ["70100418", "77020049", "84011113"]
assert rows[0].nama == "CUCU JUHANA, A.K.S."
def test_extracts_rows_when_sprint_has_no_nrp_column(self) -> None:
# Polda Kalbar Akpol-panitia regression: sprint formats without
# an NRP column (panitia, undangan templates) must still extract
# rows via the rank-only Pass 3 path. Names span multiple OCR
# lines (narrow column), and the multi-token rank "KOMBES POL"
# is split across two lines.
text = (
"DAFTAR NAMA PANITIA\n"
"NO\nNAMA\nPANGKAT\nJABATAN\nSTRUKTURAL\nDALAM SPRIN\nKET\n"
"1\nF. GUNTUR\nSUNOTO, S.I.K.,\nM.H.\n"
"KOMBES\nPOL\n"
"KARO SDM\nPOLDA KALBAR\nKETUA\nPELAKSANA\n"
"2\nJUDA TRISNO\nTAMPUBOLON,\nS.H., S.I.K., M.H.\n"
"AKBP\n"
"KABAGDALPERS\nRO SDM\nPOLDA KALBAR\nSEKRETARIS\n"
"3\nPRAYITNO, S.H.,\nM.H.\n"
"KOMPOL\n"
"KASUBBAG DIAPERS\nANGGOTA\n"
)
rows = extract_personnel_from_text(text)
assert len(rows) == 3
assert [r.pangkat for r in rows] == ["KOMBES POL", "AKBP", "KOMPOL"]
# All Pass 3 rows have nrp=None by design.
assert all(r.nrp is None for r in rows)
assert rows[0].nama == "F. GUNTUR SUNOTO, S.I.K., M.H."
assert rows[1].nama == "JUDA TRISNO TAMPUBOLON, S.H., S.I.K., M.H."
assert rows[2].nama == "PRAYITNO, S.H., M.H."
assert rows[0].jabatan_dinas is not None and "KARO SDM" in rows[0].jabatan_dinas
def test_pass3_does_not_run_when_pass1_succeeds(self) -> None:
# If a sprint has NRPs (Pass 1 succeeds), Pass 3 must not fire
# and produce duplicate/contaminating rows.
text = (
"1\nSRI WAHYUNI\nAIPTU / 75070328\nBAUR SKCK\n"
"2\nCITRA DWI PUTRI\nBRIPTU / 95070659\nBA PELAKSANA\n"
)
rows = extract_personnel_from_text(text)
assert len(rows) == 2
assert all(r.nrp is not None for r in rows)
def test_still_blocks_bare_column_header_tokens(self) -> None:
# Word-boundary fix must still reject the actual column-header
# rows that motivated the blocklist in the first place.
text = "NO\nNAMA\nPANGKAT / NRP\nJABATAN\nKET\n1.\nREAL NAME\nAIPTU / 12345678\n"
rows = extract_personnel_from_text(text)
assert len(rows) == 1
assert rows[0].nama == "REAL NAME"
class TestExtractPersonnelFromOcrLines:
"""Column-aware Pass 3 — Polda Kalbar Akpol-panitia regression.
Verifies that bounding-box geometry preserves column boundaries on
dense tables where text-only Pass 3 bleeds adjacent columns into
nama/jabatan.
"""
def _kalbar_lines(self) -> list[OCRLine]:
# Stylised Polda Kalbar layout: NO | NAMA | PANGKAT | STRUKTURAL | SPRIN
# X columns: 10, 100, 250, 380, 520. Each row may have multi-line cells.
return [
# Row 1 — KOMBES POL spans two stacked OCR boxes
_ocr_line("1", 10, 100),
_ocr_line("F. GUNTUR", 100, 100),
_ocr_line("SUNOTO, S.I.K.,", 100, 120),
_ocr_line("M.H.", 100, 140),
_ocr_line("KOMBES", 250, 100),
_ocr_line("POL", 250, 120),
_ocr_line("KARO SDM", 380, 100),
_ocr_line("POLDA KALBAR", 380, 120),
_ocr_line("KETUA", 520, 100),
_ocr_line("PELAKSANA", 520, 120),
# Row 2
_ocr_line("2", 10, 200),
_ocr_line("JUDA TRISNO", 100, 200),
_ocr_line("TAMPUBOLON,", 100, 220),
_ocr_line("S.H., S.I.K., M.H.", 100, 240),
_ocr_line("AKBP", 250, 200),
_ocr_line("KABAGDALPERS", 380, 200),
_ocr_line("RO SDM", 380, 220),
_ocr_line("POLDA KALBAR", 380, 240),
_ocr_line("SEKRETARIS", 520, 200),
# Row 9 — PNS PENATA TK I (multi-token rank stacked)
_ocr_line("9", 10, 500),
_ocr_line("FITRIANSYAH,", 100, 500),
_ocr_line("S.E.", 100, 520),
_ocr_line("PENATA", 250, 500),
_ocr_line("TK I", 250, 520),
_ocr_line("KAURKEU", 380, 500),
_ocr_line("RO SDM", 380, 520),
_ocr_line("POLDA KALBAR", 380, 540),
_ocr_line("BENDAHARA", 520, 500),
]
def test_extracts_three_rows(self) -> None:
rows = extract_personnel_from_ocr_lines(self._kalbar_lines())
assert len(rows) == 3
assert [r.pangkat for r in rows] == ["KOMBES POL", "AKBP", "PENATA TK I"]
def test_nama_is_assembled_only_from_nama_column(self) -> None:
# Each row's nama must contain *all* its multi-line fragments
# and *only* its multi-line fragments — no bleed from struktural.
rows = extract_personnel_from_ocr_lines(self._kalbar_lines())
assert rows[0].nama == "F. GUNTUR SUNOTO, S.I.K., M.H."
assert rows[1].nama == "JUDA TRISNO TAMPUBOLON, S.H., S.I.K., M.H."
assert rows[2].nama == "FITRIANSYAH, S.E."
def test_jabatan_split_into_struktural_and_sprint(self) -> None:
# The geometric column boundary must split STRUKTURAL (jabatan_dinas)
# from DALAM SPRIN (jabatan_sprint).
rows = extract_personnel_from_ocr_lines(self._kalbar_lines())
assert rows[0].jabatan_dinas == "KARO SDM POLDA KALBAR"
assert rows[0].jabatan_sprint == "KETUA PELAKSANA"
assert rows[1].jabatan_dinas == "KABAGDALPERS RO SDM POLDA KALBAR"
assert rows[1].jabatan_sprint == "SEKRETARIS"
def test_returns_empty_when_no_rank_anchors(self) -> None:
lines = [
_ocr_line("DAFTAR NAMA", 100, 50),
_ocr_line("HEADER", 100, 100),
]
assert extract_personnel_from_ocr_lines(lines) == []
def test_returns_empty_for_empty_input(self) -> None:
assert extract_personnel_from_ocr_lines([]) == []
def test_no_row_bleed_between_consecutive_rows(self) -> None:
# Row 1's last name fragment ("F. GUNTUR") sits BELOW its rank
# line but inside row 1's visual span. It must NOT leak into
# row 2's nama, which should start with "JUDA TRISNO".
rows = extract_personnel_from_ocr_lines(self._kalbar_lines())
assert rows[1].nama is not None
assert rows[1].nama.startswith("JUDA TRISNO")
assert "GUNTUR" not in rows[1].nama
assert "SUNOTO" not in rows[1].nama
class TestIsLowQuality:
def test_empty_list_is_low_quality(self) -> None:
assert is_low_quality([]) is True
def test_all_rows_with_only_name_is_low_quality(self) -> None:
rows = [PersonnelEntry(nama=f"NAMA {i}") for i in range(10)]
assert is_low_quality(rows) is True
def test_majority_with_rank_nrp_is_high_quality(self) -> None:
rows = [
PersonnelEntry(nama=f"NAMA {i}", pangkat="AIPTU", nrp=f"{10000000 + i:08d}")
for i in range(10)
]
assert is_low_quality(rows) is False
def test_borderline_30_percent_threshold(self) -> None:
# 3 useful out of 10 = exactly 0.3, treated as not-low-quality.
useful = [
PersonnelEntry(nama=f"NAMA {i}", pangkat="AIPTU", nrp=f"{10000000 + i:08d}")
for i in range(3)
]
useless = [PersonnelEntry(nama=f"NAMA {i + 3}") for i in range(7)]
assert is_low_quality(useful + useless) is False