325 lines
13 KiB
Python
325 lines
13 KiB
Python
"""Tests for the text-based personnel fallback extractor.
|
|
|
|
Driven by the real Polres Cimahi sprint document where PP-Structure
|
|
produced 24 rows with only ``nama`` populated. The fallback should
|
|
recover at least the rank + NRP for every row.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from ocr_sprint.pipeline.extract.personnel_text import (
|
|
extract_personnel_from_ocr_lines,
|
|
extract_personnel_from_text,
|
|
is_low_quality,
|
|
)
|
|
from ocr_sprint.pipeline.ocr import OCRLine
|
|
from ocr_sprint.schemas.personnel import PersonnelEntry
|
|
|
|
|
|
def _ocr_line(text: str, x: float, y: float, w: float = 80, h: float = 15) -> OCRLine:
|
|
box = ((x, y), (x + w, y), (x + w, y + h), (x, y + h))
|
|
return OCRLine(text=text, confidence=1.0, box=box)
|
|
|
|
_CIMAHI_FIXTURE = """\
|
|
DAFTAR PERSONIL SKCK POLRES DAN POLSEK JAJARAN POLRES CIMAHI TA 2024
|
|
NO
|
|
NAMA
|
|
PANGKAT / NRP
|
|
JABATAN
|
|
KET
|
|
BAUR SKCK SAT
|
|
1.
|
|
SRI WAHYUNI
|
|
AIPTU / 75070328
|
|
INTELKAM POLRES
|
|
CIMAHI
|
|
BA PELAKSANA SKCK
|
|
2.
|
|
CITRA DWI PUTRI R
|
|
BRIPTU / 95070659
|
|
SAT INTELKAM
|
|
POLRES CIMAHI
|
|
BA PELAKSANA SKCK
|
|
3.
|
|
AGUNG LUKMAN AL
|
|
BRIPTU / 99030245
|
|
SAT INTELKAM
|
|
POLRES CIMAHI
|
|
BA POLSEK
|
|
8.
|
|
ARIEF SYAHRUL ZAMAN
|
|
BRIGPOL /96030446
|
|
MARGAASIH
|
|
"""
|
|
|
|
|
|
class TestExtractPersonnelFromText:
|
|
def test_extracts_rank_nrp_and_name(self) -> None:
|
|
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
|
|
assert len(rows) == 4
|
|
first = rows[0]
|
|
assert first.pangkat == "AIPTU"
|
|
assert first.nrp == "75070328"
|
|
assert first.nama == "SRI WAHYUNI"
|
|
|
|
def test_normalizes_brigpol_to_brigadir(self) -> None:
|
|
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
|
|
last = rows[-1]
|
|
# 'BRIGPOL' (no space) must canonicalize to 'BRIGADIR'.
|
|
assert last.pangkat == "BRIGADIR"
|
|
assert last.nrp == "96030446"
|
|
assert last.nama == "ARIEF SYAHRUL ZAMAN"
|
|
|
|
def test_skips_header_lines_as_names(self) -> None:
|
|
# No row should ever have a column-header word as nama.
|
|
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
|
|
names = [r.nama for r in rows]
|
|
for blocked in {"NAMA", "PANGKAT", "JABATAN", "KET", "DAFTAR"}:
|
|
assert blocked not in names
|
|
|
|
def test_jabatan_collected_from_following_lines(self) -> None:
|
|
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
|
|
assert rows[0].jabatan_dinas is not None
|
|
assert "INTELKAM" in rows[0].jabatan_dinas
|
|
|
|
def test_empty_text_returns_empty(self) -> None:
|
|
assert extract_personnel_from_text("") == []
|
|
|
|
def test_text_without_rank_nrp_pattern_returns_empty(self) -> None:
|
|
text = "Just a paragraph with no rank or NRP at all.\nAnother line."
|
|
assert extract_personnel_from_text(text) == []
|
|
|
|
def test_ignores_isolated_8digit_number_without_rank(self) -> None:
|
|
# NRP without a recognised rank token must not produce a row.
|
|
text = "Some line\n12345678\nanother line"
|
|
assert extract_personnel_from_text(text) == []
|
|
|
|
def test_rejects_unknown_rank_with_8digit_number(self) -> None:
|
|
# A "rank-shaped" word that isn't in the master list must not yield a row.
|
|
text = "Some line\nFAKERANK / 12345678\nanother line"
|
|
assert extract_personnel_from_text(text) == []
|
|
|
|
def test_does_not_drop_indonesian_names_starting_with_no_or_ket(self) -> None:
|
|
# Regression: 'NO' / 'KET' are legitimate column header tokens but
|
|
# also prefix common Indonesian names (KETUT, NOVA, NOOR). The
|
|
# blocklist must use word boundaries, not a raw startswith check.
|
|
text = (
|
|
"DAFTAR PERSONIL\n"
|
|
"1.\n"
|
|
"KETUT WARDANA\n"
|
|
"AIPTU / 11111111\n"
|
|
"JABATAN A\n"
|
|
"2.\n"
|
|
"NOVA SARI\n"
|
|
"BRIPTU / 22222222\n"
|
|
"JABATAN B\n"
|
|
"3.\n"
|
|
"NOOR HIDAYAT\n"
|
|
"BRIPDA / 33333333\n"
|
|
"JABATAN C\n"
|
|
)
|
|
rows = extract_personnel_from_text(text)
|
|
names = [r.nama for r in rows]
|
|
assert names == ["KETUT WARDANA", "NOVA SARI", "NOOR HIDAYAT"]
|
|
|
|
def test_extracts_multiple_rows_when_collapsed_to_one_line(self) -> None:
|
|
# Polres Banjar regression: when PaddleOCR merges several table
|
|
# rows onto a single OCR line, every rank+NRP pair on that line
|
|
# must still produce a separate row. Previously per-line
|
|
# ``re.search`` returned only the first match.
|
|
text = (
|
|
"DAFTAR NAMA INSTRUKTUR\n"
|
|
"1 CUCU JUHANA, A.K.S. KOMPOL 70100418 KABAGOPS "
|
|
"INSTRUKTUR LAT PRA OPS "
|
|
"HERU SAMSUL BAHRI, S.E., M.M., CPHR., CBA.AKP 77020049 "
|
|
"KASAT RESKRIM SDA "
|
|
"YAYAN SOPIANA, S.A.P., M.A.P., CPHR., CBA, CIAKP 84011113 "
|
|
"KASATINTELKAM POLRES BANJAR SDA\n"
|
|
)
|
|
rows = extract_personnel_from_text(text)
|
|
assert len(rows) == 3
|
|
assert [r.pangkat for r in rows] == ["KOMPOL", "AKP", "AKP"]
|
|
assert [r.nrp for r in rows] == ["70100418", "77020049", "84011113"]
|
|
assert rows[0].nama == "CUCU JUHANA, A.K.S."
|
|
assert rows[1].nama is not None and "HERU SAMSUL BAHRI" in rows[1].nama
|
|
assert rows[2].nama is not None and "YAYAN SOPIANA" in rows[2].nama
|
|
|
|
def test_extracts_multiple_rows_when_split_across_lines(self) -> None:
|
|
# Variant of the squished case where OCR produces one line per
|
|
# table row. Each row still ends up with multiple rank+NRP pairs
|
|
# never being on the same line, but verifies the finditer-based
|
|
# path doesn't regress this layout.
|
|
text = (
|
|
"1 CUCU JUHANA, A.K.S. KOMPOL 70100418 KABAGOPS\n"
|
|
"INSTRUKTUR LAT PRA OPS\n"
|
|
"HERU SAMSUL BAHRI, S.E., M.M., CPHR., CBA.AKP 77020049 KASAT RESKRIM\n"
|
|
"SDA\n"
|
|
"YAYAN SOPIANA, S.A.P., M.A.P., CPHR., CBA, CIAKP 84011113 KASATINTELKAM\n"
|
|
"POLRES BANJAR SDA\n"
|
|
)
|
|
rows = extract_personnel_from_text(text)
|
|
assert [r.pangkat for r in rows] == ["KOMPOL", "AKP", "AKP"]
|
|
assert [r.nrp for r in rows] == ["70100418", "77020049", "84011113"]
|
|
assert rows[0].nama == "CUCU JUHANA, A.K.S."
|
|
|
|
def test_extracts_rows_when_sprint_has_no_nrp_column(self) -> None:
|
|
# Polda Kalbar Akpol-panitia regression: sprint formats without
|
|
# an NRP column (panitia, undangan templates) must still extract
|
|
# rows via the rank-only Pass 3 path. Names span multiple OCR
|
|
# lines (narrow column), and the multi-token rank "KOMBES POL"
|
|
# is split across two lines.
|
|
text = (
|
|
"DAFTAR NAMA PANITIA\n"
|
|
"NO\nNAMA\nPANGKAT\nJABATAN\nSTRUKTURAL\nDALAM SPRIN\nKET\n"
|
|
"1\nF. GUNTUR\nSUNOTO, S.I.K.,\nM.H.\n"
|
|
"KOMBES\nPOL\n"
|
|
"KARO SDM\nPOLDA KALBAR\nKETUA\nPELAKSANA\n"
|
|
"2\nJUDA TRISNO\nTAMPUBOLON,\nS.H., S.I.K., M.H.\n"
|
|
"AKBP\n"
|
|
"KABAGDALPERS\nRO SDM\nPOLDA KALBAR\nSEKRETARIS\n"
|
|
"3\nPRAYITNO, S.H.,\nM.H.\n"
|
|
"KOMPOL\n"
|
|
"KASUBBAG DIAPERS\nANGGOTA\n"
|
|
)
|
|
rows = extract_personnel_from_text(text)
|
|
assert len(rows) == 3
|
|
assert [r.pangkat for r in rows] == ["KOMBES POL", "AKBP", "KOMPOL"]
|
|
# All Pass 3 rows have nrp=None by design.
|
|
assert all(r.nrp is None for r in rows)
|
|
assert rows[0].nama == "F. GUNTUR SUNOTO, S.I.K., M.H."
|
|
assert rows[1].nama == "JUDA TRISNO TAMPUBOLON, S.H., S.I.K., M.H."
|
|
assert rows[2].nama == "PRAYITNO, S.H., M.H."
|
|
assert rows[0].jabatan_dinas is not None and "KARO SDM" in rows[0].jabatan_dinas
|
|
|
|
def test_pass3_does_not_run_when_pass1_succeeds(self) -> None:
|
|
# If a sprint has NRPs (Pass 1 succeeds), Pass 3 must not fire
|
|
# and produce duplicate/contaminating rows.
|
|
text = (
|
|
"1\nSRI WAHYUNI\nAIPTU / 75070328\nBAUR SKCK\n"
|
|
"2\nCITRA DWI PUTRI\nBRIPTU / 95070659\nBA PELAKSANA\n"
|
|
)
|
|
rows = extract_personnel_from_text(text)
|
|
assert len(rows) == 2
|
|
assert all(r.nrp is not None for r in rows)
|
|
|
|
def test_still_blocks_bare_column_header_tokens(self) -> None:
|
|
# Word-boundary fix must still reject the actual column-header
|
|
# rows that motivated the blocklist in the first place.
|
|
text = "NO\nNAMA\nPANGKAT / NRP\nJABATAN\nKET\n1.\nREAL NAME\nAIPTU / 12345678\n"
|
|
rows = extract_personnel_from_text(text)
|
|
assert len(rows) == 1
|
|
assert rows[0].nama == "REAL NAME"
|
|
|
|
|
|
class TestExtractPersonnelFromOcrLines:
|
|
"""Column-aware Pass 3 — Polda Kalbar Akpol-panitia regression.
|
|
|
|
Verifies that bounding-box geometry preserves column boundaries on
|
|
dense tables where text-only Pass 3 bleeds adjacent columns into
|
|
nama/jabatan.
|
|
"""
|
|
|
|
def _kalbar_lines(self) -> list[OCRLine]:
|
|
# Stylised Polda Kalbar layout: NO | NAMA | PANGKAT | STRUKTURAL | SPRIN
|
|
# X columns: 10, 100, 250, 380, 520. Each row may have multi-line cells.
|
|
return [
|
|
# Row 1 — KOMBES POL spans two stacked OCR boxes
|
|
_ocr_line("1", 10, 100),
|
|
_ocr_line("F. GUNTUR", 100, 100),
|
|
_ocr_line("SUNOTO, S.I.K.,", 100, 120),
|
|
_ocr_line("M.H.", 100, 140),
|
|
_ocr_line("KOMBES", 250, 100),
|
|
_ocr_line("POL", 250, 120),
|
|
_ocr_line("KARO SDM", 380, 100),
|
|
_ocr_line("POLDA KALBAR", 380, 120),
|
|
_ocr_line("KETUA", 520, 100),
|
|
_ocr_line("PELAKSANA", 520, 120),
|
|
# Row 2
|
|
_ocr_line("2", 10, 200),
|
|
_ocr_line("JUDA TRISNO", 100, 200),
|
|
_ocr_line("TAMPUBOLON,", 100, 220),
|
|
_ocr_line("S.H., S.I.K., M.H.", 100, 240),
|
|
_ocr_line("AKBP", 250, 200),
|
|
_ocr_line("KABAGDALPERS", 380, 200),
|
|
_ocr_line("RO SDM", 380, 220),
|
|
_ocr_line("POLDA KALBAR", 380, 240),
|
|
_ocr_line("SEKRETARIS", 520, 200),
|
|
# Row 9 — PNS PENATA TK I (multi-token rank stacked)
|
|
_ocr_line("9", 10, 500),
|
|
_ocr_line("FITRIANSYAH,", 100, 500),
|
|
_ocr_line("S.E.", 100, 520),
|
|
_ocr_line("PENATA", 250, 500),
|
|
_ocr_line("TK I", 250, 520),
|
|
_ocr_line("KAURKEU", 380, 500),
|
|
_ocr_line("RO SDM", 380, 520),
|
|
_ocr_line("POLDA KALBAR", 380, 540),
|
|
_ocr_line("BENDAHARA", 520, 500),
|
|
]
|
|
|
|
def test_extracts_three_rows(self) -> None:
|
|
rows = extract_personnel_from_ocr_lines(self._kalbar_lines())
|
|
assert len(rows) == 3
|
|
assert [r.pangkat for r in rows] == ["KOMBES POL", "AKBP", "PENATA TK I"]
|
|
|
|
def test_nama_is_assembled_only_from_nama_column(self) -> None:
|
|
# Each row's nama must contain *all* its multi-line fragments
|
|
# and *only* its multi-line fragments — no bleed from struktural.
|
|
rows = extract_personnel_from_ocr_lines(self._kalbar_lines())
|
|
assert rows[0].nama == "F. GUNTUR SUNOTO, S.I.K., M.H."
|
|
assert rows[1].nama == "JUDA TRISNO TAMPUBOLON, S.H., S.I.K., M.H."
|
|
assert rows[2].nama == "FITRIANSYAH, S.E."
|
|
|
|
def test_jabatan_split_into_struktural_and_sprint(self) -> None:
|
|
# The geometric column boundary must split STRUKTURAL (jabatan_dinas)
|
|
# from DALAM SPRIN (jabatan_sprint).
|
|
rows = extract_personnel_from_ocr_lines(self._kalbar_lines())
|
|
assert rows[0].jabatan_dinas == "KARO SDM POLDA KALBAR"
|
|
assert rows[0].jabatan_sprint == "KETUA PELAKSANA"
|
|
assert rows[1].jabatan_dinas == "KABAGDALPERS RO SDM POLDA KALBAR"
|
|
assert rows[1].jabatan_sprint == "SEKRETARIS"
|
|
|
|
def test_returns_empty_when_no_rank_anchors(self) -> None:
|
|
lines = [
|
|
_ocr_line("DAFTAR NAMA", 100, 50),
|
|
_ocr_line("HEADER", 100, 100),
|
|
]
|
|
assert extract_personnel_from_ocr_lines(lines) == []
|
|
|
|
def test_returns_empty_for_empty_input(self) -> None:
|
|
assert extract_personnel_from_ocr_lines([]) == []
|
|
|
|
def test_no_row_bleed_between_consecutive_rows(self) -> None:
|
|
# Row 1's last name fragment ("F. GUNTUR") sits BELOW its rank
|
|
# line but inside row 1's visual span. It must NOT leak into
|
|
# row 2's nama, which should start with "JUDA TRISNO".
|
|
rows = extract_personnel_from_ocr_lines(self._kalbar_lines())
|
|
assert rows[1].nama is not None
|
|
assert rows[1].nama.startswith("JUDA TRISNO")
|
|
assert "GUNTUR" not in rows[1].nama
|
|
assert "SUNOTO" not in rows[1].nama
|
|
|
|
|
|
class TestIsLowQuality:
|
|
def test_empty_list_is_low_quality(self) -> None:
|
|
assert is_low_quality([]) is True
|
|
|
|
def test_all_rows_with_only_name_is_low_quality(self) -> None:
|
|
rows = [PersonnelEntry(nama=f"NAMA {i}") for i in range(10)]
|
|
assert is_low_quality(rows) is True
|
|
|
|
def test_majority_with_rank_nrp_is_high_quality(self) -> None:
|
|
rows = [
|
|
PersonnelEntry(nama=f"NAMA {i}", pangkat="AIPTU", nrp=f"{10000000 + i:08d}")
|
|
for i in range(10)
|
|
]
|
|
assert is_low_quality(rows) is False
|
|
|
|
def test_borderline_30_percent_threshold(self) -> None:
|
|
# 3 useful out of 10 = exactly 0.3, treated as not-low-quality.
|
|
useful = [
|
|
PersonnelEntry(nama=f"NAMA {i}", pangkat="AIPTU", nrp=f"{10000000 + i:08d}")
|
|
for i in range(3)
|
|
]
|
|
useless = [PersonnelEntry(nama=f"NAMA {i + 3}") for i in range(7)]
|
|
assert is_low_quality(useful + useless) is False
|