feat: implement robust personnel data extraction pipeline with text-based fallback and coordinate-aware processing

This commit is contained in:
Adriankf59
2026-04-26 17:16:47 +07:00
parent dbcf480130
commit 002821ca07
20 changed files with 3326 additions and 20 deletions

View File

@@ -0,0 +1,75 @@
"""Tests for OCR layout reordering.
PaddleOCR emits text boxes in detection order, not visual reading order.
On dense table layouts (Polda Kalbar Akpol-panitia regression) this
interleaves columns within a row and breaks every downstream extractor
that assumes top-to-bottom row order. ``sort_lines_by_layout`` rebuilds
reading order from the bounding-box geometry.
"""
from __future__ import annotations
from ocr_sprint.pipeline.ocr import OCRLine, OCRPage, sort_lines_by_layout
def _box(x: float, y: float, w: float = 30, h: float = 15):
return ((x, y), (x + w, y), (x + w, y + h), (x, y + h))
def _make(text: str, x: float, y: float) -> OCRLine:
return OCRLine(text=text, confidence=1.0, box=_box(x, y))
class TestSortLinesByLayout:
def test_empty_returns_empty(self) -> None:
assert sort_lines_by_layout([]) == []
def test_already_sorted_is_stable(self) -> None:
lines = [_make("A", 10, 10), _make("B", 50, 10), _make("C", 10, 30)]
assert [ln.text for ln in sort_lines_by_layout(lines)] == ["A", "B", "C"]
def test_reorders_column_first_detection_to_row_first(self) -> None:
# Simulate a 2-row, 3-col table where Paddle returned cells
# column-first instead of row-first.
lines = [
_make("B1", 50, 10),
_make("B2", 50, 30),
_make("A1", 10, 10),
_make("A2", 10, 30),
_make("C1", 90, 10),
_make("C2", 90, 30),
]
result = [ln.text for ln in sort_lines_by_layout(lines)]
assert result == ["A1", "B1", "C1", "A2", "B2", "C2"]
def test_groups_slightly_misaligned_cells_into_one_band(self) -> None:
# Real OCR boxes for a single visual row are rarely perfectly
# y-aligned; we still want them grouped.
lines = [
_make("LEFT", 10, 10),
_make("MID", 50, 12), # 2px below LEFT — same row visually
_make("RIGHT", 90, 11),
]
result = [ln.text for ln in sort_lines_by_layout(lines)]
assert result == ["LEFT", "MID", "RIGHT"]
def test_separates_rows_when_y_gap_exceeds_threshold(self) -> None:
# Lines with a y gap larger than ~½ line-height must NOT collapse
# into the same band.
lines = [
_make("ROW1A", 10, 10),
_make("ROW1B", 50, 10),
_make("ROW2A", 10, 30), # gap of 20 vs height 15 → new band
_make("ROW2B", 50, 30),
]
result = [ln.text for ln in sort_lines_by_layout(lines)]
assert result == ["ROW1A", "ROW1B", "ROW2A", "ROW2B"]
def test_ocrpage_text_uses_sorted_order(self) -> None:
lines = [
_make("RIGHT", 90, 10),
_make("LEFT", 10, 10),
_make("BOTTOM", 10, 30),
]
page = OCRPage(lines=lines)
assert page.text == "LEFT\nRIGHT\nBOTTOM"

View File

@@ -8,11 +8,18 @@ recover at least the rank + NRP for every row.
from __future__ import annotations
from ocr_sprint.pipeline.extract.personnel_text import (
extract_personnel_from_ocr_lines,
extract_personnel_from_text,
is_low_quality,
)
from ocr_sprint.pipeline.ocr import OCRLine
from ocr_sprint.schemas.personnel import PersonnelEntry
def _ocr_line(text: str, x: float, y: float, w: float = 80, h: float = 15) -> OCRLine:
box = ((x, y), (x + w, y), (x + w, y + h), (x, y + h))
return OCRLine(text=text, confidence=1.0, box=box)
_CIMAHI_FIXTURE = """\
DAFTAR PERSONIL SKCK POLRES DAN POLSEK JAJARAN POLRES CIMAHI TA 2024
NO
@@ -115,6 +122,86 @@ class TestExtractPersonnelFromText:
names = [r.nama for r in rows]
assert names == ["KETUT WARDANA", "NOVA SARI", "NOOR HIDAYAT"]
def test_extracts_multiple_rows_when_collapsed_to_one_line(self) -> None:
# Polres Banjar regression: when PaddleOCR merges several table
# rows onto a single OCR line, every rank+NRP pair on that line
# must still produce a separate row. Previously per-line
# ``re.search`` returned only the first match.
text = (
"DAFTAR NAMA INSTRUKTUR\n"
"1 CUCU JUHANA, A.K.S. KOMPOL 70100418 KABAGOPS "
"INSTRUKTUR LAT PRA OPS "
"HERU SAMSUL BAHRI, S.E., M.M., CPHR., CBA.AKP 77020049 "
"KASAT RESKRIM SDA "
"YAYAN SOPIANA, S.A.P., M.A.P., CPHR., CBA, CIAKP 84011113 "
"KASATINTELKAM POLRES BANJAR SDA\n"
)
rows = extract_personnel_from_text(text)
assert len(rows) == 3
assert [r.pangkat for r in rows] == ["KOMPOL", "AKP", "AKP"]
assert [r.nrp for r in rows] == ["70100418", "77020049", "84011113"]
assert rows[0].nama == "CUCU JUHANA, A.K.S."
assert rows[1].nama is not None and "HERU SAMSUL BAHRI" in rows[1].nama
assert rows[2].nama is not None and "YAYAN SOPIANA" in rows[2].nama
def test_extracts_multiple_rows_when_split_across_lines(self) -> None:
# Variant of the squished case where OCR produces one line per
# table row. Each row still ends up with multiple rank+NRP pairs
# never being on the same line, but verifies the finditer-based
# path doesn't regress this layout.
text = (
"1 CUCU JUHANA, A.K.S. KOMPOL 70100418 KABAGOPS\n"
"INSTRUKTUR LAT PRA OPS\n"
"HERU SAMSUL BAHRI, S.E., M.M., CPHR., CBA.AKP 77020049 KASAT RESKRIM\n"
"SDA\n"
"YAYAN SOPIANA, S.A.P., M.A.P., CPHR., CBA, CIAKP 84011113 KASATINTELKAM\n"
"POLRES BANJAR SDA\n"
)
rows = extract_personnel_from_text(text)
assert [r.pangkat for r in rows] == ["KOMPOL", "AKP", "AKP"]
assert [r.nrp for r in rows] == ["70100418", "77020049", "84011113"]
assert rows[0].nama == "CUCU JUHANA, A.K.S."
def test_extracts_rows_when_sprint_has_no_nrp_column(self) -> None:
# Polda Kalbar Akpol-panitia regression: sprint formats without
# an NRP column (panitia, undangan templates) must still extract
# rows via the rank-only Pass 3 path. Names span multiple OCR
# lines (narrow column), and the multi-token rank "KOMBES POL"
# is split across two lines.
text = (
"DAFTAR NAMA PANITIA\n"
"NO\nNAMA\nPANGKAT\nJABATAN\nSTRUKTURAL\nDALAM SPRIN\nKET\n"
"1\nF. GUNTUR\nSUNOTO, S.I.K.,\nM.H.\n"
"KOMBES\nPOL\n"
"KARO SDM\nPOLDA KALBAR\nKETUA\nPELAKSANA\n"
"2\nJUDA TRISNO\nTAMPUBOLON,\nS.H., S.I.K., M.H.\n"
"AKBP\n"
"KABAGDALPERS\nRO SDM\nPOLDA KALBAR\nSEKRETARIS\n"
"3\nPRAYITNO, S.H.,\nM.H.\n"
"KOMPOL\n"
"KASUBBAG DIAPERS\nANGGOTA\n"
)
rows = extract_personnel_from_text(text)
assert len(rows) == 3
assert [r.pangkat for r in rows] == ["KOMBES POL", "AKBP", "KOMPOL"]
# All Pass 3 rows have nrp=None by design.
assert all(r.nrp is None for r in rows)
assert rows[0].nama == "F. GUNTUR SUNOTO, S.I.K., M.H."
assert rows[1].nama == "JUDA TRISNO TAMPUBOLON, S.H., S.I.K., M.H."
assert rows[2].nama == "PRAYITNO, S.H., M.H."
assert rows[0].jabatan_dinas is not None and "KARO SDM" in rows[0].jabatan_dinas
def test_pass3_does_not_run_when_pass1_succeeds(self) -> None:
# If a sprint has NRPs (Pass 1 succeeds), Pass 3 must not fire
# and produce duplicate/contaminating rows.
text = (
"1\nSRI WAHYUNI\nAIPTU / 75070328\nBAUR SKCK\n"
"2\nCITRA DWI PUTRI\nBRIPTU / 95070659\nBA PELAKSANA\n"
)
rows = extract_personnel_from_text(text)
assert len(rows) == 2
assert all(r.nrp is not None for r in rows)
def test_still_blocks_bare_column_header_tokens(self) -> None:
# Word-boundary fix must still reject the actual column-header
# rows that motivated the blocklist in the first place.
@@ -124,6 +211,94 @@ class TestExtractPersonnelFromText:
assert rows[0].nama == "REAL NAME"
class TestExtractPersonnelFromOcrLines:
"""Column-aware Pass 3 — Polda Kalbar Akpol-panitia regression.
Verifies that bounding-box geometry preserves column boundaries on
dense tables where text-only Pass 3 bleeds adjacent columns into
nama/jabatan.
"""
def _kalbar_lines(self) -> list[OCRLine]:
# Stylised Polda Kalbar layout: NO | NAMA | PANGKAT | STRUKTURAL | SPRIN
# X columns: 10, 100, 250, 380, 520. Each row may have multi-line cells.
return [
# Row 1 — KOMBES POL spans two stacked OCR boxes
_ocr_line("1", 10, 100),
_ocr_line("F. GUNTUR", 100, 100),
_ocr_line("SUNOTO, S.I.K.,", 100, 120),
_ocr_line("M.H.", 100, 140),
_ocr_line("KOMBES", 250, 100),
_ocr_line("POL", 250, 120),
_ocr_line("KARO SDM", 380, 100),
_ocr_line("POLDA KALBAR", 380, 120),
_ocr_line("KETUA", 520, 100),
_ocr_line("PELAKSANA", 520, 120),
# Row 2
_ocr_line("2", 10, 200),
_ocr_line("JUDA TRISNO", 100, 200),
_ocr_line("TAMPUBOLON,", 100, 220),
_ocr_line("S.H., S.I.K., M.H.", 100, 240),
_ocr_line("AKBP", 250, 200),
_ocr_line("KABAGDALPERS", 380, 200),
_ocr_line("RO SDM", 380, 220),
_ocr_line("POLDA KALBAR", 380, 240),
_ocr_line("SEKRETARIS", 520, 200),
# Row 9 — PNS PENATA TK I (multi-token rank stacked)
_ocr_line("9", 10, 500),
_ocr_line("FITRIANSYAH,", 100, 500),
_ocr_line("S.E.", 100, 520),
_ocr_line("PENATA", 250, 500),
_ocr_line("TK I", 250, 520),
_ocr_line("KAURKEU", 380, 500),
_ocr_line("RO SDM", 380, 520),
_ocr_line("POLDA KALBAR", 380, 540),
_ocr_line("BENDAHARA", 520, 500),
]
def test_extracts_three_rows(self) -> None:
rows = extract_personnel_from_ocr_lines(self._kalbar_lines())
assert len(rows) == 3
assert [r.pangkat for r in rows] == ["KOMBES POL", "AKBP", "PENATA TK I"]
def test_nama_is_assembled_only_from_nama_column(self) -> None:
# Each row's nama must contain *all* its multi-line fragments
# and *only* its multi-line fragments — no bleed from struktural.
rows = extract_personnel_from_ocr_lines(self._kalbar_lines())
assert rows[0].nama == "F. GUNTUR SUNOTO, S.I.K., M.H."
assert rows[1].nama == "JUDA TRISNO TAMPUBOLON, S.H., S.I.K., M.H."
assert rows[2].nama == "FITRIANSYAH, S.E."
def test_jabatan_split_into_struktural_and_sprint(self) -> None:
# The geometric column boundary must split STRUKTURAL (jabatan_dinas)
# from DALAM SPRIN (jabatan_sprint).
rows = extract_personnel_from_ocr_lines(self._kalbar_lines())
assert rows[0].jabatan_dinas == "KARO SDM POLDA KALBAR"
assert rows[0].jabatan_sprint == "KETUA PELAKSANA"
assert rows[1].jabatan_dinas == "KABAGDALPERS RO SDM POLDA KALBAR"
assert rows[1].jabatan_sprint == "SEKRETARIS"
def test_returns_empty_when_no_rank_anchors(self) -> None:
lines = [
_ocr_line("DAFTAR NAMA", 100, 50),
_ocr_line("HEADER", 100, 100),
]
assert extract_personnel_from_ocr_lines(lines) == []
def test_returns_empty_for_empty_input(self) -> None:
assert extract_personnel_from_ocr_lines([]) == []
def test_no_row_bleed_between_consecutive_rows(self) -> None:
# Row 1's last name fragment ("F. GUNTUR") sits BELOW its rank
# line but inside row 1's visual span. It must NOT leak into
# row 2's nama, which should start with "JUDA TRISNO".
rows = extract_personnel_from_ocr_lines(self._kalbar_lines())
assert rows[1].nama is not None
assert rows[1].nama.startswith("JUDA TRISNO")
assert "GUNTUR" not in rows[1].nama
assert "SUNOTO" not in rows[1].nama
class TestIsLowQuality:
def test_empty_list_is_low_quality(self) -> None:
assert is_low_quality([]) is True