feat: implement robust personnel data extraction pipeline with text-based fallback and coordinate-aware processing
This commit is contained in:
75
tests/unit/test_ocr_layout.py
Normal file
75
tests/unit/test_ocr_layout.py
Normal file
@@ -0,0 +1,75 @@
|
||||
"""Tests for OCR layout reordering.
|
||||
|
||||
PaddleOCR emits text boxes in detection order, not visual reading order.
|
||||
On dense table layouts (Polda Kalbar Akpol-panitia regression) this
|
||||
interleaves columns within a row and breaks every downstream extractor
|
||||
that assumes top-to-bottom row order. ``sort_lines_by_layout`` rebuilds
|
||||
reading order from the bounding-box geometry.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from ocr_sprint.pipeline.ocr import OCRLine, OCRPage, sort_lines_by_layout
|
||||
|
||||
|
||||
def _box(x: float, y: float, w: float = 30, h: float = 15):
|
||||
return ((x, y), (x + w, y), (x + w, y + h), (x, y + h))
|
||||
|
||||
|
||||
def _make(text: str, x: float, y: float) -> OCRLine:
|
||||
return OCRLine(text=text, confidence=1.0, box=_box(x, y))
|
||||
|
||||
|
||||
class TestSortLinesByLayout:
|
||||
def test_empty_returns_empty(self) -> None:
|
||||
assert sort_lines_by_layout([]) == []
|
||||
|
||||
def test_already_sorted_is_stable(self) -> None:
|
||||
lines = [_make("A", 10, 10), _make("B", 50, 10), _make("C", 10, 30)]
|
||||
assert [ln.text for ln in sort_lines_by_layout(lines)] == ["A", "B", "C"]
|
||||
|
||||
def test_reorders_column_first_detection_to_row_first(self) -> None:
|
||||
# Simulate a 2-row, 3-col table where Paddle returned cells
|
||||
# column-first instead of row-first.
|
||||
lines = [
|
||||
_make("B1", 50, 10),
|
||||
_make("B2", 50, 30),
|
||||
_make("A1", 10, 10),
|
||||
_make("A2", 10, 30),
|
||||
_make("C1", 90, 10),
|
||||
_make("C2", 90, 30),
|
||||
]
|
||||
result = [ln.text for ln in sort_lines_by_layout(lines)]
|
||||
assert result == ["A1", "B1", "C1", "A2", "B2", "C2"]
|
||||
|
||||
def test_groups_slightly_misaligned_cells_into_one_band(self) -> None:
|
||||
# Real OCR boxes for a single visual row are rarely perfectly
|
||||
# y-aligned; we still want them grouped.
|
||||
lines = [
|
||||
_make("LEFT", 10, 10),
|
||||
_make("MID", 50, 12), # 2px below LEFT — same row visually
|
||||
_make("RIGHT", 90, 11),
|
||||
]
|
||||
result = [ln.text for ln in sort_lines_by_layout(lines)]
|
||||
assert result == ["LEFT", "MID", "RIGHT"]
|
||||
|
||||
def test_separates_rows_when_y_gap_exceeds_threshold(self) -> None:
|
||||
# Lines with a y gap larger than ~½ line-height must NOT collapse
|
||||
# into the same band.
|
||||
lines = [
|
||||
_make("ROW1A", 10, 10),
|
||||
_make("ROW1B", 50, 10),
|
||||
_make("ROW2A", 10, 30), # gap of 20 vs height 15 → new band
|
||||
_make("ROW2B", 50, 30),
|
||||
]
|
||||
result = [ln.text for ln in sort_lines_by_layout(lines)]
|
||||
assert result == ["ROW1A", "ROW1B", "ROW2A", "ROW2B"]
|
||||
|
||||
def test_ocrpage_text_uses_sorted_order(self) -> None:
|
||||
lines = [
|
||||
_make("RIGHT", 90, 10),
|
||||
_make("LEFT", 10, 10),
|
||||
_make("BOTTOM", 10, 30),
|
||||
]
|
||||
page = OCRPage(lines=lines)
|
||||
assert page.text == "LEFT\nRIGHT\nBOTTOM"
|
||||
@@ -8,11 +8,18 @@ recover at least the rank + NRP for every row.
|
||||
from __future__ import annotations
|
||||
|
||||
from ocr_sprint.pipeline.extract.personnel_text import (
|
||||
extract_personnel_from_ocr_lines,
|
||||
extract_personnel_from_text,
|
||||
is_low_quality,
|
||||
)
|
||||
from ocr_sprint.pipeline.ocr import OCRLine
|
||||
from ocr_sprint.schemas.personnel import PersonnelEntry
|
||||
|
||||
|
||||
def _ocr_line(text: str, x: float, y: float, w: float = 80, h: float = 15) -> OCRLine:
|
||||
box = ((x, y), (x + w, y), (x + w, y + h), (x, y + h))
|
||||
return OCRLine(text=text, confidence=1.0, box=box)
|
||||
|
||||
_CIMAHI_FIXTURE = """\
|
||||
DAFTAR PERSONIL SKCK POLRES DAN POLSEK JAJARAN POLRES CIMAHI TA 2024
|
||||
NO
|
||||
@@ -115,6 +122,86 @@ class TestExtractPersonnelFromText:
|
||||
names = [r.nama for r in rows]
|
||||
assert names == ["KETUT WARDANA", "NOVA SARI", "NOOR HIDAYAT"]
|
||||
|
||||
def test_extracts_multiple_rows_when_collapsed_to_one_line(self) -> None:
|
||||
# Polres Banjar regression: when PaddleOCR merges several table
|
||||
# rows onto a single OCR line, every rank+NRP pair on that line
|
||||
# must still produce a separate row. Previously per-line
|
||||
# ``re.search`` returned only the first match.
|
||||
text = (
|
||||
"DAFTAR NAMA INSTRUKTUR\n"
|
||||
"1 CUCU JUHANA, A.K.S. KOMPOL 70100418 KABAGOPS "
|
||||
"INSTRUKTUR LAT PRA OPS "
|
||||
"HERU SAMSUL BAHRI, S.E., M.M., CPHR., CBA.AKP 77020049 "
|
||||
"KASAT RESKRIM SDA "
|
||||
"YAYAN SOPIANA, S.A.P., M.A.P., CPHR., CBA, CIAKP 84011113 "
|
||||
"KASATINTELKAM POLRES BANJAR SDA\n"
|
||||
)
|
||||
rows = extract_personnel_from_text(text)
|
||||
assert len(rows) == 3
|
||||
assert [r.pangkat for r in rows] == ["KOMPOL", "AKP", "AKP"]
|
||||
assert [r.nrp for r in rows] == ["70100418", "77020049", "84011113"]
|
||||
assert rows[0].nama == "CUCU JUHANA, A.K.S."
|
||||
assert rows[1].nama is not None and "HERU SAMSUL BAHRI" in rows[1].nama
|
||||
assert rows[2].nama is not None and "YAYAN SOPIANA" in rows[2].nama
|
||||
|
||||
def test_extracts_multiple_rows_when_split_across_lines(self) -> None:
|
||||
# Variant of the squished case where OCR produces one line per
|
||||
# table row. Each row still ends up with multiple rank+NRP pairs
|
||||
# never being on the same line, but verifies the finditer-based
|
||||
# path doesn't regress this layout.
|
||||
text = (
|
||||
"1 CUCU JUHANA, A.K.S. KOMPOL 70100418 KABAGOPS\n"
|
||||
"INSTRUKTUR LAT PRA OPS\n"
|
||||
"HERU SAMSUL BAHRI, S.E., M.M., CPHR., CBA.AKP 77020049 KASAT RESKRIM\n"
|
||||
"SDA\n"
|
||||
"YAYAN SOPIANA, S.A.P., M.A.P., CPHR., CBA, CIAKP 84011113 KASATINTELKAM\n"
|
||||
"POLRES BANJAR SDA\n"
|
||||
)
|
||||
rows = extract_personnel_from_text(text)
|
||||
assert [r.pangkat for r in rows] == ["KOMPOL", "AKP", "AKP"]
|
||||
assert [r.nrp for r in rows] == ["70100418", "77020049", "84011113"]
|
||||
assert rows[0].nama == "CUCU JUHANA, A.K.S."
|
||||
|
||||
def test_extracts_rows_when_sprint_has_no_nrp_column(self) -> None:
|
||||
# Polda Kalbar Akpol-panitia regression: sprint formats without
|
||||
# an NRP column (panitia, undangan templates) must still extract
|
||||
# rows via the rank-only Pass 3 path. Names span multiple OCR
|
||||
# lines (narrow column), and the multi-token rank "KOMBES POL"
|
||||
# is split across two lines.
|
||||
text = (
|
||||
"DAFTAR NAMA PANITIA\n"
|
||||
"NO\nNAMA\nPANGKAT\nJABATAN\nSTRUKTURAL\nDALAM SPRIN\nKET\n"
|
||||
"1\nF. GUNTUR\nSUNOTO, S.I.K.,\nM.H.\n"
|
||||
"KOMBES\nPOL\n"
|
||||
"KARO SDM\nPOLDA KALBAR\nKETUA\nPELAKSANA\n"
|
||||
"2\nJUDA TRISNO\nTAMPUBOLON,\nS.H., S.I.K., M.H.\n"
|
||||
"AKBP\n"
|
||||
"KABAGDALPERS\nRO SDM\nPOLDA KALBAR\nSEKRETARIS\n"
|
||||
"3\nPRAYITNO, S.H.,\nM.H.\n"
|
||||
"KOMPOL\n"
|
||||
"KASUBBAG DIAPERS\nANGGOTA\n"
|
||||
)
|
||||
rows = extract_personnel_from_text(text)
|
||||
assert len(rows) == 3
|
||||
assert [r.pangkat for r in rows] == ["KOMBES POL", "AKBP", "KOMPOL"]
|
||||
# All Pass 3 rows have nrp=None by design.
|
||||
assert all(r.nrp is None for r in rows)
|
||||
assert rows[0].nama == "F. GUNTUR SUNOTO, S.I.K., M.H."
|
||||
assert rows[1].nama == "JUDA TRISNO TAMPUBOLON, S.H., S.I.K., M.H."
|
||||
assert rows[2].nama == "PRAYITNO, S.H., M.H."
|
||||
assert rows[0].jabatan_dinas is not None and "KARO SDM" in rows[0].jabatan_dinas
|
||||
|
||||
def test_pass3_does_not_run_when_pass1_succeeds(self) -> None:
|
||||
# If a sprint has NRPs (Pass 1 succeeds), Pass 3 must not fire
|
||||
# and produce duplicate/contaminating rows.
|
||||
text = (
|
||||
"1\nSRI WAHYUNI\nAIPTU / 75070328\nBAUR SKCK\n"
|
||||
"2\nCITRA DWI PUTRI\nBRIPTU / 95070659\nBA PELAKSANA\n"
|
||||
)
|
||||
rows = extract_personnel_from_text(text)
|
||||
assert len(rows) == 2
|
||||
assert all(r.nrp is not None for r in rows)
|
||||
|
||||
def test_still_blocks_bare_column_header_tokens(self) -> None:
|
||||
# Word-boundary fix must still reject the actual column-header
|
||||
# rows that motivated the blocklist in the first place.
|
||||
@@ -124,6 +211,94 @@ class TestExtractPersonnelFromText:
|
||||
assert rows[0].nama == "REAL NAME"
|
||||
|
||||
|
||||
class TestExtractPersonnelFromOcrLines:
|
||||
"""Column-aware Pass 3 — Polda Kalbar Akpol-panitia regression.
|
||||
|
||||
Verifies that bounding-box geometry preserves column boundaries on
|
||||
dense tables where text-only Pass 3 bleeds adjacent columns into
|
||||
nama/jabatan.
|
||||
"""
|
||||
|
||||
def _kalbar_lines(self) -> list[OCRLine]:
|
||||
# Stylised Polda Kalbar layout: NO | NAMA | PANGKAT | STRUKTURAL | SPRIN
|
||||
# X columns: 10, 100, 250, 380, 520. Each row may have multi-line cells.
|
||||
return [
|
||||
# Row 1 — KOMBES POL spans two stacked OCR boxes
|
||||
_ocr_line("1", 10, 100),
|
||||
_ocr_line("F. GUNTUR", 100, 100),
|
||||
_ocr_line("SUNOTO, S.I.K.,", 100, 120),
|
||||
_ocr_line("M.H.", 100, 140),
|
||||
_ocr_line("KOMBES", 250, 100),
|
||||
_ocr_line("POL", 250, 120),
|
||||
_ocr_line("KARO SDM", 380, 100),
|
||||
_ocr_line("POLDA KALBAR", 380, 120),
|
||||
_ocr_line("KETUA", 520, 100),
|
||||
_ocr_line("PELAKSANA", 520, 120),
|
||||
# Row 2
|
||||
_ocr_line("2", 10, 200),
|
||||
_ocr_line("JUDA TRISNO", 100, 200),
|
||||
_ocr_line("TAMPUBOLON,", 100, 220),
|
||||
_ocr_line("S.H., S.I.K., M.H.", 100, 240),
|
||||
_ocr_line("AKBP", 250, 200),
|
||||
_ocr_line("KABAGDALPERS", 380, 200),
|
||||
_ocr_line("RO SDM", 380, 220),
|
||||
_ocr_line("POLDA KALBAR", 380, 240),
|
||||
_ocr_line("SEKRETARIS", 520, 200),
|
||||
# Row 9 — PNS PENATA TK I (multi-token rank stacked)
|
||||
_ocr_line("9", 10, 500),
|
||||
_ocr_line("FITRIANSYAH,", 100, 500),
|
||||
_ocr_line("S.E.", 100, 520),
|
||||
_ocr_line("PENATA", 250, 500),
|
||||
_ocr_line("TK I", 250, 520),
|
||||
_ocr_line("KAURKEU", 380, 500),
|
||||
_ocr_line("RO SDM", 380, 520),
|
||||
_ocr_line("POLDA KALBAR", 380, 540),
|
||||
_ocr_line("BENDAHARA", 520, 500),
|
||||
]
|
||||
|
||||
def test_extracts_three_rows(self) -> None:
|
||||
rows = extract_personnel_from_ocr_lines(self._kalbar_lines())
|
||||
assert len(rows) == 3
|
||||
assert [r.pangkat for r in rows] == ["KOMBES POL", "AKBP", "PENATA TK I"]
|
||||
|
||||
def test_nama_is_assembled_only_from_nama_column(self) -> None:
|
||||
# Each row's nama must contain *all* its multi-line fragments
|
||||
# and *only* its multi-line fragments — no bleed from struktural.
|
||||
rows = extract_personnel_from_ocr_lines(self._kalbar_lines())
|
||||
assert rows[0].nama == "F. GUNTUR SUNOTO, S.I.K., M.H."
|
||||
assert rows[1].nama == "JUDA TRISNO TAMPUBOLON, S.H., S.I.K., M.H."
|
||||
assert rows[2].nama == "FITRIANSYAH, S.E."
|
||||
|
||||
def test_jabatan_split_into_struktural_and_sprint(self) -> None:
|
||||
# The geometric column boundary must split STRUKTURAL (jabatan_dinas)
|
||||
# from DALAM SPRIN (jabatan_sprint).
|
||||
rows = extract_personnel_from_ocr_lines(self._kalbar_lines())
|
||||
assert rows[0].jabatan_dinas == "KARO SDM POLDA KALBAR"
|
||||
assert rows[0].jabatan_sprint == "KETUA PELAKSANA"
|
||||
assert rows[1].jabatan_dinas == "KABAGDALPERS RO SDM POLDA KALBAR"
|
||||
assert rows[1].jabatan_sprint == "SEKRETARIS"
|
||||
|
||||
def test_returns_empty_when_no_rank_anchors(self) -> None:
|
||||
lines = [
|
||||
_ocr_line("DAFTAR NAMA", 100, 50),
|
||||
_ocr_line("HEADER", 100, 100),
|
||||
]
|
||||
assert extract_personnel_from_ocr_lines(lines) == []
|
||||
|
||||
def test_returns_empty_for_empty_input(self) -> None:
|
||||
assert extract_personnel_from_ocr_lines([]) == []
|
||||
|
||||
def test_no_row_bleed_between_consecutive_rows(self) -> None:
|
||||
# Row 1's last name fragment ("F. GUNTUR") sits BELOW its rank
|
||||
# line but inside row 1's visual span. It must NOT leak into
|
||||
# row 2's nama, which should start with "JUDA TRISNO".
|
||||
rows = extract_personnel_from_ocr_lines(self._kalbar_lines())
|
||||
assert rows[1].nama is not None
|
||||
assert rows[1].nama.startswith("JUDA TRISNO")
|
||||
assert "GUNTUR" not in rows[1].nama
|
||||
assert "SUNOTO" not in rows[1].nama
|
||||
|
||||
|
||||
class TestIsLowQuality:
|
||||
def test_empty_list_is_low_quality(self) -> None:
|
||||
assert is_low_quality([]) is True
|
||||
|
||||
Reference in New Issue
Block a user