feat: implement robust personnel data extraction pipeline with text-based fallback and coordinate-aware processing
This commit is contained in:
75
tests/unit/test_ocr_layout.py
Normal file
75
tests/unit/test_ocr_layout.py
Normal file
@@ -0,0 +1,75 @@
|
||||
"""Tests for OCR layout reordering.
|
||||
|
||||
PaddleOCR emits text boxes in detection order, not visual reading order.
|
||||
On dense table layouts (Polda Kalbar Akpol-panitia regression) this
|
||||
interleaves columns within a row and breaks every downstream extractor
|
||||
that assumes top-to-bottom row order. ``sort_lines_by_layout`` rebuilds
|
||||
reading order from the bounding-box geometry.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from ocr_sprint.pipeline.ocr import OCRLine, OCRPage, sort_lines_by_layout
|
||||
|
||||
|
||||
def _box(x: float, y: float, w: float = 30, h: float = 15):
|
||||
return ((x, y), (x + w, y), (x + w, y + h), (x, y + h))
|
||||
|
||||
|
||||
def _make(text: str, x: float, y: float) -> OCRLine:
|
||||
return OCRLine(text=text, confidence=1.0, box=_box(x, y))
|
||||
|
||||
|
||||
class TestSortLinesByLayout:
|
||||
def test_empty_returns_empty(self) -> None:
|
||||
assert sort_lines_by_layout([]) == []
|
||||
|
||||
def test_already_sorted_is_stable(self) -> None:
|
||||
lines = [_make("A", 10, 10), _make("B", 50, 10), _make("C", 10, 30)]
|
||||
assert [ln.text for ln in sort_lines_by_layout(lines)] == ["A", "B", "C"]
|
||||
|
||||
def test_reorders_column_first_detection_to_row_first(self) -> None:
|
||||
# Simulate a 2-row, 3-col table where Paddle returned cells
|
||||
# column-first instead of row-first.
|
||||
lines = [
|
||||
_make("B1", 50, 10),
|
||||
_make("B2", 50, 30),
|
||||
_make("A1", 10, 10),
|
||||
_make("A2", 10, 30),
|
||||
_make("C1", 90, 10),
|
||||
_make("C2", 90, 30),
|
||||
]
|
||||
result = [ln.text for ln in sort_lines_by_layout(lines)]
|
||||
assert result == ["A1", "B1", "C1", "A2", "B2", "C2"]
|
||||
|
||||
def test_groups_slightly_misaligned_cells_into_one_band(self) -> None:
|
||||
# Real OCR boxes for a single visual row are rarely perfectly
|
||||
# y-aligned; we still want them grouped.
|
||||
lines = [
|
||||
_make("LEFT", 10, 10),
|
||||
_make("MID", 50, 12), # 2px below LEFT — same row visually
|
||||
_make("RIGHT", 90, 11),
|
||||
]
|
||||
result = [ln.text for ln in sort_lines_by_layout(lines)]
|
||||
assert result == ["LEFT", "MID", "RIGHT"]
|
||||
|
||||
def test_separates_rows_when_y_gap_exceeds_threshold(self) -> None:
|
||||
# Lines with a y gap larger than ~½ line-height must NOT collapse
|
||||
# into the same band.
|
||||
lines = [
|
||||
_make("ROW1A", 10, 10),
|
||||
_make("ROW1B", 50, 10),
|
||||
_make("ROW2A", 10, 30), # gap of 20 vs height 15 → new band
|
||||
_make("ROW2B", 50, 30),
|
||||
]
|
||||
result = [ln.text for ln in sort_lines_by_layout(lines)]
|
||||
assert result == ["ROW1A", "ROW1B", "ROW2A", "ROW2B"]
|
||||
|
||||
def test_ocrpage_text_uses_sorted_order(self) -> None:
|
||||
lines = [
|
||||
_make("RIGHT", 90, 10),
|
||||
_make("LEFT", 10, 10),
|
||||
_make("BOTTOM", 10, 30),
|
||||
]
|
||||
page = OCRPage(lines=lines)
|
||||
assert page.text == "LEFT\nRIGHT\nBOTTOM"
|
||||
Reference in New Issue
Block a user