76 lines
2.8 KiB
Python
76 lines
2.8 KiB
Python
"""Tests for OCR layout reordering.
|
|
|
|
PaddleOCR emits text boxes in detection order, not visual reading order.
|
|
On dense table layouts (Polda Kalbar Akpol-panitia regression) this
|
|
interleaves columns within a row and breaks every downstream extractor
|
|
that assumes top-to-bottom row order. ``sort_lines_by_layout`` rebuilds
|
|
reading order from the bounding-box geometry.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from ocr_sprint.pipeline.ocr import OCRLine, OCRPage, sort_lines_by_layout
|
|
|
|
|
|
def _box(x: float, y: float, w: float = 30, h: float = 15):
|
|
return ((x, y), (x + w, y), (x + w, y + h), (x, y + h))
|
|
|
|
|
|
def _make(text: str, x: float, y: float) -> OCRLine:
|
|
return OCRLine(text=text, confidence=1.0, box=_box(x, y))
|
|
|
|
|
|
class TestSortLinesByLayout:
|
|
def test_empty_returns_empty(self) -> None:
|
|
assert sort_lines_by_layout([]) == []
|
|
|
|
def test_already_sorted_is_stable(self) -> None:
|
|
lines = [_make("A", 10, 10), _make("B", 50, 10), _make("C", 10, 30)]
|
|
assert [ln.text for ln in sort_lines_by_layout(lines)] == ["A", "B", "C"]
|
|
|
|
def test_reorders_column_first_detection_to_row_first(self) -> None:
|
|
# Simulate a 2-row, 3-col table where Paddle returned cells
|
|
# column-first instead of row-first.
|
|
lines = [
|
|
_make("B1", 50, 10),
|
|
_make("B2", 50, 30),
|
|
_make("A1", 10, 10),
|
|
_make("A2", 10, 30),
|
|
_make("C1", 90, 10),
|
|
_make("C2", 90, 30),
|
|
]
|
|
result = [ln.text for ln in sort_lines_by_layout(lines)]
|
|
assert result == ["A1", "B1", "C1", "A2", "B2", "C2"]
|
|
|
|
def test_groups_slightly_misaligned_cells_into_one_band(self) -> None:
|
|
# Real OCR boxes for a single visual row are rarely perfectly
|
|
# y-aligned; we still want them grouped.
|
|
lines = [
|
|
_make("LEFT", 10, 10),
|
|
_make("MID", 50, 12), # 2px below LEFT — same row visually
|
|
_make("RIGHT", 90, 11),
|
|
]
|
|
result = [ln.text for ln in sort_lines_by_layout(lines)]
|
|
assert result == ["LEFT", "MID", "RIGHT"]
|
|
|
|
def test_separates_rows_when_y_gap_exceeds_threshold(self) -> None:
|
|
# Lines with a y gap larger than ~½ line-height must NOT collapse
|
|
# into the same band.
|
|
lines = [
|
|
_make("ROW1A", 10, 10),
|
|
_make("ROW1B", 50, 10),
|
|
_make("ROW2A", 10, 30), # gap of 20 vs height 15 → new band
|
|
_make("ROW2B", 50, 30),
|
|
]
|
|
result = [ln.text for ln in sort_lines_by_layout(lines)]
|
|
assert result == ["ROW1A", "ROW1B", "ROW2A", "ROW2B"]
|
|
|
|
def test_ocrpage_text_uses_sorted_order(self) -> None:
|
|
lines = [
|
|
_make("RIGHT", 90, 10),
|
|
_make("LEFT", 10, 10),
|
|
_make("BOTTOM", 10, 30),
|
|
]
|
|
page = OCRPage(lines=lines)
|
|
assert page.text == "LEFT\nRIGHT\nBOTTOM"
|