Files
OCR-SPRIN-SERVICE/tests/unit/test_ocr_layout.py

76 lines
2.8 KiB
Python

"""Tests for OCR layout reordering.
PaddleOCR emits text boxes in detection order, not visual reading order.
On dense table layouts (Polda Kalbar Akpol-panitia regression) this
interleaves columns within a row and breaks every downstream extractor
that assumes top-to-bottom row order. ``sort_lines_by_layout`` rebuilds
reading order from the bounding-box geometry.
"""
from __future__ import annotations
from ocr_sprint.pipeline.ocr import OCRLine, OCRPage, sort_lines_by_layout
def _box(x: float, y: float, w: float = 30, h: float = 15):
return ((x, y), (x + w, y), (x + w, y + h), (x, y + h))
def _make(text: str, x: float, y: float) -> OCRLine:
return OCRLine(text=text, confidence=1.0, box=_box(x, y))
class TestSortLinesByLayout:
def test_empty_returns_empty(self) -> None:
assert sort_lines_by_layout([]) == []
def test_already_sorted_is_stable(self) -> None:
lines = [_make("A", 10, 10), _make("B", 50, 10), _make("C", 10, 30)]
assert [ln.text for ln in sort_lines_by_layout(lines)] == ["A", "B", "C"]
def test_reorders_column_first_detection_to_row_first(self) -> None:
# Simulate a 2-row, 3-col table where Paddle returned cells
# column-first instead of row-first.
lines = [
_make("B1", 50, 10),
_make("B2", 50, 30),
_make("A1", 10, 10),
_make("A2", 10, 30),
_make("C1", 90, 10),
_make("C2", 90, 30),
]
result = [ln.text for ln in sort_lines_by_layout(lines)]
assert result == ["A1", "B1", "C1", "A2", "B2", "C2"]
def test_groups_slightly_misaligned_cells_into_one_band(self) -> None:
# Real OCR boxes for a single visual row are rarely perfectly
# y-aligned; we still want them grouped.
lines = [
_make("LEFT", 10, 10),
_make("MID", 50, 12), # 2px below LEFT — same row visually
_make("RIGHT", 90, 11),
]
result = [ln.text for ln in sort_lines_by_layout(lines)]
assert result == ["LEFT", "MID", "RIGHT"]
def test_separates_rows_when_y_gap_exceeds_threshold(self) -> None:
# Lines with a y gap larger than ~½ line-height must NOT collapse
# into the same band.
lines = [
_make("ROW1A", 10, 10),
_make("ROW1B", 50, 10),
_make("ROW2A", 10, 30), # gap of 20 vs height 15 → new band
_make("ROW2B", 50, 30),
]
result = [ln.text for ln in sort_lines_by_layout(lines)]
assert result == ["ROW1A", "ROW1B", "ROW2A", "ROW2B"]
def test_ocrpage_text_uses_sorted_order(self) -> None:
lines = [
_make("RIGHT", 90, 10),
_make("LEFT", 10, 10),
_make("BOTTOM", 10, 30),
]
page = OCRPage(lines=lines)
assert page.text == "LEFT\nRIGHT\nBOTTOM"