"""Tests for OCR layout reordering. PaddleOCR emits text boxes in detection order, not visual reading order. On dense table layouts (Polda Kalbar Akpol-panitia regression) this interleaves columns within a row and breaks every downstream extractor that assumes top-to-bottom row order. ``sort_lines_by_layout`` rebuilds reading order from the bounding-box geometry. """ from __future__ import annotations from ocr_sprint.pipeline.ocr import OCRLine, OCRPage, sort_lines_by_layout def _box(x: float, y: float, w: float = 30, h: float = 15): return ((x, y), (x + w, y), (x + w, y + h), (x, y + h)) def _make(text: str, x: float, y: float) -> OCRLine: return OCRLine(text=text, confidence=1.0, box=_box(x, y)) class TestSortLinesByLayout: def test_empty_returns_empty(self) -> None: assert sort_lines_by_layout([]) == [] def test_already_sorted_is_stable(self) -> None: lines = [_make("A", 10, 10), _make("B", 50, 10), _make("C", 10, 30)] assert [ln.text for ln in sort_lines_by_layout(lines)] == ["A", "B", "C"] def test_reorders_column_first_detection_to_row_first(self) -> None: # Simulate a 2-row, 3-col table where Paddle returned cells # column-first instead of row-first. lines = [ _make("B1", 50, 10), _make("B2", 50, 30), _make("A1", 10, 10), _make("A2", 10, 30), _make("C1", 90, 10), _make("C2", 90, 30), ] result = [ln.text for ln in sort_lines_by_layout(lines)] assert result == ["A1", "B1", "C1", "A2", "B2", "C2"] def test_groups_slightly_misaligned_cells_into_one_band(self) -> None: # Real OCR boxes for a single visual row are rarely perfectly # y-aligned; we still want them grouped. lines = [ _make("LEFT", 10, 10), _make("MID", 50, 12), # 2px below LEFT — same row visually _make("RIGHT", 90, 11), ] result = [ln.text for ln in sort_lines_by_layout(lines)] assert result == ["LEFT", "MID", "RIGHT"] def test_separates_rows_when_y_gap_exceeds_threshold(self) -> None: # Lines with a y gap larger than ~½ line-height must NOT collapse # into the same band. lines = [ _make("ROW1A", 10, 10), _make("ROW1B", 50, 10), _make("ROW2A", 10, 30), # gap of 20 vs height 15 → new band _make("ROW2B", 50, 30), ] result = [ln.text for ln in sort_lines_by_layout(lines)] assert result == ["ROW1A", "ROW1B", "ROW2A", "ROW2B"] def test_ocrpage_text_uses_sorted_order(self) -> None: lines = [ _make("RIGHT", 90, 10), _make("LEFT", 10, 10), _make("BOTTOM", 10, 30), ] page = OCRPage(lines=lines) assert page.text == "LEFT\nRIGHT\nBOTTOM"