"""Tests for the PP-Structure table parsing helpers (no paddleocr required).""" from __future__ import annotations import sys from types import ModuleType, SimpleNamespace import pytest from ocr_sprint.pipeline import table as table_module from ocr_sprint.pipeline.table import ( DetectedTable, extract_tables_from_pp_result, parse_table_html, ) class TestParseTableHtml: def test_simple_grid(self) -> None: html_str = """
NoPangkatNRPNama
1AKP87010101Budi Santoso
2IPDA92030404Sari Wulandari
""" rows = parse_table_html(html_str) assert rows == [ ["No", "Pangkat", "NRP", "Nama"], ["1", "AKP", "87010101", "Budi Santoso"], ["2", "IPDA", "92030404", "Sari Wulandari"], ] def test_handles_th_and_entities_and_inline_tags(self) -> None: html_str = ( "" "
Pangkat / NRPNama
AKP 87010101Budi Santoso
" ) rows = parse_table_html(html_str) assert rows[0] == ["Pangkat / NRP", "Nama"] assert rows[1] == ["AKP 87010101", "Budi Santoso"] def test_empty_table_returns_empty_list(self) -> None: assert parse_table_html("
") == [] assert parse_table_html("") == [] class TestExtractTablesFromPpResult: def test_filters_table_regions_and_parses_html(self) -> None: pp_result = [ {"type": "text", "res": [{"text": "ignore me", "confidence": 0.9}]}, { "type": "table", "res": { "html": "
AB
", "cell_bbox": [], }, }, { "type": "table", "res": {"html": ""}, # empty html → ignored }, { "type": "figure", "res": [], }, ] tables = extract_tables_from_pp_result(pp_result) assert len(tables) == 1 assert tables[0].cells == [["A", "B"]] def test_no_tables_returns_empty_list(self) -> None: pp_result = [{"type": "text", "res": [{"text": "x"}]}] assert extract_tables_from_pp_result(pp_result) == [] class TestDetectedTable: def test_dimensions(self) -> None: table = DetectedTable(cells=[["a", "b", "c"], ["d", "e"]]) assert table.n_rows == 2 assert table.n_cols == 3 def test_zero_rows(self) -> None: table = DetectedTable() assert table.n_rows == 0 assert table.n_cols == 0 class TestPpStructureInit: def test_gpu_init_falls_back_to_cpu(self, monkeypatch: pytest.MonkeyPatch) -> None: calls: list[dict[str, object]] = [] class FakePPStructure: def __init__(self, **kwargs: object) -> None: calls.append(kwargs) if kwargs["use_gpu"]: raise RuntimeError("gpu init failed") fake_paddleocr = ModuleType("paddleocr") fake_paddleocr.PPStructure = FakePPStructure monkeypatch.setitem(sys.modules, "paddleocr", fake_paddleocr) monkeypatch.setattr( table_module, "get_settings", lambda: SimpleNamespace(ocr_lang="latin", ocr_use_gpu=True), ) engine = table_module._build_pp_structure() assert isinstance(engine, FakePPStructure) assert calls == [ {"lang": "en", "use_gpu": True, "layout": True, "show_log": False}, {"lang": "en", "use_gpu": False, "layout": True, "show_log": False}, ] @pytest.fixture def sample_personnel_table() -> DetectedTable: """Header + three personnel rows in a typical Polres-level format.""" cells = [ ["No", "Pangkat / NRP", "Nama", "Jabatan dalam Dinas", "Jabatan dalam Sprint"], ["1", "AKP 87010101", "Budi Santoso", "Kanit Reskrim", "Ketua Tim"], ["2", "IPDA 92030404", "Sari Wulandari", "Banit Reskrim", "Anggota"], ["3", "BRIPKA 98050505", "Ahmad Hidayat", "Banit Reskrim", "Anggota"], ] return DetectedTable(cells=cells)