"""Tests for the PP-Structure table parsing helpers (no paddleocr required).""" from __future__ import annotations import pytest from ocr_sprint.pipeline.table import ( DetectedTable, extract_tables_from_pp_result, parse_table_html, ) class TestParseTableHtml: def test_simple_grid(self) -> None: html_str = """
NoPangkatNRPNama
1AKP87010101Budi Santoso
2IPDA92030404Sari Wulandari
""" rows = parse_table_html(html_str) assert rows == [ ["No", "Pangkat", "NRP", "Nama"], ["1", "AKP", "87010101", "Budi Santoso"], ["2", "IPDA", "92030404", "Sari Wulandari"], ] def test_handles_th_and_entities_and_inline_tags(self) -> None: html_str = ( "" "
Pangkat / NRPNama
AKP 87010101Budi Santoso
" ) rows = parse_table_html(html_str) assert rows[0] == ["Pangkat / NRP", "Nama"] assert rows[1] == ["AKP 87010101", "Budi Santoso"] def test_empty_table_returns_empty_list(self) -> None: assert parse_table_html("
") == [] assert parse_table_html("") == [] class TestExtractTablesFromPpResult: def test_filters_table_regions_and_parses_html(self) -> None: pp_result = [ {"type": "text", "res": [{"text": "ignore me", "confidence": 0.9}]}, { "type": "table", "res": { "html": "
AB
", "cell_bbox": [], }, }, { "type": "table", "res": {"html": ""}, # empty html → ignored }, { "type": "figure", "res": [], }, ] tables = extract_tables_from_pp_result(pp_result) assert len(tables) == 1 assert tables[0].cells == [["A", "B"]] def test_no_tables_returns_empty_list(self) -> None: pp_result = [{"type": "text", "res": [{"text": "x"}]}] assert extract_tables_from_pp_result(pp_result) == [] class TestDetectedTable: def test_dimensions(self) -> None: table = DetectedTable(cells=[["a", "b", "c"], ["d", "e"]]) assert table.n_rows == 2 assert table.n_cols == 3 def test_zero_rows(self) -> None: table = DetectedTable() assert table.n_rows == 0 assert table.n_cols == 0 @pytest.fixture def sample_personnel_table() -> DetectedTable: """Header + three personnel rows in a typical Polres-level format.""" cells = [ ["No", "Pangkat / NRP", "Nama", "Jabatan dalam Dinas", "Jabatan dalam Sprint"], ["1", "AKP 87010101", "Budi Santoso", "Kanit Reskrim", "Ketua Tim"], ["2", "IPDA 92030404", "Sari Wulandari", "Banit Reskrim", "Anggota"], ["3", "BRIPKA 98050505", "Ahmad Hidayat", "Banit Reskrim", "Anggota"], ] return DetectedTable(cells=cells)