"""Tests for the PP-Structure table parsing helpers (no paddleocr required)."""
from __future__ import annotations
import pytest
from ocr_sprint.pipeline.table import (
DetectedTable,
extract_tables_from_pp_result,
parse_table_html,
)
class TestParseTableHtml:
def test_simple_grid(self) -> None:
html_str = """
| No | Pangkat | NRP | Nama |
| 1 | AKP | 87010101 | Budi Santoso |
| 2 | IPDA | 92030404 | Sari Wulandari |
"""
rows = parse_table_html(html_str)
assert rows == [
["No", "Pangkat", "NRP", "Nama"],
["1", "AKP", "87010101", "Budi Santoso"],
["2", "IPDA", "92030404", "Sari Wulandari"],
]
def test_handles_th_and_entities_and_inline_tags(self) -> None:
html_str = (
"| Pangkat / NRP | Nama |
"
"| AKP 87010101 | Budi Santoso |
"
)
rows = parse_table_html(html_str)
assert rows[0] == ["Pangkat / NRP", "Nama"]
assert rows[1] == ["AKP 87010101", "Budi Santoso"]
def test_empty_table_returns_empty_list(self) -> None:
assert parse_table_html("") == []
assert parse_table_html("") == []
class TestExtractTablesFromPpResult:
def test_filters_table_regions_and_parses_html(self) -> None:
pp_result = [
{"type": "text", "res": [{"text": "ignore me", "confidence": 0.9}]},
{
"type": "table",
"res": {
"html": "",
"cell_bbox": [],
},
},
{
"type": "table",
"res": {"html": ""}, # empty html → ignored
},
{
"type": "figure",
"res": [],
},
]
tables = extract_tables_from_pp_result(pp_result)
assert len(tables) == 1
assert tables[0].cells == [["A", "B"]]
def test_no_tables_returns_empty_list(self) -> None:
pp_result = [{"type": "text", "res": [{"text": "x"}]}]
assert extract_tables_from_pp_result(pp_result) == []
class TestDetectedTable:
def test_dimensions(self) -> None:
table = DetectedTable(cells=[["a", "b", "c"], ["d", "e"]])
assert table.n_rows == 2
assert table.n_cols == 3
def test_zero_rows(self) -> None:
table = DetectedTable()
assert table.n_rows == 0
assert table.n_cols == 0
@pytest.fixture
def sample_personnel_table() -> DetectedTable:
"""Header + three personnel rows in a typical Polres-level format."""
cells = [
["No", "Pangkat / NRP", "Nama", "Jabatan dalam Dinas", "Jabatan dalam Sprint"],
["1", "AKP 87010101", "Budi Santoso", "Kanit Reskrim", "Ketua Tim"],
["2", "IPDA 92030404", "Sari Wulandari", "Banit Reskrim", "Anggota"],
["3", "BRIPKA 98050505", "Ahmad Hidayat", "Banit Reskrim", "Anggota"],
]
return DetectedTable(cells=cells)