Files
OCR-SPRIN-SERVICE/tests/unit/test_table.py

127 lines
4.3 KiB
Python

"""Tests for the PP-Structure table parsing helpers (no paddleocr required)."""
from __future__ import annotations
import sys
from types import ModuleType, SimpleNamespace
import pytest
from ocr_sprint.pipeline import table as table_module
from ocr_sprint.pipeline.table import (
DetectedTable,
extract_tables_from_pp_result,
parse_table_html,
)
class TestParseTableHtml:
def test_simple_grid(self) -> None:
html_str = """
<html><body><table>
<tr><td>No</td><td>Pangkat</td><td>NRP</td><td>Nama</td></tr>
<tr><td>1</td><td>AKP</td><td>87010101</td><td>Budi Santoso</td></tr>
<tr><td>2</td><td>IPDA</td><td>92030404</td><td>Sari Wulandari</td></tr>
</table></body></html>
"""
rows = parse_table_html(html_str)
assert rows == [
["No", "Pangkat", "NRP", "Nama"],
["1", "AKP", "87010101", "Budi Santoso"],
["2", "IPDA", "92030404", "Sari Wulandari"],
]
def test_handles_th_and_entities_and_inline_tags(self) -> None:
html_str = (
"<table><tr><th>Pangkat&nbsp;/ NRP</th><th>Nama</th></tr>"
"<tr><td>AKP <b>87010101</b></td><td>Budi&nbsp;Santoso</td></tr></table>"
)
rows = parse_table_html(html_str)
assert rows[0] == ["Pangkat / NRP", "Nama"]
assert rows[1] == ["AKP 87010101", "Budi Santoso"]
def test_empty_table_returns_empty_list(self) -> None:
assert parse_table_html("<table></table>") == []
assert parse_table_html("") == []
class TestExtractTablesFromPpResult:
def test_filters_table_regions_and_parses_html(self) -> None:
pp_result = [
{"type": "text", "res": [{"text": "ignore me", "confidence": 0.9}]},
{
"type": "table",
"res": {
"html": "<table><tr><td>A</td><td>B</td></tr></table>",
"cell_bbox": [],
},
},
{
"type": "table",
"res": {"html": ""}, # empty html → ignored
},
{
"type": "figure",
"res": [],
},
]
tables = extract_tables_from_pp_result(pp_result)
assert len(tables) == 1
assert tables[0].cells == [["A", "B"]]
def test_no_tables_returns_empty_list(self) -> None:
pp_result = [{"type": "text", "res": [{"text": "x"}]}]
assert extract_tables_from_pp_result(pp_result) == []
class TestDetectedTable:
def test_dimensions(self) -> None:
table = DetectedTable(cells=[["a", "b", "c"], ["d", "e"]])
assert table.n_rows == 2
assert table.n_cols == 3
def test_zero_rows(self) -> None:
table = DetectedTable()
assert table.n_rows == 0
assert table.n_cols == 0
class TestPpStructureInit:
def test_gpu_init_falls_back_to_cpu(self, monkeypatch: pytest.MonkeyPatch) -> None:
calls: list[dict[str, object]] = []
class FakePPStructure:
def __init__(self, **kwargs: object) -> None:
calls.append(kwargs)
if kwargs["use_gpu"]:
raise RuntimeError("gpu init failed")
fake_paddleocr = ModuleType("paddleocr")
fake_paddleocr.PPStructure = FakePPStructure
monkeypatch.setitem(sys.modules, "paddleocr", fake_paddleocr)
monkeypatch.setattr(
table_module,
"get_settings",
lambda: SimpleNamespace(ocr_lang="latin", ocr_use_gpu=True),
)
engine = table_module._build_pp_structure()
assert isinstance(engine, FakePPStructure)
assert calls == [
{"lang": "en", "use_gpu": True, "layout": True, "show_log": False},
{"lang": "en", "use_gpu": False, "layout": True, "show_log": False},
]
@pytest.fixture
def sample_personnel_table() -> DetectedTable:
"""Header + three personnel rows in a typical Polres-level format."""
cells = [
["No", "Pangkat / NRP", "Nama", "Jabatan dalam Dinas", "Jabatan dalam Sprint"],
["1", "AKP 87010101", "Budi Santoso", "Kanit Reskrim", "Ketua Tim"],
["2", "IPDA 92030404", "Sari Wulandari", "Banit Reskrim", "Anggota"],
["3", "BRIPKA 98050505", "Ahmad Hidayat", "Banit Reskrim", "Anggota"],
]
return DetectedTable(cells=cells)