127 lines
4.3 KiB
Python
127 lines
4.3 KiB
Python
"""Tests for the PP-Structure table parsing helpers (no paddleocr required)."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import sys
|
|
from types import ModuleType, SimpleNamespace
|
|
|
|
import pytest
|
|
|
|
from ocr_sprint.pipeline import table as table_module
|
|
from ocr_sprint.pipeline.table import (
|
|
DetectedTable,
|
|
extract_tables_from_pp_result,
|
|
parse_table_html,
|
|
)
|
|
|
|
|
|
class TestParseTableHtml:
|
|
def test_simple_grid(self) -> None:
|
|
html_str = """
|
|
<html><body><table>
|
|
<tr><td>No</td><td>Pangkat</td><td>NRP</td><td>Nama</td></tr>
|
|
<tr><td>1</td><td>AKP</td><td>87010101</td><td>Budi Santoso</td></tr>
|
|
<tr><td>2</td><td>IPDA</td><td>92030404</td><td>Sari Wulandari</td></tr>
|
|
</table></body></html>
|
|
"""
|
|
rows = parse_table_html(html_str)
|
|
assert rows == [
|
|
["No", "Pangkat", "NRP", "Nama"],
|
|
["1", "AKP", "87010101", "Budi Santoso"],
|
|
["2", "IPDA", "92030404", "Sari Wulandari"],
|
|
]
|
|
|
|
def test_handles_th_and_entities_and_inline_tags(self) -> None:
|
|
html_str = (
|
|
"<table><tr><th>Pangkat / NRP</th><th>Nama</th></tr>"
|
|
"<tr><td>AKP <b>87010101</b></td><td>Budi Santoso</td></tr></table>"
|
|
)
|
|
rows = parse_table_html(html_str)
|
|
assert rows[0] == ["Pangkat / NRP", "Nama"]
|
|
assert rows[1] == ["AKP 87010101", "Budi Santoso"]
|
|
|
|
def test_empty_table_returns_empty_list(self) -> None:
|
|
assert parse_table_html("<table></table>") == []
|
|
assert parse_table_html("") == []
|
|
|
|
|
|
class TestExtractTablesFromPpResult:
|
|
def test_filters_table_regions_and_parses_html(self) -> None:
|
|
pp_result = [
|
|
{"type": "text", "res": [{"text": "ignore me", "confidence": 0.9}]},
|
|
{
|
|
"type": "table",
|
|
"res": {
|
|
"html": "<table><tr><td>A</td><td>B</td></tr></table>",
|
|
"cell_bbox": [],
|
|
},
|
|
},
|
|
{
|
|
"type": "table",
|
|
"res": {"html": ""}, # empty html → ignored
|
|
},
|
|
{
|
|
"type": "figure",
|
|
"res": [],
|
|
},
|
|
]
|
|
tables = extract_tables_from_pp_result(pp_result)
|
|
assert len(tables) == 1
|
|
assert tables[0].cells == [["A", "B"]]
|
|
|
|
def test_no_tables_returns_empty_list(self) -> None:
|
|
pp_result = [{"type": "text", "res": [{"text": "x"}]}]
|
|
assert extract_tables_from_pp_result(pp_result) == []
|
|
|
|
|
|
class TestDetectedTable:
|
|
def test_dimensions(self) -> None:
|
|
table = DetectedTable(cells=[["a", "b", "c"], ["d", "e"]])
|
|
assert table.n_rows == 2
|
|
assert table.n_cols == 3
|
|
|
|
def test_zero_rows(self) -> None:
|
|
table = DetectedTable()
|
|
assert table.n_rows == 0
|
|
assert table.n_cols == 0
|
|
|
|
|
|
class TestPpStructureInit:
|
|
def test_gpu_init_falls_back_to_cpu(self, monkeypatch: pytest.MonkeyPatch) -> None:
|
|
calls: list[dict[str, object]] = []
|
|
|
|
class FakePPStructure:
|
|
def __init__(self, **kwargs: object) -> None:
|
|
calls.append(kwargs)
|
|
if kwargs["use_gpu"]:
|
|
raise RuntimeError("gpu init failed")
|
|
|
|
fake_paddleocr = ModuleType("paddleocr")
|
|
fake_paddleocr.PPStructure = FakePPStructure
|
|
monkeypatch.setitem(sys.modules, "paddleocr", fake_paddleocr)
|
|
monkeypatch.setattr(
|
|
table_module,
|
|
"get_settings",
|
|
lambda: SimpleNamespace(ocr_lang="latin", ocr_use_gpu=True),
|
|
)
|
|
|
|
engine = table_module._build_pp_structure()
|
|
|
|
assert isinstance(engine, FakePPStructure)
|
|
assert calls == [
|
|
{"lang": "en", "use_gpu": True, "layout": True, "show_log": False},
|
|
{"lang": "en", "use_gpu": False, "layout": True, "show_log": False},
|
|
]
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_personnel_table() -> DetectedTable:
|
|
"""Header + three personnel rows in a typical Polres-level format."""
|
|
cells = [
|
|
["No", "Pangkat / NRP", "Nama", "Jabatan dalam Dinas", "Jabatan dalam Sprint"],
|
|
["1", "AKP 87010101", "Budi Santoso", "Kanit Reskrim", "Ketua Tim"],
|
|
["2", "IPDA 92030404", "Sari Wulandari", "Banit Reskrim", "Anggota"],
|
|
["3", "BRIPKA 98050505", "Ahmad Hidayat", "Banit Reskrim", "Anggota"],
|
|
]
|
|
return DetectedTable(cells=cells)
|