OCR-SPRIN-SERVICE/tests/unit/test_table.py

"""Tests for the PP-Structure table parsing helpers (no paddleocr required)."""

from __future__ import annotations

import pytest

from ocr_sprint.pipeline.table import (
    DetectedTable,
    extract_tables_from_pp_result,
    parse_table_html,
)


class TestParseTableHtml:
    def test_simple_grid(self) -> None:
        html_str = """
        <html><body><table>
          <tr><td>No</td><td>Pangkat</td><td>NRP</td><td>Nama</td></tr>
          <tr><td>1</td><td>AKP</td><td>87010101</td><td>Budi Santoso</td></tr>
          <tr><td>2</td><td>IPDA</td><td>92030404</td><td>Sari Wulandari</td></tr>
        </table></body></html>
        """
        rows = parse_table_html(html_str)
        assert rows == [
            ["No", "Pangkat", "NRP", "Nama"],
            ["1", "AKP", "87010101", "Budi Santoso"],
            ["2", "IPDA", "92030404", "Sari Wulandari"],
        ]

    def test_handles_th_and_entities_and_inline_tags(self) -> None:
        html_str = (
            "<table><tr><th>Pangkat&nbsp;/ NRP</th><th>Nama</th></tr>"
            "<tr><td>AKP <b>87010101</b></td><td>Budi&nbsp;Santoso</td></tr></table>"
        )
        rows = parse_table_html(html_str)
        assert rows[0] == ["Pangkat / NRP", "Nama"]
        assert rows[1] == ["AKP 87010101", "Budi Santoso"]

    def test_empty_table_returns_empty_list(self) -> None:
        assert parse_table_html("<table></table>") == []
        assert parse_table_html("") == []


class TestExtractTablesFromPpResult:
    def test_filters_table_regions_and_parses_html(self) -> None:
        pp_result = [
            {"type": "text", "res": [{"text": "ignore me", "confidence": 0.9}]},
            {
                "type": "table",
                "res": {
                    "html": "<table><tr><td>A</td><td>B</td></tr></table>",
                    "cell_bbox": [],
                },
            },
            {
                "type": "table",
                "res": {"html": ""},  # empty html → ignored
            },
            {
                "type": "figure",
                "res": [],
            },
        ]
        tables = extract_tables_from_pp_result(pp_result)
        assert len(tables) == 1
        assert tables[0].cells == [["A", "B"]]

    def test_no_tables_returns_empty_list(self) -> None:
        pp_result = [{"type": "text", "res": [{"text": "x"}]}]
        assert extract_tables_from_pp_result(pp_result) == []


class TestDetectedTable:
    def test_dimensions(self) -> None:
        table = DetectedTable(cells=[["a", "b", "c"], ["d", "e"]])
        assert table.n_rows == 2
        assert table.n_cols == 3

    def test_zero_rows(self) -> None:
        table = DetectedTable()
        assert table.n_rows == 0
        assert table.n_cols == 0


@pytest.fixture
def sample_personnel_table() -> DetectedTable:
    """Header + three personnel rows in a typical Polres-level format."""
    cells = [
        ["No", "Pangkat / NRP", "Nama", "Jabatan dalam Dinas", "Jabatan dalam Sprint"],
        ["1", "AKP 87010101", "Budi Santoso", "Kanit Reskrim", "Ketua Tim"],
        ["2", "IPDA 92030404", "Sari Wulandari", "Banit Reskrim", "Anggota"],
        ["3", "BRIPKA 98050505", "Ahmad Hidayat", "Banit Reskrim", "Anggota"],
    ]
    return DetectedTable(cells=cells)