OCR-SPRIN-SERVICE/tests/unit/test_api_ground_truth.py

"""HTTP tests for the ground-truth export endpoints."""

from __future__ import annotations

import json
from datetime import date

import pytest
from fastapi.testclient import TestClient

from ocr_sprint.main import create_app
from ocr_sprint.pipeline import orchestrator as orch_module
from ocr_sprint.pipeline.orchestrator import PipelineOutput
from ocr_sprint.schemas.document import DocumentStatus, SourceKind
from ocr_sprint.schemas.extraction import (
    ExtractionResult,
    HeaderFields,
    PersonnelEntry,
)


@pytest.fixture
def client() -> TestClient:
    return TestClient(create_app())


@pytest.fixture
def fake_pipeline(monkeypatch: pytest.MonkeyPatch) -> PipelineOutput:
    result = ExtractionResult(
        header=HeaderFields(
            nomor_sprint="Sprin/1/I/2025",
            tanggal=date(2025, 1, 1),
            satuan_penerbit="POLRES TEST",
        ),
        personel=[
            PersonnelEntry(pangkat="AIPDA", nrp="77060000", nama="BUDI", jabatan="ANGGOTA"),
        ],
        confidence=0.9,
    )
    output = PipelineOutput(
        source_kind=SourceKind.PDF,
        status=DocumentStatus.COMPLETED,
        confidence=0.9,
        result=result,
    )

    def _fake_run(_content: bytes) -> PipelineOutput:
        return output

    monkeypatch.setattr(orch_module, "run_pipeline", _fake_run)
    from ocr_sprint.api.routes import documents as docs_module

    monkeypatch.setattr(docs_module, "run_pipeline", _fake_run)
    from ocr_sprint.worker import tasks as tasks_module

    monkeypatch.setattr(tasks_module, "run_pipeline", _fake_run)
    return output


def _create_and_approve(client: TestClient, *, correction_value: str | None = None) -> str:
    post = client.post(
        "/api/v1/documents?sync=true",
        files={"file": ("x.pdf", b"%PDF-1.4\n%fake", "application/pdf")},
    )
    assert post.status_code == 200, post.text
    jid = str(post.json()["job_id"])
    if correction_value is not None:
        patched = client.patch(
            f"/api/v1/documents/{jid}",
            json={"corrections": [{"path": "header.perihal", "value": correction_value}]},
        )
        assert patched.status_code == 200
    approved = client.post(f"/api/v1/documents/{jid}/approve")
    assert approved.status_code == 200
    return jid


def test_stats_empty_dataset(client: TestClient) -> None:
    resp = client.get("/api/v1/ground-truth/stats")
    assert resp.status_code == 200
    body = resp.json()
    assert body["total_jobs"] == 0
    assert body["approved_jobs"] == 0
    assert body["total_corrections"] == 0
    assert body["top_corrected_fields"] == []


def test_stats_rolls_up_counts(client: TestClient, fake_pipeline: PipelineOutput) -> None:
    _create_and_approve(client, correction_value="Penyelidikan-1")
    _create_and_approve(client, correction_value="Penyelidikan-2")
    _create_and_approve(client, correction_value=None)  # pristine

    resp = client.get("/api/v1/ground-truth/stats")
    assert resp.status_code == 200
    body = resp.json()
    assert body["total_jobs"] == 3
    assert body["approved_jobs"] == 3
    assert body["total_corrections"] == 2
    assert body["jobs_with_corrections"] == 2
    assert body["top_corrected_fields"][0]["field_path"] == "header.perihal"
    assert body["top_corrected_fields"][0]["count"] == 2


def test_export_streams_jsonl(client: TestClient, fake_pipeline: PipelineOutput) -> None:
    _create_and_approve(client, correction_value="Penyelidikan")
    _create_and_approve(client, correction_value=None)

    resp = client.get("/api/v1/ground-truth/export")
    assert resp.status_code == 200
    assert resp.headers["content-type"].startswith("application/x-ndjson")
    lines = [line for line in resp.text.splitlines() if line.strip()]
    assert len(lines) == 2
    parsed = [json.loads(line) for line in lines]
    for sample in parsed:
        assert sample["approved"] is True
        assert "initial_result" in sample
        assert "final_result" in sample


def test_export_approved_only_default(client: TestClient, fake_pipeline: PipelineOutput) -> None:
    """Unapproved jobs shouldn't appear in the default export."""
    # One approved, one just completed (no approve call).
    _create_and_approve(client, correction_value=None)
    client.post(
        "/api/v1/documents?sync=true",
        files={"file": ("y.pdf", b"%PDF-1.4\n%fake", "application/pdf")},
    )
    resp = client.get("/api/v1/ground-truth/export")
    lines = [line for line in resp.text.splitlines() if line.strip()]
    assert len(lines) == 1

    # Toggle approved_only=false to include both.
    resp = client.get("/api/v1/ground-truth/export?approved_only=false")
    lines = [line for line in resp.text.splitlines() if line.strip()]
    assert len(lines) == 2


def test_export_has_corrections_filter(client: TestClient, fake_pipeline: PipelineOutput) -> None:
    _create_and_approve(client, correction_value="Penyelidikan")
    _create_and_approve(client, correction_value=None)

    resp = client.get("/api/v1/ground-truth/export?has_corrections=true")
    lines = [line for line in resp.text.splitlines() if line.strip()]
    assert len(lines) == 1
    assert json.loads(lines[0])["corrections"][0]["new_value"] == "Penyelidikan"

    resp = client.get("/api/v1/ground-truth/export?has_corrections=false")
    lines = [line for line in resp.text.splitlines() if line.strip()]
    assert len(lines) == 1
    assert json.loads(lines[0])["corrections"] == []


def test_export_respects_limit(client: TestClient, fake_pipeline: PipelineOutput) -> None:
    for _ in range(5):
        _create_and_approve(client, correction_value=None)
    resp = client.get("/api/v1/ground-truth/export?limit=2")
    lines = [line for line in resp.text.splitlines() if line.strip()]
    assert len(lines) == 2