OCR-SPRIN-SERVICE/tests/unit/test_api.py

"""API tests with the OCR engine mocked.

These tests do NOT load PaddleOCR — instead they monkeypatch the orchestrator
so we can exercise the FastAPI surface without the heavy ML init cost.
"""

from __future__ import annotations

from datetime import date

import pytest
from fastapi.testclient import TestClient

from ocr_sprint.main import create_app
from ocr_sprint.pipeline import orchestrator as orch_module
from ocr_sprint.pipeline.orchestrator import PipelineOutput
from ocr_sprint.schemas.document import DocumentStatus, SourceKind
from ocr_sprint.schemas.extraction import ExtractionResult, HeaderFields


@pytest.fixture
def client() -> TestClient:
    return TestClient(create_app())


def test_health_endpoint(client: TestClient) -> None:
    response = client.get("/api/v1/health")
    assert response.status_code == 200
    assert response.json()["status"] == "ok"


def test_documents_rejects_empty_upload(client: TestClient) -> None:
    response = client.post(
        "/api/v1/documents",
        files={"file": ("empty.pdf", b"", "application/pdf")},
    )
    assert response.status_code == 400


def test_documents_rejects_unknown_format(
    client: TestClient,
    monkeypatch: pytest.MonkeyPatch,
) -> None:
    response = client.post(
        "/api/v1/documents",
        files={"file": ("x.bin", b"random garbage bytes here", "application/octet-stream")},
    )
    assert response.status_code == 400


def test_documents_returns_pipeline_output(
    client: TestClient,
    monkeypatch: pytest.MonkeyPatch,
) -> None:
    fake_result = ExtractionResult(
        header=HeaderFields(
            nomor_sprint="Sprin/1/I/2025",
            tanggal=date(2025, 1, 1),
            satuan_penerbit="POLRES TEST",
        ),
        confidence=0.97,
    )
    fake_output = PipelineOutput(
        source_kind=SourceKind.PDF,
        status=DocumentStatus.COMPLETED,
        confidence=0.97,
        result=fake_result,
    )

    def _fake_run(_content: bytes) -> PipelineOutput:
        return fake_output

    # Patch the symbol *imported into* the routes module.
    monkeypatch.setattr(orch_module, "run_pipeline", _fake_run)
    from ocr_sprint.api.routes import documents as docs_module

    monkeypatch.setattr(docs_module, "run_pipeline", _fake_run)

    response = client.post(
        "/api/v1/documents",
        files={"file": ("x.pdf", b"%PDF-1.4\n%fake", "application/pdf")},
    )
    assert response.status_code == 200
    body = response.json()
    assert body["status"] == "completed"
    assert body["confidence"] == 0.97
    assert body["data"]["header"]["nomor_sprint"] == "Sprin/1/I/2025"