OCR-SPRIN-SERVICE/tests/unit/test_api_hitl.py

"""End-to-end HTTP tests for the HITL endpoints.

We re-use the ``fake_pipeline`` style from ``test_api.py`` so we don't pay
the PaddleOCR init cost; the orchestrator is monkey-patched to return a
synthetic ``ExtractionResult``.
"""

from __future__ import annotations

from datetime import date

import pytest
from fastapi.testclient import TestClient

from ocr_sprint.main import create_app
from ocr_sprint.pipeline import orchestrator as orch_module
from ocr_sprint.pipeline.orchestrator import PipelineOutput
from ocr_sprint.schemas.document import DocumentStatus, SourceKind
from ocr_sprint.schemas.extraction import (
    ExtractionResult,
    HeaderFields,
    PersonnelEntry,
    ReviewFlag,
)


@pytest.fixture
def client() -> TestClient:
    return TestClient(create_app())


@pytest.fixture
def fake_pipeline(monkeypatch: pytest.MonkeyPatch) -> PipelineOutput:
    result = ExtractionResult(
        header=HeaderFields(
            nomor_sprint="Sprin/1/I/2025",
            tanggal=date(2025, 1, 1),
            satuan_penerbit="POLRES TEST",
            perihal=None,  # intentional gap so a PATCH can fill it
        ),
        personel=[
            PersonnelEntry(pangkat="AIPDA", nrp="77060000", nama="BUDI", jabatan="ANGGOTA"),
        ],
        review_flags=[ReviewFlag.MISSING_FIELD],
        confidence=0.7,
    )
    output = PipelineOutput(
        source_kind=SourceKind.PDF,
        status=DocumentStatus.NEEDS_REVIEW,
        confidence=0.7,
        result=result,
    )

    def _fake_run(_content: bytes) -> PipelineOutput:
        return output

    monkeypatch.setattr(orch_module, "run_pipeline", _fake_run)
    from ocr_sprint.api.routes import documents as docs_module

    monkeypatch.setattr(docs_module, "run_pipeline", _fake_run)
    from ocr_sprint.worker import tasks as tasks_module

    monkeypatch.setattr(tasks_module, "run_pipeline", _fake_run)
    return output


def _create_job(client: TestClient) -> str:
    post = client.post(
        "/api/v1/documents?sync=true",
        files={"file": ("x.pdf", b"%PDF-1.4\n%fake", "application/pdf")},
    )
    assert post.status_code == 200, post.text
    body = post.json()
    assert body["status"] == "needs_review"
    return str(body["job_id"])


def test_patch_applies_correction_and_clears_missing_field(
    client: TestClient,
    fake_pipeline: PipelineOutput,
) -> None:
    job_id = _create_job(client)
    patched = client.patch(
        f"/api/v1/documents/{job_id}",
        json={
            "corrections": [
                {
                    "path": "header.perihal",
                    "value": "Penyelidikan kasus X",
                    "reason": "LLM missed it",
                }
            ]
        },
        headers={"X-User-Id": "reviewer-a"},
    )
    assert patched.status_code == 200, patched.text
    body = patched.json()
    assert body["data"]["header"]["perihal"] == "Penyelidikan kasus X"
    # The fake pipeline has both required header fields filled, so the
    # ``missing_field`` flag is auto-cleared as soon as any correction
    # lands (the policy re-evaluates required-field coverage on every
    # edit).
    assert "missing_field" not in body["review_flags"]


def test_patch_returns_400_for_unknown_path(
    client: TestClient,
    fake_pipeline: PipelineOutput,
) -> None:
    job_id = _create_job(client)
    resp = client.patch(
        f"/api/v1/documents/{job_id}",
        json={"corrections": [{"path": "bogus.field", "value": "x"}]},
    )
    assert resp.status_code == 400


def test_patch_is_atomic_on_partial_failure(
    client: TestClient,
    fake_pipeline: PipelineOutput,
) -> None:
    job_id = _create_job(client)
    resp = client.patch(
        f"/api/v1/documents/{job_id}",
        json={
            "corrections": [
                {"path": "header.perihal", "value": "OK"},
                {"path": "bogus.root", "value": "X"},
            ]
        },
    )
    assert resp.status_code == 400

    # The first correction must not have persisted.
    got = client.get(f"/api/v1/documents/{job_id}")
    assert got.json()["data"]["header"]["perihal"] is None


def test_history_returns_corrections_in_order(
    client: TestClient,
    fake_pipeline: PipelineOutput,
) -> None:
    job_id = _create_job(client)
    client.patch(
        f"/api/v1/documents/{job_id}",
        json={"corrections": [{"path": "header.perihal", "value": "first"}]},
        headers={"X-User-Id": "reviewer-a"},
    )
    client.patch(
        f"/api/v1/documents/{job_id}",
        json={"corrections": [{"path": "header.perihal", "value": "second"}]},
        headers={"X-User-Id": "reviewer-b"},
    )

    history = client.get(f"/api/v1/documents/{job_id}/history")
    assert history.status_code == 200
    events = history.json()
    assert [e["new_value"] for e in events] == ["first", "second"]
    assert [e["corrected_by"] for e in events] == ["reviewer-a", "reviewer-b"]
    # old_value of the second event should reflect the first edit.
    assert events[1]["old_value"] == "first"


def test_history_returns_empty_list_for_untouched_job(
    client: TestClient,
    fake_pipeline: PipelineOutput,
) -> None:
    job_id = _create_job(client)
    history = client.get(f"/api/v1/documents/{job_id}/history")
    assert history.status_code == 200
    assert history.json() == []


def test_history_returns_404_for_unknown_job(client: TestClient) -> None:
    resp = client.get("/api/v1/documents/00000000-0000-0000-0000-000000000000/history")
    assert resp.status_code == 404


def test_approve_locks_subsequent_patches(
    client: TestClient,
    fake_pipeline: PipelineOutput,
) -> None:
    job_id = _create_job(client)
    approved = client.post(
        f"/api/v1/documents/{job_id}/approve",
        headers={"X-User-Id": "reviewer-a"},
    )
    assert approved.status_code == 200, approved.text
    body = approved.json()
    assert body["approved"] is True
    assert body["reviewed_by"] == "reviewer-a"
    assert body["reviewed_at"]  # non-empty timestamp

    # GET reflects the approval state.
    got = client.get(f"/api/v1/documents/{job_id}").json()
    assert got["approved"] is True

    # PATCH after approve must be rejected with 409.
    patched = client.patch(
        f"/api/v1/documents/{job_id}",
        json={"corrections": [{"path": "header.perihal", "value": "X"}]},
    )
    assert patched.status_code == 409


def test_approve_is_idempotent(
    client: TestClient,
    fake_pipeline: PipelineOutput,
) -> None:
    job_id = _create_job(client)
    first = client.post(
        f"/api/v1/documents/{job_id}/approve",
        headers={"X-User-Id": "reviewer-a"},
    )
    second = client.post(
        f"/api/v1/documents/{job_id}/approve",
        headers={"X-User-Id": "reviewer-b"},
    )
    assert first.status_code == 200
    assert second.status_code == 200
    # Second approve must NOT change the attribution. (SQLite drops tzinfo
    # on roundtrip, which changes Pydantic's serialization between the two
    # calls; compare the naive components.)
    assert second.json()["reviewed_by"] == "reviewer-a"
    assert (
        second.json()["reviewed_at"].rstrip("Z").split("+")[0]
        == (first.json()["reviewed_at"].rstrip("Z").split("+")[0])
    )


def test_patch_requires_at_least_one_correction(
    client: TestClient,
    fake_pipeline: PipelineOutput,
) -> None:
    job_id = _create_job(client)
    resp = client.patch(
        f"/api/v1/documents/{job_id}",
        json={"corrections": []},
    )
    assert resp.status_code == 422  # Pydantic min_length=1 violation


def test_patch_missing_job_returns_404(client: TestClient) -> None:
    resp = client.patch(
        "/api/v1/documents/00000000-0000-0000-0000-000000000000",
        json={"corrections": [{"path": "header.perihal", "value": "X"}]},
    )
    assert resp.status_code == 404