"""End-to-end HTTP tests for the HITL endpoints. We re-use the ``fake_pipeline`` style from ``test_api.py`` so we don't pay the PaddleOCR init cost; the orchestrator is monkey-patched to return a synthetic ``ExtractionResult``. """ from __future__ import annotations from datetime import date import pytest from fastapi.testclient import TestClient from ocr_sprint.main import create_app from ocr_sprint.pipeline import orchestrator as orch_module from ocr_sprint.pipeline.orchestrator import PipelineOutput from ocr_sprint.schemas.document import DocumentStatus, SourceKind from ocr_sprint.schemas.extraction import ( ExtractionResult, HeaderFields, PersonnelEntry, ReviewFlag, ) @pytest.fixture def client() -> TestClient: return TestClient(create_app()) @pytest.fixture def fake_pipeline(monkeypatch: pytest.MonkeyPatch) -> PipelineOutput: result = ExtractionResult( header=HeaderFields( nomor_sprint="Sprin/1/I/2025", tanggal=date(2025, 1, 1), satuan_penerbit="POLRES TEST", perihal=None, # intentional gap so a PATCH can fill it ), personel=[ PersonnelEntry(pangkat="AIPDA", nrp="77060000", nama="BUDI", jabatan="ANGGOTA"), ], review_flags=[ReviewFlag.MISSING_FIELD], confidence=0.7, ) output = PipelineOutput( source_kind=SourceKind.PDF, status=DocumentStatus.NEEDS_REVIEW, confidence=0.7, result=result, ) def _fake_run(_content: bytes) -> PipelineOutput: return output monkeypatch.setattr(orch_module, "run_pipeline", _fake_run) from ocr_sprint.api.routes import documents as docs_module monkeypatch.setattr(docs_module, "run_pipeline", _fake_run) from ocr_sprint.worker import tasks as tasks_module monkeypatch.setattr(tasks_module, "run_pipeline", _fake_run) return output def _create_job(client: TestClient) -> str: post = client.post( "/api/v1/documents?sync=true", files={"file": ("x.pdf", b"%PDF-1.4\n%fake", "application/pdf")}, ) assert post.status_code == 200, post.text body = post.json() assert body["status"] == "needs_review" return str(body["job_id"]) def test_patch_applies_correction_and_clears_missing_field( client: TestClient, fake_pipeline: PipelineOutput, ) -> None: job_id = _create_job(client) patched = client.patch( f"/api/v1/documents/{job_id}", json={ "corrections": [ { "path": "header.perihal", "value": "Penyelidikan kasus X", "reason": "LLM missed it", } ] }, headers={"X-User-Id": "reviewer-a"}, ) assert patched.status_code == 200, patched.text body = patched.json() assert body["data"]["header"]["perihal"] == "Penyelidikan kasus X" # The fake pipeline has both required header fields filled, so the # ``missing_field`` flag is auto-cleared as soon as any correction # lands (the policy re-evaluates required-field coverage on every # edit). assert "missing_field" not in body["review_flags"] def test_patch_returns_400_for_unknown_path( client: TestClient, fake_pipeline: PipelineOutput, ) -> None: job_id = _create_job(client) resp = client.patch( f"/api/v1/documents/{job_id}", json={"corrections": [{"path": "bogus.field", "value": "x"}]}, ) assert resp.status_code == 400 def test_patch_is_atomic_on_partial_failure( client: TestClient, fake_pipeline: PipelineOutput, ) -> None: job_id = _create_job(client) resp = client.patch( f"/api/v1/documents/{job_id}", json={ "corrections": [ {"path": "header.perihal", "value": "OK"}, {"path": "bogus.root", "value": "X"}, ] }, ) assert resp.status_code == 400 # The first correction must not have persisted. got = client.get(f"/api/v1/documents/{job_id}") assert got.json()["data"]["header"]["perihal"] is None def test_history_returns_corrections_in_order( client: TestClient, fake_pipeline: PipelineOutput, ) -> None: job_id = _create_job(client) client.patch( f"/api/v1/documents/{job_id}", json={"corrections": [{"path": "header.perihal", "value": "first"}]}, headers={"X-User-Id": "reviewer-a"}, ) client.patch( f"/api/v1/documents/{job_id}", json={"corrections": [{"path": "header.perihal", "value": "second"}]}, headers={"X-User-Id": "reviewer-b"}, ) history = client.get(f"/api/v1/documents/{job_id}/history") assert history.status_code == 200 events = history.json() assert [e["new_value"] for e in events] == ["first", "second"] assert [e["corrected_by"] for e in events] == ["reviewer-a", "reviewer-b"] # old_value of the second event should reflect the first edit. assert events[1]["old_value"] == "first" def test_history_returns_empty_list_for_untouched_job( client: TestClient, fake_pipeline: PipelineOutput, ) -> None: job_id = _create_job(client) history = client.get(f"/api/v1/documents/{job_id}/history") assert history.status_code == 200 assert history.json() == [] def test_history_returns_404_for_unknown_job(client: TestClient) -> None: resp = client.get("/api/v1/documents/00000000-0000-0000-0000-000000000000/history") assert resp.status_code == 404 def test_approve_locks_subsequent_patches( client: TestClient, fake_pipeline: PipelineOutput, ) -> None: job_id = _create_job(client) approved = client.post( f"/api/v1/documents/{job_id}/approve", headers={"X-User-Id": "reviewer-a"}, ) assert approved.status_code == 200, approved.text body = approved.json() assert body["approved"] is True assert body["reviewed_by"] == "reviewer-a" assert body["reviewed_at"] # non-empty timestamp # GET reflects the approval state. got = client.get(f"/api/v1/documents/{job_id}").json() assert got["approved"] is True # PATCH after approve must be rejected with 409. patched = client.patch( f"/api/v1/documents/{job_id}", json={"corrections": [{"path": "header.perihal", "value": "X"}]}, ) assert patched.status_code == 409 def test_approve_is_idempotent( client: TestClient, fake_pipeline: PipelineOutput, ) -> None: job_id = _create_job(client) first = client.post( f"/api/v1/documents/{job_id}/approve", headers={"X-User-Id": "reviewer-a"}, ) second = client.post( f"/api/v1/documents/{job_id}/approve", headers={"X-User-Id": "reviewer-b"}, ) assert first.status_code == 200 assert second.status_code == 200 # Second approve must NOT change the attribution. (SQLite drops tzinfo # on roundtrip, which changes Pydantic's serialization between the two # calls; compare the naive components.) assert second.json()["reviewed_by"] == "reviewer-a" assert ( second.json()["reviewed_at"].rstrip("Z").split("+")[0] == (first.json()["reviewed_at"].rstrip("Z").split("+")[0]) ) def test_patch_requires_at_least_one_correction( client: TestClient, fake_pipeline: PipelineOutput, ) -> None: job_id = _create_job(client) resp = client.patch( f"/api/v1/documents/{job_id}", json={"corrections": []}, ) assert resp.status_code == 422 # Pydantic min_length=1 violation def test_patch_missing_job_returns_404(client: TestClient) -> None: resp = client.patch( "/api/v1/documents/00000000-0000-0000-0000-000000000000", json={"corrections": [{"path": "header.perihal", "value": "X"}]}, ) assert resp.status_code == 404