"""HTTP tests for the ground-truth export endpoints.""" from __future__ import annotations import json from datetime import date import pytest from fastapi.testclient import TestClient from ocr_sprint.main import create_app from ocr_sprint.pipeline import orchestrator as orch_module from ocr_sprint.pipeline.orchestrator import PipelineOutput from ocr_sprint.schemas.document import DocumentStatus, SourceKind from ocr_sprint.schemas.extraction import ( ExtractionResult, HeaderFields, PersonnelEntry, ) @pytest.fixture def client() -> TestClient: return TestClient(create_app()) @pytest.fixture def fake_pipeline(monkeypatch: pytest.MonkeyPatch) -> PipelineOutput: result = ExtractionResult( header=HeaderFields( nomor_sprint="Sprin/1/I/2025", tanggal=date(2025, 1, 1), satuan_penerbit="POLRES TEST", ), personel=[ PersonnelEntry(pangkat="AIPDA", nrp="77060000", nama="BUDI", jabatan="ANGGOTA"), ], confidence=0.9, ) output = PipelineOutput( source_kind=SourceKind.PDF, status=DocumentStatus.COMPLETED, confidence=0.9, result=result, ) def _fake_run(_content: bytes) -> PipelineOutput: return output monkeypatch.setattr(orch_module, "run_pipeline", _fake_run) from ocr_sprint.api.routes import documents as docs_module monkeypatch.setattr(docs_module, "run_pipeline", _fake_run) from ocr_sprint.worker import tasks as tasks_module monkeypatch.setattr(tasks_module, "run_pipeline", _fake_run) return output def _create_and_approve(client: TestClient, *, correction_value: str | None = None) -> str: post = client.post( "/api/v1/documents?sync=true", files={"file": ("x.pdf", b"%PDF-1.4\n%fake", "application/pdf")}, ) assert post.status_code == 200, post.text jid = str(post.json()["job_id"]) if correction_value is not None: patched = client.patch( f"/api/v1/documents/{jid}", json={"corrections": [{"path": "header.perihal", "value": correction_value}]}, ) assert patched.status_code == 200 approved = client.post(f"/api/v1/documents/{jid}/approve") assert approved.status_code == 200 return jid def test_stats_empty_dataset(client: TestClient) -> None: resp = client.get("/api/v1/ground-truth/stats") assert resp.status_code == 200 body = resp.json() assert body["total_jobs"] == 0 assert body["approved_jobs"] == 0 assert body["total_corrections"] == 0 assert body["top_corrected_fields"] == [] def test_stats_rolls_up_counts(client: TestClient, fake_pipeline: PipelineOutput) -> None: _create_and_approve(client, correction_value="Penyelidikan-1") _create_and_approve(client, correction_value="Penyelidikan-2") _create_and_approve(client, correction_value=None) # pristine resp = client.get("/api/v1/ground-truth/stats") assert resp.status_code == 200 body = resp.json() assert body["total_jobs"] == 3 assert body["approved_jobs"] == 3 assert body["total_corrections"] == 2 assert body["jobs_with_corrections"] == 2 assert body["top_corrected_fields"][0]["field_path"] == "header.perihal" assert body["top_corrected_fields"][0]["count"] == 2 def test_export_streams_jsonl(client: TestClient, fake_pipeline: PipelineOutput) -> None: _create_and_approve(client, correction_value="Penyelidikan") _create_and_approve(client, correction_value=None) resp = client.get("/api/v1/ground-truth/export") assert resp.status_code == 200 assert resp.headers["content-type"].startswith("application/x-ndjson") lines = [line for line in resp.text.splitlines() if line.strip()] assert len(lines) == 2 parsed = [json.loads(line) for line in lines] for sample in parsed: assert sample["approved"] is True assert "initial_result" in sample assert "final_result" in sample def test_export_approved_only_default(client: TestClient, fake_pipeline: PipelineOutput) -> None: """Unapproved jobs shouldn't appear in the default export.""" # One approved, one just completed (no approve call). _create_and_approve(client, correction_value=None) client.post( "/api/v1/documents?sync=true", files={"file": ("y.pdf", b"%PDF-1.4\n%fake", "application/pdf")}, ) resp = client.get("/api/v1/ground-truth/export") lines = [line for line in resp.text.splitlines() if line.strip()] assert len(lines) == 1 # Toggle approved_only=false to include both. resp = client.get("/api/v1/ground-truth/export?approved_only=false") lines = [line for line in resp.text.splitlines() if line.strip()] assert len(lines) == 2 def test_export_has_corrections_filter(client: TestClient, fake_pipeline: PipelineOutput) -> None: _create_and_approve(client, correction_value="Penyelidikan") _create_and_approve(client, correction_value=None) resp = client.get("/api/v1/ground-truth/export?has_corrections=true") lines = [line for line in resp.text.splitlines() if line.strip()] assert len(lines) == 1 assert json.loads(lines[0])["corrections"][0]["new_value"] == "Penyelidikan" resp = client.get("/api/v1/ground-truth/export?has_corrections=false") lines = [line for line in resp.text.splitlines() if line.strip()] assert len(lines) == 1 assert json.loads(lines[0])["corrections"] == [] def test_export_respects_limit(client: TestClient, fake_pipeline: PipelineOutput) -> None: for _ in range(5): _create_and_approve(client, correction_value=None) resp = client.get("/api/v1/ground-truth/export?limit=2") lines = [line for line in resp.text.splitlines() if line.strip()] assert len(lines) == 2