Files
OCR-SPRIN-SERVICE/tests/unit/test_api_ground_truth.py
Devin AI 6003d96a94 Phase 7: ground-truth export (JSONL + stats) + CLI tool
- GET /api/v1/ground-truth/export  streaming JSONL (approved_only,
  since, until, has_corrections, limit)
- GET /api/v1/ground-truth/stats   total / approved / corrections
  counts + top-N most-corrected field paths
- python -m ocr_sprint.tools.export_ground_truth  operator CLI with
  the same filters + optional --print-stats
- Ground-truth sample reconstructs the pipeline's original output by
  replaying job_corrections in reverse
- docs/ground-truth-format.md    schema + fine-tuning guidance
- 17 new tests (service replay, endpoint filters, CLI)
- 201 total tests passing, ruff / mypy --strict clean

Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
2026-04-25 20:24:40 +00:00

159 lines
5.7 KiB
Python

"""HTTP tests for the ground-truth export endpoints."""
from __future__ import annotations
import json
from datetime import date
import pytest
from fastapi.testclient import TestClient
from ocr_sprint.main import create_app
from ocr_sprint.pipeline import orchestrator as orch_module
from ocr_sprint.pipeline.orchestrator import PipelineOutput
from ocr_sprint.schemas.document import DocumentStatus, SourceKind
from ocr_sprint.schemas.extraction import (
ExtractionResult,
HeaderFields,
PersonnelEntry,
)
@pytest.fixture
def client() -> TestClient:
return TestClient(create_app())
@pytest.fixture
def fake_pipeline(monkeypatch: pytest.MonkeyPatch) -> PipelineOutput:
result = ExtractionResult(
header=HeaderFields(
nomor_sprint="Sprin/1/I/2025",
tanggal=date(2025, 1, 1),
satuan_penerbit="POLRES TEST",
),
personel=[
PersonnelEntry(pangkat="AIPDA", nrp="77060000", nama="BUDI", jabatan="ANGGOTA"),
],
confidence=0.9,
)
output = PipelineOutput(
source_kind=SourceKind.PDF,
status=DocumentStatus.COMPLETED,
confidence=0.9,
result=result,
)
def _fake_run(_content: bytes) -> PipelineOutput:
return output
monkeypatch.setattr(orch_module, "run_pipeline", _fake_run)
from ocr_sprint.api.routes import documents as docs_module
monkeypatch.setattr(docs_module, "run_pipeline", _fake_run)
from ocr_sprint.worker import tasks as tasks_module
monkeypatch.setattr(tasks_module, "run_pipeline", _fake_run)
return output
def _create_and_approve(client: TestClient, *, correction_value: str | None = None) -> str:
post = client.post(
"/api/v1/documents?sync=true",
files={"file": ("x.pdf", b"%PDF-1.4\n%fake", "application/pdf")},
)
assert post.status_code == 200, post.text
jid = str(post.json()["job_id"])
if correction_value is not None:
patched = client.patch(
f"/api/v1/documents/{jid}",
json={"corrections": [{"path": "header.perihal", "value": correction_value}]},
)
assert patched.status_code == 200
approved = client.post(f"/api/v1/documents/{jid}/approve")
assert approved.status_code == 200
return jid
def test_stats_empty_dataset(client: TestClient) -> None:
resp = client.get("/api/v1/ground-truth/stats")
assert resp.status_code == 200
body = resp.json()
assert body["total_jobs"] == 0
assert body["approved_jobs"] == 0
assert body["total_corrections"] == 0
assert body["top_corrected_fields"] == []
def test_stats_rolls_up_counts(client: TestClient, fake_pipeline: PipelineOutput) -> None:
_create_and_approve(client, correction_value="Penyelidikan-1")
_create_and_approve(client, correction_value="Penyelidikan-2")
_create_and_approve(client, correction_value=None) # pristine
resp = client.get("/api/v1/ground-truth/stats")
assert resp.status_code == 200
body = resp.json()
assert body["total_jobs"] == 3
assert body["approved_jobs"] == 3
assert body["total_corrections"] == 2
assert body["jobs_with_corrections"] == 2
assert body["top_corrected_fields"][0]["field_path"] == "header.perihal"
assert body["top_corrected_fields"][0]["count"] == 2
def test_export_streams_jsonl(client: TestClient, fake_pipeline: PipelineOutput) -> None:
_create_and_approve(client, correction_value="Penyelidikan")
_create_and_approve(client, correction_value=None)
resp = client.get("/api/v1/ground-truth/export")
assert resp.status_code == 200
assert resp.headers["content-type"].startswith("application/x-ndjson")
lines = [line for line in resp.text.splitlines() if line.strip()]
assert len(lines) == 2
parsed = [json.loads(line) for line in lines]
for sample in parsed:
assert sample["approved"] is True
assert "initial_result" in sample
assert "final_result" in sample
def test_export_approved_only_default(client: TestClient, fake_pipeline: PipelineOutput) -> None:
"""Unapproved jobs shouldn't appear in the default export."""
# One approved, one just completed (no approve call).
_create_and_approve(client, correction_value=None)
client.post(
"/api/v1/documents?sync=true",
files={"file": ("y.pdf", b"%PDF-1.4\n%fake", "application/pdf")},
)
resp = client.get("/api/v1/ground-truth/export")
lines = [line for line in resp.text.splitlines() if line.strip()]
assert len(lines) == 1
# Toggle approved_only=false to include both.
resp = client.get("/api/v1/ground-truth/export?approved_only=false")
lines = [line for line in resp.text.splitlines() if line.strip()]
assert len(lines) == 2
def test_export_has_corrections_filter(client: TestClient, fake_pipeline: PipelineOutput) -> None:
_create_and_approve(client, correction_value="Penyelidikan")
_create_and_approve(client, correction_value=None)
resp = client.get("/api/v1/ground-truth/export?has_corrections=true")
lines = [line for line in resp.text.splitlines() if line.strip()]
assert len(lines) == 1
assert json.loads(lines[0])["corrections"][0]["new_value"] == "Penyelidikan"
resp = client.get("/api/v1/ground-truth/export?has_corrections=false")
lines = [line for line in resp.text.splitlines() if line.strip()]
assert len(lines) == 1
assert json.loads(lines[0])["corrections"] == []
def test_export_respects_limit(client: TestClient, fake_pipeline: PipelineOutput) -> None:
for _ in range(5):
_create_and_approve(client, correction_value=None)
resp = client.get("/api/v1/ground-truth/export?limit=2")
lines = [line for line in resp.text.splitlines() if line.strip()]
assert len(lines) == 2