Phase 7: ground-truth export (JSONL + stats) + CLI tool
- GET /api/v1/ground-truth/export streaming JSONL (approved_only, since, until, has_corrections, limit) - GET /api/v1/ground-truth/stats total / approved / corrections counts + top-N most-corrected field paths - python -m ocr_sprint.tools.export_ground_truth operator CLI with the same filters + optional --print-stats - Ground-truth sample reconstructs the pipeline's original output by replaying job_corrections in reverse - docs/ground-truth-format.md schema + fine-tuning guidance - 17 new tests (service replay, endpoint filters, CLI) - 201 total tests passing, ruff / mypy --strict clean Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
This commit is contained in:
158
tests/unit/test_api_ground_truth.py
Normal file
158
tests/unit/test_api_ground_truth.py
Normal file
@@ -0,0 +1,158 @@
|
||||
"""HTTP tests for the ground-truth export endpoints."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from datetime import date
|
||||
|
||||
import pytest
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from ocr_sprint.main import create_app
|
||||
from ocr_sprint.pipeline import orchestrator as orch_module
|
||||
from ocr_sprint.pipeline.orchestrator import PipelineOutput
|
||||
from ocr_sprint.schemas.document import DocumentStatus, SourceKind
|
||||
from ocr_sprint.schemas.extraction import (
|
||||
ExtractionResult,
|
||||
HeaderFields,
|
||||
PersonnelEntry,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def client() -> TestClient:
|
||||
return TestClient(create_app())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def fake_pipeline(monkeypatch: pytest.MonkeyPatch) -> PipelineOutput:
|
||||
result = ExtractionResult(
|
||||
header=HeaderFields(
|
||||
nomor_sprint="Sprin/1/I/2025",
|
||||
tanggal=date(2025, 1, 1),
|
||||
satuan_penerbit="POLRES TEST",
|
||||
),
|
||||
personel=[
|
||||
PersonnelEntry(pangkat="AIPDA", nrp="77060000", nama="BUDI", jabatan="ANGGOTA"),
|
||||
],
|
||||
confidence=0.9,
|
||||
)
|
||||
output = PipelineOutput(
|
||||
source_kind=SourceKind.PDF,
|
||||
status=DocumentStatus.COMPLETED,
|
||||
confidence=0.9,
|
||||
result=result,
|
||||
)
|
||||
|
||||
def _fake_run(_content: bytes) -> PipelineOutput:
|
||||
return output
|
||||
|
||||
monkeypatch.setattr(orch_module, "run_pipeline", _fake_run)
|
||||
from ocr_sprint.api.routes import documents as docs_module
|
||||
|
||||
monkeypatch.setattr(docs_module, "run_pipeline", _fake_run)
|
||||
from ocr_sprint.worker import tasks as tasks_module
|
||||
|
||||
monkeypatch.setattr(tasks_module, "run_pipeline", _fake_run)
|
||||
return output
|
||||
|
||||
|
||||
def _create_and_approve(client: TestClient, *, correction_value: str | None = None) -> str:
|
||||
post = client.post(
|
||||
"/api/v1/documents?sync=true",
|
||||
files={"file": ("x.pdf", b"%PDF-1.4\n%fake", "application/pdf")},
|
||||
)
|
||||
assert post.status_code == 200, post.text
|
||||
jid = str(post.json()["job_id"])
|
||||
if correction_value is not None:
|
||||
patched = client.patch(
|
||||
f"/api/v1/documents/{jid}",
|
||||
json={"corrections": [{"path": "header.perihal", "value": correction_value}]},
|
||||
)
|
||||
assert patched.status_code == 200
|
||||
approved = client.post(f"/api/v1/documents/{jid}/approve")
|
||||
assert approved.status_code == 200
|
||||
return jid
|
||||
|
||||
|
||||
def test_stats_empty_dataset(client: TestClient) -> None:
|
||||
resp = client.get("/api/v1/ground-truth/stats")
|
||||
assert resp.status_code == 200
|
||||
body = resp.json()
|
||||
assert body["total_jobs"] == 0
|
||||
assert body["approved_jobs"] == 0
|
||||
assert body["total_corrections"] == 0
|
||||
assert body["top_corrected_fields"] == []
|
||||
|
||||
|
||||
def test_stats_rolls_up_counts(client: TestClient, fake_pipeline: PipelineOutput) -> None:
|
||||
_create_and_approve(client, correction_value="Penyelidikan-1")
|
||||
_create_and_approve(client, correction_value="Penyelidikan-2")
|
||||
_create_and_approve(client, correction_value=None) # pristine
|
||||
|
||||
resp = client.get("/api/v1/ground-truth/stats")
|
||||
assert resp.status_code == 200
|
||||
body = resp.json()
|
||||
assert body["total_jobs"] == 3
|
||||
assert body["approved_jobs"] == 3
|
||||
assert body["total_corrections"] == 2
|
||||
assert body["jobs_with_corrections"] == 2
|
||||
assert body["top_corrected_fields"][0]["field_path"] == "header.perihal"
|
||||
assert body["top_corrected_fields"][0]["count"] == 2
|
||||
|
||||
|
||||
def test_export_streams_jsonl(client: TestClient, fake_pipeline: PipelineOutput) -> None:
|
||||
_create_and_approve(client, correction_value="Penyelidikan")
|
||||
_create_and_approve(client, correction_value=None)
|
||||
|
||||
resp = client.get("/api/v1/ground-truth/export")
|
||||
assert resp.status_code == 200
|
||||
assert resp.headers["content-type"].startswith("application/x-ndjson")
|
||||
lines = [line for line in resp.text.splitlines() if line.strip()]
|
||||
assert len(lines) == 2
|
||||
parsed = [json.loads(line) for line in lines]
|
||||
for sample in parsed:
|
||||
assert sample["approved"] is True
|
||||
assert "initial_result" in sample
|
||||
assert "final_result" in sample
|
||||
|
||||
|
||||
def test_export_approved_only_default(client: TestClient, fake_pipeline: PipelineOutput) -> None:
|
||||
"""Unapproved jobs shouldn't appear in the default export."""
|
||||
# One approved, one just completed (no approve call).
|
||||
_create_and_approve(client, correction_value=None)
|
||||
client.post(
|
||||
"/api/v1/documents?sync=true",
|
||||
files={"file": ("y.pdf", b"%PDF-1.4\n%fake", "application/pdf")},
|
||||
)
|
||||
resp = client.get("/api/v1/ground-truth/export")
|
||||
lines = [line for line in resp.text.splitlines() if line.strip()]
|
||||
assert len(lines) == 1
|
||||
|
||||
# Toggle approved_only=false to include both.
|
||||
resp = client.get("/api/v1/ground-truth/export?approved_only=false")
|
||||
lines = [line for line in resp.text.splitlines() if line.strip()]
|
||||
assert len(lines) == 2
|
||||
|
||||
|
||||
def test_export_has_corrections_filter(client: TestClient, fake_pipeline: PipelineOutput) -> None:
|
||||
_create_and_approve(client, correction_value="Penyelidikan")
|
||||
_create_and_approve(client, correction_value=None)
|
||||
|
||||
resp = client.get("/api/v1/ground-truth/export?has_corrections=true")
|
||||
lines = [line for line in resp.text.splitlines() if line.strip()]
|
||||
assert len(lines) == 1
|
||||
assert json.loads(lines[0])["corrections"][0]["new_value"] == "Penyelidikan"
|
||||
|
||||
resp = client.get("/api/v1/ground-truth/export?has_corrections=false")
|
||||
lines = [line for line in resp.text.splitlines() if line.strip()]
|
||||
assert len(lines) == 1
|
||||
assert json.loads(lines[0])["corrections"] == []
|
||||
|
||||
|
||||
def test_export_respects_limit(client: TestClient, fake_pipeline: PipelineOutput) -> None:
|
||||
for _ in range(5):
|
||||
_create_and_approve(client, correction_value=None)
|
||||
resp = client.get("/api/v1/ground-truth/export?limit=2")
|
||||
lines = [line for line in resp.text.splitlines() if line.strip()]
|
||||
assert len(lines) == 2
|
||||
Reference in New Issue
Block a user