Phase 7: ground-truth export (JSONL + stats) + CLI tool

- GET /api/v1/ground-truth/export streaming JSONL (approved_only, since, until, has_corrections, limit) - GET /api/v1/ground-truth/stats total / approved / corrections counts + top-N most-corrected field paths - python -m ocr_sprint.tools.export_ground_truth operator CLI with the same filters + optional --print-stats - Ground-truth sample reconstructs the pipeline's original output by replaying job_corrections in reverse - docs/ground-truth-format.md schema + fine-tuning guidance - 17 new tests (service replay, endpoint filters, CLI) - 201 total tests passing, ruff / mypy --strict clean Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
2026-04-25 20:24:40 +00:00
parent 9457fa3c55
commit 6003d96a94
11 changed files with 1148 additions and 1 deletions
--- a/tests/unit/test_api_ground_truth.py
+++ b/tests/unit/test_api_ground_truth.py
@@ -0,0 +1,158 @@
+"""HTTP tests for the ground-truth export endpoints."""
+
+from __future__ import annotations
+
+import json
+from datetime import date
+
+import pytest
+from fastapi.testclient import TestClient
+
+from ocr_sprint.main import create_app
+from ocr_sprint.pipeline import orchestrator as orch_module
+from ocr_sprint.pipeline.orchestrator import PipelineOutput
+from ocr_sprint.schemas.document import DocumentStatus, SourceKind
+from ocr_sprint.schemas.extraction import (
+    ExtractionResult,
+    HeaderFields,
+    PersonnelEntry,
+)
+
+
+@pytest.fixture
+def client() -> TestClient:
+    return TestClient(create_app())
+
+
+@pytest.fixture
+def fake_pipeline(monkeypatch: pytest.MonkeyPatch) -> PipelineOutput:
+    result = ExtractionResult(
+        header=HeaderFields(
+            nomor_sprint="Sprin/1/I/2025",
+            tanggal=date(2025, 1, 1),
+            satuan_penerbit="POLRES TEST",
+        ),
+        personel=[
+            PersonnelEntry(pangkat="AIPDA", nrp="77060000", nama="BUDI", jabatan="ANGGOTA"),
+        ],
+        confidence=0.9,
+    )
+    output = PipelineOutput(
+        source_kind=SourceKind.PDF,
+        status=DocumentStatus.COMPLETED,
+        confidence=0.9,
+        result=result,
+    )
+
+    def _fake_run(_content: bytes) -> PipelineOutput:
+        return output
+
+    monkeypatch.setattr(orch_module, "run_pipeline", _fake_run)
+    from ocr_sprint.api.routes import documents as docs_module
+
+    monkeypatch.setattr(docs_module, "run_pipeline", _fake_run)
+    from ocr_sprint.worker import tasks as tasks_module
+
+    monkeypatch.setattr(tasks_module, "run_pipeline", _fake_run)
+    return output
+
+
+def _create_and_approve(client: TestClient, *, correction_value: str | None = None) -> str:
+    post = client.post(
+        "/api/v1/documents?sync=true",
+        files={"file": ("x.pdf", b"%PDF-1.4\n%fake", "application/pdf")},
+    )
+    assert post.status_code == 200, post.text
+    jid = str(post.json()["job_id"])
+    if correction_value is not None:
+        patched = client.patch(
+            f"/api/v1/documents/{jid}",
+            json={"corrections": [{"path": "header.perihal", "value": correction_value}]},
+        )
+        assert patched.status_code == 200
+    approved = client.post(f"/api/v1/documents/{jid}/approve")
+    assert approved.status_code == 200
+    return jid
+
+
+def test_stats_empty_dataset(client: TestClient) -> None:
+    resp = client.get("/api/v1/ground-truth/stats")
+    assert resp.status_code == 200
+    body = resp.json()
+    assert body["total_jobs"] == 0
+    assert body["approved_jobs"] == 0
+    assert body["total_corrections"] == 0
+    assert body["top_corrected_fields"] == []
+
+
+def test_stats_rolls_up_counts(client: TestClient, fake_pipeline: PipelineOutput) -> None:
+    _create_and_approve(client, correction_value="Penyelidikan-1")
+    _create_and_approve(client, correction_value="Penyelidikan-2")
+    _create_and_approve(client, correction_value=None)  # pristine
+
+    resp = client.get("/api/v1/ground-truth/stats")
+    assert resp.status_code == 200
+    body = resp.json()
+    assert body["total_jobs"] == 3
+    assert body["approved_jobs"] == 3
+    assert body["total_corrections"] == 2
+    assert body["jobs_with_corrections"] == 2
+    assert body["top_corrected_fields"][0]["field_path"] == "header.perihal"
+    assert body["top_corrected_fields"][0]["count"] == 2
+
+
+def test_export_streams_jsonl(client: TestClient, fake_pipeline: PipelineOutput) -> None:
+    _create_and_approve(client, correction_value="Penyelidikan")
+    _create_and_approve(client, correction_value=None)
+
+    resp = client.get("/api/v1/ground-truth/export")
+    assert resp.status_code == 200
+    assert resp.headers["content-type"].startswith("application/x-ndjson")
+    lines = [line for line in resp.text.splitlines() if line.strip()]
+    assert len(lines) == 2
+    parsed = [json.loads(line) for line in lines]
+    for sample in parsed:
+        assert sample["approved"] is True
+        assert "initial_result" in sample
+        assert "final_result" in sample
+
+
+def test_export_approved_only_default(client: TestClient, fake_pipeline: PipelineOutput) -> None:
+    """Unapproved jobs shouldn't appear in the default export."""
+    # One approved, one just completed (no approve call).
+    _create_and_approve(client, correction_value=None)
+    client.post(
+        "/api/v1/documents?sync=true",
+        files={"file": ("y.pdf", b"%PDF-1.4\n%fake", "application/pdf")},
+    )
+    resp = client.get("/api/v1/ground-truth/export")
+    lines = [line for line in resp.text.splitlines() if line.strip()]
+    assert len(lines) == 1
+
+    # Toggle approved_only=false to include both.
+    resp = client.get("/api/v1/ground-truth/export?approved_only=false")
+    lines = [line for line in resp.text.splitlines() if line.strip()]
+    assert len(lines) == 2
+
+
+def test_export_has_corrections_filter(client: TestClient, fake_pipeline: PipelineOutput) -> None:
+    _create_and_approve(client, correction_value="Penyelidikan")
+    _create_and_approve(client, correction_value=None)
+
+    resp = client.get("/api/v1/ground-truth/export?has_corrections=true")
+    lines = [line for line in resp.text.splitlines() if line.strip()]
+    assert len(lines) == 1
+    assert json.loads(lines[0])["corrections"][0]["new_value"] == "Penyelidikan"
+
+    resp = client.get("/api/v1/ground-truth/export?has_corrections=false")
+    lines = [line for line in resp.text.splitlines() if line.strip()]
+    assert len(lines) == 1
+    assert json.loads(lines[0])["corrections"] == []
+
+
+def test_export_respects_limit(client: TestClient, fake_pipeline: PipelineOutput) -> None:
+    for _ in range(5):
+        _create_and_approve(client, correction_value=None)
+    resp = client.get("/api/v1/ground-truth/export?limit=2")
+    lines = [line for line in resp.text.splitlines() if line.strip()]
+    assert len(lines) == 2