Phase 7: ground-truth export (JSONL + stats) + CLI tool

- GET /api/v1/ground-truth/export streaming JSONL (approved_only, since, until, has_corrections, limit) - GET /api/v1/ground-truth/stats total / approved / corrections counts + top-N most-corrected field paths - python -m ocr_sprint.tools.export_ground_truth operator CLI with the same filters + optional --print-stats - Ground-truth sample reconstructs the pipeline's original output by replaying job_corrections in reverse - docs/ground-truth-format.md schema + fine-tuning guidance - 17 new tests (service replay, endpoint filters, CLI) - 201 total tests passing, ruff / mypy --strict clean Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
2026-04-25 20:24:40 +00:00
parent 9457fa3c55
commit 6003d96a94
11 changed files with 1148 additions and 1 deletions
--- a/tests/unit/test_cli_export_ground_truth.py
+++ b/tests/unit/test_cli_export_ground_truth.py
@@ -0,0 +1,80 @@
+"""Smoke tests for the ``ocr_sprint.tools.export_ground_truth`` CLI."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from uuid import uuid4
+
+import pytest
+
+from ocr_sprint.db.base import Base, get_engine, session_scope
+from ocr_sprint.db.repositories import JobRepository
+from ocr_sprint.schemas.document import DocumentStatus, SourceKind
+from ocr_sprint.tools.export_ground_truth import main as export_main
+
+
+@pytest.fixture
+def db_ready() -> None:
+    Base.metadata.create_all(bind=get_engine())
+
+
+def _seed_two_approved_jobs() -> None:
+    for _ in range(2):
+        jid = uuid4()
+        with session_scope() as session:
+            JobRepository(session).create(
+                job_id=jid, filename="x.pdf", source_kind=SourceKind.PDF, blob_key="k"
+            )
+        with session_scope() as session:
+            JobRepository(session).mark_completed(
+                jid,
+                status=DocumentStatus.COMPLETED,
+                confidence=0.9,
+                result={"header": {"nomor_sprint": "SPR/1/2025"}},
+                review_flags=[],
+            )
+        with session_scope() as session:
+            JobRepository(session).approve(jid, reviewed_by="rev")
+
+
+def test_cli_writes_expected_number_of_lines(
+    db_ready: None, tmp_path: Path, capsys: pytest.CaptureFixture[str]
+) -> None:
+    _seed_two_approved_jobs()
+    out = tmp_path / "corpus.jsonl"
+
+    exit_code = export_main(["--out", str(out)])
+    assert exit_code == 0
+    lines = [line for line in out.read_text().splitlines() if line.strip()]
+    assert len(lines) == 2
+    for line in lines:
+        parsed = json.loads(line)
+        assert parsed["approved"] is True
+
+    stderr = capsys.readouterr().err
+    assert "wrote 2 sample(s)" in stderr
+
+
+def test_cli_respects_limit(db_ready: None, tmp_path: Path) -> None:
+    _seed_two_approved_jobs()
+    out = tmp_path / "corpus.jsonl"
+    exit_code = export_main(["--out", str(out), "--limit", "1"])
+    assert exit_code == 0
+    lines = [line for line in out.read_text().splitlines() if line.strip()]
+    assert len(lines) == 1
+
+
+def test_cli_print_stats_emits_json_to_stderr(
+    db_ready: None, tmp_path: Path, capsys: pytest.CaptureFixture[str]
+) -> None:
+    _seed_two_approved_jobs()
+    out = tmp_path / "corpus.jsonl"
+    exit_code = export_main(["--out", str(out), "--print-stats"])
+    assert exit_code == 0
+    stderr = capsys.readouterr().err
+    # Validate the JSON prologue (after the "wrote N" line).
+    json_start = stderr.index("{")
+    stats = json.loads(stderr[json_start:])
+    assert stats["total_jobs"] == 2
+    assert stats["approved_jobs"] == 2