Phase 7: ground-truth export (JSONL + stats) + CLI tool

- GET /api/v1/ground-truth/export streaming JSONL (approved_only, since, until, has_corrections, limit) - GET /api/v1/ground-truth/stats total / approved / corrections counts + top-N most-corrected field paths - python -m ocr_sprint.tools.export_ground_truth operator CLI with the same filters + optional --print-stats - Ground-truth sample reconstructs the pipeline's original output by replaying job_corrections in reverse - docs/ground-truth-format.md schema + fine-tuning guidance - 17 new tests (service replay, endpoint filters, CLI) - 201 total tests passing, ruff / mypy --strict clean Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
2026-04-25 20:24:40 +00:00
parent 9457fa3c55
commit 6003d96a94
11 changed files with 1148 additions and 1 deletions
--- a/src/ocr_sprint/api/routes/ground_truth.py
+++ b/src/ocr_sprint/api/routes/ground_truth.py
@@ -0,0 +1,102 @@
+"""Ground-truth export + statistics endpoints (Phase 7).
+
+Two endpoints, both auth'd by the existing ``X-API-Key`` dependency:
+
+* ``GET /ground-truth/export`` — streams JSONL of approved (or filtered)
+  samples for downstream fine-tuning pipelines.
+* ``GET /ground-truth/stats``  — returns aggregate counts + top-corrected
+  field paths so operators know when/where fine-tuning will pay off.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Iterator
+from datetime import datetime
+from typing import Annotated
+
+from fastapi import APIRouter, Depends, Query
+from fastapi.responses import StreamingResponse
+from sqlalchemy.orm import Session
+
+from ocr_sprint.api.deps.auth import require_api_key
+from ocr_sprint.api.deps.db import get_session
+from ocr_sprint.ground_truth import (
+    GroundTruthFilters,
+    ground_truth_stats,
+    iter_ground_truth_samples,
+    serialize_sample_to_jsonl,
+)
+from ocr_sprint.schemas.ground_truth import GroundTruthStats
+
+router = APIRouter(
+    prefix="/ground-truth",
+    tags=["ground-truth"],
+    dependencies=[Depends(require_api_key)],
+)
+
+
+@router.get(
+    "/export",
+    response_class=StreamingResponse,
+    responses={
+        200: {
+            "content": {"application/x-ndjson": {}},
+            "description": "Newline-delimited JSON stream of training samples.",
+        }
+    },
+)
+def export_ground_truth(
+    session: Annotated[Session, Depends(get_session)],
+    since: Annotated[
+        datetime | None,
+        Query(description="Only include jobs created at or after this ISO timestamp."),
+    ] = None,
+    until: Annotated[
+        datetime | None,
+        Query(description="Only include jobs created at or before this ISO timestamp."),
+    ] = None,
+    approved_only: Annotated[
+        bool,
+        Query(description="Only export approved jobs (default true)."),
+    ] = True,
+    has_corrections: Annotated[
+        bool | None,
+        Query(
+            description=(
+                "Optional: true = only jobs that had at least one correction, "
+                "false = only pristine (no-correction) jobs."
+            )
+        ),
+    ] = None,
+    limit: Annotated[
+        int | None,
+        Query(ge=1, le=100_000, description="Maximum rows to emit."),
+    ] = None,
+) -> StreamingResponse:
+    filters = GroundTruthFilters(
+        since=since,
+        until=until,
+        approved_only=approved_only,
+        has_corrections=has_corrections,
+        limit=limit,
+    )
+
+    def _stream() -> Iterator[bytes]:
+        for sample in iter_ground_truth_samples(session, filters):
+            yield serialize_sample_to_jsonl(sample).encode("utf-8")
+
+    return StreamingResponse(_stream(), media_type="application/x-ndjson")
+
+
+@router.get(
+    "/stats",
+    response_model=GroundTruthStats,
+)
+def get_stats(
+    session: Annotated[Session, Depends(get_session)],
+    top_n: Annotated[
+        int,
+        Query(ge=1, le=100, description="How many top-corrected field paths to return."),
+    ] = 10,
+) -> GroundTruthStats:
+    return ground_truth_stats(session, top_n=top_n)