Phase 7: ground-truth export (JSONL + stats) + CLI tool

- GET /api/v1/ground-truth/export  streaming JSONL (approved_only,
  since, until, has_corrections, limit)
- GET /api/v1/ground-truth/stats   total / approved / corrections
  counts + top-N most-corrected field paths
- python -m ocr_sprint.tools.export_ground_truth  operator CLI with
  the same filters + optional --print-stats
- Ground-truth sample reconstructs the pipeline's original output by
  replaying job_corrections in reverse
- docs/ground-truth-format.md    schema + fine-tuning guidance
- 17 new tests (service replay, endpoint filters, CLI)
- 201 total tests passing, ruff / mypy --strict clean

Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
This commit is contained in:
Devin AI
2026-04-25 20:24:40 +00:00
parent 9457fa3c55
commit 6003d96a94
11 changed files with 1148 additions and 1 deletions

View File

@@ -0,0 +1,102 @@
"""Ground-truth export + statistics endpoints (Phase 7).
Two endpoints, both auth'd by the existing ``X-API-Key`` dependency:
* ``GET /ground-truth/export`` — streams JSONL of approved (or filtered)
samples for downstream fine-tuning pipelines.
* ``GET /ground-truth/stats`` — returns aggregate counts + top-corrected
field paths so operators know when/where fine-tuning will pay off.
"""
from __future__ import annotations
from collections.abc import Iterator
from datetime import datetime
from typing import Annotated
from fastapi import APIRouter, Depends, Query
from fastapi.responses import StreamingResponse
from sqlalchemy.orm import Session
from ocr_sprint.api.deps.auth import require_api_key
from ocr_sprint.api.deps.db import get_session
from ocr_sprint.ground_truth import (
GroundTruthFilters,
ground_truth_stats,
iter_ground_truth_samples,
serialize_sample_to_jsonl,
)
from ocr_sprint.schemas.ground_truth import GroundTruthStats
router = APIRouter(
prefix="/ground-truth",
tags=["ground-truth"],
dependencies=[Depends(require_api_key)],
)
@router.get(
"/export",
response_class=StreamingResponse,
responses={
200: {
"content": {"application/x-ndjson": {}},
"description": "Newline-delimited JSON stream of training samples.",
}
},
)
def export_ground_truth(
session: Annotated[Session, Depends(get_session)],
since: Annotated[
datetime | None,
Query(description="Only include jobs created at or after this ISO timestamp."),
] = None,
until: Annotated[
datetime | None,
Query(description="Only include jobs created at or before this ISO timestamp."),
] = None,
approved_only: Annotated[
bool,
Query(description="Only export approved jobs (default true)."),
] = True,
has_corrections: Annotated[
bool | None,
Query(
description=(
"Optional: true = only jobs that had at least one correction, "
"false = only pristine (no-correction) jobs."
)
),
] = None,
limit: Annotated[
int | None,
Query(ge=1, le=100_000, description="Maximum rows to emit."),
] = None,
) -> StreamingResponse:
filters = GroundTruthFilters(
since=since,
until=until,
approved_only=approved_only,
has_corrections=has_corrections,
limit=limit,
)
def _stream() -> Iterator[bytes]:
for sample in iter_ground_truth_samples(session, filters):
yield serialize_sample_to_jsonl(sample).encode("utf-8")
return StreamingResponse(_stream(), media_type="application/x-ndjson")
@router.get(
"/stats",
response_model=GroundTruthStats,
)
def get_stats(
session: Annotated[Session, Depends(get_session)],
top_n: Annotated[
int,
Query(ge=1, le=100, description="How many top-corrected field paths to return."),
] = 10,
) -> GroundTruthStats:
return ground_truth_stats(session, top_n=top_n)