Phase 7: ground-truth export (JSONL + stats) + CLI tool
- GET /api/v1/ground-truth/export streaming JSONL (approved_only, since, until, has_corrections, limit) - GET /api/v1/ground-truth/stats total / approved / corrections counts + top-N most-corrected field paths - python -m ocr_sprint.tools.export_ground_truth operator CLI with the same filters + optional --print-stats - Ground-truth sample reconstructs the pipeline's original output by replaying job_corrections in reverse - docs/ground-truth-format.md schema + fine-tuning guidance - 17 new tests (service replay, endpoint filters, CLI) - 201 total tests passing, ruff / mypy --strict clean Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
This commit is contained in:
102
src/ocr_sprint/api/routes/ground_truth.py
Normal file
102
src/ocr_sprint/api/routes/ground_truth.py
Normal file
@@ -0,0 +1,102 @@
|
||||
"""Ground-truth export + statistics endpoints (Phase 7).
|
||||
|
||||
Two endpoints, both auth'd by the existing ``X-API-Key`` dependency:
|
||||
|
||||
* ``GET /ground-truth/export`` — streams JSONL of approved (or filtered)
|
||||
samples for downstream fine-tuning pipelines.
|
||||
* ``GET /ground-truth/stats`` — returns aggregate counts + top-corrected
|
||||
field paths so operators know when/where fine-tuning will pay off.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Iterator
|
||||
from datetime import datetime
|
||||
from typing import Annotated
|
||||
|
||||
from fastapi import APIRouter, Depends, Query
|
||||
from fastapi.responses import StreamingResponse
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from ocr_sprint.api.deps.auth import require_api_key
|
||||
from ocr_sprint.api.deps.db import get_session
|
||||
from ocr_sprint.ground_truth import (
|
||||
GroundTruthFilters,
|
||||
ground_truth_stats,
|
||||
iter_ground_truth_samples,
|
||||
serialize_sample_to_jsonl,
|
||||
)
|
||||
from ocr_sprint.schemas.ground_truth import GroundTruthStats
|
||||
|
||||
router = APIRouter(
|
||||
prefix="/ground-truth",
|
||||
tags=["ground-truth"],
|
||||
dependencies=[Depends(require_api_key)],
|
||||
)
|
||||
|
||||
|
||||
@router.get(
|
||||
"/export",
|
||||
response_class=StreamingResponse,
|
||||
responses={
|
||||
200: {
|
||||
"content": {"application/x-ndjson": {}},
|
||||
"description": "Newline-delimited JSON stream of training samples.",
|
||||
}
|
||||
},
|
||||
)
|
||||
def export_ground_truth(
|
||||
session: Annotated[Session, Depends(get_session)],
|
||||
since: Annotated[
|
||||
datetime | None,
|
||||
Query(description="Only include jobs created at or after this ISO timestamp."),
|
||||
] = None,
|
||||
until: Annotated[
|
||||
datetime | None,
|
||||
Query(description="Only include jobs created at or before this ISO timestamp."),
|
||||
] = None,
|
||||
approved_only: Annotated[
|
||||
bool,
|
||||
Query(description="Only export approved jobs (default true)."),
|
||||
] = True,
|
||||
has_corrections: Annotated[
|
||||
bool | None,
|
||||
Query(
|
||||
description=(
|
||||
"Optional: true = only jobs that had at least one correction, "
|
||||
"false = only pristine (no-correction) jobs."
|
||||
)
|
||||
),
|
||||
] = None,
|
||||
limit: Annotated[
|
||||
int | None,
|
||||
Query(ge=1, le=100_000, description="Maximum rows to emit."),
|
||||
] = None,
|
||||
) -> StreamingResponse:
|
||||
filters = GroundTruthFilters(
|
||||
since=since,
|
||||
until=until,
|
||||
approved_only=approved_only,
|
||||
has_corrections=has_corrections,
|
||||
limit=limit,
|
||||
)
|
||||
|
||||
def _stream() -> Iterator[bytes]:
|
||||
for sample in iter_ground_truth_samples(session, filters):
|
||||
yield serialize_sample_to_jsonl(sample).encode("utf-8")
|
||||
|
||||
return StreamingResponse(_stream(), media_type="application/x-ndjson")
|
||||
|
||||
|
||||
@router.get(
|
||||
"/stats",
|
||||
response_model=GroundTruthStats,
|
||||
)
|
||||
def get_stats(
|
||||
session: Annotated[Session, Depends(get_session)],
|
||||
top_n: Annotated[
|
||||
int,
|
||||
Query(ge=1, le=100, description="How many top-corrected field paths to return."),
|
||||
] = 10,
|
||||
) -> GroundTruthStats:
|
||||
return ground_truth_stats(session, top_n=top_n)
|
||||
Reference in New Issue
Block a user