Phase 7: ground-truth export (JSONL + stats) + CLI tool

- GET /api/v1/ground-truth/export streaming JSONL (approved_only, since, until, has_corrections, limit) - GET /api/v1/ground-truth/stats total / approved / corrections counts + top-N most-corrected field paths - python -m ocr_sprint.tools.export_ground_truth operator CLI with the same filters + optional --print-stats - Ground-truth sample reconstructs the pipeline's original output by replaying job_corrections in reverse - docs/ground-truth-format.md schema + fine-tuning guidance - 17 new tests (service replay, endpoint filters, CLI) - 201 total tests passing, ruff / mypy --strict clean Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
2026-04-25 20:24:40 +00:00
parent 9457fa3c55
commit 6003d96a94
11 changed files with 1148 additions and 1 deletions
--- a/src/ocr_sprint/schemas/ground_truth.py
+++ b/src/ocr_sprint/schemas/ground_truth.py
@@ -0,0 +1,76 @@
+"""Schemas for the Phase 7 ground-truth export.
+
+Each ``GroundTruthSample`` represents one training-ready example:
+
+* ``initial_*`` snapshots the pipeline's original (pre-HITL) output,
+  reconstructed by replaying the audit trail in reverse.
+* ``final_*`` is the current ``result`` on the ``jobs`` row — the
+  reviewer-approved answer.
+* ``corrections`` is the raw audit trail so downstream fine-tuning can
+  see *what* was changed, *why* (free-text reason), and by whom.
+
+JSONL is emitted — one sample per line — so the file can be mmapped,
+streamed, or piped straight into an HF ``datasets.load_dataset("json",
+...)`` call.
+"""
+
+from __future__ import annotations
+
+from datetime import datetime
+from typing import Any
+from uuid import UUID
+
+from pydantic import BaseModel, ConfigDict, Field
+
+
+class GroundTruthCorrection(BaseModel):
+    """One row of the ``job_corrections`` audit trail, as exported."""
+
+    field_path: str
+    old_value: Any | None = None
+    new_value: Any | None = None
+    corrected_by: str | None = None
+    reason: str | None = None
+    corrected_at: datetime
+
+
+class GroundTruthSample(BaseModel):
+    """One training sample written as a single JSONL line."""
+
+    model_config = ConfigDict(populate_by_name=True)
+
+    job_id: UUID
+    filename: str
+    source_kind: str
+    approved: bool = False
+    reviewed_by: str | None = None
+    reviewed_at: datetime | None = None
+    created_at: datetime
+    # ``initial_*`` is the pipeline's pre-HITL answer, reconstructed from
+    # the audit trail. ``final_*`` is the reviewer-approved version.
+    initial_result: dict[str, Any] | None = None
+    final_result: dict[str, Any] | None = None
+    corrections: list[GroundTruthCorrection] = Field(default_factory=list)
+    review_flags: list[str] = Field(default_factory=list)
+    confidence: float | None = None
+
+
+class FieldCorrectionCount(BaseModel):
+    field_path: str
+    count: int
+
+
+class GroundTruthStats(BaseModel):
+    """High-level dataset health report surfaced by ``GET /ground-truth/stats``."""
+
+    total_jobs: int
+    completed_jobs: int
+    needs_review_jobs: int
+    failed_jobs: int
+    approved_jobs: int
+    total_corrections: int
+    jobs_with_corrections: int
+    # Most-corrected field paths (descending). Operators use this to
+    # prioritise which fields to target with prompt tweaks or fine-tune
+    # data collection first.
+    top_corrected_fields: list[FieldCorrectionCount] = Field(default_factory=list)