Phase 7: ground-truth export (JSONL + stats) + CLI tool
- GET /api/v1/ground-truth/export streaming JSONL (approved_only, since, until, has_corrections, limit) - GET /api/v1/ground-truth/stats total / approved / corrections counts + top-N most-corrected field paths - python -m ocr_sprint.tools.export_ground_truth operator CLI with the same filters + optional --print-stats - Ground-truth sample reconstructs the pipeline's original output by replaying job_corrections in reverse - docs/ground-truth-format.md schema + fine-tuning guidance - 17 new tests (service replay, endpoint filters, CLI) - 201 total tests passing, ruff / mypy --strict clean Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
This commit is contained in:
76
src/ocr_sprint/schemas/ground_truth.py
Normal file
76
src/ocr_sprint/schemas/ground_truth.py
Normal file
@@ -0,0 +1,76 @@
|
||||
"""Schemas for the Phase 7 ground-truth export.
|
||||
|
||||
Each ``GroundTruthSample`` represents one training-ready example:
|
||||
|
||||
* ``initial_*`` snapshots the pipeline's original (pre-HITL) output,
|
||||
reconstructed by replaying the audit trail in reverse.
|
||||
* ``final_*`` is the current ``result`` on the ``jobs`` row — the
|
||||
reviewer-approved answer.
|
||||
* ``corrections`` is the raw audit trail so downstream fine-tuning can
|
||||
see *what* was changed, *why* (free-text reason), and by whom.
|
||||
|
||||
JSONL is emitted — one sample per line — so the file can be mmapped,
|
||||
streamed, or piped straight into an HF ``datasets.load_dataset("json",
|
||||
...)`` call.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
from uuid import UUID
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
|
||||
|
||||
class GroundTruthCorrection(BaseModel):
|
||||
"""One row of the ``job_corrections`` audit trail, as exported."""
|
||||
|
||||
field_path: str
|
||||
old_value: Any | None = None
|
||||
new_value: Any | None = None
|
||||
corrected_by: str | None = None
|
||||
reason: str | None = None
|
||||
corrected_at: datetime
|
||||
|
||||
|
||||
class GroundTruthSample(BaseModel):
|
||||
"""One training sample written as a single JSONL line."""
|
||||
|
||||
model_config = ConfigDict(populate_by_name=True)
|
||||
|
||||
job_id: UUID
|
||||
filename: str
|
||||
source_kind: str
|
||||
approved: bool = False
|
||||
reviewed_by: str | None = None
|
||||
reviewed_at: datetime | None = None
|
||||
created_at: datetime
|
||||
# ``initial_*`` is the pipeline's pre-HITL answer, reconstructed from
|
||||
# the audit trail. ``final_*`` is the reviewer-approved version.
|
||||
initial_result: dict[str, Any] | None = None
|
||||
final_result: dict[str, Any] | None = None
|
||||
corrections: list[GroundTruthCorrection] = Field(default_factory=list)
|
||||
review_flags: list[str] = Field(default_factory=list)
|
||||
confidence: float | None = None
|
||||
|
||||
|
||||
class FieldCorrectionCount(BaseModel):
|
||||
field_path: str
|
||||
count: int
|
||||
|
||||
|
||||
class GroundTruthStats(BaseModel):
|
||||
"""High-level dataset health report surfaced by ``GET /ground-truth/stats``."""
|
||||
|
||||
total_jobs: int
|
||||
completed_jobs: int
|
||||
needs_review_jobs: int
|
||||
failed_jobs: int
|
||||
approved_jobs: int
|
||||
total_corrections: int
|
||||
jobs_with_corrections: int
|
||||
# Most-corrected field paths (descending). Operators use this to
|
||||
# prioritise which fields to target with prompt tweaks or fine-tune
|
||||
# data collection first.
|
||||
top_corrected_fields: list[FieldCorrectionCount] = Field(default_factory=list)
|
||||
Reference in New Issue
Block a user