- GET /api/v1/ground-truth/export streaming JSONL (approved_only, since, until, has_corrections, limit) - GET /api/v1/ground-truth/stats total / approved / corrections counts + top-N most-corrected field paths - python -m ocr_sprint.tools.export_ground_truth operator CLI with the same filters + optional --print-stats - Ground-truth sample reconstructs the pipeline's original output by replaying job_corrections in reverse - docs/ground-truth-format.md schema + fine-tuning guidance - 17 new tests (service replay, endpoint filters, CLI) - 201 total tests passing, ruff / mypy --strict clean Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
77 lines
2.4 KiB
Python
77 lines
2.4 KiB
Python
"""Schemas for the Phase 7 ground-truth export.
|
|
|
|
Each ``GroundTruthSample`` represents one training-ready example:
|
|
|
|
* ``initial_*`` snapshots the pipeline's original (pre-HITL) output,
|
|
reconstructed by replaying the audit trail in reverse.
|
|
* ``final_*`` is the current ``result`` on the ``jobs`` row — the
|
|
reviewer-approved answer.
|
|
* ``corrections`` is the raw audit trail so downstream fine-tuning can
|
|
see *what* was changed, *why* (free-text reason), and by whom.
|
|
|
|
JSONL is emitted — one sample per line — so the file can be mmapped,
|
|
streamed, or piped straight into an HF ``datasets.load_dataset("json",
|
|
...)`` call.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from datetime import datetime
|
|
from typing import Any
|
|
from uuid import UUID
|
|
|
|
from pydantic import BaseModel, ConfigDict, Field
|
|
|
|
|
|
class GroundTruthCorrection(BaseModel):
|
|
"""One row of the ``job_corrections`` audit trail, as exported."""
|
|
|
|
field_path: str
|
|
old_value: Any | None = None
|
|
new_value: Any | None = None
|
|
corrected_by: str | None = None
|
|
reason: str | None = None
|
|
corrected_at: datetime
|
|
|
|
|
|
class GroundTruthSample(BaseModel):
|
|
"""One training sample written as a single JSONL line."""
|
|
|
|
model_config = ConfigDict(populate_by_name=True)
|
|
|
|
job_id: UUID
|
|
filename: str
|
|
source_kind: str
|
|
approved: bool = False
|
|
reviewed_by: str | None = None
|
|
reviewed_at: datetime | None = None
|
|
created_at: datetime
|
|
# ``initial_*`` is the pipeline's pre-HITL answer, reconstructed from
|
|
# the audit trail. ``final_*`` is the reviewer-approved version.
|
|
initial_result: dict[str, Any] | None = None
|
|
final_result: dict[str, Any] | None = None
|
|
corrections: list[GroundTruthCorrection] = Field(default_factory=list)
|
|
review_flags: list[str] = Field(default_factory=list)
|
|
confidence: float | None = None
|
|
|
|
|
|
class FieldCorrectionCount(BaseModel):
|
|
field_path: str
|
|
count: int
|
|
|
|
|
|
class GroundTruthStats(BaseModel):
|
|
"""High-level dataset health report surfaced by ``GET /ground-truth/stats``."""
|
|
|
|
total_jobs: int
|
|
completed_jobs: int
|
|
needs_review_jobs: int
|
|
failed_jobs: int
|
|
approved_jobs: int
|
|
total_corrections: int
|
|
jobs_with_corrections: int
|
|
# Most-corrected field paths (descending). Operators use this to
|
|
# prioritise which fields to target with prompt tweaks or fine-tune
|
|
# data collection first.
|
|
top_corrected_fields: list[FieldCorrectionCount] = Field(default_factory=list)
|