OCR-SPRIN-SERVICE/src/ocr_sprint/schemas/ground_truth.py

"""Schemas for the Phase 7 ground-truth export.

Each ``GroundTruthSample`` represents one training-ready example:

* ``initial_*`` snapshots the pipeline's original (pre-HITL) output,
  reconstructed by replaying the audit trail in reverse.
* ``final_*`` is the current ``result`` on the ``jobs`` row — the
  reviewer-approved answer.
* ``corrections`` is the raw audit trail so downstream fine-tuning can
  see *what* was changed, *why* (free-text reason), and by whom.

JSONL is emitted — one sample per line — so the file can be mmapped,
streamed, or piped straight into an HF ``datasets.load_dataset("json",
...)`` call.
"""

from __future__ import annotations

from datetime import datetime
from typing import Any
from uuid import UUID

from pydantic import BaseModel, ConfigDict, Field


class GroundTruthCorrection(BaseModel):
    """One row of the ``job_corrections`` audit trail, as exported."""

    field_path: str
    old_value: Any | None = None
    new_value: Any | None = None
    corrected_by: str | None = None
    reason: str | None = None
    corrected_at: datetime


class GroundTruthSample(BaseModel):
    """One training sample written as a single JSONL line."""

    model_config = ConfigDict(populate_by_name=True)

    job_id: UUID
    filename: str
    source_kind: str
    approved: bool = False
    reviewed_by: str | None = None
    reviewed_at: datetime | None = None
    created_at: datetime
    # ``initial_*`` is the pipeline's pre-HITL answer, reconstructed from
    # the audit trail. ``final_*`` is the reviewer-approved version.
    initial_result: dict[str, Any] | None = None
    final_result: dict[str, Any] | None = None
    corrections: list[GroundTruthCorrection] = Field(default_factory=list)
    review_flags: list[str] = Field(default_factory=list)
    confidence: float | None = None


class FieldCorrectionCount(BaseModel):
    field_path: str
    count: int


class GroundTruthStats(BaseModel):
    """High-level dataset health report surfaced by ``GET /ground-truth/stats``."""

    total_jobs: int
    completed_jobs: int
    needs_review_jobs: int
    failed_jobs: int
    approved_jobs: int
    total_corrections: int
    jobs_with_corrections: int
    # Most-corrected field paths (descending). Operators use this to
    # prioritise which fields to target with prompt tweaks or fine-tune
    # data collection first.
    top_corrected_fields: list[FieldCorrectionCount] = Field(default_factory=list)