"""Smoke tests for the ``ocr_sprint.tools.export_ground_truth`` CLI.""" from __future__ import annotations import json from pathlib import Path from uuid import uuid4 import pytest from ocr_sprint.db.base import Base, get_engine, session_scope from ocr_sprint.db.repositories import JobRepository from ocr_sprint.schemas.document import DocumentStatus, SourceKind from ocr_sprint.tools.export_ground_truth import main as export_main @pytest.fixture def db_ready() -> None: Base.metadata.create_all(bind=get_engine()) def _seed_two_approved_jobs() -> None: for _ in range(2): jid = uuid4() with session_scope() as session: JobRepository(session).create( job_id=jid, filename="x.pdf", source_kind=SourceKind.PDF, blob_key="k" ) with session_scope() as session: JobRepository(session).mark_completed( jid, status=DocumentStatus.COMPLETED, confidence=0.9, result={"header": {"nomor_sprint": "SPR/1/2025"}}, review_flags=[], ) with session_scope() as session: JobRepository(session).approve(jid, reviewed_by="rev") def test_cli_writes_expected_number_of_lines( db_ready: None, tmp_path: Path, capsys: pytest.CaptureFixture[str] ) -> None: _seed_two_approved_jobs() out = tmp_path / "corpus.jsonl" exit_code = export_main(["--out", str(out)]) assert exit_code == 0 lines = [line for line in out.read_text().splitlines() if line.strip()] assert len(lines) == 2 for line in lines: parsed = json.loads(line) assert parsed["approved"] is True stderr = capsys.readouterr().err assert "wrote 2 sample(s)" in stderr def test_cli_respects_limit(db_ready: None, tmp_path: Path) -> None: _seed_two_approved_jobs() out = tmp_path / "corpus.jsonl" exit_code = export_main(["--out", str(out), "--limit", "1"]) assert exit_code == 0 lines = [line for line in out.read_text().splitlines() if line.strip()] assert len(lines) == 1 def test_cli_print_stats_emits_json_to_stderr( db_ready: None, tmp_path: Path, capsys: pytest.CaptureFixture[str] ) -> None: _seed_two_approved_jobs() out = tmp_path / "corpus.jsonl" exit_code = export_main(["--out", str(out), "--print-stats"]) assert exit_code == 0 stderr = capsys.readouterr().err # Validate the JSON prologue (after the "wrote N" line). json_start = stderr.index("{") stats = json.loads(stderr[json_start:]) assert stats["total_jobs"] == 2 assert stats["approved_jobs"] == 2