"""Tests for source detection + image ingest.""" from __future__ import annotations import io import numpy as np from PIL import Image from ocr_sprint.pipeline.ingest import detect_source_kind, ingest_image from ocr_sprint.schemas.document import SourceKind def _png_bytes() -> bytes: img = Image.new("RGB", (100, 80), color="white") buf = io.BytesIO() img.save(buf, format="PNG") return buf.getvalue() def _jpeg_bytes() -> bytes: img = Image.new("RGB", (100, 80), color="white") buf = io.BytesIO() img.save(buf, format="JPEG") return buf.getvalue() def test_detect_pdf() -> None: assert detect_source_kind(b"%PDF-1.7\n...") == SourceKind.PDF def test_detect_png() -> None: assert detect_source_kind(_png_bytes()) == SourceKind.IMAGE def test_detect_jpeg() -> None: assert detect_source_kind(_jpeg_bytes()) == SourceKind.IMAGE def test_detect_unknown() -> None: assert detect_source_kind(b"garbage") == SourceKind.UNKNOWN def test_ingest_image_returns_one_page() -> None: pages = ingest_image(_png_bytes()) assert len(pages) == 1 assert pages[0].page_index == 0 assert isinstance(pages[0].image, np.ndarray) assert pages[0].image.dtype == np.uint8 assert pages[0].image.shape == (80, 100, 3)