Files
OCR-SPRIN-SERVICE/tests/unit/test_document_detect.py
Devin AI d0e1835cc1 Phase 2: document detection + perspective correction + shadow removal
Adds OpenCV-based phone-photo handling that runs before the standard
preprocessing pipeline for IMAGE source kinds (PDF renders are flat by
construction and skip this stage).

Pipeline additions in src/ocr_sprint/pipeline/document_detect.py:
- _find_document_quad: Canny + dilate + contour search, picks the
  largest convex 4-point polygon above a configurable area threshold;
  fails gracefully and returns None when no usable quad is found.
- _four_point_warp: orders corners (TL/TR/BR/BL via sum/diff trick)
  and runs cv2.getPerspectiveTransform + warpPerspective.
- _remove_shadow: per-channel background-division (dilate + median
  blur + 255 - absdiff + normalize) for uneven phone-shot lighting.
- detect_and_correct: top-level entrypoint with graceful fallback
  to the original image when detection fails.

Wired into the synchronous orchestrator: only enabled for IMAGE
sources, skipped for PDF. New settings:
- preprocess_detect_document (default: true)
- preprocess_remove_shadow (default: true)
- preprocess_min_quad_area_fraction (default: 0.20)

Tests: 9 new unit tests covering corner ordering, quad detection on
synthetic skewed documents, perspective warp output sanity, shadow
removal shape preservation, full-pipeline behavior, and graceful
fallback when detection fails. 70 tests total, all green.

ML-based dewarping (DewarpNet) and DocTR detector are deferred to a
future phase per the roadmap; the existing API is structured so they
can be added as alternative backends behind DocumentDetectConfig.

Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
2026-04-25 15:06:58 +00:00

129 lines
4.6 KiB
Python

"""Tests for Phase 2 document detection + perspective correction."""
from __future__ import annotations
import cv2
import numpy as np
import pytest
from ocr_sprint.pipeline.document_detect import (
DocumentDetectConfig,
_find_document_quad,
_four_point_warp,
_order_corners,
_remove_shadow,
detect_and_correct,
)
def _synthetic_skewed_doc(
canvas_size: tuple[int, int] = (900, 700),
page_size: tuple[int, int] = (500, 380),
skew: tuple[int, int, int, int] = (40, -20, -30, 25),
) -> np.ndarray:
"""Create a dark canvas containing a white quadrilateral 'document'.
Returns a BGR uint8 image. Useful for testing detection without bringing
in real photos.
"""
h, w = canvas_size
img = np.full((h, w, 3), 30, dtype=np.uint8) # dark grey background
page_w, page_h = page_size
cx, cy = w // 2, h // 2
# corners offset by `skew` to simulate perspective distortion
sx_tl, sx_tr, sx_br, sx_bl = skew
pts = np.array(
[
[cx - page_w // 2 + sx_tl, cy - page_h // 2 + sx_tl], # TL
[cx + page_w // 2 + sx_tr, cy - page_h // 2 - sx_tr], # TR
[cx + page_w // 2 + sx_br, cy + page_h // 2 + sx_br], # BR
[cx - page_w // 2 + sx_bl, cy + page_h // 2 - sx_bl], # BL
],
dtype=np.int32,
)
cv2.fillPoly(img, [pts], color=(245, 245, 245))
# add a fake text line so the page isn't a flat white blob
cv2.line(img, (cx - 100, cy), (cx + 100, cy), color=(20, 20, 20), thickness=4)
return img
class TestOrderCorners:
def test_canonical_order(self) -> None:
# input pts unordered: BR, TL, TR, BL
pts = np.array([[100, 100], [10, 10], [110, 8], [9, 95]], dtype=np.float32)
ordered = _order_corners(pts)
# TL has the smallest sum
assert tuple(ordered[0]) == pytest.approx((10.0, 10.0))
# BR has the largest sum
assert tuple(ordered[2]) == pytest.approx((100.0, 100.0))
class TestFindDocumentQuad:
def test_finds_quad_in_synthetic_image(self) -> None:
img = _synthetic_skewed_doc()
quad = _find_document_quad(img, min_area_fraction=0.10)
assert quad is not None
assert quad.shape == (4, 2)
def test_returns_none_for_flat_image(self) -> None:
flat = np.full((400, 400, 3), 200, dtype=np.uint8)
assert _find_document_quad(flat) is None
def test_returns_none_for_tiny_image(self) -> None:
tiny = np.full((20, 20, 3), 200, dtype=np.uint8)
assert _find_document_quad(tiny) is None
class TestFourPointWarp:
def test_warp_produces_axis_aligned_rectangle(self) -> None:
img = _synthetic_skewed_doc()
quad = _find_document_quad(img, min_area_fraction=0.10)
assert quad is not None
warped = _four_point_warp(img, quad)
# warped output should be smaller than the canvas (it's only the page)
assert warped.shape[0] < img.shape[0]
assert warped.shape[1] < img.shape[1]
# majority of the warped image should be near-white (the page)
gray = cv2.cvtColor(warped, cv2.COLOR_BGR2GRAY)
bright_fraction = float((gray > 200).mean())
assert bright_fraction > 0.7
class TestRemoveShadow:
def test_output_shape_matches_input(self) -> None:
img = np.random.randint(0, 256, (200, 300, 3), dtype=np.uint8)
out = _remove_shadow(img)
assert out.shape == img.shape
assert out.dtype == np.uint8
class TestDetectAndCorrect:
def test_full_pipeline_on_skewed_doc(self) -> None:
img = _synthetic_skewed_doc()
out = detect_and_correct(
img,
DocumentDetectConfig(detect_document=True, remove_shadow=True, min_area_fraction=0.1),
)
# the output is the cropped+flattened page, so it must be smaller
assert out.shape[0] < img.shape[0]
def test_disabled_returns_input_unchanged(self) -> None:
img = _synthetic_skewed_doc()
out = detect_and_correct(
img,
DocumentDetectConfig(detect_document=False, remove_shadow=False),
)
assert out.shape == img.shape
# disabled both flags → bytes-identical
assert np.array_equal(out, img)
def test_failed_detection_falls_back_to_input(self) -> None:
# flat grey image with no contour-detectable document
flat = np.full((300, 300, 3), 128, dtype=np.uint8)
out = detect_and_correct(
flat,
DocumentDetectConfig(detect_document=True, remove_shadow=False, min_area_fraction=0.5),
)
# detection failed, so the original shape is preserved
assert out.shape == flat.shape