from __future__ import annotations from pathlib import Path from typing import Callable import fitz import pytesseract from PIL import Image def get_total_pages(pdf_path: Path) -> int: """Return total number of pages in a PDF file.""" with fitz.open(pdf_path) as document: return document.page_count def ocr_pdf_range( pdf_path: Path, start_page: int, end_page: int, *, lang: str, dpi: int, rotate: int = 0, on_progress: Callable[[int, int, int], None] | None = None, ) -> list[tuple[int, str]]: """Run OCR for the selected inclusive page range and return extracted text.""" scale = dpi / 72.0 matrix = fitz.Matrix(scale, scale) total_in_range = end_page - start_page + 1 results: list[tuple[int, str]] = [] with fitz.open(pdf_path) as document: for index, page_number in enumerate(range(start_page, end_page + 1), start=1): if on_progress is not None: on_progress(index, total_in_range, page_number) page = document.load_page(page_number - 1) pixmap = page.get_pixmap(matrix=matrix, alpha=False) image = Image.frombytes( "RGB", (pixmap.width, pixmap.height), pixmap.samples, ) try: if rotate: rotated_image = image.rotate(-rotate, expand=True) else: rotated_image = image text = pytesseract.image_to_string(rotated_image, lang=lang) except Exception as exc: raise RuntimeError(f"OCR failed on page {page_number}") from exc finally: if rotate: rotated_image.close() image.close() results.append((page_number, text)) return results