from __future__ import annotations import argparse import shutil import sys from pathlib import Path import pytesseract from pdf_ocr import get_total_pages, ocr_pdf_range def build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser( description="Recognize text from selected PDF pages and save to a text file.", ) parser.add_argument("--input", required=True, type=Path, help="Path to source PDF file.") parser.add_argument("--start", required=True, type=int, help="Start page (1-based, inclusive).") parser.add_argument("--end", required=True, type=int, help="End page (1-based, inclusive).") parser.add_argument("--output", required=True, type=Path, help="Path to output TXT file.") parser.add_argument("--lang", default="rus+eng", help="Tesseract language(s), example: rus+eng.") parser.add_argument("--dpi", default=300, type=int, help="Render DPI before OCR. Default: 300.") parser.add_argument( "--rotate", default=0, type=int, help="Rotate page image before OCR. Allowed: 0, 90, 180, 270, -90, -180, -270.", ) return parser def validate_args(args: argparse.Namespace) -> int: input_path: Path = args.input if not input_path.exists() or not input_path.is_file(): raise ValueError(f"Input file not found: {input_path}") if input_path.suffix.lower() != ".pdf": raise ValueError(f"Input file must have .pdf extension: {input_path}") if args.start < 1: raise ValueError("--start must be >= 1.") if args.end < args.start: raise ValueError("--end must be >= --start.") if args.dpi < 72: raise ValueError("--dpi must be >= 72.") allowed_rotations = {-270, -180, -90, 0, 90, 180, 270} if args.rotate not in allowed_rotations: raise ValueError("--rotate must be one of: 0, 90, 180, 270, -90, -180, -270.") total_pages = get_total_pages(input_path) if args.end > total_pages: raise ValueError( f"--end ({args.end}) is out of range. Document has {total_pages} pages.", ) return total_pages def ensure_tesseract_available() -> None: if shutil.which("tesseract") is None: raise RuntimeError( "Tesseract is not installed or not in PATH. Install it, for example: sudo apt-get install tesseract-ocr", ) try: _ = pytesseract.get_tesseract_version() except Exception as exc: raise RuntimeError("Cannot use Tesseract binary from current environment.") from exc def write_output(output_path: Path, pages: list[tuple[int, str]]) -> None: output_path.parent.mkdir(parents=True, exist_ok=True) with output_path.open("w", encoding="utf-8") as output_file: for page_number, text in pages: output_file.write(f"=== Page {page_number} ===\n") output_file.write(text.rstrip()) output_file.write("\n\n") def main() -> int: parser = build_parser() args = parser.parse_args() try: total_pages = validate_args(args) ensure_tesseract_available() print( ( f"Running OCR for pages {args.start}-{args.end} of {total_pages} " f"from {args.input} with rotate={args.rotate}..." ), flush=True, ) pages = ocr_pdf_range( args.input, args.start, args.end, lang=args.lang, dpi=args.dpi, rotate=args.rotate, on_progress=lambda index, total, page: print( f"[{index}/{total}] OCR page {page}", flush=True, ), ) write_output(args.output, pages) except (ValueError, RuntimeError) as exc: print(f"Error: {exc}", file=sys.stderr) return 1 print(f"Saved OCR text to: {args.output}") return 0 if __name__ == "__main__": raise SystemExit(main())