pdf-reader/main.py

from __future__ import annotations

import argparse
import shutil
import sys
from pathlib import Path

import pytesseract

from pdf_ocr import get_total_pages, ocr_pdf_range


def build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(
        description="Recognize text from selected PDF pages and save to a text file.",
    )
    parser.add_argument("--input", required=True, type=Path, help="Path to source PDF file.")
    parser.add_argument("--start", type=int, help="Start page (1-based, inclusive). Default: 1.")
    parser.add_argument("--end", type=int, help="End page (1-based, inclusive). Default: last page.")
    parser.add_argument("--output", required=True, type=Path, help="Path to output TXT file.")
    parser.add_argument("--lang", default="rus+eng", help="Tesseract language(s), example: rus+eng.")
    parser.add_argument("--dpi", default=300, type=int, help="Render DPI before OCR. Default: 300.")
    parser.add_argument(
        "--rotate",
        default=0,
        type=int,
        help="Rotate page image before OCR. Allowed: 0, 90, 180, 270, -90, -180, -270.",
    )
    return parser


def validate_args(args: argparse.Namespace) -> int:
    input_path: Path = args.input
    if not input_path.exists() or not input_path.is_file():
        raise ValueError(f"Input file not found: {input_path}")

    if input_path.suffix.lower() != ".pdf":
        raise ValueError(f"Input file must have .pdf extension: {input_path}")

    total_pages = get_total_pages(input_path)

    if args.start is None and args.end is None:
        args.start = 1
        args.end = total_pages
    else:
        if args.start is None:
            args.start = 1
        if args.end is None:
            args.end = total_pages

    if args.start < 1:
        raise ValueError("--start must be >= 1.")

    if args.end < args.start:
        raise ValueError("--end must be >= --start.")

    if args.dpi < 72:
        raise ValueError("--dpi must be >= 72.")

    allowed_rotations = {-270, -180, -90, 0, 90, 180, 270}
    if args.rotate not in allowed_rotations:
        raise ValueError("--rotate must be one of: 0, 90, 180, 270, -90, -180, -270.")

    if args.end > total_pages:
        raise ValueError(
            f"--end ({args.end}) is out of range. Document has {total_pages} pages.",
        )

    return total_pages


def ensure_tesseract_available() -> None:
    if shutil.which("tesseract") is None:
        raise RuntimeError(
            "Tesseract is not installed or not in PATH. Install it, for example: sudo apt-get install tesseract-ocr",
        )

    try:
        _ = pytesseract.get_tesseract_version()
    except Exception as exc:
        raise RuntimeError("Cannot use Tesseract binary from current environment.") from exc


def write_output(output_path: Path, pages: list[tuple[int, str]]) -> None:
    output_path.parent.mkdir(parents=True, exist_ok=True)
    with output_path.open("w", encoding="utf-8") as output_file:
        for page_number, text in pages:
            output_file.write(f"=== Page {page_number} ===\n")
            output_file.write(text.rstrip())
            output_file.write("\n\n")


def main() -> int:
    parser = build_parser()
    args = parser.parse_args()

    try:
        total_pages = validate_args(args)
        ensure_tesseract_available()

        print(
            (
                f"Running OCR for pages {args.start}-{args.end} of {total_pages} "
                f"from {args.input} with rotate={args.rotate}..."
            ),
            flush=True,
        )

        pages = ocr_pdf_range(
            args.input,
            args.start,
            args.end,
            lang=args.lang,
            dpi=args.dpi,
            rotate=args.rotate,
            on_progress=lambda index, total, page: print(
                f"[{index}/{total}] OCR page {page}",
                flush=True,
            ),
        )
        write_output(args.output, pages)
    except (ValueError, RuntimeError) as exc:
        print(f"Error: {exc}", file=sys.stderr)
        return 1

    print(f"Saved OCR text to: {args.output}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())