Files

132 lines
4.1 KiB
Python

from __future__ import annotations
import argparse
import shutil
import sys
from pathlib import Path
import pytesseract
from pdf_ocr import get_total_pages, ocr_pdf_range
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Recognize text from selected PDF pages and save to a text file.",
)
parser.add_argument("--input", required=True, type=Path, help="Path to source PDF file.")
parser.add_argument("--start", type=int, help="Start page (1-based, inclusive). Default: 1.")
parser.add_argument("--end", type=int, help="End page (1-based, inclusive). Default: last page.")
parser.add_argument("--output", required=True, type=Path, help="Path to output TXT file.")
parser.add_argument("--lang", default="rus+eng", help="Tesseract language(s), example: rus+eng.")
parser.add_argument("--dpi", default=300, type=int, help="Render DPI before OCR. Default: 300.")
parser.add_argument(
"--rotate",
default=0,
type=int,
help="Rotate page image before OCR. Allowed: 0, 90, 180, 270, -90, -180, -270.",
)
return parser
def validate_args(args: argparse.Namespace) -> int:
input_path: Path = args.input
if not input_path.exists() or not input_path.is_file():
raise ValueError(f"Input file not found: {input_path}")
if input_path.suffix.lower() != ".pdf":
raise ValueError(f"Input file must have .pdf extension: {input_path}")
total_pages = get_total_pages(input_path)
if args.start is None and args.end is None:
args.start = 1
args.end = total_pages
else:
if args.start is None:
args.start = 1
if args.end is None:
args.end = total_pages
if args.start < 1:
raise ValueError("--start must be >= 1.")
if args.end < args.start:
raise ValueError("--end must be >= --start.")
if args.dpi < 72:
raise ValueError("--dpi must be >= 72.")
allowed_rotations = {-270, -180, -90, 0, 90, 180, 270}
if args.rotate not in allowed_rotations:
raise ValueError("--rotate must be one of: 0, 90, 180, 270, -90, -180, -270.")
if args.end > total_pages:
raise ValueError(
f"--end ({args.end}) is out of range. Document has {total_pages} pages.",
)
return total_pages
def ensure_tesseract_available() -> None:
if shutil.which("tesseract") is None:
raise RuntimeError(
"Tesseract is not installed or not in PATH. Install it, for example: sudo apt-get install tesseract-ocr",
)
try:
_ = pytesseract.get_tesseract_version()
except Exception as exc:
raise RuntimeError("Cannot use Tesseract binary from current environment.") from exc
def write_output(output_path: Path, pages: list[tuple[int, str]]) -> None:
output_path.parent.mkdir(parents=True, exist_ok=True)
with output_path.open("w", encoding="utf-8") as output_file:
for page_number, text in pages:
output_file.write(f"=== Page {page_number} ===\n")
output_file.write(text.rstrip())
output_file.write("\n\n")
def main() -> int:
parser = build_parser()
args = parser.parse_args()
try:
total_pages = validate_args(args)
ensure_tesseract_available()
print(
(
f"Running OCR for pages {args.start}-{args.end} of {total_pages} "
f"from {args.input} with rotate={args.rotate}..."
),
flush=True,
)
pages = ocr_pdf_range(
args.input,
args.start,
args.end,
lang=args.lang,
dpi=args.dpi,
rotate=args.rotate,
on_progress=lambda index, total, page: print(
f"[{index}/{total}] OCR page {page}",
flush=True,
),
)
write_output(args.output, pages)
except (ValueError, RuntimeError) as exc:
print(f"Error: {exc}", file=sys.stderr)
return 1
print(f"Saved OCR text to: {args.output}")
return 0
if __name__ == "__main__":
raise SystemExit(main())