#!/usr/bin/env python3 """ PDF to Excel Table Extractor A CLI tool to extract tables from PDF files and save them to Excel format. """ import argparse import shutil import sys from pathlib import Path # Check for required dependencies before importing them def check_dependencies(): """Check if all required dependencies are installed.""" missing_deps = [] try: import camelot except ImportError: missing_deps.append("camelot-py[cv]") try: import pandas except ImportError: missing_deps.append("pandas") try: import openpyxl except ImportError: missing_deps.append("openpyxl") try: import cv2 except ImportError as e: if "libGL" in str(e): print("Error: Missing system library 'libGL.so.1'.") print("Install it with: sudo apt-get install libgl1") sys.exit(1) missing_deps.append("opencv-python") if missing_deps: print("Error: Missing required Python packages:") for dep in missing_deps: print(f" - {dep}") print(f"\nInstall them with: pip install {' '.join(missing_deps)}") sys.exit(1) # Check for Ghostscript if not shutil.which("gs") and not shutil.which("gswin64c") and not shutil.which("gswin32c"): print("Error: Ghostscript is not installed or not in PATH.") print("\nInstall Ghostscript:") print(" Ubuntu/Debian: sudo apt-get install ghostscript") print(" Fedora/RHEL: sudo dnf install ghostscript") print(" macOS: brew install ghostscript") print(" Windows: https://www.ghostscript.com/releases/gsdnld.html") sys.exit(1) check_dependencies() import camelot import pandas as pd def extract_tables_from_pdf(pdf_path: str, pages: str = "all") -> list: """ Extract tables from a PDF file. Args: pdf_path: Path to the PDF file pages: Page numbers to extract from (default: "all") Can be "all", "1", "1,2,3", or "1-3" Returns: List of pandas DataFrames containing the extracted tables """ try: # Try lattice method first (works better for tables with visible borders) tables = camelot.read_pdf(pdf_path, pages=pages, flavor="lattice") if len(tables) == 0: # Fall back to stream method (works for tables without visible borders) print("No tables found with lattice method, trying stream method...") tables = camelot.read_pdf(pdf_path, pages=pages, flavor="stream") return tables except FileNotFoundError: print(f"Error: PDF file '{pdf_path}' not found.") sys.exit(1) except PermissionError: print(f"Error: Permission denied when accessing '{pdf_path}'.") sys.exit(1) except Exception as e: error_msg = str(e).lower() if "ghostscript" in error_msg: print("Error: Ghostscript error occurred.") print("Make sure Ghostscript is properly installed and accessible.") print(f"Details: {e}") elif "password" in error_msg or "encrypted" in error_msg: print("Error: The PDF appears to be password-protected or encrypted.") print("Please provide an unencrypted PDF file.") elif "invalid" in error_msg and "page" in error_msg: print(f"Error: Invalid page specification '{pages}'.") print("Use 'all', a single page number (e.g., '1'), ") print("a comma-separated list (e.g., '1,2,3'), or a range (e.g., '1-5').") elif "no tables" in error_msg: print("Error: No tables could be detected in the PDF.") print("The PDF might contain images of tables rather than actual table data.") else: print(f"Error extracting tables: {e}") sys.exit(1) def save_tables_to_excel(tables, output_path: str, separate_sheets: bool = True) -> None: """ Save extracted tables to an Excel file. Args: tables: List of camelot Table objects output_path: Path to the output Excel file separate_sheets: If True, save each table to a separate sheet """ if len(tables) == 0: print("No tables found in the PDF.") print("Tips:") print(" - The PDF might contain images of tables rather than actual table data") print(" - Try specifying different pages with the --pages option") sys.exit(1) # Check if output directory exists output_dir = Path(output_path).parent if output_dir and not output_dir.exists(): try: output_dir.mkdir(parents=True, exist_ok=True) print(f"Created output directory: {output_dir}") except PermissionError: print(f"Error: Permission denied when creating directory '{output_dir}'.") sys.exit(1) except Exception as e: print(f"Error creating output directory: {e}") sys.exit(1) # Check if output file already exists and is writable output_file = Path(output_path) if output_file.exists(): try: # Test if we can write to the file with open(output_file, 'a'): pass except PermissionError: print(f"Error: Cannot write to '{output_path}'. File may be open in another program.") sys.exit(1) try: with pd.ExcelWriter(output_path, engine="openpyxl") as writer: if separate_sheets: for i, table in enumerate(tables): df = table.df # Use first row as header if len(df) > 0: df.columns = df.iloc[0] df = df[1:] df.to_excel(writer, sheet_name=f"Table_{i+1}", index=False) print(f"Table {i+1}: {len(df)} rows extracted (Page {table.page})") else: # Combine all tables into one sheet all_dfs = [] for table in tables: df = table.df if len(df) > 0: df.columns = df.iloc[0] df = df[1:] all_dfs.append(df) if all_dfs: combined_df = pd.concat(all_dfs, ignore_index=True) combined_df.to_excel(writer, sheet_name="Combined_Tables", index=False) print(f"Combined {len(tables)} tables into one sheet with {len(combined_df)} total rows") else: print("Warning: All tables were empty.") except PermissionError: print(f"Error: Permission denied when writing to '{output_path}'.") print("Make sure the file is not open in another program.") sys.exit(1) except Exception as e: print(f"Error saving Excel file: {e}") sys.exit(1) def validate_pages_arg(pages: str) -> bool: """ Validate the pages argument format. Args: pages: Pages specification string Returns: True if valid, False otherwise """ if pages.lower() == "all": return True # Check for valid formats: "1", "1,2,3", "1-5", "1,3-5,7" import re pattern = r'^(\d+(-\d+)?)(,\d+(-\d+)?)*$' return bool(re.match(pattern, pages.replace(" ", ""))) def main(): parser = argparse.ArgumentParser( description="Extract tables from PDF files and save to Excel format.", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: %(prog)s input.pdf output.xlsx %(prog)s input.pdf output.xlsx --pages 1,2,3 %(prog)s input.pdf output.xlsx --pages 1-5 %(prog)s input.pdf output.xlsx --combine """ ) parser.add_argument( "input_pdf", help="Path to the input PDF file" ) parser.add_argument( "output_excel", help="Path to the output Excel file" ) parser.add_argument( "-p", "--pages", default="all", help="Pages to extract tables from (default: all). " "Can be 'all', '1', '1,2,3', or '1-5'" ) parser.add_argument( "-c", "--combine", action="store_true", help="Combine all tables into a single sheet instead of separate sheets" ) args = parser.parse_args() # Validate input file input_path = Path(args.input_pdf) if not input_path.exists(): print(f"Error: Input file '{args.input_pdf}' does not exist.") sys.exit(1) if not input_path.is_file(): print(f"Error: '{args.input_pdf}' is not a file.") sys.exit(1) if not input_path.suffix.lower() == ".pdf": print(f"Warning: Input file may not be a PDF (extension: {input_path.suffix})") # Check if input file is readable try: with open(input_path, 'rb') as f: # Read first few bytes to check if it's a valid PDF header = f.read(5) if header != b'%PDF-': print(f"Warning: File does not appear to be a valid PDF (missing PDF header).") except PermissionError: print(f"Error: Permission denied when reading '{args.input_pdf}'.") sys.exit(1) except Exception as e: print(f"Error reading input file: {e}") sys.exit(1) # Validate pages argument if not validate_pages_arg(args.pages): print(f"Error: Invalid page specification '{args.pages}'.") print("Valid formats:") print(" 'all' - Extract from all pages") print(" '1' - Extract from page 1") print(" '1,2,3' - Extract from pages 1, 2, and 3") print(" '1-5' - Extract from pages 1 through 5") print(" '1,3-5,7' - Extract from pages 1, 3-5, and 7") sys.exit(1) # Ensure output has .xlsx extension output_path = args.output_excel if not output_path.lower().endswith(".xlsx"): output_path += ".xlsx" print(f"Extracting tables from: {args.input_pdf}") print(f"Pages: {args.pages}") # Extract tables tables = extract_tables_from_pdf(args.input_pdf, args.pages) print(f"Found {len(tables)} table(s)") # Save to Excel save_tables_to_excel(tables, output_path, separate_sheets=not args.combine) print(f"Successfully saved to: {output_path}") if __name__ == "__main__": try: main() except KeyboardInterrupt: print("\nOperation cancelled by user.") sys.exit(130) except Exception as e: print(f"Unexpected error: {e}") print("If this problem persists, please report it with the full error message.") sys.exit(1)