added merge pdfs, fixed jpgstopdf arg issues,

This commit is contained in:
klein panic
2024-11-01 17:27:04 -04:00
parent 242841c44b
commit 8d09cfb89f
8096 changed files with 9895 additions and 1426612 deletions

View File

@@ -4,10 +4,29 @@ import sys
import os
def pdf_to_markdown(pdf_path, output_md_path):
# Check if the input PDF file exists
if not os.path.isfile(pdf_path):
print(f"Error: The input PDF file '{pdf_path}' does not exist.")
sys.exit(1)
# Check if the output directory is writable
output_dir = os.path.dirname(output_md_path)
if output_dir and not os.access(output_dir, os.W_OK):
print(f"Error: The output directory '{output_dir}' is not writable.")
sys.exit(1)
try:
# Open the PDF document
pdf_document = fitz.open(pdf_path)
if pdf_document.page_count == 0:
print("Error: The PDF file contains no pages.")
pdf_document.close()
sys.exit(1)
# Initialize the Markdown content
md_content = "# PDF Content\n\n"
# Extract text from each page
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
text = page.get_text()
@@ -15,19 +34,27 @@ def pdf_to_markdown(pdf_path, output_md_path):
pdf_document.close()
with open(output_md_path, 'w') as file:
# Write the Markdown content to the output file
with open(output_md_path, 'w', encoding='utf-8') as file:
file.write(md_content)
print(f"Markdown file created at {output_md_path}")
print(f"Markdown file created at '{output_md_path}'")
except FileNotFoundError:
print(f"Error: The input PDF file '{pdf_path}' was not found.")
sys.exit(1)
except PermissionError:
print(f"Error: Permission denied when writing to '{output_md_path}'.")
sys.exit(1)
except Exception as e:
print(f"Error reading PDF content: {e}")
print(f"An unexpected error occurred: {e}")
sys.exit(1)
if len(sys.argv) != 3:
print("Usage: pdftomd <input_pdf_path> <output_md_path>")
sys.exit(1)
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: pdftomd <input_pdf_path> <output_md_path>")
sys.exit(1)
input_path = sys.argv[1]
output_path = sys.argv[2]
input_path = sys.argv[1]
output_path = sys.argv[2]
pdf_to_markdown(input_path, output_path)
pdf_to_markdown(input_path, output_path)