Files
convertions/convertions-env/lib/python3.11/site-packages/pymupdf/__main__.py
2024-09-29 01:45:31 -04:00

1147 lines
40 KiB
Python

# -----------------------------------------------------------------------------
# Copyright 2020-2022, Harald Lieder, mailto:harald.lieder@outlook.com
# License: GNU AFFERO GPL 3.0, https://www.gnu.org/licenses/agpl-3.0.html
# Part of "PyMuPDF", Python bindings for "MuPDF" (http://mupdf.com), a
# lightweight PDF, XPS, and E-book viewer, renderer and toolkit which is
# maintained and developed by Artifex Software, Inc. https://artifex.com.
# -----------------------------------------------------------------------------
import argparse
import bisect
import os
import sys
import statistics
from typing import Dict, List, Set
from . import pymupdf
def mycenter(x):
return (" %s " % x).center(75, "-")
def recoverpix(doc, item):
"""Return image for a given XREF."""
x = item[0] # xref of PDF image
s = item[1] # xref of its /SMask
if s == 0: # no smask: use direct image output
return doc.extract_image(x)
def getimage(pix):
if pix.colorspace.n != 4:
return pix
tpix = pymupdf.Pixmap(pymupdf.csRGB, pix)
return tpix
# we need to reconstruct the alpha channel with the smask
pix1 = pymupdf.Pixmap(doc, x)
pix2 = pymupdf.Pixmap(doc, s) # create pixmap of the /SMask entry
"""Sanity check:
- both pixmaps must have the same rectangle
- both pixmaps must have alpha=0
- pix2 must consist of 1 byte per pixel
"""
if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
pymupdf.message("Warning: unsupported /SMask %i for %i:" % (s, x))
pymupdf.message(pix2)
pix2 = None
return getimage(pix1) # return the pixmap as is
pix = pymupdf.Pixmap(pix1) # copy of pix1, with an alpha channel added
pix.set_alpha(pix2.samples) # treat pix2.samples as the alpha values
pix1 = pix2 = None # free temp pixmaps
# we may need to adjust something for CMYK pixmaps here:
return getimage(pix)
def open_file(filename, password, show=False, pdf=True):
"""Open and authenticate a document."""
doc = pymupdf.open(filename)
if not doc.is_pdf and pdf is True:
sys.exit("this command supports PDF files only")
rc = -1
if not doc.needs_pass:
return doc
if password:
rc = doc.authenticate(password)
if not rc:
sys.exit("authentication unsuccessful")
if show is True:
pymupdf.message("authenticated as %s" % "owner" if rc > 2 else "user")
else:
sys.exit("'%s' requires a password" % doc.name)
return doc
def print_dict(item):
"""Print a Python dictionary."""
l = max([len(k) for k in item.keys()]) + 1
for k, v in item.items():
msg = "%s: %s" % (k.rjust(l), v)
pymupdf.message(msg)
def print_xref(doc, xref):
"""Print an object given by XREF number.
Simulate the PDF source in "pretty" format.
For a stream also print its size.
"""
pymupdf.message("%i 0 obj" % xref)
xref_str = doc.xref_object(xref)
pymupdf.message(xref_str)
if doc.xref_is_stream(xref):
temp = xref_str.split()
try:
idx = temp.index("/Length") + 1
size = temp[idx]
if size.endswith("0 R"):
size = "unknown"
except Exception:
size = "unknown"
pymupdf.message("stream\n...%s bytes" % size)
pymupdf.message("endstream")
pymupdf.message("endobj")
def get_list(rlist, limit, what="page"):
"""Transform a page / xref specification into a list of integers.
Args
----
rlist: (str) the specification
limit: maximum number, i.e. number of pages, number of objects
what: a string to be used in error messages
Returns
-------
A list of integers representing the specification.
"""
N = str(limit - 1)
rlist = rlist.replace("N", N).replace(" ", "")
rlist_arr = rlist.split(",")
out_list = []
for seq, item in enumerate(rlist_arr):
n = seq + 1
if item.isdecimal(): # a single integer
i = int(item)
if 1 <= i < limit:
out_list.append(int(item))
else:
sys.exit("bad %s specification at item %i" % (what, n))
continue
try: # this must be a range now, and all of the following must work:
i1, i2 = item.split("-") # will fail if not 2 items produced
i1 = int(i1) # will fail on non-integers
i2 = int(i2)
except Exception:
sys.exit("bad %s range specification at item %i" % (what, n))
if not (1 <= i1 < limit and 1 <= i2 < limit):
sys.exit("bad %s range specification at item %i" % (what, n))
if i1 == i2: # just in case: a range of equal numbers
out_list.append(i1)
continue
if i1 < i2: # first less than second
out_list += list(range(i1, i2 + 1))
else: # first larger than second
out_list += list(range(i1, i2 - 1, -1))
return out_list
def show(args):
doc = open_file(args.input, args.password, True)
size = os.path.getsize(args.input) / 1024
flag = "KB"
if size > 1000:
size /= 1024
flag = "MB"
size = round(size, 1)
meta = doc.metadata # pylint: disable=no-member
pymupdf.message(
"'%s', pages: %i, objects: %i, %g %s, %s, encryption: %s"
% (
args.input,
doc.page_count,
doc.xref_length() - 1,
size,
flag,
meta["format"],
meta["encryption"],
)
)
n = doc.is_form_pdf
if n > 0:
s = doc.get_sigflags()
pymupdf.message(
"document contains %i root form fields and is %ssigned"
% (n, "not " if s != 3 else "")
)
n = doc.embfile_count()
if n > 0:
pymupdf.message("document contains %i embedded files" % n)
pymupdf.message()
if args.catalog:
pymupdf.message(mycenter("PDF catalog"))
xref = doc.pdf_catalog()
print_xref(doc, xref)
pymupdf.message()
if args.metadata:
pymupdf.message(mycenter("PDF metadata"))
print_dict(doc.metadata) # pylint: disable=no-member
pymupdf.message()
if args.xrefs:
pymupdf.message(mycenter("object information"))
xrefl = get_list(args.xrefs, doc.xref_length(), what="xref")
for xref in xrefl:
print_xref(doc, xref)
pymupdf.message()
if args.pages:
pymupdf.message(mycenter("page information"))
pagel = get_list(args.pages, doc.page_count + 1)
for pno in pagel:
n = pno - 1
xref = doc.page_xref(n)
pymupdf.message("Page %i:" % pno)
print_xref(doc, xref)
pymupdf.message()
if args.trailer:
pymupdf.message(mycenter("PDF trailer"))
pymupdf.message(doc.pdf_trailer())
pymupdf.message()
doc.close()
def clean(args):
doc = open_file(args.input, args.password, pdf=True)
encryption = args.encryption
encrypt = ("keep", "none", "rc4-40", "rc4-128", "aes-128", "aes-256").index(
encryption
)
if not args.pages: # simple cleaning
doc.save(
args.output,
garbage=args.garbage,
deflate=args.compress,
pretty=args.pretty,
clean=args.sanitize,
ascii=args.ascii,
linear=args.linear,
encryption=encrypt,
owner_pw=args.owner,
user_pw=args.user,
permissions=args.permission,
)
return
# create sub document from page numbers
pages = get_list(args.pages, doc.page_count + 1)
outdoc = pymupdf.open()
for pno in pages:
n = pno - 1
outdoc.insert_pdf(doc, from_page=n, to_page=n)
outdoc.save(
args.output,
garbage=args.garbage,
deflate=args.compress,
pretty=args.pretty,
clean=args.sanitize,
ascii=args.ascii,
linear=args.linear,
encryption=encrypt,
owner_pw=args.owner,
user_pw=args.user,
permissions=args.permission,
)
doc.close()
outdoc.close()
return
def doc_join(args):
"""Join pages from several PDF documents."""
doc_list = args.input # a list of input PDFs
doc = pymupdf.open() # output PDF
for src_item in doc_list: # process one input PDF
src_list = src_item.split(",")
password = src_list[1] if len(src_list) > 1 else None
src = open_file(src_list[0], password, pdf=True)
pages = ",".join(src_list[2:]) # get 'pages' specifications
if pages: # if anything there, retrieve a list of desired pages
page_list = get_list(",".join(src_list[2:]), src.page_count + 1)
else: # take all pages
page_list = range(1, src.page_count + 1)
for i in page_list:
doc.insert_pdf(src, from_page=i - 1, to_page=i - 1) # copy each source page
src.close()
doc.save(args.output, garbage=4, deflate=True)
doc.close()
def embedded_copy(args):
"""Copy embedded files between PDFs."""
doc = open_file(args.input, args.password, pdf=True)
if not doc.can_save_incrementally() and (
not args.output or args.output == args.input
):
sys.exit("cannot save PDF incrementally")
src = open_file(args.source, args.pwdsource)
names = set(args.name) if args.name else set()
src_names = set(src.embfile_names())
if names:
if not names <= src_names:
sys.exit("not all names are contained in source")
else:
names = src_names
if not names:
sys.exit("nothing to copy")
intersect = names & set(doc.embfile_names()) # any equal name already in target?
if intersect:
sys.exit("following names already exist in receiving PDF: %s" % str(intersect))
for item in names:
info = src.embfile_info(item)
buff = src.embfile_get(item)
doc.embfile_add(
item,
buff,
filename=info["filename"],
ufilename=info["ufilename"],
desc=info["desc"],
)
pymupdf.message("copied entry '%s' from '%s'" % (item, src.name))
src.close()
if args.output and args.output != args.input:
doc.save(args.output, garbage=3)
else:
doc.saveIncr()
doc.close()
def embedded_del(args):
"""Delete an embedded file entry."""
doc = open_file(args.input, args.password, pdf=True)
if not doc.can_save_incrementally() and (
not args.output or args.output == args.input
):
sys.exit("cannot save PDF incrementally")
exception_types = (ValueError, pymupdf.mupdf.FzErrorBase)
if pymupdf.mupdf_version_tuple < (1, 24):
exception_types = ValueError
try:
doc.embfile_del(args.name)
except exception_types as e: # pylint: disable=catching-non-exception
sys.exit(f'no such embedded file {args.name!r}: {e}')
if not args.output or args.output == args.input:
doc.saveIncr()
else:
doc.save(args.output, garbage=1)
doc.close()
def embedded_get(args):
"""Retrieve contents of an embedded file."""
doc = open_file(args.input, args.password, pdf=True)
exception_types = (ValueError, pymupdf.mupdf.FzErrorBase)
if pymupdf.mupdf_version_tuple < (1, 24):
exception_types = ValueError
try:
stream = doc.embfile_get(args.name)
d = doc.embfile_info(args.name)
except exception_types as e: # pylint: disable=catching-non-exception
sys.exit(f'no such embedded file {args.name!r}: {e}')
filename = args.output if args.output else d["filename"]
with open(filename, "wb") as output:
output.write(stream)
pymupdf.message("saved entry '%s' as '%s'" % (args.name, filename))
doc.close()
def embedded_add(args):
"""Insert a new embedded file."""
doc = open_file(args.input, args.password, pdf=True)
if not doc.can_save_incrementally() and (
args.output is None or args.output == args.input
):
sys.exit("cannot save PDF incrementally")
try:
doc.embfile_del(args.name)
sys.exit("entry '%s' already exists" % args.name)
except Exception:
pass
if not os.path.exists(args.path) or not os.path.isfile(args.path):
sys.exit("no such file '%s'" % args.path)
with open(args.path, "rb") as f:
stream = f.read()
filename = args.path
ufilename = filename
if not args.desc:
desc = filename
else:
desc = args.desc
doc.embfile_add(
args.name, stream, filename=filename, ufilename=ufilename, desc=desc
)
if not args.output or args.output == args.input:
doc.saveIncr()
else:
doc.save(args.output, garbage=3)
doc.close()
def embedded_upd(args):
"""Update contents or metadata of an embedded file."""
doc = open_file(args.input, args.password, pdf=True)
if not doc.can_save_incrementally() and (
args.output is None or args.output == args.input
):
sys.exit("cannot save PDF incrementally")
try:
doc.embfile_info(args.name)
except Exception:
sys.exit("no such embedded file '%s'" % args.name)
if (
args.path is not None
and os.path.exists(args.path)
and os.path.isfile(args.path)
):
with open(args.path, "rb") as f:
stream = f.read()
else:
stream = None
if args.filename:
filename = args.filename
else:
filename = None
if args.ufilename:
ufilename = args.ufilename
elif args.filename:
ufilename = args.filename
else:
ufilename = None
if args.desc:
desc = args.desc
else:
desc = None
doc.embfile_upd(
args.name, stream, filename=filename, ufilename=ufilename, desc=desc
)
if args.output is None or args.output == args.input:
doc.saveIncr()
else:
doc.save(args.output, garbage=3)
doc.close()
def embedded_list(args):
"""List embedded files."""
doc = open_file(args.input, args.password, pdf=True)
names = doc.embfile_names()
if args.name is not None:
if args.name not in names:
sys.exit("no such embedded file '%s'" % args.name)
else:
pymupdf.message()
pymupdf.message(
"printing 1 of %i embedded file%s:"
% (len(names), "s" if len(names) > 1 else "")
)
pymupdf.message()
print_dict(doc.embfile_info(args.name))
pymupdf.message()
return
if not names:
pymupdf.message("'%s' contains no embedded files" % doc.name)
return
if len(names) > 1:
msg = "'%s' contains the following %i embedded files" % (doc.name, len(names))
else:
msg = "'%s' contains the following embedded file" % doc.name
pymupdf.message(msg)
pymupdf.message()
for name in names:
if not args.detail:
pymupdf.message(name)
continue
_ = doc.embfile_info(name)
print_dict(doc.embfile_info(name))
pymupdf.message()
doc.close()
def extract_objects(args):
"""Extract images and / or fonts from a PDF."""
if not args.fonts and not args.images:
sys.exit("neither fonts nor images requested")
doc = open_file(args.input, args.password, pdf=True)
if args.pages:
pages = get_list(args.pages, doc.page_count + 1)
else:
pages = range(1, doc.page_count + 1)
if not args.output:
out_dir = os.path.abspath(os.curdir)
else:
out_dir = args.output
if not (os.path.exists(out_dir) and os.path.isdir(out_dir)):
sys.exit("output directory %s does not exist" % out_dir)
font_xrefs = set() # already saved fonts
image_xrefs = set() # already saved images
for pno in pages:
if args.fonts:
itemlist = doc.get_page_fonts(pno - 1)
for item in itemlist:
xref = item[0]
if xref not in font_xrefs:
font_xrefs.add(xref)
fontname, ext, _, buffer = doc.extract_font(xref)
if ext == "n/a" or not buffer:
continue
outname = os.path.join(
out_dir, f"{fontname.replace(' ', '-')}-{xref}.{ext}"
)
with open(outname, "wb") as outfile:
outfile.write(buffer)
buffer = None
if args.images:
itemlist = doc.get_page_images(pno - 1)
for item in itemlist:
xref = item[0]
if xref not in image_xrefs:
image_xrefs.add(xref)
pix = recoverpix(doc, item)
if type(pix) is dict:
ext = pix["ext"]
imgdata = pix["image"]
outname = os.path.join(out_dir, "img-%i.%s" % (xref, ext))
with open(outname, "wb") as outfile:
outfile.write(imgdata)
else:
outname = os.path.join(out_dir, "img-%i.png" % xref)
pix2 = (
pix
if pix.colorspace.n < 4
else pymupdf.Pixmap(pymupdf.csRGB, pix)
)
pix2.save(outname)
if args.fonts:
pymupdf.message("saved %i fonts to '%s'" % (len(font_xrefs), out_dir))
if args.images:
pymupdf.message("saved %i images to '%s'" % (len(image_xrefs), out_dir))
doc.close()
def page_simple(page, textout, GRID, fontsize, noformfeed, skip_empty, flags):
eop = b"\n" if noformfeed else bytes([12])
text = page.get_text("text", flags=flags)
if not text:
if not skip_empty:
textout.write(eop) # write formfeed
return
textout.write(text.encode("utf8", errors="surrogatepass"))
textout.write(eop)
return
def page_blocksort(page, textout, GRID, fontsize, noformfeed, skip_empty, flags):
eop = b"\n" if noformfeed else bytes([12])
blocks = page.get_text("blocks", flags=flags)
if blocks == []:
if not skip_empty:
textout.write(eop) # write formfeed
return
blocks.sort(key=lambda b: (b[3], b[0]))
for b in blocks:
textout.write(b[4].encode("utf8", errors="surrogatepass"))
textout.write(eop)
return
def page_layout(page, textout, GRID, fontsize, noformfeed, skip_empty, flags):
eop = b"\n" if noformfeed else bytes([12])
# --------------------------------------------------------------------
def find_line_index(values: List[int], value: int) -> int:
"""Find the right row coordinate.
Args:
values: (list) y-coordinates of rows.
value: (int) lookup for this value (y-origin of char).
Returns:
y-ccordinate of appropriate line for value.
"""
i = bisect.bisect_right(values, value)
if i:
return values[i - 1]
raise RuntimeError("Line for %g not found in %s" % (value, values))
# --------------------------------------------------------------------
def curate_rows(rows: Set[int], GRID) -> List:
rows = list(rows)
rows.sort() # sort ascending
nrows = [rows[0]]
for h in rows[1:]:
if h >= nrows[-1] + GRID: # only keep significant differences
nrows.append(h)
return nrows # curated list of line bottom coordinates
def process_blocks(blocks: List[Dict], page: pymupdf.Page):
rows = set()
page_width = page.rect.width
page_height = page.rect.height
rowheight = page_height
left = page_width
right = 0
chars = []
for block in blocks:
for line in block["lines"]:
if line["dir"] != (1, 0): # ignore non-horizontal text
continue
x0, y0, x1, y1 = line["bbox"]
if y1 < 0 or y0 > page.rect.height: # ignore if outside CropBox
continue
# upd row height
height = y1 - y0
if rowheight > height:
rowheight = height
for span in line["spans"]:
if span["size"] <= fontsize:
continue
for c in span["chars"]:
x0, _, x1, _ = c["bbox"]
cwidth = x1 - x0
ox, oy = c["origin"]
oy = int(round(oy))
rows.add(oy)
ch = c["c"]
if left > ox and ch != " ":
left = ox # update left coordinate
if right < x1:
right = x1 # update right coordinate
# handle ligatures:
if cwidth == 0 and chars != []: # potential ligature
old_ch, old_ox, old_oy, old_cwidth = chars[-1]
if old_oy == oy: # ligature
if old_ch != chr(0xFB00): # previous "ff" char lig?
lig = joinligature(old_ch + ch) # no
# convert to one of the 3-char ligatures:
elif ch == "i":
lig = chr(0xFB03) # "ffi"
elif ch == "l":
lig = chr(0xFB04) # "ffl"
else: # something wrong, leave old char in place
lig = old_ch
chars[-1] = (lig, old_ox, old_oy, old_cwidth)
continue
chars.append((ch, ox, oy, cwidth)) # all chars on page
return chars, rows, left, right, rowheight
def joinligature(lig: str) -> str:
"""Return ligature character for a given pair / triple of characters.
Args:
lig: (str) 2/3 characters, e.g. "ff"
Returns:
Ligature, e.g. "ff" -> chr(0xFB00)
"""
if lig == "ff":
return chr(0xFB00)
elif lig == "fi":
return chr(0xFB01)
elif lig == "fl":
return chr(0xFB02)
elif lig == "ffi":
return chr(0xFB03)
elif lig == "ffl":
return chr(0xFB04)
elif lig == "ft":
return chr(0xFB05)
elif lig == "st":
return chr(0xFB06)
return lig
# --------------------------------------------------------------------
def make_textline(left, slot, minslot, lchars):
"""Produce the text of one output line.
Args:
left: (float) left most coordinate used on page
slot: (float) avg width of one character in any font in use.
minslot: (float) min width for the characters in this line.
chars: (list[tuple]) characters of this line.
Returns:
text: (str) text string for this line
"""
text = "" # we output this
old_char = ""
old_x1 = 0 # end coordinate of last char
old_ox = 0 # x-origin of last char
if minslot <= pymupdf.EPSILON:
raise RuntimeError("program error: minslot too small = %g" % minslot)
for c in lchars: # loop over characters
char, ox, _, cwidth = c
ox = ox - left # its (relative) start coordinate
x1 = ox + cwidth # ending coordinate
# eliminate overprint effect
if old_char == char and ox - old_ox <= cwidth * 0.2:
continue
# omit spaces overlapping previous char
if char == " " and (old_x1 - ox) / cwidth > 0.8:
continue
old_char = char
# close enough to previous?
if ox < old_x1 + minslot: # assume char adjacent to previous
text += char # append to output
old_x1 = x1 # new end coord
old_ox = ox # new origin.x
continue
# else next char starts after some gap:
# fill in right number of spaces, so char is positioned
# in the right slot of the line
if char == " ": # rest relevant for non-space only
continue
delta = int(ox / slot) - len(text)
if ox > old_x1 and delta > 1:
text += " " * delta
# now append char
text += char
old_x1 = x1 # new end coordinate
old_ox = ox # new origin
return text.rstrip()
# extract page text by single characters ("rawdict")
blocks = page.get_text("rawdict", flags=flags)["blocks"]
chars, rows, left, right, rowheight = process_blocks(blocks, page)
if chars == []:
if not skip_empty:
textout.write(eop) # write formfeed
return
# compute list of line coordinates - ignoring small (GRID) differences
rows = curate_rows(rows, GRID)
# sort all chars by x-coordinates, so every line will receive char info,
# sorted from left to right.
chars.sort(key=lambda c: c[1])
# populate the lines with their char info
lines = {} # key: y1-ccordinate, value: char list
for c in chars:
_, _, oy, _ = c
y = find_line_index(rows, oy) # y-coord of the right line
lchars = lines.get(y, []) # read line chars so far
lchars.append(c) # append this char
lines[y] = lchars # write back to line
# ensure line coordinates are ascending
keys = list(lines.keys())
keys.sort()
# -------------------------------------------------------------------------
# Compute "char resolution" for the page: the char width corresponding to
# 1 text char position on output - call it 'slot'.
# For each line, compute median of its char widths. The minimum across all
# lines is 'slot'.
# The minimum char width of each line is used to determine if spaces must
# be inserted in between two characters.
# -------------------------------------------------------------------------
slot = right - left
minslots = {}
for k in keys:
lchars = lines[k]
ccount = len(lchars)
if ccount < 2:
minslots[k] = 1
continue
widths = [c[3] for c in lchars]
widths.sort()
this_slot = statistics.median(widths) # take median value
if this_slot < slot:
slot = this_slot
minslots[k] = widths[0]
# compute line advance in text output
rowheight = rowheight * (rows[-1] - rows[0]) / (rowheight * len(rows)) * 1.2
rowpos = rows[0] # first line positioned here
textout.write(b"\n")
for k in keys: # walk through the lines
while rowpos < k: # honor distance between lines
textout.write(b"\n")
rowpos += rowheight
text = make_textline(left, slot, minslots[k], lines[k])
textout.write((text + "\n").encode("utf8", errors="surrogatepass"))
rowpos = k + rowheight
textout.write(eop) # write formfeed
def gettext(args):
doc = open_file(args.input, args.password, pdf=False)
pagel = get_list(args.pages, doc.page_count + 1)
output = args.output
if output is None:
filename, _ = os.path.splitext(doc.name)
output = filename + ".txt"
with open(output, "wb") as textout:
flags = pymupdf.TEXT_PRESERVE_LIGATURES | pymupdf.TEXT_PRESERVE_WHITESPACE
if args.convert_white:
flags ^= pymupdf.TEXT_PRESERVE_WHITESPACE
if args.noligatures:
flags ^= pymupdf.TEXT_PRESERVE_LIGATURES
if args.extra_spaces:
flags ^= pymupdf.TEXT_INHIBIT_SPACES
func = {
"simple": page_simple,
"blocks": page_blocksort,
"layout": page_layout,
}
for pno in pagel:
page = doc[pno - 1]
func[args.mode](
page,
textout,
args.grid,
args.fontsize,
args.noformfeed,
args.skip_empty,
flags=flags,
)
def _internal(args):
pymupdf.message('This is from PyMuPDF message().')
pymupdf.log('This is from PyMuPDF log().')
def main():
"""Define command configurations."""
parser = argparse.ArgumentParser(
prog="pymupdf",
description=mycenter("Basic PyMuPDF Functions"),
)
subps = parser.add_subparsers(
title="Subcommands", help="Enter 'command -h' for subcommand specific help"
)
# -------------------------------------------------------------------------
# 'show' command
# -------------------------------------------------------------------------
ps_show = subps.add_parser("show", description=mycenter("display PDF information"))
ps_show.add_argument("input", type=str, help="PDF filename")
ps_show.add_argument("-password", help="password")
ps_show.add_argument("-catalog", action="store_true", help="show PDF catalog")
ps_show.add_argument("-trailer", action="store_true", help="show PDF trailer")
ps_show.add_argument("-metadata", action="store_true", help="show PDF metadata")
ps_show.add_argument(
"-xrefs", type=str, help="show selected objects, format: 1,5-7,N"
)
ps_show.add_argument(
"-pages", type=str, help="show selected pages, format: 1,5-7,50-N"
)
ps_show.set_defaults(func=show)
# -------------------------------------------------------------------------
# 'clean' command
# -------------------------------------------------------------------------
ps_clean = subps.add_parser(
"clean", description=mycenter("optimize PDF, or create sub-PDF if pages given")
)
ps_clean.add_argument("input", type=str, help="PDF filename")
ps_clean.add_argument("output", type=str, help="output PDF filename")
ps_clean.add_argument("-password", help="password")
ps_clean.add_argument(
"-encryption",
help="encryption method",
choices=("keep", "none", "rc4-40", "rc4-128", "aes-128", "aes-256"),
default="none",
)
ps_clean.add_argument("-owner", type=str, help="owner password")
ps_clean.add_argument("-user", type=str, help="user password")
ps_clean.add_argument(
"-garbage",
type=int,
help="garbage collection level",
choices=range(5),
default=0,
)
ps_clean.add_argument(
"-compress",
action="store_true",
default=False,
help="compress (deflate) output",
)
ps_clean.add_argument(
"-ascii", action="store_true", default=False, help="ASCII encode binary data"
)
ps_clean.add_argument(
"-linear",
action="store_true",
default=False,
help="format for fast web display",
)
ps_clean.add_argument(
"-permission", type=int, default=-1, help="integer with permission levels"
)
ps_clean.add_argument(
"-sanitize",
action="store_true",
default=False,
help="sanitize / clean contents",
)
ps_clean.add_argument(
"-pretty", action="store_true", default=False, help="prettify PDF structure"
)
ps_clean.add_argument(
"-pages", help="output selected pages pages, format: 1,5-7,50-N"
)
ps_clean.set_defaults(func=clean)
# -------------------------------------------------------------------------
# 'join' command
# -------------------------------------------------------------------------
ps_join = subps.add_parser(
"join",
description=mycenter("join PDF documents"),
epilog="specify each input as 'filename[,password[,pages]]'",
)
ps_join.add_argument("input", nargs="*", help="input filenames")
ps_join.add_argument("-output", required=True, help="output filename")
ps_join.set_defaults(func=doc_join)
# -------------------------------------------------------------------------
# 'extract' command
# -------------------------------------------------------------------------
ps_extract = subps.add_parser(
"extract", description=mycenter("extract images and fonts to disk")
)
ps_extract.add_argument("input", type=str, help="PDF filename")
ps_extract.add_argument("-images", action="store_true", help="extract images")
ps_extract.add_argument("-fonts", action="store_true", help="extract fonts")
ps_extract.add_argument(
"-output", help="folder to receive output, defaults to current"
)
ps_extract.add_argument("-password", help="password")
ps_extract.add_argument(
"-pages", type=str, help="consider these pages only, format: 1,5-7,50-N"
)
ps_extract.set_defaults(func=extract_objects)
# -------------------------------------------------------------------------
# 'embed-info'
# -------------------------------------------------------------------------
ps_show = subps.add_parser(
"embed-info", description=mycenter("list embedded files")
)
ps_show.add_argument("input", help="PDF filename")
ps_show.add_argument("-name", help="if given, report only this one")
ps_show.add_argument("-detail", action="store_true", help="detail information")
ps_show.add_argument("-password", help="password")
ps_show.set_defaults(func=embedded_list)
# -------------------------------------------------------------------------
# 'embed-add' command
# -------------------------------------------------------------------------
ps_embed_add = subps.add_parser(
"embed-add", description=mycenter("add embedded file")
)
ps_embed_add.add_argument("input", help="PDF filename")
ps_embed_add.add_argument("-password", help="password")
ps_embed_add.add_argument(
"-output", help="output PDF filename, incremental save if none"
)
ps_embed_add.add_argument("-name", required=True, help="name of new entry")
ps_embed_add.add_argument("-path", required=True, help="path to data for new entry")
ps_embed_add.add_argument("-desc", help="description of new entry")
ps_embed_add.set_defaults(func=embedded_add)
# -------------------------------------------------------------------------
# 'embed-del' command
# -------------------------------------------------------------------------
ps_embed_del = subps.add_parser(
"embed-del", description=mycenter("delete embedded file")
)
ps_embed_del.add_argument("input", help="PDF filename")
ps_embed_del.add_argument("-password", help="password")
ps_embed_del.add_argument(
"-output", help="output PDF filename, incremental save if none"
)
ps_embed_del.add_argument("-name", required=True, help="name of entry to delete")
ps_embed_del.set_defaults(func=embedded_del)
# -------------------------------------------------------------------------
# 'embed-upd' command
# -------------------------------------------------------------------------
ps_embed_upd = subps.add_parser(
"embed-upd",
description=mycenter("update embedded file"),
epilog="except '-name' all parameters are optional",
)
ps_embed_upd.add_argument("input", help="PDF filename")
ps_embed_upd.add_argument("-name", required=True, help="name of entry")
ps_embed_upd.add_argument("-password", help="password")
ps_embed_upd.add_argument(
"-output", help="Output PDF filename, incremental save if none"
)
ps_embed_upd.add_argument("-path", help="path to new data for entry")
ps_embed_upd.add_argument("-filename", help="new filename to store in entry")
ps_embed_upd.add_argument(
"-ufilename", help="new unicode filename to store in entry"
)
ps_embed_upd.add_argument("-desc", help="new description to store in entry")
ps_embed_upd.set_defaults(func=embedded_upd)
# -------------------------------------------------------------------------
# 'embed-extract' command
# -------------------------------------------------------------------------
ps_embed_extract = subps.add_parser(
"embed-extract", description=mycenter("extract embedded file to disk")
)
ps_embed_extract.add_argument("input", type=str, help="PDF filename")
ps_embed_extract.add_argument("-name", required=True, help="name of entry")
ps_embed_extract.add_argument("-password", help="password")
ps_embed_extract.add_argument(
"-output", help="output filename, default is stored name"
)
ps_embed_extract.set_defaults(func=embedded_get)
# -------------------------------------------------------------------------
# 'embed-copy' command
# -------------------------------------------------------------------------
ps_embed_copy = subps.add_parser(
"embed-copy", description=mycenter("copy embedded files between PDFs")
)
ps_embed_copy.add_argument("input", type=str, help="PDF to receive embedded files")
ps_embed_copy.add_argument("-password", help="password of input")
ps_embed_copy.add_argument(
"-output", help="output PDF, incremental save to 'input' if omitted"
)
ps_embed_copy.add_argument(
"-source", required=True, help="copy embedded files from here"
)
ps_embed_copy.add_argument("-pwdsource", help="password of 'source' PDF")
ps_embed_copy.add_argument(
"-name", nargs="*", help="restrict copy to these entries"
)
ps_embed_copy.set_defaults(func=embedded_copy)
# -------------------------------------------------------------------------
# 'textlayout' command
# -------------------------------------------------------------------------
ps_gettext = subps.add_parser(
"gettext", description=mycenter("extract text in various formatting modes")
)
ps_gettext.add_argument("input", type=str, help="input document filename")
ps_gettext.add_argument("-password", help="password for input document")
ps_gettext.add_argument(
"-mode",
type=str,
help="mode: simple, block sort, or layout (default)",
choices=("simple", "blocks", "layout"),
default="layout",
)
ps_gettext.add_argument(
"-pages",
type=str,
help="select pages, format: 1,5-7,50-N",
default="1-N",
)
ps_gettext.add_argument(
"-noligatures",
action="store_true",
help="expand ligature characters (default False)",
default=False,
)
ps_gettext.add_argument(
"-convert-white",
action="store_true",
help="convert whitespace characters to white (default False)",
default=False,
)
ps_gettext.add_argument(
"-extra-spaces",
action="store_true",
help="fill gaps with spaces (default False)",
default=False,
)
ps_gettext.add_argument(
"-noformfeed",
action="store_true",
help="write linefeeds, no formfeeds (default False)",
default=False,
)
ps_gettext.add_argument(
"-skip-empty",
action="store_true",
help="suppress pages with no text (default False)",
default=False,
)
ps_gettext.add_argument(
"-output",
help="store text in this file (default inputfilename.txt)",
)
ps_gettext.add_argument(
"-grid",
type=float,
help="merge lines if closer than this (default 2)",
default=2,
)
ps_gettext.add_argument(
"-fontsize",
type=float,
help="only include text with a larger fontsize (default 3)",
default=3,
)
ps_gettext.set_defaults(func=gettext)
# -------------------------------------------------------------------------
# '_internal' command
# -------------------------------------------------------------------------
ps_internal = subps.add_parser(
"internal", description=mycenter("internal testing")
)
ps_internal.set_defaults(func=_internal)
# -------------------------------------------------------------------------
# start program
# -------------------------------------------------------------------------
args = parser.parse_args() # create parameter arguments class
if not hasattr(args, "func"): # no function selected
parser.print_help() # so print top level help
else:
args.func(args) # execute requested command
if __name__ == "__main__":
main()