# ----------------------------------------------------------------------------- # Copyright 2020-2022, Harald Lieder, mailto:harald.lieder@outlook.com # License: GNU AFFERO GPL 3.0, https://www.gnu.org/licenses/agpl-3.0.html # Part of "PyMuPDF", Python bindings for "MuPDF" (http://mupdf.com), a # lightweight PDF, XPS, and E-book viewer, renderer and toolkit which is # maintained and developed by Artifex Software, Inc. https://artifex.com. # ----------------------------------------------------------------------------- import argparse import bisect import os import sys import statistics from typing import Dict, List, Set from . import pymupdf def mycenter(x): return (" %s " % x).center(75, "-") def recoverpix(doc, item): """Return image for a given XREF.""" x = item[0] # xref of PDF image s = item[1] # xref of its /SMask if s == 0: # no smask: use direct image output return doc.extract_image(x) def getimage(pix): if pix.colorspace.n != 4: return pix tpix = pymupdf.Pixmap(pymupdf.csRGB, pix) return tpix # we need to reconstruct the alpha channel with the smask pix1 = pymupdf.Pixmap(doc, x) pix2 = pymupdf.Pixmap(doc, s) # create pixmap of the /SMask entry """Sanity check: - both pixmaps must have the same rectangle - both pixmaps must have alpha=0 - pix2 must consist of 1 byte per pixel """ if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1): pymupdf.message("Warning: unsupported /SMask %i for %i:" % (s, x)) pymupdf.message(pix2) pix2 = None return getimage(pix1) # return the pixmap as is pix = pymupdf.Pixmap(pix1) # copy of pix1, with an alpha channel added pix.set_alpha(pix2.samples) # treat pix2.samples as the alpha values pix1 = pix2 = None # free temp pixmaps # we may need to adjust something for CMYK pixmaps here: return getimage(pix) def open_file(filename, password, show=False, pdf=True): """Open and authenticate a document.""" doc = pymupdf.open(filename) if not doc.is_pdf and pdf is True: sys.exit("this command supports PDF files only") rc = -1 if not doc.needs_pass: return doc if password: rc = doc.authenticate(password) if not rc: sys.exit("authentication unsuccessful") if show is True: pymupdf.message("authenticated as %s" % "owner" if rc > 2 else "user") else: sys.exit("'%s' requires a password" % doc.name) return doc def print_dict(item): """Print a Python dictionary.""" l = max([len(k) for k in item.keys()]) + 1 for k, v in item.items(): msg = "%s: %s" % (k.rjust(l), v) pymupdf.message(msg) def print_xref(doc, xref): """Print an object given by XREF number. Simulate the PDF source in "pretty" format. For a stream also print its size. """ pymupdf.message("%i 0 obj" % xref) xref_str = doc.xref_object(xref) pymupdf.message(xref_str) if doc.xref_is_stream(xref): temp = xref_str.split() try: idx = temp.index("/Length") + 1 size = temp[idx] if size.endswith("0 R"): size = "unknown" except Exception: size = "unknown" pymupdf.message("stream\n...%s bytes" % size) pymupdf.message("endstream") pymupdf.message("endobj") def get_list(rlist, limit, what="page"): """Transform a page / xref specification into a list of integers. Args ---- rlist: (str) the specification limit: maximum number, i.e. number of pages, number of objects what: a string to be used in error messages Returns ------- A list of integers representing the specification. """ N = str(limit - 1) rlist = rlist.replace("N", N).replace(" ", "") rlist_arr = rlist.split(",") out_list = [] for seq, item in enumerate(rlist_arr): n = seq + 1 if item.isdecimal(): # a single integer i = int(item) if 1 <= i < limit: out_list.append(int(item)) else: sys.exit("bad %s specification at item %i" % (what, n)) continue try: # this must be a range now, and all of the following must work: i1, i2 = item.split("-") # will fail if not 2 items produced i1 = int(i1) # will fail on non-integers i2 = int(i2) except Exception: sys.exit("bad %s range specification at item %i" % (what, n)) if not (1 <= i1 < limit and 1 <= i2 < limit): sys.exit("bad %s range specification at item %i" % (what, n)) if i1 == i2: # just in case: a range of equal numbers out_list.append(i1) continue if i1 < i2: # first less than second out_list += list(range(i1, i2 + 1)) else: # first larger than second out_list += list(range(i1, i2 - 1, -1)) return out_list def show(args): doc = open_file(args.input, args.password, True) size = os.path.getsize(args.input) / 1024 flag = "KB" if size > 1000: size /= 1024 flag = "MB" size = round(size, 1) meta = doc.metadata # pylint: disable=no-member pymupdf.message( "'%s', pages: %i, objects: %i, %g %s, %s, encryption: %s" % ( args.input, doc.page_count, doc.xref_length() - 1, size, flag, meta["format"], meta["encryption"], ) ) n = doc.is_form_pdf if n > 0: s = doc.get_sigflags() pymupdf.message( "document contains %i root form fields and is %ssigned" % (n, "not " if s != 3 else "") ) n = doc.embfile_count() if n > 0: pymupdf.message("document contains %i embedded files" % n) pymupdf.message() if args.catalog: pymupdf.message(mycenter("PDF catalog")) xref = doc.pdf_catalog() print_xref(doc, xref) pymupdf.message() if args.metadata: pymupdf.message(mycenter("PDF metadata")) print_dict(doc.metadata) # pylint: disable=no-member pymupdf.message() if args.xrefs: pymupdf.message(mycenter("object information")) xrefl = get_list(args.xrefs, doc.xref_length(), what="xref") for xref in xrefl: print_xref(doc, xref) pymupdf.message() if args.pages: pymupdf.message(mycenter("page information")) pagel = get_list(args.pages, doc.page_count + 1) for pno in pagel: n = pno - 1 xref = doc.page_xref(n) pymupdf.message("Page %i:" % pno) print_xref(doc, xref) pymupdf.message() if args.trailer: pymupdf.message(mycenter("PDF trailer")) pymupdf.message(doc.pdf_trailer()) pymupdf.message() doc.close() def clean(args): doc = open_file(args.input, args.password, pdf=True) encryption = args.encryption encrypt = ("keep", "none", "rc4-40", "rc4-128", "aes-128", "aes-256").index( encryption ) if not args.pages: # simple cleaning doc.save( args.output, garbage=args.garbage, deflate=args.compress, pretty=args.pretty, clean=args.sanitize, ascii=args.ascii, linear=args.linear, encryption=encrypt, owner_pw=args.owner, user_pw=args.user, permissions=args.permission, ) return # create sub document from page numbers pages = get_list(args.pages, doc.page_count + 1) outdoc = pymupdf.open() for pno in pages: n = pno - 1 outdoc.insert_pdf(doc, from_page=n, to_page=n) outdoc.save( args.output, garbage=args.garbage, deflate=args.compress, pretty=args.pretty, clean=args.sanitize, ascii=args.ascii, linear=args.linear, encryption=encrypt, owner_pw=args.owner, user_pw=args.user, permissions=args.permission, ) doc.close() outdoc.close() return def doc_join(args): """Join pages from several PDF documents.""" doc_list = args.input # a list of input PDFs doc = pymupdf.open() # output PDF for src_item in doc_list: # process one input PDF src_list = src_item.split(",") password = src_list[1] if len(src_list) > 1 else None src = open_file(src_list[0], password, pdf=True) pages = ",".join(src_list[2:]) # get 'pages' specifications if pages: # if anything there, retrieve a list of desired pages page_list = get_list(",".join(src_list[2:]), src.page_count + 1) else: # take all pages page_list = range(1, src.page_count + 1) for i in page_list: doc.insert_pdf(src, from_page=i - 1, to_page=i - 1) # copy each source page src.close() doc.save(args.output, garbage=4, deflate=True) doc.close() def embedded_copy(args): """Copy embedded files between PDFs.""" doc = open_file(args.input, args.password, pdf=True) if not doc.can_save_incrementally() and ( not args.output or args.output == args.input ): sys.exit("cannot save PDF incrementally") src = open_file(args.source, args.pwdsource) names = set(args.name) if args.name else set() src_names = set(src.embfile_names()) if names: if not names <= src_names: sys.exit("not all names are contained in source") else: names = src_names if not names: sys.exit("nothing to copy") intersect = names & set(doc.embfile_names()) # any equal name already in target? if intersect: sys.exit("following names already exist in receiving PDF: %s" % str(intersect)) for item in names: info = src.embfile_info(item) buff = src.embfile_get(item) doc.embfile_add( item, buff, filename=info["filename"], ufilename=info["ufilename"], desc=info["desc"], ) pymupdf.message("copied entry '%s' from '%s'" % (item, src.name)) src.close() if args.output and args.output != args.input: doc.save(args.output, garbage=3) else: doc.saveIncr() doc.close() def embedded_del(args): """Delete an embedded file entry.""" doc = open_file(args.input, args.password, pdf=True) if not doc.can_save_incrementally() and ( not args.output or args.output == args.input ): sys.exit("cannot save PDF incrementally") exception_types = (ValueError, pymupdf.mupdf.FzErrorBase) if pymupdf.mupdf_version_tuple < (1, 24): exception_types = ValueError try: doc.embfile_del(args.name) except exception_types as e: # pylint: disable=catching-non-exception sys.exit(f'no such embedded file {args.name!r}: {e}') if not args.output or args.output == args.input: doc.saveIncr() else: doc.save(args.output, garbage=1) doc.close() def embedded_get(args): """Retrieve contents of an embedded file.""" doc = open_file(args.input, args.password, pdf=True) exception_types = (ValueError, pymupdf.mupdf.FzErrorBase) if pymupdf.mupdf_version_tuple < (1, 24): exception_types = ValueError try: stream = doc.embfile_get(args.name) d = doc.embfile_info(args.name) except exception_types as e: # pylint: disable=catching-non-exception sys.exit(f'no such embedded file {args.name!r}: {e}') filename = args.output if args.output else d["filename"] with open(filename, "wb") as output: output.write(stream) pymupdf.message("saved entry '%s' as '%s'" % (args.name, filename)) doc.close() def embedded_add(args): """Insert a new embedded file.""" doc = open_file(args.input, args.password, pdf=True) if not doc.can_save_incrementally() and ( args.output is None or args.output == args.input ): sys.exit("cannot save PDF incrementally") try: doc.embfile_del(args.name) sys.exit("entry '%s' already exists" % args.name) except Exception: pass if not os.path.exists(args.path) or not os.path.isfile(args.path): sys.exit("no such file '%s'" % args.path) with open(args.path, "rb") as f: stream = f.read() filename = args.path ufilename = filename if not args.desc: desc = filename else: desc = args.desc doc.embfile_add( args.name, stream, filename=filename, ufilename=ufilename, desc=desc ) if not args.output or args.output == args.input: doc.saveIncr() else: doc.save(args.output, garbage=3) doc.close() def embedded_upd(args): """Update contents or metadata of an embedded file.""" doc = open_file(args.input, args.password, pdf=True) if not doc.can_save_incrementally() and ( args.output is None or args.output == args.input ): sys.exit("cannot save PDF incrementally") try: doc.embfile_info(args.name) except Exception: sys.exit("no such embedded file '%s'" % args.name) if ( args.path is not None and os.path.exists(args.path) and os.path.isfile(args.path) ): with open(args.path, "rb") as f: stream = f.read() else: stream = None if args.filename: filename = args.filename else: filename = None if args.ufilename: ufilename = args.ufilename elif args.filename: ufilename = args.filename else: ufilename = None if args.desc: desc = args.desc else: desc = None doc.embfile_upd( args.name, stream, filename=filename, ufilename=ufilename, desc=desc ) if args.output is None or args.output == args.input: doc.saveIncr() else: doc.save(args.output, garbage=3) doc.close() def embedded_list(args): """List embedded files.""" doc = open_file(args.input, args.password, pdf=True) names = doc.embfile_names() if args.name is not None: if args.name not in names: sys.exit("no such embedded file '%s'" % args.name) else: pymupdf.message() pymupdf.message( "printing 1 of %i embedded file%s:" % (len(names), "s" if len(names) > 1 else "") ) pymupdf.message() print_dict(doc.embfile_info(args.name)) pymupdf.message() return if not names: pymupdf.message("'%s' contains no embedded files" % doc.name) return if len(names) > 1: msg = "'%s' contains the following %i embedded files" % (doc.name, len(names)) else: msg = "'%s' contains the following embedded file" % doc.name pymupdf.message(msg) pymupdf.message() for name in names: if not args.detail: pymupdf.message(name) continue _ = doc.embfile_info(name) print_dict(doc.embfile_info(name)) pymupdf.message() doc.close() def extract_objects(args): """Extract images and / or fonts from a PDF.""" if not args.fonts and not args.images: sys.exit("neither fonts nor images requested") doc = open_file(args.input, args.password, pdf=True) if args.pages: pages = get_list(args.pages, doc.page_count + 1) else: pages = range(1, doc.page_count + 1) if not args.output: out_dir = os.path.abspath(os.curdir) else: out_dir = args.output if not (os.path.exists(out_dir) and os.path.isdir(out_dir)): sys.exit("output directory %s does not exist" % out_dir) font_xrefs = set() # already saved fonts image_xrefs = set() # already saved images for pno in pages: if args.fonts: itemlist = doc.get_page_fonts(pno - 1) for item in itemlist: xref = item[0] if xref not in font_xrefs: font_xrefs.add(xref) fontname, ext, _, buffer = doc.extract_font(xref) if ext == "n/a" or not buffer: continue outname = os.path.join( out_dir, f"{fontname.replace(' ', '-')}-{xref}.{ext}" ) with open(outname, "wb") as outfile: outfile.write(buffer) buffer = None if args.images: itemlist = doc.get_page_images(pno - 1) for item in itemlist: xref = item[0] if xref not in image_xrefs: image_xrefs.add(xref) pix = recoverpix(doc, item) if type(pix) is dict: ext = pix["ext"] imgdata = pix["image"] outname = os.path.join(out_dir, "img-%i.%s" % (xref, ext)) with open(outname, "wb") as outfile: outfile.write(imgdata) else: outname = os.path.join(out_dir, "img-%i.png" % xref) pix2 = ( pix if pix.colorspace.n < 4 else pymupdf.Pixmap(pymupdf.csRGB, pix) ) pix2.save(outname) if args.fonts: pymupdf.message("saved %i fonts to '%s'" % (len(font_xrefs), out_dir)) if args.images: pymupdf.message("saved %i images to '%s'" % (len(image_xrefs), out_dir)) doc.close() def page_simple(page, textout, GRID, fontsize, noformfeed, skip_empty, flags): eop = b"\n" if noformfeed else bytes([12]) text = page.get_text("text", flags=flags) if not text: if not skip_empty: textout.write(eop) # write formfeed return textout.write(text.encode("utf8", errors="surrogatepass")) textout.write(eop) return def page_blocksort(page, textout, GRID, fontsize, noformfeed, skip_empty, flags): eop = b"\n" if noformfeed else bytes([12]) blocks = page.get_text("blocks", flags=flags) if blocks == []: if not skip_empty: textout.write(eop) # write formfeed return blocks.sort(key=lambda b: (b[3], b[0])) for b in blocks: textout.write(b[4].encode("utf8", errors="surrogatepass")) textout.write(eop) return def page_layout(page, textout, GRID, fontsize, noformfeed, skip_empty, flags): eop = b"\n" if noformfeed else bytes([12]) # -------------------------------------------------------------------- def find_line_index(values: List[int], value: int) -> int: """Find the right row coordinate. Args: values: (list) y-coordinates of rows. value: (int) lookup for this value (y-origin of char). Returns: y-ccordinate of appropriate line for value. """ i = bisect.bisect_right(values, value) if i: return values[i - 1] raise RuntimeError("Line for %g not found in %s" % (value, values)) # -------------------------------------------------------------------- def curate_rows(rows: Set[int], GRID) -> List: rows = list(rows) rows.sort() # sort ascending nrows = [rows[0]] for h in rows[1:]: if h >= nrows[-1] + GRID: # only keep significant differences nrows.append(h) return nrows # curated list of line bottom coordinates def process_blocks(blocks: List[Dict], page: pymupdf.Page): rows = set() page_width = page.rect.width page_height = page.rect.height rowheight = page_height left = page_width right = 0 chars = [] for block in blocks: for line in block["lines"]: if line["dir"] != (1, 0): # ignore non-horizontal text continue x0, y0, x1, y1 = line["bbox"] if y1 < 0 or y0 > page.rect.height: # ignore if outside CropBox continue # upd row height height = y1 - y0 if rowheight > height: rowheight = height for span in line["spans"]: if span["size"] <= fontsize: continue for c in span["chars"]: x0, _, x1, _ = c["bbox"] cwidth = x1 - x0 ox, oy = c["origin"] oy = int(round(oy)) rows.add(oy) ch = c["c"] if left > ox and ch != " ": left = ox # update left coordinate if right < x1: right = x1 # update right coordinate # handle ligatures: if cwidth == 0 and chars != []: # potential ligature old_ch, old_ox, old_oy, old_cwidth = chars[-1] if old_oy == oy: # ligature if old_ch != chr(0xFB00): # previous "ff" char lig? lig = joinligature(old_ch + ch) # no # convert to one of the 3-char ligatures: elif ch == "i": lig = chr(0xFB03) # "ffi" elif ch == "l": lig = chr(0xFB04) # "ffl" else: # something wrong, leave old char in place lig = old_ch chars[-1] = (lig, old_ox, old_oy, old_cwidth) continue chars.append((ch, ox, oy, cwidth)) # all chars on page return chars, rows, left, right, rowheight def joinligature(lig: str) -> str: """Return ligature character for a given pair / triple of characters. Args: lig: (str) 2/3 characters, e.g. "ff" Returns: Ligature, e.g. "ff" -> chr(0xFB00) """ if lig == "ff": return chr(0xFB00) elif lig == "fi": return chr(0xFB01) elif lig == "fl": return chr(0xFB02) elif lig == "ffi": return chr(0xFB03) elif lig == "ffl": return chr(0xFB04) elif lig == "ft": return chr(0xFB05) elif lig == "st": return chr(0xFB06) return lig # -------------------------------------------------------------------- def make_textline(left, slot, minslot, lchars): """Produce the text of one output line. Args: left: (float) left most coordinate used on page slot: (float) avg width of one character in any font in use. minslot: (float) min width for the characters in this line. chars: (list[tuple]) characters of this line. Returns: text: (str) text string for this line """ text = "" # we output this old_char = "" old_x1 = 0 # end coordinate of last char old_ox = 0 # x-origin of last char if minslot <= pymupdf.EPSILON: raise RuntimeError("program error: minslot too small = %g" % minslot) for c in lchars: # loop over characters char, ox, _, cwidth = c ox = ox - left # its (relative) start coordinate x1 = ox + cwidth # ending coordinate # eliminate overprint effect if old_char == char and ox - old_ox <= cwidth * 0.2: continue # omit spaces overlapping previous char if char == " " and (old_x1 - ox) / cwidth > 0.8: continue old_char = char # close enough to previous? if ox < old_x1 + minslot: # assume char adjacent to previous text += char # append to output old_x1 = x1 # new end coord old_ox = ox # new origin.x continue # else next char starts after some gap: # fill in right number of spaces, so char is positioned # in the right slot of the line if char == " ": # rest relevant for non-space only continue delta = int(ox / slot) - len(text) if ox > old_x1 and delta > 1: text += " " * delta # now append char text += char old_x1 = x1 # new end coordinate old_ox = ox # new origin return text.rstrip() # extract page text by single characters ("rawdict") blocks = page.get_text("rawdict", flags=flags)["blocks"] chars, rows, left, right, rowheight = process_blocks(blocks, page) if chars == []: if not skip_empty: textout.write(eop) # write formfeed return # compute list of line coordinates - ignoring small (GRID) differences rows = curate_rows(rows, GRID) # sort all chars by x-coordinates, so every line will receive char info, # sorted from left to right. chars.sort(key=lambda c: c[1]) # populate the lines with their char info lines = {} # key: y1-ccordinate, value: char list for c in chars: _, _, oy, _ = c y = find_line_index(rows, oy) # y-coord of the right line lchars = lines.get(y, []) # read line chars so far lchars.append(c) # append this char lines[y] = lchars # write back to line # ensure line coordinates are ascending keys = list(lines.keys()) keys.sort() # ------------------------------------------------------------------------- # Compute "char resolution" for the page: the char width corresponding to # 1 text char position on output - call it 'slot'. # For each line, compute median of its char widths. The minimum across all # lines is 'slot'. # The minimum char width of each line is used to determine if spaces must # be inserted in between two characters. # ------------------------------------------------------------------------- slot = right - left minslots = {} for k in keys: lchars = lines[k] ccount = len(lchars) if ccount < 2: minslots[k] = 1 continue widths = [c[3] for c in lchars] widths.sort() this_slot = statistics.median(widths) # take median value if this_slot < slot: slot = this_slot minslots[k] = widths[0] # compute line advance in text output rowheight = rowheight * (rows[-1] - rows[0]) / (rowheight * len(rows)) * 1.2 rowpos = rows[0] # first line positioned here textout.write(b"\n") for k in keys: # walk through the lines while rowpos < k: # honor distance between lines textout.write(b"\n") rowpos += rowheight text = make_textline(left, slot, minslots[k], lines[k]) textout.write((text + "\n").encode("utf8", errors="surrogatepass")) rowpos = k + rowheight textout.write(eop) # write formfeed def gettext(args): doc = open_file(args.input, args.password, pdf=False) pagel = get_list(args.pages, doc.page_count + 1) output = args.output if output is None: filename, _ = os.path.splitext(doc.name) output = filename + ".txt" with open(output, "wb") as textout: flags = pymupdf.TEXT_PRESERVE_LIGATURES | pymupdf.TEXT_PRESERVE_WHITESPACE if args.convert_white: flags ^= pymupdf.TEXT_PRESERVE_WHITESPACE if args.noligatures: flags ^= pymupdf.TEXT_PRESERVE_LIGATURES if args.extra_spaces: flags ^= pymupdf.TEXT_INHIBIT_SPACES func = { "simple": page_simple, "blocks": page_blocksort, "layout": page_layout, } for pno in pagel: page = doc[pno - 1] func[args.mode]( page, textout, args.grid, args.fontsize, args.noformfeed, args.skip_empty, flags=flags, ) def _internal(args): pymupdf.message('This is from PyMuPDF message().') pymupdf.log('This is from PyMuPDF log().') def main(): """Define command configurations.""" parser = argparse.ArgumentParser( prog="pymupdf", description=mycenter("Basic PyMuPDF Functions"), ) subps = parser.add_subparsers( title="Subcommands", help="Enter 'command -h' for subcommand specific help" ) # ------------------------------------------------------------------------- # 'show' command # ------------------------------------------------------------------------- ps_show = subps.add_parser("show", description=mycenter("display PDF information")) ps_show.add_argument("input", type=str, help="PDF filename") ps_show.add_argument("-password", help="password") ps_show.add_argument("-catalog", action="store_true", help="show PDF catalog") ps_show.add_argument("-trailer", action="store_true", help="show PDF trailer") ps_show.add_argument("-metadata", action="store_true", help="show PDF metadata") ps_show.add_argument( "-xrefs", type=str, help="show selected objects, format: 1,5-7,N" ) ps_show.add_argument( "-pages", type=str, help="show selected pages, format: 1,5-7,50-N" ) ps_show.set_defaults(func=show) # ------------------------------------------------------------------------- # 'clean' command # ------------------------------------------------------------------------- ps_clean = subps.add_parser( "clean", description=mycenter("optimize PDF, or create sub-PDF if pages given") ) ps_clean.add_argument("input", type=str, help="PDF filename") ps_clean.add_argument("output", type=str, help="output PDF filename") ps_clean.add_argument("-password", help="password") ps_clean.add_argument( "-encryption", help="encryption method", choices=("keep", "none", "rc4-40", "rc4-128", "aes-128", "aes-256"), default="none", ) ps_clean.add_argument("-owner", type=str, help="owner password") ps_clean.add_argument("-user", type=str, help="user password") ps_clean.add_argument( "-garbage", type=int, help="garbage collection level", choices=range(5), default=0, ) ps_clean.add_argument( "-compress", action="store_true", default=False, help="compress (deflate) output", ) ps_clean.add_argument( "-ascii", action="store_true", default=False, help="ASCII encode binary data" ) ps_clean.add_argument( "-linear", action="store_true", default=False, help="format for fast web display", ) ps_clean.add_argument( "-permission", type=int, default=-1, help="integer with permission levels" ) ps_clean.add_argument( "-sanitize", action="store_true", default=False, help="sanitize / clean contents", ) ps_clean.add_argument( "-pretty", action="store_true", default=False, help="prettify PDF structure" ) ps_clean.add_argument( "-pages", help="output selected pages pages, format: 1,5-7,50-N" ) ps_clean.set_defaults(func=clean) # ------------------------------------------------------------------------- # 'join' command # ------------------------------------------------------------------------- ps_join = subps.add_parser( "join", description=mycenter("join PDF documents"), epilog="specify each input as 'filename[,password[,pages]]'", ) ps_join.add_argument("input", nargs="*", help="input filenames") ps_join.add_argument("-output", required=True, help="output filename") ps_join.set_defaults(func=doc_join) # ------------------------------------------------------------------------- # 'extract' command # ------------------------------------------------------------------------- ps_extract = subps.add_parser( "extract", description=mycenter("extract images and fonts to disk") ) ps_extract.add_argument("input", type=str, help="PDF filename") ps_extract.add_argument("-images", action="store_true", help="extract images") ps_extract.add_argument("-fonts", action="store_true", help="extract fonts") ps_extract.add_argument( "-output", help="folder to receive output, defaults to current" ) ps_extract.add_argument("-password", help="password") ps_extract.add_argument( "-pages", type=str, help="consider these pages only, format: 1,5-7,50-N" ) ps_extract.set_defaults(func=extract_objects) # ------------------------------------------------------------------------- # 'embed-info' # ------------------------------------------------------------------------- ps_show = subps.add_parser( "embed-info", description=mycenter("list embedded files") ) ps_show.add_argument("input", help="PDF filename") ps_show.add_argument("-name", help="if given, report only this one") ps_show.add_argument("-detail", action="store_true", help="detail information") ps_show.add_argument("-password", help="password") ps_show.set_defaults(func=embedded_list) # ------------------------------------------------------------------------- # 'embed-add' command # ------------------------------------------------------------------------- ps_embed_add = subps.add_parser( "embed-add", description=mycenter("add embedded file") ) ps_embed_add.add_argument("input", help="PDF filename") ps_embed_add.add_argument("-password", help="password") ps_embed_add.add_argument( "-output", help="output PDF filename, incremental save if none" ) ps_embed_add.add_argument("-name", required=True, help="name of new entry") ps_embed_add.add_argument("-path", required=True, help="path to data for new entry") ps_embed_add.add_argument("-desc", help="description of new entry") ps_embed_add.set_defaults(func=embedded_add) # ------------------------------------------------------------------------- # 'embed-del' command # ------------------------------------------------------------------------- ps_embed_del = subps.add_parser( "embed-del", description=mycenter("delete embedded file") ) ps_embed_del.add_argument("input", help="PDF filename") ps_embed_del.add_argument("-password", help="password") ps_embed_del.add_argument( "-output", help="output PDF filename, incremental save if none" ) ps_embed_del.add_argument("-name", required=True, help="name of entry to delete") ps_embed_del.set_defaults(func=embedded_del) # ------------------------------------------------------------------------- # 'embed-upd' command # ------------------------------------------------------------------------- ps_embed_upd = subps.add_parser( "embed-upd", description=mycenter("update embedded file"), epilog="except '-name' all parameters are optional", ) ps_embed_upd.add_argument("input", help="PDF filename") ps_embed_upd.add_argument("-name", required=True, help="name of entry") ps_embed_upd.add_argument("-password", help="password") ps_embed_upd.add_argument( "-output", help="Output PDF filename, incremental save if none" ) ps_embed_upd.add_argument("-path", help="path to new data for entry") ps_embed_upd.add_argument("-filename", help="new filename to store in entry") ps_embed_upd.add_argument( "-ufilename", help="new unicode filename to store in entry" ) ps_embed_upd.add_argument("-desc", help="new description to store in entry") ps_embed_upd.set_defaults(func=embedded_upd) # ------------------------------------------------------------------------- # 'embed-extract' command # ------------------------------------------------------------------------- ps_embed_extract = subps.add_parser( "embed-extract", description=mycenter("extract embedded file to disk") ) ps_embed_extract.add_argument("input", type=str, help="PDF filename") ps_embed_extract.add_argument("-name", required=True, help="name of entry") ps_embed_extract.add_argument("-password", help="password") ps_embed_extract.add_argument( "-output", help="output filename, default is stored name" ) ps_embed_extract.set_defaults(func=embedded_get) # ------------------------------------------------------------------------- # 'embed-copy' command # ------------------------------------------------------------------------- ps_embed_copy = subps.add_parser( "embed-copy", description=mycenter("copy embedded files between PDFs") ) ps_embed_copy.add_argument("input", type=str, help="PDF to receive embedded files") ps_embed_copy.add_argument("-password", help="password of input") ps_embed_copy.add_argument( "-output", help="output PDF, incremental save to 'input' if omitted" ) ps_embed_copy.add_argument( "-source", required=True, help="copy embedded files from here" ) ps_embed_copy.add_argument("-pwdsource", help="password of 'source' PDF") ps_embed_copy.add_argument( "-name", nargs="*", help="restrict copy to these entries" ) ps_embed_copy.set_defaults(func=embedded_copy) # ------------------------------------------------------------------------- # 'textlayout' command # ------------------------------------------------------------------------- ps_gettext = subps.add_parser( "gettext", description=mycenter("extract text in various formatting modes") ) ps_gettext.add_argument("input", type=str, help="input document filename") ps_gettext.add_argument("-password", help="password for input document") ps_gettext.add_argument( "-mode", type=str, help="mode: simple, block sort, or layout (default)", choices=("simple", "blocks", "layout"), default="layout", ) ps_gettext.add_argument( "-pages", type=str, help="select pages, format: 1,5-7,50-N", default="1-N", ) ps_gettext.add_argument( "-noligatures", action="store_true", help="expand ligature characters (default False)", default=False, ) ps_gettext.add_argument( "-convert-white", action="store_true", help="convert whitespace characters to white (default False)", default=False, ) ps_gettext.add_argument( "-extra-spaces", action="store_true", help="fill gaps with spaces (default False)", default=False, ) ps_gettext.add_argument( "-noformfeed", action="store_true", help="write linefeeds, no formfeeds (default False)", default=False, ) ps_gettext.add_argument( "-skip-empty", action="store_true", help="suppress pages with no text (default False)", default=False, ) ps_gettext.add_argument( "-output", help="store text in this file (default inputfilename.txt)", ) ps_gettext.add_argument( "-grid", type=float, help="merge lines if closer than this (default 2)", default=2, ) ps_gettext.add_argument( "-fontsize", type=float, help="only include text with a larger fontsize (default 3)", default=3, ) ps_gettext.set_defaults(func=gettext) # ------------------------------------------------------------------------- # '_internal' command # ------------------------------------------------------------------------- ps_internal = subps.add_parser( "internal", description=mycenter("internal testing") ) ps_internal.set_defaults(func=_internal) # ------------------------------------------------------------------------- # start program # ------------------------------------------------------------------------- args = parser.parse_args() # create parameter arguments class if not hasattr(args, "func"): # no function selected parser.print_help() # so print top level help else: args.func(args) # execute requested command if __name__ == "__main__": main()