1147 lines
40 KiB
Python
1147 lines
40 KiB
Python
# -----------------------------------------------------------------------------
|
|
# Copyright 2020-2022, Harald Lieder, mailto:harald.lieder@outlook.com
|
|
# License: GNU AFFERO GPL 3.0, https://www.gnu.org/licenses/agpl-3.0.html
|
|
# Part of "PyMuPDF", Python bindings for "MuPDF" (http://mupdf.com), a
|
|
# lightweight PDF, XPS, and E-book viewer, renderer and toolkit which is
|
|
# maintained and developed by Artifex Software, Inc. https://artifex.com.
|
|
# -----------------------------------------------------------------------------
|
|
import argparse
|
|
import bisect
|
|
import os
|
|
import sys
|
|
import statistics
|
|
from typing import Dict, List, Set
|
|
|
|
from . import pymupdf
|
|
|
|
def mycenter(x):
|
|
return (" %s " % x).center(75, "-")
|
|
|
|
|
|
def recoverpix(doc, item):
|
|
"""Return image for a given XREF."""
|
|
x = item[0] # xref of PDF image
|
|
s = item[1] # xref of its /SMask
|
|
if s == 0: # no smask: use direct image output
|
|
return doc.extract_image(x)
|
|
|
|
def getimage(pix):
|
|
if pix.colorspace.n != 4:
|
|
return pix
|
|
tpix = pymupdf.Pixmap(pymupdf.csRGB, pix)
|
|
return tpix
|
|
|
|
# we need to reconstruct the alpha channel with the smask
|
|
pix1 = pymupdf.Pixmap(doc, x)
|
|
pix2 = pymupdf.Pixmap(doc, s) # create pixmap of the /SMask entry
|
|
|
|
"""Sanity check:
|
|
- both pixmaps must have the same rectangle
|
|
- both pixmaps must have alpha=0
|
|
- pix2 must consist of 1 byte per pixel
|
|
"""
|
|
if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
|
|
pymupdf.message("Warning: unsupported /SMask %i for %i:" % (s, x))
|
|
pymupdf.message(pix2)
|
|
pix2 = None
|
|
return getimage(pix1) # return the pixmap as is
|
|
|
|
pix = pymupdf.Pixmap(pix1) # copy of pix1, with an alpha channel added
|
|
pix.set_alpha(pix2.samples) # treat pix2.samples as the alpha values
|
|
pix1 = pix2 = None # free temp pixmaps
|
|
|
|
# we may need to adjust something for CMYK pixmaps here:
|
|
return getimage(pix)
|
|
|
|
|
|
def open_file(filename, password, show=False, pdf=True):
|
|
"""Open and authenticate a document."""
|
|
doc = pymupdf.open(filename)
|
|
if not doc.is_pdf and pdf is True:
|
|
sys.exit("this command supports PDF files only")
|
|
rc = -1
|
|
if not doc.needs_pass:
|
|
return doc
|
|
if password:
|
|
rc = doc.authenticate(password)
|
|
if not rc:
|
|
sys.exit("authentication unsuccessful")
|
|
if show is True:
|
|
pymupdf.message("authenticated as %s" % "owner" if rc > 2 else "user")
|
|
else:
|
|
sys.exit("'%s' requires a password" % doc.name)
|
|
return doc
|
|
|
|
|
|
def print_dict(item):
|
|
"""Print a Python dictionary."""
|
|
l = max([len(k) for k in item.keys()]) + 1
|
|
for k, v in item.items():
|
|
msg = "%s: %s" % (k.rjust(l), v)
|
|
pymupdf.message(msg)
|
|
|
|
|
|
def print_xref(doc, xref):
|
|
"""Print an object given by XREF number.
|
|
|
|
Simulate the PDF source in "pretty" format.
|
|
For a stream also print its size.
|
|
"""
|
|
pymupdf.message("%i 0 obj" % xref)
|
|
xref_str = doc.xref_object(xref)
|
|
pymupdf.message(xref_str)
|
|
if doc.xref_is_stream(xref):
|
|
temp = xref_str.split()
|
|
try:
|
|
idx = temp.index("/Length") + 1
|
|
size = temp[idx]
|
|
if size.endswith("0 R"):
|
|
size = "unknown"
|
|
except Exception:
|
|
size = "unknown"
|
|
pymupdf.message("stream\n...%s bytes" % size)
|
|
pymupdf.message("endstream")
|
|
pymupdf.message("endobj")
|
|
|
|
|
|
def get_list(rlist, limit, what="page"):
|
|
"""Transform a page / xref specification into a list of integers.
|
|
|
|
Args
|
|
----
|
|
rlist: (str) the specification
|
|
limit: maximum number, i.e. number of pages, number of objects
|
|
what: a string to be used in error messages
|
|
Returns
|
|
-------
|
|
A list of integers representing the specification.
|
|
"""
|
|
N = str(limit - 1)
|
|
rlist = rlist.replace("N", N).replace(" ", "")
|
|
rlist_arr = rlist.split(",")
|
|
out_list = []
|
|
for seq, item in enumerate(rlist_arr):
|
|
n = seq + 1
|
|
if item.isdecimal(): # a single integer
|
|
i = int(item)
|
|
if 1 <= i < limit:
|
|
out_list.append(int(item))
|
|
else:
|
|
sys.exit("bad %s specification at item %i" % (what, n))
|
|
continue
|
|
try: # this must be a range now, and all of the following must work:
|
|
i1, i2 = item.split("-") # will fail if not 2 items produced
|
|
i1 = int(i1) # will fail on non-integers
|
|
i2 = int(i2)
|
|
except Exception:
|
|
sys.exit("bad %s range specification at item %i" % (what, n))
|
|
|
|
if not (1 <= i1 < limit and 1 <= i2 < limit):
|
|
sys.exit("bad %s range specification at item %i" % (what, n))
|
|
|
|
if i1 == i2: # just in case: a range of equal numbers
|
|
out_list.append(i1)
|
|
continue
|
|
|
|
if i1 < i2: # first less than second
|
|
out_list += list(range(i1, i2 + 1))
|
|
else: # first larger than second
|
|
out_list += list(range(i1, i2 - 1, -1))
|
|
|
|
return out_list
|
|
|
|
|
|
def show(args):
|
|
doc = open_file(args.input, args.password, True)
|
|
size = os.path.getsize(args.input) / 1024
|
|
flag = "KB"
|
|
if size > 1000:
|
|
size /= 1024
|
|
flag = "MB"
|
|
size = round(size, 1)
|
|
meta = doc.metadata # pylint: disable=no-member
|
|
pymupdf.message(
|
|
"'%s', pages: %i, objects: %i, %g %s, %s, encryption: %s"
|
|
% (
|
|
args.input,
|
|
doc.page_count,
|
|
doc.xref_length() - 1,
|
|
size,
|
|
flag,
|
|
meta["format"],
|
|
meta["encryption"],
|
|
)
|
|
)
|
|
n = doc.is_form_pdf
|
|
if n > 0:
|
|
s = doc.get_sigflags()
|
|
pymupdf.message(
|
|
"document contains %i root form fields and is %ssigned"
|
|
% (n, "not " if s != 3 else "")
|
|
)
|
|
n = doc.embfile_count()
|
|
if n > 0:
|
|
pymupdf.message("document contains %i embedded files" % n)
|
|
pymupdf.message()
|
|
if args.catalog:
|
|
pymupdf.message(mycenter("PDF catalog"))
|
|
xref = doc.pdf_catalog()
|
|
print_xref(doc, xref)
|
|
pymupdf.message()
|
|
if args.metadata:
|
|
pymupdf.message(mycenter("PDF metadata"))
|
|
print_dict(doc.metadata) # pylint: disable=no-member
|
|
pymupdf.message()
|
|
if args.xrefs:
|
|
pymupdf.message(mycenter("object information"))
|
|
xrefl = get_list(args.xrefs, doc.xref_length(), what="xref")
|
|
for xref in xrefl:
|
|
print_xref(doc, xref)
|
|
pymupdf.message()
|
|
if args.pages:
|
|
pymupdf.message(mycenter("page information"))
|
|
pagel = get_list(args.pages, doc.page_count + 1)
|
|
for pno in pagel:
|
|
n = pno - 1
|
|
xref = doc.page_xref(n)
|
|
pymupdf.message("Page %i:" % pno)
|
|
print_xref(doc, xref)
|
|
pymupdf.message()
|
|
if args.trailer:
|
|
pymupdf.message(mycenter("PDF trailer"))
|
|
pymupdf.message(doc.pdf_trailer())
|
|
pymupdf.message()
|
|
doc.close()
|
|
|
|
|
|
def clean(args):
|
|
doc = open_file(args.input, args.password, pdf=True)
|
|
encryption = args.encryption
|
|
encrypt = ("keep", "none", "rc4-40", "rc4-128", "aes-128", "aes-256").index(
|
|
encryption
|
|
)
|
|
|
|
if not args.pages: # simple cleaning
|
|
doc.save(
|
|
args.output,
|
|
garbage=args.garbage,
|
|
deflate=args.compress,
|
|
pretty=args.pretty,
|
|
clean=args.sanitize,
|
|
ascii=args.ascii,
|
|
linear=args.linear,
|
|
encryption=encrypt,
|
|
owner_pw=args.owner,
|
|
user_pw=args.user,
|
|
permissions=args.permission,
|
|
)
|
|
return
|
|
|
|
# create sub document from page numbers
|
|
pages = get_list(args.pages, doc.page_count + 1)
|
|
outdoc = pymupdf.open()
|
|
for pno in pages:
|
|
n = pno - 1
|
|
outdoc.insert_pdf(doc, from_page=n, to_page=n)
|
|
outdoc.save(
|
|
args.output,
|
|
garbage=args.garbage,
|
|
deflate=args.compress,
|
|
pretty=args.pretty,
|
|
clean=args.sanitize,
|
|
ascii=args.ascii,
|
|
linear=args.linear,
|
|
encryption=encrypt,
|
|
owner_pw=args.owner,
|
|
user_pw=args.user,
|
|
permissions=args.permission,
|
|
)
|
|
doc.close()
|
|
outdoc.close()
|
|
return
|
|
|
|
|
|
def doc_join(args):
|
|
"""Join pages from several PDF documents."""
|
|
doc_list = args.input # a list of input PDFs
|
|
doc = pymupdf.open() # output PDF
|
|
for src_item in doc_list: # process one input PDF
|
|
src_list = src_item.split(",")
|
|
password = src_list[1] if len(src_list) > 1 else None
|
|
src = open_file(src_list[0], password, pdf=True)
|
|
pages = ",".join(src_list[2:]) # get 'pages' specifications
|
|
if pages: # if anything there, retrieve a list of desired pages
|
|
page_list = get_list(",".join(src_list[2:]), src.page_count + 1)
|
|
else: # take all pages
|
|
page_list = range(1, src.page_count + 1)
|
|
for i in page_list:
|
|
doc.insert_pdf(src, from_page=i - 1, to_page=i - 1) # copy each source page
|
|
src.close()
|
|
|
|
doc.save(args.output, garbage=4, deflate=True)
|
|
doc.close()
|
|
|
|
|
|
def embedded_copy(args):
|
|
"""Copy embedded files between PDFs."""
|
|
doc = open_file(args.input, args.password, pdf=True)
|
|
if not doc.can_save_incrementally() and (
|
|
not args.output or args.output == args.input
|
|
):
|
|
sys.exit("cannot save PDF incrementally")
|
|
src = open_file(args.source, args.pwdsource)
|
|
names = set(args.name) if args.name else set()
|
|
src_names = set(src.embfile_names())
|
|
if names:
|
|
if not names <= src_names:
|
|
sys.exit("not all names are contained in source")
|
|
else:
|
|
names = src_names
|
|
if not names:
|
|
sys.exit("nothing to copy")
|
|
intersect = names & set(doc.embfile_names()) # any equal name already in target?
|
|
if intersect:
|
|
sys.exit("following names already exist in receiving PDF: %s" % str(intersect))
|
|
|
|
for item in names:
|
|
info = src.embfile_info(item)
|
|
buff = src.embfile_get(item)
|
|
doc.embfile_add(
|
|
item,
|
|
buff,
|
|
filename=info["filename"],
|
|
ufilename=info["ufilename"],
|
|
desc=info["desc"],
|
|
)
|
|
pymupdf.message("copied entry '%s' from '%s'" % (item, src.name))
|
|
src.close()
|
|
if args.output and args.output != args.input:
|
|
doc.save(args.output, garbage=3)
|
|
else:
|
|
doc.saveIncr()
|
|
doc.close()
|
|
|
|
|
|
def embedded_del(args):
|
|
"""Delete an embedded file entry."""
|
|
doc = open_file(args.input, args.password, pdf=True)
|
|
if not doc.can_save_incrementally() and (
|
|
not args.output or args.output == args.input
|
|
):
|
|
sys.exit("cannot save PDF incrementally")
|
|
|
|
exception_types = (ValueError, pymupdf.mupdf.FzErrorBase)
|
|
if pymupdf.mupdf_version_tuple < (1, 24):
|
|
exception_types = ValueError
|
|
try:
|
|
doc.embfile_del(args.name)
|
|
except exception_types as e: # pylint: disable=catching-non-exception
|
|
sys.exit(f'no such embedded file {args.name!r}: {e}')
|
|
if not args.output or args.output == args.input:
|
|
doc.saveIncr()
|
|
else:
|
|
doc.save(args.output, garbage=1)
|
|
doc.close()
|
|
|
|
|
|
def embedded_get(args):
|
|
"""Retrieve contents of an embedded file."""
|
|
doc = open_file(args.input, args.password, pdf=True)
|
|
exception_types = (ValueError, pymupdf.mupdf.FzErrorBase)
|
|
if pymupdf.mupdf_version_tuple < (1, 24):
|
|
exception_types = ValueError
|
|
try:
|
|
stream = doc.embfile_get(args.name)
|
|
d = doc.embfile_info(args.name)
|
|
except exception_types as e: # pylint: disable=catching-non-exception
|
|
sys.exit(f'no such embedded file {args.name!r}: {e}')
|
|
filename = args.output if args.output else d["filename"]
|
|
with open(filename, "wb") as output:
|
|
output.write(stream)
|
|
pymupdf.message("saved entry '%s' as '%s'" % (args.name, filename))
|
|
doc.close()
|
|
|
|
|
|
def embedded_add(args):
|
|
"""Insert a new embedded file."""
|
|
doc = open_file(args.input, args.password, pdf=True)
|
|
if not doc.can_save_incrementally() and (
|
|
args.output is None or args.output == args.input
|
|
):
|
|
sys.exit("cannot save PDF incrementally")
|
|
|
|
try:
|
|
doc.embfile_del(args.name)
|
|
sys.exit("entry '%s' already exists" % args.name)
|
|
except Exception:
|
|
pass
|
|
|
|
if not os.path.exists(args.path) or not os.path.isfile(args.path):
|
|
sys.exit("no such file '%s'" % args.path)
|
|
with open(args.path, "rb") as f:
|
|
stream = f.read()
|
|
filename = args.path
|
|
ufilename = filename
|
|
if not args.desc:
|
|
desc = filename
|
|
else:
|
|
desc = args.desc
|
|
doc.embfile_add(
|
|
args.name, stream, filename=filename, ufilename=ufilename, desc=desc
|
|
)
|
|
if not args.output or args.output == args.input:
|
|
doc.saveIncr()
|
|
else:
|
|
doc.save(args.output, garbage=3)
|
|
doc.close()
|
|
|
|
|
|
def embedded_upd(args):
|
|
"""Update contents or metadata of an embedded file."""
|
|
doc = open_file(args.input, args.password, pdf=True)
|
|
if not doc.can_save_incrementally() and (
|
|
args.output is None or args.output == args.input
|
|
):
|
|
sys.exit("cannot save PDF incrementally")
|
|
|
|
try:
|
|
doc.embfile_info(args.name)
|
|
except Exception:
|
|
sys.exit("no such embedded file '%s'" % args.name)
|
|
|
|
if (
|
|
args.path is not None
|
|
and os.path.exists(args.path)
|
|
and os.path.isfile(args.path)
|
|
):
|
|
with open(args.path, "rb") as f:
|
|
stream = f.read()
|
|
else:
|
|
stream = None
|
|
|
|
if args.filename:
|
|
filename = args.filename
|
|
else:
|
|
filename = None
|
|
|
|
if args.ufilename:
|
|
ufilename = args.ufilename
|
|
elif args.filename:
|
|
ufilename = args.filename
|
|
else:
|
|
ufilename = None
|
|
|
|
if args.desc:
|
|
desc = args.desc
|
|
else:
|
|
desc = None
|
|
|
|
doc.embfile_upd(
|
|
args.name, stream, filename=filename, ufilename=ufilename, desc=desc
|
|
)
|
|
if args.output is None or args.output == args.input:
|
|
doc.saveIncr()
|
|
else:
|
|
doc.save(args.output, garbage=3)
|
|
doc.close()
|
|
|
|
|
|
def embedded_list(args):
|
|
"""List embedded files."""
|
|
doc = open_file(args.input, args.password, pdf=True)
|
|
names = doc.embfile_names()
|
|
if args.name is not None:
|
|
if args.name not in names:
|
|
sys.exit("no such embedded file '%s'" % args.name)
|
|
else:
|
|
pymupdf.message()
|
|
pymupdf.message(
|
|
"printing 1 of %i embedded file%s:"
|
|
% (len(names), "s" if len(names) > 1 else "")
|
|
)
|
|
pymupdf.message()
|
|
print_dict(doc.embfile_info(args.name))
|
|
pymupdf.message()
|
|
return
|
|
if not names:
|
|
pymupdf.message("'%s' contains no embedded files" % doc.name)
|
|
return
|
|
if len(names) > 1:
|
|
msg = "'%s' contains the following %i embedded files" % (doc.name, len(names))
|
|
else:
|
|
msg = "'%s' contains the following embedded file" % doc.name
|
|
pymupdf.message(msg)
|
|
pymupdf.message()
|
|
for name in names:
|
|
if not args.detail:
|
|
pymupdf.message(name)
|
|
continue
|
|
_ = doc.embfile_info(name)
|
|
print_dict(doc.embfile_info(name))
|
|
pymupdf.message()
|
|
doc.close()
|
|
|
|
|
|
def extract_objects(args):
|
|
"""Extract images and / or fonts from a PDF."""
|
|
if not args.fonts and not args.images:
|
|
sys.exit("neither fonts nor images requested")
|
|
doc = open_file(args.input, args.password, pdf=True)
|
|
|
|
if args.pages:
|
|
pages = get_list(args.pages, doc.page_count + 1)
|
|
else:
|
|
pages = range(1, doc.page_count + 1)
|
|
|
|
if not args.output:
|
|
out_dir = os.path.abspath(os.curdir)
|
|
else:
|
|
out_dir = args.output
|
|
if not (os.path.exists(out_dir) and os.path.isdir(out_dir)):
|
|
sys.exit("output directory %s does not exist" % out_dir)
|
|
|
|
font_xrefs = set() # already saved fonts
|
|
image_xrefs = set() # already saved images
|
|
|
|
for pno in pages:
|
|
if args.fonts:
|
|
itemlist = doc.get_page_fonts(pno - 1)
|
|
for item in itemlist:
|
|
xref = item[0]
|
|
if xref not in font_xrefs:
|
|
font_xrefs.add(xref)
|
|
fontname, ext, _, buffer = doc.extract_font(xref)
|
|
if ext == "n/a" or not buffer:
|
|
continue
|
|
outname = os.path.join(
|
|
out_dir, f"{fontname.replace(' ', '-')}-{xref}.{ext}"
|
|
)
|
|
with open(outname, "wb") as outfile:
|
|
outfile.write(buffer)
|
|
buffer = None
|
|
if args.images:
|
|
itemlist = doc.get_page_images(pno - 1)
|
|
for item in itemlist:
|
|
xref = item[0]
|
|
if xref not in image_xrefs:
|
|
image_xrefs.add(xref)
|
|
pix = recoverpix(doc, item)
|
|
if type(pix) is dict:
|
|
ext = pix["ext"]
|
|
imgdata = pix["image"]
|
|
outname = os.path.join(out_dir, "img-%i.%s" % (xref, ext))
|
|
with open(outname, "wb") as outfile:
|
|
outfile.write(imgdata)
|
|
else:
|
|
outname = os.path.join(out_dir, "img-%i.png" % xref)
|
|
pix2 = (
|
|
pix
|
|
if pix.colorspace.n < 4
|
|
else pymupdf.Pixmap(pymupdf.csRGB, pix)
|
|
)
|
|
pix2.save(outname)
|
|
|
|
if args.fonts:
|
|
pymupdf.message("saved %i fonts to '%s'" % (len(font_xrefs), out_dir))
|
|
if args.images:
|
|
pymupdf.message("saved %i images to '%s'" % (len(image_xrefs), out_dir))
|
|
doc.close()
|
|
|
|
|
|
def page_simple(page, textout, GRID, fontsize, noformfeed, skip_empty, flags):
|
|
eop = b"\n" if noformfeed else bytes([12])
|
|
text = page.get_text("text", flags=flags)
|
|
if not text:
|
|
if not skip_empty:
|
|
textout.write(eop) # write formfeed
|
|
return
|
|
textout.write(text.encode("utf8", errors="surrogatepass"))
|
|
textout.write(eop)
|
|
return
|
|
|
|
|
|
def page_blocksort(page, textout, GRID, fontsize, noformfeed, skip_empty, flags):
|
|
eop = b"\n" if noformfeed else bytes([12])
|
|
blocks = page.get_text("blocks", flags=flags)
|
|
if blocks == []:
|
|
if not skip_empty:
|
|
textout.write(eop) # write formfeed
|
|
return
|
|
blocks.sort(key=lambda b: (b[3], b[0]))
|
|
for b in blocks:
|
|
textout.write(b[4].encode("utf8", errors="surrogatepass"))
|
|
textout.write(eop)
|
|
return
|
|
|
|
|
|
def page_layout(page, textout, GRID, fontsize, noformfeed, skip_empty, flags):
|
|
eop = b"\n" if noformfeed else bytes([12])
|
|
|
|
# --------------------------------------------------------------------
|
|
def find_line_index(values: List[int], value: int) -> int:
|
|
"""Find the right row coordinate.
|
|
|
|
Args:
|
|
values: (list) y-coordinates of rows.
|
|
value: (int) lookup for this value (y-origin of char).
|
|
Returns:
|
|
y-ccordinate of appropriate line for value.
|
|
"""
|
|
i = bisect.bisect_right(values, value)
|
|
if i:
|
|
return values[i - 1]
|
|
raise RuntimeError("Line for %g not found in %s" % (value, values))
|
|
|
|
# --------------------------------------------------------------------
|
|
def curate_rows(rows: Set[int], GRID) -> List:
|
|
rows = list(rows)
|
|
rows.sort() # sort ascending
|
|
nrows = [rows[0]]
|
|
for h in rows[1:]:
|
|
if h >= nrows[-1] + GRID: # only keep significant differences
|
|
nrows.append(h)
|
|
return nrows # curated list of line bottom coordinates
|
|
|
|
def process_blocks(blocks: List[Dict], page: pymupdf.Page):
|
|
rows = set()
|
|
page_width = page.rect.width
|
|
page_height = page.rect.height
|
|
rowheight = page_height
|
|
left = page_width
|
|
right = 0
|
|
chars = []
|
|
for block in blocks:
|
|
for line in block["lines"]:
|
|
if line["dir"] != (1, 0): # ignore non-horizontal text
|
|
continue
|
|
x0, y0, x1, y1 = line["bbox"]
|
|
if y1 < 0 or y0 > page.rect.height: # ignore if outside CropBox
|
|
continue
|
|
# upd row height
|
|
height = y1 - y0
|
|
|
|
if rowheight > height:
|
|
rowheight = height
|
|
for span in line["spans"]:
|
|
if span["size"] <= fontsize:
|
|
continue
|
|
for c in span["chars"]:
|
|
x0, _, x1, _ = c["bbox"]
|
|
cwidth = x1 - x0
|
|
ox, oy = c["origin"]
|
|
oy = int(round(oy))
|
|
rows.add(oy)
|
|
ch = c["c"]
|
|
if left > ox and ch != " ":
|
|
left = ox # update left coordinate
|
|
if right < x1:
|
|
right = x1 # update right coordinate
|
|
# handle ligatures:
|
|
if cwidth == 0 and chars != []: # potential ligature
|
|
old_ch, old_ox, old_oy, old_cwidth = chars[-1]
|
|
if old_oy == oy: # ligature
|
|
if old_ch != chr(0xFB00): # previous "ff" char lig?
|
|
lig = joinligature(old_ch + ch) # no
|
|
# convert to one of the 3-char ligatures:
|
|
elif ch == "i":
|
|
lig = chr(0xFB03) # "ffi"
|
|
elif ch == "l":
|
|
lig = chr(0xFB04) # "ffl"
|
|
else: # something wrong, leave old char in place
|
|
lig = old_ch
|
|
chars[-1] = (lig, old_ox, old_oy, old_cwidth)
|
|
continue
|
|
chars.append((ch, ox, oy, cwidth)) # all chars on page
|
|
return chars, rows, left, right, rowheight
|
|
|
|
def joinligature(lig: str) -> str:
|
|
"""Return ligature character for a given pair / triple of characters.
|
|
|
|
Args:
|
|
lig: (str) 2/3 characters, e.g. "ff"
|
|
Returns:
|
|
Ligature, e.g. "ff" -> chr(0xFB00)
|
|
"""
|
|
|
|
if lig == "ff":
|
|
return chr(0xFB00)
|
|
elif lig == "fi":
|
|
return chr(0xFB01)
|
|
elif lig == "fl":
|
|
return chr(0xFB02)
|
|
elif lig == "ffi":
|
|
return chr(0xFB03)
|
|
elif lig == "ffl":
|
|
return chr(0xFB04)
|
|
elif lig == "ft":
|
|
return chr(0xFB05)
|
|
elif lig == "st":
|
|
return chr(0xFB06)
|
|
return lig
|
|
|
|
# --------------------------------------------------------------------
|
|
def make_textline(left, slot, minslot, lchars):
|
|
"""Produce the text of one output line.
|
|
|
|
Args:
|
|
left: (float) left most coordinate used on page
|
|
slot: (float) avg width of one character in any font in use.
|
|
minslot: (float) min width for the characters in this line.
|
|
chars: (list[tuple]) characters of this line.
|
|
Returns:
|
|
text: (str) text string for this line
|
|
"""
|
|
text = "" # we output this
|
|
old_char = ""
|
|
old_x1 = 0 # end coordinate of last char
|
|
old_ox = 0 # x-origin of last char
|
|
if minslot <= pymupdf.EPSILON:
|
|
raise RuntimeError("program error: minslot too small = %g" % minslot)
|
|
|
|
for c in lchars: # loop over characters
|
|
char, ox, _, cwidth = c
|
|
ox = ox - left # its (relative) start coordinate
|
|
x1 = ox + cwidth # ending coordinate
|
|
|
|
# eliminate overprint effect
|
|
if old_char == char and ox - old_ox <= cwidth * 0.2:
|
|
continue
|
|
|
|
# omit spaces overlapping previous char
|
|
if char == " " and (old_x1 - ox) / cwidth > 0.8:
|
|
continue
|
|
|
|
old_char = char
|
|
# close enough to previous?
|
|
if ox < old_x1 + minslot: # assume char adjacent to previous
|
|
text += char # append to output
|
|
old_x1 = x1 # new end coord
|
|
old_ox = ox # new origin.x
|
|
continue
|
|
|
|
# else next char starts after some gap:
|
|
# fill in right number of spaces, so char is positioned
|
|
# in the right slot of the line
|
|
if char == " ": # rest relevant for non-space only
|
|
continue
|
|
delta = int(ox / slot) - len(text)
|
|
if ox > old_x1 and delta > 1:
|
|
text += " " * delta
|
|
# now append char
|
|
text += char
|
|
old_x1 = x1 # new end coordinate
|
|
old_ox = ox # new origin
|
|
return text.rstrip()
|
|
|
|
# extract page text by single characters ("rawdict")
|
|
blocks = page.get_text("rawdict", flags=flags)["blocks"]
|
|
chars, rows, left, right, rowheight = process_blocks(blocks, page)
|
|
|
|
if chars == []:
|
|
if not skip_empty:
|
|
textout.write(eop) # write formfeed
|
|
return
|
|
# compute list of line coordinates - ignoring small (GRID) differences
|
|
rows = curate_rows(rows, GRID)
|
|
|
|
# sort all chars by x-coordinates, so every line will receive char info,
|
|
# sorted from left to right.
|
|
chars.sort(key=lambda c: c[1])
|
|
|
|
# populate the lines with their char info
|
|
lines = {} # key: y1-ccordinate, value: char list
|
|
for c in chars:
|
|
_, _, oy, _ = c
|
|
y = find_line_index(rows, oy) # y-coord of the right line
|
|
lchars = lines.get(y, []) # read line chars so far
|
|
lchars.append(c) # append this char
|
|
lines[y] = lchars # write back to line
|
|
|
|
# ensure line coordinates are ascending
|
|
keys = list(lines.keys())
|
|
keys.sort()
|
|
|
|
# -------------------------------------------------------------------------
|
|
# Compute "char resolution" for the page: the char width corresponding to
|
|
# 1 text char position on output - call it 'slot'.
|
|
# For each line, compute median of its char widths. The minimum across all
|
|
# lines is 'slot'.
|
|
# The minimum char width of each line is used to determine if spaces must
|
|
# be inserted in between two characters.
|
|
# -------------------------------------------------------------------------
|
|
slot = right - left
|
|
minslots = {}
|
|
for k in keys:
|
|
lchars = lines[k]
|
|
ccount = len(lchars)
|
|
if ccount < 2:
|
|
minslots[k] = 1
|
|
continue
|
|
widths = [c[3] for c in lchars]
|
|
widths.sort()
|
|
this_slot = statistics.median(widths) # take median value
|
|
if this_slot < slot:
|
|
slot = this_slot
|
|
minslots[k] = widths[0]
|
|
|
|
# compute line advance in text output
|
|
rowheight = rowheight * (rows[-1] - rows[0]) / (rowheight * len(rows)) * 1.2
|
|
rowpos = rows[0] # first line positioned here
|
|
textout.write(b"\n")
|
|
for k in keys: # walk through the lines
|
|
while rowpos < k: # honor distance between lines
|
|
textout.write(b"\n")
|
|
rowpos += rowheight
|
|
text = make_textline(left, slot, minslots[k], lines[k])
|
|
textout.write((text + "\n").encode("utf8", errors="surrogatepass"))
|
|
rowpos = k + rowheight
|
|
|
|
textout.write(eop) # write formfeed
|
|
|
|
|
|
def gettext(args):
|
|
doc = open_file(args.input, args.password, pdf=False)
|
|
pagel = get_list(args.pages, doc.page_count + 1)
|
|
output = args.output
|
|
if output is None:
|
|
filename, _ = os.path.splitext(doc.name)
|
|
output = filename + ".txt"
|
|
with open(output, "wb") as textout:
|
|
flags = pymupdf.TEXT_PRESERVE_LIGATURES | pymupdf.TEXT_PRESERVE_WHITESPACE
|
|
if args.convert_white:
|
|
flags ^= pymupdf.TEXT_PRESERVE_WHITESPACE
|
|
if args.noligatures:
|
|
flags ^= pymupdf.TEXT_PRESERVE_LIGATURES
|
|
if args.extra_spaces:
|
|
flags ^= pymupdf.TEXT_INHIBIT_SPACES
|
|
func = {
|
|
"simple": page_simple,
|
|
"blocks": page_blocksort,
|
|
"layout": page_layout,
|
|
}
|
|
for pno in pagel:
|
|
page = doc[pno - 1]
|
|
func[args.mode](
|
|
page,
|
|
textout,
|
|
args.grid,
|
|
args.fontsize,
|
|
args.noformfeed,
|
|
args.skip_empty,
|
|
flags=flags,
|
|
)
|
|
|
|
|
|
def _internal(args):
|
|
pymupdf.message('This is from PyMuPDF message().')
|
|
pymupdf.log('This is from PyMuPDF log().')
|
|
|
|
def main():
|
|
"""Define command configurations."""
|
|
parser = argparse.ArgumentParser(
|
|
prog="pymupdf",
|
|
description=mycenter("Basic PyMuPDF Functions"),
|
|
)
|
|
subps = parser.add_subparsers(
|
|
title="Subcommands", help="Enter 'command -h' for subcommand specific help"
|
|
)
|
|
|
|
# -------------------------------------------------------------------------
|
|
# 'show' command
|
|
# -------------------------------------------------------------------------
|
|
ps_show = subps.add_parser("show", description=mycenter("display PDF information"))
|
|
ps_show.add_argument("input", type=str, help="PDF filename")
|
|
ps_show.add_argument("-password", help="password")
|
|
ps_show.add_argument("-catalog", action="store_true", help="show PDF catalog")
|
|
ps_show.add_argument("-trailer", action="store_true", help="show PDF trailer")
|
|
ps_show.add_argument("-metadata", action="store_true", help="show PDF metadata")
|
|
ps_show.add_argument(
|
|
"-xrefs", type=str, help="show selected objects, format: 1,5-7,N"
|
|
)
|
|
ps_show.add_argument(
|
|
"-pages", type=str, help="show selected pages, format: 1,5-7,50-N"
|
|
)
|
|
ps_show.set_defaults(func=show)
|
|
|
|
# -------------------------------------------------------------------------
|
|
# 'clean' command
|
|
# -------------------------------------------------------------------------
|
|
ps_clean = subps.add_parser(
|
|
"clean", description=mycenter("optimize PDF, or create sub-PDF if pages given")
|
|
)
|
|
ps_clean.add_argument("input", type=str, help="PDF filename")
|
|
ps_clean.add_argument("output", type=str, help="output PDF filename")
|
|
ps_clean.add_argument("-password", help="password")
|
|
|
|
ps_clean.add_argument(
|
|
"-encryption",
|
|
help="encryption method",
|
|
choices=("keep", "none", "rc4-40", "rc4-128", "aes-128", "aes-256"),
|
|
default="none",
|
|
)
|
|
|
|
ps_clean.add_argument("-owner", type=str, help="owner password")
|
|
ps_clean.add_argument("-user", type=str, help="user password")
|
|
|
|
ps_clean.add_argument(
|
|
"-garbage",
|
|
type=int,
|
|
help="garbage collection level",
|
|
choices=range(5),
|
|
default=0,
|
|
)
|
|
|
|
ps_clean.add_argument(
|
|
"-compress",
|
|
action="store_true",
|
|
default=False,
|
|
help="compress (deflate) output",
|
|
)
|
|
|
|
ps_clean.add_argument(
|
|
"-ascii", action="store_true", default=False, help="ASCII encode binary data"
|
|
)
|
|
|
|
ps_clean.add_argument(
|
|
"-linear",
|
|
action="store_true",
|
|
default=False,
|
|
help="format for fast web display",
|
|
)
|
|
|
|
ps_clean.add_argument(
|
|
"-permission", type=int, default=-1, help="integer with permission levels"
|
|
)
|
|
|
|
ps_clean.add_argument(
|
|
"-sanitize",
|
|
action="store_true",
|
|
default=False,
|
|
help="sanitize / clean contents",
|
|
)
|
|
ps_clean.add_argument(
|
|
"-pretty", action="store_true", default=False, help="prettify PDF structure"
|
|
)
|
|
ps_clean.add_argument(
|
|
"-pages", help="output selected pages pages, format: 1,5-7,50-N"
|
|
)
|
|
ps_clean.set_defaults(func=clean)
|
|
|
|
# -------------------------------------------------------------------------
|
|
# 'join' command
|
|
# -------------------------------------------------------------------------
|
|
ps_join = subps.add_parser(
|
|
"join",
|
|
description=mycenter("join PDF documents"),
|
|
epilog="specify each input as 'filename[,password[,pages]]'",
|
|
)
|
|
ps_join.add_argument("input", nargs="*", help="input filenames")
|
|
ps_join.add_argument("-output", required=True, help="output filename")
|
|
ps_join.set_defaults(func=doc_join)
|
|
|
|
# -------------------------------------------------------------------------
|
|
# 'extract' command
|
|
# -------------------------------------------------------------------------
|
|
ps_extract = subps.add_parser(
|
|
"extract", description=mycenter("extract images and fonts to disk")
|
|
)
|
|
ps_extract.add_argument("input", type=str, help="PDF filename")
|
|
ps_extract.add_argument("-images", action="store_true", help="extract images")
|
|
ps_extract.add_argument("-fonts", action="store_true", help="extract fonts")
|
|
ps_extract.add_argument(
|
|
"-output", help="folder to receive output, defaults to current"
|
|
)
|
|
ps_extract.add_argument("-password", help="password")
|
|
ps_extract.add_argument(
|
|
"-pages", type=str, help="consider these pages only, format: 1,5-7,50-N"
|
|
)
|
|
ps_extract.set_defaults(func=extract_objects)
|
|
|
|
# -------------------------------------------------------------------------
|
|
# 'embed-info'
|
|
# -------------------------------------------------------------------------
|
|
ps_show = subps.add_parser(
|
|
"embed-info", description=mycenter("list embedded files")
|
|
)
|
|
ps_show.add_argument("input", help="PDF filename")
|
|
ps_show.add_argument("-name", help="if given, report only this one")
|
|
ps_show.add_argument("-detail", action="store_true", help="detail information")
|
|
ps_show.add_argument("-password", help="password")
|
|
ps_show.set_defaults(func=embedded_list)
|
|
|
|
# -------------------------------------------------------------------------
|
|
# 'embed-add' command
|
|
# -------------------------------------------------------------------------
|
|
ps_embed_add = subps.add_parser(
|
|
"embed-add", description=mycenter("add embedded file")
|
|
)
|
|
ps_embed_add.add_argument("input", help="PDF filename")
|
|
ps_embed_add.add_argument("-password", help="password")
|
|
ps_embed_add.add_argument(
|
|
"-output", help="output PDF filename, incremental save if none"
|
|
)
|
|
ps_embed_add.add_argument("-name", required=True, help="name of new entry")
|
|
ps_embed_add.add_argument("-path", required=True, help="path to data for new entry")
|
|
ps_embed_add.add_argument("-desc", help="description of new entry")
|
|
ps_embed_add.set_defaults(func=embedded_add)
|
|
|
|
# -------------------------------------------------------------------------
|
|
# 'embed-del' command
|
|
# -------------------------------------------------------------------------
|
|
ps_embed_del = subps.add_parser(
|
|
"embed-del", description=mycenter("delete embedded file")
|
|
)
|
|
ps_embed_del.add_argument("input", help="PDF filename")
|
|
ps_embed_del.add_argument("-password", help="password")
|
|
ps_embed_del.add_argument(
|
|
"-output", help="output PDF filename, incremental save if none"
|
|
)
|
|
ps_embed_del.add_argument("-name", required=True, help="name of entry to delete")
|
|
ps_embed_del.set_defaults(func=embedded_del)
|
|
|
|
# -------------------------------------------------------------------------
|
|
# 'embed-upd' command
|
|
# -------------------------------------------------------------------------
|
|
ps_embed_upd = subps.add_parser(
|
|
"embed-upd",
|
|
description=mycenter("update embedded file"),
|
|
epilog="except '-name' all parameters are optional",
|
|
)
|
|
ps_embed_upd.add_argument("input", help="PDF filename")
|
|
ps_embed_upd.add_argument("-name", required=True, help="name of entry")
|
|
ps_embed_upd.add_argument("-password", help="password")
|
|
ps_embed_upd.add_argument(
|
|
"-output", help="Output PDF filename, incremental save if none"
|
|
)
|
|
ps_embed_upd.add_argument("-path", help="path to new data for entry")
|
|
ps_embed_upd.add_argument("-filename", help="new filename to store in entry")
|
|
ps_embed_upd.add_argument(
|
|
"-ufilename", help="new unicode filename to store in entry"
|
|
)
|
|
ps_embed_upd.add_argument("-desc", help="new description to store in entry")
|
|
ps_embed_upd.set_defaults(func=embedded_upd)
|
|
|
|
# -------------------------------------------------------------------------
|
|
# 'embed-extract' command
|
|
# -------------------------------------------------------------------------
|
|
ps_embed_extract = subps.add_parser(
|
|
"embed-extract", description=mycenter("extract embedded file to disk")
|
|
)
|
|
ps_embed_extract.add_argument("input", type=str, help="PDF filename")
|
|
ps_embed_extract.add_argument("-name", required=True, help="name of entry")
|
|
ps_embed_extract.add_argument("-password", help="password")
|
|
ps_embed_extract.add_argument(
|
|
"-output", help="output filename, default is stored name"
|
|
)
|
|
ps_embed_extract.set_defaults(func=embedded_get)
|
|
|
|
# -------------------------------------------------------------------------
|
|
# 'embed-copy' command
|
|
# -------------------------------------------------------------------------
|
|
ps_embed_copy = subps.add_parser(
|
|
"embed-copy", description=mycenter("copy embedded files between PDFs")
|
|
)
|
|
ps_embed_copy.add_argument("input", type=str, help="PDF to receive embedded files")
|
|
ps_embed_copy.add_argument("-password", help="password of input")
|
|
ps_embed_copy.add_argument(
|
|
"-output", help="output PDF, incremental save to 'input' if omitted"
|
|
)
|
|
ps_embed_copy.add_argument(
|
|
"-source", required=True, help="copy embedded files from here"
|
|
)
|
|
ps_embed_copy.add_argument("-pwdsource", help="password of 'source' PDF")
|
|
ps_embed_copy.add_argument(
|
|
"-name", nargs="*", help="restrict copy to these entries"
|
|
)
|
|
ps_embed_copy.set_defaults(func=embedded_copy)
|
|
|
|
# -------------------------------------------------------------------------
|
|
# 'textlayout' command
|
|
# -------------------------------------------------------------------------
|
|
ps_gettext = subps.add_parser(
|
|
"gettext", description=mycenter("extract text in various formatting modes")
|
|
)
|
|
ps_gettext.add_argument("input", type=str, help="input document filename")
|
|
ps_gettext.add_argument("-password", help="password for input document")
|
|
ps_gettext.add_argument(
|
|
"-mode",
|
|
type=str,
|
|
help="mode: simple, block sort, or layout (default)",
|
|
choices=("simple", "blocks", "layout"),
|
|
default="layout",
|
|
)
|
|
ps_gettext.add_argument(
|
|
"-pages",
|
|
type=str,
|
|
help="select pages, format: 1,5-7,50-N",
|
|
default="1-N",
|
|
)
|
|
ps_gettext.add_argument(
|
|
"-noligatures",
|
|
action="store_true",
|
|
help="expand ligature characters (default False)",
|
|
default=False,
|
|
)
|
|
ps_gettext.add_argument(
|
|
"-convert-white",
|
|
action="store_true",
|
|
help="convert whitespace characters to white (default False)",
|
|
default=False,
|
|
)
|
|
ps_gettext.add_argument(
|
|
"-extra-spaces",
|
|
action="store_true",
|
|
help="fill gaps with spaces (default False)",
|
|
default=False,
|
|
)
|
|
ps_gettext.add_argument(
|
|
"-noformfeed",
|
|
action="store_true",
|
|
help="write linefeeds, no formfeeds (default False)",
|
|
default=False,
|
|
)
|
|
ps_gettext.add_argument(
|
|
"-skip-empty",
|
|
action="store_true",
|
|
help="suppress pages with no text (default False)",
|
|
default=False,
|
|
)
|
|
ps_gettext.add_argument(
|
|
"-output",
|
|
help="store text in this file (default inputfilename.txt)",
|
|
)
|
|
ps_gettext.add_argument(
|
|
"-grid",
|
|
type=float,
|
|
help="merge lines if closer than this (default 2)",
|
|
default=2,
|
|
)
|
|
ps_gettext.add_argument(
|
|
"-fontsize",
|
|
type=float,
|
|
help="only include text with a larger fontsize (default 3)",
|
|
default=3,
|
|
)
|
|
ps_gettext.set_defaults(func=gettext)
|
|
|
|
# -------------------------------------------------------------------------
|
|
# '_internal' command
|
|
# -------------------------------------------------------------------------
|
|
ps_internal = subps.add_parser(
|
|
"internal", description=mycenter("internal testing")
|
|
)
|
|
ps_internal.set_defaults(func=_internal)
|
|
|
|
# -------------------------------------------------------------------------
|
|
# start program
|
|
# -------------------------------------------------------------------------
|
|
args = parser.parse_args() # create parameter arguments class
|
|
if not hasattr(args, "func"): # no function selected
|
|
parser.print_help() # so print top level help
|
|
else:
|
|
args.func(args) # execute requested command
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|