Files
convertions/convertions-env/lib/python3.11/site-packages/pymupdf/utils.py
2024-09-29 01:45:31 -04:00

5716 lines
190 KiB
Python

# ------------------------------------------------------------------------
# Copyright 2020-2022, Harald Lieder, mailto:harald.lieder@outlook.com
# License: GNU AFFERO GPL 3.0, https://www.gnu.org/licenses/agpl-3.0.html
#
# Part of "PyMuPDF", a Python binding for "MuPDF" (http://mupdf.com), a
# lightweight PDF, XPS, and E-book viewer, renderer and toolkit which is
# maintained and developed by Artifex Software, Inc. https://artifex.com.
# ------------------------------------------------------------------------
import io
import math
import os
import typing
import weakref
try:
from . import pymupdf
except Exception:
import pymupdf
try:
from . import mupdf
except Exception:
import mupdf
_format_g = pymupdf.format_g
g_exceptions_verbose = pymupdf.g_exceptions_verbose
TESSDATA_PREFIX = os.environ.get("TESSDATA_PREFIX")
point_like = "point_like"
rect_like = "rect_like"
matrix_like = "matrix_like"
quad_like = "quad_like"
AnyType = typing.Any
OptInt = typing.Union[int, None]
OptFloat = typing.Optional[float]
OptStr = typing.Optional[str]
OptDict = typing.Optional[dict]
OptBytes = typing.Optional[typing.ByteString]
OptSeq = typing.Optional[typing.Sequence]
"""
This is a collection of functions to extend PyMupdf.
"""
def write_text(
page: pymupdf.Page,
rect=None,
writers=None,
overlay=True,
color=None,
opacity=None,
keep_proportion=True,
rotate=0,
oc=0,
) -> None:
"""Write the text of one or more pymupdf.TextWriter objects.
Args:
rect: target rectangle. If None, the union of the text writers is used.
writers: one or more pymupdf.TextWriter objects.
overlay: put in foreground or background.
keep_proportion: maintain aspect ratio of rectangle sides.
rotate: arbitrary rotation angle.
oc: the xref of an optional content object
"""
assert isinstance(page, pymupdf.Page)
if not writers:
raise ValueError("need at least one pymupdf.TextWriter")
if type(writers) is pymupdf.TextWriter:
if rotate == 0 and rect is None:
writers.write_text(page, opacity=opacity, color=color, overlay=overlay)
return None
else:
writers = (writers,)
clip = writers[0].text_rect
textdoc = pymupdf.Document()
tpage = textdoc.new_page(width=page.rect.width, height=page.rect.height)
for writer in writers:
clip |= writer.text_rect
writer.write_text(tpage, opacity=opacity, color=color)
if rect is None:
rect = clip
page.show_pdf_page(
rect,
textdoc,
0,
overlay=overlay,
keep_proportion=keep_proportion,
rotate=rotate,
clip=clip,
oc=oc,
)
textdoc = None
tpage = None
def show_pdf_page(
page,
rect,
src,
pno=0,
keep_proportion=True,
overlay=True,
oc=0,
rotate=0,
clip=None,
) -> int:
"""Show page number 'pno' of PDF 'src' in rectangle 'rect'.
Args:
rect: (rect-like) where to place the source image
src: (document) source PDF
pno: (int) source page number
keep_proportion: (bool) do not change width-height-ratio
overlay: (bool) put in foreground
oc: (xref) make visibility dependent on this OCG / OCMD (which must be defined in the target PDF)
rotate: (int) degrees (multiple of 90)
clip: (rect-like) part of source page rectangle
Returns:
xref of inserted object (for reuse)
"""
def calc_matrix(sr, tr, keep=True, rotate=0):
"""Calculate transformation matrix from source to target rect.
Notes:
The product of four matrices in this sequence: (1) translate correct
source corner to origin, (2) rotate, (3) scale, (4) translate to
target's top-left corner.
Args:
sr: source rect in PDF (!) coordinate system
tr: target rect in PDF coordinate system
keep: whether to keep source ratio of width to height
rotate: rotation angle in degrees
Returns:
Transformation matrix.
"""
# calc center point of source rect
smp = (sr.tl + sr.br) / 2.0
# calc center point of target rect
tmp = (tr.tl + tr.br) / 2.0
# m moves to (0, 0), then rotates
m = pymupdf.Matrix(1, 0, 0, 1, -smp.x, -smp.y) * pymupdf.Matrix(rotate)
sr1 = sr * m # resulting source rect to calculate scale factors
fw = tr.width / sr1.width # scale the width
fh = tr.height / sr1.height # scale the height
if keep:
fw = fh = min(fw, fh) # take min if keeping aspect ratio
m *= pymupdf.Matrix(fw, fh) # concat scale matrix
m *= pymupdf.Matrix(1, 0, 0, 1, tmp.x, tmp.y) # concat move to target center
return pymupdf.JM_TUPLE(m)
pymupdf.CheckParent(page)
doc = page.parent
if not doc.is_pdf or not src.is_pdf:
raise ValueError("is no PDF")
if rect.is_empty or rect.is_infinite:
raise ValueError("rect must be finite and not empty")
while pno < 0: # support negative page numbers
pno += src.page_count
src_page = src[pno] # load source page
if src_page.get_contents() == []:
raise ValueError("nothing to show - source page empty")
tar_rect = rect * ~page.transformation_matrix # target rect in PDF coordinates
src_rect = src_page.rect if not clip else src_page.rect & clip # source rect
if src_rect.is_empty or src_rect.is_infinite:
raise ValueError("clip must be finite and not empty")
src_rect = src_rect * ~src_page.transformation_matrix # ... in PDF coord
matrix = calc_matrix(src_rect, tar_rect, keep=keep_proportion, rotate=rotate)
# list of existing /Form /XObjects
ilst = [i[1] for i in doc.get_page_xobjects(page.number)]
ilst += [i[7] for i in doc.get_page_images(page.number)]
ilst += [i[4] for i in doc.get_page_fonts(page.number)]
# create a name not in that list
n = "fzFrm"
i = 0
_imgname = n + "0"
while _imgname in ilst:
i += 1
_imgname = n + str(i)
isrc = src._graft_id # used as key for graftmaps
if doc._graft_id == isrc:
raise ValueError("source document must not equal target")
# retrieve / make pymupdf.Graftmap for source PDF
gmap = doc.Graftmaps.get(isrc, None)
if gmap is None:
gmap = pymupdf.Graftmap(doc)
doc.Graftmaps[isrc] = gmap
# take note of generated xref for automatic reuse
pno_id = (isrc, pno) # id of src[pno]
xref = doc.ShownPages.get(pno_id, 0)
if overlay:
page.wrap_contents() # ensure a balanced graphics state
xref = page._show_pdf_page(
src_page,
overlay=overlay,
matrix=matrix,
xref=xref,
oc=oc,
clip=src_rect,
graftmap=gmap,
_imgname=_imgname,
)
doc.ShownPages[pno_id] = xref
return xref
def replace_image(page: pymupdf.Page, xref: int, *, filename=None, pixmap=None, stream=None):
"""Replace the image referred to by xref.
Replace the image by changing the object definition stored under xref. This
will leave the pages appearance instructions intact, so the new image is
being displayed with the same bbox, rotation etc.
By providing a small fully transparent image, an effect as if the image had
been deleted can be achieved.
A typical use may include replacing large images by a smaller version,
e.g. with a lower resolution or graylevel instead of colored.
Args:
xref: the xref of the image to replace.
filename, pixmap, stream: exactly one of these must be provided. The
meaning being the same as in Page.insert_image.
"""
doc = page.parent # the owning document
if not doc.xref_is_image(xref):
raise ValueError("xref not an image") # insert new image anywhere in page
if bool(filename) + bool(stream) + bool(pixmap) != 1:
raise ValueError("Exactly one of filename/stream/pixmap must be given")
new_xref = page.insert_image(
page.rect, filename=filename, stream=stream, pixmap=pixmap
)
doc.xref_copy(new_xref, xref) # copy over new to old
last_contents_xref = page.get_contents()[-1]
# new image insertion has created a new /Contents source,
# which we will set to spaces now
doc.update_stream(last_contents_xref, b" ")
def delete_image(page: pymupdf.Page, xref: int):
"""Delete the image referred to by xef.
Actually replaces by a small transparent Pixmap using method Page.replace_image.
Args:
xref: xref of the image to delete.
"""
# make a small 100% transparent pixmap (of just any dimension)
pix = pymupdf.Pixmap(pymupdf.csGRAY, (0, 0, 1, 1), 1)
pix.clear_with() # clear all samples bytes to 0x00
page.replace_image(xref, pixmap=pix)
def insert_image(
page,
rect,
*,
alpha=-1,
filename=None,
height=0,
keep_proportion=True,
mask=None,
oc=0,
overlay=True,
pixmap=None,
rotate=0,
stream=None,
width=0,
xref=0,
):
"""Insert an image for display in a rectangle.
Args:
rect: (rect_like) position of image on the page.
alpha: (int, optional) set to 0 if image has no transparency.
filename: (str, Path, file object) image filename.
height: (int)
keep_proportion: (bool) keep width / height ratio (default).
mask: (bytes, optional) image consisting of alpha values to use.
oc: (int) xref of OCG or OCMD to declare as Optional Content.
overlay: (bool) put in foreground (default) or background.
pixmap: (pymupdf.Pixmap) use this as image.
rotate: (int) rotate by 0, 90, 180 or 270 degrees.
stream: (bytes) use this as image.
width: (int)
xref: (int) use this as image.
'page' and 'rect' are positional, all other parameters are keywords.
If 'xref' is given, that image is used. Other input options are ignored.
Else, exactly one of pixmap, stream or filename must be given.
'alpha=0' for non-transparent images improves performance significantly.
Affects stream and filename only.
Optimum transparent insertions are possible by using filename / stream in
conjunction with a 'mask' image of alpha values.
Returns:
xref (int) of inserted image. Re-use as argument for multiple insertions.
"""
pymupdf.CheckParent(page)
doc = page.parent
if not doc.is_pdf:
raise ValueError("is no PDF")
if xref == 0 and (bool(filename) + bool(stream) + bool(pixmap) != 1):
raise ValueError("xref=0 needs exactly one of filename, pixmap, stream")
if filename:
if type(filename) is str:
pass
elif hasattr(filename, "absolute"):
filename = str(filename)
elif hasattr(filename, "name"):
filename = filename.name
else:
raise ValueError("bad filename")
if filename and not os.path.exists(filename):
raise FileNotFoundError("No such file: '%s'" % filename)
elif stream and type(stream) not in (bytes, bytearray, io.BytesIO):
raise ValueError("stream must be bytes-like / BytesIO")
elif pixmap and type(pixmap) is not pymupdf.Pixmap:
raise ValueError("pixmap must be a pymupdf.Pixmap")
if mask and not (stream or filename):
raise ValueError("mask requires stream or filename")
if mask and type(mask) not in (bytes, bytearray, io.BytesIO):
raise ValueError("mask must be bytes-like / BytesIO")
while rotate < 0:
rotate += 360
while rotate >= 360:
rotate -= 360
if rotate not in (0, 90, 180, 270):
raise ValueError("bad rotate value")
r = pymupdf.Rect(rect)
if r.is_empty or r.is_infinite:
raise ValueError("rect must be finite and not empty")
clip = r * ~page.transformation_matrix
# Create a unique image reference name.
ilst = [i[7] for i in doc.get_page_images(page.number)]
ilst += [i[1] for i in doc.get_page_xobjects(page.number)]
ilst += [i[4] for i in doc.get_page_fonts(page.number)]
n = "fzImg" # 'pymupdf image'
i = 0
_imgname = n + "0" # first name candidate
while _imgname in ilst:
i += 1
_imgname = n + str(i) # try new name
if overlay:
page.wrap_contents() # ensure a balanced graphics state
digests = doc.InsertedImages
xref, digests = page._insert_image(
filename=filename,
pixmap=pixmap,
stream=stream,
imask=mask,
clip=clip,
overlay=overlay,
oc=oc,
xref=xref,
rotate=rotate,
keep_proportion=keep_proportion,
width=width,
height=height,
alpha=alpha,
_imgname=_imgname,
digests=digests,
)
if digests is not None:
doc.InsertedImages = digests
return xref
def search_for(
page,
text,
*,
clip=None,
quads=False,
flags=pymupdf.TEXT_DEHYPHENATE
| pymupdf.TEXT_PRESERVE_WHITESPACE
| pymupdf.TEXT_PRESERVE_LIGATURES
| pymupdf.TEXT_MEDIABOX_CLIP
,
textpage=None,
) -> list:
"""Search for a string on a page.
Args:
text: string to be searched for
clip: restrict search to this rectangle
quads: (bool) return quads instead of rectangles
flags: bit switches, default: join hyphened words
textpage: a pre-created pymupdf.TextPage
Returns:
a list of rectangles or quads, each containing one occurrence.
"""
if clip is not None:
clip = pymupdf.Rect(clip)
pymupdf.CheckParent(page)
tp = textpage
if tp is None:
tp = page.get_textpage(clip=clip, flags=flags) # create pymupdf.TextPage
elif getattr(tp, "parent") != page:
raise ValueError("not a textpage of this page")
rlist = tp.search(text, quads=quads)
if textpage is None:
del tp
return rlist
def search_page_for(
doc: pymupdf.Document,
pno: int,
text: str,
quads: bool = False,
clip: rect_like = None,
flags: int = pymupdf.TEXT_DEHYPHENATE
| pymupdf.TEXT_PRESERVE_LIGATURES
| pymupdf.TEXT_PRESERVE_WHITESPACE
| pymupdf.TEXT_MEDIABOX_CLIP
,
textpage: pymupdf.TextPage = None,
) -> list:
"""Search for a string on a page.
Args:
pno: page number
text: string to be searched for
clip: restrict search to this rectangle
quads: (bool) return quads instead of rectangles
flags: bit switches, default: join hyphened words
textpage: reuse a prepared textpage
Returns:
a list of rectangles or quads, each containing an occurrence.
"""
return doc[pno].search_for(
text,
quads=quads,
clip=clip,
flags=flags,
textpage=textpage,
)
def get_text_blocks(
page: pymupdf.Page,
clip: rect_like = None,
flags: OptInt = None,
textpage: pymupdf.TextPage = None,
sort: bool = False,
) -> list:
"""Return the text blocks on a page.
Notes:
Lines in a block are concatenated with line breaks.
Args:
flags: (int) control the amount of data parsed into the textpage.
Returns:
A list of the blocks. Each item contains the containing rectangle
coordinates, text lines, running block number and block type.
"""
pymupdf.CheckParent(page)
if flags is None:
flags = pymupdf.TEXTFLAGS_BLOCKS
tp = textpage
if tp is None:
tp = page.get_textpage(clip=clip, flags=flags)
elif getattr(tp, "parent") != page:
raise ValueError("not a textpage of this page")
blocks = tp.extractBLOCKS()
if textpage is None:
del tp
if sort is True:
blocks.sort(key=lambda b: (b[3], b[0]))
return blocks
def get_text_words(
page: pymupdf.Page,
clip: rect_like = None,
flags: OptInt = None,
textpage: pymupdf.TextPage = None,
sort: bool = False,
delimiters=None,
) -> list:
"""Return the text words as a list with the bbox for each word.
Args:
flags: (int) control the amount of data parsed into the textpage.
delimiters: (str,list) characters to use as word delimiters
Returns:
Word tuples (x0, y0, x1, y1, "word", bno, lno, wno).
"""
pymupdf.CheckParent(page)
if flags is None:
flags = pymupdf.TEXT_PRESERVE_WHITESPACE | pymupdf.TEXT_PRESERVE_LIGATURES | pymupdf.TEXT_MEDIABOX_CLIP
tp = textpage
if tp is None:
tp = page.get_textpage(clip=clip, flags=flags)
elif getattr(tp, "parent") != page:
raise ValueError("not a textpage of this page")
words = tp.extractWORDS(delimiters)
if textpage is None:
del tp
if sort is True:
words.sort(key=lambda w: (w[3], w[0]))
return words
def get_textbox(
page: pymupdf.Page,
rect: rect_like,
textpage: pymupdf.TextPage = None,
) -> str:
tp = textpage
if tp is None:
tp = page.get_textpage()
elif getattr(tp, "parent") != page:
raise ValueError("not a textpage of this page")
rc = tp.extractTextbox(rect)
if textpage is None:
del tp
return rc
def get_text_selection(
page: pymupdf.Page,
p1: point_like,
p2: point_like,
clip: rect_like = None,
textpage: pymupdf.TextPage = None,
):
pymupdf.CheckParent(page)
tp = textpage
if tp is None:
tp = page.get_textpage(clip=clip, flags=pymupdf.TEXT_DEHYPHENATE)
elif getattr(tp, "parent") != page:
raise ValueError("not a textpage of this page")
rc = tp.extractSelection(p1, p2)
if textpage is None:
del tp
return rc
def get_textpage_ocr(
page: pymupdf.Page,
flags: int = 0,
language: str = "eng",
dpi: int = 72,
full: bool = False,
tessdata: str = None,
) -> pymupdf.TextPage:
"""Create a Textpage from combined results of normal and OCR text parsing.
Args:
flags: (int) control content becoming part of the result.
language: (str) specify expected language(s). Deafault is "eng" (English).
dpi: (int) resolution in dpi, default 72.
full: (bool) whether to OCR the full page image, or only its images (default)
"""
pymupdf.CheckParent(page)
if not TESSDATA_PREFIX and not tessdata:
raise RuntimeError("No OCR support: TESSDATA_PREFIX not set")
def full_ocr(page, dpi, language, flags):
zoom = dpi / 72
mat = pymupdf.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat)
ocr_pdf = pymupdf.Document(
"pdf",
pix.pdfocr_tobytes(
compress=False,
language=language,
tessdata=tessdata,
),
)
ocr_page = ocr_pdf.load_page(0)
unzoom = page.rect.width / ocr_page.rect.width
ctm = pymupdf.Matrix(unzoom, unzoom) * page.derotation_matrix
tpage = ocr_page.get_textpage(flags=flags, matrix=ctm)
ocr_pdf.close()
pix = None
tpage.parent = weakref.proxy(page)
return tpage
# if OCR for the full page, OCR its pixmap @ desired dpi
if full is True:
return full_ocr(page, dpi, language, flags)
# For partial OCR, make a normal textpage, then extend it with text that
# is OCRed from each image.
# Because of this, we need the images flag bit set ON.
tpage = page.get_textpage(flags=flags)
for block in page.get_text("dict", flags=pymupdf.TEXT_PRESERVE_IMAGES)["blocks"]:
if block["type"] != 1: # only look at images
continue
bbox = pymupdf.Rect(block["bbox"])
if bbox.width <= 3 or bbox.height <= 3: # ignore tiny stuff
continue
exception_types = (RuntimeError, mupdf.FzErrorBase)
if pymupdf.mupdf_version_tuple < (1, 24):
exception_types = RuntimeError
try:
pix = pymupdf.Pixmap(block["image"]) # get image pixmap
if pix.n - pix.alpha != 3: # we need to convert this to RGB!
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
if pix.alpha: # must remove alpha channel
pix = pymupdf.Pixmap(pix, 0)
imgdoc = pymupdf.Document(
"pdf",
pix.pdfocr_tobytes(language=language, tessdata=tessdata),
) # pdf with OCRed page
imgpage = imgdoc.load_page(0) # read image as a page
pix = None
# compute matrix to transform coordinates back to that of 'page'
imgrect = imgpage.rect # page size of image PDF
shrink = pymupdf.Matrix(1 / imgrect.width, 1 / imgrect.height)
mat = shrink * block["transform"]
imgpage.extend_textpage(tpage, flags=0, matrix=mat)
imgdoc.close()
except exception_types:
if g_exceptions_verbose: pymupdf.exception_info()
tpage = None
pymupdf.message("Falling back to full page OCR")
return full_ocr(page, dpi, language, flags)
return tpage
def get_image_info(page: pymupdf.Page, hashes: bool = False, xrefs: bool = False) -> list:
"""Extract image information only from a pymupdf.TextPage.
Args:
hashes: (bool) include MD5 hash for each image.
xrefs: (bool) try to find the xref for each image. Sets hashes to true.
"""
doc = page.parent
if xrefs and doc.is_pdf:
hashes = True
if not doc.is_pdf:
xrefs = False
imginfo = getattr(page, "_image_info", None)
if imginfo and not xrefs:
return imginfo
if not imginfo:
tp = page.get_textpage(flags=pymupdf.TEXT_PRESERVE_IMAGES)
imginfo = tp.extractIMGINFO(hashes=hashes)
del tp
if hashes:
page._image_info = imginfo
if not xrefs or not doc.is_pdf:
return imginfo
imglist = page.get_images()
digests = {}
for item in imglist:
xref = item[0]
pix = pymupdf.Pixmap(doc, xref)
digests[pix.digest] = xref
del pix
for i in range(len(imginfo)):
item = imginfo[i]
xref = digests.get(item["digest"], 0)
item["xref"] = xref
imginfo[i] = item
return imginfo
def get_image_rects(page: pymupdf.Page, name, transform=False) -> list:
"""Return list of image positions on a page.
Args:
name: (str, list, int) image identification. May be reference name, an
item of the page's image list or an xref.
transform: (bool) whether to also return the transformation matrix.
Returns:
A list of pymupdf.Rect objects or tuples of (pymupdf.Rect, pymupdf.Matrix)
for all image locations on the page.
"""
if type(name) in (list, tuple):
xref = name[0]
elif type(name) is int:
xref = name
else:
imglist = [i for i in page.get_images() if i[7] == name]
if imglist == []:
raise ValueError("bad image name")
elif len(imglist) != 1:
raise ValueError("multiple image names found")
xref = imglist[0][0]
pix = pymupdf.Pixmap(page.parent, xref) # make pixmap of the image to compute MD5
digest = pix.digest
del pix
infos = page.get_image_info(hashes=True)
if not transform:
bboxes = [pymupdf.Rect(im["bbox"]) for im in infos if im["digest"] == digest]
else:
bboxes = [
(pymupdf.Rect(im["bbox"]), pymupdf.Matrix(im["transform"]))
for im in infos
if im["digest"] == digest
]
return bboxes
def get_text(
page: pymupdf.Page,
option: str = "text",
clip: rect_like = None,
flags: OptInt = None,
textpage: pymupdf.TextPage = None,
sort: bool = False,
delimiters=None,
):
"""Extract text from a page or an annotation.
This is a unifying wrapper for various methods of the pymupdf.TextPage class.
Args:
option: (str) text, words, blocks, html, dict, json, rawdict, xhtml or xml.
clip: (rect-like) restrict output to this area.
flags: bit switches to e.g. exclude images or decompose ligatures.
textpage: reuse this pymupdf.TextPage and make no new one. If specified,
'flags' and 'clip' are ignored.
Returns:
the output of methods get_text_words / get_text_blocks or pymupdf.TextPage
methods extractText, extractHTML, extractDICT, extractJSON, extractRAWDICT,
extractXHTML or etractXML respectively.
Default and misspelling choice is "text".
"""
formats = {
"text": pymupdf.TEXTFLAGS_TEXT,
"html": pymupdf.TEXTFLAGS_HTML,
"json": pymupdf.TEXTFLAGS_DICT,
"rawjson": pymupdf.TEXTFLAGS_RAWDICT,
"xml": pymupdf.TEXTFLAGS_XML,
"xhtml": pymupdf.TEXTFLAGS_XHTML,
"dict": pymupdf.TEXTFLAGS_DICT,
"rawdict": pymupdf.TEXTFLAGS_RAWDICT,
"words": pymupdf.TEXTFLAGS_WORDS,
"blocks": pymupdf.TEXTFLAGS_BLOCKS,
}
option = option.lower()
if option not in formats:
option = "text"
if flags is None:
flags = formats[option]
if option == "words":
return get_text_words(
page,
clip=clip,
flags=flags,
textpage=textpage,
sort=sort,
delimiters=delimiters,
)
if option == "blocks":
return get_text_blocks(
page, clip=clip, flags=flags, textpage=textpage, sort=sort
)
pymupdf.CheckParent(page)
cb = None
if option in ("html", "xml", "xhtml"): # no clipping for MuPDF functions
clip = page.cropbox
if clip is not None:
clip = pymupdf.Rect(clip)
cb = None
elif type(page) is pymupdf.Page:
cb = page.cropbox
# pymupdf.TextPage with or without images
tp = textpage
#pymupdf.exception_info()
if tp is None:
tp = page.get_textpage(clip=clip, flags=flags)
elif getattr(tp, "parent") != page:
raise ValueError("not a textpage of this page")
#pymupdf.log( '{option=}')
if option == "json":
t = tp.extractJSON(cb=cb, sort=sort)
elif option == "rawjson":
t = tp.extractRAWJSON(cb=cb, sort=sort)
elif option == "dict":
t = tp.extractDICT(cb=cb, sort=sort)
elif option == "rawdict":
t = tp.extractRAWDICT(cb=cb, sort=sort)
elif option == "html":
t = tp.extractHTML()
elif option == "xml":
t = tp.extractXML()
elif option == "xhtml":
t = tp.extractXHTML()
else:
t = tp.extractText(sort=sort)
if textpage is None:
del tp
return t
def get_page_text(
doc: pymupdf.Document,
pno: int,
option: str = "text",
clip: rect_like = None,
flags: OptInt = None,
textpage: pymupdf.TextPage = None,
sort: bool = False,
) -> typing.Any:
"""Extract a document page's text by page number.
Notes:
Convenience function calling page.get_text().
Args:
pno: page number
option: (str) text, words, blocks, html, dict, json, rawdict, xhtml or xml.
Returns:
output from page.TextPage().
"""
return doc[pno].get_text(option, clip=clip, flags=flags, sort=sort)
def get_pixmap(
page: pymupdf.Page,
*,
matrix: matrix_like=pymupdf.Identity,
dpi=None,
colorspace: pymupdf.Colorspace=pymupdf.csRGB,
clip: rect_like=None,
alpha: bool=False,
annots: bool=True,
) -> pymupdf.Pixmap:
"""Create pixmap of page.
Keyword args:
matrix: Matrix for transformation (default: Identity).
dpi: desired dots per inch. If given, matrix is ignored.
colorspace: (str/Colorspace) cmyk, rgb, gray - case ignored, default csRGB.
clip: (irect-like) restrict rendering to this area.
alpha: (bool) whether to include alpha channel
annots: (bool) whether to also render annotations
"""
if dpi:
zoom = dpi / 72
matrix = pymupdf.Matrix(zoom, zoom)
if type(colorspace) is str:
if colorspace.upper() == "GRAY":
colorspace = pymupdf.csGRAY
elif colorspace.upper() == "CMYK":
colorspace = pymupdf.csCMYK
else:
colorspace = pymupdf.csRGB
if colorspace.n not in (1, 3, 4):
raise ValueError("unsupported colorspace")
dl = page.get_displaylist(annots=annots)
pix = dl.get_pixmap(matrix=matrix, colorspace=colorspace, alpha=alpha, clip=clip)
dl = None
if dpi:
pix.set_dpi(dpi, dpi)
return pix
def get_page_pixmap(
doc: pymupdf.Document,
pno: int,
*,
matrix: matrix_like = pymupdf.Identity,
dpi=None,
colorspace: pymupdf.Colorspace = pymupdf.csRGB,
clip: rect_like = None,
alpha: bool = False,
annots: bool = True,
) -> pymupdf.Pixmap:
"""Create pixmap of document page by page number.
Notes:
Convenience function calling page.get_pixmap.
Args:
pno: (int) page number
matrix: pymupdf.Matrix for transformation (default: pymupdf.Identity).
colorspace: (str,pymupdf.Colorspace) rgb, rgb, gray - case ignored, default csRGB.
clip: (irect-like) restrict rendering to this area.
alpha: (bool) include alpha channel
annots: (bool) also render annotations
"""
return doc[pno].get_pixmap(
matrix=matrix,
dpi=dpi, colorspace=colorspace,
clip=clip,
alpha=alpha,
annots=annots
)
def getLinkDict(ln, document=None) -> dict:
if isinstance(ln, pymupdf.Outline):
dest = ln.destination(document)
elif isinstance(ln, pymupdf.Link):
dest = ln.dest
else:
assert 0, f'Unexpected {type(ln)=}.'
nl = {"kind": dest.kind, "xref": 0}
try:
nl["from"] = ln.rect
except Exception:
# This seems to happen quite often in PyMuPDF/tests.
if g_exceptions_verbose: pymupdf.exception_info()
pass
pnt = pymupdf.Point(0, 0)
if dest.flags & pymupdf.LINK_FLAG_L_VALID:
pnt.x = dest.lt.x
if dest.flags & pymupdf.LINK_FLAG_T_VALID:
pnt.y = dest.lt.y
if dest.kind == pymupdf.LINK_URI:
nl["uri"] = dest.uri
elif dest.kind == pymupdf.LINK_GOTO:
nl["page"] = dest.page
nl["to"] = pnt
if dest.flags & pymupdf.LINK_FLAG_R_IS_ZOOM:
nl["zoom"] = dest.rb.x
else:
nl["zoom"] = 0.0
elif dest.kind == pymupdf.LINK_GOTOR:
nl["file"] = dest.file_spec.replace("\\", "/")
nl["page"] = dest.page
if dest.page < 0:
nl["to"] = dest.dest
else:
nl["to"] = pnt
if dest.flags & pymupdf.LINK_FLAG_R_IS_ZOOM:
nl["zoom"] = dest.rb.x
else:
nl["zoom"] = 0.0
elif dest.kind == pymupdf.LINK_LAUNCH:
nl["file"] = dest.file_spec.replace("\\", "/")
elif dest.kind == pymupdf.LINK_NAMED:
# The dicts should not have same key(s).
assert not (dest.named.keys() & nl.keys())
nl.update(dest.named)
if 'to' in nl:
nl['to'] = pymupdf.Point(nl['to'])
else:
nl["page"] = dest.page
return nl
def get_links(page: pymupdf.Page) -> list:
"""Create a list of all links contained in a PDF page.
Notes:
see PyMuPDF ducmentation for details.
"""
pymupdf.CheckParent(page)
ln = page.first_link
links = []
while ln:
nl = getLinkDict(ln, page.parent)
links.append(nl)
ln = ln.next
if links != [] and page.parent.is_pdf:
linkxrefs = [x for x in
#page.annot_xrefs()
pymupdf.JM_get_annot_xref_list2(page)
if x[1] == pymupdf.PDF_ANNOT_LINK # pylint: disable=no-member
]
if len(linkxrefs) == len(links):
for i in range(len(linkxrefs)):
links[i]["xref"] = linkxrefs[i][0]
links[i]["id"] = linkxrefs[i][2]
return links
def get_toc(
doc: pymupdf.Document,
simple: bool = True,
) -> list:
"""Create a table of contents.
Args:
simple: a bool to control output. Returns a list, where each entry consists of outline level, title, page number and link destination (if simple = False). For details see PyMuPDF's documentation.
"""
def recurse(olItem, liste, lvl):
"""Recursively follow the outline item chain and record item information in a list."""
while olItem and olItem.this.m_internal:
if olItem.title:
title = olItem.title
else:
title = " "
if not olItem.is_external:
if olItem.uri:
if olItem.page == -1:
resolve = doc.resolve_link(olItem.uri)
page = resolve[0] + 1
else:
page = olItem.page + 1
else:
page = -1
else:
page = -1
if not simple:
link = getLinkDict(olItem, doc)
liste.append([lvl, title, page, link])
else:
liste.append([lvl, title, page])
if olItem.down:
liste = recurse(olItem.down, liste, lvl + 1)
olItem = olItem.next
return liste
# ensure document is open
if doc.is_closed:
raise ValueError("document closed")
doc.init_doc()
olItem = doc.outline
if not olItem:
return []
lvl = 1
liste = []
toc = recurse(olItem, liste, lvl)
if doc.is_pdf and simple is False:
doc._extend_toc_items(toc)
return toc
def del_toc_item(
doc: pymupdf.Document,
idx: int,
) -> None:
"""Delete TOC / bookmark item by index."""
xref = doc.get_outline_xrefs()[idx]
doc._remove_toc_item(xref)
def set_toc_item(
doc: pymupdf.Document,
idx: int,
dest_dict: OptDict = None,
kind: OptInt = None,
pno: OptInt = None,
uri: OptStr = None,
title: OptStr = None,
to: point_like = None,
filename: OptStr = None,
zoom: float = 0,
) -> None:
"""Update TOC item by index.
It allows changing the item's title and link destination.
Args:
idx:
(int) desired index of the TOC list, as created by get_toc.
dest_dict:
(dict) destination dictionary as created by get_toc(False).
Outrules all other parameters. If None, the remaining parameters
are used to make a dest dictionary.
kind:
(int) kind of link (pymupdf.LINK_GOTO, etc.). If None, then only
the title will be updated. If pymupdf.LINK_NONE, the TOC item will
be deleted.
pno:
(int) page number (1-based like in get_toc). Required if
pymupdf.LINK_GOTO.
uri:
(str) the URL, required if pymupdf.LINK_URI.
title:
(str) the new title. No change if None.
to:
(point-like) destination on the target page. If omitted, (72, 36)
will be used as taget coordinates.
filename:
(str) destination filename, required for pymupdf.LINK_GOTOR and
pymupdf.LINK_LAUNCH.
name:
(str) a destination name for pymupdf.LINK_NAMED.
zoom:
(float) a zoom factor for the target location (pymupdf.LINK_GOTO).
"""
xref = doc.get_outline_xrefs()[idx]
page_xref = 0
if type(dest_dict) is dict:
if dest_dict["kind"] == pymupdf.LINK_GOTO:
pno = dest_dict["page"]
page_xref = doc.page_xref(pno)
page_height = doc.page_cropbox(pno).height
to = dest_dict.get('to', pymupdf.Point(72, 36))
to.y = page_height - to.y
dest_dict["to"] = to
action = getDestStr(page_xref, dest_dict)
if not action.startswith("/A"):
raise ValueError("bad bookmark dest")
color = dest_dict.get("color")
if color:
color = list(map(float, color))
if len(color) != 3 or min(color) < 0 or max(color) > 1:
raise ValueError("bad color value")
bold = dest_dict.get("bold", False)
italic = dest_dict.get("italic", False)
flags = italic + 2 * bold
collapse = dest_dict.get("collapse")
return doc._update_toc_item(
xref,
action=action[2:],
title=title,
color=color,
flags=flags,
collapse=collapse,
)
if kind == pymupdf.LINK_NONE: # delete bookmark item
return doc.del_toc_item(idx)
if kind is None and title is None: # treat as no-op
return None
if kind is None: # only update title text
return doc._update_toc_item(xref, action=None, title=title)
if kind == pymupdf.LINK_GOTO:
if pno is None or pno not in range(1, doc.page_count + 1):
raise ValueError("bad page number")
page_xref = doc.page_xref(pno - 1)
page_height = doc.page_cropbox(pno - 1).height
if to is None:
to = pymupdf.Point(72, page_height - 36)
else:
to = pymupdf.Point(to)
to.y = page_height - to.y
ddict = {
"kind": kind,
"to": to,
"uri": uri,
"page": pno,
"file": filename,
"zoom": zoom,
}
action = getDestStr(page_xref, ddict)
if action == "" or not action.startswith("/A"):
raise ValueError("bad bookmark dest")
return doc._update_toc_item(xref, action=action[2:], title=title)
def get_area(*args) -> float:
"""Calculate area of rectangle.\nparameter is one of 'px' (default), 'in', 'cm', or 'mm'."""
rect = args[0]
if len(args) > 1:
unit = args[1]
else:
unit = "px"
u = {"px": (1, 1), "in": (1.0, 72.0), "cm": (2.54, 72.0), "mm": (25.4, 72.0)}
f = (u[unit][0] / u[unit][1]) ** 2
return f * rect.width * rect.height
def set_metadata(doc: pymupdf.Document, m: dict) -> None:
"""Update the PDF /Info object.
Args:
m: a dictionary like doc.metadata.
"""
if not doc.is_pdf:
raise ValueError("is no PDF")
if doc.is_closed or doc.is_encrypted:
raise ValueError("document closed or encrypted")
if type(m) is not dict:
raise ValueError("bad metadata")
keymap = {
"author": "Author",
"producer": "Producer",
"creator": "Creator",
"title": "Title",
"format": None,
"encryption": None,
"creationDate": "CreationDate",
"modDate": "ModDate",
"subject": "Subject",
"keywords": "Keywords",
"trapped": "Trapped",
}
valid_keys = set(keymap.keys())
diff_set = set(m.keys()).difference(valid_keys)
if diff_set != set():
msg = "bad dict key(s): %s" % diff_set
raise ValueError(msg)
t, temp = doc.xref_get_key(-1, "Info")
if t != "xref":
info_xref = 0
else:
info_xref = int(temp.replace("0 R", ""))
if m == {} and info_xref == 0: # nothing to do
return
if info_xref == 0: # no prev metadata: get new xref
info_xref = doc.get_new_xref()
doc.update_object(info_xref, "<<>>") # fill it with empty object
doc.xref_set_key(-1, "Info", "%i 0 R" % info_xref)
elif m == {}: # remove existing metadata
doc.xref_set_key(-1, "Info", "null")
return
for key, val in [(k, v) for k, v in m.items() if keymap[k] is not None]:
pdf_key = keymap[key]
if not bool(val) or val in ("none", "null"):
val = "null"
else:
val = pymupdf.get_pdf_str(val)
doc.xref_set_key(info_xref, pdf_key, val)
doc.init_doc()
return
def getDestStr(xref: int, ddict: dict) -> str:
"""Calculate the PDF action string.
Notes:
Supports Link annotations and outline items (bookmarks).
"""
if not ddict:
return ""
str_goto = lambda a, b, c, d: f"/A<</S/GoTo/D[{a} 0 R/XYZ {_format_g((b, c, d))}]>>"
str_gotor1 = lambda a, b, c, d, e, f: f"/A<</S/GoToR/D[{a} /XYZ {_format_g((b, c, d))}]/F<</F{e}/UF{f}/Type/Filespec>>>>"
str_gotor2 = lambda a, b, c: f"/A<</S/GoToR/D{a}/F<</F{b}/UF{c}/Type/Filespec>>>>"
str_launch = lambda a, b: f"/A<</S/Launch/F<</F{a}/UF{b}/Type/Filespec>>>>"
str_uri = lambda a: f"/A<</S/URI/URI{a}>>"
if type(ddict) in (int, float):
dest = str_goto(xref, 0, ddict, 0)
return dest
d_kind = ddict.get("kind", pymupdf.LINK_NONE)
if d_kind == pymupdf.LINK_NONE:
return ""
if ddict["kind"] == pymupdf.LINK_GOTO:
d_zoom = ddict.get("zoom", 0)
to = ddict.get("to", pymupdf.Point(0, 0))
d_left, d_top = to
dest = str_goto(xref, d_left, d_top, d_zoom)
return dest
if ddict["kind"] == pymupdf.LINK_URI:
dest = str_uri(pymupdf.get_pdf_str(ddict["uri"]),)
return dest
if ddict["kind"] == pymupdf.LINK_LAUNCH:
fspec = pymupdf.get_pdf_str(ddict["file"])
dest = str_launch(fspec, fspec)
return dest
if ddict["kind"] == pymupdf.LINK_GOTOR and ddict["page"] < 0:
fspec = pymupdf.get_pdf_str(ddict["file"])
dest = str_gotor2(pymupdf.get_pdf_str(ddict["to"]), fspec, fspec)
return dest
if ddict["kind"] == pymupdf.LINK_GOTOR and ddict["page"] >= 0:
fspec = pymupdf.get_pdf_str(ddict["file"])
dest = str_gotor1(
ddict["page"],
ddict["to"].x,
ddict["to"].y,
ddict["zoom"],
fspec,
fspec,
)
return dest
return ""
def set_toc(
doc: pymupdf.Document,
toc: list,
collapse: int = 1,
) -> int:
"""Create new outline tree (table of contents, TOC).
Args:
toc: (list, tuple) each entry must contain level, title, page and
optionally top margin on the page. None or '()' remove the TOC.
collapse: (int) collapses entries beyond this level. Zero or None
shows all entries unfolded.
Returns:
the number of inserted items, or the number of removed items respectively.
"""
if doc.is_closed or doc.is_encrypted:
raise ValueError("document closed or encrypted")
if not doc.is_pdf:
raise ValueError("is no PDF")
if not toc: # remove all entries
return len(doc._delToC())
# validity checks --------------------------------------------------------
if type(toc) not in (list, tuple):
raise ValueError("'toc' must be list or tuple")
toclen = len(toc)
page_count = doc.page_count
t0 = toc[0]
if type(t0) not in (list, tuple):
raise ValueError("items must be sequences of 3 or 4 items")
if t0[0] != 1:
raise ValueError("hierarchy level of item 0 must be 1")
for i in list(range(toclen - 1)):
t1 = toc[i]
t2 = toc[i + 1]
if not -1 <= t1[2] <= page_count:
raise ValueError("row %i: page number out of range" % i)
if (type(t2) not in (list, tuple)) or len(t2) not in (3, 4):
raise ValueError("bad row %i" % (i + 1))
if (type(t2[0]) is not int) or t2[0] < 1:
raise ValueError("bad hierarchy level in row %i" % (i + 1))
if t2[0] > t1[0] + 1:
raise ValueError("bad hierarchy level in row %i" % (i + 1))
# no formal errors in toc --------------------------------------------------
# --------------------------------------------------------------------------
# make a list of xref numbers, which we can use for our TOC entries
# --------------------------------------------------------------------------
old_xrefs = doc._delToC() # del old outlines, get their xref numbers
# prepare table of xrefs for new bookmarks
old_xrefs = []
xref = [0] + old_xrefs
xref[0] = doc._getOLRootNumber() # entry zero is outline root xref number
if toclen > len(old_xrefs): # too few old xrefs?
for i in range((toclen - len(old_xrefs))):
xref.append(doc.get_new_xref()) # acquire new ones
lvltab = {0: 0} # to store last entry per hierarchy level
# ------------------------------------------------------------------------------
# contains new outline objects as strings - first one is the outline root
# ------------------------------------------------------------------------------
olitems = [{"count": 0, "first": -1, "last": -1, "xref": xref[0]}]
# ------------------------------------------------------------------------------
# build olitems as a list of PDF-like connnected dictionaries
# ------------------------------------------------------------------------------
for i in range(toclen):
o = toc[i]
lvl = o[0] # level
title = pymupdf.get_pdf_str(o[1]) # title
pno = min(doc.page_count - 1, max(0, o[2] - 1)) # page number
page_xref = doc.page_xref(pno)
page_height = doc.page_cropbox(pno).height
top = pymupdf.Point(72, page_height - 36)
dest_dict = {"to": top, "kind": pymupdf.LINK_GOTO} # fall back target
if o[2] < 0:
dest_dict["kind"] = pymupdf.LINK_NONE
if len(o) > 3: # some target is specified
if type(o[3]) in (int, float): # convert a number to a point
dest_dict["to"] = pymupdf.Point(72, page_height - o[3])
else: # if something else, make sure we have a dict
# We make a copy of o[3] to avoid modifying our caller's data.
dest_dict = o[3].copy() if type(o[3]) is dict else dest_dict
if "to" not in dest_dict: # target point not in dict?
dest_dict["to"] = top # put default in
else: # transform target to PDF coordinates
page = doc[pno]
point = pymupdf.Point(dest_dict["to"])
point.y = page.cropbox.height - point.y
point = point * page.rotation_matrix
dest_dict["to"] = (point.x, point.y)
d = {}
d["first"] = -1
d["count"] = 0
d["last"] = -1
d["prev"] = -1
d["next"] = -1
d["dest"] = getDestStr(page_xref, dest_dict)
d["top"] = dest_dict["to"]
d["title"] = title
d["parent"] = lvltab[lvl - 1]
d["xref"] = xref[i + 1]
d["color"] = dest_dict.get("color")
d["flags"] = dest_dict.get("italic", 0) + 2 * dest_dict.get("bold", 0)
lvltab[lvl] = i + 1
parent = olitems[lvltab[lvl - 1]] # the parent entry
if (
dest_dict.get("collapse") or collapse and lvl > collapse
): # suppress expansion
parent["count"] -= 1 # make /Count negative
else:
parent["count"] += 1 # positive /Count
if parent["first"] == -1:
parent["first"] = i + 1
parent["last"] = i + 1
else:
d["prev"] = parent["last"]
prev = olitems[parent["last"]]
prev["next"] = i + 1
parent["last"] = i + 1
olitems.append(d)
# ------------------------------------------------------------------------------
# now create each outline item as a string and insert it in the PDF
# ------------------------------------------------------------------------------
for i, ol in enumerate(olitems):
txt = "<<"
if ol["count"] != 0:
txt += "/Count %i" % ol["count"]
try:
txt += ol["dest"]
except Exception:
# Verbose in PyMuPDF/tests.
if g_exceptions_verbose: pymupdf.exception_info()
pass
try:
if ol["first"] > -1:
txt += "/First %i 0 R" % xref[ol["first"]]
except Exception:
if g_exceptions_verbose: pymupdf.exception_info()
pass
try:
if ol["last"] > -1:
txt += "/Last %i 0 R" % xref[ol["last"]]
except Exception:
if g_exceptions_verbose: pymupdf.exception_info()
pass
try:
if ol["next"] > -1:
txt += "/Next %i 0 R" % xref[ol["next"]]
except Exception:
# Verbose in PyMuPDF/tests.
if g_exceptions_verbose: pymupdf.exception_info()
pass
try:
if ol["parent"] > -1:
txt += "/Parent %i 0 R" % xref[ol["parent"]]
except Exception:
# Verbose in PyMuPDF/tests.
if g_exceptions_verbose: pymupdf.exception_info()
pass
try:
if ol["prev"] > -1:
txt += "/Prev %i 0 R" % xref[ol["prev"]]
except Exception:
# Verbose in PyMuPDF/tests.
if g_exceptions_verbose: pymupdf.exception_info()
pass
try:
txt += "/Title" + ol["title"]
except Exception:
# Verbose in PyMuPDF/tests.
if g_exceptions_verbose: pymupdf.exception_info()
pass
if ol.get("color") and len(ol["color"]) == 3:
txt += f"/C[ {_format_g(tuple(ol['color']))}]"
if ol.get("flags", 0) > 0:
txt += "/F %i" % ol["flags"]
if i == 0: # special: this is the outline root
txt += "/Type/Outlines" # so add the /Type entry
txt += ">>"
doc.update_object(xref[i], txt) # insert the PDF object
doc.init_doc()
return toclen
def do_links(
doc1: pymupdf.Document,
doc2: pymupdf.Document,
from_page: int = -1,
to_page: int = -1,
start_at: int = -1,
) -> None:
"""Insert links contained in copied page range into destination PDF.
Parameter values **must** equal those of method insert_pdf(), which must
have been previously executed.
"""
#pymupdf.log( 'utils.do_links()')
# --------------------------------------------------------------------------
# internal function to create the actual "/Annots" object string
# --------------------------------------------------------------------------
def cre_annot(lnk, xref_dst, pno_src, ctm):
"""Create annotation object string for a passed-in link."""
r = lnk["from"] * ctm # rect in PDF coordinates
rect = _format_g(tuple(r))
if lnk["kind"] == pymupdf.LINK_GOTO:
txt = pymupdf.annot_skel["goto1"] # annot_goto
idx = pno_src.index(lnk["page"])
p = lnk["to"] * ctm # target point in PDF coordinates
annot = txt(xref_dst[idx], p.x, p.y, lnk["zoom"], rect)
elif lnk["kind"] == pymupdf.LINK_GOTOR:
if lnk["page"] >= 0:
txt = pymupdf.annot_skel["gotor1"] # annot_gotor
pnt = lnk.get("to", pymupdf.Point(0, 0)) # destination point
if type(pnt) is not pymupdf.Point:
pnt = pymupdf.Point(0, 0)
annot = txt % (
lnk["page"],
pnt.x,
pnt.y,
lnk["zoom"],
lnk["file"],
lnk["file"],
rect,
)
else:
txt = pymupdf.annot_skel["gotor2"] # annot_gotor_n
to = pymupdf.get_pdf_str(lnk["to"])
to = to[1:-1]
f = lnk["file"]
annot = txt(to, f, rect)
elif lnk["kind"] == pymupdf.LINK_LAUNCH:
txt = pymupdf.annot_skel["launch"] # annot_launch
annot = txt(lnk["file"], lnk["file"], rect)
elif lnk["kind"] == pymupdf.LINK_URI:
txt = pymupdf.annot_skel["uri"] # annot_uri
annot = txt(lnk["uri"], rect)
else:
annot = ""
return annot
# --------------------------------------------------------------------------
# validate & normalize parameters
if from_page < 0:
fp = 0
elif from_page >= doc2.page_count:
fp = doc2.page_count - 1
else:
fp = from_page
if to_page < 0 or to_page >= doc2.page_count:
tp = doc2.page_count - 1
else:
tp = to_page
if start_at < 0:
raise ValueError("'start_at' must be >= 0")
sa = start_at
incr = 1 if fp <= tp else -1 # page range could be reversed
# lists of source / destination page numbers
pno_src = list(range(fp, tp + incr, incr))
pno_dst = [sa + i for i in range(len(pno_src))]
# lists of source / destination page xrefs
xref_src = []
xref_dst = []
for i in range(len(pno_src)):
p_src = pno_src[i]
p_dst = pno_dst[i]
old_xref = doc2.page_xref(p_src)
new_xref = doc1.page_xref(p_dst)
xref_src.append(old_xref)
xref_dst.append(new_xref)
# create the links for each copied page in destination PDF
for i in range(len(xref_src)):
page_src = doc2[pno_src[i]] # load source page
links = page_src.get_links() # get all its links
#pymupdf.log( '{pno_src=}')
#pymupdf.log( '{type(page_src)=}')
#pymupdf.log( '{page_src=}')
#pymupdf.log( '{=i len(links)}')
if len(links) == 0: # no links there
page_src = None
continue
ctm = ~page_src.transformation_matrix # calc page transformation matrix
page_dst = doc1[pno_dst[i]] # load destination page
link_tab = [] # store all link definitions here
for l in links:
if l["kind"] == pymupdf.LINK_GOTO and (l["page"] not in pno_src):
continue # GOTO link target not in copied pages
annot_text = cre_annot(l, xref_dst, pno_src, ctm)
if annot_text:
link_tab.append(annot_text)
if link_tab != []:
page_dst._addAnnot_FromString( tuple(link_tab))
#pymupdf.log( 'utils.do_links() returning.')
def getLinkText(page: pymupdf.Page, lnk: dict) -> str:
# --------------------------------------------------------------------------
# define skeletons for /Annots object texts
# --------------------------------------------------------------------------
ctm = page.transformation_matrix
ictm = ~ctm
r = lnk["from"]
rect = _format_g(tuple(r * ictm))
annot = ""
if lnk["kind"] == pymupdf.LINK_GOTO:
if lnk["page"] >= 0:
txt = pymupdf.annot_skel["goto1"] # annot_goto
pno = lnk["page"]
xref = page.parent.page_xref(pno)
pnt = lnk.get("to", pymupdf.Point(0, 0)) # destination point
ipnt = pnt * ictm
annot = txt(xref, ipnt.x, ipnt.y, lnk.get("zoom", 0), rect)
else:
txt = pymupdf.annot_skel["goto2"] # annot_goto_n
annot = txt(pymupdf.get_pdf_str(lnk["to"]), rect)
elif lnk["kind"] == pymupdf.LINK_GOTOR:
if lnk["page"] >= 0:
txt = pymupdf.annot_skel["gotor1"] # annot_gotor
pnt = lnk.get("to", pymupdf.Point(0, 0)) # destination point
if type(pnt) is not pymupdf.Point:
pnt = pymupdf.Point(0, 0)
annot = txt(
lnk["page"],
pnt.x,
pnt.y,
lnk.get("zoom", 0),
lnk["file"],
lnk["file"],
rect,
)
else:
txt = pymupdf.annot_skel["gotor2"] # annot_gotor_n
annot = txt(pymupdf.get_pdf_str(lnk["to"]), lnk["file"], rect)
elif lnk["kind"] == pymupdf.LINK_LAUNCH:
txt = pymupdf.annot_skel["launch"] # annot_launch
annot = txt(lnk["file"], lnk["file"], rect)
elif lnk["kind"] == pymupdf.LINK_URI:
txt = pymupdf.annot_skel["uri"] # txt = annot_uri
annot = txt(lnk["uri"], rect)
elif lnk["kind"] == pymupdf.LINK_NAMED:
txt = pymupdf.annot_skel["named"] # annot_named
lname = lnk.get("name") # check presence of key
if lname is None: # if missing, fall back to alternative
lname = lnk["nameddest"]
annot = txt(lname, rect)
if not annot:
return annot
# add a /NM PDF key to the object definition
link_names = dict( # existing ids and their xref
[(x[0], x[2]) for x in page.annot_xrefs() if x[1] == pymupdf.PDF_ANNOT_LINK] # pylint: disable=no-member
)
old_name = lnk.get("id", "") # id value in the argument
if old_name and (lnk["xref"], old_name) in link_names.items():
name = old_name # no new name if this is an update only
else:
i = 0
stem = pymupdf.TOOLS.set_annot_stem() + "-L%i"
while True:
name = stem % i
if name not in link_names.values():
break
i += 1
# add /NM key to object definition
annot = annot.replace("/Link", "/Link/NM(%s)" % name)
return annot
def delete_widget(page: pymupdf.Page, widget: pymupdf.Widget) -> pymupdf.Widget:
"""Delete widget from page and return the next one."""
pymupdf.CheckParent(page)
annot = getattr(widget, "_annot", None)
if annot is None:
raise ValueError("bad type: widget")
nextwidget = widget.next
page.delete_annot(annot)
widget._annot.parent = None
keylist = list(widget.__dict__.keys())
for key in keylist:
del widget.__dict__[key]
return nextwidget
def update_link(page: pymupdf.Page, lnk: dict) -> None:
"""Update a link on the current page."""
pymupdf.CheckParent(page)
annot = getLinkText(page, lnk)
if annot == "":
raise ValueError("link kind not supported")
page.parent.update_object(lnk["xref"], annot, page=page)
def insert_link(page: pymupdf.Page, lnk: dict, mark: bool = True) -> None:
"""Insert a new link for the current page."""
pymupdf.CheckParent(page)
annot = getLinkText(page, lnk)
if annot == "":
raise ValueError("link kind not supported")
page._addAnnot_FromString((annot,))
def insert_textbox(
page: pymupdf.Page,
rect: rect_like,
buffer: typing.Union[str, list],
fontname: str = "helv",
fontfile: OptStr = None,
set_simple: int = 0,
encoding: int = 0,
fontsize: float = 11,
lineheight: OptFloat = None,
color: OptSeq = None,
fill: OptSeq = None,
expandtabs: int = 1,
align: int = 0,
rotate: int = 0,
render_mode: int = 0,
border_width: float = 0.05,
morph: OptSeq = None,
overlay: bool = True,
stroke_opacity: float = 1,
fill_opacity: float = 1,
oc: int = 0,
) -> float:
"""Insert text into a given rectangle.
Notes:
Creates a Shape object, uses its same-named method and commits it.
Parameters:
rect: (rect-like) area to use for text.
buffer: text to be inserted
fontname: a Base-14 font, font name or '/name'
fontfile: name of a font file
fontsize: font size
lineheight: overwrite the font property
color: RGB color triple
expandtabs: handles tabulators with string function
align: left, center, right, justified
rotate: 0, 90, 180, or 270 degrees
morph: morph box with a matrix and a fixpoint
overlay: put text in foreground or background
Returns:
unused or deficit rectangle area (float)
"""
img = page.new_shape()
rc = img.insert_textbox(
rect,
buffer,
fontsize=fontsize,
lineheight=lineheight,
fontname=fontname,
fontfile=fontfile,
set_simple=set_simple,
encoding=encoding,
color=color,
fill=fill,
expandtabs=expandtabs,
render_mode=render_mode,
border_width=border_width,
align=align,
rotate=rotate,
morph=morph,
stroke_opacity=stroke_opacity,
fill_opacity=fill_opacity,
oc=oc,
)
if rc >= 0:
img.commit(overlay)
return rc
def insert_text(
page: pymupdf.Page,
point: point_like,
text: typing.Union[str, list],
fontsize: float = 11,
lineheight: OptFloat = None,
fontname: str = "helv",
fontfile: OptStr = None,
set_simple: int = 0,
encoding: int = 0,
color: OptSeq = None,
fill: OptSeq = None,
border_width: float = 0.05,
render_mode: int = 0,
rotate: int = 0,
morph: OptSeq = None,
overlay: bool = True,
stroke_opacity: float = 1,
fill_opacity: float = 1,
oc: int = 0,
):
img = page.new_shape()
rc = img.insert_text(
point,
text,
fontsize=fontsize,
lineheight=lineheight,
fontname=fontname,
fontfile=fontfile,
set_simple=set_simple,
encoding=encoding,
color=color,
fill=fill,
border_width=border_width,
render_mode=render_mode,
rotate=rotate,
morph=morph,
stroke_opacity=stroke_opacity,
fill_opacity=fill_opacity,
oc=oc,
)
if rc >= 0:
img.commit(overlay)
return rc
def insert_htmlbox(
page,
rect,
text,
*,
css=None,
scale_low=0,
archive=None,
rotate=0,
oc=0,
opacity=1,
overlay=True,
) -> float:
"""Insert text with optional HTML tags and stylings into a rectangle.
Args:
rect: (rect-like) rectangle into which the text should be placed.
text: (str) text with optional HTML tags and stylings.
css: (str) CSS styling commands.
scale_low: (float) force-fit content by scaling it down. Must be in
range [0, 1]. If 1, no scaling will take place. If 0, arbitrary
down-scaling is acceptable. A value of 0.1 would mean that content
may be scaled down by at most 90%.
archive: Archive object pointing to locations of used fonts or images
rotate: (int) rotate the text in the box by a multiple of 90 degrees.
oc: (int) the xref of an OCG / OCMD (Optional Content).
opacity: (float) set opacity of inserted content.
overlay: (bool) put text on top of page content.
Returns:
A tuple of floats (spare_height, scale).
spare_height: -1 if content did not fit, else >= 0. It is the height of the
unused (still available) rectangle stripe. Positive only if
scale_min = 1 (no down scaling).
scale: downscaling factor, 0 < scale <= 1. Set to 0 if spare_height = -1 (no fit).
"""
# normalize rotation angle
if not rotate % 90 == 0:
raise ValueError("bad rotation angle")
while rotate < 0:
rotate += 360
while rotate >= 360:
rotate -= 360
if not 0 <= scale_low <= 1:
raise ValueError("'scale_low' must be in [0, 1]")
if css is None:
css = ""
rect = pymupdf.Rect(rect)
if rotate in (90, 270):
temp_rect = pymupdf.Rect(0, 0, rect.height, rect.width)
else:
temp_rect = pymupdf.Rect(0, 0, rect.width, rect.height)
# use a small border by default
mycss = "body {margin:1px;}" + css # append user CSS
# either make a story, or accept a given one
if isinstance(text, str): # if a string, convert to a Story
story = pymupdf.Story(html=text, user_css=mycss, archive=archive)
elif isinstance(text, pymupdf.Story):
story = text
else:
raise ValueError("'text' must be a string or a Story")
# ----------------------------------------------------------------
# Find a scaling factor that lets our story fit in
# ----------------------------------------------------------------
scale_max = None if scale_low == 0 else 1 / scale_low
fit = story.fit_scale(temp_rect, scale_min=1, scale_max=scale_max)
if fit.big_enough is False: # there was no fit
return (-1, scale_low)
filled = fit.filled
scale = 1 / fit.parameter # shrink factor
spare_height = fit.rect.y1 - filled[3] # unused room at rectangle bottom
# Note: due to MuPDF's logic this may be negative even for successful fits.
if scale != 1 or spare_height < 0: # if scaling occurred, set spare_height to 0
spare_height = 0
def rect_function(*args):
return fit.rect, fit.rect, pymupdf.Identity
# draw story on temp PDF page
doc = story.write_with_links(rect_function)
# Insert opacity if requested.
# For this, we prepend a command to the /Contents.
if 0 <= opacity < 1:
tpage = doc[0] # load page
# generate /ExtGstate for the page
alp0 = tpage._set_opacity(CA=opacity, ca=opacity)
s = f"/{alp0} gs\n" # generate graphic state command
pymupdf.TOOLS._insert_contents(tpage, s.encode(), 0)
# put result in target page
page.show_pdf_page(rect, doc, 0, rotate=rotate, oc=oc, overlay=overlay)
# -------------------------------------------------------------------------
# re-insert links in target rect (show_pdf_page cannot copy annotations)
# -------------------------------------------------------------------------
# scaled center point of fit.rect
mp1 = (fit.rect.tl + fit.rect.br) / 2 * scale
# center point of target rect
mp2 = (rect.tl + rect.br) / 2
# compute link positioning matrix:
# - move center of scaled-down fit.rect to (0,0)
# - rotate
# - move (0,0) to center of target rect
mat = (
pymupdf.Matrix(scale, 0, 0, scale, -mp1.x, -mp1.y)
* pymupdf.Matrix(-rotate)
* pymupdf.Matrix(1, 0, 0, 1, mp2.x, mp2.y)
)
# copy over links
for link in doc[0].get_links():
link["from"] *= mat
page.insert_link(link)
return spare_height, scale
def new_page(
doc: pymupdf.Document,
pno: int = -1,
width: float = 595,
height: float = 842,
) -> pymupdf.Page:
"""Create and return a new page object.
Args:
pno: (int) insert before this page. Default: after last page.
width: (float) page width in points. Default: 595 (ISO A4 width).
height: (float) page height in points. Default 842 (ISO A4 height).
Returns:
A pymupdf.Page object.
"""
doc._newPage(pno, width=width, height=height)
return doc[pno]
def insert_page(
doc: pymupdf.Document,
pno: int,
text: typing.Union[str, list, None] = None,
fontsize: float = 11,
width: float = 595,
height: float = 842,
fontname: str = "helv",
fontfile: OptStr = None,
color: OptSeq = (0,),
) -> int:
"""Create a new PDF page and insert some text.
Notes:
Function combining pymupdf.Document.new_page() and pymupdf.Page.insert_text().
For parameter details see these methods.
"""
page = doc.new_page(pno=pno, width=width, height=height)
if not bool(text):
return 0
rc = page.insert_text(
(50, 72),
text,
fontsize=fontsize,
fontname=fontname,
fontfile=fontfile,
color=color,
)
return rc
def draw_line(
page: pymupdf.Page,
p1: point_like,
p2: point_like,
color: OptSeq = (0,),
dashes: OptStr = None,
width: float = 1,
lineCap: int = 0,
lineJoin: int = 0,
overlay: bool = True,
morph: OptSeq = None,
stroke_opacity: float = 1,
fill_opacity: float = 1,
oc=0,
) -> pymupdf.Point:
"""Draw a line from point p1 to point p2."""
img = page.new_shape()
p = img.draw_line(pymupdf.Point(p1), pymupdf.Point(p2))
img.finish(
color=color,
dashes=dashes,
width=width,
closePath=False,
lineCap=lineCap,
lineJoin=lineJoin,
morph=morph,
stroke_opacity=stroke_opacity,
fill_opacity=fill_opacity,
oc=oc,
)
img.commit(overlay)
return p
def draw_squiggle(
page: pymupdf.Page,
p1: point_like,
p2: point_like,
breadth: float = 2,
color: OptSeq = (0,),
dashes: OptStr = None,
width: float = 1,
lineCap: int = 0,
lineJoin: int = 0,
overlay: bool = True,
morph: OptSeq = None,
stroke_opacity: float = 1,
fill_opacity: float = 1,
oc: int = 0,
) -> pymupdf.Point:
"""Draw a squiggly line from point p1 to point p2."""
img = page.new_shape()
p = img.draw_squiggle(pymupdf.Point(p1), pymupdf.Point(p2), breadth=breadth)
img.finish(
color=color,
dashes=dashes,
width=width,
closePath=False,
lineCap=lineCap,
lineJoin=lineJoin,
morph=morph,
stroke_opacity=stroke_opacity,
fill_opacity=fill_opacity,
oc=oc,
)
img.commit(overlay)
return p
def draw_zigzag(
page: pymupdf.Page,
p1: point_like,
p2: point_like,
breadth: float = 2,
color: OptSeq = (0,),
dashes: OptStr = None,
width: float = 1,
lineCap: int = 0,
lineJoin: int = 0,
overlay: bool = True,
morph: OptSeq = None,
stroke_opacity: float = 1,
fill_opacity: float = 1,
oc: int = 0,
) -> pymupdf.Point:
"""Draw a zigzag line from point p1 to point p2."""
img = page.new_shape()
p = img.draw_zigzag(pymupdf.Point(p1), pymupdf.Point(p2), breadth=breadth)
img.finish(
color=color,
dashes=dashes,
width=width,
closePath=False,
lineCap=lineCap,
lineJoin=lineJoin,
morph=morph,
stroke_opacity=stroke_opacity,
fill_opacity=fill_opacity,
oc=oc,
)
img.commit(overlay)
return p
def draw_rect(
page: pymupdf.Page,
rect: rect_like,
color: OptSeq = (0,),
fill: OptSeq = None,
dashes: OptStr = None,
width: float = 1,
lineCap: int = 0,
lineJoin: int = 0,
morph: OptSeq = None,
overlay: bool = True,
stroke_opacity: float = 1,
fill_opacity: float = 1,
oc: int = 0,
radius=None,
) -> pymupdf.Point:
'''
Draw a rectangle. See Shape class method for details.
'''
img = page.new_shape()
Q = img.draw_rect(pymupdf.Rect(rect), radius=radius)
img.finish(
color=color,
fill=fill,
dashes=dashes,
width=width,
lineCap=lineCap,
lineJoin=lineJoin,
morph=morph,
stroke_opacity=stroke_opacity,
fill_opacity=fill_opacity,
oc=oc,
)
img.commit(overlay)
return Q
def draw_quad(
page: pymupdf.Page,
quad: quad_like,
color: OptSeq = (0,),
fill: OptSeq = None,
dashes: OptStr = None,
width: float = 1,
lineCap: int = 0,
lineJoin: int = 0,
morph: OptSeq = None,
overlay: bool = True,
stroke_opacity: float = 1,
fill_opacity: float = 1,
oc: int = 0,
) -> pymupdf.Point:
"""Draw a quadrilateral."""
img = page.new_shape()
Q = img.draw_quad(pymupdf.Quad(quad))
img.finish(
color=color,
fill=fill,
dashes=dashes,
width=width,
lineCap=lineCap,
lineJoin=lineJoin,
morph=morph,
stroke_opacity=stroke_opacity,
fill_opacity=fill_opacity,
oc=oc,
)
img.commit(overlay)
return Q
def draw_polyline(
page: pymupdf.Page,
points: list,
color: OptSeq = (0,),
fill: OptSeq = None,
dashes: OptStr = None,
width: float = 1,
morph: OptSeq = None,
lineCap: int = 0,
lineJoin: int = 0,
overlay: bool = True,
closePath: bool = False,
stroke_opacity: float = 1,
fill_opacity: float = 1,
oc: int = 0,
) -> pymupdf.Point:
"""Draw multiple connected line segments."""
img = page.new_shape()
Q = img.draw_polyline(points)
img.finish(
color=color,
fill=fill,
dashes=dashes,
width=width,
lineCap=lineCap,
lineJoin=lineJoin,
morph=morph,
closePath=closePath,
stroke_opacity=stroke_opacity,
fill_opacity=fill_opacity,
oc=oc,
)
img.commit(overlay)
return Q
def draw_circle(
page: pymupdf.Page,
center: point_like,
radius: float,
color: OptSeq = (0,),
fill: OptSeq = None,
morph: OptSeq = None,
dashes: OptStr = None,
width: float = 1,
lineCap: int = 0,
lineJoin: int = 0,
overlay: bool = True,
stroke_opacity: float = 1,
fill_opacity: float = 1,
oc: int = 0,
) -> pymupdf.Point:
"""Draw a circle given its center and radius."""
img = page.new_shape()
Q = img.draw_circle(pymupdf.Point(center), radius)
img.finish(
color=color,
fill=fill,
dashes=dashes,
width=width,
lineCap=lineCap,
lineJoin=lineJoin,
morph=morph,
stroke_opacity=stroke_opacity,
fill_opacity=fill_opacity,
oc=oc,
)
img.commit(overlay)
return Q
def draw_oval(
page: pymupdf.Page,
rect: typing.Union[rect_like, quad_like],
color: OptSeq = (0,),
fill: OptSeq = None,
dashes: OptStr = None,
morph: OptSeq = None,
width: float = 1,
lineCap: int = 0,
lineJoin: int = 0,
overlay: bool = True,
stroke_opacity: float = 1,
fill_opacity: float = 1,
oc: int = 0,
) -> pymupdf.Point:
"""Draw an oval given its containing rectangle or quad."""
img = page.new_shape()
Q = img.draw_oval(rect)
img.finish(
color=color,
fill=fill,
dashes=dashes,
width=width,
lineCap=lineCap,
lineJoin=lineJoin,
morph=morph,
stroke_opacity=stroke_opacity,
fill_opacity=fill_opacity,
oc=oc,
)
img.commit(overlay)
return Q
def draw_curve(
page: pymupdf.Page,
p1: point_like,
p2: point_like,
p3: point_like,
color: OptSeq = (0,),
fill: OptSeq = None,
dashes: OptStr = None,
width: float = 1,
morph: OptSeq = None,
closePath: bool = False,
lineCap: int = 0,
lineJoin: int = 0,
overlay: bool = True,
stroke_opacity: float = 1,
fill_opacity: float = 1,
oc: int = 0,
) -> pymupdf.Point:
"""Draw a special Bezier curve from p1 to p3, generating control points on lines p1 to p2 and p2 to p3."""
img = page.new_shape()
Q = img.draw_curve(pymupdf.Point(p1), pymupdf.Point(p2), pymupdf.Point(p3))
img.finish(
color=color,
fill=fill,
dashes=dashes,
width=width,
lineCap=lineCap,
lineJoin=lineJoin,
morph=morph,
closePath=closePath,
stroke_opacity=stroke_opacity,
fill_opacity=fill_opacity,
oc=oc,
)
img.commit(overlay)
return Q
def draw_bezier(
page: pymupdf.Page,
p1: point_like,
p2: point_like,
p3: point_like,
p4: point_like,
color: OptSeq = (0,),
fill: OptSeq = None,
dashes: OptStr = None,
width: float = 1,
morph: OptStr = None,
closePath: bool = False,
lineCap: int = 0,
lineJoin: int = 0,
overlay: bool = True,
stroke_opacity: float = 1,
fill_opacity: float = 1,
oc: int = 0,
) -> pymupdf.Point:
"""Draw a general cubic Bezier curve from p1 to p4 using control points p2 and p3."""
img = page.new_shape()
Q = img.draw_bezier(pymupdf.Point(p1), pymupdf.Point(p2), pymupdf.Point(p3), pymupdf.Point(p4))
img.finish(
color=color,
fill=fill,
dashes=dashes,
width=width,
lineCap=lineCap,
lineJoin=lineJoin,
morph=morph,
closePath=closePath,
stroke_opacity=stroke_opacity,
fill_opacity=fill_opacity,
oc=oc,
)
img.commit(overlay)
return Q
def draw_sector(
page: pymupdf.Page,
center: point_like,
point: point_like,
beta: float,
color: OptSeq = (0,),
fill: OptSeq = None,
dashes: OptStr = None,
fullSector: bool = True,
morph: OptSeq = None,
width: float = 1,
closePath: bool = False,
lineCap: int = 0,
lineJoin: int = 0,
overlay: bool = True,
stroke_opacity: float = 1,
fill_opacity: float = 1,
oc: int = 0,
) -> pymupdf.Point:
"""Draw a circle sector given circle center, one arc end point and the angle of the arc.
Parameters:
center -- center of circle
point -- arc end point
beta -- angle of arc (degrees)
fullSector -- connect arc ends with center
"""
img = page.new_shape()
Q = img.draw_sector(pymupdf.Point(center), pymupdf.Point(point), beta, fullSector=fullSector)
img.finish(
color=color,
fill=fill,
dashes=dashes,
width=width,
lineCap=lineCap,
lineJoin=lineJoin,
morph=morph,
closePath=closePath,
stroke_opacity=stroke_opacity,
fill_opacity=fill_opacity,
oc=oc,
)
img.commit(overlay)
return Q
# ----------------------------------------------------------------------
# Name: wx.lib.colourdb.py
# Purpose: Adds a bunch of colour names and RGB values to the
# colour database so they can be found by name
#
# Author: Robin Dunn
#
# Created: 13-March-2001
# Copyright: (c) 2001-2017 by Total Control Software
# Licence: wxWindows license
# Tags: phoenix-port, unittest, documented
# ----------------------------------------------------------------------
def getColorList() -> list:
"""
Returns a list of just the colour names used by this module.
:rtype: list of strings
"""
return [x[0] for x in getColorInfoList()]
def getColorInfoList() -> list:
"""
Returns the list of colour name/value tuples used by this module.
:rtype: list of tuples
"""
return [
("ALICEBLUE", 240, 248, 255),
("ANTIQUEWHITE", 250, 235, 215),
("ANTIQUEWHITE1", 255, 239, 219),
("ANTIQUEWHITE2", 238, 223, 204),
("ANTIQUEWHITE3", 205, 192, 176),
("ANTIQUEWHITE4", 139, 131, 120),
("AQUAMARINE", 127, 255, 212),
("AQUAMARINE1", 127, 255, 212),
("AQUAMARINE2", 118, 238, 198),
("AQUAMARINE3", 102, 205, 170),
("AQUAMARINE4", 69, 139, 116),
("AZURE", 240, 255, 255),
("AZURE1", 240, 255, 255),
("AZURE2", 224, 238, 238),
("AZURE3", 193, 205, 205),
("AZURE4", 131, 139, 139),
("BEIGE", 245, 245, 220),
("BISQUE", 255, 228, 196),
("BISQUE1", 255, 228, 196),
("BISQUE2", 238, 213, 183),
("BISQUE3", 205, 183, 158),
("BISQUE4", 139, 125, 107),
("BLACK", 0, 0, 0),
("BLANCHEDALMOND", 255, 235, 205),
("BLUE", 0, 0, 255),
("BLUE1", 0, 0, 255),
("BLUE2", 0, 0, 238),
("BLUE3", 0, 0, 205),
("BLUE4", 0, 0, 139),
("BLUEVIOLET", 138, 43, 226),
("BROWN", 165, 42, 42),
("BROWN1", 255, 64, 64),
("BROWN2", 238, 59, 59),
("BROWN3", 205, 51, 51),
("BROWN4", 139, 35, 35),
("BURLYWOOD", 222, 184, 135),
("BURLYWOOD1", 255, 211, 155),
("BURLYWOOD2", 238, 197, 145),
("BURLYWOOD3", 205, 170, 125),
("BURLYWOOD4", 139, 115, 85),
("CADETBLUE", 95, 158, 160),
("CADETBLUE1", 152, 245, 255),
("CADETBLUE2", 142, 229, 238),
("CADETBLUE3", 122, 197, 205),
("CADETBLUE4", 83, 134, 139),
("CHARTREUSE", 127, 255, 0),
("CHARTREUSE1", 127, 255, 0),
("CHARTREUSE2", 118, 238, 0),
("CHARTREUSE3", 102, 205, 0),
("CHARTREUSE4", 69, 139, 0),
("CHOCOLATE", 210, 105, 30),
("CHOCOLATE1", 255, 127, 36),
("CHOCOLATE2", 238, 118, 33),
("CHOCOLATE3", 205, 102, 29),
("CHOCOLATE4", 139, 69, 19),
("COFFEE", 156, 79, 0),
("CORAL", 255, 127, 80),
("CORAL1", 255, 114, 86),
("CORAL2", 238, 106, 80),
("CORAL3", 205, 91, 69),
("CORAL4", 139, 62, 47),
("CORNFLOWERBLUE", 100, 149, 237),
("CORNSILK", 255, 248, 220),
("CORNSILK1", 255, 248, 220),
("CORNSILK2", 238, 232, 205),
("CORNSILK3", 205, 200, 177),
("CORNSILK4", 139, 136, 120),
("CYAN", 0, 255, 255),
("CYAN1", 0, 255, 255),
("CYAN2", 0, 238, 238),
("CYAN3", 0, 205, 205),
("CYAN4", 0, 139, 139),
("DARKBLUE", 0, 0, 139),
("DARKCYAN", 0, 139, 139),
("DARKGOLDENROD", 184, 134, 11),
("DARKGOLDENROD1", 255, 185, 15),
("DARKGOLDENROD2", 238, 173, 14),
("DARKGOLDENROD3", 205, 149, 12),
("DARKGOLDENROD4", 139, 101, 8),
("DARKGREEN", 0, 100, 0),
("DARKGRAY", 169, 169, 169),
("DARKKHAKI", 189, 183, 107),
("DARKMAGENTA", 139, 0, 139),
("DARKOLIVEGREEN", 85, 107, 47),
("DARKOLIVEGREEN1", 202, 255, 112),
("DARKOLIVEGREEN2", 188, 238, 104),
("DARKOLIVEGREEN3", 162, 205, 90),
("DARKOLIVEGREEN4", 110, 139, 61),
("DARKORANGE", 255, 140, 0),
("DARKORANGE1", 255, 127, 0),
("DARKORANGE2", 238, 118, 0),
("DARKORANGE3", 205, 102, 0),
("DARKORANGE4", 139, 69, 0),
("DARKORCHID", 153, 50, 204),
("DARKORCHID1", 191, 62, 255),
("DARKORCHID2", 178, 58, 238),
("DARKORCHID3", 154, 50, 205),
("DARKORCHID4", 104, 34, 139),
("DARKRED", 139, 0, 0),
("DARKSALMON", 233, 150, 122),
("DARKSEAGREEN", 143, 188, 143),
("DARKSEAGREEN1", 193, 255, 193),
("DARKSEAGREEN2", 180, 238, 180),
("DARKSEAGREEN3", 155, 205, 155),
("DARKSEAGREEN4", 105, 139, 105),
("DARKSLATEBLUE", 72, 61, 139),
("DARKSLATEGRAY", 47, 79, 79),
("DARKTURQUOISE", 0, 206, 209),
("DARKVIOLET", 148, 0, 211),
("DEEPPINK", 255, 20, 147),
("DEEPPINK1", 255, 20, 147),
("DEEPPINK2", 238, 18, 137),
("DEEPPINK3", 205, 16, 118),
("DEEPPINK4", 139, 10, 80),
("DEEPSKYBLUE", 0, 191, 255),
("DEEPSKYBLUE1", 0, 191, 255),
("DEEPSKYBLUE2", 0, 178, 238),
("DEEPSKYBLUE3", 0, 154, 205),
("DEEPSKYBLUE4", 0, 104, 139),
("DIMGRAY", 105, 105, 105),
("DODGERBLUE", 30, 144, 255),
("DODGERBLUE1", 30, 144, 255),
("DODGERBLUE2", 28, 134, 238),
("DODGERBLUE3", 24, 116, 205),
("DODGERBLUE4", 16, 78, 139),
("FIREBRICK", 178, 34, 34),
("FIREBRICK1", 255, 48, 48),
("FIREBRICK2", 238, 44, 44),
("FIREBRICK3", 205, 38, 38),
("FIREBRICK4", 139, 26, 26),
("FLORALWHITE", 255, 250, 240),
("FORESTGREEN", 34, 139, 34),
("GAINSBORO", 220, 220, 220),
("GHOSTWHITE", 248, 248, 255),
("GOLD", 255, 215, 0),
("GOLD1", 255, 215, 0),
("GOLD2", 238, 201, 0),
("GOLD3", 205, 173, 0),
("GOLD4", 139, 117, 0),
("GOLDENROD", 218, 165, 32),
("GOLDENROD1", 255, 193, 37),
("GOLDENROD2", 238, 180, 34),
("GOLDENROD3", 205, 155, 29),
("GOLDENROD4", 139, 105, 20),
("GREEN YELLOW", 173, 255, 47),
("GREEN", 0, 255, 0),
("GREEN1", 0, 255, 0),
("GREEN2", 0, 238, 0),
("GREEN3", 0, 205, 0),
("GREEN4", 0, 139, 0),
("GREENYELLOW", 173, 255, 47),
("GRAY", 190, 190, 190),
("GRAY0", 0, 0, 0),
("GRAY1", 3, 3, 3),
("GRAY10", 26, 26, 26),
("GRAY100", 255, 255, 255),
("GRAY11", 28, 28, 28),
("GRAY12", 31, 31, 31),
("GRAY13", 33, 33, 33),
("GRAY14", 36, 36, 36),
("GRAY15", 38, 38, 38),
("GRAY16", 41, 41, 41),
("GRAY17", 43, 43, 43),
("GRAY18", 46, 46, 46),
("GRAY19", 48, 48, 48),
("GRAY2", 5, 5, 5),
("GRAY20", 51, 51, 51),
("GRAY21", 54, 54, 54),
("GRAY22", 56, 56, 56),
("GRAY23", 59, 59, 59),
("GRAY24", 61, 61, 61),
("GRAY25", 64, 64, 64),
("GRAY26", 66, 66, 66),
("GRAY27", 69, 69, 69),
("GRAY28", 71, 71, 71),
("GRAY29", 74, 74, 74),
("GRAY3", 8, 8, 8),
("GRAY30", 77, 77, 77),
("GRAY31", 79, 79, 79),
("GRAY32", 82, 82, 82),
("GRAY33", 84, 84, 84),
("GRAY34", 87, 87, 87),
("GRAY35", 89, 89, 89),
("GRAY36", 92, 92, 92),
("GRAY37", 94, 94, 94),
("GRAY38", 97, 97, 97),
("GRAY39", 99, 99, 99),
("GRAY4", 10, 10, 10),
("GRAY40", 102, 102, 102),
("GRAY41", 105, 105, 105),
("GRAY42", 107, 107, 107),
("GRAY43", 110, 110, 110),
("GRAY44", 112, 112, 112),
("GRAY45", 115, 115, 115),
("GRAY46", 117, 117, 117),
("GRAY47", 120, 120, 120),
("GRAY48", 122, 122, 122),
("GRAY49", 125, 125, 125),
("GRAY5", 13, 13, 13),
("GRAY50", 127, 127, 127),
("GRAY51", 130, 130, 130),
("GRAY52", 133, 133, 133),
("GRAY53", 135, 135, 135),
("GRAY54", 138, 138, 138),
("GRAY55", 140, 140, 140),
("GRAY56", 143, 143, 143),
("GRAY57", 145, 145, 145),
("GRAY58", 148, 148, 148),
("GRAY59", 150, 150, 150),
("GRAY6", 15, 15, 15),
("GRAY60", 153, 153, 153),
("GRAY61", 156, 156, 156),
("GRAY62", 158, 158, 158),
("GRAY63", 161, 161, 161),
("GRAY64", 163, 163, 163),
("GRAY65", 166, 166, 166),
("GRAY66", 168, 168, 168),
("GRAY67", 171, 171, 171),
("GRAY68", 173, 173, 173),
("GRAY69", 176, 176, 176),
("GRAY7", 18, 18, 18),
("GRAY70", 179, 179, 179),
("GRAY71", 181, 181, 181),
("GRAY72", 184, 184, 184),
("GRAY73", 186, 186, 186),
("GRAY74", 189, 189, 189),
("GRAY75", 191, 191, 191),
("GRAY76", 194, 194, 194),
("GRAY77", 196, 196, 196),
("GRAY78", 199, 199, 199),
("GRAY79", 201, 201, 201),
("GRAY8", 20, 20, 20),
("GRAY80", 204, 204, 204),
("GRAY81", 207, 207, 207),
("GRAY82", 209, 209, 209),
("GRAY83", 212, 212, 212),
("GRAY84", 214, 214, 214),
("GRAY85", 217, 217, 217),
("GRAY86", 219, 219, 219),
("GRAY87", 222, 222, 222),
("GRAY88", 224, 224, 224),
("GRAY89", 227, 227, 227),
("GRAY9", 23, 23, 23),
("GRAY90", 229, 229, 229),
("GRAY91", 232, 232, 232),
("GRAY92", 235, 235, 235),
("GRAY93", 237, 237, 237),
("GRAY94", 240, 240, 240),
("GRAY95", 242, 242, 242),
("GRAY96", 245, 245, 245),
("GRAY97", 247, 247, 247),
("GRAY98", 250, 250, 250),
("GRAY99", 252, 252, 252),
("HONEYDEW", 240, 255, 240),
("HONEYDEW1", 240, 255, 240),
("HONEYDEW2", 224, 238, 224),
("HONEYDEW3", 193, 205, 193),
("HONEYDEW4", 131, 139, 131),
("HOTPINK", 255, 105, 180),
("HOTPINK1", 255, 110, 180),
("HOTPINK2", 238, 106, 167),
("HOTPINK3", 205, 96, 144),
("HOTPINK4", 139, 58, 98),
("INDIANRED", 205, 92, 92),
("INDIANRED1", 255, 106, 106),
("INDIANRED2", 238, 99, 99),
("INDIANRED3", 205, 85, 85),
("INDIANRED4", 139, 58, 58),
("IVORY", 255, 255, 240),
("IVORY1", 255, 255, 240),
("IVORY2", 238, 238, 224),
("IVORY3", 205, 205, 193),
("IVORY4", 139, 139, 131),
("KHAKI", 240, 230, 140),
("KHAKI1", 255, 246, 143),
("KHAKI2", 238, 230, 133),
("KHAKI3", 205, 198, 115),
("KHAKI4", 139, 134, 78),
("LAVENDER", 230, 230, 250),
("LAVENDERBLUSH", 255, 240, 245),
("LAVENDERBLUSH1", 255, 240, 245),
("LAVENDERBLUSH2", 238, 224, 229),
("LAVENDERBLUSH3", 205, 193, 197),
("LAVENDERBLUSH4", 139, 131, 134),
("LAWNGREEN", 124, 252, 0),
("LEMONCHIFFON", 255, 250, 205),
("LEMONCHIFFON1", 255, 250, 205),
("LEMONCHIFFON2", 238, 233, 191),
("LEMONCHIFFON3", 205, 201, 165),
("LEMONCHIFFON4", 139, 137, 112),
("LIGHTBLUE", 173, 216, 230),
("LIGHTBLUE1", 191, 239, 255),
("LIGHTBLUE2", 178, 223, 238),
("LIGHTBLUE3", 154, 192, 205),
("LIGHTBLUE4", 104, 131, 139),
("LIGHTCORAL", 240, 128, 128),
("LIGHTCYAN", 224, 255, 255),
("LIGHTCYAN1", 224, 255, 255),
("LIGHTCYAN2", 209, 238, 238),
("LIGHTCYAN3", 180, 205, 205),
("LIGHTCYAN4", 122, 139, 139),
("LIGHTGOLDENROD", 238, 221, 130),
("LIGHTGOLDENROD1", 255, 236, 139),
("LIGHTGOLDENROD2", 238, 220, 130),
("LIGHTGOLDENROD3", 205, 190, 112),
("LIGHTGOLDENROD4", 139, 129, 76),
("LIGHTGOLDENRODYELLOW", 250, 250, 210),
("LIGHTGREEN", 144, 238, 144),
("LIGHTGRAY", 211, 211, 211),
("LIGHTPINK", 255, 182, 193),
("LIGHTPINK1", 255, 174, 185),
("LIGHTPINK2", 238, 162, 173),
("LIGHTPINK3", 205, 140, 149),
("LIGHTPINK4", 139, 95, 101),
("LIGHTSALMON", 255, 160, 122),
("LIGHTSALMON1", 255, 160, 122),
("LIGHTSALMON2", 238, 149, 114),
("LIGHTSALMON3", 205, 129, 98),
("LIGHTSALMON4", 139, 87, 66),
("LIGHTSEAGREEN", 32, 178, 170),
("LIGHTSKYBLUE", 135, 206, 250),
("LIGHTSKYBLUE1", 176, 226, 255),
("LIGHTSKYBLUE2", 164, 211, 238),
("LIGHTSKYBLUE3", 141, 182, 205),
("LIGHTSKYBLUE4", 96, 123, 139),
("LIGHTSLATEBLUE", 132, 112, 255),
("LIGHTSLATEGRAY", 119, 136, 153),
("LIGHTSTEELBLUE", 176, 196, 222),
("LIGHTSTEELBLUE1", 202, 225, 255),
("LIGHTSTEELBLUE2", 188, 210, 238),
("LIGHTSTEELBLUE3", 162, 181, 205),
("LIGHTSTEELBLUE4", 110, 123, 139),
("LIGHTYELLOW", 255, 255, 224),
("LIGHTYELLOW1", 255, 255, 224),
("LIGHTYELLOW2", 238, 238, 209),
("LIGHTYELLOW3", 205, 205, 180),
("LIGHTYELLOW4", 139, 139, 122),
("LIMEGREEN", 50, 205, 50),
("LINEN", 250, 240, 230),
("MAGENTA", 255, 0, 255),
("MAGENTA1", 255, 0, 255),
("MAGENTA2", 238, 0, 238),
("MAGENTA3", 205, 0, 205),
("MAGENTA4", 139, 0, 139),
("MAROON", 176, 48, 96),
("MAROON1", 255, 52, 179),
("MAROON2", 238, 48, 167),
("MAROON3", 205, 41, 144),
("MAROON4", 139, 28, 98),
("MEDIUMAQUAMARINE", 102, 205, 170),
("MEDIUMBLUE", 0, 0, 205),
("MEDIUMORCHID", 186, 85, 211),
("MEDIUMORCHID1", 224, 102, 255),
("MEDIUMORCHID2", 209, 95, 238),
("MEDIUMORCHID3", 180, 82, 205),
("MEDIUMORCHID4", 122, 55, 139),
("MEDIUMPURPLE", 147, 112, 219),
("MEDIUMPURPLE1", 171, 130, 255),
("MEDIUMPURPLE2", 159, 121, 238),
("MEDIUMPURPLE3", 137, 104, 205),
("MEDIUMPURPLE4", 93, 71, 139),
("MEDIUMSEAGREEN", 60, 179, 113),
("MEDIUMSLATEBLUE", 123, 104, 238),
("MEDIUMSPRINGGREEN", 0, 250, 154),
("MEDIUMTURQUOISE", 72, 209, 204),
("MEDIUMVIOLETRED", 199, 21, 133),
("MIDNIGHTBLUE", 25, 25, 112),
("MINTCREAM", 245, 255, 250),
("MISTYROSE", 255, 228, 225),
("MISTYROSE1", 255, 228, 225),
("MISTYROSE2", 238, 213, 210),
("MISTYROSE3", 205, 183, 181),
("MISTYROSE4", 139, 125, 123),
("MOCCASIN", 255, 228, 181),
("MUPDFBLUE", 37, 114, 172),
("NAVAJOWHITE", 255, 222, 173),
("NAVAJOWHITE1", 255, 222, 173),
("NAVAJOWHITE2", 238, 207, 161),
("NAVAJOWHITE3", 205, 179, 139),
("NAVAJOWHITE4", 139, 121, 94),
("NAVY", 0, 0, 128),
("NAVYBLUE", 0, 0, 128),
("OLDLACE", 253, 245, 230),
("OLIVEDRAB", 107, 142, 35),
("OLIVEDRAB1", 192, 255, 62),
("OLIVEDRAB2", 179, 238, 58),
("OLIVEDRAB3", 154, 205, 50),
("OLIVEDRAB4", 105, 139, 34),
("ORANGE", 255, 165, 0),
("ORANGE1", 255, 165, 0),
("ORANGE2", 238, 154, 0),
("ORANGE3", 205, 133, 0),
("ORANGE4", 139, 90, 0),
("ORANGERED", 255, 69, 0),
("ORANGERED1", 255, 69, 0),
("ORANGERED2", 238, 64, 0),
("ORANGERED3", 205, 55, 0),
("ORANGERED4", 139, 37, 0),
("ORCHID", 218, 112, 214),
("ORCHID1", 255, 131, 250),
("ORCHID2", 238, 122, 233),
("ORCHID3", 205, 105, 201),
("ORCHID4", 139, 71, 137),
("PALEGOLDENROD", 238, 232, 170),
("PALEGREEN", 152, 251, 152),
("PALEGREEN1", 154, 255, 154),
("PALEGREEN2", 144, 238, 144),
("PALEGREEN3", 124, 205, 124),
("PALEGREEN4", 84, 139, 84),
("PALETURQUOISE", 175, 238, 238),
("PALETURQUOISE1", 187, 255, 255),
("PALETURQUOISE2", 174, 238, 238),
("PALETURQUOISE3", 150, 205, 205),
("PALETURQUOISE4", 102, 139, 139),
("PALEVIOLETRED", 219, 112, 147),
("PALEVIOLETRED1", 255, 130, 171),
("PALEVIOLETRED2", 238, 121, 159),
("PALEVIOLETRED3", 205, 104, 137),
("PALEVIOLETRED4", 139, 71, 93),
("PAPAYAWHIP", 255, 239, 213),
("PEACHPUFF", 255, 218, 185),
("PEACHPUFF1", 255, 218, 185),
("PEACHPUFF2", 238, 203, 173),
("PEACHPUFF3", 205, 175, 149),
("PEACHPUFF4", 139, 119, 101),
("PERU", 205, 133, 63),
("PINK", 255, 192, 203),
("PINK1", 255, 181, 197),
("PINK2", 238, 169, 184),
("PINK3", 205, 145, 158),
("PINK4", 139, 99, 108),
("PLUM", 221, 160, 221),
("PLUM1", 255, 187, 255),
("PLUM2", 238, 174, 238),
("PLUM3", 205, 150, 205),
("PLUM4", 139, 102, 139),
("POWDERBLUE", 176, 224, 230),
("PURPLE", 160, 32, 240),
("PURPLE1", 155, 48, 255),
("PURPLE2", 145, 44, 238),
("PURPLE3", 125, 38, 205),
("PURPLE4", 85, 26, 139),
("PY_COLOR", 240, 255, 210),
("RED", 255, 0, 0),
("RED1", 255, 0, 0),
("RED2", 238, 0, 0),
("RED3", 205, 0, 0),
("RED4", 139, 0, 0),
("ROSYBROWN", 188, 143, 143),
("ROSYBROWN1", 255, 193, 193),
("ROSYBROWN2", 238, 180, 180),
("ROSYBROWN3", 205, 155, 155),
("ROSYBROWN4", 139, 105, 105),
("ROYALBLUE", 65, 105, 225),
("ROYALBLUE1", 72, 118, 255),
("ROYALBLUE2", 67, 110, 238),
("ROYALBLUE3", 58, 95, 205),
("ROYALBLUE4", 39, 64, 139),
("SADDLEBROWN", 139, 69, 19),
("SALMON", 250, 128, 114),
("SALMON1", 255, 140, 105),
("SALMON2", 238, 130, 98),
("SALMON3", 205, 112, 84),
("SALMON4", 139, 76, 57),
("SANDYBROWN", 244, 164, 96),
("SEAGREEN", 46, 139, 87),
("SEAGREEN1", 84, 255, 159),
("SEAGREEN2", 78, 238, 148),
("SEAGREEN3", 67, 205, 128),
("SEAGREEN4", 46, 139, 87),
("SEASHELL", 255, 245, 238),
("SEASHELL1", 255, 245, 238),
("SEASHELL2", 238, 229, 222),
("SEASHELL3", 205, 197, 191),
("SEASHELL4", 139, 134, 130),
("SIENNA", 160, 82, 45),
("SIENNA1", 255, 130, 71),
("SIENNA2", 238, 121, 66),
("SIENNA3", 205, 104, 57),
("SIENNA4", 139, 71, 38),
("SKYBLUE", 135, 206, 235),
("SKYBLUE1", 135, 206, 255),
("SKYBLUE2", 126, 192, 238),
("SKYBLUE3", 108, 166, 205),
("SKYBLUE4", 74, 112, 139),
("SLATEBLUE", 106, 90, 205),
("SLATEBLUE1", 131, 111, 255),
("SLATEBLUE2", 122, 103, 238),
("SLATEBLUE3", 105, 89, 205),
("SLATEBLUE4", 71, 60, 139),
("SLATEGRAY", 112, 128, 144),
("SNOW", 255, 250, 250),
("SNOW1", 255, 250, 250),
("SNOW2", 238, 233, 233),
("SNOW3", 205, 201, 201),
("SNOW4", 139, 137, 137),
("SPRINGGREEN", 0, 255, 127),
("SPRINGGREEN1", 0, 255, 127),
("SPRINGGREEN2", 0, 238, 118),
("SPRINGGREEN3", 0, 205, 102),
("SPRINGGREEN4", 0, 139, 69),
("STEELBLUE", 70, 130, 180),
("STEELBLUE1", 99, 184, 255),
("STEELBLUE2", 92, 172, 238),
("STEELBLUE3", 79, 148, 205),
("STEELBLUE4", 54, 100, 139),
("TAN", 210, 180, 140),
("TAN1", 255, 165, 79),
("TAN2", 238, 154, 73),
("TAN3", 205, 133, 63),
("TAN4", 139, 90, 43),
("THISTLE", 216, 191, 216),
("THISTLE1", 255, 225, 255),
("THISTLE2", 238, 210, 238),
("THISTLE3", 205, 181, 205),
("THISTLE4", 139, 123, 139),
("TOMATO", 255, 99, 71),
("TOMATO1", 255, 99, 71),
("TOMATO2", 238, 92, 66),
("TOMATO3", 205, 79, 57),
("TOMATO4", 139, 54, 38),
("TURQUOISE", 64, 224, 208),
("TURQUOISE1", 0, 245, 255),
("TURQUOISE2", 0, 229, 238),
("TURQUOISE3", 0, 197, 205),
("TURQUOISE4", 0, 134, 139),
("VIOLET", 238, 130, 238),
("VIOLETRED", 208, 32, 144),
("VIOLETRED1", 255, 62, 150),
("VIOLETRED2", 238, 58, 140),
("VIOLETRED3", 205, 50, 120),
("VIOLETRED4", 139, 34, 82),
("WHEAT", 245, 222, 179),
("WHEAT1", 255, 231, 186),
("WHEAT2", 238, 216, 174),
("WHEAT3", 205, 186, 150),
("WHEAT4", 139, 126, 102),
("WHITE", 255, 255, 255),
("WHITESMOKE", 245, 245, 245),
("YELLOW", 255, 255, 0),
("YELLOW1", 255, 255, 0),
("YELLOW2", 238, 238, 0),
("YELLOW3", 205, 205, 0),
("YELLOW4", 139, 139, 0),
("YELLOWGREEN", 154, 205, 50),
]
def getColorInfoDict() -> dict:
d = {}
for item in getColorInfoList():
d[item[0].lower()] = item[1:]
return d
def getColor(name: str) -> tuple:
"""Retrieve RGB color in PDF format by name.
Returns:
a triple of floats in range 0 to 1. In case of name-not-found, "white" is returned.
"""
try:
c = getColorInfoList()[getColorList().index(name.upper())]
return (c[1] / 255.0, c[2] / 255.0, c[3] / 255.0)
except Exception:
pymupdf.exception_info()
return (1, 1, 1)
def getColorHSV(name: str) -> tuple:
"""Retrieve the hue, saturation, value triple of a color name.
Returns:
a triple (degree, percent, percent). If not found (-1, -1, -1) is returned.
"""
try:
x = getColorInfoList()[getColorList().index(name.upper())]
except Exception:
if g_exceptions_verbose: pymupdf.exception_info()
return (-1, -1, -1)
r = x[1] / 255.0
g = x[2] / 255.0
b = x[3] / 255.0
cmax = max(r, g, b)
V = round(cmax * 100, 1)
cmin = min(r, g, b)
delta = cmax - cmin
if delta == 0:
hue = 0
elif cmax == r:
hue = 60.0 * (((g - b) / delta) % 6)
elif cmax == g:
hue = 60.0 * (((b - r) / delta) + 2)
else:
hue = 60.0 * (((r - g) / delta) + 4)
H = int(round(hue))
if cmax == 0:
sat = 0
else:
sat = delta / cmax
S = int(round(sat * 100))
return (H, S, V)
def _get_font_properties(doc: pymupdf.Document, xref: int) -> tuple:
fontname, ext, stype, buffer = doc.extract_font(xref)
asc = 0.8
dsc = -0.2
if ext == "":
return fontname, ext, stype, asc, dsc
if buffer:
try:
font = pymupdf.Font(fontbuffer=buffer)
asc = font.ascender
dsc = font.descender
bbox = font.bbox
if asc - dsc < 1:
if bbox.y0 < dsc:
dsc = bbox.y0
asc = 1 - dsc
except Exception:
pymupdf.exception_info()
asc *= 1.2
dsc *= 1.2
return fontname, ext, stype, asc, dsc
if ext != "n/a":
try:
font = pymupdf.Font(fontname)
asc = font.ascender
dsc = font.descender
except Exception:
pymupdf.exception_info()
asc *= 1.2
dsc *= 1.2
else:
asc *= 1.2
dsc *= 1.2
return fontname, ext, stype, asc, dsc
def get_char_widths(
doc: pymupdf.Document, xref: int, limit: int = 256, idx: int = 0, fontdict: OptDict = None
) -> list:
"""Get list of glyph information of a font.
Notes:
Must be provided by its XREF number. If we already dealt with the
font, it will be recorded in doc.FontInfos. Otherwise we insert an
entry there.
Finally we return the glyphs for the font. This is a list of
(glyph, width) where glyph is an integer controlling the char
appearance, and width is a float controlling the char's spacing:
width * fontsize is the actual space.
For 'simple' fonts, glyph == ord(char) will usually be true.
Exceptions are 'Symbol' and 'ZapfDingbats'. We are providing data for these directly here.
"""
fontinfo = pymupdf.CheckFontInfo(doc, xref)
if fontinfo is None: # not recorded yet: create it
if fontdict is None:
name, ext, stype, asc, dsc = _get_font_properties(doc, xref)
fontdict = {
"name": name,
"type": stype,
"ext": ext,
"ascender": asc,
"descender": dsc,
}
else:
name = fontdict["name"]
ext = fontdict["ext"]
stype = fontdict["type"]
ordering = fontdict["ordering"]
simple = fontdict["simple"]
if ext == "":
raise ValueError("xref is not a font")
# check for 'simple' fonts
if stype in ("Type1", "MMType1", "TrueType"):
simple = True
else:
simple = False
# check for CJK fonts
if name in ("Fangti", "Ming"):
ordering = 0
elif name in ("Heiti", "Song"):
ordering = 1
elif name in ("Gothic", "Mincho"):
ordering = 2
elif name in ("Dotum", "Batang"):
ordering = 3
else:
ordering = -1
fontdict["simple"] = simple
if name == "ZapfDingbats":
glyphs = pymupdf.zapf_glyphs
elif name == "Symbol":
glyphs = pymupdf.symbol_glyphs
else:
glyphs = None
fontdict["glyphs"] = glyphs
fontdict["ordering"] = ordering
fontinfo = [xref, fontdict]
doc.FontInfos.append(fontinfo)
else:
fontdict = fontinfo[1]
glyphs = fontdict["glyphs"]
simple = fontdict["simple"]
ordering = fontdict["ordering"]
if glyphs is None:
oldlimit = 0
else:
oldlimit = len(glyphs)
mylimit = max(256, limit)
if mylimit <= oldlimit:
return glyphs
if ordering < 0: # not a CJK font
glyphs = doc._get_char_widths(
xref, fontdict["name"], fontdict["ext"], fontdict["ordering"], mylimit, idx
)
else: # CJK fonts use char codes and width = 1
glyphs = None
fontdict["glyphs"] = glyphs
fontinfo[1] = fontdict
pymupdf.UpdateFontInfo(doc, fontinfo)
return glyphs
class Shape:
"""Create a new shape."""
@staticmethod
def horizontal_angle(C, P):
"""Return the angle to the horizontal for the connection from C to P.
This uses the arcus sine function and resolves its inherent ambiguity by
looking up in which quadrant vector S = P - C is located.
"""
S = pymupdf.Point(P - C).unit # unit vector 'C' -> 'P'
alfa = math.asin(abs(S.y)) # absolute angle from horizontal
if S.x < 0: # make arcsin result unique
if S.y <= 0: # bottom-left
alfa = -(math.pi - alfa)
else: # top-left
alfa = math.pi - alfa
else:
if S.y >= 0: # top-right
pass
else: # bottom-right
alfa = -alfa
return alfa
def __init__(self, page: pymupdf.Page):
pymupdf.CheckParent(page)
self.page = page
self.doc = page.parent
if not self.doc.is_pdf:
raise ValueError("is no PDF")
self.height = page.mediabox_size.y
self.width = page.mediabox_size.x
self.x = page.cropbox_position.x
self.y = page.cropbox_position.y
self.pctm = page.transformation_matrix # page transf. matrix
self.ipctm = ~self.pctm # inverted transf. matrix
self.draw_cont = ""
self.text_cont = ""
self.totalcont = ""
self.last_point = None
self.rect = None
def updateRect(self, x):
if self.rect is None:
if len(x) == 2:
self.rect = pymupdf.Rect(x, x)
else:
self.rect = pymupdf.Rect(x)
else:
if len(x) == 2:
x = pymupdf.Point(x)
self.rect.x0 = min(self.rect.x0, x.x)
self.rect.y0 = min(self.rect.y0, x.y)
self.rect.x1 = max(self.rect.x1, x.x)
self.rect.y1 = max(self.rect.y1, x.y)
else:
x = pymupdf.Rect(x)
self.rect.x0 = min(self.rect.x0, x.x0)
self.rect.y0 = min(self.rect.y0, x.y0)
self.rect.x1 = max(self.rect.x1, x.x1)
self.rect.y1 = max(self.rect.y1, x.y1)
def draw_line(self, p1: point_like, p2: point_like) -> pymupdf.Point:
"""Draw a line between two points."""
p1 = pymupdf.Point(p1)
p2 = pymupdf.Point(p2)
if not (self.last_point == p1):
self.draw_cont += _format_g(pymupdf.JM_TUPLE(p1 * self.ipctm)) + " m\n"
self.last_point = p1
self.updateRect(p1)
self.draw_cont += _format_g(pymupdf.JM_TUPLE(p2 * self.ipctm)) + " l\n"
self.updateRect(p2)
self.last_point = p2
return self.last_point
def draw_polyline(self, points: list) -> pymupdf.Point:
"""Draw several connected line segments."""
for i, p in enumerate(points):
if i == 0:
if not (self.last_point == pymupdf.Point(p)):
self.draw_cont += _format_g(pymupdf.JM_TUPLE(pymupdf.Point(p) * self.ipctm)) + " m\n"
self.last_point = pymupdf.Point(p)
else:
self.draw_cont += _format_g(pymupdf.JM_TUPLE(pymupdf.Point(p) * self.ipctm)) + " l\n"
self.updateRect(p)
self.last_point = pymupdf.Point(points[-1])
return self.last_point
def draw_bezier(
self,
p1: point_like,
p2: point_like,
p3: point_like,
p4: point_like,
) -> pymupdf.Point:
"""Draw a standard cubic Bezier curve."""
p1 = pymupdf.Point(p1)
p2 = pymupdf.Point(p2)
p3 = pymupdf.Point(p3)
p4 = pymupdf.Point(p4)
if not (self.last_point == p1):
self.draw_cont += _format_g(pymupdf.JM_TUPLE(p1 * self.ipctm)) + " m\n"
args = pymupdf.JM_TUPLE(list(p2 * self.ipctm) + list(p3 * self.ipctm) + list(p4 * self.ipctm))
self.draw_cont += _format_g(args) + " c\n"
self.updateRect(p1)
self.updateRect(p2)
self.updateRect(p3)
self.updateRect(p4)
self.last_point = p4
return self.last_point
def draw_oval(self, tetra: typing.Union[quad_like, rect_like]) -> pymupdf.Point:
"""Draw an ellipse inside a tetrapod."""
if len(tetra) != 4:
raise ValueError("invalid arg length")
if hasattr(tetra[0], "__float__"):
q = pymupdf.Rect(tetra).quad
else:
q = pymupdf.Quad(tetra)
mt = q.ul + (q.ur - q.ul) * 0.5
mr = q.ur + (q.lr - q.ur) * 0.5
mb = q.ll + (q.lr - q.ll) * 0.5
ml = q.ul + (q.ll - q.ul) * 0.5
if not (self.last_point == ml):
self.draw_cont += _format_g(pymupdf.JM_TUPLE(ml * self.ipctm)) + " m\n"
self.last_point = ml
self.draw_curve(ml, q.ll, mb)
self.draw_curve(mb, q.lr, mr)
self.draw_curve(mr, q.ur, mt)
self.draw_curve(mt, q.ul, ml)
self.updateRect(q.rect)
self.last_point = ml
return self.last_point
def draw_circle(self, center: point_like, radius: float) -> pymupdf.Point:
"""Draw a circle given its center and radius."""
if not radius > pymupdf.EPSILON:
raise ValueError("radius must be positive")
center = pymupdf.Point(center)
p1 = center - (radius, 0)
return self.draw_sector(center, p1, 360, fullSector=False)
def draw_curve(
self,
p1: point_like,
p2: point_like,
p3: point_like,
) -> pymupdf.Point:
"""Draw a curve between points using one control point."""
kappa = 0.55228474983
p1 = pymupdf.Point(p1)
p2 = pymupdf.Point(p2)
p3 = pymupdf.Point(p3)
k1 = p1 + (p2 - p1) * kappa
k2 = p3 + (p2 - p3) * kappa
return self.draw_bezier(p1, k1, k2, p3)
def draw_sector(
self,
center: point_like,
point: point_like,
beta: float,
fullSector: bool = True,
) -> pymupdf.Point:
"""Draw a circle sector."""
center = pymupdf.Point(center)
point = pymupdf.Point(point)
l3 = lambda a, b: _format_g((a, b)) + " m\n"
l4 = lambda a, b, c, d, e, f: _format_g((a, b, c, d, e, f)) + " c\n"
l5 = lambda a, b: _format_g((a, b)) + " l\n"
betar = math.radians(-beta)
w360 = math.radians(math.copysign(360, betar)) * (-1)
w90 = math.radians(math.copysign(90, betar))
w45 = w90 / 2
while abs(betar) > 2 * math.pi:
betar += w360 # bring angle below 360 degrees
if not (self.last_point == point):
self.draw_cont += l3(*pymupdf.JM_TUPLE(point * self.ipctm))
self.last_point = point
Q = pymupdf.Point(0, 0) # just make sure it exists
C = center
P = point
S = P - C # vector 'center' -> 'point'
rad = abs(S) # circle radius
if not rad > pymupdf.EPSILON:
raise ValueError("radius must be positive")
alfa = self.horizontal_angle(center, point)
while abs(betar) > abs(w90): # draw 90 degree arcs
q1 = C.x + math.cos(alfa + w90) * rad
q2 = C.y + math.sin(alfa + w90) * rad
Q = pymupdf.Point(q1, q2) # the arc's end point
r1 = C.x + math.cos(alfa + w45) * rad / math.cos(w45)
r2 = C.y + math.sin(alfa + w45) * rad / math.cos(w45)
R = pymupdf.Point(r1, r2) # crossing point of tangents
kappah = (1 - math.cos(w45)) * 4 / 3 / abs(R - Q)
kappa = kappah * abs(P - Q)
cp1 = P + (R - P) * kappa # control point 1
cp2 = Q + (R - Q) * kappa # control point 2
self.draw_cont += l4(*pymupdf.JM_TUPLE(
list(cp1 * self.ipctm) + list(cp2 * self.ipctm) + list(Q * self.ipctm)
))
betar -= w90 # reduce parm angle by 90 deg
alfa += w90 # advance start angle by 90 deg
P = Q # advance to arc end point
# draw (remaining) arc
if abs(betar) > 1e-3: # significant degrees left?
beta2 = betar / 2
q1 = C.x + math.cos(alfa + betar) * rad
q2 = C.y + math.sin(alfa + betar) * rad
Q = pymupdf.Point(q1, q2) # the arc's end point
r1 = C.x + math.cos(alfa + beta2) * rad / math.cos(beta2)
r2 = C.y + math.sin(alfa + beta2) * rad / math.cos(beta2)
R = pymupdf.Point(r1, r2) # crossing point of tangents
# kappa height is 4/3 of segment height
kappah = (1 - math.cos(beta2)) * 4 / 3 / abs(R - Q) # kappa height
kappa = kappah * abs(P - Q) / (1 - math.cos(betar))
cp1 = P + (R - P) * kappa # control point 1
cp2 = Q + (R - Q) * kappa # control point 2
self.draw_cont += l4(*pymupdf.JM_TUPLE(
list(cp1 * self.ipctm) + list(cp2 * self.ipctm) + list(Q * self.ipctm)
))
if fullSector:
self.draw_cont += l3(*pymupdf.JM_TUPLE(point * self.ipctm))
self.draw_cont += l5(*pymupdf.JM_TUPLE(center * self.ipctm))
self.draw_cont += l5(*pymupdf.JM_TUPLE(Q * self.ipctm))
self.last_point = Q
return self.last_point
def draw_rect(self, rect: rect_like, *, radius=None) -> pymupdf.Point:
"""Draw a rectangle.
Args:
radius: if not None, the rectangle will have rounded corners.
This is the radius of the curvature, given as percentage of
the rectangle width or height. Valid are values 0 < v <= 0.5.
For a sequence of two values, the corners will have different
radii. Otherwise, the percentage will be computed from the
shorter side. A value of (0.5, 0.5) will draw an ellipse.
"""
r = pymupdf.Rect(rect)
if radius is None: # standard rectangle
self.draw_cont += _format_g(pymupdf.JM_TUPLE(
list(r.bl * self.ipctm) + [r.width, r.height]
)) + " re\n"
self.updateRect(r)
self.last_point = r.tl
return self.last_point
# rounded corners requested. This requires 1 or 2 values, each
# with 0 < value <= 0.5
if hasattr(radius, "__float__"):
if radius <= 0 or radius > 0.5:
raise ValueError(f"bad radius value {radius}.")
d = min(r.width, r.height) * radius
px = (d, 0)
py = (0, d)
elif hasattr(radius, "__len__") and len(radius) == 2:
rx, ry = radius
px = (rx * r.width, 0)
py = (0, ry * r.height)
if min(rx, ry) <= 0 or max(rx, ry) > 0.5:
raise ValueError(f"bad radius value {radius}.")
else:
raise ValueError(f"bad radius value {radius}.")
lp = self.draw_line(r.tl + py, r.bl - py)
lp = self.draw_curve(lp, r.bl, r.bl + px)
lp = self.draw_line(lp, r.br - px)
lp = self.draw_curve(lp, r.br, r.br - py)
lp = self.draw_line(lp, r.tr + py)
lp = self.draw_curve(lp, r.tr, r.tr - px)
lp = self.draw_line(lp, r.tl + px)
self.last_point = self.draw_curve(lp, r.tl, r.tl + py)
self.updateRect(r)
return self.last_point
def draw_quad(self, quad: quad_like) -> pymupdf.Point:
"""Draw a Quad."""
q = pymupdf.Quad(quad)
return self.draw_polyline([q.ul, q.ll, q.lr, q.ur, q.ul])
def draw_zigzag(
self,
p1: point_like,
p2: point_like,
breadth: float = 2,
) -> pymupdf.Point:
"""Draw a zig-zagged line from p1 to p2."""
p1 = pymupdf.Point(p1)
p2 = pymupdf.Point(p2)
S = p2 - p1 # vector start - end
rad = abs(S) # distance of points
cnt = 4 * int(round(rad / (4 * breadth), 0)) # always take full phases
if cnt < 4:
raise ValueError("points too close")
mb = rad / cnt # revised breadth
matrix = pymupdf.Matrix(pymupdf.util_hor_matrix(p1, p2)) # normalize line to x-axis
i_mat = ~matrix # get original position
points = [] # stores edges
for i in range(1, cnt):
if i % 4 == 1: # point "above" connection
p = pymupdf.Point(i, -1) * mb
elif i % 4 == 3: # point "below" connection
p = pymupdf.Point(i, 1) * mb
else: # ignore others
continue
points.append(p * i_mat)
self.draw_polyline([p1] + points + [p2]) # add start and end points
return p2
def draw_squiggle(
self,
p1: point_like,
p2: point_like,
breadth=2,
) -> pymupdf.Point:
"""Draw a squiggly line from p1 to p2."""
p1 = pymupdf.Point(p1)
p2 = pymupdf.Point(p2)
S = p2 - p1 # vector start - end
rad = abs(S) # distance of points
cnt = 4 * int(round(rad / (4 * breadth), 0)) # always take full phases
if cnt < 4:
raise ValueError("points too close")
mb = rad / cnt # revised breadth
matrix = pymupdf.Matrix(pymupdf.util_hor_matrix(p1, p2)) # normalize line to x-axis
i_mat = ~matrix # get original position
k = 2.4142135623765633 # y of draw_curve helper point
points = [] # stores edges
for i in range(1, cnt):
if i % 4 == 1: # point "above" connection
p = pymupdf.Point(i, -k) * mb
elif i % 4 == 3: # point "below" connection
p = pymupdf.Point(i, k) * mb
else: # else on connection line
p = pymupdf.Point(i, 0) * mb
points.append(p * i_mat)
points = [p1] + points + [p2]
cnt = len(points)
i = 0
while i + 2 < cnt:
self.draw_curve(points[i], points[i + 1], points[i + 2])
i += 2
return p2
# ==============================================================================
# Shape.insert_text
# ==============================================================================
def insert_text(
self,
point: point_like,
buffer: typing.Union[str, list],
fontsize: float = 11,
lineheight: OptFloat = None,
fontname: str = "helv",
fontfile: OptStr = None,
set_simple: bool = 0,
encoding: int = 0,
color: OptSeq = None,
fill: OptSeq = None,
render_mode: int = 0,
border_width: float = 0.05,
rotate: int = 0,
morph: OptSeq = None,
stroke_opacity: float = 1,
fill_opacity: float = 1,
oc: int = 0,
) -> int:
# ensure 'text' is a list of strings, worth dealing with
if not bool(buffer):
return 0
if type(buffer) not in (list, tuple):
text = buffer.splitlines()
else:
text = buffer
if not len(text) > 0:
return 0
point = pymupdf.Point(point)
try:
maxcode = max([ord(c) for c in " ".join(text)])
except Exception:
pymupdf.exception_info()
return 0
# ensure valid 'fontname'
fname = fontname
if fname.startswith("/"):
fname = fname[1:]
xref = self.page.insert_font(
fontname=fname, fontfile=fontfile, encoding=encoding, set_simple=set_simple
)
fontinfo = pymupdf.CheckFontInfo(self.doc, xref)
fontdict = fontinfo[1]
ordering = fontdict["ordering"]
simple = fontdict["simple"]
bfname = fontdict["name"]
ascender = fontdict["ascender"]
descender = fontdict["descender"]
if lineheight:
lheight = fontsize * lineheight
elif ascender - descender <= 1:
lheight = fontsize * 1.2
else:
lheight = fontsize * (ascender - descender)
if maxcode > 255:
glyphs = self.doc.get_char_widths(xref, maxcode + 1)
else:
glyphs = fontdict["glyphs"]
tab = []
for t in text:
if simple and bfname not in ("Symbol", "ZapfDingbats"):
g = None
else:
g = glyphs
tab.append(pymupdf.getTJstr(t, g, simple, ordering))
text = tab
color_str = pymupdf.ColorCode(color, "c")
fill_str = pymupdf.ColorCode(fill, "f")
if not fill and render_mode == 0: # ensure fill color when 0 Tr
fill = color
fill_str = pymupdf.ColorCode(color, "f")
morphing = pymupdf.CheckMorph(morph)
rot = rotate
if rot % 90 != 0:
raise ValueError("bad rotate value")
while rot < 0:
rot += 360
rot = rot % 360 # text rotate = 0, 90, 270, 180
templ1 = lambda a, b, c, d, e, f, g: f"\nq\n{a}{b}BT\n{c}1 0 0 1 {_format_g((d, e))} Tm\n/{f} {_format_g(g)} Tf "
templ2 = lambda a: f"TJ\n0 -{_format_g(a)} TD\n"
cmp90 = "0 1 -1 0 0 0 cm\n" # rotates 90 deg counter-clockwise
cmm90 = "0 -1 1 0 0 0 cm\n" # rotates 90 deg clockwise
cm180 = "-1 0 0 -1 0 0 cm\n" # rotates by 180 deg.
height = self.height
width = self.width
# setting up for standard rotation directions
# case rotate = 0
if morphing:
m1 = pymupdf.Matrix(1, 0, 0, 1, morph[0].x + self.x, height - morph[0].y - self.y)
mat = ~m1 * morph[1] * m1
cm = _format_g(pymupdf.JM_TUPLE(mat)) + " cm\n"
else:
cm = ""
top = height - point.y - self.y # start of 1st char
left = point.x + self.x # start of 1. char
space = top # space available
#headroom = point.y + self.y # distance to page border
if rot == 90:
left = height - point.y - self.y
top = -point.x - self.x
cm += cmp90
space = width - abs(top)
#headroom = point.x + self.x
elif rot == 270:
left = -height + point.y + self.y
top = point.x + self.x
cm += cmm90
space = abs(top)
#headroom = width - point.x - self.x
elif rot == 180:
left = -point.x - self.x
top = -height + point.y + self.y
cm += cm180
space = abs(point.y + self.y)
#headroom = height - point.y - self.y
optcont = self.page._get_optional_content(oc)
if optcont is not None:
bdc = "/OC /%s BDC\n" % optcont
emc = "EMC\n"
else:
bdc = emc = ""
alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity)
if alpha is None:
alpha = ""
else:
alpha = "/%s gs\n" % alpha
nres = templ1(bdc, alpha, cm, left, top, fname, fontsize)
if render_mode > 0:
nres += "%i Tr " % render_mode
nres += _format_g(border_width * fontsize) + " w "
if color is not None:
nres += color_str
if fill is not None:
nres += fill_str
# =========================================================================
# start text insertion
# =========================================================================
nres += text[0]
nlines = 1 # set output line counter
if len(text) > 1:
nres += templ2(lheight) # line 1
else:
nres += 'TJ'
for i in range(1, len(text)):
if space < lheight:
break # no space left on page
if i > 1:
nres += "\nT* "
nres += text[i] + 'TJ'
space -= lheight
nlines += 1
nres += "\nET\n%sQ\n" % emc
# =========================================================================
# end of text insertion
# =========================================================================
# update the /Contents object
self.text_cont += nres
return nlines
# ==============================================================================
# Shape.insert_textbox
# ==============================================================================
def insert_textbox(
self,
rect: rect_like,
buffer: typing.Union[str, list],
fontname: OptStr = "helv",
fontfile: OptStr = None,
fontsize: float = 11,
lineheight: OptFloat = None,
set_simple: bool = 0,
encoding: int = 0,
color: OptSeq = None,
fill: OptSeq = None,
expandtabs: int = 1,
border_width: float = 0.05,
align: int = 0,
render_mode: int = 0,
rotate: int = 0,
morph: OptSeq = None,
stroke_opacity: float = 1,
fill_opacity: float = 1,
oc: int = 0,
) -> float:
"""Insert text into a given rectangle.
Args:
rect -- the textbox to fill
buffer -- text to be inserted
fontname -- a Base-14 font, font name or '/name'
fontfile -- name of a font file
fontsize -- font size
lineheight -- overwrite the font property
color -- RGB stroke color triple
fill -- RGB fill color triple
render_mode -- text rendering control
border_width -- thickness of glyph borders as percentage of fontsize
expandtabs -- handles tabulators with string function
align -- left, center, right, justified
rotate -- 0, 90, 180, or 270 degrees
morph -- morph box with a matrix and a fixpoint
Returns:
unused or deficit rectangle area (float)
"""
rect = pymupdf.Rect(rect)
if rect.is_empty or rect.is_infinite:
raise ValueError("text box must be finite and not empty")
color_str = pymupdf.ColorCode(color, "c")
fill_str = pymupdf.ColorCode(fill, "f")
if fill is None and render_mode == 0: # ensure fill color for 0 Tr
fill = color
fill_str = pymupdf.ColorCode(color, "f")
optcont = self.page._get_optional_content(oc)
if optcont is not None:
bdc = "/OC /%s BDC\n" % optcont
emc = "EMC\n"
else:
bdc = emc = ""
# determine opacity / transparency
alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity)
if alpha is None:
alpha = ""
else:
alpha = "/%s gs\n" % alpha
if rotate % 90 != 0:
raise ValueError("rotate must be multiple of 90")
rot = rotate
while rot < 0:
rot += 360
rot = rot % 360
# is buffer worth of dealing with?
if not bool(buffer):
return rect.height if rot in (0, 180) else rect.width
cmp90 = "0 1 -1 0 0 0 cm\n" # rotates counter-clockwise
cmm90 = "0 -1 1 0 0 0 cm\n" # rotates clockwise
cm180 = "-1 0 0 -1 0 0 cm\n" # rotates by 180 deg.
height = self.height
fname = fontname
if fname.startswith("/"):
fname = fname[1:]
xref = self.page.insert_font(
fontname=fname, fontfile=fontfile, encoding=encoding, set_simple=set_simple
)
fontinfo = pymupdf.CheckFontInfo(self.doc, xref)
fontdict = fontinfo[1]
ordering = fontdict["ordering"]
simple = fontdict["simple"]
glyphs = fontdict["glyphs"]
bfname = fontdict["name"]
ascender = fontdict["ascender"]
descender = fontdict["descender"]
if lineheight:
lheight_factor = lineheight
elif ascender - descender <= 1:
lheight_factor = 1.2
else:
lheight_factor = ascender - descender
lheight = fontsize * lheight_factor
# create a list from buffer, split into its lines
if type(buffer) in (list, tuple):
t0 = "\n".join(buffer)
else:
t0 = buffer
maxcode = max([ord(c) for c in t0])
# replace invalid char codes for simple fonts
if simple and maxcode > 255:
t0 = "".join([c if ord(c) < 256 else "?" for c in t0])
t0 = t0.splitlines()
glyphs = self.doc.get_char_widths(xref, maxcode + 1)
if simple and bfname not in ("Symbol", "ZapfDingbats"):
tj_glyphs = None
else:
tj_glyphs = glyphs
# ----------------------------------------------------------------------
# calculate pixel length of a string
# ----------------------------------------------------------------------
def pixlen(x):
"""Calculate pixel length of x."""
if ordering < 0:
return sum([glyphs[ord(c)][1] for c in x]) * fontsize
else:
return len(x) * fontsize
# ---------------------------------------------------------------------
if ordering < 0:
blen = glyphs[32][1] * fontsize # pixel size of space character
else:
blen = fontsize
text = "" # output buffer
if pymupdf.CheckMorph(morph):
m1 = pymupdf.Matrix(
1, 0, 0, 1, morph[0].x + self.x, self.height - morph[0].y - self.y
)
mat = ~m1 * morph[1] * m1
cm = _format_g(pymupdf.JM_TUPLE(mat)) + " cm\n"
else:
cm = ""
# ---------------------------------------------------------------------
# adjust for text orientation / rotation
# ---------------------------------------------------------------------
progr = 1 # direction of line progress
c_pnt = pymupdf.Point(0, fontsize * ascender) # used for line progress
if rot == 0: # normal orientation
point = rect.tl + c_pnt # line 1 is 'lheight' below top
maxwidth = rect.width # pixels available in one line
maxheight = rect.height # available text height
elif rot == 90: # rotate counter clockwise
c_pnt = pymupdf.Point(fontsize * ascender, 0) # progress in x-direction
point = rect.bl + c_pnt # line 1 'lheight' away from left
maxwidth = rect.height # pixels available in one line
maxheight = rect.width # available text height
cm += cmp90
elif rot == 180: # text upside down
# progress upwards in y direction
c_pnt = -pymupdf.Point(0, fontsize * ascender)
point = rect.br + c_pnt # line 1 'lheight' above bottom
maxwidth = rect.width # pixels available in one line
progr = -1 # subtract lheight for next line
maxheight =rect.height # available text height
cm += cm180
else: # rotate clockwise (270 or -90)
# progress from right to left
c_pnt = -pymupdf.Point(fontsize * ascender, 0)
point = rect.tr + c_pnt # line 1 'lheight' left of right
maxwidth = rect.height # pixels available in one line
progr = -1 # subtract lheight for next line
maxheight = rect.width # available text height
cm += cmm90
# =====================================================================
# line loop
# =====================================================================
just_tab = [] # 'justify' indicators per line
for i, line in enumerate(t0):
line_t = line.expandtabs(expandtabs).split(" ") # split into words
num_words = len(line_t)
lbuff = "" # init line buffer
rest = maxwidth # available line pixels
# =================================================================
# word loop
# =================================================================
for j in range(num_words):
word = line_t[j]
pl_w = pixlen(word) # pixel len of word
if rest >= pl_w: # does it fit on the line?
lbuff += word + " " # yes, append word
rest -= pl_w + blen # update available line space
continue # next word
# word doesn't fit - output line (if not empty)
if lbuff:
lbuff = lbuff.rstrip() + "\n" # line full, append line break
text += lbuff # append to total text
just_tab.append(True) # can align-justify
lbuff = "" # re-init line buffer
rest = maxwidth # re-init avail. space
if pl_w <= maxwidth: # word shorter than 1 line?
lbuff = word + " " # start the line with it
rest = maxwidth - pl_w - blen # update free space
continue
# long word: split across multiple lines - char by char ...
if len(just_tab) > 0:
just_tab[-1] = False # cannot align-justify
for c in word:
if pixlen(lbuff) <= maxwidth - pixlen(c):
lbuff += c
else: # line full
lbuff += "\n" # close line
text += lbuff # append to text
just_tab.append(False) # cannot align-justify
lbuff = c # start new line with this char
lbuff += " " # finish long word
rest = maxwidth - pixlen(lbuff) # long word stored
if lbuff: # unprocessed line content?
text += lbuff.rstrip() # append to text
just_tab.append(False) # cannot align-justify
if i < len(t0) - 1: # not the last line?
text += "\n" # insert line break
# compute used part of the textbox
if text.endswith("\n"):
text = text[:-1]
lb_count = text.count("\n") + 1 # number of lines written
# text height = line count * line height plus one descender value
text_height = lheight * lb_count - descender * fontsize
more = text_height - maxheight # difference to height limit
if more > pymupdf.EPSILON: # landed too much outside rect
return (-1) * more # return deficit, don't output
more = abs(more)
if more < pymupdf.EPSILON:
more = 0 # don't bother with epsilons
nres = "\nq\n%s%sBT\n" % (bdc, alpha) + cm # initialize output buffer
templ = lambda a, b, c, d: f"1 0 0 1 {_format_g((a, b))} Tm /{c} {_format_g(d)} Tf "
# center, right, justify: output each line with its own specifics
text_t = text.splitlines() # split text in lines again
just_tab[-1] = False # never justify last line
for i, t in enumerate(text_t):
spacing = 0
pl = maxwidth - pixlen(t) # length of empty line part
pnt = point + c_pnt * (i * lheight_factor) # text start of line
if align == 1: # center: right shift by half width
if rot in (0, 180):
pnt = pnt + pymupdf.Point(pl / 2, 0) * progr
else:
pnt = pnt - pymupdf.Point(0, pl / 2) * progr
elif align == 2: # right: right shift by full width
if rot in (0, 180):
pnt = pnt + pymupdf.Point(pl, 0) * progr
else:
pnt = pnt - pymupdf.Point(0, pl) * progr
elif align == 3: # justify
spaces = t.count(" ") # number of spaces in line
if spaces > 0 and just_tab[i]: # if any, and we may justify
spacing = pl / spaces # make every space this much larger
else:
spacing = 0 # keep normal space length
top = height - pnt.y - self.y
left = pnt.x + self.x
if rot == 90:
left = height - pnt.y - self.y
top = -pnt.x - self.x
elif rot == 270:
left = -height + pnt.y + self.y
top = pnt.x + self.x
elif rot == 180:
left = -pnt.x - self.x
top = -height + pnt.y + self.y
nres += templ(left, top, fname, fontsize)
if render_mode > 0:
nres += "%i Tr " % render_mode
nres += _format_g(border_width * fontsize) + " w "
if align == 3:
nres += _format_g(spacing) + " Tw "
if color is not None:
nres += color_str
if fill is not None:
nres += fill_str
nres += "%sTJ\n" % pymupdf.getTJstr(t, tj_glyphs, simple, ordering)
nres += "ET\n%sQ\n" % emc
self.text_cont += nres
self.updateRect(rect)
return more
def finish(
self,
width: float = 1,
color: OptSeq = (0,),
fill: OptSeq = None,
lineCap: int = 0,
lineJoin: int = 0,
dashes: OptStr = None,
even_odd: bool = False,
morph: OptSeq = None,
closePath: bool = True,
fill_opacity: float = 1,
stroke_opacity: float = 1,
oc: int = 0,
) -> None:
"""Finish the current drawing segment.
Notes:
Apply colors, opacity, dashes, line style and width, or
morphing. Also whether to close the path
by connecting last to first point.
"""
if self.draw_cont == "": # treat empty contents as no-op
return
if width == 0: # border color makes no sense then
color = None
elif color is None: # vice versa
width = 0
# if color == None and fill == None:
# raise ValueError("at least one of 'color' or 'fill' must be given")
color_str = pymupdf.ColorCode(color, "c") # ensure proper color string
fill_str = pymupdf.ColorCode(fill, "f") # ensure proper fill string
optcont = self.page._get_optional_content(oc)
if optcont is not None:
self.draw_cont = "/OC /%s BDC\n" % optcont + self.draw_cont
emc = "EMC\n"
else:
emc = ""
alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity)
if alpha is not None:
self.draw_cont = "/%s gs\n" % alpha + self.draw_cont
if width != 1 and width != 0:
self.draw_cont += _format_g(width) + " w\n"
if lineCap != 0:
self.draw_cont = "%i J\n" % lineCap + self.draw_cont
if lineJoin != 0:
self.draw_cont = "%i j\n" % lineJoin + self.draw_cont
if dashes not in (None, "", "[] 0"):
self.draw_cont = "%s d\n" % dashes + self.draw_cont
if closePath:
self.draw_cont += "h\n"
self.last_point = None
if color is not None:
self.draw_cont += color_str
if fill is not None:
self.draw_cont += fill_str
if color is not None:
if not even_odd:
self.draw_cont += "B\n"
else:
self.draw_cont += "B*\n"
else:
if not even_odd:
self.draw_cont += "f\n"
else:
self.draw_cont += "f*\n"
else:
self.draw_cont += "S\n"
self.draw_cont += emc
if pymupdf.CheckMorph(morph):
m1 = pymupdf.Matrix(
1, 0, 0, 1, morph[0].x + self.x, self.height - morph[0].y - self.y
)
mat = ~m1 * morph[1] * m1
self.draw_cont = _format_g(pymupdf.JM_TUPLE(mat)) + " cm\n" + self.draw_cont
self.totalcont += "\nq\n" + self.draw_cont + "Q\n"
self.draw_cont = ""
self.last_point = None
return
def commit(self, overlay: bool = True) -> None:
"""Update the page's /Contents object with Shape data.
The argument controls whether data appear in foreground (default)
or background.
"""
pymupdf.CheckParent(self.page) # doc may have died meanwhile
self.totalcont += self.text_cont
self.totalcont = self.totalcont.encode()
if self.totalcont:
if overlay:
self.page.wrap_contents() # ensure a balanced graphics state
# make /Contents object with dummy stream
xref = pymupdf.TOOLS._insert_contents(self.page, b" ", overlay)
# update it with potential compression
self.doc.update_stream(xref, self.totalcont)
self.last_point = None # clean up ...
self.rect = None #
self.draw_cont = "" # for potential ...
self.text_cont = "" # ...
self.totalcont = "" # re-use
def apply_redactions(
page: pymupdf.Page, images: int = 2, graphics: int = 1, text: int = 0
) -> bool:
"""Apply the redaction annotations of the page.
Args:
page: the PDF page.
images:
0 - ignore images
1 - remove all overlapping images
2 - blank out overlapping image parts
3 - remove image unless invisible
graphics:
0 - ignore graphics
1 - remove graphics if contained in rectangle
2 - remove all overlapping graphics
text:
0 - remove text
1 - ignore text
"""
def center_rect(annot_rect, new_text, font, fsize):
"""Calculate minimal sub-rectangle for the overlay text.
Notes:
Because 'insert_textbox' supports no vertical text centering,
we calculate an approximate number of lines here and return a
sub-rect with smaller height, which should still be sufficient.
Args:
annot_rect: the annotation rectangle
new_text: the text to insert.
font: the fontname. Must be one of the CJK or Base-14 set, else
the rectangle is returned unchanged.
fsize: the fontsize
Returns:
A rectangle to use instead of the annot rectangle.
"""
exception_types = (ValueError, mupdf.FzErrorBase)
if pymupdf.mupdf_version_tuple < (1, 24):
exception_types = ValueError
if not new_text:
return annot_rect
try:
text_width = pymupdf.get_text_length(new_text, font, fsize)
except exception_types: # unsupported font
if g_exceptions_verbose:
pymupdf.exception_info()
return annot_rect
line_height = fsize * 1.2
limit = annot_rect.width
h = math.ceil(text_width / limit) * line_height # estimate rect height
if h >= annot_rect.height:
return annot_rect
r = annot_rect
y = (annot_rect.tl.y + annot_rect.bl.y - h) * 0.5
r.y0 = y
return r
pymupdf.CheckParent(page)
doc = page.parent
if doc.is_encrypted or doc.is_closed:
raise ValueError("document closed or encrypted")
if not doc.is_pdf:
raise ValueError("is no PDF")
redact_annots = [] # storage of annot values
for annot in page.annots(
types=(pymupdf.PDF_ANNOT_REDACT,) # pylint: disable=no-member
):
# loop redactions
redact_annots.append(annot._get_redact_values()) # save annot values
if redact_annots == []: # any redactions on this page?
return False # no redactions
rc = page._apply_redactions(text, images, graphics) # call MuPDF
if not rc: # should not happen really
raise ValueError("Error applying redactions.")
# now write replacement text in old redact rectangles
shape = page.new_shape()
for redact in redact_annots:
annot_rect = redact["rect"]
fill = redact["fill"]
if fill:
shape.draw_rect(annot_rect) # colorize the rect background
shape.finish(fill=fill, color=fill)
if "text" in redact.keys(): # if we also have text
new_text = redact["text"]
align = redact.get("align", 0)
fname = redact["fontname"]
fsize = redact["fontsize"]
color = redact["text_color"]
# try finding vertical centered sub-rect
trect = center_rect(annot_rect, new_text, fname, fsize)
rc = -1
while rc < 0 and fsize >= 4: # while not enough room
# (re-) try insertion
rc = shape.insert_textbox(
trect,
new_text,
fontname=fname,
fontsize=fsize,
color=color,
align=align,
)
fsize -= 0.5 # reduce font if unsuccessful
shape.commit() # append new contents object
return True
# ------------------------------------------------------------------------------
# Remove potentially sensitive data from a PDF. Similar to the Adobe
# Acrobat 'sanitize' function
# ------------------------------------------------------------------------------
def scrub(
doc: pymupdf.Document,
attached_files: bool = True,
clean_pages: bool = True,
embedded_files: bool = True,
hidden_text: bool = True,
javascript: bool = True,
metadata: bool = True,
redactions: bool = True,
redact_images: int = 0,
remove_links: bool = True,
reset_fields: bool = True,
reset_responses: bool = True,
thumbnails: bool = True,
xml_metadata: bool = True,
) -> None:
def remove_hidden(cont_lines):
"""Remove hidden text from a PDF page.
Args:
cont_lines: list of lines with /Contents content. Should have status
from after page.cleanContents().
Returns:
List of /Contents lines from which hidden text has been removed.
Notes:
The input must have been created after the page's /Contents object(s)
have been cleaned with page.cleanContents(). This ensures a standard
formatting: one command per line, single spaces between operators.
This allows for drastic simplification of this code.
"""
out_lines = [] # will return this
in_text = False # indicate if within BT/ET object
suppress = False # indicate text suppression active
make_return = False
for line in cont_lines:
if line == b"BT": # start of text object
in_text = True # switch on
out_lines.append(line) # output it
continue
if line == b"ET": # end of text object
in_text = False # switch off
out_lines.append(line) # output it
continue
if line == b"3 Tr": # text suppression operator
suppress = True # switch on
make_return = True
continue
if line[-2:] == b"Tr" and line[0] != b"3":
suppress = False # text rendering changed
out_lines.append(line)
continue
if line == b"Q": # unstack command also switches off
suppress = False
out_lines.append(line)
continue
if suppress and in_text: # suppress hidden lines
continue
out_lines.append(line)
if make_return:
return out_lines
else:
return None
if not doc.is_pdf: # only works for PDF
raise ValueError("is no PDF")
if doc.is_encrypted or doc.is_closed:
raise ValueError("closed or encrypted doc")
if clean_pages is False:
hidden_text = False
redactions = False
if metadata:
doc.set_metadata({}) # remove standard metadata
for page in doc:
if reset_fields:
# reset form fields (widgets)
for widget in page.widgets():
widget.reset()
if remove_links:
links = page.get_links() # list of all links on page
for link in links: # remove all links
page.delete_link(link)
found_redacts = False
for annot in page.annots():
if annot.type[0] == mupdf.PDF_ANNOT_FILE_ATTACHMENT and attached_files:
annot.update_file(buffer=b" ") # set file content to empty
if reset_responses:
annot.delete_responses()
if annot.type[0] == pymupdf.PDF_ANNOT_REDACT: # pylint: disable=no-member
found_redacts = True
if redactions and found_redacts:
page.apply_redactions(images=redact_images)
if not (clean_pages or hidden_text):
continue # done with the page
page.clean_contents()
if not page.get_contents():
continue
if hidden_text:
xref = page.get_contents()[0] # only one b/o cleaning!
cont = doc.xref_stream(xref)
cont_lines = remove_hidden(cont.splitlines()) # remove hidden text
if cont_lines: # something was actually removed
cont = b"\n".join(cont_lines)
doc.update_stream(xref, cont) # rewrite the page /Contents
if thumbnails: # remove page thumbnails?
if doc.xref_get_key(page.xref, "Thumb")[0] != "null":
doc.xref_set_key(page.xref, "Thumb", "null")
# pages are scrubbed, now perform document-wide scrubbing
# remove embedded files
if embedded_files:
for name in doc.embfile_names():
doc.embfile_del(name)
if xml_metadata:
doc.del_xml_metadata()
if not (xml_metadata or javascript):
xref_limit = 0
else:
xref_limit = doc.xref_length()
for xref in range(1, xref_limit):
if not doc.xref_object(xref):
msg = "bad xref %i - clean PDF before scrubbing" % xref
raise ValueError(msg)
if javascript and doc.xref_get_key(xref, "S")[1] == "/JavaScript":
# a /JavaScript action object
obj = "<</S/JavaScript/JS()>>" # replace with a null JavaScript
doc.update_object(xref, obj) # update this object
continue # no further handling
if not xml_metadata:
continue
if doc.xref_get_key(xref, "Type")[1] == "/Metadata":
# delete any metadata object directly
doc.update_object(xref, "<<>>")
doc.update_stream(xref, b"deleted", new=True)
continue
if doc.xref_get_key(xref, "Metadata")[0] != "null":
doc.xref_set_key(xref, "Metadata", "null")
def _show_fz_text( text):
#if mupdf_cppyy:
# assert isinstance( text, cppyy.gbl.mupdf.Text)
#else:
# assert isinstance( text, mupdf.Text)
num_spans = 0
num_chars = 0
span = text.m_internal.head
while 1:
if not span:
break
num_spans += 1
num_chars += span.len
span = span.next
return f'num_spans={num_spans} num_chars={num_chars}'
def fill_textbox(
writer: pymupdf.TextWriter,
rect: rect_like,
text: typing.Union[str, list],
pos: point_like = None,
font: typing.Optional[pymupdf.Font] = None,
fontsize: float = 11,
lineheight: OptFloat = None,
align: int = 0,
warn: bool = None,
right_to_left: bool = False,
small_caps: bool = False,
) -> tuple:
"""Fill a rectangle with text.
Args:
writer: pymupdf.TextWriter object (= "self")
rect: rect-like to receive the text.
text: string or list/tuple of strings.
pos: point-like start position of first word.
font: pymupdf.Font object (default pymupdf.Font('helv')).
fontsize: the fontsize.
lineheight: overwrite the font property
align: (int) 0 = left, 1 = center, 2 = right, 3 = justify
warn: (bool) text overflow action: none, warn, or exception
right_to_left: (bool) indicate right-to-left language.
"""
rect = pymupdf.Rect(rect)
if rect.is_empty:
raise ValueError("fill rect must not empty.")
if type(font) is not pymupdf.Font:
font = pymupdf.Font("helv")
def textlen(x):
"""Return length of a string."""
return font.text_length(
x, fontsize=fontsize, small_caps=small_caps
) # abbreviation
def char_lengths(x):
"""Return list of single character lengths for a string."""
return font.char_lengths(x, fontsize=fontsize, small_caps=small_caps)
def append_this(pos, text):
ret = writer.append(
pos, text, font=font, fontsize=fontsize, small_caps=small_caps
)
return ret
tolerance = fontsize * 0.2 # extra distance to left border
space_len = textlen(" ")
std_width = rect.width - tolerance
std_start = rect.x0 + tolerance
def norm_words(width, words):
"""Cut any word in pieces no longer than 'width'."""
nwords = []
word_lengths = []
for w in words:
wl_lst = char_lengths(w)
wl = sum(wl_lst)
if wl <= width: # nothing to do - copy over
nwords.append(w)
word_lengths.append(wl)
continue
# word longer than rect width - split it in parts
n = len(wl_lst)
while n > 0:
wl = sum(wl_lst[:n])
if wl <= width:
nwords.append(w[:n])
word_lengths.append(wl)
w = w[n:]
wl_lst = wl_lst[n:]
n = len(wl_lst)
else:
n -= 1
return nwords, word_lengths
def output_justify(start, line):
"""Justified output of a line."""
# ignore leading / trailing / multiple spaces
words = [w for w in line.split(" ") if w != ""]
nwords = len(words)
if nwords == 0:
return
if nwords == 1: # single word cannot be justified
append_this(start, words[0])
return
tl = sum([textlen(w) for w in words]) # total word lengths
gaps = nwords - 1 # number of word gaps
gapl = (std_width - tl) / gaps # width of each gap
for w in words:
_, lp = append_this(start, w) # output one word
start.x = lp.x + gapl # next start at word end plus gap
return
asc = font.ascender
dsc = font.descender
if not lineheight:
if asc - dsc <= 1:
lheight = 1.2
else:
lheight = asc - dsc
else:
lheight = lineheight
LINEHEIGHT = fontsize * lheight # effective line height
width = std_width # available horizontal space
# starting point of text
if pos is not None:
pos = pymupdf.Point(pos)
else: # default is just below rect top-left
pos = rect.tl + (tolerance, fontsize * asc)
if pos not in rect:
raise ValueError("Text must start in rectangle.")
# calculate displacement factor for alignment
if align == pymupdf.TEXT_ALIGN_CENTER:
factor = 0.5
elif align == pymupdf.TEXT_ALIGN_RIGHT:
factor = 1.0
else:
factor = 0
# split in lines if just a string was given
if type(text) is str:
textlines = text.splitlines()
else:
textlines = []
for line in text:
textlines.extend(line.splitlines())
max_lines = int((rect.y1 - pos.y) / LINEHEIGHT) + 1
new_lines = [] # the final list of textbox lines
no_justify = [] # no justify for these line numbers
for i, line in enumerate(textlines):
if line in ("", " "):
new_lines.append((line, space_len))
width = rect.width - tolerance
no_justify.append((len(new_lines) - 1))
continue
if i == 0:
width = rect.x1 - pos.x
else:
width = rect.width - tolerance
if right_to_left: # reverses Arabic / Hebrew text front to back
line = writer.clean_rtl(line)
tl = textlen(line)
if tl <= width: # line short enough
new_lines.append((line, tl))
no_justify.append((len(new_lines) - 1))
continue
# we need to split the line in fitting parts
words = line.split(" ") # the words in the line
# cut in parts any words that are longer than rect width
words, word_lengths = norm_words(std_width, words)
n = len(words)
while True:
line0 = " ".join(words[:n])
wl = sum(word_lengths[:n]) + space_len * (len(word_lengths[:n]) - 1)
if wl <= width:
new_lines.append((line0, wl))
words = words[n:]
word_lengths = word_lengths[n:]
n = len(words)
line0 = None
else:
n -= 1
if len(words) == 0:
break
# -------------------------------------------------------------------------
# List of lines created. Each item is (text, tl), where 'tl' is the PDF
# output length (float) and 'text' is the text. Except for justified text,
# this is output-ready.
# -------------------------------------------------------------------------
nlines = len(new_lines)
if nlines > max_lines:
msg = "Only fitting %i of %i lines." % (max_lines, nlines)
if warn is True:
pymupdf.message("Warning: " + msg)
elif warn is False:
raise ValueError(msg)
start = pymupdf.Point()
no_justify += [len(new_lines) - 1] # no justifying of last line
for i in range(max_lines):
try:
line, tl = new_lines.pop(0)
except IndexError:
# Verbose in PyMuPDF/tests.
if g_exceptions_verbose: pymupdf.exception_info()
break
if right_to_left: # Arabic, Hebrew
line = "".join(reversed(line))
if i == 0: # may have different start for first line
start = pos
if align == pymupdf.TEXT_ALIGN_JUSTIFY and i not in no_justify and tl < std_width:
output_justify(start, line)
start.x = std_start
start.y += LINEHEIGHT
continue
if i > 0 or pos.x == std_start: # left, center, right alignments
start.x += (width - tl) * factor
append_this(start, line)
start.x = std_start
start.y += LINEHEIGHT
return new_lines # return non-written lines
# ------------------------------------------------------------------------
# Optional Content functions
# ------------------------------------------------------------------------
def get_oc(doc: pymupdf.Document, xref: int) -> int:
"""Return optional content object xref for an image or form xobject.
Args:
xref: (int) xref number of an image or form xobject.
"""
if doc.is_closed or doc.is_encrypted:
raise ValueError("document close or encrypted")
t, name = doc.xref_get_key(xref, "Subtype")
if t != "name" or name not in ("/Image", "/Form"):
raise ValueError("bad object type at xref %i" % xref)
t, oc = doc.xref_get_key(xref, "OC")
if t != "xref":
return 0
rc = int(oc.replace("0 R", ""))
return rc
def set_oc(doc: pymupdf.Document, xref: int, oc: int) -> None:
"""Attach optional content object to image or form xobject.
Args:
xref: (int) xref number of an image or form xobject
oc: (int) xref number of an OCG or OCMD
"""
if doc.is_closed or doc.is_encrypted:
raise ValueError("document close or encrypted")
t, name = doc.xref_get_key(xref, "Subtype")
if t != "name" or name not in ("/Image", "/Form"):
raise ValueError("bad object type at xref %i" % xref)
if oc > 0:
t, name = doc.xref_get_key(oc, "Type")
if t != "name" or name not in ("/OCG", "/OCMD"):
raise ValueError("bad object type at xref %i" % oc)
if oc == 0 and "OC" in doc.xref_get_keys(xref):
doc.xref_set_key(xref, "OC", "null")
return None
doc.xref_set_key(xref, "OC", "%i 0 R" % oc)
return None
def set_ocmd(
doc: pymupdf.Document,
xref: int = 0,
ocgs: typing.Union[list, None] = None,
policy: OptStr = None,
ve: typing.Union[list, None] = None,
) -> int:
"""Create or update an OCMD object in a PDF document.
Args:
xref: (int) 0 for creating a new object, otherwise update existing one.
ocgs: (list) OCG xref numbers, which shall be subject to 'policy'.
policy: one of 'AllOn', 'AllOff', 'AnyOn', 'AnyOff' (any casing).
ve: (list) visibility expression. Use instead of 'ocgs' with 'policy'.
Returns:
Xref of the created or updated OCMD.
"""
all_ocgs = set(doc.get_ocgs().keys())
def ve_maker(ve):
if type(ve) not in (list, tuple) or len(ve) < 2:
raise ValueError("bad 've' format: %s" % ve)
if ve[0].lower() not in ("and", "or", "not"):
raise ValueError("bad operand: %s" % ve[0])
if ve[0].lower() == "not" and len(ve) != 2:
raise ValueError("bad 've' format: %s" % ve)
item = "[/%s" % ve[0].title()
for x in ve[1:]:
if type(x) is int:
if x not in all_ocgs:
raise ValueError("bad OCG %i" % x)
item += " %i 0 R" % x
else:
item += " %s" % ve_maker(x)
item += "]"
return item
text = "<</Type/OCMD"
if ocgs and type(ocgs) in (list, tuple): # some OCGs are provided
s = set(ocgs).difference(all_ocgs) # contains illegal xrefs
if s != set():
msg = "bad OCGs: %s" % s
raise ValueError(msg)
text += "/OCGs[" + " ".join(map(lambda x: "%i 0 R" % x, ocgs)) + "]"
if policy:
policy = str(policy).lower()
pols = {
"anyon": "AnyOn",
"allon": "AllOn",
"anyoff": "AnyOff",
"alloff": "AllOff",
}
if policy not in ("anyon", "allon", "anyoff", "alloff"):
raise ValueError("bad policy: %s" % policy)
text += "/P/%s" % pols[policy]
if ve:
text += "/VE%s" % ve_maker(ve)
text += ">>"
# make new object or replace old OCMD (check type first)
if xref == 0:
xref = doc.get_new_xref()
elif "/Type/OCMD" not in doc.xref_object(xref, compressed=True):
raise ValueError("bad xref or not an OCMD")
doc.update_object(xref, text)
return xref
def get_ocmd(doc: pymupdf.Document, xref: int) -> dict:
"""Return the definition of an OCMD (optional content membership dictionary).
Recognizes PDF dict keys /OCGs (PDF array of OCGs), /P (policy string) and
/VE (visibility expression, PDF array). Via string manipulation, this
info is converted to a Python dictionary with keys "xref", "ocgs", "policy"
and "ve" - ready to recycle as input for 'set_ocmd()'.
"""
if xref not in range(doc.xref_length()):
raise ValueError("bad xref")
text = doc.xref_object(xref, compressed=True)
if "/Type/OCMD" not in text:
raise ValueError("bad object type")
textlen = len(text)
p0 = text.find("/OCGs[") # look for /OCGs key
p1 = text.find("]", p0)
if p0 < 0 or p1 < 0: # no OCGs found
ocgs = None
else:
ocgs = text[p0 + 6 : p1].replace("0 R", " ").split()
ocgs = list(map(int, ocgs))
p0 = text.find("/P/") # look for /P policy key
if p0 < 0:
policy = None
else:
p1 = text.find("ff", p0)
if p1 < 0:
p1 = text.find("on", p0)
if p1 < 0: # some irregular syntax
raise ValueError("bad object at xref")
else:
policy = text[p0 + 3 : p1 + 2]
p0 = text.find("/VE[") # look for /VE visibility expression key
if p0 < 0: # no visibility expression found
ve = None
else:
lp = rp = 0 # find end of /VE by finding last ']'.
p1 = p0
while lp < 1 or lp != rp:
p1 += 1
if not p1 < textlen: # some irregular syntax
raise ValueError("bad object at xref")
if text[p1] == "[":
lp += 1
if text[p1] == "]":
rp += 1
# p1 now positioned at the last "]"
ve = text[p0 + 3 : p1 + 1] # the PDF /VE array
ve = (
ve.replace("/And", '"and",')
.replace("/Not", '"not",')
.replace("/Or", '"or",')
)
ve = ve.replace(" 0 R]", "]").replace(" 0 R", ",").replace("][", "],[")
import json
try:
ve = json.loads(ve)
except Exception:
pymupdf.exception_info()
pymupdf.message(f"bad /VE key: {ve!r}")
raise
return {"xref": xref, "ocgs": ocgs, "policy": policy, "ve": ve}
"""
Handle page labels for PDF documents.
Reading
-------
* compute the label of a page
* find page number(s) having the given label.
Writing
-------
Supports setting (defining) page labels for PDF documents.
A big Thank You goes to WILLIAM CHAPMAN who contributed the idea and
significant parts of the following code during late December 2020
through early January 2021.
"""
def rule_dict(item):
"""Make a Python dict from a PDF page label rule.
Args:
item -- a tuple (pno, rule) with the start page number and the rule
string like <</S/D...>>.
Returns:
A dict like
{'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}.
"""
# Jorj McKie, 2021-01-06
pno, rule = item
rule = rule[2:-2].split("/")[1:] # strip "<<" and ">>"
d = {"startpage": pno, "prefix": "", "firstpagenum": 1}
skip = False
for i, item in enumerate(rule): # pylint: disable=redefined-argument-from-local
if skip: # this item has already been processed
skip = False # deactivate skipping again
continue
if item == "S": # style specification
d["style"] = rule[i + 1] # next item has the style
skip = True # do not process next item again
continue
if item.startswith("P"): # prefix specification: extract the string
x = item[1:].replace("(", "").replace(")", "")
d["prefix"] = x
continue
if item.startswith("St"): # start page number specification
x = int(item[2:])
d["firstpagenum"] = x
return d
def get_label_pno(pgNo, labels):
"""Return the label for this page number.
Args:
pgNo: page number, 0-based.
labels: result of doc._get_page_labels().
Returns:
The label (str) of the page number. Errors return an empty string.
"""
# Jorj McKie, 2021-01-06
item = [x for x in labels if x[0] <= pgNo][-1]
rule = rule_dict(item)
prefix = rule.get("prefix", "")
style = rule.get("style", "")
pagenumber = pgNo - rule["startpage"] + rule["firstpagenum"]
return construct_label(style, prefix, pagenumber)
def get_label(page):
"""Return the label for this PDF page.
Args:
page: page object.
Returns:
The label (str) of the page. Errors return an empty string.
"""
# Jorj McKie, 2021-01-06
labels = page.parent._get_page_labels()
if not labels:
return ""
labels.sort()
return get_label_pno(page.number, labels)
def get_page_numbers(doc, label, only_one=False):
"""Return a list of page numbers with the given label.
Args:
doc: PDF document object (resp. 'self').
label: (str) label.
only_one: (bool) stop searching after first hit.
Returns:
List of page numbers having this label.
"""
# Jorj McKie, 2021-01-06
numbers = []
if not label:
return numbers
labels = doc._get_page_labels()
if labels == []:
return numbers
for i in range(doc.page_count):
plabel = get_label_pno(i, labels)
if plabel == label:
numbers.append(i)
if only_one:
break
return numbers
def construct_label(style, prefix, pno) -> str:
"""Construct a label based on style, prefix and page number."""
# William Chapman, 2021-01-06
n_str = ""
if style == "D":
n_str = str(pno)
elif style == "r":
n_str = integerToRoman(pno).lower()
elif style == "R":
n_str = integerToRoman(pno).upper()
elif style == "a":
n_str = integerToLetter(pno).lower()
elif style == "A":
n_str = integerToLetter(pno).upper()
result = prefix + n_str
return result
def integerToLetter(i) -> str:
"""Returns letter sequence string for integer i."""
# William Chapman, Jorj McKie, 2021-01-06
import string
ls = string.ascii_uppercase
n, a = 1, i
while pow(26, n) <= a:
a -= int(math.pow(26, n))
n += 1
str_t = ""
for j in reversed(range(n)):
f, g = divmod(a, int(math.pow(26, j)))
str_t += ls[f]
a = g
return str_t
def integerToRoman(num: int) -> str:
"""Return roman numeral for an integer."""
# William Chapman, Jorj McKie, 2021-01-06
roman = (
(1000, "M"),
(900, "CM"),
(500, "D"),
(400, "CD"),
(100, "C"),
(90, "XC"),
(50, "L"),
(40, "XL"),
(10, "X"),
(9, "IX"),
(5, "V"),
(4, "IV"),
(1, "I"),
)
def roman_num(num):
for r, ltr in roman:
x, _ = divmod(num, r)
yield ltr * x
num -= r * x
if num <= 0:
break
return "".join([a for a in roman_num(num)])
def get_page_labels(doc):
"""Return page label definitions in PDF document.
Args:
doc: PDF document (resp. 'self').
Returns:
A list of dictionaries with the following format:
{'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}.
"""
# Jorj McKie, 2021-01-10
return [rule_dict(item) for item in doc._get_page_labels()]
def set_page_labels(doc, labels):
"""Add / replace page label definitions in PDF document.
Args:
doc: PDF document (resp. 'self').
labels: list of label dictionaries like:
{'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int},
as returned by get_page_labels().
"""
# William Chapman, 2021-01-06
def create_label_str(label):
"""Convert Python label dict to correspnding PDF rule string.
Args:
label: (dict) build rule for the label.
Returns:
PDF label rule string wrapped in "<<", ">>".
"""
s = "%i<<" % label["startpage"]
if label.get("prefix", "") != "":
s += "/P(%s)" % label["prefix"]
if label.get("style", "") != "":
s += "/S/%s" % label["style"]
if label.get("firstpagenum", 1) > 1:
s += "/St %i" % label["firstpagenum"]
s += ">>"
return s
def create_nums(labels):
"""Return concatenated string of all labels rules.
Args:
labels: (list) dictionaries as created by function 'rule_dict'.
Returns:
PDF compatible string for page label definitions, ready to be
enclosed in PDF array 'Nums[...]'.
"""
labels.sort(key=lambda x: x["startpage"])
s = "".join([create_label_str(label) for label in labels])
return s
doc._set_page_labels(create_nums(labels))
# End of Page Label Code -------------------------------------------------
def has_links(doc: pymupdf.Document) -> bool:
"""Check whether there are links on any page."""
if doc.is_closed:
raise ValueError("document closed")
if not doc.is_pdf:
raise ValueError("is no PDF")
for i in range(doc.page_count):
for item in doc.page_annot_xrefs(i):
if item[1] == pymupdf.PDF_ANNOT_LINK: # pylint: disable=no-member
return True
return False
def has_annots(doc: pymupdf.Document) -> bool:
"""Check whether there are annotations on any page."""
if doc.is_closed:
raise ValueError("document closed")
if not doc.is_pdf:
raise ValueError("is no PDF")
for i in range(doc.page_count):
for item in doc.page_annot_xrefs(i):
# pylint: disable=no-member
if not (item[1] == pymupdf.PDF_ANNOT_LINK or item[1] == pymupdf.PDF_ANNOT_WIDGET):
return True
return False
# -------------------------------------------------------------------
# Functions to recover the quad contained in a text extraction bbox
# -------------------------------------------------------------------
def recover_bbox_quad(line_dir: tuple, span: dict, bbox: tuple) -> pymupdf.Quad:
"""Compute the quad located inside the bbox.
The bbox may be any of the resp. tuples occurring inside the given span.
Args:
line_dir: (tuple) 'line["dir"]' of the owning line or None.
span: (dict) the span. May be from get_texttrace() method.
bbox: (tuple) the bbox of the span or any of its characters.
Returns:
The quad which is wrapped by the bbox.
"""
if line_dir is None:
line_dir = span["dir"]
cos, sin = line_dir
bbox = pymupdf.Rect(bbox) # make it a rect
if pymupdf.TOOLS.set_small_glyph_heights(): # ==> just fontsize as height
d = 1
else:
d = span["ascender"] - span["descender"]
height = d * span["size"] # the quad's rectangle height
# The following are distances from the bbox corners, at wich we find the
# respective quad points. The computation depends on in which quadrant
# the text writing angle is located.
hs = height * sin
hc = height * cos
if hc >= 0 and hs <= 0: # quadrant 1
ul = bbox.bl - (0, hc)
ur = bbox.tr + (hs, 0)
ll = bbox.bl - (hs, 0)
lr = bbox.tr + (0, hc)
elif hc <= 0 and hs <= 0: # quadrant 2
ul = bbox.br + (hs, 0)
ur = bbox.tl - (0, hc)
ll = bbox.br + (0, hc)
lr = bbox.tl - (hs, 0)
elif hc <= 0 and hs >= 0: # quadrant 3
ul = bbox.tr - (0, hc)
ur = bbox.bl + (hs, 0)
ll = bbox.tr - (hs, 0)
lr = bbox.bl + (0, hc)
else: # quadrant 4
ul = bbox.tl + (hs, 0)
ur = bbox.br - (0, hc)
ll = bbox.tl + (0, hc)
lr = bbox.br - (hs, 0)
return pymupdf.Quad(ul, ur, ll, lr)
def recover_quad(line_dir: tuple, span: dict) -> pymupdf.Quad:
"""Recover the quadrilateral of a text span.
Args:
line_dir: (tuple) 'line["dir"]' of the owning line.
span: the span.
Returns:
The quadrilateral enveloping the span's text.
"""
if type(line_dir) is not tuple or len(line_dir) != 2:
raise ValueError("bad line dir argument")
if type(span) is not dict:
raise ValueError("bad span argument")
return recover_bbox_quad(line_dir, span, span["bbox"])
def recover_line_quad(line: dict, spans: list = None) -> pymupdf.Quad:
"""Calculate the line quad for 'dict' / 'rawdict' text extractions.
The lower quad points are those of the first, resp. last span quad.
The upper points are determined by the maximum span quad height.
From this, compute a rect with bottom-left in (0, 0), convert this to a
quad and rotate and shift back to cover the text of the spans.
Args:
spans: (list, optional) sub-list of spans to consider.
Returns:
pymupdf.Quad covering selected spans.
"""
if spans is None: # no sub-selection
spans = line["spans"] # all spans
if len(spans) == 0:
raise ValueError("bad span list")
line_dir = line["dir"] # text direction
cos, sin = line_dir
q0 = recover_quad(line_dir, spans[0]) # quad of first span
if len(spans) > 1: # get quad of last span
q1 = recover_quad(line_dir, spans[-1])
else:
q1 = q0 # last = first
line_ll = q0.ll # lower-left of line quad
line_lr = q1.lr # lower-right of line quad
mat0 = pymupdf.planish_line(line_ll, line_lr)
# map base line to x-axis such that line_ll goes to (0, 0)
x_lr = line_lr * mat0
small = pymupdf.TOOLS.set_small_glyph_heights() # small glyph heights?
h = max(
[s["size"] * (1 if small else (s["ascender"] - s["descender"])) for s in spans]
)
line_rect = pymupdf.Rect(0, -h, x_lr.x, 0) # line rectangle
line_quad = line_rect.quad # make it a quad and:
line_quad *= ~mat0
return line_quad
def recover_span_quad(line_dir: tuple, span: dict, chars: list = None) -> pymupdf.Quad:
"""Calculate the span quad for 'dict' / 'rawdict' text extractions.
Notes:
There are two execution paths:
1. For the full span quad, the result of 'recover_quad' is returned.
2. For the quad of a sub-list of characters, the char quads are
computed and joined. This is only supported for the "rawdict"
extraction option.
Args:
line_dir: (tuple) 'line["dir"]' of the owning line.
span: (dict) the span.
chars: (list, optional) sub-list of characters to consider.
Returns:
pymupdf.Quad covering selected characters.
"""
if line_dir is None: # must be a span from get_texttrace()
line_dir = span["dir"]
if chars is None: # no sub-selection
return recover_quad(line_dir, span)
if "chars" not in span.keys():
raise ValueError("need 'rawdict' option to sub-select chars")
q0 = recover_char_quad(line_dir, span, chars[0]) # quad of first char
if len(chars) > 1: # get quad of last char
q1 = recover_char_quad(line_dir, span, chars[-1])
else:
q1 = q0 # last = first
span_ll = q0.ll # lower-left of span quad
span_lr = q1.lr # lower-right of span quad
mat0 = pymupdf.planish_line(span_ll, span_lr)
# map base line to x-axis such that span_ll goes to (0, 0)
x_lr = span_lr * mat0
small = pymupdf.TOOLS.set_small_glyph_heights() # small glyph heights?
h = span["size"] * (1 if small else (span["ascender"] - span["descender"]))
span_rect = pymupdf.Rect(0, -h, x_lr.x, 0) # line rectangle
span_quad = span_rect.quad # make it a quad and:
span_quad *= ~mat0 # rotate back and shift back
return span_quad
def recover_char_quad(line_dir: tuple, span: dict, char: dict) -> pymupdf.Quad:
"""Recover the quadrilateral of a text character.
This requires the "rawdict" option of text extraction.
Args:
line_dir: (tuple) 'line["dir"]' of the span's line.
span: (dict) the span dict.
char: (dict) the character dict.
Returns:
The quadrilateral enveloping the character.
"""
if line_dir is None:
line_dir = span["dir"]
if type(line_dir) is not tuple or len(line_dir) != 2:
raise ValueError("bad line dir argument")
if type(span) is not dict:
raise ValueError("bad span argument")
if type(char) is dict:
bbox = pymupdf.Rect(char["bbox"])
elif type(char) is tuple:
bbox = pymupdf.Rect(char[3])
else:
raise ValueError("bad span argument")
return recover_bbox_quad(line_dir, span, bbox)
# -------------------------------------------------------------------
# Building font subsets using fontTools
# -------------------------------------------------------------------
def subset_fonts(doc: pymupdf.Document, verbose: bool = False, fallback: bool = False) -> None:
"""Build font subsets of a PDF. Requires package 'fontTools'.
Eligible fonts are potentially replaced by smaller versions. Page text is
NOT rewritten and thus should retain properties like being hidden or
controlled by optional content.
This method by default uses MuPDF's own internal feature to create subset
fonts. As this is a new function, errors may still occur. In this case,
please fall back to using the previous version by using "fallback=True".
"""
# Font binaries: - "buffer" -> (names, xrefs, (unicodes, glyphs))
# An embedded font is uniquely defined by its fontbuffer only. It may have
# multiple names and xrefs.
# Once the sets of used unicodes and glyphs are known, we compute a
# smaller version of the buffer user package fontTools.
if fallback is False: # by default use MuPDF function
pdf = mupdf.pdf_document_from_fz_document(doc)
mupdf.pdf_subset_fonts2(pdf, list(range(doc.page_count)))
return
font_buffers = {}
def get_old_widths(xref):
"""Retrieve old font '/W' and '/DW' values."""
df = doc.xref_get_key(xref, "DescendantFonts")
if df[0] != "array": # only handle xref specifications
return None, None
df_xref = int(df[1][1:-1].replace("0 R", ""))
widths = doc.xref_get_key(df_xref, "W")
if widths[0] != "array": # no widths key found
widths = None
else:
widths = widths[1]
dwidths = doc.xref_get_key(df_xref, "DW")
if dwidths[0] != "int":
dwidths = None
else:
dwidths = dwidths[1]
return widths, dwidths
def set_old_widths(xref, widths, dwidths):
"""Restore the old '/W' and '/DW' in subsetted font.
If either parameter is None or evaluates to False, the corresponding
dictionary key will be set to null.
"""
df = doc.xref_get_key(xref, "DescendantFonts")
if df[0] != "array": # only handle xref specs
return None
df_xref = int(df[1][1:-1].replace("0 R", ""))
if (type(widths) is not str or not widths) and doc.xref_get_key(df_xref, "W")[
0
] != "null":
doc.xref_set_key(df_xref, "W", "null")
else:
doc.xref_set_key(df_xref, "W", widths)
if (type(dwidths) is not str or not dwidths) and doc.xref_get_key(
df_xref, "DW"
)[0] != "null":
doc.xref_set_key(df_xref, "DW", "null")
else:
doc.xref_set_key(df_xref, "DW", dwidths)
return None
def set_subset_fontname(new_xref):
"""Generate a name prefix to tag a font as subset.
We use a random generator to select 6 upper case ASCII characters.
The prefixed name must be put in the font xref as the "/BaseFont" value
and in the FontDescriptor object as the '/FontName' value.
"""
# The following generates a prefix like 'ABCDEF+'
import random
import string
prefix = "".join(random.choices(tuple(string.ascii_uppercase), k=6)) + "+"
font_str = doc.xref_object(new_xref, compressed=True)
font_str = font_str.replace("/BaseFont/", "/BaseFont/" + prefix)
df = doc.xref_get_key(new_xref, "DescendantFonts")
if df[0] == "array":
df_xref = int(df[1][1:-1].replace("0 R", ""))
fd = doc.xref_get_key(df_xref, "FontDescriptor")
if fd[0] == "xref":
fd_xref = int(fd[1].replace("0 R", ""))
fd_str = doc.xref_object(fd_xref, compressed=True)
fd_str = fd_str.replace("/FontName/", "/FontName/" + prefix)
doc.update_object(fd_xref, fd_str)
doc.update_object(new_xref, font_str)
def build_subset(buffer, unc_set, gid_set):
"""Build font subset using fontTools.
Args:
buffer: (bytes) the font given as a binary buffer.
unc_set: (set) required glyph ids.
Returns:
Either None if subsetting is unsuccessful or the subset font buffer.
"""
try:
import fontTools.subset as fts
except ImportError:
if g_exceptions_verbose: pymupdf.exception_info()
pymupdf.message("This method requires fontTools to be installed.")
raise
import tempfile
tmp_dir = tempfile.gettempdir()
oldfont_path = f"{tmp_dir}/oldfont.ttf"
newfont_path = f"{tmp_dir}/newfont.ttf"
uncfile_path = f"{tmp_dir}/uncfile.txt"
args = [
oldfont_path,
"--retain-gids",
f"--output-file={newfont_path}",
"--layout-features='*'",
"--passthrough-tables",
"--ignore-missing-glyphs",
"--ignore-missing-unicodes",
"--symbol-cmap",
]
# store glyph ids or unicodes as file
with open(f"{tmp_dir}/uncfile.txt", "w", encoding='utf8') as unc_file:
if 0xFFFD in unc_set: # error unicode exists -> use glyphs
args.append(f"--gids-file={uncfile_path}")
gid_set.add(189)
unc_list = list(gid_set)
for unc in unc_list:
unc_file.write("%i\n" % unc)
else:
args.append(f"--unicodes-file={uncfile_path}")
unc_set.add(255)
unc_list = list(unc_set)
for unc in unc_list:
unc_file.write("%04x\n" % unc)
# store fontbuffer as a file
with open(oldfont_path, "wb") as fontfile:
fontfile.write(buffer)
try:
os.remove(newfont_path) # remove old file
except Exception:
pass
try: # invoke fontTools subsetter
fts.main(args)
font = pymupdf.Font(fontfile=newfont_path)
new_buffer = font.buffer # subset font binary
if font.glyph_count == 0: # intercept empty font
new_buffer = None
except Exception:
pymupdf.exception_info()
new_buffer = None
try:
os.remove(uncfile_path)
except Exception:
pymupdf.exception_info()
pass
try:
os.remove(oldfont_path)
except Exception:
pymupdf.exception_info()
pass
try:
os.remove(newfont_path)
except Exception:
pymupdf.exception_info()
pass
return new_buffer
def repl_fontnames(doc):
"""Populate 'font_buffers'.
For each font candidate, store its xref and the list of names
by which PDF text may refer to it (there may be multiple).
"""
def norm_name(name):
"""Recreate font name that contains PDF hex codes.
E.g. #20 -> space, chr(32)
"""
while "#" in name:
p = name.find("#")
c = int(name[p + 1 : p + 3], 16)
name = name.replace(name[p : p + 3], chr(c))
return name
def get_fontnames(doc, item):
"""Return a list of fontnames for an item of page.get_fonts().
There may be multiple names e.g. for Type0 fonts.
"""
fontname = item[3]
names = [fontname]
fontname = doc.xref_get_key(item[0], "BaseFont")[1][1:]
fontname = norm_name(fontname)
if fontname not in names:
names.append(fontname)
descendents = doc.xref_get_key(item[0], "DescendantFonts")
if descendents[0] != "array":
return names
descendents = descendents[1][1:-1]
if descendents.endswith(" 0 R"):
xref = int(descendents[:-4])
descendents = doc.xref_object(xref, compressed=True)
p1 = descendents.find("/BaseFont")
if p1 >= 0:
p2 = descendents.find("/", p1 + 1)
p1 = min(descendents.find("/", p2 + 1), descendents.find(">>", p2 + 1))
fontname = descendents[p2 + 1 : p1]
fontname = norm_name(fontname)
if fontname not in names:
names.append(fontname)
return names
for i in range(doc.page_count):
for f in doc.get_page_fonts(i, full=True):
font_xref = f[0] # font xref
font_ext = f[1] # font file extension
basename = f[3] # font basename
if font_ext not in ( # skip if not supported by fontTools
"otf",
"ttf",
"woff",
"woff2",
):
continue
# skip fonts which already are subsets
if len(basename) > 6 and basename[6] == "+":
continue
extr = doc.extract_font(font_xref)
fontbuffer = extr[-1]
names = get_fontnames(doc, f)
name_set, xref_set, subsets = font_buffers.get(
fontbuffer, (set(), set(), (set(), set()))
)
xref_set.add(font_xref)
for name in names:
name_set.add(name)
font = pymupdf.Font(fontbuffer=fontbuffer)
name_set.add(font.name)
del font
font_buffers[fontbuffer] = (name_set, xref_set, subsets)
def find_buffer_by_name(name):
for buffer, (name_set, _, _) in font_buffers.items():
if name in name_set:
return buffer
return None
# -----------------
# main function
# -----------------
repl_fontnames(doc) # populate font information
if not font_buffers: # nothing found to do
if verbose:
pymupdf.message(f'No fonts to subset.')
return 0
old_fontsize = 0
new_fontsize = 0
for fontbuffer in font_buffers.keys():
old_fontsize += len(fontbuffer)
# Scan page text for usage of subsettable fonts
for page in doc:
# go through the text and extend set of used glyphs by font
# we use a modified MuPDF trace device, which delivers us glyph ids.
for span in page.get_texttrace():
if type(span) is not dict: # skip useless information
continue
fontname = span["font"][:33] # fontname for the span
buffer = find_buffer_by_name(fontname)
if buffer is None:
continue
name_set, xref_set, (set_ucs, set_gid) = font_buffers[buffer]
for c in span["chars"]:
set_ucs.add(c[0]) # unicode
set_gid.add(c[1]) # glyph id
font_buffers[buffer] = (name_set, xref_set, (set_ucs, set_gid))
# build the font subsets
for old_buffer, (name_set, xref_set, subsets) in font_buffers.items():
new_buffer = build_subset(old_buffer, subsets[0], subsets[1])
fontname = list(name_set)[0]
if new_buffer is None or len(new_buffer) >= len(old_buffer):
# subset was not created or did not get smaller
if verbose:
pymupdf.message(f'Cannot subset {fontname!r}.')
continue
if verbose:
pymupdf.message(f"Built subset of font {fontname!r}.")
val = doc._insert_font(fontbuffer=new_buffer) # store subset font in PDF
new_xref = val[0] # get its xref
set_subset_fontname(new_xref) # tag fontname as subset font
font_str = doc.xref_object( # get its object definition
new_xref,
compressed=True,
)
# walk through the original font xrefs and replace each by the subset def
for font_xref in xref_set:
# we need the original '/W' and '/DW' width values
width_table, def_width = get_old_widths(font_xref)
# ... and replace original font definition at xref with it
doc.update_object(font_xref, font_str)
# now copy over old '/W' and '/DW' values
if width_table or def_width:
set_old_widths(font_xref, width_table, def_width)
# 'new_xref' remains unused in the PDF and must be removed
# by garbage collection.
new_fontsize += len(new_buffer)
return old_fontsize - new_fontsize
# -------------------------------------------------------------------
# Copy XREF object to another XREF
# -------------------------------------------------------------------
def xref_copy(doc: pymupdf.Document, source: int, target: int, *, keep: list = None) -> None:
"""Copy a PDF dictionary object to another one given their xref numbers.
Args:
doc: PDF document object
source: source xref number
target: target xref number, the xref must already exist
keep: an optional list of 1st level keys in target that should not be
removed before copying.
Notes:
This works similar to the copy() method of dictionaries in Python. The
source may be a stream object.
"""
if doc.xref_is_stream(source):
# read new xref stream, maintaining compression
stream = doc.xref_stream_raw(source)
doc.update_stream(
target,
stream,
compress=False, # keeps source compression
new=True, # in case target is no stream
)
# empty the target completely, observe exceptions
if keep is None:
keep = []
for key in doc.xref_get_keys(target):
if key in keep:
continue
doc.xref_set_key(target, key, "null")
# copy over all source dict items
for key in doc.xref_get_keys(source):
item = doc.xref_get_key(source, key)
doc.xref_set_key(target, key, item[1])