# ------------------------------------------------------------------------ # Copyright 2020-2022, Harald Lieder, mailto:harald.lieder@outlook.com # License: GNU AFFERO GPL 3.0, https://www.gnu.org/licenses/agpl-3.0.html # # Part of "PyMuPDF", a Python binding for "MuPDF" (http://mupdf.com), a # lightweight PDF, XPS, and E-book viewer, renderer and toolkit which is # maintained and developed by Artifex Software, Inc. https://artifex.com. # ------------------------------------------------------------------------ import io import math import os import typing import weakref try: from . import pymupdf except Exception: import pymupdf try: from . import mupdf except Exception: import mupdf _format_g = pymupdf.format_g g_exceptions_verbose = pymupdf.g_exceptions_verbose TESSDATA_PREFIX = os.environ.get("TESSDATA_PREFIX") point_like = "point_like" rect_like = "rect_like" matrix_like = "matrix_like" quad_like = "quad_like" AnyType = typing.Any OptInt = typing.Union[int, None] OptFloat = typing.Optional[float] OptStr = typing.Optional[str] OptDict = typing.Optional[dict] OptBytes = typing.Optional[typing.ByteString] OptSeq = typing.Optional[typing.Sequence] """ This is a collection of functions to extend PyMupdf. """ def write_text( page: pymupdf.Page, rect=None, writers=None, overlay=True, color=None, opacity=None, keep_proportion=True, rotate=0, oc=0, ) -> None: """Write the text of one or more pymupdf.TextWriter objects. Args: rect: target rectangle. If None, the union of the text writers is used. writers: one or more pymupdf.TextWriter objects. overlay: put in foreground or background. keep_proportion: maintain aspect ratio of rectangle sides. rotate: arbitrary rotation angle. oc: the xref of an optional content object """ assert isinstance(page, pymupdf.Page) if not writers: raise ValueError("need at least one pymupdf.TextWriter") if type(writers) is pymupdf.TextWriter: if rotate == 0 and rect is None: writers.write_text(page, opacity=opacity, color=color, overlay=overlay) return None else: writers = (writers,) clip = writers[0].text_rect textdoc = pymupdf.Document() tpage = textdoc.new_page(width=page.rect.width, height=page.rect.height) for writer in writers: clip |= writer.text_rect writer.write_text(tpage, opacity=opacity, color=color) if rect is None: rect = clip page.show_pdf_page( rect, textdoc, 0, overlay=overlay, keep_proportion=keep_proportion, rotate=rotate, clip=clip, oc=oc, ) textdoc = None tpage = None def show_pdf_page( page, rect, src, pno=0, keep_proportion=True, overlay=True, oc=0, rotate=0, clip=None, ) -> int: """Show page number 'pno' of PDF 'src' in rectangle 'rect'. Args: rect: (rect-like) where to place the source image src: (document) source PDF pno: (int) source page number keep_proportion: (bool) do not change width-height-ratio overlay: (bool) put in foreground oc: (xref) make visibility dependent on this OCG / OCMD (which must be defined in the target PDF) rotate: (int) degrees (multiple of 90) clip: (rect-like) part of source page rectangle Returns: xref of inserted object (for reuse) """ def calc_matrix(sr, tr, keep=True, rotate=0): """Calculate transformation matrix from source to target rect. Notes: The product of four matrices in this sequence: (1) translate correct source corner to origin, (2) rotate, (3) scale, (4) translate to target's top-left corner. Args: sr: source rect in PDF (!) coordinate system tr: target rect in PDF coordinate system keep: whether to keep source ratio of width to height rotate: rotation angle in degrees Returns: Transformation matrix. """ # calc center point of source rect smp = (sr.tl + sr.br) / 2.0 # calc center point of target rect tmp = (tr.tl + tr.br) / 2.0 # m moves to (0, 0), then rotates m = pymupdf.Matrix(1, 0, 0, 1, -smp.x, -smp.y) * pymupdf.Matrix(rotate) sr1 = sr * m # resulting source rect to calculate scale factors fw = tr.width / sr1.width # scale the width fh = tr.height / sr1.height # scale the height if keep: fw = fh = min(fw, fh) # take min if keeping aspect ratio m *= pymupdf.Matrix(fw, fh) # concat scale matrix m *= pymupdf.Matrix(1, 0, 0, 1, tmp.x, tmp.y) # concat move to target center return pymupdf.JM_TUPLE(m) pymupdf.CheckParent(page) doc = page.parent if not doc.is_pdf or not src.is_pdf: raise ValueError("is no PDF") if rect.is_empty or rect.is_infinite: raise ValueError("rect must be finite and not empty") while pno < 0: # support negative page numbers pno += src.page_count src_page = src[pno] # load source page if src_page.get_contents() == []: raise ValueError("nothing to show - source page empty") tar_rect = rect * ~page.transformation_matrix # target rect in PDF coordinates src_rect = src_page.rect if not clip else src_page.rect & clip # source rect if src_rect.is_empty or src_rect.is_infinite: raise ValueError("clip must be finite and not empty") src_rect = src_rect * ~src_page.transformation_matrix # ... in PDF coord matrix = calc_matrix(src_rect, tar_rect, keep=keep_proportion, rotate=rotate) # list of existing /Form /XObjects ilst = [i[1] for i in doc.get_page_xobjects(page.number)] ilst += [i[7] for i in doc.get_page_images(page.number)] ilst += [i[4] for i in doc.get_page_fonts(page.number)] # create a name not in that list n = "fzFrm" i = 0 _imgname = n + "0" while _imgname in ilst: i += 1 _imgname = n + str(i) isrc = src._graft_id # used as key for graftmaps if doc._graft_id == isrc: raise ValueError("source document must not equal target") # retrieve / make pymupdf.Graftmap for source PDF gmap = doc.Graftmaps.get(isrc, None) if gmap is None: gmap = pymupdf.Graftmap(doc) doc.Graftmaps[isrc] = gmap # take note of generated xref for automatic reuse pno_id = (isrc, pno) # id of src[pno] xref = doc.ShownPages.get(pno_id, 0) if overlay: page.wrap_contents() # ensure a balanced graphics state xref = page._show_pdf_page( src_page, overlay=overlay, matrix=matrix, xref=xref, oc=oc, clip=src_rect, graftmap=gmap, _imgname=_imgname, ) doc.ShownPages[pno_id] = xref return xref def replace_image(page: pymupdf.Page, xref: int, *, filename=None, pixmap=None, stream=None): """Replace the image referred to by xref. Replace the image by changing the object definition stored under xref. This will leave the pages appearance instructions intact, so the new image is being displayed with the same bbox, rotation etc. By providing a small fully transparent image, an effect as if the image had been deleted can be achieved. A typical use may include replacing large images by a smaller version, e.g. with a lower resolution or graylevel instead of colored. Args: xref: the xref of the image to replace. filename, pixmap, stream: exactly one of these must be provided. The meaning being the same as in Page.insert_image. """ doc = page.parent # the owning document if not doc.xref_is_image(xref): raise ValueError("xref not an image") # insert new image anywhere in page if bool(filename) + bool(stream) + bool(pixmap) != 1: raise ValueError("Exactly one of filename/stream/pixmap must be given") new_xref = page.insert_image( page.rect, filename=filename, stream=stream, pixmap=pixmap ) doc.xref_copy(new_xref, xref) # copy over new to old last_contents_xref = page.get_contents()[-1] # new image insertion has created a new /Contents source, # which we will set to spaces now doc.update_stream(last_contents_xref, b" ") def delete_image(page: pymupdf.Page, xref: int): """Delete the image referred to by xef. Actually replaces by a small transparent Pixmap using method Page.replace_image. Args: xref: xref of the image to delete. """ # make a small 100% transparent pixmap (of just any dimension) pix = pymupdf.Pixmap(pymupdf.csGRAY, (0, 0, 1, 1), 1) pix.clear_with() # clear all samples bytes to 0x00 page.replace_image(xref, pixmap=pix) def insert_image( page, rect, *, alpha=-1, filename=None, height=0, keep_proportion=True, mask=None, oc=0, overlay=True, pixmap=None, rotate=0, stream=None, width=0, xref=0, ): """Insert an image for display in a rectangle. Args: rect: (rect_like) position of image on the page. alpha: (int, optional) set to 0 if image has no transparency. filename: (str, Path, file object) image filename. height: (int) keep_proportion: (bool) keep width / height ratio (default). mask: (bytes, optional) image consisting of alpha values to use. oc: (int) xref of OCG or OCMD to declare as Optional Content. overlay: (bool) put in foreground (default) or background. pixmap: (pymupdf.Pixmap) use this as image. rotate: (int) rotate by 0, 90, 180 or 270 degrees. stream: (bytes) use this as image. width: (int) xref: (int) use this as image. 'page' and 'rect' are positional, all other parameters are keywords. If 'xref' is given, that image is used. Other input options are ignored. Else, exactly one of pixmap, stream or filename must be given. 'alpha=0' for non-transparent images improves performance significantly. Affects stream and filename only. Optimum transparent insertions are possible by using filename / stream in conjunction with a 'mask' image of alpha values. Returns: xref (int) of inserted image. Re-use as argument for multiple insertions. """ pymupdf.CheckParent(page) doc = page.parent if not doc.is_pdf: raise ValueError("is no PDF") if xref == 0 and (bool(filename) + bool(stream) + bool(pixmap) != 1): raise ValueError("xref=0 needs exactly one of filename, pixmap, stream") if filename: if type(filename) is str: pass elif hasattr(filename, "absolute"): filename = str(filename) elif hasattr(filename, "name"): filename = filename.name else: raise ValueError("bad filename") if filename and not os.path.exists(filename): raise FileNotFoundError("No such file: '%s'" % filename) elif stream and type(stream) not in (bytes, bytearray, io.BytesIO): raise ValueError("stream must be bytes-like / BytesIO") elif pixmap and type(pixmap) is not pymupdf.Pixmap: raise ValueError("pixmap must be a pymupdf.Pixmap") if mask and not (stream or filename): raise ValueError("mask requires stream or filename") if mask and type(mask) not in (bytes, bytearray, io.BytesIO): raise ValueError("mask must be bytes-like / BytesIO") while rotate < 0: rotate += 360 while rotate >= 360: rotate -= 360 if rotate not in (0, 90, 180, 270): raise ValueError("bad rotate value") r = pymupdf.Rect(rect) if r.is_empty or r.is_infinite: raise ValueError("rect must be finite and not empty") clip = r * ~page.transformation_matrix # Create a unique image reference name. ilst = [i[7] for i in doc.get_page_images(page.number)] ilst += [i[1] for i in doc.get_page_xobjects(page.number)] ilst += [i[4] for i in doc.get_page_fonts(page.number)] n = "fzImg" # 'pymupdf image' i = 0 _imgname = n + "0" # first name candidate while _imgname in ilst: i += 1 _imgname = n + str(i) # try new name if overlay: page.wrap_contents() # ensure a balanced graphics state digests = doc.InsertedImages xref, digests = page._insert_image( filename=filename, pixmap=pixmap, stream=stream, imask=mask, clip=clip, overlay=overlay, oc=oc, xref=xref, rotate=rotate, keep_proportion=keep_proportion, width=width, height=height, alpha=alpha, _imgname=_imgname, digests=digests, ) if digests is not None: doc.InsertedImages = digests return xref def search_for( page, text, *, clip=None, quads=False, flags=pymupdf.TEXT_DEHYPHENATE | pymupdf.TEXT_PRESERVE_WHITESPACE | pymupdf.TEXT_PRESERVE_LIGATURES | pymupdf.TEXT_MEDIABOX_CLIP , textpage=None, ) -> list: """Search for a string on a page. Args: text: string to be searched for clip: restrict search to this rectangle quads: (bool) return quads instead of rectangles flags: bit switches, default: join hyphened words textpage: a pre-created pymupdf.TextPage Returns: a list of rectangles or quads, each containing one occurrence. """ if clip is not None: clip = pymupdf.Rect(clip) pymupdf.CheckParent(page) tp = textpage if tp is None: tp = page.get_textpage(clip=clip, flags=flags) # create pymupdf.TextPage elif getattr(tp, "parent") != page: raise ValueError("not a textpage of this page") rlist = tp.search(text, quads=quads) if textpage is None: del tp return rlist def search_page_for( doc: pymupdf.Document, pno: int, text: str, quads: bool = False, clip: rect_like = None, flags: int = pymupdf.TEXT_DEHYPHENATE | pymupdf.TEXT_PRESERVE_LIGATURES | pymupdf.TEXT_PRESERVE_WHITESPACE | pymupdf.TEXT_MEDIABOX_CLIP , textpage: pymupdf.TextPage = None, ) -> list: """Search for a string on a page. Args: pno: page number text: string to be searched for clip: restrict search to this rectangle quads: (bool) return quads instead of rectangles flags: bit switches, default: join hyphened words textpage: reuse a prepared textpage Returns: a list of rectangles or quads, each containing an occurrence. """ return doc[pno].search_for( text, quads=quads, clip=clip, flags=flags, textpage=textpage, ) def get_text_blocks( page: pymupdf.Page, clip: rect_like = None, flags: OptInt = None, textpage: pymupdf.TextPage = None, sort: bool = False, ) -> list: """Return the text blocks on a page. Notes: Lines in a block are concatenated with line breaks. Args: flags: (int) control the amount of data parsed into the textpage. Returns: A list of the blocks. Each item contains the containing rectangle coordinates, text lines, running block number and block type. """ pymupdf.CheckParent(page) if flags is None: flags = pymupdf.TEXTFLAGS_BLOCKS tp = textpage if tp is None: tp = page.get_textpage(clip=clip, flags=flags) elif getattr(tp, "parent") != page: raise ValueError("not a textpage of this page") blocks = tp.extractBLOCKS() if textpage is None: del tp if sort is True: blocks.sort(key=lambda b: (b[3], b[0])) return blocks def get_text_words( page: pymupdf.Page, clip: rect_like = None, flags: OptInt = None, textpage: pymupdf.TextPage = None, sort: bool = False, delimiters=None, ) -> list: """Return the text words as a list with the bbox for each word. Args: flags: (int) control the amount of data parsed into the textpage. delimiters: (str,list) characters to use as word delimiters Returns: Word tuples (x0, y0, x1, y1, "word", bno, lno, wno). """ pymupdf.CheckParent(page) if flags is None: flags = pymupdf.TEXT_PRESERVE_WHITESPACE | pymupdf.TEXT_PRESERVE_LIGATURES | pymupdf.TEXT_MEDIABOX_CLIP tp = textpage if tp is None: tp = page.get_textpage(clip=clip, flags=flags) elif getattr(tp, "parent") != page: raise ValueError("not a textpage of this page") words = tp.extractWORDS(delimiters) if textpage is None: del tp if sort is True: words.sort(key=lambda w: (w[3], w[0])) return words def get_textbox( page: pymupdf.Page, rect: rect_like, textpage: pymupdf.TextPage = None, ) -> str: tp = textpage if tp is None: tp = page.get_textpage() elif getattr(tp, "parent") != page: raise ValueError("not a textpage of this page") rc = tp.extractTextbox(rect) if textpage is None: del tp return rc def get_text_selection( page: pymupdf.Page, p1: point_like, p2: point_like, clip: rect_like = None, textpage: pymupdf.TextPage = None, ): pymupdf.CheckParent(page) tp = textpage if tp is None: tp = page.get_textpage(clip=clip, flags=pymupdf.TEXT_DEHYPHENATE) elif getattr(tp, "parent") != page: raise ValueError("not a textpage of this page") rc = tp.extractSelection(p1, p2) if textpage is None: del tp return rc def get_textpage_ocr( page: pymupdf.Page, flags: int = 0, language: str = "eng", dpi: int = 72, full: bool = False, tessdata: str = None, ) -> pymupdf.TextPage: """Create a Textpage from combined results of normal and OCR text parsing. Args: flags: (int) control content becoming part of the result. language: (str) specify expected language(s). Deafault is "eng" (English). dpi: (int) resolution in dpi, default 72. full: (bool) whether to OCR the full page image, or only its images (default) """ pymupdf.CheckParent(page) if not TESSDATA_PREFIX and not tessdata: raise RuntimeError("No OCR support: TESSDATA_PREFIX not set") def full_ocr(page, dpi, language, flags): zoom = dpi / 72 mat = pymupdf.Matrix(zoom, zoom) pix = page.get_pixmap(matrix=mat) ocr_pdf = pymupdf.Document( "pdf", pix.pdfocr_tobytes( compress=False, language=language, tessdata=tessdata, ), ) ocr_page = ocr_pdf.load_page(0) unzoom = page.rect.width / ocr_page.rect.width ctm = pymupdf.Matrix(unzoom, unzoom) * page.derotation_matrix tpage = ocr_page.get_textpage(flags=flags, matrix=ctm) ocr_pdf.close() pix = None tpage.parent = weakref.proxy(page) return tpage # if OCR for the full page, OCR its pixmap @ desired dpi if full is True: return full_ocr(page, dpi, language, flags) # For partial OCR, make a normal textpage, then extend it with text that # is OCRed from each image. # Because of this, we need the images flag bit set ON. tpage = page.get_textpage(flags=flags) for block in page.get_text("dict", flags=pymupdf.TEXT_PRESERVE_IMAGES)["blocks"]: if block["type"] != 1: # only look at images continue bbox = pymupdf.Rect(block["bbox"]) if bbox.width <= 3 or bbox.height <= 3: # ignore tiny stuff continue exception_types = (RuntimeError, mupdf.FzErrorBase) if pymupdf.mupdf_version_tuple < (1, 24): exception_types = RuntimeError try: pix = pymupdf.Pixmap(block["image"]) # get image pixmap if pix.n - pix.alpha != 3: # we need to convert this to RGB! pix = pymupdf.Pixmap(pymupdf.csRGB, pix) if pix.alpha: # must remove alpha channel pix = pymupdf.Pixmap(pix, 0) imgdoc = pymupdf.Document( "pdf", pix.pdfocr_tobytes(language=language, tessdata=tessdata), ) # pdf with OCRed page imgpage = imgdoc.load_page(0) # read image as a page pix = None # compute matrix to transform coordinates back to that of 'page' imgrect = imgpage.rect # page size of image PDF shrink = pymupdf.Matrix(1 / imgrect.width, 1 / imgrect.height) mat = shrink * block["transform"] imgpage.extend_textpage(tpage, flags=0, matrix=mat) imgdoc.close() except exception_types: if g_exceptions_verbose: pymupdf.exception_info() tpage = None pymupdf.message("Falling back to full page OCR") return full_ocr(page, dpi, language, flags) return tpage def get_image_info(page: pymupdf.Page, hashes: bool = False, xrefs: bool = False) -> list: """Extract image information only from a pymupdf.TextPage. Args: hashes: (bool) include MD5 hash for each image. xrefs: (bool) try to find the xref for each image. Sets hashes to true. """ doc = page.parent if xrefs and doc.is_pdf: hashes = True if not doc.is_pdf: xrefs = False imginfo = getattr(page, "_image_info", None) if imginfo and not xrefs: return imginfo if not imginfo: tp = page.get_textpage(flags=pymupdf.TEXT_PRESERVE_IMAGES) imginfo = tp.extractIMGINFO(hashes=hashes) del tp if hashes: page._image_info = imginfo if not xrefs or not doc.is_pdf: return imginfo imglist = page.get_images() digests = {} for item in imglist: xref = item[0] pix = pymupdf.Pixmap(doc, xref) digests[pix.digest] = xref del pix for i in range(len(imginfo)): item = imginfo[i] xref = digests.get(item["digest"], 0) item["xref"] = xref imginfo[i] = item return imginfo def get_image_rects(page: pymupdf.Page, name, transform=False) -> list: """Return list of image positions on a page. Args: name: (str, list, int) image identification. May be reference name, an item of the page's image list or an xref. transform: (bool) whether to also return the transformation matrix. Returns: A list of pymupdf.Rect objects or tuples of (pymupdf.Rect, pymupdf.Matrix) for all image locations on the page. """ if type(name) in (list, tuple): xref = name[0] elif type(name) is int: xref = name else: imglist = [i for i in page.get_images() if i[7] == name] if imglist == []: raise ValueError("bad image name") elif len(imglist) != 1: raise ValueError("multiple image names found") xref = imglist[0][0] pix = pymupdf.Pixmap(page.parent, xref) # make pixmap of the image to compute MD5 digest = pix.digest del pix infos = page.get_image_info(hashes=True) if not transform: bboxes = [pymupdf.Rect(im["bbox"]) for im in infos if im["digest"] == digest] else: bboxes = [ (pymupdf.Rect(im["bbox"]), pymupdf.Matrix(im["transform"])) for im in infos if im["digest"] == digest ] return bboxes def get_text( page: pymupdf.Page, option: str = "text", clip: rect_like = None, flags: OptInt = None, textpage: pymupdf.TextPage = None, sort: bool = False, delimiters=None, ): """Extract text from a page or an annotation. This is a unifying wrapper for various methods of the pymupdf.TextPage class. Args: option: (str) text, words, blocks, html, dict, json, rawdict, xhtml or xml. clip: (rect-like) restrict output to this area. flags: bit switches to e.g. exclude images or decompose ligatures. textpage: reuse this pymupdf.TextPage and make no new one. If specified, 'flags' and 'clip' are ignored. Returns: the output of methods get_text_words / get_text_blocks or pymupdf.TextPage methods extractText, extractHTML, extractDICT, extractJSON, extractRAWDICT, extractXHTML or etractXML respectively. Default and misspelling choice is "text". """ formats = { "text": pymupdf.TEXTFLAGS_TEXT, "html": pymupdf.TEXTFLAGS_HTML, "json": pymupdf.TEXTFLAGS_DICT, "rawjson": pymupdf.TEXTFLAGS_RAWDICT, "xml": pymupdf.TEXTFLAGS_XML, "xhtml": pymupdf.TEXTFLAGS_XHTML, "dict": pymupdf.TEXTFLAGS_DICT, "rawdict": pymupdf.TEXTFLAGS_RAWDICT, "words": pymupdf.TEXTFLAGS_WORDS, "blocks": pymupdf.TEXTFLAGS_BLOCKS, } option = option.lower() if option not in formats: option = "text" if flags is None: flags = formats[option] if option == "words": return get_text_words( page, clip=clip, flags=flags, textpage=textpage, sort=sort, delimiters=delimiters, ) if option == "blocks": return get_text_blocks( page, clip=clip, flags=flags, textpage=textpage, sort=sort ) pymupdf.CheckParent(page) cb = None if option in ("html", "xml", "xhtml"): # no clipping for MuPDF functions clip = page.cropbox if clip is not None: clip = pymupdf.Rect(clip) cb = None elif type(page) is pymupdf.Page: cb = page.cropbox # pymupdf.TextPage with or without images tp = textpage #pymupdf.exception_info() if tp is None: tp = page.get_textpage(clip=clip, flags=flags) elif getattr(tp, "parent") != page: raise ValueError("not a textpage of this page") #pymupdf.log( '{option=}') if option == "json": t = tp.extractJSON(cb=cb, sort=sort) elif option == "rawjson": t = tp.extractRAWJSON(cb=cb, sort=sort) elif option == "dict": t = tp.extractDICT(cb=cb, sort=sort) elif option == "rawdict": t = tp.extractRAWDICT(cb=cb, sort=sort) elif option == "html": t = tp.extractHTML() elif option == "xml": t = tp.extractXML() elif option == "xhtml": t = tp.extractXHTML() else: t = tp.extractText(sort=sort) if textpage is None: del tp return t def get_page_text( doc: pymupdf.Document, pno: int, option: str = "text", clip: rect_like = None, flags: OptInt = None, textpage: pymupdf.TextPage = None, sort: bool = False, ) -> typing.Any: """Extract a document page's text by page number. Notes: Convenience function calling page.get_text(). Args: pno: page number option: (str) text, words, blocks, html, dict, json, rawdict, xhtml or xml. Returns: output from page.TextPage(). """ return doc[pno].get_text(option, clip=clip, flags=flags, sort=sort) def get_pixmap( page: pymupdf.Page, *, matrix: matrix_like=pymupdf.Identity, dpi=None, colorspace: pymupdf.Colorspace=pymupdf.csRGB, clip: rect_like=None, alpha: bool=False, annots: bool=True, ) -> pymupdf.Pixmap: """Create pixmap of page. Keyword args: matrix: Matrix for transformation (default: Identity). dpi: desired dots per inch. If given, matrix is ignored. colorspace: (str/Colorspace) cmyk, rgb, gray - case ignored, default csRGB. clip: (irect-like) restrict rendering to this area. alpha: (bool) whether to include alpha channel annots: (bool) whether to also render annotations """ if dpi: zoom = dpi / 72 matrix = pymupdf.Matrix(zoom, zoom) if type(colorspace) is str: if colorspace.upper() == "GRAY": colorspace = pymupdf.csGRAY elif colorspace.upper() == "CMYK": colorspace = pymupdf.csCMYK else: colorspace = pymupdf.csRGB if colorspace.n not in (1, 3, 4): raise ValueError("unsupported colorspace") dl = page.get_displaylist(annots=annots) pix = dl.get_pixmap(matrix=matrix, colorspace=colorspace, alpha=alpha, clip=clip) dl = None if dpi: pix.set_dpi(dpi, dpi) return pix def get_page_pixmap( doc: pymupdf.Document, pno: int, *, matrix: matrix_like = pymupdf.Identity, dpi=None, colorspace: pymupdf.Colorspace = pymupdf.csRGB, clip: rect_like = None, alpha: bool = False, annots: bool = True, ) -> pymupdf.Pixmap: """Create pixmap of document page by page number. Notes: Convenience function calling page.get_pixmap. Args: pno: (int) page number matrix: pymupdf.Matrix for transformation (default: pymupdf.Identity). colorspace: (str,pymupdf.Colorspace) rgb, rgb, gray - case ignored, default csRGB. clip: (irect-like) restrict rendering to this area. alpha: (bool) include alpha channel annots: (bool) also render annotations """ return doc[pno].get_pixmap( matrix=matrix, dpi=dpi, colorspace=colorspace, clip=clip, alpha=alpha, annots=annots ) def getLinkDict(ln, document=None) -> dict: if isinstance(ln, pymupdf.Outline): dest = ln.destination(document) elif isinstance(ln, pymupdf.Link): dest = ln.dest else: assert 0, f'Unexpected {type(ln)=}.' nl = {"kind": dest.kind, "xref": 0} try: nl["from"] = ln.rect except Exception: # This seems to happen quite often in PyMuPDF/tests. if g_exceptions_verbose: pymupdf.exception_info() pass pnt = pymupdf.Point(0, 0) if dest.flags & pymupdf.LINK_FLAG_L_VALID: pnt.x = dest.lt.x if dest.flags & pymupdf.LINK_FLAG_T_VALID: pnt.y = dest.lt.y if dest.kind == pymupdf.LINK_URI: nl["uri"] = dest.uri elif dest.kind == pymupdf.LINK_GOTO: nl["page"] = dest.page nl["to"] = pnt if dest.flags & pymupdf.LINK_FLAG_R_IS_ZOOM: nl["zoom"] = dest.rb.x else: nl["zoom"] = 0.0 elif dest.kind == pymupdf.LINK_GOTOR: nl["file"] = dest.file_spec.replace("\\", "/") nl["page"] = dest.page if dest.page < 0: nl["to"] = dest.dest else: nl["to"] = pnt if dest.flags & pymupdf.LINK_FLAG_R_IS_ZOOM: nl["zoom"] = dest.rb.x else: nl["zoom"] = 0.0 elif dest.kind == pymupdf.LINK_LAUNCH: nl["file"] = dest.file_spec.replace("\\", "/") elif dest.kind == pymupdf.LINK_NAMED: # The dicts should not have same key(s). assert not (dest.named.keys() & nl.keys()) nl.update(dest.named) if 'to' in nl: nl['to'] = pymupdf.Point(nl['to']) else: nl["page"] = dest.page return nl def get_links(page: pymupdf.Page) -> list: """Create a list of all links contained in a PDF page. Notes: see PyMuPDF ducmentation for details. """ pymupdf.CheckParent(page) ln = page.first_link links = [] while ln: nl = getLinkDict(ln, page.parent) links.append(nl) ln = ln.next if links != [] and page.parent.is_pdf: linkxrefs = [x for x in #page.annot_xrefs() pymupdf.JM_get_annot_xref_list2(page) if x[1] == pymupdf.PDF_ANNOT_LINK # pylint: disable=no-member ] if len(linkxrefs) == len(links): for i in range(len(linkxrefs)): links[i]["xref"] = linkxrefs[i][0] links[i]["id"] = linkxrefs[i][2] return links def get_toc( doc: pymupdf.Document, simple: bool = True, ) -> list: """Create a table of contents. Args: simple: a bool to control output. Returns a list, where each entry consists of outline level, title, page number and link destination (if simple = False). For details see PyMuPDF's documentation. """ def recurse(olItem, liste, lvl): """Recursively follow the outline item chain and record item information in a list.""" while olItem and olItem.this.m_internal: if olItem.title: title = olItem.title else: title = " " if not olItem.is_external: if olItem.uri: if olItem.page == -1: resolve = doc.resolve_link(olItem.uri) page = resolve[0] + 1 else: page = olItem.page + 1 else: page = -1 else: page = -1 if not simple: link = getLinkDict(olItem, doc) liste.append([lvl, title, page, link]) else: liste.append([lvl, title, page]) if olItem.down: liste = recurse(olItem.down, liste, lvl + 1) olItem = olItem.next return liste # ensure document is open if doc.is_closed: raise ValueError("document closed") doc.init_doc() olItem = doc.outline if not olItem: return [] lvl = 1 liste = [] toc = recurse(olItem, liste, lvl) if doc.is_pdf and simple is False: doc._extend_toc_items(toc) return toc def del_toc_item( doc: pymupdf.Document, idx: int, ) -> None: """Delete TOC / bookmark item by index.""" xref = doc.get_outline_xrefs()[idx] doc._remove_toc_item(xref) def set_toc_item( doc: pymupdf.Document, idx: int, dest_dict: OptDict = None, kind: OptInt = None, pno: OptInt = None, uri: OptStr = None, title: OptStr = None, to: point_like = None, filename: OptStr = None, zoom: float = 0, ) -> None: """Update TOC item by index. It allows changing the item's title and link destination. Args: idx: (int) desired index of the TOC list, as created by get_toc. dest_dict: (dict) destination dictionary as created by get_toc(False). Outrules all other parameters. If None, the remaining parameters are used to make a dest dictionary. kind: (int) kind of link (pymupdf.LINK_GOTO, etc.). If None, then only the title will be updated. If pymupdf.LINK_NONE, the TOC item will be deleted. pno: (int) page number (1-based like in get_toc). Required if pymupdf.LINK_GOTO. uri: (str) the URL, required if pymupdf.LINK_URI. title: (str) the new title. No change if None. to: (point-like) destination on the target page. If omitted, (72, 36) will be used as taget coordinates. filename: (str) destination filename, required for pymupdf.LINK_GOTOR and pymupdf.LINK_LAUNCH. name: (str) a destination name for pymupdf.LINK_NAMED. zoom: (float) a zoom factor for the target location (pymupdf.LINK_GOTO). """ xref = doc.get_outline_xrefs()[idx] page_xref = 0 if type(dest_dict) is dict: if dest_dict["kind"] == pymupdf.LINK_GOTO: pno = dest_dict["page"] page_xref = doc.page_xref(pno) page_height = doc.page_cropbox(pno).height to = dest_dict.get('to', pymupdf.Point(72, 36)) to.y = page_height - to.y dest_dict["to"] = to action = getDestStr(page_xref, dest_dict) if not action.startswith("/A"): raise ValueError("bad bookmark dest") color = dest_dict.get("color") if color: color = list(map(float, color)) if len(color) != 3 or min(color) < 0 or max(color) > 1: raise ValueError("bad color value") bold = dest_dict.get("bold", False) italic = dest_dict.get("italic", False) flags = italic + 2 * bold collapse = dest_dict.get("collapse") return doc._update_toc_item( xref, action=action[2:], title=title, color=color, flags=flags, collapse=collapse, ) if kind == pymupdf.LINK_NONE: # delete bookmark item return doc.del_toc_item(idx) if kind is None and title is None: # treat as no-op return None if kind is None: # only update title text return doc._update_toc_item(xref, action=None, title=title) if kind == pymupdf.LINK_GOTO: if pno is None or pno not in range(1, doc.page_count + 1): raise ValueError("bad page number") page_xref = doc.page_xref(pno - 1) page_height = doc.page_cropbox(pno - 1).height if to is None: to = pymupdf.Point(72, page_height - 36) else: to = pymupdf.Point(to) to.y = page_height - to.y ddict = { "kind": kind, "to": to, "uri": uri, "page": pno, "file": filename, "zoom": zoom, } action = getDestStr(page_xref, ddict) if action == "" or not action.startswith("/A"): raise ValueError("bad bookmark dest") return doc._update_toc_item(xref, action=action[2:], title=title) def get_area(*args) -> float: """Calculate area of rectangle.\nparameter is one of 'px' (default), 'in', 'cm', or 'mm'.""" rect = args[0] if len(args) > 1: unit = args[1] else: unit = "px" u = {"px": (1, 1), "in": (1.0, 72.0), "cm": (2.54, 72.0), "mm": (25.4, 72.0)} f = (u[unit][0] / u[unit][1]) ** 2 return f * rect.width * rect.height def set_metadata(doc: pymupdf.Document, m: dict) -> None: """Update the PDF /Info object. Args: m: a dictionary like doc.metadata. """ if not doc.is_pdf: raise ValueError("is no PDF") if doc.is_closed or doc.is_encrypted: raise ValueError("document closed or encrypted") if type(m) is not dict: raise ValueError("bad metadata") keymap = { "author": "Author", "producer": "Producer", "creator": "Creator", "title": "Title", "format": None, "encryption": None, "creationDate": "CreationDate", "modDate": "ModDate", "subject": "Subject", "keywords": "Keywords", "trapped": "Trapped", } valid_keys = set(keymap.keys()) diff_set = set(m.keys()).difference(valid_keys) if diff_set != set(): msg = "bad dict key(s): %s" % diff_set raise ValueError(msg) t, temp = doc.xref_get_key(-1, "Info") if t != "xref": info_xref = 0 else: info_xref = int(temp.replace("0 R", "")) if m == {} and info_xref == 0: # nothing to do return if info_xref == 0: # no prev metadata: get new xref info_xref = doc.get_new_xref() doc.update_object(info_xref, "<<>>") # fill it with empty object doc.xref_set_key(-1, "Info", "%i 0 R" % info_xref) elif m == {}: # remove existing metadata doc.xref_set_key(-1, "Info", "null") return for key, val in [(k, v) for k, v in m.items() if keymap[k] is not None]: pdf_key = keymap[key] if not bool(val) or val in ("none", "null"): val = "null" else: val = pymupdf.get_pdf_str(val) doc.xref_set_key(info_xref, pdf_key, val) doc.init_doc() return def getDestStr(xref: int, ddict: dict) -> str: """Calculate the PDF action string. Notes: Supports Link annotations and outline items (bookmarks). """ if not ddict: return "" str_goto = lambda a, b, c, d: f"/A<>" str_gotor1 = lambda a, b, c, d, e, f: f"/A<>>>" str_gotor2 = lambda a, b, c: f"/A<>>>" str_launch = lambda a, b: f"/A<>>>" str_uri = lambda a: f"/A<>" if type(ddict) in (int, float): dest = str_goto(xref, 0, ddict, 0) return dest d_kind = ddict.get("kind", pymupdf.LINK_NONE) if d_kind == pymupdf.LINK_NONE: return "" if ddict["kind"] == pymupdf.LINK_GOTO: d_zoom = ddict.get("zoom", 0) to = ddict.get("to", pymupdf.Point(0, 0)) d_left, d_top = to dest = str_goto(xref, d_left, d_top, d_zoom) return dest if ddict["kind"] == pymupdf.LINK_URI: dest = str_uri(pymupdf.get_pdf_str(ddict["uri"]),) return dest if ddict["kind"] == pymupdf.LINK_LAUNCH: fspec = pymupdf.get_pdf_str(ddict["file"]) dest = str_launch(fspec, fspec) return dest if ddict["kind"] == pymupdf.LINK_GOTOR and ddict["page"] < 0: fspec = pymupdf.get_pdf_str(ddict["file"]) dest = str_gotor2(pymupdf.get_pdf_str(ddict["to"]), fspec, fspec) return dest if ddict["kind"] == pymupdf.LINK_GOTOR and ddict["page"] >= 0: fspec = pymupdf.get_pdf_str(ddict["file"]) dest = str_gotor1( ddict["page"], ddict["to"].x, ddict["to"].y, ddict["zoom"], fspec, fspec, ) return dest return "" def set_toc( doc: pymupdf.Document, toc: list, collapse: int = 1, ) -> int: """Create new outline tree (table of contents, TOC). Args: toc: (list, tuple) each entry must contain level, title, page and optionally top margin on the page. None or '()' remove the TOC. collapse: (int) collapses entries beyond this level. Zero or None shows all entries unfolded. Returns: the number of inserted items, or the number of removed items respectively. """ if doc.is_closed or doc.is_encrypted: raise ValueError("document closed or encrypted") if not doc.is_pdf: raise ValueError("is no PDF") if not toc: # remove all entries return len(doc._delToC()) # validity checks -------------------------------------------------------- if type(toc) not in (list, tuple): raise ValueError("'toc' must be list or tuple") toclen = len(toc) page_count = doc.page_count t0 = toc[0] if type(t0) not in (list, tuple): raise ValueError("items must be sequences of 3 or 4 items") if t0[0] != 1: raise ValueError("hierarchy level of item 0 must be 1") for i in list(range(toclen - 1)): t1 = toc[i] t2 = toc[i + 1] if not -1 <= t1[2] <= page_count: raise ValueError("row %i: page number out of range" % i) if (type(t2) not in (list, tuple)) or len(t2) not in (3, 4): raise ValueError("bad row %i" % (i + 1)) if (type(t2[0]) is not int) or t2[0] < 1: raise ValueError("bad hierarchy level in row %i" % (i + 1)) if t2[0] > t1[0] + 1: raise ValueError("bad hierarchy level in row %i" % (i + 1)) # no formal errors in toc -------------------------------------------------- # -------------------------------------------------------------------------- # make a list of xref numbers, which we can use for our TOC entries # -------------------------------------------------------------------------- old_xrefs = doc._delToC() # del old outlines, get their xref numbers # prepare table of xrefs for new bookmarks old_xrefs = [] xref = [0] + old_xrefs xref[0] = doc._getOLRootNumber() # entry zero is outline root xref number if toclen > len(old_xrefs): # too few old xrefs? for i in range((toclen - len(old_xrefs))): xref.append(doc.get_new_xref()) # acquire new ones lvltab = {0: 0} # to store last entry per hierarchy level # ------------------------------------------------------------------------------ # contains new outline objects as strings - first one is the outline root # ------------------------------------------------------------------------------ olitems = [{"count": 0, "first": -1, "last": -1, "xref": xref[0]}] # ------------------------------------------------------------------------------ # build olitems as a list of PDF-like connnected dictionaries # ------------------------------------------------------------------------------ for i in range(toclen): o = toc[i] lvl = o[0] # level title = pymupdf.get_pdf_str(o[1]) # title pno = min(doc.page_count - 1, max(0, o[2] - 1)) # page number page_xref = doc.page_xref(pno) page_height = doc.page_cropbox(pno).height top = pymupdf.Point(72, page_height - 36) dest_dict = {"to": top, "kind": pymupdf.LINK_GOTO} # fall back target if o[2] < 0: dest_dict["kind"] = pymupdf.LINK_NONE if len(o) > 3: # some target is specified if type(o[3]) in (int, float): # convert a number to a point dest_dict["to"] = pymupdf.Point(72, page_height - o[3]) else: # if something else, make sure we have a dict # We make a copy of o[3] to avoid modifying our caller's data. dest_dict = o[3].copy() if type(o[3]) is dict else dest_dict if "to" not in dest_dict: # target point not in dict? dest_dict["to"] = top # put default in else: # transform target to PDF coordinates page = doc[pno] point = pymupdf.Point(dest_dict["to"]) point.y = page.cropbox.height - point.y point = point * page.rotation_matrix dest_dict["to"] = (point.x, point.y) d = {} d["first"] = -1 d["count"] = 0 d["last"] = -1 d["prev"] = -1 d["next"] = -1 d["dest"] = getDestStr(page_xref, dest_dict) d["top"] = dest_dict["to"] d["title"] = title d["parent"] = lvltab[lvl - 1] d["xref"] = xref[i + 1] d["color"] = dest_dict.get("color") d["flags"] = dest_dict.get("italic", 0) + 2 * dest_dict.get("bold", 0) lvltab[lvl] = i + 1 parent = olitems[lvltab[lvl - 1]] # the parent entry if ( dest_dict.get("collapse") or collapse and lvl > collapse ): # suppress expansion parent["count"] -= 1 # make /Count negative else: parent["count"] += 1 # positive /Count if parent["first"] == -1: parent["first"] = i + 1 parent["last"] = i + 1 else: d["prev"] = parent["last"] prev = olitems[parent["last"]] prev["next"] = i + 1 parent["last"] = i + 1 olitems.append(d) # ------------------------------------------------------------------------------ # now create each outline item as a string and insert it in the PDF # ------------------------------------------------------------------------------ for i, ol in enumerate(olitems): txt = "<<" if ol["count"] != 0: txt += "/Count %i" % ol["count"] try: txt += ol["dest"] except Exception: # Verbose in PyMuPDF/tests. if g_exceptions_verbose: pymupdf.exception_info() pass try: if ol["first"] > -1: txt += "/First %i 0 R" % xref[ol["first"]] except Exception: if g_exceptions_verbose: pymupdf.exception_info() pass try: if ol["last"] > -1: txt += "/Last %i 0 R" % xref[ol["last"]] except Exception: if g_exceptions_verbose: pymupdf.exception_info() pass try: if ol["next"] > -1: txt += "/Next %i 0 R" % xref[ol["next"]] except Exception: # Verbose in PyMuPDF/tests. if g_exceptions_verbose: pymupdf.exception_info() pass try: if ol["parent"] > -1: txt += "/Parent %i 0 R" % xref[ol["parent"]] except Exception: # Verbose in PyMuPDF/tests. if g_exceptions_verbose: pymupdf.exception_info() pass try: if ol["prev"] > -1: txt += "/Prev %i 0 R" % xref[ol["prev"]] except Exception: # Verbose in PyMuPDF/tests. if g_exceptions_verbose: pymupdf.exception_info() pass try: txt += "/Title" + ol["title"] except Exception: # Verbose in PyMuPDF/tests. if g_exceptions_verbose: pymupdf.exception_info() pass if ol.get("color") and len(ol["color"]) == 3: txt += f"/C[ {_format_g(tuple(ol['color']))}]" if ol.get("flags", 0) > 0: txt += "/F %i" % ol["flags"] if i == 0: # special: this is the outline root txt += "/Type/Outlines" # so add the /Type entry txt += ">>" doc.update_object(xref[i], txt) # insert the PDF object doc.init_doc() return toclen def do_links( doc1: pymupdf.Document, doc2: pymupdf.Document, from_page: int = -1, to_page: int = -1, start_at: int = -1, ) -> None: """Insert links contained in copied page range into destination PDF. Parameter values **must** equal those of method insert_pdf(), which must have been previously executed. """ #pymupdf.log( 'utils.do_links()') # -------------------------------------------------------------------------- # internal function to create the actual "/Annots" object string # -------------------------------------------------------------------------- def cre_annot(lnk, xref_dst, pno_src, ctm): """Create annotation object string for a passed-in link.""" r = lnk["from"] * ctm # rect in PDF coordinates rect = _format_g(tuple(r)) if lnk["kind"] == pymupdf.LINK_GOTO: txt = pymupdf.annot_skel["goto1"] # annot_goto idx = pno_src.index(lnk["page"]) p = lnk["to"] * ctm # target point in PDF coordinates annot = txt(xref_dst[idx], p.x, p.y, lnk["zoom"], rect) elif lnk["kind"] == pymupdf.LINK_GOTOR: if lnk["page"] >= 0: txt = pymupdf.annot_skel["gotor1"] # annot_gotor pnt = lnk.get("to", pymupdf.Point(0, 0)) # destination point if type(pnt) is not pymupdf.Point: pnt = pymupdf.Point(0, 0) annot = txt % ( lnk["page"], pnt.x, pnt.y, lnk["zoom"], lnk["file"], lnk["file"], rect, ) else: txt = pymupdf.annot_skel["gotor2"] # annot_gotor_n to = pymupdf.get_pdf_str(lnk["to"]) to = to[1:-1] f = lnk["file"] annot = txt(to, f, rect) elif lnk["kind"] == pymupdf.LINK_LAUNCH: txt = pymupdf.annot_skel["launch"] # annot_launch annot = txt(lnk["file"], lnk["file"], rect) elif lnk["kind"] == pymupdf.LINK_URI: txt = pymupdf.annot_skel["uri"] # annot_uri annot = txt(lnk["uri"], rect) else: annot = "" return annot # -------------------------------------------------------------------------- # validate & normalize parameters if from_page < 0: fp = 0 elif from_page >= doc2.page_count: fp = doc2.page_count - 1 else: fp = from_page if to_page < 0 or to_page >= doc2.page_count: tp = doc2.page_count - 1 else: tp = to_page if start_at < 0: raise ValueError("'start_at' must be >= 0") sa = start_at incr = 1 if fp <= tp else -1 # page range could be reversed # lists of source / destination page numbers pno_src = list(range(fp, tp + incr, incr)) pno_dst = [sa + i for i in range(len(pno_src))] # lists of source / destination page xrefs xref_src = [] xref_dst = [] for i in range(len(pno_src)): p_src = pno_src[i] p_dst = pno_dst[i] old_xref = doc2.page_xref(p_src) new_xref = doc1.page_xref(p_dst) xref_src.append(old_xref) xref_dst.append(new_xref) # create the links for each copied page in destination PDF for i in range(len(xref_src)): page_src = doc2[pno_src[i]] # load source page links = page_src.get_links() # get all its links #pymupdf.log( '{pno_src=}') #pymupdf.log( '{type(page_src)=}') #pymupdf.log( '{page_src=}') #pymupdf.log( '{=i len(links)}') if len(links) == 0: # no links there page_src = None continue ctm = ~page_src.transformation_matrix # calc page transformation matrix page_dst = doc1[pno_dst[i]] # load destination page link_tab = [] # store all link definitions here for l in links: if l["kind"] == pymupdf.LINK_GOTO and (l["page"] not in pno_src): continue # GOTO link target not in copied pages annot_text = cre_annot(l, xref_dst, pno_src, ctm) if annot_text: link_tab.append(annot_text) if link_tab != []: page_dst._addAnnot_FromString( tuple(link_tab)) #pymupdf.log( 'utils.do_links() returning.') def getLinkText(page: pymupdf.Page, lnk: dict) -> str: # -------------------------------------------------------------------------- # define skeletons for /Annots object texts # -------------------------------------------------------------------------- ctm = page.transformation_matrix ictm = ~ctm r = lnk["from"] rect = _format_g(tuple(r * ictm)) annot = "" if lnk["kind"] == pymupdf.LINK_GOTO: if lnk["page"] >= 0: txt = pymupdf.annot_skel["goto1"] # annot_goto pno = lnk["page"] xref = page.parent.page_xref(pno) pnt = lnk.get("to", pymupdf.Point(0, 0)) # destination point ipnt = pnt * ictm annot = txt(xref, ipnt.x, ipnt.y, lnk.get("zoom", 0), rect) else: txt = pymupdf.annot_skel["goto2"] # annot_goto_n annot = txt(pymupdf.get_pdf_str(lnk["to"]), rect) elif lnk["kind"] == pymupdf.LINK_GOTOR: if lnk["page"] >= 0: txt = pymupdf.annot_skel["gotor1"] # annot_gotor pnt = lnk.get("to", pymupdf.Point(0, 0)) # destination point if type(pnt) is not pymupdf.Point: pnt = pymupdf.Point(0, 0) annot = txt( lnk["page"], pnt.x, pnt.y, lnk.get("zoom", 0), lnk["file"], lnk["file"], rect, ) else: txt = pymupdf.annot_skel["gotor2"] # annot_gotor_n annot = txt(pymupdf.get_pdf_str(lnk["to"]), lnk["file"], rect) elif lnk["kind"] == pymupdf.LINK_LAUNCH: txt = pymupdf.annot_skel["launch"] # annot_launch annot = txt(lnk["file"], lnk["file"], rect) elif lnk["kind"] == pymupdf.LINK_URI: txt = pymupdf.annot_skel["uri"] # txt = annot_uri annot = txt(lnk["uri"], rect) elif lnk["kind"] == pymupdf.LINK_NAMED: txt = pymupdf.annot_skel["named"] # annot_named lname = lnk.get("name") # check presence of key if lname is None: # if missing, fall back to alternative lname = lnk["nameddest"] annot = txt(lname, rect) if not annot: return annot # add a /NM PDF key to the object definition link_names = dict( # existing ids and their xref [(x[0], x[2]) for x in page.annot_xrefs() if x[1] == pymupdf.PDF_ANNOT_LINK] # pylint: disable=no-member ) old_name = lnk.get("id", "") # id value in the argument if old_name and (lnk["xref"], old_name) in link_names.items(): name = old_name # no new name if this is an update only else: i = 0 stem = pymupdf.TOOLS.set_annot_stem() + "-L%i" while True: name = stem % i if name not in link_names.values(): break i += 1 # add /NM key to object definition annot = annot.replace("/Link", "/Link/NM(%s)" % name) return annot def delete_widget(page: pymupdf.Page, widget: pymupdf.Widget) -> pymupdf.Widget: """Delete widget from page and return the next one.""" pymupdf.CheckParent(page) annot = getattr(widget, "_annot", None) if annot is None: raise ValueError("bad type: widget") nextwidget = widget.next page.delete_annot(annot) widget._annot.parent = None keylist = list(widget.__dict__.keys()) for key in keylist: del widget.__dict__[key] return nextwidget def update_link(page: pymupdf.Page, lnk: dict) -> None: """Update a link on the current page.""" pymupdf.CheckParent(page) annot = getLinkText(page, lnk) if annot == "": raise ValueError("link kind not supported") page.parent.update_object(lnk["xref"], annot, page=page) def insert_link(page: pymupdf.Page, lnk: dict, mark: bool = True) -> None: """Insert a new link for the current page.""" pymupdf.CheckParent(page) annot = getLinkText(page, lnk) if annot == "": raise ValueError("link kind not supported") page._addAnnot_FromString((annot,)) def insert_textbox( page: pymupdf.Page, rect: rect_like, buffer: typing.Union[str, list], fontname: str = "helv", fontfile: OptStr = None, set_simple: int = 0, encoding: int = 0, fontsize: float = 11, lineheight: OptFloat = None, color: OptSeq = None, fill: OptSeq = None, expandtabs: int = 1, align: int = 0, rotate: int = 0, render_mode: int = 0, border_width: float = 0.05, morph: OptSeq = None, overlay: bool = True, stroke_opacity: float = 1, fill_opacity: float = 1, oc: int = 0, ) -> float: """Insert text into a given rectangle. Notes: Creates a Shape object, uses its same-named method and commits it. Parameters: rect: (rect-like) area to use for text. buffer: text to be inserted fontname: a Base-14 font, font name or '/name' fontfile: name of a font file fontsize: font size lineheight: overwrite the font property color: RGB color triple expandtabs: handles tabulators with string function align: left, center, right, justified rotate: 0, 90, 180, or 270 degrees morph: morph box with a matrix and a fixpoint overlay: put text in foreground or background Returns: unused or deficit rectangle area (float) """ img = page.new_shape() rc = img.insert_textbox( rect, buffer, fontsize=fontsize, lineheight=lineheight, fontname=fontname, fontfile=fontfile, set_simple=set_simple, encoding=encoding, color=color, fill=fill, expandtabs=expandtabs, render_mode=render_mode, border_width=border_width, align=align, rotate=rotate, morph=morph, stroke_opacity=stroke_opacity, fill_opacity=fill_opacity, oc=oc, ) if rc >= 0: img.commit(overlay) return rc def insert_text( page: pymupdf.Page, point: point_like, text: typing.Union[str, list], fontsize: float = 11, lineheight: OptFloat = None, fontname: str = "helv", fontfile: OptStr = None, set_simple: int = 0, encoding: int = 0, color: OptSeq = None, fill: OptSeq = None, border_width: float = 0.05, render_mode: int = 0, rotate: int = 0, morph: OptSeq = None, overlay: bool = True, stroke_opacity: float = 1, fill_opacity: float = 1, oc: int = 0, ): img = page.new_shape() rc = img.insert_text( point, text, fontsize=fontsize, lineheight=lineheight, fontname=fontname, fontfile=fontfile, set_simple=set_simple, encoding=encoding, color=color, fill=fill, border_width=border_width, render_mode=render_mode, rotate=rotate, morph=morph, stroke_opacity=stroke_opacity, fill_opacity=fill_opacity, oc=oc, ) if rc >= 0: img.commit(overlay) return rc def insert_htmlbox( page, rect, text, *, css=None, scale_low=0, archive=None, rotate=0, oc=0, opacity=1, overlay=True, ) -> float: """Insert text with optional HTML tags and stylings into a rectangle. Args: rect: (rect-like) rectangle into which the text should be placed. text: (str) text with optional HTML tags and stylings. css: (str) CSS styling commands. scale_low: (float) force-fit content by scaling it down. Must be in range [0, 1]. If 1, no scaling will take place. If 0, arbitrary down-scaling is acceptable. A value of 0.1 would mean that content may be scaled down by at most 90%. archive: Archive object pointing to locations of used fonts or images rotate: (int) rotate the text in the box by a multiple of 90 degrees. oc: (int) the xref of an OCG / OCMD (Optional Content). opacity: (float) set opacity of inserted content. overlay: (bool) put text on top of page content. Returns: A tuple of floats (spare_height, scale). spare_height: -1 if content did not fit, else >= 0. It is the height of the unused (still available) rectangle stripe. Positive only if scale_min = 1 (no down scaling). scale: downscaling factor, 0 < scale <= 1. Set to 0 if spare_height = -1 (no fit). """ # normalize rotation angle if not rotate % 90 == 0: raise ValueError("bad rotation angle") while rotate < 0: rotate += 360 while rotate >= 360: rotate -= 360 if not 0 <= scale_low <= 1: raise ValueError("'scale_low' must be in [0, 1]") if css is None: css = "" rect = pymupdf.Rect(rect) if rotate in (90, 270): temp_rect = pymupdf.Rect(0, 0, rect.height, rect.width) else: temp_rect = pymupdf.Rect(0, 0, rect.width, rect.height) # use a small border by default mycss = "body {margin:1px;}" + css # append user CSS # either make a story, or accept a given one if isinstance(text, str): # if a string, convert to a Story story = pymupdf.Story(html=text, user_css=mycss, archive=archive) elif isinstance(text, pymupdf.Story): story = text else: raise ValueError("'text' must be a string or a Story") # ---------------------------------------------------------------- # Find a scaling factor that lets our story fit in # ---------------------------------------------------------------- scale_max = None if scale_low == 0 else 1 / scale_low fit = story.fit_scale(temp_rect, scale_min=1, scale_max=scale_max) if fit.big_enough is False: # there was no fit return (-1, scale_low) filled = fit.filled scale = 1 / fit.parameter # shrink factor spare_height = fit.rect.y1 - filled[3] # unused room at rectangle bottom # Note: due to MuPDF's logic this may be negative even for successful fits. if scale != 1 or spare_height < 0: # if scaling occurred, set spare_height to 0 spare_height = 0 def rect_function(*args): return fit.rect, fit.rect, pymupdf.Identity # draw story on temp PDF page doc = story.write_with_links(rect_function) # Insert opacity if requested. # For this, we prepend a command to the /Contents. if 0 <= opacity < 1: tpage = doc[0] # load page # generate /ExtGstate for the page alp0 = tpage._set_opacity(CA=opacity, ca=opacity) s = f"/{alp0} gs\n" # generate graphic state command pymupdf.TOOLS._insert_contents(tpage, s.encode(), 0) # put result in target page page.show_pdf_page(rect, doc, 0, rotate=rotate, oc=oc, overlay=overlay) # ------------------------------------------------------------------------- # re-insert links in target rect (show_pdf_page cannot copy annotations) # ------------------------------------------------------------------------- # scaled center point of fit.rect mp1 = (fit.rect.tl + fit.rect.br) / 2 * scale # center point of target rect mp2 = (rect.tl + rect.br) / 2 # compute link positioning matrix: # - move center of scaled-down fit.rect to (0,0) # - rotate # - move (0,0) to center of target rect mat = ( pymupdf.Matrix(scale, 0, 0, scale, -mp1.x, -mp1.y) * pymupdf.Matrix(-rotate) * pymupdf.Matrix(1, 0, 0, 1, mp2.x, mp2.y) ) # copy over links for link in doc[0].get_links(): link["from"] *= mat page.insert_link(link) return spare_height, scale def new_page( doc: pymupdf.Document, pno: int = -1, width: float = 595, height: float = 842, ) -> pymupdf.Page: """Create and return a new page object. Args: pno: (int) insert before this page. Default: after last page. width: (float) page width in points. Default: 595 (ISO A4 width). height: (float) page height in points. Default 842 (ISO A4 height). Returns: A pymupdf.Page object. """ doc._newPage(pno, width=width, height=height) return doc[pno] def insert_page( doc: pymupdf.Document, pno: int, text: typing.Union[str, list, None] = None, fontsize: float = 11, width: float = 595, height: float = 842, fontname: str = "helv", fontfile: OptStr = None, color: OptSeq = (0,), ) -> int: """Create a new PDF page and insert some text. Notes: Function combining pymupdf.Document.new_page() and pymupdf.Page.insert_text(). For parameter details see these methods. """ page = doc.new_page(pno=pno, width=width, height=height) if not bool(text): return 0 rc = page.insert_text( (50, 72), text, fontsize=fontsize, fontname=fontname, fontfile=fontfile, color=color, ) return rc def draw_line( page: pymupdf.Page, p1: point_like, p2: point_like, color: OptSeq = (0,), dashes: OptStr = None, width: float = 1, lineCap: int = 0, lineJoin: int = 0, overlay: bool = True, morph: OptSeq = None, stroke_opacity: float = 1, fill_opacity: float = 1, oc=0, ) -> pymupdf.Point: """Draw a line from point p1 to point p2.""" img = page.new_shape() p = img.draw_line(pymupdf.Point(p1), pymupdf.Point(p2)) img.finish( color=color, dashes=dashes, width=width, closePath=False, lineCap=lineCap, lineJoin=lineJoin, morph=morph, stroke_opacity=stroke_opacity, fill_opacity=fill_opacity, oc=oc, ) img.commit(overlay) return p def draw_squiggle( page: pymupdf.Page, p1: point_like, p2: point_like, breadth: float = 2, color: OptSeq = (0,), dashes: OptStr = None, width: float = 1, lineCap: int = 0, lineJoin: int = 0, overlay: bool = True, morph: OptSeq = None, stroke_opacity: float = 1, fill_opacity: float = 1, oc: int = 0, ) -> pymupdf.Point: """Draw a squiggly line from point p1 to point p2.""" img = page.new_shape() p = img.draw_squiggle(pymupdf.Point(p1), pymupdf.Point(p2), breadth=breadth) img.finish( color=color, dashes=dashes, width=width, closePath=False, lineCap=lineCap, lineJoin=lineJoin, morph=morph, stroke_opacity=stroke_opacity, fill_opacity=fill_opacity, oc=oc, ) img.commit(overlay) return p def draw_zigzag( page: pymupdf.Page, p1: point_like, p2: point_like, breadth: float = 2, color: OptSeq = (0,), dashes: OptStr = None, width: float = 1, lineCap: int = 0, lineJoin: int = 0, overlay: bool = True, morph: OptSeq = None, stroke_opacity: float = 1, fill_opacity: float = 1, oc: int = 0, ) -> pymupdf.Point: """Draw a zigzag line from point p1 to point p2.""" img = page.new_shape() p = img.draw_zigzag(pymupdf.Point(p1), pymupdf.Point(p2), breadth=breadth) img.finish( color=color, dashes=dashes, width=width, closePath=False, lineCap=lineCap, lineJoin=lineJoin, morph=morph, stroke_opacity=stroke_opacity, fill_opacity=fill_opacity, oc=oc, ) img.commit(overlay) return p def draw_rect( page: pymupdf.Page, rect: rect_like, color: OptSeq = (0,), fill: OptSeq = None, dashes: OptStr = None, width: float = 1, lineCap: int = 0, lineJoin: int = 0, morph: OptSeq = None, overlay: bool = True, stroke_opacity: float = 1, fill_opacity: float = 1, oc: int = 0, radius=None, ) -> pymupdf.Point: ''' Draw a rectangle. See Shape class method for details. ''' img = page.new_shape() Q = img.draw_rect(pymupdf.Rect(rect), radius=radius) img.finish( color=color, fill=fill, dashes=dashes, width=width, lineCap=lineCap, lineJoin=lineJoin, morph=morph, stroke_opacity=stroke_opacity, fill_opacity=fill_opacity, oc=oc, ) img.commit(overlay) return Q def draw_quad( page: pymupdf.Page, quad: quad_like, color: OptSeq = (0,), fill: OptSeq = None, dashes: OptStr = None, width: float = 1, lineCap: int = 0, lineJoin: int = 0, morph: OptSeq = None, overlay: bool = True, stroke_opacity: float = 1, fill_opacity: float = 1, oc: int = 0, ) -> pymupdf.Point: """Draw a quadrilateral.""" img = page.new_shape() Q = img.draw_quad(pymupdf.Quad(quad)) img.finish( color=color, fill=fill, dashes=dashes, width=width, lineCap=lineCap, lineJoin=lineJoin, morph=morph, stroke_opacity=stroke_opacity, fill_opacity=fill_opacity, oc=oc, ) img.commit(overlay) return Q def draw_polyline( page: pymupdf.Page, points: list, color: OptSeq = (0,), fill: OptSeq = None, dashes: OptStr = None, width: float = 1, morph: OptSeq = None, lineCap: int = 0, lineJoin: int = 0, overlay: bool = True, closePath: bool = False, stroke_opacity: float = 1, fill_opacity: float = 1, oc: int = 0, ) -> pymupdf.Point: """Draw multiple connected line segments.""" img = page.new_shape() Q = img.draw_polyline(points) img.finish( color=color, fill=fill, dashes=dashes, width=width, lineCap=lineCap, lineJoin=lineJoin, morph=morph, closePath=closePath, stroke_opacity=stroke_opacity, fill_opacity=fill_opacity, oc=oc, ) img.commit(overlay) return Q def draw_circle( page: pymupdf.Page, center: point_like, radius: float, color: OptSeq = (0,), fill: OptSeq = None, morph: OptSeq = None, dashes: OptStr = None, width: float = 1, lineCap: int = 0, lineJoin: int = 0, overlay: bool = True, stroke_opacity: float = 1, fill_opacity: float = 1, oc: int = 0, ) -> pymupdf.Point: """Draw a circle given its center and radius.""" img = page.new_shape() Q = img.draw_circle(pymupdf.Point(center), radius) img.finish( color=color, fill=fill, dashes=dashes, width=width, lineCap=lineCap, lineJoin=lineJoin, morph=morph, stroke_opacity=stroke_opacity, fill_opacity=fill_opacity, oc=oc, ) img.commit(overlay) return Q def draw_oval( page: pymupdf.Page, rect: typing.Union[rect_like, quad_like], color: OptSeq = (0,), fill: OptSeq = None, dashes: OptStr = None, morph: OptSeq = None, width: float = 1, lineCap: int = 0, lineJoin: int = 0, overlay: bool = True, stroke_opacity: float = 1, fill_opacity: float = 1, oc: int = 0, ) -> pymupdf.Point: """Draw an oval given its containing rectangle or quad.""" img = page.new_shape() Q = img.draw_oval(rect) img.finish( color=color, fill=fill, dashes=dashes, width=width, lineCap=lineCap, lineJoin=lineJoin, morph=morph, stroke_opacity=stroke_opacity, fill_opacity=fill_opacity, oc=oc, ) img.commit(overlay) return Q def draw_curve( page: pymupdf.Page, p1: point_like, p2: point_like, p3: point_like, color: OptSeq = (0,), fill: OptSeq = None, dashes: OptStr = None, width: float = 1, morph: OptSeq = None, closePath: bool = False, lineCap: int = 0, lineJoin: int = 0, overlay: bool = True, stroke_opacity: float = 1, fill_opacity: float = 1, oc: int = 0, ) -> pymupdf.Point: """Draw a special Bezier curve from p1 to p3, generating control points on lines p1 to p2 and p2 to p3.""" img = page.new_shape() Q = img.draw_curve(pymupdf.Point(p1), pymupdf.Point(p2), pymupdf.Point(p3)) img.finish( color=color, fill=fill, dashes=dashes, width=width, lineCap=lineCap, lineJoin=lineJoin, morph=morph, closePath=closePath, stroke_opacity=stroke_opacity, fill_opacity=fill_opacity, oc=oc, ) img.commit(overlay) return Q def draw_bezier( page: pymupdf.Page, p1: point_like, p2: point_like, p3: point_like, p4: point_like, color: OptSeq = (0,), fill: OptSeq = None, dashes: OptStr = None, width: float = 1, morph: OptStr = None, closePath: bool = False, lineCap: int = 0, lineJoin: int = 0, overlay: bool = True, stroke_opacity: float = 1, fill_opacity: float = 1, oc: int = 0, ) -> pymupdf.Point: """Draw a general cubic Bezier curve from p1 to p4 using control points p2 and p3.""" img = page.new_shape() Q = img.draw_bezier(pymupdf.Point(p1), pymupdf.Point(p2), pymupdf.Point(p3), pymupdf.Point(p4)) img.finish( color=color, fill=fill, dashes=dashes, width=width, lineCap=lineCap, lineJoin=lineJoin, morph=morph, closePath=closePath, stroke_opacity=stroke_opacity, fill_opacity=fill_opacity, oc=oc, ) img.commit(overlay) return Q def draw_sector( page: pymupdf.Page, center: point_like, point: point_like, beta: float, color: OptSeq = (0,), fill: OptSeq = None, dashes: OptStr = None, fullSector: bool = True, morph: OptSeq = None, width: float = 1, closePath: bool = False, lineCap: int = 0, lineJoin: int = 0, overlay: bool = True, stroke_opacity: float = 1, fill_opacity: float = 1, oc: int = 0, ) -> pymupdf.Point: """Draw a circle sector given circle center, one arc end point and the angle of the arc. Parameters: center -- center of circle point -- arc end point beta -- angle of arc (degrees) fullSector -- connect arc ends with center """ img = page.new_shape() Q = img.draw_sector(pymupdf.Point(center), pymupdf.Point(point), beta, fullSector=fullSector) img.finish( color=color, fill=fill, dashes=dashes, width=width, lineCap=lineCap, lineJoin=lineJoin, morph=morph, closePath=closePath, stroke_opacity=stroke_opacity, fill_opacity=fill_opacity, oc=oc, ) img.commit(overlay) return Q # ---------------------------------------------------------------------- # Name: wx.lib.colourdb.py # Purpose: Adds a bunch of colour names and RGB values to the # colour database so they can be found by name # # Author: Robin Dunn # # Created: 13-March-2001 # Copyright: (c) 2001-2017 by Total Control Software # Licence: wxWindows license # Tags: phoenix-port, unittest, documented # ---------------------------------------------------------------------- def getColorList() -> list: """ Returns a list of just the colour names used by this module. :rtype: list of strings """ return [x[0] for x in getColorInfoList()] def getColorInfoList() -> list: """ Returns the list of colour name/value tuples used by this module. :rtype: list of tuples """ return [ ("ALICEBLUE", 240, 248, 255), ("ANTIQUEWHITE", 250, 235, 215), ("ANTIQUEWHITE1", 255, 239, 219), ("ANTIQUEWHITE2", 238, 223, 204), ("ANTIQUEWHITE3", 205, 192, 176), ("ANTIQUEWHITE4", 139, 131, 120), ("AQUAMARINE", 127, 255, 212), ("AQUAMARINE1", 127, 255, 212), ("AQUAMARINE2", 118, 238, 198), ("AQUAMARINE3", 102, 205, 170), ("AQUAMARINE4", 69, 139, 116), ("AZURE", 240, 255, 255), ("AZURE1", 240, 255, 255), ("AZURE2", 224, 238, 238), ("AZURE3", 193, 205, 205), ("AZURE4", 131, 139, 139), ("BEIGE", 245, 245, 220), ("BISQUE", 255, 228, 196), ("BISQUE1", 255, 228, 196), ("BISQUE2", 238, 213, 183), ("BISQUE3", 205, 183, 158), ("BISQUE4", 139, 125, 107), ("BLACK", 0, 0, 0), ("BLANCHEDALMOND", 255, 235, 205), ("BLUE", 0, 0, 255), ("BLUE1", 0, 0, 255), ("BLUE2", 0, 0, 238), ("BLUE3", 0, 0, 205), ("BLUE4", 0, 0, 139), ("BLUEVIOLET", 138, 43, 226), ("BROWN", 165, 42, 42), ("BROWN1", 255, 64, 64), ("BROWN2", 238, 59, 59), ("BROWN3", 205, 51, 51), ("BROWN4", 139, 35, 35), ("BURLYWOOD", 222, 184, 135), ("BURLYWOOD1", 255, 211, 155), ("BURLYWOOD2", 238, 197, 145), ("BURLYWOOD3", 205, 170, 125), ("BURLYWOOD4", 139, 115, 85), ("CADETBLUE", 95, 158, 160), ("CADETBLUE1", 152, 245, 255), ("CADETBLUE2", 142, 229, 238), ("CADETBLUE3", 122, 197, 205), ("CADETBLUE4", 83, 134, 139), ("CHARTREUSE", 127, 255, 0), ("CHARTREUSE1", 127, 255, 0), ("CHARTREUSE2", 118, 238, 0), ("CHARTREUSE3", 102, 205, 0), ("CHARTREUSE4", 69, 139, 0), ("CHOCOLATE", 210, 105, 30), ("CHOCOLATE1", 255, 127, 36), ("CHOCOLATE2", 238, 118, 33), ("CHOCOLATE3", 205, 102, 29), ("CHOCOLATE4", 139, 69, 19), ("COFFEE", 156, 79, 0), ("CORAL", 255, 127, 80), ("CORAL1", 255, 114, 86), ("CORAL2", 238, 106, 80), ("CORAL3", 205, 91, 69), ("CORAL4", 139, 62, 47), ("CORNFLOWERBLUE", 100, 149, 237), ("CORNSILK", 255, 248, 220), ("CORNSILK1", 255, 248, 220), ("CORNSILK2", 238, 232, 205), ("CORNSILK3", 205, 200, 177), ("CORNSILK4", 139, 136, 120), ("CYAN", 0, 255, 255), ("CYAN1", 0, 255, 255), ("CYAN2", 0, 238, 238), ("CYAN3", 0, 205, 205), ("CYAN4", 0, 139, 139), ("DARKBLUE", 0, 0, 139), ("DARKCYAN", 0, 139, 139), ("DARKGOLDENROD", 184, 134, 11), ("DARKGOLDENROD1", 255, 185, 15), ("DARKGOLDENROD2", 238, 173, 14), ("DARKGOLDENROD3", 205, 149, 12), ("DARKGOLDENROD4", 139, 101, 8), ("DARKGREEN", 0, 100, 0), ("DARKGRAY", 169, 169, 169), ("DARKKHAKI", 189, 183, 107), ("DARKMAGENTA", 139, 0, 139), ("DARKOLIVEGREEN", 85, 107, 47), ("DARKOLIVEGREEN1", 202, 255, 112), ("DARKOLIVEGREEN2", 188, 238, 104), ("DARKOLIVEGREEN3", 162, 205, 90), ("DARKOLIVEGREEN4", 110, 139, 61), ("DARKORANGE", 255, 140, 0), ("DARKORANGE1", 255, 127, 0), ("DARKORANGE2", 238, 118, 0), ("DARKORANGE3", 205, 102, 0), ("DARKORANGE4", 139, 69, 0), ("DARKORCHID", 153, 50, 204), ("DARKORCHID1", 191, 62, 255), ("DARKORCHID2", 178, 58, 238), ("DARKORCHID3", 154, 50, 205), ("DARKORCHID4", 104, 34, 139), ("DARKRED", 139, 0, 0), ("DARKSALMON", 233, 150, 122), ("DARKSEAGREEN", 143, 188, 143), ("DARKSEAGREEN1", 193, 255, 193), ("DARKSEAGREEN2", 180, 238, 180), ("DARKSEAGREEN3", 155, 205, 155), ("DARKSEAGREEN4", 105, 139, 105), ("DARKSLATEBLUE", 72, 61, 139), ("DARKSLATEGRAY", 47, 79, 79), ("DARKTURQUOISE", 0, 206, 209), ("DARKVIOLET", 148, 0, 211), ("DEEPPINK", 255, 20, 147), ("DEEPPINK1", 255, 20, 147), ("DEEPPINK2", 238, 18, 137), ("DEEPPINK3", 205, 16, 118), ("DEEPPINK4", 139, 10, 80), ("DEEPSKYBLUE", 0, 191, 255), ("DEEPSKYBLUE1", 0, 191, 255), ("DEEPSKYBLUE2", 0, 178, 238), ("DEEPSKYBLUE3", 0, 154, 205), ("DEEPSKYBLUE4", 0, 104, 139), ("DIMGRAY", 105, 105, 105), ("DODGERBLUE", 30, 144, 255), ("DODGERBLUE1", 30, 144, 255), ("DODGERBLUE2", 28, 134, 238), ("DODGERBLUE3", 24, 116, 205), ("DODGERBLUE4", 16, 78, 139), ("FIREBRICK", 178, 34, 34), ("FIREBRICK1", 255, 48, 48), ("FIREBRICK2", 238, 44, 44), ("FIREBRICK3", 205, 38, 38), ("FIREBRICK4", 139, 26, 26), ("FLORALWHITE", 255, 250, 240), ("FORESTGREEN", 34, 139, 34), ("GAINSBORO", 220, 220, 220), ("GHOSTWHITE", 248, 248, 255), ("GOLD", 255, 215, 0), ("GOLD1", 255, 215, 0), ("GOLD2", 238, 201, 0), ("GOLD3", 205, 173, 0), ("GOLD4", 139, 117, 0), ("GOLDENROD", 218, 165, 32), ("GOLDENROD1", 255, 193, 37), ("GOLDENROD2", 238, 180, 34), ("GOLDENROD3", 205, 155, 29), ("GOLDENROD4", 139, 105, 20), ("GREEN YELLOW", 173, 255, 47), ("GREEN", 0, 255, 0), ("GREEN1", 0, 255, 0), ("GREEN2", 0, 238, 0), ("GREEN3", 0, 205, 0), ("GREEN4", 0, 139, 0), ("GREENYELLOW", 173, 255, 47), ("GRAY", 190, 190, 190), ("GRAY0", 0, 0, 0), ("GRAY1", 3, 3, 3), ("GRAY10", 26, 26, 26), ("GRAY100", 255, 255, 255), ("GRAY11", 28, 28, 28), ("GRAY12", 31, 31, 31), ("GRAY13", 33, 33, 33), ("GRAY14", 36, 36, 36), ("GRAY15", 38, 38, 38), ("GRAY16", 41, 41, 41), ("GRAY17", 43, 43, 43), ("GRAY18", 46, 46, 46), ("GRAY19", 48, 48, 48), ("GRAY2", 5, 5, 5), ("GRAY20", 51, 51, 51), ("GRAY21", 54, 54, 54), ("GRAY22", 56, 56, 56), ("GRAY23", 59, 59, 59), ("GRAY24", 61, 61, 61), ("GRAY25", 64, 64, 64), ("GRAY26", 66, 66, 66), ("GRAY27", 69, 69, 69), ("GRAY28", 71, 71, 71), ("GRAY29", 74, 74, 74), ("GRAY3", 8, 8, 8), ("GRAY30", 77, 77, 77), ("GRAY31", 79, 79, 79), ("GRAY32", 82, 82, 82), ("GRAY33", 84, 84, 84), ("GRAY34", 87, 87, 87), ("GRAY35", 89, 89, 89), ("GRAY36", 92, 92, 92), ("GRAY37", 94, 94, 94), ("GRAY38", 97, 97, 97), ("GRAY39", 99, 99, 99), ("GRAY4", 10, 10, 10), ("GRAY40", 102, 102, 102), ("GRAY41", 105, 105, 105), ("GRAY42", 107, 107, 107), ("GRAY43", 110, 110, 110), ("GRAY44", 112, 112, 112), ("GRAY45", 115, 115, 115), ("GRAY46", 117, 117, 117), ("GRAY47", 120, 120, 120), ("GRAY48", 122, 122, 122), ("GRAY49", 125, 125, 125), ("GRAY5", 13, 13, 13), ("GRAY50", 127, 127, 127), ("GRAY51", 130, 130, 130), ("GRAY52", 133, 133, 133), ("GRAY53", 135, 135, 135), ("GRAY54", 138, 138, 138), ("GRAY55", 140, 140, 140), ("GRAY56", 143, 143, 143), ("GRAY57", 145, 145, 145), ("GRAY58", 148, 148, 148), ("GRAY59", 150, 150, 150), ("GRAY6", 15, 15, 15), ("GRAY60", 153, 153, 153), ("GRAY61", 156, 156, 156), ("GRAY62", 158, 158, 158), ("GRAY63", 161, 161, 161), ("GRAY64", 163, 163, 163), ("GRAY65", 166, 166, 166), ("GRAY66", 168, 168, 168), ("GRAY67", 171, 171, 171), ("GRAY68", 173, 173, 173), ("GRAY69", 176, 176, 176), ("GRAY7", 18, 18, 18), ("GRAY70", 179, 179, 179), ("GRAY71", 181, 181, 181), ("GRAY72", 184, 184, 184), ("GRAY73", 186, 186, 186), ("GRAY74", 189, 189, 189), ("GRAY75", 191, 191, 191), ("GRAY76", 194, 194, 194), ("GRAY77", 196, 196, 196), ("GRAY78", 199, 199, 199), ("GRAY79", 201, 201, 201), ("GRAY8", 20, 20, 20), ("GRAY80", 204, 204, 204), ("GRAY81", 207, 207, 207), ("GRAY82", 209, 209, 209), ("GRAY83", 212, 212, 212), ("GRAY84", 214, 214, 214), ("GRAY85", 217, 217, 217), ("GRAY86", 219, 219, 219), ("GRAY87", 222, 222, 222), ("GRAY88", 224, 224, 224), ("GRAY89", 227, 227, 227), ("GRAY9", 23, 23, 23), ("GRAY90", 229, 229, 229), ("GRAY91", 232, 232, 232), ("GRAY92", 235, 235, 235), ("GRAY93", 237, 237, 237), ("GRAY94", 240, 240, 240), ("GRAY95", 242, 242, 242), ("GRAY96", 245, 245, 245), ("GRAY97", 247, 247, 247), ("GRAY98", 250, 250, 250), ("GRAY99", 252, 252, 252), ("HONEYDEW", 240, 255, 240), ("HONEYDEW1", 240, 255, 240), ("HONEYDEW2", 224, 238, 224), ("HONEYDEW3", 193, 205, 193), ("HONEYDEW4", 131, 139, 131), ("HOTPINK", 255, 105, 180), ("HOTPINK1", 255, 110, 180), ("HOTPINK2", 238, 106, 167), ("HOTPINK3", 205, 96, 144), ("HOTPINK4", 139, 58, 98), ("INDIANRED", 205, 92, 92), ("INDIANRED1", 255, 106, 106), ("INDIANRED2", 238, 99, 99), ("INDIANRED3", 205, 85, 85), ("INDIANRED4", 139, 58, 58), ("IVORY", 255, 255, 240), ("IVORY1", 255, 255, 240), ("IVORY2", 238, 238, 224), ("IVORY3", 205, 205, 193), ("IVORY4", 139, 139, 131), ("KHAKI", 240, 230, 140), ("KHAKI1", 255, 246, 143), ("KHAKI2", 238, 230, 133), ("KHAKI3", 205, 198, 115), ("KHAKI4", 139, 134, 78), ("LAVENDER", 230, 230, 250), ("LAVENDERBLUSH", 255, 240, 245), ("LAVENDERBLUSH1", 255, 240, 245), ("LAVENDERBLUSH2", 238, 224, 229), ("LAVENDERBLUSH3", 205, 193, 197), ("LAVENDERBLUSH4", 139, 131, 134), ("LAWNGREEN", 124, 252, 0), ("LEMONCHIFFON", 255, 250, 205), ("LEMONCHIFFON1", 255, 250, 205), ("LEMONCHIFFON2", 238, 233, 191), ("LEMONCHIFFON3", 205, 201, 165), ("LEMONCHIFFON4", 139, 137, 112), ("LIGHTBLUE", 173, 216, 230), ("LIGHTBLUE1", 191, 239, 255), ("LIGHTBLUE2", 178, 223, 238), ("LIGHTBLUE3", 154, 192, 205), ("LIGHTBLUE4", 104, 131, 139), ("LIGHTCORAL", 240, 128, 128), ("LIGHTCYAN", 224, 255, 255), ("LIGHTCYAN1", 224, 255, 255), ("LIGHTCYAN2", 209, 238, 238), ("LIGHTCYAN3", 180, 205, 205), ("LIGHTCYAN4", 122, 139, 139), ("LIGHTGOLDENROD", 238, 221, 130), ("LIGHTGOLDENROD1", 255, 236, 139), ("LIGHTGOLDENROD2", 238, 220, 130), ("LIGHTGOLDENROD3", 205, 190, 112), ("LIGHTGOLDENROD4", 139, 129, 76), ("LIGHTGOLDENRODYELLOW", 250, 250, 210), ("LIGHTGREEN", 144, 238, 144), ("LIGHTGRAY", 211, 211, 211), ("LIGHTPINK", 255, 182, 193), ("LIGHTPINK1", 255, 174, 185), ("LIGHTPINK2", 238, 162, 173), ("LIGHTPINK3", 205, 140, 149), ("LIGHTPINK4", 139, 95, 101), ("LIGHTSALMON", 255, 160, 122), ("LIGHTSALMON1", 255, 160, 122), ("LIGHTSALMON2", 238, 149, 114), ("LIGHTSALMON3", 205, 129, 98), ("LIGHTSALMON4", 139, 87, 66), ("LIGHTSEAGREEN", 32, 178, 170), ("LIGHTSKYBLUE", 135, 206, 250), ("LIGHTSKYBLUE1", 176, 226, 255), ("LIGHTSKYBLUE2", 164, 211, 238), ("LIGHTSKYBLUE3", 141, 182, 205), ("LIGHTSKYBLUE4", 96, 123, 139), ("LIGHTSLATEBLUE", 132, 112, 255), ("LIGHTSLATEGRAY", 119, 136, 153), ("LIGHTSTEELBLUE", 176, 196, 222), ("LIGHTSTEELBLUE1", 202, 225, 255), ("LIGHTSTEELBLUE2", 188, 210, 238), ("LIGHTSTEELBLUE3", 162, 181, 205), ("LIGHTSTEELBLUE4", 110, 123, 139), ("LIGHTYELLOW", 255, 255, 224), ("LIGHTYELLOW1", 255, 255, 224), ("LIGHTYELLOW2", 238, 238, 209), ("LIGHTYELLOW3", 205, 205, 180), ("LIGHTYELLOW4", 139, 139, 122), ("LIMEGREEN", 50, 205, 50), ("LINEN", 250, 240, 230), ("MAGENTA", 255, 0, 255), ("MAGENTA1", 255, 0, 255), ("MAGENTA2", 238, 0, 238), ("MAGENTA3", 205, 0, 205), ("MAGENTA4", 139, 0, 139), ("MAROON", 176, 48, 96), ("MAROON1", 255, 52, 179), ("MAROON2", 238, 48, 167), ("MAROON3", 205, 41, 144), ("MAROON4", 139, 28, 98), ("MEDIUMAQUAMARINE", 102, 205, 170), ("MEDIUMBLUE", 0, 0, 205), ("MEDIUMORCHID", 186, 85, 211), ("MEDIUMORCHID1", 224, 102, 255), ("MEDIUMORCHID2", 209, 95, 238), ("MEDIUMORCHID3", 180, 82, 205), ("MEDIUMORCHID4", 122, 55, 139), ("MEDIUMPURPLE", 147, 112, 219), ("MEDIUMPURPLE1", 171, 130, 255), ("MEDIUMPURPLE2", 159, 121, 238), ("MEDIUMPURPLE3", 137, 104, 205), ("MEDIUMPURPLE4", 93, 71, 139), ("MEDIUMSEAGREEN", 60, 179, 113), ("MEDIUMSLATEBLUE", 123, 104, 238), ("MEDIUMSPRINGGREEN", 0, 250, 154), ("MEDIUMTURQUOISE", 72, 209, 204), ("MEDIUMVIOLETRED", 199, 21, 133), ("MIDNIGHTBLUE", 25, 25, 112), ("MINTCREAM", 245, 255, 250), ("MISTYROSE", 255, 228, 225), ("MISTYROSE1", 255, 228, 225), ("MISTYROSE2", 238, 213, 210), ("MISTYROSE3", 205, 183, 181), ("MISTYROSE4", 139, 125, 123), ("MOCCASIN", 255, 228, 181), ("MUPDFBLUE", 37, 114, 172), ("NAVAJOWHITE", 255, 222, 173), ("NAVAJOWHITE1", 255, 222, 173), ("NAVAJOWHITE2", 238, 207, 161), ("NAVAJOWHITE3", 205, 179, 139), ("NAVAJOWHITE4", 139, 121, 94), ("NAVY", 0, 0, 128), ("NAVYBLUE", 0, 0, 128), ("OLDLACE", 253, 245, 230), ("OLIVEDRAB", 107, 142, 35), ("OLIVEDRAB1", 192, 255, 62), ("OLIVEDRAB2", 179, 238, 58), ("OLIVEDRAB3", 154, 205, 50), ("OLIVEDRAB4", 105, 139, 34), ("ORANGE", 255, 165, 0), ("ORANGE1", 255, 165, 0), ("ORANGE2", 238, 154, 0), ("ORANGE3", 205, 133, 0), ("ORANGE4", 139, 90, 0), ("ORANGERED", 255, 69, 0), ("ORANGERED1", 255, 69, 0), ("ORANGERED2", 238, 64, 0), ("ORANGERED3", 205, 55, 0), ("ORANGERED4", 139, 37, 0), ("ORCHID", 218, 112, 214), ("ORCHID1", 255, 131, 250), ("ORCHID2", 238, 122, 233), ("ORCHID3", 205, 105, 201), ("ORCHID4", 139, 71, 137), ("PALEGOLDENROD", 238, 232, 170), ("PALEGREEN", 152, 251, 152), ("PALEGREEN1", 154, 255, 154), ("PALEGREEN2", 144, 238, 144), ("PALEGREEN3", 124, 205, 124), ("PALEGREEN4", 84, 139, 84), ("PALETURQUOISE", 175, 238, 238), ("PALETURQUOISE1", 187, 255, 255), ("PALETURQUOISE2", 174, 238, 238), ("PALETURQUOISE3", 150, 205, 205), ("PALETURQUOISE4", 102, 139, 139), ("PALEVIOLETRED", 219, 112, 147), ("PALEVIOLETRED1", 255, 130, 171), ("PALEVIOLETRED2", 238, 121, 159), ("PALEVIOLETRED3", 205, 104, 137), ("PALEVIOLETRED4", 139, 71, 93), ("PAPAYAWHIP", 255, 239, 213), ("PEACHPUFF", 255, 218, 185), ("PEACHPUFF1", 255, 218, 185), ("PEACHPUFF2", 238, 203, 173), ("PEACHPUFF3", 205, 175, 149), ("PEACHPUFF4", 139, 119, 101), ("PERU", 205, 133, 63), ("PINK", 255, 192, 203), ("PINK1", 255, 181, 197), ("PINK2", 238, 169, 184), ("PINK3", 205, 145, 158), ("PINK4", 139, 99, 108), ("PLUM", 221, 160, 221), ("PLUM1", 255, 187, 255), ("PLUM2", 238, 174, 238), ("PLUM3", 205, 150, 205), ("PLUM4", 139, 102, 139), ("POWDERBLUE", 176, 224, 230), ("PURPLE", 160, 32, 240), ("PURPLE1", 155, 48, 255), ("PURPLE2", 145, 44, 238), ("PURPLE3", 125, 38, 205), ("PURPLE4", 85, 26, 139), ("PY_COLOR", 240, 255, 210), ("RED", 255, 0, 0), ("RED1", 255, 0, 0), ("RED2", 238, 0, 0), ("RED3", 205, 0, 0), ("RED4", 139, 0, 0), ("ROSYBROWN", 188, 143, 143), ("ROSYBROWN1", 255, 193, 193), ("ROSYBROWN2", 238, 180, 180), ("ROSYBROWN3", 205, 155, 155), ("ROSYBROWN4", 139, 105, 105), ("ROYALBLUE", 65, 105, 225), ("ROYALBLUE1", 72, 118, 255), ("ROYALBLUE2", 67, 110, 238), ("ROYALBLUE3", 58, 95, 205), ("ROYALBLUE4", 39, 64, 139), ("SADDLEBROWN", 139, 69, 19), ("SALMON", 250, 128, 114), ("SALMON1", 255, 140, 105), ("SALMON2", 238, 130, 98), ("SALMON3", 205, 112, 84), ("SALMON4", 139, 76, 57), ("SANDYBROWN", 244, 164, 96), ("SEAGREEN", 46, 139, 87), ("SEAGREEN1", 84, 255, 159), ("SEAGREEN2", 78, 238, 148), ("SEAGREEN3", 67, 205, 128), ("SEAGREEN4", 46, 139, 87), ("SEASHELL", 255, 245, 238), ("SEASHELL1", 255, 245, 238), ("SEASHELL2", 238, 229, 222), ("SEASHELL3", 205, 197, 191), ("SEASHELL4", 139, 134, 130), ("SIENNA", 160, 82, 45), ("SIENNA1", 255, 130, 71), ("SIENNA2", 238, 121, 66), ("SIENNA3", 205, 104, 57), ("SIENNA4", 139, 71, 38), ("SKYBLUE", 135, 206, 235), ("SKYBLUE1", 135, 206, 255), ("SKYBLUE2", 126, 192, 238), ("SKYBLUE3", 108, 166, 205), ("SKYBLUE4", 74, 112, 139), ("SLATEBLUE", 106, 90, 205), ("SLATEBLUE1", 131, 111, 255), ("SLATEBLUE2", 122, 103, 238), ("SLATEBLUE3", 105, 89, 205), ("SLATEBLUE4", 71, 60, 139), ("SLATEGRAY", 112, 128, 144), ("SNOW", 255, 250, 250), ("SNOW1", 255, 250, 250), ("SNOW2", 238, 233, 233), ("SNOW3", 205, 201, 201), ("SNOW4", 139, 137, 137), ("SPRINGGREEN", 0, 255, 127), ("SPRINGGREEN1", 0, 255, 127), ("SPRINGGREEN2", 0, 238, 118), ("SPRINGGREEN3", 0, 205, 102), ("SPRINGGREEN4", 0, 139, 69), ("STEELBLUE", 70, 130, 180), ("STEELBLUE1", 99, 184, 255), ("STEELBLUE2", 92, 172, 238), ("STEELBLUE3", 79, 148, 205), ("STEELBLUE4", 54, 100, 139), ("TAN", 210, 180, 140), ("TAN1", 255, 165, 79), ("TAN2", 238, 154, 73), ("TAN3", 205, 133, 63), ("TAN4", 139, 90, 43), ("THISTLE", 216, 191, 216), ("THISTLE1", 255, 225, 255), ("THISTLE2", 238, 210, 238), ("THISTLE3", 205, 181, 205), ("THISTLE4", 139, 123, 139), ("TOMATO", 255, 99, 71), ("TOMATO1", 255, 99, 71), ("TOMATO2", 238, 92, 66), ("TOMATO3", 205, 79, 57), ("TOMATO4", 139, 54, 38), ("TURQUOISE", 64, 224, 208), ("TURQUOISE1", 0, 245, 255), ("TURQUOISE2", 0, 229, 238), ("TURQUOISE3", 0, 197, 205), ("TURQUOISE4", 0, 134, 139), ("VIOLET", 238, 130, 238), ("VIOLETRED", 208, 32, 144), ("VIOLETRED1", 255, 62, 150), ("VIOLETRED2", 238, 58, 140), ("VIOLETRED3", 205, 50, 120), ("VIOLETRED4", 139, 34, 82), ("WHEAT", 245, 222, 179), ("WHEAT1", 255, 231, 186), ("WHEAT2", 238, 216, 174), ("WHEAT3", 205, 186, 150), ("WHEAT4", 139, 126, 102), ("WHITE", 255, 255, 255), ("WHITESMOKE", 245, 245, 245), ("YELLOW", 255, 255, 0), ("YELLOW1", 255, 255, 0), ("YELLOW2", 238, 238, 0), ("YELLOW3", 205, 205, 0), ("YELLOW4", 139, 139, 0), ("YELLOWGREEN", 154, 205, 50), ] def getColorInfoDict() -> dict: d = {} for item in getColorInfoList(): d[item[0].lower()] = item[1:] return d def getColor(name: str) -> tuple: """Retrieve RGB color in PDF format by name. Returns: a triple of floats in range 0 to 1. In case of name-not-found, "white" is returned. """ try: c = getColorInfoList()[getColorList().index(name.upper())] return (c[1] / 255.0, c[2] / 255.0, c[3] / 255.0) except Exception: pymupdf.exception_info() return (1, 1, 1) def getColorHSV(name: str) -> tuple: """Retrieve the hue, saturation, value triple of a color name. Returns: a triple (degree, percent, percent). If not found (-1, -1, -1) is returned. """ try: x = getColorInfoList()[getColorList().index(name.upper())] except Exception: if g_exceptions_verbose: pymupdf.exception_info() return (-1, -1, -1) r = x[1] / 255.0 g = x[2] / 255.0 b = x[3] / 255.0 cmax = max(r, g, b) V = round(cmax * 100, 1) cmin = min(r, g, b) delta = cmax - cmin if delta == 0: hue = 0 elif cmax == r: hue = 60.0 * (((g - b) / delta) % 6) elif cmax == g: hue = 60.0 * (((b - r) / delta) + 2) else: hue = 60.0 * (((r - g) / delta) + 4) H = int(round(hue)) if cmax == 0: sat = 0 else: sat = delta / cmax S = int(round(sat * 100)) return (H, S, V) def _get_font_properties(doc: pymupdf.Document, xref: int) -> tuple: fontname, ext, stype, buffer = doc.extract_font(xref) asc = 0.8 dsc = -0.2 if ext == "": return fontname, ext, stype, asc, dsc if buffer: try: font = pymupdf.Font(fontbuffer=buffer) asc = font.ascender dsc = font.descender bbox = font.bbox if asc - dsc < 1: if bbox.y0 < dsc: dsc = bbox.y0 asc = 1 - dsc except Exception: pymupdf.exception_info() asc *= 1.2 dsc *= 1.2 return fontname, ext, stype, asc, dsc if ext != "n/a": try: font = pymupdf.Font(fontname) asc = font.ascender dsc = font.descender except Exception: pymupdf.exception_info() asc *= 1.2 dsc *= 1.2 else: asc *= 1.2 dsc *= 1.2 return fontname, ext, stype, asc, dsc def get_char_widths( doc: pymupdf.Document, xref: int, limit: int = 256, idx: int = 0, fontdict: OptDict = None ) -> list: """Get list of glyph information of a font. Notes: Must be provided by its XREF number. If we already dealt with the font, it will be recorded in doc.FontInfos. Otherwise we insert an entry there. Finally we return the glyphs for the font. This is a list of (glyph, width) where glyph is an integer controlling the char appearance, and width is a float controlling the char's spacing: width * fontsize is the actual space. For 'simple' fonts, glyph == ord(char) will usually be true. Exceptions are 'Symbol' and 'ZapfDingbats'. We are providing data for these directly here. """ fontinfo = pymupdf.CheckFontInfo(doc, xref) if fontinfo is None: # not recorded yet: create it if fontdict is None: name, ext, stype, asc, dsc = _get_font_properties(doc, xref) fontdict = { "name": name, "type": stype, "ext": ext, "ascender": asc, "descender": dsc, } else: name = fontdict["name"] ext = fontdict["ext"] stype = fontdict["type"] ordering = fontdict["ordering"] simple = fontdict["simple"] if ext == "": raise ValueError("xref is not a font") # check for 'simple' fonts if stype in ("Type1", "MMType1", "TrueType"): simple = True else: simple = False # check for CJK fonts if name in ("Fangti", "Ming"): ordering = 0 elif name in ("Heiti", "Song"): ordering = 1 elif name in ("Gothic", "Mincho"): ordering = 2 elif name in ("Dotum", "Batang"): ordering = 3 else: ordering = -1 fontdict["simple"] = simple if name == "ZapfDingbats": glyphs = pymupdf.zapf_glyphs elif name == "Symbol": glyphs = pymupdf.symbol_glyphs else: glyphs = None fontdict["glyphs"] = glyphs fontdict["ordering"] = ordering fontinfo = [xref, fontdict] doc.FontInfos.append(fontinfo) else: fontdict = fontinfo[1] glyphs = fontdict["glyphs"] simple = fontdict["simple"] ordering = fontdict["ordering"] if glyphs is None: oldlimit = 0 else: oldlimit = len(glyphs) mylimit = max(256, limit) if mylimit <= oldlimit: return glyphs if ordering < 0: # not a CJK font glyphs = doc._get_char_widths( xref, fontdict["name"], fontdict["ext"], fontdict["ordering"], mylimit, idx ) else: # CJK fonts use char codes and width = 1 glyphs = None fontdict["glyphs"] = glyphs fontinfo[1] = fontdict pymupdf.UpdateFontInfo(doc, fontinfo) return glyphs class Shape: """Create a new shape.""" @staticmethod def horizontal_angle(C, P): """Return the angle to the horizontal for the connection from C to P. This uses the arcus sine function and resolves its inherent ambiguity by looking up in which quadrant vector S = P - C is located. """ S = pymupdf.Point(P - C).unit # unit vector 'C' -> 'P' alfa = math.asin(abs(S.y)) # absolute angle from horizontal if S.x < 0: # make arcsin result unique if S.y <= 0: # bottom-left alfa = -(math.pi - alfa) else: # top-left alfa = math.pi - alfa else: if S.y >= 0: # top-right pass else: # bottom-right alfa = -alfa return alfa def __init__(self, page: pymupdf.Page): pymupdf.CheckParent(page) self.page = page self.doc = page.parent if not self.doc.is_pdf: raise ValueError("is no PDF") self.height = page.mediabox_size.y self.width = page.mediabox_size.x self.x = page.cropbox_position.x self.y = page.cropbox_position.y self.pctm = page.transformation_matrix # page transf. matrix self.ipctm = ~self.pctm # inverted transf. matrix self.draw_cont = "" self.text_cont = "" self.totalcont = "" self.last_point = None self.rect = None def updateRect(self, x): if self.rect is None: if len(x) == 2: self.rect = pymupdf.Rect(x, x) else: self.rect = pymupdf.Rect(x) else: if len(x) == 2: x = pymupdf.Point(x) self.rect.x0 = min(self.rect.x0, x.x) self.rect.y0 = min(self.rect.y0, x.y) self.rect.x1 = max(self.rect.x1, x.x) self.rect.y1 = max(self.rect.y1, x.y) else: x = pymupdf.Rect(x) self.rect.x0 = min(self.rect.x0, x.x0) self.rect.y0 = min(self.rect.y0, x.y0) self.rect.x1 = max(self.rect.x1, x.x1) self.rect.y1 = max(self.rect.y1, x.y1) def draw_line(self, p1: point_like, p2: point_like) -> pymupdf.Point: """Draw a line between two points.""" p1 = pymupdf.Point(p1) p2 = pymupdf.Point(p2) if not (self.last_point == p1): self.draw_cont += _format_g(pymupdf.JM_TUPLE(p1 * self.ipctm)) + " m\n" self.last_point = p1 self.updateRect(p1) self.draw_cont += _format_g(pymupdf.JM_TUPLE(p2 * self.ipctm)) + " l\n" self.updateRect(p2) self.last_point = p2 return self.last_point def draw_polyline(self, points: list) -> pymupdf.Point: """Draw several connected line segments.""" for i, p in enumerate(points): if i == 0: if not (self.last_point == pymupdf.Point(p)): self.draw_cont += _format_g(pymupdf.JM_TUPLE(pymupdf.Point(p) * self.ipctm)) + " m\n" self.last_point = pymupdf.Point(p) else: self.draw_cont += _format_g(pymupdf.JM_TUPLE(pymupdf.Point(p) * self.ipctm)) + " l\n" self.updateRect(p) self.last_point = pymupdf.Point(points[-1]) return self.last_point def draw_bezier( self, p1: point_like, p2: point_like, p3: point_like, p4: point_like, ) -> pymupdf.Point: """Draw a standard cubic Bezier curve.""" p1 = pymupdf.Point(p1) p2 = pymupdf.Point(p2) p3 = pymupdf.Point(p3) p4 = pymupdf.Point(p4) if not (self.last_point == p1): self.draw_cont += _format_g(pymupdf.JM_TUPLE(p1 * self.ipctm)) + " m\n" args = pymupdf.JM_TUPLE(list(p2 * self.ipctm) + list(p3 * self.ipctm) + list(p4 * self.ipctm)) self.draw_cont += _format_g(args) + " c\n" self.updateRect(p1) self.updateRect(p2) self.updateRect(p3) self.updateRect(p4) self.last_point = p4 return self.last_point def draw_oval(self, tetra: typing.Union[quad_like, rect_like]) -> pymupdf.Point: """Draw an ellipse inside a tetrapod.""" if len(tetra) != 4: raise ValueError("invalid arg length") if hasattr(tetra[0], "__float__"): q = pymupdf.Rect(tetra).quad else: q = pymupdf.Quad(tetra) mt = q.ul + (q.ur - q.ul) * 0.5 mr = q.ur + (q.lr - q.ur) * 0.5 mb = q.ll + (q.lr - q.ll) * 0.5 ml = q.ul + (q.ll - q.ul) * 0.5 if not (self.last_point == ml): self.draw_cont += _format_g(pymupdf.JM_TUPLE(ml * self.ipctm)) + " m\n" self.last_point = ml self.draw_curve(ml, q.ll, mb) self.draw_curve(mb, q.lr, mr) self.draw_curve(mr, q.ur, mt) self.draw_curve(mt, q.ul, ml) self.updateRect(q.rect) self.last_point = ml return self.last_point def draw_circle(self, center: point_like, radius: float) -> pymupdf.Point: """Draw a circle given its center and radius.""" if not radius > pymupdf.EPSILON: raise ValueError("radius must be positive") center = pymupdf.Point(center) p1 = center - (radius, 0) return self.draw_sector(center, p1, 360, fullSector=False) def draw_curve( self, p1: point_like, p2: point_like, p3: point_like, ) -> pymupdf.Point: """Draw a curve between points using one control point.""" kappa = 0.55228474983 p1 = pymupdf.Point(p1) p2 = pymupdf.Point(p2) p3 = pymupdf.Point(p3) k1 = p1 + (p2 - p1) * kappa k2 = p3 + (p2 - p3) * kappa return self.draw_bezier(p1, k1, k2, p3) def draw_sector( self, center: point_like, point: point_like, beta: float, fullSector: bool = True, ) -> pymupdf.Point: """Draw a circle sector.""" center = pymupdf.Point(center) point = pymupdf.Point(point) l3 = lambda a, b: _format_g((a, b)) + " m\n" l4 = lambda a, b, c, d, e, f: _format_g((a, b, c, d, e, f)) + " c\n" l5 = lambda a, b: _format_g((a, b)) + " l\n" betar = math.radians(-beta) w360 = math.radians(math.copysign(360, betar)) * (-1) w90 = math.radians(math.copysign(90, betar)) w45 = w90 / 2 while abs(betar) > 2 * math.pi: betar += w360 # bring angle below 360 degrees if not (self.last_point == point): self.draw_cont += l3(*pymupdf.JM_TUPLE(point * self.ipctm)) self.last_point = point Q = pymupdf.Point(0, 0) # just make sure it exists C = center P = point S = P - C # vector 'center' -> 'point' rad = abs(S) # circle radius if not rad > pymupdf.EPSILON: raise ValueError("radius must be positive") alfa = self.horizontal_angle(center, point) while abs(betar) > abs(w90): # draw 90 degree arcs q1 = C.x + math.cos(alfa + w90) * rad q2 = C.y + math.sin(alfa + w90) * rad Q = pymupdf.Point(q1, q2) # the arc's end point r1 = C.x + math.cos(alfa + w45) * rad / math.cos(w45) r2 = C.y + math.sin(alfa + w45) * rad / math.cos(w45) R = pymupdf.Point(r1, r2) # crossing point of tangents kappah = (1 - math.cos(w45)) * 4 / 3 / abs(R - Q) kappa = kappah * abs(P - Q) cp1 = P + (R - P) * kappa # control point 1 cp2 = Q + (R - Q) * kappa # control point 2 self.draw_cont += l4(*pymupdf.JM_TUPLE( list(cp1 * self.ipctm) + list(cp2 * self.ipctm) + list(Q * self.ipctm) )) betar -= w90 # reduce parm angle by 90 deg alfa += w90 # advance start angle by 90 deg P = Q # advance to arc end point # draw (remaining) arc if abs(betar) > 1e-3: # significant degrees left? beta2 = betar / 2 q1 = C.x + math.cos(alfa + betar) * rad q2 = C.y + math.sin(alfa + betar) * rad Q = pymupdf.Point(q1, q2) # the arc's end point r1 = C.x + math.cos(alfa + beta2) * rad / math.cos(beta2) r2 = C.y + math.sin(alfa + beta2) * rad / math.cos(beta2) R = pymupdf.Point(r1, r2) # crossing point of tangents # kappa height is 4/3 of segment height kappah = (1 - math.cos(beta2)) * 4 / 3 / abs(R - Q) # kappa height kappa = kappah * abs(P - Q) / (1 - math.cos(betar)) cp1 = P + (R - P) * kappa # control point 1 cp2 = Q + (R - Q) * kappa # control point 2 self.draw_cont += l4(*pymupdf.JM_TUPLE( list(cp1 * self.ipctm) + list(cp2 * self.ipctm) + list(Q * self.ipctm) )) if fullSector: self.draw_cont += l3(*pymupdf.JM_TUPLE(point * self.ipctm)) self.draw_cont += l5(*pymupdf.JM_TUPLE(center * self.ipctm)) self.draw_cont += l5(*pymupdf.JM_TUPLE(Q * self.ipctm)) self.last_point = Q return self.last_point def draw_rect(self, rect: rect_like, *, radius=None) -> pymupdf.Point: """Draw a rectangle. Args: radius: if not None, the rectangle will have rounded corners. This is the radius of the curvature, given as percentage of the rectangle width or height. Valid are values 0 < v <= 0.5. For a sequence of two values, the corners will have different radii. Otherwise, the percentage will be computed from the shorter side. A value of (0.5, 0.5) will draw an ellipse. """ r = pymupdf.Rect(rect) if radius is None: # standard rectangle self.draw_cont += _format_g(pymupdf.JM_TUPLE( list(r.bl * self.ipctm) + [r.width, r.height] )) + " re\n" self.updateRect(r) self.last_point = r.tl return self.last_point # rounded corners requested. This requires 1 or 2 values, each # with 0 < value <= 0.5 if hasattr(radius, "__float__"): if radius <= 0 or radius > 0.5: raise ValueError(f"bad radius value {radius}.") d = min(r.width, r.height) * radius px = (d, 0) py = (0, d) elif hasattr(radius, "__len__") and len(radius) == 2: rx, ry = radius px = (rx * r.width, 0) py = (0, ry * r.height) if min(rx, ry) <= 0 or max(rx, ry) > 0.5: raise ValueError(f"bad radius value {radius}.") else: raise ValueError(f"bad radius value {radius}.") lp = self.draw_line(r.tl + py, r.bl - py) lp = self.draw_curve(lp, r.bl, r.bl + px) lp = self.draw_line(lp, r.br - px) lp = self.draw_curve(lp, r.br, r.br - py) lp = self.draw_line(lp, r.tr + py) lp = self.draw_curve(lp, r.tr, r.tr - px) lp = self.draw_line(lp, r.tl + px) self.last_point = self.draw_curve(lp, r.tl, r.tl + py) self.updateRect(r) return self.last_point def draw_quad(self, quad: quad_like) -> pymupdf.Point: """Draw a Quad.""" q = pymupdf.Quad(quad) return self.draw_polyline([q.ul, q.ll, q.lr, q.ur, q.ul]) def draw_zigzag( self, p1: point_like, p2: point_like, breadth: float = 2, ) -> pymupdf.Point: """Draw a zig-zagged line from p1 to p2.""" p1 = pymupdf.Point(p1) p2 = pymupdf.Point(p2) S = p2 - p1 # vector start - end rad = abs(S) # distance of points cnt = 4 * int(round(rad / (4 * breadth), 0)) # always take full phases if cnt < 4: raise ValueError("points too close") mb = rad / cnt # revised breadth matrix = pymupdf.Matrix(pymupdf.util_hor_matrix(p1, p2)) # normalize line to x-axis i_mat = ~matrix # get original position points = [] # stores edges for i in range(1, cnt): if i % 4 == 1: # point "above" connection p = pymupdf.Point(i, -1) * mb elif i % 4 == 3: # point "below" connection p = pymupdf.Point(i, 1) * mb else: # ignore others continue points.append(p * i_mat) self.draw_polyline([p1] + points + [p2]) # add start and end points return p2 def draw_squiggle( self, p1: point_like, p2: point_like, breadth=2, ) -> pymupdf.Point: """Draw a squiggly line from p1 to p2.""" p1 = pymupdf.Point(p1) p2 = pymupdf.Point(p2) S = p2 - p1 # vector start - end rad = abs(S) # distance of points cnt = 4 * int(round(rad / (4 * breadth), 0)) # always take full phases if cnt < 4: raise ValueError("points too close") mb = rad / cnt # revised breadth matrix = pymupdf.Matrix(pymupdf.util_hor_matrix(p1, p2)) # normalize line to x-axis i_mat = ~matrix # get original position k = 2.4142135623765633 # y of draw_curve helper point points = [] # stores edges for i in range(1, cnt): if i % 4 == 1: # point "above" connection p = pymupdf.Point(i, -k) * mb elif i % 4 == 3: # point "below" connection p = pymupdf.Point(i, k) * mb else: # else on connection line p = pymupdf.Point(i, 0) * mb points.append(p * i_mat) points = [p1] + points + [p2] cnt = len(points) i = 0 while i + 2 < cnt: self.draw_curve(points[i], points[i + 1], points[i + 2]) i += 2 return p2 # ============================================================================== # Shape.insert_text # ============================================================================== def insert_text( self, point: point_like, buffer: typing.Union[str, list], fontsize: float = 11, lineheight: OptFloat = None, fontname: str = "helv", fontfile: OptStr = None, set_simple: bool = 0, encoding: int = 0, color: OptSeq = None, fill: OptSeq = None, render_mode: int = 0, border_width: float = 0.05, rotate: int = 0, morph: OptSeq = None, stroke_opacity: float = 1, fill_opacity: float = 1, oc: int = 0, ) -> int: # ensure 'text' is a list of strings, worth dealing with if not bool(buffer): return 0 if type(buffer) not in (list, tuple): text = buffer.splitlines() else: text = buffer if not len(text) > 0: return 0 point = pymupdf.Point(point) try: maxcode = max([ord(c) for c in " ".join(text)]) except Exception: pymupdf.exception_info() return 0 # ensure valid 'fontname' fname = fontname if fname.startswith("/"): fname = fname[1:] xref = self.page.insert_font( fontname=fname, fontfile=fontfile, encoding=encoding, set_simple=set_simple ) fontinfo = pymupdf.CheckFontInfo(self.doc, xref) fontdict = fontinfo[1] ordering = fontdict["ordering"] simple = fontdict["simple"] bfname = fontdict["name"] ascender = fontdict["ascender"] descender = fontdict["descender"] if lineheight: lheight = fontsize * lineheight elif ascender - descender <= 1: lheight = fontsize * 1.2 else: lheight = fontsize * (ascender - descender) if maxcode > 255: glyphs = self.doc.get_char_widths(xref, maxcode + 1) else: glyphs = fontdict["glyphs"] tab = [] for t in text: if simple and bfname not in ("Symbol", "ZapfDingbats"): g = None else: g = glyphs tab.append(pymupdf.getTJstr(t, g, simple, ordering)) text = tab color_str = pymupdf.ColorCode(color, "c") fill_str = pymupdf.ColorCode(fill, "f") if not fill and render_mode == 0: # ensure fill color when 0 Tr fill = color fill_str = pymupdf.ColorCode(color, "f") morphing = pymupdf.CheckMorph(morph) rot = rotate if rot % 90 != 0: raise ValueError("bad rotate value") while rot < 0: rot += 360 rot = rot % 360 # text rotate = 0, 90, 270, 180 templ1 = lambda a, b, c, d, e, f, g: f"\nq\n{a}{b}BT\n{c}1 0 0 1 {_format_g((d, e))} Tm\n/{f} {_format_g(g)} Tf " templ2 = lambda a: f"TJ\n0 -{_format_g(a)} TD\n" cmp90 = "0 1 -1 0 0 0 cm\n" # rotates 90 deg counter-clockwise cmm90 = "0 -1 1 0 0 0 cm\n" # rotates 90 deg clockwise cm180 = "-1 0 0 -1 0 0 cm\n" # rotates by 180 deg. height = self.height width = self.width # setting up for standard rotation directions # case rotate = 0 if morphing: m1 = pymupdf.Matrix(1, 0, 0, 1, morph[0].x + self.x, height - morph[0].y - self.y) mat = ~m1 * morph[1] * m1 cm = _format_g(pymupdf.JM_TUPLE(mat)) + " cm\n" else: cm = "" top = height - point.y - self.y # start of 1st char left = point.x + self.x # start of 1. char space = top # space available #headroom = point.y + self.y # distance to page border if rot == 90: left = height - point.y - self.y top = -point.x - self.x cm += cmp90 space = width - abs(top) #headroom = point.x + self.x elif rot == 270: left = -height + point.y + self.y top = point.x + self.x cm += cmm90 space = abs(top) #headroom = width - point.x - self.x elif rot == 180: left = -point.x - self.x top = -height + point.y + self.y cm += cm180 space = abs(point.y + self.y) #headroom = height - point.y - self.y optcont = self.page._get_optional_content(oc) if optcont is not None: bdc = "/OC /%s BDC\n" % optcont emc = "EMC\n" else: bdc = emc = "" alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity) if alpha is None: alpha = "" else: alpha = "/%s gs\n" % alpha nres = templ1(bdc, alpha, cm, left, top, fname, fontsize) if render_mode > 0: nres += "%i Tr " % render_mode nres += _format_g(border_width * fontsize) + " w " if color is not None: nres += color_str if fill is not None: nres += fill_str # ========================================================================= # start text insertion # ========================================================================= nres += text[0] nlines = 1 # set output line counter if len(text) > 1: nres += templ2(lheight) # line 1 else: nres += 'TJ' for i in range(1, len(text)): if space < lheight: break # no space left on page if i > 1: nres += "\nT* " nres += text[i] + 'TJ' space -= lheight nlines += 1 nres += "\nET\n%sQ\n" % emc # ========================================================================= # end of text insertion # ========================================================================= # update the /Contents object self.text_cont += nres return nlines # ============================================================================== # Shape.insert_textbox # ============================================================================== def insert_textbox( self, rect: rect_like, buffer: typing.Union[str, list], fontname: OptStr = "helv", fontfile: OptStr = None, fontsize: float = 11, lineheight: OptFloat = None, set_simple: bool = 0, encoding: int = 0, color: OptSeq = None, fill: OptSeq = None, expandtabs: int = 1, border_width: float = 0.05, align: int = 0, render_mode: int = 0, rotate: int = 0, morph: OptSeq = None, stroke_opacity: float = 1, fill_opacity: float = 1, oc: int = 0, ) -> float: """Insert text into a given rectangle. Args: rect -- the textbox to fill buffer -- text to be inserted fontname -- a Base-14 font, font name or '/name' fontfile -- name of a font file fontsize -- font size lineheight -- overwrite the font property color -- RGB stroke color triple fill -- RGB fill color triple render_mode -- text rendering control border_width -- thickness of glyph borders as percentage of fontsize expandtabs -- handles tabulators with string function align -- left, center, right, justified rotate -- 0, 90, 180, or 270 degrees morph -- morph box with a matrix and a fixpoint Returns: unused or deficit rectangle area (float) """ rect = pymupdf.Rect(rect) if rect.is_empty or rect.is_infinite: raise ValueError("text box must be finite and not empty") color_str = pymupdf.ColorCode(color, "c") fill_str = pymupdf.ColorCode(fill, "f") if fill is None and render_mode == 0: # ensure fill color for 0 Tr fill = color fill_str = pymupdf.ColorCode(color, "f") optcont = self.page._get_optional_content(oc) if optcont is not None: bdc = "/OC /%s BDC\n" % optcont emc = "EMC\n" else: bdc = emc = "" # determine opacity / transparency alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity) if alpha is None: alpha = "" else: alpha = "/%s gs\n" % alpha if rotate % 90 != 0: raise ValueError("rotate must be multiple of 90") rot = rotate while rot < 0: rot += 360 rot = rot % 360 # is buffer worth of dealing with? if not bool(buffer): return rect.height if rot in (0, 180) else rect.width cmp90 = "0 1 -1 0 0 0 cm\n" # rotates counter-clockwise cmm90 = "0 -1 1 0 0 0 cm\n" # rotates clockwise cm180 = "-1 0 0 -1 0 0 cm\n" # rotates by 180 deg. height = self.height fname = fontname if fname.startswith("/"): fname = fname[1:] xref = self.page.insert_font( fontname=fname, fontfile=fontfile, encoding=encoding, set_simple=set_simple ) fontinfo = pymupdf.CheckFontInfo(self.doc, xref) fontdict = fontinfo[1] ordering = fontdict["ordering"] simple = fontdict["simple"] glyphs = fontdict["glyphs"] bfname = fontdict["name"] ascender = fontdict["ascender"] descender = fontdict["descender"] if lineheight: lheight_factor = lineheight elif ascender - descender <= 1: lheight_factor = 1.2 else: lheight_factor = ascender - descender lheight = fontsize * lheight_factor # create a list from buffer, split into its lines if type(buffer) in (list, tuple): t0 = "\n".join(buffer) else: t0 = buffer maxcode = max([ord(c) for c in t0]) # replace invalid char codes for simple fonts if simple and maxcode > 255: t0 = "".join([c if ord(c) < 256 else "?" for c in t0]) t0 = t0.splitlines() glyphs = self.doc.get_char_widths(xref, maxcode + 1) if simple and bfname not in ("Symbol", "ZapfDingbats"): tj_glyphs = None else: tj_glyphs = glyphs # ---------------------------------------------------------------------- # calculate pixel length of a string # ---------------------------------------------------------------------- def pixlen(x): """Calculate pixel length of x.""" if ordering < 0: return sum([glyphs[ord(c)][1] for c in x]) * fontsize else: return len(x) * fontsize # --------------------------------------------------------------------- if ordering < 0: blen = glyphs[32][1] * fontsize # pixel size of space character else: blen = fontsize text = "" # output buffer if pymupdf.CheckMorph(morph): m1 = pymupdf.Matrix( 1, 0, 0, 1, morph[0].x + self.x, self.height - morph[0].y - self.y ) mat = ~m1 * morph[1] * m1 cm = _format_g(pymupdf.JM_TUPLE(mat)) + " cm\n" else: cm = "" # --------------------------------------------------------------------- # adjust for text orientation / rotation # --------------------------------------------------------------------- progr = 1 # direction of line progress c_pnt = pymupdf.Point(0, fontsize * ascender) # used for line progress if rot == 0: # normal orientation point = rect.tl + c_pnt # line 1 is 'lheight' below top maxwidth = rect.width # pixels available in one line maxheight = rect.height # available text height elif rot == 90: # rotate counter clockwise c_pnt = pymupdf.Point(fontsize * ascender, 0) # progress in x-direction point = rect.bl + c_pnt # line 1 'lheight' away from left maxwidth = rect.height # pixels available in one line maxheight = rect.width # available text height cm += cmp90 elif rot == 180: # text upside down # progress upwards in y direction c_pnt = -pymupdf.Point(0, fontsize * ascender) point = rect.br + c_pnt # line 1 'lheight' above bottom maxwidth = rect.width # pixels available in one line progr = -1 # subtract lheight for next line maxheight =rect.height # available text height cm += cm180 else: # rotate clockwise (270 or -90) # progress from right to left c_pnt = -pymupdf.Point(fontsize * ascender, 0) point = rect.tr + c_pnt # line 1 'lheight' left of right maxwidth = rect.height # pixels available in one line progr = -1 # subtract lheight for next line maxheight = rect.width # available text height cm += cmm90 # ===================================================================== # line loop # ===================================================================== just_tab = [] # 'justify' indicators per line for i, line in enumerate(t0): line_t = line.expandtabs(expandtabs).split(" ") # split into words num_words = len(line_t) lbuff = "" # init line buffer rest = maxwidth # available line pixels # ================================================================= # word loop # ================================================================= for j in range(num_words): word = line_t[j] pl_w = pixlen(word) # pixel len of word if rest >= pl_w: # does it fit on the line? lbuff += word + " " # yes, append word rest -= pl_w + blen # update available line space continue # next word # word doesn't fit - output line (if not empty) if lbuff: lbuff = lbuff.rstrip() + "\n" # line full, append line break text += lbuff # append to total text just_tab.append(True) # can align-justify lbuff = "" # re-init line buffer rest = maxwidth # re-init avail. space if pl_w <= maxwidth: # word shorter than 1 line? lbuff = word + " " # start the line with it rest = maxwidth - pl_w - blen # update free space continue # long word: split across multiple lines - char by char ... if len(just_tab) > 0: just_tab[-1] = False # cannot align-justify for c in word: if pixlen(lbuff) <= maxwidth - pixlen(c): lbuff += c else: # line full lbuff += "\n" # close line text += lbuff # append to text just_tab.append(False) # cannot align-justify lbuff = c # start new line with this char lbuff += " " # finish long word rest = maxwidth - pixlen(lbuff) # long word stored if lbuff: # unprocessed line content? text += lbuff.rstrip() # append to text just_tab.append(False) # cannot align-justify if i < len(t0) - 1: # not the last line? text += "\n" # insert line break # compute used part of the textbox if text.endswith("\n"): text = text[:-1] lb_count = text.count("\n") + 1 # number of lines written # text height = line count * line height plus one descender value text_height = lheight * lb_count - descender * fontsize more = text_height - maxheight # difference to height limit if more > pymupdf.EPSILON: # landed too much outside rect return (-1) * more # return deficit, don't output more = abs(more) if more < pymupdf.EPSILON: more = 0 # don't bother with epsilons nres = "\nq\n%s%sBT\n" % (bdc, alpha) + cm # initialize output buffer templ = lambda a, b, c, d: f"1 0 0 1 {_format_g((a, b))} Tm /{c} {_format_g(d)} Tf " # center, right, justify: output each line with its own specifics text_t = text.splitlines() # split text in lines again just_tab[-1] = False # never justify last line for i, t in enumerate(text_t): spacing = 0 pl = maxwidth - pixlen(t) # length of empty line part pnt = point + c_pnt * (i * lheight_factor) # text start of line if align == 1: # center: right shift by half width if rot in (0, 180): pnt = pnt + pymupdf.Point(pl / 2, 0) * progr else: pnt = pnt - pymupdf.Point(0, pl / 2) * progr elif align == 2: # right: right shift by full width if rot in (0, 180): pnt = pnt + pymupdf.Point(pl, 0) * progr else: pnt = pnt - pymupdf.Point(0, pl) * progr elif align == 3: # justify spaces = t.count(" ") # number of spaces in line if spaces > 0 and just_tab[i]: # if any, and we may justify spacing = pl / spaces # make every space this much larger else: spacing = 0 # keep normal space length top = height - pnt.y - self.y left = pnt.x + self.x if rot == 90: left = height - pnt.y - self.y top = -pnt.x - self.x elif rot == 270: left = -height + pnt.y + self.y top = pnt.x + self.x elif rot == 180: left = -pnt.x - self.x top = -height + pnt.y + self.y nres += templ(left, top, fname, fontsize) if render_mode > 0: nres += "%i Tr " % render_mode nres += _format_g(border_width * fontsize) + " w " if align == 3: nres += _format_g(spacing) + " Tw " if color is not None: nres += color_str if fill is not None: nres += fill_str nres += "%sTJ\n" % pymupdf.getTJstr(t, tj_glyphs, simple, ordering) nres += "ET\n%sQ\n" % emc self.text_cont += nres self.updateRect(rect) return more def finish( self, width: float = 1, color: OptSeq = (0,), fill: OptSeq = None, lineCap: int = 0, lineJoin: int = 0, dashes: OptStr = None, even_odd: bool = False, morph: OptSeq = None, closePath: bool = True, fill_opacity: float = 1, stroke_opacity: float = 1, oc: int = 0, ) -> None: """Finish the current drawing segment. Notes: Apply colors, opacity, dashes, line style and width, or morphing. Also whether to close the path by connecting last to first point. """ if self.draw_cont == "": # treat empty contents as no-op return if width == 0: # border color makes no sense then color = None elif color is None: # vice versa width = 0 # if color == None and fill == None: # raise ValueError("at least one of 'color' or 'fill' must be given") color_str = pymupdf.ColorCode(color, "c") # ensure proper color string fill_str = pymupdf.ColorCode(fill, "f") # ensure proper fill string optcont = self.page._get_optional_content(oc) if optcont is not None: self.draw_cont = "/OC /%s BDC\n" % optcont + self.draw_cont emc = "EMC\n" else: emc = "" alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity) if alpha is not None: self.draw_cont = "/%s gs\n" % alpha + self.draw_cont if width != 1 and width != 0: self.draw_cont += _format_g(width) + " w\n" if lineCap != 0: self.draw_cont = "%i J\n" % lineCap + self.draw_cont if lineJoin != 0: self.draw_cont = "%i j\n" % lineJoin + self.draw_cont if dashes not in (None, "", "[] 0"): self.draw_cont = "%s d\n" % dashes + self.draw_cont if closePath: self.draw_cont += "h\n" self.last_point = None if color is not None: self.draw_cont += color_str if fill is not None: self.draw_cont += fill_str if color is not None: if not even_odd: self.draw_cont += "B\n" else: self.draw_cont += "B*\n" else: if not even_odd: self.draw_cont += "f\n" else: self.draw_cont += "f*\n" else: self.draw_cont += "S\n" self.draw_cont += emc if pymupdf.CheckMorph(morph): m1 = pymupdf.Matrix( 1, 0, 0, 1, morph[0].x + self.x, self.height - morph[0].y - self.y ) mat = ~m1 * morph[1] * m1 self.draw_cont = _format_g(pymupdf.JM_TUPLE(mat)) + " cm\n" + self.draw_cont self.totalcont += "\nq\n" + self.draw_cont + "Q\n" self.draw_cont = "" self.last_point = None return def commit(self, overlay: bool = True) -> None: """Update the page's /Contents object with Shape data. The argument controls whether data appear in foreground (default) or background. """ pymupdf.CheckParent(self.page) # doc may have died meanwhile self.totalcont += self.text_cont self.totalcont = self.totalcont.encode() if self.totalcont: if overlay: self.page.wrap_contents() # ensure a balanced graphics state # make /Contents object with dummy stream xref = pymupdf.TOOLS._insert_contents(self.page, b" ", overlay) # update it with potential compression self.doc.update_stream(xref, self.totalcont) self.last_point = None # clean up ... self.rect = None # self.draw_cont = "" # for potential ... self.text_cont = "" # ... self.totalcont = "" # re-use def apply_redactions( page: pymupdf.Page, images: int = 2, graphics: int = 1, text: int = 0 ) -> bool: """Apply the redaction annotations of the page. Args: page: the PDF page. images: 0 - ignore images 1 - remove all overlapping images 2 - blank out overlapping image parts 3 - remove image unless invisible graphics: 0 - ignore graphics 1 - remove graphics if contained in rectangle 2 - remove all overlapping graphics text: 0 - remove text 1 - ignore text """ def center_rect(annot_rect, new_text, font, fsize): """Calculate minimal sub-rectangle for the overlay text. Notes: Because 'insert_textbox' supports no vertical text centering, we calculate an approximate number of lines here and return a sub-rect with smaller height, which should still be sufficient. Args: annot_rect: the annotation rectangle new_text: the text to insert. font: the fontname. Must be one of the CJK or Base-14 set, else the rectangle is returned unchanged. fsize: the fontsize Returns: A rectangle to use instead of the annot rectangle. """ exception_types = (ValueError, mupdf.FzErrorBase) if pymupdf.mupdf_version_tuple < (1, 24): exception_types = ValueError if not new_text: return annot_rect try: text_width = pymupdf.get_text_length(new_text, font, fsize) except exception_types: # unsupported font if g_exceptions_verbose: pymupdf.exception_info() return annot_rect line_height = fsize * 1.2 limit = annot_rect.width h = math.ceil(text_width / limit) * line_height # estimate rect height if h >= annot_rect.height: return annot_rect r = annot_rect y = (annot_rect.tl.y + annot_rect.bl.y - h) * 0.5 r.y0 = y return r pymupdf.CheckParent(page) doc = page.parent if doc.is_encrypted or doc.is_closed: raise ValueError("document closed or encrypted") if not doc.is_pdf: raise ValueError("is no PDF") redact_annots = [] # storage of annot values for annot in page.annots( types=(pymupdf.PDF_ANNOT_REDACT,) # pylint: disable=no-member ): # loop redactions redact_annots.append(annot._get_redact_values()) # save annot values if redact_annots == []: # any redactions on this page? return False # no redactions rc = page._apply_redactions(text, images, graphics) # call MuPDF if not rc: # should not happen really raise ValueError("Error applying redactions.") # now write replacement text in old redact rectangles shape = page.new_shape() for redact in redact_annots: annot_rect = redact["rect"] fill = redact["fill"] if fill: shape.draw_rect(annot_rect) # colorize the rect background shape.finish(fill=fill, color=fill) if "text" in redact.keys(): # if we also have text new_text = redact["text"] align = redact.get("align", 0) fname = redact["fontname"] fsize = redact["fontsize"] color = redact["text_color"] # try finding vertical centered sub-rect trect = center_rect(annot_rect, new_text, fname, fsize) rc = -1 while rc < 0 and fsize >= 4: # while not enough room # (re-) try insertion rc = shape.insert_textbox( trect, new_text, fontname=fname, fontsize=fsize, color=color, align=align, ) fsize -= 0.5 # reduce font if unsuccessful shape.commit() # append new contents object return True # ------------------------------------------------------------------------------ # Remove potentially sensitive data from a PDF. Similar to the Adobe # Acrobat 'sanitize' function # ------------------------------------------------------------------------------ def scrub( doc: pymupdf.Document, attached_files: bool = True, clean_pages: bool = True, embedded_files: bool = True, hidden_text: bool = True, javascript: bool = True, metadata: bool = True, redactions: bool = True, redact_images: int = 0, remove_links: bool = True, reset_fields: bool = True, reset_responses: bool = True, thumbnails: bool = True, xml_metadata: bool = True, ) -> None: def remove_hidden(cont_lines): """Remove hidden text from a PDF page. Args: cont_lines: list of lines with /Contents content. Should have status from after page.cleanContents(). Returns: List of /Contents lines from which hidden text has been removed. Notes: The input must have been created after the page's /Contents object(s) have been cleaned with page.cleanContents(). This ensures a standard formatting: one command per line, single spaces between operators. This allows for drastic simplification of this code. """ out_lines = [] # will return this in_text = False # indicate if within BT/ET object suppress = False # indicate text suppression active make_return = False for line in cont_lines: if line == b"BT": # start of text object in_text = True # switch on out_lines.append(line) # output it continue if line == b"ET": # end of text object in_text = False # switch off out_lines.append(line) # output it continue if line == b"3 Tr": # text suppression operator suppress = True # switch on make_return = True continue if line[-2:] == b"Tr" and line[0] != b"3": suppress = False # text rendering changed out_lines.append(line) continue if line == b"Q": # unstack command also switches off suppress = False out_lines.append(line) continue if suppress and in_text: # suppress hidden lines continue out_lines.append(line) if make_return: return out_lines else: return None if not doc.is_pdf: # only works for PDF raise ValueError("is no PDF") if doc.is_encrypted or doc.is_closed: raise ValueError("closed or encrypted doc") if clean_pages is False: hidden_text = False redactions = False if metadata: doc.set_metadata({}) # remove standard metadata for page in doc: if reset_fields: # reset form fields (widgets) for widget in page.widgets(): widget.reset() if remove_links: links = page.get_links() # list of all links on page for link in links: # remove all links page.delete_link(link) found_redacts = False for annot in page.annots(): if annot.type[0] == mupdf.PDF_ANNOT_FILE_ATTACHMENT and attached_files: annot.update_file(buffer=b" ") # set file content to empty if reset_responses: annot.delete_responses() if annot.type[0] == pymupdf.PDF_ANNOT_REDACT: # pylint: disable=no-member found_redacts = True if redactions and found_redacts: page.apply_redactions(images=redact_images) if not (clean_pages or hidden_text): continue # done with the page page.clean_contents() if not page.get_contents(): continue if hidden_text: xref = page.get_contents()[0] # only one b/o cleaning! cont = doc.xref_stream(xref) cont_lines = remove_hidden(cont.splitlines()) # remove hidden text if cont_lines: # something was actually removed cont = b"\n".join(cont_lines) doc.update_stream(xref, cont) # rewrite the page /Contents if thumbnails: # remove page thumbnails? if doc.xref_get_key(page.xref, "Thumb")[0] != "null": doc.xref_set_key(page.xref, "Thumb", "null") # pages are scrubbed, now perform document-wide scrubbing # remove embedded files if embedded_files: for name in doc.embfile_names(): doc.embfile_del(name) if xml_metadata: doc.del_xml_metadata() if not (xml_metadata or javascript): xref_limit = 0 else: xref_limit = doc.xref_length() for xref in range(1, xref_limit): if not doc.xref_object(xref): msg = "bad xref %i - clean PDF before scrubbing" % xref raise ValueError(msg) if javascript and doc.xref_get_key(xref, "S")[1] == "/JavaScript": # a /JavaScript action object obj = "<>" # replace with a null JavaScript doc.update_object(xref, obj) # update this object continue # no further handling if not xml_metadata: continue if doc.xref_get_key(xref, "Type")[1] == "/Metadata": # delete any metadata object directly doc.update_object(xref, "<<>>") doc.update_stream(xref, b"deleted", new=True) continue if doc.xref_get_key(xref, "Metadata")[0] != "null": doc.xref_set_key(xref, "Metadata", "null") def _show_fz_text( text): #if mupdf_cppyy: # assert isinstance( text, cppyy.gbl.mupdf.Text) #else: # assert isinstance( text, mupdf.Text) num_spans = 0 num_chars = 0 span = text.m_internal.head while 1: if not span: break num_spans += 1 num_chars += span.len span = span.next return f'num_spans={num_spans} num_chars={num_chars}' def fill_textbox( writer: pymupdf.TextWriter, rect: rect_like, text: typing.Union[str, list], pos: point_like = None, font: typing.Optional[pymupdf.Font] = None, fontsize: float = 11, lineheight: OptFloat = None, align: int = 0, warn: bool = None, right_to_left: bool = False, small_caps: bool = False, ) -> tuple: """Fill a rectangle with text. Args: writer: pymupdf.TextWriter object (= "self") rect: rect-like to receive the text. text: string or list/tuple of strings. pos: point-like start position of first word. font: pymupdf.Font object (default pymupdf.Font('helv')). fontsize: the fontsize. lineheight: overwrite the font property align: (int) 0 = left, 1 = center, 2 = right, 3 = justify warn: (bool) text overflow action: none, warn, or exception right_to_left: (bool) indicate right-to-left language. """ rect = pymupdf.Rect(rect) if rect.is_empty: raise ValueError("fill rect must not empty.") if type(font) is not pymupdf.Font: font = pymupdf.Font("helv") def textlen(x): """Return length of a string.""" return font.text_length( x, fontsize=fontsize, small_caps=small_caps ) # abbreviation def char_lengths(x): """Return list of single character lengths for a string.""" return font.char_lengths(x, fontsize=fontsize, small_caps=small_caps) def append_this(pos, text): ret = writer.append( pos, text, font=font, fontsize=fontsize, small_caps=small_caps ) return ret tolerance = fontsize * 0.2 # extra distance to left border space_len = textlen(" ") std_width = rect.width - tolerance std_start = rect.x0 + tolerance def norm_words(width, words): """Cut any word in pieces no longer than 'width'.""" nwords = [] word_lengths = [] for w in words: wl_lst = char_lengths(w) wl = sum(wl_lst) if wl <= width: # nothing to do - copy over nwords.append(w) word_lengths.append(wl) continue # word longer than rect width - split it in parts n = len(wl_lst) while n > 0: wl = sum(wl_lst[:n]) if wl <= width: nwords.append(w[:n]) word_lengths.append(wl) w = w[n:] wl_lst = wl_lst[n:] n = len(wl_lst) else: n -= 1 return nwords, word_lengths def output_justify(start, line): """Justified output of a line.""" # ignore leading / trailing / multiple spaces words = [w for w in line.split(" ") if w != ""] nwords = len(words) if nwords == 0: return if nwords == 1: # single word cannot be justified append_this(start, words[0]) return tl = sum([textlen(w) for w in words]) # total word lengths gaps = nwords - 1 # number of word gaps gapl = (std_width - tl) / gaps # width of each gap for w in words: _, lp = append_this(start, w) # output one word start.x = lp.x + gapl # next start at word end plus gap return asc = font.ascender dsc = font.descender if not lineheight: if asc - dsc <= 1: lheight = 1.2 else: lheight = asc - dsc else: lheight = lineheight LINEHEIGHT = fontsize * lheight # effective line height width = std_width # available horizontal space # starting point of text if pos is not None: pos = pymupdf.Point(pos) else: # default is just below rect top-left pos = rect.tl + (tolerance, fontsize * asc) if pos not in rect: raise ValueError("Text must start in rectangle.") # calculate displacement factor for alignment if align == pymupdf.TEXT_ALIGN_CENTER: factor = 0.5 elif align == pymupdf.TEXT_ALIGN_RIGHT: factor = 1.0 else: factor = 0 # split in lines if just a string was given if type(text) is str: textlines = text.splitlines() else: textlines = [] for line in text: textlines.extend(line.splitlines()) max_lines = int((rect.y1 - pos.y) / LINEHEIGHT) + 1 new_lines = [] # the final list of textbox lines no_justify = [] # no justify for these line numbers for i, line in enumerate(textlines): if line in ("", " "): new_lines.append((line, space_len)) width = rect.width - tolerance no_justify.append((len(new_lines) - 1)) continue if i == 0: width = rect.x1 - pos.x else: width = rect.width - tolerance if right_to_left: # reverses Arabic / Hebrew text front to back line = writer.clean_rtl(line) tl = textlen(line) if tl <= width: # line short enough new_lines.append((line, tl)) no_justify.append((len(new_lines) - 1)) continue # we need to split the line in fitting parts words = line.split(" ") # the words in the line # cut in parts any words that are longer than rect width words, word_lengths = norm_words(std_width, words) n = len(words) while True: line0 = " ".join(words[:n]) wl = sum(word_lengths[:n]) + space_len * (len(word_lengths[:n]) - 1) if wl <= width: new_lines.append((line0, wl)) words = words[n:] word_lengths = word_lengths[n:] n = len(words) line0 = None else: n -= 1 if len(words) == 0: break # ------------------------------------------------------------------------- # List of lines created. Each item is (text, tl), where 'tl' is the PDF # output length (float) and 'text' is the text. Except for justified text, # this is output-ready. # ------------------------------------------------------------------------- nlines = len(new_lines) if nlines > max_lines: msg = "Only fitting %i of %i lines." % (max_lines, nlines) if warn is True: pymupdf.message("Warning: " + msg) elif warn is False: raise ValueError(msg) start = pymupdf.Point() no_justify += [len(new_lines) - 1] # no justifying of last line for i in range(max_lines): try: line, tl = new_lines.pop(0) except IndexError: # Verbose in PyMuPDF/tests. if g_exceptions_verbose: pymupdf.exception_info() break if right_to_left: # Arabic, Hebrew line = "".join(reversed(line)) if i == 0: # may have different start for first line start = pos if align == pymupdf.TEXT_ALIGN_JUSTIFY and i not in no_justify and tl < std_width: output_justify(start, line) start.x = std_start start.y += LINEHEIGHT continue if i > 0 or pos.x == std_start: # left, center, right alignments start.x += (width - tl) * factor append_this(start, line) start.x = std_start start.y += LINEHEIGHT return new_lines # return non-written lines # ------------------------------------------------------------------------ # Optional Content functions # ------------------------------------------------------------------------ def get_oc(doc: pymupdf.Document, xref: int) -> int: """Return optional content object xref for an image or form xobject. Args: xref: (int) xref number of an image or form xobject. """ if doc.is_closed or doc.is_encrypted: raise ValueError("document close or encrypted") t, name = doc.xref_get_key(xref, "Subtype") if t != "name" or name not in ("/Image", "/Form"): raise ValueError("bad object type at xref %i" % xref) t, oc = doc.xref_get_key(xref, "OC") if t != "xref": return 0 rc = int(oc.replace("0 R", "")) return rc def set_oc(doc: pymupdf.Document, xref: int, oc: int) -> None: """Attach optional content object to image or form xobject. Args: xref: (int) xref number of an image or form xobject oc: (int) xref number of an OCG or OCMD """ if doc.is_closed or doc.is_encrypted: raise ValueError("document close or encrypted") t, name = doc.xref_get_key(xref, "Subtype") if t != "name" or name not in ("/Image", "/Form"): raise ValueError("bad object type at xref %i" % xref) if oc > 0: t, name = doc.xref_get_key(oc, "Type") if t != "name" or name not in ("/OCG", "/OCMD"): raise ValueError("bad object type at xref %i" % oc) if oc == 0 and "OC" in doc.xref_get_keys(xref): doc.xref_set_key(xref, "OC", "null") return None doc.xref_set_key(xref, "OC", "%i 0 R" % oc) return None def set_ocmd( doc: pymupdf.Document, xref: int = 0, ocgs: typing.Union[list, None] = None, policy: OptStr = None, ve: typing.Union[list, None] = None, ) -> int: """Create or update an OCMD object in a PDF document. Args: xref: (int) 0 for creating a new object, otherwise update existing one. ocgs: (list) OCG xref numbers, which shall be subject to 'policy'. policy: one of 'AllOn', 'AllOff', 'AnyOn', 'AnyOff' (any casing). ve: (list) visibility expression. Use instead of 'ocgs' with 'policy'. Returns: Xref of the created or updated OCMD. """ all_ocgs = set(doc.get_ocgs().keys()) def ve_maker(ve): if type(ve) not in (list, tuple) or len(ve) < 2: raise ValueError("bad 've' format: %s" % ve) if ve[0].lower() not in ("and", "or", "not"): raise ValueError("bad operand: %s" % ve[0]) if ve[0].lower() == "not" and len(ve) != 2: raise ValueError("bad 've' format: %s" % ve) item = "[/%s" % ve[0].title() for x in ve[1:]: if type(x) is int: if x not in all_ocgs: raise ValueError("bad OCG %i" % x) item += " %i 0 R" % x else: item += " %s" % ve_maker(x) item += "]" return item text = "< dict: """Return the definition of an OCMD (optional content membership dictionary). Recognizes PDF dict keys /OCGs (PDF array of OCGs), /P (policy string) and /VE (visibility expression, PDF array). Via string manipulation, this info is converted to a Python dictionary with keys "xref", "ocgs", "policy" and "ve" - ready to recycle as input for 'set_ocmd()'. """ if xref not in range(doc.xref_length()): raise ValueError("bad xref") text = doc.xref_object(xref, compressed=True) if "/Type/OCMD" not in text: raise ValueError("bad object type") textlen = len(text) p0 = text.find("/OCGs[") # look for /OCGs key p1 = text.find("]", p0) if p0 < 0 or p1 < 0: # no OCGs found ocgs = None else: ocgs = text[p0 + 6 : p1].replace("0 R", " ").split() ocgs = list(map(int, ocgs)) p0 = text.find("/P/") # look for /P policy key if p0 < 0: policy = None else: p1 = text.find("ff", p0) if p1 < 0: p1 = text.find("on", p0) if p1 < 0: # some irregular syntax raise ValueError("bad object at xref") else: policy = text[p0 + 3 : p1 + 2] p0 = text.find("/VE[") # look for /VE visibility expression key if p0 < 0: # no visibility expression found ve = None else: lp = rp = 0 # find end of /VE by finding last ']'. p1 = p0 while lp < 1 or lp != rp: p1 += 1 if not p1 < textlen: # some irregular syntax raise ValueError("bad object at xref") if text[p1] == "[": lp += 1 if text[p1] == "]": rp += 1 # p1 now positioned at the last "]" ve = text[p0 + 3 : p1 + 1] # the PDF /VE array ve = ( ve.replace("/And", '"and",') .replace("/Not", '"not",') .replace("/Or", '"or",') ) ve = ve.replace(" 0 R]", "]").replace(" 0 R", ",").replace("][", "],[") import json try: ve = json.loads(ve) except Exception: pymupdf.exception_info() pymupdf.message(f"bad /VE key: {ve!r}") raise return {"xref": xref, "ocgs": ocgs, "policy": policy, "ve": ve} """ Handle page labels for PDF documents. Reading ------- * compute the label of a page * find page number(s) having the given label. Writing ------- Supports setting (defining) page labels for PDF documents. A big Thank You goes to WILLIAM CHAPMAN who contributed the idea and significant parts of the following code during late December 2020 through early January 2021. """ def rule_dict(item): """Make a Python dict from a PDF page label rule. Args: item -- a tuple (pno, rule) with the start page number and the rule string like <>. Returns: A dict like {'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}. """ # Jorj McKie, 2021-01-06 pno, rule = item rule = rule[2:-2].split("/")[1:] # strip "<<" and ">>" d = {"startpage": pno, "prefix": "", "firstpagenum": 1} skip = False for i, item in enumerate(rule): # pylint: disable=redefined-argument-from-local if skip: # this item has already been processed skip = False # deactivate skipping again continue if item == "S": # style specification d["style"] = rule[i + 1] # next item has the style skip = True # do not process next item again continue if item.startswith("P"): # prefix specification: extract the string x = item[1:].replace("(", "").replace(")", "") d["prefix"] = x continue if item.startswith("St"): # start page number specification x = int(item[2:]) d["firstpagenum"] = x return d def get_label_pno(pgNo, labels): """Return the label for this page number. Args: pgNo: page number, 0-based. labels: result of doc._get_page_labels(). Returns: The label (str) of the page number. Errors return an empty string. """ # Jorj McKie, 2021-01-06 item = [x for x in labels if x[0] <= pgNo][-1] rule = rule_dict(item) prefix = rule.get("prefix", "") style = rule.get("style", "") pagenumber = pgNo - rule["startpage"] + rule["firstpagenum"] return construct_label(style, prefix, pagenumber) def get_label(page): """Return the label for this PDF page. Args: page: page object. Returns: The label (str) of the page. Errors return an empty string. """ # Jorj McKie, 2021-01-06 labels = page.parent._get_page_labels() if not labels: return "" labels.sort() return get_label_pno(page.number, labels) def get_page_numbers(doc, label, only_one=False): """Return a list of page numbers with the given label. Args: doc: PDF document object (resp. 'self'). label: (str) label. only_one: (bool) stop searching after first hit. Returns: List of page numbers having this label. """ # Jorj McKie, 2021-01-06 numbers = [] if not label: return numbers labels = doc._get_page_labels() if labels == []: return numbers for i in range(doc.page_count): plabel = get_label_pno(i, labels) if plabel == label: numbers.append(i) if only_one: break return numbers def construct_label(style, prefix, pno) -> str: """Construct a label based on style, prefix and page number.""" # William Chapman, 2021-01-06 n_str = "" if style == "D": n_str = str(pno) elif style == "r": n_str = integerToRoman(pno).lower() elif style == "R": n_str = integerToRoman(pno).upper() elif style == "a": n_str = integerToLetter(pno).lower() elif style == "A": n_str = integerToLetter(pno).upper() result = prefix + n_str return result def integerToLetter(i) -> str: """Returns letter sequence string for integer i.""" # William Chapman, Jorj McKie, 2021-01-06 import string ls = string.ascii_uppercase n, a = 1, i while pow(26, n) <= a: a -= int(math.pow(26, n)) n += 1 str_t = "" for j in reversed(range(n)): f, g = divmod(a, int(math.pow(26, j))) str_t += ls[f] a = g return str_t def integerToRoman(num: int) -> str: """Return roman numeral for an integer.""" # William Chapman, Jorj McKie, 2021-01-06 roman = ( (1000, "M"), (900, "CM"), (500, "D"), (400, "CD"), (100, "C"), (90, "XC"), (50, "L"), (40, "XL"), (10, "X"), (9, "IX"), (5, "V"), (4, "IV"), (1, "I"), ) def roman_num(num): for r, ltr in roman: x, _ = divmod(num, r) yield ltr * x num -= r * x if num <= 0: break return "".join([a for a in roman_num(num)]) def get_page_labels(doc): """Return page label definitions in PDF document. Args: doc: PDF document (resp. 'self'). Returns: A list of dictionaries with the following format: {'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}. """ # Jorj McKie, 2021-01-10 return [rule_dict(item) for item in doc._get_page_labels()] def set_page_labels(doc, labels): """Add / replace page label definitions in PDF document. Args: doc: PDF document (resp. 'self'). labels: list of label dictionaries like: {'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}, as returned by get_page_labels(). """ # William Chapman, 2021-01-06 def create_label_str(label): """Convert Python label dict to correspnding PDF rule string. Args: label: (dict) build rule for the label. Returns: PDF label rule string wrapped in "<<", ">>". """ s = "%i<<" % label["startpage"] if label.get("prefix", "") != "": s += "/P(%s)" % label["prefix"] if label.get("style", "") != "": s += "/S/%s" % label["style"] if label.get("firstpagenum", 1) > 1: s += "/St %i" % label["firstpagenum"] s += ">>" return s def create_nums(labels): """Return concatenated string of all labels rules. Args: labels: (list) dictionaries as created by function 'rule_dict'. Returns: PDF compatible string for page label definitions, ready to be enclosed in PDF array 'Nums[...]'. """ labels.sort(key=lambda x: x["startpage"]) s = "".join([create_label_str(label) for label in labels]) return s doc._set_page_labels(create_nums(labels)) # End of Page Label Code ------------------------------------------------- def has_links(doc: pymupdf.Document) -> bool: """Check whether there are links on any page.""" if doc.is_closed: raise ValueError("document closed") if not doc.is_pdf: raise ValueError("is no PDF") for i in range(doc.page_count): for item in doc.page_annot_xrefs(i): if item[1] == pymupdf.PDF_ANNOT_LINK: # pylint: disable=no-member return True return False def has_annots(doc: pymupdf.Document) -> bool: """Check whether there are annotations on any page.""" if doc.is_closed: raise ValueError("document closed") if not doc.is_pdf: raise ValueError("is no PDF") for i in range(doc.page_count): for item in doc.page_annot_xrefs(i): # pylint: disable=no-member if not (item[1] == pymupdf.PDF_ANNOT_LINK or item[1] == pymupdf.PDF_ANNOT_WIDGET): return True return False # ------------------------------------------------------------------- # Functions to recover the quad contained in a text extraction bbox # ------------------------------------------------------------------- def recover_bbox_quad(line_dir: tuple, span: dict, bbox: tuple) -> pymupdf.Quad: """Compute the quad located inside the bbox. The bbox may be any of the resp. tuples occurring inside the given span. Args: line_dir: (tuple) 'line["dir"]' of the owning line or None. span: (dict) the span. May be from get_texttrace() method. bbox: (tuple) the bbox of the span or any of its characters. Returns: The quad which is wrapped by the bbox. """ if line_dir is None: line_dir = span["dir"] cos, sin = line_dir bbox = pymupdf.Rect(bbox) # make it a rect if pymupdf.TOOLS.set_small_glyph_heights(): # ==> just fontsize as height d = 1 else: d = span["ascender"] - span["descender"] height = d * span["size"] # the quad's rectangle height # The following are distances from the bbox corners, at wich we find the # respective quad points. The computation depends on in which quadrant # the text writing angle is located. hs = height * sin hc = height * cos if hc >= 0 and hs <= 0: # quadrant 1 ul = bbox.bl - (0, hc) ur = bbox.tr + (hs, 0) ll = bbox.bl - (hs, 0) lr = bbox.tr + (0, hc) elif hc <= 0 and hs <= 0: # quadrant 2 ul = bbox.br + (hs, 0) ur = bbox.tl - (0, hc) ll = bbox.br + (0, hc) lr = bbox.tl - (hs, 0) elif hc <= 0 and hs >= 0: # quadrant 3 ul = bbox.tr - (0, hc) ur = bbox.bl + (hs, 0) ll = bbox.tr - (hs, 0) lr = bbox.bl + (0, hc) else: # quadrant 4 ul = bbox.tl + (hs, 0) ur = bbox.br - (0, hc) ll = bbox.tl + (0, hc) lr = bbox.br - (hs, 0) return pymupdf.Quad(ul, ur, ll, lr) def recover_quad(line_dir: tuple, span: dict) -> pymupdf.Quad: """Recover the quadrilateral of a text span. Args: line_dir: (tuple) 'line["dir"]' of the owning line. span: the span. Returns: The quadrilateral enveloping the span's text. """ if type(line_dir) is not tuple or len(line_dir) != 2: raise ValueError("bad line dir argument") if type(span) is not dict: raise ValueError("bad span argument") return recover_bbox_quad(line_dir, span, span["bbox"]) def recover_line_quad(line: dict, spans: list = None) -> pymupdf.Quad: """Calculate the line quad for 'dict' / 'rawdict' text extractions. The lower quad points are those of the first, resp. last span quad. The upper points are determined by the maximum span quad height. From this, compute a rect with bottom-left in (0, 0), convert this to a quad and rotate and shift back to cover the text of the spans. Args: spans: (list, optional) sub-list of spans to consider. Returns: pymupdf.Quad covering selected spans. """ if spans is None: # no sub-selection spans = line["spans"] # all spans if len(spans) == 0: raise ValueError("bad span list") line_dir = line["dir"] # text direction cos, sin = line_dir q0 = recover_quad(line_dir, spans[0]) # quad of first span if len(spans) > 1: # get quad of last span q1 = recover_quad(line_dir, spans[-1]) else: q1 = q0 # last = first line_ll = q0.ll # lower-left of line quad line_lr = q1.lr # lower-right of line quad mat0 = pymupdf.planish_line(line_ll, line_lr) # map base line to x-axis such that line_ll goes to (0, 0) x_lr = line_lr * mat0 small = pymupdf.TOOLS.set_small_glyph_heights() # small glyph heights? h = max( [s["size"] * (1 if small else (s["ascender"] - s["descender"])) for s in spans] ) line_rect = pymupdf.Rect(0, -h, x_lr.x, 0) # line rectangle line_quad = line_rect.quad # make it a quad and: line_quad *= ~mat0 return line_quad def recover_span_quad(line_dir: tuple, span: dict, chars: list = None) -> pymupdf.Quad: """Calculate the span quad for 'dict' / 'rawdict' text extractions. Notes: There are two execution paths: 1. For the full span quad, the result of 'recover_quad' is returned. 2. For the quad of a sub-list of characters, the char quads are computed and joined. This is only supported for the "rawdict" extraction option. Args: line_dir: (tuple) 'line["dir"]' of the owning line. span: (dict) the span. chars: (list, optional) sub-list of characters to consider. Returns: pymupdf.Quad covering selected characters. """ if line_dir is None: # must be a span from get_texttrace() line_dir = span["dir"] if chars is None: # no sub-selection return recover_quad(line_dir, span) if "chars" not in span.keys(): raise ValueError("need 'rawdict' option to sub-select chars") q0 = recover_char_quad(line_dir, span, chars[0]) # quad of first char if len(chars) > 1: # get quad of last char q1 = recover_char_quad(line_dir, span, chars[-1]) else: q1 = q0 # last = first span_ll = q0.ll # lower-left of span quad span_lr = q1.lr # lower-right of span quad mat0 = pymupdf.planish_line(span_ll, span_lr) # map base line to x-axis such that span_ll goes to (0, 0) x_lr = span_lr * mat0 small = pymupdf.TOOLS.set_small_glyph_heights() # small glyph heights? h = span["size"] * (1 if small else (span["ascender"] - span["descender"])) span_rect = pymupdf.Rect(0, -h, x_lr.x, 0) # line rectangle span_quad = span_rect.quad # make it a quad and: span_quad *= ~mat0 # rotate back and shift back return span_quad def recover_char_quad(line_dir: tuple, span: dict, char: dict) -> pymupdf.Quad: """Recover the quadrilateral of a text character. This requires the "rawdict" option of text extraction. Args: line_dir: (tuple) 'line["dir"]' of the span's line. span: (dict) the span dict. char: (dict) the character dict. Returns: The quadrilateral enveloping the character. """ if line_dir is None: line_dir = span["dir"] if type(line_dir) is not tuple or len(line_dir) != 2: raise ValueError("bad line dir argument") if type(span) is not dict: raise ValueError("bad span argument") if type(char) is dict: bbox = pymupdf.Rect(char["bbox"]) elif type(char) is tuple: bbox = pymupdf.Rect(char[3]) else: raise ValueError("bad span argument") return recover_bbox_quad(line_dir, span, bbox) # ------------------------------------------------------------------- # Building font subsets using fontTools # ------------------------------------------------------------------- def subset_fonts(doc: pymupdf.Document, verbose: bool = False, fallback: bool = False) -> None: """Build font subsets of a PDF. Requires package 'fontTools'. Eligible fonts are potentially replaced by smaller versions. Page text is NOT rewritten and thus should retain properties like being hidden or controlled by optional content. This method by default uses MuPDF's own internal feature to create subset fonts. As this is a new function, errors may still occur. In this case, please fall back to using the previous version by using "fallback=True". """ # Font binaries: - "buffer" -> (names, xrefs, (unicodes, glyphs)) # An embedded font is uniquely defined by its fontbuffer only. It may have # multiple names and xrefs. # Once the sets of used unicodes and glyphs are known, we compute a # smaller version of the buffer user package fontTools. if fallback is False: # by default use MuPDF function pdf = mupdf.pdf_document_from_fz_document(doc) mupdf.pdf_subset_fonts2(pdf, list(range(doc.page_count))) return font_buffers = {} def get_old_widths(xref): """Retrieve old font '/W' and '/DW' values.""" df = doc.xref_get_key(xref, "DescendantFonts") if df[0] != "array": # only handle xref specifications return None, None df_xref = int(df[1][1:-1].replace("0 R", "")) widths = doc.xref_get_key(df_xref, "W") if widths[0] != "array": # no widths key found widths = None else: widths = widths[1] dwidths = doc.xref_get_key(df_xref, "DW") if dwidths[0] != "int": dwidths = None else: dwidths = dwidths[1] return widths, dwidths def set_old_widths(xref, widths, dwidths): """Restore the old '/W' and '/DW' in subsetted font. If either parameter is None or evaluates to False, the corresponding dictionary key will be set to null. """ df = doc.xref_get_key(xref, "DescendantFonts") if df[0] != "array": # only handle xref specs return None df_xref = int(df[1][1:-1].replace("0 R", "")) if (type(widths) is not str or not widths) and doc.xref_get_key(df_xref, "W")[ 0 ] != "null": doc.xref_set_key(df_xref, "W", "null") else: doc.xref_set_key(df_xref, "W", widths) if (type(dwidths) is not str or not dwidths) and doc.xref_get_key( df_xref, "DW" )[0] != "null": doc.xref_set_key(df_xref, "DW", "null") else: doc.xref_set_key(df_xref, "DW", dwidths) return None def set_subset_fontname(new_xref): """Generate a name prefix to tag a font as subset. We use a random generator to select 6 upper case ASCII characters. The prefixed name must be put in the font xref as the "/BaseFont" value and in the FontDescriptor object as the '/FontName' value. """ # The following generates a prefix like 'ABCDEF+' import random import string prefix = "".join(random.choices(tuple(string.ascii_uppercase), k=6)) + "+" font_str = doc.xref_object(new_xref, compressed=True) font_str = font_str.replace("/BaseFont/", "/BaseFont/" + prefix) df = doc.xref_get_key(new_xref, "DescendantFonts") if df[0] == "array": df_xref = int(df[1][1:-1].replace("0 R", "")) fd = doc.xref_get_key(df_xref, "FontDescriptor") if fd[0] == "xref": fd_xref = int(fd[1].replace("0 R", "")) fd_str = doc.xref_object(fd_xref, compressed=True) fd_str = fd_str.replace("/FontName/", "/FontName/" + prefix) doc.update_object(fd_xref, fd_str) doc.update_object(new_xref, font_str) def build_subset(buffer, unc_set, gid_set): """Build font subset using fontTools. Args: buffer: (bytes) the font given as a binary buffer. unc_set: (set) required glyph ids. Returns: Either None if subsetting is unsuccessful or the subset font buffer. """ try: import fontTools.subset as fts except ImportError: if g_exceptions_verbose: pymupdf.exception_info() pymupdf.message("This method requires fontTools to be installed.") raise import tempfile tmp_dir = tempfile.gettempdir() oldfont_path = f"{tmp_dir}/oldfont.ttf" newfont_path = f"{tmp_dir}/newfont.ttf" uncfile_path = f"{tmp_dir}/uncfile.txt" args = [ oldfont_path, "--retain-gids", f"--output-file={newfont_path}", "--layout-features='*'", "--passthrough-tables", "--ignore-missing-glyphs", "--ignore-missing-unicodes", "--symbol-cmap", ] # store glyph ids or unicodes as file with open(f"{tmp_dir}/uncfile.txt", "w", encoding='utf8') as unc_file: if 0xFFFD in unc_set: # error unicode exists -> use glyphs args.append(f"--gids-file={uncfile_path}") gid_set.add(189) unc_list = list(gid_set) for unc in unc_list: unc_file.write("%i\n" % unc) else: args.append(f"--unicodes-file={uncfile_path}") unc_set.add(255) unc_list = list(unc_set) for unc in unc_list: unc_file.write("%04x\n" % unc) # store fontbuffer as a file with open(oldfont_path, "wb") as fontfile: fontfile.write(buffer) try: os.remove(newfont_path) # remove old file except Exception: pass try: # invoke fontTools subsetter fts.main(args) font = pymupdf.Font(fontfile=newfont_path) new_buffer = font.buffer # subset font binary if font.glyph_count == 0: # intercept empty font new_buffer = None except Exception: pymupdf.exception_info() new_buffer = None try: os.remove(uncfile_path) except Exception: pymupdf.exception_info() pass try: os.remove(oldfont_path) except Exception: pymupdf.exception_info() pass try: os.remove(newfont_path) except Exception: pymupdf.exception_info() pass return new_buffer def repl_fontnames(doc): """Populate 'font_buffers'. For each font candidate, store its xref and the list of names by which PDF text may refer to it (there may be multiple). """ def norm_name(name): """Recreate font name that contains PDF hex codes. E.g. #20 -> space, chr(32) """ while "#" in name: p = name.find("#") c = int(name[p + 1 : p + 3], 16) name = name.replace(name[p : p + 3], chr(c)) return name def get_fontnames(doc, item): """Return a list of fontnames for an item of page.get_fonts(). There may be multiple names e.g. for Type0 fonts. """ fontname = item[3] names = [fontname] fontname = doc.xref_get_key(item[0], "BaseFont")[1][1:] fontname = norm_name(fontname) if fontname not in names: names.append(fontname) descendents = doc.xref_get_key(item[0], "DescendantFonts") if descendents[0] != "array": return names descendents = descendents[1][1:-1] if descendents.endswith(" 0 R"): xref = int(descendents[:-4]) descendents = doc.xref_object(xref, compressed=True) p1 = descendents.find("/BaseFont") if p1 >= 0: p2 = descendents.find("/", p1 + 1) p1 = min(descendents.find("/", p2 + 1), descendents.find(">>", p2 + 1)) fontname = descendents[p2 + 1 : p1] fontname = norm_name(fontname) if fontname not in names: names.append(fontname) return names for i in range(doc.page_count): for f in doc.get_page_fonts(i, full=True): font_xref = f[0] # font xref font_ext = f[1] # font file extension basename = f[3] # font basename if font_ext not in ( # skip if not supported by fontTools "otf", "ttf", "woff", "woff2", ): continue # skip fonts which already are subsets if len(basename) > 6 and basename[6] == "+": continue extr = doc.extract_font(font_xref) fontbuffer = extr[-1] names = get_fontnames(doc, f) name_set, xref_set, subsets = font_buffers.get( fontbuffer, (set(), set(), (set(), set())) ) xref_set.add(font_xref) for name in names: name_set.add(name) font = pymupdf.Font(fontbuffer=fontbuffer) name_set.add(font.name) del font font_buffers[fontbuffer] = (name_set, xref_set, subsets) def find_buffer_by_name(name): for buffer, (name_set, _, _) in font_buffers.items(): if name in name_set: return buffer return None # ----------------- # main function # ----------------- repl_fontnames(doc) # populate font information if not font_buffers: # nothing found to do if verbose: pymupdf.message(f'No fonts to subset.') return 0 old_fontsize = 0 new_fontsize = 0 for fontbuffer in font_buffers.keys(): old_fontsize += len(fontbuffer) # Scan page text for usage of subsettable fonts for page in doc: # go through the text and extend set of used glyphs by font # we use a modified MuPDF trace device, which delivers us glyph ids. for span in page.get_texttrace(): if type(span) is not dict: # skip useless information continue fontname = span["font"][:33] # fontname for the span buffer = find_buffer_by_name(fontname) if buffer is None: continue name_set, xref_set, (set_ucs, set_gid) = font_buffers[buffer] for c in span["chars"]: set_ucs.add(c[0]) # unicode set_gid.add(c[1]) # glyph id font_buffers[buffer] = (name_set, xref_set, (set_ucs, set_gid)) # build the font subsets for old_buffer, (name_set, xref_set, subsets) in font_buffers.items(): new_buffer = build_subset(old_buffer, subsets[0], subsets[1]) fontname = list(name_set)[0] if new_buffer is None or len(new_buffer) >= len(old_buffer): # subset was not created or did not get smaller if verbose: pymupdf.message(f'Cannot subset {fontname!r}.') continue if verbose: pymupdf.message(f"Built subset of font {fontname!r}.") val = doc._insert_font(fontbuffer=new_buffer) # store subset font in PDF new_xref = val[0] # get its xref set_subset_fontname(new_xref) # tag fontname as subset font font_str = doc.xref_object( # get its object definition new_xref, compressed=True, ) # walk through the original font xrefs and replace each by the subset def for font_xref in xref_set: # we need the original '/W' and '/DW' width values width_table, def_width = get_old_widths(font_xref) # ... and replace original font definition at xref with it doc.update_object(font_xref, font_str) # now copy over old '/W' and '/DW' values if width_table or def_width: set_old_widths(font_xref, width_table, def_width) # 'new_xref' remains unused in the PDF and must be removed # by garbage collection. new_fontsize += len(new_buffer) return old_fontsize - new_fontsize # ------------------------------------------------------------------- # Copy XREF object to another XREF # ------------------------------------------------------------------- def xref_copy(doc: pymupdf.Document, source: int, target: int, *, keep: list = None) -> None: """Copy a PDF dictionary object to another one given their xref numbers. Args: doc: PDF document object source: source xref number target: target xref number, the xref must already exist keep: an optional list of 1st level keys in target that should not be removed before copying. Notes: This works similar to the copy() method of dictionaries in Python. The source may be a stream object. """ if doc.xref_is_stream(source): # read new xref stream, maintaining compression stream = doc.xref_stream_raw(source) doc.update_stream( target, stream, compress=False, # keeps source compression new=True, # in case target is no stream ) # empty the target completely, observe exceptions if keep is None: keep = [] for key in doc.xref_get_keys(target): if key in keep: continue doc.xref_set_key(target, key, "null") # copy over all source dict items for key in doc.xref_get_keys(source): item = doc.xref_get_key(source, key) doc.xref_set_key(target, key, item[1])