""" Copyright (C) 2023 Artifex Software, Inc. This file is part of PyMuPDF. PyMuPDF is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. PyMuPDF is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with MuPDF. If not, see Alternative licensing terms are available from the licensor. For commercial licensing, see or contact Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, CA 94129, USA, for further information. --------------------------------------------------------------------- Portions of this code have been ported from pdfplumber, see https://pypi.org/project/pdfplumber/. The ported code is under the following MIT license: --------------------------------------------------------------------- The MIT License (MIT) Copyright (c) 2015, Jeremy Singer-Vine Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. --------------------------------------------------------------------- Also see here: https://github.com/jsvine/pdfplumber/blob/stable/LICENSE.txt --------------------------------------------------------------------- The porting mainly pertains to files "table.py" and relevant parts of "utils/text.py" within pdfplumber's repository on Github. With respect to "text.py", we have removed functions or features that are not used by table processing. Examples are: * the text search function * simple text extraction * text extraction by lines Original pdfplumber code does neither detect, nor identify table headers. This PyMuPDF port adds respective code to the 'Table' class as method '_get_header'. This is implemented as new class TableHeader with the properties: * bbox: A tuple for the header's bbox * cells: A tuple for each bbox of a column header * names: A list of strings with column header text * external: A bool indicating whether the header is outside the table cells. """ import inspect import itertools import string import html from collections.abc import Sequence from dataclasses import dataclass from operator import itemgetter # ------------------------------------------------------------------- # Start of PyMuPDF interface code # ------------------------------------------------------------------- from . import ( Rect, Matrix, TEXTFLAGS_TEXT, TOOLS, EMPTY_RECT, sRGB_to_pdf, Point, message, ) EDGES = [] # vector graphics from PyMuPDF CHARS = [] # text characters from PyMuPDF TEXTPAGE = None white_spaces = set(string.whitespace) # for checking white space only cells # ------------------------------------------------------------------- # End of PyMuPDF interface code # ------------------------------------------------------------------- class UnsetFloat(float): pass NON_NEGATIVE_SETTINGS = [ "snap_tolerance", "snap_x_tolerance", "snap_y_tolerance", "join_tolerance", "join_x_tolerance", "join_y_tolerance", "edge_min_length", "min_words_vertical", "min_words_horizontal", "intersection_tolerance", "intersection_x_tolerance", "intersection_y_tolerance", ] TABLE_STRATEGIES = ["lines", "lines_strict", "text", "explicit"] UNSET = UnsetFloat(0) DEFAULT_SNAP_TOLERANCE = 3 DEFAULT_JOIN_TOLERANCE = 3 DEFAULT_MIN_WORDS_VERTICAL = 3 DEFAULT_MIN_WORDS_HORIZONTAL = 1 DEFAULT_X_TOLERANCE = 3 DEFAULT_Y_TOLERANCE = 3 DEFAULT_X_DENSITY = 7.25 DEFAULT_Y_DENSITY = 13 bbox_getter = itemgetter("x0", "top", "x1", "bottom") LIGATURES = { "ff": "ff", "ffi": "ffi", "ffl": "ffl", "fi": "fi", "fl": "fl", "st": "st", "ſt": "st", } def to_list(collection) -> list: if isinstance(collection, list): return collection elif isinstance(collection, Sequence): return list(collection) elif hasattr(collection, "to_dict"): res = collection.to_dict("records") # pragma: nocover return res else: return list(collection) class TextMap: """ A TextMap maps each unicode character in the text to an individual `char` object (or, in the case of layout-implied whitespace, `None`). """ def __init__(self, tuples=None) -> None: self.tuples = tuples self.as_string = "".join(map(itemgetter(0), tuples)) def match_to_dict( self, m, main_group: int = 0, return_groups: bool = True, return_chars: bool = True, ) -> dict: subset = self.tuples[m.start(main_group) : m.end(main_group)] chars = [c for (text, c) in subset if c is not None] x0, top, x1, bottom = objects_to_bbox(chars) result = { "text": m.group(main_group), "x0": x0, "top": top, "x1": x1, "bottom": bottom, } if return_groups: result["groups"] = m.groups() if return_chars: result["chars"] = chars return result class WordMap: """ A WordMap maps words->chars. """ def __init__(self, tuples) -> None: self.tuples = tuples def to_textmap( self, layout: bool = False, layout_width=0, layout_height=0, layout_width_chars: int = 0, layout_height_chars: int = 0, x_density=DEFAULT_X_DENSITY, y_density=DEFAULT_Y_DENSITY, x_shift=0, y_shift=0, y_tolerance=DEFAULT_Y_TOLERANCE, use_text_flow: bool = False, presorted: bool = False, expand_ligatures: bool = True, ) -> TextMap: """ Given a list of (word, chars) tuples (i.e., a WordMap), return a list of (char-text, char) tuples (i.e., a TextMap) that can be used to mimic the structural layout of the text on the page(s), using the following approach: - Sort the words by (doctop, x0) if not already sorted. - Calculate the initial doctop for the starting page. - Cluster the words by doctop (taking `y_tolerance` into account), and iterate through them. - For each cluster, calculate the distance between that doctop and the initial doctop, in points, minus `y_shift`. Divide that distance by `y_density` to calculate the minimum number of newlines that should come before this cluster. Append that number of newlines *minus* the number of newlines already appended, with a minimum of one. - Then for each cluster, iterate through each word in it. Divide each word's x0, minus `x_shift`, by `x_density` to calculate the minimum number of characters that should come before this cluster. Append that number of spaces *minus* the number of characters and spaces already appended, with a minimum of one. Then append the word's text. - At the termination of each line, add more spaces if necessary to mimic `layout_width`. - Finally, add newlines to the end if necessary to mimic to `layout_height`. Note: This approach currently works best for horizontal, left-to-right text, but will display all words regardless of orientation. There is room for improvement in better supporting right-to-left text, as well as vertical text. """ _textmap = [] if not len(self.tuples): return TextMap(_textmap) expansions = LIGATURES if expand_ligatures else {} if layout: if layout_width_chars: if layout_width: raise ValueError( "`layout_width` and `layout_width_chars` cannot both be set." ) else: layout_width_chars = int(round(layout_width / x_density)) if layout_height_chars: if layout_height: raise ValueError( "`layout_height` and `layout_height_chars` cannot both be set." ) else: layout_height_chars = int(round(layout_height / y_density)) blank_line = [(" ", None)] * layout_width_chars else: blank_line = [] num_newlines = 0 words_sorted_doctop = ( self.tuples if presorted or use_text_flow else sorted(self.tuples, key=lambda x: float(x[0]["doctop"])) ) first_word = words_sorted_doctop[0][0] doctop_start = first_word["doctop"] - first_word["top"] for i, ws in enumerate( cluster_objects( words_sorted_doctop, lambda x: float(x[0]["doctop"]), y_tolerance ) ): y_dist = ( (ws[0][0]["doctop"] - (doctop_start + y_shift)) / y_density if layout else 0 ) num_newlines_prepend = max( # At least one newline, unless this iis the first line int(i > 0), # ... or as many as needed to get the imputed "distance" from the top round(y_dist) - num_newlines, ) for i in range(num_newlines_prepend): if not len(_textmap) or _textmap[-1][0] == "\n": _textmap += blank_line _textmap.append(("\n", None)) num_newlines += num_newlines_prepend line_len = 0 line_words_sorted_x0 = ( ws if presorted or use_text_flow else sorted(ws, key=lambda x: float(x[0]["x0"])) ) for word, chars in line_words_sorted_x0: x_dist = (word["x0"] - x_shift) / x_density if layout else 0 num_spaces_prepend = max(min(1, line_len), round(x_dist) - line_len) _textmap += [(" ", None)] * num_spaces_prepend line_len += num_spaces_prepend for c in chars: letters = expansions.get(c["text"], c["text"]) for letter in letters: _textmap.append((letter, c)) line_len += 1 # Append spaces at end of line if layout: _textmap += [(" ", None)] * (layout_width_chars - line_len) # Append blank lines at end of text if layout: num_newlines_append = layout_height_chars - (num_newlines + 1) for i in range(num_newlines_append): if i > 0: _textmap += blank_line _textmap.append(("\n", None)) # Remove terminal newline if _textmap[-1] == ("\n", None): _textmap = _textmap[:-1] return TextMap(_textmap) class WordExtractor: def __init__( self, x_tolerance=DEFAULT_X_TOLERANCE, y_tolerance=DEFAULT_Y_TOLERANCE, keep_blank_chars: bool = False, use_text_flow=False, horizontal_ltr=True, # Should words be read left-to-right? vertical_ttb=False, # Should vertical words be read top-to-bottom? extra_attrs=None, split_at_punctuation=False, expand_ligatures=True, ): self.x_tolerance = x_tolerance self.y_tolerance = y_tolerance self.keep_blank_chars = keep_blank_chars self.use_text_flow = use_text_flow self.horizontal_ltr = horizontal_ltr self.vertical_ttb = vertical_ttb self.extra_attrs = [] if extra_attrs is None else extra_attrs # Note: string.punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' self.split_at_punctuation = ( string.punctuation if split_at_punctuation is True else (split_at_punctuation or "") ) self.expansions = LIGATURES if expand_ligatures else {} def merge_chars(self, ordered_chars: list): x0, top, x1, bottom = objects_to_bbox(ordered_chars) doctop_adj = ordered_chars[0]["doctop"] - ordered_chars[0]["top"] upright = ordered_chars[0]["upright"] direction = 1 if (self.horizontal_ltr if upright else self.vertical_ttb) else -1 matrix = ordered_chars[0]["matrix"] rotation = 0 if not upright and matrix[1] < 0: ordered_chars = reversed(ordered_chars) rotation = 270 if matrix[0] < 0 and matrix[3] < 0: rotation = 180 elif matrix[1] > 0: rotation = 90 word = { "text": "".join( self.expansions.get(c["text"], c["text"]) for c in ordered_chars ), "x0": x0, "x1": x1, "top": top, "doctop": top + doctop_adj, "bottom": bottom, "upright": upright, "direction": direction, "rotation": rotation, } for key in self.extra_attrs: word[key] = ordered_chars[0][key] return word def char_begins_new_word( self, prev_char, curr_char, ) -> bool: """This method takes several factors into account to determine if `curr_char` represents the beginning of a new word: - Whether the text is "upright" (i.e., non-rotated) - Whether the user has specified that horizontal text runs left-to-right (default) or right-to-left, as represented by self.horizontal_ltr - Whether the user has specified that vertical text the text runs top-to-bottom (default) or bottom-to-top, as represented by self.vertical_ttb - The x0, top, x1, and bottom attributes of prev_char and curr_char - The self.x_tolerance and self.y_tolerance settings. Note: In this case, x/y refer to those directions for non-rotated text. For vertical text, they are flipped. A more accurate terminology might be "*intra*line character distance tolerance" and "*inter*line character distance tolerance" An important note: The *intra*line distance is measured from the *end* of the previous character to the *beginning* of the current character, while the *inter*line distance is measured from the *top* of the previous character to the *top* of the next character. The reasons for this are partly repository-historical, and partly logical, as successive text lines' bounding boxes often overlap slightly (and we don't want that overlap to be interpreted as the two lines being the same line). The upright-ness of the character determines the attributes to compare, while horizontal_ltr/vertical_ttb determine the direction of the comparison. """ # Note: Due to the grouping step earlier in the process, # curr_char["upright"] will always equal prev_char["upright"]. if curr_char["upright"]: x = self.x_tolerance y = self.y_tolerance ay = prev_char["top"] cy = curr_char["top"] if self.horizontal_ltr: ax = prev_char["x0"] bx = prev_char["x1"] cx = curr_char["x0"] else: ax = -prev_char["x1"] bx = -prev_char["x0"] cx = -curr_char["x1"] else: x = self.y_tolerance y = self.x_tolerance ay = prev_char["x0"] cy = curr_char["x0"] if self.vertical_ttb: ax = prev_char["top"] bx = prev_char["bottom"] cx = curr_char["top"] else: ax = -prev_char["bottom"] bx = -prev_char["top"] cx = -curr_char["bottom"] return bool( # Intraline test (cx < ax) or (cx > bx + x) # Interline test or (cy > ay + y) ) def iter_chars_to_words(self, ordered_chars): current_word: list = [] def start_next_word(new_char=None): nonlocal current_word if current_word: yield current_word current_word = [] if new_char is None else [new_char] for char in ordered_chars: text = char["text"] if not self.keep_blank_chars and text.isspace(): yield from start_next_word(None) elif text in self.split_at_punctuation: yield from start_next_word(char) yield from start_next_word(None) elif current_word and self.char_begins_new_word(current_word[-1], char): yield from start_next_word(char) else: current_word.append(char) # Finally, after all chars processed if current_word: yield current_word def iter_sort_chars(self, chars): def upright_key(x) -> int: return -int(x["upright"]) for upright_cluster in cluster_objects(list(chars), upright_key, 0): upright = upright_cluster[0]["upright"] cluster_key = "doctop" if upright else "x0" # Cluster by line subclusters = cluster_objects( upright_cluster, itemgetter(cluster_key), self.y_tolerance ) for sc in subclusters: # Sort within line sort_key = "x0" if upright else "doctop" to_yield = sorted(sc, key=itemgetter(sort_key)) # Reverse order if necessary if not (self.horizontal_ltr if upright else self.vertical_ttb): yield from reversed(to_yield) else: yield from to_yield def iter_extract_tuples(self, chars): ordered_chars = chars if self.use_text_flow else self.iter_sort_chars(chars) grouping_key = itemgetter("upright", *self.extra_attrs) grouped_chars = itertools.groupby(ordered_chars, grouping_key) for keyvals, char_group in grouped_chars: for word_chars in self.iter_chars_to_words(char_group): yield (self.merge_chars(word_chars), word_chars) def extract_wordmap(self, chars) -> WordMap: return WordMap(list(self.iter_extract_tuples(chars))) def extract_words(self, chars: list) -> list: words = list(word for word, word_chars in self.iter_extract_tuples(chars)) return words def extract_words(chars: list, **kwargs) -> list: return WordExtractor(**kwargs).extract_words(chars) TEXTMAP_KWARGS = inspect.signature(WordMap.to_textmap).parameters.keys() WORD_EXTRACTOR_KWARGS = inspect.signature(WordExtractor).parameters.keys() def chars_to_textmap(chars: list, **kwargs) -> TextMap: kwargs.update({"presorted": True}) extractor = WordExtractor( **{k: kwargs[k] for k in WORD_EXTRACTOR_KWARGS if k in kwargs} ) wordmap = extractor.extract_wordmap(chars) textmap = wordmap.to_textmap( **{k: kwargs[k] for k in TEXTMAP_KWARGS if k in kwargs} ) return textmap def extract_text(chars: list, **kwargs) -> str: chars = to_list(chars) if len(chars) == 0: return "" if kwargs.get("layout"): return chars_to_textmap(chars, **kwargs).as_string else: y_tolerance = kwargs.get("y_tolerance", DEFAULT_Y_TOLERANCE) extractor = WordExtractor( **{k: kwargs[k] for k in WORD_EXTRACTOR_KWARGS if k in kwargs} ) words = extractor.extract_words(chars) if words: rotation = words[0]["rotation"] # rotation cannot change within a cell else: rotation = 0 if rotation == 90: words.sort(key=lambda w: (w["x1"], -w["top"])) lines = " ".join([w["text"] for w in words]) elif rotation == 270: words.sort(key=lambda w: (-w["x1"], w["top"])) lines = " ".join([w["text"] for w in words]) else: lines = cluster_objects(words, itemgetter("doctop"), y_tolerance) lines = "\n".join(" ".join(word["text"] for word in line) for line in lines) if rotation == 180: # needs extra treatment lines = "".join([(c if c != "\n" else " ") for c in reversed(lines)]) return lines def collate_line( line_chars: list, tolerance=DEFAULT_X_TOLERANCE, ) -> str: coll = "" last_x1 = None for char in sorted(line_chars, key=itemgetter("x0")): if (last_x1 is not None) and (char["x0"] > (last_x1 + tolerance)): coll += " " last_x1 = char["x1"] coll += char["text"] return coll def dedupe_chars(chars: list, tolerance=1) -> list: """ Removes duplicate chars — those sharing the same text, fontname, size, and positioning (within `tolerance`) as other characters in the set. """ key = itemgetter("fontname", "size", "upright", "text") pos_key = itemgetter("doctop", "x0") def yield_unique_chars(chars: list): sorted_chars = sorted(chars, key=key) for grp, grp_chars in itertools.groupby(sorted_chars, key=key): for y_cluster in cluster_objects( list(grp_chars), itemgetter("doctop"), tolerance ): for x_cluster in cluster_objects( y_cluster, itemgetter("x0"), tolerance ): yield sorted(x_cluster, key=pos_key)[0] deduped = yield_unique_chars(chars) return sorted(deduped, key=chars.index) def line_to_edge(line): edge = dict(line) edge["orientation"] = "h" if (line["top"] == line["bottom"]) else "v" return edge def rect_to_edges(rect) -> list: top, bottom, left, right = [dict(rect) for x in range(4)] top.update( { "object_type": "rect_edge", "height": 0, "y0": rect["y1"], "bottom": rect["top"], "orientation": "h", } ) bottom.update( { "object_type": "rect_edge", "height": 0, "y1": rect["y0"], "top": rect["top"] + rect["height"], "doctop": rect["doctop"] + rect["height"], "orientation": "h", } ) left.update( { "object_type": "rect_edge", "width": 0, "x1": rect["x0"], "orientation": "v", } ) right.update( { "object_type": "rect_edge", "width": 0, "x0": rect["x1"], "orientation": "v", } ) return [top, bottom, left, right] def curve_to_edges(curve) -> list: point_pairs = zip(curve["pts"], curve["pts"][1:]) return [ { "object_type": "curve_edge", "x0": min(p0[0], p1[0]), "x1": max(p0[0], p1[0]), "top": min(p0[1], p1[1]), "doctop": min(p0[1], p1[1]) + (curve["doctop"] - curve["top"]), "bottom": max(p0[1], p1[1]), "width": abs(p0[0] - p1[0]), "height": abs(p0[1] - p1[1]), "orientation": "v" if p0[0] == p1[0] else ("h" if p0[1] == p1[1] else None), } for p0, p1 in point_pairs ] def obj_to_edges(obj) -> list: t = obj["object_type"] if "_edge" in t: return [obj] elif t == "line": return [line_to_edge(obj)] else: return {"rect": rect_to_edges, "curve": curve_to_edges}[t](obj) def filter_edges( edges, orientation=None, edge_type=None, min_length=1, ) -> list: if orientation not in ("v", "h", None): raise ValueError("Orientation must be 'v' or 'h'") def test(e) -> bool: dim = "height" if e["orientation"] == "v" else "width" et_correct = e["object_type"] == edge_type if edge_type is not None else True orient_correct = orientation is None or e["orientation"] == orientation return bool(et_correct and orient_correct and (e[dim] >= min_length)) return list(filter(test, edges)) def cluster_list(xs, tolerance=0) -> list: if tolerance == 0: return [[x] for x in sorted(xs)] if len(xs) < 2: return [[x] for x in sorted(xs)] groups = [] xs = list(sorted(xs)) current_group = [xs[0]] last = xs[0] for x in xs[1:]: if x <= (last + tolerance): current_group.append(x) else: groups.append(current_group) current_group = [x] last = x groups.append(current_group) return groups def make_cluster_dict(values, tolerance) -> dict: clusters = cluster_list(list(set(values)), tolerance) nested_tuples = [ [(val, i) for val in value_cluster] for i, value_cluster in enumerate(clusters) ] return dict(itertools.chain(*nested_tuples)) def cluster_objects(xs, key_fn, tolerance) -> list: if not callable(key_fn): key_fn = itemgetter(key_fn) values = map(key_fn, xs) cluster_dict = make_cluster_dict(values, tolerance) get_0, get_1 = itemgetter(0), itemgetter(1) cluster_tuples = sorted(((x, cluster_dict.get(key_fn(x))) for x in xs), key=get_1) grouped = itertools.groupby(cluster_tuples, key=get_1) return [list(map(get_0, v)) for k, v in grouped] def move_object(obj, axis: str, value): assert axis in ("h", "v") if axis == "h": new_items = [ ("x0", obj["x0"] + value), ("x1", obj["x1"] + value), ] if axis == "v": new_items = [ ("top", obj["top"] + value), ("bottom", obj["bottom"] + value), ] if "doctop" in obj: new_items += [("doctop", obj["doctop"] + value)] if "y0" in obj: new_items += [ ("y0", obj["y0"] - value), ("y1", obj["y1"] - value), ] return obj.__class__(tuple(obj.items()) + tuple(new_items)) def snap_objects(objs, attr: str, tolerance) -> list: axis = {"x0": "h", "x1": "h", "top": "v", "bottom": "v"}[attr] list_objs = list(objs) clusters = cluster_objects(list_objs, itemgetter(attr), tolerance) avgs = [sum(map(itemgetter(attr), cluster)) / len(cluster) for cluster in clusters] snapped_clusters = [ [move_object(obj, axis, avg - obj[attr]) for obj in cluster] for cluster, avg in zip(clusters, avgs) ] return list(itertools.chain(*snapped_clusters)) def snap_edges( edges, x_tolerance=DEFAULT_SNAP_TOLERANCE, y_tolerance=DEFAULT_SNAP_TOLERANCE, ): """ Given a list of edges, snap any within `tolerance` pixels of one another to their positional average. """ by_orientation = {"v": [], "h": []} for e in edges: by_orientation[e["orientation"]].append(e) snapped_v = snap_objects(by_orientation["v"], "x0", x_tolerance) snapped_h = snap_objects(by_orientation["h"], "top", y_tolerance) return snapped_v + snapped_h def resize_object(obj, key: str, value): assert key in ("x0", "x1", "top", "bottom") old_value = obj[key] diff = value - old_value new_items = [ (key, value), ] if key == "x0": assert value <= obj["x1"] new_items.append(("width", obj["x1"] - value)) elif key == "x1": assert value >= obj["x0"] new_items.append(("width", value - obj["x0"])) elif key == "top": assert value <= obj["bottom"] new_items.append(("doctop", obj["doctop"] + diff)) new_items.append(("height", obj["height"] - diff)) if "y1" in obj: new_items.append(("y1", obj["y1"] - diff)) elif key == "bottom": assert value >= obj["top"] new_items.append(("height", obj["height"] + diff)) if "y0" in obj: new_items.append(("y0", obj["y0"] - diff)) return obj.__class__(tuple(obj.items()) + tuple(new_items)) def join_edge_group(edges, orientation: str, tolerance=DEFAULT_JOIN_TOLERANCE): """ Given a list of edges along the same infinite line, join those that are within `tolerance` pixels of one another. """ if orientation == "h": min_prop, max_prop = "x0", "x1" elif orientation == "v": min_prop, max_prop = "top", "bottom" else: raise ValueError("Orientation must be 'v' or 'h'") sorted_edges = list(sorted(edges, key=itemgetter(min_prop))) joined = [sorted_edges[0]] for e in sorted_edges[1:]: last = joined[-1] if e[min_prop] <= (last[max_prop] + tolerance): if e[max_prop] > last[max_prop]: # Extend current edge to new extremity joined[-1] = resize_object(last, max_prop, e[max_prop]) else: # Edge is separate from previous edges joined.append(e) return joined def merge_edges( edges, snap_x_tolerance, snap_y_tolerance, join_x_tolerance, join_y_tolerance, ): """ Using the `snap_edges` and `join_edge_group` methods above, merge a list of edges into a more "seamless" list. """ def get_group(edge): if edge["orientation"] == "h": return ("h", edge["top"]) else: return ("v", edge["x0"]) if snap_x_tolerance > 0 or snap_y_tolerance > 0: edges = snap_edges(edges, snap_x_tolerance, snap_y_tolerance) _sorted = sorted(edges, key=get_group) edge_groups = itertools.groupby(_sorted, key=get_group) edge_gen = ( join_edge_group( items, k[0], (join_x_tolerance if k[0] == "h" else join_y_tolerance) ) for k, items in edge_groups ) edges = list(itertools.chain(*edge_gen)) return edges def bbox_to_rect(bbox) -> dict: """ Return the rectangle (i.e a dict with keys "x0", "top", "x1", "bottom") for an object. """ return {"x0": bbox[0], "top": bbox[1], "x1": bbox[2], "bottom": bbox[3]} def objects_to_rect(objects) -> dict: """ Given an iterable of objects, return the smallest rectangle (i.e. a dict with "x0", "top", "x1", and "bottom" keys) that contains them all. """ return bbox_to_rect(objects_to_bbox(objects)) def merge_bboxes(bboxes): """ Given an iterable of bounding boxes, return the smallest bounding box that contains them all. """ x0, top, x1, bottom = zip(*bboxes) return (min(x0), min(top), max(x1), max(bottom)) def objects_to_bbox(objects): """ Given an iterable of objects, return the smallest bounding box that contains them all. """ return merge_bboxes(map(bbox_getter, objects)) def words_to_edges_h(words, word_threshold: int = DEFAULT_MIN_WORDS_HORIZONTAL): """ Find (imaginary) horizontal lines that connect the tops of at least `word_threshold` words. """ by_top = cluster_objects(words, itemgetter("top"), 1) large_clusters = filter(lambda x: len(x) >= word_threshold, by_top) rects = list(map(objects_to_rect, large_clusters)) if len(rects) == 0: return [] min_x0 = min(map(itemgetter("x0"), rects)) max_x1 = max(map(itemgetter("x1"), rects)) edges = [] for r in rects: edges += [ # Top of text { "x0": min_x0, "x1": max_x1, "top": r["top"], "bottom": r["top"], "width": max_x1 - min_x0, "orientation": "h", }, # For each detected row, we also add the 'bottom' line. This will # generate extra edges, (some will be redundant with the next row # 'top' line), but this catches the last row of every table. { "x0": min_x0, "x1": max_x1, "top": r["bottom"], "bottom": r["bottom"], "width": max_x1 - min_x0, "orientation": "h", }, ] return edges def get_bbox_overlap(a, b): a_left, a_top, a_right, a_bottom = a b_left, b_top, b_right, b_bottom = b o_left = max(a_left, b_left) o_right = min(a_right, b_right) o_bottom = min(a_bottom, b_bottom) o_top = max(a_top, b_top) o_width = o_right - o_left o_height = o_bottom - o_top if o_height >= 0 and o_width >= 0 and o_height + o_width > 0: return (o_left, o_top, o_right, o_bottom) else: return None def words_to_edges_v(words, word_threshold: int = DEFAULT_MIN_WORDS_VERTICAL): """ Find (imaginary) vertical lines that connect the left, right, or center of at least `word_threshold` words. """ # Find words that share the same left, right, or centerpoints by_x0 = cluster_objects(words, itemgetter("x0"), 1) by_x1 = cluster_objects(words, itemgetter("x1"), 1) def get_center(word): return float(word["x0"] + word["x1"]) / 2 by_center = cluster_objects(words, get_center, 1) clusters = by_x0 + by_x1 + by_center # Find the points that align with the most words sorted_clusters = sorted(clusters, key=lambda x: -len(x)) large_clusters = filter(lambda x: len(x) >= word_threshold, sorted_clusters) # For each of those points, find the bboxes fitting all matching words bboxes = list(map(objects_to_bbox, large_clusters)) # Iterate through those bboxes, condensing overlapping bboxes condensed_bboxes = [] for bbox in bboxes: overlap = any(get_bbox_overlap(bbox, c) for c in condensed_bboxes) if not overlap: condensed_bboxes.append(bbox) if len(condensed_bboxes) == 0: return [] condensed_rects = map(bbox_to_rect, condensed_bboxes) sorted_rects = list(sorted(condensed_rects, key=itemgetter("x0"))) max_x1 = max(map(itemgetter("x1"), sorted_rects)) min_top = min(map(itemgetter("top"), sorted_rects)) max_bottom = max(map(itemgetter("bottom"), sorted_rects)) return [ { "x0": b["x0"], "x1": b["x0"], "top": min_top, "bottom": max_bottom, "height": max_bottom - min_top, "orientation": "v", } for b in sorted_rects ] + [ { "x0": max_x1, "x1": max_x1, "top": min_top, "bottom": max_bottom, "height": max_bottom - min_top, "orientation": "v", } ] def edges_to_intersections(edges, x_tolerance=1, y_tolerance=1) -> dict: """ Given a list of edges, return the points at which they intersect within `tolerance` pixels. """ intersections = {} v_edges, h_edges = [ list(filter(lambda x: x["orientation"] == o, edges)) for o in ("v", "h") ] for v in sorted(v_edges, key=itemgetter("x0", "top")): for h in sorted(h_edges, key=itemgetter("top", "x0")): if ( (v["top"] <= (h["top"] + y_tolerance)) and (v["bottom"] >= (h["top"] - y_tolerance)) and (v["x0"] >= (h["x0"] - x_tolerance)) and (v["x0"] <= (h["x1"] + x_tolerance)) ): vertex = (v["x0"], h["top"]) if vertex not in intersections: intersections[vertex] = {"v": [], "h": []} intersections[vertex]["v"].append(v) intersections[vertex]["h"].append(h) return intersections def obj_to_bbox(obj): """ Return the bounding box for an object. """ return bbox_getter(obj) def intersections_to_cells(intersections): """ Given a list of points (`intersections`), return all rectangular "cells" that those points describe. `intersections` should be a dictionary with (x0, top) tuples as keys, and a list of edge objects as values. The edge objects should correspond to the edges that touch the intersection. """ def edge_connects(p1, p2) -> bool: def edges_to_set(edges): return set(map(obj_to_bbox, edges)) if p1[0] == p2[0]: common = edges_to_set(intersections[p1]["v"]).intersection( edges_to_set(intersections[p2]["v"]) ) if len(common): return True if p1[1] == p2[1]: common = edges_to_set(intersections[p1]["h"]).intersection( edges_to_set(intersections[p2]["h"]) ) if len(common): return True return False points = list(sorted(intersections.keys())) n_points = len(points) def find_smallest_cell(points, i: int): if i == n_points - 1: return None pt = points[i] rest = points[i + 1 :] # Get all the points directly below and directly right below = [x for x in rest if x[0] == pt[0]] right = [x for x in rest if x[1] == pt[1]] for below_pt in below: if not edge_connects(pt, below_pt): continue for right_pt in right: if not edge_connects(pt, right_pt): continue bottom_right = (right_pt[0], below_pt[1]) if ( (bottom_right in intersections) and edge_connects(bottom_right, right_pt) and edge_connects(bottom_right, below_pt) ): return (pt[0], pt[1], bottom_right[0], bottom_right[1]) return None cell_gen = (find_smallest_cell(points, i) for i in range(len(points))) return list(filter(None, cell_gen)) def cells_to_tables(page, cells) -> list: """ Given a list of bounding boxes (`cells`), return a list of tables that hold those cells most simply (and contiguously). """ def bbox_to_corners(bbox) -> tuple: x0, top, x1, bottom = bbox return ((x0, top), (x0, bottom), (x1, top), (x1, bottom)) remaining_cells = list(cells) # Iterate through the cells found above, and assign them # to contiguous tables current_corners = set() current_cells = [] tables = [] while len(remaining_cells): initial_cell_count = len(current_cells) for cell in list(remaining_cells): cell_corners = bbox_to_corners(cell) # If we're just starting a table ... if len(current_cells) == 0: # ... immediately assign it to the empty group current_corners |= set(cell_corners) current_cells.append(cell) remaining_cells.remove(cell) else: # How many corners does this table share with the current group? corner_count = sum(c in current_corners for c in cell_corners) # If touching on at least one corner... if corner_count > 0: # ... assign it to the current group current_corners |= set(cell_corners) current_cells.append(cell) remaining_cells.remove(cell) # If this iteration did not find any more cells to append... if len(current_cells) == initial_cell_count: # ... start a new cell group tables.append(list(current_cells)) current_corners.clear() current_cells.clear() # Once we have exhausting the list of cells ... # ... and we have a cell group that has not been stored if len(current_cells): # ... store it. tables.append(list(current_cells)) # PyMuPDF modification: # Remove tables without text or having only 1 column for i in range(len(tables) - 1, -1, -1): r = EMPTY_RECT() x1_vals = set() x0_vals = set() for c in tables[i]: r |= c x1_vals.add(c[2]) x0_vals.add(c[0]) if ( len(x1_vals) < 2 or len(x0_vals) < 2 or white_spaces.issuperset( page.get_textbox( r, textpage=TEXTPAGE, ) ) ): del tables[i] # Sort the tables top-to-bottom-left-to-right based on the value of the # topmost-and-then-leftmost coordinate of a table. _sorted = sorted(tables, key=lambda t: min((c[1], c[0]) for c in t)) return _sorted class CellGroup: def __init__(self, cells): self.cells = cells self.bbox = ( min(map(itemgetter(0), filter(None, cells))), min(map(itemgetter(1), filter(None, cells))), max(map(itemgetter(2), filter(None, cells))), max(map(itemgetter(3), filter(None, cells))), ) class TableRow(CellGroup): pass class TableHeader: """PyMuPDF extension containing the identified table header.""" def __init__(self, bbox, cells, names, above): self.bbox = bbox self.cells = cells self.names = names self.external = above class Table: def __init__(self, page, cells): self.page = page self.cells = cells self.header = self._get_header() # PyMuPDF extension @property def bbox(self): c = self.cells return ( min(map(itemgetter(0), c)), min(map(itemgetter(1), c)), max(map(itemgetter(2), c)), max(map(itemgetter(3), c)), ) @property def rows(self) -> list: _sorted = sorted(self.cells, key=itemgetter(1, 0)) xs = list(sorted(set(map(itemgetter(0), self.cells)))) rows = [] for y, row_cells in itertools.groupby(_sorted, itemgetter(1)): xdict = {cell[0]: cell for cell in row_cells} row = TableRow([xdict.get(x) for x in xs]) rows.append(row) return rows @property def row_count(self) -> int: # PyMuPDF extension return len(self.rows) @property def col_count(self) -> int: # PyMuPDF extension return max([len(r.cells) for r in self.rows]) def extract(self, **kwargs) -> list: chars = CHARS table_arr = [] def char_in_bbox(char, bbox) -> bool: v_mid = (char["top"] + char["bottom"]) / 2 h_mid = (char["x0"] + char["x1"]) / 2 x0, top, x1, bottom = bbox return bool( (h_mid >= x0) and (h_mid < x1) and (v_mid >= top) and (v_mid < bottom) ) for row in self.rows: arr = [] row_chars = [char for char in chars if char_in_bbox(char, row.bbox)] for cell in row.cells: if cell is None: cell_text = None else: cell_chars = [ char for char in row_chars if char_in_bbox(char, cell) ] if len(cell_chars): kwargs["x_shift"] = cell[0] kwargs["y_shift"] = cell[1] if "layout" in kwargs: kwargs["layout_width"] = cell[2] - cell[0] kwargs["layout_height"] = cell[3] - cell[1] cell_text = extract_text(cell_chars, **kwargs) else: cell_text = "" arr.append(cell_text) table_arr.append(arr) return table_arr def to_markdown(self, clean=True): """Output table content as a string in Github-markdown format. If clean is true, markdown syntax is removed from cell content.""" output = "|" # generate header string and MD underline for i, name in enumerate(self.header.names): if name is None or name == "": # generate a name if empty name = f"Col{i+1}" name = name.replace("\n", " ") # remove any line breaks if clean: # remove sensitive syntax name = html.escape(name.replace("-", "-")) output += name + "|" output += "\n" output += "|" + "|".join("---" for i in range(self.col_count)) + "|\n" # skip first row in details if header is part of the table j = 0 if self.header.external else 1 # iterate over detail rows for row in self.extract()[j:]: line = "|" for i, cell in enumerate(row): # output None cells with empty string cell = "" if cell is None else cell.replace("\n", " ") if clean: # remove sensitive syntax cell = html.escape(cell.replace("-", "-")) line += cell + "|" line += "\n" output += line return output + "\n" def to_pandas(self, **kwargs): """Return a pandas DataFrame version of the table.""" try: import pandas as pd except ModuleNotFoundError: message("Package 'pandas' is not installed") raise pd_dict = {} extract = self.extract() hdr = self.header names = self.header.names hdr_len = len(names) # ensure uniqueness of column names for i in range(hdr_len): name = names[i] if not name: names[i] = f"Col{i}" if hdr_len != len(set(names)): for i in range(hdr_len): name = names[i] if name != f"Col{i}": names[i] = f"{i}-{name}" if not hdr.external: # header is part of 'extract' extract = extract[1:] for i in range(hdr_len): key = names[i] value = [] for j in range(len(extract)): value.append(extract[j][i]) pd_dict[key] = value return pd.DataFrame(pd_dict) def _get_header(self, y_tolerance=3): """Identify the table header. *** PyMuPDF extension. *** Starting from the first line above the table upwards, check if it qualifies to be part of the table header. Criteria include: * A one-line table never has an extra header. * Column borders must not intersect any word. If this happens, all text of this line and above of it is ignored. * No excess inter-line distance: If a line further up has a distance of more than 1.5 times of its font size, it will be ignored and all lines above of it. * Must have same text properties. * Starting with the top table line, a bold text property cannot change back to non-bold. If not all criteria are met (or there is no text above the table), the first table row is assumed to be the header. """ page = self.page y_delta = y_tolerance def top_row_is_bold(bbox): """Check if row 0 has bold text anywhere. If this is true, then any non-bold text in lines above disqualify these lines as header. bbox is the (potentially repaired) row 0 bbox. Returns True or False """ for b in page.get_text("dict", flags=TEXTFLAGS_TEXT, clip=bbox)["blocks"]: for l in b["lines"]: for s in l["spans"]: if s["flags"] & 16: return True return False try: row = self.rows[0] cells = row.cells bbox = Rect(row.bbox) except IndexError: # this table has no rows return None # return this if we determine that the top row is the header header_top_row = TableHeader(bbox, cells, self.extract()[0], False) # one-line tables have no extra header if len(self.rows) < 2: return header_top_row # x-ccordinates of columns between x0 and x1 of the table if len(cells) < 2: return header_top_row col_x = [ c[2] if c is not None else None for c in cells[:-1] ] # column (x) coordinates # Special check: is top row bold? # If first line above table is not bold, but top-left table cell is bold, # we take first table row as header top_row_bold = top_row_is_bold(bbox) # clip = area above table # We will inspect this area for text qualifying as column header. clip = +bbox # take row 0 bbox clip.y0 = 0 # start at top of page clip.y1 = bbox.y0 # end at top of table spans = [] # the text spans inside clip for b in page.get_text("dict", clip=clip, flags=TEXTFLAGS_TEXT)["blocks"]: for l in b["lines"]: for s in l["spans"]: if ( not s["flags"] & 1 and s["text"].strip() ): # ignore superscripts and empty text spans.append(s) select = [] # y1 coordinates above, sorted descending line_heights = [] # line heights above, sorted descending line_bolds = [] # bold indicator per line above, same sorting # spans sorted descending spans.sort(key=lambda s: s["bbox"][3], reverse=True) # walk through the spans and fill above 3 lists for i in range(len(spans)): s = spans[i] y1 = s["bbox"][3] # span bottom h = y1 - s["bbox"][1] # span bbox height bold = s["flags"] & 16 # use first item to start the lists if i == 0: select.append(y1) line_heights.append(h) line_bolds.append(bold) continue # get last items from the 3 lists y0 = select[-1] h0 = line_heights[-1] bold0 = line_bolds[-1] if bold0 and not bold: break # stop if switching from bold to non-bold # if fitting in height of previous span, modify bbox if y0 - y1 <= y_delta or abs((y0 - h0) - s["bbox"][1]) <= y_delta: s["bbox"] = (s["bbox"][0], y0 - h0, s["bbox"][2], y0) spans[i] = s if bold: line_bolds[-1] = bold continue elif y0 - y1 > 1.5 * h0: break # stop if distance to previous line too large select.append(y1) line_heights.append(h) line_bolds.append(bold) if select == []: # nothing above the table? return header_top_row select = select[:5] # only accept up to 5 lines in any header # take top row as header if text above table is too far apart if bbox.y0 - select[0] >= line_heights[0]: return header_top_row # if top table row is bold, but line above is not: if top_row_bold and not line_bolds[0]: return header_top_row if spans == []: # nothing left above the table, return top row return header_top_row # re-compute clip above table nclip = EMPTY_RECT() for s in [s for s in spans if s["bbox"][3] >= select[-1]]: nclip |= s["bbox"] if not nclip.is_empty: clip = nclip clip.y1 = bbox.y0 # make sure we still include every word above # Confirm that no word in clip is intersecting a column separator word_rects = [Rect(w[:4]) for w in page.get_text("words", clip=clip)] word_tops = sorted(list(set([r[1] for r in word_rects])), reverse=True) select = [] # exclude lines with words that intersect a column border for top in word_tops: intersecting = [ (x, r) for x in col_x if x is not None for r in word_rects if r[1] == top and r[0] < x and r[2] > x ] if intersecting == []: select.append(top) else: # detected a word crossing a column border break if select == []: # nothing left over: return first row return header_top_row hdr_bbox = +clip # compute the header cells hdr_bbox.y0 = select[-1] # hdr_bbox top is smallest top coord of words hdr_cells = [ (c[0], hdr_bbox.y0, c[2], hdr_bbox.y1) if c is not None else None for c in cells ] # adjust left/right of header bbox hdr_bbox.x0 = self.bbox[0] hdr_bbox.x1 = self.bbox[2] # column names: no line breaks, no excess spaces hdr_names = [ ( page.get_textbox(c).replace("\n", " ").replace(" ", " ").strip() if c is not None else "" ) for c in hdr_cells ] return TableHeader(tuple(hdr_bbox), hdr_cells, hdr_names, True) @dataclass class TableSettings: vertical_strategy: str = "lines" horizontal_strategy: str = "lines" explicit_vertical_lines: list = None explicit_horizontal_lines: list = None snap_tolerance: float = DEFAULT_SNAP_TOLERANCE snap_x_tolerance: float = UNSET snap_y_tolerance: float = UNSET join_tolerance: float = DEFAULT_JOIN_TOLERANCE join_x_tolerance: float = UNSET join_y_tolerance: float = UNSET edge_min_length: float = 3 min_words_vertical: float = DEFAULT_MIN_WORDS_VERTICAL min_words_horizontal: float = DEFAULT_MIN_WORDS_HORIZONTAL intersection_tolerance: float = 3 intersection_x_tolerance: float = UNSET intersection_y_tolerance: float = UNSET text_settings: dict = None def __post_init__(self) -> "TableSettings": """Clean up user-provided table settings. Validates that the table settings provided consists of acceptable values and returns a cleaned up version. The cleaned up version fills out the missing values with the default values in the provided settings. TODO: Can be further used to validate that the values are of the correct type. For example, raising a value error when a non-boolean input is provided for the key ``keep_blank_chars``. :param table_settings: User-provided table settings. :returns: A cleaned up version of the user-provided table settings. :raises ValueError: When an unrecognised key is provided. """ for setting in NON_NEGATIVE_SETTINGS: if (getattr(self, setting) or 0) < 0: raise ValueError(f"Table setting '{setting}' cannot be negative") for orientation in ["horizontal", "vertical"]: strategy = getattr(self, orientation + "_strategy") if strategy not in TABLE_STRATEGIES: raise ValueError( f"{orientation}_strategy must be one of" f'{{{",".join(TABLE_STRATEGIES)}}}' ) if self.text_settings is None: self.text_settings = {} # This next section is for backwards compatibility for attr in ["x_tolerance", "y_tolerance"]: if attr not in self.text_settings: self.text_settings[attr] = self.text_settings.get("tolerance", 3) if "tolerance" in self.text_settings: del self.text_settings["tolerance"] # End of that section for attr, fallback in [ ("snap_x_tolerance", "snap_tolerance"), ("snap_y_tolerance", "snap_tolerance"), ("join_x_tolerance", "join_tolerance"), ("join_y_tolerance", "join_tolerance"), ("intersection_x_tolerance", "intersection_tolerance"), ("intersection_y_tolerance", "intersection_tolerance"), ]: if getattr(self, attr) is UNSET: setattr(self, attr, getattr(self, fallback)) return self @classmethod def resolve(cls, settings=None): if settings is None: return cls() elif isinstance(settings, cls): return settings elif isinstance(settings, dict): core_settings = {} text_settings = {} for k, v in settings.items(): if k[:5] == "text_": text_settings[k[5:]] = v else: core_settings[k] = v core_settings["text_settings"] = text_settings return cls(**core_settings) else: raise ValueError(f"Cannot resolve settings: {settings}") class TableFinder: """ Given a PDF page, find plausible table structures. Largely borrowed from Anssi Nurminen's master's thesis: http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 ... and inspired by Tabula: https://github.com/tabulapdf/tabula-extractor/issues/16 """ def __init__(self, page, settings=None): self.page = page self.settings = TableSettings.resolve(settings) self.edges = self.get_edges() self.intersections = edges_to_intersections( self.edges, self.settings.intersection_x_tolerance, self.settings.intersection_y_tolerance, ) self.cells = intersections_to_cells(self.intersections) self.tables = [ Table(self.page, cell_group) for cell_group in cells_to_tables(self.page, self.cells) ] def get_edges(self) -> list: settings = self.settings for orientation in ["vertical", "horizontal"]: strategy = getattr(settings, orientation + "_strategy") if strategy == "explicit": lines = getattr(settings, "explicit_" + orientation + "_lines") if len(lines) < 2: raise ValueError( f"If {orientation}_strategy == 'explicit', " f"explicit_{orientation}_lines " f"must be specified as a list/tuple of two or more " f"floats/ints." ) v_strat = settings.vertical_strategy h_strat = settings.horizontal_strategy if v_strat == "text" or h_strat == "text": words = extract_words(CHARS, **(settings.text_settings or {})) else: words = [] v_explicit = [] for desc in settings.explicit_vertical_lines or []: if isinstance(desc, dict): for e in obj_to_edges(desc): if e["orientation"] == "v": v_explicit.append(e) else: v_explicit.append( { "x0": desc, "x1": desc, "top": self.page.rect[1], "bottom": self.page.rect[3], "height": self.page.rect[3] - self.page.rect[1], "orientation": "v", } ) if v_strat == "lines": v_base = filter_edges(EDGES, "v") elif v_strat == "lines_strict": v_base = filter_edges(EDGES, "v", edge_type="line") elif v_strat == "text": v_base = words_to_edges_v(words, word_threshold=settings.min_words_vertical) elif v_strat == "explicit": v_base = [] else: v_base = [] v = v_base + v_explicit h_explicit = [] for desc in settings.explicit_horizontal_lines or []: if isinstance(desc, dict): for e in obj_to_edges(desc): if e["orientation"] == "h": h_explicit.append(e) else: h_explicit.append( { "x0": self.page.rect[0], "x1": self.page.rect[2], "width": self.page.rect[2] - self.page.rect[0], "top": desc, "bottom": desc, "orientation": "h", } ) if h_strat == "lines": h_base = filter_edges(EDGES, "h") elif h_strat == "lines_strict": h_base = filter_edges(EDGES, "h", edge_type="line") elif h_strat == "text": h_base = words_to_edges_h( words, word_threshold=settings.min_words_horizontal ) elif h_strat == "explicit": h_base = [] else: h_base = [] h = h_base + h_explicit edges = list(v) + list(h) edges = merge_edges( edges, snap_x_tolerance=settings.snap_x_tolerance, snap_y_tolerance=settings.snap_y_tolerance, join_x_tolerance=settings.join_x_tolerance, join_y_tolerance=settings.join_y_tolerance, ) return filter_edges(edges, min_length=settings.edge_min_length) def __getitem__(self, i): tcount = len(self.tables) if i >= tcount: raise IndexError("table not on page") while i < 0: i += tcount return self.tables[i] """ Start of PyMuPDF interface code. The following functions are executed when "page.find_tables()" is called. * make_chars: Fills the CHARS list with text character information extracted via "rawdict" text extraction. Items in CHARS are formatted as expected by the table code. * make_edges: Fills the EDGES list with vector graphic information extracted via "get_drawings". Items in EDGES are formatted as expected by the table code. The lists CHARS and EDGES are used to replace respective document access of pdfplumber or, respectively pdfminer. The table code has been modified to use these lists instead of accessing page information themselves. """ # ----------------------------------------------------------------------------- # Extract all page characters to fill the CHARS list # ----------------------------------------------------------------------------- def make_chars(page, clip=None): """Extract text as "rawdict" to fill CHARS.""" global CHARS, TEXTPAGE page_number = page.number + 1 page_height = page.rect.height ctm = page.transformation_matrix TEXTPAGE = page.get_textpage(clip=clip, flags=TEXTFLAGS_TEXT) blocks = page.get_text("rawdict", textpage=TEXTPAGE)["blocks"] doctop_base = page_height * page.number for block in blocks: for line in block["lines"]: ldir = line["dir"] # = (cosine, sine) of angle ldir = (round(ldir[0], 4), round(ldir[1], 4)) matrix = Matrix(ldir[0], -ldir[1], ldir[1], ldir[0], 0, 0) if ldir[1] == 0: upright = True else: upright = False for span in sorted(line["spans"], key=lambda s: s["bbox"][0]): fontname = span["font"] fontsize = span["size"] color = sRGB_to_pdf(span["color"]) for char in sorted(span["chars"], key=lambda c: c["bbox"][0]): bbox = Rect(char["bbox"]) bbox_ctm = bbox * ctm origin = Point(char["origin"]) * ctm matrix.e = origin.x matrix.f = origin.y text = char["c"] char_dict = { "adv": bbox.x1 - bbox.x0 if upright else bbox.y1 - bbox.y0, "bottom": bbox.y1, "doctop": bbox.y0 + doctop_base, "fontname": fontname, "height": bbox.y1 - bbox.y0, "matrix": tuple(matrix), "ncs": "DeviceRGB", "non_stroking_color": color, "non_stroking_pattern": None, "object_type": "char", "page_number": page_number, "size": fontsize if upright else bbox.y1 - bbox.y0, "stroking_color": color, "stroking_pattern": None, "text": text, "top": bbox.y0, "upright": upright, "width": bbox.x1 - bbox.x0, "x0": bbox.x0, "x1": bbox.x1, "y0": bbox_ctm.y0, "y1": bbox_ctm.y1, } CHARS.append(char_dict) # ------------------------------------------------------------------------ # Extract all page vector graphics to fill the EDGES list. # We are ignoring Bézier curves completely and are converting everything # else to lines. # ------------------------------------------------------------------------ def make_edges(page, clip=None, tset=None, add_lines=None): global EDGES snap_x = tset.snap_x_tolerance snap_y = tset.snap_y_tolerance lines_strict = ( tset.vertical_strategy == "lines_strict" or tset.horizontal_strategy == "lines_strict" ) page_height = page.rect.height doctop_basis = page.number * page_height page_number = page.number + 1 prect = page.rect if page.rotation in (90, 270): w, h = prect.br prect = Rect(0, 0, h, w) if clip is not None: clip = Rect(clip) else: clip = prect def are_neighbors(r1, r2): """Detect whether r1, r2 are neighbors. Defined as: The minimum distance between points of r1 and points of r2 is not larger than some delta. This check supports empty rect-likes and thus also lines. Note: This type of check is MUCH faster than native Rect containment checks. """ if ( # check if x-coordinates of r1 are within those of r2 r2.x0 - snap_x <= r1.x0 <= r2.x1 + snap_x or r2.x0 - snap_x <= r1.x1 <= r2.x1 + snap_x ) and ( # ... same for y-coordinates r2.y0 - snap_y <= r1.y0 <= r2.y1 + snap_y or r2.y0 - snap_y <= r1.y1 <= r2.y1 + snap_y ): return True # same check with r1 / r2 exchanging their roles (this is necessary!) if ( r1.x0 - snap_x <= r2.x0 <= r1.x1 + snap_x or r1.x0 - snap_x <= r2.x1 <= r1.x1 + snap_x ) and ( r1.y0 - snap_y <= r2.y0 <= r1.y1 + snap_y or r1.y0 - snap_y <= r2.y1 <= r1.y1 + snap_y ): return True return False def clean_graphics(): """Detect and join rectangles of "connected" vector graphics.""" paths = [] # paths relevant for table detection for p in page.get_drawings(): # ignore fill-only graphics if they do not simulate lines, # which means one of width or height are small. if ( p["type"] == "f" and lines_strict and p["rect"].width > snap_x and p["rect"].height > snap_y ): continue paths.append(p) # start with all vector graphics rectangles prects = sorted(set([p["rect"] for p in paths]), key=lambda r: (r.y1, r.x0)) new_rects = [] # the final list of joined rectangles # ---------------------------------------------------------------- # Strategy: Join rectangles that "almost touch" each other. # Extend first rectangle with any other that is a "neighbor". # Then move it to the final list and continue with the rest. # ---------------------------------------------------------------- while prects: # the algorithm will empty this list prect0 = prects[0] # copy of first rectangle (performance reasons!) repeat = True while repeat: # this loop extends first rect in list repeat = False # set to true again if some other rect touches for i in range(len(prects) - 1, 0, -1): # run backwards if are_neighbors(prect0, prects[i]): # close enough to rect 0? prect0 |= prects[i].tl # extend rect 0 prect0 |= prects[i].br # extend rect 0 del prects[i] # delete this rect repeat = True # keep checking the rest # move rect 0 over to result list if there is some text in it if not white_spaces.issuperset(page.get_textbox(prect0, textpage=TEXTPAGE)): # contains text, so accept it as a table bbox candidate new_rects.append(prect0) del prects[0] # remove from rect list return new_rects, paths bboxes, paths = clean_graphics() def is_parallel(p1, p2): """Check if line is roughly axis-parallel.""" if abs(p1.x - p2.x) <= snap_x or abs(p1.y - p2.y) <= snap_y: return True return False def make_line(p, p1, p2, clip): """Given 2 points, make a line dictionary for table detection.""" if not is_parallel(p1, p2): # only accepting axis-parallel lines return {} # compute the extremal values x0 = min(p1.x, p2.x) x1 = max(p1.x, p2.x) y0 = min(p1.y, p2.y) y1 = max(p1.y, p2.y) # check for outside clip if x0 > clip.x1 or x1 < clip.x0 or y0 > clip.y1 or y1 < clip.y0: return {} if x0 < clip.x0: x0 = clip.x0 # adjust to clip boundary if x1 > clip.x1: x1 = clip.x1 # adjust to clip boundary if y0 < clip.y0: y0 = clip.y0 # adjust to clip boundary if y1 > clip.y1: y1 = clip.y1 # adjust to clip boundary width = x1 - x0 # from adjusted values height = y1 - y0 # from adjusted values if width == height == 0: return {} # nothing left to deal with line_dict = { "x0": x0, "y0": page_height - y0, "x1": x1, "y1": page_height - y1, "width": width, "height": height, "pts": [(x0, y0), (x1, y1)], "linewidth": p["width"], "stroke": True, "fill": False, "evenodd": False, "stroking_color": p["color"] if p["color"] else p["fill"], "non_stroking_color": None, "object_type": "line", "page_number": page_number, "stroking_pattern": None, "non_stroking_pattern": None, "top": y0, "bottom": y1, "doctop": y0 + doctop_basis, } return line_dict for p in paths: items = p["items"] # items in this path # if 'closePath', add a line from last to first point if p["closePath"] and items[0][0] == "l" and items[-1][0] == "l": items.append(("l", items[-1][2], items[0][1])) for i in items: if i[0] not in ("l", "re", "qu"): continue # ignore anything else if i[0] == "l": # a line p1, p2 = i[1:] line_dict = make_line(p, p1, p2, clip) if line_dict: EDGES.append(line_to_edge(line_dict)) elif i[0] == "re": # a rectangle: decompose into 4 lines rect = i[1].normalize() # rectangle itself # ignore minute rectangles if rect.height <= snap_y and rect.width <= snap_x: continue if rect.width <= snap_x: # simulates a vertical line x = abs(rect.x1 + rect.x0) / 2 # take middle value for x p1 = Point(x, rect.y0) p2 = Point(x, rect.y1) line_dict = make_line(p, p1, p2, clip) if line_dict: EDGES.append(line_to_edge(line_dict)) continue if rect.height <= snap_y: # simulates a horizontal line y = abs(rect.y1 + rect.y0) / 2 # take middle value for y p1 = Point(rect.x0, y) p2 = Point(rect.x1, y) line_dict = make_line(p, p1, p2, clip) if line_dict: EDGES.append(line_to_edge(line_dict)) continue line_dict = make_line(p, rect.tl, rect.bl, clip) if line_dict: EDGES.append(line_to_edge(line_dict)) line_dict = make_line(p, rect.bl, rect.br, clip) if line_dict: EDGES.append(line_to_edge(line_dict)) line_dict = make_line(p, rect.br, rect.tr, clip) if line_dict: EDGES.append(line_to_edge(line_dict)) line_dict = make_line(p, rect.tr, rect.tl, clip) if line_dict: EDGES.append(line_to_edge(line_dict)) else: # must be a quad # we convert it into (up to) 4 lines ul, ur, ll, lr = i[1] line_dict = make_line(p, ul, ll, clip) if line_dict: EDGES.append(line_to_edge(line_dict)) line_dict = make_line(p, ll, lr, clip) if line_dict: EDGES.append(line_to_edge(line_dict)) line_dict = make_line(p, lr, ur, clip) if line_dict: EDGES.append(line_to_edge(line_dict)) line_dict = make_line(p, ur, ul, clip) if line_dict: EDGES.append(line_to_edge(line_dict)) path = {"color": (0, 0, 0), "fill": None, "width": 1} for bbox in bboxes: # add the border lines for all enveloping bboxes line_dict = make_line(path, bbox.tl, bbox.tr, clip) if line_dict: EDGES.append(line_to_edge(line_dict)) line_dict = make_line(path, bbox.bl, bbox.br, clip) if line_dict: EDGES.append(line_to_edge(line_dict)) line_dict = make_line(path, bbox.tl, bbox.bl, clip) if line_dict: EDGES.append(line_to_edge(line_dict)) line_dict = make_line(path, bbox.tr, bbox.br, clip) if line_dict: EDGES.append(line_to_edge(line_dict)) if add_lines is not None: # add user-specified lines assert isinstance(add_lines, (tuple, list)) else: add_lines = [] for p1, p2 in add_lines: p1 = Point(p1) p2 = Point(p2) line_dict = make_line(path, p1, p2, clip) if line_dict: EDGES.append(line_to_edge(line_dict)) def page_rotation_set0(page): """Nullify page rotation. To correctly detect tables, page rotation must be zero. This function performs the necessary adjustments and returns information for reverting this changes. """ mediabox = page.mediabox rot = page.rotation # contains normalized rotation value # need to derotate the page's content mb = page.mediabox # current mediabox if rot == 90: # before derotation, shift content horizontally mat0 = Matrix(1, 0, 0, 1, mb.y1 - mb.x1 - mb.x0 - mb.y0, 0) elif rot == 270: # before derotation, shift content vertically mat0 = Matrix(1, 0, 0, 1, 0, mb.x1 - mb.y1 - mb.y0 - mb.x0) else: mat0 = Matrix(1, 0, 0, 1, -2 * mb.x0, -2 * mb.y0) # prefix with derotation matrix mat = mat0 * page.derotation_matrix cmd = b"%g %g %g %g %g %g cm " % tuple(mat) xref = TOOLS._insert_contents(page, cmd, 0) # swap x- and y-coordinates if rot in (90, 270): x0, y0, x1, y1 = mb mb.x0 = y0 mb.y0 = x0 mb.x1 = y1 mb.y1 = x1 page.set_mediabox(mb) page.set_rotation(0) # refresh the page to apply these changes doc = page.parent pno = page.number page = doc[pno] return page, xref, rot, mediabox def page_rotation_reset(page, xref, rot, mediabox): """Reset page rotation to original values. To be used before we return tables.""" doc = page.parent # document of the page doc.update_stream(xref, b" ") # remove de-rotation matrix page.set_mediabox(mediabox) # set mediabox to old value page.set_rotation(rot) # set rotation to old value pno = page.number page = doc[pno] # update page info return page def find_tables( page, clip=None, vertical_strategy: str = "lines", horizontal_strategy: str = "lines", vertical_lines: list = None, horizontal_lines: list = None, snap_tolerance: float = DEFAULT_SNAP_TOLERANCE, snap_x_tolerance: float = None, snap_y_tolerance: float = None, join_tolerance: float = DEFAULT_JOIN_TOLERANCE, join_x_tolerance: float = None, join_y_tolerance: float = None, edge_min_length: float = 3, min_words_vertical: float = DEFAULT_MIN_WORDS_VERTICAL, min_words_horizontal: float = DEFAULT_MIN_WORDS_HORIZONTAL, intersection_tolerance: float = 3, intersection_x_tolerance: float = None, intersection_y_tolerance: float = None, text_tolerance=3, text_x_tolerance=3, text_y_tolerance=3, strategy=None, # offer abbreviation add_lines=None, # optional user-specified lines ): global CHARS, EDGES CHARS = [] EDGES = [] old_small = bool(TOOLS.set_small_glyph_heights()) # save old value TOOLS.set_small_glyph_heights(True) # we need minimum bboxes if page.rotation != 0: page, old_xref, old_rot, old_mediabox = page_rotation_set0(page) else: old_xref, old_rot, old_mediabox = None, None, None if snap_x_tolerance is None: snap_x_tolerance = UNSET if snap_y_tolerance is None: snap_y_tolerance = UNSET if join_x_tolerance is None: join_x_tolerance = UNSET if join_y_tolerance is None: join_y_tolerance = UNSET if intersection_x_tolerance is None: intersection_x_tolerance = UNSET if intersection_y_tolerance is None: intersection_y_tolerance = UNSET if strategy is not None: vertical_strategy = strategy horizontal_strategy = strategy settings = { "vertical_strategy": vertical_strategy, "horizontal_strategy": horizontal_strategy, "explicit_vertical_lines": vertical_lines, "explicit_horizontal_lines": horizontal_lines, "snap_tolerance": snap_tolerance, "snap_x_tolerance": snap_x_tolerance, "snap_y_tolerance": snap_y_tolerance, "join_tolerance": join_tolerance, "join_x_tolerance": join_x_tolerance, "join_y_tolerance": join_y_tolerance, "edge_min_length": edge_min_length, "min_words_vertical": min_words_vertical, "min_words_horizontal": min_words_horizontal, "intersection_tolerance": intersection_tolerance, "intersection_x_tolerance": intersection_x_tolerance, "intersection_y_tolerance": intersection_y_tolerance, "text_tolerance": text_tolerance, "text_x_tolerance": text_x_tolerance, "text_y_tolerance": text_y_tolerance, } tset = TableSettings.resolve(settings=settings) page.table_settings = tset make_chars(page, clip=clip) # create character list of page make_edges( page, clip=clip, tset=tset, add_lines=add_lines ) # create lines and curves tables = TableFinder(page, settings=tset) TOOLS.set_small_glyph_heights(old_small) if old_xref is not None: page = page_rotation_reset(page, old_xref, old_rot, old_mediabox) return tables