initial commit

This commit is contained in:
klein panic
2024-09-29 01:46:07 -04:00
commit ba6e6914d1
7576 changed files with 1356825 additions and 0 deletions

View File

@@ -0,0 +1,28 @@
Copyright 2010 Pallets
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@@ -0,0 +1,93 @@
Metadata-Version: 2.1
Name: MarkupSafe
Version: 2.1.5
Summary: Safely add untrusted strings to HTML/XML markup.
Home-page: https://palletsprojects.com/p/markupsafe/
Maintainer: Pallets
Maintainer-email: contact@palletsprojects.com
License: BSD-3-Clause
Project-URL: Donate, https://palletsprojects.com/donate
Project-URL: Documentation, https://markupsafe.palletsprojects.com/
Project-URL: Changes, https://markupsafe.palletsprojects.com/changes/
Project-URL: Source Code, https://github.com/pallets/markupsafe/
Project-URL: Issue Tracker, https://github.com/pallets/markupsafe/issues/
Project-URL: Chat, https://discord.gg/pallets
Classifier: Development Status :: 5 - Production/Stable
Classifier: Environment :: Web Environment
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: BSD License
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python
Classifier: Topic :: Internet :: WWW/HTTP :: Dynamic Content
Classifier: Topic :: Text Processing :: Markup :: HTML
Requires-Python: >=3.7
Description-Content-Type: text/x-rst
License-File: LICENSE.rst
MarkupSafe
==========
MarkupSafe implements a text object that escapes characters so it is
safe to use in HTML and XML. Characters that have special meanings are
replaced so that they display as the actual characters. This mitigates
injection attacks, meaning untrusted user input can safely be displayed
on a page.
Installing
----------
Install and update using `pip`_:
.. code-block:: text
pip install -U MarkupSafe
.. _pip: https://pip.pypa.io/en/stable/getting-started/
Examples
--------
.. code-block:: pycon
>>> from markupsafe import Markup, escape
>>> # escape replaces special characters and wraps in Markup
>>> escape("<script>alert(document.cookie);</script>")
Markup('&lt;script&gt;alert(document.cookie);&lt;/script&gt;')
>>> # wrap in Markup to mark text "safe" and prevent escaping
>>> Markup("<strong>Hello</strong>")
Markup('<strong>hello</strong>')
>>> escape(Markup("<strong>Hello</strong>"))
Markup('<strong>hello</strong>')
>>> # Markup is a str subclass
>>> # methods and operators escape their arguments
>>> template = Markup("Hello <em>{name}</em>")
>>> template.format(name='"World"')
Markup('Hello <em>&#34;World&#34;</em>')
Donate
------
The Pallets organization develops and supports MarkupSafe and other
popular packages. In order to grow the community of contributors and
users, and allow the maintainers to devote more time to the projects,
`please donate today`_.
.. _please donate today: https://palletsprojects.com/donate
Links
-----
- Documentation: https://markupsafe.palletsprojects.com/
- Changes: https://markupsafe.palletsprojects.com/changes/
- PyPI Releases: https://pypi.org/project/MarkupSafe/
- Source Code: https://github.com/pallets/markupsafe/
- Issue Tracker: https://github.com/pallets/markupsafe/issues/
- Chat: https://discord.gg/pallets

View File

@@ -0,0 +1,14 @@
MarkupSafe-2.1.5.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
MarkupSafe-2.1.5.dist-info/LICENSE.rst,sha256=SJqOEQhQntmKN7uYPhHg9-HTHwvY-Zp5yESOf_N9B-o,1475
MarkupSafe-2.1.5.dist-info/METADATA,sha256=2dRDPam6OZLfpX0wg1JN5P3u9arqACxVSfdGmsJU7o8,3003
MarkupSafe-2.1.5.dist-info/RECORD,,
MarkupSafe-2.1.5.dist-info/WHEEL,sha256=AI1yqBLEPcVKWn5Ls2uPawjbqPXPFTYdQLSdN8WFCJw,152
MarkupSafe-2.1.5.dist-info/top_level.txt,sha256=qy0Plje5IJuvsCBjejJyhDCjEAdcDLK_2agVcex8Z6U,11
markupsafe/__init__.py,sha256=r7VOTjUq7EMQ4v3p4R1LoVOGJg6ysfYRncLr34laRBs,10958
markupsafe/__pycache__/__init__.cpython-311.pyc,,
markupsafe/__pycache__/_native.cpython-311.pyc,,
markupsafe/_native.py,sha256=GR86Qvo_GcgKmKreA1WmYN9ud17OFwkww8E-fiW-57s,1713
markupsafe/_speedups.c,sha256=X2XvQVtIdcK4Usz70BvkzoOfjTCmQlDkkjYSn-swE0g,7083
markupsafe/_speedups.cpython-311-x86_64-linux-gnu.so,sha256=9PMBIm-zJzHm91NC-mblTC119_dIAldSQ4xFsE1_NPc,53656
markupsafe/_speedups.pyi,sha256=vfMCsOgbAXRNLUXkyuyonG8uEWKYU4PDqNuMaDELAYw,229
markupsafe/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0

View File

@@ -0,0 +1,6 @@
Wheel-Version: 1.0
Generator: bdist_wheel (0.42.0)
Root-Is-Purelib: false
Tag: cp311-cp311-manylinux_2_17_x86_64
Tag: cp311-cp311-manylinux2014_x86_64

View File

@@ -0,0 +1,222 @@
# don't import any costly modules
import sys
import os
is_pypy = '__pypy__' in sys.builtin_module_names
def warn_distutils_present():
if 'distutils' not in sys.modules:
return
if is_pypy and sys.version_info < (3, 7):
# PyPy for 3.6 unconditionally imports distutils, so bypass the warning
# https://foss.heptapod.net/pypy/pypy/-/blob/be829135bc0d758997b3566062999ee8b23872b4/lib-python/3/site.py#L250
return
import warnings
warnings.warn(
"Distutils was imported before Setuptools, but importing Setuptools "
"also replaces the `distutils` module in `sys.modules`. This may lead "
"to undesirable behaviors or errors. To avoid these issues, avoid "
"using distutils directly, ensure that setuptools is installed in the "
"traditional way (e.g. not an editable install), and/or make sure "
"that setuptools is always imported before distutils."
)
def clear_distutils():
if 'distutils' not in sys.modules:
return
import warnings
warnings.warn("Setuptools is replacing distutils.")
mods = [
name
for name in sys.modules
if name == "distutils" or name.startswith("distutils.")
]
for name in mods:
del sys.modules[name]
def enabled():
"""
Allow selection of distutils by environment variable.
"""
which = os.environ.get('SETUPTOOLS_USE_DISTUTILS', 'local')
return which == 'local'
def ensure_local_distutils():
import importlib
clear_distutils()
# With the DistutilsMetaFinder in place,
# perform an import to cause distutils to be
# loaded from setuptools._distutils. Ref #2906.
with shim():
importlib.import_module('distutils')
# check that submodules load as expected
core = importlib.import_module('distutils.core')
assert '_distutils' in core.__file__, core.__file__
assert 'setuptools._distutils.log' not in sys.modules
def do_override():
"""
Ensure that the local copy of distutils is preferred over stdlib.
See https://github.com/pypa/setuptools/issues/417#issuecomment-392298401
for more motivation.
"""
if enabled():
warn_distutils_present()
ensure_local_distutils()
class _TrivialRe:
def __init__(self, *patterns):
self._patterns = patterns
def match(self, string):
return all(pat in string for pat in self._patterns)
class DistutilsMetaFinder:
def find_spec(self, fullname, path, target=None):
# optimization: only consider top level modules and those
# found in the CPython test suite.
if path is not None and not fullname.startswith('test.'):
return
method_name = 'spec_for_{fullname}'.format(**locals())
method = getattr(self, method_name, lambda: None)
return method()
def spec_for_distutils(self):
if self.is_cpython():
return
import importlib
import importlib.abc
import importlib.util
try:
mod = importlib.import_module('setuptools._distutils')
except Exception:
# There are a couple of cases where setuptools._distutils
# may not be present:
# - An older Setuptools without a local distutils is
# taking precedence. Ref #2957.
# - Path manipulation during sitecustomize removes
# setuptools from the path but only after the hook
# has been loaded. Ref #2980.
# In either case, fall back to stdlib behavior.
return
class DistutilsLoader(importlib.abc.Loader):
def create_module(self, spec):
mod.__name__ = 'distutils'
return mod
def exec_module(self, module):
pass
return importlib.util.spec_from_loader(
'distutils', DistutilsLoader(), origin=mod.__file__
)
@staticmethod
def is_cpython():
"""
Suppress supplying distutils for CPython (build and tests).
Ref #2965 and #3007.
"""
return os.path.isfile('pybuilddir.txt')
def spec_for_pip(self):
"""
Ensure stdlib distutils when running under pip.
See pypa/pip#8761 for rationale.
"""
if self.pip_imported_during_build():
return
clear_distutils()
self.spec_for_distutils = lambda: None
@classmethod
def pip_imported_during_build(cls):
"""
Detect if pip is being imported in a build script. Ref #2355.
"""
import traceback
return any(
cls.frame_file_is_setup(frame) for frame, line in traceback.walk_stack(None)
)
@staticmethod
def frame_file_is_setup(frame):
"""
Return True if the indicated frame suggests a setup.py file.
"""
# some frames may not have __file__ (#2940)
return frame.f_globals.get('__file__', '').endswith('setup.py')
def spec_for_sensitive_tests(self):
"""
Ensure stdlib distutils when running select tests under CPython.
python/cpython#91169
"""
clear_distutils()
self.spec_for_distutils = lambda: None
sensitive_tests = (
[
'test.test_distutils',
'test.test_peg_generator',
'test.test_importlib',
]
if sys.version_info < (3, 10)
else [
'test.test_distutils',
]
)
for name in DistutilsMetaFinder.sensitive_tests:
setattr(
DistutilsMetaFinder,
f'spec_for_{name}',
DistutilsMetaFinder.spec_for_sensitive_tests,
)
DISTUTILS_FINDER = DistutilsMetaFinder()
def add_shim():
DISTUTILS_FINDER in sys.meta_path or insert_shim()
class shim:
def __enter__(self):
insert_shim()
def __exit__(self, exc, value, tb):
remove_shim()
def insert_shim():
sys.meta_path.insert(0, DISTUTILS_FINDER)
def remove_shim():
try:
sys.meta_path.remove(DISTUTILS_FINDER)
except ValueError:
pass

View File

@@ -0,0 +1 @@
__import__('_distutils_hack').do_override()

View File

@@ -0,0 +1,19 @@
Copyright (C) 2013, Martin Journois
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -0,0 +1,31 @@
Metadata-Version: 2.1
Name: branca
Version: 0.7.2
Summary: Generate complex HTML+JS pages with Python
Home-page: https://github.com/python-visualization/branca
Author: Martin Journois
License: MIT
Keywords: data visualization
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3.8
Classifier: Programming Language :: Python :: 3.9
Classifier: Programming Language :: Python :: 3.10
Classifier: Programming Language :: Python :: 3.11
Classifier: Programming Language :: Python :: 3.12
Classifier: License :: OSI Approved :: MIT License
Classifier: Development Status :: 5 - Production/Stable
Requires-Python: >=3.7
Description-Content-Type: text/markdown
License-File: LICENSE.txt
Requires-Dist: jinja2 >=3
[![PyPI Package](https://img.shields.io/pypi/v/branca.svg)](https://pypi.python.org/pypi/branca)
[![Build Status](https://github.com/python-visualization/branca/actions/workflows/test_code.yml/badge.svg?branch=main)](https://github.com/python-visualization/branca/actions/workflows/test_code.yml)
[![Gitter](https://badges.gitter.im/python-visualization/folium.svg)](https://gitter.im/python-visualization/folium)
# Branca
This library is a spinoff from [folium](https://github.com/python-visualization/folium). It can be used to generate HTML + JS. It is based on Jinja2.
- Documentation: https://python-visualization.github.io/branca/
- Examples: https://nbviewer.org/github/python-visualization/branca/tree/main/examples/

View File

@@ -0,0 +1,21 @@
branca-0.7.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
branca-0.7.2.dist-info/LICENSE.txt,sha256=Fv3MFqr_xBAT3mB571tsJl4kAQibA8UHzCanIBFVi6s,1079
branca-0.7.2.dist-info/METADATA,sha256=0XqbN6maxHRRh6YEZKjLTwO-0FxW4ru4lLLf2EkYUMY,1478
branca-0.7.2.dist-info/RECORD,,
branca-0.7.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
branca-0.7.2.dist-info/top_level.txt,sha256=ypVQTwONob-Ro9YaFnI5JSp3Fie7goG1kHO-OOOvgFE,7
branca/__init__.py,sha256=96-JnqmVvjl6A13m-ngTS54ieHuvW5LsJMR4OEQZ5mo,207
branca/__pycache__/__init__.cpython-311.pyc,,
branca/__pycache__/_version.cpython-311.pyc,,
branca/__pycache__/colormap.cpython-311.pyc,,
branca/__pycache__/element.cpython-311.pyc,,
branca/__pycache__/utilities.cpython-311.pyc,,
branca/_cnames.json,sha256=gczkHOtCOMkbXf3oncGhfdrwrdr850-KBlwdlzuZaB0,3540
branca/_schemes.json,sha256=d19tuEsvPhxMeNf-E8vvQ-1S72JGG5yZvU_nOPZpPsM,21192
branca/_version.py,sha256=ISWV9_SJtaCeILz1FAMYHSjVkHbV08KnnT23Ww0_w-o,21
branca/colormap.py,sha256=1-5ONAFdLcyKsHqw-YA40yT6iXqbOS4XS62bNu8k6oQ,20378
branca/element.py,sha256=WO_nTAs1iEvPkgqwdOG0pDlWYlkGP0xdv6FwjPeGDCA,22061
branca/scheme_base_codes.json,sha256=yMAJXX7PnL3KFVF3inGtHnuXe3mfegRZ2kkEcQXR3YY,370
branca/scheme_info.json,sha256=4eCiIiCV90kLXtyJbuTCxpDVLPDgEBr2TcHZyKUIe8M,904
branca/templates/color_scale.js,sha256=Qc4zOyKDBkSvXduzoOiXep6XoQkN6QO1jv97OvVc3xs,2300
branca/utilities.py,sha256=N8lovy-szMpIATuicgLjMv5B0xtjvKcC_VDnJRc4e6A,14549

View File

@@ -0,0 +1,5 @@
Wheel-Version: 1.0
Generator: bdist_wheel (0.43.0)
Root-Is-Purelib: true
Tag: py3-none-any

View File

@@ -0,0 +1,13 @@
import branca.colormap as colormap
import branca.element as element
try:
from ._version import __version__
except ImportError:
__version__ = "unknown"
__all__ = [
"colormap",
"element",
]

View File

@@ -0,0 +1 @@
{"indigo": "#4B0082", "gold": "#FFD700", "hotpink": "#FF69B4", "firebrick": "#B22222", "indianred": "#CD5C5C", "sage": "#87AE73", "yellow": "#FFFF00", "mistyrose": "#FFE4E1", "darkolivegreen": "#556B2F", "olive": "#808000", "darkseagreen": "#8FBC8F", "pink": "#FFC0CB", "tomato": "#FF6347", "lightcoral": "#F08080", "orangered": "#FF4500", "navajowhite": "#FFDEAD", "lime": "#00FF00", "palegreen": "#98FB98", "greenyellow": "#ADFF2F", "burlywood": "#DEB887", "seashell": "#FFF5EE", "mediumspringgreen": "#00FA9A", "fuchsia": "#FF00FF", "papayawhip": "#FFEFD5", "blanchedalmond": "#FFEBCD", "chartreuse": "#7FFF00", "dimgray": "#696969", "black": "#000000", "peachpuff": "#FFDAB9", "springgreen": "#00FF7F", "aquamarine": "#7FFFD4", "white": "#FFFFFF", "b": "#0000FF", "orange": "#FFA500", "lightsalmon": "#FFA07A", "darkslategray": "#2F4F4F", "brown": "#A52A2A", "ivory": "#FFFFF0", "dodgerblue": "#1E90FF", "peru": "#CD853F", "lawngreen": "#7CFC00", "chocolate": "#D2691E", "crimson": "#DC143C", "forestgreen": "#228B22", "slateblue": "#6A5ACD", "lightseagreen": "#20B2AA", "cyan": "#00FFFF", "mintcream": "#F5FFFA", "silver": "#C0C0C0", "antiquewhite": "#FAEBD7", "mediumorchid": "#BA55D3", "skyblue": "#87CEEB", "gray": "#808080", "darkturquoise": "#00CED1", "goldenrod": "#DAA520", "darkgreen": "#006400", "floralwhite": "#FFFAF0", "darkviolet": "#9400D3", "darkgray": "#A9A9A9", "moccasin": "#FFE4B5", "saddlebrown": "#8B4513", "darkslateblue": "#483D8B", "lightskyblue": "#87CEFA", "lightpink": "#FFB6C1", "mediumvioletred": "#C71585", "r": "#FF0000", "red": "#FF0000", "deeppink": "#FF1493", "limegreen": "#32CD32", "k": "#000000", "darkmagenta": "#8B008B", "palegoldenrod": "#EEE8AA", "plum": "#DDA0DD", "turquoise": "#40E0D0", "m": "#FF00FF", "lightgoldenrodyellow": "#FAFAD2", "darkgoldenrod": "#B8860B", "lavender": "#E6E6FA", "maroon": "#800000", "yellowgreen": "#9ACD32", "sandybrown": "#FAA460", "thistle": "#D8BFD8", "violet": "#EE82EE", "navy": "#000080", "magenta": "#FF00FF", "tan": "#D2B48C", "rosybrown": "#BC8F8F", "olivedrab": "#6B8E23", "blue": "#0000FF", "lightblue": "#ADD8E6", "ghostwhite": "#F8F8FF", "honeydew": "#F0FFF0", "cornflowerblue": "#6495ED", "linen": "#FAF0E6", "darkblue": "#00008B", "powderblue": "#B0E0E6", "seagreen": "#2E8B57", "darkkhaki": "#BDB76B", "snow": "#FFFAFA", "sienna": "#A0522D", "mediumblue": "#0000CD", "royalblue": "#4169E1", "lightcyan": "#E0FFFF", "green": "#008000", "mediumpurple": "#9370DB", "midnightblue": "#191970", "cornsilk": "#FFF8DC", "paleturquoise": "#AFEEEE", "bisque": "#FFE4C4", "slategray": "#708090", "darkcyan": "#008B8B", "khaki": "#F0E68C", "wheat": "#F5DEB3", "teal": "#008080", "darkorchid": "#9932CC", "deepskyblue": "#00BFFF", "salmon": "#FA8072", "y": "#FFFF00", "darkred": "#8B0000", "steelblue": "#4682B4", "g": "#008000", "palevioletred": "#DB7093", "lightslategray": "#778899", "aliceblue": "#F0F8FF", "lightgreen": "#90EE90", "orchid": "#DA70D6", "gainsboro": "#DCDCDC", "mediumseagreen": "#3CB371", "lightgray": "#D3D3D3", "c": "#00FFFF", "mediumturquoise": "#48D1CC", "darksage": "#598556", "lemonchiffon": "#FFFACD", "cadetblue": "#5F9EA0", "lightyellow": "#FFFFE0", "lavenderblush": "#FFF0F5", "coral": "#FF7F50", "purple": "#800080", "aqua": "#00FFFF", "lightsage": "#BCECAC", "whitesmoke": "#F5F5F5", "mediumslateblue": "#7B68EE", "darkorange": "#FF8C00", "mediumaquamarine": "#66CDAA", "darksalmon": "#E9967A", "beige": "#F5F5DC", "w": "#FFFFFF", "blueviolet": "#8A2BE2", "azure": "#F0FFFF", "lightsteelblue": "#B0C4DE", "oldlace": "#FDF5E6"}

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1 @@
__version__ = "0.7.2"

View File

@@ -0,0 +1,610 @@
"""
Colormap
--------
Utility module for dealing with colormaps.
"""
import json
import math
import os
from jinja2 import Template
from branca.element import ENV, Figure, JavascriptLink, MacroElement
from branca.utilities import legend_scaler
rootpath = os.path.abspath(os.path.dirname(__file__))
with open(os.path.join(rootpath, "_cnames.json")) as f:
_cnames = json.loads(f.read())
with open(os.path.join(rootpath, "_schemes.json")) as f:
_schemes = json.loads(f.read())
def _is_hex(x):
return x.startswith("#") and len(x) == 7
def _parse_hex(color_code):
return (
int(color_code[1:3], 16),
int(color_code[3:5], 16),
int(color_code[5:7], 16),
)
def _parse_color(x):
if isinstance(x, (tuple, list)):
color_tuple = tuple(x)[:4]
elif isinstance(x, (str, bytes)) and _is_hex(x):
color_tuple = _parse_hex(x)
elif isinstance(x, (str, bytes)):
cname = _cnames.get(x.lower(), None)
if cname is None:
raise ValueError(f"Unknown color {cname!r}.")
color_tuple = _parse_hex(cname)
else:
raise ValueError(f"Unrecognized color code {x!r}")
if max(color_tuple) > 1.0:
color_tuple = tuple(u / 255.0 for u in color_tuple)
return tuple(map(float, (color_tuple + (1.0,))[:4]))
def _base(x):
if x > 0:
base = pow(10, math.floor(math.log10(x)))
return round(x / base) * base
else:
return 0
class ColorMap(MacroElement):
"""A generic class for creating colormaps.
Parameters
----------
vmin: float
The left bound of the color scale.
vmax: float
The right bound of the color scale.
caption: str
A caption to draw with the colormap.
max_labels : int, default 10
Maximum number of legend tick labels
"""
_template = ENV.get_template("color_scale.js")
def __init__(self, vmin=0.0, vmax=1.0, caption="", max_labels=10):
super().__init__()
self._name = "ColorMap"
self.vmin = vmin
self.vmax = vmax
self.caption = caption
self.index = [vmin, vmax]
self.max_labels = max_labels
self.tick_labels = None
self.width = 450
self.height = 40
def render(self, **kwargs):
"""Renders the HTML representation of the element."""
self.color_domain = [
float(self.vmin + (self.vmax - self.vmin) * k / 499.0) for k in range(500)
]
self.color_range = [self.__call__(x) for x in self.color_domain]
# sanitize possible numpy floats to native python floats
self.index = [float(i) for i in self.index]
if self.tick_labels is None:
self.tick_labels = legend_scaler(self.index, self.max_labels)
super().render(**kwargs)
figure = self.get_root()
assert isinstance(figure, Figure), (
"You cannot render this Element " "if it is not in a Figure."
)
figure.header.add_child(
JavascriptLink("https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js"),
name="d3",
) # noqa
def rgba_floats_tuple(self, x):
"""
This class has to be implemented for each class inheriting from
Colormap. This has to be a function of the form float ->
(float, float, float, float) describing for each input float x,
the output color in RGBA format;
Each output value being between 0 and 1.
"""
raise NotImplementedError
def rgba_bytes_tuple(self, x):
"""Provides the color corresponding to value `x` in the
form of a tuple (R,G,B,A) with int values between 0 and 255.
"""
return tuple(int(u * 255.9999) for u in self.rgba_floats_tuple(x))
def rgb_bytes_tuple(self, x):
"""Provides the color corresponding to value `x` in the
form of a tuple (R,G,B) with int values between 0 and 255.
"""
return self.rgba_bytes_tuple(x)[:3]
def rgb_hex_str(self, x):
"""Provides the color corresponding to value `x` in the
form of a string of hexadecimal values "#RRGGBB".
"""
return "#%02x%02x%02x" % self.rgb_bytes_tuple(x)
def rgba_hex_str(self, x):
"""Provides the color corresponding to value `x` in the
form of a string of hexadecimal values "#RRGGBBAA".
"""
return "#%02x%02x%02x%02x" % self.rgba_bytes_tuple(x)
def __call__(self, x):
"""Provides the color corresponding to value `x` in the
form of a string of hexadecimal values "#RRGGBBAA".
"""
return self.rgba_hex_str(x)
def _repr_html_(self):
"""Display the colormap in a Jupyter Notebook.
Does not support all the class arguments.
"""
nb_ticks = 7
delta_x = math.floor(self.width / (nb_ticks - 1))
x_ticks = [(i) * delta_x for i in range(0, nb_ticks)]
delta_val = delta_x * (self.vmax - self.vmin) / self.width
val_ticks = [round(self.vmin + (i) * delta_val, 1) for i in range(0, nb_ticks)]
return (
f'<svg height="40" width="{self.width}">'
+ "".join(
[
(
'<line x1="{i}" y1="15" x2="{i}" '
'y2="27" style="stroke:{color};stroke-width:2;" />'
).format(
i=i * 1,
color=self.rgba_hex_str(
self.vmin + (self.vmax - self.vmin) * i / (self.width - 1),
),
)
for i in range(self.width)
],
)
+ '<text x="0" y="38" style="text-anchor:start; font-size:11px; font:Arial">{}</text>'.format( # noqa
self.vmin,
)
+ "".join(
[
(
'<text x="{}" y="38"; style="text-anchor:middle; font-size:11px; font:Arial">{}</text>' # noqa
).format(x_ticks[i], val_ticks[i])
for i in range(1, nb_ticks - 1)
],
)
+ '<text x="{}" y="38" style="text-anchor:end; font-size:11px; font:Arial">{}</text>'.format(
self.width,
self.vmax,
)
+ '<text x="0" y="12" style="font-size:11px; font:Arial">{}</text>'.format(
self.caption,
)
+ "</svg>"
)
class LinearColormap(ColorMap):
"""Creates a ColorMap based on linear interpolation of a set of colors
over a given index.
Parameters
----------
colors : list-like object with at least two colors.
The set of colors to be used for interpolation.
Colors can be provided in the form:
* tuples of RGBA ints between 0 and 255 (e.g: `(255, 255, 0)` or
`(255, 255, 0, 255)`)
* tuples of RGBA floats between 0. and 1. (e.g: `(1.,1.,0.)` or
`(1., 1., 0., 1.)`)
* HTML-like string (e.g: `"#ffff00`)
* a color name or shortcut (e.g: `"y"` or `"yellow"`)
index : list of floats, default None
The values corresponding to each color.
It has to be sorted, and have the same length as `colors`.
If None, a regular grid between `vmin` and `vmax` is created.
vmin : float, default 0.
The minimal value for the colormap.
Values lower than `vmin` will be bound directly to `colors[0]`.
vmax : float, default 1.
The maximal value for the colormap.
Values higher than `vmax` will be bound directly to `colors[-1]`.
max_labels : int, default 10
Maximum number of legend tick labels
tick_labels: list of floats, default None
If given, used as the positions of ticks."""
def __init__(
self,
colors,
index=None,
vmin=0.0,
vmax=1.0,
caption="",
max_labels=10,
tick_labels=None,
):
super().__init__(
vmin=vmin,
vmax=vmax,
caption=caption,
max_labels=max_labels,
)
self.tick_labels = tick_labels
n = len(colors)
if n < 2:
raise ValueError("You must provide at least 2 colors.")
if index is None:
self.index = [vmin + (vmax - vmin) * i * 1.0 / (n - 1) for i in range(n)]
else:
self.index = list(index)
self.colors = [_parse_color(x) for x in colors]
def rgba_floats_tuple(self, x):
"""Provides the color corresponding to value `x` in the
form of a tuple (R,G,B,A) with float values between 0. and 1.
"""
if x <= self.index[0]:
return self.colors[0]
if x >= self.index[-1]:
return self.colors[-1]
i = len([u for u in self.index if u < x]) # 0 < i < n.
if self.index[i - 1] < self.index[i]:
p = (x - self.index[i - 1]) * 1.0 / (self.index[i] - self.index[i - 1])
elif self.index[i - 1] == self.index[i]:
p = 1.0
else:
raise ValueError("Thresholds are not sorted.")
return tuple(
(1.0 - p) * self.colors[i - 1][j] + p * self.colors[i][j] for j in range(4)
)
def to_step(
self,
n=None,
index=None,
data=None,
method=None,
quantiles=None,
round_method=None,
max_labels=10,
):
"""Splits the LinearColormap into a StepColormap.
Parameters
----------
n : int, default None
The number of expected colors in the output StepColormap.
This will be ignored if `index` is provided.
index : list of floats, default None
The values corresponding to each color bounds.
It has to be sorted.
If None, a regular grid between `vmin` and `vmax` is created.
data : list of floats, default None
A sample of data to adapt the color map to.
method : str, default 'linear'
The method used to create data-based colormap.
It can be 'linear' for linear scale, 'log' for logarithmic,
or 'quant' for data's quantile-based scale.
quantiles : list of floats, default None
Alternatively, you can provide explicitly the quantiles you
want to use in the scale.
round_method : str, default None
The method used to round thresholds.
* If 'int', all values will be rounded to the nearest integer.
* If 'log10', all values will be rounded to the nearest
order-of-magnitude integer. For example, 2100 is rounded to
2000, 2790 to 3000.
max_labels : int, default 10
Maximum number of legend tick labels
Returns
-------
A StepColormap with `n=len(index)-1` colors.
Examples:
>> lc.to_step(n=12)
>> lc.to_step(index=[0, 2, 4, 6, 8, 10])
>> lc.to_step(data=some_list, n=12)
>> lc.to_step(data=some_list, n=12, method='linear')
>> lc.to_step(data=some_list, n=12, method='log')
>> lc.to_step(data=some_list, n=12, method='quantiles')
>> lc.to_step(data=some_list, quantiles=[0, 0.3, 0.7, 1])
>> lc.to_step(data=some_list, quantiles=[0, 0.3, 0.7, 1],
... round_method='log10')
"""
msg = "You must specify either `index` or `n`"
if index is None:
if data is None:
if n is None:
raise ValueError(msg)
else:
index = [
self.vmin + (self.vmax - self.vmin) * i * 1.0 / n
for i in range(1 + n)
]
scaled_cm = self
else:
max_ = max(data)
min_ = min(data)
scaled_cm = self.scale(vmin=min_, vmax=max_)
method = (
"quantiles"
if quantiles is not None
else method if method is not None else "linear"
)
if method.lower().startswith("lin"):
if n is None:
raise ValueError(msg)
index = [min_ + i * (max_ - min_) * 1.0 / n for i in range(1 + n)]
elif method.lower().startswith("log"):
if n is None:
raise ValueError(msg)
if min_ <= 0:
msg = "Log-scale works only with strictly " "positive values."
raise ValueError(msg)
index = [
math.exp(
math.log(min_)
+ i * (math.log(max_) - math.log(min_)) * 1.0 / n,
)
for i in range(1 + n)
]
elif method.lower().startswith("quant"):
if quantiles is None:
if n is None:
msg = (
"You must specify either `index`, `n` or" "`quantiles`."
)
raise ValueError(msg)
else:
quantiles = [i * 1.0 / n for i in range(1 + n)]
p = len(data) - 1
s = sorted(data)
index = [
s[int(q * p)] * (1.0 - (q * p) % 1)
+ s[min(int(q * p) + 1, p)] * ((q * p) % 1)
for q in quantiles
]
else:
raise ValueError(f"Unknown method {method}")
else:
scaled_cm = self.scale(vmin=min(index), vmax=max(index))
n = len(index) - 1
if round_method == "int":
index = [round(x) for x in index]
if round_method == "log10":
index = [_base(x) for x in index]
colors = [
scaled_cm.rgba_floats_tuple(
index[i] * (1.0 - i / (n - 1.0)) + index[i + 1] * i / (n - 1.0),
)
for i in range(n)
]
caption = self.caption
return StepColormap(
colors,
index=index,
vmin=index[0],
vmax=index[-1],
caption=caption,
max_labels=max_labels,
tick_labels=self.tick_labels,
)
def scale(self, vmin=0.0, vmax=1.0, max_labels=10):
"""Transforms the colorscale so that the minimal and maximal values
fit the given parameters.
"""
return LinearColormap(
self.colors,
index=[
vmin + (vmax - vmin) * (x - self.vmin) * 1.0 / (self.vmax - self.vmin)
for x in self.index
], # noqa
vmin=vmin,
vmax=vmax,
caption=self.caption,
max_labels=max_labels,
)
class StepColormap(ColorMap):
"""Creates a ColorMap based on linear interpolation of a set of colors
over a given index.
Parameters
----------
colors : list-like object
The set of colors to be used for interpolation.
Colors can be provided in the form:
* tuples of int between 0 and 255 (e.g: `(255,255,0)` or
`(255, 255, 0, 255)`)
* tuples of floats between 0. and 1. (e.g: `(1.,1.,0.)` or
`(1., 1., 0., 1.)`)
* HTML-like string (e.g: `"#ffff00`)
* a color name or shortcut (e.g: `"y"` or `"yellow"`)
index : list of floats, default None
The bounds of the colors. The lower value is inclusive,
the upper value is exclusive.
It has to be sorted, and have the same length as `colors`.
If None, a regular grid between `vmin` and `vmax` is created.
vmin : float, default 0.
The minimal value for the colormap.
Values lower than `vmin` will be bound directly to `colors[0]`.
vmax : float, default 1.
The maximal value for the colormap.
Values higher than `vmax` will be bound directly to `colors[-1]`.
max_labels : int, default 10
Maximum number of legend tick labels
tick_labels: list of floats, default None
If given, used as the positions of ticks.
"""
def __init__(
self,
colors,
index=None,
vmin=0.0,
vmax=1.0,
caption="",
max_labels=10,
tick_labels=None,
):
super().__init__(
vmin=vmin,
vmax=vmax,
caption=caption,
max_labels=max_labels,
)
self.tick_labels = tick_labels
n = len(colors)
if n < 1:
raise ValueError("You must provide at least 1 colors.")
if index is None:
self.index = [vmin + (vmax - vmin) * i * 1.0 / n for i in range(n + 1)]
else:
self.index = list(index)
self.colors = [_parse_color(x) for x in colors]
def rgba_floats_tuple(self, x):
"""
Provides the color corresponding to value `x` in the
form of a tuple (R,G,B,A) with float values between 0. and 1.
"""
if x <= self.index[0]:
return self.colors[0]
if x >= self.index[-1]:
return self.colors[-1]
i = len([u for u in self.index if u <= x]) # 0 < i < n.
return tuple(self.colors[i - 1])
def to_linear(self, index=None, max_labels=10):
"""
Transforms the StepColormap into a LinearColormap.
Parameters
----------
index : list of floats, default None
The values corresponding to each color in the output colormap.
It has to be sorted.
If None, a regular grid between `vmin` and `vmax` is created.
max_labels : int, default 10
Maximum number of legend tick labels
"""
if index is None:
n = len(self.index) - 1
index = [
self.index[i] * (1.0 - i / (n - 1.0))
+ self.index[i + 1] * i / (n - 1.0)
for i in range(n)
]
colors = [self.rgba_floats_tuple(x) for x in index]
return LinearColormap(
colors,
index=index,
vmin=self.vmin,
vmax=self.vmax,
max_labels=max_labels,
)
def scale(self, vmin=0.0, vmax=1.0, max_labels=10):
"""Transforms the colorscale so that the minimal and maximal values
fit the given parameters.
"""
return StepColormap(
self.colors,
index=[
vmin + (vmax - vmin) * (x - self.vmin) * 1.0 / (self.vmax - self.vmin)
for x in self.index
], # noqa
vmin=vmin,
vmax=vmax,
caption=self.caption,
max_labels=max_labels,
)
class _LinearColormaps:
"""A class for hosting the list of built-in linear colormaps."""
def __init__(self):
self._schemes = _schemes.copy()
self._colormaps = {key: LinearColormap(val) for key, val in _schemes.items()}
for key, val in _schemes.items():
setattr(self, key, LinearColormap(val))
def _repr_html_(self):
return Template(
"""
<table>
{% for key,val in this._colormaps.items() %}
<tr><td>{{key}}</td><td>{{val._repr_html_()}}</td></tr>
{% endfor %}</table>
""",
).render(this=self)
linear = _LinearColormaps()
class _StepColormaps:
"""A class for hosting the list of built-in step colormaps."""
def __init__(self):
self._schemes = _schemes.copy()
self._colormaps = {key: StepColormap(val) for key, val in _schemes.items()}
for key, val in _schemes.items():
setattr(self, key, StepColormap(val))
def _repr_html_(self):
return Template(
"""
<table>
{% for key,val in this._colormaps.items() %}
<tr><td>{{key}}</td><td>{{val._repr_html_()}}</td></tr>
{% endfor %}</table>
""",
).render(this=self)
step = _StepColormaps()

View File

@@ -0,0 +1,681 @@
"""
Element
-------
A generic class for creating Elements.
"""
import base64
import json
import warnings
from binascii import hexlify
from collections import OrderedDict
from html import escape
from os import urandom
from pathlib import Path
from urllib.request import urlopen
from jinja2 import Environment, PackageLoader, Template
from .utilities import _camelify, _parse_size, none_max, none_min
ENV = Environment(loader=PackageLoader("branca", "templates"))
class Element:
"""Basic Element object that does nothing.
Other Elements may inherit from this one.
Parameters
----------
template : str, default None
A jinaj2-compatible template string for rendering the element.
If None, template will be:
.. code-block:: jinja
{% for name, element in this._children.items() %}
{{element.render(**kwargs)}}
{% endfor %}
so that all the element's children are rendered.
template_name : str, default None
If no template is provided, you can also provide a filename.
"""
_template = Template(
"{% for name, element in this._children.items() %}\n"
" {{element.render(**kwargs)}}"
"{% endfor %}",
)
def __init__(self, template=None, template_name=None):
self._name = "Element"
self._id = hexlify(urandom(16)).decode()
self._children = OrderedDict()
self._parent = None
self._template_str = template
self._template_name = template_name
if template is not None:
self._template = Template(template)
elif template_name is not None:
self._template = ENV.get_template(template_name)
def __getstate__(self):
"""Modify object state when pickling the object.
jinja2 Templates cannot be pickled, so remove the instance attribute
if it exists. It will be added back when unpickling (see __setstate__).
"""
state: dict = self.__dict__.copy()
state.pop("_template", None)
return state
def __setstate__(self, state: dict):
"""Re-add _template instance attribute when unpickling"""
if state["_template_str"] is not None:
state["_template"] = Template(state["_template_str"])
elif state["_template_name"] is not None:
state["_template"] = ENV.get_template(state["_template_name"])
self.__dict__.update(state)
def get_name(self):
"""Returns a string representation of the object.
This string has to be unique and to be a python and
javascript-compatible
variable name.
"""
return _camelify(self._name) + "_" + self._id
def _get_self_bounds(self):
"""Computes the bounds of the object itself (not including it's children)
in the form [[lat_min, lon_min], [lat_max, lon_max]]
"""
return [[None, None], [None, None]]
def get_bounds(self):
"""Computes the bounds of the object and all it's children
in the form [[lat_min, lon_min], [lat_max, lon_max]].
"""
bounds = self._get_self_bounds()
for child in self._children.values():
child_bounds = child.get_bounds()
bounds = [
[
none_min(bounds[0][0], child_bounds[0][0]),
none_min(bounds[0][1], child_bounds[0][1]),
],
[
none_max(bounds[1][0], child_bounds[1][0]),
none_max(bounds[1][1], child_bounds[1][1]),
],
]
return bounds
def add_children(self, child, name=None, index=None):
"""Add a child."""
warnings.warn(
"Method `add_children` is deprecated. Please use `add_child` instead.",
FutureWarning,
stacklevel=2,
)
return self.add_child(child, name=name, index=index)
def add_child(self, child, name=None, index=None):
"""Add a child."""
if name is None:
name = child.get_name()
if index is None:
self._children[name] = child
else:
items = [item for item in self._children.items() if item[0] != name]
items.insert(int(index), (name, child))
self._children = OrderedDict(items)
child._parent = self
return self
def add_to(self, parent, name=None, index=None):
"""Add element to a parent."""
parent.add_child(self, name=name, index=index)
return self
def to_dict(self, depth=-1, ordered=True, **kwargs):
"""Returns a dict representation of the object."""
if ordered:
dict_fun = OrderedDict
else:
dict_fun = dict
out = dict_fun()
out["name"] = self._name
out["id"] = self._id
if depth != 0:
out["children"] = dict_fun(
[
(name, child.to_dict(depth=depth - 1))
for name, child in self._children.items()
],
) # noqa
return out
def to_json(self, depth=-1, **kwargs):
"""Returns a JSON representation of the object."""
return json.dumps(self.to_dict(depth=depth, ordered=True), **kwargs)
def get_root(self):
"""Returns the root of the elements tree."""
if self._parent is None:
return self
else:
return self._parent.get_root()
def render(self, **kwargs):
"""Renders the HTML representation of the element."""
return self._template.render(this=self, kwargs=kwargs)
def save(self, outfile, close_file=True, **kwargs):
"""Saves an Element into a file.
Parameters
----------
outfile : str or file object
The file (or filename) where you want to output the html.
close_file : bool, default True
Whether the file has to be closed after write.
"""
if isinstance(outfile, (str, bytes, Path)):
fid = open(outfile, "wb")
else:
fid = outfile
root = self.get_root()
html = root.render(**kwargs)
fid.write(html.encode("utf8"))
if close_file:
fid.close()
class Link(Element):
"""An abstract class for embedding a link in the HTML."""
def get_code(self):
"""Opens the link and returns the response's content."""
if self.code is None:
self.code = urlopen(self.url).read()
return self.code
def to_dict(self, depth=-1, **kwargs):
"""Returns a dict representation of the object."""
out = super().to_dict(depth=-1, **kwargs)
out["url"] = self.url
return out
class JavascriptLink(Link):
"""Create a JavascriptLink object based on a url.
Parameters
----------
url : str
The url to be linked
download : bool, default False
Whether the target document shall be loaded right now.
"""
_template = Template(
'{% if kwargs.get("embedded",False) %}'
"<script>{{this.get_code()}}</script>"
"{% else %}"
'<script src="{{this.url}}"></script>'
"{% endif %}",
)
def __init__(self, url, download=False):
super().__init__()
self._name = "JavascriptLink"
self.url = url
self.code = None
if download:
self.get_code()
class CssLink(Link):
"""Create a CssLink object based on a url.
Parameters
----------
url : str
The url to be linked
download : bool, default False
Whether the target document shall be loaded right now.
"""
_template = Template(
'{% if kwargs.get("embedded",False) %}'
"<style>{{this.get_code()}}</style>"
"{% else %}"
'<link rel="stylesheet" href="{{this.url}}"/>'
"{% endif %}",
)
def __init__(self, url, download=False):
super().__init__()
self._name = "CssLink"
self.url = url
self.code = None
if download:
self.get_code()
class Figure(Element):
"""Create a Figure object, to plot things into it.
Parameters
----------
width : str, default "100%"
The width of the Figure.
It may be a percentage or pixel value (like "300px").
height : str, default None
The height of the Figure.
It may be a percentage or a pixel value (like "300px").
ratio : str, default "60%"
A percentage defining the aspect ratio of the Figure.
It will be ignored if height is not None.
title : str, default None
Figure title.
figsize : tuple of two int, default None
If you're a matplotlib addict, you can overwrite width and
height. Values will be converted into pixels in using 60 dpi.
For example figsize=(10, 5) will result in
width="600px", height="300px".
"""
_template = Template(
"<!DOCTYPE html>\n"
"<html>\n"
"<head>\n"
"{% if this.title %}<title>{{this.title}}</title>{% endif %}"
" {{this.header.render(**kwargs)}}\n"
"</head>\n"
"<body>\n"
" {{this.html.render(**kwargs)}}\n"
"</body>\n"
"<script>\n"
" {{this.script.render(**kwargs)}}\n"
"</script>\n"
"</html>\n",
)
def __init__(
self,
width="100%",
height=None,
ratio="60%",
title=None,
figsize=None,
):
super().__init__()
self._name = "Figure"
self.header = Element()
self.html = Element()
self.script = Element()
self.header._parent = self
self.html._parent = self
self.script._parent = self
self.width = width
self.height = height
self.ratio = ratio
self.title = title
if figsize is not None:
self.width = str(60 * figsize[0]) + "px"
self.height = str(60 * figsize[1]) + "px"
# Create the meta tag.
self.header.add_child(
Element(
'<meta http-equiv="content-type" content="text/html; charset=UTF-8" />',
), # noqa
name="meta_http",
)
def to_dict(self, depth=-1, **kwargs):
"""Returns a dict representation of the object."""
out = super().to_dict(depth=depth, **kwargs)
out["header"] = self.header.to_dict(depth=depth - 1, **kwargs)
out["html"] = self.html.to_dict(depth=depth - 1, **kwargs)
out["script"] = self.script.to_dict(depth=depth - 1, **kwargs)
return out
def get_root(self):
"""Returns the root of the elements tree."""
return self
def render(self, **kwargs):
"""Renders the HTML representation of the element."""
for name, child in self._children.items():
child.render(**kwargs)
return self._template.render(this=self, kwargs=kwargs)
def _repr_html_(self, **kwargs):
"""Displays the Figure in a Jupyter notebook."""
html = escape(self.render(**kwargs))
if self.height is None:
iframe = (
'<div style="width:{width};">'
'<div style="position:relative;width:100%;height:0;padding-bottom:{ratio};">' # noqa
'<span style="color:#565656">Make this Notebook Trusted to load map: File -> Trust Notebook</span>' # noqa
'<iframe srcdoc="{html}" style="position:absolute;width:100%;height:100%;left:0;top:0;' # noqa
'border:none !important;" '
"allowfullscreen webkitallowfullscreen mozallowfullscreen>"
"</iframe>"
"</div></div>"
).format(html=html, width=self.width, ratio=self.ratio)
else:
iframe = (
'<iframe srcdoc="{html}" width="{width}" height="{height}"'
'style="border:none !important;" '
'"allowfullscreen" "webkitallowfullscreen" "mozallowfullscreen">'
"</iframe>"
).format(html=html, width=self.width, height=self.height)
return iframe
def add_subplot(self, x, y, n, margin=0.05):
"""Creates a div child subplot in a matplotlib.figure.add_subplot style.
Parameters
----------
x : int
The number of rows in the grid.
y : int
The number of columns in the grid.
n : int
The cell number in the grid, counted from 1 to x*y.
Example:
>>> fig.add_subplot(3, 2, 5)
# Create a div in the 5th cell of a 3rows x 2columns
grid(bottom-left corner).
"""
width = 1.0 / y
height = 1.0 / x
left = ((n - 1) % y) * width
top = ((n - 1) // y) * height
left = left + width * margin
top = top + height * margin
width = width * (1 - 2.0 * margin)
height = height * (1 - 2.0 * margin)
div = Div(
position="absolute",
width=f"{100.0 * width}%",
height=f"{100.0 * height}%",
left=f"{100.0 * left}%",
top=f"{100.0 * top}%",
)
self.add_child(div)
return div
class Html(Element):
"""Create an HTML div object for embedding data.
Parameters
----------
data : str
The HTML data to be embedded.
script : bool
If True, data will be embedded without escaping
(suitable for embedding html-ready code)
width : int or str, default '100%'
The width of the output div element.
Ex: 120 , '80%'
height : int or str, default '100%'
The height of the output div element.
Ex: 120 , '80%'
"""
_template = Template(
'<div id="{{this.get_name()}}" '
'style="width: {{this.width[0]}}{{this.width[1]}}; height: {{this.height[0]}}{{this.height[1]}};">' # noqa
"{% if this.script %}{{this.data}}{% else %}{{this.data|e}}{% endif %}</div>",
) # noqa
def __init__(self, data, script=False, width="100%", height="100%"):
super().__init__()
self._name = "Html"
self.script = script
self.data = data
self.width = _parse_size(width)
self.height = _parse_size(height)
class Div(Figure):
"""Create a Div to be embedded in a Figure.
Parameters
----------
width: int or str, default '100%'
The width of the div in pixels (int) or percentage (str).
height: int or str, default '100%'
The height of the div in pixels (int) or percentage (str).
left: int or str, default '0%'
The left-position of the div in pixels (int) or percentage (str).
top: int or str, default '0%'
The top-position of the div in pixels (int) or percentage (str).
position: str, default 'relative'
The position policy of the div.
Usual values are 'relative', 'absolute', 'fixed', 'static'.
"""
_template = Template(
"{% macro header(this, kwargs) %}"
"<style> #{{this.get_name()}} {\n"
" position : {{this.position}};\n"
" width : {{this.width[0]}}{{this.width[1]}};\n"
" height: {{this.height[0]}}{{this.height[1]}};\n"
" left: {{this.left[0]}}{{this.left[1]}};\n"
" top: {{this.top[0]}}{{this.top[1]}};\n"
" </style>"
"{% endmacro %}"
"{% macro html(this, kwargs) %}"
'<div id="{{this.get_name()}}">{{this.html.render(**kwargs)}}</div>'
"{% endmacro %}",
)
def __init__(
self,
width="100%",
height="100%",
left="0%",
top="0%",
position="relative",
):
super(Figure, self).__init__()
self._name = "Div"
# Size Parameters.
self.width = _parse_size(width)
self.height = _parse_size(height)
self.left = _parse_size(left)
self.top = _parse_size(top)
self.position = position
self.header = Element()
self.html = Element(
"{% for name, element in this._children.items() %}"
"{{element.render(**kwargs)}}"
"{% endfor %}",
)
self.script = Element()
self.header._parent = self
self.html._parent = self
self.script._parent = self
def get_root(self):
"""Returns the root of the elements tree."""
return self
def render(self, **kwargs):
"""Renders the HTML representation of the element."""
figure = self._parent
assert isinstance(figure, Figure), (
"You cannot render this Element " "if it is not in a Figure."
)
for name, element in self._children.items():
element.render(**kwargs)
for name, element in self.header._children.items():
figure.header.add_child(element, name=name)
for name, element in self.script._children.items():
figure.script.add_child(element, name=name)
header = self._template.module.__dict__.get("header", None)
if header is not None:
figure.header.add_child(Element(header(self, kwargs)), name=self.get_name())
html = self._template.module.__dict__.get("html", None)
if html is not None:
figure.html.add_child(Element(html(self, kwargs)), name=self.get_name())
script = self._template.module.__dict__.get("script", None)
if script is not None:
figure.script.add_child(Element(script(self, kwargs)), name=self.get_name())
def _repr_html_(self, **kwargs):
"""Displays the Div in a Jupyter notebook."""
if self._parent is None:
self.add_to(Figure())
out = self._parent._repr_html_(**kwargs)
self._parent = None
else:
out = self._parent._repr_html_(**kwargs)
return out
class IFrame(Element):
"""Create a Figure object, to plot things into it.
Parameters
----------
html : str, default None
Eventual HTML code that you want to put in the frame.
width : str, default "100%"
The width of the Figure.
It may be a percentage or pixel value (like "300px").
height : str, default None
The height of the Figure.
It may be a percentage or a pixel value (like "300px").
ratio : str, default "60%"
A percentage defining the aspect ratio of the Figure.
It will be ignored if height is not None.
figsize : tuple of two int, default None
If you're a matplotlib addict, you can overwrite width and
height. Values will be converted into pixels in using 60 dpi.
For example figsize=(10, 5) will result in
width="600px", height="300px".
"""
def __init__(self, html=None, width="100%", height=None, ratio="60%", figsize=None):
super().__init__()
self._name = "IFrame"
self.width = width
self.height = height
self.ratio = ratio
if figsize is not None:
self.width = str(60 * figsize[0]) + "px"
self.height = str(60 * figsize[1]) + "px"
if isinstance(html, str) or isinstance(html, bytes):
self.add_child(Element(html))
elif html is not None:
self.add_child(html)
def render(self, **kwargs):
"""Renders the HTML representation of the element."""
html = super().render(**kwargs)
html = "data:text/html;charset=utf-8;base64," + base64.b64encode(
html.encode("utf8"),
).decode(
"utf8",
) # noqa
if self.height is None:
iframe = (
'<div style="width:{width};">'
'<div style="position:relative;width:100%;height:0;padding-bottom:{ratio};">' # noqa
'<iframe src="{html}" style="position:absolute;width:100%;height:100%;left:0;top:0;' # noqa
'border:none !important;">'
"</iframe>"
"</div></div>"
).format(html=html, width=self.width, ratio=self.ratio)
else:
iframe = (
'<iframe src="{html}" width="{width}" style="border:none !important;" '
'height="{height}"></iframe>'
).format(html=html, width=self.width, height=self.height)
return iframe
class MacroElement(Element):
"""This is a parent class for Elements defined by a macro template.
To compute your own element, all you have to do is:
* To inherit from this class
* Overwrite the '_name' attribute
* Overwrite the '_template' attribute with something of the form::
{% macro header(this, kwargs) %}
...
{% endmacro %}
{% macro html(this, kwargs) %}
...
{% endmacro %}
{% macro script(this, kwargs) %}
...
{% endmacro %}
"""
_template = Template("")
def __init__(self):
super().__init__()
self._name = "MacroElement"
def render(self, **kwargs):
"""Renders the HTML representation of the element."""
figure = self.get_root()
assert isinstance(figure, Figure), (
"You cannot render this Element " "if it is not in a Figure."
)
header = self._template.module.__dict__.get("header", None)
if header is not None:
figure.header.add_child(Element(header(self, kwargs)), name=self.get_name())
html = self._template.module.__dict__.get("html", None)
if html is not None:
figure.html.add_child(Element(html(self, kwargs)), name=self.get_name())
script = self._template.module.__dict__.get("script", None)
if script is not None:
figure.script.add_child(Element(script(self, kwargs)), name=self.get_name())
for name, element in self._children.items():
element.render(**kwargs)

View File

@@ -0,0 +1 @@
{"codes": ["viridis", "plasma", "inferno", "magma", "Spectral", "RdYlGn", "PuBu", "Accent", "OrRd", "Set1", "Set2", "Set3", "BuPu", "Dark2", "RdBu", "Oranges", "BuGn", "PiYG", "YlOrBr", "YlGn", "Pastel2", "RdPu", "Greens", "PRGn", "YlGnBu", "RdYlBu", "Paired", "BrBG", "Purples", "Reds", "Pastel1", "GnBu", "Greys", "RdGy", "YlOrRd", "PuOr", "PuRd", "Blues", "PuBuGn"]}

View File

@@ -0,0 +1 @@
{"Spectral": "Diverging", "RdYlGn": "Diverging", "Set2": "Qualitative", "Accent": "Qualitative", "OrRd": "Sequential", "Set1": "Qualitative", "PuBu": "Sequential", "Set3": "Qualitative", "BuPu": "Sequential", "Dark2": "Qualitative", "RdBu": "Diverging", "BuGn": "Sequential", "PiYG": "Diverging", "YlOrBr": "Sequential", "YlGn": "Sequential", "RdPu": "Sequential", "PRGn": "Diverging", "YlGnBu": "Sequential", "RdYlBu": "Diverging", "Paired": "Qualitative", "Pastel2": "Qualitative", "Pastel1": "Qualitative", "GnBu": "Sequential", "RdGy": "Diverging", "YlOrRd": "Sequential", "PuOr": "Diverging", "PuRd": "Sequential", "BrBG": "Diverging", "PuBuGn": "Sequential", "Greens": "Sequential", "viridis": "Sequential", "plasma": "Sequential", "inferno": "Sequential", "magma": "Sequential", "Oranges": "Sequential", "Blues": "Sequential", "Greys": "Sequential", "Reds": "Sequential", "Purples": "Sequential"}

View File

@@ -0,0 +1,55 @@
{% macro script(this, kwargs) %}
var {{this.get_name()}} = {};
{%if this.color_range %}
{{this.get_name()}}.color = d3.scale.threshold()
.domain({{this.color_domain}})
.range({{this.color_range}});
{%else%}
{{this.get_name()}}.color = d3.scale.threshold()
.domain([{{ this.color_domain[0] }}, {{ this.color_domain[-1] }}])
.range(['{{ this.fill_color }}', '{{ this.fill_color }}']);
{%endif%}
{{this.get_name()}}.x = d3.scale.linear()
.domain([{{ this.color_domain[0] }}, {{ this.color_domain[-1] }}])
.range([0, {{ this.width }} - 50]);
{{this.get_name()}}.legend = L.control({position: 'topright'});
{{this.get_name()}}.legend.onAdd = function (map) {var div = L.DomUtil.create('div', 'legend'); return div};
{{this.get_name()}}.legend.addTo({{this._parent.get_name()}});
{{this.get_name()}}.xAxis = d3.svg.axis()
.scale({{this.get_name()}}.x)
.orient("top")
.tickSize(1)
.tickValues({{ this.tick_labels }});
{{this.get_name()}}.svg = d3.select(".legend.leaflet-control").append("svg")
.attr("id", 'legend')
.attr("width", {{ this.width }})
.attr("height", {{ this.height }});
{{this.get_name()}}.g = {{this.get_name()}}.svg.append("g")
.attr("class", "key")
.attr("transform", "translate(25,16)");
{{this.get_name()}}.g.selectAll("rect")
.data({{this.get_name()}}.color.range().map(function(d, i) {
return {
x0: i ? {{this.get_name()}}.x({{this.get_name()}}.color.domain()[i - 1]) : {{this.get_name()}}.x.range()[0],
x1: i < {{this.get_name()}}.color.domain().length ? {{this.get_name()}}.x({{this.get_name()}}.color.domain()[i]) : {{this.get_name()}}.x.range()[1],
z: d
};
}))
.enter().append("rect")
.attr("height", {{ this.height }} - 30)
.attr("x", function(d) { return d.x0; })
.attr("width", function(d) { return d.x1 - d.x0; })
.style("fill", function(d) { return d.z; });
{{this.get_name()}}.g.call({{this.get_name()}}.xAxis).append("text")
.attr("class", "caption")
.attr("y", 21)
.text({{ this.caption|tojson }});
{% endmacro %}

View File

@@ -0,0 +1,458 @@
"""
Utilities
-------
Utility module for Folium helper functions.
"""
import base64
import json
import math
import os
import re
import struct
import typing
import zlib
from typing import Any, Callable, Union
from jinja2 import Environment, PackageLoader
try:
import numpy as np
except ImportError:
np = None
if typing.TYPE_CHECKING:
from branca.colormap import ColorMap
rootpath = os.path.abspath(os.path.dirname(__file__))
def get_templates():
"""Get Jinja templates."""
return Environment(loader=PackageLoader("branca", "templates"))
def legend_scaler(legend_values, max_labels=10.0):
"""
Downsamples the number of legend values so that there isn't a collision
of text on the legend colorbar (within reason). The colorbar seems to
support ~10 entries as a maximum.
"""
if len(legend_values) < max_labels:
legend_ticks = legend_values
else:
spacer = int(math.ceil(len(legend_values) / max_labels))
legend_ticks = []
for i in legend_values[::spacer]:
legend_ticks += [i]
legend_ticks += [""] * (spacer - 1)
return legend_ticks
def linear_gradient(hexList, nColors):
"""
Given a list of hexcode values, will return a list of length
nColors where the colors are linearly interpolated between the
(r, g, b) tuples that are given.
Examples
--------
>>> linear_gradient([(0, 0, 0), (255, 0, 0), (255, 255, 0)], 100)
"""
def _scale(start, finish, length, i):
"""
Return the value correct value of a number that is in between start
and finish, for use in a loop of length *length*.
"""
base = 16
fraction = float(i) / (length - 1)
raynge = int(finish, base) - int(start, base)
thex = hex(int(int(start, base) + fraction * raynge)).split("x")[-1]
if len(thex) != 2:
thex = "0" + thex
return thex
allColors = []
# Separate (R, G, B) pairs.
for start, end in zip(hexList[:-1], hexList[1:]):
# Linearly interpolate between pair of hex ###### values and
# add to list.
nInterpolate = 765
for index in range(nInterpolate):
r = _scale(start[1:3], end[1:3], nInterpolate, index)
g = _scale(start[3:5], end[3:5], nInterpolate, index)
b = _scale(start[5:7], end[5:7], nInterpolate, index)
allColors.append("".join(["#", r, g, b]))
# Pick only nColors colors from the total list.
result = []
for counter in range(nColors):
fraction = float(counter) / (nColors - 1)
index = int(fraction * (len(allColors) - 1))
result.append(allColors[index])
return result
def color_brewer(color_code, n=6):
"""
Generate a colorbrewer color scheme of length 'len', type 'scheme.
Live examples can be seen at http://colorbrewer2.org/
"""
maximum_n = 253
minimum_n = 3
if not isinstance(n, int):
raise TypeError("n has to be an int, not a %s" % type(n))
# Raise an error if the n requested is greater than the maximum.
if n > maximum_n:
raise ValueError(
"The maximum number of colors in a"
" ColorBrewer sequential color series is 253",
)
if n < minimum_n:
raise ValueError(
"The minimum number of colors in a"
" ColorBrewer sequential color series is 3",
)
if not isinstance(color_code, str):
raise ValueError(f"color should be a string, not a {type(color_code)}.")
if color_code[-2:] == "_r":
base_code = color_code[:-2]
core_color_code = base_code + "_" + str(n).zfill(2)
color_reverse = True
else:
base_code = color_code
core_color_code = base_code + "_" + str(n).zfill(2)
color_reverse = False
with open(os.path.join(rootpath, "_schemes.json")) as f:
schemes = json.loads(f.read())
with open(os.path.join(rootpath, "scheme_info.json")) as f:
scheme_info = json.loads(f.read())
with open(os.path.join(rootpath, "scheme_base_codes.json")) as f:
core_schemes = json.loads(f.read())["codes"]
if base_code not in core_schemes:
raise ValueError(base_code + " is not a valid ColorBrewer code")
explicit_scheme = True
if schemes.get(core_color_code) is None:
explicit_scheme = False
# Only if n is greater than the scheme length do we interpolate values.
if not explicit_scheme:
# Check to make sure that it is not a qualitative scheme.
if scheme_info[base_code] == "Qualitative":
matching_quals = []
for key in schemes:
if base_code + "_" in key:
matching_quals.append(int(key.split("_")[1]))
raise ValueError(
"Expanded color support is not available"
" for Qualitative schemes; restrict the"
" number of colors for the "
+ base_code
+ " code to between "
+ str(min(matching_quals))
+ " and "
+ str(max(matching_quals)),
)
else:
longest_scheme_name = base_code
longest_scheme_n = 0
for sn_name in schemes.keys():
if "_" not in sn_name:
continue
if sn_name.split("_")[0] != base_code:
continue
if int(sn_name.split("_")[1]) > longest_scheme_n:
longest_scheme_name = sn_name
longest_scheme_n = int(sn_name.split("_")[1])
if not color_reverse:
color_scheme = linear_gradient(schemes.get(longest_scheme_name), n)
else:
color_scheme = linear_gradient(
schemes.get(longest_scheme_name)[::-1],
n,
)
else:
if not color_reverse:
color_scheme = schemes.get(core_color_code, None)
else:
color_scheme = schemes.get(core_color_code, None)[::-1]
return color_scheme
def image_to_url(image, colormap=None, origin="upper"):
"""Infers the type of an image argument and transforms it into a URL.
Parameters
----------
image: string, file or array-like object
* If string, it will be written directly in the output file.
* If file, it's content will be converted as embedded in the
output file.
* If array-like, it will be converted to PNG base64 string and
embedded in the output.
origin : ['upper' | 'lower'], optional, default 'upper'
Place the [0, 0] index of the array in the upper left or
lower left corner of the axes.
colormap : callable, used only for `mono` image.
Function of the form [x -> (r,g,b)] or [x -> (r,g,b,a)]
for transforming a mono image into RGB.
It must output iterables of length 3 or 4, with values between
0. and 1. Hint : you can use colormaps from `matplotlib.cm`.
"""
if hasattr(image, "read"):
# We got an image file.
if hasattr(image, "name"):
# We try to get the image format from the file name.
fileformat = image.name.lower().split(".")[-1]
else:
fileformat = "png"
url = "data:image/{};base64,{}".format(
fileformat,
base64.b64encode(image.read()).decode("utf-8"),
)
elif (not (isinstance(image, str) or isinstance(image, bytes))) and hasattr(
image,
"__iter__",
):
# We got an array-like object.
png = write_png(image, origin=origin, colormap=colormap)
url = "data:image/png;base64," + base64.b64encode(png).decode("utf-8")
else:
# We got an URL.
url = json.loads(json.dumps(image))
return url.replace("\n", " ")
def write_png(
data: Any,
origin: str = "upper",
colormap: Union["ColorMap", Callable, None] = None,
) -> bytes:
"""
Transform an array of data into a PNG string.
This can be written to disk using binary I/O, or encoded using base64
for an inline PNG like this:
>>> png_str = write_png(array)
>>> "data:image/png;base64," + png_str.encode("base64")
Inspired from
http://stackoverflow.com/questions/902761/saving-a-numpy-array-as-an-image
Parameters
----------
data: numpy array or equivalent list-like object.
Must be NxM (mono), NxMx3 (RGB) or NxMx4 (RGBA)
origin : ['upper' | 'lower'], optional, default 'upper'
Place the [0,0] index of the array in the upper left or lower left
corner of the axes.
colormap : ColorMap subclass or callable, optional
Only needed to transform mono images into RGB. You have three options:
- use a subclass of `ColorMap` like `LinearColorMap`
- use a colormap from `matplotlib.cm`
- use a custom function of the form [x -> (r,g,b)] or [x -> (r,g,b,a)].
It must output iterables of length 3 or 4 with values between 0 and 1.
Returns
-------
PNG formatted byte string
"""
from branca.colormap import ColorMap
if np is None:
raise ImportError("The NumPy package is required" " for this functionality")
if isinstance(colormap, ColorMap):
colormap_callable = colormap.rgba_floats_tuple
elif callable(colormap):
colormap_callable = colormap
else:
colormap_callable = lambda x: (x, x, x, 1) # noqa E731
array = np.atleast_3d(data)
height, width, nblayers = array.shape
if nblayers not in [1, 3, 4]:
raise ValueError("Data must be NxM (mono), " "NxMx3 (RGB), or NxMx4 (RGBA)")
assert array.shape == (height, width, nblayers)
if nblayers == 1:
array = np.array(list(map(colormap_callable, array.ravel())))
nblayers = array.shape[1]
if nblayers not in [3, 4]:
raise ValueError(
"colormap must provide colors of" "length 3 (RGB) or 4 (RGBA)",
)
array = array.reshape((height, width, nblayers))
assert array.shape == (height, width, nblayers)
if nblayers == 3:
array = np.concatenate((array, np.ones((height, width, 1))), axis=2)
nblayers = 4
assert array.shape == (height, width, nblayers)
assert nblayers == 4
# Normalize to uint8 if it isn't already.
if array.dtype != "uint8":
with np.errstate(divide="ignore", invalid="ignore"):
array = array * 255.0 / array.max(axis=(0, 1)).reshape((1, 1, 4))
array[~np.isfinite(array)] = 0
array = array.astype("uint8")
# Eventually flip the image.
if origin == "lower":
array = array[::-1, :, :]
# Transform the array to bytes.
raw_data = b"".join([b"\x00" + array[i, :, :].tobytes() for i in range(height)])
def png_pack(png_tag, data):
chunk_head = png_tag + data
return (
struct.pack("!I", len(data))
+ chunk_head
+ struct.pack("!I", 0xFFFFFFFF & zlib.crc32(chunk_head))
)
return b"".join(
[
b"\x89PNG\r\n\x1a\n",
png_pack(b"IHDR", struct.pack("!2I5B", width, height, 8, 6, 0, 0, 0)),
png_pack(b"IDAT", zlib.compress(raw_data, 9)),
png_pack(b"IEND", b""),
],
)
def _camelify(out):
return (
(
"".join(
[
(
"_" + x.lower()
if i < len(out) - 1
and x.isupper()
and out[i + 1].islower() # noqa
else (
x.lower() + "_"
if i < len(out) - 1
and x.islower()
and out[i + 1].isupper() # noqa
else x.lower()
)
)
for i, x in enumerate(list(out))
],
)
)
.lstrip("_")
.replace("__", "_")
) # noqa
def _parse_size(value):
if isinstance(value, (int, float)):
return float(value), "px"
elif isinstance(value, str):
# match digits or a point, possibly followed by a space,
# followed by a unit: either 1 to 5 letters or a percent sign
match = re.fullmatch(r"([\d.]+)\s?(\w{1,5}|%)", value.strip())
if match:
return float(match.group(1)), match.group(2)
else:
raise ValueError(
f"Cannot parse {value!r}, it should be a number followed by a unit.",
)
elif (
isinstance(value, tuple)
and isinstance(value[0], (int, float))
and isinstance(value[1], str)
):
# value had been already parsed
return (float(value[0]), value[1])
else:
raise TypeError(
f"Cannot parse {value!r}, it should be a number or a string containing a number and a unit.",
)
def _locations_mirror(x):
"""Mirrors the points in a list-of-list-of-...-of-list-of-points.
For example:
>>> _locations_mirror([[[1, 2], [3, 4]], [5, 6], [7, 8]])
[[[2, 1], [4, 3]], [6, 5], [8, 7]]
"""
if hasattr(x, "__iter__"):
if hasattr(x[0], "__iter__"):
return list(map(_locations_mirror, x))
else:
return list(x[::-1])
else:
return x
def _locations_tolist(x):
"""Transforms recursively a list of iterables into a list of list."""
if hasattr(x, "__iter__"):
return list(map(_locations_tolist, x))
else:
return x
def none_min(x, y):
if x is None:
return y
elif y is None:
return x
else:
return min(x, y)
def none_max(x, y):
if x is None:
return y
elif y is None:
return x
else:
return max(x, y)
def iter_points(x):
"""Iterates over a list representing a feature, and returns a list of points,
whatever the shape of the array (Point, MultiPolyline, etc).
"""
if isinstance(x, (list, tuple)):
if len(x):
if isinstance(x[0], (list, tuple)):
out = []
for y in x:
out += iter_points(y)
return out
else:
return [x]
else:
return []
else:
raise ValueError(f"List/tuple type expected. Got {x!r}.")

View File

@@ -0,0 +1,20 @@
This package contains a modified version of ca-bundle.crt:
ca-bundle.crt -- Bundle of CA Root Certificates
This is a bundle of X.509 certificates of public Certificate Authorities
(CA). These were automatically extracted from Mozilla's root certificates
file (certdata.txt). This file can be found in the mozilla source tree:
https://hg.mozilla.org/mozilla-central/file/tip/security/nss/lib/ckfw/builtins/certdata.txt
It contains the certificates in PEM format and therefore
can be directly used with curl / libcurl / php_curl, or with
an Apache+mod_ssl webserver for SSL client authentication.
Just configure this file as the SSLCACertificateFile.#
***** BEGIN LICENSE BLOCK *****
This Source Code Form is subject to the terms of the Mozilla Public License,
v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain
one at http://mozilla.org/MPL/2.0/.
***** END LICENSE BLOCK *****
@(#) $RCSfile: certdata.txt,v $ $Revision: 1.80 $ $Date: 2011/11/03 15:11:58 $

View File

@@ -0,0 +1,67 @@
Metadata-Version: 2.1
Name: certifi
Version: 2024.7.4
Summary: Python package for providing Mozilla's CA Bundle.
Home-page: https://github.com/certifi/python-certifi
Author: Kenneth Reitz
Author-email: me@kennethreitz.com
License: MPL-2.0
Project-URL: Source, https://github.com/certifi/python-certifi
Classifier: Development Status :: 5 - Production/Stable
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)
Classifier: Natural Language :: English
Classifier: Programming Language :: Python
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3 :: Only
Classifier: Programming Language :: Python :: 3.6
Classifier: Programming Language :: Python :: 3.7
Classifier: Programming Language :: Python :: 3.8
Classifier: Programming Language :: Python :: 3.9
Classifier: Programming Language :: Python :: 3.10
Classifier: Programming Language :: Python :: 3.11
Classifier: Programming Language :: Python :: 3.12
Requires-Python: >=3.6
License-File: LICENSE
Certifi: Python SSL Certificates
================================
Certifi provides Mozilla's carefully curated collection of Root Certificates for
validating the trustworthiness of SSL certificates while verifying the identity
of TLS hosts. It has been extracted from the `Requests`_ project.
Installation
------------
``certifi`` is available on PyPI. Simply install it with ``pip``::
$ pip install certifi
Usage
-----
To reference the installed certificate authority (CA) bundle, you can use the
built-in function::
>>> import certifi
>>> certifi.where()
'/usr/local/lib/python3.7/site-packages/certifi/cacert.pem'
Or from the command line::
$ python -m certifi
/usr/local/lib/python3.7/site-packages/certifi/cacert.pem
Enjoy!
.. _`Requests`: https://requests.readthedocs.io/en/master/
Addition/Removal of Certificates
--------------------------------
Certifi does not support any addition/removal or other modification of the
CA trust store content. This project is intended to provide a reliable and
highly portable root of trust to python deployments. Look to upstream projects
for methods to use alternate trust.

View File

@@ -0,0 +1,14 @@
certifi-2024.7.4.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
certifi-2024.7.4.dist-info/LICENSE,sha256=6TcW2mucDVpKHfYP5pWzcPBpVgPSH2-D8FPkLPwQyvc,989
certifi-2024.7.4.dist-info/METADATA,sha256=L9_EuPoQQvHFzxu03_ctaEZxhEty7inz569jGWjlLGo,2221
certifi-2024.7.4.dist-info/RECORD,,
certifi-2024.7.4.dist-info/WHEEL,sha256=y4mX-SOX4fYIkonsAGA5N0Oy-8_gI4FXw5HNI1xqvWg,91
certifi-2024.7.4.dist-info/top_level.txt,sha256=KMu4vUCfsjLrkPbSNdgdekS-pVJzBAJFO__nI8NF6-U,8
certifi/__init__.py,sha256=LHXz7E80YJYBzCBv6ZyidQ5-ciYSkSebpY2E5OM0l7o,94
certifi/__main__.py,sha256=xBBoj905TUWBLRGANOcf7oi6e-3dMP4cEoG9OyMs11g,243
certifi/__pycache__/__init__.cpython-311.pyc,,
certifi/__pycache__/__main__.cpython-311.pyc,,
certifi/__pycache__/core.cpython-311.pyc,,
certifi/cacert.pem,sha256=SIupYGAr8HzGP073rsEIaS_sQYIPwzKKjj894DgUmu4,291528
certifi/core.py,sha256=qRDDFyXVJwTB_EmoGppaXU_R9qCZvhl-EzxPMuV3nTA,4426
certifi/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0

View File

@@ -0,0 +1,5 @@
Wheel-Version: 1.0
Generator: setuptools (70.2.0)
Root-Is-Purelib: true
Tag: py3-none-any

View File

@@ -0,0 +1,4 @@
from .core import contents, where
__all__ = ["contents", "where"]
__version__ = "2024.07.04"

View File

@@ -0,0 +1,12 @@
import argparse
from certifi import contents, where
parser = argparse.ArgumentParser()
parser.add_argument("-c", "--contents", action="store_true")
args = parser.parse_args()
if args.contents:
print(contents())
else:
print(where())

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,114 @@
"""
certifi.py
~~~~~~~~~~
This module returns the installation location of cacert.pem or its contents.
"""
import sys
import atexit
def exit_cacert_ctx() -> None:
_CACERT_CTX.__exit__(None, None, None) # type: ignore[union-attr]
if sys.version_info >= (3, 11):
from importlib.resources import as_file, files
_CACERT_CTX = None
_CACERT_PATH = None
def where() -> str:
# This is slightly terrible, but we want to delay extracting the file
# in cases where we're inside of a zipimport situation until someone
# actually calls where(), but we don't want to re-extract the file
# on every call of where(), so we'll do it once then store it in a
# global variable.
global _CACERT_CTX
global _CACERT_PATH
if _CACERT_PATH is None:
# This is slightly janky, the importlib.resources API wants you to
# manage the cleanup of this file, so it doesn't actually return a
# path, it returns a context manager that will give you the path
# when you enter it and will do any cleanup when you leave it. In
# the common case of not needing a temporary file, it will just
# return the file system location and the __exit__() is a no-op.
#
# We also have to hold onto the actual context manager, because
# it will do the cleanup whenever it gets garbage collected, so
# we will also store that at the global level as well.
_CACERT_CTX = as_file(files("certifi").joinpath("cacert.pem"))
_CACERT_PATH = str(_CACERT_CTX.__enter__())
atexit.register(exit_cacert_ctx)
return _CACERT_PATH
def contents() -> str:
return files("certifi").joinpath("cacert.pem").read_text(encoding="ascii")
elif sys.version_info >= (3, 7):
from importlib.resources import path as get_path, read_text
_CACERT_CTX = None
_CACERT_PATH = None
def where() -> str:
# This is slightly terrible, but we want to delay extracting the
# file in cases where we're inside of a zipimport situation until
# someone actually calls where(), but we don't want to re-extract
# the file on every call of where(), so we'll do it once then store
# it in a global variable.
global _CACERT_CTX
global _CACERT_PATH
if _CACERT_PATH is None:
# This is slightly janky, the importlib.resources API wants you
# to manage the cleanup of this file, so it doesn't actually
# return a path, it returns a context manager that will give
# you the path when you enter it and will do any cleanup when
# you leave it. In the common case of not needing a temporary
# file, it will just return the file system location and the
# __exit__() is a no-op.
#
# We also have to hold onto the actual context manager, because
# it will do the cleanup whenever it gets garbage collected, so
# we will also store that at the global level as well.
_CACERT_CTX = get_path("certifi", "cacert.pem")
_CACERT_PATH = str(_CACERT_CTX.__enter__())
atexit.register(exit_cacert_ctx)
return _CACERT_PATH
def contents() -> str:
return read_text("certifi", "cacert.pem", encoding="ascii")
else:
import os
import types
from typing import Union
Package = Union[types.ModuleType, str]
Resource = Union[str, "os.PathLike"]
# This fallback will work for Python versions prior to 3.7 that lack the
# importlib.resources module but relies on the existing `where` function
# so won't address issues with environments like PyOxidizer that don't set
# __file__ on modules.
def read_text(
package: Package,
resource: Resource,
encoding: str = 'utf-8',
errors: str = 'strict'
) -> str:
with open(where(), encoding=encoding) as data:
return data.read()
# If we don't have importlib.resources, then we will just do the old logic
# of assuming we're on the filesystem and munge the path directly.
def where() -> str:
f = os.path.dirname(__file__)
return os.path.join(f, "cacert.pem")
def contents() -> str:
return read_text("certifi", "cacert.pem", encoding="ascii")

View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2019 TAHRI Ahmed R.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -0,0 +1,683 @@
Metadata-Version: 2.1
Name: charset-normalizer
Version: 3.3.2
Summary: The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.
Home-page: https://github.com/Ousret/charset_normalizer
Author: Ahmed TAHRI
Author-email: ahmed.tahri@cloudnursery.dev
License: MIT
Project-URL: Bug Reports, https://github.com/Ousret/charset_normalizer/issues
Project-URL: Documentation, https://charset-normalizer.readthedocs.io/en/latest
Keywords: encoding,charset,charset-detector,detector,normalization,unicode,chardet,detect
Classifier: Development Status :: 5 - Production/Stable
Classifier: License :: OSI Approved :: MIT License
Classifier: Intended Audience :: Developers
Classifier: Topic :: Software Development :: Libraries :: Python Modules
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3.7
Classifier: Programming Language :: Python :: 3.8
Classifier: Programming Language :: Python :: 3.9
Classifier: Programming Language :: Python :: 3.10
Classifier: Programming Language :: Python :: 3.11
Classifier: Programming Language :: Python :: 3.12
Classifier: Programming Language :: Python :: Implementation :: PyPy
Classifier: Topic :: Text Processing :: Linguistic
Classifier: Topic :: Utilities
Classifier: Typing :: Typed
Requires-Python: >=3.7.0
Description-Content-Type: text/markdown
License-File: LICENSE
Provides-Extra: unicode_backport
<h1 align="center">Charset Detection, for Everyone 👋</h1>
<p align="center">
<sup>The Real First Universal Charset Detector</sup><br>
<a href="https://pypi.org/project/charset-normalizer">
<img src="https://img.shields.io/pypi/pyversions/charset_normalizer.svg?orange=blue" />
</a>
<a href="https://pepy.tech/project/charset-normalizer/">
<img alt="Download Count Total" src="https://static.pepy.tech/badge/charset-normalizer/month" />
</a>
<a href="https://bestpractices.coreinfrastructure.org/projects/7297">
<img src="https://bestpractices.coreinfrastructure.org/projects/7297/badge">
</a>
</p>
<p align="center">
<sup><i>Featured Packages</i></sup><br>
<a href="https://github.com/jawah/niquests">
<img alt="Static Badge" src="https://img.shields.io/badge/Niquests-HTTP_1.1%2C%202%2C_and_3_Client-cyan">
</a>
<a href="https://github.com/jawah/wassima">
<img alt="Static Badge" src="https://img.shields.io/badge/Wassima-Certifi_Killer-cyan">
</a>
</p>
<p align="center">
<sup><i>In other language (unofficial port - by the community)</i></sup><br>
<a href="https://github.com/nickspring/charset-normalizer-rs">
<img alt="Static Badge" src="https://img.shields.io/badge/Rust-red">
</a>
</p>
> A library that helps you read text from an unknown charset encoding.<br /> Motivated by `chardet`,
> I'm trying to resolve the issue by taking a new approach.
> All IANA character set names for which the Python core library provides codecs are supported.
<p align="center">
>>>>> <a href="https://charsetnormalizerweb.ousret.now.sh" target="_blank">👉 Try Me Online Now, Then Adopt Me 👈 </a> <<<<<
</p>
This project offers you an alternative to **Universal Charset Encoding Detector**, also known as **Chardet**.
| Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) |
|--------------------------------------------------|:---------------------------------------------:|:--------------------------------------------------------------------------------------------------:|:-----------------------------------------------:|
| `Fast` | ❌ | ✅ | ✅ |
| `Universal**` | ❌ | ✅ | ❌ |
| `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ |
| `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ |
| `License` | LGPL-2.1<br>_restrictive_ | MIT | MPL-1.1<br>_restrictive_ |
| `Native Python` | ✅ | ✅ | ❌ |
| `Detect spoken language` | ❌ | ✅ | N/A |
| `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ |
| `Whl Size (min)` | 193.6 kB | 42 kB | ~200 kB |
| `Supported Encoding` | 33 | 🎉 [99](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40 |
<p align="center">
<img src="https://i.imgflip.com/373iay.gif" alt="Reading Normalized Text" width="226"/><img src="https://media.tenor.com/images/c0180f70732a18b4965448d33adba3d0/tenor.gif" alt="Cat Reading Text" width="200"/>
</p>
*\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*<br>
Did you got there because of the logs? See [https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html](https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html)
## ⚡ Performance
This package offer better performance than its counterpart Chardet. Here are some numbers.
| Package | Accuracy | Mean per file (ms) | File per sec (est) |
|-----------------------------------------------|:--------:|:------------------:|:------------------:|
| [chardet](https://github.com/chardet/chardet) | 86 % | 200 ms | 5 file/sec |
| charset-normalizer | **98 %** | **10 ms** | 100 file/sec |
| Package | 99th percentile | 95th percentile | 50th percentile |
|-----------------------------------------------|:---------------:|:---------------:|:---------------:|
| [chardet](https://github.com/chardet/chardet) | 1200 ms | 287 ms | 23 ms |
| charset-normalizer | 100 ms | 50 ms | 5 ms |
Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
> Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows.
> And yes, these results might change at any time. The dataset can be updated to include more files.
> The actual delays heavily depends on your CPU capabilities. The factors should remain the same.
> Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability
> (eg. Supported Encoding) Challenge-them if you want.
## ✨ Installation
Using pip:
```sh
pip install charset-normalizer -U
```
## 🚀 Basic Usage
### CLI
This package comes with a CLI.
```
usage: normalizer [-h] [-v] [-a] [-n] [-m] [-r] [-f] [-t THRESHOLD]
file [file ...]
The Real First Universal Charset Detector. Discover originating encoding used
on text file. Normalize text to unicode.
positional arguments:
files File(s) to be analysed
optional arguments:
-h, --help show this help message and exit
-v, --verbose Display complementary information about file if any.
Stdout will contain logs about the detection process.
-a, --with-alternative
Output complementary possibilities if any. Top-level
JSON WILL be a list.
-n, --normalize Permit to normalize input file. If not set, program
does not write anything.
-m, --minimal Only output the charset detected to STDOUT. Disabling
JSON output.
-r, --replace Replace file when trying to normalize it instead of
creating a new one.
-f, --force Replace file without asking if you are sure, use this
flag with caution.
-t THRESHOLD, --threshold THRESHOLD
Define a custom maximum amount of chaos allowed in
decoded content. 0. <= chaos <= 1.
--version Show version information and exit.
```
```bash
normalizer ./data/sample.1.fr.srt
```
or
```bash
python -m charset_normalizer ./data/sample.1.fr.srt
```
🎉 Since version 1.4.0 the CLI produce easily usable stdout result in JSON format.
```json
{
"path": "/home/default/projects/charset_normalizer/data/sample.1.fr.srt",
"encoding": "cp1252",
"encoding_aliases": [
"1252",
"windows_1252"
],
"alternative_encodings": [
"cp1254",
"cp1256",
"cp1258",
"iso8859_14",
"iso8859_15",
"iso8859_16",
"iso8859_3",
"iso8859_9",
"latin_1",
"mbcs"
],
"language": "French",
"alphabets": [
"Basic Latin",
"Latin-1 Supplement"
],
"has_sig_or_bom": false,
"chaos": 0.149,
"coherence": 97.152,
"unicode_path": null,
"is_preferred": true
}
```
### Python
*Just print out normalized text*
```python
from charset_normalizer import from_path
results = from_path('./my_subtitle.srt')
print(str(results.best()))
```
*Upgrade your code without effort*
```python
from charset_normalizer import detect
```
The above code will behave the same as **chardet**. We ensure that we offer the best (reasonable) BC result possible.
See the docs for advanced usage : [readthedocs.io](https://charset-normalizer.readthedocs.io/en/latest/)
## 😇 Why
When I started using Chardet, I noticed that it was not suited to my expectations, and I wanted to propose a
reliable alternative using a completely different method. Also! I never back down on a good challenge!
I **don't care** about the **originating charset** encoding, because **two different tables** can
produce **two identical rendered string.**
What I want is to get readable text, the best I can.
In a way, **I'm brute forcing text decoding.** How cool is that ? 😎
Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode.
## 🍰 How
- Discard all charset encoding table that could not fit the binary content.
- Measure noise, or the mess once opened (by chunks) with a corresponding charset encoding.
- Extract matches with the lowest mess detected.
- Additionally, we measure coherence / probe for a language.
**Wait a minute**, what is noise/mess and coherence according to **YOU ?**
*Noise :* I opened hundred of text files, **written by humans**, with the wrong encoding table. **I observed**, then
**I established** some ground rules about **what is obvious** when **it seems like** a mess.
I know that my interpretation of what is noise is probably incomplete, feel free to contribute in order to
improve or rewrite it.
*Coherence :* For each language there is on earth, we have computed ranked letter appearance occurrences (the best we can). So I thought
that intel is worth something here. So I use those records against decoded text to check if I can detect intelligent design.
## ⚡ Known limitations
- Language detection is unreliable when text contains two or more languages sharing identical letters. (eg. HTML (english tags) + Turkish content (Sharing Latin characters))
- Every charset detector heavily depends on sufficient content. In common cases, do not bother run detection on very tiny content.
## ⚠️ About Python EOLs
**If you are running:**
- Python >=2.7,<3.5: Unsupported
- Python 3.5: charset-normalizer < 2.1
- Python 3.6: charset-normalizer < 3.1
- Python 3.7: charset-normalizer < 4.0
Upgrade your Python interpreter as soon as possible.
## 👤 Contributing
Contributions, issues and feature requests are very much welcome.<br />
Feel free to check [issues page](https://github.com/ousret/charset_normalizer/issues) if you want to contribute.
## 📝 License
Copyright © [Ahmed TAHRI @Ousret](https://github.com/Ousret).<br />
This project is [MIT](https://github.com/Ousret/charset_normalizer/blob/master/LICENSE) licensed.
Characters frequencies used in this project © 2012 [Denny Vrandečić](http://simia.net/letters/)
## 💼 For Enterprise
Professional support for charset-normalizer is available as part of the [Tidelift
Subscription][1]. Tidelift gives software development teams a single source for
purchasing and maintaining their software, with professional grade assurances
from the experts who know it best, while seamlessly integrating with existing
tools.
[1]: https://tidelift.com/subscription/pkg/pypi-charset-normalizer?utm_source=pypi-charset-normalizer&utm_medium=readme
# Changelog
All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)
### Fixed
- Unintentional memory usage regression when using large payload that match several encoding (#376)
- Regression on some detection case showcased in the documentation (#371)
### Added
- Noise (md) probe that identify malformed arabic representation due to the presence of letters in isolated form (credit to my wife)
## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-22)
### Changed
- Optional mypyc compilation upgraded to version 1.6.1 for Python >= 3.8
- Improved the general detection reliability based on reports from the community
## [3.3.0](https://github.com/Ousret/charset_normalizer/compare/3.2.0...3.3.0) (2023-09-30)
### Added
- Allow to execute the CLI (e.g. normalizer) through `python -m charset_normalizer.cli` or `python -m charset_normalizer`
- Support for 9 forgotten encoding that are supported by Python but unlisted in `encoding.aliases` as they have no alias (#323)
### Removed
- (internal) Redundant utils.is_ascii function and unused function is_private_use_only
- (internal) charset_normalizer.assets is moved inside charset_normalizer.constant
### Changed
- (internal) Unicode code blocks in constants are updated using the latest v15.0.0 definition to improve detection
- Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.8
### Fixed
- Unable to properly sort CharsetMatch when both chaos/noise and coherence were close due to an unreachable condition in \_\_lt\_\_ (#350)
## [3.2.0](https://github.com/Ousret/charset_normalizer/compare/3.1.0...3.2.0) (2023-06-07)
### Changed
- Typehint for function `from_path` no longer enforce `PathLike` as its first argument
- Minor improvement over the global detection reliability
### Added
- Introduce function `is_binary` that relies on main capabilities, and optimized to detect binaries
- Propagate `enable_fallback` argument throughout `from_bytes`, `from_path`, and `from_fp` that allow a deeper control over the detection (default True)
- Explicit support for Python 3.12
### Fixed
- Edge case detection failure where a file would contain 'very-long' camel cased word (Issue #289)
## [3.1.0](https://github.com/Ousret/charset_normalizer/compare/3.0.1...3.1.0) (2023-03-06)
### Added
- Argument `should_rename_legacy` for legacy function `detect` and disregard any new arguments without errors (PR #262)
### Removed
- Support for Python 3.6 (PR #260)
### Changed
- Optional speedup provided by mypy/c 1.0.1
## [3.0.1](https://github.com/Ousret/charset_normalizer/compare/3.0.0...3.0.1) (2022-11-18)
### Fixed
- Multi-bytes cutter/chunk generator did not always cut correctly (PR #233)
### Changed
- Speedup provided by mypy/c 0.990 on Python >= 3.7
## [3.0.0](https://github.com/Ousret/charset_normalizer/compare/2.1.1...3.0.0) (2022-10-20)
### Added
- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
### Changed
- Build with static metadata using 'build' frontend
- Make the language detection stricter
- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
### Fixed
- CLI with opt --normalize fail when using full path for files
- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
- Sphinx warnings when generating the documentation
### Removed
- Coherence detector no longer return 'Simple English' instead return 'English'
- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
- Breaking: Method `first()` and `best()` from CharsetMatch
- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
- Breaking: Top-level function `normalize`
- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
- Support for the backport `unicodedata2`
## [3.0.0rc1](https://github.com/Ousret/charset_normalizer/compare/3.0.0b2...3.0.0rc1) (2022-10-18)
### Added
- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
### Changed
- Build with static metadata using 'build' frontend
- Make the language detection stricter
### Fixed
- CLI with opt --normalize fail when using full path for files
- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
### Removed
- Coherence detector no longer return 'Simple English' instead return 'English'
- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
## [3.0.0b2](https://github.com/Ousret/charset_normalizer/compare/3.0.0b1...3.0.0b2) (2022-08-21)
### Added
- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
### Removed
- Breaking: Method `first()` and `best()` from CharsetMatch
- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
### Fixed
- Sphinx warnings when generating the documentation
## [3.0.0b1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...3.0.0b1) (2022-08-15)
### Changed
- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
### Removed
- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
- Breaking: Top-level function `normalize`
- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
- Support for the backport `unicodedata2`
## [2.1.1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...2.1.1) (2022-08-19)
### Deprecated
- Function `normalize` scheduled for removal in 3.0
### Changed
- Removed useless call to decode in fn is_unprintable (#206)
### Fixed
- Third-party library (i18n xgettext) crashing not recognizing utf_8 (PEP 263) with underscore from [@aleksandernovikov](https://github.com/aleksandernovikov) (#204)
## [2.1.0](https://github.com/Ousret/charset_normalizer/compare/2.0.12...2.1.0) (2022-06-19)
### Added
- Output the Unicode table version when running the CLI with `--version` (PR #194)
### Changed
- Re-use decoded buffer for single byte character sets from [@nijel](https://github.com/nijel) (PR #175)
- Fixing some performance bottlenecks from [@deedy5](https://github.com/deedy5) (PR #183)
### Fixed
- Workaround potential bug in cpython with Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space (PR #175)
- CLI default threshold aligned with the API threshold from [@oleksandr-kuzmenko](https://github.com/oleksandr-kuzmenko) (PR #181)
### Removed
- Support for Python 3.5 (PR #192)
### Deprecated
- Use of backport unicodedata from `unicodedata2` as Python is quickly catching up, scheduled for removal in 3.0 (PR #194)
## [2.0.12](https://github.com/Ousret/charset_normalizer/compare/2.0.11...2.0.12) (2022-02-12)
### Fixed
- ASCII miss-detection on rare cases (PR #170)
## [2.0.11](https://github.com/Ousret/charset_normalizer/compare/2.0.10...2.0.11) (2022-01-30)
### Added
- Explicit support for Python 3.11 (PR #164)
### Changed
- The logging behavior have been completely reviewed, now using only TRACE and DEBUG levels (PR #163 #165)
## [2.0.10](https://github.com/Ousret/charset_normalizer/compare/2.0.9...2.0.10) (2022-01-04)
### Fixed
- Fallback match entries might lead to UnicodeDecodeError for large bytes sequence (PR #154)
### Changed
- Skipping the language-detection (CD) on ASCII (PR #155)
## [2.0.9](https://github.com/Ousret/charset_normalizer/compare/2.0.8...2.0.9) (2021-12-03)
### Changed
- Moderating the logging impact (since 2.0.8) for specific environments (PR #147)
### Fixed
- Wrong logging level applied when setting kwarg `explain` to True (PR #146)
## [2.0.8](https://github.com/Ousret/charset_normalizer/compare/2.0.7...2.0.8) (2021-11-24)
### Changed
- Improvement over Vietnamese detection (PR #126)
- MD improvement on trailing data and long foreign (non-pure latin) data (PR #124)
- Efficiency improvements in cd/alphabet_languages from [@adbar](https://github.com/adbar) (PR #122)
- call sum() without an intermediary list following PEP 289 recommendations from [@adbar](https://github.com/adbar) (PR #129)
- Code style as refactored by Sourcery-AI (PR #131)
- Minor adjustment on the MD around european words (PR #133)
- Remove and replace SRTs from assets / tests (PR #139)
- Initialize the library logger with a `NullHandler` by default from [@nmaynes](https://github.com/nmaynes) (PR #135)
- Setting kwarg `explain` to True will add provisionally (bounded to function lifespan) a specific stream handler (PR #135)
### Fixed
- Fix large (misleading) sequence giving UnicodeDecodeError (PR #137)
- Avoid using too insignificant chunk (PR #137)
### Added
- Add and expose function `set_logging_handler` to configure a specific StreamHandler from [@nmaynes](https://github.com/nmaynes) (PR #135)
- Add `CHANGELOG.md` entries, format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) (PR #141)
## [2.0.7](https://github.com/Ousret/charset_normalizer/compare/2.0.6...2.0.7) (2021-10-11)
### Added
- Add support for Kazakh (Cyrillic) language detection (PR #109)
### Changed
- Further, improve inferring the language from a given single-byte code page (PR #112)
- Vainly trying to leverage PEP263 when PEP3120 is not supported (PR #116)
- Refactoring for potential performance improvements in loops from [@adbar](https://github.com/adbar) (PR #113)
- Various detection improvement (MD+CD) (PR #117)
### Removed
- Remove redundant logging entry about detected language(s) (PR #115)
### Fixed
- Fix a minor inconsistency between Python 3.5 and other versions regarding language detection (PR #117 #102)
## [2.0.6](https://github.com/Ousret/charset_normalizer/compare/2.0.5...2.0.6) (2021-09-18)
### Fixed
- Unforeseen regression with the loss of the backward-compatibility with some older minor of Python 3.5.x (PR #100)
- Fix CLI crash when using --minimal output in certain cases (PR #103)
### Changed
- Minor improvement to the detection efficiency (less than 1%) (PR #106 #101)
## [2.0.5](https://github.com/Ousret/charset_normalizer/compare/2.0.4...2.0.5) (2021-09-14)
### Changed
- The project now comply with: flake8, mypy, isort and black to ensure a better overall quality (PR #81)
- The BC-support with v1.x was improved, the old staticmethods are restored (PR #82)
- The Unicode detection is slightly improved (PR #93)
- Add syntax sugar \_\_bool\_\_ for results CharsetMatches list-container (PR #91)
### Removed
- The project no longer raise warning on tiny content given for detection, will be simply logged as warning instead (PR #92)
### Fixed
- In some rare case, the chunks extractor could cut in the middle of a multi-byte character and could mislead the mess detection (PR #95)
- Some rare 'space' characters could trip up the UnprintablePlugin/Mess detection (PR #96)
- The MANIFEST.in was not exhaustive (PR #78)
## [2.0.4](https://github.com/Ousret/charset_normalizer/compare/2.0.3...2.0.4) (2021-07-30)
### Fixed
- The CLI no longer raise an unexpected exception when no encoding has been found (PR #70)
- Fix accessing the 'alphabets' property when the payload contains surrogate characters (PR #68)
- The logger could mislead (explain=True) on detected languages and the impact of one MBCS match (PR #72)
- Submatch factoring could be wrong in rare edge cases (PR #72)
- Multiple files given to the CLI were ignored when publishing results to STDOUT. (After the first path) (PR #72)
- Fix line endings from CRLF to LF for certain project files (PR #67)
### Changed
- Adjust the MD to lower the sensitivity, thus improving the global detection reliability (PR #69 #76)
- Allow fallback on specified encoding if any (PR #71)
## [2.0.3](https://github.com/Ousret/charset_normalizer/compare/2.0.2...2.0.3) (2021-07-16)
### Changed
- Part of the detection mechanism has been improved to be less sensitive, resulting in more accurate detection results. Especially ASCII. (PR #63)
- According to the community wishes, the detection will fall back on ASCII or UTF-8 in a last-resort case. (PR #64)
## [2.0.2](https://github.com/Ousret/charset_normalizer/compare/2.0.1...2.0.2) (2021-07-15)
### Fixed
- Empty/Too small JSON payload miss-detection fixed. Report from [@tseaver](https://github.com/tseaver) (PR #59)
### Changed
- Don't inject unicodedata2 into sys.modules from [@akx](https://github.com/akx) (PR #57)
## [2.0.1](https://github.com/Ousret/charset_normalizer/compare/2.0.0...2.0.1) (2021-07-13)
### Fixed
- Make it work where there isn't a filesystem available, dropping assets frequencies.json. Report from [@sethmlarson](https://github.com/sethmlarson). (PR #55)
- Using explain=False permanently disable the verbose output in the current runtime (PR #47)
- One log entry (language target preemptive) was not show in logs when using explain=True (PR #47)
- Fix undesired exception (ValueError) on getitem of instance CharsetMatches (PR #52)
### Changed
- Public function normalize default args values were not aligned with from_bytes (PR #53)
### Added
- You may now use charset aliases in cp_isolation and cp_exclusion arguments (PR #47)
## [2.0.0](https://github.com/Ousret/charset_normalizer/compare/1.4.1...2.0.0) (2021-07-02)
### Changed
- 4x to 5 times faster than the previous 1.4.0 release. At least 2x faster than Chardet.
- Accent has been made on UTF-8 detection, should perform rather instantaneous.
- The backward compatibility with Chardet has been greatly improved. The legacy detect function returns an identical charset name whenever possible.
- The detection mechanism has been slightly improved, now Turkish content is detected correctly (most of the time)
- The program has been rewritten to ease the readability and maintainability. (+Using static typing)+
- utf_7 detection has been reinstated.
### Removed
- This package no longer require anything when used with Python 3.5 (Dropped cached_property)
- Removed support for these languages: Catalan, Esperanto, Kazakh, Baque, Volapük, Azeri, Galician, Nynorsk, Macedonian, and Serbocroatian.
- The exception hook on UnicodeDecodeError has been removed.
### Deprecated
- Methods coherence_non_latin, w_counter, chaos_secondary_pass of the class CharsetMatch are now deprecated and scheduled for removal in v3.0
### Fixed
- The CLI output used the relative path of the file(s). Should be absolute.
## [1.4.1](https://github.com/Ousret/charset_normalizer/compare/1.4.0...1.4.1) (2021-05-28)
### Fixed
- Logger configuration/usage no longer conflict with others (PR #44)
## [1.4.0](https://github.com/Ousret/charset_normalizer/compare/1.3.9...1.4.0) (2021-05-21)
### Removed
- Using standard logging instead of using the package loguru.
- Dropping nose test framework in favor of the maintained pytest.
- Choose to not use dragonmapper package to help with gibberish Chinese/CJK text.
- Require cached_property only for Python 3.5 due to constraint. Dropping for every other interpreter version.
- Stop support for UTF-7 that does not contain a SIG.
- Dropping PrettyTable, replaced with pure JSON output in CLI.
### Fixed
- BOM marker in a CharsetNormalizerMatch instance could be False in rare cases even if obviously present. Due to the sub-match factoring process.
- Not searching properly for the BOM when trying utf32/16 parent codec.
### Changed
- Improving the package final size by compressing frequencies.json.
- Huge improvement over the larges payload.
### Added
- CLI now produces JSON consumable output.
- Return ASCII if given sequences fit. Given reasonable confidence.
## [1.3.9](https://github.com/Ousret/charset_normalizer/compare/1.3.8...1.3.9) (2021-05-13)
### Fixed
- In some very rare cases, you may end up getting encode/decode errors due to a bad bytes payload (PR #40)
## [1.3.8](https://github.com/Ousret/charset_normalizer/compare/1.3.7...1.3.8) (2021-05-12)
### Fixed
- Empty given payload for detection may cause an exception if trying to access the `alphabets` property. (PR #39)
## [1.3.7](https://github.com/Ousret/charset_normalizer/compare/1.3.6...1.3.7) (2021-05-12)
### Fixed
- The legacy detect function should return UTF-8-SIG if sig is present in the payload. (PR #38)
## [1.3.6](https://github.com/Ousret/charset_normalizer/compare/1.3.5...1.3.6) (2021-02-09)
### Changed
- Amend the previous release to allow prettytable 2.0 (PR #35)
## [1.3.5](https://github.com/Ousret/charset_normalizer/compare/1.3.4...1.3.5) (2021-02-08)
### Fixed
- Fix error while using the package with a python pre-release interpreter (PR #33)
### Changed
- Dependencies refactoring, constraints revised.
### Added
- Add python 3.9 and 3.10 to the supported interpreters
MIT License
Copyright (c) 2019 TAHRI Ahmed R.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -0,0 +1,35 @@
../../../bin/normalizer,sha256=R5O6Ltc3o7b7rmWTAR5oe51zIvqulOK7eLpFO0Tnqgg,289
charset_normalizer-3.3.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
charset_normalizer-3.3.2.dist-info/LICENSE,sha256=6zGgxaT7Cbik4yBV0lweX5w1iidS_vPNcgIT0cz-4kE,1070
charset_normalizer-3.3.2.dist-info/METADATA,sha256=cfLhl5A6SI-F0oclm8w8ux9wshL1nipdeCdVnYb4AaA,33550
charset_normalizer-3.3.2.dist-info/RECORD,,
charset_normalizer-3.3.2.dist-info/WHEEL,sha256=48wUIcZcdQ2pWN7qt0HP02Cvv6HIQZGsSgx3PsepNj8,152
charset_normalizer-3.3.2.dist-info/entry_points.txt,sha256=ADSTKrkXZ3hhdOVFi6DcUEHQRS0xfxDIE_pEz4wLIXA,65
charset_normalizer-3.3.2.dist-info/top_level.txt,sha256=7ASyzePr8_xuZWJsnqJjIBtyV8vhEo0wBCv1MPRRi3Q,19
charset_normalizer/__init__.py,sha256=UzI3xC8PhmcLRMzSgPb6minTmRq0kWznnCBJ8ZCc2XI,1577
charset_normalizer/__main__.py,sha256=JxY8bleaENOFlLRb9HfoeZCzAMnn2A1oGR5Xm2eyqg0,73
charset_normalizer/__pycache__/__init__.cpython-311.pyc,,
charset_normalizer/__pycache__/__main__.cpython-311.pyc,,
charset_normalizer/__pycache__/api.cpython-311.pyc,,
charset_normalizer/__pycache__/cd.cpython-311.pyc,,
charset_normalizer/__pycache__/constant.cpython-311.pyc,,
charset_normalizer/__pycache__/legacy.cpython-311.pyc,,
charset_normalizer/__pycache__/md.cpython-311.pyc,,
charset_normalizer/__pycache__/models.cpython-311.pyc,,
charset_normalizer/__pycache__/utils.cpython-311.pyc,,
charset_normalizer/__pycache__/version.cpython-311.pyc,,
charset_normalizer/api.py,sha256=WOlWjy6wT8SeMYFpaGbXZFN1TMXa-s8vZYfkL4G29iQ,21097
charset_normalizer/cd.py,sha256=xwZliZcTQFA3jU0c00PRiu9MNxXTFxQkFLWmMW24ZzI,12560
charset_normalizer/cli/__init__.py,sha256=D5ERp8P62llm2FuoMzydZ7d9rs8cvvLXqE-1_6oViPc,100
charset_normalizer/cli/__main__.py,sha256=2F-xURZJzo063Ye-2RLJ2wcmURpbKeAzKwpiws65dAs,9744
charset_normalizer/cli/__pycache__/__init__.cpython-311.pyc,,
charset_normalizer/cli/__pycache__/__main__.cpython-311.pyc,,
charset_normalizer/constant.py,sha256=p0IsOVcEbPWYPOdWhnhRbjK1YVBy6fs05C5vKC-zoxU,40481
charset_normalizer/legacy.py,sha256=T-QuVMsMeDiQEk8WSszMrzVJg_14AMeSkmHdRYhdl1k,2071
charset_normalizer/md.cpython-311-x86_64-linux-gnu.so,sha256=Y7QSLD5QLoSFAWys0-tL7R6QB7oi5864zM6zr7RWek4,16064
charset_normalizer/md.py,sha256=NkSuVLK13_a8c7BxZ4cGIQ5vOtGIWOdh22WZEvjp-7U,19624
charset_normalizer/md__mypyc.cpython-311-x86_64-linux-gnu.so,sha256=93T0C_hoJxReTevc7NpjM7P7fae_U-scv5B-AhkKKtY,264392
charset_normalizer/models.py,sha256=I5i0s4aKCCgLPY2tUY3pwkgFA-BUbbNxQ7hVkVTt62s,11624
charset_normalizer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
charset_normalizer/utils.py,sha256=teiosMqzKjXyAHXnGdjSBOgnBZwx-SkBbCLrx0UXy8M,11894
charset_normalizer/version.py,sha256=iHKUfHD3kDRSyrh_BN2ojh43TA5-UZQjvbVIEFfpHDs,79

View File

@@ -0,0 +1,6 @@
Wheel-Version: 1.0
Generator: bdist_wheel (0.41.2)
Root-Is-Purelib: false
Tag: cp311-cp311-manylinux_2_17_x86_64
Tag: cp311-cp311-manylinux2014_x86_64

View File

@@ -0,0 +1,2 @@
[console_scripts]
normalizer = charset_normalizer.cli:cli_detect

View File

@@ -0,0 +1,46 @@
# -*- coding: utf-8 -*-
"""
Charset-Normalizer
~~~~~~~~~~~~~~
The Real First Universal Charset Detector.
A library that helps you read text from an unknown charset encoding.
Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
All IANA character set names for which the Python core library provides codecs are supported.
Basic usage:
>>> from charset_normalizer import from_bytes
>>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
>>> best_guess = results.best()
>>> str(best_guess)
'Bсеки човек има право на образование. Oбразованието!'
Others methods and usages are available - see the full documentation
at <https://github.com/Ousret/charset_normalizer>.
:copyright: (c) 2021 by Ahmed TAHRI
:license: MIT, see LICENSE for more details.
"""
import logging
from .api import from_bytes, from_fp, from_path, is_binary
from .legacy import detect
from .models import CharsetMatch, CharsetMatches
from .utils import set_logging_handler
from .version import VERSION, __version__
__all__ = (
"from_fp",
"from_path",
"from_bytes",
"is_binary",
"detect",
"CharsetMatch",
"CharsetMatches",
"__version__",
"VERSION",
"set_logging_handler",
)
# Attach a NullHandler to the top level logger by default
# https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())

View File

@@ -0,0 +1,4 @@
from .cli import cli_detect
if __name__ == "__main__":
cli_detect()

View File

@@ -0,0 +1,626 @@
import logging
from os import PathLike
from typing import BinaryIO, List, Optional, Set, Union
from .cd import (
coherence_ratio,
encoding_languages,
mb_encoding_languages,
merge_coherence_ratios,
)
from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
from .md import mess_ratio
from .models import CharsetMatch, CharsetMatches
from .utils import (
any_specified_encoding,
cut_sequence_chunks,
iana_name,
identify_sig_or_bom,
is_cp_similar,
is_multi_byte_encoding,
should_strip_sig_or_bom,
)
# Will most likely be controversial
# logging.addLevelName(TRACE, "TRACE")
logger = logging.getLogger("charset_normalizer")
explain_handler = logging.StreamHandler()
explain_handler.setFormatter(
logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
)
def from_bytes(
sequences: Union[bytes, bytearray],
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.2,
cp_isolation: Optional[List[str]] = None,
cp_exclusion: Optional[List[str]] = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
enable_fallback: bool = True,
) -> CharsetMatches:
"""
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
If there is no results, it is a strong indicator that the source is binary/not text.
By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
but never take it for granted. Can improve the performance.
You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
purpose.
This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
Custom logging format and handler can be set manually.
"""
if not isinstance(sequences, (bytearray, bytes)):
raise TypeError(
"Expected object of type bytes or bytearray, got: {0}".format(
type(sequences)
)
)
if explain:
previous_logger_level: int = logger.level
logger.addHandler(explain_handler)
logger.setLevel(TRACE)
length: int = len(sequences)
if length == 0:
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
if explain:
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level or logging.WARNING)
return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
if cp_isolation is not None:
logger.log(
TRACE,
"cp_isolation is set. use this flag for debugging purpose. "
"limited list of encoding allowed : %s.",
", ".join(cp_isolation),
)
cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
else:
cp_isolation = []
if cp_exclusion is not None:
logger.log(
TRACE,
"cp_exclusion is set. use this flag for debugging purpose. "
"limited list of encoding excluded : %s.",
", ".join(cp_exclusion),
)
cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
else:
cp_exclusion = []
if length <= (chunk_size * steps):
logger.log(
TRACE,
"override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
steps,
chunk_size,
length,
)
steps = 1
chunk_size = length
if steps > 1 and length / steps < chunk_size:
chunk_size = int(length / steps)
is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
if is_too_small_sequence:
logger.log(
TRACE,
"Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
length
),
)
elif is_too_large_sequence:
logger.log(
TRACE,
"Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
length
),
)
prioritized_encodings: List[str] = []
specified_encoding: Optional[str] = (
any_specified_encoding(sequences) if preemptive_behaviour else None
)
if specified_encoding is not None:
prioritized_encodings.append(specified_encoding)
logger.log(
TRACE,
"Detected declarative mark in sequence. Priority +1 given for %s.",
specified_encoding,
)
tested: Set[str] = set()
tested_but_hard_failure: List[str] = []
tested_but_soft_failure: List[str] = []
fallback_ascii: Optional[CharsetMatch] = None
fallback_u8: Optional[CharsetMatch] = None
fallback_specified: Optional[CharsetMatch] = None
results: CharsetMatches = CharsetMatches()
sig_encoding, sig_payload = identify_sig_or_bom(sequences)
if sig_encoding is not None:
prioritized_encodings.append(sig_encoding)
logger.log(
TRACE,
"Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
len(sig_payload),
sig_encoding,
)
prioritized_encodings.append("ascii")
if "utf_8" not in prioritized_encodings:
prioritized_encodings.append("utf_8")
for encoding_iana in prioritized_encodings + IANA_SUPPORTED:
if cp_isolation and encoding_iana not in cp_isolation:
continue
if cp_exclusion and encoding_iana in cp_exclusion:
continue
if encoding_iana in tested:
continue
tested.add(encoding_iana)
decoded_payload: Optional[str] = None
bom_or_sig_available: bool = sig_encoding == encoding_iana
strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
encoding_iana
)
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
logger.log(
TRACE,
"Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
encoding_iana,
)
continue
if encoding_iana in {"utf_7"} and not bom_or_sig_available:
logger.log(
TRACE,
"Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
encoding_iana,
)
continue
try:
is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
except (ModuleNotFoundError, ImportError):
logger.log(
TRACE,
"Encoding %s does not provide an IncrementalDecoder",
encoding_iana,
)
continue
try:
if is_too_large_sequence and is_multi_byte_decoder is False:
str(
sequences[: int(50e4)]
if strip_sig_or_bom is False
else sequences[len(sig_payload) : int(50e4)],
encoding=encoding_iana,
)
else:
decoded_payload = str(
sequences
if strip_sig_or_bom is False
else sequences[len(sig_payload) :],
encoding=encoding_iana,
)
except (UnicodeDecodeError, LookupError) as e:
if not isinstance(e, LookupError):
logger.log(
TRACE,
"Code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
tested_but_hard_failure.append(encoding_iana)
continue
similar_soft_failure_test: bool = False
for encoding_soft_failed in tested_but_soft_failure:
if is_cp_similar(encoding_iana, encoding_soft_failed):
similar_soft_failure_test = True
break
if similar_soft_failure_test:
logger.log(
TRACE,
"%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
encoding_iana,
encoding_soft_failed,
)
continue
r_ = range(
0 if not bom_or_sig_available else len(sig_payload),
length,
int(length / steps),
)
multi_byte_bonus: bool = (
is_multi_byte_decoder
and decoded_payload is not None
and len(decoded_payload) < length
)
if multi_byte_bonus:
logger.log(
TRACE,
"Code page %s is a multi byte encoding table and it appear that at least one character "
"was encoded using n-bytes.",
encoding_iana,
)
max_chunk_gave_up: int = int(len(r_) / 4)
max_chunk_gave_up = max(max_chunk_gave_up, 2)
early_stop_count: int = 0
lazy_str_hard_failure = False
md_chunks: List[str] = []
md_ratios = []
try:
for chunk in cut_sequence_chunks(
sequences,
encoding_iana,
r_,
chunk_size,
bom_or_sig_available,
strip_sig_or_bom,
sig_payload,
is_multi_byte_decoder,
decoded_payload,
):
md_chunks.append(chunk)
md_ratios.append(
mess_ratio(
chunk,
threshold,
explain is True and 1 <= len(cp_isolation) <= 2,
)
)
if md_ratios[-1] >= threshold:
early_stop_count += 1
if (early_stop_count >= max_chunk_gave_up) or (
bom_or_sig_available and strip_sig_or_bom is False
):
break
except (
UnicodeDecodeError
) as e: # Lazy str loading may have missed something there
logger.log(
TRACE,
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
early_stop_count = max_chunk_gave_up
lazy_str_hard_failure = True
# We might want to check the sequence again with the whole content
# Only if initial MD tests passes
if (
not lazy_str_hard_failure
and is_too_large_sequence
and not is_multi_byte_decoder
):
try:
sequences[int(50e3) :].decode(encoding_iana, errors="strict")
except UnicodeDecodeError as e:
logger.log(
TRACE,
"LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
tested_but_hard_failure.append(encoding_iana)
continue
mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
tested_but_soft_failure.append(encoding_iana)
logger.log(
TRACE,
"%s was excluded because of initial chaos probing. Gave up %i time(s). "
"Computed mean chaos is %f %%.",
encoding_iana,
early_stop_count,
round(mean_mess_ratio * 100, ndigits=3),
)
# Preparing those fallbacks in case we got nothing.
if (
enable_fallback
and encoding_iana in ["ascii", "utf_8", specified_encoding]
and not lazy_str_hard_failure
):
fallback_entry = CharsetMatch(
sequences, encoding_iana, threshold, False, [], decoded_payload
)
if encoding_iana == specified_encoding:
fallback_specified = fallback_entry
elif encoding_iana == "ascii":
fallback_ascii = fallback_entry
else:
fallback_u8 = fallback_entry
continue
logger.log(
TRACE,
"%s passed initial chaos probing. Mean measured chaos is %f %%",
encoding_iana,
round(mean_mess_ratio * 100, ndigits=3),
)
if not is_multi_byte_decoder:
target_languages: List[str] = encoding_languages(encoding_iana)
else:
target_languages = mb_encoding_languages(encoding_iana)
if target_languages:
logger.log(
TRACE,
"{} should target any language(s) of {}".format(
encoding_iana, str(target_languages)
),
)
cd_ratios = []
# We shall skip the CD when its about ASCII
# Most of the time its not relevant to run "language-detection" on it.
if encoding_iana != "ascii":
for chunk in md_chunks:
chunk_languages = coherence_ratio(
chunk,
language_threshold,
",".join(target_languages) if target_languages else None,
)
cd_ratios.append(chunk_languages)
cd_ratios_merged = merge_coherence_ratios(cd_ratios)
if cd_ratios_merged:
logger.log(
TRACE,
"We detected language {} using {}".format(
cd_ratios_merged, encoding_iana
),
)
results.append(
CharsetMatch(
sequences,
encoding_iana,
mean_mess_ratio,
bom_or_sig_available,
cd_ratios_merged,
decoded_payload,
)
)
if (
encoding_iana in [specified_encoding, "ascii", "utf_8"]
and mean_mess_ratio < 0.1
):
logger.debug(
"Encoding detection: %s is most likely the one.", encoding_iana
)
if explain:
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return CharsetMatches([results[encoding_iana]])
if encoding_iana == sig_encoding:
logger.debug(
"Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
"the beginning of the sequence.",
encoding_iana,
)
if explain:
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return CharsetMatches([results[encoding_iana]])
if len(results) == 0:
if fallback_u8 or fallback_ascii or fallback_specified:
logger.log(
TRACE,
"Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
)
if fallback_specified:
logger.debug(
"Encoding detection: %s will be used as a fallback match",
fallback_specified.encoding,
)
results.append(fallback_specified)
elif (
(fallback_u8 and fallback_ascii is None)
or (
fallback_u8
and fallback_ascii
and fallback_u8.fingerprint != fallback_ascii.fingerprint
)
or (fallback_u8 is not None)
):
logger.debug("Encoding detection: utf_8 will be used as a fallback match")
results.append(fallback_u8)
elif fallback_ascii:
logger.debug("Encoding detection: ascii will be used as a fallback match")
results.append(fallback_ascii)
if results:
logger.debug(
"Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
results.best().encoding, # type: ignore
len(results) - 1,
)
else:
logger.debug("Encoding detection: Unable to determine any suitable charset.")
if explain:
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return results
def from_fp(
fp: BinaryIO,
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: Optional[List[str]] = None,
cp_exclusion: Optional[List[str]] = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
enable_fallback: bool = True,
) -> CharsetMatches:
"""
Same thing than the function from_bytes but using a file pointer that is already ready.
Will not close the file pointer.
"""
return from_bytes(
fp.read(),
steps,
chunk_size,
threshold,
cp_isolation,
cp_exclusion,
preemptive_behaviour,
explain,
language_threshold,
enable_fallback,
)
def from_path(
path: Union[str, bytes, PathLike], # type: ignore[type-arg]
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: Optional[List[str]] = None,
cp_exclusion: Optional[List[str]] = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
enable_fallback: bool = True,
) -> CharsetMatches:
"""
Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
Can raise IOError.
"""
with open(path, "rb") as fp:
return from_fp(
fp,
steps,
chunk_size,
threshold,
cp_isolation,
cp_exclusion,
preemptive_behaviour,
explain,
language_threshold,
enable_fallback,
)
def is_binary(
fp_or_path_or_payload: Union[PathLike, str, BinaryIO, bytes], # type: ignore[type-arg]
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: Optional[List[str]] = None,
cp_exclusion: Optional[List[str]] = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
enable_fallback: bool = False,
) -> bool:
"""
Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
are disabled to be stricter around ASCII-compatible but unlikely to be a string.
"""
if isinstance(fp_or_path_or_payload, (str, PathLike)):
guesses = from_path(
fp_or_path_or_payload,
steps=steps,
chunk_size=chunk_size,
threshold=threshold,
cp_isolation=cp_isolation,
cp_exclusion=cp_exclusion,
preemptive_behaviour=preemptive_behaviour,
explain=explain,
language_threshold=language_threshold,
enable_fallback=enable_fallback,
)
elif isinstance(
fp_or_path_or_payload,
(
bytes,
bytearray,
),
):
guesses = from_bytes(
fp_or_path_or_payload,
steps=steps,
chunk_size=chunk_size,
threshold=threshold,
cp_isolation=cp_isolation,
cp_exclusion=cp_exclusion,
preemptive_behaviour=preemptive_behaviour,
explain=explain,
language_threshold=language_threshold,
enable_fallback=enable_fallback,
)
else:
guesses = from_fp(
fp_or_path_or_payload,
steps=steps,
chunk_size=chunk_size,
threshold=threshold,
cp_isolation=cp_isolation,
cp_exclusion=cp_exclusion,
preemptive_behaviour=preemptive_behaviour,
explain=explain,
language_threshold=language_threshold,
enable_fallback=enable_fallback,
)
return not guesses

View File

@@ -0,0 +1,395 @@
import importlib
from codecs import IncrementalDecoder
from collections import Counter
from functools import lru_cache
from typing import Counter as TypeCounter, Dict, List, Optional, Tuple
from .constant import (
FREQUENCIES,
KO_NAMES,
LANGUAGE_SUPPORTED_COUNT,
TOO_SMALL_SEQUENCE,
ZH_NAMES,
)
from .md import is_suspiciously_successive_range
from .models import CoherenceMatches
from .utils import (
is_accentuated,
is_latin,
is_multi_byte_encoding,
is_unicode_range_secondary,
unicode_range,
)
def encoding_unicode_range(iana_name: str) -> List[str]:
"""
Return associated unicode ranges in a single byte code page.
"""
if is_multi_byte_encoding(iana_name):
raise IOError("Function not supported on multi-byte code page")
decoder = importlib.import_module(
"encodings.{}".format(iana_name)
).IncrementalDecoder
p: IncrementalDecoder = decoder(errors="ignore")
seen_ranges: Dict[str, int] = {}
character_count: int = 0
for i in range(0x40, 0xFF):
chunk: str = p.decode(bytes([i]))
if chunk:
character_range: Optional[str] = unicode_range(chunk)
if character_range is None:
continue
if is_unicode_range_secondary(character_range) is False:
if character_range not in seen_ranges:
seen_ranges[character_range] = 0
seen_ranges[character_range] += 1
character_count += 1
return sorted(
[
character_range
for character_range in seen_ranges
if seen_ranges[character_range] / character_count >= 0.15
]
)
def unicode_range_languages(primary_range: str) -> List[str]:
"""
Return inferred languages used with a unicode range.
"""
languages: List[str] = []
for language, characters in FREQUENCIES.items():
for character in characters:
if unicode_range(character) == primary_range:
languages.append(language)
break
return languages
@lru_cache()
def encoding_languages(iana_name: str) -> List[str]:
"""
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
This function does the correspondence.
"""
unicode_ranges: List[str] = encoding_unicode_range(iana_name)
primary_range: Optional[str] = None
for specified_range in unicode_ranges:
if "Latin" not in specified_range:
primary_range = specified_range
break
if primary_range is None:
return ["Latin Based"]
return unicode_range_languages(primary_range)
@lru_cache()
def mb_encoding_languages(iana_name: str) -> List[str]:
"""
Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
This function does the correspondence.
"""
if (
iana_name.startswith("shift_")
or iana_name.startswith("iso2022_jp")
or iana_name.startswith("euc_j")
or iana_name == "cp932"
):
return ["Japanese"]
if iana_name.startswith("gb") or iana_name in ZH_NAMES:
return ["Chinese"]
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
return ["Korean"]
return []
@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
def get_target_features(language: str) -> Tuple[bool, bool]:
"""
Determine main aspects from a supported language if it contains accents and if is pure Latin.
"""
target_have_accents: bool = False
target_pure_latin: bool = True
for character in FREQUENCIES[language]:
if not target_have_accents and is_accentuated(character):
target_have_accents = True
if target_pure_latin and is_latin(character) is False:
target_pure_latin = False
return target_have_accents, target_pure_latin
def alphabet_languages(
characters: List[str], ignore_non_latin: bool = False
) -> List[str]:
"""
Return associated languages associated to given characters.
"""
languages: List[Tuple[str, float]] = []
source_have_accents = any(is_accentuated(character) for character in characters)
for language, language_characters in FREQUENCIES.items():
target_have_accents, target_pure_latin = get_target_features(language)
if ignore_non_latin and target_pure_latin is False:
continue
if target_have_accents is False and source_have_accents:
continue
character_count: int = len(language_characters)
character_match_count: int = len(
[c for c in language_characters if c in characters]
)
ratio: float = character_match_count / character_count
if ratio >= 0.2:
languages.append((language, ratio))
languages = sorted(languages, key=lambda x: x[1], reverse=True)
return [compatible_language[0] for compatible_language in languages]
def characters_popularity_compare(
language: str, ordered_characters: List[str]
) -> float:
"""
Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
"""
if language not in FREQUENCIES:
raise ValueError("{} not available".format(language))
character_approved_count: int = 0
FREQUENCIES_language_set = set(FREQUENCIES[language])
ordered_characters_count: int = len(ordered_characters)
target_language_characters_count: int = len(FREQUENCIES[language])
large_alphabet: bool = target_language_characters_count > 26
for character, character_rank in zip(
ordered_characters, range(0, ordered_characters_count)
):
if character not in FREQUENCIES_language_set:
continue
character_rank_in_language: int = FREQUENCIES[language].index(character)
expected_projection_ratio: float = (
target_language_characters_count / ordered_characters_count
)
character_rank_projection: int = int(character_rank * expected_projection_ratio)
if (
large_alphabet is False
and abs(character_rank_projection - character_rank_in_language) > 4
):
continue
if (
large_alphabet is True
and abs(character_rank_projection - character_rank_in_language)
< target_language_characters_count / 3
):
character_approved_count += 1
continue
characters_before_source: List[str] = FREQUENCIES[language][
0:character_rank_in_language
]
characters_after_source: List[str] = FREQUENCIES[language][
character_rank_in_language:
]
characters_before: List[str] = ordered_characters[0:character_rank]
characters_after: List[str] = ordered_characters[character_rank:]
before_match_count: int = len(
set(characters_before) & set(characters_before_source)
)
after_match_count: int = len(
set(characters_after) & set(characters_after_source)
)
if len(characters_before_source) == 0 and before_match_count <= 4:
character_approved_count += 1
continue
if len(characters_after_source) == 0 and after_match_count <= 4:
character_approved_count += 1
continue
if (
before_match_count / len(characters_before_source) >= 0.4
or after_match_count / len(characters_after_source) >= 0.4
):
character_approved_count += 1
continue
return character_approved_count / len(ordered_characters)
def alpha_unicode_split(decoded_sequence: str) -> List[str]:
"""
Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
One containing the latin letters and the other hebrew.
"""
layers: Dict[str, str] = {}
for character in decoded_sequence:
if character.isalpha() is False:
continue
character_range: Optional[str] = unicode_range(character)
if character_range is None:
continue
layer_target_range: Optional[str] = None
for discovered_range in layers:
if (
is_suspiciously_successive_range(discovered_range, character_range)
is False
):
layer_target_range = discovered_range
break
if layer_target_range is None:
layer_target_range = character_range
if layer_target_range not in layers:
layers[layer_target_range] = character.lower()
continue
layers[layer_target_range] += character.lower()
return list(layers.values())
def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
"""
This function merge results previously given by the function coherence_ratio.
The return type is the same as coherence_ratio.
"""
per_language_ratios: Dict[str, List[float]] = {}
for result in results:
for sub_result in result:
language, ratio = sub_result
if language not in per_language_ratios:
per_language_ratios[language] = [ratio]
continue
per_language_ratios[language].append(ratio)
merge = [
(
language,
round(
sum(per_language_ratios[language]) / len(per_language_ratios[language]),
4,
),
)
for language in per_language_ratios
]
return sorted(merge, key=lambda x: x[1], reverse=True)
def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
"""
We shall NOT return "English—" in CoherenceMatches because it is an alternative
of "English". This function only keeps the best match and remove the em-dash in it.
"""
index_results: Dict[str, List[float]] = dict()
for result in results:
language, ratio = result
no_em_name: str = language.replace("", "")
if no_em_name not in index_results:
index_results[no_em_name] = []
index_results[no_em_name].append(ratio)
if any(len(index_results[e]) > 1 for e in index_results):
filtered_results: CoherenceMatches = []
for language in index_results:
filtered_results.append((language, max(index_results[language])))
return filtered_results
return results
@lru_cache(maxsize=2048)
def coherence_ratio(
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None
) -> CoherenceMatches:
"""
Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
A layer = Character extraction by alphabets/ranges.
"""
results: List[Tuple[str, float]] = []
ignore_non_latin: bool = False
sufficient_match_count: int = 0
lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
if "Latin Based" in lg_inclusion_list:
ignore_non_latin = True
lg_inclusion_list.remove("Latin Based")
for layer in alpha_unicode_split(decoded_sequence):
sequence_frequencies: TypeCounter[str] = Counter(layer)
most_common = sequence_frequencies.most_common()
character_count: int = sum(o for c, o in most_common)
if character_count <= TOO_SMALL_SEQUENCE:
continue
popular_character_ordered: List[str] = [c for c, o in most_common]
for language in lg_inclusion_list or alphabet_languages(
popular_character_ordered, ignore_non_latin
):
ratio: float = characters_popularity_compare(
language, popular_character_ordered
)
if ratio < threshold:
continue
elif ratio >= 0.8:
sufficient_match_count += 1
results.append((language, round(ratio, 4)))
if sufficient_match_count >= 3:
break
return sorted(
filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
)

View File

@@ -0,0 +1,6 @@
from .__main__ import cli_detect, query_yes_no
__all__ = (
"cli_detect",
"query_yes_no",
)

View File

@@ -0,0 +1,296 @@
import argparse
import sys
from json import dumps
from os.path import abspath, basename, dirname, join, realpath
from platform import python_version
from typing import List, Optional
from unicodedata import unidata_version
import charset_normalizer.md as md_module
from charset_normalizer import from_fp
from charset_normalizer.models import CliDetectionResult
from charset_normalizer.version import __version__
def query_yes_no(question: str, default: str = "yes") -> bool:
"""Ask a yes/no question via input() and return their answer.
"question" is a string that is presented to the user.
"default" is the presumed answer if the user just hits <Enter>.
It must be "yes" (the default), "no" or None (meaning
an answer is required of the user).
The "answer" return value is True for "yes" or False for "no".
Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
"""
valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
if default is None:
prompt = " [y/n] "
elif default == "yes":
prompt = " [Y/n] "
elif default == "no":
prompt = " [y/N] "
else:
raise ValueError("invalid default answer: '%s'" % default)
while True:
sys.stdout.write(question + prompt)
choice = input().lower()
if default is not None and choice == "":
return valid[default]
elif choice in valid:
return valid[choice]
else:
sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
def cli_detect(argv: Optional[List[str]] = None) -> int:
"""
CLI assistant using ARGV and ArgumentParser
:param argv:
:return: 0 if everything is fine, anything else equal trouble
"""
parser = argparse.ArgumentParser(
description="The Real First Universal Charset Detector. "
"Discover originating encoding used on text file. "
"Normalize text to unicode."
)
parser.add_argument(
"files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed"
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
default=False,
dest="verbose",
help="Display complementary information about file if any. "
"Stdout will contain logs about the detection process.",
)
parser.add_argument(
"-a",
"--with-alternative",
action="store_true",
default=False,
dest="alternatives",
help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
)
parser.add_argument(
"-n",
"--normalize",
action="store_true",
default=False,
dest="normalize",
help="Permit to normalize input file. If not set, program does not write anything.",
)
parser.add_argument(
"-m",
"--minimal",
action="store_true",
default=False,
dest="minimal",
help="Only output the charset detected to STDOUT. Disabling JSON output.",
)
parser.add_argument(
"-r",
"--replace",
action="store_true",
default=False,
dest="replace",
help="Replace file when trying to normalize it instead of creating a new one.",
)
parser.add_argument(
"-f",
"--force",
action="store_true",
default=False,
dest="force",
help="Replace file without asking if you are sure, use this flag with caution.",
)
parser.add_argument(
"-t",
"--threshold",
action="store",
default=0.2,
type=float,
dest="threshold",
help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.",
)
parser.add_argument(
"--version",
action="version",
version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
__version__,
python_version(),
unidata_version,
"OFF" if md_module.__file__.lower().endswith(".py") else "ON",
),
help="Show version information and exit.",
)
args = parser.parse_args(argv)
if args.replace is True and args.normalize is False:
print("Use --replace in addition of --normalize only.", file=sys.stderr)
return 1
if args.force is True and args.replace is False:
print("Use --force in addition of --replace only.", file=sys.stderr)
return 1
if args.threshold < 0.0 or args.threshold > 1.0:
print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
return 1
x_ = []
for my_file in args.files:
matches = from_fp(my_file, threshold=args.threshold, explain=args.verbose)
best_guess = matches.best()
if best_guess is None:
print(
'Unable to identify originating encoding for "{}". {}'.format(
my_file.name,
"Maybe try increasing maximum amount of chaos."
if args.threshold < 1.0
else "",
),
file=sys.stderr,
)
x_.append(
CliDetectionResult(
abspath(my_file.name),
None,
[],
[],
"Unknown",
[],
False,
1.0,
0.0,
None,
True,
)
)
else:
x_.append(
CliDetectionResult(
abspath(my_file.name),
best_guess.encoding,
best_guess.encoding_aliases,
[
cp
for cp in best_guess.could_be_from_charset
if cp != best_guess.encoding
],
best_guess.language,
best_guess.alphabets,
best_guess.bom,
best_guess.percent_chaos,
best_guess.percent_coherence,
None,
True,
)
)
if len(matches) > 1 and args.alternatives:
for el in matches:
if el != best_guess:
x_.append(
CliDetectionResult(
abspath(my_file.name),
el.encoding,
el.encoding_aliases,
[
cp
for cp in el.could_be_from_charset
if cp != el.encoding
],
el.language,
el.alphabets,
el.bom,
el.percent_chaos,
el.percent_coherence,
None,
False,
)
)
if args.normalize is True:
if best_guess.encoding.startswith("utf") is True:
print(
'"{}" file does not need to be normalized, as it already came from unicode.'.format(
my_file.name
),
file=sys.stderr,
)
if my_file.closed is False:
my_file.close()
continue
dir_path = dirname(realpath(my_file.name))
file_name = basename(realpath(my_file.name))
o_: List[str] = file_name.split(".")
if args.replace is False:
o_.insert(-1, best_guess.encoding)
if my_file.closed is False:
my_file.close()
elif (
args.force is False
and query_yes_no(
'Are you sure to normalize "{}" by replacing it ?'.format(
my_file.name
),
"no",
)
is False
):
if my_file.closed is False:
my_file.close()
continue
try:
x_[0].unicode_path = join(dir_path, ".".join(o_))
with open(x_[0].unicode_path, "w", encoding="utf-8") as fp:
fp.write(str(best_guess))
except IOError as e:
print(str(e), file=sys.stderr)
if my_file.closed is False:
my_file.close()
return 2
if my_file.closed is False:
my_file.close()
if args.minimal is False:
print(
dumps(
[el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
ensure_ascii=True,
indent=4,
)
)
else:
for my_file in args.files:
print(
", ".join(
[
el.encoding or "undefined"
for el in x_
if el.path == abspath(my_file.name)
]
)
)
return 0
if __name__ == "__main__":
cli_detect()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,54 @@
from typing import Any, Dict, Optional, Union
from warnings import warn
from .api import from_bytes
from .constant import CHARDET_CORRESPONDENCE
def detect(
byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
) -> Dict[str, Optional[Union[str, float]]]:
"""
chardet legacy method
Detect the encoding of the given byte string. It should be mostly backward-compatible.
Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
This function is deprecated and should be used to migrate your project easily, consult the documentation for
further information. Not planned for removal.
:param byte_str: The byte sequence to examine.
:param should_rename_legacy: Should we rename legacy encodings
to their more modern equivalents?
"""
if len(kwargs):
warn(
f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
)
if not isinstance(byte_str, (bytearray, bytes)):
raise TypeError( # pragma: nocover
"Expected object of type bytes or bytearray, got: "
"{0}".format(type(byte_str))
)
if isinstance(byte_str, bytearray):
byte_str = bytes(byte_str)
r = from_bytes(byte_str).best()
encoding = r.encoding if r is not None else None
language = r.language if r is not None and r.language != "Unknown" else ""
confidence = 1.0 - r.chaos if r is not None else None
# Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
# but chardet does return 'utf-8-sig' and it is a valid codec name.
if r is not None and encoding == "utf_8" and r.bom:
encoding += "_sig"
if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
encoding = CHARDET_CORRESPONDENCE[encoding]
return {
"encoding": encoding,
"language": language,
"confidence": confidence,
}

View File

@@ -0,0 +1,615 @@
from functools import lru_cache
from logging import getLogger
from typing import List, Optional
from .constant import (
COMMON_SAFE_ASCII_CHARACTERS,
TRACE,
UNICODE_SECONDARY_RANGE_KEYWORD,
)
from .utils import (
is_accentuated,
is_arabic,
is_arabic_isolated_form,
is_case_variable,
is_cjk,
is_emoticon,
is_hangul,
is_hiragana,
is_katakana,
is_latin,
is_punctuation,
is_separator,
is_symbol,
is_thai,
is_unprintable,
remove_accent,
unicode_range,
)
class MessDetectorPlugin:
"""
Base abstract class used for mess detection plugins.
All detectors MUST extend and implement given methods.
"""
def eligible(self, character: str) -> bool:
"""
Determine if given character should be fed in.
"""
raise NotImplementedError # pragma: nocover
def feed(self, character: str) -> None:
"""
The main routine to be executed upon character.
Insert the logic in witch the text would be considered chaotic.
"""
raise NotImplementedError # pragma: nocover
def reset(self) -> None: # pragma: no cover
"""
Permit to reset the plugin to the initial state.
"""
raise NotImplementedError
@property
def ratio(self) -> float:
"""
Compute the chaos ratio based on what your feed() has seen.
Must NOT be lower than 0.; No restriction gt 0.
"""
raise NotImplementedError # pragma: nocover
class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._punctuation_count: int = 0
self._symbol_count: int = 0
self._character_count: int = 0
self._last_printable_char: Optional[str] = None
self._frenzy_symbol_in_word: bool = False
def eligible(self, character: str) -> bool:
return character.isprintable()
def feed(self, character: str) -> None:
self._character_count += 1
if (
character != self._last_printable_char
and character not in COMMON_SAFE_ASCII_CHARACTERS
):
if is_punctuation(character):
self._punctuation_count += 1
elif (
character.isdigit() is False
and is_symbol(character)
and is_emoticon(character) is False
):
self._symbol_count += 2
self._last_printable_char = character
def reset(self) -> None: # pragma: no cover
self._punctuation_count = 0
self._character_count = 0
self._symbol_count = 0
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.0
ratio_of_punctuation: float = (
self._punctuation_count + self._symbol_count
) / self._character_count
return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
class TooManyAccentuatedPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._character_count: int = 0
self._accentuated_count: int = 0
def eligible(self, character: str) -> bool:
return character.isalpha()
def feed(self, character: str) -> None:
self._character_count += 1
if is_accentuated(character):
self._accentuated_count += 1
def reset(self) -> None: # pragma: no cover
self._character_count = 0
self._accentuated_count = 0
@property
def ratio(self) -> float:
if self._character_count < 8:
return 0.0
ratio_of_accentuation: float = self._accentuated_count / self._character_count
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
class UnprintablePlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._unprintable_count: int = 0
self._character_count: int = 0
def eligible(self, character: str) -> bool:
return True
def feed(self, character: str) -> None:
if is_unprintable(character):
self._unprintable_count += 1
self._character_count += 1
def reset(self) -> None: # pragma: no cover
self._unprintable_count = 0
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.0
return (self._unprintable_count * 8) / self._character_count
class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._successive_count: int = 0
self._character_count: int = 0
self._last_latin_character: Optional[str] = None
def eligible(self, character: str) -> bool:
return character.isalpha() and is_latin(character)
def feed(self, character: str) -> None:
self._character_count += 1
if (
self._last_latin_character is not None
and is_accentuated(character)
and is_accentuated(self._last_latin_character)
):
if character.isupper() and self._last_latin_character.isupper():
self._successive_count += 1
# Worse if its the same char duplicated with different accent.
if remove_accent(character) == remove_accent(self._last_latin_character):
self._successive_count += 1
self._last_latin_character = character
def reset(self) -> None: # pragma: no cover
self._successive_count = 0
self._character_count = 0
self._last_latin_character = None
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.0
return (self._successive_count * 2) / self._character_count
class SuspiciousRange(MessDetectorPlugin):
def __init__(self) -> None:
self._suspicious_successive_range_count: int = 0
self._character_count: int = 0
self._last_printable_seen: Optional[str] = None
def eligible(self, character: str) -> bool:
return character.isprintable()
def feed(self, character: str) -> None:
self._character_count += 1
if (
character.isspace()
or is_punctuation(character)
or character in COMMON_SAFE_ASCII_CHARACTERS
):
self._last_printable_seen = None
return
if self._last_printable_seen is None:
self._last_printable_seen = character
return
unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
unicode_range_b: Optional[str] = unicode_range(character)
if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
self._suspicious_successive_range_count += 1
self._last_printable_seen = character
def reset(self) -> None: # pragma: no cover
self._character_count = 0
self._suspicious_successive_range_count = 0
self._last_printable_seen = None
@property
def ratio(self) -> float:
if self._character_count <= 24:
return 0.0
ratio_of_suspicious_range_usage: float = (
self._suspicious_successive_range_count * 2
) / self._character_count
return ratio_of_suspicious_range_usage
class SuperWeirdWordPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._word_count: int = 0
self._bad_word_count: int = 0
self._foreign_long_count: int = 0
self._is_current_word_bad: bool = False
self._foreign_long_watch: bool = False
self._character_count: int = 0
self._bad_character_count: int = 0
self._buffer: str = ""
self._buffer_accent_count: int = 0
def eligible(self, character: str) -> bool:
return True
def feed(self, character: str) -> None:
if character.isalpha():
self._buffer += character
if is_accentuated(character):
self._buffer_accent_count += 1
if (
self._foreign_long_watch is False
and (is_latin(character) is False or is_accentuated(character))
and is_cjk(character) is False
and is_hangul(character) is False
and is_katakana(character) is False
and is_hiragana(character) is False
and is_thai(character) is False
):
self._foreign_long_watch = True
return
if not self._buffer:
return
if (
character.isspace() or is_punctuation(character) or is_separator(character)
) and self._buffer:
self._word_count += 1
buffer_length: int = len(self._buffer)
self._character_count += buffer_length
if buffer_length >= 4:
if self._buffer_accent_count / buffer_length > 0.34:
self._is_current_word_bad = True
# Word/Buffer ending with an upper case accentuated letter are so rare,
# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
if (
is_accentuated(self._buffer[-1])
and self._buffer[-1].isupper()
and all(_.isupper() for _ in self._buffer) is False
):
self._foreign_long_count += 1
self._is_current_word_bad = True
if buffer_length >= 24 and self._foreign_long_watch:
camel_case_dst = [
i
for c, i in zip(self._buffer, range(0, buffer_length))
if c.isupper()
]
probable_camel_cased: bool = False
if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
probable_camel_cased = True
if not probable_camel_cased:
self._foreign_long_count += 1
self._is_current_word_bad = True
if self._is_current_word_bad:
self._bad_word_count += 1
self._bad_character_count += len(self._buffer)
self._is_current_word_bad = False
self._foreign_long_watch = False
self._buffer = ""
self._buffer_accent_count = 0
elif (
character not in {"<", ">", "-", "=", "~", "|", "_"}
and character.isdigit() is False
and is_symbol(character)
):
self._is_current_word_bad = True
self._buffer += character
def reset(self) -> None: # pragma: no cover
self._buffer = ""
self._is_current_word_bad = False
self._foreign_long_watch = False
self._bad_word_count = 0
self._word_count = 0
self._character_count = 0
self._bad_character_count = 0
self._foreign_long_count = 0
@property
def ratio(self) -> float:
if self._word_count <= 10 and self._foreign_long_count == 0:
return 0.0
return self._bad_character_count / self._character_count
class CjkInvalidStopPlugin(MessDetectorPlugin):
"""
GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
can be easily detected. Searching for the overuse of '' and ''.
"""
def __init__(self) -> None:
self._wrong_stop_count: int = 0
self._cjk_character_count: int = 0
def eligible(self, character: str) -> bool:
return True
def feed(self, character: str) -> None:
if character in {"", ""}:
self._wrong_stop_count += 1
return
if is_cjk(character):
self._cjk_character_count += 1
def reset(self) -> None: # pragma: no cover
self._wrong_stop_count = 0
self._cjk_character_count = 0
@property
def ratio(self) -> float:
if self._cjk_character_count < 16:
return 0.0
return self._wrong_stop_count / self._cjk_character_count
class ArchaicUpperLowerPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._buf: bool = False
self._character_count_since_last_sep: int = 0
self._successive_upper_lower_count: int = 0
self._successive_upper_lower_count_final: int = 0
self._character_count: int = 0
self._last_alpha_seen: Optional[str] = None
self._current_ascii_only: bool = True
def eligible(self, character: str) -> bool:
return True
def feed(self, character: str) -> None:
is_concerned = character.isalpha() and is_case_variable(character)
chunk_sep = is_concerned is False
if chunk_sep and self._character_count_since_last_sep > 0:
if (
self._character_count_since_last_sep <= 64
and character.isdigit() is False
and self._current_ascii_only is False
):
self._successive_upper_lower_count_final += (
self._successive_upper_lower_count
)
self._successive_upper_lower_count = 0
self._character_count_since_last_sep = 0
self._last_alpha_seen = None
self._buf = False
self._character_count += 1
self._current_ascii_only = True
return
if self._current_ascii_only is True and character.isascii() is False:
self._current_ascii_only = False
if self._last_alpha_seen is not None:
if (character.isupper() and self._last_alpha_seen.islower()) or (
character.islower() and self._last_alpha_seen.isupper()
):
if self._buf is True:
self._successive_upper_lower_count += 2
self._buf = False
else:
self._buf = True
else:
self._buf = False
self._character_count += 1
self._character_count_since_last_sep += 1
self._last_alpha_seen = character
def reset(self) -> None: # pragma: no cover
self._character_count = 0
self._character_count_since_last_sep = 0
self._successive_upper_lower_count = 0
self._successive_upper_lower_count_final = 0
self._last_alpha_seen = None
self._buf = False
self._current_ascii_only = True
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.0
return self._successive_upper_lower_count_final / self._character_count
class ArabicIsolatedFormPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._character_count: int = 0
self._isolated_form_count: int = 0
def reset(self) -> None: # pragma: no cover
self._character_count = 0
self._isolated_form_count = 0
def eligible(self, character: str) -> bool:
return is_arabic(character)
def feed(self, character: str) -> None:
self._character_count += 1
if is_arabic_isolated_form(character):
self._isolated_form_count += 1
@property
def ratio(self) -> float:
if self._character_count < 8:
return 0.0
isolated_form_usage: float = self._isolated_form_count / self._character_count
return isolated_form_usage
@lru_cache(maxsize=1024)
def is_suspiciously_successive_range(
unicode_range_a: Optional[str], unicode_range_b: Optional[str]
) -> bool:
"""
Determine if two Unicode range seen next to each other can be considered as suspicious.
"""
if unicode_range_a is None or unicode_range_b is None:
return True
if unicode_range_a == unicode_range_b:
return False
if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
return False
if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
return False
# Latin characters can be accompanied with a combining diacritical mark
# eg. Vietnamese.
if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
"Combining" in unicode_range_a or "Combining" in unicode_range_b
):
return False
keywords_range_a, keywords_range_b = unicode_range_a.split(
" "
), unicode_range_b.split(" ")
for el in keywords_range_a:
if el in UNICODE_SECONDARY_RANGE_KEYWORD:
continue
if el in keywords_range_b:
return False
# Japanese Exception
range_a_jp_chars, range_b_jp_chars = (
unicode_range_a
in (
"Hiragana",
"Katakana",
),
unicode_range_b in ("Hiragana", "Katakana"),
)
if (range_a_jp_chars or range_b_jp_chars) and (
"CJK" in unicode_range_a or "CJK" in unicode_range_b
):
return False
if range_a_jp_chars and range_b_jp_chars:
return False
if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
return False
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
return False
# Chinese/Japanese use dedicated range for punctuation and/or separators.
if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
unicode_range_a in ["Katakana", "Hiragana"]
and unicode_range_b in ["Katakana", "Hiragana"]
):
if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
return False
if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
return False
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
return False
return True
@lru_cache(maxsize=2048)
def mess_ratio(
decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
) -> float:
"""
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
"""
detectors: List[MessDetectorPlugin] = [
md_class() for md_class in MessDetectorPlugin.__subclasses__()
]
length: int = len(decoded_sequence) + 1
mean_mess_ratio: float = 0.0
if length < 512:
intermediary_mean_mess_ratio_calc: int = 32
elif length <= 1024:
intermediary_mean_mess_ratio_calc = 64
else:
intermediary_mean_mess_ratio_calc = 128
for character, index in zip(decoded_sequence + "\n", range(length)):
for detector in detectors:
if detector.eligible(character):
detector.feed(character)
if (
index > 0 and index % intermediary_mean_mess_ratio_calc == 0
) or index == length - 1:
mean_mess_ratio = sum(dt.ratio for dt in detectors)
if mean_mess_ratio >= maximum_threshold:
break
if debug:
logger = getLogger("charset_normalizer")
logger.log(
TRACE,
"Mess-detector extended-analysis start. "
f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
f"maximum_threshold={maximum_threshold}",
)
if len(decoded_sequence) > 16:
logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
for dt in detectors: # pragma: nocover
logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
return round(mean_mess_ratio, 3)

View File

@@ -0,0 +1,340 @@
from encodings.aliases import aliases
from hashlib import sha256
from json import dumps
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
from .constant import TOO_BIG_SEQUENCE
from .utils import iana_name, is_multi_byte_encoding, unicode_range
class CharsetMatch:
def __init__(
self,
payload: bytes,
guessed_encoding: str,
mean_mess_ratio: float,
has_sig_or_bom: bool,
languages: "CoherenceMatches",
decoded_payload: Optional[str] = None,
):
self._payload: bytes = payload
self._encoding: str = guessed_encoding
self._mean_mess_ratio: float = mean_mess_ratio
self._languages: CoherenceMatches = languages
self._has_sig_or_bom: bool = has_sig_or_bom
self._unicode_ranges: Optional[List[str]] = None
self._leaves: List[CharsetMatch] = []
self._mean_coherence_ratio: float = 0.0
self._output_payload: Optional[bytes] = None
self._output_encoding: Optional[str] = None
self._string: Optional[str] = decoded_payload
def __eq__(self, other: object) -> bool:
if not isinstance(other, CharsetMatch):
raise TypeError(
"__eq__ cannot be invoked on {} and {}.".format(
str(other.__class__), str(self.__class__)
)
)
return self.encoding == other.encoding and self.fingerprint == other.fingerprint
def __lt__(self, other: object) -> bool:
"""
Implemented to make sorted available upon CharsetMatches items.
"""
if not isinstance(other, CharsetMatch):
raise ValueError
chaos_difference: float = abs(self.chaos - other.chaos)
coherence_difference: float = abs(self.coherence - other.coherence)
# Below 1% difference --> Use Coherence
if chaos_difference < 0.01 and coherence_difference > 0.02:
return self.coherence > other.coherence
elif chaos_difference < 0.01 and coherence_difference <= 0.02:
# When having a difficult decision, use the result that decoded as many multi-byte as possible.
# preserve RAM usage!
if len(self._payload) >= TOO_BIG_SEQUENCE:
return self.chaos < other.chaos
return self.multi_byte_usage > other.multi_byte_usage
return self.chaos < other.chaos
@property
def multi_byte_usage(self) -> float:
return 1.0 - (len(str(self)) / len(self.raw))
def __str__(self) -> str:
# Lazy Str Loading
if self._string is None:
self._string = str(self._payload, self._encoding, "strict")
return self._string
def __repr__(self) -> str:
return "<CharsetMatch '{}' bytes({})>".format(self.encoding, self.fingerprint)
def add_submatch(self, other: "CharsetMatch") -> None:
if not isinstance(other, CharsetMatch) or other == self:
raise ValueError(
"Unable to add instance <{}> as a submatch of a CharsetMatch".format(
other.__class__
)
)
other._string = None # Unload RAM usage; dirty trick.
self._leaves.append(other)
@property
def encoding(self) -> str:
return self._encoding
@property
def encoding_aliases(self) -> List[str]:
"""
Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
"""
also_known_as: List[str] = []
for u, p in aliases.items():
if self.encoding == u:
also_known_as.append(p)
elif self.encoding == p:
also_known_as.append(u)
return also_known_as
@property
def bom(self) -> bool:
return self._has_sig_or_bom
@property
def byte_order_mark(self) -> bool:
return self._has_sig_or_bom
@property
def languages(self) -> List[str]:
"""
Return the complete list of possible languages found in decoded sequence.
Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
"""
return [e[0] for e in self._languages]
@property
def language(self) -> str:
"""
Most probable language found in decoded sequence. If none were detected or inferred, the property will return
"Unknown".
"""
if not self._languages:
# Trying to infer the language based on the given encoding
# Its either English or we should not pronounce ourselves in certain cases.
if "ascii" in self.could_be_from_charset:
return "English"
# doing it there to avoid circular import
from charset_normalizer.cd import encoding_languages, mb_encoding_languages
languages = (
mb_encoding_languages(self.encoding)
if is_multi_byte_encoding(self.encoding)
else encoding_languages(self.encoding)
)
if len(languages) == 0 or "Latin Based" in languages:
return "Unknown"
return languages[0]
return self._languages[0][0]
@property
def chaos(self) -> float:
return self._mean_mess_ratio
@property
def coherence(self) -> float:
if not self._languages:
return 0.0
return self._languages[0][1]
@property
def percent_chaos(self) -> float:
return round(self.chaos * 100, ndigits=3)
@property
def percent_coherence(self) -> float:
return round(self.coherence * 100, ndigits=3)
@property
def raw(self) -> bytes:
"""
Original untouched bytes.
"""
return self._payload
@property
def submatch(self) -> List["CharsetMatch"]:
return self._leaves
@property
def has_submatch(self) -> bool:
return len(self._leaves) > 0
@property
def alphabets(self) -> List[str]:
if self._unicode_ranges is not None:
return self._unicode_ranges
# list detected ranges
detected_ranges: List[Optional[str]] = [
unicode_range(char) for char in str(self)
]
# filter and sort
self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
return self._unicode_ranges
@property
def could_be_from_charset(self) -> List[str]:
"""
The complete list of encoding that output the exact SAME str result and therefore could be the originating
encoding.
This list does include the encoding available in property 'encoding'.
"""
return [self._encoding] + [m.encoding for m in self._leaves]
def output(self, encoding: str = "utf_8") -> bytes:
"""
Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
Any errors will be simply ignored by the encoder NOT replaced.
"""
if self._output_encoding is None or self._output_encoding != encoding:
self._output_encoding = encoding
self._output_payload = str(self).encode(encoding, "replace")
return self._output_payload # type: ignore
@property
def fingerprint(self) -> str:
"""
Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
"""
return sha256(self.output()).hexdigest()
class CharsetMatches:
"""
Container with every CharsetMatch items ordered by default from most probable to the less one.
Act like a list(iterable) but does not implements all related methods.
"""
def __init__(self, results: Optional[List[CharsetMatch]] = None):
self._results: List[CharsetMatch] = sorted(results) if results else []
def __iter__(self) -> Iterator[CharsetMatch]:
yield from self._results
def __getitem__(self, item: Union[int, str]) -> CharsetMatch:
"""
Retrieve a single item either by its position or encoding name (alias may be used here).
Raise KeyError upon invalid index or encoding not present in results.
"""
if isinstance(item, int):
return self._results[item]
if isinstance(item, str):
item = iana_name(item, False)
for result in self._results:
if item in result.could_be_from_charset:
return result
raise KeyError
def __len__(self) -> int:
return len(self._results)
def __bool__(self) -> bool:
return len(self._results) > 0
def append(self, item: CharsetMatch) -> None:
"""
Insert a single match. Will be inserted accordingly to preserve sort.
Can be inserted as a submatch.
"""
if not isinstance(item, CharsetMatch):
raise ValueError(
"Cannot append instance '{}' to CharsetMatches".format(
str(item.__class__)
)
)
# We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
if len(item.raw) <= TOO_BIG_SEQUENCE:
for match in self._results:
if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
match.add_submatch(item)
return
self._results.append(item)
self._results = sorted(self._results)
def best(self) -> Optional["CharsetMatch"]:
"""
Simply return the first match. Strict equivalent to matches[0].
"""
if not self._results:
return None
return self._results[0]
def first(self) -> Optional["CharsetMatch"]:
"""
Redundant method, call the method best(). Kept for BC reasons.
"""
return self.best()
CoherenceMatch = Tuple[str, float]
CoherenceMatches = List[CoherenceMatch]
class CliDetectionResult:
def __init__(
self,
path: str,
encoding: Optional[str],
encoding_aliases: List[str],
alternative_encodings: List[str],
language: str,
alphabets: List[str],
has_sig_or_bom: bool,
chaos: float,
coherence: float,
unicode_path: Optional[str],
is_preferred: bool,
):
self.path: str = path
self.unicode_path: Optional[str] = unicode_path
self.encoding: Optional[str] = encoding
self.encoding_aliases: List[str] = encoding_aliases
self.alternative_encodings: List[str] = alternative_encodings
self.language: str = language
self.alphabets: List[str] = alphabets
self.has_sig_or_bom: bool = has_sig_or_bom
self.chaos: float = chaos
self.coherence: float = coherence
self.is_preferred: bool = is_preferred
@property
def __dict__(self) -> Dict[str, Any]: # type: ignore
return {
"path": self.path,
"encoding": self.encoding,
"encoding_aliases": self.encoding_aliases,
"alternative_encodings": self.alternative_encodings,
"language": self.language,
"alphabets": self.alphabets,
"has_sig_or_bom": self.has_sig_or_bom,
"chaos": self.chaos,
"coherence": self.coherence,
"unicode_path": self.unicode_path,
"is_preferred": self.is_preferred,
}
def to_json(self) -> str:
return dumps(self.__dict__, ensure_ascii=True, indent=4)

View File

@@ -0,0 +1,421 @@
import importlib
import logging
import unicodedata
from codecs import IncrementalDecoder
from encodings.aliases import aliases
from functools import lru_cache
from re import findall
from typing import Generator, List, Optional, Set, Tuple, Union
from _multibytecodec import MultibyteIncrementalDecoder
from .constant import (
ENCODING_MARKS,
IANA_SUPPORTED_SIMILAR,
RE_POSSIBLE_ENCODING_INDICATION,
UNICODE_RANGES_COMBINED,
UNICODE_SECONDARY_RANGE_KEYWORD,
UTF8_MAXIMAL_ALLOCATION,
)
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_accentuated(character: str) -> bool:
try:
description: str = unicodedata.name(character)
except ValueError:
return False
return (
"WITH GRAVE" in description
or "WITH ACUTE" in description
or "WITH CEDILLA" in description
or "WITH DIAERESIS" in description
or "WITH CIRCUMFLEX" in description
or "WITH TILDE" in description
or "WITH MACRON" in description
or "WITH RING ABOVE" in description
)
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def remove_accent(character: str) -> str:
decomposed: str = unicodedata.decomposition(character)
if not decomposed:
return character
codes: List[str] = decomposed.split(" ")
return chr(int(codes[0], 16))
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def unicode_range(character: str) -> Optional[str]:
"""
Retrieve the Unicode range official name from a single character.
"""
character_ord: int = ord(character)
for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
if character_ord in ord_range:
return range_name
return None
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_latin(character: str) -> bool:
try:
description: str = unicodedata.name(character)
except ValueError:
return False
return "LATIN" in description
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_punctuation(character: str) -> bool:
character_category: str = unicodedata.category(character)
if "P" in character_category:
return True
character_range: Optional[str] = unicode_range(character)
if character_range is None:
return False
return "Punctuation" in character_range
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_symbol(character: str) -> bool:
character_category: str = unicodedata.category(character)
if "S" in character_category or "N" in character_category:
return True
character_range: Optional[str] = unicode_range(character)
if character_range is None:
return False
return "Forms" in character_range and character_category != "Lo"
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_emoticon(character: str) -> bool:
character_range: Optional[str] = unicode_range(character)
if character_range is None:
return False
return "Emoticons" in character_range or "Pictographs" in character_range
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_separator(character: str) -> bool:
if character.isspace() or character in {"", "+", "<", ">"}:
return True
character_category: str = unicodedata.category(character)
return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_case_variable(character: str) -> bool:
return character.islower() != character.isupper()
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_cjk(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False
return "CJK" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_hiragana(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False
return "HIRAGANA" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_katakana(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False
return "KATAKANA" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_hangul(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False
return "HANGUL" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_thai(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False
return "THAI" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_arabic(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False
return "ARABIC" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_arabic_isolated_form(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False
return "ARABIC" in character_name and "ISOLATED FORM" in character_name
@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
def is_unicode_range_secondary(range_name: str) -> bool:
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_unprintable(character: str) -> bool:
return (
character.isspace() is False # includes \n \t \r \v
and character.isprintable() is False
and character != "\x1A" # Why? Its the ASCII substitute character.
and character != "\ufeff" # bug discovered in Python,
# Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
)
def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> Optional[str]:
"""
Extract using ASCII-only decoder any specified encoding in the first n-bytes.
"""
if not isinstance(sequence, bytes):
raise TypeError
seq_len: int = len(sequence)
results: List[str] = findall(
RE_POSSIBLE_ENCODING_INDICATION,
sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
)
if len(results) == 0:
return None
for specified_encoding in results:
specified_encoding = specified_encoding.lower().replace("-", "_")
encoding_alias: str
encoding_iana: str
for encoding_alias, encoding_iana in aliases.items():
if encoding_alias == specified_encoding:
return encoding_iana
if encoding_iana == specified_encoding:
return encoding_iana
return None
@lru_cache(maxsize=128)
def is_multi_byte_encoding(name: str) -> bool:
"""
Verify is a specific encoding is a multi byte one based on it IANA name
"""
return name in {
"utf_8",
"utf_8_sig",
"utf_16",
"utf_16_be",
"utf_16_le",
"utf_32",
"utf_32_le",
"utf_32_be",
"utf_7",
} or issubclass(
importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,
MultibyteIncrementalDecoder,
)
def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:
"""
Identify and extract SIG/BOM in given sequence.
"""
for iana_encoding in ENCODING_MARKS:
marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding]
if isinstance(marks, bytes):
marks = [marks]
for mark in marks:
if sequence.startswith(mark):
return iana_encoding, mark
return None, b""
def should_strip_sig_or_bom(iana_encoding: str) -> bool:
return iana_encoding not in {"utf_16", "utf_32"}
def iana_name(cp_name: str, strict: bool = True) -> str:
cp_name = cp_name.lower().replace("-", "_")
encoding_alias: str
encoding_iana: str
for encoding_alias, encoding_iana in aliases.items():
if cp_name in [encoding_alias, encoding_iana]:
return encoding_iana
if strict:
raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name))
return cp_name
def range_scan(decoded_sequence: str) -> List[str]:
ranges: Set[str] = set()
for character in decoded_sequence:
character_range: Optional[str] = unicode_range(character)
if character_range is None:
continue
ranges.add(character_range)
return list(ranges)
def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
return 0.0
decoder_a = importlib.import_module(
"encodings.{}".format(iana_name_a)
).IncrementalDecoder
decoder_b = importlib.import_module(
"encodings.{}".format(iana_name_b)
).IncrementalDecoder
id_a: IncrementalDecoder = decoder_a(errors="ignore")
id_b: IncrementalDecoder = decoder_b(errors="ignore")
character_match_count: int = 0
for i in range(255):
to_be_decoded: bytes = bytes([i])
if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
character_match_count += 1
return character_match_count / 254
def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
"""
Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
the function cp_similarity.
"""
return (
iana_name_a in IANA_SUPPORTED_SIMILAR
and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
)
def set_logging_handler(
name: str = "charset_normalizer",
level: int = logging.INFO,
format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
) -> None:
logger = logging.getLogger(name)
logger.setLevel(level)
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter(format_string))
logger.addHandler(handler)
def cut_sequence_chunks(
sequences: bytes,
encoding_iana: str,
offsets: range,
chunk_size: int,
bom_or_sig_available: bool,
strip_sig_or_bom: bool,
sig_payload: bytes,
is_multi_byte_decoder: bool,
decoded_payload: Optional[str] = None,
) -> Generator[str, None, None]:
if decoded_payload and is_multi_byte_decoder is False:
for i in offsets:
chunk = decoded_payload[i : i + chunk_size]
if not chunk:
break
yield chunk
else:
for i in offsets:
chunk_end = i + chunk_size
if chunk_end > len(sequences) + 8:
continue
cut_sequence = sequences[i : i + chunk_size]
if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence
chunk = cut_sequence.decode(
encoding_iana,
errors="ignore" if is_multi_byte_decoder else "strict",
)
# multi-byte bad cutting detector and adjustment
# not the cleanest way to perform that fix but clever enough for now.
if is_multi_byte_decoder and i > 0:
chunk_partial_size_chk: int = min(chunk_size, 16)
if (
decoded_payload
and chunk[:chunk_partial_size_chk] not in decoded_payload
):
for j in range(i, i - 4, -1):
cut_sequence = sequences[j:chunk_end]
if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence
chunk = cut_sequence.decode(encoding_iana, errors="ignore")
if chunk[:chunk_partial_size_chk] in decoded_payload:
break
yield chunk

View File

@@ -0,0 +1,6 @@
"""
Expose version
"""
__version__ = "3.3.2"
VERSION = __version__.split(".")

View File

@@ -0,0 +1,24 @@
# -*- coding: utf-8 -*-
import sys
try:
from ._version import version as __version__
except ImportError:
__version__ = 'unknown'
__all__ = ['easter', 'parser', 'relativedelta', 'rrule', 'tz',
'utils', 'zoneinfo']
def __getattr__(name):
import importlib
if name in __all__:
return importlib.import_module("." + name, __name__)
raise AttributeError(
"module {!r} has not attribute {!r}".format(__name__, name)
)
def __dir__():
# __dir__ should include all the lazy-importable modules as well.
return [x for x in globals() if x not in sys.modules] + __all__

View File

@@ -0,0 +1,43 @@
"""
Common code used in multiple modules.
"""
class weekday(object):
__slots__ = ["weekday", "n"]
def __init__(self, weekday, n=None):
self.weekday = weekday
self.n = n
def __call__(self, n):
if n == self.n:
return self
else:
return self.__class__(self.weekday, n)
def __eq__(self, other):
try:
if self.weekday != other.weekday or self.n != other.n:
return False
except AttributeError:
return False
return True
def __hash__(self):
return hash((
self.weekday,
self.n,
))
def __ne__(self, other):
return not (self == other)
def __repr__(self):
s = ("MO", "TU", "WE", "TH", "FR", "SA", "SU")[self.weekday]
if not self.n:
return s
else:
return "%s(%+d)" % (s, self.n)
# vim:ts=4:sw=4:et

View File

@@ -0,0 +1,4 @@
# file generated by setuptools_scm
# don't change, don't track in version control
__version__ = version = '2.9.0.post0'
__version_tuple__ = version_tuple = (2, 9, 0)

View File

@@ -0,0 +1,89 @@
# -*- coding: utf-8 -*-
"""
This module offers a generic Easter computing method for any given year, using
Western, Orthodox or Julian algorithms.
"""
import datetime
__all__ = ["easter", "EASTER_JULIAN", "EASTER_ORTHODOX", "EASTER_WESTERN"]
EASTER_JULIAN = 1
EASTER_ORTHODOX = 2
EASTER_WESTERN = 3
def easter(year, method=EASTER_WESTERN):
"""
This method was ported from the work done by GM Arts,
on top of the algorithm by Claus Tondering, which was
based in part on the algorithm of Ouding (1940), as
quoted in "Explanatory Supplement to the Astronomical
Almanac", P. Kenneth Seidelmann, editor.
This algorithm implements three different Easter
calculation methods:
1. Original calculation in Julian calendar, valid in
dates after 326 AD
2. Original method, with date converted to Gregorian
calendar, valid in years 1583 to 4099
3. Revised method, in Gregorian calendar, valid in
years 1583 to 4099 as well
These methods are represented by the constants:
* ``EASTER_JULIAN = 1``
* ``EASTER_ORTHODOX = 2``
* ``EASTER_WESTERN = 3``
The default method is method 3.
More about the algorithm may be found at:
`GM Arts: Easter Algorithms <http://www.gmarts.org/index.php?go=415>`_
and
`The Calendar FAQ: Easter <https://www.tondering.dk/claus/cal/easter.php>`_
"""
if not (1 <= method <= 3):
raise ValueError("invalid method")
# g - Golden year - 1
# c - Century
# h - (23 - Epact) mod 30
# i - Number of days from March 21 to Paschal Full Moon
# j - Weekday for PFM (0=Sunday, etc)
# p - Number of days from March 21 to Sunday on or before PFM
# (-6 to 28 methods 1 & 3, to 56 for method 2)
# e - Extra days to add for method 2 (converting Julian
# date to Gregorian date)
y = year
g = y % 19
e = 0
if method < 3:
# Old method
i = (19*g + 15) % 30
j = (y + y//4 + i) % 7
if method == 2:
# Extra dates to convert Julian to Gregorian date
e = 10
if y > 1600:
e = e + y//100 - 16 - (y//100 - 16)//4
else:
# New method
c = y//100
h = (c - c//4 - (8*c + 13)//25 + 19*g + 15) % 30
i = h - (h//28)*(1 - (h//28)*(29//(h + 1))*((21 - g)//11))
j = (y + y//4 + i + 2 - c + c//4) % 7
# p can be from -6 to 56 corresponding to dates 22 March to 23 May
# (later dates apply to method 2, although 23 May never actually occurs)
p = i - j + e
d = 1 + (p + 27 + (p + 6)//40) % 31
m = 3 + (p + 26)//30
return datetime.date(int(y), int(m), int(d))

View File

@@ -0,0 +1,61 @@
# -*- coding: utf-8 -*-
from ._parser import parse, parser, parserinfo, ParserError
from ._parser import DEFAULTPARSER, DEFAULTTZPARSER
from ._parser import UnknownTimezoneWarning
from ._parser import __doc__
from .isoparser import isoparser, isoparse
__all__ = ['parse', 'parser', 'parserinfo',
'isoparse', 'isoparser',
'ParserError',
'UnknownTimezoneWarning']
###
# Deprecate portions of the private interface so that downstream code that
# is improperly relying on it is given *some* notice.
def __deprecated_private_func(f):
from functools import wraps
import warnings
msg = ('{name} is a private function and may break without warning, '
'it will be moved and or renamed in future versions.')
msg = msg.format(name=f.__name__)
@wraps(f)
def deprecated_func(*args, **kwargs):
warnings.warn(msg, DeprecationWarning)
return f(*args, **kwargs)
return deprecated_func
def __deprecate_private_class(c):
import warnings
msg = ('{name} is a private class and may break without warning, '
'it will be moved and or renamed in future versions.')
msg = msg.format(name=c.__name__)
class private_class(c):
__doc__ = c.__doc__
def __init__(self, *args, **kwargs):
warnings.warn(msg, DeprecationWarning)
super(private_class, self).__init__(*args, **kwargs)
private_class.__name__ = c.__name__
return private_class
from ._parser import _timelex, _resultbase
from ._parser import _tzparser, _parsetz
_timelex = __deprecate_private_class(_timelex)
_tzparser = __deprecate_private_class(_tzparser)
_resultbase = __deprecate_private_class(_resultbase)
_parsetz = __deprecated_private_func(_parsetz)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,416 @@
# -*- coding: utf-8 -*-
"""
This module offers a parser for ISO-8601 strings
It is intended to support all valid date, time and datetime formats per the
ISO-8601 specification.
..versionadded:: 2.7.0
"""
from datetime import datetime, timedelta, time, date
import calendar
from dateutil import tz
from functools import wraps
import re
import six
__all__ = ["isoparse", "isoparser"]
def _takes_ascii(f):
@wraps(f)
def func(self, str_in, *args, **kwargs):
# If it's a stream, read the whole thing
str_in = getattr(str_in, 'read', lambda: str_in)()
# If it's unicode, turn it into bytes, since ISO-8601 only covers ASCII
if isinstance(str_in, six.text_type):
# ASCII is the same in UTF-8
try:
str_in = str_in.encode('ascii')
except UnicodeEncodeError as e:
msg = 'ISO-8601 strings should contain only ASCII characters'
six.raise_from(ValueError(msg), e)
return f(self, str_in, *args, **kwargs)
return func
class isoparser(object):
def __init__(self, sep=None):
"""
:param sep:
A single character that separates date and time portions. If
``None``, the parser will accept any single character.
For strict ISO-8601 adherence, pass ``'T'``.
"""
if sep is not None:
if (len(sep) != 1 or ord(sep) >= 128 or sep in '0123456789'):
raise ValueError('Separator must be a single, non-numeric ' +
'ASCII character')
sep = sep.encode('ascii')
self._sep = sep
@_takes_ascii
def isoparse(self, dt_str):
"""
Parse an ISO-8601 datetime string into a :class:`datetime.datetime`.
An ISO-8601 datetime string consists of a date portion, followed
optionally by a time portion - the date and time portions are separated
by a single character separator, which is ``T`` in the official
standard. Incomplete date formats (such as ``YYYY-MM``) may *not* be
combined with a time portion.
Supported date formats are:
Common:
- ``YYYY``
- ``YYYY-MM``
- ``YYYY-MM-DD`` or ``YYYYMMDD``
Uncommon:
- ``YYYY-Www`` or ``YYYYWww`` - ISO week (day defaults to 0)
- ``YYYY-Www-D`` or ``YYYYWwwD`` - ISO week and day
The ISO week and day numbering follows the same logic as
:func:`datetime.date.isocalendar`.
Supported time formats are:
- ``hh``
- ``hh:mm`` or ``hhmm``
- ``hh:mm:ss`` or ``hhmmss``
- ``hh:mm:ss.ssssss`` (Up to 6 sub-second digits)
Midnight is a special case for `hh`, as the standard supports both
00:00 and 24:00 as a representation. The decimal separator can be
either a dot or a comma.
.. caution::
Support for fractional components other than seconds is part of the
ISO-8601 standard, but is not currently implemented in this parser.
Supported time zone offset formats are:
- `Z` (UTC)
- `±HH:MM`
- `±HHMM`
- `±HH`
Offsets will be represented as :class:`dateutil.tz.tzoffset` objects,
with the exception of UTC, which will be represented as
:class:`dateutil.tz.tzutc`. Time zone offsets equivalent to UTC (such
as `+00:00`) will also be represented as :class:`dateutil.tz.tzutc`.
:param dt_str:
A string or stream containing only an ISO-8601 datetime string
:return:
Returns a :class:`datetime.datetime` representing the string.
Unspecified components default to their lowest value.
.. warning::
As of version 2.7.0, the strictness of the parser should not be
considered a stable part of the contract. Any valid ISO-8601 string
that parses correctly with the default settings will continue to
parse correctly in future versions, but invalid strings that
currently fail (e.g. ``2017-01-01T00:00+00:00:00``) are not
guaranteed to continue failing in future versions if they encode
a valid date.
.. versionadded:: 2.7.0
"""
components, pos = self._parse_isodate(dt_str)
if len(dt_str) > pos:
if self._sep is None or dt_str[pos:pos + 1] == self._sep:
components += self._parse_isotime(dt_str[pos + 1:])
else:
raise ValueError('String contains unknown ISO components')
if len(components) > 3 and components[3] == 24:
components[3] = 0
return datetime(*components) + timedelta(days=1)
return datetime(*components)
@_takes_ascii
def parse_isodate(self, datestr):
"""
Parse the date portion of an ISO string.
:param datestr:
The string portion of an ISO string, without a separator
:return:
Returns a :class:`datetime.date` object
"""
components, pos = self._parse_isodate(datestr)
if pos < len(datestr):
raise ValueError('String contains unknown ISO ' +
'components: {!r}'.format(datestr.decode('ascii')))
return date(*components)
@_takes_ascii
def parse_isotime(self, timestr):
"""
Parse the time portion of an ISO string.
:param timestr:
The time portion of an ISO string, without a separator
:return:
Returns a :class:`datetime.time` object
"""
components = self._parse_isotime(timestr)
if components[0] == 24:
components[0] = 0
return time(*components)
@_takes_ascii
def parse_tzstr(self, tzstr, zero_as_utc=True):
"""
Parse a valid ISO time zone string.
See :func:`isoparser.isoparse` for details on supported formats.
:param tzstr:
A string representing an ISO time zone offset
:param zero_as_utc:
Whether to return :class:`dateutil.tz.tzutc` for zero-offset zones
:return:
Returns :class:`dateutil.tz.tzoffset` for offsets and
:class:`dateutil.tz.tzutc` for ``Z`` and (if ``zero_as_utc`` is
specified) offsets equivalent to UTC.
"""
return self._parse_tzstr(tzstr, zero_as_utc=zero_as_utc)
# Constants
_DATE_SEP = b'-'
_TIME_SEP = b':'
_FRACTION_REGEX = re.compile(b'[\\.,]([0-9]+)')
def _parse_isodate(self, dt_str):
try:
return self._parse_isodate_common(dt_str)
except ValueError:
return self._parse_isodate_uncommon(dt_str)
def _parse_isodate_common(self, dt_str):
len_str = len(dt_str)
components = [1, 1, 1]
if len_str < 4:
raise ValueError('ISO string too short')
# Year
components[0] = int(dt_str[0:4])
pos = 4
if pos >= len_str:
return components, pos
has_sep = dt_str[pos:pos + 1] == self._DATE_SEP
if has_sep:
pos += 1
# Month
if len_str - pos < 2:
raise ValueError('Invalid common month')
components[1] = int(dt_str[pos:pos + 2])
pos += 2
if pos >= len_str:
if has_sep:
return components, pos
else:
raise ValueError('Invalid ISO format')
if has_sep:
if dt_str[pos:pos + 1] != self._DATE_SEP:
raise ValueError('Invalid separator in ISO string')
pos += 1
# Day
if len_str - pos < 2:
raise ValueError('Invalid common day')
components[2] = int(dt_str[pos:pos + 2])
return components, pos + 2
def _parse_isodate_uncommon(self, dt_str):
if len(dt_str) < 4:
raise ValueError('ISO string too short')
# All ISO formats start with the year
year = int(dt_str[0:4])
has_sep = dt_str[4:5] == self._DATE_SEP
pos = 4 + has_sep # Skip '-' if it's there
if dt_str[pos:pos + 1] == b'W':
# YYYY-?Www-?D?
pos += 1
weekno = int(dt_str[pos:pos + 2])
pos += 2
dayno = 1
if len(dt_str) > pos:
if (dt_str[pos:pos + 1] == self._DATE_SEP) != has_sep:
raise ValueError('Inconsistent use of dash separator')
pos += has_sep
dayno = int(dt_str[pos:pos + 1])
pos += 1
base_date = self._calculate_weekdate(year, weekno, dayno)
else:
# YYYYDDD or YYYY-DDD
if len(dt_str) - pos < 3:
raise ValueError('Invalid ordinal day')
ordinal_day = int(dt_str[pos:pos + 3])
pos += 3
if ordinal_day < 1 or ordinal_day > (365 + calendar.isleap(year)):
raise ValueError('Invalid ordinal day' +
' {} for year {}'.format(ordinal_day, year))
base_date = date(year, 1, 1) + timedelta(days=ordinal_day - 1)
components = [base_date.year, base_date.month, base_date.day]
return components, pos
def _calculate_weekdate(self, year, week, day):
"""
Calculate the day of corresponding to the ISO year-week-day calendar.
This function is effectively the inverse of
:func:`datetime.date.isocalendar`.
:param year:
The year in the ISO calendar
:param week:
The week in the ISO calendar - range is [1, 53]
:param day:
The day in the ISO calendar - range is [1 (MON), 7 (SUN)]
:return:
Returns a :class:`datetime.date`
"""
if not 0 < week < 54:
raise ValueError('Invalid week: {}'.format(week))
if not 0 < day < 8: # Range is 1-7
raise ValueError('Invalid weekday: {}'.format(day))
# Get week 1 for the specific year:
jan_4 = date(year, 1, 4) # Week 1 always has January 4th in it
week_1 = jan_4 - timedelta(days=jan_4.isocalendar()[2] - 1)
# Now add the specific number of weeks and days to get what we want
week_offset = (week - 1) * 7 + (day - 1)
return week_1 + timedelta(days=week_offset)
def _parse_isotime(self, timestr):
len_str = len(timestr)
components = [0, 0, 0, 0, None]
pos = 0
comp = -1
if len_str < 2:
raise ValueError('ISO time too short')
has_sep = False
while pos < len_str and comp < 5:
comp += 1
if timestr[pos:pos + 1] in b'-+Zz':
# Detect time zone boundary
components[-1] = self._parse_tzstr(timestr[pos:])
pos = len_str
break
if comp == 1 and timestr[pos:pos+1] == self._TIME_SEP:
has_sep = True
pos += 1
elif comp == 2 and has_sep:
if timestr[pos:pos+1] != self._TIME_SEP:
raise ValueError('Inconsistent use of colon separator')
pos += 1
if comp < 3:
# Hour, minute, second
components[comp] = int(timestr[pos:pos + 2])
pos += 2
if comp == 3:
# Fraction of a second
frac = self._FRACTION_REGEX.match(timestr[pos:])
if not frac:
continue
us_str = frac.group(1)[:6] # Truncate to microseconds
components[comp] = int(us_str) * 10**(6 - len(us_str))
pos += len(frac.group())
if pos < len_str:
raise ValueError('Unused components in ISO string')
if components[0] == 24:
# Standard supports 00:00 and 24:00 as representations of midnight
if any(component != 0 for component in components[1:4]):
raise ValueError('Hour may only be 24 at 24:00:00.000')
return components
def _parse_tzstr(self, tzstr, zero_as_utc=True):
if tzstr == b'Z' or tzstr == b'z':
return tz.UTC
if len(tzstr) not in {3, 5, 6}:
raise ValueError('Time zone offset must be 1, 3, 5 or 6 characters')
if tzstr[0:1] == b'-':
mult = -1
elif tzstr[0:1] == b'+':
mult = 1
else:
raise ValueError('Time zone offset requires sign')
hours = int(tzstr[1:3])
if len(tzstr) == 3:
minutes = 0
else:
minutes = int(tzstr[(4 if tzstr[3:4] == self._TIME_SEP else 3):])
if zero_as_utc and hours == 0 and minutes == 0:
return tz.UTC
else:
if minutes > 59:
raise ValueError('Invalid minutes in time zone offset')
if hours > 23:
raise ValueError('Invalid hours in time zone offset')
return tz.tzoffset(None, mult * (hours * 60 + minutes) * 60)
DEFAULT_ISOPARSER = isoparser()
isoparse = DEFAULT_ISOPARSER.isoparse

View File

@@ -0,0 +1,599 @@
# -*- coding: utf-8 -*-
import datetime
import calendar
import operator
from math import copysign
from six import integer_types
from warnings import warn
from ._common import weekday
MO, TU, WE, TH, FR, SA, SU = weekdays = tuple(weekday(x) for x in range(7))
__all__ = ["relativedelta", "MO", "TU", "WE", "TH", "FR", "SA", "SU"]
class relativedelta(object):
"""
The relativedelta type is designed to be applied to an existing datetime and
can replace specific components of that datetime, or represents an interval
of time.
It is based on the specification of the excellent work done by M.-A. Lemburg
in his
`mx.DateTime <https://www.egenix.com/products/python/mxBase/mxDateTime/>`_ extension.
However, notice that this type does *NOT* implement the same algorithm as
his work. Do *NOT* expect it to behave like mx.DateTime's counterpart.
There are two different ways to build a relativedelta instance. The
first one is passing it two date/datetime classes::
relativedelta(datetime1, datetime2)
The second one is passing it any number of the following keyword arguments::
relativedelta(arg1=x,arg2=y,arg3=z...)
year, month, day, hour, minute, second, microsecond:
Absolute information (argument is singular); adding or subtracting a
relativedelta with absolute information does not perform an arithmetic
operation, but rather REPLACES the corresponding value in the
original datetime with the value(s) in relativedelta.
years, months, weeks, days, hours, minutes, seconds, microseconds:
Relative information, may be negative (argument is plural); adding
or subtracting a relativedelta with relative information performs
the corresponding arithmetic operation on the original datetime value
with the information in the relativedelta.
weekday:
One of the weekday instances (MO, TU, etc) available in the
relativedelta module. These instances may receive a parameter N,
specifying the Nth weekday, which could be positive or negative
(like MO(+1) or MO(-2)). Not specifying it is the same as specifying
+1. You can also use an integer, where 0=MO. This argument is always
relative e.g. if the calculated date is already Monday, using MO(1)
or MO(-1) won't change the day. To effectively make it absolute, use
it in combination with the day argument (e.g. day=1, MO(1) for first
Monday of the month).
leapdays:
Will add given days to the date found, if year is a leap
year, and the date found is post 28 of february.
yearday, nlyearday:
Set the yearday or the non-leap year day (jump leap days).
These are converted to day/month/leapdays information.
There are relative and absolute forms of the keyword
arguments. The plural is relative, and the singular is
absolute. For each argument in the order below, the absolute form
is applied first (by setting each attribute to that value) and
then the relative form (by adding the value to the attribute).
The order of attributes considered when this relativedelta is
added to a datetime is:
1. Year
2. Month
3. Day
4. Hours
5. Minutes
6. Seconds
7. Microseconds
Finally, weekday is applied, using the rule described above.
For example
>>> from datetime import datetime
>>> from dateutil.relativedelta import relativedelta, MO
>>> dt = datetime(2018, 4, 9, 13, 37, 0)
>>> delta = relativedelta(hours=25, day=1, weekday=MO(1))
>>> dt + delta
datetime.datetime(2018, 4, 2, 14, 37)
First, the day is set to 1 (the first of the month), then 25 hours
are added, to get to the 2nd day and 14th hour, finally the
weekday is applied, but since the 2nd is already a Monday there is
no effect.
"""
def __init__(self, dt1=None, dt2=None,
years=0, months=0, days=0, leapdays=0, weeks=0,
hours=0, minutes=0, seconds=0, microseconds=0,
year=None, month=None, day=None, weekday=None,
yearday=None, nlyearday=None,
hour=None, minute=None, second=None, microsecond=None):
if dt1 and dt2:
# datetime is a subclass of date. So both must be date
if not (isinstance(dt1, datetime.date) and
isinstance(dt2, datetime.date)):
raise TypeError("relativedelta only diffs datetime/date")
# We allow two dates, or two datetimes, so we coerce them to be
# of the same type
if (isinstance(dt1, datetime.datetime) !=
isinstance(dt2, datetime.datetime)):
if not isinstance(dt1, datetime.datetime):
dt1 = datetime.datetime.fromordinal(dt1.toordinal())
elif not isinstance(dt2, datetime.datetime):
dt2 = datetime.datetime.fromordinal(dt2.toordinal())
self.years = 0
self.months = 0
self.days = 0
self.leapdays = 0
self.hours = 0
self.minutes = 0
self.seconds = 0
self.microseconds = 0
self.year = None
self.month = None
self.day = None
self.weekday = None
self.hour = None
self.minute = None
self.second = None
self.microsecond = None
self._has_time = 0
# Get year / month delta between the two
months = (dt1.year - dt2.year) * 12 + (dt1.month - dt2.month)
self._set_months(months)
# Remove the year/month delta so the timedelta is just well-defined
# time units (seconds, days and microseconds)
dtm = self.__radd__(dt2)
# If we've overshot our target, make an adjustment
if dt1 < dt2:
compare = operator.gt
increment = 1
else:
compare = operator.lt
increment = -1
while compare(dt1, dtm):
months += increment
self._set_months(months)
dtm = self.__radd__(dt2)
# Get the timedelta between the "months-adjusted" date and dt1
delta = dt1 - dtm
self.seconds = delta.seconds + delta.days * 86400
self.microseconds = delta.microseconds
else:
# Check for non-integer values in integer-only quantities
if any(x is not None and x != int(x) for x in (years, months)):
raise ValueError("Non-integer years and months are "
"ambiguous and not currently supported.")
# Relative information
self.years = int(years)
self.months = int(months)
self.days = days + weeks * 7
self.leapdays = leapdays
self.hours = hours
self.minutes = minutes
self.seconds = seconds
self.microseconds = microseconds
# Absolute information
self.year = year
self.month = month
self.day = day
self.hour = hour
self.minute = minute
self.second = second
self.microsecond = microsecond
if any(x is not None and int(x) != x
for x in (year, month, day, hour,
minute, second, microsecond)):
# For now we'll deprecate floats - later it'll be an error.
warn("Non-integer value passed as absolute information. " +
"This is not a well-defined condition and will raise " +
"errors in future versions.", DeprecationWarning)
if isinstance(weekday, integer_types):
self.weekday = weekdays[weekday]
else:
self.weekday = weekday
yday = 0
if nlyearday:
yday = nlyearday
elif yearday:
yday = yearday
if yearday > 59:
self.leapdays = -1
if yday:
ydayidx = [31, 59, 90, 120, 151, 181, 212,
243, 273, 304, 334, 366]
for idx, ydays in enumerate(ydayidx):
if yday <= ydays:
self.month = idx+1
if idx == 0:
self.day = yday
else:
self.day = yday-ydayidx[idx-1]
break
else:
raise ValueError("invalid year day (%d)" % yday)
self._fix()
def _fix(self):
if abs(self.microseconds) > 999999:
s = _sign(self.microseconds)
div, mod = divmod(self.microseconds * s, 1000000)
self.microseconds = mod * s
self.seconds += div * s
if abs(self.seconds) > 59:
s = _sign(self.seconds)
div, mod = divmod(self.seconds * s, 60)
self.seconds = mod * s
self.minutes += div * s
if abs(self.minutes) > 59:
s = _sign(self.minutes)
div, mod = divmod(self.minutes * s, 60)
self.minutes = mod * s
self.hours += div * s
if abs(self.hours) > 23:
s = _sign(self.hours)
div, mod = divmod(self.hours * s, 24)
self.hours = mod * s
self.days += div * s
if abs(self.months) > 11:
s = _sign(self.months)
div, mod = divmod(self.months * s, 12)
self.months = mod * s
self.years += div * s
if (self.hours or self.minutes or self.seconds or self.microseconds
or self.hour is not None or self.minute is not None or
self.second is not None or self.microsecond is not None):
self._has_time = 1
else:
self._has_time = 0
@property
def weeks(self):
return int(self.days / 7.0)
@weeks.setter
def weeks(self, value):
self.days = self.days - (self.weeks * 7) + value * 7
def _set_months(self, months):
self.months = months
if abs(self.months) > 11:
s = _sign(self.months)
div, mod = divmod(self.months * s, 12)
self.months = mod * s
self.years = div * s
else:
self.years = 0
def normalized(self):
"""
Return a version of this object represented entirely using integer
values for the relative attributes.
>>> relativedelta(days=1.5, hours=2).normalized()
relativedelta(days=+1, hours=+14)
:return:
Returns a :class:`dateutil.relativedelta.relativedelta` object.
"""
# Cascade remainders down (rounding each to roughly nearest microsecond)
days = int(self.days)
hours_f = round(self.hours + 24 * (self.days - days), 11)
hours = int(hours_f)
minutes_f = round(self.minutes + 60 * (hours_f - hours), 10)
minutes = int(minutes_f)
seconds_f = round(self.seconds + 60 * (minutes_f - minutes), 8)
seconds = int(seconds_f)
microseconds = round(self.microseconds + 1e6 * (seconds_f - seconds))
# Constructor carries overflow back up with call to _fix()
return self.__class__(years=self.years, months=self.months,
days=days, hours=hours, minutes=minutes,
seconds=seconds, microseconds=microseconds,
leapdays=self.leapdays, year=self.year,
month=self.month, day=self.day,
weekday=self.weekday, hour=self.hour,
minute=self.minute, second=self.second,
microsecond=self.microsecond)
def __add__(self, other):
if isinstance(other, relativedelta):
return self.__class__(years=other.years + self.years,
months=other.months + self.months,
days=other.days + self.days,
hours=other.hours + self.hours,
minutes=other.minutes + self.minutes,
seconds=other.seconds + self.seconds,
microseconds=(other.microseconds +
self.microseconds),
leapdays=other.leapdays or self.leapdays,
year=(other.year if other.year is not None
else self.year),
month=(other.month if other.month is not None
else self.month),
day=(other.day if other.day is not None
else self.day),
weekday=(other.weekday if other.weekday is not None
else self.weekday),
hour=(other.hour if other.hour is not None
else self.hour),
minute=(other.minute if other.minute is not None
else self.minute),
second=(other.second if other.second is not None
else self.second),
microsecond=(other.microsecond if other.microsecond
is not None else
self.microsecond))
if isinstance(other, datetime.timedelta):
return self.__class__(years=self.years,
months=self.months,
days=self.days + other.days,
hours=self.hours,
minutes=self.minutes,
seconds=self.seconds + other.seconds,
microseconds=self.microseconds + other.microseconds,
leapdays=self.leapdays,
year=self.year,
month=self.month,
day=self.day,
weekday=self.weekday,
hour=self.hour,
minute=self.minute,
second=self.second,
microsecond=self.microsecond)
if not isinstance(other, datetime.date):
return NotImplemented
elif self._has_time and not isinstance(other, datetime.datetime):
other = datetime.datetime.fromordinal(other.toordinal())
year = (self.year or other.year)+self.years
month = self.month or other.month
if self.months:
assert 1 <= abs(self.months) <= 12
month += self.months
if month > 12:
year += 1
month -= 12
elif month < 1:
year -= 1
month += 12
day = min(calendar.monthrange(year, month)[1],
self.day or other.day)
repl = {"year": year, "month": month, "day": day}
for attr in ["hour", "minute", "second", "microsecond"]:
value = getattr(self, attr)
if value is not None:
repl[attr] = value
days = self.days
if self.leapdays and month > 2 and calendar.isleap(year):
days += self.leapdays
ret = (other.replace(**repl)
+ datetime.timedelta(days=days,
hours=self.hours,
minutes=self.minutes,
seconds=self.seconds,
microseconds=self.microseconds))
if self.weekday:
weekday, nth = self.weekday.weekday, self.weekday.n or 1
jumpdays = (abs(nth) - 1) * 7
if nth > 0:
jumpdays += (7 - ret.weekday() + weekday) % 7
else:
jumpdays += (ret.weekday() - weekday) % 7
jumpdays *= -1
ret += datetime.timedelta(days=jumpdays)
return ret
def __radd__(self, other):
return self.__add__(other)
def __rsub__(self, other):
return self.__neg__().__radd__(other)
def __sub__(self, other):
if not isinstance(other, relativedelta):
return NotImplemented # In case the other object defines __rsub__
return self.__class__(years=self.years - other.years,
months=self.months - other.months,
days=self.days - other.days,
hours=self.hours - other.hours,
minutes=self.minutes - other.minutes,
seconds=self.seconds - other.seconds,
microseconds=self.microseconds - other.microseconds,
leapdays=self.leapdays or other.leapdays,
year=(self.year if self.year is not None
else other.year),
month=(self.month if self.month is not None else
other.month),
day=(self.day if self.day is not None else
other.day),
weekday=(self.weekday if self.weekday is not None else
other.weekday),
hour=(self.hour if self.hour is not None else
other.hour),
minute=(self.minute if self.minute is not None else
other.minute),
second=(self.second if self.second is not None else
other.second),
microsecond=(self.microsecond if self.microsecond
is not None else
other.microsecond))
def __abs__(self):
return self.__class__(years=abs(self.years),
months=abs(self.months),
days=abs(self.days),
hours=abs(self.hours),
minutes=abs(self.minutes),
seconds=abs(self.seconds),
microseconds=abs(self.microseconds),
leapdays=self.leapdays,
year=self.year,
month=self.month,
day=self.day,
weekday=self.weekday,
hour=self.hour,
minute=self.minute,
second=self.second,
microsecond=self.microsecond)
def __neg__(self):
return self.__class__(years=-self.years,
months=-self.months,
days=-self.days,
hours=-self.hours,
minutes=-self.minutes,
seconds=-self.seconds,
microseconds=-self.microseconds,
leapdays=self.leapdays,
year=self.year,
month=self.month,
day=self.day,
weekday=self.weekday,
hour=self.hour,
minute=self.minute,
second=self.second,
microsecond=self.microsecond)
def __bool__(self):
return not (not self.years and
not self.months and
not self.days and
not self.hours and
not self.minutes and
not self.seconds and
not self.microseconds and
not self.leapdays and
self.year is None and
self.month is None and
self.day is None and
self.weekday is None and
self.hour is None and
self.minute is None and
self.second is None and
self.microsecond is None)
# Compatibility with Python 2.x
__nonzero__ = __bool__
def __mul__(self, other):
try:
f = float(other)
except TypeError:
return NotImplemented
return self.__class__(years=int(self.years * f),
months=int(self.months * f),
days=int(self.days * f),
hours=int(self.hours * f),
minutes=int(self.minutes * f),
seconds=int(self.seconds * f),
microseconds=int(self.microseconds * f),
leapdays=self.leapdays,
year=self.year,
month=self.month,
day=self.day,
weekday=self.weekday,
hour=self.hour,
minute=self.minute,
second=self.second,
microsecond=self.microsecond)
__rmul__ = __mul__
def __eq__(self, other):
if not isinstance(other, relativedelta):
return NotImplemented
if self.weekday or other.weekday:
if not self.weekday or not other.weekday:
return False
if self.weekday.weekday != other.weekday.weekday:
return False
n1, n2 = self.weekday.n, other.weekday.n
if n1 != n2 and not ((not n1 or n1 == 1) and (not n2 or n2 == 1)):
return False
return (self.years == other.years and
self.months == other.months and
self.days == other.days and
self.hours == other.hours and
self.minutes == other.minutes and
self.seconds == other.seconds and
self.microseconds == other.microseconds and
self.leapdays == other.leapdays and
self.year == other.year and
self.month == other.month and
self.day == other.day and
self.hour == other.hour and
self.minute == other.minute and
self.second == other.second and
self.microsecond == other.microsecond)
def __hash__(self):
return hash((
self.weekday,
self.years,
self.months,
self.days,
self.hours,
self.minutes,
self.seconds,
self.microseconds,
self.leapdays,
self.year,
self.month,
self.day,
self.hour,
self.minute,
self.second,
self.microsecond,
))
def __ne__(self, other):
return not self.__eq__(other)
def __div__(self, other):
try:
reciprocal = 1 / float(other)
except TypeError:
return NotImplemented
return self.__mul__(reciprocal)
__truediv__ = __div__
def __repr__(self):
l = []
for attr in ["years", "months", "days", "leapdays",
"hours", "minutes", "seconds", "microseconds"]:
value = getattr(self, attr)
if value:
l.append("{attr}={value:+g}".format(attr=attr, value=value))
for attr in ["year", "month", "day", "weekday",
"hour", "minute", "second", "microsecond"]:
value = getattr(self, attr)
if value is not None:
l.append("{attr}={value}".format(attr=attr, value=repr(value)))
return "{classname}({attrs})".format(classname=self.__class__.__name__,
attrs=", ".join(l))
def _sign(x):
return int(copysign(1, x))
# vim:ts=4:sw=4:et

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More