initial commit

This commit is contained in:
klein panic
2024-09-29 01:45:31 -04:00
commit 242841c44b
8018 changed files with 1426958 additions and 0 deletions

View File

@@ -0,0 +1,239 @@
from __future__ import annotations
import functools
from typing import (
TYPE_CHECKING,
Any,
Callable,
)
if TYPE_CHECKING:
from pandas._typing import Scalar
import numpy as np
from pandas.compat._optional import import_optional_dependency
@functools.cache
def generate_apply_looper(func, nopython=True, nogil=True, parallel=False):
if TYPE_CHECKING:
import numba
else:
numba = import_optional_dependency("numba")
nb_compat_func = numba.extending.register_jitable(func)
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
def nb_looper(values, axis):
# Operate on the first row/col in order to get
# the output shape
if axis == 0:
first_elem = values[:, 0]
dim0 = values.shape[1]
else:
first_elem = values[0]
dim0 = values.shape[0]
res0 = nb_compat_func(first_elem)
# Use np.asarray to get shape for
# https://github.com/numba/numba/issues/4202#issuecomment-1185981507
buf_shape = (dim0,) + np.atleast_1d(np.asarray(res0)).shape
if axis == 0:
buf_shape = buf_shape[::-1]
buff = np.empty(buf_shape)
if axis == 1:
buff[0] = res0
for i in numba.prange(1, values.shape[0]):
buff[i] = nb_compat_func(values[i])
else:
buff[:, 0] = res0
for j in numba.prange(1, values.shape[1]):
buff[:, j] = nb_compat_func(values[:, j])
return buff
return nb_looper
@functools.cache
def make_looper(func, result_dtype, is_grouped_kernel, nopython, nogil, parallel):
if TYPE_CHECKING:
import numba
else:
numba = import_optional_dependency("numba")
if is_grouped_kernel:
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
def column_looper(
values: np.ndarray,
labels: np.ndarray,
ngroups: int,
min_periods: int,
*args,
):
result = np.empty((values.shape[0], ngroups), dtype=result_dtype)
na_positions = {}
for i in numba.prange(values.shape[0]):
output, na_pos = func(
values[i], result_dtype, labels, ngroups, min_periods, *args
)
result[i] = output
if len(na_pos) > 0:
na_positions[i] = np.array(na_pos)
return result, na_positions
else:
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
def column_looper(
values: np.ndarray,
start: np.ndarray,
end: np.ndarray,
min_periods: int,
*args,
):
result = np.empty((values.shape[0], len(start)), dtype=result_dtype)
na_positions = {}
for i in numba.prange(values.shape[0]):
output, na_pos = func(
values[i], result_dtype, start, end, min_periods, *args
)
result[i] = output
if len(na_pos) > 0:
na_positions[i] = np.array(na_pos)
return result, na_positions
return column_looper
default_dtype_mapping: dict[np.dtype, Any] = {
np.dtype("int8"): np.int64,
np.dtype("int16"): np.int64,
np.dtype("int32"): np.int64,
np.dtype("int64"): np.int64,
np.dtype("uint8"): np.uint64,
np.dtype("uint16"): np.uint64,
np.dtype("uint32"): np.uint64,
np.dtype("uint64"): np.uint64,
np.dtype("float32"): np.float64,
np.dtype("float64"): np.float64,
np.dtype("complex64"): np.complex128,
np.dtype("complex128"): np.complex128,
}
# TODO: Preserve complex dtypes
float_dtype_mapping: dict[np.dtype, Any] = {
np.dtype("int8"): np.float64,
np.dtype("int16"): np.float64,
np.dtype("int32"): np.float64,
np.dtype("int64"): np.float64,
np.dtype("uint8"): np.float64,
np.dtype("uint16"): np.float64,
np.dtype("uint32"): np.float64,
np.dtype("uint64"): np.float64,
np.dtype("float32"): np.float64,
np.dtype("float64"): np.float64,
np.dtype("complex64"): np.float64,
np.dtype("complex128"): np.float64,
}
identity_dtype_mapping: dict[np.dtype, Any] = {
np.dtype("int8"): np.int8,
np.dtype("int16"): np.int16,
np.dtype("int32"): np.int32,
np.dtype("int64"): np.int64,
np.dtype("uint8"): np.uint8,
np.dtype("uint16"): np.uint16,
np.dtype("uint32"): np.uint32,
np.dtype("uint64"): np.uint64,
np.dtype("float32"): np.float32,
np.dtype("float64"): np.float64,
np.dtype("complex64"): np.complex64,
np.dtype("complex128"): np.complex128,
}
def generate_shared_aggregator(
func: Callable[..., Scalar],
dtype_mapping: dict[np.dtype, np.dtype],
is_grouped_kernel: bool,
nopython: bool,
nogil: bool,
parallel: bool,
):
"""
Generate a Numba function that loops over the columns 2D object and applies
a 1D numba kernel over each column.
Parameters
----------
func : function
aggregation function to be applied to each column
dtype_mapping: dict or None
If not None, maps a dtype to a result dtype.
Otherwise, will fall back to default mapping.
is_grouped_kernel: bool, default False
Whether func operates using the group labels (True)
or using starts/ends arrays
If true, you also need to pass the number of groups to this function
nopython : bool
nopython to be passed into numba.jit
nogil : bool
nogil to be passed into numba.jit
parallel : bool
parallel to be passed into numba.jit
Returns
-------
Numba function
"""
# A wrapper around the looper function,
# to dispatch based on dtype since numba is unable to do that in nopython mode
# It also post-processes the values by inserting nans where number of observations
# is less than min_periods
# Cannot do this in numba nopython mode
# (you'll run into type-unification error when you cast int -> float)
def looper_wrapper(
values,
start=None,
end=None,
labels=None,
ngroups=None,
min_periods: int = 0,
**kwargs,
):
result_dtype = dtype_mapping[values.dtype]
column_looper = make_looper(
func, result_dtype, is_grouped_kernel, nopython, nogil, parallel
)
# Need to unpack kwargs since numba only supports *args
if is_grouped_kernel:
result, na_positions = column_looper(
values, labels, ngroups, min_periods, *kwargs.values()
)
else:
result, na_positions = column_looper(
values, start, end, min_periods, *kwargs.values()
)
if result.dtype.kind == "i":
# Look if na_positions is not empty
# If so, convert the whole block
# This is OK since int dtype cannot hold nan,
# so if min_periods not satisfied for 1 col, it is not satisfied for
# all columns at that index
for na_pos in na_positions.values():
if len(na_pos) > 0:
result = result.astype("float64")
break
# TODO: Optimize this
for i, na_pos in na_positions.items():
if len(na_pos) > 0:
result[i, na_pos] = np.nan
return result
return looper_wrapper

View File

@@ -0,0 +1,584 @@
# Disable type checking for this module since numba's internals
# are not typed, and we use numba's internals via its extension API
# mypy: ignore-errors
"""
Utility classes/functions to let numba recognize
pandas Index/Series/DataFrame
Mostly vendored from https://github.com/numba/numba/blob/main/numba/tests/pdlike_usecase.py
"""
from __future__ import annotations
from contextlib import contextmanager
import operator
import numba
from numba import types
from numba.core import cgutils
from numba.core.datamodel import models
from numba.core.extending import (
NativeValue,
box,
lower_builtin,
make_attribute_wrapper,
overload,
overload_attribute,
overload_method,
register_model,
type_callable,
typeof_impl,
unbox,
)
from numba.core.imputils import impl_ret_borrowed
import numpy as np
from pandas._libs import lib
from pandas.core.indexes.base import Index
from pandas.core.indexing import _iLocIndexer
from pandas.core.internals import SingleBlockManager
from pandas.core.series import Series
# Helper function to hack around fact that Index casts numpy string dtype to object
#
# Idea is to set an attribute on a Index called _numba_data
# that is the original data, or the object data casted to numpy string dtype,
# with a context manager that is unset afterwards
@contextmanager
def set_numba_data(index: Index):
numba_data = index._data
if numba_data.dtype == object:
if not lib.is_string_array(numba_data):
raise ValueError(
"The numba engine only supports using string or numeric column names"
)
numba_data = numba_data.astype("U")
try:
index._numba_data = numba_data
yield index
finally:
del index._numba_data
# TODO: Range index support
# (this currently lowers OK, but does not round-trip)
class IndexType(types.Type):
"""
The type class for Index objects.
"""
def __init__(self, dtype, layout, pyclass: any) -> None:
self.pyclass = pyclass
name = f"index({dtype}, {layout})"
self.dtype = dtype
self.layout = layout
super().__init__(name)
@property
def key(self):
return self.pyclass, self.dtype, self.layout
@property
def as_array(self):
return types.Array(self.dtype, 1, self.layout)
def copy(self, dtype=None, ndim: int = 1, layout=None):
assert ndim == 1
if dtype is None:
dtype = self.dtype
layout = layout or self.layout
return type(self)(dtype, layout, self.pyclass)
class SeriesType(types.Type):
"""
The type class for Series objects.
"""
def __init__(self, dtype, index, namety) -> None:
assert isinstance(index, IndexType)
self.dtype = dtype
self.index = index
self.values = types.Array(self.dtype, 1, "C")
self.namety = namety
name = f"series({dtype}, {index}, {namety})"
super().__init__(name)
@property
def key(self):
return self.dtype, self.index, self.namety
@property
def as_array(self):
return self.values
def copy(self, dtype=None, ndim: int = 1, layout: str = "C"):
assert ndim == 1
assert layout == "C"
if dtype is None:
dtype = self.dtype
return type(self)(dtype, self.index, self.namety)
@typeof_impl.register(Index)
def typeof_index(val, c):
"""
This will assume that only strings are in object dtype
index.
(you should check this before this gets lowered down to numba)
"""
# arrty = typeof_impl(val._data, c)
arrty = typeof_impl(val._numba_data, c)
assert arrty.ndim == 1
return IndexType(arrty.dtype, arrty.layout, type(val))
@typeof_impl.register(Series)
def typeof_series(val, c):
index = typeof_impl(val.index, c)
arrty = typeof_impl(val.values, c)
namety = typeof_impl(val.name, c)
assert arrty.ndim == 1
assert arrty.layout == "C"
return SeriesType(arrty.dtype, index, namety)
@type_callable(Series)
def type_series_constructor(context):
def typer(data, index, name=None):
if isinstance(index, IndexType) and isinstance(data, types.Array):
assert data.ndim == 1
if name is None:
name = types.intp
return SeriesType(data.dtype, index, name)
return typer
@type_callable(Index)
def type_index_constructor(context):
def typer(data, hashmap=None):
if isinstance(data, types.Array):
assert data.layout == "C"
assert data.ndim == 1
assert hashmap is None or isinstance(hashmap, types.DictType)
return IndexType(data.dtype, layout=data.layout, pyclass=Index)
return typer
# Backend extensions for Index and Series and Frame
@register_model(IndexType)
class IndexModel(models.StructModel):
def __init__(self, dmm, fe_type) -> None:
# We don't want the numpy string scalar type in our hashmap
members = [
("data", fe_type.as_array),
# This is an attempt to emulate our hashtable code with a numba
# typed dict
# It maps from values in the index to their integer positions in the array
("hashmap", types.DictType(fe_type.dtype, types.intp)),
# Pointer to the Index object this was created from, or that it
# boxes to
# https://numba.discourse.group/t/qst-how-to-cache-the-boxing-of-an-object/2128/2?u=lithomas1
("parent", types.pyobject),
]
models.StructModel.__init__(self, dmm, fe_type, members)
@register_model(SeriesType)
class SeriesModel(models.StructModel):
def __init__(self, dmm, fe_type) -> None:
members = [
("index", fe_type.index),
("values", fe_type.as_array),
("name", fe_type.namety),
]
models.StructModel.__init__(self, dmm, fe_type, members)
make_attribute_wrapper(IndexType, "data", "_data")
make_attribute_wrapper(IndexType, "hashmap", "hashmap")
make_attribute_wrapper(SeriesType, "index", "index")
make_attribute_wrapper(SeriesType, "values", "values")
make_attribute_wrapper(SeriesType, "name", "name")
@lower_builtin(Series, types.Array, IndexType)
def pdseries_constructor(context, builder, sig, args):
data, index = args
series = cgutils.create_struct_proxy(sig.return_type)(context, builder)
series.index = index
series.values = data
series.name = context.get_constant(types.intp, 0)
return impl_ret_borrowed(context, builder, sig.return_type, series._getvalue())
@lower_builtin(Series, types.Array, IndexType, types.intp)
@lower_builtin(Series, types.Array, IndexType, types.float64)
@lower_builtin(Series, types.Array, IndexType, types.unicode_type)
def pdseries_constructor_with_name(context, builder, sig, args):
data, index, name = args
series = cgutils.create_struct_proxy(sig.return_type)(context, builder)
series.index = index
series.values = data
series.name = name
return impl_ret_borrowed(context, builder, sig.return_type, series._getvalue())
@lower_builtin(Index, types.Array, types.DictType, types.pyobject)
def index_constructor_2arg(context, builder, sig, args):
(data, hashmap, parent) = args
index = cgutils.create_struct_proxy(sig.return_type)(context, builder)
index.data = data
index.hashmap = hashmap
index.parent = parent
return impl_ret_borrowed(context, builder, sig.return_type, index._getvalue())
@lower_builtin(Index, types.Array, types.DictType)
def index_constructor_2arg_parent(context, builder, sig, args):
# Basically same as index_constructor_1arg, but also lets you specify the
# parent object
(data, hashmap) = args
index = cgutils.create_struct_proxy(sig.return_type)(context, builder)
index.data = data
index.hashmap = hashmap
return impl_ret_borrowed(context, builder, sig.return_type, index._getvalue())
@lower_builtin(Index, types.Array)
def index_constructor_1arg(context, builder, sig, args):
from numba.typed import Dict
key_type = sig.return_type.dtype
value_type = types.intp
def index_impl(data):
return Index(data, Dict.empty(key_type, value_type))
return context.compile_internal(builder, index_impl, sig, args)
# Helper to convert the unicodecharseq (numpy string scalar) into a unicode_type
# (regular string)
def maybe_cast_str(x):
# Dummy function that numba can overload
pass
@overload(maybe_cast_str)
def maybe_cast_str_impl(x):
"""Converts numba UnicodeCharSeq (numpy string scalar) -> unicode type (string).
Is a no-op for other types."""
if isinstance(x, types.UnicodeCharSeq):
return lambda x: str(x)
else:
return lambda x: x
@unbox(IndexType)
def unbox_index(typ, obj, c):
"""
Convert a Index object to a native structure.
Note: Object dtype is not allowed here
"""
data_obj = c.pyapi.object_getattr_string(obj, "_numba_data")
index = cgutils.create_struct_proxy(typ)(c.context, c.builder)
# If we see an object array, assume its been validated as only containing strings
# We still need to do the conversion though
index.data = c.unbox(typ.as_array, data_obj).value
typed_dict_obj = c.pyapi.unserialize(c.pyapi.serialize_object(numba.typed.Dict))
# Create an empty typed dict in numba for the hashmap for indexing
# equiv of numba.typed.Dict.empty(typ.dtype, types.intp)
arr_type_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.dtype))
intp_type_obj = c.pyapi.unserialize(c.pyapi.serialize_object(types.intp))
hashmap_obj = c.pyapi.call_method(
typed_dict_obj, "empty", (arr_type_obj, intp_type_obj)
)
index.hashmap = c.unbox(types.DictType(typ.dtype, types.intp), hashmap_obj).value
# Set the parent for speedy boxing.
index.parent = obj
# Decrefs
c.pyapi.decref(data_obj)
c.pyapi.decref(arr_type_obj)
c.pyapi.decref(intp_type_obj)
c.pyapi.decref(typed_dict_obj)
return NativeValue(index._getvalue())
@unbox(SeriesType)
def unbox_series(typ, obj, c):
"""
Convert a Series object to a native structure.
"""
index_obj = c.pyapi.object_getattr_string(obj, "index")
values_obj = c.pyapi.object_getattr_string(obj, "values")
name_obj = c.pyapi.object_getattr_string(obj, "name")
series = cgutils.create_struct_proxy(typ)(c.context, c.builder)
series.index = c.unbox(typ.index, index_obj).value
series.values = c.unbox(typ.values, values_obj).value
series.name = c.unbox(typ.namety, name_obj).value
# Decrefs
c.pyapi.decref(index_obj)
c.pyapi.decref(values_obj)
c.pyapi.decref(name_obj)
return NativeValue(series._getvalue())
@box(IndexType)
def box_index(typ, val, c):
"""
Convert a native index structure to a Index object.
If our native index is of a numpy string dtype, we'll cast it to
object.
"""
# First build a Numpy array object, then wrap it in a Index
index = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val)
res = cgutils.alloca_once_value(c.builder, index.parent)
# Does parent exist?
# (it means already boxed once, or Index same as original df.index or df.columns)
# xref https://github.com/numba/numba/blob/596e8a55334cc46854e3192766e643767bd7c934/numba/core/boxing.py#L593C17-L593C17
with c.builder.if_else(cgutils.is_not_null(c.builder, index.parent)) as (
has_parent,
otherwise,
):
with has_parent:
c.pyapi.incref(index.parent)
with otherwise:
# TODO: preserve the original class for the index
# Also need preserve the name of the Index
# class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.pyclass))
class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Index))
array_obj = c.box(typ.as_array, index.data)
if isinstance(typ.dtype, types.UnicodeCharSeq):
# We converted to numpy string dtype, convert back
# to object since _simple_new won't do that for uss
object_str_obj = c.pyapi.unserialize(c.pyapi.serialize_object("object"))
array_obj = c.pyapi.call_method(array_obj, "astype", (object_str_obj,))
c.pyapi.decref(object_str_obj)
# this is basically Index._simple_new(array_obj, name_obj) in python
index_obj = c.pyapi.call_method(class_obj, "_simple_new", (array_obj,))
index.parent = index_obj
c.builder.store(index_obj, res)
# Decrefs
c.pyapi.decref(class_obj)
c.pyapi.decref(array_obj)
return c.builder.load(res)
@box(SeriesType)
def box_series(typ, val, c):
"""
Convert a native series structure to a Series object.
"""
series = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val)
series_const_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Series._from_mgr))
mgr_const_obj = c.pyapi.unserialize(
c.pyapi.serialize_object(SingleBlockManager.from_array)
)
index_obj = c.box(typ.index, series.index)
array_obj = c.box(typ.as_array, series.values)
name_obj = c.box(typ.namety, series.name)
# This is basically equivalent of
# pd.Series(data=array_obj, index=index_obj)
# To improve perf, we will construct the Series from a manager
# object to avoid checks.
# We'll also set the name attribute manually to avoid validation
mgr_obj = c.pyapi.call_function_objargs(
mgr_const_obj,
(
array_obj,
index_obj,
),
)
mgr_axes_obj = c.pyapi.object_getattr_string(mgr_obj, "axes")
# Series._constructor_from_mgr(mgr, axes)
series_obj = c.pyapi.call_function_objargs(
series_const_obj, (mgr_obj, mgr_axes_obj)
)
c.pyapi.object_setattr_string(series_obj, "_name", name_obj)
# Decrefs
c.pyapi.decref(series_const_obj)
c.pyapi.decref(mgr_axes_obj)
c.pyapi.decref(mgr_obj)
c.pyapi.decref(mgr_const_obj)
c.pyapi.decref(index_obj)
c.pyapi.decref(array_obj)
c.pyapi.decref(name_obj)
return series_obj
# Add common series reductions (e.g. mean, sum),
# and also add common binops (e.g. add, sub, mul, div)
def generate_series_reduction(ser_reduction, ser_method):
@overload_method(SeriesType, ser_reduction)
def series_reduction(series):
def series_reduction_impl(series):
return ser_method(series.values)
return series_reduction_impl
return series_reduction
def generate_series_binop(binop):
@overload(binop)
def series_binop(series1, value):
if isinstance(series1, SeriesType):
if isinstance(value, SeriesType):
def series_binop_impl(series1, series2):
# TODO: Check index matching?
return Series(
binop(series1.values, series2.values),
series1.index,
series1.name,
)
return series_binop_impl
else:
def series_binop_impl(series1, value):
return Series(
binop(series1.values, value), series1.index, series1.name
)
return series_binop_impl
return series_binop
series_reductions = [
("sum", np.sum),
("mean", np.mean),
# Disabled due to discrepancies between numba std. dev
# and pandas std. dev (no way to specify dof)
# ("std", np.std),
# ("var", np.var),
("min", np.min),
("max", np.max),
]
for reduction, reduction_method in series_reductions:
generate_series_reduction(reduction, reduction_method)
series_binops = [operator.add, operator.sub, operator.mul, operator.truediv]
for ser_binop in series_binops:
generate_series_binop(ser_binop)
# get_loc on Index
@overload_method(IndexType, "get_loc")
def index_get_loc(index, item):
def index_get_loc_impl(index, item):
# Initialize the hash table if not initialized
if len(index.hashmap) == 0:
for i, val in enumerate(index._data):
index.hashmap[val] = i
return index.hashmap[item]
return index_get_loc_impl
# Indexing for Series/Index
@overload(operator.getitem)
def series_indexing(series, item):
if isinstance(series, SeriesType):
def series_getitem(series, item):
loc = series.index.get_loc(item)
return series.iloc[loc]
return series_getitem
@overload(operator.getitem)
def index_indexing(index, idx):
if isinstance(index, IndexType):
def index_getitem(index, idx):
return index._data[idx]
return index_getitem
class IlocType(types.Type):
def __init__(self, obj_type) -> None:
self.obj_type = obj_type
name = f"iLocIndexer({obj_type})"
super().__init__(name=name)
@property
def key(self):
return self.obj_type
@typeof_impl.register(_iLocIndexer)
def typeof_iloc(val, c):
objtype = typeof_impl(val.obj, c)
return IlocType(objtype)
@type_callable(_iLocIndexer)
def type_iloc_constructor(context):
def typer(obj):
if isinstance(obj, SeriesType):
return IlocType(obj)
return typer
@lower_builtin(_iLocIndexer, SeriesType)
def iloc_constructor(context, builder, sig, args):
(obj,) = args
iloc_indexer = cgutils.create_struct_proxy(sig.return_type)(context, builder)
iloc_indexer.obj = obj
return impl_ret_borrowed(
context, builder, sig.return_type, iloc_indexer._getvalue()
)
@register_model(IlocType)
class ILocModel(models.StructModel):
def __init__(self, dmm, fe_type) -> None:
members = [("obj", fe_type.obj_type)]
models.StructModel.__init__(self, dmm, fe_type, members)
make_attribute_wrapper(IlocType, "obj", "obj")
@overload_attribute(SeriesType, "iloc")
def series_iloc(series):
def get(series):
return _iLocIndexer(series)
return get
@overload(operator.getitem)
def iloc_getitem(iloc_indexer, i):
if isinstance(iloc_indexer, IlocType):
def getitem_impl(iloc_indexer, i):
return iloc_indexer.obj.values[i]
return getitem_impl

View File

@@ -0,0 +1,27 @@
from pandas.core._numba.kernels.mean_ import (
grouped_mean,
sliding_mean,
)
from pandas.core._numba.kernels.min_max_ import (
grouped_min_max,
sliding_min_max,
)
from pandas.core._numba.kernels.sum_ import (
grouped_sum,
sliding_sum,
)
from pandas.core._numba.kernels.var_ import (
grouped_var,
sliding_var,
)
__all__ = [
"sliding_mean",
"grouped_mean",
"sliding_sum",
"grouped_sum",
"sliding_var",
"grouped_var",
"sliding_min_max",
"grouped_min_max",
]

View File

@@ -0,0 +1,196 @@
"""
Numba 1D mean kernels that can be shared by
* Dataframe / Series
* groupby
* rolling / expanding
Mirrors pandas/_libs/window/aggregation.pyx
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import numba
import numpy as np
from pandas.core._numba.kernels.shared import is_monotonic_increasing
from pandas.core._numba.kernels.sum_ import grouped_kahan_sum
if TYPE_CHECKING:
from pandas._typing import npt
@numba.jit(nopython=True, nogil=True, parallel=False)
def add_mean(
val: float,
nobs: int,
sum_x: float,
neg_ct: int,
compensation: float,
num_consecutive_same_value: int,
prev_value: float,
) -> tuple[int, float, int, float, int, float]:
if not np.isnan(val):
nobs += 1
y = val - compensation
t = sum_x + y
compensation = t - sum_x - y
sum_x = t
if val < 0:
neg_ct += 1
if val == prev_value:
num_consecutive_same_value += 1
else:
num_consecutive_same_value = 1
prev_value = val
return nobs, sum_x, neg_ct, compensation, num_consecutive_same_value, prev_value
@numba.jit(nopython=True, nogil=True, parallel=False)
def remove_mean(
val: float, nobs: int, sum_x: float, neg_ct: int, compensation: float
) -> tuple[int, float, int, float]:
if not np.isnan(val):
nobs -= 1
y = -val - compensation
t = sum_x + y
compensation = t - sum_x - y
sum_x = t
if val < 0:
neg_ct -= 1
return nobs, sum_x, neg_ct, compensation
@numba.jit(nopython=True, nogil=True, parallel=False)
def sliding_mean(
values: np.ndarray,
result_dtype: np.dtype,
start: np.ndarray,
end: np.ndarray,
min_periods: int,
) -> tuple[np.ndarray, list[int]]:
N = len(start)
nobs = 0
sum_x = 0.0
neg_ct = 0
compensation_add = 0.0
compensation_remove = 0.0
is_monotonic_increasing_bounds = is_monotonic_increasing(
start
) and is_monotonic_increasing(end)
output = np.empty(N, dtype=result_dtype)
for i in range(N):
s = start[i]
e = end[i]
if i == 0 or not is_monotonic_increasing_bounds:
prev_value = values[s]
num_consecutive_same_value = 0
for j in range(s, e):
val = values[j]
(
nobs,
sum_x,
neg_ct,
compensation_add,
num_consecutive_same_value,
prev_value,
) = add_mean(
val,
nobs,
sum_x,
neg_ct,
compensation_add,
num_consecutive_same_value,
prev_value, # pyright: ignore[reportGeneralTypeIssues]
)
else:
for j in range(start[i - 1], s):
val = values[j]
nobs, sum_x, neg_ct, compensation_remove = remove_mean(
val, nobs, sum_x, neg_ct, compensation_remove
)
for j in range(end[i - 1], e):
val = values[j]
(
nobs,
sum_x,
neg_ct,
compensation_add,
num_consecutive_same_value,
prev_value,
) = add_mean(
val,
nobs,
sum_x,
neg_ct,
compensation_add,
num_consecutive_same_value,
prev_value, # pyright: ignore[reportGeneralTypeIssues]
)
if nobs >= min_periods and nobs > 0:
result = sum_x / nobs
if num_consecutive_same_value >= nobs:
result = prev_value
elif neg_ct == 0 and result < 0:
result = 0
elif neg_ct == nobs and result > 0:
result = 0
else:
result = np.nan
output[i] = result
if not is_monotonic_increasing_bounds:
nobs = 0
sum_x = 0.0
neg_ct = 0
compensation_remove = 0.0
# na_position is empty list since float64 can already hold nans
# Do list comprehension, since numba cannot figure out that na_pos is
# empty list of ints on its own
na_pos = [0 for i in range(0)]
return output, na_pos
@numba.jit(nopython=True, nogil=True, parallel=False)
def grouped_mean(
values: np.ndarray,
result_dtype: np.dtype,
labels: npt.NDArray[np.intp],
ngroups: int,
min_periods: int,
) -> tuple[np.ndarray, list[int]]:
output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum(
values, result_dtype, labels, ngroups
)
# Post-processing, replace sums that don't satisfy min_periods
for lab in range(ngroups):
nobs = nobs_arr[lab]
num_consecutive_same_value = consecutive_counts[lab]
prev_value = prev_vals[lab]
sum_x = output[lab]
if nobs >= min_periods:
if num_consecutive_same_value >= nobs:
result = prev_value * nobs
else:
result = sum_x
else:
result = np.nan
result /= nobs
output[lab] = result
# na_position is empty list since float64 can already hold nans
# Do list comprehension, since numba cannot figure out that na_pos is
# empty list of ints on its own
na_pos = [0 for i in range(0)]
return output, na_pos

View File

@@ -0,0 +1,125 @@
"""
Numba 1D min/max kernels that can be shared by
* Dataframe / Series
* groupby
* rolling / expanding
Mirrors pandas/_libs/window/aggregation.pyx
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import numba
import numpy as np
if TYPE_CHECKING:
from pandas._typing import npt
@numba.jit(nopython=True, nogil=True, parallel=False)
def sliding_min_max(
values: np.ndarray,
result_dtype: np.dtype,
start: np.ndarray,
end: np.ndarray,
min_periods: int,
is_max: bool,
) -> tuple[np.ndarray, list[int]]:
N = len(start)
nobs = 0
output = np.empty(N, dtype=result_dtype)
na_pos = []
# Use deque once numba supports it
# https://github.com/numba/numba/issues/7417
Q: list = []
W: list = []
for i in range(N):
curr_win_size = end[i] - start[i]
if i == 0:
st = start[i]
else:
st = end[i - 1]
for k in range(st, end[i]):
ai = values[k]
if not np.isnan(ai):
nobs += 1
elif is_max:
ai = -np.inf
else:
ai = np.inf
# Discard previous entries if we find new min or max
if is_max:
while Q and ((ai >= values[Q[-1]]) or values[Q[-1]] != values[Q[-1]]):
Q.pop()
else:
while Q and ((ai <= values[Q[-1]]) or values[Q[-1]] != values[Q[-1]]):
Q.pop()
Q.append(k)
W.append(k)
# Discard entries outside and left of current window
while Q and Q[0] <= start[i] - 1:
Q.pop(0)
while W and W[0] <= start[i] - 1:
if not np.isnan(values[W[0]]):
nobs -= 1
W.pop(0)
# Save output based on index in input value array
if Q and curr_win_size > 0 and nobs >= min_periods:
output[i] = values[Q[0]]
else:
if values.dtype.kind != "i":
output[i] = np.nan
else:
na_pos.append(i)
return output, na_pos
@numba.jit(nopython=True, nogil=True, parallel=False)
def grouped_min_max(
values: np.ndarray,
result_dtype: np.dtype,
labels: npt.NDArray[np.intp],
ngroups: int,
min_periods: int,
is_max: bool,
) -> tuple[np.ndarray, list[int]]:
N = len(labels)
nobs = np.zeros(ngroups, dtype=np.int64)
na_pos = []
output = np.empty(ngroups, dtype=result_dtype)
for i in range(N):
lab = labels[i]
val = values[i]
if lab < 0:
continue
if values.dtype.kind == "i" or not np.isnan(val):
nobs[lab] += 1
else:
# NaN value cannot be a min/max value
continue
if nobs[lab] == 1:
# First element in group, set output equal to this
output[lab] = val
continue
if is_max:
if val > output[lab]:
output[lab] = val
else:
if val < output[lab]:
output[lab] = val
# Set labels that don't satisfy min_periods as np.nan
for lab, count in enumerate(nobs):
if count < min_periods:
na_pos.append(lab)
return output, na_pos

View File

@@ -0,0 +1,29 @@
from __future__ import annotations
from typing import TYPE_CHECKING
import numba
if TYPE_CHECKING:
import numpy as np
@numba.jit(
# error: Any? not callable
numba.boolean(numba.int64[:]), # type: ignore[misc]
nopython=True,
nogil=True,
parallel=False,
)
def is_monotonic_increasing(bounds: np.ndarray) -> bool:
"""Check if int64 values are monotonically increasing."""
n = len(bounds)
if n < 2:
return True
prev = bounds[0]
for i in range(1, n):
cur = bounds[i]
if cur < prev:
return False
prev = cur
return True

View File

@@ -0,0 +1,244 @@
"""
Numba 1D sum kernels that can be shared by
* Dataframe / Series
* groupby
* rolling / expanding
Mirrors pandas/_libs/window/aggregation.pyx
"""
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Any,
)
import numba
from numba.extending import register_jitable
import numpy as np
if TYPE_CHECKING:
from pandas._typing import npt
from pandas.core._numba.kernels.shared import is_monotonic_increasing
@numba.jit(nopython=True, nogil=True, parallel=False)
def add_sum(
val: Any,
nobs: int,
sum_x: Any,
compensation: Any,
num_consecutive_same_value: int,
prev_value: Any,
) -> tuple[int, Any, Any, int, Any]:
if not np.isnan(val):
nobs += 1
y = val - compensation
t = sum_x + y
compensation = t - sum_x - y
sum_x = t
if val == prev_value:
num_consecutive_same_value += 1
else:
num_consecutive_same_value = 1
prev_value = val
return nobs, sum_x, compensation, num_consecutive_same_value, prev_value
@numba.jit(nopython=True, nogil=True, parallel=False)
def remove_sum(
val: Any, nobs: int, sum_x: Any, compensation: Any
) -> tuple[int, Any, Any]:
if not np.isnan(val):
nobs -= 1
y = -val - compensation
t = sum_x + y
compensation = t - sum_x - y
sum_x = t
return nobs, sum_x, compensation
@numba.jit(nopython=True, nogil=True, parallel=False)
def sliding_sum(
values: np.ndarray,
result_dtype: np.dtype,
start: np.ndarray,
end: np.ndarray,
min_periods: int,
) -> tuple[np.ndarray, list[int]]:
dtype = values.dtype
na_val: object = np.nan
if dtype.kind == "i":
na_val = 0
N = len(start)
nobs = 0
sum_x = 0
compensation_add = 0
compensation_remove = 0
na_pos = []
is_monotonic_increasing_bounds = is_monotonic_increasing(
start
) and is_monotonic_increasing(end)
output = np.empty(N, dtype=result_dtype)
for i in range(N):
s = start[i]
e = end[i]
if i == 0 or not is_monotonic_increasing_bounds:
prev_value = values[s]
num_consecutive_same_value = 0
for j in range(s, e):
val = values[j]
(
nobs,
sum_x,
compensation_add,
num_consecutive_same_value,
prev_value,
) = add_sum(
val,
nobs,
sum_x,
compensation_add,
num_consecutive_same_value,
prev_value,
)
else:
for j in range(start[i - 1], s):
val = values[j]
nobs, sum_x, compensation_remove = remove_sum(
val, nobs, sum_x, compensation_remove
)
for j in range(end[i - 1], e):
val = values[j]
(
nobs,
sum_x,
compensation_add,
num_consecutive_same_value,
prev_value,
) = add_sum(
val,
nobs,
sum_x,
compensation_add,
num_consecutive_same_value,
prev_value,
)
if nobs == 0 == min_periods:
result: object = 0
elif nobs >= min_periods:
if num_consecutive_same_value >= nobs:
result = prev_value * nobs
else:
result = sum_x
else:
result = na_val
if dtype.kind == "i":
na_pos.append(i)
output[i] = result
if not is_monotonic_increasing_bounds:
nobs = 0
sum_x = 0
compensation_remove = 0
return output, na_pos
# Mypy/pyright don't like the fact that the decorator is untyped
@register_jitable # type: ignore[misc]
def grouped_kahan_sum(
values: np.ndarray,
result_dtype: np.dtype,
labels: npt.NDArray[np.intp],
ngroups: int,
) -> tuple[
np.ndarray, npt.NDArray[np.int64], np.ndarray, npt.NDArray[np.int64], np.ndarray
]:
N = len(labels)
nobs_arr = np.zeros(ngroups, dtype=np.int64)
comp_arr = np.zeros(ngroups, dtype=values.dtype)
consecutive_counts = np.zeros(ngroups, dtype=np.int64)
prev_vals = np.zeros(ngroups, dtype=values.dtype)
output = np.zeros(ngroups, dtype=result_dtype)
for i in range(N):
lab = labels[i]
val = values[i]
if lab < 0:
continue
sum_x = output[lab]
nobs = nobs_arr[lab]
compensation_add = comp_arr[lab]
num_consecutive_same_value = consecutive_counts[lab]
prev_value = prev_vals[lab]
(
nobs,
sum_x,
compensation_add,
num_consecutive_same_value,
prev_value,
) = add_sum(
val,
nobs,
sum_x,
compensation_add,
num_consecutive_same_value,
prev_value,
)
output[lab] = sum_x
consecutive_counts[lab] = num_consecutive_same_value
prev_vals[lab] = prev_value
comp_arr[lab] = compensation_add
nobs_arr[lab] = nobs
return output, nobs_arr, comp_arr, consecutive_counts, prev_vals
@numba.jit(nopython=True, nogil=True, parallel=False)
def grouped_sum(
values: np.ndarray,
result_dtype: np.dtype,
labels: npt.NDArray[np.intp],
ngroups: int,
min_periods: int,
) -> tuple[np.ndarray, list[int]]:
na_pos = []
output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum(
values, result_dtype, labels, ngroups
)
# Post-processing, replace sums that don't satisfy min_periods
for lab in range(ngroups):
nobs = nobs_arr[lab]
num_consecutive_same_value = consecutive_counts[lab]
prev_value = prev_vals[lab]
sum_x = output[lab]
if nobs >= min_periods:
if num_consecutive_same_value >= nobs:
result = prev_value * nobs
else:
result = sum_x
else:
result = sum_x # Don't change val, will be replaced by nan later
na_pos.append(lab)
output[lab] = result
return output, na_pos

View File

@@ -0,0 +1,245 @@
"""
Numba 1D var kernels that can be shared by
* Dataframe / Series
* groupby
* rolling / expanding
Mirrors pandas/_libs/window/aggregation.pyx
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import numba
import numpy as np
if TYPE_CHECKING:
from pandas._typing import npt
from pandas.core._numba.kernels.shared import is_monotonic_increasing
@numba.jit(nopython=True, nogil=True, parallel=False)
def add_var(
val: float,
nobs: int,
mean_x: float,
ssqdm_x: float,
compensation: float,
num_consecutive_same_value: int,
prev_value: float,
) -> tuple[int, float, float, float, int, float]:
if not np.isnan(val):
if val == prev_value:
num_consecutive_same_value += 1
else:
num_consecutive_same_value = 1
prev_value = val
nobs += 1
prev_mean = mean_x - compensation
y = val - compensation
t = y - mean_x
compensation = t + mean_x - y
delta = t
if nobs:
mean_x += delta / nobs
else:
mean_x = 0
ssqdm_x += (val - prev_mean) * (val - mean_x)
return nobs, mean_x, ssqdm_x, compensation, num_consecutive_same_value, prev_value
@numba.jit(nopython=True, nogil=True, parallel=False)
def remove_var(
val: float, nobs: int, mean_x: float, ssqdm_x: float, compensation: float
) -> tuple[int, float, float, float]:
if not np.isnan(val):
nobs -= 1
if nobs:
prev_mean = mean_x - compensation
y = val - compensation
t = y - mean_x
compensation = t + mean_x - y
delta = t
mean_x -= delta / nobs
ssqdm_x -= (val - prev_mean) * (val - mean_x)
else:
mean_x = 0
ssqdm_x = 0
return nobs, mean_x, ssqdm_x, compensation
@numba.jit(nopython=True, nogil=True, parallel=False)
def sliding_var(
values: np.ndarray,
result_dtype: np.dtype,
start: np.ndarray,
end: np.ndarray,
min_periods: int,
ddof: int = 1,
) -> tuple[np.ndarray, list[int]]:
N = len(start)
nobs = 0
mean_x = 0.0
ssqdm_x = 0.0
compensation_add = 0.0
compensation_remove = 0.0
min_periods = max(min_periods, 1)
is_monotonic_increasing_bounds = is_monotonic_increasing(
start
) and is_monotonic_increasing(end)
output = np.empty(N, dtype=result_dtype)
for i in range(N):
s = start[i]
e = end[i]
if i == 0 or not is_monotonic_increasing_bounds:
prev_value = values[s]
num_consecutive_same_value = 0
for j in range(s, e):
val = values[j]
(
nobs,
mean_x,
ssqdm_x,
compensation_add,
num_consecutive_same_value,
prev_value,
) = add_var(
val,
nobs,
mean_x,
ssqdm_x,
compensation_add,
num_consecutive_same_value,
prev_value,
)
else:
for j in range(start[i - 1], s):
val = values[j]
nobs, mean_x, ssqdm_x, compensation_remove = remove_var(
val, nobs, mean_x, ssqdm_x, compensation_remove
)
for j in range(end[i - 1], e):
val = values[j]
(
nobs,
mean_x,
ssqdm_x,
compensation_add,
num_consecutive_same_value,
prev_value,
) = add_var(
val,
nobs,
mean_x,
ssqdm_x,
compensation_add,
num_consecutive_same_value,
prev_value,
)
if nobs >= min_periods and nobs > ddof:
if nobs == 1 or num_consecutive_same_value >= nobs:
result = 0.0
else:
result = ssqdm_x / (nobs - ddof)
else:
result = np.nan
output[i] = result
if not is_monotonic_increasing_bounds:
nobs = 0
mean_x = 0.0
ssqdm_x = 0.0
compensation_remove = 0.0
# na_position is empty list since float64 can already hold nans
# Do list comprehension, since numba cannot figure out that na_pos is
# empty list of ints on its own
na_pos = [0 for i in range(0)]
return output, na_pos
@numba.jit(nopython=True, nogil=True, parallel=False)
def grouped_var(
values: np.ndarray,
result_dtype: np.dtype,
labels: npt.NDArray[np.intp],
ngroups: int,
min_periods: int,
ddof: int = 1,
) -> tuple[np.ndarray, list[int]]:
N = len(labels)
nobs_arr = np.zeros(ngroups, dtype=np.int64)
comp_arr = np.zeros(ngroups, dtype=values.dtype)
consecutive_counts = np.zeros(ngroups, dtype=np.int64)
prev_vals = np.zeros(ngroups, dtype=values.dtype)
output = np.zeros(ngroups, dtype=result_dtype)
means = np.zeros(ngroups, dtype=result_dtype)
for i in range(N):
lab = labels[i]
val = values[i]
if lab < 0:
continue
mean_x = means[lab]
ssqdm_x = output[lab]
nobs = nobs_arr[lab]
compensation_add = comp_arr[lab]
num_consecutive_same_value = consecutive_counts[lab]
prev_value = prev_vals[lab]
(
nobs,
mean_x,
ssqdm_x,
compensation_add,
num_consecutive_same_value,
prev_value,
) = add_var(
val,
nobs,
mean_x,
ssqdm_x,
compensation_add,
num_consecutive_same_value,
prev_value,
)
output[lab] = ssqdm_x
means[lab] = mean_x
consecutive_counts[lab] = num_consecutive_same_value
prev_vals[lab] = prev_value
comp_arr[lab] = compensation_add
nobs_arr[lab] = nobs
# Post-processing, replace vars that don't satisfy min_periods
for lab in range(ngroups):
nobs = nobs_arr[lab]
num_consecutive_same_value = consecutive_counts[lab]
ssqdm_x = output[lab]
if nobs >= min_periods and nobs > ddof:
if nobs == 1 or num_consecutive_same_value >= nobs:
result = 0.0
else:
result = ssqdm_x / (nobs - ddof)
else:
result = np.nan
output[lab] = result
# Second pass to get the std.dev
# na_position is empty list since float64 can already hold nans
# Do list comprehension, since numba cannot figure out that na_pos is
# empty list of ints on its own
na_pos = [0 for i in range(0)]
return output, na_pos

View File

@@ -0,0 +1,340 @@
"""
accessor.py contains base classes for implementing accessor properties
that can be mixed into or pinned onto other pandas classes.
"""
from __future__ import annotations
from typing import (
Callable,
final,
)
import warnings
from pandas.util._decorators import doc
from pandas.util._exceptions import find_stack_level
class DirNamesMixin:
_accessors: set[str] = set()
_hidden_attrs: frozenset[str] = frozenset()
@final
def _dir_deletions(self) -> set[str]:
"""
Delete unwanted __dir__ for this object.
"""
return self._accessors | self._hidden_attrs
def _dir_additions(self) -> set[str]:
"""
Add additional __dir__ for this object.
"""
return {accessor for accessor in self._accessors if hasattr(self, accessor)}
def __dir__(self) -> list[str]:
"""
Provide method name lookup and completion.
Notes
-----
Only provide 'public' methods.
"""
rv = set(super().__dir__())
rv = (rv - self._dir_deletions()) | self._dir_additions()
return sorted(rv)
class PandasDelegate:
"""
Abstract base class for delegating methods/properties.
"""
def _delegate_property_get(self, name: str, *args, **kwargs):
raise TypeError(f"You cannot access the property {name}")
def _delegate_property_set(self, name: str, value, *args, **kwargs):
raise TypeError(f"The property {name} cannot be set")
def _delegate_method(self, name: str, *args, **kwargs):
raise TypeError(f"You cannot call method {name}")
@classmethod
def _add_delegate_accessors(
cls,
delegate,
accessors: list[str],
typ: str,
overwrite: bool = False,
accessor_mapping: Callable[[str], str] = lambda x: x,
raise_on_missing: bool = True,
) -> None:
"""
Add accessors to cls from the delegate class.
Parameters
----------
cls
Class to add the methods/properties to.
delegate
Class to get methods/properties and doc-strings.
accessors : list of str
List of accessors to add.
typ : {'property', 'method'}
overwrite : bool, default False
Overwrite the method/property in the target class if it exists.
accessor_mapping: Callable, default lambda x: x
Callable to map the delegate's function to the cls' function.
raise_on_missing: bool, default True
Raise if an accessor does not exist on delegate.
False skips the missing accessor.
"""
def _create_delegator_property(name: str):
def _getter(self):
return self._delegate_property_get(name)
def _setter(self, new_values):
return self._delegate_property_set(name, new_values)
_getter.__name__ = name
_setter.__name__ = name
return property(
fget=_getter,
fset=_setter,
doc=getattr(delegate, accessor_mapping(name)).__doc__,
)
def _create_delegator_method(name: str):
def f(self, *args, **kwargs):
return self._delegate_method(name, *args, **kwargs)
f.__name__ = name
f.__doc__ = getattr(delegate, accessor_mapping(name)).__doc__
return f
for name in accessors:
if (
not raise_on_missing
and getattr(delegate, accessor_mapping(name), None) is None
):
continue
if typ == "property":
f = _create_delegator_property(name)
else:
f = _create_delegator_method(name)
# don't overwrite existing methods/properties
if overwrite or not hasattr(cls, name):
setattr(cls, name, f)
def delegate_names(
delegate,
accessors: list[str],
typ: str,
overwrite: bool = False,
accessor_mapping: Callable[[str], str] = lambda x: x,
raise_on_missing: bool = True,
):
"""
Add delegated names to a class using a class decorator. This provides
an alternative usage to directly calling `_add_delegate_accessors`
below a class definition.
Parameters
----------
delegate : object
The class to get methods/properties & doc-strings.
accessors : Sequence[str]
List of accessor to add.
typ : {'property', 'method'}
overwrite : bool, default False
Overwrite the method/property in the target class if it exists.
accessor_mapping: Callable, default lambda x: x
Callable to map the delegate's function to the cls' function.
raise_on_missing: bool, default True
Raise if an accessor does not exist on delegate.
False skips the missing accessor.
Returns
-------
callable
A class decorator.
Examples
--------
@delegate_names(Categorical, ["categories", "ordered"], "property")
class CategoricalAccessor(PandasDelegate):
[...]
"""
def add_delegate_accessors(cls):
cls._add_delegate_accessors(
delegate,
accessors,
typ,
overwrite=overwrite,
accessor_mapping=accessor_mapping,
raise_on_missing=raise_on_missing,
)
return cls
return add_delegate_accessors
# Ported with modifications from xarray; licence at LICENSES/XARRAY_LICENSE
# https://github.com/pydata/xarray/blob/master/xarray/core/extensions.py
# 1. We don't need to catch and re-raise AttributeErrors as RuntimeErrors
# 2. We use a UserWarning instead of a custom Warning
class CachedAccessor:
"""
Custom property-like object.
A descriptor for caching accessors.
Parameters
----------
name : str
Namespace that will be accessed under, e.g. ``df.foo``.
accessor : cls
Class with the extension methods.
Notes
-----
For accessor, The class's __init__ method assumes that one of
``Series``, ``DataFrame`` or ``Index`` as the
single argument ``data``.
"""
def __init__(self, name: str, accessor) -> None:
self._name = name
self._accessor = accessor
def __get__(self, obj, cls):
if obj is None:
# we're accessing the attribute of the class, i.e., Dataset.geo
return self._accessor
accessor_obj = self._accessor(obj)
# Replace the property with the accessor object. Inspired by:
# https://www.pydanny.com/cached-property.html
# We need to use object.__setattr__ because we overwrite __setattr__ on
# NDFrame
object.__setattr__(obj, self._name, accessor_obj)
return accessor_obj
@doc(klass="", others="")
def _register_accessor(name: str, cls):
"""
Register a custom accessor on {klass} objects.
Parameters
----------
name : str
Name under which the accessor should be registered. A warning is issued
if this name conflicts with a preexisting attribute.
Returns
-------
callable
A class decorator.
See Also
--------
register_dataframe_accessor : Register a custom accessor on DataFrame objects.
register_series_accessor : Register a custom accessor on Series objects.
register_index_accessor : Register a custom accessor on Index objects.
Notes
-----
When accessed, your accessor will be initialized with the pandas object
the user is interacting with. So the signature must be
.. code-block:: python
def __init__(self, pandas_object): # noqa: E999
...
For consistency with pandas methods, you should raise an ``AttributeError``
if the data passed to your accessor has an incorrect dtype.
>>> pd.Series(['a', 'b']).dt
Traceback (most recent call last):
...
AttributeError: Can only use .dt accessor with datetimelike values
Examples
--------
In your library code::
import pandas as pd
@pd.api.extensions.register_dataframe_accessor("geo")
class GeoAccessor:
def __init__(self, pandas_obj):
self._obj = pandas_obj
@property
def center(self):
# return the geographic center point of this DataFrame
lat = self._obj.latitude
lon = self._obj.longitude
return (float(lon.mean()), float(lat.mean()))
def plot(self):
# plot this array's data on a map, e.g., using Cartopy
pass
Back in an interactive IPython session:
.. code-block:: ipython
In [1]: ds = pd.DataFrame({{"longitude": np.linspace(0, 10),
...: "latitude": np.linspace(0, 20)}})
In [2]: ds.geo.center
Out[2]: (5.0, 10.0)
In [3]: ds.geo.plot() # plots data on a map
"""
def decorator(accessor):
if hasattr(cls, name):
warnings.warn(
f"registration of accessor {repr(accessor)} under name "
f"{repr(name)} for type {repr(cls)} is overriding a preexisting "
f"attribute with the same name.",
UserWarning,
stacklevel=find_stack_level(),
)
setattr(cls, name, CachedAccessor(name, accessor))
cls._accessors.add(name)
return accessor
return decorator
@doc(_register_accessor, klass="DataFrame")
def register_dataframe_accessor(name: str):
from pandas import DataFrame
return _register_accessor(name, DataFrame)
@doc(_register_accessor, klass="Series")
def register_series_accessor(name: str):
from pandas import Series
return _register_accessor(name, Series)
@doc(_register_accessor, klass="Index")
def register_index_accessor(name: str):
from pandas import Index
return _register_accessor(name, Index)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,140 @@
from pandas._libs import (
NaT,
Period,
Timedelta,
Timestamp,
)
from pandas._libs.missing import NA
from pandas.core.dtypes.dtypes import (
ArrowDtype,
CategoricalDtype,
DatetimeTZDtype,
IntervalDtype,
PeriodDtype,
)
from pandas.core.dtypes.missing import (
isna,
isnull,
notna,
notnull,
)
from pandas.core.algorithms import (
factorize,
unique,
value_counts,
)
from pandas.core.arrays import Categorical
from pandas.core.arrays.boolean import BooleanDtype
from pandas.core.arrays.floating import (
Float32Dtype,
Float64Dtype,
)
from pandas.core.arrays.integer import (
Int8Dtype,
Int16Dtype,
Int32Dtype,
Int64Dtype,
UInt8Dtype,
UInt16Dtype,
UInt32Dtype,
UInt64Dtype,
)
from pandas.core.arrays.string_ import StringDtype
from pandas.core.construction import array
from pandas.core.flags import Flags
from pandas.core.groupby import (
Grouper,
NamedAgg,
)
from pandas.core.indexes.api import (
CategoricalIndex,
DatetimeIndex,
Index,
IntervalIndex,
MultiIndex,
PeriodIndex,
RangeIndex,
TimedeltaIndex,
)
from pandas.core.indexes.datetimes import (
bdate_range,
date_range,
)
from pandas.core.indexes.interval import (
Interval,
interval_range,
)
from pandas.core.indexes.period import period_range
from pandas.core.indexes.timedeltas import timedelta_range
from pandas.core.indexing import IndexSlice
from pandas.core.series import Series
from pandas.core.tools.datetimes import to_datetime
from pandas.core.tools.numeric import to_numeric
from pandas.core.tools.timedeltas import to_timedelta
from pandas.io.formats.format import set_eng_float_format
from pandas.tseries.offsets import DateOffset
# DataFrame needs to be imported after NamedAgg to avoid a circular import
from pandas.core.frame import DataFrame # isort:skip
__all__ = [
"array",
"ArrowDtype",
"bdate_range",
"BooleanDtype",
"Categorical",
"CategoricalDtype",
"CategoricalIndex",
"DataFrame",
"DateOffset",
"date_range",
"DatetimeIndex",
"DatetimeTZDtype",
"factorize",
"Flags",
"Float32Dtype",
"Float64Dtype",
"Grouper",
"Index",
"IndexSlice",
"Int16Dtype",
"Int32Dtype",
"Int64Dtype",
"Int8Dtype",
"Interval",
"IntervalDtype",
"IntervalIndex",
"interval_range",
"isna",
"isnull",
"MultiIndex",
"NA",
"NamedAgg",
"NaT",
"notna",
"notnull",
"Period",
"PeriodDtype",
"PeriodIndex",
"period_range",
"RangeIndex",
"Series",
"set_eng_float_format",
"StringDtype",
"Timedelta",
"TimedeltaIndex",
"timedelta_range",
"Timestamp",
"to_datetime",
"to_numeric",
"to_timedelta",
"UInt16Dtype",
"UInt32Dtype",
"UInt64Dtype",
"UInt8Dtype",
"unique",
"value_counts",
]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,9 @@
"""
core.array_algos is for algorithms that operate on ndarray and ExtensionArray.
These should:
- Assume that any Index, Series, or DataFrame objects have already been unwrapped.
- Assume that any list arguments have already been cast to ndarray/EA.
- Not depend on Index, Series, or DataFrame, nor import any of these.
- May dispatch to ExtensionArray methods, but should not import from core.arrays.
"""

View File

@@ -0,0 +1,67 @@
"""
datetimelke_accumulations.py is for accumulations of datetimelike extension arrays
"""
from __future__ import annotations
from typing import Callable
import numpy as np
from pandas._libs import iNaT
from pandas.core.dtypes.missing import isna
def _cum_func(
func: Callable,
values: np.ndarray,
*,
skipna: bool = True,
):
"""
Accumulations for 1D datetimelike arrays.
Parameters
----------
func : np.cumsum, np.maximum.accumulate, np.minimum.accumulate
values : np.ndarray
Numpy array with the values (can be of any dtype that support the
operation). Values is changed is modified inplace.
skipna : bool, default True
Whether to skip NA.
"""
try:
fill_value = {
np.maximum.accumulate: np.iinfo(np.int64).min,
np.cumsum: 0,
np.minimum.accumulate: np.iinfo(np.int64).max,
}[func]
except KeyError:
raise ValueError(f"No accumulation for {func} implemented on BaseMaskedArray")
mask = isna(values)
y = values.view("i8")
y[mask] = fill_value
if not skipna:
mask = np.maximum.accumulate(mask)
result = func(y)
result[mask] = iNaT
if values.dtype.kind in "mM":
return result.view(values.dtype.base)
return result
def cumsum(values: np.ndarray, *, skipna: bool = True) -> np.ndarray:
return _cum_func(np.cumsum, values, skipna=skipna)
def cummin(values: np.ndarray, *, skipna: bool = True):
return _cum_func(np.minimum.accumulate, values, skipna=skipna)
def cummax(values: np.ndarray, *, skipna: bool = True):
return _cum_func(np.maximum.accumulate, values, skipna=skipna)

View File

@@ -0,0 +1,90 @@
"""
masked_accumulations.py is for accumulation algorithms using a mask-based approach
for missing values.
"""
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Callable,
)
import numpy as np
if TYPE_CHECKING:
from pandas._typing import npt
def _cum_func(
func: Callable,
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
):
"""
Accumulations for 1D masked array.
We will modify values in place to replace NAs with the appropriate fill value.
Parameters
----------
func : np.cumsum, np.cumprod, np.maximum.accumulate, np.minimum.accumulate
values : np.ndarray
Numpy array with the values (can be of any dtype that support the
operation).
mask : np.ndarray
Boolean numpy array (True values indicate missing values).
skipna : bool, default True
Whether to skip NA.
"""
dtype_info: np.iinfo | np.finfo
if values.dtype.kind == "f":
dtype_info = np.finfo(values.dtype.type)
elif values.dtype.kind in "iu":
dtype_info = np.iinfo(values.dtype.type)
elif values.dtype.kind == "b":
# Max value of bool is 1, but since we are setting into a boolean
# array, 255 is fine as well. Min value has to be 0 when setting
# into the boolean array.
dtype_info = np.iinfo(np.uint8)
else:
raise NotImplementedError(
f"No masked accumulation defined for dtype {values.dtype.type}"
)
try:
fill_value = {
np.cumprod: 1,
np.maximum.accumulate: dtype_info.min,
np.cumsum: 0,
np.minimum.accumulate: dtype_info.max,
}[func]
except KeyError:
raise NotImplementedError(
f"No accumulation for {func} implemented on BaseMaskedArray"
)
values[mask] = fill_value
if not skipna:
mask = np.maximum.accumulate(mask)
values = func(values)
return values, mask
def cumsum(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
return _cum_func(np.cumsum, values, mask, skipna=skipna)
def cumprod(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
return _cum_func(np.cumprod, values, mask, skipna=skipna)
def cummin(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
return _cum_func(np.minimum.accumulate, values, mask, skipna=skipna)
def cummax(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
return _cum_func(np.maximum.accumulate, values, mask, skipna=skipna)

View File

@@ -0,0 +1,197 @@
"""
masked_reductions.py is for reduction algorithms using a mask-based approach
for missing values.
"""
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Callable,
)
import warnings
import numpy as np
from pandas._libs import missing as libmissing
from pandas.core.nanops import check_below_min_count
if TYPE_CHECKING:
from pandas._typing import (
AxisInt,
npt,
)
def _reductions(
func: Callable,
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
min_count: int = 0,
axis: AxisInt | None = None,
**kwargs,
):
"""
Sum, mean or product for 1D masked array.
Parameters
----------
func : np.sum or np.prod
values : np.ndarray
Numpy array with the values (can be of any dtype that support the
operation).
mask : np.ndarray[bool]
Boolean numpy array (True values indicate missing values).
skipna : bool, default True
Whether to skip NA.
min_count : int, default 0
The required number of valid values to perform the operation. If fewer than
``min_count`` non-NA values are present the result will be NA.
axis : int, optional, default None
"""
if not skipna:
if mask.any() or check_below_min_count(values.shape, None, min_count):
return libmissing.NA
else:
return func(values, axis=axis, **kwargs)
else:
if check_below_min_count(values.shape, mask, min_count) and (
axis is None or values.ndim == 1
):
return libmissing.NA
return func(values, where=~mask, axis=axis, **kwargs)
def sum(
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
min_count: int = 0,
axis: AxisInt | None = None,
):
return _reductions(
np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis
)
def prod(
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
min_count: int = 0,
axis: AxisInt | None = None,
):
return _reductions(
np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis
)
def _minmax(
func: Callable,
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
axis: AxisInt | None = None,
):
"""
Reduction for 1D masked array.
Parameters
----------
func : np.min or np.max
values : np.ndarray
Numpy array with the values (can be of any dtype that support the
operation).
mask : np.ndarray[bool]
Boolean numpy array (True values indicate missing values).
skipna : bool, default True
Whether to skip NA.
axis : int, optional, default None
"""
if not skipna:
if mask.any() or not values.size:
# min/max with empty array raise in numpy, pandas returns NA
return libmissing.NA
else:
return func(values, axis=axis)
else:
subset = values[~mask]
if subset.size:
return func(subset, axis=axis)
else:
# min/max with empty array raise in numpy, pandas returns NA
return libmissing.NA
def min(
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
axis: AxisInt | None = None,
):
return _minmax(np.min, values=values, mask=mask, skipna=skipna, axis=axis)
def max(
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
axis: AxisInt | None = None,
):
return _minmax(np.max, values=values, mask=mask, skipna=skipna, axis=axis)
def mean(
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
axis: AxisInt | None = None,
):
if not values.size or mask.all():
return libmissing.NA
return _reductions(np.mean, values=values, mask=mask, skipna=skipna, axis=axis)
def var(
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
axis: AxisInt | None = None,
ddof: int = 1,
):
if not values.size or mask.all():
return libmissing.NA
with warnings.catch_warnings():
warnings.simplefilter("ignore", RuntimeWarning)
return _reductions(
np.var, values=values, mask=mask, skipna=skipna, axis=axis, ddof=ddof
)
def std(
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
axis: AxisInt | None = None,
ddof: int = 1,
):
if not values.size or mask.all():
return libmissing.NA
with warnings.catch_warnings():
warnings.simplefilter("ignore", RuntimeWarning)
return _reductions(
np.std, values=values, mask=mask, skipna=skipna, axis=axis, ddof=ddof
)

View File

@@ -0,0 +1,149 @@
"""
EA-compatible analogue to np.putmask
"""
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Any,
)
import numpy as np
from pandas._libs import lib
from pandas.core.dtypes.cast import infer_dtype_from
from pandas.core.dtypes.common import is_list_like
from pandas.core.arrays import ExtensionArray
if TYPE_CHECKING:
from pandas._typing import (
ArrayLike,
npt,
)
from pandas import MultiIndex
def putmask_inplace(values: ArrayLike, mask: npt.NDArray[np.bool_], value: Any) -> None:
"""
ExtensionArray-compatible implementation of np.putmask. The main
difference is we do not handle repeating or truncating like numpy.
Parameters
----------
values: np.ndarray or ExtensionArray
mask : np.ndarray[bool]
We assume extract_bool_array has already been called.
value : Any
"""
if (
not isinstance(values, np.ndarray)
or (values.dtype == object and not lib.is_scalar(value))
# GH#43424: np.putmask raises TypeError if we cannot cast between types with
# rule = "safe", a stricter guarantee we may not have here
or (
isinstance(value, np.ndarray) and not np.can_cast(value.dtype, values.dtype)
)
):
# GH#19266 using np.putmask gives unexpected results with listlike value
# along with object dtype
if is_list_like(value) and len(value) == len(values):
values[mask] = value[mask]
else:
values[mask] = value
else:
# GH#37833 np.putmask is more performant than __setitem__
np.putmask(values, mask, value)
def putmask_without_repeat(
values: np.ndarray, mask: npt.NDArray[np.bool_], new: Any
) -> None:
"""
np.putmask will truncate or repeat if `new` is a listlike with
len(new) != len(values). We require an exact match.
Parameters
----------
values : np.ndarray
mask : np.ndarray[bool]
new : Any
"""
if getattr(new, "ndim", 0) >= 1:
new = new.astype(values.dtype, copy=False)
# TODO: this prob needs some better checking for 2D cases
nlocs = mask.sum()
if nlocs > 0 and is_list_like(new) and getattr(new, "ndim", 1) == 1:
shape = np.shape(new)
# np.shape compat for if setitem_datetimelike_compat
# changed arraylike to list e.g. test_where_dt64_2d
if nlocs == shape[-1]:
# GH#30567
# If length of ``new`` is less than the length of ``values``,
# `np.putmask` would first repeat the ``new`` array and then
# assign the masked values hence produces incorrect result.
# `np.place` on the other hand uses the ``new`` values at it is
# to place in the masked locations of ``values``
np.place(values, mask, new)
# i.e. values[mask] = new
elif mask.shape[-1] == shape[-1] or shape[-1] == 1:
np.putmask(values, mask, new)
else:
raise ValueError("cannot assign mismatch length to masked array")
else:
np.putmask(values, mask, new)
def validate_putmask(
values: ArrayLike | MultiIndex, mask: np.ndarray
) -> tuple[npt.NDArray[np.bool_], bool]:
"""
Validate mask and check if this putmask operation is a no-op.
"""
mask = extract_bool_array(mask)
if mask.shape != values.shape:
raise ValueError("putmask: mask and data must be the same size")
noop = not mask.any()
return mask, noop
def extract_bool_array(mask: ArrayLike) -> npt.NDArray[np.bool_]:
"""
If we have a SparseArray or BooleanArray, convert it to ndarray[bool].
"""
if isinstance(mask, ExtensionArray):
# We could have BooleanArray, Sparse[bool], ...
# Except for BooleanArray, this is equivalent to just
# np.asarray(mask, dtype=bool)
mask = mask.to_numpy(dtype=bool, na_value=False)
mask = np.asarray(mask, dtype=bool)
return mask
def setitem_datetimelike_compat(values: np.ndarray, num_set: int, other):
"""
Parameters
----------
values : np.ndarray
num_set : int
For putmask, this is mask.sum()
other : Any
"""
if values.dtype == object:
dtype, _ = infer_dtype_from(other)
if lib.is_np_dtype(dtype, "mM"):
# https://github.com/numpy/numpy/issues/12550
# timedelta64 will incorrectly cast to int
if not is_list_like(other):
other = [other] * num_set
else:
other = list(other)
return other

View File

@@ -0,0 +1,226 @@
from __future__ import annotations
from typing import TYPE_CHECKING
import numpy as np
from pandas.core.dtypes.missing import (
isna,
na_value_for_dtype,
)
if TYPE_CHECKING:
from pandas._typing import (
ArrayLike,
Scalar,
npt,
)
def quantile_compat(
values: ArrayLike, qs: npt.NDArray[np.float64], interpolation: str
) -> ArrayLike:
"""
Compute the quantiles of the given values for each quantile in `qs`.
Parameters
----------
values : np.ndarray or ExtensionArray
qs : np.ndarray[float64]
interpolation : str
Returns
-------
np.ndarray or ExtensionArray
"""
if isinstance(values, np.ndarray):
fill_value = na_value_for_dtype(values.dtype, compat=False)
mask = isna(values)
return quantile_with_mask(values, mask, fill_value, qs, interpolation)
else:
return values._quantile(qs, interpolation)
def quantile_with_mask(
values: np.ndarray,
mask: npt.NDArray[np.bool_],
fill_value,
qs: npt.NDArray[np.float64],
interpolation: str,
) -> np.ndarray:
"""
Compute the quantiles of the given values for each quantile in `qs`.
Parameters
----------
values : np.ndarray
For ExtensionArray, this is _values_for_factorize()[0]
mask : np.ndarray[bool]
mask = isna(values)
For ExtensionArray, this is computed before calling _value_for_factorize
fill_value : Scalar
The value to interpret fill NA entries with
For ExtensionArray, this is _values_for_factorize()[1]
qs : np.ndarray[float64]
interpolation : str
Type of interpolation
Returns
-------
np.ndarray
Notes
-----
Assumes values is already 2D. For ExtensionArray this means np.atleast_2d
has been called on _values_for_factorize()[0]
Quantile is computed along axis=1.
"""
assert values.shape == mask.shape
if values.ndim == 1:
# unsqueeze, operate, re-squeeze
values = np.atleast_2d(values)
mask = np.atleast_2d(mask)
res_values = quantile_with_mask(values, mask, fill_value, qs, interpolation)
return res_values[0]
assert values.ndim == 2
is_empty = values.shape[1] == 0
if is_empty:
# create the array of na_values
# 2d len(values) * len(qs)
flat = np.array([fill_value] * len(qs))
result = np.repeat(flat, len(values)).reshape(len(values), len(qs))
else:
result = _nanpercentile(
values,
qs * 100.0,
na_value=fill_value,
mask=mask,
interpolation=interpolation,
)
result = np.asarray(result)
result = result.T
return result
def _nanpercentile_1d(
values: np.ndarray,
mask: npt.NDArray[np.bool_],
qs: npt.NDArray[np.float64],
na_value: Scalar,
interpolation: str,
) -> Scalar | np.ndarray:
"""
Wrapper for np.percentile that skips missing values, specialized to
1-dimensional case.
Parameters
----------
values : array over which to find quantiles
mask : ndarray[bool]
locations in values that should be considered missing
qs : np.ndarray[float64] of quantile indices to find
na_value : scalar
value to return for empty or all-null values
interpolation : str
Returns
-------
quantiles : scalar or array
"""
# mask is Union[ExtensionArray, ndarray]
values = values[~mask]
if len(values) == 0:
# Can't pass dtype=values.dtype here bc we might have na_value=np.nan
# with values.dtype=int64 see test_quantile_empty
# equiv: 'np.array([na_value] * len(qs))' but much faster
return np.full(len(qs), na_value)
return np.percentile(
values,
qs,
# error: No overload variant of "percentile" matches argument
# types "ndarray[Any, Any]", "ndarray[Any, dtype[floating[_64Bit]]]"
# , "Dict[str, str]" [call-overload]
method=interpolation, # type: ignore[call-overload]
)
def _nanpercentile(
values: np.ndarray,
qs: npt.NDArray[np.float64],
*,
na_value,
mask: npt.NDArray[np.bool_],
interpolation: str,
):
"""
Wrapper for np.percentile that skips missing values.
Parameters
----------
values : np.ndarray[ndim=2] over which to find quantiles
qs : np.ndarray[float64] of quantile indices to find
na_value : scalar
value to return for empty or all-null values
mask : np.ndarray[bool]
locations in values that should be considered missing
interpolation : str
Returns
-------
quantiles : scalar or array
"""
if values.dtype.kind in "mM":
# need to cast to integer to avoid rounding errors in numpy
result = _nanpercentile(
values.view("i8"),
qs=qs,
na_value=na_value.view("i8"),
mask=mask,
interpolation=interpolation,
)
# Note: we have to do `astype` and not view because in general we
# have float result at this point, not i8
return result.astype(values.dtype)
if mask.any():
# Caller is responsible for ensuring mask shape match
assert mask.shape == values.shape
result = [
_nanpercentile_1d(val, m, qs, na_value, interpolation=interpolation)
for (val, m) in zip(list(values), list(mask))
]
if values.dtype.kind == "f":
# preserve itemsize
result = np.asarray(result, dtype=values.dtype).T
else:
result = np.asarray(result).T
if (
result.dtype != values.dtype
and not mask.all()
and (result == result.astype(values.dtype, copy=False)).all()
):
# mask.all() will never get cast back to int
# e.g. values id integer dtype and result is floating dtype,
# only cast back to integer dtype if result values are all-integer.
result = result.astype(values.dtype, copy=False)
return result
else:
return np.percentile(
values,
qs,
axis=1,
# error: No overload variant of "percentile" matches argument types
# "ndarray[Any, Any]", "ndarray[Any, dtype[floating[_64Bit]]]",
# "int", "Dict[str, str]" [call-overload]
method=interpolation, # type: ignore[call-overload]
)

View File

@@ -0,0 +1,152 @@
"""
Methods used by Block.replace and related methods.
"""
from __future__ import annotations
import operator
import re
from re import Pattern
from typing import (
TYPE_CHECKING,
Any,
)
import numpy as np
from pandas.core.dtypes.common import (
is_bool,
is_re,
is_re_compilable,
)
from pandas.core.dtypes.missing import isna
if TYPE_CHECKING:
from pandas._typing import (
ArrayLike,
Scalar,
npt,
)
def should_use_regex(regex: bool, to_replace: Any) -> bool:
"""
Decide whether to treat `to_replace` as a regular expression.
"""
if is_re(to_replace):
regex = True
regex = regex and is_re_compilable(to_replace)
# Don't use regex if the pattern is empty.
regex = regex and re.compile(to_replace).pattern != ""
return regex
def compare_or_regex_search(
a: ArrayLike, b: Scalar | Pattern, regex: bool, mask: npt.NDArray[np.bool_]
) -> ArrayLike:
"""
Compare two array-like inputs of the same shape or two scalar values
Calls operator.eq or re.search, depending on regex argument. If regex is
True, perform an element-wise regex matching.
Parameters
----------
a : array-like
b : scalar or regex pattern
regex : bool
mask : np.ndarray[bool]
Returns
-------
mask : array-like of bool
"""
if isna(b):
return ~mask
def _check_comparison_types(
result: ArrayLike | bool, a: ArrayLike, b: Scalar | Pattern
):
"""
Raises an error if the two arrays (a,b) cannot be compared.
Otherwise, returns the comparison result as expected.
"""
if is_bool(result) and isinstance(a, np.ndarray):
type_names = [type(a).__name__, type(b).__name__]
type_names[0] = f"ndarray(dtype={a.dtype})"
raise TypeError(
f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}"
)
if not regex or not should_use_regex(regex, b):
# TODO: should use missing.mask_missing?
op = lambda x: operator.eq(x, b)
else:
op = np.vectorize(
lambda x: bool(re.search(b, x))
if isinstance(x, str) and isinstance(b, (str, Pattern))
else False
)
# GH#32621 use mask to avoid comparing to NAs
if isinstance(a, np.ndarray):
a = a[mask]
result = op(a)
if isinstance(result, np.ndarray) and mask is not None:
# The shape of the mask can differ to that of the result
# since we may compare only a subset of a's or b's elements
tmp = np.zeros(mask.shape, dtype=np.bool_)
np.place(tmp, mask, result)
result = tmp
_check_comparison_types(result, a, b)
return result
def replace_regex(
values: ArrayLike, rx: re.Pattern, value, mask: npt.NDArray[np.bool_] | None
) -> None:
"""
Parameters
----------
values : ArrayLike
Object dtype.
rx : re.Pattern
value : Any
mask : np.ndarray[bool], optional
Notes
-----
Alters values in-place.
"""
# deal with replacing values with objects (strings) that match but
# whose replacement is not a string (numeric, nan, object)
if isna(value) or not isinstance(value, str):
def re_replacer(s):
if is_re(rx) and isinstance(s, str):
return value if rx.search(s) is not None else s
else:
return s
else:
# value is guaranteed to be a string here, s can be either a string
# or null if it's null it gets returned
def re_replacer(s):
if is_re(rx) and isinstance(s, str):
return rx.sub(value, s)
else:
return s
f = np.vectorize(re_replacer, otypes=[np.object_])
if mask is None:
values[:] = f(values)
else:
values[mask] = f(values[mask])

View File

@@ -0,0 +1,594 @@
from __future__ import annotations
import functools
from typing import (
TYPE_CHECKING,
cast,
overload,
)
import numpy as np
from pandas._libs import (
algos as libalgos,
lib,
)
from pandas.core.dtypes.cast import maybe_promote
from pandas.core.dtypes.common import (
ensure_platform_int,
is_1d_only_ea_dtype,
)
from pandas.core.dtypes.missing import na_value_for_dtype
from pandas.core.construction import ensure_wrapped_if_datetimelike
if TYPE_CHECKING:
from pandas._typing import (
ArrayLike,
AxisInt,
npt,
)
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
from pandas.core.arrays.base import ExtensionArray
@overload
def take_nd(
arr: np.ndarray,
indexer,
axis: AxisInt = ...,
fill_value=...,
allow_fill: bool = ...,
) -> np.ndarray:
...
@overload
def take_nd(
arr: ExtensionArray,
indexer,
axis: AxisInt = ...,
fill_value=...,
allow_fill: bool = ...,
) -> ArrayLike:
...
def take_nd(
arr: ArrayLike,
indexer,
axis: AxisInt = 0,
fill_value=lib.no_default,
allow_fill: bool = True,
) -> ArrayLike:
"""
Specialized Cython take which sets NaN values in one pass
This dispatches to ``take`` defined on ExtensionArrays.
Note: this function assumes that the indexer is a valid(ated) indexer with
no out of bound indices.
Parameters
----------
arr : np.ndarray or ExtensionArray
Input array.
indexer : ndarray
1-D array of indices to take, subarrays corresponding to -1 value
indices are filed with fill_value
axis : int, default 0
Axis to take from
fill_value : any, default np.nan
Fill value to replace -1 values with
allow_fill : bool, default True
If False, indexer is assumed to contain no -1 values so no filling
will be done. This short-circuits computation of a mask. Result is
undefined if allow_fill == False and -1 is present in indexer.
Returns
-------
subarray : np.ndarray or ExtensionArray
May be the same type as the input, or cast to an ndarray.
"""
if fill_value is lib.no_default:
fill_value = na_value_for_dtype(arr.dtype, compat=False)
elif lib.is_np_dtype(arr.dtype, "mM"):
dtype, fill_value = maybe_promote(arr.dtype, fill_value)
if arr.dtype != dtype:
# EA.take is strict about returning a new object of the same type
# so for that case cast upfront
arr = arr.astype(dtype)
if not isinstance(arr, np.ndarray):
# i.e. ExtensionArray,
# includes for EA to catch DatetimeArray, TimedeltaArray
if not is_1d_only_ea_dtype(arr.dtype):
# i.e. DatetimeArray, TimedeltaArray
arr = cast("NDArrayBackedExtensionArray", arr)
return arr.take(
indexer, fill_value=fill_value, allow_fill=allow_fill, axis=axis
)
return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
arr = np.asarray(arr)
return _take_nd_ndarray(arr, indexer, axis, fill_value, allow_fill)
def _take_nd_ndarray(
arr: np.ndarray,
indexer: npt.NDArray[np.intp] | None,
axis: AxisInt,
fill_value,
allow_fill: bool,
) -> np.ndarray:
if indexer is None:
indexer = np.arange(arr.shape[axis], dtype=np.intp)
dtype, fill_value = arr.dtype, arr.dtype.type()
else:
indexer = ensure_platform_int(indexer)
dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value(
arr, indexer, fill_value, allow_fill
)
flip_order = False
if arr.ndim == 2 and arr.flags.f_contiguous:
flip_order = True
if flip_order:
arr = arr.T
axis = arr.ndim - axis - 1
# at this point, it's guaranteed that dtype can hold both the arr values
# and the fill_value
out_shape_ = list(arr.shape)
out_shape_[axis] = len(indexer)
out_shape = tuple(out_shape_)
if arr.flags.f_contiguous and axis == arr.ndim - 1:
# minor tweak that can make an order-of-magnitude difference
# for dataframes initialized directly from 2-d ndarrays
# (s.t. df.values is c-contiguous and df._mgr.blocks[0] is its
# f-contiguous transpose)
out = np.empty(out_shape, dtype=dtype, order="F")
else:
out = np.empty(out_shape, dtype=dtype)
func = _get_take_nd_function(
arr.ndim, arr.dtype, out.dtype, axis=axis, mask_info=mask_info
)
func(arr, indexer, out, fill_value)
if flip_order:
out = out.T
return out
def take_1d(
arr: ArrayLike,
indexer: npt.NDArray[np.intp],
fill_value=None,
allow_fill: bool = True,
mask: npt.NDArray[np.bool_] | None = None,
) -> ArrayLike:
"""
Specialized version for 1D arrays. Differences compared to `take_nd`:
- Assumes input array has already been converted to numpy array / EA
- Assumes indexer is already guaranteed to be intp dtype ndarray
- Only works for 1D arrays
To ensure the lowest possible overhead.
Note: similarly to `take_nd`, this function assumes that the indexer is
a valid(ated) indexer with no out of bound indices.
Parameters
----------
arr : np.ndarray or ExtensionArray
Input array.
indexer : ndarray
1-D array of indices to take (validated indices, intp dtype).
fill_value : any, default np.nan
Fill value to replace -1 values with
allow_fill : bool, default True
If False, indexer is assumed to contain no -1 values so no filling
will be done. This short-circuits computation of a mask. Result is
undefined if allow_fill == False and -1 is present in indexer.
mask : np.ndarray, optional, default None
If `allow_fill` is True, and the mask (where indexer == -1) is already
known, it can be passed to avoid recomputation.
"""
if not isinstance(arr, np.ndarray):
# ExtensionArray -> dispatch to their method
return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
if not allow_fill:
return arr.take(indexer)
dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value(
arr, indexer, fill_value, True, mask
)
# at this point, it's guaranteed that dtype can hold both the arr values
# and the fill_value
out = np.empty(indexer.shape, dtype=dtype)
func = _get_take_nd_function(
arr.ndim, arr.dtype, out.dtype, axis=0, mask_info=mask_info
)
func(arr, indexer, out, fill_value)
return out
def take_2d_multi(
arr: np.ndarray,
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
fill_value=np.nan,
) -> np.ndarray:
"""
Specialized Cython take which sets NaN values in one pass.
"""
# This is only called from one place in DataFrame._reindex_multi,
# so we know indexer is well-behaved.
assert indexer is not None
assert indexer[0] is not None
assert indexer[1] is not None
row_idx, col_idx = indexer
row_idx = ensure_platform_int(row_idx)
col_idx = ensure_platform_int(col_idx)
indexer = row_idx, col_idx
mask_info = None
# check for promotion based on types only (do this first because
# it's faster than computing a mask)
dtype, fill_value = maybe_promote(arr.dtype, fill_value)
if dtype != arr.dtype:
# check if promotion is actually required based on indexer
row_mask = row_idx == -1
col_mask = col_idx == -1
row_needs = row_mask.any()
col_needs = col_mask.any()
mask_info = (row_mask, col_mask), (row_needs, col_needs)
if not (row_needs or col_needs):
# if not, then depromote, set fill_value to dummy
# (it won't be used but we don't want the cython code
# to crash when trying to cast it to dtype)
dtype, fill_value = arr.dtype, arr.dtype.type()
# at this point, it's guaranteed that dtype can hold both the arr values
# and the fill_value
out_shape = len(row_idx), len(col_idx)
out = np.empty(out_shape, dtype=dtype)
func = _take_2d_multi_dict.get((arr.dtype.name, out.dtype.name), None)
if func is None and arr.dtype != out.dtype:
func = _take_2d_multi_dict.get((out.dtype.name, out.dtype.name), None)
if func is not None:
func = _convert_wrapper(func, out.dtype)
if func is not None:
func(arr, indexer, out=out, fill_value=fill_value)
else:
# test_reindex_multi
_take_2d_multi_object(
arr, indexer, out, fill_value=fill_value, mask_info=mask_info
)
return out
@functools.lru_cache
def _get_take_nd_function_cached(
ndim: int, arr_dtype: np.dtype, out_dtype: np.dtype, axis: AxisInt
):
"""
Part of _get_take_nd_function below that doesn't need `mask_info` and thus
can be cached (mask_info potentially contains a numpy ndarray which is not
hashable and thus cannot be used as argument for cached function).
"""
tup = (arr_dtype.name, out_dtype.name)
if ndim == 1:
func = _take_1d_dict.get(tup, None)
elif ndim == 2:
if axis == 0:
func = _take_2d_axis0_dict.get(tup, None)
else:
func = _take_2d_axis1_dict.get(tup, None)
if func is not None:
return func
# We get here with string, uint, float16, and complex dtypes that could
# potentially be handled in algos_take_helper.
# Also a couple with (M8[ns], object) and (m8[ns], object)
tup = (out_dtype.name, out_dtype.name)
if ndim == 1:
func = _take_1d_dict.get(tup, None)
elif ndim == 2:
if axis == 0:
func = _take_2d_axis0_dict.get(tup, None)
else:
func = _take_2d_axis1_dict.get(tup, None)
if func is not None:
func = _convert_wrapper(func, out_dtype)
return func
return None
def _get_take_nd_function(
ndim: int,
arr_dtype: np.dtype,
out_dtype: np.dtype,
axis: AxisInt = 0,
mask_info=None,
):
"""
Get the appropriate "take" implementation for the given dimension, axis
and dtypes.
"""
func = None
if ndim <= 2:
# for this part we don't need `mask_info` -> use the cached algo lookup
func = _get_take_nd_function_cached(ndim, arr_dtype, out_dtype, axis)
if func is None:
def func(arr, indexer, out, fill_value=np.nan) -> None:
indexer = ensure_platform_int(indexer)
_take_nd_object(
arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info
)
return func
def _view_wrapper(f, arr_dtype=None, out_dtype=None, fill_wrap=None):
def wrapper(
arr: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
) -> None:
if arr_dtype is not None:
arr = arr.view(arr_dtype)
if out_dtype is not None:
out = out.view(out_dtype)
if fill_wrap is not None:
# FIXME: if we get here with dt64/td64 we need to be sure we have
# matching resos
if fill_value.dtype.kind == "m":
fill_value = fill_value.astype("m8[ns]")
else:
fill_value = fill_value.astype("M8[ns]")
fill_value = fill_wrap(fill_value)
f(arr, indexer, out, fill_value=fill_value)
return wrapper
def _convert_wrapper(f, conv_dtype):
def wrapper(
arr: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
) -> None:
if conv_dtype == object:
# GH#39755 avoid casting dt64/td64 to integers
arr = ensure_wrapped_if_datetimelike(arr)
arr = arr.astype(conv_dtype)
f(arr, indexer, out, fill_value=fill_value)
return wrapper
_take_1d_dict = {
("int8", "int8"): libalgos.take_1d_int8_int8,
("int8", "int32"): libalgos.take_1d_int8_int32,
("int8", "int64"): libalgos.take_1d_int8_int64,
("int8", "float64"): libalgos.take_1d_int8_float64,
("int16", "int16"): libalgos.take_1d_int16_int16,
("int16", "int32"): libalgos.take_1d_int16_int32,
("int16", "int64"): libalgos.take_1d_int16_int64,
("int16", "float64"): libalgos.take_1d_int16_float64,
("int32", "int32"): libalgos.take_1d_int32_int32,
("int32", "int64"): libalgos.take_1d_int32_int64,
("int32", "float64"): libalgos.take_1d_int32_float64,
("int64", "int64"): libalgos.take_1d_int64_int64,
("int64", "float64"): libalgos.take_1d_int64_float64,
("float32", "float32"): libalgos.take_1d_float32_float32,
("float32", "float64"): libalgos.take_1d_float32_float64,
("float64", "float64"): libalgos.take_1d_float64_float64,
("object", "object"): libalgos.take_1d_object_object,
("bool", "bool"): _view_wrapper(libalgos.take_1d_bool_bool, np.uint8, np.uint8),
("bool", "object"): _view_wrapper(libalgos.take_1d_bool_object, np.uint8, None),
("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
libalgos.take_1d_int64_int64, np.int64, np.int64, np.int64
),
("timedelta64[ns]", "timedelta64[ns]"): _view_wrapper(
libalgos.take_1d_int64_int64, np.int64, np.int64, np.int64
),
}
_take_2d_axis0_dict = {
("int8", "int8"): libalgos.take_2d_axis0_int8_int8,
("int8", "int32"): libalgos.take_2d_axis0_int8_int32,
("int8", "int64"): libalgos.take_2d_axis0_int8_int64,
("int8", "float64"): libalgos.take_2d_axis0_int8_float64,
("int16", "int16"): libalgos.take_2d_axis0_int16_int16,
("int16", "int32"): libalgos.take_2d_axis0_int16_int32,
("int16", "int64"): libalgos.take_2d_axis0_int16_int64,
("int16", "float64"): libalgos.take_2d_axis0_int16_float64,
("int32", "int32"): libalgos.take_2d_axis0_int32_int32,
("int32", "int64"): libalgos.take_2d_axis0_int32_int64,
("int32", "float64"): libalgos.take_2d_axis0_int32_float64,
("int64", "int64"): libalgos.take_2d_axis0_int64_int64,
("int64", "float64"): libalgos.take_2d_axis0_int64_float64,
("float32", "float32"): libalgos.take_2d_axis0_float32_float32,
("float32", "float64"): libalgos.take_2d_axis0_float32_float64,
("float64", "float64"): libalgos.take_2d_axis0_float64_float64,
("object", "object"): libalgos.take_2d_axis0_object_object,
("bool", "bool"): _view_wrapper(
libalgos.take_2d_axis0_bool_bool, np.uint8, np.uint8
),
("bool", "object"): _view_wrapper(
libalgos.take_2d_axis0_bool_object, np.uint8, None
),
("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
libalgos.take_2d_axis0_int64_int64, np.int64, np.int64, fill_wrap=np.int64
),
("timedelta64[ns]", "timedelta64[ns]"): _view_wrapper(
libalgos.take_2d_axis0_int64_int64, np.int64, np.int64, fill_wrap=np.int64
),
}
_take_2d_axis1_dict = {
("int8", "int8"): libalgos.take_2d_axis1_int8_int8,
("int8", "int32"): libalgos.take_2d_axis1_int8_int32,
("int8", "int64"): libalgos.take_2d_axis1_int8_int64,
("int8", "float64"): libalgos.take_2d_axis1_int8_float64,
("int16", "int16"): libalgos.take_2d_axis1_int16_int16,
("int16", "int32"): libalgos.take_2d_axis1_int16_int32,
("int16", "int64"): libalgos.take_2d_axis1_int16_int64,
("int16", "float64"): libalgos.take_2d_axis1_int16_float64,
("int32", "int32"): libalgos.take_2d_axis1_int32_int32,
("int32", "int64"): libalgos.take_2d_axis1_int32_int64,
("int32", "float64"): libalgos.take_2d_axis1_int32_float64,
("int64", "int64"): libalgos.take_2d_axis1_int64_int64,
("int64", "float64"): libalgos.take_2d_axis1_int64_float64,
("float32", "float32"): libalgos.take_2d_axis1_float32_float32,
("float32", "float64"): libalgos.take_2d_axis1_float32_float64,
("float64", "float64"): libalgos.take_2d_axis1_float64_float64,
("object", "object"): libalgos.take_2d_axis1_object_object,
("bool", "bool"): _view_wrapper(
libalgos.take_2d_axis1_bool_bool, np.uint8, np.uint8
),
("bool", "object"): _view_wrapper(
libalgos.take_2d_axis1_bool_object, np.uint8, None
),
("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
libalgos.take_2d_axis1_int64_int64, np.int64, np.int64, fill_wrap=np.int64
),
("timedelta64[ns]", "timedelta64[ns]"): _view_wrapper(
libalgos.take_2d_axis1_int64_int64, np.int64, np.int64, fill_wrap=np.int64
),
}
_take_2d_multi_dict = {
("int8", "int8"): libalgos.take_2d_multi_int8_int8,
("int8", "int32"): libalgos.take_2d_multi_int8_int32,
("int8", "int64"): libalgos.take_2d_multi_int8_int64,
("int8", "float64"): libalgos.take_2d_multi_int8_float64,
("int16", "int16"): libalgos.take_2d_multi_int16_int16,
("int16", "int32"): libalgos.take_2d_multi_int16_int32,
("int16", "int64"): libalgos.take_2d_multi_int16_int64,
("int16", "float64"): libalgos.take_2d_multi_int16_float64,
("int32", "int32"): libalgos.take_2d_multi_int32_int32,
("int32", "int64"): libalgos.take_2d_multi_int32_int64,
("int32", "float64"): libalgos.take_2d_multi_int32_float64,
("int64", "int64"): libalgos.take_2d_multi_int64_int64,
("int64", "float64"): libalgos.take_2d_multi_int64_float64,
("float32", "float32"): libalgos.take_2d_multi_float32_float32,
("float32", "float64"): libalgos.take_2d_multi_float32_float64,
("float64", "float64"): libalgos.take_2d_multi_float64_float64,
("object", "object"): libalgos.take_2d_multi_object_object,
("bool", "bool"): _view_wrapper(
libalgos.take_2d_multi_bool_bool, np.uint8, np.uint8
),
("bool", "object"): _view_wrapper(
libalgos.take_2d_multi_bool_object, np.uint8, None
),
("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
libalgos.take_2d_multi_int64_int64, np.int64, np.int64, fill_wrap=np.int64
),
("timedelta64[ns]", "timedelta64[ns]"): _view_wrapper(
libalgos.take_2d_multi_int64_int64, np.int64, np.int64, fill_wrap=np.int64
),
}
def _take_nd_object(
arr: np.ndarray,
indexer: npt.NDArray[np.intp],
out: np.ndarray,
axis: AxisInt,
fill_value,
mask_info,
) -> None:
if mask_info is not None:
mask, needs_masking = mask_info
else:
mask = indexer == -1
needs_masking = mask.any()
if arr.dtype != out.dtype:
arr = arr.astype(out.dtype)
if arr.shape[axis] > 0:
arr.take(indexer, axis=axis, out=out)
if needs_masking:
outindexer = [slice(None)] * arr.ndim
outindexer[axis] = mask
out[tuple(outindexer)] = fill_value
def _take_2d_multi_object(
arr: np.ndarray,
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
out: np.ndarray,
fill_value,
mask_info,
) -> None:
# this is not ideal, performance-wise, but it's better than raising
# an exception (best to optimize in Cython to avoid getting here)
row_idx, col_idx = indexer # both np.intp
if mask_info is not None:
(row_mask, col_mask), (row_needs, col_needs) = mask_info
else:
row_mask = row_idx == -1
col_mask = col_idx == -1
row_needs = row_mask.any()
col_needs = col_mask.any()
if fill_value is not None:
if row_needs:
out[row_mask, :] = fill_value
if col_needs:
out[:, col_mask] = fill_value
for i, u_ in enumerate(row_idx):
if u_ != -1:
for j, v in enumerate(col_idx):
if v != -1:
out[i, j] = arr[u_, v]
def _take_preprocess_indexer_and_fill_value(
arr: np.ndarray,
indexer: npt.NDArray[np.intp],
fill_value,
allow_fill: bool,
mask: npt.NDArray[np.bool_] | None = None,
):
mask_info: tuple[np.ndarray | None, bool] | None = None
if not allow_fill:
dtype, fill_value = arr.dtype, arr.dtype.type()
mask_info = None, False
else:
# check for promotion based on types only (do this first because
# it's faster than computing a mask)
dtype, fill_value = maybe_promote(arr.dtype, fill_value)
if dtype != arr.dtype:
# check if promotion is actually required based on indexer
if mask is not None:
needs_masking = True
else:
mask = indexer == -1
needs_masking = bool(mask.any())
mask_info = mask, needs_masking
if not needs_masking:
# if not, then depromote, set fill_value to dummy
# (it won't be used but we don't want the cython code
# to crash when trying to cast it to dtype)
dtype, fill_value = arr.dtype, arr.dtype.type()
return dtype, fill_value, mask_info

View File

@@ -0,0 +1,50 @@
"""
transforms.py is for shape-preserving functions.
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import numpy as np
if TYPE_CHECKING:
from pandas._typing import (
AxisInt,
Scalar,
)
def shift(
values: np.ndarray, periods: int, axis: AxisInt, fill_value: Scalar
) -> np.ndarray:
new_values = values
if periods == 0 or values.size == 0:
return new_values.copy()
# make sure array sent to np.roll is c_contiguous
f_ordered = values.flags.f_contiguous
if f_ordered:
new_values = new_values.T
axis = new_values.ndim - axis - 1
if new_values.size:
new_values = np.roll(
new_values,
np.intp(periods),
axis=axis,
)
axis_indexer = [slice(None)] * values.ndim
if periods > 0:
axis_indexer[axis] = slice(None, periods)
else:
axis_indexer[axis] = slice(periods, None)
new_values[tuple(axis_indexer)] = fill_value
# restore original order
if f_ordered:
new_values = new_values.T
return new_values

View File

@@ -0,0 +1,530 @@
"""
Methods that can be shared by many array-like classes or subclasses:
Series
Index
ExtensionArray
"""
from __future__ import annotations
import operator
from typing import Any
import numpy as np
from pandas._libs import lib
from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op
from pandas.core.dtypes.generic import ABCNDFrame
from pandas.core import roperator
from pandas.core.construction import extract_array
from pandas.core.ops.common import unpack_zerodim_and_defer
REDUCTION_ALIASES = {
"maximum": "max",
"minimum": "min",
"add": "sum",
"multiply": "prod",
}
class OpsMixin:
# -------------------------------------------------------------
# Comparisons
def _cmp_method(self, other, op):
return NotImplemented
@unpack_zerodim_and_defer("__eq__")
def __eq__(self, other):
return self._cmp_method(other, operator.eq)
@unpack_zerodim_and_defer("__ne__")
def __ne__(self, other):
return self._cmp_method(other, operator.ne)
@unpack_zerodim_and_defer("__lt__")
def __lt__(self, other):
return self._cmp_method(other, operator.lt)
@unpack_zerodim_and_defer("__le__")
def __le__(self, other):
return self._cmp_method(other, operator.le)
@unpack_zerodim_and_defer("__gt__")
def __gt__(self, other):
return self._cmp_method(other, operator.gt)
@unpack_zerodim_and_defer("__ge__")
def __ge__(self, other):
return self._cmp_method(other, operator.ge)
# -------------------------------------------------------------
# Logical Methods
def _logical_method(self, other, op):
return NotImplemented
@unpack_zerodim_and_defer("__and__")
def __and__(self, other):
return self._logical_method(other, operator.and_)
@unpack_zerodim_and_defer("__rand__")
def __rand__(self, other):
return self._logical_method(other, roperator.rand_)
@unpack_zerodim_and_defer("__or__")
def __or__(self, other):
return self._logical_method(other, operator.or_)
@unpack_zerodim_and_defer("__ror__")
def __ror__(self, other):
return self._logical_method(other, roperator.ror_)
@unpack_zerodim_and_defer("__xor__")
def __xor__(self, other):
return self._logical_method(other, operator.xor)
@unpack_zerodim_and_defer("__rxor__")
def __rxor__(self, other):
return self._logical_method(other, roperator.rxor)
# -------------------------------------------------------------
# Arithmetic Methods
def _arith_method(self, other, op):
return NotImplemented
@unpack_zerodim_and_defer("__add__")
def __add__(self, other):
"""
Get Addition of DataFrame and other, column-wise.
Equivalent to ``DataFrame.add(other)``.
Parameters
----------
other : scalar, sequence, Series, dict or DataFrame
Object to be added to the DataFrame.
Returns
-------
DataFrame
The result of adding ``other`` to DataFrame.
See Also
--------
DataFrame.add : Add a DataFrame and another object, with option for index-
or column-oriented addition.
Examples
--------
>>> df = pd.DataFrame({'height': [1.5, 2.6], 'weight': [500, 800]},
... index=['elk', 'moose'])
>>> df
height weight
elk 1.5 500
moose 2.6 800
Adding a scalar affects all rows and columns.
>>> df[['height', 'weight']] + 1.5
height weight
elk 3.0 501.5
moose 4.1 801.5
Each element of a list is added to a column of the DataFrame, in order.
>>> df[['height', 'weight']] + [0.5, 1.5]
height weight
elk 2.0 501.5
moose 3.1 801.5
Keys of a dictionary are aligned to the DataFrame, based on column names;
each value in the dictionary is added to the corresponding column.
>>> df[['height', 'weight']] + {'height': 0.5, 'weight': 1.5}
height weight
elk 2.0 501.5
moose 3.1 801.5
When `other` is a :class:`Series`, the index of `other` is aligned with the
columns of the DataFrame.
>>> s1 = pd.Series([0.5, 1.5], index=['weight', 'height'])
>>> df[['height', 'weight']] + s1
height weight
elk 3.0 500.5
moose 4.1 800.5
Even when the index of `other` is the same as the index of the DataFrame,
the :class:`Series` will not be reoriented. If index-wise alignment is desired,
:meth:`DataFrame.add` should be used with `axis='index'`.
>>> s2 = pd.Series([0.5, 1.5], index=['elk', 'moose'])
>>> df[['height', 'weight']] + s2
elk height moose weight
elk NaN NaN NaN NaN
moose NaN NaN NaN NaN
>>> df[['height', 'weight']].add(s2, axis='index')
height weight
elk 2.0 500.5
moose 4.1 801.5
When `other` is a :class:`DataFrame`, both columns names and the
index are aligned.
>>> other = pd.DataFrame({'height': [0.2, 0.4, 0.6]},
... index=['elk', 'moose', 'deer'])
>>> df[['height', 'weight']] + other
height weight
deer NaN NaN
elk 1.7 NaN
moose 3.0 NaN
"""
return self._arith_method(other, operator.add)
@unpack_zerodim_and_defer("__radd__")
def __radd__(self, other):
return self._arith_method(other, roperator.radd)
@unpack_zerodim_and_defer("__sub__")
def __sub__(self, other):
return self._arith_method(other, operator.sub)
@unpack_zerodim_and_defer("__rsub__")
def __rsub__(self, other):
return self._arith_method(other, roperator.rsub)
@unpack_zerodim_and_defer("__mul__")
def __mul__(self, other):
return self._arith_method(other, operator.mul)
@unpack_zerodim_and_defer("__rmul__")
def __rmul__(self, other):
return self._arith_method(other, roperator.rmul)
@unpack_zerodim_and_defer("__truediv__")
def __truediv__(self, other):
return self._arith_method(other, operator.truediv)
@unpack_zerodim_and_defer("__rtruediv__")
def __rtruediv__(self, other):
return self._arith_method(other, roperator.rtruediv)
@unpack_zerodim_and_defer("__floordiv__")
def __floordiv__(self, other):
return self._arith_method(other, operator.floordiv)
@unpack_zerodim_and_defer("__rfloordiv")
def __rfloordiv__(self, other):
return self._arith_method(other, roperator.rfloordiv)
@unpack_zerodim_and_defer("__mod__")
def __mod__(self, other):
return self._arith_method(other, operator.mod)
@unpack_zerodim_and_defer("__rmod__")
def __rmod__(self, other):
return self._arith_method(other, roperator.rmod)
@unpack_zerodim_and_defer("__divmod__")
def __divmod__(self, other):
return self._arith_method(other, divmod)
@unpack_zerodim_and_defer("__rdivmod__")
def __rdivmod__(self, other):
return self._arith_method(other, roperator.rdivmod)
@unpack_zerodim_and_defer("__pow__")
def __pow__(self, other):
return self._arith_method(other, operator.pow)
@unpack_zerodim_and_defer("__rpow__")
def __rpow__(self, other):
return self._arith_method(other, roperator.rpow)
# -----------------------------------------------------------------------------
# Helpers to implement __array_ufunc__
def array_ufunc(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any):
"""
Compatibility with numpy ufuncs.
See also
--------
numpy.org/doc/stable/reference/arrays.classes.html#numpy.class.__array_ufunc__
"""
from pandas.core.frame import (
DataFrame,
Series,
)
from pandas.core.generic import NDFrame
from pandas.core.internals import (
ArrayManager,
BlockManager,
)
cls = type(self)
kwargs = _standardize_out_kwarg(**kwargs)
# for binary ops, use our custom dunder methods
result = maybe_dispatch_ufunc_to_dunder_op(self, ufunc, method, *inputs, **kwargs)
if result is not NotImplemented:
return result
# Determine if we should defer.
no_defer = (
np.ndarray.__array_ufunc__,
cls.__array_ufunc__,
)
for item in inputs:
higher_priority = (
hasattr(item, "__array_priority__")
and item.__array_priority__ > self.__array_priority__
)
has_array_ufunc = (
hasattr(item, "__array_ufunc__")
and type(item).__array_ufunc__ not in no_defer
and not isinstance(item, self._HANDLED_TYPES)
)
if higher_priority or has_array_ufunc:
return NotImplemented
# align all the inputs.
types = tuple(type(x) for x in inputs)
alignable = [x for x, t in zip(inputs, types) if issubclass(t, NDFrame)]
if len(alignable) > 1:
# This triggers alignment.
# At the moment, there aren't any ufuncs with more than two inputs
# so this ends up just being x1.index | x2.index, but we write
# it to handle *args.
set_types = set(types)
if len(set_types) > 1 and {DataFrame, Series}.issubset(set_types):
# We currently don't handle ufunc(DataFrame, Series)
# well. Previously this raised an internal ValueError. We might
# support it someday, so raise a NotImplementedError.
raise NotImplementedError(
f"Cannot apply ufunc {ufunc} to mixed DataFrame and Series inputs."
)
axes = self.axes
for obj in alignable[1:]:
# this relies on the fact that we aren't handling mixed
# series / frame ufuncs.
for i, (ax1, ax2) in enumerate(zip(axes, obj.axes)):
axes[i] = ax1.union(ax2)
reconstruct_axes = dict(zip(self._AXIS_ORDERS, axes))
inputs = tuple(
x.reindex(**reconstruct_axes) if issubclass(t, NDFrame) else x
for x, t in zip(inputs, types)
)
else:
reconstruct_axes = dict(zip(self._AXIS_ORDERS, self.axes))
if self.ndim == 1:
names = [getattr(x, "name") for x in inputs if hasattr(x, "name")]
name = names[0] if len(set(names)) == 1 else None
reconstruct_kwargs = {"name": name}
else:
reconstruct_kwargs = {}
def reconstruct(result):
if ufunc.nout > 1:
# np.modf, np.frexp, np.divmod
return tuple(_reconstruct(x) for x in result)
return _reconstruct(result)
def _reconstruct(result):
if lib.is_scalar(result):
return result
if result.ndim != self.ndim:
if method == "outer":
raise NotImplementedError
return result
if isinstance(result, (BlockManager, ArrayManager)):
# we went through BlockManager.apply e.g. np.sqrt
result = self._constructor_from_mgr(result, axes=result.axes)
else:
# we converted an array, lost our axes
result = self._constructor(
result, **reconstruct_axes, **reconstruct_kwargs, copy=False
)
# TODO: When we support multiple values in __finalize__, this
# should pass alignable to `__finalize__` instead of self.
# Then `np.add(a, b)` would consider attrs from both a and b
# when a and b are NDFrames.
if len(alignable) == 1:
result = result.__finalize__(self)
return result
if "out" in kwargs:
# e.g. test_multiindex_get_loc
result = dispatch_ufunc_with_out(self, ufunc, method, *inputs, **kwargs)
return reconstruct(result)
if method == "reduce":
# e.g. test.series.test_ufunc.test_reduce
result = dispatch_reduction_ufunc(self, ufunc, method, *inputs, **kwargs)
if result is not NotImplemented:
return result
# We still get here with kwargs `axis` for e.g. np.maximum.accumulate
# and `dtype` and `keepdims` for np.ptp
if self.ndim > 1 and (len(inputs) > 1 or ufunc.nout > 1):
# Just give up on preserving types in the complex case.
# In theory we could preserve them for them.
# * nout>1 is doable if BlockManager.apply took nout and
# returned a Tuple[BlockManager].
# * len(inputs) > 1 is doable when we know that we have
# aligned blocks / dtypes.
# e.g. my_ufunc, modf, logaddexp, heaviside, subtract, add
inputs = tuple(np.asarray(x) for x in inputs)
# Note: we can't use default_array_ufunc here bc reindexing means
# that `self` may not be among `inputs`
result = getattr(ufunc, method)(*inputs, **kwargs)
elif self.ndim == 1:
# ufunc(series, ...)
inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs)
result = getattr(ufunc, method)(*inputs, **kwargs)
else:
# ufunc(dataframe)
if method == "__call__" and not kwargs:
# for np.<ufunc>(..) calls
# kwargs cannot necessarily be handled block-by-block, so only
# take this path if there are no kwargs
mgr = inputs[0]._mgr
result = mgr.apply(getattr(ufunc, method))
else:
# otherwise specific ufunc methods (eg np.<ufunc>.accumulate(..))
# Those can have an axis keyword and thus can't be called block-by-block
result = default_array_ufunc(inputs[0], ufunc, method, *inputs, **kwargs)
# e.g. np.negative (only one reached), with "where" and "out" in kwargs
result = reconstruct(result)
return result
def _standardize_out_kwarg(**kwargs) -> dict:
"""
If kwargs contain "out1" and "out2", replace that with a tuple "out"
np.divmod, np.modf, np.frexp can have either `out=(out1, out2)` or
`out1=out1, out2=out2)`
"""
if "out" not in kwargs and "out1" in kwargs and "out2" in kwargs:
out1 = kwargs.pop("out1")
out2 = kwargs.pop("out2")
out = (out1, out2)
kwargs["out"] = out
return kwargs
def dispatch_ufunc_with_out(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
"""
If we have an `out` keyword, then call the ufunc without `out` and then
set the result into the given `out`.
"""
# Note: we assume _standardize_out_kwarg has already been called.
out = kwargs.pop("out")
where = kwargs.pop("where", None)
result = getattr(ufunc, method)(*inputs, **kwargs)
if result is NotImplemented:
return NotImplemented
if isinstance(result, tuple):
# i.e. np.divmod, np.modf, np.frexp
if not isinstance(out, tuple) or len(out) != len(result):
raise NotImplementedError
for arr, res in zip(out, result):
_assign_where(arr, res, where)
return out
if isinstance(out, tuple):
if len(out) == 1:
out = out[0]
else:
raise NotImplementedError
_assign_where(out, result, where)
return out
def _assign_where(out, result, where) -> None:
"""
Set a ufunc result into 'out', masking with a 'where' argument if necessary.
"""
if where is None:
# no 'where' arg passed to ufunc
out[:] = result
else:
np.putmask(out, where, result)
def default_array_ufunc(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
"""
Fallback to the behavior we would get if we did not define __array_ufunc__.
Notes
-----
We are assuming that `self` is among `inputs`.
"""
if not any(x is self for x in inputs):
raise NotImplementedError
new_inputs = [x if x is not self else np.asarray(x) for x in inputs]
return getattr(ufunc, method)(*new_inputs, **kwargs)
def dispatch_reduction_ufunc(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
"""
Dispatch ufunc reductions to self's reduction methods.
"""
assert method == "reduce"
if len(inputs) != 1 or inputs[0] is not self:
return NotImplemented
if ufunc.__name__ not in REDUCTION_ALIASES:
return NotImplemented
method_name = REDUCTION_ALIASES[ufunc.__name__]
# NB: we are assuming that min/max represent minimum/maximum methods,
# which would not be accurate for e.g. Timestamp.min
if not hasattr(self, method_name):
return NotImplemented
if self.ndim > 1:
if isinstance(self, ABCNDFrame):
# TODO: test cases where this doesn't hold, i.e. 2D DTA/TDA
kwargs["numeric_only"] = False
if "axis" not in kwargs:
# For DataFrame reductions we don't want the default axis=0
# Note: np.min is not a ufunc, but uses array_function_dispatch,
# so calls DataFrame.min (without ever getting here) with the np.min
# default of axis=None, which DataFrame.min catches and changes to axis=0.
# np.minimum.reduce(df) gets here bc axis is not in kwargs,
# so we set axis=0 to match the behaviorof np.minimum.reduce(df.values)
kwargs["axis"] = 0
# By default, numpy's reductions do not skip NaNs, so we have to
# pass skipna=False
return getattr(self, method_name)(skipna=False, **kwargs)

View File

@@ -0,0 +1,43 @@
from pandas.core.arrays.arrow import ArrowExtensionArray
from pandas.core.arrays.base import (
ExtensionArray,
ExtensionOpsMixin,
ExtensionScalarOpsMixin,
)
from pandas.core.arrays.boolean import BooleanArray
from pandas.core.arrays.categorical import Categorical
from pandas.core.arrays.datetimes import DatetimeArray
from pandas.core.arrays.floating import FloatingArray
from pandas.core.arrays.integer import IntegerArray
from pandas.core.arrays.interval import IntervalArray
from pandas.core.arrays.masked import BaseMaskedArray
from pandas.core.arrays.numpy_ import NumpyExtensionArray
from pandas.core.arrays.period import (
PeriodArray,
period_array,
)
from pandas.core.arrays.sparse import SparseArray
from pandas.core.arrays.string_ import StringArray
from pandas.core.arrays.string_arrow import ArrowStringArray
from pandas.core.arrays.timedeltas import TimedeltaArray
__all__ = [
"ArrowExtensionArray",
"ExtensionArray",
"ExtensionOpsMixin",
"ExtensionScalarOpsMixin",
"ArrowStringArray",
"BaseMaskedArray",
"BooleanArray",
"Categorical",
"DatetimeArray",
"FloatingArray",
"IntegerArray",
"IntervalArray",
"NumpyExtensionArray",
"PeriodArray",
"period_array",
"SparseArray",
"StringArray",
"TimedeltaArray",
]

View File

@@ -0,0 +1,84 @@
from __future__ import annotations
from typing import Literal
import numpy as np
from pandas.compat import pa_version_under10p1
if not pa_version_under10p1:
import pyarrow as pa
import pyarrow.compute as pc
class ArrowStringArrayMixin:
_pa_array = None
def __init__(self, *args, **kwargs) -> None:
raise NotImplementedError
def _str_pad(
self,
width: int,
side: Literal["left", "right", "both"] = "left",
fillchar: str = " ",
):
if side == "left":
pa_pad = pc.utf8_lpad
elif side == "right":
pa_pad = pc.utf8_rpad
elif side == "both":
pa_pad = pc.utf8_center
else:
raise ValueError(
f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'"
)
return type(self)(pa_pad(self._pa_array, width=width, padding=fillchar))
def _str_get(self, i: int):
lengths = pc.utf8_length(self._pa_array)
if i >= 0:
out_of_bounds = pc.greater_equal(i, lengths)
start = i
stop = i + 1
step = 1
else:
out_of_bounds = pc.greater(-i, lengths)
start = i
stop = i - 1
step = -1
not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True))
selected = pc.utf8_slice_codeunits(
self._pa_array, start=start, stop=stop, step=step
)
null_value = pa.scalar(
None, type=self._pa_array.type # type: ignore[attr-defined]
)
result = pc.if_else(not_out_of_bounds, selected, null_value)
return type(self)(result)
def _str_slice_replace(
self, start: int | None = None, stop: int | None = None, repl: str | None = None
):
if repl is None:
repl = ""
if start is None:
start = 0
if stop is None:
stop = np.iinfo(np.int64).max
return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl))
def _str_capitalize(self):
return type(self)(pc.utf8_capitalize(self._pa_array))
def _str_title(self):
return type(self)(pc.utf8_title(self._pa_array))
def _str_swapcase(self):
return type(self)(pc.utf8_swapcase(self._pa_array))
def _str_removesuffix(self, suffix: str):
ends_with = pc.ends_with(self._pa_array, pattern=suffix)
removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))
result = pc.if_else(ends_with, removed, self._pa_array)
return type(self)(result)

View File

@@ -0,0 +1,547 @@
from __future__ import annotations
from functools import wraps
from typing import (
TYPE_CHECKING,
Any,
Literal,
cast,
overload,
)
import numpy as np
from pandas._libs import lib
from pandas._libs.arrays import NDArrayBacked
from pandas._libs.tslibs import is_supported_dtype
from pandas._typing import (
ArrayLike,
AxisInt,
Dtype,
F,
FillnaOptions,
PositionalIndexer2D,
PositionalIndexerTuple,
ScalarIndexer,
Self,
SequenceIndexer,
Shape,
TakeIndexer,
npt,
)
from pandas.errors import AbstractMethodError
from pandas.util._decorators import doc
from pandas.util._validators import (
validate_bool_kwarg,
validate_fillna_kwargs,
validate_insert_loc,
)
from pandas.core.dtypes.common import pandas_dtype
from pandas.core.dtypes.dtypes import (
DatetimeTZDtype,
ExtensionDtype,
PeriodDtype,
)
from pandas.core.dtypes.missing import array_equivalent
from pandas.core import missing
from pandas.core.algorithms import (
take,
unique,
value_counts_internal as value_counts,
)
from pandas.core.array_algos.quantile import quantile_with_mask
from pandas.core.array_algos.transforms import shift
from pandas.core.arrays.base import ExtensionArray
from pandas.core.construction import extract_array
from pandas.core.indexers import check_array_indexer
from pandas.core.sorting import nargminmax
if TYPE_CHECKING:
from collections.abc import Sequence
from pandas._typing import (
NumpySorter,
NumpyValueArrayLike,
)
from pandas import Series
def ravel_compat(meth: F) -> F:
"""
Decorator to ravel a 2D array before passing it to a cython operation,
then reshape the result to our own shape.
"""
@wraps(meth)
def method(self, *args, **kwargs):
if self.ndim == 1:
return meth(self, *args, **kwargs)
flags = self._ndarray.flags
flat = self.ravel("K")
result = meth(flat, *args, **kwargs)
order = "F" if flags.f_contiguous else "C"
return result.reshape(self.shape, order=order)
return cast(F, method)
class NDArrayBackedExtensionArray(NDArrayBacked, ExtensionArray):
"""
ExtensionArray that is backed by a single NumPy ndarray.
"""
_ndarray: np.ndarray
# scalar used to denote NA value inside our self._ndarray, e.g. -1
# for Categorical, iNaT for Period. Outside of object dtype,
# self.isna() should be exactly locations in self._ndarray with
# _internal_fill_value.
_internal_fill_value: Any
def _box_func(self, x):
"""
Wrap numpy type in our dtype.type if necessary.
"""
return x
def _validate_scalar(self, value):
# used by NDArrayBackedExtensionIndex.insert
raise AbstractMethodError(self)
# ------------------------------------------------------------------------
def view(self, dtype: Dtype | None = None) -> ArrayLike:
# We handle datetime64, datetime64tz, timedelta64, and period
# dtypes here. Everything else we pass through to the underlying
# ndarray.
if dtype is None or dtype is self.dtype:
return self._from_backing_data(self._ndarray)
if isinstance(dtype, type):
# we sometimes pass non-dtype objects, e.g np.ndarray;
# pass those through to the underlying ndarray
return self._ndarray.view(dtype)
dtype = pandas_dtype(dtype)
arr = self._ndarray
if isinstance(dtype, PeriodDtype):
cls = dtype.construct_array_type()
return cls(arr.view("i8"), dtype=dtype)
elif isinstance(dtype, DatetimeTZDtype):
dt_cls = dtype.construct_array_type()
dt64_values = arr.view(f"M8[{dtype.unit}]")
return dt_cls._simple_new(dt64_values, dtype=dtype)
elif lib.is_np_dtype(dtype, "M") and is_supported_dtype(dtype):
from pandas.core.arrays import DatetimeArray
dt64_values = arr.view(dtype)
return DatetimeArray._simple_new(dt64_values, dtype=dtype)
elif lib.is_np_dtype(dtype, "m") and is_supported_dtype(dtype):
from pandas.core.arrays import TimedeltaArray
td64_values = arr.view(dtype)
return TimedeltaArray._simple_new(td64_values, dtype=dtype)
# error: Argument "dtype" to "view" of "_ArrayOrScalarCommon" has incompatible
# type "Union[ExtensionDtype, dtype[Any]]"; expected "Union[dtype[Any], None,
# type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, Union[int,
# Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]"
return arr.view(dtype=dtype) # type: ignore[arg-type]
def take(
self,
indices: TakeIndexer,
*,
allow_fill: bool = False,
fill_value: Any = None,
axis: AxisInt = 0,
) -> Self:
if allow_fill:
fill_value = self._validate_scalar(fill_value)
new_data = take(
self._ndarray,
indices,
allow_fill=allow_fill,
fill_value=fill_value,
axis=axis,
)
return self._from_backing_data(new_data)
# ------------------------------------------------------------------------
def equals(self, other) -> bool:
if type(self) is not type(other):
return False
if self.dtype != other.dtype:
return False
return bool(array_equivalent(self._ndarray, other._ndarray, dtype_equal=True))
@classmethod
def _from_factorized(cls, values, original):
assert values.dtype == original._ndarray.dtype
return original._from_backing_data(values)
def _values_for_argsort(self) -> np.ndarray:
return self._ndarray
def _values_for_factorize(self):
return self._ndarray, self._internal_fill_value
def _hash_pandas_object(
self, *, encoding: str, hash_key: str, categorize: bool
) -> npt.NDArray[np.uint64]:
from pandas.core.util.hashing import hash_array
values = self._ndarray
return hash_array(
values, encoding=encoding, hash_key=hash_key, categorize=categorize
)
# Signature of "argmin" incompatible with supertype "ExtensionArray"
def argmin(self, axis: AxisInt = 0, skipna: bool = True): # type: ignore[override]
# override base class by adding axis keyword
validate_bool_kwarg(skipna, "skipna")
if not skipna and self._hasna:
raise NotImplementedError
return nargminmax(self, "argmin", axis=axis)
# Signature of "argmax" incompatible with supertype "ExtensionArray"
def argmax(self, axis: AxisInt = 0, skipna: bool = True): # type: ignore[override]
# override base class by adding axis keyword
validate_bool_kwarg(skipna, "skipna")
if not skipna and self._hasna:
raise NotImplementedError
return nargminmax(self, "argmax", axis=axis)
def unique(self) -> Self:
new_data = unique(self._ndarray)
return self._from_backing_data(new_data)
@classmethod
@doc(ExtensionArray._concat_same_type)
def _concat_same_type(
cls,
to_concat: Sequence[Self],
axis: AxisInt = 0,
) -> Self:
if not lib.dtypes_all_equal([x.dtype for x in to_concat]):
dtypes = {str(x.dtype) for x in to_concat}
raise ValueError("to_concat must have the same dtype", dtypes)
return super()._concat_same_type(to_concat, axis=axis)
@doc(ExtensionArray.searchsorted)
def searchsorted(
self,
value: NumpyValueArrayLike | ExtensionArray,
side: Literal["left", "right"] = "left",
sorter: NumpySorter | None = None,
) -> npt.NDArray[np.intp] | np.intp:
npvalue = self._validate_setitem_value(value)
return self._ndarray.searchsorted(npvalue, side=side, sorter=sorter)
@doc(ExtensionArray.shift)
def shift(self, periods: int = 1, fill_value=None):
# NB: shift is always along axis=0
axis = 0
fill_value = self._validate_scalar(fill_value)
new_values = shift(self._ndarray, periods, axis, fill_value)
return self._from_backing_data(new_values)
def __setitem__(self, key, value) -> None:
key = check_array_indexer(self, key)
value = self._validate_setitem_value(value)
self._ndarray[key] = value
def _validate_setitem_value(self, value):
return value
@overload
def __getitem__(self, key: ScalarIndexer) -> Any:
...
@overload
def __getitem__(
self,
key: SequenceIndexer | PositionalIndexerTuple,
) -> Self:
...
def __getitem__(
self,
key: PositionalIndexer2D,
) -> Self | Any:
if lib.is_integer(key):
# fast-path
result = self._ndarray[key]
if self.ndim == 1:
return self._box_func(result)
return self._from_backing_data(result)
# error: Incompatible types in assignment (expression has type "ExtensionArray",
# variable has type "Union[int, slice, ndarray]")
key = extract_array(key, extract_numpy=True) # type: ignore[assignment]
key = check_array_indexer(self, key)
result = self._ndarray[key]
if lib.is_scalar(result):
return self._box_func(result)
result = self._from_backing_data(result)
return result
def _fill_mask_inplace(
self, method: str, limit: int | None, mask: npt.NDArray[np.bool_]
) -> None:
# (for now) when self.ndim == 2, we assume axis=0
func = missing.get_fill_func(method, ndim=self.ndim)
func(self._ndarray.T, limit=limit, mask=mask.T)
def _pad_or_backfill(
self,
*,
method: FillnaOptions,
limit: int | None = None,
limit_area: Literal["inside", "outside"] | None = None,
copy: bool = True,
) -> Self:
mask = self.isna()
if mask.any():
# (for now) when self.ndim == 2, we assume axis=0
func = missing.get_fill_func(method, ndim=self.ndim)
npvalues = self._ndarray.T
if copy:
npvalues = npvalues.copy()
func(npvalues, limit=limit, limit_area=limit_area, mask=mask.T)
npvalues = npvalues.T
if copy:
new_values = self._from_backing_data(npvalues)
else:
new_values = self
else:
if copy:
new_values = self.copy()
else:
new_values = self
return new_values
@doc(ExtensionArray.fillna)
def fillna(
self, value=None, method=None, limit: int | None = None, copy: bool = True
) -> Self:
value, method = validate_fillna_kwargs(
value, method, validate_scalar_dict_value=False
)
mask = self.isna()
# error: Argument 2 to "check_value_size" has incompatible type
# "ExtensionArray"; expected "ndarray"
value = missing.check_value_size(
value, mask, len(self) # type: ignore[arg-type]
)
if mask.any():
if method is not None:
# (for now) when self.ndim == 2, we assume axis=0
func = missing.get_fill_func(method, ndim=self.ndim)
npvalues = self._ndarray.T
if copy:
npvalues = npvalues.copy()
func(npvalues, limit=limit, mask=mask.T)
npvalues = npvalues.T
# TODO: NumpyExtensionArray didn't used to copy, need tests
# for this
new_values = self._from_backing_data(npvalues)
else:
# fill with value
if copy:
new_values = self.copy()
else:
new_values = self[:]
new_values[mask] = value
else:
# We validate the fill_value even if there is nothing to fill
if value is not None:
self._validate_setitem_value(value)
if not copy:
new_values = self[:]
else:
new_values = self.copy()
return new_values
# ------------------------------------------------------------------------
# Reductions
def _wrap_reduction_result(self, axis: AxisInt | None, result):
if axis is None or self.ndim == 1:
return self._box_func(result)
return self._from_backing_data(result)
# ------------------------------------------------------------------------
# __array_function__ methods
def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None:
"""
Analogue to np.putmask(self, mask, value)
Parameters
----------
mask : np.ndarray[bool]
value : scalar or listlike
Raises
------
TypeError
If value cannot be cast to self.dtype.
"""
value = self._validate_setitem_value(value)
np.putmask(self._ndarray, mask, value)
def _where(self: Self, mask: npt.NDArray[np.bool_], value) -> Self:
"""
Analogue to np.where(mask, self, value)
Parameters
----------
mask : np.ndarray[bool]
value : scalar or listlike
Raises
------
TypeError
If value cannot be cast to self.dtype.
"""
value = self._validate_setitem_value(value)
res_values = np.where(mask, self._ndarray, value)
if res_values.dtype != self._ndarray.dtype:
raise AssertionError(
# GH#56410
"Something has gone wrong, please report a bug at "
"github.com/pandas-dev/pandas/"
)
return self._from_backing_data(res_values)
# ------------------------------------------------------------------------
# Index compat methods
def insert(self, loc: int, item) -> Self:
"""
Make new ExtensionArray inserting new item at location. Follows
Python list.append semantics for negative values.
Parameters
----------
loc : int
item : object
Returns
-------
type(self)
"""
loc = validate_insert_loc(loc, len(self))
code = self._validate_scalar(item)
new_vals = np.concatenate(
(
self._ndarray[:loc],
np.asarray([code], dtype=self._ndarray.dtype),
self._ndarray[loc:],
)
)
return self._from_backing_data(new_vals)
# ------------------------------------------------------------------------
# Additional array methods
# These are not part of the EA API, but we implement them because
# pandas assumes they're there.
def value_counts(self, dropna: bool = True) -> Series:
"""
Return a Series containing counts of unique values.
Parameters
----------
dropna : bool, default True
Don't include counts of NA values.
Returns
-------
Series
"""
if self.ndim != 1:
raise NotImplementedError
from pandas import (
Index,
Series,
)
if dropna:
# error: Unsupported operand type for ~ ("ExtensionArray")
values = self[~self.isna()]._ndarray # type: ignore[operator]
else:
values = self._ndarray
result = value_counts(values, sort=False, dropna=dropna)
index_arr = self._from_backing_data(np.asarray(result.index._data))
index = Index(index_arr, name=result.index.name)
return Series(result._values, index=index, name=result.name, copy=False)
def _quantile(
self,
qs: npt.NDArray[np.float64],
interpolation: str,
) -> Self:
# TODO: disable for Categorical if not ordered?
mask = np.asarray(self.isna())
arr = self._ndarray
fill_value = self._internal_fill_value
res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation)
res_values = self._cast_quantile_result(res_values)
return self._from_backing_data(res_values)
# TODO: see if we can share this with other dispatch-wrapping methods
def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
"""
Cast the result of quantile_with_mask to an appropriate dtype
to pass to _from_backing_data in _quantile.
"""
return res_values
# ------------------------------------------------------------------------
# numpy-like methods
@classmethod
def _empty(cls, shape: Shape, dtype: ExtensionDtype) -> Self:
"""
Analogous to np.empty(shape, dtype=dtype)
Parameters
----------
shape : tuple[int]
dtype : ExtensionDtype
"""
# The base implementation uses a naive approach to find the dtype
# for the backing ndarray
arr = cls._from_sequence([], dtype=dtype)
backing = np.empty(shape, dtype=arr._ndarray.dtype)
return arr._from_backing_data(backing)

View File

@@ -0,0 +1,207 @@
"""
Helper functions to generate range-like data for DatetimeArray
(and possibly TimedeltaArray/PeriodArray)
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import numpy as np
from pandas._libs.lib import i8max
from pandas._libs.tslibs import (
BaseOffset,
OutOfBoundsDatetime,
Timedelta,
Timestamp,
iNaT,
)
if TYPE_CHECKING:
from pandas._typing import npt
def generate_regular_range(
start: Timestamp | Timedelta | None,
end: Timestamp | Timedelta | None,
periods: int | None,
freq: BaseOffset,
unit: str = "ns",
) -> npt.NDArray[np.intp]:
"""
Generate a range of dates or timestamps with the spans between dates
described by the given `freq` DateOffset.
Parameters
----------
start : Timedelta, Timestamp or None
First point of produced date range.
end : Timedelta, Timestamp or None
Last point of produced date range.
periods : int or None
Number of periods in produced date range.
freq : Tick
Describes space between dates in produced date range.
unit : str, default "ns"
The resolution the output is meant to represent.
Returns
-------
ndarray[np.int64]
Representing the given resolution.
"""
istart = start._value if start is not None else None
iend = end._value if end is not None else None
freq.nanos # raises if non-fixed frequency
td = Timedelta(freq)
b: int
e: int
try:
td = td.as_unit(unit, round_ok=False)
except ValueError as err:
raise ValueError(
f"freq={freq} is incompatible with unit={unit}. "
"Use a lower freq or a higher unit instead."
) from err
stride = int(td._value)
if periods is None and istart is not None and iend is not None:
b = istart
# cannot just use e = Timestamp(end) + 1 because arange breaks when
# stride is too large, see GH10887
e = b + (iend - b) // stride * stride + stride // 2 + 1
elif istart is not None and periods is not None:
b = istart
e = _generate_range_overflow_safe(b, periods, stride, side="start")
elif iend is not None and periods is not None:
e = iend + stride
b = _generate_range_overflow_safe(e, periods, stride, side="end")
else:
raise ValueError(
"at least 'start' or 'end' should be specified if a 'period' is given."
)
with np.errstate(over="raise"):
# If the range is sufficiently large, np.arange may overflow
# and incorrectly return an empty array if not caught.
try:
values = np.arange(b, e, stride, dtype=np.int64)
except FloatingPointError:
xdr = [b]
while xdr[-1] != e:
xdr.append(xdr[-1] + stride)
values = np.array(xdr[:-1], dtype=np.int64)
return values
def _generate_range_overflow_safe(
endpoint: int, periods: int, stride: int, side: str = "start"
) -> int:
"""
Calculate the second endpoint for passing to np.arange, checking
to avoid an integer overflow. Catch OverflowError and re-raise
as OutOfBoundsDatetime.
Parameters
----------
endpoint : int
nanosecond timestamp of the known endpoint of the desired range
periods : int
number of periods in the desired range
stride : int
nanoseconds between periods in the desired range
side : {'start', 'end'}
which end of the range `endpoint` refers to
Returns
-------
other_end : int
Raises
------
OutOfBoundsDatetime
"""
# GH#14187 raise instead of incorrectly wrapping around
assert side in ["start", "end"]
i64max = np.uint64(i8max)
msg = f"Cannot generate range with {side}={endpoint} and periods={periods}"
with np.errstate(over="raise"):
# if periods * strides cannot be multiplied within the *uint64* bounds,
# we cannot salvage the operation by recursing, so raise
try:
addend = np.uint64(periods) * np.uint64(np.abs(stride))
except FloatingPointError as err:
raise OutOfBoundsDatetime(msg) from err
if np.abs(addend) <= i64max:
# relatively easy case without casting concerns
return _generate_range_overflow_safe_signed(endpoint, periods, stride, side)
elif (endpoint > 0 and side == "start" and stride > 0) or (
endpoint < 0 < stride and side == "end"
):
# no chance of not-overflowing
raise OutOfBoundsDatetime(msg)
elif side == "end" and endpoint - stride <= i64max < endpoint:
# in _generate_regular_range we added `stride` thereby overflowing
# the bounds. Adjust to fix this.
return _generate_range_overflow_safe(
endpoint - stride, periods - 1, stride, side
)
# split into smaller pieces
mid_periods = periods // 2
remaining = periods - mid_periods
assert 0 < remaining < periods, (remaining, periods, endpoint, stride)
midpoint = int(_generate_range_overflow_safe(endpoint, mid_periods, stride, side))
return _generate_range_overflow_safe(midpoint, remaining, stride, side)
def _generate_range_overflow_safe_signed(
endpoint: int, periods: int, stride: int, side: str
) -> int:
"""
A special case for _generate_range_overflow_safe where `periods * stride`
can be calculated without overflowing int64 bounds.
"""
assert side in ["start", "end"]
if side == "end":
stride *= -1
with np.errstate(over="raise"):
addend = np.int64(periods) * np.int64(stride)
try:
# easy case with no overflows
result = np.int64(endpoint) + addend
if result == iNaT:
# Putting this into a DatetimeArray/TimedeltaArray
# would incorrectly be interpreted as NaT
raise OverflowError
return int(result)
except (FloatingPointError, OverflowError):
# with endpoint negative and addend positive we risk
# FloatingPointError; with reversed signed we risk OverflowError
pass
# if stride and endpoint had opposite signs, then endpoint + addend
# should never overflow. so they must have the same signs
assert (stride > 0 and endpoint >= 0) or (stride < 0 and endpoint <= 0)
if stride > 0:
# watch out for very special case in which we just slightly
# exceed implementation bounds, but when passing the result to
# np.arange will get a result slightly within the bounds
uresult = np.uint64(endpoint) + np.uint64(addend)
i64max = np.uint64(i8max)
assert uresult > i64max
if uresult <= i64max + np.uint64(stride):
return int(uresult)
raise OutOfBoundsDatetime(
f"Cannot generate range with {side}={endpoint} and periods={periods}"
)

View File

@@ -0,0 +1,63 @@
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Any,
)
import numpy as np
from pandas._libs import lib
from pandas.errors import LossySetitemError
from pandas.core.dtypes.cast import np_can_hold_element
from pandas.core.dtypes.common import is_numeric_dtype
if TYPE_CHECKING:
from pandas._typing import (
ArrayLike,
npt,
)
def to_numpy_dtype_inference(
arr: ArrayLike, dtype: npt.DTypeLike | None, na_value, hasna: bool
) -> tuple[npt.DTypeLike, Any]:
if dtype is None and is_numeric_dtype(arr.dtype):
dtype_given = False
if hasna:
if arr.dtype.kind == "b":
dtype = np.dtype(np.object_)
else:
if arr.dtype.kind in "iu":
dtype = np.dtype(np.float64)
else:
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
if na_value is lib.no_default:
na_value = np.nan
else:
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
elif dtype is not None:
dtype = np.dtype(dtype)
dtype_given = True
else:
dtype_given = True
if na_value is lib.no_default:
if dtype is None or not hasna:
na_value = arr.dtype.na_value
elif dtype.kind == "f": # type: ignore[union-attr]
na_value = np.nan
elif dtype.kind == "M": # type: ignore[union-attr]
na_value = np.datetime64("nat")
elif dtype.kind == "m": # type: ignore[union-attr]
na_value = np.timedelta64("nat")
else:
na_value = arr.dtype.na_value
if not dtype_given and hasna:
try:
np_can_hold_element(dtype, na_value) # type: ignore[arg-type]
except LossySetitemError:
dtype = np.dtype(np.object_)
return dtype, na_value

View File

@@ -0,0 +1,7 @@
from pandas.core.arrays.arrow.accessors import (
ListAccessor,
StructAccessor,
)
from pandas.core.arrays.arrow.array import ArrowExtensionArray
__all__ = ["ArrowExtensionArray", "StructAccessor", "ListAccessor"]

View File

@@ -0,0 +1,66 @@
from __future__ import annotations
import warnings
import numpy as np
import pyarrow
from pandas.errors import PerformanceWarning
from pandas.util._exceptions import find_stack_level
def fallback_performancewarning(version: str | None = None) -> None:
"""
Raise a PerformanceWarning for falling back to ExtensionArray's
non-pyarrow method
"""
msg = "Falling back on a non-pyarrow code path which may decrease performance."
if version is not None:
msg += f" Upgrade to pyarrow >={version} to possibly suppress this warning."
warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level())
def pyarrow_array_to_numpy_and_mask(
arr, dtype: np.dtype
) -> tuple[np.ndarray, np.ndarray]:
"""
Convert a primitive pyarrow.Array to a numpy array and boolean mask based
on the buffers of the Array.
At the moment pyarrow.BooleanArray is not supported.
Parameters
----------
arr : pyarrow.Array
dtype : numpy.dtype
Returns
-------
(data, mask)
Tuple of two numpy arrays with the raw data (with specified dtype) and
a boolean mask (validity mask, so False means missing)
"""
dtype = np.dtype(dtype)
if pyarrow.types.is_null(arr.type):
# No initialization of data is needed since everything is null
data = np.empty(len(arr), dtype=dtype)
mask = np.zeros(len(arr), dtype=bool)
return data, mask
buflist = arr.buffers()
# Since Arrow buffers might contain padding and the data might be offset,
# the buffer gets sliced here before handing it to numpy.
# See also https://github.com/pandas-dev/pandas/issues/40896
offset = arr.offset * dtype.itemsize
length = len(arr) * dtype.itemsize
data_buf = buflist[1][offset : offset + length]
data = np.frombuffer(data_buf, dtype=dtype)
bitmask = buflist[0]
if bitmask is not None:
mask = pyarrow.BooleanArray.from_buffers(
pyarrow.bool_(), len(arr), [None, bitmask], offset=arr.offset
)
mask = np.asarray(mask)
else:
mask = np.ones(len(arr), dtype=bool)
return data, mask

View File

@@ -0,0 +1,473 @@
"""Accessors for arrow-backed data."""
from __future__ import annotations
from abc import (
ABCMeta,
abstractmethod,
)
from typing import (
TYPE_CHECKING,
cast,
)
from pandas.compat import (
pa_version_under10p1,
pa_version_under11p0,
)
from pandas.core.dtypes.common import is_list_like
if not pa_version_under10p1:
import pyarrow as pa
import pyarrow.compute as pc
from pandas.core.dtypes.dtypes import ArrowDtype
if TYPE_CHECKING:
from collections.abc import Iterator
from pandas import (
DataFrame,
Series,
)
class ArrowAccessor(metaclass=ABCMeta):
@abstractmethod
def __init__(self, data, validation_msg: str) -> None:
self._data = data
self._validation_msg = validation_msg
self._validate(data)
@abstractmethod
def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
pass
def _validate(self, data):
dtype = data.dtype
if not isinstance(dtype, ArrowDtype):
# Raise AttributeError so that inspect can handle non-struct Series.
raise AttributeError(self._validation_msg.format(dtype=dtype))
if not self._is_valid_pyarrow_dtype(dtype.pyarrow_dtype):
# Raise AttributeError so that inspect can handle invalid Series.
raise AttributeError(self._validation_msg.format(dtype=dtype))
@property
def _pa_array(self):
return self._data.array._pa_array
class ListAccessor(ArrowAccessor):
"""
Accessor object for list data properties of the Series values.
Parameters
----------
data : Series
Series containing Arrow list data.
"""
def __init__(self, data=None) -> None:
super().__init__(
data,
validation_msg="Can only use the '.list' accessor with "
"'list[pyarrow]' dtype, not {dtype}.",
)
def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
return (
pa.types.is_list(pyarrow_dtype)
or pa.types.is_fixed_size_list(pyarrow_dtype)
or pa.types.is_large_list(pyarrow_dtype)
)
def len(self) -> Series:
"""
Return the length of each list in the Series.
Returns
-------
pandas.Series
The length of each list.
Examples
--------
>>> import pyarrow as pa
>>> s = pd.Series(
... [
... [1, 2, 3],
... [3],
... ],
... dtype=pd.ArrowDtype(pa.list_(
... pa.int64()
... ))
... )
>>> s.list.len()
0 3
1 1
dtype: int32[pyarrow]
"""
from pandas import Series
value_lengths = pc.list_value_length(self._pa_array)
return Series(value_lengths, dtype=ArrowDtype(value_lengths.type))
def __getitem__(self, key: int | slice) -> Series:
"""
Index or slice lists in the Series.
Parameters
----------
key : int | slice
Index or slice of indices to access from each list.
Returns
-------
pandas.Series
The list at requested index.
Examples
--------
>>> import pyarrow as pa
>>> s = pd.Series(
... [
... [1, 2, 3],
... [3],
... ],
... dtype=pd.ArrowDtype(pa.list_(
... pa.int64()
... ))
... )
>>> s.list[0]
0 1
1 3
dtype: int64[pyarrow]
"""
from pandas import Series
if isinstance(key, int):
# TODO: Support negative key but pyarrow does not allow
# element index to be an array.
# if key < 0:
# key = pc.add(key, pc.list_value_length(self._pa_array))
element = pc.list_element(self._pa_array, key)
return Series(element, dtype=ArrowDtype(element.type))
elif isinstance(key, slice):
if pa_version_under11p0:
raise NotImplementedError(
f"List slice not supported by pyarrow {pa.__version__}."
)
# TODO: Support negative start/stop/step, ideally this would be added
# upstream in pyarrow.
start, stop, step = key.start, key.stop, key.step
if start is None:
# TODO: When adding negative step support
# this should be setto last element of array
# when step is negative.
start = 0
if step is None:
step = 1
sliced = pc.list_slice(self._pa_array, start, stop, step)
return Series(sliced, dtype=ArrowDtype(sliced.type))
else:
raise ValueError(f"key must be an int or slice, got {type(key).__name__}")
def __iter__(self) -> Iterator:
raise TypeError(f"'{type(self).__name__}' object is not iterable")
def flatten(self) -> Series:
"""
Flatten list values.
Returns
-------
pandas.Series
The data from all lists in the series flattened.
Examples
--------
>>> import pyarrow as pa
>>> s = pd.Series(
... [
... [1, 2, 3],
... [3],
... ],
... dtype=pd.ArrowDtype(pa.list_(
... pa.int64()
... ))
... )
>>> s.list.flatten()
0 1
1 2
2 3
3 3
dtype: int64[pyarrow]
"""
from pandas import Series
flattened = pc.list_flatten(self._pa_array)
return Series(flattened, dtype=ArrowDtype(flattened.type))
class StructAccessor(ArrowAccessor):
"""
Accessor object for structured data properties of the Series values.
Parameters
----------
data : Series
Series containing Arrow struct data.
"""
def __init__(self, data=None) -> None:
super().__init__(
data,
validation_msg=(
"Can only use the '.struct' accessor with 'struct[pyarrow]' "
"dtype, not {dtype}."
),
)
def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
return pa.types.is_struct(pyarrow_dtype)
@property
def dtypes(self) -> Series:
"""
Return the dtype object of each child field of the struct.
Returns
-------
pandas.Series
The data type of each child field.
Examples
--------
>>> import pyarrow as pa
>>> s = pd.Series(
... [
... {"version": 1, "project": "pandas"},
... {"version": 2, "project": "pandas"},
... {"version": 1, "project": "numpy"},
... ],
... dtype=pd.ArrowDtype(pa.struct(
... [("version", pa.int64()), ("project", pa.string())]
... ))
... )
>>> s.struct.dtypes
version int64[pyarrow]
project string[pyarrow]
dtype: object
"""
from pandas import (
Index,
Series,
)
pa_type = self._data.dtype.pyarrow_dtype
types = [ArrowDtype(struct.type) for struct in pa_type]
names = [struct.name for struct in pa_type]
return Series(types, index=Index(names))
def field(
self,
name_or_index: list[str]
| list[bytes]
| list[int]
| pc.Expression
| bytes
| str
| int,
) -> Series:
"""
Extract a child field of a struct as a Series.
Parameters
----------
name_or_index : str | bytes | int | expression | list
Name or index of the child field to extract.
For list-like inputs, this will index into a nested
struct.
Returns
-------
pandas.Series
The data corresponding to the selected child field.
See Also
--------
Series.struct.explode : Return all child fields as a DataFrame.
Notes
-----
The name of the resulting Series will be set using the following
rules:
- For string, bytes, or integer `name_or_index` (or a list of these, for
a nested selection), the Series name is set to the selected
field's name.
- For a :class:`pyarrow.compute.Expression`, this is set to
the string form of the expression.
- For list-like `name_or_index`, the name will be set to the
name of the final field selected.
Examples
--------
>>> import pyarrow as pa
>>> s = pd.Series(
... [
... {"version": 1, "project": "pandas"},
... {"version": 2, "project": "pandas"},
... {"version": 1, "project": "numpy"},
... ],
... dtype=pd.ArrowDtype(pa.struct(
... [("version", pa.int64()), ("project", pa.string())]
... ))
... )
Extract by field name.
>>> s.struct.field("project")
0 pandas
1 pandas
2 numpy
Name: project, dtype: string[pyarrow]
Extract by field index.
>>> s.struct.field(0)
0 1
1 2
2 1
Name: version, dtype: int64[pyarrow]
Or an expression
>>> import pyarrow.compute as pc
>>> s.struct.field(pc.field("project"))
0 pandas
1 pandas
2 numpy
Name: project, dtype: string[pyarrow]
For nested struct types, you can pass a list of values to index
multiple levels:
>>> version_type = pa.struct([
... ("major", pa.int64()),
... ("minor", pa.int64()),
... ])
>>> s = pd.Series(
... [
... {"version": {"major": 1, "minor": 5}, "project": "pandas"},
... {"version": {"major": 2, "minor": 1}, "project": "pandas"},
... {"version": {"major": 1, "minor": 26}, "project": "numpy"},
... ],
... dtype=pd.ArrowDtype(pa.struct(
... [("version", version_type), ("project", pa.string())]
... ))
... )
>>> s.struct.field(["version", "minor"])
0 5
1 1
2 26
Name: minor, dtype: int64[pyarrow]
>>> s.struct.field([0, 0])
0 1
1 2
2 1
Name: major, dtype: int64[pyarrow]
"""
from pandas import Series
def get_name(
level_name_or_index: list[str]
| list[bytes]
| list[int]
| pc.Expression
| bytes
| str
| int,
data: pa.ChunkedArray,
):
if isinstance(level_name_or_index, int):
name = data.type.field(level_name_or_index).name
elif isinstance(level_name_or_index, (str, bytes)):
name = level_name_or_index
elif isinstance(level_name_or_index, pc.Expression):
name = str(level_name_or_index)
elif is_list_like(level_name_or_index):
# For nested input like [2, 1, 2]
# iteratively get the struct and field name. The last
# one is used for the name of the index.
level_name_or_index = list(reversed(level_name_or_index))
selected = data
while level_name_or_index:
# we need the cast, otherwise mypy complains about
# getting ints, bytes, or str here, which isn't possible.
level_name_or_index = cast(list, level_name_or_index)
name_or_index = level_name_or_index.pop()
name = get_name(name_or_index, selected)
selected = selected.type.field(selected.type.get_field_index(name))
name = selected.name
else:
raise ValueError(
"name_or_index must be an int, str, bytes, "
"pyarrow.compute.Expression, or list of those"
)
return name
pa_arr = self._data.array._pa_array
name = get_name(name_or_index, pa_arr)
field_arr = pc.struct_field(pa_arr, name_or_index)
return Series(
field_arr,
dtype=ArrowDtype(field_arr.type),
index=self._data.index,
name=name,
)
def explode(self) -> DataFrame:
"""
Extract all child fields of a struct as a DataFrame.
Returns
-------
pandas.DataFrame
The data corresponding to all child fields.
See Also
--------
Series.struct.field : Return a single child field as a Series.
Examples
--------
>>> import pyarrow as pa
>>> s = pd.Series(
... [
... {"version": 1, "project": "pandas"},
... {"version": 2, "project": "pandas"},
... {"version": 1, "project": "numpy"},
... ],
... dtype=pd.ArrowDtype(pa.struct(
... [("version", pa.int64()), ("project", pa.string())]
... ))
... )
>>> s.struct.explode()
version project
0 1 pandas
1 2 pandas
2 1 numpy
"""
from pandas import concat
pa_type = self._pa_array.type
return concat(
[self.field(i) for i in range(pa_type.num_fields)], axis="columns"
)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,174 @@
from __future__ import annotations
import json
from typing import TYPE_CHECKING
import pyarrow
from pandas.compat import pa_version_under14p1
from pandas.core.dtypes.dtypes import (
IntervalDtype,
PeriodDtype,
)
from pandas.core.arrays.interval import VALID_CLOSED
if TYPE_CHECKING:
from pandas._typing import IntervalClosedType
class ArrowPeriodType(pyarrow.ExtensionType):
def __init__(self, freq) -> None:
# attributes need to be set first before calling
# super init (as that calls serialize)
self._freq = freq
pyarrow.ExtensionType.__init__(self, pyarrow.int64(), "pandas.period")
@property
def freq(self):
return self._freq
def __arrow_ext_serialize__(self) -> bytes:
metadata = {"freq": self.freq}
return json.dumps(metadata).encode()
@classmethod
def __arrow_ext_deserialize__(cls, storage_type, serialized) -> ArrowPeriodType:
metadata = json.loads(serialized.decode())
return ArrowPeriodType(metadata["freq"])
def __eq__(self, other):
if isinstance(other, pyarrow.BaseExtensionType):
return type(self) == type(other) and self.freq == other.freq
else:
return NotImplemented
def __ne__(self, other) -> bool:
return not self == other
def __hash__(self) -> int:
return hash((str(self), self.freq))
def to_pandas_dtype(self) -> PeriodDtype:
return PeriodDtype(freq=self.freq)
# register the type with a dummy instance
_period_type = ArrowPeriodType("D")
pyarrow.register_extension_type(_period_type)
class ArrowIntervalType(pyarrow.ExtensionType):
def __init__(self, subtype, closed: IntervalClosedType) -> None:
# attributes need to be set first before calling
# super init (as that calls serialize)
assert closed in VALID_CLOSED
self._closed: IntervalClosedType = closed
if not isinstance(subtype, pyarrow.DataType):
subtype = pyarrow.type_for_alias(str(subtype))
self._subtype = subtype
storage_type = pyarrow.struct([("left", subtype), ("right", subtype)])
pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval")
@property
def subtype(self):
return self._subtype
@property
def closed(self) -> IntervalClosedType:
return self._closed
def __arrow_ext_serialize__(self) -> bytes:
metadata = {"subtype": str(self.subtype), "closed": self.closed}
return json.dumps(metadata).encode()
@classmethod
def __arrow_ext_deserialize__(cls, storage_type, serialized) -> ArrowIntervalType:
metadata = json.loads(serialized.decode())
subtype = pyarrow.type_for_alias(metadata["subtype"])
closed = metadata["closed"]
return ArrowIntervalType(subtype, closed)
def __eq__(self, other):
if isinstance(other, pyarrow.BaseExtensionType):
return (
type(self) == type(other)
and self.subtype == other.subtype
and self.closed == other.closed
)
else:
return NotImplemented
def __ne__(self, other) -> bool:
return not self == other
def __hash__(self) -> int:
return hash((str(self), str(self.subtype), self.closed))
def to_pandas_dtype(self) -> IntervalDtype:
return IntervalDtype(self.subtype.to_pandas_dtype(), self.closed)
# register the type with a dummy instance
_interval_type = ArrowIntervalType(pyarrow.int64(), "left")
pyarrow.register_extension_type(_interval_type)
_ERROR_MSG = """\
Disallowed deserialization of 'arrow.py_extension_type':
storage_type = {storage_type}
serialized = {serialized}
pickle disassembly:\n{pickle_disassembly}
Reading of untrusted Parquet or Feather files with a PyExtensionType column
allows arbitrary code execution.
If you trust this file, you can enable reading the extension type by one of:
- upgrading to pyarrow >= 14.0.1, and call `pa.PyExtensionType.set_auto_load(True)`
- install pyarrow-hotfix (`pip install pyarrow-hotfix`) and disable it by running
`import pyarrow_hotfix; pyarrow_hotfix.uninstall()`
We strongly recommend updating your Parquet/Feather files to use extension types
derived from `pyarrow.ExtensionType` instead, and register this type explicitly.
"""
def patch_pyarrow():
# starting from pyarrow 14.0.1, it has its own mechanism
if not pa_version_under14p1:
return
# if https://github.com/pitrou/pyarrow-hotfix was installed and enabled
if getattr(pyarrow, "_hotfix_installed", False):
return
class ForbiddenExtensionType(pyarrow.ExtensionType):
def __arrow_ext_serialize__(self):
return b""
@classmethod
def __arrow_ext_deserialize__(cls, storage_type, serialized):
import io
import pickletools
out = io.StringIO()
pickletools.dis(serialized, out)
raise RuntimeError(
_ERROR_MSG.format(
storage_type=storage_type,
serialized=serialized,
pickle_disassembly=out.getvalue(),
)
)
pyarrow.unregister_extension_type("arrow.py_extension_type")
pyarrow.register_extension_type(
ForbiddenExtensionType(pyarrow.null(), "arrow.py_extension_type")
)
pyarrow._hotfix_installed = True
patch_pyarrow()

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More