initial commit
This commit is contained in:
@@ -0,0 +1,25 @@
|
||||
def get_groupby_method_args(name, obj):
|
||||
"""
|
||||
Get required arguments for a groupby method.
|
||||
|
||||
When parametrizing a test over groupby methods (e.g. "sum", "mean", "fillna"),
|
||||
it is often the case that arguments are required for certain methods.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name: str
|
||||
Name of the method.
|
||||
obj: Series or DataFrame
|
||||
pandas object that is being grouped.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A tuple of required arguments for the method.
|
||||
"""
|
||||
if name in ("nth", "fillna", "take"):
|
||||
return (0,)
|
||||
if name == "quantile":
|
||||
return (0.5,)
|
||||
if name == "corrwith":
|
||||
return (obj,)
|
||||
return ()
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,435 @@
|
||||
"""
|
||||
test cython .agg behavior
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_float_dtype,
|
||||
is_integer_dtype,
|
||||
)
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
NaT,
|
||||
Series,
|
||||
Timedelta,
|
||||
Timestamp,
|
||||
bdate_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
import pandas.core.common as com
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op_name",
|
||||
[
|
||||
"count",
|
||||
"sum",
|
||||
"std",
|
||||
"var",
|
||||
"sem",
|
||||
"mean",
|
||||
pytest.param(
|
||||
"median",
|
||||
# ignore mean of empty slice
|
||||
# and all-NaN
|
||||
marks=[pytest.mark.filterwarnings("ignore::RuntimeWarning")],
|
||||
),
|
||||
"prod",
|
||||
"min",
|
||||
"max",
|
||||
],
|
||||
)
|
||||
def test_cythonized_aggers(op_name):
|
||||
data = {
|
||||
"A": [0, 0, 0, 0, 1, 1, 1, 1, 1, 1.0, np.nan, np.nan],
|
||||
"B": ["A", "B"] * 6,
|
||||
"C": np.random.default_rng(2).standard_normal(12),
|
||||
}
|
||||
df = DataFrame(data)
|
||||
df.loc[2:10:2, "C"] = np.nan
|
||||
|
||||
op = lambda x: getattr(x, op_name)()
|
||||
|
||||
# single column
|
||||
grouped = df.drop(["B"], axis=1).groupby("A")
|
||||
exp = {cat: op(group["C"]) for cat, group in grouped}
|
||||
exp = DataFrame({"C": exp})
|
||||
exp.index.name = "A"
|
||||
result = op(grouped)
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
# multiple columns
|
||||
grouped = df.groupby(["A", "B"])
|
||||
expd = {}
|
||||
for (cat1, cat2), group in grouped:
|
||||
expd.setdefault(cat1, {})[cat2] = op(group["C"])
|
||||
exp = DataFrame(expd).T.stack(future_stack=True)
|
||||
exp.index.names = ["A", "B"]
|
||||
exp.name = "C"
|
||||
|
||||
result = op(grouped)["C"]
|
||||
if op_name in ["sum", "prod"]:
|
||||
tm.assert_series_equal(result, exp)
|
||||
|
||||
|
||||
def test_cython_agg_boolean():
|
||||
frame = DataFrame(
|
||||
{
|
||||
"a": np.random.default_rng(2).integers(0, 5, 50),
|
||||
"b": np.random.default_rng(2).integers(0, 2, 50).astype("bool"),
|
||||
}
|
||||
)
|
||||
result = frame.groupby("a")["b"].mean()
|
||||
msg = "using SeriesGroupBy.mean"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
# GH#53425
|
||||
expected = frame.groupby("a")["b"].agg(np.mean)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_cython_agg_nothing_to_agg():
|
||||
frame = DataFrame(
|
||||
{"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25}
|
||||
)
|
||||
|
||||
msg = "Cannot use numeric_only=True with SeriesGroupBy.mean and non-numeric dtypes"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
frame.groupby("a")["b"].mean(numeric_only=True)
|
||||
|
||||
frame = DataFrame(
|
||||
{"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25}
|
||||
)
|
||||
|
||||
result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True)
|
||||
expected = DataFrame(
|
||||
[], index=frame["a"].sort_values().drop_duplicates(), columns=[]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_cython_agg_nothing_to_agg_with_dates():
|
||||
frame = DataFrame(
|
||||
{
|
||||
"a": np.random.default_rng(2).integers(0, 5, 50),
|
||||
"b": ["foo", "bar"] * 25,
|
||||
"dates": pd.date_range("now", periods=50, freq="min"),
|
||||
}
|
||||
)
|
||||
msg = "Cannot use numeric_only=True with SeriesGroupBy.mean and non-numeric dtypes"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
frame.groupby("b").dates.mean(numeric_only=True)
|
||||
|
||||
|
||||
def test_cython_agg_frame_columns():
|
||||
# #2113
|
||||
df = DataFrame({"x": [1, 2, 3], "y": [3, 4, 5]})
|
||||
|
||||
msg = "DataFrame.groupby with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
df.groupby(level=0, axis="columns").mean()
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
df.groupby(level=0, axis="columns").mean()
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
df.groupby(level=0, axis="columns").mean()
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
df.groupby(level=0, axis="columns").mean()
|
||||
|
||||
|
||||
def test_cython_agg_return_dict():
|
||||
# GH 16741
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.default_rng(2).standard_normal(8),
|
||||
"D": np.random.default_rng(2).standard_normal(8),
|
||||
}
|
||||
)
|
||||
|
||||
ts = df.groupby("A")["B"].agg(lambda x: x.value_counts().to_dict())
|
||||
expected = Series(
|
||||
[{"two": 1, "one": 1, "three": 1}, {"two": 2, "one": 2, "three": 1}],
|
||||
index=Index(["bar", "foo"], name="A"),
|
||||
name="B",
|
||||
)
|
||||
tm.assert_series_equal(ts, expected)
|
||||
|
||||
|
||||
def test_cython_fail_agg():
|
||||
dr = bdate_range("1/1/2000", periods=50)
|
||||
ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr)
|
||||
|
||||
grouped = ts.groupby(lambda x: x.month)
|
||||
summed = grouped.sum()
|
||||
msg = "using SeriesGroupBy.sum"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
# GH#53425
|
||||
expected = grouped.agg(np.sum)
|
||||
tm.assert_series_equal(summed, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op, targop",
|
||||
[
|
||||
("mean", np.mean),
|
||||
("median", np.median),
|
||||
("var", np.var),
|
||||
("sum", np.sum),
|
||||
("prod", np.prod),
|
||||
("min", np.min),
|
||||
("max", np.max),
|
||||
("first", lambda x: x.iloc[0]),
|
||||
("last", lambda x: x.iloc[-1]),
|
||||
],
|
||||
)
|
||||
def test__cython_agg_general(op, targop):
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal(1000))
|
||||
labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float)
|
||||
|
||||
result = df.groupby(labels)._cython_agg_general(op, alt=None, numeric_only=True)
|
||||
warn = FutureWarning if targop in com._cython_table else None
|
||||
msg = f"using DataFrameGroupBy.{op}"
|
||||
with tm.assert_produces_warning(warn, match=msg):
|
||||
# GH#53425
|
||||
expected = df.groupby(labels).agg(targop)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op, targop",
|
||||
[
|
||||
("mean", np.mean),
|
||||
("median", lambda x: np.median(x) if len(x) > 0 else np.nan),
|
||||
("var", lambda x: np.var(x, ddof=1)),
|
||||
("min", np.min),
|
||||
("max", np.max),
|
||||
],
|
||||
)
|
||||
def test_cython_agg_empty_buckets(op, targop, observed):
|
||||
df = DataFrame([11, 12, 13])
|
||||
grps = range(0, 55, 5)
|
||||
|
||||
# calling _cython_agg_general directly, instead of via the user API
|
||||
# which sets different values for min_count, so do that here.
|
||||
g = df.groupby(pd.cut(df[0], grps), observed=observed)
|
||||
result = g._cython_agg_general(op, alt=None, numeric_only=True)
|
||||
|
||||
g = df.groupby(pd.cut(df[0], grps), observed=observed)
|
||||
expected = g.agg(lambda x: targop(x))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_cython_agg_empty_buckets_nanops(observed):
|
||||
# GH-18869 can't call nanops on empty groups, so hardcode expected
|
||||
# for these
|
||||
df = DataFrame([11, 12, 13], columns=["a"])
|
||||
grps = np.arange(0, 25, 5, dtype=int)
|
||||
# add / sum
|
||||
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
|
||||
"sum", alt=None, numeric_only=True
|
||||
)
|
||||
intervals = pd.interval_range(0, 20, freq=5)
|
||||
expected = DataFrame(
|
||||
{"a": [0, 0, 36, 0]},
|
||||
index=pd.CategoricalIndex(intervals, name="a", ordered=True),
|
||||
)
|
||||
if observed:
|
||||
expected = expected[expected.a != 0]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# prod
|
||||
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
|
||||
"prod", alt=None, numeric_only=True
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"a": [1, 1, 1716, 1]},
|
||||
index=pd.CategoricalIndex(intervals, name="a", ordered=True),
|
||||
)
|
||||
if observed:
|
||||
expected = expected[expected.a != 1]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("op", ["first", "last", "max", "min"])
|
||||
@pytest.mark.parametrize(
|
||||
"data", [Timestamp("2016-10-14 21:00:44.557"), Timedelta("17088 days 21:00:44.557")]
|
||||
)
|
||||
def test_cython_with_timestamp_and_nat(op, data):
|
||||
# https://github.com/pandas-dev/pandas/issues/19526
|
||||
df = DataFrame({"a": [0, 1], "b": [data, NaT]})
|
||||
index = Index([0, 1], name="a")
|
||||
|
||||
# We will group by a and test the cython aggregations
|
||||
expected = DataFrame({"b": [data, NaT]}, index=index)
|
||||
|
||||
result = df.groupby("a").aggregate(op)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"agg",
|
||||
[
|
||||
"min",
|
||||
"max",
|
||||
"count",
|
||||
"sum",
|
||||
"prod",
|
||||
"var",
|
||||
"mean",
|
||||
"median",
|
||||
"ohlc",
|
||||
"cumprod",
|
||||
"cumsum",
|
||||
"shift",
|
||||
"any",
|
||||
"all",
|
||||
"quantile",
|
||||
"first",
|
||||
"last",
|
||||
"rank",
|
||||
"cummin",
|
||||
"cummax",
|
||||
],
|
||||
)
|
||||
def test_read_only_buffer_source_agg(agg):
|
||||
# https://github.com/pandas-dev/pandas/issues/36014
|
||||
df = DataFrame(
|
||||
{
|
||||
"sepal_length": [5.1, 4.9, 4.7, 4.6, 5.0],
|
||||
"species": ["setosa", "setosa", "setosa", "setosa", "setosa"],
|
||||
}
|
||||
)
|
||||
df._mgr.arrays[0].flags.writeable = False
|
||||
|
||||
result = df.groupby(["species"]).agg({"sepal_length": agg})
|
||||
expected = df.copy().groupby(["species"]).agg({"sepal_length": agg})
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op_name",
|
||||
[
|
||||
"count",
|
||||
"sum",
|
||||
"std",
|
||||
"var",
|
||||
"sem",
|
||||
"mean",
|
||||
"median",
|
||||
"prod",
|
||||
"min",
|
||||
"max",
|
||||
],
|
||||
)
|
||||
def test_cython_agg_nullable_int(op_name):
|
||||
# ensure that the cython-based aggregations don't fail for nullable dtype
|
||||
# (eg https://github.com/pandas-dev/pandas/issues/37415)
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["A", "B"] * 5,
|
||||
"B": pd.array([1, 2, 3, 4, 5, 6, 7, 8, 9, pd.NA], dtype="Int64"),
|
||||
}
|
||||
)
|
||||
result = getattr(df.groupby("A")["B"], op_name)()
|
||||
df2 = df.assign(B=df["B"].astype("float64"))
|
||||
expected = getattr(df2.groupby("A")["B"], op_name)()
|
||||
if op_name in ("mean", "median"):
|
||||
convert_integer = False
|
||||
else:
|
||||
convert_integer = True
|
||||
expected = expected.convert_dtypes(convert_integer=convert_integer)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
|
||||
def test_count_masked_returns_masked_dtype(dtype):
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 1],
|
||||
"B": pd.array([1, pd.NA], dtype=dtype),
|
||||
"C": pd.array([1, 1], dtype=dtype),
|
||||
}
|
||||
)
|
||||
result = df.groupby("A").count()
|
||||
expected = DataFrame(
|
||||
[[1, 2]], index=Index([1], name="A"), columns=["B", "C"], dtype="Int64"
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("with_na", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"op_name, action",
|
||||
[
|
||||
# ("count", "always_int"),
|
||||
("sum", "large_int"),
|
||||
# ("std", "always_float"),
|
||||
("var", "always_float"),
|
||||
# ("sem", "always_float"),
|
||||
("mean", "always_float"),
|
||||
("median", "always_float"),
|
||||
("prod", "large_int"),
|
||||
("min", "preserve"),
|
||||
("max", "preserve"),
|
||||
("first", "preserve"),
|
||||
("last", "preserve"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
pd.array([1, 2, 3, 4], dtype="Int64"),
|
||||
pd.array([1, 2, 3, 4], dtype="Int8"),
|
||||
pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float32"),
|
||||
pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float64"),
|
||||
pd.array([True, True, False, False], dtype="boolean"),
|
||||
],
|
||||
)
|
||||
def test_cython_agg_EA_known_dtypes(data, op_name, action, with_na):
|
||||
if with_na:
|
||||
data[3] = pd.NA
|
||||
|
||||
df = DataFrame({"key": ["a", "a", "b", "b"], "col": data})
|
||||
grouped = df.groupby("key")
|
||||
|
||||
if action == "always_int":
|
||||
# always Int64
|
||||
expected_dtype = pd.Int64Dtype()
|
||||
elif action == "large_int":
|
||||
# for any int/bool use Int64, for float preserve dtype
|
||||
if is_float_dtype(data.dtype):
|
||||
expected_dtype = data.dtype
|
||||
elif is_integer_dtype(data.dtype):
|
||||
# match the numpy dtype we'd get with the non-nullable analogue
|
||||
expected_dtype = data.dtype
|
||||
else:
|
||||
expected_dtype = pd.Int64Dtype()
|
||||
elif action == "always_float":
|
||||
# for any int/bool use Float64, for float preserve dtype
|
||||
if is_float_dtype(data.dtype):
|
||||
expected_dtype = data.dtype
|
||||
else:
|
||||
expected_dtype = pd.Float64Dtype()
|
||||
elif action == "preserve":
|
||||
expected_dtype = data.dtype
|
||||
|
||||
result = getattr(grouped, op_name)()
|
||||
assert result["col"].dtype == expected_dtype
|
||||
|
||||
result = grouped.aggregate(op_name)
|
||||
assert result["col"].dtype == expected_dtype
|
||||
|
||||
result = getattr(grouped["col"], op_name)()
|
||||
assert result.dtype == expected_dtype
|
||||
|
||||
result = grouped["col"].aggregate(op_name)
|
||||
assert result.dtype == expected_dtype
|
||||
@@ -0,0 +1,392 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import NumbaUtilError
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
NamedAgg,
|
||||
Series,
|
||||
option_context,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.single_cpu
|
||||
|
||||
|
||||
def test_correct_function_signature():
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def incorrect_function(x):
|
||||
return sum(x) * 2.7
|
||||
|
||||
data = DataFrame(
|
||||
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
|
||||
columns=["key", "data"],
|
||||
)
|
||||
with pytest.raises(NumbaUtilError, match="The first 2"):
|
||||
data.groupby("key").agg(incorrect_function, engine="numba")
|
||||
|
||||
with pytest.raises(NumbaUtilError, match="The first 2"):
|
||||
data.groupby("key")["data"].agg(incorrect_function, engine="numba")
|
||||
|
||||
|
||||
def test_check_nopython_kwargs():
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def incorrect_function(values, index):
|
||||
return sum(values) * 2.7
|
||||
|
||||
data = DataFrame(
|
||||
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
|
||||
columns=["key", "data"],
|
||||
)
|
||||
with pytest.raises(NumbaUtilError, match="numba does not support"):
|
||||
data.groupby("key").agg(incorrect_function, engine="numba", a=1)
|
||||
|
||||
with pytest.raises(NumbaUtilError, match="numba does not support"):
|
||||
data.groupby("key")["data"].agg(incorrect_function, engine="numba", a=1)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore")
|
||||
# Filter warnings when parallel=True and the function can't be parallelized by Numba
|
||||
@pytest.mark.parametrize("jit", [True, False])
|
||||
@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
|
||||
@pytest.mark.parametrize("as_index", [True, False])
|
||||
def test_numba_vs_cython(jit, pandas_obj, nogil, parallel, nopython, as_index):
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def func_numba(values, index):
|
||||
return np.mean(values) * 2.7
|
||||
|
||||
if jit:
|
||||
# Test accepted jitted functions
|
||||
import numba
|
||||
|
||||
func_numba = numba.jit(func_numba)
|
||||
|
||||
data = DataFrame(
|
||||
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
||||
)
|
||||
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
||||
grouped = data.groupby(0, as_index=as_index)
|
||||
if pandas_obj == "Series":
|
||||
grouped = grouped[1]
|
||||
|
||||
result = grouped.agg(func_numba, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.agg(lambda x: np.mean(x) * 2.7, engine="cython")
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore")
|
||||
# Filter warnings when parallel=True and the function can't be parallelized by Numba
|
||||
@pytest.mark.parametrize("jit", [True, False])
|
||||
@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
|
||||
def test_cache(jit, pandas_obj, nogil, parallel, nopython):
|
||||
# Test that the functions are cached correctly if we switch functions
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def func_1(values, index):
|
||||
return np.mean(values) - 3.4
|
||||
|
||||
def func_2(values, index):
|
||||
return np.mean(values) * 2.7
|
||||
|
||||
if jit:
|
||||
import numba
|
||||
|
||||
func_1 = numba.jit(func_1)
|
||||
func_2 = numba.jit(func_2)
|
||||
|
||||
data = DataFrame(
|
||||
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
||||
)
|
||||
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
||||
grouped = data.groupby(0)
|
||||
if pandas_obj == "Series":
|
||||
grouped = grouped[1]
|
||||
|
||||
result = grouped.agg(func_1, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.agg(lambda x: np.mean(x) - 3.4, engine="cython")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# Add func_2 to the cache
|
||||
result = grouped.agg(func_2, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.agg(lambda x: np.mean(x) * 2.7, engine="cython")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# Retest func_1 which should use the cache
|
||||
result = grouped.agg(func_1, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.agg(lambda x: np.mean(x) - 3.4, engine="cython")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
def test_use_global_config():
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def func_1(values, index):
|
||||
return np.mean(values) - 3.4
|
||||
|
||||
data = DataFrame(
|
||||
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
||||
)
|
||||
grouped = data.groupby(0)
|
||||
expected = grouped.agg(func_1, engine="numba")
|
||||
with option_context("compute.use_numba", True):
|
||||
result = grouped.agg(func_1, engine=None)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"agg_kwargs",
|
||||
[
|
||||
{"func": ["min", "max"]},
|
||||
{"func": "min"},
|
||||
{"func": {1: ["min", "max"], 2: "sum"}},
|
||||
{"bmin": NamedAgg(column=1, aggfunc="min")},
|
||||
],
|
||||
)
|
||||
def test_multifunc_numba_vs_cython_frame(agg_kwargs):
|
||||
pytest.importorskip("numba")
|
||||
data = DataFrame(
|
||||
{
|
||||
0: ["a", "a", "b", "b", "a"],
|
||||
1: [1.0, 2.0, 3.0, 4.0, 5.0],
|
||||
2: [1, 2, 3, 4, 5],
|
||||
},
|
||||
columns=[0, 1, 2],
|
||||
)
|
||||
grouped = data.groupby(0)
|
||||
result = grouped.agg(**agg_kwargs, engine="numba")
|
||||
expected = grouped.agg(**agg_kwargs, engine="cython")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"agg_kwargs,expected_func",
|
||||
[
|
||||
({"func": lambda values, index: values.sum()}, "sum"),
|
||||
# FIXME
|
||||
pytest.param(
|
||||
{
|
||||
"func": [
|
||||
lambda values, index: values.sum(),
|
||||
lambda values, index: values.min(),
|
||||
]
|
||||
},
|
||||
["sum", "min"],
|
||||
marks=pytest.mark.xfail(
|
||||
reason="This doesn't work yet! Fails in nopython pipeline!"
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_multifunc_numba_udf_frame(agg_kwargs, expected_func):
|
||||
pytest.importorskip("numba")
|
||||
data = DataFrame(
|
||||
{
|
||||
0: ["a", "a", "b", "b", "a"],
|
||||
1: [1.0, 2.0, 3.0, 4.0, 5.0],
|
||||
2: [1, 2, 3, 4, 5],
|
||||
},
|
||||
columns=[0, 1, 2],
|
||||
)
|
||||
grouped = data.groupby(0)
|
||||
result = grouped.agg(**agg_kwargs, engine="numba")
|
||||
expected = grouped.agg(expected_func, engine="cython")
|
||||
# check_dtype can be removed if GH 44952 is addressed
|
||||
# Currently, UDFs still always return float64 while reductions can preserve dtype
|
||||
tm.assert_frame_equal(result, expected, check_dtype=False)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"agg_kwargs",
|
||||
[{"func": ["min", "max"]}, {"func": "min"}, {"min_val": "min", "max_val": "max"}],
|
||||
)
|
||||
def test_multifunc_numba_vs_cython_series(agg_kwargs):
|
||||
pytest.importorskip("numba")
|
||||
labels = ["a", "a", "b", "b", "a"]
|
||||
data = Series([1.0, 2.0, 3.0, 4.0, 5.0])
|
||||
grouped = data.groupby(labels)
|
||||
agg_kwargs["engine"] = "numba"
|
||||
result = grouped.agg(**agg_kwargs)
|
||||
agg_kwargs["engine"] = "cython"
|
||||
expected = grouped.agg(**agg_kwargs)
|
||||
if isinstance(expected, DataFrame):
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.single_cpu
|
||||
@pytest.mark.parametrize(
|
||||
"data,agg_kwargs",
|
||||
[
|
||||
(Series([1.0, 2.0, 3.0, 4.0, 5.0]), {"func": ["min", "max"]}),
|
||||
(Series([1.0, 2.0, 3.0, 4.0, 5.0]), {"func": "min"}),
|
||||
(
|
||||
DataFrame(
|
||||
{1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
|
||||
),
|
||||
{"func": ["min", "max"]},
|
||||
),
|
||||
(
|
||||
DataFrame(
|
||||
{1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
|
||||
),
|
||||
{"func": "min"},
|
||||
),
|
||||
(
|
||||
DataFrame(
|
||||
{1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
|
||||
),
|
||||
{"func": {1: ["min", "max"], 2: "sum"}},
|
||||
),
|
||||
(
|
||||
DataFrame(
|
||||
{1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
|
||||
),
|
||||
{"min_col": NamedAgg(column=1, aggfunc="min")},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_multifunc_numba_kwarg_propagation(data, agg_kwargs):
|
||||
pytest.importorskip("numba")
|
||||
labels = ["a", "a", "b", "b", "a"]
|
||||
grouped = data.groupby(labels)
|
||||
result = grouped.agg(**agg_kwargs, engine="numba", engine_kwargs={"parallel": True})
|
||||
expected = grouped.agg(**agg_kwargs, engine="numba")
|
||||
if isinstance(expected, DataFrame):
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_args_not_cached():
|
||||
# GH 41647
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def sum_last(values, index, n):
|
||||
return values[-n:].sum()
|
||||
|
||||
df = DataFrame({"id": [0, 0, 1, 1], "x": [1, 1, 1, 1]})
|
||||
grouped_x = df.groupby("id")["x"]
|
||||
result = grouped_x.agg(sum_last, 1, engine="numba")
|
||||
expected = Series([1.0] * 2, name="x", index=Index([0, 1], name="id"))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = grouped_x.agg(sum_last, 2, engine="numba")
|
||||
expected = Series([2.0] * 2, name="x", index=Index([0, 1], name="id"))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_index_data_correctly_passed():
|
||||
# GH 43133
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def f(values, index):
|
||||
return np.mean(index)
|
||||
|
||||
df = DataFrame({"group": ["A", "A", "B"], "v": [4, 5, 6]}, index=[-1, -2, -3])
|
||||
result = df.groupby("group").aggregate(f, engine="numba")
|
||||
expected = DataFrame(
|
||||
[-1.5, -3.0], columns=["v"], index=Index(["A", "B"], name="group")
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_engine_kwargs_not_cached():
|
||||
# If the user passes a different set of engine_kwargs don't return the same
|
||||
# jitted function
|
||||
pytest.importorskip("numba")
|
||||
nogil = True
|
||||
parallel = False
|
||||
nopython = True
|
||||
|
||||
def func_kwargs(values, index):
|
||||
return nogil + parallel + nopython
|
||||
|
||||
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
df = DataFrame({"value": [0, 0, 0]})
|
||||
result = df.groupby(level=0).aggregate(
|
||||
func_kwargs, engine="numba", engine_kwargs=engine_kwargs
|
||||
)
|
||||
expected = DataFrame({"value": [2.0, 2.0, 2.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
nogil = False
|
||||
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
result = df.groupby(level=0).aggregate(
|
||||
func_kwargs, engine="numba", engine_kwargs=engine_kwargs
|
||||
)
|
||||
expected = DataFrame({"value": [1.0, 1.0, 1.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore")
|
||||
def test_multiindex_one_key(nogil, parallel, nopython):
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def numba_func(values, index):
|
||||
return 1
|
||||
|
||||
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
|
||||
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
result = df.groupby("A").agg(
|
||||
numba_func, engine="numba", engine_kwargs=engine_kwargs
|
||||
)
|
||||
expected = DataFrame([1.0], index=Index([1], name="A"), columns=["C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_multiindex_multi_key_not_supported(nogil, parallel, nopython):
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def numba_func(values, index):
|
||||
return 1
|
||||
|
||||
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
|
||||
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
with pytest.raises(NotImplementedError, match="more than 1 grouping labels"):
|
||||
df.groupby(["A", "B"]).agg(
|
||||
numba_func, engine="numba", engine_kwargs=engine_kwargs
|
||||
)
|
||||
|
||||
|
||||
def test_multilabel_numba_vs_cython(numba_supported_reductions):
|
||||
pytest.importorskip("numba")
|
||||
reduction, kwargs = numba_supported_reductions
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.default_rng(2).standard_normal(8),
|
||||
"D": np.random.default_rng(2).standard_normal(8),
|
||||
}
|
||||
)
|
||||
gb = df.groupby(["A", "B"])
|
||||
res_agg = gb.agg(reduction, engine="numba", **kwargs)
|
||||
expected_agg = gb.agg(reduction, engine="cython", **kwargs)
|
||||
tm.assert_frame_equal(res_agg, expected_agg)
|
||||
# Test that calling the aggregation directly also works
|
||||
direct_res = getattr(gb, reduction)(engine="numba", **kwargs)
|
||||
direct_expected = getattr(gb, reduction)(engine="cython", **kwargs)
|
||||
tm.assert_frame_equal(direct_res, direct_expected)
|
||||
|
||||
|
||||
def test_multilabel_udf_numba_vs_cython():
|
||||
pytest.importorskip("numba")
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.default_rng(2).standard_normal(8),
|
||||
"D": np.random.default_rng(2).standard_normal(8),
|
||||
}
|
||||
)
|
||||
gb = df.groupby(["A", "B"])
|
||||
result = gb.agg(lambda values, index: values.min(), engine="numba")
|
||||
expected = gb.agg(lambda x: x.min(), engine="cython")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,675 @@
|
||||
"""
|
||||
test all other .agg behavior
|
||||
"""
|
||||
|
||||
import datetime as dt
|
||||
from functools import partial
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import SpecificationError
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
PeriodIndex,
|
||||
Series,
|
||||
date_range,
|
||||
period_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.formats.printing import pprint_thing
|
||||
|
||||
|
||||
def test_agg_partial_failure_raises():
|
||||
# GH#43741
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"data1": np.random.default_rng(2).standard_normal(5),
|
||||
"data2": np.random.default_rng(2).standard_normal(5),
|
||||
"key1": ["a", "a", "b", "b", "a"],
|
||||
"key2": ["one", "two", "one", "two", "one"],
|
||||
}
|
||||
)
|
||||
grouped = df.groupby("key1")
|
||||
|
||||
def peak_to_peak(arr):
|
||||
return arr.max() - arr.min()
|
||||
|
||||
with pytest.raises(TypeError, match="unsupported operand type"):
|
||||
grouped.agg([peak_to_peak])
|
||||
|
||||
with pytest.raises(TypeError, match="unsupported operand type"):
|
||||
grouped.agg(peak_to_peak)
|
||||
|
||||
|
||||
def test_agg_datetimes_mixed():
|
||||
data = [[1, "2012-01-01", 1.0], [2, "2012-01-02", 2.0], [3, None, 3.0]]
|
||||
|
||||
df1 = DataFrame(
|
||||
{
|
||||
"key": [x[0] for x in data],
|
||||
"date": [x[1] for x in data],
|
||||
"value": [x[2] for x in data],
|
||||
}
|
||||
)
|
||||
|
||||
data = [
|
||||
[
|
||||
row[0],
|
||||
(dt.datetime.strptime(row[1], "%Y-%m-%d").date() if row[1] else None),
|
||||
row[2],
|
||||
]
|
||||
for row in data
|
||||
]
|
||||
|
||||
df2 = DataFrame(
|
||||
{
|
||||
"key": [x[0] for x in data],
|
||||
"date": [x[1] for x in data],
|
||||
"value": [x[2] for x in data],
|
||||
}
|
||||
)
|
||||
|
||||
df1["weights"] = df1["value"] / df1["value"].sum()
|
||||
gb1 = df1.groupby("date").aggregate("sum")
|
||||
|
||||
df2["weights"] = df1["value"] / df1["value"].sum()
|
||||
gb2 = df2.groupby("date").aggregate("sum")
|
||||
|
||||
assert len(gb1) == len(gb2)
|
||||
|
||||
|
||||
def test_agg_period_index():
|
||||
prng = period_range("2012-1-1", freq="M", periods=3)
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((3, 2)), index=prng)
|
||||
rs = df.groupby(level=0).sum()
|
||||
assert isinstance(rs.index, PeriodIndex)
|
||||
|
||||
# GH 3579
|
||||
index = period_range(start="1999-01", periods=5, freq="M")
|
||||
s1 = Series(np.random.default_rng(2).random(len(index)), index=index)
|
||||
s2 = Series(np.random.default_rng(2).random(len(index)), index=index)
|
||||
df = DataFrame.from_dict({"s1": s1, "s2": s2})
|
||||
grouped = df.groupby(df.index.month)
|
||||
list(grouped)
|
||||
|
||||
|
||||
def test_agg_dict_parameter_cast_result_dtypes():
|
||||
# GH 12821
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"class": ["A", "A", "B", "B", "C", "C", "D", "D"],
|
||||
"time": date_range("1/1/2011", periods=8, freq="h"),
|
||||
}
|
||||
)
|
||||
df.loc[[0, 1, 2, 5], "time"] = None
|
||||
|
||||
# test for `first` function
|
||||
exp = df.loc[[0, 3, 4, 6]].set_index("class")
|
||||
grouped = df.groupby("class")
|
||||
tm.assert_frame_equal(grouped.first(), exp)
|
||||
tm.assert_frame_equal(grouped.agg("first"), exp)
|
||||
tm.assert_frame_equal(grouped.agg({"time": "first"}), exp)
|
||||
tm.assert_series_equal(grouped.time.first(), exp["time"])
|
||||
tm.assert_series_equal(grouped.time.agg("first"), exp["time"])
|
||||
|
||||
# test for `last` function
|
||||
exp = df.loc[[0, 3, 4, 7]].set_index("class")
|
||||
grouped = df.groupby("class")
|
||||
tm.assert_frame_equal(grouped.last(), exp)
|
||||
tm.assert_frame_equal(grouped.agg("last"), exp)
|
||||
tm.assert_frame_equal(grouped.agg({"time": "last"}), exp)
|
||||
tm.assert_series_equal(grouped.time.last(), exp["time"])
|
||||
tm.assert_series_equal(grouped.time.agg("last"), exp["time"])
|
||||
|
||||
# count
|
||||
exp = Series([2, 2, 2, 2], index=Index(list("ABCD"), name="class"), name="time")
|
||||
tm.assert_series_equal(grouped.time.agg(len), exp)
|
||||
tm.assert_series_equal(grouped.time.size(), exp)
|
||||
|
||||
exp = Series([0, 1, 1, 2], index=Index(list("ABCD"), name="class"), name="time")
|
||||
tm.assert_series_equal(grouped.time.count(), exp)
|
||||
|
||||
|
||||
def test_agg_cast_results_dtypes():
|
||||
# similar to GH12821
|
||||
# xref #11444
|
||||
u = [dt.datetime(2015, x + 1, 1) for x in range(12)]
|
||||
v = list("aaabbbbbbccd")
|
||||
df = DataFrame({"X": v, "Y": u})
|
||||
|
||||
result = df.groupby("X")["Y"].agg(len)
|
||||
expected = df.groupby("X")["Y"].count()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_aggregate_float64_no_int64():
|
||||
# see gh-11199
|
||||
df = DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 2, 4, 5], "c": [1, 2, 3, 4, 5]})
|
||||
|
||||
expected = DataFrame({"a": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
|
||||
expected.index.name = "b"
|
||||
|
||||
result = df.groupby("b")[["a"]].mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame({"a": [1, 2.5, 4, 5], "c": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
|
||||
expected.index.name = "b"
|
||||
|
||||
result = df.groupby("b")[["a", "c"]].mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_aggregate_api_consistency():
|
||||
# GH 9052
|
||||
# make sure that the aggregates via dict
|
||||
# are consistent
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": np.random.default_rng(2).standard_normal(8) + 1.0,
|
||||
"D": np.arange(8),
|
||||
}
|
||||
)
|
||||
|
||||
grouped = df.groupby(["A", "B"])
|
||||
c_mean = grouped["C"].mean()
|
||||
c_sum = grouped["C"].sum()
|
||||
d_mean = grouped["D"].mean()
|
||||
d_sum = grouped["D"].sum()
|
||||
|
||||
result = grouped["D"].agg(["sum", "mean"])
|
||||
expected = pd.concat([d_sum, d_mean], axis=1)
|
||||
expected.columns = ["sum", "mean"]
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
result = grouped.agg(["sum", "mean"])
|
||||
expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1)
|
||||
expected.columns = MultiIndex.from_product([["C", "D"], ["sum", "mean"]])
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
result = grouped[["D", "C"]].agg(["sum", "mean"])
|
||||
expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1)
|
||||
expected.columns = MultiIndex.from_product([["D", "C"], ["sum", "mean"]])
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
result = grouped.agg({"C": "mean", "D": "sum"})
|
||||
expected = pd.concat([d_sum, c_mean], axis=1)
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
result = grouped.agg({"C": ["mean", "sum"], "D": ["mean", "sum"]})
|
||||
expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1)
|
||||
expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]])
|
||||
|
||||
msg = r"Column\(s\) \['r', 'r2'\] do not exist"
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
grouped[["D", "C"]].agg({"r": "sum", "r2": "mean"})
|
||||
|
||||
|
||||
def test_agg_dict_renaming_deprecation():
|
||||
# 15931
|
||||
df = DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)})
|
||||
|
||||
msg = r"nested renamer is not supported"
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
df.groupby("A").agg(
|
||||
{"B": {"foo": ["sum", "max"]}, "C": {"bar": ["count", "min"]}}
|
||||
)
|
||||
|
||||
msg = r"Column\(s\) \['ma'\] do not exist"
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
df.groupby("A")[["B", "C"]].agg({"ma": "max"})
|
||||
|
||||
msg = r"nested renamer is not supported"
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
df.groupby("A").B.agg({"foo": "count"})
|
||||
|
||||
|
||||
def test_agg_compat():
|
||||
# GH 12334
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": np.random.default_rng(2).standard_normal(8) + 1.0,
|
||||
"D": np.arange(8),
|
||||
}
|
||||
)
|
||||
|
||||
g = df.groupby(["A", "B"])
|
||||
|
||||
msg = r"nested renamer is not supported"
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g["D"].agg({"C": ["sum", "std"]})
|
||||
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g["D"].agg({"C": "sum", "D": "std"})
|
||||
|
||||
|
||||
def test_agg_nested_dicts():
|
||||
# API change for disallowing these types of nested dicts
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": np.random.default_rng(2).standard_normal(8) + 1.0,
|
||||
"D": np.arange(8),
|
||||
}
|
||||
)
|
||||
|
||||
g = df.groupby(["A", "B"])
|
||||
|
||||
msg = r"nested renamer is not supported"
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g.aggregate({"r1": {"C": ["mean", "sum"]}, "r2": {"D": ["mean", "sum"]}})
|
||||
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g.agg({"C": {"ra": ["mean", "std"]}, "D": {"rb": ["mean", "std"]}})
|
||||
|
||||
# same name as the original column
|
||||
# GH9052
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g["D"].agg({"result1": np.sum, "result2": np.mean})
|
||||
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g["D"].agg({"D": np.sum, "result2": np.mean})
|
||||
|
||||
|
||||
def test_agg_item_by_item_raise_typeerror():
|
||||
df = DataFrame(np.random.default_rng(2).integers(10, size=(20, 10)))
|
||||
|
||||
def raiseException(df):
|
||||
pprint_thing("----------------------------------------")
|
||||
pprint_thing(df.to_string())
|
||||
raise TypeError("test")
|
||||
|
||||
with pytest.raises(TypeError, match="test"):
|
||||
df.groupby(0).agg(raiseException)
|
||||
|
||||
|
||||
def test_series_agg_multikey():
|
||||
ts = Series(
|
||||
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
|
||||
)
|
||||
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
|
||||
|
||||
result = grouped.agg("sum")
|
||||
expected = grouped.sum()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_series_agg_multi_pure_python():
|
||||
data = DataFrame(
|
||||
{
|
||||
"A": [
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
],
|
||||
"B": [
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"two",
|
||||
"two",
|
||||
"one",
|
||||
],
|
||||
"C": [
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"shiny",
|
||||
],
|
||||
"D": np.random.default_rng(2).standard_normal(11),
|
||||
"E": np.random.default_rng(2).standard_normal(11),
|
||||
"F": np.random.default_rng(2).standard_normal(11),
|
||||
}
|
||||
)
|
||||
|
||||
def bad(x):
|
||||
assert len(x.values.base) > 0
|
||||
return "foo"
|
||||
|
||||
result = data.groupby(["A", "B"]).agg(bad)
|
||||
expected = data.groupby(["A", "B"]).agg(lambda x: "foo")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_consistency():
|
||||
# agg with ([]) and () not consistent
|
||||
# GH 6715
|
||||
def P1(a):
|
||||
return np.percentile(a.dropna(), q=1)
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"col1": [1, 2, 3, 4],
|
||||
"col2": [10, 25, 26, 31],
|
||||
"date": [
|
||||
dt.date(2013, 2, 10),
|
||||
dt.date(2013, 2, 10),
|
||||
dt.date(2013, 2, 11),
|
||||
dt.date(2013, 2, 11),
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
g = df.groupby("date")
|
||||
|
||||
expected = g.agg([P1])
|
||||
expected.columns = expected.columns.levels[0]
|
||||
|
||||
result = g.agg(P1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_callables():
|
||||
# GH 7929
|
||||
df = DataFrame({"foo": [1, 2], "bar": [3, 4]}).astype(np.int64)
|
||||
|
||||
class fn_class:
|
||||
def __call__(self, x):
|
||||
return sum(x)
|
||||
|
||||
equiv_callables = [
|
||||
sum,
|
||||
np.sum,
|
||||
lambda x: sum(x),
|
||||
lambda x: x.sum(),
|
||||
partial(sum),
|
||||
fn_class(),
|
||||
]
|
||||
|
||||
expected = df.groupby("foo").agg("sum")
|
||||
for ecall in equiv_callables:
|
||||
warn = FutureWarning if ecall is sum or ecall is np.sum else None
|
||||
msg = "using DataFrameGroupBy.sum"
|
||||
with tm.assert_produces_warning(warn, match=msg):
|
||||
result = df.groupby("foo").agg(ecall)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_over_numpy_arrays():
|
||||
# GH 3788
|
||||
df = DataFrame(
|
||||
[
|
||||
[1, np.array([10, 20, 30])],
|
||||
[1, np.array([40, 50, 60])],
|
||||
[2, np.array([20, 30, 40])],
|
||||
],
|
||||
columns=["category", "arraydata"],
|
||||
)
|
||||
gb = df.groupby("category")
|
||||
|
||||
expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]]
|
||||
expected_index = Index([1, 2], name="category")
|
||||
expected_column = ["arraydata"]
|
||||
expected = DataFrame(expected_data, index=expected_index, columns=expected_column)
|
||||
|
||||
alt = gb.sum(numeric_only=False)
|
||||
tm.assert_frame_equal(alt, expected)
|
||||
|
||||
result = gb.agg("sum", numeric_only=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# FIXME: the original version of this test called `gb.agg(sum)`
|
||||
# and that raises TypeError if `numeric_only=False` is passed
|
||||
|
||||
|
||||
@pytest.mark.parametrize("as_period", [True, False])
|
||||
def test_agg_tzaware_non_datetime_result(as_period):
|
||||
# discussed in GH#29589, fixed in GH#29641, operating on tzaware values
|
||||
# with function that is not dtype-preserving
|
||||
dti = date_range("2012-01-01", periods=4, tz="UTC")
|
||||
if as_period:
|
||||
dti = dti.tz_localize(None).to_period("D")
|
||||
|
||||
df = DataFrame({"a": [0, 0, 1, 1], "b": dti})
|
||||
gb = df.groupby("a")
|
||||
|
||||
# Case that _does_ preserve the dtype
|
||||
result = gb["b"].agg(lambda x: x.iloc[0])
|
||||
expected = Series(dti[::2], name="b")
|
||||
expected.index.name = "a"
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# Cases that do _not_ preserve the dtype
|
||||
result = gb["b"].agg(lambda x: x.iloc[0].year)
|
||||
expected = Series([2012, 2012], name="b")
|
||||
expected.index.name = "a"
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = gb["b"].agg(lambda x: x.iloc[-1] - x.iloc[0])
|
||||
expected = Series([pd.Timedelta(days=1), pd.Timedelta(days=1)], name="b")
|
||||
expected.index.name = "a"
|
||||
if as_period:
|
||||
expected = Series([pd.offsets.Day(1), pd.offsets.Day(1)], name="b")
|
||||
expected.index.name = "a"
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_timezone_round_trip():
|
||||
# GH 15426
|
||||
ts = pd.Timestamp("2016-01-01 12:00:00", tz="US/Pacific")
|
||||
df = DataFrame({"a": 1, "b": [ts + dt.timedelta(minutes=nn) for nn in range(10)]})
|
||||
|
||||
result1 = df.groupby("a")["b"].agg("min").iloc[0]
|
||||
result2 = df.groupby("a")["b"].agg(lambda x: np.min(x)).iloc[0]
|
||||
result3 = df.groupby("a")["b"].min().iloc[0]
|
||||
|
||||
assert result1 == ts
|
||||
assert result2 == ts
|
||||
assert result3 == ts
|
||||
|
||||
dates = [
|
||||
pd.Timestamp(f"2016-01-0{i:d} 12:00:00", tz="US/Pacific") for i in range(1, 5)
|
||||
]
|
||||
df = DataFrame({"A": ["a", "b"] * 2, "B": dates})
|
||||
grouped = df.groupby("A")
|
||||
|
||||
ts = df["B"].iloc[0]
|
||||
assert ts == grouped.nth(0)["B"].iloc[0]
|
||||
assert ts == grouped.head(1)["B"].iloc[0]
|
||||
assert ts == grouped.first()["B"].iloc[0]
|
||||
|
||||
# GH#27110 applying iloc should return a DataFrame
|
||||
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
||||
with tm.assert_produces_warning(DeprecationWarning, match=msg):
|
||||
assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1]
|
||||
|
||||
ts = df["B"].iloc[2]
|
||||
assert ts == grouped.last()["B"].iloc[0]
|
||||
|
||||
# GH#27110 applying iloc should return a DataFrame
|
||||
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
||||
with tm.assert_produces_warning(DeprecationWarning, match=msg):
|
||||
assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1]
|
||||
|
||||
|
||||
def test_sum_uint64_overflow():
|
||||
# see gh-14758
|
||||
# Convert to uint64 and don't overflow
|
||||
df = DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object)
|
||||
df = df + 9223372036854775807
|
||||
|
||||
index = Index(
|
||||
[9223372036854775808, 9223372036854775810, 9223372036854775812], dtype=np.uint64
|
||||
)
|
||||
expected = DataFrame(
|
||||
{1: [9223372036854775809, 9223372036854775811, 9223372036854775813]},
|
||||
index=index,
|
||||
dtype=object,
|
||||
)
|
||||
|
||||
expected.index.name = 0
|
||||
result = df.groupby(0).sum(numeric_only=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# out column is non-numeric, so with numeric_only=True it is dropped
|
||||
result2 = df.groupby(0).sum(numeric_only=True)
|
||||
expected2 = expected[[]]
|
||||
tm.assert_frame_equal(result2, expected2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"structure, expected",
|
||||
[
|
||||
(tuple, DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})),
|
||||
(list, DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})),
|
||||
(
|
||||
lambda x: tuple(x),
|
||||
DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}}),
|
||||
),
|
||||
(
|
||||
lambda x: list(x),
|
||||
DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}}),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_agg_structs_dataframe(structure, expected):
|
||||
df = DataFrame(
|
||||
{"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
|
||||
)
|
||||
|
||||
result = df.groupby(["A", "B"]).aggregate(structure)
|
||||
expected.index.names = ["A", "B"]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"structure, expected",
|
||||
[
|
||||
(tuple, Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")),
|
||||
(list, Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")),
|
||||
(lambda x: tuple(x), Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")),
|
||||
(lambda x: list(x), Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")),
|
||||
],
|
||||
)
|
||||
def test_agg_structs_series(structure, expected):
|
||||
# Issue #18079
|
||||
df = DataFrame(
|
||||
{"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
|
||||
)
|
||||
|
||||
result = df.groupby("A")["C"].aggregate(structure)
|
||||
expected.index.name = "A"
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_category_nansum(observed):
|
||||
categories = ["a", "b", "c"]
|
||||
df = DataFrame(
|
||||
{"A": pd.Categorical(["a", "a", "b"], categories=categories), "B": [1, 2, 3]}
|
||||
)
|
||||
msg = "using SeriesGroupBy.sum"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = df.groupby("A", observed=observed).B.agg(np.nansum)
|
||||
expected = Series(
|
||||
[3, 3, 0],
|
||||
index=pd.CategoricalIndex(["a", "b", "c"], categories=categories, name="A"),
|
||||
name="B",
|
||||
)
|
||||
if observed:
|
||||
expected = expected[expected != 0]
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_list_like_func():
|
||||
# GH 18473
|
||||
df = DataFrame({"A": [str(x) for x in range(3)], "B": [str(x) for x in range(3)]})
|
||||
grouped = df.groupby("A", as_index=False, sort=False)
|
||||
result = grouped.agg({"B": lambda x: list(x)})
|
||||
expected = DataFrame(
|
||||
{"A": [str(x) for x in range(3)], "B": [[str(x)] for x in range(3)]}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_lambda_with_timezone():
|
||||
# GH 23683
|
||||
df = DataFrame(
|
||||
{
|
||||
"tag": [1, 1],
|
||||
"date": [
|
||||
pd.Timestamp("2018-01-01", tz="UTC"),
|
||||
pd.Timestamp("2018-01-02", tz="UTC"),
|
||||
],
|
||||
}
|
||||
)
|
||||
result = df.groupby("tag").agg({"date": lambda e: e.head(1)})
|
||||
expected = DataFrame(
|
||||
[pd.Timestamp("2018-01-01", tz="UTC")],
|
||||
index=Index([1], name="tag"),
|
||||
columns=["date"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"err_cls",
|
||||
[
|
||||
NotImplementedError,
|
||||
RuntimeError,
|
||||
KeyError,
|
||||
IndexError,
|
||||
OSError,
|
||||
ValueError,
|
||||
ArithmeticError,
|
||||
AttributeError,
|
||||
],
|
||||
)
|
||||
def test_groupby_agg_err_catching(err_cls):
|
||||
# make sure we suppress anything other than TypeError or AssertionError
|
||||
# in _python_agg_general
|
||||
|
||||
# Use a non-standard EA to make sure we don't go down ndarray paths
|
||||
from pandas.tests.extension.decimal.array import (
|
||||
DecimalArray,
|
||||
make_data,
|
||||
to_decimal,
|
||||
)
|
||||
|
||||
data = make_data()[:5]
|
||||
df = DataFrame(
|
||||
{"id1": [0, 0, 0, 1, 1], "id2": [0, 1, 0, 1, 1], "decimals": DecimalArray(data)}
|
||||
)
|
||||
|
||||
expected = Series(to_decimal([data[0], data[3]]))
|
||||
|
||||
def weird_func(x):
|
||||
# weird function that raise something other than TypeError or IndexError
|
||||
# in _python_agg_general
|
||||
if len(x) == 0:
|
||||
raise err_cls
|
||||
return x.iloc[0]
|
||||
|
||||
result = df["decimals"].groupby(df["id1"]).agg(weird_func)
|
||||
tm.assert_series_equal(result, expected, check_names=False)
|
||||
@@ -0,0 +1,208 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
date_range,
|
||||
)
|
||||
from pandas.core.groupby.base import (
|
||||
reduction_kernels,
|
||||
transformation_kernels,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def sort(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def as_index(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def dropna(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def observed(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df():
|
||||
return DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.default_rng(2).standard_normal(8),
|
||||
"D": np.random.default_rng(2).standard_normal(8),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ts():
|
||||
return Series(
|
||||
np.random.default_rng(2).standard_normal(30),
|
||||
index=date_range("2000-01-01", periods=30, freq="B"),
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tsframe():
|
||||
return DataFrame(
|
||||
np.random.default_rng(2).standard_normal((30, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=date_range("2000-01-01", periods=30, freq="B"),
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def three_group():
|
||||
return DataFrame(
|
||||
{
|
||||
"A": [
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
],
|
||||
"B": [
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"two",
|
||||
"two",
|
||||
"one",
|
||||
],
|
||||
"C": [
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"shiny",
|
||||
],
|
||||
"D": np.random.default_rng(2).standard_normal(11),
|
||||
"E": np.random.default_rng(2).standard_normal(11),
|
||||
"F": np.random.default_rng(2).standard_normal(11),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def slice_test_df():
|
||||
data = [
|
||||
[0, "a", "a0_at_0"],
|
||||
[1, "b", "b0_at_1"],
|
||||
[2, "a", "a1_at_2"],
|
||||
[3, "b", "b1_at_3"],
|
||||
[4, "c", "c0_at_4"],
|
||||
[5, "a", "a2_at_5"],
|
||||
[6, "a", "a3_at_6"],
|
||||
[7, "a", "a4_at_7"],
|
||||
]
|
||||
df = DataFrame(data, columns=["Index", "Group", "Value"])
|
||||
return df.set_index("Index")
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def slice_test_grouped(slice_test_df):
|
||||
return slice_test_df.groupby("Group", as_index=False)
|
||||
|
||||
|
||||
@pytest.fixture(params=sorted(reduction_kernels))
|
||||
def reduction_func(request):
|
||||
"""
|
||||
yields the string names of all groupby reduction functions, one at a time.
|
||||
"""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=sorted(transformation_kernels))
|
||||
def transformation_func(request):
|
||||
"""yields the string names of all groupby transformation functions."""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=sorted(reduction_kernels) + sorted(transformation_kernels))
|
||||
def groupby_func(request):
|
||||
"""yields both aggregation and transformation functions."""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def parallel(request):
|
||||
"""parallel keyword argument for numba.jit"""
|
||||
return request.param
|
||||
|
||||
|
||||
# Can parameterize nogil & nopython over True | False, but limiting per
|
||||
# https://github.com/pandas-dev/pandas/pull/41971#issuecomment-860607472
|
||||
|
||||
|
||||
@pytest.fixture(params=[False])
|
||||
def nogil(request):
|
||||
"""nogil keyword argument for numba.jit"""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=[True])
|
||||
def nopython(request):
|
||||
"""nopython keyword argument for numba.jit"""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
("mean", {}),
|
||||
("var", {"ddof": 1}),
|
||||
("var", {"ddof": 0}),
|
||||
("std", {"ddof": 1}),
|
||||
("std", {"ddof": 0}),
|
||||
("sum", {}),
|
||||
("min", {}),
|
||||
("max", {}),
|
||||
("sum", {"min_count": 2}),
|
||||
("min", {"min_count": 2}),
|
||||
("max", {"min_count": 2}),
|
||||
],
|
||||
ids=[
|
||||
"mean",
|
||||
"var_1",
|
||||
"var_0",
|
||||
"std_1",
|
||||
"std_0",
|
||||
"sum",
|
||||
"min",
|
||||
"max",
|
||||
"sum-min_count",
|
||||
"min-min_count",
|
||||
"max-min_count",
|
||||
],
|
||||
)
|
||||
def numba_supported_reductions(request):
|
||||
"""reductions supported with engine='numba'"""
|
||||
return request.param
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,24 @@
|
||||
import numpy as np
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_corrwith_with_1_axis():
|
||||
# GH 47723
|
||||
df = DataFrame({"a": [1, 1, 2], "b": [3, 7, 4]})
|
||||
gb = df.groupby("a")
|
||||
|
||||
msg = "DataFrameGroupBy.corrwith with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = gb.corrwith(df, axis=1)
|
||||
index = Index(
|
||||
data=[(1, 0), (1, 1), (1, 2), (2, 2), (2, 0), (2, 1)],
|
||||
name=("a", None),
|
||||
)
|
||||
expected = Series([np.nan] * 6, index=index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
@@ -0,0 +1,297 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
Timestamp,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_apply_describe_bug(multiindex_dataframe_random_data):
|
||||
grouped = multiindex_dataframe_random_data.groupby(level="first")
|
||||
grouped.describe() # it works!
|
||||
|
||||
|
||||
def test_series_describe_multikey():
|
||||
ts = Series(
|
||||
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
|
||||
)
|
||||
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
|
||||
result = grouped.describe()
|
||||
tm.assert_series_equal(result["mean"], grouped.mean(), check_names=False)
|
||||
tm.assert_series_equal(result["std"], grouped.std(), check_names=False)
|
||||
tm.assert_series_equal(result["min"], grouped.min(), check_names=False)
|
||||
|
||||
|
||||
def test_series_describe_single():
|
||||
ts = Series(
|
||||
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
|
||||
)
|
||||
grouped = ts.groupby(lambda x: x.month)
|
||||
result = grouped.apply(lambda x: x.describe())
|
||||
expected = grouped.describe().stack(future_stack=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("keys", ["key1", ["key1", "key2"]])
|
||||
def test_series_describe_as_index(as_index, keys):
|
||||
# GH#49256
|
||||
df = DataFrame(
|
||||
{
|
||||
"key1": ["one", "two", "two", "three", "two"],
|
||||
"key2": ["one", "two", "two", "three", "two"],
|
||||
"foo2": [1, 2, 4, 4, 6],
|
||||
}
|
||||
)
|
||||
gb = df.groupby(keys, as_index=as_index)["foo2"]
|
||||
result = gb.describe()
|
||||
expected = DataFrame(
|
||||
{
|
||||
"key1": ["one", "three", "two"],
|
||||
"count": [1.0, 1.0, 3.0],
|
||||
"mean": [1.0, 4.0, 4.0],
|
||||
"std": [np.nan, np.nan, 2.0],
|
||||
"min": [1.0, 4.0, 2.0],
|
||||
"25%": [1.0, 4.0, 3.0],
|
||||
"50%": [1.0, 4.0, 4.0],
|
||||
"75%": [1.0, 4.0, 5.0],
|
||||
"max": [1.0, 4.0, 6.0],
|
||||
}
|
||||
)
|
||||
if len(keys) == 2:
|
||||
expected.insert(1, "key2", expected["key1"])
|
||||
if as_index:
|
||||
expected = expected.set_index(keys)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_frame_describe_multikey(tsframe):
|
||||
grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
|
||||
result = grouped.describe()
|
||||
desc_groups = []
|
||||
for col in tsframe:
|
||||
group = grouped[col].describe()
|
||||
# GH 17464 - Remove duplicate MultiIndex levels
|
||||
group_col = MultiIndex(
|
||||
levels=[[col], group.columns],
|
||||
codes=[[0] * len(group.columns), range(len(group.columns))],
|
||||
)
|
||||
group = DataFrame(group.values, columns=group_col, index=group.index)
|
||||
desc_groups.append(group)
|
||||
expected = pd.concat(desc_groups, axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
msg = "DataFrame.groupby with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1)
|
||||
result = groupedT.describe()
|
||||
expected = tsframe.describe().T
|
||||
# reverting the change from https://github.com/pandas-dev/pandas/pull/35441/
|
||||
expected.index = MultiIndex(
|
||||
levels=[[0, 1], expected.index],
|
||||
codes=[[0, 0, 1, 1], range(len(expected.index))],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_frame_describe_tupleindex():
|
||||
# GH 14848 - regression from 0.19.0 to 0.19.1
|
||||
df1 = DataFrame(
|
||||
{
|
||||
"x": [1, 2, 3, 4, 5] * 3,
|
||||
"y": [10, 20, 30, 40, 50] * 3,
|
||||
"z": [100, 200, 300, 400, 500] * 3,
|
||||
}
|
||||
)
|
||||
df1["k"] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
|
||||
df2 = df1.rename(columns={"k": "key"})
|
||||
msg = "Names should be list-like for a MultiIndex"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df1.groupby("k").describe()
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df2.groupby("key").describe()
|
||||
|
||||
|
||||
def test_frame_describe_unstacked_format():
|
||||
# GH 4792
|
||||
prices = {
|
||||
Timestamp("2011-01-06 10:59:05", tz=None): 24990,
|
||||
Timestamp("2011-01-06 12:43:33", tz=None): 25499,
|
||||
Timestamp("2011-01-06 12:54:09", tz=None): 25499,
|
||||
}
|
||||
volumes = {
|
||||
Timestamp("2011-01-06 10:59:05", tz=None): 1500000000,
|
||||
Timestamp("2011-01-06 12:43:33", tz=None): 5000000000,
|
||||
Timestamp("2011-01-06 12:54:09", tz=None): 100000000,
|
||||
}
|
||||
df = DataFrame({"PRICE": prices, "VOLUME": volumes})
|
||||
result = df.groupby("PRICE").VOLUME.describe()
|
||||
data = [
|
||||
df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
|
||||
df[df.PRICE == 25499].VOLUME.describe().values.tolist(),
|
||||
]
|
||||
expected = DataFrame(
|
||||
data,
|
||||
index=Index([24990, 25499], name="PRICE"),
|
||||
columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:"
|
||||
"indexing past lexsort depth may impact performance:"
|
||||
"pandas.errors.PerformanceWarning"
|
||||
)
|
||||
@pytest.mark.parametrize("as_index", [True, False])
|
||||
@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
|
||||
def test_describe_with_duplicate_output_column_names(as_index, keys):
|
||||
# GH 35314
|
||||
df = DataFrame(
|
||||
{
|
||||
"a1": [99, 99, 99, 88, 88, 88],
|
||||
"a2": [99, 99, 99, 88, 88, 88],
|
||||
"b": [1, 2, 3, 4, 5, 6],
|
||||
"c": [10, 20, 30, 40, 50, 60],
|
||||
},
|
||||
columns=["a1", "a2", "b", "b"],
|
||||
copy=False,
|
||||
)
|
||||
if keys == ["a1"]:
|
||||
df = df.drop(columns="a2")
|
||||
|
||||
expected = (
|
||||
DataFrame.from_records(
|
||||
[
|
||||
("b", "count", 3.0, 3.0),
|
||||
("b", "mean", 5.0, 2.0),
|
||||
("b", "std", 1.0, 1.0),
|
||||
("b", "min", 4.0, 1.0),
|
||||
("b", "25%", 4.5, 1.5),
|
||||
("b", "50%", 5.0, 2.0),
|
||||
("b", "75%", 5.5, 2.5),
|
||||
("b", "max", 6.0, 3.0),
|
||||
("b", "count", 3.0, 3.0),
|
||||
("b", "mean", 5.0, 2.0),
|
||||
("b", "std", 1.0, 1.0),
|
||||
("b", "min", 4.0, 1.0),
|
||||
("b", "25%", 4.5, 1.5),
|
||||
("b", "50%", 5.0, 2.0),
|
||||
("b", "75%", 5.5, 2.5),
|
||||
("b", "max", 6.0, 3.0),
|
||||
],
|
||||
)
|
||||
.set_index([0, 1])
|
||||
.T
|
||||
)
|
||||
expected.columns.names = [None, None]
|
||||
if len(keys) == 2:
|
||||
expected.index = MultiIndex(
|
||||
levels=[[88, 99], [88, 99]], codes=[[0, 1], [0, 1]], names=["a1", "a2"]
|
||||
)
|
||||
else:
|
||||
expected.index = Index([88, 99], name="a1")
|
||||
|
||||
if not as_index:
|
||||
expected = expected.reset_index()
|
||||
|
||||
result = df.groupby(keys, as_index=as_index).describe()
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_describe_duplicate_columns():
|
||||
# GH#50806
|
||||
df = DataFrame([[0, 1, 2, 3]])
|
||||
df.columns = [0, 1, 2, 0]
|
||||
gb = df.groupby(df[1])
|
||||
result = gb.describe(percentiles=[])
|
||||
|
||||
columns = ["count", "mean", "std", "min", "50%", "max"]
|
||||
frames = [
|
||||
DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns)
|
||||
for val in (0.0, 2.0, 3.0)
|
||||
]
|
||||
expected = pd.concat(frames, axis=1)
|
||||
expected.columns = MultiIndex(
|
||||
levels=[[0, 2], columns],
|
||||
codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))],
|
||||
)
|
||||
expected.index.names = [1]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestGroupByNonCythonPaths:
|
||||
# GH#5610 non-cython calls should not include the grouper
|
||||
# Tests for code not expected to go through cython paths.
|
||||
|
||||
@pytest.fixture
|
||||
def df(self):
|
||||
df = DataFrame(
|
||||
[[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]],
|
||||
columns=["A", "B", "C"],
|
||||
)
|
||||
return df
|
||||
|
||||
@pytest.fixture
|
||||
def gb(self, df):
|
||||
gb = df.groupby("A")
|
||||
return gb
|
||||
|
||||
@pytest.fixture
|
||||
def gni(self, df):
|
||||
gni = df.groupby("A", as_index=False)
|
||||
return gni
|
||||
|
||||
def test_describe(self, df, gb, gni):
|
||||
# describe
|
||||
expected_index = Index([1, 3], name="A")
|
||||
expected_col = MultiIndex(
|
||||
levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]],
|
||||
codes=[[0] * 8, list(range(8))],
|
||||
)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0],
|
||||
[0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
|
||||
],
|
||||
index=expected_index,
|
||||
columns=expected_col,
|
||||
)
|
||||
result = gb.describe()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = expected.reset_index()
|
||||
result = gni.describe()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [int, float, object])
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs",
|
||||
[
|
||||
{"percentiles": [0.10, 0.20, 0.30], "include": "all", "exclude": None},
|
||||
{"percentiles": [0.10, 0.20, 0.30], "include": None, "exclude": ["int"]},
|
||||
{"percentiles": [0.10, 0.20, 0.30], "include": ["int"], "exclude": None},
|
||||
],
|
||||
)
|
||||
def test_groupby_empty_dataset(dtype, kwargs):
|
||||
# GH#41575
|
||||
df = DataFrame([[1, 2, 3]], columns=["A", "B", "C"], dtype=dtype)
|
||||
df["B"] = df["B"].astype(int)
|
||||
df["C"] = df["C"].astype(float)
|
||||
|
||||
result = df.iloc[:0].groupby("A").describe(**kwargs)
|
||||
expected = df.groupby("A").describe(**kwargs).reset_index(drop=True).iloc[:0]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.iloc[:0].groupby("A").B.describe(**kwargs)
|
||||
expected = df.groupby("A").B.describe(**kwargs).reset_index(drop=True).iloc[:0]
|
||||
expected.index = Index([])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,255 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
NaT,
|
||||
Series,
|
||||
Timedelta,
|
||||
Timestamp,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_group_shift_with_null_key():
|
||||
# This test is designed to replicate the segfault in issue #13813.
|
||||
n_rows = 1200
|
||||
|
||||
# Generate a moderately large dataframe with occasional missing
|
||||
# values in column `B`, and then group by [`A`, `B`]. This should
|
||||
# force `-1` in `labels` array of `g._grouper.group_info` exactly
|
||||
# at those places, where the group-by key is partially missing.
|
||||
df = DataFrame(
|
||||
[(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)],
|
||||
dtype=float,
|
||||
columns=["A", "B", "Z"],
|
||||
index=None,
|
||||
)
|
||||
g = df.groupby(["A", "B"])
|
||||
|
||||
expected = DataFrame(
|
||||
[(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)],
|
||||
dtype=float,
|
||||
columns=["Z"],
|
||||
index=None,
|
||||
)
|
||||
result = g.shift(-1)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_shift_with_fill_value():
|
||||
# GH #24128
|
||||
n_rows = 24
|
||||
df = DataFrame(
|
||||
[(i % 12, i % 3, i) for i in range(n_rows)],
|
||||
dtype=float,
|
||||
columns=["A", "B", "Z"],
|
||||
index=None,
|
||||
)
|
||||
g = df.groupby(["A", "B"])
|
||||
|
||||
expected = DataFrame(
|
||||
[(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)],
|
||||
dtype=float,
|
||||
columns=["Z"],
|
||||
index=None,
|
||||
)
|
||||
result = g.shift(-1, fill_value=0)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_shift_lose_timezone():
|
||||
# GH 30134
|
||||
now_dt = Timestamp.utcnow().as_unit("ns")
|
||||
df = DataFrame({"a": [1, 1], "date": now_dt})
|
||||
result = df.groupby("a").shift(0).iloc[0]
|
||||
expected = Series({"date": now_dt}, name=result.name)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_diff_real_series(any_real_numpy_dtype):
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 3, 3, 2], "b": [1, 2, 3, 4, 5]},
|
||||
dtype=any_real_numpy_dtype,
|
||||
)
|
||||
result = df.groupby("a")["b"].diff()
|
||||
exp_dtype = "float"
|
||||
if any_real_numpy_dtype in ["int8", "int16", "float32"]:
|
||||
exp_dtype = "float32"
|
||||
expected = Series([np.nan, np.nan, np.nan, 1.0, 3.0], dtype=exp_dtype, name="b")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_diff_real_frame(any_real_numpy_dtype):
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [1, 2, 3, 3, 2],
|
||||
"b": [1, 2, 3, 4, 5],
|
||||
"c": [1, 2, 3, 4, 6],
|
||||
},
|
||||
dtype=any_real_numpy_dtype,
|
||||
)
|
||||
result = df.groupby("a").diff()
|
||||
exp_dtype = "float"
|
||||
if any_real_numpy_dtype in ["int8", "int16", "float32"]:
|
||||
exp_dtype = "float32"
|
||||
expected = DataFrame(
|
||||
{
|
||||
"b": [np.nan, np.nan, np.nan, 1.0, 3.0],
|
||||
"c": [np.nan, np.nan, np.nan, 1.0, 4.0],
|
||||
},
|
||||
dtype=exp_dtype,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
[
|
||||
Timestamp("2013-01-01"),
|
||||
Timestamp("2013-01-02"),
|
||||
Timestamp("2013-01-03"),
|
||||
],
|
||||
[Timedelta("5 days"), Timedelta("6 days"), Timedelta("7 days")],
|
||||
],
|
||||
)
|
||||
def test_group_diff_datetimelike(data, unit):
|
||||
df = DataFrame({"a": [1, 2, 2], "b": data})
|
||||
df["b"] = df["b"].dt.as_unit(unit)
|
||||
result = df.groupby("a")["b"].diff()
|
||||
expected = Series([NaT, NaT, Timedelta("1 days")], name="b").dt.as_unit(unit)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_diff_bool():
|
||||
df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]})
|
||||
result = df.groupby("a")["b"].diff()
|
||||
expected = Series([np.nan, np.nan, np.nan, False, False], name="b")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_diff_object_raises(object_dtype):
|
||||
df = DataFrame(
|
||||
{"a": ["foo", "bar", "bar"], "b": ["baz", "foo", "foo"]}, dtype=object_dtype
|
||||
)
|
||||
with pytest.raises(TypeError, match=r"unsupported operand type\(s\) for -"):
|
||||
df.groupby("a")["b"].diff()
|
||||
|
||||
|
||||
def test_empty_shift_with_fill():
|
||||
# GH 41264, single-index check
|
||||
df = DataFrame(columns=["a", "b", "c"])
|
||||
shifted = df.groupby(["a"]).shift(1)
|
||||
shifted_with_fill = df.groupby(["a"]).shift(1, fill_value=0)
|
||||
tm.assert_frame_equal(shifted, shifted_with_fill)
|
||||
tm.assert_index_equal(shifted.index, shifted_with_fill.index)
|
||||
|
||||
|
||||
def test_multindex_empty_shift_with_fill():
|
||||
# GH 41264, multi-index check
|
||||
df = DataFrame(columns=["a", "b", "c"])
|
||||
shifted = df.groupby(["a", "b"]).shift(1)
|
||||
shifted_with_fill = df.groupby(["a", "b"]).shift(1, fill_value=0)
|
||||
tm.assert_frame_equal(shifted, shifted_with_fill)
|
||||
tm.assert_index_equal(shifted.index, shifted_with_fill.index)
|
||||
|
||||
|
||||
def test_shift_periods_freq():
|
||||
# GH 54093
|
||||
data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]}
|
||||
df = DataFrame(data, index=date_range(start="20100101", periods=6))
|
||||
result = df.groupby(df.index).shift(periods=-2, freq="D")
|
||||
expected = DataFrame(data, index=date_range(start="2009-12-30", periods=6))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_shift_deprecate_freq_and_fill_value():
|
||||
# GH 53832
|
||||
data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]}
|
||||
df = DataFrame(data, index=date_range(start="20100101", periods=6))
|
||||
msg = (
|
||||
"Passing a 'freq' together with a 'fill_value' silently ignores the fill_value"
|
||||
)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
df.groupby(df.index).shift(periods=-2, freq="D", fill_value="1")
|
||||
|
||||
|
||||
def test_shift_disallow_suffix_if_periods_is_int():
|
||||
# GH#44424
|
||||
data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]}
|
||||
df = DataFrame(data)
|
||||
msg = "Cannot specify `suffix` if `periods` is an int."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("b").shift(1, suffix="fails")
|
||||
|
||||
|
||||
def test_group_shift_with_multiple_periods():
|
||||
# GH#44424
|
||||
df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]})
|
||||
|
||||
shifted_df = df.groupby("b")[["a"]].shift([0, 1])
|
||||
expected_df = DataFrame(
|
||||
{"a_0": [1, 2, 3, 3, 2], "a_1": [np.nan, 1.0, np.nan, 3.0, 2.0]}
|
||||
)
|
||||
tm.assert_frame_equal(shifted_df, expected_df)
|
||||
|
||||
# series
|
||||
shifted_series = df.groupby("b")["a"].shift([0, 1])
|
||||
tm.assert_frame_equal(shifted_series, expected_df)
|
||||
|
||||
|
||||
def test_group_shift_with_multiple_periods_and_freq():
|
||||
# GH#44424
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]},
|
||||
index=date_range("1/1/2000", periods=5, freq="h"),
|
||||
)
|
||||
shifted_df = df.groupby("b")[["a"]].shift(
|
||||
[0, 1],
|
||||
freq="h",
|
||||
)
|
||||
expected_df = DataFrame(
|
||||
{
|
||||
"a_0": [1.0, 2.0, 3.0, 4.0, 5.0, np.nan],
|
||||
"a_1": [
|
||||
np.nan,
|
||||
1.0,
|
||||
2.0,
|
||||
3.0,
|
||||
4.0,
|
||||
5.0,
|
||||
],
|
||||
},
|
||||
index=date_range("1/1/2000", periods=6, freq="h"),
|
||||
)
|
||||
tm.assert_frame_equal(shifted_df, expected_df)
|
||||
|
||||
|
||||
def test_group_shift_with_multiple_periods_and_fill_value():
|
||||
# GH#44424
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]},
|
||||
)
|
||||
shifted_df = df.groupby("b")[["a"]].shift([0, 1], fill_value=-1)
|
||||
expected_df = DataFrame(
|
||||
{"a_0": [1, 2, 3, 4, 5], "a_1": [-1, 1, -1, 3, 2]},
|
||||
)
|
||||
tm.assert_frame_equal(shifted_df, expected_df)
|
||||
|
||||
|
||||
def test_group_shift_with_multiple_periods_and_both_fill_and_freq_deprecated():
|
||||
# GH#44424
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]},
|
||||
index=date_range("1/1/2000", periods=5, freq="h"),
|
||||
)
|
||||
msg = (
|
||||
"Passing a 'freq' together with a 'fill_value' silently ignores the "
|
||||
"fill_value"
|
||||
)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
df.groupby("b")[["a"]].shift([1, 2], fill_value=1, freq="h")
|
||||
@@ -0,0 +1,78 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"in_vals, out_vals",
|
||||
[
|
||||
# Basics: strictly increasing (T), strictly decreasing (F),
|
||||
# abs val increasing (F), non-strictly increasing (T)
|
||||
([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], [True, False, False, True]),
|
||||
# Test with inf vals
|
||||
(
|
||||
[1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf],
|
||||
[True, False, True, False],
|
||||
),
|
||||
# Test with nan vals; should always be False
|
||||
(
|
||||
[1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
|
||||
[False, False, False, False],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_is_monotonic_increasing(in_vals, out_vals):
|
||||
# GH 17015
|
||||
source_dict = {
|
||||
"A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
|
||||
"B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"],
|
||||
"C": in_vals,
|
||||
}
|
||||
df = DataFrame(source_dict)
|
||||
result = df.groupby("B").C.is_monotonic_increasing
|
||||
index = Index(list("abcd"), name="B")
|
||||
expected = Series(index=index, data=out_vals, name="C")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# Also check result equal to manually taking x.is_monotonic_increasing.
|
||||
expected = df.groupby(["B"]).C.apply(lambda x: x.is_monotonic_increasing)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"in_vals, out_vals",
|
||||
[
|
||||
# Basics: strictly decreasing (T), strictly increasing (F),
|
||||
# abs val decreasing (F), non-strictly increasing (T)
|
||||
([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], [True, False, False, True]),
|
||||
# Test with inf vals
|
||||
(
|
||||
[np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf],
|
||||
[True, True, False, True],
|
||||
),
|
||||
# Test with nan vals; should always be False
|
||||
(
|
||||
[1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
|
||||
[False, False, False, False],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_is_monotonic_decreasing(in_vals, out_vals):
|
||||
# GH 17015
|
||||
source_dict = {
|
||||
"A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
|
||||
"B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"],
|
||||
"C": in_vals,
|
||||
}
|
||||
|
||||
df = DataFrame(source_dict)
|
||||
result = df.groupby("B").C.is_monotonic_decreasing
|
||||
index = Index(list("abcd"), name="B")
|
||||
expected = Series(index=index, data=out_vals, name="C")
|
||||
tm.assert_series_equal(result, expected)
|
||||
@@ -0,0 +1,115 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
MultiIndex,
|
||||
Series,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_nlargest():
|
||||
a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
|
||||
b = Series(list("a" * 5 + "b" * 5))
|
||||
gb = a.groupby(b)
|
||||
r = gb.nlargest(3)
|
||||
e = Series(
|
||||
[7, 5, 3, 10, 9, 6],
|
||||
index=MultiIndex.from_arrays([list("aaabbb"), [3, 2, 1, 9, 5, 8]]),
|
||||
)
|
||||
tm.assert_series_equal(r, e)
|
||||
|
||||
a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
|
||||
gb = a.groupby(b)
|
||||
e = Series(
|
||||
[3, 2, 1, 3, 3, 2],
|
||||
index=MultiIndex.from_arrays([list("aaabbb"), [2, 3, 1, 6, 5, 7]]),
|
||||
)
|
||||
tm.assert_series_equal(gb.nlargest(3, keep="last"), e)
|
||||
|
||||
|
||||
def test_nlargest_mi_grouper():
|
||||
# see gh-21411
|
||||
npr = np.random.default_rng(2)
|
||||
|
||||
dts = date_range("20180101", periods=10)
|
||||
iterables = [dts, ["one", "two"]]
|
||||
|
||||
idx = MultiIndex.from_product(iterables, names=["first", "second"])
|
||||
s = Series(npr.standard_normal(20), index=idx)
|
||||
|
||||
result = s.groupby("first").nlargest(1)
|
||||
|
||||
exp_idx = MultiIndex.from_tuples(
|
||||
[
|
||||
(dts[0], dts[0], "one"),
|
||||
(dts[1], dts[1], "one"),
|
||||
(dts[2], dts[2], "one"),
|
||||
(dts[3], dts[3], "two"),
|
||||
(dts[4], dts[4], "one"),
|
||||
(dts[5], dts[5], "one"),
|
||||
(dts[6], dts[6], "one"),
|
||||
(dts[7], dts[7], "one"),
|
||||
(dts[8], dts[8], "one"),
|
||||
(dts[9], dts[9], "one"),
|
||||
],
|
||||
names=["first", "first", "second"],
|
||||
)
|
||||
|
||||
exp_values = [
|
||||
0.18905338179353307,
|
||||
-0.41306354339189344,
|
||||
1.799707382720902,
|
||||
0.7738065867276614,
|
||||
0.28121066979764925,
|
||||
0.9775674511260357,
|
||||
-0.3288239040579627,
|
||||
0.45495807124085547,
|
||||
0.5452887139646817,
|
||||
0.12682784711186987,
|
||||
]
|
||||
|
||||
expected = Series(exp_values, index=exp_idx)
|
||||
tm.assert_series_equal(result, expected, check_exact=False, rtol=1e-3)
|
||||
|
||||
|
||||
def test_nsmallest():
|
||||
a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
|
||||
b = Series(list("a" * 5 + "b" * 5))
|
||||
gb = a.groupby(b)
|
||||
r = gb.nsmallest(3)
|
||||
e = Series(
|
||||
[1, 2, 3, 0, 4, 6],
|
||||
index=MultiIndex.from_arrays([list("aaabbb"), [0, 4, 1, 6, 7, 8]]),
|
||||
)
|
||||
tm.assert_series_equal(r, e)
|
||||
|
||||
a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
|
||||
gb = a.groupby(b)
|
||||
e = Series(
|
||||
[0, 1, 1, 0, 1, 2],
|
||||
index=MultiIndex.from_arrays([list("aaabbb"), [4, 1, 0, 9, 8, 7]]),
|
||||
)
|
||||
tm.assert_series_equal(gb.nsmallest(3, keep="last"), e)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, groups",
|
||||
[([0, 1, 2, 3], [0, 0, 1, 1]), ([0], [0])],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [None, *tm.ALL_INT_NUMPY_DTYPES])
|
||||
@pytest.mark.parametrize("method", ["nlargest", "nsmallest"])
|
||||
def test_nlargest_and_smallest_noop(data, groups, dtype, method):
|
||||
# GH 15272, GH 16345, GH 29129
|
||||
# Test nlargest/smallest when it results in a noop,
|
||||
# i.e. input is sorted and group size <= n
|
||||
if dtype is not None:
|
||||
data = np.array(data, dtype=dtype)
|
||||
if method == "nlargest":
|
||||
data = list(reversed(data))
|
||||
ser = Series(data, name="a")
|
||||
result = getattr(ser.groupby(groups), method)(n=2)
|
||||
expidx = np.array(groups, dtype=int) if isinstance(groups, list) else groups
|
||||
expected = Series(data, index=MultiIndex.from_arrays([expidx, ser.index]), name="a")
|
||||
tm.assert_series_equal(result, expected)
|
||||
@@ -0,0 +1,921 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
Timestamp,
|
||||
isna,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_first_last_nth(df):
|
||||
# tests for first / last / nth
|
||||
grouped = df.groupby("A")
|
||||
first = grouped.first()
|
||||
expected = df.loc[[1, 0], ["B", "C", "D"]]
|
||||
expected.index = Index(["bar", "foo"], name="A")
|
||||
expected = expected.sort_index()
|
||||
tm.assert_frame_equal(first, expected)
|
||||
|
||||
nth = grouped.nth(0)
|
||||
expected = df.loc[[0, 1]]
|
||||
tm.assert_frame_equal(nth, expected)
|
||||
|
||||
last = grouped.last()
|
||||
expected = df.loc[[5, 7], ["B", "C", "D"]]
|
||||
expected.index = Index(["bar", "foo"], name="A")
|
||||
tm.assert_frame_equal(last, expected)
|
||||
|
||||
nth = grouped.nth(-1)
|
||||
expected = df.iloc[[5, 7]]
|
||||
tm.assert_frame_equal(nth, expected)
|
||||
|
||||
nth = grouped.nth(1)
|
||||
expected = df.iloc[[2, 3]]
|
||||
tm.assert_frame_equal(nth, expected)
|
||||
|
||||
# it works!
|
||||
grouped["B"].first()
|
||||
grouped["B"].last()
|
||||
grouped["B"].nth(0)
|
||||
|
||||
df = df.copy()
|
||||
df.loc[df["A"] == "foo", "B"] = np.nan
|
||||
grouped = df.groupby("A")
|
||||
assert isna(grouped["B"].first()["foo"])
|
||||
assert isna(grouped["B"].last()["foo"])
|
||||
assert isna(grouped["B"].nth(0).iloc[0])
|
||||
|
||||
# v0.14.0 whatsnew
|
||||
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
|
||||
g = df.groupby("A")
|
||||
result = g.first()
|
||||
expected = df.iloc[[1, 2]].set_index("A")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = df.iloc[[1, 2]]
|
||||
result = g.nth(0, dropna="any")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["first", "last"])
|
||||
def test_first_last_with_na_object(method, nulls_fixture):
|
||||
# https://github.com/pandas-dev/pandas/issues/32123
|
||||
groups = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby("a")
|
||||
result = getattr(groups, method)()
|
||||
|
||||
if method == "first":
|
||||
values = [1, 3]
|
||||
else:
|
||||
values = [2, 3]
|
||||
|
||||
values = np.array(values, dtype=result["b"].dtype)
|
||||
idx = Index([1, 2], name="a")
|
||||
expected = DataFrame({"b": values}, index=idx)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index", [0, -1])
|
||||
def test_nth_with_na_object(index, nulls_fixture):
|
||||
# https://github.com/pandas-dev/pandas/issues/32123
|
||||
df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]})
|
||||
groups = df.groupby("a")
|
||||
result = groups.nth(index)
|
||||
expected = df.iloc[[0, 2]] if index == 0 else df.iloc[[1, 3]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["first", "last"])
|
||||
def test_first_last_with_None(method):
|
||||
# https://github.com/pandas-dev/pandas/issues/32800
|
||||
# None should be preserved as object dtype
|
||||
df = DataFrame.from_dict({"id": ["a"], "value": [None]})
|
||||
groups = df.groupby("id", as_index=False)
|
||||
result = getattr(groups, method)()
|
||||
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["first", "last"])
|
||||
@pytest.mark.parametrize(
|
||||
"df, expected",
|
||||
[
|
||||
(
|
||||
DataFrame({"id": "a", "value": [None, "foo", np.nan]}),
|
||||
DataFrame({"value": ["foo"]}, index=Index(["a"], name="id")),
|
||||
),
|
||||
(
|
||||
DataFrame({"id": "a", "value": [np.nan]}, dtype=object),
|
||||
DataFrame({"value": [None]}, index=Index(["a"], name="id")),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_first_last_with_None_expanded(method, df, expected):
|
||||
# GH 32800, 38286
|
||||
result = getattr(df.groupby("id"), method)()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_first_last_nth_dtypes():
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.default_rng(2).standard_normal(8),
|
||||
"D": np.array(np.random.default_rng(2).standard_normal(8), dtype="float32"),
|
||||
}
|
||||
)
|
||||
df["E"] = True
|
||||
df["F"] = 1
|
||||
|
||||
# tests for first / last / nth
|
||||
grouped = df.groupby("A")
|
||||
first = grouped.first()
|
||||
expected = df.loc[[1, 0], ["B", "C", "D", "E", "F"]]
|
||||
expected.index = Index(["bar", "foo"], name="A")
|
||||
expected = expected.sort_index()
|
||||
tm.assert_frame_equal(first, expected)
|
||||
|
||||
last = grouped.last()
|
||||
expected = df.loc[[5, 7], ["B", "C", "D", "E", "F"]]
|
||||
expected.index = Index(["bar", "foo"], name="A")
|
||||
expected = expected.sort_index()
|
||||
tm.assert_frame_equal(last, expected)
|
||||
|
||||
nth = grouped.nth(1)
|
||||
expected = df.iloc[[2, 3]]
|
||||
tm.assert_frame_equal(nth, expected)
|
||||
|
||||
|
||||
def test_first_last_nth_dtypes2():
|
||||
# GH 2763, first/last shifting dtypes
|
||||
idx = list(range(10))
|
||||
idx.append(9)
|
||||
ser = Series(data=range(11), index=idx, name="IntCol")
|
||||
assert ser.dtype == "int64"
|
||||
f = ser.groupby(level=0).first()
|
||||
assert f.dtype == "int64"
|
||||
|
||||
|
||||
def test_first_last_nth_nan_dtype():
|
||||
# GH 33591
|
||||
df = DataFrame({"data": ["A"], "nans": Series([None], dtype=object)})
|
||||
grouped = df.groupby("data")
|
||||
|
||||
expected = df.set_index("data").nans
|
||||
tm.assert_series_equal(grouped.nans.first(), expected)
|
||||
tm.assert_series_equal(grouped.nans.last(), expected)
|
||||
|
||||
expected = df.nans
|
||||
tm.assert_series_equal(grouped.nans.nth(-1), expected)
|
||||
tm.assert_series_equal(grouped.nans.nth(0), expected)
|
||||
|
||||
|
||||
def test_first_strings_timestamps():
|
||||
# GH 11244
|
||||
test = DataFrame(
|
||||
{
|
||||
Timestamp("2012-01-01 00:00:00"): ["a", "b"],
|
||||
Timestamp("2012-01-02 00:00:00"): ["c", "d"],
|
||||
"name": ["e", "e"],
|
||||
"aaaa": ["f", "g"],
|
||||
}
|
||||
)
|
||||
result = test.groupby("name").first()
|
||||
expected = DataFrame(
|
||||
[["a", "c", "f"]],
|
||||
columns=Index([Timestamp("2012-01-01"), Timestamp("2012-01-02"), "aaaa"]),
|
||||
index=Index(["e"], name="name"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nth():
|
||||
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
|
||||
gb = df.groupby("A")
|
||||
|
||||
tm.assert_frame_equal(gb.nth(0), df.iloc[[0, 2]])
|
||||
tm.assert_frame_equal(gb.nth(1), df.iloc[[1]])
|
||||
tm.assert_frame_equal(gb.nth(2), df.loc[[]])
|
||||
tm.assert_frame_equal(gb.nth(-1), df.iloc[[1, 2]])
|
||||
tm.assert_frame_equal(gb.nth(-2), df.iloc[[0]])
|
||||
tm.assert_frame_equal(gb.nth(-3), df.loc[[]])
|
||||
tm.assert_series_equal(gb.B.nth(0), df.B.iloc[[0, 2]])
|
||||
tm.assert_series_equal(gb.B.nth(1), df.B.iloc[[1]])
|
||||
tm.assert_frame_equal(gb[["B"]].nth(0), df[["B"]].iloc[[0, 2]])
|
||||
|
||||
tm.assert_frame_equal(gb.nth(0, dropna="any"), df.iloc[[1, 2]])
|
||||
tm.assert_frame_equal(gb.nth(-1, dropna="any"), df.iloc[[1, 2]])
|
||||
|
||||
tm.assert_frame_equal(gb.nth(7, dropna="any"), df.iloc[:0])
|
||||
tm.assert_frame_equal(gb.nth(2, dropna="any"), df.iloc[:0])
|
||||
|
||||
|
||||
def test_nth2():
|
||||
# out of bounds, regression from 0.13.1
|
||||
# GH 6621
|
||||
df = DataFrame(
|
||||
{
|
||||
"color": {0: "green", 1: "green", 2: "red", 3: "red", 4: "red"},
|
||||
"food": {0: "ham", 1: "eggs", 2: "eggs", 3: "ham", 4: "pork"},
|
||||
"two": {
|
||||
0: 1.5456590000000001,
|
||||
1: -0.070345000000000005,
|
||||
2: -2.4004539999999999,
|
||||
3: 0.46206000000000003,
|
||||
4: 0.52350799999999997,
|
||||
},
|
||||
"one": {
|
||||
0: 0.56573799999999996,
|
||||
1: -0.9742360000000001,
|
||||
2: 1.033801,
|
||||
3: -0.78543499999999999,
|
||||
4: 0.70422799999999997,
|
||||
},
|
||||
}
|
||||
).set_index(["color", "food"])
|
||||
|
||||
result = df.groupby(level=0, as_index=False).nth(2)
|
||||
expected = df.iloc[[-1]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(level=0, as_index=False).nth(3)
|
||||
expected = df.loc[[]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nth3():
|
||||
# GH 7559
|
||||
# from the vbench
|
||||
df = DataFrame(np.random.default_rng(2).integers(1, 10, (100, 2)), dtype="int64")
|
||||
ser = df[1]
|
||||
gb = df[0]
|
||||
expected = ser.groupby(gb).first()
|
||||
expected2 = ser.groupby(gb).apply(lambda x: x.iloc[0])
|
||||
tm.assert_series_equal(expected2, expected, check_names=False)
|
||||
assert expected.name == 1
|
||||
assert expected2.name == 1
|
||||
|
||||
# validate first
|
||||
v = ser[gb == 1].iloc[0]
|
||||
assert expected.iloc[0] == v
|
||||
assert expected2.iloc[0] == v
|
||||
|
||||
with pytest.raises(ValueError, match="For a DataFrame"):
|
||||
ser.groupby(gb, sort=False).nth(0, dropna=True)
|
||||
|
||||
|
||||
def test_nth4():
|
||||
# doc example
|
||||
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
|
||||
gb = df.groupby("A")
|
||||
result = gb.B.nth(0, dropna="all")
|
||||
expected = df.B.iloc[[1, 2]]
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_nth5():
|
||||
# test multiple nth values
|
||||
df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], columns=["A", "B"])
|
||||
gb = df.groupby("A")
|
||||
|
||||
tm.assert_frame_equal(gb.nth(0), df.iloc[[0, 3]])
|
||||
tm.assert_frame_equal(gb.nth([0]), df.iloc[[0, 3]])
|
||||
tm.assert_frame_equal(gb.nth([0, 1]), df.iloc[[0, 1, 3, 4]])
|
||||
tm.assert_frame_equal(gb.nth([0, -1]), df.iloc[[0, 2, 3, 4]])
|
||||
tm.assert_frame_equal(gb.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]])
|
||||
tm.assert_frame_equal(gb.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]])
|
||||
tm.assert_frame_equal(gb.nth([2]), df.iloc[[2]])
|
||||
tm.assert_frame_equal(gb.nth([3, 4]), df.loc[[]])
|
||||
|
||||
|
||||
def test_nth_bdays(unit):
|
||||
business_dates = pd.date_range(
|
||||
start="4/1/2014", end="6/30/2014", freq="B", unit=unit
|
||||
)
|
||||
df = DataFrame(1, index=business_dates, columns=["a", "b"])
|
||||
# get the first, fourth and last two business days for each month
|
||||
key = [df.index.year, df.index.month]
|
||||
result = df.groupby(key, as_index=False).nth([0, 3, -2, -1])
|
||||
expected_dates = pd.to_datetime(
|
||||
[
|
||||
"2014/4/1",
|
||||
"2014/4/4",
|
||||
"2014/4/29",
|
||||
"2014/4/30",
|
||||
"2014/5/1",
|
||||
"2014/5/6",
|
||||
"2014/5/29",
|
||||
"2014/5/30",
|
||||
"2014/6/2",
|
||||
"2014/6/5",
|
||||
"2014/6/27",
|
||||
"2014/6/30",
|
||||
]
|
||||
).as_unit(unit)
|
||||
expected = DataFrame(1, columns=["a", "b"], index=expected_dates)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nth_multi_grouper(three_group):
|
||||
# PR 9090, related to issue 8979
|
||||
# test nth on multiple groupers
|
||||
grouped = three_group.groupby(["A", "B"])
|
||||
result = grouped.nth(0)
|
||||
expected = three_group.iloc[[0, 3, 4, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, expected_first, expected_last",
|
||||
[
|
||||
(
|
||||
{
|
||||
"id": ["A"],
|
||||
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
"foo": [1],
|
||||
},
|
||||
{
|
||||
"id": ["A"],
|
||||
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
"foo": [1],
|
||||
},
|
||||
{
|
||||
"id": ["A"],
|
||||
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
"foo": [1],
|
||||
},
|
||||
),
|
||||
(
|
||||
{
|
||||
"id": ["A", "B", "A"],
|
||||
"time": [
|
||||
Timestamp("2012-01-01 13:00:00", tz="America/New_York"),
|
||||
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
Timestamp("2012-03-01 12:00:00", tz="Europe/London"),
|
||||
],
|
||||
"foo": [1, 2, 3],
|
||||
},
|
||||
{
|
||||
"id": ["A", "B"],
|
||||
"time": [
|
||||
Timestamp("2012-01-01 13:00:00", tz="America/New_York"),
|
||||
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
],
|
||||
"foo": [1, 2],
|
||||
},
|
||||
{
|
||||
"id": ["A", "B"],
|
||||
"time": [
|
||||
Timestamp("2012-03-01 12:00:00", tz="Europe/London"),
|
||||
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
],
|
||||
"foo": [3, 2],
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_first_last_tz(data, expected_first, expected_last):
|
||||
# GH15884
|
||||
# Test that the timezone is retained when calling first
|
||||
# or last on groupby with as_index=False
|
||||
|
||||
df = DataFrame(data)
|
||||
|
||||
result = df.groupby("id", as_index=False).first()
|
||||
expected = DataFrame(expected_first)
|
||||
cols = ["id", "time", "foo"]
|
||||
tm.assert_frame_equal(result[cols], expected[cols])
|
||||
|
||||
result = df.groupby("id", as_index=False)["time"].first()
|
||||
tm.assert_frame_equal(result, expected[["id", "time"]])
|
||||
|
||||
result = df.groupby("id", as_index=False).last()
|
||||
expected = DataFrame(expected_last)
|
||||
cols = ["id", "time", "foo"]
|
||||
tm.assert_frame_equal(result[cols], expected[cols])
|
||||
|
||||
result = df.groupby("id", as_index=False)["time"].last()
|
||||
tm.assert_frame_equal(result, expected[["id", "time"]])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method, ts, alpha",
|
||||
[
|
||||
["first", Timestamp("2013-01-01", tz="US/Eastern"), "a"],
|
||||
["last", Timestamp("2013-01-02", tz="US/Eastern"), "b"],
|
||||
],
|
||||
)
|
||||
def test_first_last_tz_multi_column(method, ts, alpha, unit):
|
||||
# GH 21603
|
||||
category_string = Series(list("abc")).astype("category")
|
||||
dti = pd.date_range("20130101", periods=3, tz="US/Eastern", unit=unit)
|
||||
df = DataFrame(
|
||||
{
|
||||
"group": [1, 1, 2],
|
||||
"category_string": category_string,
|
||||
"datetimetz": dti,
|
||||
}
|
||||
)
|
||||
result = getattr(df.groupby("group"), method)()
|
||||
expected = DataFrame(
|
||||
{
|
||||
"category_string": pd.Categorical(
|
||||
[alpha, "c"], dtype=category_string.dtype
|
||||
),
|
||||
"datetimetz": [ts, Timestamp("2013-01-03", tz="US/Eastern")],
|
||||
},
|
||||
index=Index([1, 2], name="group"),
|
||||
)
|
||||
expected["datetimetz"] = expected["datetimetz"].dt.as_unit(unit)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values",
|
||||
[
|
||||
pd.array([True, False], dtype="boolean"),
|
||||
pd.array([1, 2], dtype="Int64"),
|
||||
pd.to_datetime(["2020-01-01", "2020-02-01"]),
|
||||
pd.to_timedelta([1, 2], unit="D"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("function", ["first", "last", "min", "max"])
|
||||
def test_first_last_extension_array_keeps_dtype(values, function):
|
||||
# https://github.com/pandas-dev/pandas/issues/33071
|
||||
# https://github.com/pandas-dev/pandas/issues/32194
|
||||
df = DataFrame({"a": [1, 2], "b": values})
|
||||
grouped = df.groupby("a")
|
||||
idx = Index([1, 2], name="a")
|
||||
expected_series = Series(values, name="b", index=idx)
|
||||
expected_frame = DataFrame({"b": values}, index=idx)
|
||||
|
||||
result_series = getattr(grouped["b"], function)()
|
||||
tm.assert_series_equal(result_series, expected_series)
|
||||
|
||||
result_frame = grouped.agg({"b": function})
|
||||
tm.assert_frame_equal(result_frame, expected_frame)
|
||||
|
||||
|
||||
def test_nth_multi_index_as_expected():
|
||||
# PR 9090, related to issue 8979
|
||||
# test nth on MultiIndex
|
||||
three_group = DataFrame(
|
||||
{
|
||||
"A": [
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
],
|
||||
"B": [
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"two",
|
||||
"two",
|
||||
"one",
|
||||
],
|
||||
"C": [
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"shiny",
|
||||
],
|
||||
}
|
||||
)
|
||||
grouped = three_group.groupby(["A", "B"])
|
||||
result = grouped.nth(0)
|
||||
expected = three_group.iloc[[0, 3, 4, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op, n, expected_rows",
|
||||
[
|
||||
("head", -1, [0]),
|
||||
("head", 0, []),
|
||||
("head", 1, [0, 2]),
|
||||
("head", 7, [0, 1, 2]),
|
||||
("tail", -1, [1]),
|
||||
("tail", 0, []),
|
||||
("tail", 1, [1, 2]),
|
||||
("tail", 7, [0, 1, 2]),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("columns", [None, [], ["A"], ["B"], ["A", "B"]])
|
||||
@pytest.mark.parametrize("as_index", [True, False])
|
||||
def test_groupby_head_tail(op, n, expected_rows, columns, as_index):
|
||||
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
|
||||
g = df.groupby("A", as_index=as_index)
|
||||
expected = df.iloc[expected_rows]
|
||||
if columns is not None:
|
||||
g = g[columns]
|
||||
expected = expected[columns]
|
||||
result = getattr(g, op)(n)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op, n, expected_cols",
|
||||
[
|
||||
("head", -1, [0]),
|
||||
("head", 0, []),
|
||||
("head", 1, [0, 2]),
|
||||
("head", 7, [0, 1, 2]),
|
||||
("tail", -1, [1]),
|
||||
("tail", 0, []),
|
||||
("tail", 1, [1, 2]),
|
||||
("tail", 7, [0, 1, 2]),
|
||||
],
|
||||
)
|
||||
def test_groupby_head_tail_axis_1(op, n, expected_cols):
|
||||
# GH 9772
|
||||
df = DataFrame(
|
||||
[[1, 2, 3], [1, 4, 5], [2, 6, 7], [3, 8, 9]], columns=["A", "B", "C"]
|
||||
)
|
||||
msg = "DataFrame.groupby with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
g = df.groupby([0, 0, 1], axis=1)
|
||||
expected = df.iloc[:, expected_cols]
|
||||
result = getattr(g, op)(n)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_selection_cache():
|
||||
# GH 12839 nth, head, and tail should return same result consistently
|
||||
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
|
||||
expected = df.iloc[[0, 2]]
|
||||
|
||||
g = df.groupby("A")
|
||||
result1 = g.head(n=2)
|
||||
result2 = g.nth(0)
|
||||
tm.assert_frame_equal(result1, df)
|
||||
tm.assert_frame_equal(result2, expected)
|
||||
|
||||
g = df.groupby("A")
|
||||
result1 = g.tail(n=2)
|
||||
result2 = g.nth(0)
|
||||
tm.assert_frame_equal(result1, df)
|
||||
tm.assert_frame_equal(result2, expected)
|
||||
|
||||
g = df.groupby("A")
|
||||
result1 = g.nth(0)
|
||||
result2 = g.head(n=2)
|
||||
tm.assert_frame_equal(result1, expected)
|
||||
tm.assert_frame_equal(result2, df)
|
||||
|
||||
g = df.groupby("A")
|
||||
result1 = g.nth(0)
|
||||
result2 = g.tail(n=2)
|
||||
tm.assert_frame_equal(result1, expected)
|
||||
tm.assert_frame_equal(result2, df)
|
||||
|
||||
|
||||
def test_nth_empty():
|
||||
# GH 16064
|
||||
df = DataFrame(index=[0], columns=["a", "b", "c"])
|
||||
result = df.groupby("a").nth(10)
|
||||
expected = df.iloc[:0]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(["a", "b"]).nth(10)
|
||||
expected = df.iloc[:0]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nth_column_order():
|
||||
# GH 20760
|
||||
# Check that nth preserves column order
|
||||
df = DataFrame(
|
||||
[[1, "b", 100], [1, "a", 50], [1, "a", np.nan], [2, "c", 200], [2, "d", 150]],
|
||||
columns=["A", "C", "B"],
|
||||
)
|
||||
result = df.groupby("A").nth(0)
|
||||
expected = df.iloc[[0, 3]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("A").nth(-1, dropna="any")
|
||||
expected = df.iloc[[1, 4]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dropna", [None, "any", "all"])
|
||||
def test_nth_nan_in_grouper(dropna):
|
||||
# GH 26011
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [np.nan, "a", np.nan, "b", np.nan],
|
||||
"b": [0, 2, 4, 6, 8],
|
||||
"c": [1, 3, 5, 7, 9],
|
||||
}
|
||||
)
|
||||
result = df.groupby("a").nth(0, dropna=dropna)
|
||||
expected = df.iloc[[1, 3]]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dropna", [None, "any", "all"])
|
||||
def test_nth_nan_in_grouper_series(dropna):
|
||||
# GH 26454
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [np.nan, "a", np.nan, "b", np.nan],
|
||||
"b": [0, 2, 4, 6, 8],
|
||||
}
|
||||
)
|
||||
result = df.groupby("a")["b"].nth(0, dropna=dropna)
|
||||
expected = df["b"].iloc[[1, 3]]
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_first_categorical_and_datetime_data_nat():
|
||||
# GH 20520
|
||||
df = DataFrame(
|
||||
{
|
||||
"group": ["first", "first", "second", "third", "third"],
|
||||
"time": 5 * [np.datetime64("NaT")],
|
||||
"categories": Series(["a", "b", "c", "a", "b"], dtype="category"),
|
||||
}
|
||||
)
|
||||
result = df.groupby("group").first()
|
||||
expected = DataFrame(
|
||||
{
|
||||
"time": 3 * [np.datetime64("NaT")],
|
||||
"categories": Series(["a", "c", "a"]).astype(
|
||||
pd.CategoricalDtype(["a", "b", "c"])
|
||||
),
|
||||
}
|
||||
)
|
||||
expected.index = Index(["first", "second", "third"], name="group")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_first_multi_key_groupby_categorical():
|
||||
# GH 22512
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 1, 1, 2, 2],
|
||||
"B": [100, 100, 200, 100, 100],
|
||||
"C": ["apple", "orange", "mango", "mango", "orange"],
|
||||
"D": ["jupiter", "mercury", "mars", "venus", "venus"],
|
||||
}
|
||||
)
|
||||
df = df.astype({"D": "category"})
|
||||
result = df.groupby(by=["A", "B"]).first()
|
||||
expected = DataFrame(
|
||||
{
|
||||
"C": ["apple", "mango", "mango"],
|
||||
"D": Series(["jupiter", "mars", "venus"]).astype(
|
||||
pd.CategoricalDtype(["jupiter", "mars", "mercury", "venus"])
|
||||
),
|
||||
}
|
||||
)
|
||||
expected.index = MultiIndex.from_tuples(
|
||||
[(1, 100), (1, 200), (2, 100)], names=["A", "B"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["first", "last", "nth"])
|
||||
def test_groupby_last_first_nth_with_none(method, nulls_fixture):
|
||||
# GH29645
|
||||
expected = Series(["y"])
|
||||
data = Series(
|
||||
[nulls_fixture, nulls_fixture, nulls_fixture, "y", nulls_fixture],
|
||||
index=[0, 0, 0, 0, 0],
|
||||
).groupby(level=0)
|
||||
|
||||
if method == "nth":
|
||||
result = getattr(data, method)(3)
|
||||
else:
|
||||
result = getattr(data, method)()
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arg, expected_rows",
|
||||
[
|
||||
[slice(None, 3, 2), [0, 1, 4, 5]],
|
||||
[slice(None, -2), [0, 2, 5]],
|
||||
[[slice(None, 2), slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]],
|
||||
[[0, 1, slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]],
|
||||
],
|
||||
)
|
||||
def test_slice(slice_test_df, slice_test_grouped, arg, expected_rows):
|
||||
# Test slices GH #42947
|
||||
|
||||
result = slice_test_grouped.nth[arg]
|
||||
equivalent = slice_test_grouped.nth(arg)
|
||||
expected = slice_test_df.iloc[expected_rows]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_frame_equal(equivalent, expected)
|
||||
|
||||
|
||||
def test_nth_indexed(slice_test_df, slice_test_grouped):
|
||||
# Test index notation GH #44688
|
||||
|
||||
result = slice_test_grouped.nth[0, 1, -2:]
|
||||
equivalent = slice_test_grouped.nth([0, 1, slice(-2, None)])
|
||||
expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_frame_equal(equivalent, expected)
|
||||
|
||||
|
||||
def test_invalid_argument(slice_test_grouped):
|
||||
# Test for error on invalid argument
|
||||
|
||||
with pytest.raises(TypeError, match="Invalid index"):
|
||||
slice_test_grouped.nth(3.14)
|
||||
|
||||
|
||||
def test_negative_step(slice_test_grouped):
|
||||
# Test for error on negative slice step
|
||||
|
||||
with pytest.raises(ValueError, match="Invalid step"):
|
||||
slice_test_grouped.nth(slice(None, None, -1))
|
||||
|
||||
|
||||
def test_np_ints(slice_test_df, slice_test_grouped):
|
||||
# Test np ints work
|
||||
|
||||
result = slice_test_grouped.nth(np.array([0, 1]))
|
||||
expected = slice_test_df.iloc[[0, 1, 2, 3, 4]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_nth_with_column_axis():
|
||||
# GH43926
|
||||
df = DataFrame(
|
||||
[
|
||||
[4, 5, 6],
|
||||
[8, 8, 7],
|
||||
],
|
||||
index=["z", "y"],
|
||||
columns=["C", "B", "A"],
|
||||
)
|
||||
msg = "DataFrame.groupby with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
gb = df.groupby(df.iloc[1], axis=1)
|
||||
result = gb.nth(0)
|
||||
expected = df.iloc[:, [0, 2]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_nth_interval():
|
||||
# GH#24205
|
||||
idx_result = MultiIndex(
|
||||
[
|
||||
pd.CategoricalIndex([pd.Interval(0, 1), pd.Interval(1, 2)]),
|
||||
pd.CategoricalIndex([pd.Interval(0, 10), pd.Interval(10, 20)]),
|
||||
],
|
||||
[[0, 0, 0, 1, 1], [0, 1, 1, 0, -1]],
|
||||
)
|
||||
df_result = DataFrame({"col": range(len(idx_result))}, index=idx_result)
|
||||
result = df_result.groupby(level=[0, 1], observed=False).nth(0)
|
||||
val_expected = [0, 1, 3]
|
||||
idx_expected = MultiIndex(
|
||||
[
|
||||
pd.CategoricalIndex([pd.Interval(0, 1), pd.Interval(1, 2)]),
|
||||
pd.CategoricalIndex([pd.Interval(0, 10), pd.Interval(10, 20)]),
|
||||
],
|
||||
[[0, 0, 1], [0, 1, 0]],
|
||||
)
|
||||
expected = DataFrame(val_expected, index=idx_expected, columns=["col"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"start, stop, expected_values, expected_columns",
|
||||
[
|
||||
(None, None, [0, 1, 2, 3, 4], list("ABCDE")),
|
||||
(None, 1, [0, 3], list("AD")),
|
||||
(None, 9, [0, 1, 2, 3, 4], list("ABCDE")),
|
||||
(None, -1, [0, 1, 3], list("ABD")),
|
||||
(1, None, [1, 2, 4], list("BCE")),
|
||||
(1, -1, [1], list("B")),
|
||||
(-1, None, [2, 4], list("CE")),
|
||||
(-1, 2, [4], list("E")),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("method", ["call", "index"])
|
||||
def test_nth_slices_with_column_axis(
|
||||
start, stop, expected_values, expected_columns, method
|
||||
):
|
||||
df = DataFrame([range(5)], columns=[list("ABCDE")])
|
||||
msg = "DataFrame.groupby with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
gb = df.groupby([5, 5, 5, 6, 6], axis=1)
|
||||
result = {
|
||||
"call": lambda start, stop: gb.nth(slice(start, stop)),
|
||||
"index": lambda start, stop: gb.nth[start:stop],
|
||||
}[method](start, stop)
|
||||
expected = DataFrame([expected_values], columns=[expected_columns])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:invalid value encountered in remainder:RuntimeWarning"
|
||||
)
|
||||
def test_head_tail_dropna_true():
|
||||
# GH#45089
|
||||
df = DataFrame(
|
||||
[["a", "z"], ["b", np.nan], ["c", np.nan], ["c", np.nan]], columns=["X", "Y"]
|
||||
)
|
||||
expected = DataFrame([["a", "z"]], columns=["X", "Y"])
|
||||
|
||||
result = df.groupby(["X", "Y"]).head(n=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(["X", "Y"]).tail(n=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(["X", "Y"]).nth(n=0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_head_tail_dropna_false():
|
||||
# GH#45089
|
||||
df = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"])
|
||||
expected = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"])
|
||||
|
||||
result = df.groupby(["X", "Y"], dropna=False).head(n=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(["X", "Y"], dropna=False).tail(n=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(["X", "Y"], dropna=False).nth(n=0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("selection", ("b", ["b"], ["b", "c"]))
|
||||
@pytest.mark.parametrize("dropna", ["any", "all", None])
|
||||
def test_nth_after_selection(selection, dropna):
|
||||
# GH#11038, GH#53518
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [1, 1, 2],
|
||||
"b": [np.nan, 3, 4],
|
||||
"c": [5, 6, 7],
|
||||
}
|
||||
)
|
||||
gb = df.groupby("a")[selection]
|
||||
result = gb.nth(0, dropna=dropna)
|
||||
if dropna == "any" or (dropna == "all" and selection != ["b", "c"]):
|
||||
locs = [1, 2]
|
||||
else:
|
||||
locs = [0, 2]
|
||||
expected = df.loc[locs, selection]
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
(
|
||||
Timestamp("2011-01-15 12:50:28.502376"),
|
||||
Timestamp("2011-01-20 12:50:28.593448"),
|
||||
),
|
||||
(24650000000000001, 24650000000000002),
|
||||
],
|
||||
)
|
||||
def test_groupby_nth_int_like_precision(data):
|
||||
# GH#6620, GH#9311
|
||||
df = DataFrame({"a": [1, 1], "b": data})
|
||||
|
||||
grouped = df.groupby("a")
|
||||
result = grouped.nth(0)
|
||||
expected = DataFrame({"a": 1, "b": [data[0]]})
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,496 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"interpolation", ["linear", "lower", "higher", "nearest", "midpoint"]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"a_vals,b_vals",
|
||||
[
|
||||
# Ints
|
||||
([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]),
|
||||
([1, 2, 3, 4], [4, 3, 2, 1]),
|
||||
([1, 2, 3, 4, 5], [4, 3, 2, 1]),
|
||||
# Floats
|
||||
([1.0, 2.0, 3.0, 4.0, 5.0], [5.0, 4.0, 3.0, 2.0, 1.0]),
|
||||
# Missing data
|
||||
([1.0, np.nan, 3.0, np.nan, 5.0], [5.0, np.nan, 3.0, np.nan, 1.0]),
|
||||
([np.nan, 4.0, np.nan, 2.0, np.nan], [np.nan, 4.0, np.nan, 2.0, np.nan]),
|
||||
# Timestamps
|
||||
(
|
||||
pd.date_range("1/1/18", freq="D", periods=5),
|
||||
pd.date_range("1/1/18", freq="D", periods=5)[::-1],
|
||||
),
|
||||
(
|
||||
pd.date_range("1/1/18", freq="D", periods=5).as_unit("s"),
|
||||
pd.date_range("1/1/18", freq="D", periods=5)[::-1].as_unit("s"),
|
||||
),
|
||||
# All NA
|
||||
([np.nan] * 5, [np.nan] * 5),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("q", [0, 0.25, 0.5, 0.75, 1])
|
||||
def test_quantile(interpolation, a_vals, b_vals, q, request):
|
||||
if (
|
||||
interpolation == "nearest"
|
||||
and q == 0.5
|
||||
and isinstance(b_vals, list)
|
||||
and b_vals == [4, 3, 2, 1]
|
||||
):
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(
|
||||
reason="Unclear numpy expectation for nearest "
|
||||
"result with equidistant data"
|
||||
)
|
||||
)
|
||||
all_vals = pd.concat([pd.Series(a_vals), pd.Series(b_vals)])
|
||||
|
||||
a_expected = pd.Series(a_vals).quantile(q, interpolation=interpolation)
|
||||
b_expected = pd.Series(b_vals).quantile(q, interpolation=interpolation)
|
||||
|
||||
df = DataFrame({"key": ["a"] * len(a_vals) + ["b"] * len(b_vals), "val": all_vals})
|
||||
|
||||
expected = DataFrame(
|
||||
[a_expected, b_expected], columns=["val"], index=Index(["a", "b"], name="key")
|
||||
)
|
||||
if all_vals.dtype.kind == "M" and expected.dtypes.values[0].kind == "M":
|
||||
# TODO(non-nano): this should be unnecessary once array_to_datetime
|
||||
# correctly infers non-nano from Timestamp.unit
|
||||
expected = expected.astype(all_vals.dtype)
|
||||
result = df.groupby("key").quantile(q, interpolation=interpolation)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_quantile_array():
|
||||
# https://github.com/pandas-dev/pandas/issues/27526
|
||||
df = DataFrame({"A": [0, 1, 2, 3, 4]})
|
||||
key = np.array([0, 0, 1, 1, 1], dtype=np.int64)
|
||||
result = df.groupby(key).quantile([0.25])
|
||||
|
||||
index = pd.MultiIndex.from_product([[0, 1], [0.25]])
|
||||
expected = DataFrame({"A": [0.25, 2.50]}, index=index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame({"A": [0, 1, 2, 3], "B": [4, 5, 6, 7]})
|
||||
index = pd.MultiIndex.from_product([[0, 1], [0.25, 0.75]])
|
||||
|
||||
key = np.array([0, 0, 1, 1], dtype=np.int64)
|
||||
result = df.groupby(key).quantile([0.25, 0.75])
|
||||
expected = DataFrame(
|
||||
{"A": [0.25, 0.75, 2.25, 2.75], "B": [4.25, 4.75, 6.25, 6.75]}, index=index
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_quantile_array2():
|
||||
# https://github.com/pandas-dev/pandas/pull/28085#issuecomment-524066959
|
||||
arr = np.random.default_rng(2).integers(0, 5, size=(10, 3), dtype=np.int64)
|
||||
df = DataFrame(arr, columns=list("ABC"))
|
||||
result = df.groupby("A").quantile([0.3, 0.7])
|
||||
expected = DataFrame(
|
||||
{
|
||||
"B": [2.0, 2.0, 2.3, 2.7, 0.3, 0.7, 3.2, 4.0, 0.3, 0.7],
|
||||
"C": [1.0, 1.0, 1.9, 3.0999999999999996, 0.3, 0.7, 2.6, 3.0, 1.2, 2.8],
|
||||
},
|
||||
index=pd.MultiIndex.from_product(
|
||||
[[0, 1, 2, 3, 4], [0.3, 0.7]], names=["A", None]
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_quantile_array_no_sort():
|
||||
df = DataFrame({"A": [0, 1, 2], "B": [3, 4, 5]})
|
||||
key = np.array([1, 0, 1], dtype=np.int64)
|
||||
result = df.groupby(key, sort=False).quantile([0.25, 0.5, 0.75])
|
||||
expected = DataFrame(
|
||||
{"A": [0.5, 1.0, 1.5, 1.0, 1.0, 1.0], "B": [3.5, 4.0, 4.5, 4.0, 4.0, 4.0]},
|
||||
index=pd.MultiIndex.from_product([[1, 0], [0.25, 0.5, 0.75]]),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(key, sort=False).quantile([0.75, 0.25])
|
||||
expected = DataFrame(
|
||||
{"A": [1.5, 0.5, 1.0, 1.0], "B": [4.5, 3.5, 4.0, 4.0]},
|
||||
index=pd.MultiIndex.from_product([[1, 0], [0.75, 0.25]]),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_quantile_array_multiple_levels():
|
||||
df = DataFrame(
|
||||
{"A": [0, 1, 2], "B": [3, 4, 5], "c": ["a", "a", "a"], "d": ["a", "a", "b"]}
|
||||
)
|
||||
result = df.groupby(["c", "d"]).quantile([0.25, 0.75])
|
||||
index = pd.MultiIndex.from_tuples(
|
||||
[("a", "a", 0.25), ("a", "a", 0.75), ("a", "b", 0.25), ("a", "b", 0.75)],
|
||||
names=["c", "d", None],
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"A": [0.25, 0.75, 2.0, 2.0], "B": [3.25, 3.75, 5.0, 5.0]}, index=index
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("frame_size", [(2, 3), (100, 10)])
|
||||
@pytest.mark.parametrize("groupby", [[0], [0, 1]])
|
||||
@pytest.mark.parametrize("q", [[0.5, 0.6]])
|
||||
def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, q):
|
||||
# GH30289
|
||||
nrow, ncol = frame_size
|
||||
df = DataFrame(np.array([ncol * [_ % 4] for _ in range(nrow)]), columns=range(ncol))
|
||||
|
||||
idx_levels = [np.arange(min(nrow, 4))] * len(groupby) + [q]
|
||||
idx_codes = [[x for x in range(min(nrow, 4)) for _ in q]] * len(groupby) + [
|
||||
list(range(len(q))) * min(nrow, 4)
|
||||
]
|
||||
expected_index = pd.MultiIndex(
|
||||
levels=idx_levels, codes=idx_codes, names=groupby + [None]
|
||||
)
|
||||
expected_values = [
|
||||
[float(x)] * (ncol - len(groupby)) for x in range(min(nrow, 4)) for _ in q
|
||||
]
|
||||
expected_columns = [x for x in range(ncol) if x not in groupby]
|
||||
expected = DataFrame(
|
||||
expected_values, index=expected_index, columns=expected_columns
|
||||
)
|
||||
result = df.groupby(groupby).quantile(q)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_quantile_raises():
|
||||
df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"])
|
||||
|
||||
with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"):
|
||||
df.groupby("key").quantile()
|
||||
|
||||
|
||||
def test_quantile_out_of_bounds_q_raises():
|
||||
# https://github.com/pandas-dev/pandas/issues/27470
|
||||
df = DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": range(6)})
|
||||
g = df.groupby([0, 0, 0, 1, 1, 1])
|
||||
with pytest.raises(ValueError, match="Got '50.0' instead"):
|
||||
g.quantile(50)
|
||||
|
||||
with pytest.raises(ValueError, match="Got '-1.0' instead"):
|
||||
g.quantile(-1)
|
||||
|
||||
|
||||
def test_quantile_missing_group_values_no_segfaults():
|
||||
# GH 28662
|
||||
data = np.array([1.0, np.nan, 1.0])
|
||||
df = DataFrame({"key": data, "val": range(3)})
|
||||
|
||||
# Random segfaults; would have been guaranteed in loop
|
||||
grp = df.groupby("key")
|
||||
for _ in range(100):
|
||||
grp.quantile()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"key, val, expected_key, expected_val",
|
||||
[
|
||||
([1.0, np.nan, 3.0, np.nan], range(4), [1.0, 3.0], [0.0, 2.0]),
|
||||
([1.0, np.nan, 2.0, 2.0], range(4), [1.0, 2.0], [0.0, 2.5]),
|
||||
(["a", "b", "b", np.nan], range(4), ["a", "b"], [0, 1.5]),
|
||||
([0], [42], [0], [42.0]),
|
||||
([], [], np.array([], dtype="float64"), np.array([], dtype="float64")),
|
||||
],
|
||||
)
|
||||
def test_quantile_missing_group_values_correct_results(
|
||||
key, val, expected_key, expected_val
|
||||
):
|
||||
# GH 28662, GH 33200, GH 33569
|
||||
df = DataFrame({"key": key, "val": val})
|
||||
|
||||
expected = DataFrame(
|
||||
expected_val, index=Index(expected_key, name="key"), columns=["val"]
|
||||
)
|
||||
|
||||
grp = df.groupby("key")
|
||||
|
||||
result = grp.quantile(0.5)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = grp.quantile()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values",
|
||||
[
|
||||
pd.array([1, 0, None] * 2, dtype="Int64"),
|
||||
pd.array([True, False, None] * 2, dtype="boolean"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
|
||||
def test_groupby_quantile_nullable_array(values, q):
|
||||
# https://github.com/pandas-dev/pandas/issues/33136
|
||||
df = DataFrame({"a": ["x"] * 3 + ["y"] * 3, "b": values})
|
||||
result = df.groupby("a")["b"].quantile(q)
|
||||
|
||||
if isinstance(q, list):
|
||||
idx = pd.MultiIndex.from_product((["x", "y"], q), names=["a", None])
|
||||
true_quantiles = [0.0, 0.5, 1.0]
|
||||
else:
|
||||
idx = Index(["x", "y"], name="a")
|
||||
true_quantiles = [0.5]
|
||||
|
||||
expected = pd.Series(true_quantiles * 2, index=idx, name="b", dtype="Float64")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
|
||||
@pytest.mark.parametrize("numeric_only", [True, False])
|
||||
def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only):
|
||||
df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]})
|
||||
if numeric_only:
|
||||
result = df.groupby("a").quantile(q, numeric_only=numeric_only)
|
||||
expected = df.groupby("a")[["b"]].quantile(q)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
with pytest.raises(
|
||||
TypeError, match="'quantile' cannot be performed against 'object' dtypes!"
|
||||
):
|
||||
df.groupby("a").quantile(q, numeric_only=numeric_only)
|
||||
|
||||
|
||||
def test_groupby_quantile_NA_float(any_float_dtype):
|
||||
# GH#42849
|
||||
df = DataFrame({"x": [1, 1], "y": [0.2, np.nan]}, dtype=any_float_dtype)
|
||||
result = df.groupby("x")["y"].quantile(0.5)
|
||||
exp_index = Index([1.0], dtype=any_float_dtype, name="x")
|
||||
|
||||
if any_float_dtype in ["Float32", "Float64"]:
|
||||
expected_dtype = any_float_dtype
|
||||
else:
|
||||
expected_dtype = None
|
||||
|
||||
expected = pd.Series([0.2], dtype=expected_dtype, index=exp_index, name="y")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.groupby("x")["y"].quantile([0.5, 0.75])
|
||||
expected = pd.Series(
|
||||
[0.2] * 2,
|
||||
index=pd.MultiIndex.from_product((exp_index, [0.5, 0.75]), names=["x", None]),
|
||||
name="y",
|
||||
dtype=expected_dtype,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_quantile_NA_int(any_int_ea_dtype):
|
||||
# GH#42849
|
||||
df = DataFrame({"x": [1, 1], "y": [2, 5]}, dtype=any_int_ea_dtype)
|
||||
result = df.groupby("x")["y"].quantile(0.5)
|
||||
expected = pd.Series(
|
||||
[3.5],
|
||||
dtype="Float64",
|
||||
index=Index([1], name="x", dtype=any_int_ea_dtype),
|
||||
name="y",
|
||||
)
|
||||
tm.assert_series_equal(expected, result)
|
||||
|
||||
result = df.groupby("x").quantile(0.5)
|
||||
expected = DataFrame(
|
||||
{"y": 3.5}, dtype="Float64", index=Index([1], name="x", dtype=any_int_ea_dtype)
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"interpolation, val1, val2", [("lower", 2, 2), ("higher", 2, 3), ("nearest", 2, 2)]
|
||||
)
|
||||
def test_groupby_quantile_all_na_group_masked(
|
||||
interpolation, val1, val2, any_numeric_ea_dtype
|
||||
):
|
||||
# GH#37493
|
||||
df = DataFrame(
|
||||
{"a": [1, 1, 1, 2], "b": [1, 2, 3, pd.NA]}, dtype=any_numeric_ea_dtype
|
||||
)
|
||||
result = df.groupby("a").quantile(q=[0.5, 0.7], interpolation=interpolation)
|
||||
expected = DataFrame(
|
||||
{"b": [val1, val2, pd.NA, pd.NA]},
|
||||
dtype=any_numeric_ea_dtype,
|
||||
index=pd.MultiIndex.from_arrays(
|
||||
[pd.Series([1, 1, 2, 2], dtype=any_numeric_ea_dtype), [0.5, 0.7, 0.5, 0.7]],
|
||||
names=["a", None],
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("interpolation", ["midpoint", "linear"])
|
||||
def test_groupby_quantile_all_na_group_masked_interp(
|
||||
interpolation, any_numeric_ea_dtype
|
||||
):
|
||||
# GH#37493
|
||||
df = DataFrame(
|
||||
{"a": [1, 1, 1, 2], "b": [1, 2, 3, pd.NA]}, dtype=any_numeric_ea_dtype
|
||||
)
|
||||
result = df.groupby("a").quantile(q=[0.5, 0.75], interpolation=interpolation)
|
||||
|
||||
if any_numeric_ea_dtype == "Float32":
|
||||
expected_dtype = any_numeric_ea_dtype
|
||||
else:
|
||||
expected_dtype = "Float64"
|
||||
|
||||
expected = DataFrame(
|
||||
{"b": [2.0, 2.5, pd.NA, pd.NA]},
|
||||
dtype=expected_dtype,
|
||||
index=pd.MultiIndex.from_arrays(
|
||||
[
|
||||
pd.Series([1, 1, 2, 2], dtype=any_numeric_ea_dtype),
|
||||
[0.5, 0.75, 0.5, 0.75],
|
||||
],
|
||||
names=["a", None],
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["Float64", "Float32"])
|
||||
def test_groupby_quantile_allNA_column(dtype):
|
||||
# GH#42849
|
||||
df = DataFrame({"x": [1, 1], "y": [pd.NA] * 2}, dtype=dtype)
|
||||
result = df.groupby("x")["y"].quantile(0.5)
|
||||
expected = pd.Series(
|
||||
[np.nan], dtype=dtype, index=Index([1.0], dtype=dtype), name="y"
|
||||
)
|
||||
expected.index.name = "x"
|
||||
tm.assert_series_equal(expected, result)
|
||||
|
||||
|
||||
def test_groupby_timedelta_quantile():
|
||||
# GH: 29485
|
||||
df = DataFrame(
|
||||
{"value": pd.to_timedelta(np.arange(4), unit="s"), "group": [1, 1, 2, 2]}
|
||||
)
|
||||
result = df.groupby("group").quantile(0.99)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"value": [
|
||||
pd.Timedelta("0 days 00:00:00.990000"),
|
||||
pd.Timedelta("0 days 00:00:02.990000"),
|
||||
]
|
||||
},
|
||||
index=Index([1, 2], name="group"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_columns_groupby_quantile():
|
||||
# GH 33795
|
||||
df = DataFrame(
|
||||
np.arange(12).reshape(3, -1),
|
||||
index=list("XYZ"),
|
||||
columns=pd.Series(list("ABAB"), name="col"),
|
||||
)
|
||||
msg = "DataFrame.groupby with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
gb = df.groupby("col", axis=1)
|
||||
result = gb.quantile(q=[0.8, 0.2])
|
||||
expected = DataFrame(
|
||||
[
|
||||
[1.6, 0.4, 2.6, 1.4],
|
||||
[5.6, 4.4, 6.6, 5.4],
|
||||
[9.6, 8.4, 10.6, 9.4],
|
||||
],
|
||||
index=list("XYZ"),
|
||||
columns=pd.MultiIndex.from_tuples(
|
||||
[("A", 0.8), ("A", 0.2), ("B", 0.8), ("B", 0.2)], names=["col", None]
|
||||
),
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_timestamp_groupby_quantile(unit):
|
||||
# GH 33168
|
||||
dti = pd.date_range(
|
||||
start="2020-04-19 00:00:00", freq="1min", periods=100, tz="UTC", unit=unit
|
||||
).floor("1h")
|
||||
df = DataFrame(
|
||||
{
|
||||
"timestamp": dti,
|
||||
"category": list(range(1, 101)),
|
||||
"value": list(range(101, 201)),
|
||||
}
|
||||
)
|
||||
|
||||
result = df.groupby("timestamp").quantile([0.2, 0.8])
|
||||
|
||||
mi = pd.MultiIndex.from_product([dti[::99], [0.2, 0.8]], names=("timestamp", None))
|
||||
expected = DataFrame(
|
||||
[
|
||||
{"category": 12.8, "value": 112.8},
|
||||
{"category": 48.2, "value": 148.2},
|
||||
{"category": 68.8, "value": 168.8},
|
||||
{"category": 92.2, "value": 192.2},
|
||||
],
|
||||
index=mi,
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_quantile_dt64tz_period():
|
||||
# GH#51373
|
||||
dti = pd.date_range("2016-01-01", periods=1000)
|
||||
df = pd.Series(dti).to_frame().copy()
|
||||
df[1] = dti.tz_localize("US/Pacific")
|
||||
df[2] = dti.to_period("D")
|
||||
df[3] = dti - dti[0]
|
||||
df.iloc[-1] = pd.NaT
|
||||
|
||||
by = np.tile(np.arange(5), 200)
|
||||
gb = df.groupby(by)
|
||||
|
||||
result = gb.quantile(0.5)
|
||||
|
||||
# Check that we match the group-by-group result
|
||||
exp = {i: df.iloc[i::5].quantile(0.5) for i in range(5)}
|
||||
expected = DataFrame(exp).T.infer_objects()
|
||||
expected.index = expected.index.astype(int)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_quantile_nonmulti_levels_order():
|
||||
# Non-regression test for GH #53009
|
||||
ind = pd.MultiIndex.from_tuples(
|
||||
[
|
||||
(0, "a", "B"),
|
||||
(0, "a", "A"),
|
||||
(0, "b", "B"),
|
||||
(0, "b", "A"),
|
||||
(1, "a", "B"),
|
||||
(1, "a", "A"),
|
||||
(1, "b", "B"),
|
||||
(1, "b", "A"),
|
||||
],
|
||||
names=["sample", "cat0", "cat1"],
|
||||
)
|
||||
ser = pd.Series(range(8), index=ind)
|
||||
result = ser.groupby(level="cat1", sort=False).quantile([0.2, 0.8])
|
||||
|
||||
qind = pd.MultiIndex.from_tuples(
|
||||
[("B", 0.2), ("B", 0.8), ("A", 0.2), ("A", 0.8)], names=["cat1", None]
|
||||
)
|
||||
expected = pd.Series([1.2, 4.8, 2.2, 5.8], index=qind)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# We need to check that index levels are not sorted
|
||||
expected_levels = pd.core.indexes.frozen.FrozenList([["B", "A"], [0.2, 0.8]])
|
||||
tm.assert_equal(result.index.levels, expected_levels)
|
||||
@@ -0,0 +1,721 @@
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
NaT,
|
||||
Series,
|
||||
concat,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_rank_unordered_categorical_typeerror():
|
||||
# GH#51034 should be TypeError, not NotImplementedError
|
||||
cat = pd.Categorical([], ordered=False)
|
||||
ser = Series(cat)
|
||||
df = ser.to_frame()
|
||||
|
||||
msg = "Cannot perform rank with non-ordered Categorical"
|
||||
|
||||
gb = ser.groupby(cat, observed=False)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
gb.rank()
|
||||
|
||||
gb2 = df.groupby(cat, observed=False)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
gb2.rank()
|
||||
|
||||
|
||||
def test_rank_apply():
|
||||
lev1 = np.array(["a" * 10] * 100, dtype=object)
|
||||
lev2 = np.array(["b" * 10] * 130, dtype=object)
|
||||
lab1 = np.random.default_rng(2).integers(0, 100, size=500, dtype=int)
|
||||
lab2 = np.random.default_rng(2).integers(0, 130, size=500, dtype=int)
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"value": np.random.default_rng(2).standard_normal(500),
|
||||
"key1": lev1.take(lab1),
|
||||
"key2": lev2.take(lab2),
|
||||
}
|
||||
)
|
||||
|
||||
result = df.groupby(["key1", "key2"]).value.rank()
|
||||
|
||||
expected = [piece.value.rank() for key, piece in df.groupby(["key1", "key2"])]
|
||||
expected = concat(expected, axis=0)
|
||||
expected = expected.reindex(result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.groupby(["key1", "key2"]).value.rank(pct=True)
|
||||
|
||||
expected = [
|
||||
piece.value.rank(pct=True) for key, piece in df.groupby(["key1", "key2"])
|
||||
]
|
||||
expected = concat(expected, axis=0)
|
||||
expected = expected.reindex(result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
|
||||
@pytest.mark.parametrize(
|
||||
"vals",
|
||||
[
|
||||
np.array([2, 2, 8, 2, 6], dtype=dtype)
|
||||
for dtype in ["i8", "i4", "i2", "i1", "u8", "u4", "u2", "u1", "f8", "f4", "f2"]
|
||||
]
|
||||
+ [
|
||||
[
|
||||
pd.Timestamp("2018-01-02"),
|
||||
pd.Timestamp("2018-01-02"),
|
||||
pd.Timestamp("2018-01-08"),
|
||||
pd.Timestamp("2018-01-02"),
|
||||
pd.Timestamp("2018-01-06"),
|
||||
],
|
||||
[
|
||||
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-08", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-06", tz="US/Pacific"),
|
||||
],
|
||||
[
|
||||
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-08") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-06") - pd.Timestamp(0),
|
||||
],
|
||||
[
|
||||
pd.Timestamp("2018-01-02").to_period("D"),
|
||||
pd.Timestamp("2018-01-02").to_period("D"),
|
||||
pd.Timestamp("2018-01-08").to_period("D"),
|
||||
pd.Timestamp("2018-01-02").to_period("D"),
|
||||
pd.Timestamp("2018-01-06").to_period("D"),
|
||||
],
|
||||
],
|
||||
ids=lambda x: type(x[0]),
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"ties_method,ascending,pct,exp",
|
||||
[
|
||||
("average", True, False, [2.0, 2.0, 5.0, 2.0, 4.0]),
|
||||
("average", True, True, [0.4, 0.4, 1.0, 0.4, 0.8]),
|
||||
("average", False, False, [4.0, 4.0, 1.0, 4.0, 2.0]),
|
||||
("average", False, True, [0.8, 0.8, 0.2, 0.8, 0.4]),
|
||||
("min", True, False, [1.0, 1.0, 5.0, 1.0, 4.0]),
|
||||
("min", True, True, [0.2, 0.2, 1.0, 0.2, 0.8]),
|
||||
("min", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]),
|
||||
("min", False, True, [0.6, 0.6, 0.2, 0.6, 0.4]),
|
||||
("max", True, False, [3.0, 3.0, 5.0, 3.0, 4.0]),
|
||||
("max", True, True, [0.6, 0.6, 1.0, 0.6, 0.8]),
|
||||
("max", False, False, [5.0, 5.0, 1.0, 5.0, 2.0]),
|
||||
("max", False, True, [1.0, 1.0, 0.2, 1.0, 0.4]),
|
||||
("first", True, False, [1.0, 2.0, 5.0, 3.0, 4.0]),
|
||||
("first", True, True, [0.2, 0.4, 1.0, 0.6, 0.8]),
|
||||
("first", False, False, [3.0, 4.0, 1.0, 5.0, 2.0]),
|
||||
("first", False, True, [0.6, 0.8, 0.2, 1.0, 0.4]),
|
||||
("dense", True, False, [1.0, 1.0, 3.0, 1.0, 2.0]),
|
||||
("dense", True, True, [1.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 2.0 / 3.0]),
|
||||
("dense", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]),
|
||||
("dense", False, True, [3.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 2.0 / 3.0]),
|
||||
],
|
||||
)
|
||||
def test_rank_args(grps, vals, ties_method, ascending, pct, exp):
|
||||
key = np.repeat(grps, len(vals))
|
||||
|
||||
orig_vals = vals
|
||||
vals = list(vals) * len(grps)
|
||||
if isinstance(orig_vals, np.ndarray):
|
||||
vals = np.array(vals, dtype=orig_vals.dtype)
|
||||
|
||||
df = DataFrame({"key": key, "val": vals})
|
||||
result = df.groupby("key").rank(method=ties_method, ascending=ascending, pct=pct)
|
||||
|
||||
exp_df = DataFrame(exp * len(grps), columns=["val"])
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
|
||||
@pytest.mark.parametrize(
|
||||
"vals", [[-np.inf, -np.inf, np.nan, 1.0, np.nan, np.inf, np.inf]]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"ties_method,ascending,na_option,exp",
|
||||
[
|
||||
("average", True, "keep", [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]),
|
||||
("average", True, "top", [3.5, 3.5, 1.5, 5.0, 1.5, 6.5, 6.5]),
|
||||
("average", True, "bottom", [1.5, 1.5, 6.5, 3.0, 6.5, 4.5, 4.5]),
|
||||
("average", False, "keep", [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]),
|
||||
("average", False, "top", [6.5, 6.5, 1.5, 5.0, 1.5, 3.5, 3.5]),
|
||||
("average", False, "bottom", [4.5, 4.5, 6.5, 3.0, 6.5, 1.5, 1.5]),
|
||||
("min", True, "keep", [1.0, 1.0, np.nan, 3.0, np.nan, 4.0, 4.0]),
|
||||
("min", True, "top", [3.0, 3.0, 1.0, 5.0, 1.0, 6.0, 6.0]),
|
||||
("min", True, "bottom", [1.0, 1.0, 6.0, 3.0, 6.0, 4.0, 4.0]),
|
||||
("min", False, "keep", [4.0, 4.0, np.nan, 3.0, np.nan, 1.0, 1.0]),
|
||||
("min", False, "top", [6.0, 6.0, 1.0, 5.0, 1.0, 3.0, 3.0]),
|
||||
("min", False, "bottom", [4.0, 4.0, 6.0, 3.0, 6.0, 1.0, 1.0]),
|
||||
("max", True, "keep", [2.0, 2.0, np.nan, 3.0, np.nan, 5.0, 5.0]),
|
||||
("max", True, "top", [4.0, 4.0, 2.0, 5.0, 2.0, 7.0, 7.0]),
|
||||
("max", True, "bottom", [2.0, 2.0, 7.0, 3.0, 7.0, 5.0, 5.0]),
|
||||
("max", False, "keep", [5.0, 5.0, np.nan, 3.0, np.nan, 2.0, 2.0]),
|
||||
("max", False, "top", [7.0, 7.0, 2.0, 5.0, 2.0, 4.0, 4.0]),
|
||||
("max", False, "bottom", [5.0, 5.0, 7.0, 3.0, 7.0, 2.0, 2.0]),
|
||||
("first", True, "keep", [1.0, 2.0, np.nan, 3.0, np.nan, 4.0, 5.0]),
|
||||
("first", True, "top", [3.0, 4.0, 1.0, 5.0, 2.0, 6.0, 7.0]),
|
||||
("first", True, "bottom", [1.0, 2.0, 6.0, 3.0, 7.0, 4.0, 5.0]),
|
||||
("first", False, "keep", [4.0, 5.0, np.nan, 3.0, np.nan, 1.0, 2.0]),
|
||||
("first", False, "top", [6.0, 7.0, 1.0, 5.0, 2.0, 3.0, 4.0]),
|
||||
("first", False, "bottom", [4.0, 5.0, 6.0, 3.0, 7.0, 1.0, 2.0]),
|
||||
("dense", True, "keep", [1.0, 1.0, np.nan, 2.0, np.nan, 3.0, 3.0]),
|
||||
("dense", True, "top", [2.0, 2.0, 1.0, 3.0, 1.0, 4.0, 4.0]),
|
||||
("dense", True, "bottom", [1.0, 1.0, 4.0, 2.0, 4.0, 3.0, 3.0]),
|
||||
("dense", False, "keep", [3.0, 3.0, np.nan, 2.0, np.nan, 1.0, 1.0]),
|
||||
("dense", False, "top", [4.0, 4.0, 1.0, 3.0, 1.0, 2.0, 2.0]),
|
||||
("dense", False, "bottom", [3.0, 3.0, 4.0, 2.0, 4.0, 1.0, 1.0]),
|
||||
],
|
||||
)
|
||||
def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
|
||||
# GH 20561
|
||||
key = np.repeat(grps, len(vals))
|
||||
vals = vals * len(grps)
|
||||
df = DataFrame({"key": key, "val": vals})
|
||||
result = df.groupby("key").rank(
|
||||
method=ties_method, ascending=ascending, na_option=na_option
|
||||
)
|
||||
exp_df = DataFrame(exp * len(grps), columns=["val"])
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
|
||||
@pytest.mark.parametrize(
|
||||
"vals",
|
||||
[
|
||||
np.array([2, 2, np.nan, 8, 2, 6, np.nan, np.nan], dtype=dtype)
|
||||
for dtype in ["f8", "f4", "f2"]
|
||||
]
|
||||
+ [
|
||||
[
|
||||
pd.Timestamp("2018-01-02"),
|
||||
pd.Timestamp("2018-01-02"),
|
||||
np.nan,
|
||||
pd.Timestamp("2018-01-08"),
|
||||
pd.Timestamp("2018-01-02"),
|
||||
pd.Timestamp("2018-01-06"),
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
[
|
||||
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
||||
np.nan,
|
||||
pd.Timestamp("2018-01-08", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-06", tz="US/Pacific"),
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
[
|
||||
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
||||
np.nan,
|
||||
pd.Timestamp("2018-01-08") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-06") - pd.Timestamp(0),
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
[
|
||||
pd.Timestamp("2018-01-02").to_period("D"),
|
||||
pd.Timestamp("2018-01-02").to_period("D"),
|
||||
np.nan,
|
||||
pd.Timestamp("2018-01-08").to_period("D"),
|
||||
pd.Timestamp("2018-01-02").to_period("D"),
|
||||
pd.Timestamp("2018-01-06").to_period("D"),
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
],
|
||||
ids=lambda x: type(x[0]),
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"ties_method,ascending,na_option,pct,exp",
|
||||
[
|
||||
(
|
||||
"average",
|
||||
True,
|
||||
"keep",
|
||||
False,
|
||||
[2.0, 2.0, np.nan, 5.0, 2.0, 4.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"average",
|
||||
True,
|
||||
"keep",
|
||||
True,
|
||||
[0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"average",
|
||||
False,
|
||||
"keep",
|
||||
False,
|
||||
[4.0, 4.0, np.nan, 1.0, 4.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"average",
|
||||
False,
|
||||
"keep",
|
||||
True,
|
||||
[0.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan],
|
||||
),
|
||||
("min", True, "keep", False, [1.0, 1.0, np.nan, 5.0, 1.0, 4.0, np.nan, np.nan]),
|
||||
("min", True, "keep", True, [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]),
|
||||
(
|
||||
"min",
|
||||
False,
|
||||
"keep",
|
||||
False,
|
||||
[3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
("min", False, "keep", True, [0.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]),
|
||||
("max", True, "keep", False, [3.0, 3.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan]),
|
||||
("max", True, "keep", True, [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]),
|
||||
(
|
||||
"max",
|
||||
False,
|
||||
"keep",
|
||||
False,
|
||||
[5.0, 5.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
("max", False, "keep", True, [1.0, 1.0, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan]),
|
||||
(
|
||||
"first",
|
||||
True,
|
||||
"keep",
|
||||
False,
|
||||
[1.0, 2.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"first",
|
||||
True,
|
||||
"keep",
|
||||
True,
|
||||
[0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"first",
|
||||
False,
|
||||
"keep",
|
||||
False,
|
||||
[3.0, 4.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"first",
|
||||
False,
|
||||
"keep",
|
||||
True,
|
||||
[0.6, 0.8, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"dense",
|
||||
True,
|
||||
"keep",
|
||||
False,
|
||||
[1.0, 1.0, np.nan, 3.0, 1.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"dense",
|
||||
True,
|
||||
"keep",
|
||||
True,
|
||||
[
|
||||
1.0 / 3.0,
|
||||
1.0 / 3.0,
|
||||
np.nan,
|
||||
3.0 / 3.0,
|
||||
1.0 / 3.0,
|
||||
2.0 / 3.0,
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
),
|
||||
(
|
||||
"dense",
|
||||
False,
|
||||
"keep",
|
||||
False,
|
||||
[3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"dense",
|
||||
False,
|
||||
"keep",
|
||||
True,
|
||||
[
|
||||
3.0 / 3.0,
|
||||
3.0 / 3.0,
|
||||
np.nan,
|
||||
1.0 / 3.0,
|
||||
3.0 / 3.0,
|
||||
2.0 / 3.0,
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
),
|
||||
("average", True, "bottom", False, [2.0, 2.0, 7.0, 5.0, 2.0, 4.0, 7.0, 7.0]),
|
||||
(
|
||||
"average",
|
||||
True,
|
||||
"bottom",
|
||||
True,
|
||||
[0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875],
|
||||
),
|
||||
("average", False, "bottom", False, [4.0, 4.0, 7.0, 1.0, 4.0, 2.0, 7.0, 7.0]),
|
||||
(
|
||||
"average",
|
||||
False,
|
||||
"bottom",
|
||||
True,
|
||||
[0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875],
|
||||
),
|
||||
("min", True, "bottom", False, [1.0, 1.0, 6.0, 5.0, 1.0, 4.0, 6.0, 6.0]),
|
||||
(
|
||||
"min",
|
||||
True,
|
||||
"bottom",
|
||||
True,
|
||||
[0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75],
|
||||
),
|
||||
("min", False, "bottom", False, [3.0, 3.0, 6.0, 1.0, 3.0, 2.0, 6.0, 6.0]),
|
||||
(
|
||||
"min",
|
||||
False,
|
||||
"bottom",
|
||||
True,
|
||||
[0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75],
|
||||
),
|
||||
("max", True, "bottom", False, [3.0, 3.0, 8.0, 5.0, 3.0, 4.0, 8.0, 8.0]),
|
||||
("max", True, "bottom", True, [0.375, 0.375, 1.0, 0.625, 0.375, 0.5, 1.0, 1.0]),
|
||||
("max", False, "bottom", False, [5.0, 5.0, 8.0, 1.0, 5.0, 2.0, 8.0, 8.0]),
|
||||
(
|
||||
"max",
|
||||
False,
|
||||
"bottom",
|
||||
True,
|
||||
[0.625, 0.625, 1.0, 0.125, 0.625, 0.25, 1.0, 1.0],
|
||||
),
|
||||
("first", True, "bottom", False, [1.0, 2.0, 6.0, 5.0, 3.0, 4.0, 7.0, 8.0]),
|
||||
(
|
||||
"first",
|
||||
True,
|
||||
"bottom",
|
||||
True,
|
||||
[0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.0],
|
||||
),
|
||||
("first", False, "bottom", False, [3.0, 4.0, 6.0, 1.0, 5.0, 2.0, 7.0, 8.0]),
|
||||
(
|
||||
"first",
|
||||
False,
|
||||
"bottom",
|
||||
True,
|
||||
[0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.0],
|
||||
),
|
||||
("dense", True, "bottom", False, [1.0, 1.0, 4.0, 3.0, 1.0, 2.0, 4.0, 4.0]),
|
||||
("dense", True, "bottom", True, [0.25, 0.25, 1.0, 0.75, 0.25, 0.5, 1.0, 1.0]),
|
||||
("dense", False, "bottom", False, [3.0, 3.0, 4.0, 1.0, 3.0, 2.0, 4.0, 4.0]),
|
||||
("dense", False, "bottom", True, [0.75, 0.75, 1.0, 0.25, 0.75, 0.5, 1.0, 1.0]),
|
||||
],
|
||||
)
|
||||
def test_rank_args_missing(grps, vals, ties_method, ascending, na_option, pct, exp):
|
||||
key = np.repeat(grps, len(vals))
|
||||
|
||||
orig_vals = vals
|
||||
vals = list(vals) * len(grps)
|
||||
if isinstance(orig_vals, np.ndarray):
|
||||
vals = np.array(vals, dtype=orig_vals.dtype)
|
||||
|
||||
df = DataFrame({"key": key, "val": vals})
|
||||
result = df.groupby("key").rank(
|
||||
method=ties_method, ascending=ascending, na_option=na_option, pct=pct
|
||||
)
|
||||
|
||||
exp_df = DataFrame(exp * len(grps), columns=["val"])
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pct,exp", [(False, [3.0, 3.0, 3.0, 3.0, 3.0]), (True, [0.6, 0.6, 0.6, 0.6, 0.6])]
|
||||
)
|
||||
def test_rank_resets_each_group(pct, exp):
|
||||
df = DataFrame(
|
||||
{"key": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], "val": [1] * 10}
|
||||
)
|
||||
result = df.groupby("key").rank(pct=pct)
|
||||
exp_df = DataFrame(exp * 2, columns=["val"])
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype", ["int64", "int32", "uint64", "uint32", "float64", "float32"]
|
||||
)
|
||||
@pytest.mark.parametrize("upper", [True, False])
|
||||
def test_rank_avg_even_vals(dtype, upper):
|
||||
if upper:
|
||||
# use IntegerDtype/FloatingDtype
|
||||
dtype = dtype[0].upper() + dtype[1:]
|
||||
dtype = dtype.replace("Ui", "UI")
|
||||
df = DataFrame({"key": ["a"] * 4, "val": [1] * 4})
|
||||
df["val"] = df["val"].astype(dtype)
|
||||
assert df["val"].dtype == dtype
|
||||
|
||||
result = df.groupby("key").rank()
|
||||
exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=["val"])
|
||||
if upper:
|
||||
exp_df = exp_df.astype("Float64")
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"])
|
||||
@pytest.mark.parametrize("ascending", [True, False])
|
||||
@pytest.mark.parametrize("na_option", ["keep", "top", "bottom"])
|
||||
@pytest.mark.parametrize("pct", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"vals", [["bar", "bar", "foo", "bar", "baz"], ["bar", np.nan, "foo", np.nan, "baz"]]
|
||||
)
|
||||
def test_rank_object_dtype(ties_method, ascending, na_option, pct, vals):
|
||||
df = DataFrame({"key": ["foo"] * 5, "val": vals})
|
||||
mask = df["val"].isna()
|
||||
|
||||
gb = df.groupby("key")
|
||||
res = gb.rank(method=ties_method, ascending=ascending, na_option=na_option, pct=pct)
|
||||
|
||||
# construct our expected by using numeric values with the same ordering
|
||||
if mask.any():
|
||||
df2 = DataFrame({"key": ["foo"] * 5, "val": [0, np.nan, 2, np.nan, 1]})
|
||||
else:
|
||||
df2 = DataFrame({"key": ["foo"] * 5, "val": [0, 0, 2, 0, 1]})
|
||||
|
||||
gb2 = df2.groupby("key")
|
||||
alt = gb2.rank(
|
||||
method=ties_method, ascending=ascending, na_option=na_option, pct=pct
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(res, alt)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_option", [True, "bad", 1])
|
||||
@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"])
|
||||
@pytest.mark.parametrize("ascending", [True, False])
|
||||
@pytest.mark.parametrize("pct", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"vals",
|
||||
[
|
||||
["bar", "bar", "foo", "bar", "baz"],
|
||||
["bar", np.nan, "foo", np.nan, "baz"],
|
||||
[1, np.nan, 2, np.nan, 3],
|
||||
],
|
||||
)
|
||||
def test_rank_naoption_raises(ties_method, ascending, na_option, pct, vals):
|
||||
df = DataFrame({"key": ["foo"] * 5, "val": vals})
|
||||
msg = "na_option must be one of 'keep', 'top', or 'bottom'"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("key").rank(
|
||||
method=ties_method, ascending=ascending, na_option=na_option, pct=pct
|
||||
)
|
||||
|
||||
|
||||
def test_rank_empty_group():
|
||||
# see gh-22519
|
||||
column = "A"
|
||||
df = DataFrame({"A": [0, 1, 0], "B": [1.0, np.nan, 2.0]})
|
||||
|
||||
result = df.groupby(column).B.rank(pct=True)
|
||||
expected = Series([0.5, np.nan, 1.0], name="B")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.groupby(column).rank(pct=True)
|
||||
expected = DataFrame({"B": [0.5, np.nan, 1.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_key,input_value,output_value",
|
||||
[
|
||||
([1, 2], [1, 1], [1.0, 1.0]),
|
||||
([1, 1, 2, 2], [1, 2, 1, 2], [0.5, 1.0, 0.5, 1.0]),
|
||||
([1, 1, 2, 2], [1, 2, 1, np.nan], [0.5, 1.0, 1.0, np.nan]),
|
||||
([1, 1, 2], [1, 2, np.nan], [0.5, 1.0, np.nan]),
|
||||
],
|
||||
)
|
||||
def test_rank_zero_div(input_key, input_value, output_value):
|
||||
# GH 23666
|
||||
df = DataFrame({"A": input_key, "B": input_value})
|
||||
|
||||
result = df.groupby("A").rank(method="dense", pct=True)
|
||||
expected = DataFrame({"B": output_value})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_rank_min_int():
|
||||
# GH-32859
|
||||
df = DataFrame(
|
||||
{
|
||||
"grp": [1, 1, 2],
|
||||
"int_col": [
|
||||
np.iinfo(np.int64).min,
|
||||
np.iinfo(np.int64).max,
|
||||
np.iinfo(np.int64).min,
|
||||
],
|
||||
"datetimelike": [NaT, datetime(2001, 1, 1), NaT],
|
||||
}
|
||||
)
|
||||
|
||||
result = df.groupby("grp").rank()
|
||||
expected = DataFrame(
|
||||
{"int_col": [1.0, 2.0, 1.0], "datetimelike": [np.nan, 1.0, np.nan]}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_nan", [True, False])
|
||||
def test_rank_pct_equal_values_on_group_transition(use_nan):
|
||||
# GH#40518
|
||||
fill_value = np.nan if use_nan else 3
|
||||
df = DataFrame(
|
||||
[
|
||||
[-1, 1],
|
||||
[-1, 2],
|
||||
[1, fill_value],
|
||||
[-1, fill_value],
|
||||
],
|
||||
columns=["group", "val"],
|
||||
)
|
||||
result = df.groupby(["group"])["val"].rank(
|
||||
method="dense",
|
||||
pct=True,
|
||||
)
|
||||
if use_nan:
|
||||
expected = Series([0.5, 1, np.nan, np.nan], name="val")
|
||||
else:
|
||||
expected = Series([1 / 3, 2 / 3, 1, 1], name="val")
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_rank_multiindex():
|
||||
# GH27721
|
||||
df = concat(
|
||||
{
|
||||
"a": DataFrame({"col1": [3, 4], "col2": [1, 2]}),
|
||||
"b": DataFrame({"col3": [5, 6], "col4": [7, 8]}),
|
||||
},
|
||||
axis=1,
|
||||
)
|
||||
|
||||
msg = "DataFrame.groupby with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
gb = df.groupby(level=0, axis=1)
|
||||
msg = "DataFrameGroupBy.rank with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = gb.rank(axis=1)
|
||||
|
||||
expected = concat(
|
||||
[
|
||||
df["a"].rank(axis=1),
|
||||
df["b"].rank(axis=1),
|
||||
],
|
||||
axis=1,
|
||||
keys=["a", "b"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_axis0_rank_axis1():
|
||||
# GH#41320
|
||||
df = DataFrame(
|
||||
{0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]},
|
||||
index=["a", "a", "b", "b"],
|
||||
)
|
||||
msg = "The 'axis' keyword in DataFrame.groupby is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
gb = df.groupby(level=0, axis=0)
|
||||
|
||||
msg = "DataFrameGroupBy.rank with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
res = gb.rank(axis=1)
|
||||
|
||||
# This should match what we get when "manually" operating group-by-group
|
||||
expected = concat([df.loc["a"].rank(axis=1), df.loc["b"].rank(axis=1)], axis=0)
|
||||
tm.assert_frame_equal(res, expected)
|
||||
|
||||
# check that we haven't accidentally written a case that coincidentally
|
||||
# matches rank(axis=0)
|
||||
msg = "The 'axis' keyword in DataFrameGroupBy.rank"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
alt = gb.rank(axis=0)
|
||||
assert not alt.equals(expected)
|
||||
|
||||
|
||||
def test_groupby_axis0_cummax_axis1():
|
||||
# case where groupby axis is 0 and axis keyword in transform is 1
|
||||
|
||||
# df has mixed dtype -> multiple blocks
|
||||
df = DataFrame(
|
||||
{0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]},
|
||||
index=["a", "a", "b", "b"],
|
||||
)
|
||||
msg = "The 'axis' keyword in DataFrame.groupby is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
gb = df.groupby(level=0, axis=0)
|
||||
|
||||
msg = "DataFrameGroupBy.cummax with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
cmax = gb.cummax(axis=1)
|
||||
expected = df[[0, 1]].astype(np.float64)
|
||||
expected[2] = expected[1]
|
||||
tm.assert_frame_equal(cmax, expected)
|
||||
|
||||
|
||||
def test_non_unique_index():
|
||||
# GH 16577
|
||||
df = DataFrame(
|
||||
{"A": [1.0, 2.0, 3.0, np.nan], "value": 1.0},
|
||||
index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4,
|
||||
)
|
||||
result = df.groupby([df.index, "A"]).value.rank(ascending=True, pct=True)
|
||||
expected = Series(
|
||||
[1.0, 1.0, 1.0, np.nan],
|
||||
index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4,
|
||||
name="value",
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_rank_categorical():
|
||||
cat = pd.Categorical(["a", "a", "b", np.nan, "c", "b"], ordered=True)
|
||||
cat2 = pd.Categorical([1, 2, 3, np.nan, 4, 5], ordered=True)
|
||||
|
||||
df = DataFrame({"col1": [0, 1, 0, 1, 0, 1], "col2": cat, "col3": cat2})
|
||||
|
||||
gb = df.groupby("col1")
|
||||
|
||||
res = gb.rank()
|
||||
|
||||
expected = df.astype(object).groupby("col1").rank()
|
||||
tm.assert_frame_equal(res, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_option", ["top", "bottom"])
|
||||
def test_groupby_op_with_nullables(na_option):
|
||||
# GH 54206
|
||||
df = DataFrame({"x": [None]}, dtype="Float64")
|
||||
result = df.groupby("x", dropna=False)["x"].rank(method="min", na_option=na_option)
|
||||
expected = Series([1.0], dtype="Float64", name=result.name)
|
||||
tm.assert_series_equal(result, expected)
|
||||
@@ -0,0 +1,154 @@
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n, frac", [(2, None), (None, 0.2)])
|
||||
def test_groupby_sample_balanced_groups_shape(n, frac):
|
||||
values = [1] * 10 + [2] * 10
|
||||
df = DataFrame({"a": values, "b": values})
|
||||
|
||||
result = df.groupby("a").sample(n=n, frac=frac)
|
||||
values = [1] * 2 + [2] * 2
|
||||
expected = DataFrame({"a": values, "b": values}, index=result.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("a")["b"].sample(n=n, frac=frac)
|
||||
expected = Series(values, name="b", index=result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_sample_unbalanced_groups_shape():
|
||||
values = [1] * 10 + [2] * 20
|
||||
df = DataFrame({"a": values, "b": values})
|
||||
|
||||
result = df.groupby("a").sample(n=5)
|
||||
values = [1] * 5 + [2] * 5
|
||||
expected = DataFrame({"a": values, "b": values}, index=result.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("a")["b"].sample(n=5)
|
||||
expected = Series(values, name="b", index=result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_sample_index_value_spans_groups():
|
||||
values = [1] * 3 + [2] * 3
|
||||
df = DataFrame({"a": values, "b": values}, index=[1, 2, 2, 2, 2, 2])
|
||||
|
||||
result = df.groupby("a").sample(n=2)
|
||||
values = [1] * 2 + [2] * 2
|
||||
expected = DataFrame({"a": values, "b": values}, index=result.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("a")["b"].sample(n=2)
|
||||
expected = Series(values, name="b", index=result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_sample_n_and_frac_raises():
|
||||
df = DataFrame({"a": [1, 2], "b": [1, 2]})
|
||||
msg = "Please enter a value for `frac` OR `n`, not both"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("a").sample(n=1, frac=1.0)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("a")["b"].sample(n=1, frac=1.0)
|
||||
|
||||
|
||||
def test_groupby_sample_frac_gt_one_without_replacement_raises():
|
||||
df = DataFrame({"a": [1, 2], "b": [1, 2]})
|
||||
msg = "Replace has to be set to `True` when upsampling the population `frac` > 1."
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("a").sample(frac=1.5, replace=False)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("a")["b"].sample(frac=1.5, replace=False)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n", [-1, 1.5])
|
||||
def test_groupby_sample_invalid_n_raises(n):
|
||||
df = DataFrame({"a": [1, 2], "b": [1, 2]})
|
||||
|
||||
if n < 0:
|
||||
msg = "A negative number of rows requested. Please provide `n` >= 0."
|
||||
else:
|
||||
msg = "Only integers accepted as `n` values"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("a").sample(n=n)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("a")["b"].sample(n=n)
|
||||
|
||||
|
||||
def test_groupby_sample_oversample():
|
||||
values = [1] * 10 + [2] * 10
|
||||
df = DataFrame({"a": values, "b": values})
|
||||
|
||||
result = df.groupby("a").sample(frac=2.0, replace=True)
|
||||
values = [1] * 20 + [2] * 20
|
||||
expected = DataFrame({"a": values, "b": values}, index=result.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("a")["b"].sample(frac=2.0, replace=True)
|
||||
expected = Series(values, name="b", index=result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_sample_without_n_or_frac():
|
||||
values = [1] * 10 + [2] * 10
|
||||
df = DataFrame({"a": values, "b": values})
|
||||
|
||||
result = df.groupby("a").sample(n=None, frac=None)
|
||||
expected = DataFrame({"a": [1, 2], "b": [1, 2]}, index=result.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("a")["b"].sample(n=None, frac=None)
|
||||
expected = Series([1, 2], name="b", index=result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index, expected_index",
|
||||
[(["w", "x", "y", "z"], ["w", "w", "y", "y"]), ([3, 4, 5, 6], [3, 3, 5, 5])],
|
||||
)
|
||||
def test_groupby_sample_with_weights(index, expected_index):
|
||||
# GH 39927 - tests for integer index needed
|
||||
values = [1] * 2 + [2] * 2
|
||||
df = DataFrame({"a": values, "b": values}, index=Index(index))
|
||||
|
||||
result = df.groupby("a").sample(n=2, replace=True, weights=[1, 0, 1, 0])
|
||||
expected = DataFrame({"a": values, "b": values}, index=Index(expected_index))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("a")["b"].sample(n=2, replace=True, weights=[1, 0, 1, 0])
|
||||
expected = Series(values, name="b", index=Index(expected_index))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_sample_with_selections():
|
||||
# GH 39928
|
||||
values = [1] * 10 + [2] * 10
|
||||
df = DataFrame({"a": values, "b": values, "c": values})
|
||||
|
||||
result = df.groupby("a")[["b", "c"]].sample(n=None, frac=None)
|
||||
expected = DataFrame({"b": [1, 2], "c": [1, 2]}, index=result.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_sample_with_empty_inputs():
|
||||
# GH48459
|
||||
df = DataFrame({"a": [], "b": []})
|
||||
groupby_df = df.groupby("a")
|
||||
|
||||
result = groupby_df.sample()
|
||||
expected = df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,130 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas.core.dtypes.common import is_integer_dtype
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
PeriodIndex,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]])
|
||||
def test_size(df, by):
|
||||
grouped = df.groupby(by=by)
|
||||
result = grouped.size()
|
||||
for key, group in grouped:
|
||||
assert result[key] == len(group)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"by",
|
||||
[
|
||||
[0, 0, 0, 0],
|
||||
[0, 1, 1, 1],
|
||||
[1, 0, 1, 1],
|
||||
[0, None, None, None],
|
||||
pytest.param([None, None, None, None], marks=pytest.mark.xfail),
|
||||
],
|
||||
)
|
||||
def test_size_axis_1(df, axis_1, by, sort, dropna):
|
||||
# GH#45715
|
||||
counts = {key: sum(value == key for value in by) for key in dict.fromkeys(by)}
|
||||
if dropna:
|
||||
counts = {key: value for key, value in counts.items() if key is not None}
|
||||
expected = Series(counts, dtype="int64")
|
||||
if sort:
|
||||
expected = expected.sort_index()
|
||||
if is_integer_dtype(expected.index.dtype) and not any(x is None for x in by):
|
||||
expected.index = expected.index.astype(int)
|
||||
|
||||
msg = "DataFrame.groupby with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
grouped = df.groupby(by=by, axis=axis_1, sort=sort, dropna=dropna)
|
||||
result = grouped.size()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]])
|
||||
@pytest.mark.parametrize("sort", [True, False])
|
||||
def test_size_sort(sort, by):
|
||||
df = DataFrame(np.random.default_rng(2).choice(20, (1000, 3)), columns=list("ABC"))
|
||||
left = df.groupby(by=by, sort=sort).size()
|
||||
right = df.groupby(by=by, sort=sort)["C"].apply(lambda a: a.shape[0])
|
||||
tm.assert_series_equal(left, right, check_names=False)
|
||||
|
||||
|
||||
def test_size_series_dataframe():
|
||||
# https://github.com/pandas-dev/pandas/issues/11699
|
||||
df = DataFrame(columns=["A", "B"])
|
||||
out = Series(dtype="int64", index=Index([], name="A"))
|
||||
tm.assert_series_equal(df.groupby("A").size(), out)
|
||||
|
||||
|
||||
def test_size_groupby_all_null():
|
||||
# https://github.com/pandas-dev/pandas/issues/23050
|
||||
# Assert no 'Value Error : Length of passed values is 2, index implies 0'
|
||||
df = DataFrame({"A": [None, None]}) # all-null groups
|
||||
result = df.groupby("A").size()
|
||||
expected = Series(dtype="int64", index=Index([], name="A"))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_size_period_index():
|
||||
# https://github.com/pandas-dev/pandas/issues/34010
|
||||
ser = Series([1], index=PeriodIndex(["2000"], name="A", freq="D"))
|
||||
grp = ser.groupby(level="A")
|
||||
result = grp.size()
|
||||
tm.assert_series_equal(result, ser)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("as_index", [True, False])
|
||||
def test_size_on_categorical(as_index):
|
||||
df = DataFrame([[1, 1], [2, 2]], columns=["A", "B"])
|
||||
df["A"] = df["A"].astype("category")
|
||||
result = df.groupby(["A", "B"], as_index=as_index, observed=False).size()
|
||||
|
||||
expected = DataFrame(
|
||||
[[1, 1, 1], [1, 2, 0], [2, 1, 0], [2, 2, 1]], columns=["A", "B", "size"]
|
||||
)
|
||||
expected["A"] = expected["A"].astype("category")
|
||||
if as_index:
|
||||
expected = expected.set_index(["A", "B"])["size"].rename(None)
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
|
||||
def test_size_series_masked_type_returns_Int64(dtype):
|
||||
# GH 54132
|
||||
ser = Series([1, 1, 1], index=["a", "a", "b"], dtype=dtype)
|
||||
result = ser.groupby(level=0).size()
|
||||
expected = Series([2, 1], dtype="Int64", index=["a", "b"])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype",
|
||||
[
|
||||
object,
|
||||
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
|
||||
pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
|
||||
],
|
||||
)
|
||||
def test_size_strings(dtype):
|
||||
# GH#55627
|
||||
df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype)
|
||||
result = df.groupby("a")["b"].size()
|
||||
exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64"
|
||||
expected = Series(
|
||||
[2, 1],
|
||||
index=Index(["a", "b"], name="a", dtype=dtype),
|
||||
name="b",
|
||||
dtype=exp_dtype,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
@@ -0,0 +1,27 @@
|
||||
import numpy as np
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_groupby_skew_equivalence():
|
||||
# Test that that groupby skew method (which uses libgroupby.group_skew)
|
||||
# matches the results of operating group-by-group (which uses nanops.nanskew)
|
||||
nrows = 1000
|
||||
ngroups = 3
|
||||
ncols = 2
|
||||
nan_frac = 0.05
|
||||
|
||||
arr = np.random.default_rng(2).standard_normal((nrows, ncols))
|
||||
arr[np.random.default_rng(2).random(nrows) < nan_frac] = np.nan
|
||||
|
||||
df = pd.DataFrame(arr)
|
||||
grps = np.random.default_rng(2).integers(0, ngroups, size=nrows)
|
||||
gb = df.groupby(grps)
|
||||
|
||||
result = gb.skew()
|
||||
|
||||
grpwise = [grp.skew().to_frame(i).T for i, grp in gb]
|
||||
expected = pd.concat(grpwise, axis=0)
|
||||
expected.index = expected.index.astype(result.index.dtype) # 32bit builds
|
||||
tm.assert_frame_equal(result, expected)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,83 @@
|
||||
"""
|
||||
Tests that apply to all groupby operation methods.
|
||||
|
||||
The only tests that should appear here are those that use the `groupby_func` fixture.
|
||||
Even if it does use that fixture, prefer a more specific test file if it available
|
||||
such as:
|
||||
|
||||
- test_categorical
|
||||
- test_groupby_dropna
|
||||
- test_groupby_subclass
|
||||
- test_raises
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.groupby import get_groupby_method_args
|
||||
|
||||
|
||||
def test_multiindex_group_all_columns_when_empty(groupby_func):
|
||||
# GH 32464
|
||||
df = DataFrame({"a": [], "b": [], "c": []}).set_index(["a", "b", "c"])
|
||||
gb = df.groupby(["a", "b", "c"], group_keys=False)
|
||||
method = getattr(gb, groupby_func)
|
||||
args = get_groupby_method_args(groupby_func, df)
|
||||
|
||||
warn = FutureWarning if groupby_func == "fillna" else None
|
||||
warn_msg = "DataFrameGroupBy.fillna is deprecated"
|
||||
with tm.assert_produces_warning(warn, match=warn_msg):
|
||||
result = method(*args).index
|
||||
expected = df.index
|
||||
tm.assert_index_equal(result, expected)
|
||||
|
||||
|
||||
def test_duplicate_columns(request, groupby_func, as_index):
|
||||
# GH#50806
|
||||
if groupby_func == "corrwith":
|
||||
msg = "GH#50845 - corrwith fails when there are duplicate columns"
|
||||
request.applymarker(pytest.mark.xfail(reason=msg))
|
||||
df = DataFrame([[1, 3, 6], [1, 4, 7], [2, 5, 8]], columns=list("abb"))
|
||||
args = get_groupby_method_args(groupby_func, df)
|
||||
gb = df.groupby("a", as_index=as_index)
|
||||
warn = FutureWarning if groupby_func == "fillna" else None
|
||||
warn_msg = "DataFrameGroupBy.fillna is deprecated"
|
||||
with tm.assert_produces_warning(warn, match=warn_msg):
|
||||
result = getattr(gb, groupby_func)(*args)
|
||||
|
||||
expected_df = df.set_axis(["a", "b", "c"], axis=1)
|
||||
expected_args = get_groupby_method_args(groupby_func, expected_df)
|
||||
expected_gb = expected_df.groupby("a", as_index=as_index)
|
||||
warn = FutureWarning if groupby_func == "fillna" else None
|
||||
warn_msg = "DataFrameGroupBy.fillna is deprecated"
|
||||
with tm.assert_produces_warning(warn, match=warn_msg):
|
||||
expected = getattr(expected_gb, groupby_func)(*expected_args)
|
||||
if groupby_func not in ("size", "ngroup", "cumcount"):
|
||||
expected = expected.rename(columns={"c": "b"})
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"idx",
|
||||
[
|
||||
pd.Index(["a", "a"], name="foo"),
|
||||
pd.MultiIndex.from_tuples((("a", "a"), ("a", "a")), names=["foo", "bar"]),
|
||||
],
|
||||
)
|
||||
def test_dup_labels_output_shape(groupby_func, idx):
|
||||
if groupby_func in {"size", "ngroup", "cumcount"}:
|
||||
pytest.skip(f"Not applicable for {groupby_func}")
|
||||
|
||||
df = DataFrame([[1, 1]], columns=idx)
|
||||
grp_by = df.groupby([0])
|
||||
|
||||
args = get_groupby_method_args(groupby_func, df)
|
||||
warn = FutureWarning if groupby_func == "fillna" else None
|
||||
warn_msg = "DataFrameGroupBy.fillna is deprecated"
|
||||
with tm.assert_produces_warning(warn, match=warn_msg):
|
||||
result = getattr(grp_by, groupby_func)(*args)
|
||||
|
||||
assert result.shape == (1, 2)
|
||||
tm.assert_index_equal(result.columns, idx)
|
||||
@@ -0,0 +1,265 @@
|
||||
"""
|
||||
Tests of the groupby API, including internal consistency and with other pandas objects.
|
||||
|
||||
Tests in this file should only check the existence, names, and arguments of groupby
|
||||
methods. It should not test the results of any groupby operation.
|
||||
"""
|
||||
|
||||
import inspect
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
from pandas.core.groupby.base import (
|
||||
groupby_other_methods,
|
||||
reduction_kernels,
|
||||
transformation_kernels,
|
||||
)
|
||||
from pandas.core.groupby.generic import (
|
||||
DataFrameGroupBy,
|
||||
SeriesGroupBy,
|
||||
)
|
||||
|
||||
|
||||
def test_tab_completion(multiindex_dataframe_random_data):
|
||||
grp = multiindex_dataframe_random_data.groupby(level="second")
|
||||
results = {v for v in dir(grp) if not v.startswith("_")}
|
||||
expected = {
|
||||
"A",
|
||||
"B",
|
||||
"C",
|
||||
"agg",
|
||||
"aggregate",
|
||||
"apply",
|
||||
"boxplot",
|
||||
"filter",
|
||||
"first",
|
||||
"get_group",
|
||||
"groups",
|
||||
"hist",
|
||||
"indices",
|
||||
"last",
|
||||
"max",
|
||||
"mean",
|
||||
"median",
|
||||
"min",
|
||||
"ngroups",
|
||||
"nth",
|
||||
"ohlc",
|
||||
"plot",
|
||||
"prod",
|
||||
"size",
|
||||
"std",
|
||||
"sum",
|
||||
"transform",
|
||||
"var",
|
||||
"sem",
|
||||
"count",
|
||||
"nunique",
|
||||
"head",
|
||||
"describe",
|
||||
"cummax",
|
||||
"quantile",
|
||||
"rank",
|
||||
"cumprod",
|
||||
"tail",
|
||||
"resample",
|
||||
"cummin",
|
||||
"fillna",
|
||||
"cumsum",
|
||||
"cumcount",
|
||||
"ngroup",
|
||||
"all",
|
||||
"shift",
|
||||
"skew",
|
||||
"take",
|
||||
"pct_change",
|
||||
"any",
|
||||
"corr",
|
||||
"corrwith",
|
||||
"cov",
|
||||
"dtypes",
|
||||
"ndim",
|
||||
"diff",
|
||||
"idxmax",
|
||||
"idxmin",
|
||||
"ffill",
|
||||
"bfill",
|
||||
"rolling",
|
||||
"expanding",
|
||||
"pipe",
|
||||
"sample",
|
||||
"ewm",
|
||||
"value_counts",
|
||||
}
|
||||
assert results == expected
|
||||
|
||||
|
||||
def test_all_methods_categorized(multiindex_dataframe_random_data):
|
||||
grp = multiindex_dataframe_random_data.groupby(
|
||||
multiindex_dataframe_random_data.iloc[:, 0]
|
||||
)
|
||||
names = {_ for _ in dir(grp) if not _.startswith("_")} - set(
|
||||
multiindex_dataframe_random_data.columns
|
||||
)
|
||||
new_names = set(names)
|
||||
new_names -= reduction_kernels
|
||||
new_names -= transformation_kernels
|
||||
new_names -= groupby_other_methods
|
||||
|
||||
assert not reduction_kernels & transformation_kernels
|
||||
assert not reduction_kernels & groupby_other_methods
|
||||
assert not transformation_kernels & groupby_other_methods
|
||||
|
||||
# new public method?
|
||||
if new_names:
|
||||
msg = f"""
|
||||
There are uncategorized methods defined on the Grouper class:
|
||||
{new_names}.
|
||||
|
||||
Was a new method recently added?
|
||||
|
||||
Every public method On Grouper must appear in exactly one the
|
||||
following three lists defined in pandas.core.groupby.base:
|
||||
- `reduction_kernels`
|
||||
- `transformation_kernels`
|
||||
- `groupby_other_methods`
|
||||
see the comments in pandas/core/groupby/base.py for guidance on
|
||||
how to fix this test.
|
||||
"""
|
||||
raise AssertionError(msg)
|
||||
|
||||
# removed a public method?
|
||||
all_categorized = reduction_kernels | transformation_kernels | groupby_other_methods
|
||||
if names != all_categorized:
|
||||
msg = f"""
|
||||
Some methods which are supposed to be on the Grouper class
|
||||
are missing:
|
||||
{all_categorized - names}.
|
||||
|
||||
They're still defined in one of the lists that live in pandas/core/groupby/base.py.
|
||||
If you removed a method, you should update them
|
||||
"""
|
||||
raise AssertionError(msg)
|
||||
|
||||
|
||||
def test_frame_consistency(groupby_func):
|
||||
# GH#48028
|
||||
if groupby_func in ("first", "last"):
|
||||
msg = "first and last are entirely different between frame and groupby"
|
||||
pytest.skip(reason=msg)
|
||||
|
||||
if groupby_func in ("cumcount", "ngroup"):
|
||||
assert not hasattr(DataFrame, groupby_func)
|
||||
return
|
||||
|
||||
frame_method = getattr(DataFrame, groupby_func)
|
||||
gb_method = getattr(DataFrameGroupBy, groupby_func)
|
||||
result = set(inspect.signature(gb_method).parameters)
|
||||
if groupby_func == "size":
|
||||
# "size" is a method on GroupBy but property on DataFrame:
|
||||
expected = {"self"}
|
||||
else:
|
||||
expected = set(inspect.signature(frame_method).parameters)
|
||||
|
||||
# Exclude certain arguments from result and expected depending on the operation
|
||||
# Some of these may be purposeful inconsistencies between the APIs
|
||||
exclude_expected, exclude_result = set(), set()
|
||||
if groupby_func in ("any", "all"):
|
||||
exclude_expected = {"kwargs", "bool_only", "axis"}
|
||||
elif groupby_func in ("count",):
|
||||
exclude_expected = {"numeric_only", "axis"}
|
||||
elif groupby_func in ("nunique",):
|
||||
exclude_expected = {"axis"}
|
||||
elif groupby_func in ("max", "min"):
|
||||
exclude_expected = {"axis", "kwargs", "skipna"}
|
||||
exclude_result = {"min_count", "engine", "engine_kwargs"}
|
||||
elif groupby_func in ("mean", "std", "sum", "var"):
|
||||
exclude_expected = {"axis", "kwargs", "skipna"}
|
||||
exclude_result = {"engine", "engine_kwargs"}
|
||||
elif groupby_func in ("median", "prod", "sem"):
|
||||
exclude_expected = {"axis", "kwargs", "skipna"}
|
||||
elif groupby_func in ("backfill", "bfill", "ffill", "pad"):
|
||||
exclude_expected = {"downcast", "inplace", "axis", "limit_area"}
|
||||
elif groupby_func in ("cummax", "cummin"):
|
||||
exclude_expected = {"skipna", "args"}
|
||||
exclude_result = {"numeric_only"}
|
||||
elif groupby_func in ("cumprod", "cumsum"):
|
||||
exclude_expected = {"skipna"}
|
||||
elif groupby_func in ("pct_change",):
|
||||
exclude_expected = {"kwargs"}
|
||||
exclude_result = {"axis"}
|
||||
elif groupby_func in ("rank",):
|
||||
exclude_expected = {"numeric_only"}
|
||||
elif groupby_func in ("quantile",):
|
||||
exclude_expected = {"method", "axis"}
|
||||
|
||||
# Ensure excluded arguments are actually in the signatures
|
||||
assert result & exclude_result == exclude_result
|
||||
assert expected & exclude_expected == exclude_expected
|
||||
|
||||
result -= exclude_result
|
||||
expected -= exclude_expected
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_series_consistency(request, groupby_func):
|
||||
# GH#48028
|
||||
if groupby_func in ("first", "last"):
|
||||
pytest.skip("first and last are entirely different between Series and groupby")
|
||||
|
||||
if groupby_func in ("cumcount", "corrwith", "ngroup"):
|
||||
assert not hasattr(Series, groupby_func)
|
||||
return
|
||||
|
||||
series_method = getattr(Series, groupby_func)
|
||||
gb_method = getattr(SeriesGroupBy, groupby_func)
|
||||
result = set(inspect.signature(gb_method).parameters)
|
||||
if groupby_func == "size":
|
||||
# "size" is a method on GroupBy but property on Series
|
||||
expected = {"self"}
|
||||
else:
|
||||
expected = set(inspect.signature(series_method).parameters)
|
||||
|
||||
# Exclude certain arguments from result and expected depending on the operation
|
||||
# Some of these may be purposeful inconsistencies between the APIs
|
||||
exclude_expected, exclude_result = set(), set()
|
||||
if groupby_func in ("any", "all"):
|
||||
exclude_expected = {"kwargs", "bool_only", "axis"}
|
||||
elif groupby_func in ("diff",):
|
||||
exclude_result = {"axis"}
|
||||
elif groupby_func in ("max", "min"):
|
||||
exclude_expected = {"axis", "kwargs", "skipna"}
|
||||
exclude_result = {"min_count", "engine", "engine_kwargs"}
|
||||
elif groupby_func in ("mean", "std", "sum", "var"):
|
||||
exclude_expected = {"axis", "kwargs", "skipna"}
|
||||
exclude_result = {"engine", "engine_kwargs"}
|
||||
elif groupby_func in ("median", "prod", "sem"):
|
||||
exclude_expected = {"axis", "kwargs", "skipna"}
|
||||
elif groupby_func in ("backfill", "bfill", "ffill", "pad"):
|
||||
exclude_expected = {"downcast", "inplace", "axis", "limit_area"}
|
||||
elif groupby_func in ("cummax", "cummin"):
|
||||
exclude_expected = {"skipna", "args"}
|
||||
exclude_result = {"numeric_only"}
|
||||
elif groupby_func in ("cumprod", "cumsum"):
|
||||
exclude_expected = {"skipna"}
|
||||
elif groupby_func in ("pct_change",):
|
||||
exclude_expected = {"kwargs"}
|
||||
exclude_result = {"axis"}
|
||||
elif groupby_func in ("rank",):
|
||||
exclude_expected = {"numeric_only"}
|
||||
elif groupby_func in ("idxmin", "idxmax"):
|
||||
exclude_expected = {"args", "kwargs"}
|
||||
elif groupby_func in ("quantile",):
|
||||
exclude_result = {"numeric_only"}
|
||||
|
||||
# Ensure excluded arguments are actually in the signatures
|
||||
assert result & exclude_result == exclude_result
|
||||
assert expected & exclude_expected == exclude_expected
|
||||
|
||||
result -= exclude_result
|
||||
expected -= exclude_expected
|
||||
assert result == expected
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,163 @@
|
||||
import numpy as np
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_group_by_copy():
|
||||
# GH#44803
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"name": ["Alice", "Bob", "Carl"],
|
||||
"age": [20, 21, 20],
|
||||
}
|
||||
).set_index("name")
|
||||
|
||||
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
||||
with tm.assert_produces_warning(DeprecationWarning, match=msg):
|
||||
grp_by_same_value = df.groupby(["age"], group_keys=False).apply(
|
||||
lambda group: group
|
||||
)
|
||||
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
||||
with tm.assert_produces_warning(DeprecationWarning, match=msg):
|
||||
grp_by_copy = df.groupby(["age"], group_keys=False).apply(
|
||||
lambda group: group.copy()
|
||||
)
|
||||
tm.assert_frame_equal(grp_by_same_value, grp_by_copy)
|
||||
|
||||
|
||||
def test_mutate_groups():
|
||||
# GH3380
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"cat1": ["a"] * 8 + ["b"] * 6,
|
||||
"cat2": ["c"] * 2
|
||||
+ ["d"] * 2
|
||||
+ ["e"] * 2
|
||||
+ ["f"] * 2
|
||||
+ ["c"] * 2
|
||||
+ ["d"] * 2
|
||||
+ ["e"] * 2,
|
||||
"cat3": [f"g{x}" for x in range(1, 15)],
|
||||
"val": np.random.default_rng(2).integers(100, size=14),
|
||||
}
|
||||
)
|
||||
|
||||
def f_copy(x):
|
||||
x = x.copy()
|
||||
x["rank"] = x.val.rank(method="min")
|
||||
return x.groupby("cat2")["rank"].min()
|
||||
|
||||
def f_no_copy(x):
|
||||
x["rank"] = x.val.rank(method="min")
|
||||
return x.groupby("cat2")["rank"].min()
|
||||
|
||||
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
||||
with tm.assert_produces_warning(DeprecationWarning, match=msg):
|
||||
grpby_copy = df.groupby("cat1").apply(f_copy)
|
||||
with tm.assert_produces_warning(DeprecationWarning, match=msg):
|
||||
grpby_no_copy = df.groupby("cat1").apply(f_no_copy)
|
||||
tm.assert_series_equal(grpby_copy, grpby_no_copy)
|
||||
|
||||
|
||||
def test_no_mutate_but_looks_like():
|
||||
# GH 8467
|
||||
# first show's mutation indicator
|
||||
# second does not, but should yield the same results
|
||||
df = pd.DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)})
|
||||
|
||||
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
||||
with tm.assert_produces_warning(DeprecationWarning, match=msg):
|
||||
result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key)
|
||||
with tm.assert_produces_warning(DeprecationWarning, match=msg):
|
||||
result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key)
|
||||
tm.assert_series_equal(result1, result2)
|
||||
|
||||
|
||||
def test_apply_function_with_indexing(warn_copy_on_write):
|
||||
# GH: 33058
|
||||
df = pd.DataFrame(
|
||||
{"col1": ["A", "A", "A", "B", "B", "B"], "col2": [1, 2, 3, 4, 5, 6]}
|
||||
)
|
||||
|
||||
def fn(x):
|
||||
x.loc[x.index[-1], "col2"] = 0
|
||||
return x.col2
|
||||
|
||||
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
||||
with tm.assert_produces_warning(
|
||||
DeprecationWarning, match=msg, raise_on_extra_warnings=not warn_copy_on_write
|
||||
):
|
||||
result = df.groupby(["col1"], as_index=False).apply(fn)
|
||||
expected = pd.Series(
|
||||
[1, 2, 0, 4, 5, 0],
|
||||
index=pd.MultiIndex.from_tuples(
|
||||
[(0, 0), (0, 1), (0, 2), (1, 3), (1, 4), (1, 5)]
|
||||
),
|
||||
name="col2",
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_apply_mutate_columns_multiindex():
|
||||
# GH 12652
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
("C", "julian"): [1, 2, 3],
|
||||
("B", "geoffrey"): [1, 2, 3],
|
||||
("A", "julian"): [1, 2, 3],
|
||||
("B", "julian"): [1, 2, 3],
|
||||
("A", "geoffrey"): [1, 2, 3],
|
||||
("C", "geoffrey"): [1, 2, 3],
|
||||
},
|
||||
columns=pd.MultiIndex.from_tuples(
|
||||
[
|
||||
("A", "julian"),
|
||||
("A", "geoffrey"),
|
||||
("B", "julian"),
|
||||
("B", "geoffrey"),
|
||||
("C", "julian"),
|
||||
("C", "geoffrey"),
|
||||
]
|
||||
),
|
||||
)
|
||||
|
||||
def add_column(grouped):
|
||||
name = grouped.columns[0][1]
|
||||
grouped["sum", name] = grouped.sum(axis=1)
|
||||
return grouped
|
||||
|
||||
msg = "DataFrame.groupby with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
gb = df.groupby(level=1, axis=1)
|
||||
result = gb.apply(add_column)
|
||||
expected = pd.DataFrame(
|
||||
[
|
||||
[1, 1, 1, 3, 1, 1, 1, 3],
|
||||
[2, 2, 2, 6, 2, 2, 2, 6],
|
||||
[
|
||||
3,
|
||||
3,
|
||||
3,
|
||||
9,
|
||||
3,
|
||||
3,
|
||||
3,
|
||||
9,
|
||||
],
|
||||
],
|
||||
columns=pd.MultiIndex.from_tuples(
|
||||
[
|
||||
("geoffrey", "A", "geoffrey"),
|
||||
("geoffrey", "B", "geoffrey"),
|
||||
("geoffrey", "C", "geoffrey"),
|
||||
("geoffrey", "sum", "geoffrey"),
|
||||
("julian", "A", "julian"),
|
||||
("julian", "B", "julian"),
|
||||
("julian", "C", "julian"),
|
||||
("julian", "sum", "julian"),
|
||||
]
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,65 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs import lib
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def assert_block_lengths(x):
|
||||
assert len(x) == len(x._mgr.blocks[0].mgr_locs)
|
||||
return 0
|
||||
|
||||
|
||||
def cumsum_max(x):
|
||||
x.cumsum().max()
|
||||
return 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"func",
|
||||
[
|
||||
cumsum_max,
|
||||
pytest.param(assert_block_lengths, marks=td.skip_array_manager_invalid_test),
|
||||
],
|
||||
)
|
||||
def test_mgr_locs_updated(func):
|
||||
# https://github.com/pandas-dev/pandas/issues/31802
|
||||
# Some operations may require creating new blocks, which requires
|
||||
# valid mgr_locs
|
||||
df = pd.DataFrame({"A": ["a", "a", "a"], "B": ["a", "b", "b"], "C": [1, 1, 1]})
|
||||
result = df.groupby(["A", "B"]).agg(func)
|
||||
expected = pd.DataFrame(
|
||||
{"C": [0, 0]},
|
||||
index=pd.MultiIndex.from_product([["a"], ["a", "b"]], names=["A", "B"]),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"binner,closed,expected",
|
||||
[
|
||||
(
|
||||
np.array([0, 3, 6, 9], dtype=np.int64),
|
||||
"left",
|
||||
np.array([2, 5, 6], dtype=np.int64),
|
||||
),
|
||||
(
|
||||
np.array([0, 3, 6, 9], dtype=np.int64),
|
||||
"right",
|
||||
np.array([3, 6, 6], dtype=np.int64),
|
||||
),
|
||||
(np.array([0, 3, 6], dtype=np.int64), "left", np.array([2, 5], dtype=np.int64)),
|
||||
(
|
||||
np.array([0, 3, 6], dtype=np.int64),
|
||||
"right",
|
||||
np.array([3, 6], dtype=np.int64),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_generate_bins(binner, closed, expected):
|
||||
values = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64)
|
||||
result = lib.generate_bins_dt64(values, binner, closed=closed)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,394 @@
|
||||
from itertools import product
|
||||
from string import ascii_lowercase
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Period,
|
||||
Series,
|
||||
Timedelta,
|
||||
Timestamp,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestCounting:
|
||||
def test_cumcount(self):
|
||||
df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"])
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 1, 2, 0, 3])
|
||||
|
||||
tm.assert_series_equal(expected, g.cumcount())
|
||||
tm.assert_series_equal(expected, sg.cumcount())
|
||||
|
||||
def test_cumcount_empty(self):
|
||||
ge = DataFrame().groupby(level=0)
|
||||
se = Series(dtype=object).groupby(level=0)
|
||||
|
||||
# edge case, as this is usually considered float
|
||||
e = Series(dtype="int64")
|
||||
|
||||
tm.assert_series_equal(e, ge.cumcount())
|
||||
tm.assert_series_equal(e, se.cumcount())
|
||||
|
||||
def test_cumcount_dupe_index(self):
|
||||
df = DataFrame(
|
||||
[["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5
|
||||
)
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
|
||||
|
||||
tm.assert_series_equal(expected, g.cumcount())
|
||||
tm.assert_series_equal(expected, sg.cumcount())
|
||||
|
||||
def test_cumcount_mi(self):
|
||||
mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
|
||||
df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=mi)
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 1, 2, 0, 3], index=mi)
|
||||
|
||||
tm.assert_series_equal(expected, g.cumcount())
|
||||
tm.assert_series_equal(expected, sg.cumcount())
|
||||
|
||||
def test_cumcount_groupby_not_col(self):
|
||||
df = DataFrame(
|
||||
[["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5
|
||||
)
|
||||
g = df.groupby([0, 0, 0, 1, 0])
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
|
||||
|
||||
tm.assert_series_equal(expected, g.cumcount())
|
||||
tm.assert_series_equal(expected, sg.cumcount())
|
||||
|
||||
def test_ngroup(self):
|
||||
df = DataFrame({"A": list("aaaba")})
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 0, 0, 1, 0])
|
||||
|
||||
tm.assert_series_equal(expected, g.ngroup())
|
||||
tm.assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_distinct(self):
|
||||
df = DataFrame({"A": list("abcde")})
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series(range(5), dtype="int64")
|
||||
|
||||
tm.assert_series_equal(expected, g.ngroup())
|
||||
tm.assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_one_group(self):
|
||||
df = DataFrame({"A": [0] * 5})
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0] * 5)
|
||||
|
||||
tm.assert_series_equal(expected, g.ngroup())
|
||||
tm.assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_empty(self):
|
||||
ge = DataFrame().groupby(level=0)
|
||||
se = Series(dtype=object).groupby(level=0)
|
||||
|
||||
# edge case, as this is usually considered float
|
||||
e = Series(dtype="int64")
|
||||
|
||||
tm.assert_series_equal(e, ge.ngroup())
|
||||
tm.assert_series_equal(e, se.ngroup())
|
||||
|
||||
def test_ngroup_series_matches_frame(self):
|
||||
df = DataFrame({"A": list("aaaba")})
|
||||
s = Series(list("aaaba"))
|
||||
|
||||
tm.assert_series_equal(df.groupby(s).ngroup(), s.groupby(s).ngroup())
|
||||
|
||||
def test_ngroup_dupe_index(self):
|
||||
df = DataFrame({"A": list("aaaba")}, index=[0] * 5)
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
|
||||
|
||||
tm.assert_series_equal(expected, g.ngroup())
|
||||
tm.assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_mi(self):
|
||||
mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
|
||||
df = DataFrame({"A": list("aaaba")}, index=mi)
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
expected = Series([0, 0, 0, 1, 0], index=mi)
|
||||
|
||||
tm.assert_series_equal(expected, g.ngroup())
|
||||
tm.assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_groupby_not_col(self):
|
||||
df = DataFrame({"A": list("aaaba")}, index=[0] * 5)
|
||||
g = df.groupby([0, 0, 0, 1, 0])
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
|
||||
|
||||
tm.assert_series_equal(expected, g.ngroup())
|
||||
tm.assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_descending(self):
|
||||
df = DataFrame(["a", "a", "b", "a", "b"], columns=["A"])
|
||||
g = df.groupby(["A"])
|
||||
|
||||
ascending = Series([0, 0, 1, 0, 1])
|
||||
descending = Series([1, 1, 0, 1, 0])
|
||||
|
||||
tm.assert_series_equal(descending, (g.ngroups - 1) - ascending)
|
||||
tm.assert_series_equal(ascending, g.ngroup(ascending=True))
|
||||
tm.assert_series_equal(descending, g.ngroup(ascending=False))
|
||||
|
||||
def test_ngroup_matches_cumcount(self):
|
||||
# verify one manually-worked out case works
|
||||
df = DataFrame(
|
||||
[["a", "x"], ["a", "y"], ["b", "x"], ["a", "x"], ["b", "y"]],
|
||||
columns=["A", "X"],
|
||||
)
|
||||
g = df.groupby(["A", "X"])
|
||||
g_ngroup = g.ngroup()
|
||||
g_cumcount = g.cumcount()
|
||||
expected_ngroup = Series([0, 1, 2, 0, 3])
|
||||
expected_cumcount = Series([0, 0, 0, 1, 0])
|
||||
|
||||
tm.assert_series_equal(g_ngroup, expected_ngroup)
|
||||
tm.assert_series_equal(g_cumcount, expected_cumcount)
|
||||
|
||||
def test_ngroup_cumcount_pair(self):
|
||||
# brute force comparison for all small series
|
||||
for p in product(range(3), repeat=4):
|
||||
df = DataFrame({"a": p})
|
||||
g = df.groupby(["a"])
|
||||
|
||||
order = sorted(set(p))
|
||||
ngroupd = [order.index(val) for val in p]
|
||||
cumcounted = [p[:i].count(val) for i, val in enumerate(p)]
|
||||
|
||||
tm.assert_series_equal(g.ngroup(), Series(ngroupd))
|
||||
tm.assert_series_equal(g.cumcount(), Series(cumcounted))
|
||||
|
||||
def test_ngroup_respects_groupby_order(self, sort):
|
||||
df = DataFrame({"a": np.random.default_rng(2).choice(list("abcdef"), 100)})
|
||||
g = df.groupby("a", sort=sort)
|
||||
df["group_id"] = -1
|
||||
df["group_index"] = -1
|
||||
|
||||
for i, (_, group) in enumerate(g):
|
||||
df.loc[group.index, "group_id"] = i
|
||||
for j, ind in enumerate(group.index):
|
||||
df.loc[ind, "group_index"] = j
|
||||
|
||||
tm.assert_series_equal(Series(df["group_id"].values), g.ngroup())
|
||||
tm.assert_series_equal(Series(df["group_index"].values), g.cumcount())
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"datetimelike",
|
||||
[
|
||||
[Timestamp(f"2016-05-{i:02d} 20:09:25+00:00") for i in range(1, 4)],
|
||||
[Timestamp(f"2016-05-{i:02d} 20:09:25") for i in range(1, 4)],
|
||||
[Timestamp(f"2016-05-{i:02d} 20:09:25", tz="UTC") for i in range(1, 4)],
|
||||
[Timedelta(x, unit="h") for x in range(1, 4)],
|
||||
[Period(freq="2W", year=2017, month=x) for x in range(1, 4)],
|
||||
],
|
||||
)
|
||||
def test_count_with_datetimelike(self, datetimelike):
|
||||
# test for #13393, where DataframeGroupBy.count() fails
|
||||
# when counting a datetimelike column.
|
||||
|
||||
df = DataFrame({"x": ["a", "a", "b"], "y": datetimelike})
|
||||
res = df.groupby("x").count()
|
||||
expected = DataFrame({"y": [2, 1]}, index=["a", "b"])
|
||||
expected.index.name = "x"
|
||||
tm.assert_frame_equal(expected, res)
|
||||
|
||||
def test_count_with_only_nans_in_first_group(self):
|
||||
# GH21956
|
||||
df = DataFrame({"A": [np.nan, np.nan], "B": ["a", "b"], "C": [1, 2]})
|
||||
result = df.groupby(["A", "B"]).C.count()
|
||||
mi = MultiIndex(levels=[[], ["a", "b"]], codes=[[], []], names=["A", "B"])
|
||||
expected = Series([], index=mi, dtype=np.int64, name="C")
|
||||
tm.assert_series_equal(result, expected, check_index_type=False)
|
||||
|
||||
def test_count_groupby_column_with_nan_in_groupby_column(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/32841
|
||||
df = DataFrame({"A": [1, 1, 1, 1, 1], "B": [5, 4, np.nan, 3, 0]})
|
||||
res = df.groupby(["B"]).count()
|
||||
expected = DataFrame(
|
||||
index=Index([0.0, 3.0, 4.0, 5.0], name="B"), data={"A": [1, 1, 1, 1]}
|
||||
)
|
||||
tm.assert_frame_equal(expected, res)
|
||||
|
||||
def test_groupby_count_dateparseerror(self):
|
||||
dr = date_range(start="1/1/2012", freq="5min", periods=10)
|
||||
|
||||
# BAD Example, datetimes first
|
||||
ser = Series(np.arange(10), index=[dr, np.arange(10)])
|
||||
grouped = ser.groupby(lambda x: x[1] % 2 == 0)
|
||||
result = grouped.count()
|
||||
|
||||
ser = Series(np.arange(10), index=[np.arange(10), dr])
|
||||
grouped = ser.groupby(lambda x: x[0] % 2 == 0)
|
||||
expected = grouped.count()
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_timedelta_cython_count():
|
||||
df = DataFrame(
|
||||
{"g": list("ab" * 2), "delta": np.arange(4).astype("timedelta64[ns]")}
|
||||
)
|
||||
expected = Series([2, 2], index=Index(["a", "b"], name="g"), name="delta")
|
||||
result = df.groupby("g").delta.count()
|
||||
tm.assert_series_equal(expected, result)
|
||||
|
||||
|
||||
def test_count():
|
||||
n = 1 << 15
|
||||
dr = date_range("2015-08-30", periods=n // 10, freq="min")
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"1st": np.random.default_rng(2).choice(list(ascii_lowercase), n),
|
||||
"2nd": np.random.default_rng(2).integers(0, 5, n),
|
||||
"3rd": np.random.default_rng(2).standard_normal(n).round(3),
|
||||
"4th": np.random.default_rng(2).integers(-10, 10, n),
|
||||
"5th": np.random.default_rng(2).choice(dr, n),
|
||||
"6th": np.random.default_rng(2).standard_normal(n).round(3),
|
||||
"7th": np.random.default_rng(2).standard_normal(n).round(3),
|
||||
"8th": np.random.default_rng(2).choice(dr, n)
|
||||
- np.random.default_rng(2).choice(dr, 1),
|
||||
"9th": np.random.default_rng(2).choice(list(ascii_lowercase), n),
|
||||
}
|
||||
)
|
||||
|
||||
for col in df.columns.drop(["1st", "2nd", "4th"]):
|
||||
df.loc[np.random.default_rng(2).choice(n, n // 10), col] = np.nan
|
||||
|
||||
df["9th"] = df["9th"].astype("category")
|
||||
|
||||
for key in ["1st", "2nd", ["1st", "2nd"]]:
|
||||
left = df.groupby(key).count()
|
||||
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
||||
with tm.assert_produces_warning(DeprecationWarning, match=msg):
|
||||
right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1)
|
||||
tm.assert_frame_equal(left, right)
|
||||
|
||||
|
||||
def test_count_non_nulls():
|
||||
# GH#5610
|
||||
# count counts non-nulls
|
||||
df = DataFrame(
|
||||
[[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, np.nan]],
|
||||
columns=["A", "B", "C"],
|
||||
)
|
||||
|
||||
count_as = df.groupby("A").count()
|
||||
count_not_as = df.groupby("A", as_index=False).count()
|
||||
|
||||
expected = DataFrame([[1, 2], [0, 0]], columns=["B", "C"], index=[1, 3])
|
||||
expected.index.name = "A"
|
||||
tm.assert_frame_equal(count_not_as, expected.reset_index())
|
||||
tm.assert_frame_equal(count_as, expected)
|
||||
|
||||
count_B = df.groupby("A")["B"].count()
|
||||
tm.assert_series_equal(count_B, expected["B"])
|
||||
|
||||
|
||||
def test_count_object():
|
||||
df = DataFrame({"a": ["a"] * 3 + ["b"] * 3, "c": [2] * 3 + [3] * 3})
|
||||
result = df.groupby("c").a.count()
|
||||
expected = Series([3, 3], index=Index([2, 3], name="c"), name="a")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({"a": ["a", np.nan, np.nan] + ["b"] * 3, "c": [2] * 3 + [3] * 3})
|
||||
result = df.groupby("c").a.count()
|
||||
expected = Series([1, 3], index=Index([2, 3], name="c"), name="a")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_count_cross_type():
|
||||
# GH8169
|
||||
# Set float64 dtype to avoid upcast when setting nan below
|
||||
vals = np.hstack(
|
||||
(
|
||||
np.random.default_rng(2).integers(0, 5, (100, 2)),
|
||||
np.random.default_rng(2).integers(0, 2, (100, 2)),
|
||||
)
|
||||
).astype("float64")
|
||||
|
||||
df = DataFrame(vals, columns=["a", "b", "c", "d"])
|
||||
df[df == 2] = np.nan
|
||||
expected = df.groupby(["c", "d"]).count()
|
||||
|
||||
for t in ["float32", "object"]:
|
||||
df["a"] = df["a"].astype(t)
|
||||
df["b"] = df["b"].astype(t)
|
||||
result = df.groupby(["c", "d"]).count()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_lower_int_prec_count():
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": np.array([0, 1, 2, 100], np.int8),
|
||||
"b": np.array([1, 2, 3, 6], np.uint32),
|
||||
"c": np.array([4, 5, 6, 8], np.int16),
|
||||
"grp": list("ab" * 2),
|
||||
}
|
||||
)
|
||||
result = df.groupby("grp").count()
|
||||
expected = DataFrame(
|
||||
{"a": [2, 2], "b": [2, 2], "c": [2, 2]}, index=Index(list("ab"), name="grp")
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_count_uses_size_on_exception():
|
||||
class RaisingObjectException(Exception):
|
||||
pass
|
||||
|
||||
class RaisingObject:
|
||||
def __init__(self, msg="I will raise inside Cython") -> None:
|
||||
super().__init__()
|
||||
self.msg = msg
|
||||
|
||||
def __eq__(self, other):
|
||||
# gets called in Cython to check that raising calls the method
|
||||
raise RaisingObjectException(self.msg)
|
||||
|
||||
df = DataFrame({"a": [RaisingObject() for _ in range(4)], "grp": list("ab" * 2)})
|
||||
result = df.groupby("grp").count()
|
||||
expected = DataFrame({"a": [2, 2]}, index=Index(list("ab"), name="grp"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_count_arrow_string_array(any_string_dtype):
|
||||
# GH#54751
|
||||
pytest.importorskip("pyarrow")
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 3], "b": Series(["a", "b", "a"], dtype=any_string_dtype)}
|
||||
)
|
||||
result = df.groupby("a").count()
|
||||
expected = DataFrame({"b": 1}, index=Index([1, 2, 3], name="a"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,319 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import UnsupportedFunctionCall
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[np.int32, np.int64, np.float32, np.float64, "Int64", "Float64"],
|
||||
ids=["np.int32", "np.int64", "np.float32", "np.float64", "Int64", "Float64"],
|
||||
)
|
||||
def dtypes_for_minmax(request):
|
||||
"""
|
||||
Fixture of dtypes with min and max values used for testing
|
||||
cummin and cummax
|
||||
"""
|
||||
dtype = request.param
|
||||
|
||||
np_type = dtype
|
||||
if dtype == "Int64":
|
||||
np_type = np.int64
|
||||
elif dtype == "Float64":
|
||||
np_type = np.float64
|
||||
|
||||
min_val = (
|
||||
np.iinfo(np_type).min
|
||||
if np.dtype(np_type).kind == "i"
|
||||
else np.finfo(np_type).min
|
||||
)
|
||||
max_val = (
|
||||
np.iinfo(np_type).max
|
||||
if np.dtype(np_type).kind == "i"
|
||||
else np.finfo(np_type).max
|
||||
)
|
||||
|
||||
return (dtype, min_val, max_val)
|
||||
|
||||
|
||||
def test_groupby_cumprod():
|
||||
# GH 4095
|
||||
df = DataFrame({"key": ["b"] * 10, "value": 2})
|
||||
|
||||
actual = df.groupby("key")["value"].cumprod()
|
||||
expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod())
|
||||
expected.name = "value"
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
df = DataFrame({"key": ["b"] * 100, "value": 2})
|
||||
df["value"] = df["value"].astype(float)
|
||||
actual = df.groupby("key")["value"].cumprod()
|
||||
expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod())
|
||||
expected.name = "value"
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
@pytest.mark.skip_ubsan
|
||||
def test_groupby_cumprod_overflow():
|
||||
# GH#37493 if we overflow we return garbage consistent with numpy
|
||||
df = DataFrame({"key": ["b"] * 4, "value": 100_000})
|
||||
actual = df.groupby("key")["value"].cumprod()
|
||||
expected = Series(
|
||||
[100_000, 10_000_000_000, 1_000_000_000_000_000, 7766279631452241920],
|
||||
name="value",
|
||||
)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
numpy_result = df.groupby("key", group_keys=False)["value"].apply(
|
||||
lambda x: x.cumprod()
|
||||
)
|
||||
numpy_result.name = "value"
|
||||
tm.assert_series_equal(actual, numpy_result)
|
||||
|
||||
|
||||
def test_groupby_cumprod_nan_influences_other_columns():
|
||||
# GH#48064
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": 1,
|
||||
"b": [1, np.nan, 2],
|
||||
"c": [1, 2, 3.0],
|
||||
}
|
||||
)
|
||||
result = df.groupby("a").cumprod(numeric_only=True, skipna=False)
|
||||
expected = DataFrame({"b": [1, np.nan, np.nan], "c": [1, 2, 6.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_cummin(dtypes_for_minmax):
|
||||
dtype = dtypes_for_minmax[0]
|
||||
min_val = dtypes_for_minmax[1]
|
||||
|
||||
# GH 15048
|
||||
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
|
||||
expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
|
||||
|
||||
df = base_df.astype(dtype)
|
||||
|
||||
expected = DataFrame({"B": expected_mins}).astype(dtype)
|
||||
result = df.groupby("A").cummin()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Test w/ min value for dtype
|
||||
df.loc[[2, 6], "B"] = min_val
|
||||
df.loc[[1, 5], "B"] = min_val + 1
|
||||
expected.loc[[2, 3, 6, 7], "B"] = min_val
|
||||
expected.loc[[1, 5], "B"] = min_val + 1 # should not be rounded to min_val
|
||||
result = df.groupby("A").cummin()
|
||||
tm.assert_frame_equal(result, expected, check_exact=True)
|
||||
expected = (
|
||||
df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame()
|
||||
)
|
||||
tm.assert_frame_equal(result, expected, check_exact=True)
|
||||
|
||||
# Test nan in some values
|
||||
# Explicit cast to float to avoid implicit cast when setting nan
|
||||
base_df = base_df.astype({"B": "float"})
|
||||
base_df.loc[[0, 2, 4, 6], "B"] = np.nan
|
||||
expected = DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]})
|
||||
result = base_df.groupby("A").cummin()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
expected = (
|
||||
base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame()
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# GH 15561
|
||||
df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])})
|
||||
expected = Series(pd.to_datetime("2001"), index=[0], name="b")
|
||||
|
||||
result = df.groupby("a")["b"].cummin()
|
||||
tm.assert_series_equal(expected, result)
|
||||
|
||||
# GH 15635
|
||||
df = DataFrame({"a": [1, 2, 1], "b": [1, 2, 2]})
|
||||
result = df.groupby("a").b.cummin()
|
||||
expected = Series([1, 2, 1], name="b")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["cummin", "cummax"])
|
||||
@pytest.mark.parametrize("dtype", ["UInt64", "Int64", "Float64", "float", "boolean"])
|
||||
def test_cummin_max_all_nan_column(method, dtype):
|
||||
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8})
|
||||
base_df["B"] = base_df["B"].astype(dtype)
|
||||
grouped = base_df.groupby("A")
|
||||
|
||||
expected = DataFrame({"B": [np.nan] * 8}, dtype=dtype)
|
||||
result = getattr(grouped, method)()
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
result = getattr(grouped["B"], method)().to_frame()
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
def test_cummax(dtypes_for_minmax):
|
||||
dtype = dtypes_for_minmax[0]
|
||||
max_val = dtypes_for_minmax[2]
|
||||
|
||||
# GH 15048
|
||||
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
|
||||
expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]
|
||||
|
||||
df = base_df.astype(dtype)
|
||||
|
||||
expected = DataFrame({"B": expected_maxs}).astype(dtype)
|
||||
result = df.groupby("A").cummax()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Test w/ max value for dtype
|
||||
df.loc[[2, 6], "B"] = max_val
|
||||
expected.loc[[2, 3, 6, 7], "B"] = max_val
|
||||
result = df.groupby("A").cummax()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
expected = (
|
||||
df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame()
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Test nan in some values
|
||||
# Explicit cast to float to avoid implicit cast when setting nan
|
||||
base_df = base_df.astype({"B": "float"})
|
||||
base_df.loc[[0, 2, 4, 6], "B"] = np.nan
|
||||
expected = DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]})
|
||||
result = base_df.groupby("A").cummax()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
expected = (
|
||||
base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame()
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# GH 15561
|
||||
df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])})
|
||||
expected = Series(pd.to_datetime("2001"), index=[0], name="b")
|
||||
|
||||
result = df.groupby("a")["b"].cummax()
|
||||
tm.assert_series_equal(expected, result)
|
||||
|
||||
# GH 15635
|
||||
df = DataFrame({"a": [1, 2, 1], "b": [2, 1, 1]})
|
||||
result = df.groupby("a").b.cummax()
|
||||
expected = Series([2, 1, 2], name="b")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_cummax_i8_at_implementation_bound():
|
||||
# the minimum value used to be treated as NPY_NAT+1 instead of NPY_NAT
|
||||
# for int64 dtype GH#46382
|
||||
ser = Series([pd.NaT._value + n for n in range(5)])
|
||||
df = DataFrame({"A": 1, "B": ser, "C": ser._values.view("M8[ns]")})
|
||||
gb = df.groupby("A")
|
||||
|
||||
res = gb.cummax()
|
||||
exp = df[["B", "C"]]
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["cummin", "cummax"])
|
||||
@pytest.mark.parametrize("dtype", ["float", "Int64", "Float64"])
|
||||
@pytest.mark.parametrize(
|
||||
"groups,expected_data",
|
||||
[
|
||||
([1, 1, 1], [1, None, None]),
|
||||
([1, 2, 3], [1, None, 2]),
|
||||
([1, 3, 3], [1, None, None]),
|
||||
],
|
||||
)
|
||||
def test_cummin_max_skipna(method, dtype, groups, expected_data):
|
||||
# GH-34047
|
||||
df = DataFrame({"a": Series([1, None, 2], dtype=dtype)})
|
||||
orig = df.copy()
|
||||
gb = df.groupby(groups)["a"]
|
||||
|
||||
result = getattr(gb, method)(skipna=False)
|
||||
expected = Series(expected_data, dtype=dtype, name="a")
|
||||
|
||||
# check we didn't accidentally alter df
|
||||
tm.assert_frame_equal(df, orig)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["cummin", "cummax"])
|
||||
def test_cummin_max_skipna_multiple_cols(method):
|
||||
# Ensure missing value in "a" doesn't cause "b" to be nan-filled
|
||||
df = DataFrame({"a": [np.nan, 2.0, 2.0], "b": [2.0, 2.0, 2.0]})
|
||||
gb = df.groupby([1, 1, 1])[["a", "b"]]
|
||||
|
||||
result = getattr(gb, method)(skipna=False)
|
||||
expected = DataFrame({"a": [np.nan, np.nan, np.nan], "b": [2.0, 2.0, 2.0]})
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("func", ["cumprod", "cumsum"])
|
||||
def test_numpy_compat(func):
|
||||
# see gh-12811
|
||||
df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]})
|
||||
g = df.groupby("A")
|
||||
|
||||
msg = "numpy operations are not valid with groupby"
|
||||
|
||||
with pytest.raises(UnsupportedFunctionCall, match=msg):
|
||||
getattr(g, func)(1, 2, 3)
|
||||
with pytest.raises(UnsupportedFunctionCall, match=msg):
|
||||
getattr(g, func)(foo=1)
|
||||
|
||||
|
||||
@td.skip_if_32bit
|
||||
@pytest.mark.parametrize("method", ["cummin", "cummax"])
|
||||
@pytest.mark.parametrize(
|
||||
"dtype,val", [("UInt64", np.iinfo("uint64").max), ("Int64", 2**53 + 1)]
|
||||
)
|
||||
def test_nullable_int_not_cast_as_float(method, dtype, val):
|
||||
data = [val, pd.NA]
|
||||
df = DataFrame({"grp": [1, 1], "b": data}, dtype=dtype)
|
||||
grouped = df.groupby("grp")
|
||||
|
||||
result = grouped.transform(method)
|
||||
expected = DataFrame({"b": data}, dtype=dtype)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_cython_api2():
|
||||
# this takes the fast apply path
|
||||
|
||||
# cumsum (GH5614)
|
||||
df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"])
|
||||
expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=["B", "C"])
|
||||
result = df.groupby("A").cumsum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# GH 5755 - cumsum is a transformer and should ignore as_index
|
||||
result = df.groupby("A", as_index=False).cumsum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# GH 13994
|
||||
msg = "DataFrameGroupBy.cumsum with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = df.groupby("A").cumsum(axis=1)
|
||||
expected = df.cumsum(axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
msg = "DataFrameGroupBy.cumprod with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = df.groupby("A").cumprod(axis=1)
|
||||
expected = df.cumprod(axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,636 @@
|
||||
from string import ascii_lowercase
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
Timestamp,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_filter_series():
|
||||
s = Series([1, 3, 20, 5, 22, 24, 7])
|
||||
expected_odd = Series([1, 3, 5, 7], index=[0, 1, 3, 6])
|
||||
expected_even = Series([20, 22, 24], index=[2, 4, 5])
|
||||
grouper = s.apply(lambda x: x % 2)
|
||||
grouped = s.groupby(grouper)
|
||||
tm.assert_series_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd)
|
||||
tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 10), expected_even)
|
||||
# Test dropna=False.
|
||||
tm.assert_series_equal(
|
||||
grouped.filter(lambda x: x.mean() < 10, dropna=False),
|
||||
expected_odd.reindex(s.index),
|
||||
)
|
||||
tm.assert_series_equal(
|
||||
grouped.filter(lambda x: x.mean() > 10, dropna=False),
|
||||
expected_even.reindex(s.index),
|
||||
)
|
||||
|
||||
|
||||
def test_filter_single_column_df():
|
||||
df = DataFrame([1, 3, 20, 5, 22, 24, 7])
|
||||
expected_odd = DataFrame([1, 3, 5, 7], index=[0, 1, 3, 6])
|
||||
expected_even = DataFrame([20, 22, 24], index=[2, 4, 5])
|
||||
grouper = df[0].apply(lambda x: x % 2)
|
||||
grouped = df.groupby(grouper)
|
||||
tm.assert_frame_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd)
|
||||
tm.assert_frame_equal(grouped.filter(lambda x: x.mean() > 10), expected_even)
|
||||
# Test dropna=False.
|
||||
tm.assert_frame_equal(
|
||||
grouped.filter(lambda x: x.mean() < 10, dropna=False),
|
||||
expected_odd.reindex(df.index),
|
||||
)
|
||||
tm.assert_frame_equal(
|
||||
grouped.filter(lambda x: x.mean() > 10, dropna=False),
|
||||
expected_even.reindex(df.index),
|
||||
)
|
||||
|
||||
|
||||
def test_filter_multi_column_df():
|
||||
df = DataFrame({"A": [1, 12, 12, 1], "B": [1, 1, 1, 1]})
|
||||
grouper = df["A"].apply(lambda x: x % 2)
|
||||
grouped = df.groupby(grouper)
|
||||
expected = DataFrame({"A": [12, 12], "B": [1, 1]}, index=[1, 2])
|
||||
tm.assert_frame_equal(
|
||||
grouped.filter(lambda x: x["A"].sum() - x["B"].sum() > 10), expected
|
||||
)
|
||||
|
||||
|
||||
def test_filter_mixed_df():
|
||||
df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
|
||||
grouper = df["A"].apply(lambda x: x % 2)
|
||||
grouped = df.groupby(grouper)
|
||||
expected = DataFrame({"A": [12, 12], "B": ["b", "c"]}, index=[1, 2])
|
||||
tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 10), expected)
|
||||
|
||||
|
||||
def test_filter_out_all_groups():
|
||||
s = Series([1, 3, 20, 5, 22, 24, 7])
|
||||
grouper = s.apply(lambda x: x % 2)
|
||||
grouped = s.groupby(grouper)
|
||||
tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 1000), s[[]])
|
||||
df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
|
||||
grouper = df["A"].apply(lambda x: x % 2)
|
||||
grouped = df.groupby(grouper)
|
||||
tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 1000), df.loc[[]])
|
||||
|
||||
|
||||
def test_filter_out_no_groups():
|
||||
s = Series([1, 3, 20, 5, 22, 24, 7])
|
||||
grouper = s.apply(lambda x: x % 2)
|
||||
grouped = s.groupby(grouper)
|
||||
filtered = grouped.filter(lambda x: x.mean() > 0)
|
||||
tm.assert_series_equal(filtered, s)
|
||||
df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
|
||||
grouper = df["A"].apply(lambda x: x % 2)
|
||||
grouped = df.groupby(grouper)
|
||||
filtered = grouped.filter(lambda x: x["A"].mean() > 0)
|
||||
tm.assert_frame_equal(filtered, df)
|
||||
|
||||
|
||||
def test_filter_out_all_groups_in_df():
|
||||
# GH12768
|
||||
df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]})
|
||||
res = df.groupby("a")
|
||||
res = res.filter(lambda x: x["b"].sum() > 5, dropna=False)
|
||||
expected = DataFrame({"a": [np.nan] * 3, "b": [np.nan] * 3})
|
||||
tm.assert_frame_equal(expected, res)
|
||||
|
||||
df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]})
|
||||
res = df.groupby("a")
|
||||
res = res.filter(lambda x: x["b"].sum() > 5, dropna=True)
|
||||
expected = DataFrame({"a": [], "b": []}, dtype="int64")
|
||||
tm.assert_frame_equal(expected, res)
|
||||
|
||||
|
||||
def test_filter_condition_raises():
|
||||
def raise_if_sum_is_zero(x):
|
||||
if x.sum() == 0:
|
||||
raise ValueError
|
||||
return x.sum() > 0
|
||||
|
||||
s = Series([-1, 0, 1, 2])
|
||||
grouper = s.apply(lambda x: x % 2)
|
||||
grouped = s.groupby(grouper)
|
||||
msg = "the filter must return a boolean result"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
grouped.filter(raise_if_sum_is_zero)
|
||||
|
||||
|
||||
def test_filter_with_axis_in_groupby():
|
||||
# issue 11041
|
||||
index = pd.MultiIndex.from_product([range(10), [0, 1]])
|
||||
data = DataFrame(np.arange(100).reshape(-1, 20), columns=index, dtype="int64")
|
||||
|
||||
msg = "DataFrame.groupby with axis=1"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
gb = data.groupby(level=0, axis=1)
|
||||
result = gb.filter(lambda x: x.iloc[0, 0] > 10)
|
||||
expected = data.iloc[:, 12:20]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_filter_bad_shapes():
|
||||
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
|
||||
s = df["B"]
|
||||
g_df = df.groupby("B")
|
||||
g_s = s.groupby(s)
|
||||
|
||||
f = lambda x: x
|
||||
msg = "filter function returned a DataFrame, but expected a scalar bool"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_df.filter(f)
|
||||
msg = "the filter must return a boolean result"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_s.filter(f)
|
||||
|
||||
f = lambda x: x == 1
|
||||
msg = "filter function returned a DataFrame, but expected a scalar bool"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_df.filter(f)
|
||||
msg = "the filter must return a boolean result"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_s.filter(f)
|
||||
|
||||
f = lambda x: np.outer(x, x)
|
||||
msg = "can't multiply sequence by non-int of type 'str'"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_df.filter(f)
|
||||
msg = "the filter must return a boolean result"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_s.filter(f)
|
||||
|
||||
|
||||
def test_filter_nan_is_false():
|
||||
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
|
||||
s = df["B"]
|
||||
g_df = df.groupby(df["B"])
|
||||
g_s = s.groupby(s)
|
||||
|
||||
f = lambda x: np.nan
|
||||
tm.assert_frame_equal(g_df.filter(f), df.loc[[]])
|
||||
tm.assert_series_equal(g_s.filter(f), s[[]])
|
||||
|
||||
|
||||
def test_filter_pdna_is_false():
|
||||
# in particular, dont raise in filter trying to call bool(pd.NA)
|
||||
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
|
||||
ser = df["B"]
|
||||
g_df = df.groupby(df["B"])
|
||||
g_s = ser.groupby(ser)
|
||||
|
||||
func = lambda x: pd.NA
|
||||
res = g_df.filter(func)
|
||||
tm.assert_frame_equal(res, df.loc[[]])
|
||||
res = g_s.filter(func)
|
||||
tm.assert_series_equal(res, ser[[]])
|
||||
|
||||
|
||||
def test_filter_against_workaround_ints():
|
||||
# Series of ints
|
||||
s = Series(np.random.default_rng(2).integers(0, 100, 100))
|
||||
grouper = s.apply(lambda x: np.round(x, -1))
|
||||
grouped = s.groupby(grouper)
|
||||
f = lambda x: x.mean() > 10
|
||||
|
||||
old_way = s[grouped.transform(f).astype("bool")]
|
||||
new_way = grouped.filter(f)
|
||||
tm.assert_series_equal(new_way.sort_values(), old_way.sort_values())
|
||||
|
||||
|
||||
def test_filter_against_workaround_floats():
|
||||
# Series of floats
|
||||
s = 100 * Series(np.random.default_rng(2).random(100))
|
||||
grouper = s.apply(lambda x: np.round(x, -1))
|
||||
grouped = s.groupby(grouper)
|
||||
f = lambda x: x.mean() > 10
|
||||
old_way = s[grouped.transform(f).astype("bool")]
|
||||
new_way = grouped.filter(f)
|
||||
tm.assert_series_equal(new_way.sort_values(), old_way.sort_values())
|
||||
|
||||
|
||||
def test_filter_against_workaround_dataframe():
|
||||
# Set up DataFrame of ints, floats, strings.
|
||||
letters = np.array(list(ascii_lowercase))
|
||||
N = 100
|
||||
random_letters = letters.take(
|
||||
np.random.default_rng(2).integers(0, 26, N, dtype=int)
|
||||
)
|
||||
df = DataFrame(
|
||||
{
|
||||
"ints": Series(np.random.default_rng(2).integers(0, 100, N)),
|
||||
"floats": N / 10 * Series(np.random.default_rng(2).random(N)),
|
||||
"letters": Series(random_letters),
|
||||
}
|
||||
)
|
||||
|
||||
# Group by ints; filter on floats.
|
||||
grouped = df.groupby("ints")
|
||||
old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 20).astype("bool")]
|
||||
new_way = grouped.filter(lambda x: x["floats"].mean() > N / 20)
|
||||
tm.assert_frame_equal(new_way, old_way)
|
||||
|
||||
# Group by floats (rounded); filter on strings.
|
||||
grouper = df.floats.apply(lambda x: np.round(x, -1))
|
||||
grouped = df.groupby(grouper)
|
||||
old_way = df[grouped.letters.transform(lambda x: len(x) < N / 10).astype("bool")]
|
||||
new_way = grouped.filter(lambda x: len(x.letters) < N / 10)
|
||||
tm.assert_frame_equal(new_way, old_way)
|
||||
|
||||
# Group by strings; filter on ints.
|
||||
grouped = df.groupby("letters")
|
||||
old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 20).astype("bool")]
|
||||
new_way = grouped.filter(lambda x: x["ints"].mean() > N / 20)
|
||||
tm.assert_frame_equal(new_way, old_way)
|
||||
|
||||
|
||||
def test_filter_using_len():
|
||||
# BUG GH4447
|
||||
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
|
||||
grouped = df.groupby("B")
|
||||
actual = grouped.filter(lambda x: len(x) > 2)
|
||||
expected = DataFrame(
|
||||
{"A": np.arange(2, 6), "B": list("bbbb"), "C": np.arange(2, 6)},
|
||||
index=np.arange(2, 6, dtype=np.int64),
|
||||
)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped.filter(lambda x: len(x) > 4)
|
||||
expected = df.loc[[]]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Series have always worked properly, but we'll test anyway.
|
||||
s = df["B"]
|
||||
grouped = s.groupby(s)
|
||||
actual = grouped.filter(lambda x: len(x) > 2)
|
||||
expected = Series(4 * ["b"], index=np.arange(2, 6, dtype=np.int64), name="B")
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped.filter(lambda x: len(x) > 4)
|
||||
expected = s[[]]
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_maintains_ordering():
|
||||
# Simple case: index is sequential. #4621
|
||||
df = DataFrame(
|
||||
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}
|
||||
)
|
||||
s = df["pid"]
|
||||
grouped = df.groupby("tag")
|
||||
actual = grouped.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[[1, 2, 4, 7]]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
grouped = s.groupby(df["tag"])
|
||||
actual = grouped.filter(lambda x: len(x) > 1)
|
||||
expected = s.iloc[[1, 2, 4, 7]]
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Now index is sequentially decreasing.
|
||||
df.index = np.arange(len(df) - 1, -1, -1)
|
||||
s = df["pid"]
|
||||
grouped = df.groupby("tag")
|
||||
actual = grouped.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[[1, 2, 4, 7]]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
grouped = s.groupby(df["tag"])
|
||||
actual = grouped.filter(lambda x: len(x) > 1)
|
||||
expected = s.iloc[[1, 2, 4, 7]]
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Index is shuffled.
|
||||
SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3]
|
||||
df.index = df.index[SHUFFLED]
|
||||
s = df["pid"]
|
||||
grouped = df.groupby("tag")
|
||||
actual = grouped.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[[1, 2, 4, 7]]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
grouped = s.groupby(df["tag"])
|
||||
actual = grouped.filter(lambda x: len(x) > 1)
|
||||
expected = s.iloc[[1, 2, 4, 7]]
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_multiple_timestamp():
|
||||
# GH 10114
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": np.arange(5, dtype="int64"),
|
||||
"B": ["foo", "bar", "foo", "bar", "bar"],
|
||||
"C": Timestamp("20130101"),
|
||||
}
|
||||
)
|
||||
|
||||
grouped = df.groupby(["B", "C"])
|
||||
|
||||
result = grouped["A"].filter(lambda x: True)
|
||||
tm.assert_series_equal(df["A"], result)
|
||||
|
||||
result = grouped["A"].transform(len)
|
||||
expected = Series([2, 3, 2, 3, 3], name="A")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = grouped.filter(lambda x: True)
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
result = grouped.transform("sum")
|
||||
expected = DataFrame({"A": [2, 8, 2, 8, 8]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = grouped.transform(len)
|
||||
expected = DataFrame({"A": [2, 3, 2, 3, 3]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_filter_and_transform_with_non_unique_int_index():
|
||||
# GH4620
|
||||
index = [1, 1, 1, 2, 1, 1, 0, 1]
|
||||
df = DataFrame(
|
||||
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
|
||||
index=index,
|
||||
)
|
||||
grouped_df = df.groupby("tag")
|
||||
ser = df["pid"]
|
||||
grouped_ser = ser.groupby(df["tag"])
|
||||
expected_indexes = [1, 2, 4, 7]
|
||||
|
||||
# Filter DataFrame
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[expected_indexes]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
|
||||
# Cast to avoid upcast when setting nan below
|
||||
expected = df.copy().astype("float64")
|
||||
expected.iloc[[0, 3, 5, 6]] = np.nan
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Filter Series
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1)
|
||||
expected = ser.take(expected_indexes)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
|
||||
expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
|
||||
# ^ made manually because this can get confusing!
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform Series
|
||||
actual = grouped_ser.transform(len)
|
||||
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform (a column from) DataFrameGroupBy
|
||||
actual = grouped_df.pid.transform(len)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_and_transform_with_multiple_non_unique_int_index():
|
||||
# GH4620
|
||||
index = [1, 1, 1, 2, 0, 0, 0, 1]
|
||||
df = DataFrame(
|
||||
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
|
||||
index=index,
|
||||
)
|
||||
grouped_df = df.groupby("tag")
|
||||
ser = df["pid"]
|
||||
grouped_ser = ser.groupby(df["tag"])
|
||||
expected_indexes = [1, 2, 4, 7]
|
||||
|
||||
# Filter DataFrame
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[expected_indexes]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
|
||||
# Cast to avoid upcast when setting nan below
|
||||
expected = df.copy().astype("float64")
|
||||
expected.iloc[[0, 3, 5, 6]] = np.nan
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Filter Series
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1)
|
||||
expected = ser.take(expected_indexes)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
|
||||
expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
|
||||
# ^ made manually because this can get confusing!
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform Series
|
||||
actual = grouped_ser.transform(len)
|
||||
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform (a column from) DataFrameGroupBy
|
||||
actual = grouped_df.pid.transform(len)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_and_transform_with_non_unique_float_index():
|
||||
# GH4620
|
||||
index = np.array([1, 1, 1, 2, 1, 1, 0, 1], dtype=float)
|
||||
df = DataFrame(
|
||||
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
|
||||
index=index,
|
||||
)
|
||||
grouped_df = df.groupby("tag")
|
||||
ser = df["pid"]
|
||||
grouped_ser = ser.groupby(df["tag"])
|
||||
expected_indexes = [1, 2, 4, 7]
|
||||
|
||||
# Filter DataFrame
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[expected_indexes]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
|
||||
# Cast to avoid upcast when setting nan below
|
||||
expected = df.copy().astype("float64")
|
||||
expected.iloc[[0, 3, 5, 6]] = np.nan
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Filter Series
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1)
|
||||
expected = ser.take(expected_indexes)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
|
||||
expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
|
||||
# ^ made manually because this can get confusing!
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform Series
|
||||
actual = grouped_ser.transform(len)
|
||||
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform (a column from) DataFrameGroupBy
|
||||
actual = grouped_df.pid.transform(len)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_and_transform_with_non_unique_timestamp_index():
|
||||
# GH4620
|
||||
t0 = Timestamp("2013-09-30 00:05:00")
|
||||
t1 = Timestamp("2013-10-30 00:05:00")
|
||||
t2 = Timestamp("2013-11-30 00:05:00")
|
||||
index = [t1, t1, t1, t2, t1, t1, t0, t1]
|
||||
df = DataFrame(
|
||||
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
|
||||
index=index,
|
||||
)
|
||||
grouped_df = df.groupby("tag")
|
||||
ser = df["pid"]
|
||||
grouped_ser = ser.groupby(df["tag"])
|
||||
expected_indexes = [1, 2, 4, 7]
|
||||
|
||||
# Filter DataFrame
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[expected_indexes]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
|
||||
# Cast to avoid upcast when setting nan below
|
||||
expected = df.copy().astype("float64")
|
||||
expected.iloc[[0, 3, 5, 6]] = np.nan
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Filter Series
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1)
|
||||
expected = ser.take(expected_indexes)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
|
||||
expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
|
||||
# ^ made manually because this can get confusing!
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform Series
|
||||
actual = grouped_ser.transform(len)
|
||||
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform (a column from) DataFrameGroupBy
|
||||
actual = grouped_df.pid.transform(len)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_and_transform_with_non_unique_string_index():
|
||||
# GH4620
|
||||
index = list("bbbcbbab")
|
||||
df = DataFrame(
|
||||
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
|
||||
index=index,
|
||||
)
|
||||
grouped_df = df.groupby("tag")
|
||||
ser = df["pid"]
|
||||
grouped_ser = ser.groupby(df["tag"])
|
||||
expected_indexes = [1, 2, 4, 7]
|
||||
|
||||
# Filter DataFrame
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[expected_indexes]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
|
||||
# Cast to avoid upcast when setting nan below
|
||||
expected = df.copy().astype("float64")
|
||||
expected.iloc[[0, 3, 5, 6]] = np.nan
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Filter Series
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1)
|
||||
expected = ser.take(expected_indexes)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
|
||||
expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
|
||||
# ^ made manually because this can get confusing!
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform Series
|
||||
actual = grouped_ser.transform(len)
|
||||
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform (a column from) DataFrameGroupBy
|
||||
actual = grouped_df.pid.transform(len)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_has_access_to_grouped_cols():
|
||||
df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=["A", "B"])
|
||||
g = df.groupby("A")
|
||||
# previously didn't have access to col A #????
|
||||
filt = g.filter(lambda x: x["A"].sum() == 2)
|
||||
tm.assert_frame_equal(filt, df.iloc[[0, 1]])
|
||||
|
||||
|
||||
def test_filter_enforces_scalarness():
|
||||
df = DataFrame(
|
||||
[
|
||||
["best", "a", "x"],
|
||||
["worst", "b", "y"],
|
||||
["best", "c", "x"],
|
||||
["best", "d", "y"],
|
||||
["worst", "d", "y"],
|
||||
["worst", "d", "y"],
|
||||
["best", "d", "z"],
|
||||
],
|
||||
columns=["a", "b", "c"],
|
||||
)
|
||||
with pytest.raises(TypeError, match="filter function returned a.*"):
|
||||
df.groupby("c").filter(lambda g: g["a"] == "best")
|
||||
|
||||
|
||||
def test_filter_non_bool_raises():
|
||||
df = DataFrame(
|
||||
[
|
||||
["best", "a", 1],
|
||||
["worst", "b", 1],
|
||||
["best", "c", 1],
|
||||
["best", "d", 1],
|
||||
["worst", "d", 1],
|
||||
["worst", "d", 1],
|
||||
["best", "d", 1],
|
||||
],
|
||||
columns=["a", "b", "c"],
|
||||
)
|
||||
with pytest.raises(TypeError, match="filter function returned a.*"):
|
||||
df.groupby("a").filter(lambda g: g.c.mean())
|
||||
|
||||
|
||||
def test_filter_dropna_with_empty_groups():
|
||||
# GH 10780
|
||||
data = Series(np.random.default_rng(2).random(9), index=np.repeat([1, 2, 3], 3))
|
||||
grouped = data.groupby(level=0)
|
||||
result_false = grouped.filter(lambda x: x.mean() > 1, dropna=False)
|
||||
expected_false = Series([np.nan] * 9, index=np.repeat([1, 2, 3], 3))
|
||||
tm.assert_series_equal(result_false, expected_false)
|
||||
|
||||
result_true = grouped.filter(lambda x: x.mean() > 1, dropna=True)
|
||||
expected_true = Series(index=pd.Index([], dtype=int), dtype=np.float64)
|
||||
tm.assert_series_equal(result_true, expected_true)
|
||||
|
||||
|
||||
def test_filter_consistent_result_before_after_agg_func():
|
||||
# GH 17091
|
||||
df = DataFrame({"data": range(6), "key": list("ABCABC")})
|
||||
grouper = df.groupby("key")
|
||||
result = grouper.filter(lambda x: True)
|
||||
expected = DataFrame({"data": range(6), "key": list("ABCABC")})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
grouper.sum()
|
||||
result = grouper.filter(lambda x: True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,696 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat.pyarrow import pa_version_under10p1
|
||||
|
||||
from pandas.core.dtypes.missing import na_value_for_dtype
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.groupby import get_groupby_method_args
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dropna, tuples, outputs",
|
||||
[
|
||||
(
|
||||
True,
|
||||
[["A", "B"], ["B", "A"]],
|
||||
{"c": [13.0, 123.23], "d": [13.0, 123.0], "e": [13.0, 1.0]},
|
||||
),
|
||||
(
|
||||
False,
|
||||
[["A", "B"], ["A", np.nan], ["B", "A"]],
|
||||
{
|
||||
"c": [13.0, 12.3, 123.23],
|
||||
"d": [13.0, 233.0, 123.0],
|
||||
"e": [13.0, 12.0, 1.0],
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_groupby_dropna_multi_index_dataframe_nan_in_one_group(
|
||||
dropna, tuples, outputs, nulls_fixture
|
||||
):
|
||||
# GH 3729 this is to test that NA is in one group
|
||||
df_list = [
|
||||
["A", "B", 12, 12, 12],
|
||||
["A", nulls_fixture, 12.3, 233.0, 12],
|
||||
["B", "A", 123.23, 123, 1],
|
||||
["A", "B", 1, 1, 1.0],
|
||||
]
|
||||
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
|
||||
grouped = df.groupby(["a", "b"], dropna=dropna).sum()
|
||||
|
||||
mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))
|
||||
|
||||
# Since right now, by default MI will drop NA from levels when we create MI
|
||||
# via `from_*`, so we need to add NA for level manually afterwards.
|
||||
if not dropna:
|
||||
mi = mi.set_levels(["A", "B", np.nan], level="b")
|
||||
expected = pd.DataFrame(outputs, index=mi)
|
||||
|
||||
tm.assert_frame_equal(grouped, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dropna, tuples, outputs",
|
||||
[
|
||||
(
|
||||
True,
|
||||
[["A", "B"], ["B", "A"]],
|
||||
{"c": [12.0, 123.23], "d": [12.0, 123.0], "e": [12.0, 1.0]},
|
||||
),
|
||||
(
|
||||
False,
|
||||
[["A", "B"], ["A", np.nan], ["B", "A"], [np.nan, "B"]],
|
||||
{
|
||||
"c": [12.0, 13.3, 123.23, 1.0],
|
||||
"d": [12.0, 234.0, 123.0, 1.0],
|
||||
"e": [12.0, 13.0, 1.0, 1.0],
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups(
|
||||
dropna, tuples, outputs, nulls_fixture, nulls_fixture2
|
||||
):
|
||||
# GH 3729 this is to test that NA in different groups with different representations
|
||||
df_list = [
|
||||
["A", "B", 12, 12, 12],
|
||||
["A", nulls_fixture, 12.3, 233.0, 12],
|
||||
["B", "A", 123.23, 123, 1],
|
||||
[nulls_fixture2, "B", 1, 1, 1.0],
|
||||
["A", nulls_fixture2, 1, 1, 1.0],
|
||||
]
|
||||
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
|
||||
grouped = df.groupby(["a", "b"], dropna=dropna).sum()
|
||||
|
||||
mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))
|
||||
|
||||
# Since right now, by default MI will drop NA from levels when we create MI
|
||||
# via `from_*`, so we need to add NA for level manually afterwards.
|
||||
if not dropna:
|
||||
mi = mi.set_levels([["A", "B", np.nan], ["A", "B", np.nan]])
|
||||
expected = pd.DataFrame(outputs, index=mi)
|
||||
|
||||
tm.assert_frame_equal(grouped, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dropna, idx, outputs",
|
||||
[
|
||||
(True, ["A", "B"], {"b": [123.23, 13.0], "c": [123.0, 13.0], "d": [1.0, 13.0]}),
|
||||
(
|
||||
False,
|
||||
["A", "B", np.nan],
|
||||
{
|
||||
"b": [123.23, 13.0, 12.3],
|
||||
"c": [123.0, 13.0, 233.0],
|
||||
"d": [1.0, 13.0, 12.0],
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs):
|
||||
# GH 3729
|
||||
df_list = [
|
||||
["B", 12, 12, 12],
|
||||
[None, 12.3, 233.0, 12],
|
||||
["A", 123.23, 123, 1],
|
||||
["B", 1, 1, 1.0],
|
||||
]
|
||||
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d"])
|
||||
grouped = df.groupby("a", dropna=dropna).sum()
|
||||
|
||||
expected = pd.DataFrame(outputs, index=pd.Index(idx, dtype="object", name="a"))
|
||||
|
||||
tm.assert_frame_equal(grouped, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dropna, idx, expected",
|
||||
[
|
||||
(True, ["a", "a", "b", np.nan], pd.Series([3, 3], index=["a", "b"])),
|
||||
(
|
||||
False,
|
||||
["a", "a", "b", np.nan],
|
||||
pd.Series([3, 3, 3], index=["a", "b", np.nan]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_groupby_dropna_series_level(dropna, idx, expected):
|
||||
ser = pd.Series([1, 2, 3, 3], index=idx)
|
||||
|
||||
result = ser.groupby(level=0, dropna=dropna).sum()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dropna, expected",
|
||||
[
|
||||
(True, pd.Series([210.0, 350.0], index=["a", "b"], name="Max Speed")),
|
||||
(
|
||||
False,
|
||||
pd.Series([210.0, 350.0, 20.0], index=["a", "b", np.nan], name="Max Speed"),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_groupby_dropna_series_by(dropna, expected):
|
||||
ser = pd.Series(
|
||||
[390.0, 350.0, 30.0, 20.0],
|
||||
index=["Falcon", "Falcon", "Parrot", "Parrot"],
|
||||
name="Max Speed",
|
||||
)
|
||||
|
||||
result = ser.groupby(["a", "b", "a", np.nan], dropna=dropna).mean()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dropna", (False, True))
|
||||
def test_grouper_dropna_propagation(dropna):
|
||||
# GH 36604
|
||||
df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]})
|
||||
gb = df.groupby("A", dropna=dropna)
|
||||
assert gb._grouper.dropna == dropna
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index",
|
||||
[
|
||||
pd.RangeIndex(0, 4),
|
||||
list("abcd"),
|
||||
pd.MultiIndex.from_product([(1, 2), ("R", "B")], names=["num", "col"]),
|
||||
],
|
||||
)
|
||||
def test_groupby_dataframe_slice_then_transform(dropna, index):
|
||||
# GH35014 & GH35612
|
||||
expected_data = {"B": [2, 2, 1, np.nan if dropna else 1]}
|
||||
|
||||
df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=index)
|
||||
gb = df.groupby("A", dropna=dropna)
|
||||
|
||||
result = gb.transform(len)
|
||||
expected = pd.DataFrame(expected_data, index=index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = gb[["B"]].transform(len)
|
||||
expected = pd.DataFrame(expected_data, index=index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = gb["B"].transform(len)
|
||||
expected = pd.Series(expected_data["B"], index=index, name="B")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dropna, tuples, outputs",
|
||||
[
|
||||
(
|
||||
True,
|
||||
[["A", "B"], ["B", "A"]],
|
||||
{"c": [13.0, 123.23], "d": [12.0, 123.0], "e": [1.0, 1.0]},
|
||||
),
|
||||
(
|
||||
False,
|
||||
[["A", "B"], ["A", np.nan], ["B", "A"]],
|
||||
{
|
||||
"c": [13.0, 12.3, 123.23],
|
||||
"d": [12.0, 233.0, 123.0],
|
||||
"e": [1.0, 12.0, 1.0],
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_groupby_dropna_multi_index_dataframe_agg(dropna, tuples, outputs):
|
||||
# GH 3729
|
||||
df_list = [
|
||||
["A", "B", 12, 12, 12],
|
||||
["A", None, 12.3, 233.0, 12],
|
||||
["B", "A", 123.23, 123, 1],
|
||||
["A", "B", 1, 1, 1.0],
|
||||
]
|
||||
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
|
||||
agg_dict = {"c": "sum", "d": "max", "e": "min"}
|
||||
grouped = df.groupby(["a", "b"], dropna=dropna).agg(agg_dict)
|
||||
|
||||
mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))
|
||||
|
||||
# Since right now, by default MI will drop NA from levels when we create MI
|
||||
# via `from_*`, so we need to add NA for level manually afterwards.
|
||||
if not dropna:
|
||||
mi = mi.set_levels(["A", "B", np.nan], level="b")
|
||||
expected = pd.DataFrame(outputs, index=mi)
|
||||
|
||||
tm.assert_frame_equal(grouped, expected)
|
||||
|
||||
|
||||
@pytest.mark.arm_slow
|
||||
@pytest.mark.parametrize(
|
||||
"datetime1, datetime2",
|
||||
[
|
||||
(pd.Timestamp("2020-01-01"), pd.Timestamp("2020-02-01")),
|
||||
(pd.Timedelta("-2 days"), pd.Timedelta("-1 days")),
|
||||
(pd.Period("2020-01-01"), pd.Period("2020-02-01")),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dropna, values", [(True, [12, 3]), (False, [12, 3, 6])])
|
||||
def test_groupby_dropna_datetime_like_data(
|
||||
dropna, values, datetime1, datetime2, unique_nulls_fixture, unique_nulls_fixture2
|
||||
):
|
||||
# 3729
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"values": [1, 2, 3, 4, 5, 6],
|
||||
"dt": [
|
||||
datetime1,
|
||||
unique_nulls_fixture,
|
||||
datetime2,
|
||||
unique_nulls_fixture2,
|
||||
datetime1,
|
||||
datetime1,
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
if dropna:
|
||||
indexes = [datetime1, datetime2]
|
||||
else:
|
||||
indexes = [datetime1, datetime2, np.nan]
|
||||
|
||||
grouped = df.groupby("dt", dropna=dropna).agg({"values": "sum"})
|
||||
expected = pd.DataFrame({"values": values}, index=pd.Index(indexes, name="dt"))
|
||||
|
||||
tm.assert_frame_equal(grouped, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dropna, data, selected_data, levels",
|
||||
[
|
||||
pytest.param(
|
||||
False,
|
||||
{"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]},
|
||||
{"values": [0, 1, 0, 0]},
|
||||
["a", "b", np.nan],
|
||||
id="dropna_false_has_nan",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
{"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]},
|
||||
{"values": [0, 1, 0]},
|
||||
None,
|
||||
id="dropna_true_has_nan",
|
||||
),
|
||||
pytest.param(
|
||||
# no nan in "groups"; dropna=True|False should be same.
|
||||
False,
|
||||
{"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]},
|
||||
{"values": [0, 1, 0, 0]},
|
||||
None,
|
||||
id="dropna_false_no_nan",
|
||||
),
|
||||
pytest.param(
|
||||
# no nan in "groups"; dropna=True|False should be same.
|
||||
True,
|
||||
{"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]},
|
||||
{"values": [0, 1, 0, 0]},
|
||||
None,
|
||||
id="dropna_true_no_nan",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data, levels):
|
||||
# GH 35889
|
||||
|
||||
df = pd.DataFrame(data)
|
||||
gb = df.groupby("groups", dropna=dropna)
|
||||
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
||||
with tm.assert_produces_warning(DeprecationWarning, match=msg):
|
||||
result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))}))
|
||||
|
||||
mi_tuples = tuple(zip(data["groups"], selected_data["values"]))
|
||||
mi = pd.MultiIndex.from_tuples(mi_tuples, names=["groups", None])
|
||||
# Since right now, by default MI will drop NA from levels when we create MI
|
||||
# via `from_*`, so we need to add NA for level manually afterwards.
|
||||
if not dropna and levels:
|
||||
mi = mi.set_levels(levels, level="groups")
|
||||
|
||||
expected = pd.DataFrame(selected_data, index=mi)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("input_index", [None, ["a"], ["a", "b"]])
|
||||
@pytest.mark.parametrize("keys", [["a"], ["a", "b"]])
|
||||
@pytest.mark.parametrize("series", [True, False])
|
||||
def test_groupby_dropna_with_multiindex_input(input_index, keys, series):
|
||||
# GH#46783
|
||||
obj = pd.DataFrame(
|
||||
{
|
||||
"a": [1, np.nan],
|
||||
"b": [1, 1],
|
||||
"c": [2, 3],
|
||||
}
|
||||
)
|
||||
|
||||
expected = obj.set_index(keys)
|
||||
if series:
|
||||
expected = expected["c"]
|
||||
elif input_index == ["a", "b"] and keys == ["a"]:
|
||||
# Column b should not be aggregated
|
||||
expected = expected[["c"]]
|
||||
|
||||
if input_index is not None:
|
||||
obj = obj.set_index(input_index)
|
||||
gb = obj.groupby(keys, dropna=False)
|
||||
if series:
|
||||
gb = gb["c"]
|
||||
result = gb.sum()
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_nan_included():
|
||||
# GH 35646
|
||||
data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]}
|
||||
df = pd.DataFrame(data)
|
||||
grouped = df.groupby("group", dropna=False)
|
||||
result = grouped.indices
|
||||
dtype = np.intp
|
||||
expected = {
|
||||
"g1": np.array([0, 2], dtype=dtype),
|
||||
"g2": np.array([3], dtype=dtype),
|
||||
np.nan: np.array([1, 4], dtype=dtype),
|
||||
}
|
||||
for result_values, expected_values in zip(result.values(), expected.values()):
|
||||
tm.assert_numpy_array_equal(result_values, expected_values)
|
||||
assert np.isnan(list(result.keys())[2])
|
||||
assert list(result.keys())[0:2] == ["g1", "g2"]
|
||||
|
||||
|
||||
def test_groupby_drop_nan_with_multi_index():
|
||||
# GH 39895
|
||||
df = pd.DataFrame([[np.nan, 0, 1]], columns=["a", "b", "c"])
|
||||
df = df.set_index(["a", "b"])
|
||||
result = df.groupby(["a", "b"], dropna=False).first()
|
||||
expected = df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
# sequence_index enumerates all strings made up of x, y, z of length 4
|
||||
@pytest.mark.parametrize("sequence_index", range(3**4))
|
||||
@pytest.mark.parametrize(
|
||||
"dtype",
|
||||
[
|
||||
None,
|
||||
"UInt8",
|
||||
"Int8",
|
||||
"UInt16",
|
||||
"Int16",
|
||||
"UInt32",
|
||||
"Int32",
|
||||
"UInt64",
|
||||
"Int64",
|
||||
"Float32",
|
||||
"Int64",
|
||||
"Float64",
|
||||
"category",
|
||||
"string",
|
||||
pytest.param(
|
||||
"string[pyarrow]",
|
||||
marks=pytest.mark.skipif(
|
||||
pa_version_under10p1, reason="pyarrow is not installed"
|
||||
),
|
||||
),
|
||||
"datetime64[ns]",
|
||||
"period[d]",
|
||||
"Sparse[float]",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("test_series", [True, False])
|
||||
def test_no_sort_keep_na(sequence_index, dtype, test_series, as_index):
|
||||
# GH#46584, GH#48794
|
||||
|
||||
# Convert sequence_index into a string sequence, e.g. 5 becomes "xxyz"
|
||||
# This sequence is used for the grouper.
|
||||
sequence = "".join(
|
||||
[{0: "x", 1: "y", 2: "z"}[sequence_index // (3**k) % 3] for k in range(4)]
|
||||
)
|
||||
|
||||
# Unique values to use for grouper, depends on dtype
|
||||
if dtype in ("string", "string[pyarrow]"):
|
||||
uniques = {"x": "x", "y": "y", "z": pd.NA}
|
||||
elif dtype in ("datetime64[ns]", "period[d]"):
|
||||
uniques = {"x": "2016-01-01", "y": "2017-01-01", "z": pd.NA}
|
||||
else:
|
||||
uniques = {"x": 1, "y": 2, "z": np.nan}
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"key": pd.Series([uniques[label] for label in sequence], dtype=dtype),
|
||||
"a": [0, 1, 2, 3],
|
||||
}
|
||||
)
|
||||
gb = df.groupby("key", dropna=False, sort=False, as_index=as_index, observed=False)
|
||||
if test_series:
|
||||
gb = gb["a"]
|
||||
result = gb.sum()
|
||||
|
||||
# Manually compute the groupby sum, use the labels "x", "y", and "z" to avoid
|
||||
# issues with hashing np.nan
|
||||
summed = {}
|
||||
for idx, label in enumerate(sequence):
|
||||
summed[label] = summed.get(label, 0) + idx
|
||||
if dtype == "category":
|
||||
index = pd.CategoricalIndex(
|
||||
[uniques[e] for e in summed],
|
||||
df["key"].cat.categories,
|
||||
name="key",
|
||||
)
|
||||
elif isinstance(dtype, str) and dtype.startswith("Sparse"):
|
||||
index = pd.Index(
|
||||
pd.array([uniques[label] for label in summed], dtype=dtype), name="key"
|
||||
)
|
||||
else:
|
||||
index = pd.Index([uniques[label] for label in summed], dtype=dtype, name="key")
|
||||
expected = pd.Series(summed.values(), index=index, name="a", dtype=None)
|
||||
if not test_series:
|
||||
expected = expected.to_frame()
|
||||
if not as_index:
|
||||
expected = expected.reset_index()
|
||||
if dtype is not None and dtype.startswith("Sparse"):
|
||||
expected["key"] = expected["key"].astype(dtype)
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("test_series", [True, False])
|
||||
@pytest.mark.parametrize("dtype", [object, None])
|
||||
def test_null_is_null_for_dtype(
|
||||
sort, dtype, nulls_fixture, nulls_fixture2, test_series
|
||||
):
|
||||
# GH#48506 - groups should always result in using the null for the dtype
|
||||
df = pd.DataFrame({"a": [1, 2]})
|
||||
groups = pd.Series([nulls_fixture, nulls_fixture2], dtype=dtype)
|
||||
obj = df["a"] if test_series else df
|
||||
gb = obj.groupby(groups, dropna=False, sort=sort)
|
||||
result = gb.sum()
|
||||
index = pd.Index([na_value_for_dtype(groups.dtype)])
|
||||
expected = pd.DataFrame({"a": [3]}, index=index)
|
||||
if test_series:
|
||||
tm.assert_series_equal(result, expected["a"])
|
||||
else:
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
|
||||
def test_categorical_reducers(reduction_func, observed, sort, as_index, index_kind):
|
||||
# Ensure there is at least one null value by appending to the end
|
||||
values = np.append(np.random.default_rng(2).choice([1, 2, None], size=19), None)
|
||||
df = pd.DataFrame(
|
||||
{"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(20)}
|
||||
)
|
||||
|
||||
# Strategy: Compare to dropna=True by filling null values with a new code
|
||||
df_filled = df.copy()
|
||||
df_filled["x"] = pd.Categorical(values, categories=[1, 2, 3, 4]).fillna(4)
|
||||
|
||||
if index_kind == "range":
|
||||
keys = ["x"]
|
||||
elif index_kind == "single":
|
||||
keys = ["x"]
|
||||
df = df.set_index("x")
|
||||
df_filled = df_filled.set_index("x")
|
||||
else:
|
||||
keys = ["x", "x2"]
|
||||
df["x2"] = df["x"]
|
||||
df = df.set_index(["x", "x2"])
|
||||
df_filled["x2"] = df_filled["x"]
|
||||
df_filled = df_filled.set_index(["x", "x2"])
|
||||
args = get_groupby_method_args(reduction_func, df)
|
||||
args_filled = get_groupby_method_args(reduction_func, df_filled)
|
||||
if reduction_func == "corrwith" and index_kind == "range":
|
||||
# Don't include the grouping columns so we can call reset_index
|
||||
args = (args[0].drop(columns=keys),)
|
||||
args_filled = (args_filled[0].drop(columns=keys),)
|
||||
|
||||
gb_keepna = df.groupby(
|
||||
keys, dropna=False, observed=observed, sort=sort, as_index=as_index
|
||||
)
|
||||
|
||||
if not observed and reduction_func in ["idxmin", "idxmax"]:
|
||||
with pytest.raises(
|
||||
ValueError, match="empty group due to unobserved categories"
|
||||
):
|
||||
getattr(gb_keepna, reduction_func)(*args)
|
||||
return
|
||||
|
||||
gb_filled = df_filled.groupby(keys, observed=observed, sort=sort, as_index=True)
|
||||
expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index()
|
||||
expected["x"] = expected["x"].cat.remove_categories([4])
|
||||
if index_kind == "multi":
|
||||
expected["x2"] = expected["x2"].cat.remove_categories([4])
|
||||
if as_index:
|
||||
if index_kind == "multi":
|
||||
expected = expected.set_index(["x", "x2"])
|
||||
else:
|
||||
expected = expected.set_index("x")
|
||||
elif index_kind != "range" and reduction_func != "size":
|
||||
# size, unlike other methods, has the desired behavior in GH#49519
|
||||
expected = expected.drop(columns="x")
|
||||
if index_kind == "multi":
|
||||
expected = expected.drop(columns="x2")
|
||||
if reduction_func in ("idxmax", "idxmin") and index_kind != "range":
|
||||
# expected was computed with a RangeIndex; need to translate to index values
|
||||
values = expected["y"].values.tolist()
|
||||
if index_kind == "single":
|
||||
values = [np.nan if e == 4 else e for e in values]
|
||||
expected["y"] = pd.Categorical(values, categories=[1, 2, 3])
|
||||
else:
|
||||
values = [(np.nan, np.nan) if e == (4, 4) else e for e in values]
|
||||
expected["y"] = values
|
||||
if reduction_func == "size":
|
||||
# size, unlike other methods, has the desired behavior in GH#49519
|
||||
expected = expected.rename(columns={0: "size"})
|
||||
if as_index:
|
||||
expected = expected["size"].rename(None)
|
||||
|
||||
if as_index or index_kind == "range" or reduction_func == "size":
|
||||
warn = None
|
||||
else:
|
||||
warn = FutureWarning
|
||||
msg = "A grouping .* was excluded from the result"
|
||||
with tm.assert_produces_warning(warn, match=msg):
|
||||
result = getattr(gb_keepna, reduction_func)(*args)
|
||||
|
||||
# size will return a Series, others are DataFrame
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_transformers(
|
||||
request, transformation_func, observed, sort, as_index
|
||||
):
|
||||
# GH#36327
|
||||
if transformation_func == "fillna":
|
||||
msg = "GH#49651 fillna may incorrectly reorders results when dropna=False"
|
||||
request.applymarker(pytest.mark.xfail(reason=msg, strict=False))
|
||||
|
||||
values = np.append(np.random.default_rng(2).choice([1, 2, None], size=19), None)
|
||||
df = pd.DataFrame(
|
||||
{"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(20)}
|
||||
)
|
||||
args = get_groupby_method_args(transformation_func, df)
|
||||
|
||||
# Compute result for null group
|
||||
null_group_values = df[df["x"].isnull()]["y"]
|
||||
if transformation_func == "cumcount":
|
||||
null_group_data = list(range(len(null_group_values)))
|
||||
elif transformation_func == "ngroup":
|
||||
if sort:
|
||||
if observed:
|
||||
na_group = df["x"].nunique(dropna=False) - 1
|
||||
else:
|
||||
# TODO: Should this be 3?
|
||||
na_group = df["x"].nunique(dropna=False) - 1
|
||||
else:
|
||||
na_group = df.iloc[: null_group_values.index[0]]["x"].nunique()
|
||||
null_group_data = len(null_group_values) * [na_group]
|
||||
else:
|
||||
null_group_data = getattr(null_group_values, transformation_func)(*args)
|
||||
null_group_result = pd.DataFrame({"y": null_group_data})
|
||||
|
||||
gb_keepna = df.groupby(
|
||||
"x", dropna=False, observed=observed, sort=sort, as_index=as_index
|
||||
)
|
||||
gb_dropna = df.groupby("x", dropna=True, observed=observed, sort=sort)
|
||||
|
||||
msg = "The default fill_method='ffill' in DataFrameGroupBy.pct_change is deprecated"
|
||||
if transformation_func == "pct_change":
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = getattr(gb_keepna, "pct_change")(*args)
|
||||
else:
|
||||
result = getattr(gb_keepna, transformation_func)(*args)
|
||||
expected = getattr(gb_dropna, transformation_func)(*args)
|
||||
|
||||
for iloc, value in zip(
|
||||
df[df["x"].isnull()].index.tolist(), null_group_result.values.ravel()
|
||||
):
|
||||
if expected.ndim == 1:
|
||||
expected.iloc[iloc] = value
|
||||
else:
|
||||
expected.iloc[iloc, 0] = value
|
||||
if transformation_func == "ngroup":
|
||||
expected[df["x"].notnull() & expected.ge(na_group)] += 1
|
||||
if transformation_func not in ("rank", "diff", "pct_change", "shift"):
|
||||
expected = expected.astype("int64")
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["head", "tail"])
|
||||
def test_categorical_head_tail(method, observed, sort, as_index):
|
||||
# GH#36327
|
||||
values = np.random.default_rng(2).choice([1, 2, None], 30)
|
||||
df = pd.DataFrame(
|
||||
{"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))}
|
||||
)
|
||||
gb = df.groupby("x", dropna=False, observed=observed, sort=sort, as_index=as_index)
|
||||
result = getattr(gb, method)()
|
||||
|
||||
if method == "tail":
|
||||
values = values[::-1]
|
||||
# Take the top 5 values from each group
|
||||
mask = (
|
||||
((values == 1) & ((values == 1).cumsum() <= 5))
|
||||
| ((values == 2) & ((values == 2).cumsum() <= 5))
|
||||
# flake8 doesn't like the vectorized check for None, thinks we should use `is`
|
||||
| ((values == None) & ((values == None).cumsum() <= 5)) # noqa: E711
|
||||
)
|
||||
if method == "tail":
|
||||
mask = mask[::-1]
|
||||
expected = df[mask]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_agg():
|
||||
# GH#36327
|
||||
values = np.random.default_rng(2).choice([1, 2, None], 30)
|
||||
df = pd.DataFrame(
|
||||
{"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))}
|
||||
)
|
||||
gb = df.groupby("x", dropna=False, observed=False)
|
||||
result = gb.agg(lambda x: x.sum())
|
||||
expected = gb.sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_transform():
|
||||
# GH#36327
|
||||
values = np.random.default_rng(2).choice([1, 2, None], 30)
|
||||
df = pd.DataFrame(
|
||||
{"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))}
|
||||
)
|
||||
gb = df.groupby("x", dropna=False, observed=False)
|
||||
result = gb.transform(lambda x: x.sum())
|
||||
expected = gb.transform("sum")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,135 @@
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.groupby import get_groupby_method_args
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"obj",
|
||||
[
|
||||
tm.SubclassedDataFrame({"A": np.arange(0, 10)}),
|
||||
tm.SubclassedSeries(np.arange(0, 10), name="A"),
|
||||
],
|
||||
)
|
||||
def test_groupby_preserves_subclass(obj, groupby_func):
|
||||
# GH28330 -- preserve subclass through groupby operations
|
||||
|
||||
if isinstance(obj, Series) and groupby_func in {"corrwith"}:
|
||||
pytest.skip(f"Not applicable for Series and {groupby_func}")
|
||||
|
||||
grouped = obj.groupby(np.arange(0, 10))
|
||||
|
||||
# Groups should preserve subclass type
|
||||
assert isinstance(grouped.get_group(0), type(obj))
|
||||
|
||||
args = get_groupby_method_args(groupby_func, obj)
|
||||
|
||||
warn = FutureWarning if groupby_func == "fillna" else None
|
||||
msg = f"{type(grouped).__name__}.fillna is deprecated"
|
||||
with tm.assert_produces_warning(warn, match=msg, raise_on_extra_warnings=False):
|
||||
result1 = getattr(grouped, groupby_func)(*args)
|
||||
with tm.assert_produces_warning(warn, match=msg, raise_on_extra_warnings=False):
|
||||
result2 = grouped.agg(groupby_func, *args)
|
||||
|
||||
# Reduction or transformation kernels should preserve type
|
||||
slices = {"ngroup", "cumcount", "size"}
|
||||
if isinstance(obj, DataFrame) and groupby_func in slices:
|
||||
assert isinstance(result1, tm.SubclassedSeries)
|
||||
else:
|
||||
assert isinstance(result1, type(obj))
|
||||
|
||||
# Confirm .agg() groupby operations return same results
|
||||
if isinstance(result1, DataFrame):
|
||||
tm.assert_frame_equal(result1, result2)
|
||||
else:
|
||||
tm.assert_series_equal(result1, result2)
|
||||
|
||||
|
||||
def test_groupby_preserves_metadata():
|
||||
# GH-37343
|
||||
custom_df = tm.SubclassedDataFrame({"a": [1, 2, 3], "b": [1, 1, 2], "c": [7, 8, 9]})
|
||||
assert "testattr" in custom_df._metadata
|
||||
custom_df.testattr = "hello"
|
||||
for _, group_df in custom_df.groupby("c"):
|
||||
assert group_df.testattr == "hello"
|
||||
|
||||
# GH-45314
|
||||
def func(group):
|
||||
assert isinstance(group, tm.SubclassedDataFrame)
|
||||
assert hasattr(group, "testattr")
|
||||
assert group.testattr == "hello"
|
||||
return group.testattr
|
||||
|
||||
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
||||
with tm.assert_produces_warning(
|
||||
DeprecationWarning,
|
||||
match=msg,
|
||||
raise_on_extra_warnings=False,
|
||||
check_stacklevel=False,
|
||||
):
|
||||
result = custom_df.groupby("c").apply(func)
|
||||
expected = tm.SubclassedSeries(["hello"] * 3, index=Index([7, 8, 9], name="c"))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = custom_df.groupby("c").apply(func, include_groups=False)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# https://github.com/pandas-dev/pandas/pull/56761
|
||||
result = custom_df.groupby("c")[["a", "b"]].apply(func)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def func2(group):
|
||||
assert isinstance(group, tm.SubclassedSeries)
|
||||
assert hasattr(group, "testattr")
|
||||
return group.testattr
|
||||
|
||||
custom_series = tm.SubclassedSeries([1, 2, 3])
|
||||
custom_series.testattr = "hello"
|
||||
result = custom_series.groupby(custom_df["c"]).apply(func2)
|
||||
tm.assert_series_equal(result, expected)
|
||||
result = custom_series.groupby(custom_df["c"]).agg(func2)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("obj", [DataFrame, tm.SubclassedDataFrame])
|
||||
def test_groupby_resample_preserves_subclass(obj):
|
||||
# GH28330 -- preserve subclass through groupby.resample()
|
||||
|
||||
df = obj(
|
||||
{
|
||||
"Buyer": "Carl Carl Carl Carl Joe Carl".split(),
|
||||
"Quantity": [18, 3, 5, 1, 9, 3],
|
||||
"Date": [
|
||||
datetime(2013, 9, 1, 13, 0),
|
||||
datetime(2013, 9, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 3, 10, 0),
|
||||
datetime(2013, 12, 2, 12, 0),
|
||||
datetime(2013, 9, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
)
|
||||
df = df.set_index("Date")
|
||||
|
||||
# Confirm groupby.resample() preserves dataframe type
|
||||
msg = "DataFrameGroupBy.resample operated on the grouping columns"
|
||||
with tm.assert_produces_warning(
|
||||
DeprecationWarning,
|
||||
match=msg,
|
||||
raise_on_extra_warnings=False,
|
||||
check_stacklevel=False,
|
||||
):
|
||||
result = df.groupby("Buyer").resample("5D").sum()
|
||||
assert isinstance(result, obj)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,85 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.fixture(params=[["inner"], ["inner", "outer"]])
|
||||
def frame(request):
|
||||
levels = request.param
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"outer": ["a", "a", "a", "b", "b", "b"],
|
||||
"inner": [1, 2, 3, 1, 2, 3],
|
||||
"A": np.arange(6),
|
||||
"B": ["one", "one", "two", "two", "one", "one"],
|
||||
}
|
||||
)
|
||||
if levels:
|
||||
df = df.set_index(levels)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def series():
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"outer": ["a", "a", "a", "b", "b", "b"],
|
||||
"inner": [1, 2, 3, 1, 2, 3],
|
||||
"A": np.arange(6),
|
||||
"B": ["one", "one", "two", "two", "one", "one"],
|
||||
}
|
||||
)
|
||||
s = df.set_index(["outer", "inner", "B"])["A"]
|
||||
|
||||
return s
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"key_strs,groupers",
|
||||
[
|
||||
("inner", pd.Grouper(level="inner")), # Index name
|
||||
(["inner"], [pd.Grouper(level="inner")]), # List of index name
|
||||
(["B", "inner"], ["B", pd.Grouper(level="inner")]), # Column and index
|
||||
(["inner", "B"], [pd.Grouper(level="inner"), "B"]), # Index and column
|
||||
],
|
||||
)
|
||||
def test_grouper_index_level_as_string(frame, key_strs, groupers):
|
||||
if "B" not in key_strs or "outer" in frame.columns:
|
||||
result = frame.groupby(key_strs).mean(numeric_only=True)
|
||||
expected = frame.groupby(groupers).mean(numeric_only=True)
|
||||
else:
|
||||
result = frame.groupby(key_strs).mean()
|
||||
expected = frame.groupby(groupers).mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"levels",
|
||||
[
|
||||
"inner",
|
||||
"outer",
|
||||
"B",
|
||||
["inner"],
|
||||
["outer"],
|
||||
["B"],
|
||||
["inner", "outer"],
|
||||
["outer", "inner"],
|
||||
["inner", "outer", "B"],
|
||||
["B", "outer", "inner"],
|
||||
],
|
||||
)
|
||||
def test_grouper_index_level_as_string_series(series, levels):
|
||||
# Compute expected result
|
||||
if isinstance(levels, list):
|
||||
groupers = [pd.Grouper(level=lv) for lv in levels]
|
||||
else:
|
||||
groupers = pd.Grouper(level=levels)
|
||||
|
||||
expected = series.groupby(groupers).mean()
|
||||
|
||||
# Compute and check result
|
||||
result = series.groupby(levels).mean()
|
||||
tm.assert_series_equal(result, expected)
|
||||
@@ -0,0 +1,333 @@
|
||||
# Test GroupBy._positional_selector positional grouped indexing GH#42864
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arg, expected_rows",
|
||||
[
|
||||
[0, [0, 1, 4]],
|
||||
[2, [5]],
|
||||
[5, []],
|
||||
[-1, [3, 4, 7]],
|
||||
[-2, [1, 6]],
|
||||
[-6, []],
|
||||
],
|
||||
)
|
||||
def test_int(slice_test_df, slice_test_grouped, arg, expected_rows):
|
||||
# Test single integer
|
||||
result = slice_test_grouped._positional_selector[arg]
|
||||
expected = slice_test_df.iloc[expected_rows]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_slice(slice_test_df, slice_test_grouped):
|
||||
# Test single slice
|
||||
result = slice_test_grouped._positional_selector[0:3:2]
|
||||
expected = slice_test_df.iloc[[0, 1, 4, 5]]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arg, expected_rows",
|
||||
[
|
||||
[[0, 2], [0, 1, 4, 5]],
|
||||
[[0, 2, -1], [0, 1, 3, 4, 5, 7]],
|
||||
[range(0, 3, 2), [0, 1, 4, 5]],
|
||||
[{0, 2}, [0, 1, 4, 5]],
|
||||
],
|
||||
ids=[
|
||||
"list",
|
||||
"negative",
|
||||
"range",
|
||||
"set",
|
||||
],
|
||||
)
|
||||
def test_list(slice_test_df, slice_test_grouped, arg, expected_rows):
|
||||
# Test lists of integers and integer valued iterables
|
||||
result = slice_test_grouped._positional_selector[arg]
|
||||
expected = slice_test_df.iloc[expected_rows]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_ints(slice_test_df, slice_test_grouped):
|
||||
# Test tuple of ints
|
||||
result = slice_test_grouped._positional_selector[0, 2, -1]
|
||||
expected = slice_test_df.iloc[[0, 1, 3, 4, 5, 7]]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_slices(slice_test_df, slice_test_grouped):
|
||||
# Test tuple of slices
|
||||
result = slice_test_grouped._positional_selector[:2, -2:]
|
||||
expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_mix(slice_test_df, slice_test_grouped):
|
||||
# Test mixed tuple of ints and slices
|
||||
result = slice_test_grouped._positional_selector[0, 1, -2:]
|
||||
expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arg, expected_rows",
|
||||
[
|
||||
[0, [0, 1, 4]],
|
||||
[[0, 2, -1], [0, 1, 3, 4, 5, 7]],
|
||||
[(slice(None, 2), slice(-2, None)), [0, 1, 2, 3, 4, 6, 7]],
|
||||
],
|
||||
)
|
||||
def test_as_index(slice_test_df, arg, expected_rows):
|
||||
# Test the default as_index behaviour
|
||||
result = slice_test_df.groupby("Group", sort=False)._positional_selector[arg]
|
||||
expected = slice_test_df.iloc[expected_rows]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_doc_examples():
|
||||
# Test the examples in the documentation
|
||||
df = pd.DataFrame(
|
||||
[["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], columns=["A", "B"]
|
||||
)
|
||||
|
||||
grouped = df.groupby("A", as_index=False)
|
||||
|
||||
result = grouped._positional_selector[1:2]
|
||||
expected = pd.DataFrame([["a", 2], ["b", 5]], columns=["A", "B"], index=[1, 4])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = grouped._positional_selector[1, -1]
|
||||
expected = pd.DataFrame(
|
||||
[["a", 2], ["a", 3], ["b", 5]], columns=["A", "B"], index=[1, 2, 4]
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def multiindex_data():
|
||||
rng = np.random.default_rng(2)
|
||||
ndates = 100
|
||||
nitems = 20
|
||||
dates = pd.date_range("20130101", periods=ndates, freq="D")
|
||||
items = [f"item {i}" for i in range(nitems)]
|
||||
|
||||
data = {}
|
||||
for date in dates:
|
||||
nitems_for_date = nitems - rng.integers(0, 12)
|
||||
levels = [
|
||||
(item, rng.integers(0, 10000) / 100, rng.integers(0, 10000) / 100)
|
||||
for item in items[:nitems_for_date]
|
||||
]
|
||||
levels.sort(key=lambda x: x[1])
|
||||
data[date] = levels
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def _make_df_from_data(data):
|
||||
rows = {}
|
||||
for date in data:
|
||||
for level in data[date]:
|
||||
rows[(date, level[0])] = {"A": level[1], "B": level[2]}
|
||||
|
||||
df = pd.DataFrame.from_dict(rows, orient="index")
|
||||
df.index.names = ("Date", "Item")
|
||||
return df
|
||||
|
||||
|
||||
def test_multiindex(multiindex_data):
|
||||
# Test the multiindex mentioned as the use-case in the documentation
|
||||
df = _make_df_from_data(multiindex_data)
|
||||
result = df.groupby("Date", as_index=False).nth(slice(3, -3))
|
||||
|
||||
sliced = {date: multiindex_data[date][3:-3] for date in multiindex_data}
|
||||
expected = _make_df_from_data(sliced)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("arg", [1, 5, 30, 1000, -1, -5, -30, -1000])
|
||||
@pytest.mark.parametrize("method", ["head", "tail"])
|
||||
@pytest.mark.parametrize("simulated", [True, False])
|
||||
def test_against_head_and_tail(arg, method, simulated):
|
||||
# Test gives the same results as grouped head and tail
|
||||
n_groups = 100
|
||||
n_rows_per_group = 30
|
||||
|
||||
data = {
|
||||
"group": [
|
||||
f"group {g}" for j in range(n_rows_per_group) for g in range(n_groups)
|
||||
],
|
||||
"value": [
|
||||
f"group {g} row {j}"
|
||||
for j in range(n_rows_per_group)
|
||||
for g in range(n_groups)
|
||||
],
|
||||
}
|
||||
df = pd.DataFrame(data)
|
||||
grouped = df.groupby("group", as_index=False)
|
||||
size = arg if arg >= 0 else n_rows_per_group + arg
|
||||
|
||||
if method == "head":
|
||||
result = grouped._positional_selector[:arg]
|
||||
|
||||
if simulated:
|
||||
indices = [
|
||||
j * n_groups + i
|
||||
for j in range(size)
|
||||
for i in range(n_groups)
|
||||
if j * n_groups + i < n_groups * n_rows_per_group
|
||||
]
|
||||
expected = df.iloc[indices]
|
||||
|
||||
else:
|
||||
expected = grouped.head(arg)
|
||||
|
||||
else:
|
||||
result = grouped._positional_selector[-arg:]
|
||||
|
||||
if simulated:
|
||||
indices = [
|
||||
(n_rows_per_group + j - size) * n_groups + i
|
||||
for j in range(size)
|
||||
for i in range(n_groups)
|
||||
if (n_rows_per_group + j - size) * n_groups + i >= 0
|
||||
]
|
||||
expected = df.iloc[indices]
|
||||
|
||||
else:
|
||||
expected = grouped.tail(arg)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("start", [None, 0, 1, 10, -1, -10])
|
||||
@pytest.mark.parametrize("stop", [None, 0, 1, 10, -1, -10])
|
||||
@pytest.mark.parametrize("step", [None, 1, 5])
|
||||
def test_against_df_iloc(start, stop, step):
|
||||
# Test that a single group gives the same results as DataFrame.iloc
|
||||
n_rows = 30
|
||||
|
||||
data = {
|
||||
"group": ["group 0"] * n_rows,
|
||||
"value": list(range(n_rows)),
|
||||
}
|
||||
df = pd.DataFrame(data)
|
||||
grouped = df.groupby("group", as_index=False)
|
||||
|
||||
result = grouped._positional_selector[start:stop:step]
|
||||
expected = df.iloc[start:stop:step]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_series():
|
||||
# Test grouped Series
|
||||
ser = pd.Series([1, 2, 3, 4, 5], index=["a", "a", "a", "b", "b"])
|
||||
grouped = ser.groupby(level=0)
|
||||
result = grouped._positional_selector[1:2]
|
||||
expected = pd.Series([2, 5], index=["a", "b"])
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("step", [1, 2, 3, 4, 5])
|
||||
def test_step(step):
|
||||
# Test slice with various step values
|
||||
data = [["x", f"x{i}"] for i in range(5)]
|
||||
data += [["y", f"y{i}"] for i in range(4)]
|
||||
data += [["z", f"z{i}"] for i in range(3)]
|
||||
df = pd.DataFrame(data, columns=["A", "B"])
|
||||
|
||||
grouped = df.groupby("A", as_index=False)
|
||||
|
||||
result = grouped._positional_selector[::step]
|
||||
|
||||
data = [["x", f"x{i}"] for i in range(0, 5, step)]
|
||||
data += [["y", f"y{i}"] for i in range(0, 4, step)]
|
||||
data += [["z", f"z{i}"] for i in range(0, 3, step)]
|
||||
|
||||
index = [0 + i for i in range(0, 5, step)]
|
||||
index += [5 + i for i in range(0, 4, step)]
|
||||
index += [9 + i for i in range(0, 3, step)]
|
||||
|
||||
expected = pd.DataFrame(data, columns=["A", "B"], index=index)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def column_group_df():
|
||||
return pd.DataFrame(
|
||||
[[0, 1, 2, 3, 4, 5, 6], [0, 0, 1, 0, 1, 0, 2]],
|
||||
columns=["A", "B", "C", "D", "E", "F", "G"],
|
||||
)
|
||||
|
||||
|
||||
def test_column_axis(column_group_df):
|
||||
msg = "DataFrame.groupby with axis=1"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
g = column_group_df.groupby(column_group_df.iloc[1], axis=1)
|
||||
result = g._positional_selector[1:-1]
|
||||
expected = column_group_df.iloc[:, [1, 3]]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_columns_on_iter():
|
||||
# GitHub issue #44821
|
||||
df = pd.DataFrame({k: range(10) for k in "ABC"})
|
||||
|
||||
# Group-by and select columns
|
||||
cols = ["A", "B"]
|
||||
for _, dg in df.groupby(df.A < 4)[cols]:
|
||||
tm.assert_index_equal(dg.columns, pd.Index(cols))
|
||||
assert "C" not in dg.columns
|
||||
|
||||
|
||||
@pytest.mark.parametrize("func", [list, pd.Index, pd.Series, np.array])
|
||||
def test_groupby_duplicated_columns(func):
|
||||
# GH#44924
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"A": [1, 2],
|
||||
"B": [3, 3],
|
||||
"C": ["G", "G"],
|
||||
}
|
||||
)
|
||||
result = df.groupby("C")[func(["A", "B", "A"])].mean()
|
||||
expected = pd.DataFrame(
|
||||
[[1.5, 3.0, 1.5]], columns=["A", "B", "A"], index=pd.Index(["G"], name="C")
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_get_nonexisting_groups():
|
||||
# GH#32492
|
||||
df = pd.DataFrame(
|
||||
data={
|
||||
"A": ["a1", "a2", None],
|
||||
"B": ["b1", "b2", "b1"],
|
||||
"val": [1, 2, 3],
|
||||
}
|
||||
)
|
||||
grps = df.groupby(by=["A", "B"])
|
||||
|
||||
msg = "('a2', 'b1')"
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
grps.get_group(("a2", "b1"))
|
||||
@@ -0,0 +1,331 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs import groupby as libgroupby
|
||||
from pandas._libs.groupby import (
|
||||
group_cumprod,
|
||||
group_cumsum,
|
||||
group_mean,
|
||||
group_sum,
|
||||
group_var,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.common import ensure_platform_int
|
||||
|
||||
from pandas import isna
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class GroupVarTestMixin:
|
||||
def test_group_var_generic_1d(self):
|
||||
prng = np.random.default_rng(2)
|
||||
|
||||
out = (np.nan * np.ones((5, 1))).astype(self.dtype)
|
||||
counts = np.zeros(5, dtype="int64")
|
||||
values = 10 * prng.random((15, 1)).astype(self.dtype)
|
||||
labels = np.tile(np.arange(5), (3,)).astype("intp")
|
||||
|
||||
expected_out = (
|
||||
np.squeeze(values).reshape((5, 3), order="F").std(axis=1, ddof=1) ** 2
|
||||
)[:, np.newaxis]
|
||||
expected_counts = counts + 3
|
||||
|
||||
self.algo(out, counts, values, labels)
|
||||
assert np.allclose(out, expected_out, self.rtol)
|
||||
tm.assert_numpy_array_equal(counts, expected_counts)
|
||||
|
||||
def test_group_var_generic_1d_flat_labels(self):
|
||||
prng = np.random.default_rng(2)
|
||||
|
||||
out = (np.nan * np.ones((1, 1))).astype(self.dtype)
|
||||
counts = np.zeros(1, dtype="int64")
|
||||
values = 10 * prng.random((5, 1)).astype(self.dtype)
|
||||
labels = np.zeros(5, dtype="intp")
|
||||
|
||||
expected_out = np.array([[values.std(ddof=1) ** 2]])
|
||||
expected_counts = counts + 5
|
||||
|
||||
self.algo(out, counts, values, labels)
|
||||
|
||||
assert np.allclose(out, expected_out, self.rtol)
|
||||
tm.assert_numpy_array_equal(counts, expected_counts)
|
||||
|
||||
def test_group_var_generic_2d_all_finite(self):
|
||||
prng = np.random.default_rng(2)
|
||||
|
||||
out = (np.nan * np.ones((5, 2))).astype(self.dtype)
|
||||
counts = np.zeros(5, dtype="int64")
|
||||
values = 10 * prng.random((10, 2)).astype(self.dtype)
|
||||
labels = np.tile(np.arange(5), (2,)).astype("intp")
|
||||
|
||||
expected_out = np.std(values.reshape(2, 5, 2), ddof=1, axis=0) ** 2
|
||||
expected_counts = counts + 2
|
||||
|
||||
self.algo(out, counts, values, labels)
|
||||
assert np.allclose(out, expected_out, self.rtol)
|
||||
tm.assert_numpy_array_equal(counts, expected_counts)
|
||||
|
||||
def test_group_var_generic_2d_some_nan(self):
|
||||
prng = np.random.default_rng(2)
|
||||
|
||||
out = (np.nan * np.ones((5, 2))).astype(self.dtype)
|
||||
counts = np.zeros(5, dtype="int64")
|
||||
values = 10 * prng.random((10, 2)).astype(self.dtype)
|
||||
values[:, 1] = np.nan
|
||||
labels = np.tile(np.arange(5), (2,)).astype("intp")
|
||||
|
||||
expected_out = np.vstack(
|
||||
[
|
||||
values[:, 0].reshape(5, 2, order="F").std(ddof=1, axis=1) ** 2,
|
||||
np.nan * np.ones(5),
|
||||
]
|
||||
).T.astype(self.dtype)
|
||||
expected_counts = counts + 2
|
||||
|
||||
self.algo(out, counts, values, labels)
|
||||
tm.assert_almost_equal(out, expected_out, rtol=0.5e-06)
|
||||
tm.assert_numpy_array_equal(counts, expected_counts)
|
||||
|
||||
def test_group_var_constant(self):
|
||||
# Regression test from GH 10448.
|
||||
|
||||
out = np.array([[np.nan]], dtype=self.dtype)
|
||||
counts = np.array([0], dtype="int64")
|
||||
values = 0.832845131556193 * np.ones((3, 1), dtype=self.dtype)
|
||||
labels = np.zeros(3, dtype="intp")
|
||||
|
||||
self.algo(out, counts, values, labels)
|
||||
|
||||
assert counts[0] == 3
|
||||
assert out[0, 0] >= 0
|
||||
tm.assert_almost_equal(out[0, 0], 0.0)
|
||||
|
||||
|
||||
class TestGroupVarFloat64(GroupVarTestMixin):
|
||||
__test__ = True
|
||||
|
||||
algo = staticmethod(group_var)
|
||||
dtype = np.float64
|
||||
rtol = 1e-5
|
||||
|
||||
def test_group_var_large_inputs(self):
|
||||
prng = np.random.default_rng(2)
|
||||
|
||||
out = np.array([[np.nan]], dtype=self.dtype)
|
||||
counts = np.array([0], dtype="int64")
|
||||
values = (prng.random(10**6) + 10**12).astype(self.dtype)
|
||||
values.shape = (10**6, 1)
|
||||
labels = np.zeros(10**6, dtype="intp")
|
||||
|
||||
self.algo(out, counts, values, labels)
|
||||
|
||||
assert counts[0] == 10**6
|
||||
tm.assert_almost_equal(out[0, 0], 1.0 / 12, rtol=0.5e-3)
|
||||
|
||||
|
||||
class TestGroupVarFloat32(GroupVarTestMixin):
|
||||
__test__ = True
|
||||
|
||||
algo = staticmethod(group_var)
|
||||
dtype = np.float32
|
||||
rtol = 1e-2
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["float32", "float64"])
|
||||
def test_group_ohlc(dtype):
|
||||
obj = np.array(np.random.default_rng(2).standard_normal(20), dtype=dtype)
|
||||
|
||||
bins = np.array([6, 12, 20])
|
||||
out = np.zeros((3, 4), dtype)
|
||||
counts = np.zeros(len(out), dtype=np.int64)
|
||||
labels = ensure_platform_int(np.repeat(np.arange(3), np.diff(np.r_[0, bins])))
|
||||
|
||||
func = libgroupby.group_ohlc
|
||||
func(out, counts, obj[:, None], labels)
|
||||
|
||||
def _ohlc(group):
|
||||
if isna(group).all():
|
||||
return np.repeat(np.nan, 4)
|
||||
return [group[0], group.max(), group.min(), group[-1]]
|
||||
|
||||
expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), _ohlc(obj[12:])])
|
||||
|
||||
tm.assert_almost_equal(out, expected)
|
||||
tm.assert_numpy_array_equal(counts, np.array([6, 6, 8], dtype=np.int64))
|
||||
|
||||
obj[:6] = np.nan
|
||||
func(out, counts, obj[:, None], labels)
|
||||
expected[0] = np.nan
|
||||
tm.assert_almost_equal(out, expected)
|
||||
|
||||
|
||||
def _check_cython_group_transform_cumulative(pd_op, np_op, dtype):
|
||||
"""
|
||||
Check a group transform that executes a cumulative function.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
pd_op : callable
|
||||
The pandas cumulative function.
|
||||
np_op : callable
|
||||
The analogous one in NumPy.
|
||||
dtype : type
|
||||
The specified dtype of the data.
|
||||
"""
|
||||
is_datetimelike = False
|
||||
|
||||
data = np.array([[1], [2], [3], [4]], dtype=dtype)
|
||||
answer = np.zeros_like(data)
|
||||
|
||||
labels = np.array([0, 0, 0, 0], dtype=np.intp)
|
||||
ngroups = 1
|
||||
pd_op(answer, data, labels, ngroups, is_datetimelike)
|
||||
|
||||
tm.assert_numpy_array_equal(np_op(data), answer[:, 0], check_dtype=False)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("np_dtype", ["int64", "uint64", "float32", "float64"])
|
||||
def test_cython_group_transform_cumsum(np_dtype):
|
||||
# see gh-4095
|
||||
dtype = np.dtype(np_dtype).type
|
||||
pd_op, np_op = group_cumsum, np.cumsum
|
||||
_check_cython_group_transform_cumulative(pd_op, np_op, dtype)
|
||||
|
||||
|
||||
def test_cython_group_transform_cumprod():
|
||||
# see gh-4095
|
||||
dtype = np.float64
|
||||
pd_op, np_op = group_cumprod, np.cumprod
|
||||
_check_cython_group_transform_cumulative(pd_op, np_op, dtype)
|
||||
|
||||
|
||||
def test_cython_group_transform_algos():
|
||||
# see gh-4095
|
||||
is_datetimelike = False
|
||||
|
||||
# with nans
|
||||
labels = np.array([0, 0, 0, 0, 0], dtype=np.intp)
|
||||
ngroups = 1
|
||||
|
||||
data = np.array([[1], [2], [3], [np.nan], [4]], dtype="float64")
|
||||
actual = np.zeros_like(data)
|
||||
actual.fill(np.nan)
|
||||
group_cumprod(actual, data, labels, ngroups, is_datetimelike)
|
||||
expected = np.array([1, 2, 6, np.nan, 24], dtype="float64")
|
||||
tm.assert_numpy_array_equal(actual[:, 0], expected)
|
||||
|
||||
actual = np.zeros_like(data)
|
||||
actual.fill(np.nan)
|
||||
group_cumsum(actual, data, labels, ngroups, is_datetimelike)
|
||||
expected = np.array([1, 3, 6, np.nan, 10], dtype="float64")
|
||||
tm.assert_numpy_array_equal(actual[:, 0], expected)
|
||||
|
||||
# timedelta
|
||||
is_datetimelike = True
|
||||
data = np.array([np.timedelta64(1, "ns")] * 5, dtype="m8[ns]")[:, None]
|
||||
actual = np.zeros_like(data, dtype="int64")
|
||||
group_cumsum(actual, data.view("int64"), labels, ngroups, is_datetimelike)
|
||||
expected = np.array(
|
||||
[
|
||||
np.timedelta64(1, "ns"),
|
||||
np.timedelta64(2, "ns"),
|
||||
np.timedelta64(3, "ns"),
|
||||
np.timedelta64(4, "ns"),
|
||||
np.timedelta64(5, "ns"),
|
||||
]
|
||||
)
|
||||
tm.assert_numpy_array_equal(actual[:, 0].view("m8[ns]"), expected)
|
||||
|
||||
|
||||
def test_cython_group_mean_datetimelike():
|
||||
actual = np.zeros(shape=(1, 1), dtype="float64")
|
||||
counts = np.array([0], dtype="int64")
|
||||
data = (
|
||||
np.array(
|
||||
[np.timedelta64(2, "ns"), np.timedelta64(4, "ns"), np.timedelta64("NaT")],
|
||||
dtype="m8[ns]",
|
||||
)[:, None]
|
||||
.view("int64")
|
||||
.astype("float64")
|
||||
)
|
||||
labels = np.zeros(len(data), dtype=np.intp)
|
||||
|
||||
group_mean(actual, counts, data, labels, is_datetimelike=True)
|
||||
|
||||
tm.assert_numpy_array_equal(actual[:, 0], np.array([3], dtype="float64"))
|
||||
|
||||
|
||||
def test_cython_group_mean_wrong_min_count():
|
||||
actual = np.zeros(shape=(1, 1), dtype="float64")
|
||||
counts = np.zeros(1, dtype="int64")
|
||||
data = np.zeros(1, dtype="float64")[:, None]
|
||||
labels = np.zeros(1, dtype=np.intp)
|
||||
|
||||
with pytest.raises(AssertionError, match="min_count"):
|
||||
group_mean(actual, counts, data, labels, is_datetimelike=True, min_count=0)
|
||||
|
||||
|
||||
def test_cython_group_mean_not_datetimelike_but_has_NaT_values():
|
||||
actual = np.zeros(shape=(1, 1), dtype="float64")
|
||||
counts = np.array([0], dtype="int64")
|
||||
data = (
|
||||
np.array(
|
||||
[np.timedelta64("NaT"), np.timedelta64("NaT")],
|
||||
dtype="m8[ns]",
|
||||
)[:, None]
|
||||
.view("int64")
|
||||
.astype("float64")
|
||||
)
|
||||
labels = np.zeros(len(data), dtype=np.intp)
|
||||
|
||||
group_mean(actual, counts, data, labels, is_datetimelike=False)
|
||||
|
||||
tm.assert_numpy_array_equal(
|
||||
actual[:, 0], np.array(np.divide(np.add(data[0], data[1]), 2), dtype="float64")
|
||||
)
|
||||
|
||||
|
||||
def test_cython_group_mean_Inf_at_begining_and_end():
|
||||
# GH 50367
|
||||
actual = np.array([[np.nan, np.nan], [np.nan, np.nan]], dtype="float64")
|
||||
counts = np.array([0, 0], dtype="int64")
|
||||
data = np.array(
|
||||
[[np.inf, 1.0], [1.0, 2.0], [2.0, 3.0], [3.0, 4.0], [4.0, 5.0], [5, np.inf]],
|
||||
dtype="float64",
|
||||
)
|
||||
labels = np.array([0, 1, 0, 1, 0, 1], dtype=np.intp)
|
||||
|
||||
group_mean(actual, counts, data, labels, is_datetimelike=False)
|
||||
|
||||
expected = np.array([[np.inf, 3], [3, np.inf]], dtype="float64")
|
||||
|
||||
tm.assert_numpy_array_equal(
|
||||
actual,
|
||||
expected,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values, out",
|
||||
[
|
||||
([[np.inf], [np.inf], [np.inf]], [[np.inf], [np.inf]]),
|
||||
([[np.inf], [np.inf], [-np.inf]], [[np.inf], [np.nan]]),
|
||||
([[np.inf], [-np.inf], [np.inf]], [[np.inf], [np.nan]]),
|
||||
([[np.inf], [-np.inf], [-np.inf]], [[np.inf], [-np.inf]]),
|
||||
],
|
||||
)
|
||||
def test_cython_group_sum_Inf_at_begining_and_end(values, out):
|
||||
# GH #53606
|
||||
actual = np.array([[np.nan], [np.nan]], dtype="float64")
|
||||
counts = np.array([0, 0], dtype="int64")
|
||||
data = np.array(values, dtype="float64")
|
||||
labels = np.array([0, 1, 1], dtype=np.intp)
|
||||
|
||||
group_sum(actual, counts, data, labels, None, is_datetimelike=False)
|
||||
|
||||
expected = np.array(out, dtype="float64")
|
||||
|
||||
tm.assert_numpy_array_equal(
|
||||
actual,
|
||||
expected,
|
||||
)
|
||||
@@ -0,0 +1,163 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("func", ["ffill", "bfill"])
|
||||
def test_groupby_column_index_name_lost_fill_funcs(func):
|
||||
# GH: 29764 groupby loses index sometimes
|
||||
df = DataFrame(
|
||||
[[1, 1.0, -1.0], [1, np.nan, np.nan], [1, 2.0, -2.0]],
|
||||
columns=Index(["type", "a", "b"], name="idx"),
|
||||
)
|
||||
df_grouped = df.groupby(["type"])[["a", "b"]]
|
||||
result = getattr(df_grouped, func)().columns
|
||||
expected = Index(["a", "b"], name="idx")
|
||||
tm.assert_index_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("func", ["ffill", "bfill"])
|
||||
def test_groupby_fill_duplicate_column_names(func):
|
||||
# GH: 25610 ValueError with duplicate column names
|
||||
df1 = DataFrame({"field1": [1, 3, 4], "field2": [1, 3, 4]})
|
||||
df2 = DataFrame({"field1": [1, np.nan, 4]})
|
||||
df_grouped = pd.concat([df1, df2], axis=1).groupby(by=["field2"])
|
||||
expected = DataFrame(
|
||||
[[1, 1.0], [3, np.nan], [4, 4.0]], columns=["field1", "field1"]
|
||||
)
|
||||
result = getattr(df_grouped, func)()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_ffill_missing_arguments():
|
||||
# GH 14955
|
||||
df = DataFrame({"a": [1, 2], "b": [1, 1]})
|
||||
msg = "DataFrameGroupBy.fillna is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
with pytest.raises(ValueError, match="Must specify a fill"):
|
||||
df.groupby("b").fillna()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method, expected", [("ffill", [None, "a", "a"]), ("bfill", ["a", "a", None])]
|
||||
)
|
||||
def test_fillna_with_string_dtype(method, expected):
|
||||
# GH 40250
|
||||
df = DataFrame({"a": pd.array([None, "a", None], dtype="string"), "b": [0, 0, 0]})
|
||||
grp = df.groupby("b")
|
||||
msg = "DataFrameGroupBy.fillna is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = grp.fillna(method=method)
|
||||
expected = DataFrame({"a": pd.array(expected, dtype="string")})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_fill_consistency():
|
||||
# GH9221
|
||||
# pass thru keyword arguments to the generated wrapper
|
||||
# are set if the passed kw is None (only)
|
||||
df = DataFrame(
|
||||
index=pd.MultiIndex.from_product(
|
||||
[["value1", "value2"], date_range("2014-01-01", "2014-01-06")]
|
||||
),
|
||||
columns=Index(["1", "2"], name="id"),
|
||||
)
|
||||
df["1"] = [
|
||||
np.nan,
|
||||
1,
|
||||
np.nan,
|
||||
np.nan,
|
||||
11,
|
||||
np.nan,
|
||||
np.nan,
|
||||
2,
|
||||
np.nan,
|
||||
np.nan,
|
||||
22,
|
||||
np.nan,
|
||||
]
|
||||
df["2"] = [
|
||||
np.nan,
|
||||
3,
|
||||
np.nan,
|
||||
np.nan,
|
||||
33,
|
||||
np.nan,
|
||||
np.nan,
|
||||
4,
|
||||
np.nan,
|
||||
np.nan,
|
||||
44,
|
||||
np.nan,
|
||||
]
|
||||
|
||||
msg = "The 'axis' keyword in DataFrame.groupby is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
expected = df.groupby(level=0, axis=0).fillna(method="ffill")
|
||||
|
||||
msg = "DataFrame.groupby with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = df.T.groupby(level=0, axis=1).fillna(method="ffill").T
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["ffill", "bfill"])
|
||||
@pytest.mark.parametrize("dropna", [True, False])
|
||||
@pytest.mark.parametrize("has_nan_group", [True, False])
|
||||
def test_ffill_handles_nan_groups(dropna, method, has_nan_group):
|
||||
# GH 34725
|
||||
|
||||
df_without_nan_rows = DataFrame([(1, 0.1), (2, 0.2)])
|
||||
|
||||
ridx = [-1, 0, -1, -1, 1, -1]
|
||||
df = df_without_nan_rows.reindex(ridx).reset_index(drop=True)
|
||||
|
||||
group_b = np.nan if has_nan_group else "b"
|
||||
df["group_col"] = pd.Series(["a"] * 3 + [group_b] * 3)
|
||||
|
||||
grouped = df.groupby(by="group_col", dropna=dropna)
|
||||
result = getattr(grouped, method)(limit=None)
|
||||
|
||||
expected_rows = {
|
||||
("ffill", True, True): [-1, 0, 0, -1, -1, -1],
|
||||
("ffill", True, False): [-1, 0, 0, -1, 1, 1],
|
||||
("ffill", False, True): [-1, 0, 0, -1, 1, 1],
|
||||
("ffill", False, False): [-1, 0, 0, -1, 1, 1],
|
||||
("bfill", True, True): [0, 0, -1, -1, -1, -1],
|
||||
("bfill", True, False): [0, 0, -1, 1, 1, -1],
|
||||
("bfill", False, True): [0, 0, -1, 1, 1, -1],
|
||||
("bfill", False, False): [0, 0, -1, 1, 1, -1],
|
||||
}
|
||||
|
||||
ridx = expected_rows.get((method, dropna, has_nan_group))
|
||||
expected = df_without_nan_rows.reindex(ridx).reset_index(drop=True)
|
||||
# columns are a 'take' on df.columns, which are object dtype
|
||||
expected.columns = expected.columns.astype(object)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("min_count, value", [(2, np.nan), (-1, 1.0)])
|
||||
@pytest.mark.parametrize("func", ["first", "last", "max", "min"])
|
||||
def test_min_count(func, min_count, value):
|
||||
# GH#37821
|
||||
df = DataFrame({"a": [1] * 3, "b": [1, np.nan, np.nan], "c": [np.nan] * 3})
|
||||
result = getattr(df.groupby("a"), func)(min_count=min_count)
|
||||
expected = DataFrame({"b": [value], "c": [np.nan]}, index=Index([1], name="a"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_indices_with_missing():
|
||||
# GH 9304
|
||||
df = DataFrame({"a": [1, 1, np.nan], "b": [2, 3, 4], "c": [5, 6, 7]})
|
||||
g = df.groupby(["a", "b"])
|
||||
result = g.indices
|
||||
expected = {(1.0, 2): np.array([0]), (1.0, 3): np.array([1])}
|
||||
assert result == expected
|
||||
@@ -0,0 +1,80 @@
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
option_context,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.single_cpu
|
||||
|
||||
pytest.importorskip("numba")
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore")
|
||||
# Filter warnings when parallel=True and the function can't be parallelized by Numba
|
||||
class TestEngine:
|
||||
def test_cython_vs_numba_frame(
|
||||
self, sort, nogil, parallel, nopython, numba_supported_reductions
|
||||
):
|
||||
func, kwargs = numba_supported_reductions
|
||||
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
|
||||
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
||||
gb = df.groupby("a", sort=sort)
|
||||
result = getattr(gb, func)(
|
||||
engine="numba", engine_kwargs=engine_kwargs, **kwargs
|
||||
)
|
||||
expected = getattr(gb, func)(**kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_cython_vs_numba_getitem(
|
||||
self, sort, nogil, parallel, nopython, numba_supported_reductions
|
||||
):
|
||||
func, kwargs = numba_supported_reductions
|
||||
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
|
||||
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
||||
gb = df.groupby("a", sort=sort)["c"]
|
||||
result = getattr(gb, func)(
|
||||
engine="numba", engine_kwargs=engine_kwargs, **kwargs
|
||||
)
|
||||
expected = getattr(gb, func)(**kwargs)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_cython_vs_numba_series(
|
||||
self, sort, nogil, parallel, nopython, numba_supported_reductions
|
||||
):
|
||||
func, kwargs = numba_supported_reductions
|
||||
ser = Series(range(3), index=[1, 2, 1], name="foo")
|
||||
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
||||
gb = ser.groupby(level=0, sort=sort)
|
||||
result = getattr(gb, func)(
|
||||
engine="numba", engine_kwargs=engine_kwargs, **kwargs
|
||||
)
|
||||
expected = getattr(gb, func)(**kwargs)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_as_index_false_unsupported(self, numba_supported_reductions):
|
||||
func, kwargs = numba_supported_reductions
|
||||
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
|
||||
gb = df.groupby("a", as_index=False)
|
||||
with pytest.raises(NotImplementedError, match="as_index=False"):
|
||||
getattr(gb, func)(engine="numba", **kwargs)
|
||||
|
||||
def test_axis_1_unsupported(self, numba_supported_reductions):
|
||||
func, kwargs = numba_supported_reductions
|
||||
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
|
||||
gb = df.groupby("a", axis=1)
|
||||
with pytest.raises(NotImplementedError, match="axis=1"):
|
||||
getattr(gb, func)(engine="numba", **kwargs)
|
||||
|
||||
def test_no_engine_doesnt_raise(self):
|
||||
# GH55520
|
||||
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
|
||||
gb = df.groupby("a")
|
||||
# Make sure behavior of functions w/out engine argument don't raise
|
||||
# when the global use_numba option is set
|
||||
with option_context("compute.use_numba", True):
|
||||
res = gb.agg({"b": "first"})
|
||||
expected = gb.agg({"b": "first"})
|
||||
tm.assert_frame_equal(res, expected)
|
||||
@@ -0,0 +1,521 @@
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs import lib
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
Timestamp,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.groupby import get_groupby_method_args
|
||||
|
||||
|
||||
class TestNumericOnly:
|
||||
# make sure that we are passing thru kwargs to our agg functions
|
||||
|
||||
@pytest.fixture
|
||||
def df(self):
|
||||
# GH3668
|
||||
# GH5724
|
||||
df = DataFrame(
|
||||
{
|
||||
"group": [1, 1, 2],
|
||||
"int": [1, 2, 3],
|
||||
"float": [4.0, 5.0, 6.0],
|
||||
"string": list("abc"),
|
||||
"category_string": Series(list("abc")).astype("category"),
|
||||
"category_int": [7, 8, 9],
|
||||
"datetime": date_range("20130101", periods=3),
|
||||
"datetimetz": date_range("20130101", periods=3, tz="US/Eastern"),
|
||||
"timedelta": pd.timedelta_range("1 s", periods=3, freq="s"),
|
||||
},
|
||||
columns=[
|
||||
"group",
|
||||
"int",
|
||||
"float",
|
||||
"string",
|
||||
"category_string",
|
||||
"category_int",
|
||||
"datetime",
|
||||
"datetimetz",
|
||||
"timedelta",
|
||||
],
|
||||
)
|
||||
return df
|
||||
|
||||
@pytest.mark.parametrize("method", ["mean", "median"])
|
||||
def test_averages(self, df, method):
|
||||
# mean / median
|
||||
expected_columns_numeric = Index(["int", "float", "category_int"])
|
||||
|
||||
gb = df.groupby("group")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"category_int": [7.5, 9],
|
||||
"float": [4.5, 6.0],
|
||||
"timedelta": [pd.Timedelta("1.5s"), pd.Timedelta("3s")],
|
||||
"int": [1.5, 3],
|
||||
"datetime": [
|
||||
Timestamp("2013-01-01 12:00:00"),
|
||||
Timestamp("2013-01-03 00:00:00"),
|
||||
],
|
||||
"datetimetz": [
|
||||
Timestamp("2013-01-01 12:00:00", tz="US/Eastern"),
|
||||
Timestamp("2013-01-03 00:00:00", tz="US/Eastern"),
|
||||
],
|
||||
},
|
||||
index=Index([1, 2], name="group"),
|
||||
columns=[
|
||||
"int",
|
||||
"float",
|
||||
"category_int",
|
||||
],
|
||||
)
|
||||
|
||||
result = getattr(gb, method)(numeric_only=True)
|
||||
tm.assert_frame_equal(result.reindex_like(expected), expected)
|
||||
|
||||
expected_columns = expected.columns
|
||||
|
||||
self._check(df, method, expected_columns, expected_columns_numeric)
|
||||
|
||||
@pytest.mark.parametrize("method", ["min", "max"])
|
||||
def test_extrema(self, df, method):
|
||||
# TODO: min, max *should* handle
|
||||
# categorical (ordered) dtype
|
||||
|
||||
expected_columns = Index(
|
||||
[
|
||||
"int",
|
||||
"float",
|
||||
"string",
|
||||
"category_int",
|
||||
"datetime",
|
||||
"datetimetz",
|
||||
"timedelta",
|
||||
]
|
||||
)
|
||||
expected_columns_numeric = expected_columns
|
||||
|
||||
self._check(df, method, expected_columns, expected_columns_numeric)
|
||||
|
||||
@pytest.mark.parametrize("method", ["first", "last"])
|
||||
def test_first_last(self, df, method):
|
||||
expected_columns = Index(
|
||||
[
|
||||
"int",
|
||||
"float",
|
||||
"string",
|
||||
"category_string",
|
||||
"category_int",
|
||||
"datetime",
|
||||
"datetimetz",
|
||||
"timedelta",
|
||||
]
|
||||
)
|
||||
expected_columns_numeric = expected_columns
|
||||
|
||||
self._check(df, method, expected_columns, expected_columns_numeric)
|
||||
|
||||
@pytest.mark.parametrize("method", ["sum", "cumsum"])
|
||||
def test_sum_cumsum(self, df, method):
|
||||
expected_columns_numeric = Index(["int", "float", "category_int"])
|
||||
expected_columns = Index(
|
||||
["int", "float", "string", "category_int", "timedelta"]
|
||||
)
|
||||
if method == "cumsum":
|
||||
# cumsum loses string
|
||||
expected_columns = Index(["int", "float", "category_int", "timedelta"])
|
||||
|
||||
self._check(df, method, expected_columns, expected_columns_numeric)
|
||||
|
||||
@pytest.mark.parametrize("method", ["prod", "cumprod"])
|
||||
def test_prod_cumprod(self, df, method):
|
||||
expected_columns = Index(["int", "float", "category_int"])
|
||||
expected_columns_numeric = expected_columns
|
||||
|
||||
self._check(df, method, expected_columns, expected_columns_numeric)
|
||||
|
||||
@pytest.mark.parametrize("method", ["cummin", "cummax"])
|
||||
def test_cummin_cummax(self, df, method):
|
||||
# like min, max, but don't include strings
|
||||
expected_columns = Index(
|
||||
["int", "float", "category_int", "datetime", "datetimetz", "timedelta"]
|
||||
)
|
||||
|
||||
# GH#15561: numeric_only=False set by default like min/max
|
||||
expected_columns_numeric = expected_columns
|
||||
|
||||
self._check(df, method, expected_columns, expected_columns_numeric)
|
||||
|
||||
def _check(self, df, method, expected_columns, expected_columns_numeric):
|
||||
gb = df.groupby("group")
|
||||
|
||||
# object dtypes for transformations are not implemented in Cython and
|
||||
# have no Python fallback
|
||||
exception = NotImplementedError if method.startswith("cum") else TypeError
|
||||
|
||||
if method in ("min", "max", "cummin", "cummax", "cumsum", "cumprod"):
|
||||
# The methods default to numeric_only=False and raise TypeError
|
||||
msg = "|".join(
|
||||
[
|
||||
"Categorical is not ordered",
|
||||
f"Cannot perform {method} with non-ordered Categorical",
|
||||
re.escape(f"agg function failed [how->{method},dtype->object]"),
|
||||
# cumsum/cummin/cummax/cumprod
|
||||
"function is not implemented for this dtype",
|
||||
]
|
||||
)
|
||||
with pytest.raises(exception, match=msg):
|
||||
getattr(gb, method)()
|
||||
elif method in ("sum", "mean", "median", "prod"):
|
||||
msg = "|".join(
|
||||
[
|
||||
"category type does not support sum operations",
|
||||
re.escape(f"agg function failed [how->{method},dtype->object]"),
|
||||
re.escape(f"agg function failed [how->{method},dtype->string]"),
|
||||
]
|
||||
)
|
||||
with pytest.raises(exception, match=msg):
|
||||
getattr(gb, method)()
|
||||
else:
|
||||
result = getattr(gb, method)()
|
||||
tm.assert_index_equal(result.columns, expected_columns_numeric)
|
||||
|
||||
if method not in ("first", "last"):
|
||||
msg = "|".join(
|
||||
[
|
||||
"Categorical is not ordered",
|
||||
"category type does not support",
|
||||
"function is not implemented for this dtype",
|
||||
f"Cannot perform {method} with non-ordered Categorical",
|
||||
re.escape(f"agg function failed [how->{method},dtype->object]"),
|
||||
re.escape(f"agg function failed [how->{method},dtype->string]"),
|
||||
]
|
||||
)
|
||||
with pytest.raises(exception, match=msg):
|
||||
getattr(gb, method)(numeric_only=False)
|
||||
else:
|
||||
result = getattr(gb, method)(numeric_only=False)
|
||||
tm.assert_index_equal(result.columns, expected_columns)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("numeric_only", [True, False, None])
|
||||
def test_axis1_numeric_only(request, groupby_func, numeric_only, using_infer_string):
|
||||
if groupby_func in ("idxmax", "idxmin"):
|
||||
pytest.skip("idxmax and idx_min tested in test_idxmin_idxmax_axis1")
|
||||
if groupby_func in ("corrwith", "skew"):
|
||||
msg = "GH#47723 groupby.corrwith and skew do not correctly implement axis=1"
|
||||
request.applymarker(pytest.mark.xfail(reason=msg))
|
||||
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 4)), columns=["A", "B", "C", "D"]
|
||||
)
|
||||
df["E"] = "x"
|
||||
groups = [1, 2, 3, 1, 2, 3, 1, 2, 3, 4]
|
||||
gb = df.groupby(groups)
|
||||
method = getattr(gb, groupby_func)
|
||||
args = get_groupby_method_args(groupby_func, df)
|
||||
kwargs = {"axis": 1}
|
||||
if numeric_only is not None:
|
||||
# when numeric_only is None we don't pass any argument
|
||||
kwargs["numeric_only"] = numeric_only
|
||||
|
||||
# Functions without numeric_only and axis args
|
||||
no_args = ("cumprod", "cumsum", "diff", "fillna", "pct_change", "rank", "shift")
|
||||
# Functions with axis args
|
||||
has_axis = (
|
||||
"cumprod",
|
||||
"cumsum",
|
||||
"diff",
|
||||
"pct_change",
|
||||
"rank",
|
||||
"shift",
|
||||
"cummax",
|
||||
"cummin",
|
||||
"idxmin",
|
||||
"idxmax",
|
||||
"fillna",
|
||||
)
|
||||
warn_msg = f"DataFrameGroupBy.{groupby_func} with axis=1 is deprecated"
|
||||
if numeric_only is not None and groupby_func in no_args:
|
||||
msg = "got an unexpected keyword argument 'numeric_only'"
|
||||
if groupby_func in ["cumprod", "cumsum"]:
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
with tm.assert_produces_warning(FutureWarning, match=warn_msg):
|
||||
method(*args, **kwargs)
|
||||
else:
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
method(*args, **kwargs)
|
||||
elif groupby_func not in has_axis:
|
||||
msg = "got an unexpected keyword argument 'axis'"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
method(*args, **kwargs)
|
||||
# fillna and shift are successful even on object dtypes
|
||||
elif (numeric_only is None or not numeric_only) and groupby_func not in (
|
||||
"fillna",
|
||||
"shift",
|
||||
):
|
||||
msgs = (
|
||||
# cummax, cummin, rank
|
||||
"not supported between instances of",
|
||||
# cumprod
|
||||
"can't multiply sequence by non-int of type 'float'",
|
||||
# cumsum, diff, pct_change
|
||||
"unsupported operand type",
|
||||
"has no kernel",
|
||||
)
|
||||
if using_infer_string:
|
||||
import pyarrow as pa
|
||||
|
||||
errs = (TypeError, pa.lib.ArrowNotImplementedError)
|
||||
else:
|
||||
errs = TypeError
|
||||
with pytest.raises(errs, match=f"({'|'.join(msgs)})"):
|
||||
with tm.assert_produces_warning(FutureWarning, match=warn_msg):
|
||||
method(*args, **kwargs)
|
||||
else:
|
||||
with tm.assert_produces_warning(FutureWarning, match=warn_msg):
|
||||
result = method(*args, **kwargs)
|
||||
|
||||
df_expected = df.drop(columns="E").T if numeric_only else df.T
|
||||
expected = getattr(df_expected, groupby_func)(*args).T
|
||||
if groupby_func == "shift" and not numeric_only:
|
||||
# shift with axis=1 leaves the leftmost column as numeric
|
||||
# but transposing for expected gives us object dtype
|
||||
expected = expected.astype(float)
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kernel, has_arg",
|
||||
[
|
||||
("all", False),
|
||||
("any", False),
|
||||
("bfill", False),
|
||||
("corr", True),
|
||||
("corrwith", True),
|
||||
("cov", True),
|
||||
("cummax", True),
|
||||
("cummin", True),
|
||||
("cumprod", True),
|
||||
("cumsum", True),
|
||||
("diff", False),
|
||||
("ffill", False),
|
||||
("fillna", False),
|
||||
("first", True),
|
||||
("idxmax", True),
|
||||
("idxmin", True),
|
||||
("last", True),
|
||||
("max", True),
|
||||
("mean", True),
|
||||
("median", True),
|
||||
("min", True),
|
||||
("nth", False),
|
||||
("nunique", False),
|
||||
("pct_change", False),
|
||||
("prod", True),
|
||||
("quantile", True),
|
||||
("sem", True),
|
||||
("skew", True),
|
||||
("std", True),
|
||||
("sum", True),
|
||||
("var", True),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("numeric_only", [True, False, lib.no_default])
|
||||
@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
|
||||
def test_numeric_only(kernel, has_arg, numeric_only, keys):
|
||||
# GH#46072
|
||||
# drops_nuisance: Whether the op drops nuisance columns even when numeric_only=False
|
||||
# has_arg: Whether the op has a numeric_only arg
|
||||
df = DataFrame({"a1": [1, 1], "a2": [2, 2], "a3": [5, 6], "b": 2 * [object]})
|
||||
|
||||
args = get_groupby_method_args(kernel, df)
|
||||
kwargs = {} if numeric_only is lib.no_default else {"numeric_only": numeric_only}
|
||||
|
||||
gb = df.groupby(keys)
|
||||
method = getattr(gb, kernel)
|
||||
if has_arg and numeric_only is True:
|
||||
# Cases where b does not appear in the result
|
||||
result = method(*args, **kwargs)
|
||||
assert "b" not in result.columns
|
||||
elif (
|
||||
# kernels that work on any dtype and have numeric_only arg
|
||||
kernel in ("first", "last")
|
||||
or (
|
||||
# kernels that work on any dtype and don't have numeric_only arg
|
||||
kernel in ("any", "all", "bfill", "ffill", "fillna", "nth", "nunique")
|
||||
and numeric_only is lib.no_default
|
||||
)
|
||||
):
|
||||
warn = FutureWarning if kernel == "fillna" else None
|
||||
msg = "DataFrameGroupBy.fillna is deprecated"
|
||||
with tm.assert_produces_warning(warn, match=msg):
|
||||
result = method(*args, **kwargs)
|
||||
assert "b" in result.columns
|
||||
elif has_arg:
|
||||
assert numeric_only is not True
|
||||
# kernels that are successful on any dtype were above; this will fail
|
||||
|
||||
# object dtypes for transformations are not implemented in Cython and
|
||||
# have no Python fallback
|
||||
exception = NotImplementedError if kernel.startswith("cum") else TypeError
|
||||
|
||||
msg = "|".join(
|
||||
[
|
||||
"not allowed for this dtype",
|
||||
"cannot be performed against 'object' dtypes",
|
||||
# On PY39 message is "a number"; on PY310 and after is "a real number"
|
||||
"must be a string or a.* number",
|
||||
"unsupported operand type",
|
||||
"function is not implemented for this dtype",
|
||||
re.escape(f"agg function failed [how->{kernel},dtype->object]"),
|
||||
]
|
||||
)
|
||||
if kernel == "idxmin":
|
||||
msg = "'<' not supported between instances of 'type' and 'type'"
|
||||
elif kernel == "idxmax":
|
||||
msg = "'>' not supported between instances of 'type' and 'type'"
|
||||
with pytest.raises(exception, match=msg):
|
||||
method(*args, **kwargs)
|
||||
elif not has_arg and numeric_only is not lib.no_default:
|
||||
with pytest.raises(
|
||||
TypeError, match="got an unexpected keyword argument 'numeric_only'"
|
||||
):
|
||||
method(*args, **kwargs)
|
||||
else:
|
||||
assert kernel in ("diff", "pct_change")
|
||||
assert numeric_only is lib.no_default
|
||||
# Doesn't have numeric_only argument and fails on nuisance columns
|
||||
with pytest.raises(TypeError, match=r"unsupported operand type"):
|
||||
method(*args, **kwargs)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning")
|
||||
@pytest.mark.parametrize("dtype", [bool, int, float, object])
|
||||
def test_deprecate_numeric_only_series(dtype, groupby_func, request):
|
||||
# GH#46560
|
||||
grouper = [0, 0, 1]
|
||||
|
||||
ser = Series([1, 0, 0], dtype=dtype)
|
||||
gb = ser.groupby(grouper)
|
||||
|
||||
if groupby_func == "corrwith":
|
||||
# corrwith is not implemented on SeriesGroupBy
|
||||
assert not hasattr(gb, groupby_func)
|
||||
return
|
||||
|
||||
method = getattr(gb, groupby_func)
|
||||
|
||||
expected_ser = Series([1, 0, 0])
|
||||
expected_gb = expected_ser.groupby(grouper)
|
||||
expected_method = getattr(expected_gb, groupby_func)
|
||||
|
||||
args = get_groupby_method_args(groupby_func, ser)
|
||||
|
||||
fails_on_numeric_object = (
|
||||
"corr",
|
||||
"cov",
|
||||
"cummax",
|
||||
"cummin",
|
||||
"cumprod",
|
||||
"cumsum",
|
||||
"quantile",
|
||||
)
|
||||
# ops that give an object result on object input
|
||||
obj_result = (
|
||||
"first",
|
||||
"last",
|
||||
"nth",
|
||||
"bfill",
|
||||
"ffill",
|
||||
"shift",
|
||||
"sum",
|
||||
"diff",
|
||||
"pct_change",
|
||||
"var",
|
||||
"mean",
|
||||
"median",
|
||||
"min",
|
||||
"max",
|
||||
"prod",
|
||||
"skew",
|
||||
)
|
||||
|
||||
# Test default behavior; kernels that fail may be enabled in the future but kernels
|
||||
# that succeed should not be allowed to fail (without deprecation, at least)
|
||||
if groupby_func in fails_on_numeric_object and dtype is object:
|
||||
if groupby_func == "quantile":
|
||||
msg = "cannot be performed against 'object' dtypes"
|
||||
else:
|
||||
msg = "is not supported for object dtype"
|
||||
warn = FutureWarning if groupby_func == "fillna" else None
|
||||
warn_msg = "DataFrameGroupBy.fillna is deprecated"
|
||||
with tm.assert_produces_warning(warn, match=warn_msg):
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
method(*args)
|
||||
elif dtype is object:
|
||||
warn = FutureWarning if groupby_func == "fillna" else None
|
||||
warn_msg = "SeriesGroupBy.fillna is deprecated"
|
||||
with tm.assert_produces_warning(warn, match=warn_msg):
|
||||
result = method(*args)
|
||||
with tm.assert_produces_warning(warn, match=warn_msg):
|
||||
expected = expected_method(*args)
|
||||
if groupby_func in obj_result:
|
||||
expected = expected.astype(object)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
has_numeric_only = (
|
||||
"first",
|
||||
"last",
|
||||
"max",
|
||||
"mean",
|
||||
"median",
|
||||
"min",
|
||||
"prod",
|
||||
"quantile",
|
||||
"sem",
|
||||
"skew",
|
||||
"std",
|
||||
"sum",
|
||||
"var",
|
||||
"cummax",
|
||||
"cummin",
|
||||
"cumprod",
|
||||
"cumsum",
|
||||
)
|
||||
if groupby_func not in has_numeric_only:
|
||||
msg = "got an unexpected keyword argument 'numeric_only'"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
method(*args, numeric_only=True)
|
||||
elif dtype is object:
|
||||
msg = "|".join(
|
||||
[
|
||||
"SeriesGroupBy.sem called with numeric_only=True and dtype object",
|
||||
"Series.skew does not allow numeric_only=True with non-numeric",
|
||||
"cum(sum|prod|min|max) is not supported for object dtype",
|
||||
r"Cannot use numeric_only=True with SeriesGroupBy\..* and non-numeric",
|
||||
]
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
method(*args, numeric_only=True)
|
||||
elif dtype == bool and groupby_func == "quantile":
|
||||
msg = "Allowing bool dtype in SeriesGroupBy.quantile"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
# GH#51424
|
||||
result = method(*args, numeric_only=True)
|
||||
expected = method(*args, numeric_only=False)
|
||||
tm.assert_series_equal(result, expected)
|
||||
else:
|
||||
result = method(*args, numeric_only=True)
|
||||
expected = method(*args, numeric_only=False)
|
||||
tm.assert_series_equal(result, expected)
|
||||
@@ -0,0 +1,80 @@
|
||||
import numpy as np
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_pipe():
|
||||
# Test the pipe method of DataFrameGroupBy.
|
||||
# Issue #17871
|
||||
|
||||
random_state = np.random.default_rng(2)
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": random_state.standard_normal(8),
|
||||
"C": random_state.standard_normal(8),
|
||||
}
|
||||
)
|
||||
|
||||
def f(dfgb):
|
||||
return dfgb.B.max() - dfgb.C.min().min()
|
||||
|
||||
def square(srs):
|
||||
return srs**2
|
||||
|
||||
# Note that the transformations are
|
||||
# GroupBy -> Series
|
||||
# Series -> Series
|
||||
# This then chains the GroupBy.pipe and the
|
||||
# NDFrame.pipe methods
|
||||
result = df.groupby("A").pipe(f).pipe(square)
|
||||
|
||||
index = Index(["bar", "foo"], dtype="object", name="A")
|
||||
expected = pd.Series([3.749306591013693, 6.717707873081384], name="B", index=index)
|
||||
|
||||
tm.assert_series_equal(expected, result)
|
||||
|
||||
|
||||
def test_pipe_args():
|
||||
# Test passing args to the pipe method of DataFrameGroupBy.
|
||||
# Issue #17871
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"group": ["A", "A", "B", "B", "C"],
|
||||
"x": [1.0, 2.0, 3.0, 2.0, 5.0],
|
||||
"y": [10.0, 100.0, 1000.0, -100.0, -1000.0],
|
||||
}
|
||||
)
|
||||
|
||||
def f(dfgb, arg1):
|
||||
filtered = dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False)
|
||||
return filtered.groupby("group")
|
||||
|
||||
def g(dfgb, arg2):
|
||||
return dfgb.sum() / dfgb.sum().sum() + arg2
|
||||
|
||||
def h(df, arg3):
|
||||
return df.x + df.y - arg3
|
||||
|
||||
result = df.groupby("group").pipe(f, 0).pipe(g, 10).pipe(h, 100)
|
||||
|
||||
# Assert the results here
|
||||
index = Index(["A", "B"], name="group")
|
||||
expected = pd.Series([-79.5160891089, -78.4839108911], index=index)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# test SeriesGroupby.pipe
|
||||
ser = pd.Series([1, 1, 2, 2, 3, 3])
|
||||
result = ser.groupby(ser).pipe(lambda grp: grp.sum() * grp.count())
|
||||
|
||||
expected = pd.Series([4, 8, 12], index=Index([1, 2, 3], dtype=np.int64))
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
@@ -0,0 +1,716 @@
|
||||
# Only tests that raise an error and have no better location should go here.
|
||||
# Tests for specific groupby methods should go in their respective
|
||||
# test file.
|
||||
|
||||
import datetime
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DataFrame,
|
||||
Grouper,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.groupby import get_groupby_method_args
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
"a",
|
||||
["a"],
|
||||
["a", "b"],
|
||||
Grouper(key="a"),
|
||||
lambda x: x % 2,
|
||||
[0, 0, 0, 1, 2, 2, 2, 3, 3],
|
||||
np.array([0, 0, 0, 1, 2, 2, 2, 3, 3]),
|
||||
dict(zip(range(9), [0, 0, 0, 1, 2, 2, 2, 3, 3])),
|
||||
Series([1, 1, 1, 1, 1, 2, 2, 2, 2]),
|
||||
[Series([1, 1, 1, 1, 1, 2, 2, 2, 2]), Series([3, 3, 4, 4, 4, 4, 4, 3, 3])],
|
||||
]
|
||||
)
|
||||
def by(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def groupby_series(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_with_string_col():
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
|
||||
"b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
|
||||
"c": range(9),
|
||||
"d": list("xyzwtyuio"),
|
||||
}
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_with_datetime_col():
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
|
||||
"b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
|
||||
"c": range(9),
|
||||
"d": datetime.datetime(2005, 1, 1, 10, 30, 23, 540000),
|
||||
}
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_with_timedelta_col():
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
|
||||
"b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
|
||||
"c": range(9),
|
||||
"d": datetime.timedelta(days=1),
|
||||
}
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_with_cat_col():
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
|
||||
"b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
|
||||
"c": range(9),
|
||||
"d": Categorical(
|
||||
["a", "a", "a", "a", "b", "b", "b", "b", "c"],
|
||||
categories=["a", "b", "c", "d"],
|
||||
ordered=True,
|
||||
),
|
||||
}
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
def _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=""):
|
||||
warn_klass = None if warn_msg == "" else FutureWarning
|
||||
with tm.assert_produces_warning(warn_klass, match=warn_msg):
|
||||
if klass is None:
|
||||
if how == "method":
|
||||
getattr(gb, groupby_func)(*args)
|
||||
elif how == "agg":
|
||||
gb.agg(groupby_func, *args)
|
||||
else:
|
||||
gb.transform(groupby_func, *args)
|
||||
else:
|
||||
with pytest.raises(klass, match=msg):
|
||||
if how == "method":
|
||||
getattr(gb, groupby_func)(*args)
|
||||
elif how == "agg":
|
||||
gb.agg(groupby_func, *args)
|
||||
else:
|
||||
gb.transform(groupby_func, *args)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("how", ["method", "agg", "transform"])
|
||||
def test_groupby_raises_string(
|
||||
how, by, groupby_series, groupby_func, df_with_string_col
|
||||
):
|
||||
df = df_with_string_col
|
||||
args = get_groupby_method_args(groupby_func, df)
|
||||
gb = df.groupby(by=by)
|
||||
|
||||
if groupby_series:
|
||||
gb = gb["d"]
|
||||
|
||||
if groupby_func == "corrwith":
|
||||
assert not hasattr(gb, "corrwith")
|
||||
return
|
||||
|
||||
klass, msg = {
|
||||
"all": (None, ""),
|
||||
"any": (None, ""),
|
||||
"bfill": (None, ""),
|
||||
"corrwith": (TypeError, "Could not convert"),
|
||||
"count": (None, ""),
|
||||
"cumcount": (None, ""),
|
||||
"cummax": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(function|cummax) is not (implemented|supported) for (this|object) dtype",
|
||||
),
|
||||
"cummin": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(function|cummin) is not (implemented|supported) for (this|object) dtype",
|
||||
),
|
||||
"cumprod": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(function|cumprod) is not (implemented|supported) for (this|object) dtype",
|
||||
),
|
||||
"cumsum": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(function|cumsum) is not (implemented|supported) for (this|object) dtype",
|
||||
),
|
||||
"diff": (TypeError, "unsupported operand type"),
|
||||
"ffill": (None, ""),
|
||||
"fillna": (None, ""),
|
||||
"first": (None, ""),
|
||||
"idxmax": (None, ""),
|
||||
"idxmin": (None, ""),
|
||||
"last": (None, ""),
|
||||
"max": (None, ""),
|
||||
"mean": (
|
||||
TypeError,
|
||||
re.escape("agg function failed [how->mean,dtype->object]"),
|
||||
),
|
||||
"median": (
|
||||
TypeError,
|
||||
re.escape("agg function failed [how->median,dtype->object]"),
|
||||
),
|
||||
"min": (None, ""),
|
||||
"ngroup": (None, ""),
|
||||
"nunique": (None, ""),
|
||||
"pct_change": (TypeError, "unsupported operand type"),
|
||||
"prod": (
|
||||
TypeError,
|
||||
re.escape("agg function failed [how->prod,dtype->object]"),
|
||||
),
|
||||
"quantile": (TypeError, "cannot be performed against 'object' dtypes!"),
|
||||
"rank": (None, ""),
|
||||
"sem": (ValueError, "could not convert string to float"),
|
||||
"shift": (None, ""),
|
||||
"size": (None, ""),
|
||||
"skew": (ValueError, "could not convert string to float"),
|
||||
"std": (ValueError, "could not convert string to float"),
|
||||
"sum": (None, ""),
|
||||
"var": (
|
||||
TypeError,
|
||||
re.escape("agg function failed [how->var,dtype->"),
|
||||
),
|
||||
}[groupby_func]
|
||||
|
||||
if groupby_func == "fillna":
|
||||
kind = "Series" if groupby_series else "DataFrame"
|
||||
warn_msg = f"{kind}GroupBy.fillna is deprecated"
|
||||
else:
|
||||
warn_msg = ""
|
||||
_call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("how", ["agg", "transform"])
|
||||
def test_groupby_raises_string_udf(how, by, groupby_series, df_with_string_col):
|
||||
df = df_with_string_col
|
||||
gb = df.groupby(by=by)
|
||||
|
||||
if groupby_series:
|
||||
gb = gb["d"]
|
||||
|
||||
def func(x):
|
||||
raise TypeError("Test error message")
|
||||
|
||||
with pytest.raises(TypeError, match="Test error message"):
|
||||
getattr(gb, how)(func)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("how", ["agg", "transform"])
|
||||
@pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean])
|
||||
def test_groupby_raises_string_np(
|
||||
how, by, groupby_series, groupby_func_np, df_with_string_col
|
||||
):
|
||||
# GH#50749
|
||||
df = df_with_string_col
|
||||
gb = df.groupby(by=by)
|
||||
|
||||
if groupby_series:
|
||||
gb = gb["d"]
|
||||
|
||||
klass, msg = {
|
||||
np.sum: (None, ""),
|
||||
np.mean: (
|
||||
TypeError,
|
||||
re.escape("agg function failed [how->mean,dtype->object]"),
|
||||
),
|
||||
}[groupby_func_np]
|
||||
|
||||
if groupby_series:
|
||||
warn_msg = "using SeriesGroupBy.[sum|mean]"
|
||||
else:
|
||||
warn_msg = "using DataFrameGroupBy.[sum|mean]"
|
||||
_call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("how", ["method", "agg", "transform"])
|
||||
def test_groupby_raises_datetime(
|
||||
how, by, groupby_series, groupby_func, df_with_datetime_col
|
||||
):
|
||||
df = df_with_datetime_col
|
||||
args = get_groupby_method_args(groupby_func, df)
|
||||
gb = df.groupby(by=by)
|
||||
|
||||
if groupby_series:
|
||||
gb = gb["d"]
|
||||
|
||||
if groupby_func == "corrwith":
|
||||
assert not hasattr(gb, "corrwith")
|
||||
return
|
||||
|
||||
klass, msg = {
|
||||
"all": (None, ""),
|
||||
"any": (None, ""),
|
||||
"bfill": (None, ""),
|
||||
"corrwith": (TypeError, "cannot perform __mul__ with this index type"),
|
||||
"count": (None, ""),
|
||||
"cumcount": (None, ""),
|
||||
"cummax": (None, ""),
|
||||
"cummin": (None, ""),
|
||||
"cumprod": (TypeError, "datetime64 type does not support cumprod operations"),
|
||||
"cumsum": (TypeError, "datetime64 type does not support cumsum operations"),
|
||||
"diff": (None, ""),
|
||||
"ffill": (None, ""),
|
||||
"fillna": (None, ""),
|
||||
"first": (None, ""),
|
||||
"idxmax": (None, ""),
|
||||
"idxmin": (None, ""),
|
||||
"last": (None, ""),
|
||||
"max": (None, ""),
|
||||
"mean": (None, ""),
|
||||
"median": (None, ""),
|
||||
"min": (None, ""),
|
||||
"ngroup": (None, ""),
|
||||
"nunique": (None, ""),
|
||||
"pct_change": (TypeError, "cannot perform __truediv__ with this index type"),
|
||||
"prod": (TypeError, "datetime64 type does not support prod"),
|
||||
"quantile": (None, ""),
|
||||
"rank": (None, ""),
|
||||
"sem": (None, ""),
|
||||
"shift": (None, ""),
|
||||
"size": (None, ""),
|
||||
"skew": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
r"dtype datetime64\[ns\] does not support reduction",
|
||||
"datetime64 type does not support skew operations",
|
||||
]
|
||||
),
|
||||
),
|
||||
"std": (None, ""),
|
||||
"sum": (TypeError, "datetime64 type does not support sum operations"),
|
||||
"var": (TypeError, "datetime64 type does not support var operations"),
|
||||
}[groupby_func]
|
||||
|
||||
if groupby_func in ["any", "all"]:
|
||||
warn_msg = f"'{groupby_func}' with datetime64 dtypes is deprecated"
|
||||
elif groupby_func == "fillna":
|
||||
kind = "Series" if groupby_series else "DataFrame"
|
||||
warn_msg = f"{kind}GroupBy.fillna is deprecated"
|
||||
else:
|
||||
warn_msg = ""
|
||||
_call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=warn_msg)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("how", ["agg", "transform"])
|
||||
def test_groupby_raises_datetime_udf(how, by, groupby_series, df_with_datetime_col):
|
||||
df = df_with_datetime_col
|
||||
gb = df.groupby(by=by)
|
||||
|
||||
if groupby_series:
|
||||
gb = gb["d"]
|
||||
|
||||
def func(x):
|
||||
raise TypeError("Test error message")
|
||||
|
||||
with pytest.raises(TypeError, match="Test error message"):
|
||||
getattr(gb, how)(func)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("how", ["agg", "transform"])
|
||||
@pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean])
|
||||
def test_groupby_raises_datetime_np(
|
||||
how, by, groupby_series, groupby_func_np, df_with_datetime_col
|
||||
):
|
||||
# GH#50749
|
||||
df = df_with_datetime_col
|
||||
gb = df.groupby(by=by)
|
||||
|
||||
if groupby_series:
|
||||
gb = gb["d"]
|
||||
|
||||
klass, msg = {
|
||||
np.sum: (TypeError, "datetime64 type does not support sum operations"),
|
||||
np.mean: (None, ""),
|
||||
}[groupby_func_np]
|
||||
|
||||
if groupby_series:
|
||||
warn_msg = "using SeriesGroupBy.[sum|mean]"
|
||||
else:
|
||||
warn_msg = "using DataFrameGroupBy.[sum|mean]"
|
||||
_call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("func", ["prod", "cumprod", "skew", "var"])
|
||||
def test_groupby_raises_timedelta(func, df_with_timedelta_col):
|
||||
df = df_with_timedelta_col
|
||||
gb = df.groupby(by="a")
|
||||
|
||||
_call_and_check(
|
||||
TypeError,
|
||||
"timedelta64 type does not support .* operations",
|
||||
"method",
|
||||
gb,
|
||||
func,
|
||||
[],
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("how", ["method", "agg", "transform"])
|
||||
def test_groupby_raises_category(
|
||||
how, by, groupby_series, groupby_func, using_copy_on_write, df_with_cat_col
|
||||
):
|
||||
# GH#50749
|
||||
df = df_with_cat_col
|
||||
args = get_groupby_method_args(groupby_func, df)
|
||||
gb = df.groupby(by=by)
|
||||
|
||||
if groupby_series:
|
||||
gb = gb["d"]
|
||||
|
||||
if groupby_func == "corrwith":
|
||||
assert not hasattr(gb, "corrwith")
|
||||
return
|
||||
|
||||
klass, msg = {
|
||||
"all": (None, ""),
|
||||
"any": (None, ""),
|
||||
"bfill": (None, ""),
|
||||
"corrwith": (
|
||||
TypeError,
|
||||
r"unsupported operand type\(s\) for \*: 'Categorical' and 'int'",
|
||||
),
|
||||
"count": (None, ""),
|
||||
"cumcount": (None, ""),
|
||||
"cummax": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(category type does not support cummax operations|"
|
||||
"category dtype not supported|"
|
||||
"cummax is not supported for category dtype)",
|
||||
),
|
||||
"cummin": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(category type does not support cummin operations|"
|
||||
"category dtype not supported|"
|
||||
"cummin is not supported for category dtype)",
|
||||
),
|
||||
"cumprod": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(category type does not support cumprod operations|"
|
||||
"category dtype not supported|"
|
||||
"cumprod is not supported for category dtype)",
|
||||
),
|
||||
"cumsum": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(category type does not support cumsum operations|"
|
||||
"category dtype not supported|"
|
||||
"cumsum is not supported for category dtype)",
|
||||
),
|
||||
"diff": (
|
||||
TypeError,
|
||||
r"unsupported operand type\(s\) for -: 'Categorical' and 'Categorical'",
|
||||
),
|
||||
"ffill": (None, ""),
|
||||
"fillna": (
|
||||
TypeError,
|
||||
r"Cannot setitem on a Categorical with a new category \(0\), "
|
||||
"set the categories first",
|
||||
)
|
||||
if not using_copy_on_write
|
||||
else (None, ""), # no-op with CoW
|
||||
"first": (None, ""),
|
||||
"idxmax": (None, ""),
|
||||
"idxmin": (None, ""),
|
||||
"last": (None, ""),
|
||||
"max": (None, ""),
|
||||
"mean": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
"'Categorical' .* does not support reduction 'mean'",
|
||||
"category dtype does not support aggregation 'mean'",
|
||||
]
|
||||
),
|
||||
),
|
||||
"median": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
"'Categorical' .* does not support reduction 'median'",
|
||||
"category dtype does not support aggregation 'median'",
|
||||
]
|
||||
),
|
||||
),
|
||||
"min": (None, ""),
|
||||
"ngroup": (None, ""),
|
||||
"nunique": (None, ""),
|
||||
"pct_change": (
|
||||
TypeError,
|
||||
r"unsupported operand type\(s\) for /: 'Categorical' and 'Categorical'",
|
||||
),
|
||||
"prod": (TypeError, "category type does not support prod operations"),
|
||||
"quantile": (TypeError, "No matching signature found"),
|
||||
"rank": (None, ""),
|
||||
"sem": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
"'Categorical' .* does not support reduction 'sem'",
|
||||
"category dtype does not support aggregation 'sem'",
|
||||
]
|
||||
),
|
||||
),
|
||||
"shift": (None, ""),
|
||||
"size": (None, ""),
|
||||
"skew": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
"dtype category does not support reduction 'skew'",
|
||||
"category type does not support skew operations",
|
||||
]
|
||||
),
|
||||
),
|
||||
"std": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
"'Categorical' .* does not support reduction 'std'",
|
||||
"category dtype does not support aggregation 'std'",
|
||||
]
|
||||
),
|
||||
),
|
||||
"sum": (TypeError, "category type does not support sum operations"),
|
||||
"var": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
"'Categorical' .* does not support reduction 'var'",
|
||||
"category dtype does not support aggregation 'var'",
|
||||
]
|
||||
),
|
||||
),
|
||||
}[groupby_func]
|
||||
|
||||
if groupby_func == "fillna":
|
||||
kind = "Series" if groupby_series else "DataFrame"
|
||||
warn_msg = f"{kind}GroupBy.fillna is deprecated"
|
||||
else:
|
||||
warn_msg = ""
|
||||
_call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("how", ["agg", "transform"])
|
||||
def test_groupby_raises_category_udf(how, by, groupby_series, df_with_cat_col):
|
||||
# GH#50749
|
||||
df = df_with_cat_col
|
||||
gb = df.groupby(by=by)
|
||||
|
||||
if groupby_series:
|
||||
gb = gb["d"]
|
||||
|
||||
def func(x):
|
||||
raise TypeError("Test error message")
|
||||
|
||||
with pytest.raises(TypeError, match="Test error message"):
|
||||
getattr(gb, how)(func)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("how", ["agg", "transform"])
|
||||
@pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean])
|
||||
def test_groupby_raises_category_np(
|
||||
how, by, groupby_series, groupby_func_np, df_with_cat_col
|
||||
):
|
||||
# GH#50749
|
||||
df = df_with_cat_col
|
||||
gb = df.groupby(by=by)
|
||||
|
||||
if groupby_series:
|
||||
gb = gb["d"]
|
||||
|
||||
klass, msg = {
|
||||
np.sum: (TypeError, "category type does not support sum operations"),
|
||||
np.mean: (
|
||||
TypeError,
|
||||
"category dtype does not support aggregation 'mean'",
|
||||
),
|
||||
}[groupby_func_np]
|
||||
|
||||
if groupby_series:
|
||||
warn_msg = "using SeriesGroupBy.[sum|mean]"
|
||||
else:
|
||||
warn_msg = "using DataFrameGroupBy.[sum|mean]"
|
||||
_call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("how", ["method", "agg", "transform"])
|
||||
def test_groupby_raises_category_on_category(
|
||||
how,
|
||||
by,
|
||||
groupby_series,
|
||||
groupby_func,
|
||||
observed,
|
||||
using_copy_on_write,
|
||||
df_with_cat_col,
|
||||
):
|
||||
# GH#50749
|
||||
df = df_with_cat_col
|
||||
df["a"] = Categorical(
|
||||
["a", "a", "a", "a", "b", "b", "b", "b", "c"],
|
||||
categories=["a", "b", "c", "d"],
|
||||
ordered=True,
|
||||
)
|
||||
args = get_groupby_method_args(groupby_func, df)
|
||||
gb = df.groupby(by=by, observed=observed)
|
||||
|
||||
if groupby_series:
|
||||
gb = gb["d"]
|
||||
|
||||
if groupby_func == "corrwith":
|
||||
assert not hasattr(gb, "corrwith")
|
||||
return
|
||||
|
||||
empty_groups = not observed and any(group.empty for group in gb.groups.values())
|
||||
if (
|
||||
not observed
|
||||
and how != "transform"
|
||||
and isinstance(by, list)
|
||||
and isinstance(by[0], str)
|
||||
and by == ["a", "b"]
|
||||
):
|
||||
assert not empty_groups
|
||||
# TODO: empty_groups should be true due to unobserved categorical combinations
|
||||
empty_groups = True
|
||||
if how == "transform":
|
||||
# empty groups will be ignored
|
||||
empty_groups = False
|
||||
|
||||
klass, msg = {
|
||||
"all": (None, ""),
|
||||
"any": (None, ""),
|
||||
"bfill": (None, ""),
|
||||
"corrwith": (
|
||||
TypeError,
|
||||
r"unsupported operand type\(s\) for \*: 'Categorical' and 'int'",
|
||||
),
|
||||
"count": (None, ""),
|
||||
"cumcount": (None, ""),
|
||||
"cummax": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(cummax is not supported for category dtype|"
|
||||
"category dtype not supported|"
|
||||
"category type does not support cummax operations)",
|
||||
),
|
||||
"cummin": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(cummin is not supported for category dtype|"
|
||||
"category dtype not supported|"
|
||||
"category type does not support cummin operations)",
|
||||
),
|
||||
"cumprod": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(cumprod is not supported for category dtype|"
|
||||
"category dtype not supported|"
|
||||
"category type does not support cumprod operations)",
|
||||
),
|
||||
"cumsum": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(cumsum is not supported for category dtype|"
|
||||
"category dtype not supported|"
|
||||
"category type does not support cumsum operations)",
|
||||
),
|
||||
"diff": (TypeError, "unsupported operand type"),
|
||||
"ffill": (None, ""),
|
||||
"fillna": (
|
||||
TypeError,
|
||||
r"Cannot setitem on a Categorical with a new category \(0\), "
|
||||
"set the categories first",
|
||||
)
|
||||
if not using_copy_on_write
|
||||
else (None, ""), # no-op with CoW
|
||||
"first": (None, ""),
|
||||
"idxmax": (ValueError, "empty group due to unobserved categories")
|
||||
if empty_groups
|
||||
else (None, ""),
|
||||
"idxmin": (ValueError, "empty group due to unobserved categories")
|
||||
if empty_groups
|
||||
else (None, ""),
|
||||
"last": (None, ""),
|
||||
"max": (None, ""),
|
||||
"mean": (TypeError, "category dtype does not support aggregation 'mean'"),
|
||||
"median": (TypeError, "category dtype does not support aggregation 'median'"),
|
||||
"min": (None, ""),
|
||||
"ngroup": (None, ""),
|
||||
"nunique": (None, ""),
|
||||
"pct_change": (TypeError, "unsupported operand type"),
|
||||
"prod": (TypeError, "category type does not support prod operations"),
|
||||
"quantile": (TypeError, ""),
|
||||
"rank": (None, ""),
|
||||
"sem": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
"'Categorical' .* does not support reduction 'sem'",
|
||||
"category dtype does not support aggregation 'sem'",
|
||||
]
|
||||
),
|
||||
),
|
||||
"shift": (None, ""),
|
||||
"size": (None, ""),
|
||||
"skew": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
"category type does not support skew operations",
|
||||
"dtype category does not support reduction 'skew'",
|
||||
]
|
||||
),
|
||||
),
|
||||
"std": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
"'Categorical' .* does not support reduction 'std'",
|
||||
"category dtype does not support aggregation 'std'",
|
||||
]
|
||||
),
|
||||
),
|
||||
"sum": (TypeError, "category type does not support sum operations"),
|
||||
"var": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
"'Categorical' .* does not support reduction 'var'",
|
||||
"category dtype does not support aggregation 'var'",
|
||||
]
|
||||
),
|
||||
),
|
||||
}[groupby_func]
|
||||
|
||||
if groupby_func == "fillna":
|
||||
kind = "Series" if groupby_series else "DataFrame"
|
||||
warn_msg = f"{kind}GroupBy.fillna is deprecated"
|
||||
else:
|
||||
warn_msg = ""
|
||||
_call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg)
|
||||
|
||||
|
||||
def test_subsetting_columns_axis_1_raises():
|
||||
# GH 35443
|
||||
df = DataFrame({"a": [1], "b": [2], "c": [3]})
|
||||
msg = "DataFrame.groupby with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
gb = df.groupby("a", axis=1)
|
||||
with pytest.raises(ValueError, match="Cannot subset columns when using axis=1"):
|
||||
gb["b"]
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,963 @@
|
||||
"""
|
||||
test with the TimeGrouper / grouping with datetimes
|
||||
"""
|
||||
from datetime import (
|
||||
datetime,
|
||||
timedelta,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import pytz
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
DatetimeIndex,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
Timestamp,
|
||||
date_range,
|
||||
offsets,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.groupby.grouper import Grouper
|
||||
from pandas.core.groupby.ops import BinGrouper
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def frame_for_truncated_bingrouper():
|
||||
"""
|
||||
DataFrame used by groupby_with_truncated_bingrouper, made into
|
||||
a separate fixture for easier reuse in
|
||||
test_groupby_apply_timegrouper_with_nat_apply_squeeze
|
||||
"""
|
||||
df = DataFrame(
|
||||
{
|
||||
"Quantity": [18, 3, 5, 1, 9, 3],
|
||||
"Date": [
|
||||
Timestamp(2013, 9, 1, 13, 0),
|
||||
Timestamp(2013, 9, 1, 13, 5),
|
||||
Timestamp(2013, 10, 1, 20, 0),
|
||||
Timestamp(2013, 10, 3, 10, 0),
|
||||
pd.NaT,
|
||||
Timestamp(2013, 9, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper):
|
||||
"""
|
||||
GroupBy object such that gb._grouper is a BinGrouper and
|
||||
len(gb._grouper.result_index) < len(gb._grouper.group_keys_seq)
|
||||
|
||||
Aggregations on this groupby should have
|
||||
|
||||
dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date")
|
||||
|
||||
As either the index or an index level.
|
||||
"""
|
||||
df = frame_for_truncated_bingrouper
|
||||
|
||||
tdg = Grouper(key="Date", freq="5D")
|
||||
gb = df.groupby(tdg)
|
||||
|
||||
# check we're testing the case we're interested in
|
||||
assert len(gb._grouper.result_index) != len(gb._grouper.group_keys_seq)
|
||||
|
||||
return gb
|
||||
|
||||
|
||||
class TestGroupBy:
|
||||
def test_groupby_with_timegrouper(self):
|
||||
# GH 4161
|
||||
# TimeGrouper requires a sorted index
|
||||
# also verifies that the resultant index has the correct name
|
||||
df_original = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Carl Carl Carl Joe Carl".split(),
|
||||
"Quantity": [18, 3, 5, 1, 9, 3],
|
||||
"Date": [
|
||||
datetime(2013, 9, 1, 13, 0),
|
||||
datetime(2013, 9, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 3, 10, 0),
|
||||
datetime(2013, 12, 2, 12, 0),
|
||||
datetime(2013, 9, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
# GH 6908 change target column's order
|
||||
df_reordered = df_original.sort_values(by="Quantity")
|
||||
|
||||
for df in [df_original, df_reordered]:
|
||||
df = df.set_index(["Date"])
|
||||
|
||||
exp_dti = date_range(
|
||||
"20130901",
|
||||
"20131205",
|
||||
freq="5D",
|
||||
name="Date",
|
||||
inclusive="left",
|
||||
unit=df.index.unit,
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"Buyer": 0, "Quantity": 0},
|
||||
index=exp_dti,
|
||||
)
|
||||
# Cast to object to avoid implicit cast when setting entry to "CarlCarlCarl"
|
||||
expected = expected.astype({"Buyer": object})
|
||||
expected.iloc[0, 0] = "CarlCarlCarl"
|
||||
expected.iloc[6, 0] = "CarlCarl"
|
||||
expected.iloc[18, 0] = "Joe"
|
||||
expected.iloc[[0, 6, 18], 1] = np.array([24, 6, 9], dtype="int64")
|
||||
|
||||
result1 = df.resample("5D").sum()
|
||||
tm.assert_frame_equal(result1, expected)
|
||||
|
||||
df_sorted = df.sort_index()
|
||||
result2 = df_sorted.groupby(Grouper(freq="5D")).sum()
|
||||
tm.assert_frame_equal(result2, expected)
|
||||
|
||||
result3 = df.groupby(Grouper(freq="5D")).sum()
|
||||
tm.assert_frame_equal(result3, expected)
|
||||
|
||||
@pytest.mark.parametrize("should_sort", [True, False])
|
||||
def test_groupby_with_timegrouper_methods(self, should_sort):
|
||||
# GH 3881
|
||||
# make sure API of timegrouper conforms
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"Branch": "A A A A A B".split(),
|
||||
"Buyer": "Carl Mark Carl Joe Joe Carl".split(),
|
||||
"Quantity": [1, 3, 5, 8, 9, 3],
|
||||
"Date": [
|
||||
datetime(2013, 1, 1, 13, 0),
|
||||
datetime(2013, 1, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 2, 10, 0),
|
||||
datetime(2013, 12, 2, 12, 0),
|
||||
datetime(2013, 12, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
if should_sort:
|
||||
df = df.sort_values(by="Quantity", ascending=False)
|
||||
|
||||
df = df.set_index("Date", drop=False)
|
||||
g = df.groupby(Grouper(freq="6ME"))
|
||||
assert g.group_keys
|
||||
|
||||
assert isinstance(g._grouper, BinGrouper)
|
||||
groups = g.groups
|
||||
assert isinstance(groups, dict)
|
||||
assert len(groups) == 3
|
||||
|
||||
def test_timegrouper_with_reg_groups(self):
|
||||
# GH 3794
|
||||
# allow combination of timegrouper/reg groups
|
||||
|
||||
df_original = DataFrame(
|
||||
{
|
||||
"Branch": "A A A A A A A B".split(),
|
||||
"Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(),
|
||||
"Quantity": [1, 3, 5, 1, 8, 1, 9, 3],
|
||||
"Date": [
|
||||
datetime(2013, 1, 1, 13, 0),
|
||||
datetime(2013, 1, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 2, 10, 0),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 2, 10, 0),
|
||||
datetime(2013, 12, 2, 12, 0),
|
||||
datetime(2013, 12, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
).set_index("Date")
|
||||
|
||||
df_sorted = df_original.sort_values(by="Quantity", ascending=False)
|
||||
|
||||
for df in [df_original, df_sorted]:
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Joe Mark".split(),
|
||||
"Quantity": [10, 18, 3],
|
||||
"Date": [
|
||||
datetime(2013, 12, 31, 0, 0),
|
||||
datetime(2013, 12, 31, 0, 0),
|
||||
datetime(2013, 12, 31, 0, 0),
|
||||
],
|
||||
}
|
||||
).set_index(["Date", "Buyer"])
|
||||
|
||||
msg = "The default value of numeric_only"
|
||||
result = df.groupby([Grouper(freq="YE"), "Buyer"]).sum(numeric_only=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Mark Carl Joe".split(),
|
||||
"Quantity": [1, 3, 9, 18],
|
||||
"Date": [
|
||||
datetime(2013, 1, 1, 0, 0),
|
||||
datetime(2013, 1, 1, 0, 0),
|
||||
datetime(2013, 7, 1, 0, 0),
|
||||
datetime(2013, 7, 1, 0, 0),
|
||||
],
|
||||
}
|
||||
).set_index(["Date", "Buyer"])
|
||||
result = df.groupby([Grouper(freq="6MS"), "Buyer"]).sum(numeric_only=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df_original = DataFrame(
|
||||
{
|
||||
"Branch": "A A A A A A A B".split(),
|
||||
"Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(),
|
||||
"Quantity": [1, 3, 5, 1, 8, 1, 9, 3],
|
||||
"Date": [
|
||||
datetime(2013, 10, 1, 13, 0),
|
||||
datetime(2013, 10, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 2, 10, 0),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 2, 10, 0),
|
||||
datetime(2013, 10, 2, 12, 0),
|
||||
datetime(2013, 10, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
).set_index("Date")
|
||||
|
||||
df_sorted = df_original.sort_values(by="Quantity", ascending=False)
|
||||
for df in [df_original, df_sorted]:
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Joe Mark Carl Joe".split(),
|
||||
"Quantity": [6, 8, 3, 4, 10],
|
||||
"Date": [
|
||||
datetime(2013, 10, 1, 0, 0),
|
||||
datetime(2013, 10, 1, 0, 0),
|
||||
datetime(2013, 10, 1, 0, 0),
|
||||
datetime(2013, 10, 2, 0, 0),
|
||||
datetime(2013, 10, 2, 0, 0),
|
||||
],
|
||||
}
|
||||
).set_index(["Date", "Buyer"])
|
||||
|
||||
result = df.groupby([Grouper(freq="1D"), "Buyer"]).sum(numeric_only=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby([Grouper(freq="1ME"), "Buyer"]).sum(numeric_only=True)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Joe Mark".split(),
|
||||
"Quantity": [10, 18, 3],
|
||||
"Date": [
|
||||
datetime(2013, 10, 31, 0, 0),
|
||||
datetime(2013, 10, 31, 0, 0),
|
||||
datetime(2013, 10, 31, 0, 0),
|
||||
],
|
||||
}
|
||||
).set_index(["Date", "Buyer"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# passing the name
|
||||
df = df.reset_index()
|
||||
result = df.groupby([Grouper(freq="1ME", key="Date"), "Buyer"]).sum(
|
||||
numeric_only=True
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
with pytest.raises(KeyError, match="'The grouper name foo is not found'"):
|
||||
df.groupby([Grouper(freq="1ME", key="foo"), "Buyer"]).sum()
|
||||
|
||||
# passing the level
|
||||
df = df.set_index("Date")
|
||||
result = df.groupby([Grouper(freq="1ME", level="Date"), "Buyer"]).sum(
|
||||
numeric_only=True
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = df.groupby([Grouper(freq="1ME", level=0), "Buyer"]).sum(
|
||||
numeric_only=True
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
with pytest.raises(ValueError, match="The level foo is not valid"):
|
||||
df.groupby([Grouper(freq="1ME", level="foo"), "Buyer"]).sum()
|
||||
|
||||
# multi names
|
||||
df = df.copy()
|
||||
df["Date"] = df.index + offsets.MonthEnd(2)
|
||||
result = df.groupby([Grouper(freq="1ME", key="Date"), "Buyer"]).sum(
|
||||
numeric_only=True
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Joe Mark".split(),
|
||||
"Quantity": [10, 18, 3],
|
||||
"Date": [
|
||||
datetime(2013, 11, 30, 0, 0),
|
||||
datetime(2013, 11, 30, 0, 0),
|
||||
datetime(2013, 11, 30, 0, 0),
|
||||
],
|
||||
}
|
||||
).set_index(["Date", "Buyer"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# error as we have both a level and a name!
|
||||
msg = "The Grouper cannot specify both a key and a level!"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby(
|
||||
[Grouper(freq="1ME", key="Date", level="Date"), "Buyer"]
|
||||
).sum()
|
||||
|
||||
# single groupers
|
||||
expected = DataFrame(
|
||||
[[31]],
|
||||
columns=["Quantity"],
|
||||
index=DatetimeIndex(
|
||||
[datetime(2013, 10, 31, 0, 0)], freq=offsets.MonthEnd(), name="Date"
|
||||
),
|
||||
)
|
||||
result = df.groupby(Grouper(freq="1ME")).sum(numeric_only=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby([Grouper(freq="1ME")]).sum(numeric_only=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected.index = expected.index.shift(1)
|
||||
assert expected.index.freq == offsets.MonthEnd()
|
||||
result = df.groupby(Grouper(freq="1ME", key="Date")).sum(numeric_only=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby([Grouper(freq="1ME", key="Date")]).sum(
|
||||
numeric_only=True
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("freq", ["D", "ME", "YE", "QE-APR"])
|
||||
def test_timegrouper_with_reg_groups_freq(self, freq):
|
||||
# GH 6764 multiple grouping with/without sort
|
||||
df = DataFrame(
|
||||
{
|
||||
"date": pd.to_datetime(
|
||||
[
|
||||
"20121002",
|
||||
"20121007",
|
||||
"20130130",
|
||||
"20130202",
|
||||
"20130305",
|
||||
"20121002",
|
||||
"20121207",
|
||||
"20130130",
|
||||
"20130202",
|
||||
"20130305",
|
||||
"20130202",
|
||||
"20130305",
|
||||
]
|
||||
),
|
||||
"user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5],
|
||||
"whole_cost": [
|
||||
1790,
|
||||
364,
|
||||
280,
|
||||
259,
|
||||
201,
|
||||
623,
|
||||
90,
|
||||
312,
|
||||
359,
|
||||
301,
|
||||
359,
|
||||
801,
|
||||
],
|
||||
"cost1": [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12],
|
||||
}
|
||||
).set_index("date")
|
||||
|
||||
expected = (
|
||||
df.groupby("user_id")["whole_cost"]
|
||||
.resample(freq)
|
||||
.sum(min_count=1) # XXX
|
||||
.dropna()
|
||||
.reorder_levels(["date", "user_id"])
|
||||
.sort_index()
|
||||
.astype("int64")
|
||||
)
|
||||
expected.name = "whole_cost"
|
||||
|
||||
result1 = (
|
||||
df.sort_index().groupby([Grouper(freq=freq), "user_id"])["whole_cost"].sum()
|
||||
)
|
||||
tm.assert_series_equal(result1, expected)
|
||||
|
||||
result2 = df.groupby([Grouper(freq=freq), "user_id"])["whole_cost"].sum()
|
||||
tm.assert_series_equal(result2, expected)
|
||||
|
||||
def test_timegrouper_get_group(self):
|
||||
# GH 6914
|
||||
|
||||
df_original = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Joe Joe Carl Joe Carl".split(),
|
||||
"Quantity": [18, 3, 5, 1, 9, 3],
|
||||
"Date": [
|
||||
datetime(2013, 9, 1, 13, 0),
|
||||
datetime(2013, 9, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 3, 10, 0),
|
||||
datetime(2013, 12, 2, 12, 0),
|
||||
datetime(2013, 9, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
)
|
||||
df_reordered = df_original.sort_values(by="Quantity")
|
||||
|
||||
# single grouping
|
||||
expected_list = [
|
||||
df_original.iloc[[0, 1, 5]],
|
||||
df_original.iloc[[2, 3]],
|
||||
df_original.iloc[[4]],
|
||||
]
|
||||
dt_list = ["2013-09-30", "2013-10-31", "2013-12-31"]
|
||||
|
||||
for df in [df_original, df_reordered]:
|
||||
grouped = df.groupby(Grouper(freq="ME", key="Date"))
|
||||
for t, expected in zip(dt_list, expected_list):
|
||||
dt = Timestamp(t)
|
||||
result = grouped.get_group(dt)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# multiple grouping
|
||||
expected_list = [
|
||||
df_original.iloc[[1]],
|
||||
df_original.iloc[[3]],
|
||||
df_original.iloc[[4]],
|
||||
]
|
||||
g_list = [("Joe", "2013-09-30"), ("Carl", "2013-10-31"), ("Joe", "2013-12-31")]
|
||||
|
||||
for df in [df_original, df_reordered]:
|
||||
grouped = df.groupby(["Buyer", Grouper(freq="ME", key="Date")])
|
||||
for (b, t), expected in zip(g_list, expected_list):
|
||||
dt = Timestamp(t)
|
||||
result = grouped.get_group((b, dt))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# with index
|
||||
df_original = df_original.set_index("Date")
|
||||
df_reordered = df_original.sort_values(by="Quantity")
|
||||
|
||||
expected_list = [
|
||||
df_original.iloc[[0, 1, 5]],
|
||||
df_original.iloc[[2, 3]],
|
||||
df_original.iloc[[4]],
|
||||
]
|
||||
|
||||
for df in [df_original, df_reordered]:
|
||||
grouped = df.groupby(Grouper(freq="ME"))
|
||||
for t, expected in zip(dt_list, expected_list):
|
||||
dt = Timestamp(t)
|
||||
result = grouped.get_group(dt)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_timegrouper_apply_return_type_series(self):
|
||||
# Using `apply` with the `TimeGrouper` should give the
|
||||
# same return type as an `apply` with a `Grouper`.
|
||||
# Issue #11742
|
||||
df = DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]})
|
||||
df_dt = df.copy()
|
||||
df_dt["date"] = pd.to_datetime(df_dt["date"])
|
||||
|
||||
def sumfunc_series(x):
|
||||
return Series([x["value"].sum()], ("sum",))
|
||||
|
||||
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
||||
with tm.assert_produces_warning(DeprecationWarning, match=msg):
|
||||
expected = df.groupby(Grouper(key="date")).apply(sumfunc_series)
|
||||
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
||||
with tm.assert_produces_warning(DeprecationWarning, match=msg):
|
||||
result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_series)
|
||||
tm.assert_frame_equal(
|
||||
result.reset_index(drop=True), expected.reset_index(drop=True)
|
||||
)
|
||||
|
||||
def test_timegrouper_apply_return_type_value(self):
|
||||
# Using `apply` with the `TimeGrouper` should give the
|
||||
# same return type as an `apply` with a `Grouper`.
|
||||
# Issue #11742
|
||||
df = DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]})
|
||||
df_dt = df.copy()
|
||||
df_dt["date"] = pd.to_datetime(df_dt["date"])
|
||||
|
||||
def sumfunc_value(x):
|
||||
return x.value.sum()
|
||||
|
||||
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
||||
with tm.assert_produces_warning(DeprecationWarning, match=msg):
|
||||
expected = df.groupby(Grouper(key="date")).apply(sumfunc_value)
|
||||
with tm.assert_produces_warning(DeprecationWarning, match=msg):
|
||||
result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_value)
|
||||
tm.assert_series_equal(
|
||||
result.reset_index(drop=True), expected.reset_index(drop=True)
|
||||
)
|
||||
|
||||
def test_groupby_groups_datetimeindex(self):
|
||||
# GH#1430
|
||||
periods = 1000
|
||||
ind = date_range(start="2012/1/1", freq="5min", periods=periods)
|
||||
df = DataFrame(
|
||||
{"high": np.arange(periods), "low": np.arange(periods)}, index=ind
|
||||
)
|
||||
grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
|
||||
|
||||
# it works!
|
||||
groups = grouped.groups
|
||||
assert isinstance(next(iter(groups.keys())), datetime)
|
||||
|
||||
def test_groupby_groups_datetimeindex2(self):
|
||||
# GH#11442
|
||||
index = date_range("2015/01/01", periods=5, name="date")
|
||||
df = DataFrame({"A": [5, 6, 7, 8, 9], "B": [1, 2, 3, 4, 5]}, index=index)
|
||||
result = df.groupby(level="date").groups
|
||||
dates = ["2015-01-05", "2015-01-04", "2015-01-03", "2015-01-02", "2015-01-01"]
|
||||
expected = {
|
||||
Timestamp(date): DatetimeIndex([date], name="date") for date in dates
|
||||
}
|
||||
tm.assert_dict_equal(result, expected)
|
||||
|
||||
grouped = df.groupby(level="date")
|
||||
for date in dates:
|
||||
result = grouped.get_group(date)
|
||||
data = [[df.loc[date, "A"], df.loc[date, "B"]]]
|
||||
expected_index = DatetimeIndex(
|
||||
[date], name="date", freq="D", dtype=index.dtype
|
||||
)
|
||||
expected = DataFrame(data, columns=list("AB"), index=expected_index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_groupby_groups_datetimeindex_tz(self):
|
||||
# GH 3950
|
||||
dates = [
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
]
|
||||
df = DataFrame(
|
||||
{
|
||||
"label": ["a", "a", "a", "b", "b", "b"],
|
||||
"datetime": dates,
|
||||
"value1": np.arange(6, dtype="int64"),
|
||||
"value2": [1, 2] * 3,
|
||||
}
|
||||
)
|
||||
df["datetime"] = df["datetime"].apply(lambda d: Timestamp(d, tz="US/Pacific"))
|
||||
|
||||
exp_idx1 = DatetimeIndex(
|
||||
[
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
],
|
||||
tz="US/Pacific",
|
||||
name="datetime",
|
||||
)
|
||||
exp_idx2 = Index(["a", "b"] * 3, name="label")
|
||||
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
|
||||
expected = DataFrame(
|
||||
{"value1": [0, 3, 1, 4, 2, 5], "value2": [1, 2, 2, 1, 1, 2]},
|
||||
index=exp_idx,
|
||||
columns=["value1", "value2"],
|
||||
)
|
||||
|
||||
result = df.groupby(["datetime", "label"]).sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# by level
|
||||
didx = DatetimeIndex(dates, tz="Asia/Tokyo")
|
||||
df = DataFrame(
|
||||
{"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]},
|
||||
index=didx,
|
||||
)
|
||||
|
||||
exp_idx = DatetimeIndex(
|
||||
["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
|
||||
tz="Asia/Tokyo",
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"value1": [3, 5, 7], "value2": [2, 4, 6]},
|
||||
index=exp_idx,
|
||||
columns=["value1", "value2"],
|
||||
)
|
||||
|
||||
result = df.groupby(level=0).sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_frame_datetime64_handling_groupby(self):
|
||||
# it works!
|
||||
df = DataFrame(
|
||||
[(3, np.datetime64("2012-07-03")), (3, np.datetime64("2012-07-04"))],
|
||||
columns=["a", "date"],
|
||||
)
|
||||
result = df.groupby("a").first()
|
||||
assert result["date"][3] == Timestamp("2012-07-03")
|
||||
|
||||
def test_groupby_multi_timezone(self):
|
||||
# combining multiple / different timezones yields UTC
|
||||
df = DataFrame(
|
||||
{
|
||||
"value": range(5),
|
||||
"date": [
|
||||
"2000-01-28 16:47:00",
|
||||
"2000-01-29 16:48:00",
|
||||
"2000-01-30 16:49:00",
|
||||
"2000-01-31 16:50:00",
|
||||
"2000-01-01 16:50:00",
|
||||
],
|
||||
"tz": [
|
||||
"America/Chicago",
|
||||
"America/Chicago",
|
||||
"America/Los_Angeles",
|
||||
"America/Chicago",
|
||||
"America/New_York",
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
result = df.groupby("tz", group_keys=False).date.apply(
|
||||
lambda x: pd.to_datetime(x).dt.tz_localize(x.name)
|
||||
)
|
||||
|
||||
expected = Series(
|
||||
[
|
||||
Timestamp("2000-01-28 16:47:00-0600", tz="America/Chicago"),
|
||||
Timestamp("2000-01-29 16:48:00-0600", tz="America/Chicago"),
|
||||
Timestamp("2000-01-30 16:49:00-0800", tz="America/Los_Angeles"),
|
||||
Timestamp("2000-01-31 16:50:00-0600", tz="America/Chicago"),
|
||||
Timestamp("2000-01-01 16:50:00-0500", tz="America/New_York"),
|
||||
],
|
||||
name="date",
|
||||
dtype=object,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
tz = "America/Chicago"
|
||||
res_values = df.groupby("tz").date.get_group(tz)
|
||||
result = pd.to_datetime(res_values).dt.tz_localize(tz)
|
||||
exp_values = Series(
|
||||
["2000-01-28 16:47:00", "2000-01-29 16:48:00", "2000-01-31 16:50:00"],
|
||||
index=[0, 1, 3],
|
||||
name="date",
|
||||
)
|
||||
expected = pd.to_datetime(exp_values).dt.tz_localize(tz)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_groupby_groups_periods(self):
|
||||
dates = [
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
]
|
||||
df = DataFrame(
|
||||
{
|
||||
"label": ["a", "a", "a", "b", "b", "b"],
|
||||
"period": [pd.Period(d, freq="h") for d in dates],
|
||||
"value1": np.arange(6, dtype="int64"),
|
||||
"value2": [1, 2] * 3,
|
||||
}
|
||||
)
|
||||
|
||||
exp_idx1 = pd.PeriodIndex(
|
||||
[
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
],
|
||||
freq="h",
|
||||
name="period",
|
||||
)
|
||||
exp_idx2 = Index(["a", "b"] * 3, name="label")
|
||||
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
|
||||
expected = DataFrame(
|
||||
{"value1": [0, 3, 1, 4, 2, 5], "value2": [1, 2, 2, 1, 1, 2]},
|
||||
index=exp_idx,
|
||||
columns=["value1", "value2"],
|
||||
)
|
||||
|
||||
result = df.groupby(["period", "label"]).sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# by level
|
||||
didx = pd.PeriodIndex(dates, freq="h")
|
||||
df = DataFrame(
|
||||
{"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]},
|
||||
index=didx,
|
||||
)
|
||||
|
||||
exp_idx = pd.PeriodIndex(
|
||||
["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
|
||||
freq="h",
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"value1": [3, 5, 7], "value2": [2, 4, 6]},
|
||||
index=exp_idx,
|
||||
columns=["value1", "value2"],
|
||||
)
|
||||
|
||||
result = df.groupby(level=0).sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_groupby_first_datetime64(self):
|
||||
df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)])
|
||||
df[1] = df[1].astype("M8[ns]")
|
||||
|
||||
assert issubclass(df[1].dtype.type, np.datetime64)
|
||||
|
||||
result = df.groupby(level=0).first()
|
||||
got_dt = result[1].dtype
|
||||
assert issubclass(got_dt.type, np.datetime64)
|
||||
|
||||
result = df[1].groupby(level=0).first()
|
||||
got_dt = result.dtype
|
||||
assert issubclass(got_dt.type, np.datetime64)
|
||||
|
||||
def test_groupby_max_datetime64(self):
|
||||
# GH 5869
|
||||
# datetimelike dtype conversion from int
|
||||
df = DataFrame({"A": Timestamp("20130101"), "B": np.arange(5)})
|
||||
# TODO: can we retain second reso in .apply here?
|
||||
expected = df.groupby("A")["A"].apply(lambda x: x.max()).astype("M8[s]")
|
||||
result = df.groupby("A")["A"].max()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_groupby_datetime64_32_bit(self):
|
||||
# GH 6410 / numpy 4328
|
||||
# 32-bit under 1.9-dev indexing issue
|
||||
|
||||
df = DataFrame({"A": range(2), "B": [Timestamp("2000-01-1")] * 2})
|
||||
result = df.groupby("A")["B"].transform("min")
|
||||
expected = Series([Timestamp("2000-01-1")] * 2, name="B")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_groupby_with_timezone_selection(self):
|
||||
# GH 11616
|
||||
# Test that column selection returns output in correct timezone.
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"factor": np.random.default_rng(2).integers(0, 3, size=60),
|
||||
"time": date_range("01/01/2000 00:00", periods=60, freq="s", tz="UTC"),
|
||||
}
|
||||
)
|
||||
df1 = df.groupby("factor").max()["time"]
|
||||
df2 = df.groupby("factor")["time"].max()
|
||||
tm.assert_series_equal(df1, df2)
|
||||
|
||||
def test_timezone_info(self):
|
||||
# see gh-11682: Timezone info lost when broadcasting
|
||||
# scalar datetime to DataFrame
|
||||
|
||||
df = DataFrame({"a": [1], "b": [datetime.now(pytz.utc)]})
|
||||
assert df["b"][0].tzinfo == pytz.utc
|
||||
df = DataFrame({"a": [1, 2, 3]})
|
||||
df["b"] = datetime.now(pytz.utc)
|
||||
assert df["b"][0].tzinfo == pytz.utc
|
||||
|
||||
def test_datetime_count(self):
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 3] * 2, "dates": date_range("now", periods=6, freq="min")}
|
||||
)
|
||||
result = df.groupby("a").dates.count()
|
||||
expected = Series([2, 2, 2], index=Index([1, 2, 3], name="a"), name="dates")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_first_last_max_min_on_time_data(self):
|
||||
# GH 10295
|
||||
# Verify that NaT is not in the result of max, min, first and last on
|
||||
# Dataframe with datetime or timedelta values.
|
||||
df_test = DataFrame(
|
||||
{
|
||||
"dt": [
|
||||
np.nan,
|
||||
"2015-07-24 10:10",
|
||||
"2015-07-25 11:11",
|
||||
"2015-07-23 12:12",
|
||||
np.nan,
|
||||
],
|
||||
"td": [
|
||||
np.nan,
|
||||
timedelta(days=1),
|
||||
timedelta(days=2),
|
||||
timedelta(days=3),
|
||||
np.nan,
|
||||
],
|
||||
}
|
||||
)
|
||||
df_test.dt = pd.to_datetime(df_test.dt)
|
||||
df_test["group"] = "A"
|
||||
df_ref = df_test[df_test.dt.notna()]
|
||||
|
||||
grouped_test = df_test.groupby("group")
|
||||
grouped_ref = df_ref.groupby("group")
|
||||
|
||||
tm.assert_frame_equal(grouped_ref.max(), grouped_test.max())
|
||||
tm.assert_frame_equal(grouped_ref.min(), grouped_test.min())
|
||||
tm.assert_frame_equal(grouped_ref.first(), grouped_test.first())
|
||||
tm.assert_frame_equal(grouped_ref.last(), grouped_test.last())
|
||||
|
||||
def test_nunique_with_timegrouper_and_nat(self):
|
||||
# GH 17575
|
||||
test = DataFrame(
|
||||
{
|
||||
"time": [
|
||||
Timestamp("2016-06-28 09:35:35"),
|
||||
pd.NaT,
|
||||
Timestamp("2016-06-28 16:46:28"),
|
||||
],
|
||||
"data": ["1", "2", "3"],
|
||||
}
|
||||
)
|
||||
|
||||
grouper = Grouper(key="time", freq="h")
|
||||
result = test.groupby(grouper)["data"].nunique()
|
||||
expected = test[test.time.notnull()].groupby(grouper)["data"].nunique()
|
||||
expected.index = expected.index._with_freq(None)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_scalar_call_versus_list_call(self):
|
||||
# Issue: 17530
|
||||
data_frame = {
|
||||
"location": ["shanghai", "beijing", "shanghai"],
|
||||
"time": Series(
|
||||
["2017-08-09 13:32:23", "2017-08-11 23:23:15", "2017-08-11 22:23:15"],
|
||||
dtype="datetime64[ns]",
|
||||
),
|
||||
"value": [1, 2, 3],
|
||||
}
|
||||
data_frame = DataFrame(data_frame).set_index("time")
|
||||
grouper = Grouper(freq="D")
|
||||
|
||||
grouped = data_frame.groupby(grouper)
|
||||
result = grouped.count()
|
||||
grouped = data_frame.groupby([grouper])
|
||||
expected = grouped.count()
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_grouper_period_index(self):
|
||||
# GH 32108
|
||||
periods = 2
|
||||
index = pd.period_range(
|
||||
start="2018-01", periods=periods, freq="M", name="Month"
|
||||
)
|
||||
period_series = Series(range(periods), index=index)
|
||||
result = period_series.groupby(period_series.index.month).sum()
|
||||
|
||||
expected = Series(
|
||||
range(periods), index=Index(range(1, periods + 1), name=index.name)
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_groupby_apply_timegrouper_with_nat_dict_returns(
|
||||
self, groupby_with_truncated_bingrouper
|
||||
):
|
||||
# GH#43500 case where gb._grouper.result_index and gb._grouper.group_keys_seq
|
||||
# have different lengths that goes through the `isinstance(values[0], dict)`
|
||||
# path
|
||||
gb = groupby_with_truncated_bingrouper
|
||||
|
||||
res = gb["Quantity"].apply(lambda x: {"foo": len(x)})
|
||||
|
||||
df = gb.obj
|
||||
unit = df["Date"]._values.unit
|
||||
dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date", unit=unit)
|
||||
mi = MultiIndex.from_arrays([dti, ["foo"] * len(dti)])
|
||||
expected = Series([3, 0, 0, 0, 0, 0, 2], index=mi, name="Quantity")
|
||||
tm.assert_series_equal(res, expected)
|
||||
|
||||
def test_groupby_apply_timegrouper_with_nat_scalar_returns(
|
||||
self, groupby_with_truncated_bingrouper
|
||||
):
|
||||
# GH#43500 Previously raised ValueError bc used index with incorrect
|
||||
# length in wrap_applied_result
|
||||
gb = groupby_with_truncated_bingrouper
|
||||
|
||||
res = gb["Quantity"].apply(lambda x: x.iloc[0] if len(x) else np.nan)
|
||||
|
||||
df = gb.obj
|
||||
unit = df["Date"]._values.unit
|
||||
dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date", unit=unit)
|
||||
expected = Series(
|
||||
[18, np.nan, np.nan, np.nan, np.nan, np.nan, 5],
|
||||
index=dti._with_freq(None),
|
||||
name="Quantity",
|
||||
)
|
||||
|
||||
tm.assert_series_equal(res, expected)
|
||||
|
||||
def test_groupby_apply_timegrouper_with_nat_apply_squeeze(
|
||||
self, frame_for_truncated_bingrouper
|
||||
):
|
||||
df = frame_for_truncated_bingrouper
|
||||
|
||||
# We need to create a GroupBy object with only one non-NaT group,
|
||||
# so use a huge freq so that all non-NaT dates will be grouped together
|
||||
tdg = Grouper(key="Date", freq="100YE")
|
||||
gb = df.groupby(tdg)
|
||||
|
||||
# check that we will go through the singular_series path
|
||||
# in _wrap_applied_output_series
|
||||
assert gb.ngroups == 1
|
||||
assert gb._selected_obj._get_axis(gb.axis).nlevels == 1
|
||||
|
||||
# function that returns a Series
|
||||
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
||||
with tm.assert_produces_warning(DeprecationWarning, match=msg):
|
||||
res = gb.apply(lambda x: x["Quantity"] * 2)
|
||||
|
||||
dti = Index([Timestamp("2013-12-31")], dtype=df["Date"].dtype, name="Date")
|
||||
expected = DataFrame(
|
||||
[[36, 6, 6, 10, 2]],
|
||||
index=dti,
|
||||
columns=Index([0, 1, 5, 2, 3], name="Quantity"),
|
||||
)
|
||||
tm.assert_frame_equal(res, expected)
|
||||
|
||||
@pytest.mark.single_cpu
|
||||
def test_groupby_agg_numba_timegrouper_with_nat(
|
||||
self, groupby_with_truncated_bingrouper
|
||||
):
|
||||
pytest.importorskip("numba")
|
||||
|
||||
# See discussion in GH#43487
|
||||
gb = groupby_with_truncated_bingrouper
|
||||
|
||||
result = gb["Quantity"].aggregate(
|
||||
lambda values, index: np.nanmean(values), engine="numba"
|
||||
)
|
||||
|
||||
expected = gb["Quantity"].aggregate("mean")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result_df = gb[["Quantity"]].aggregate(
|
||||
lambda values, index: np.nanmean(values), engine="numba"
|
||||
)
|
||||
expected_df = gb[["Quantity"]].aggregate("mean")
|
||||
tm.assert_frame_equal(result_df, expected_df)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,284 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import NumbaUtilError
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
option_context,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.single_cpu
|
||||
|
||||
|
||||
def test_correct_function_signature():
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def incorrect_function(x):
|
||||
return x + 1
|
||||
|
||||
data = DataFrame(
|
||||
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
|
||||
columns=["key", "data"],
|
||||
)
|
||||
with pytest.raises(NumbaUtilError, match="The first 2"):
|
||||
data.groupby("key").transform(incorrect_function, engine="numba")
|
||||
|
||||
with pytest.raises(NumbaUtilError, match="The first 2"):
|
||||
data.groupby("key")["data"].transform(incorrect_function, engine="numba")
|
||||
|
||||
|
||||
def test_check_nopython_kwargs():
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def incorrect_function(values, index):
|
||||
return values + 1
|
||||
|
||||
data = DataFrame(
|
||||
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
|
||||
columns=["key", "data"],
|
||||
)
|
||||
with pytest.raises(NumbaUtilError, match="numba does not support"):
|
||||
data.groupby("key").transform(incorrect_function, engine="numba", a=1)
|
||||
|
||||
with pytest.raises(NumbaUtilError, match="numba does not support"):
|
||||
data.groupby("key")["data"].transform(incorrect_function, engine="numba", a=1)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore")
|
||||
# Filter warnings when parallel=True and the function can't be parallelized by Numba
|
||||
@pytest.mark.parametrize("jit", [True, False])
|
||||
@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
|
||||
@pytest.mark.parametrize("as_index", [True, False])
|
||||
def test_numba_vs_cython(jit, pandas_obj, nogil, parallel, nopython, as_index):
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def func(values, index):
|
||||
return values + 1
|
||||
|
||||
if jit:
|
||||
# Test accepted jitted functions
|
||||
import numba
|
||||
|
||||
func = numba.jit(func)
|
||||
|
||||
data = DataFrame(
|
||||
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
||||
)
|
||||
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
||||
grouped = data.groupby(0, as_index=as_index)
|
||||
if pandas_obj == "Series":
|
||||
grouped = grouped[1]
|
||||
|
||||
result = grouped.transform(func, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.transform(lambda x: x + 1, engine="cython")
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore")
|
||||
# Filter warnings when parallel=True and the function can't be parallelized by Numba
|
||||
@pytest.mark.parametrize("jit", [True, False])
|
||||
@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
|
||||
def test_cache(jit, pandas_obj, nogil, parallel, nopython):
|
||||
# Test that the functions are cached correctly if we switch functions
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def func_1(values, index):
|
||||
return values + 1
|
||||
|
||||
def func_2(values, index):
|
||||
return values * 5
|
||||
|
||||
if jit:
|
||||
import numba
|
||||
|
||||
func_1 = numba.jit(func_1)
|
||||
func_2 = numba.jit(func_2)
|
||||
|
||||
data = DataFrame(
|
||||
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
||||
)
|
||||
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
||||
grouped = data.groupby(0)
|
||||
if pandas_obj == "Series":
|
||||
grouped = grouped[1]
|
||||
|
||||
result = grouped.transform(func_1, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.transform(lambda x: x + 1, engine="cython")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
result = grouped.transform(func_2, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.transform(lambda x: x * 5, engine="cython")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# Retest func_1 which should use the cache
|
||||
result = grouped.transform(func_1, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.transform(lambda x: x + 1, engine="cython")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
def test_use_global_config():
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def func_1(values, index):
|
||||
return values + 1
|
||||
|
||||
data = DataFrame(
|
||||
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
||||
)
|
||||
grouped = data.groupby(0)
|
||||
expected = grouped.transform(func_1, engine="numba")
|
||||
with option_context("compute.use_numba", True):
|
||||
result = grouped.transform(func_1, engine=None)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
# TODO: Test more than just reductions (e.g. actually test transformations once we have
|
||||
@pytest.mark.parametrize(
|
||||
"agg_func", [["min", "max"], "min", {"B": ["min", "max"], "C": "sum"}]
|
||||
)
|
||||
def test_string_cython_vs_numba(agg_func, numba_supported_reductions):
|
||||
pytest.importorskip("numba")
|
||||
agg_func, kwargs = numba_supported_reductions
|
||||
data = DataFrame(
|
||||
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
||||
)
|
||||
grouped = data.groupby(0)
|
||||
|
||||
result = grouped.transform(agg_func, engine="numba", **kwargs)
|
||||
expected = grouped.transform(agg_func, engine="cython", **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = grouped[1].transform(agg_func, engine="numba", **kwargs)
|
||||
expected = grouped[1].transform(agg_func, engine="cython", **kwargs)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_args_not_cached():
|
||||
# GH 41647
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def sum_last(values, index, n):
|
||||
return values[-n:].sum()
|
||||
|
||||
df = DataFrame({"id": [0, 0, 1, 1], "x": [1, 1, 1, 1]})
|
||||
grouped_x = df.groupby("id")["x"]
|
||||
result = grouped_x.transform(sum_last, 1, engine="numba")
|
||||
expected = Series([1.0] * 4, name="x")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = grouped_x.transform(sum_last, 2, engine="numba")
|
||||
expected = Series([2.0] * 4, name="x")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_index_data_correctly_passed():
|
||||
# GH 43133
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def f(values, index):
|
||||
return index - 1
|
||||
|
||||
df = DataFrame({"group": ["A", "A", "B"], "v": [4, 5, 6]}, index=[-1, -2, -3])
|
||||
result = df.groupby("group").transform(f, engine="numba")
|
||||
expected = DataFrame([-4.0, -3.0, -2.0], columns=["v"], index=[-1, -2, -3])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_engine_kwargs_not_cached():
|
||||
# If the user passes a different set of engine_kwargs don't return the same
|
||||
# jitted function
|
||||
pytest.importorskip("numba")
|
||||
nogil = True
|
||||
parallel = False
|
||||
nopython = True
|
||||
|
||||
def func_kwargs(values, index):
|
||||
return nogil + parallel + nopython
|
||||
|
||||
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
df = DataFrame({"value": [0, 0, 0]})
|
||||
result = df.groupby(level=0).transform(
|
||||
func_kwargs, engine="numba", engine_kwargs=engine_kwargs
|
||||
)
|
||||
expected = DataFrame({"value": [2.0, 2.0, 2.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
nogil = False
|
||||
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
result = df.groupby(level=0).transform(
|
||||
func_kwargs, engine="numba", engine_kwargs=engine_kwargs
|
||||
)
|
||||
expected = DataFrame({"value": [1.0, 1.0, 1.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore")
|
||||
def test_multiindex_one_key(nogil, parallel, nopython):
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def numba_func(values, index):
|
||||
return 1
|
||||
|
||||
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
|
||||
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
result = df.groupby("A").transform(
|
||||
numba_func, engine="numba", engine_kwargs=engine_kwargs
|
||||
)
|
||||
expected = DataFrame([{"A": 1, "B": 2, "C": 1.0}]).set_index(["A", "B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_multiindex_multi_key_not_supported(nogil, parallel, nopython):
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def numba_func(values, index):
|
||||
return 1
|
||||
|
||||
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
|
||||
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
with pytest.raises(NotImplementedError, match="more than 1 grouping labels"):
|
||||
df.groupby(["A", "B"]).transform(
|
||||
numba_func, engine="numba", engine_kwargs=engine_kwargs
|
||||
)
|
||||
|
||||
|
||||
def test_multilabel_numba_vs_cython(numba_supported_reductions):
|
||||
pytest.importorskip("numba")
|
||||
reduction, kwargs = numba_supported_reductions
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.default_rng(2).standard_normal(8),
|
||||
"D": np.random.default_rng(2).standard_normal(8),
|
||||
}
|
||||
)
|
||||
gb = df.groupby(["A", "B"])
|
||||
res_agg = gb.transform(reduction, engine="numba", **kwargs)
|
||||
expected_agg = gb.transform(reduction, engine="cython", **kwargs)
|
||||
tm.assert_frame_equal(res_agg, expected_agg)
|
||||
|
||||
|
||||
def test_multilabel_udf_numba_vs_cython():
|
||||
pytest.importorskip("numba")
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.default_rng(2).standard_normal(8),
|
||||
"D": np.random.default_rng(2).standard_normal(8),
|
||||
}
|
||||
)
|
||||
gb = df.groupby(["A", "B"])
|
||||
result = gb.transform(
|
||||
lambda values, index: (values - values.min()) / (values.max() - values.min()),
|
||||
engine="numba",
|
||||
)
|
||||
expected = gb.transform(
|
||||
lambda x: (x - x.min()) / (x.max() - x.min()), engine="cython"
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user