done
This commit is contained in:
		
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -0,0 +1,437 @@ | ||||
| """ | ||||
| test cython .agg behavior | ||||
| """ | ||||
|  | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas.core.dtypes.common import ( | ||||
|     is_float_dtype, | ||||
|     is_integer_dtype, | ||||
| ) | ||||
|  | ||||
| import pandas as pd | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     NaT, | ||||
|     Series, | ||||
|     Timedelta, | ||||
|     Timestamp, | ||||
|     bdate_range, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
| import pandas.core.common as com | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "op_name", | ||||
|     [ | ||||
|         "count", | ||||
|         "sum", | ||||
|         "std", | ||||
|         "var", | ||||
|         "sem", | ||||
|         "mean", | ||||
|         pytest.param( | ||||
|             "median", | ||||
|             # ignore mean of empty slice | ||||
|             # and all-NaN | ||||
|             marks=[pytest.mark.filterwarnings("ignore::RuntimeWarning")], | ||||
|         ), | ||||
|         "prod", | ||||
|         "min", | ||||
|         "max", | ||||
|     ], | ||||
| ) | ||||
| def test_cythonized_aggers(op_name): | ||||
|     data = { | ||||
|         "A": [0, 0, 0, 0, 1, 1, 1, 1, 1, 1.0, np.nan, np.nan], | ||||
|         "B": ["A", "B"] * 6, | ||||
|         "C": np.random.default_rng(2).standard_normal(12), | ||||
|     } | ||||
|     df = DataFrame(data) | ||||
|     df.loc[2:10:2, "C"] = np.nan | ||||
|  | ||||
|     op = lambda x: getattr(x, op_name)() | ||||
|  | ||||
|     # single column | ||||
|     grouped = df.drop(["B"], axis=1).groupby("A") | ||||
|     exp = {cat: op(group["C"]) for cat, group in grouped} | ||||
|     exp = DataFrame({"C": exp}) | ||||
|     exp.index.name = "A" | ||||
|     result = op(grouped) | ||||
|     tm.assert_frame_equal(result, exp) | ||||
|  | ||||
|     # multiple columns | ||||
|     grouped = df.groupby(["A", "B"]) | ||||
|     expd = {} | ||||
|     for (cat1, cat2), group in grouped: | ||||
|         expd.setdefault(cat1, {})[cat2] = op(group["C"]) | ||||
|     exp = DataFrame(expd).T.stack(future_stack=True) | ||||
|     exp.index.names = ["A", "B"] | ||||
|     exp.name = "C" | ||||
|  | ||||
|     result = op(grouped)["C"] | ||||
|     if op_name in ["sum", "prod"]: | ||||
|         tm.assert_series_equal(result, exp) | ||||
|  | ||||
|  | ||||
| def test_cython_agg_boolean(): | ||||
|     frame = DataFrame( | ||||
|         { | ||||
|             "a": np.random.default_rng(2).integers(0, 5, 50), | ||||
|             "b": np.random.default_rng(2).integers(0, 2, 50).astype("bool"), | ||||
|         } | ||||
|     ) | ||||
|     result = frame.groupby("a")["b"].mean() | ||||
|     msg = "using SeriesGroupBy.mean" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         # GH#53425 | ||||
|         expected = frame.groupby("a")["b"].agg(np.mean) | ||||
|  | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_cython_agg_nothing_to_agg(): | ||||
|     frame = DataFrame( | ||||
|         {"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25} | ||||
|     ) | ||||
|  | ||||
|     msg = "Cannot use numeric_only=True with SeriesGroupBy.mean and non-numeric dtypes" | ||||
|     with pytest.raises(TypeError, match=msg): | ||||
|         frame.groupby("a")["b"].mean(numeric_only=True) | ||||
|  | ||||
|     frame = DataFrame( | ||||
|         {"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25} | ||||
|     ) | ||||
|  | ||||
|     result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True) | ||||
|     expected = DataFrame( | ||||
|         [], | ||||
|         index=frame["a"].sort_values().drop_duplicates(), | ||||
|         columns=Index([], dtype="str"), | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_cython_agg_nothing_to_agg_with_dates(): | ||||
|     frame = DataFrame( | ||||
|         { | ||||
|             "a": np.random.default_rng(2).integers(0, 5, 50), | ||||
|             "b": ["foo", "bar"] * 25, | ||||
|             "dates": pd.date_range("now", periods=50, freq="min"), | ||||
|         } | ||||
|     ) | ||||
|     msg = "Cannot use numeric_only=True with SeriesGroupBy.mean and non-numeric dtypes" | ||||
|     with pytest.raises(TypeError, match=msg): | ||||
|         frame.groupby("b").dates.mean(numeric_only=True) | ||||
|  | ||||
|  | ||||
| def test_cython_agg_frame_columns(): | ||||
|     # #2113 | ||||
|     df = DataFrame({"x": [1, 2, 3], "y": [3, 4, 5]}) | ||||
|  | ||||
|     msg = "DataFrame.groupby with axis=1 is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         df.groupby(level=0, axis="columns").mean() | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         df.groupby(level=0, axis="columns").mean() | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         df.groupby(level=0, axis="columns").mean() | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         df.groupby(level=0, axis="columns").mean() | ||||
|  | ||||
|  | ||||
| def test_cython_agg_return_dict(): | ||||
|     # GH 16741 | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], | ||||
|             "B": ["one", "one", "two", "three", "two", "two", "one", "three"], | ||||
|             "C": np.random.default_rng(2).standard_normal(8), | ||||
|             "D": np.random.default_rng(2).standard_normal(8), | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|     ts = df.groupby("A")["B"].agg(lambda x: x.value_counts().to_dict()) | ||||
|     expected = Series( | ||||
|         [{"two": 1, "one": 1, "three": 1}, {"two": 2, "one": 2, "three": 1}], | ||||
|         index=Index(["bar", "foo"], name="A"), | ||||
|         name="B", | ||||
|     ) | ||||
|     tm.assert_series_equal(ts, expected) | ||||
|  | ||||
|  | ||||
| def test_cython_fail_agg(): | ||||
|     dr = bdate_range("1/1/2000", periods=50) | ||||
|     ts = Series(["A", "B", "C", "D", "E"] * 10, dtype=object, index=dr) | ||||
|  | ||||
|     grouped = ts.groupby(lambda x: x.month) | ||||
|     summed = grouped.sum() | ||||
|     msg = "using SeriesGroupBy.sum" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         # GH#53425 | ||||
|         expected = grouped.agg(np.sum).astype(object) | ||||
|     tm.assert_series_equal(summed, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "op, targop", | ||||
|     [ | ||||
|         ("mean", np.mean), | ||||
|         ("median", np.median), | ||||
|         ("var", np.var), | ||||
|         ("sum", np.sum), | ||||
|         ("prod", np.prod), | ||||
|         ("min", np.min), | ||||
|         ("max", np.max), | ||||
|         ("first", lambda x: x.iloc[0]), | ||||
|         ("last", lambda x: x.iloc[-1]), | ||||
|     ], | ||||
| ) | ||||
| def test__cython_agg_general(op, targop): | ||||
|     df = DataFrame(np.random.default_rng(2).standard_normal(1000)) | ||||
|     labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float) | ||||
|  | ||||
|     result = df.groupby(labels)._cython_agg_general(op, alt=None, numeric_only=True) | ||||
|     warn = FutureWarning if targop in com._cython_table else None | ||||
|     msg = f"using DataFrameGroupBy.{op}" | ||||
|     with tm.assert_produces_warning(warn, match=msg): | ||||
|         # GH#53425 | ||||
|         expected = df.groupby(labels).agg(targop) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "op, targop", | ||||
|     [ | ||||
|         ("mean", np.mean), | ||||
|         ("median", lambda x: np.median(x) if len(x) > 0 else np.nan), | ||||
|         ("var", lambda x: np.var(x, ddof=1)), | ||||
|         ("min", np.min), | ||||
|         ("max", np.max), | ||||
|     ], | ||||
| ) | ||||
| def test_cython_agg_empty_buckets(op, targop, observed): | ||||
|     df = DataFrame([11, 12, 13]) | ||||
|     grps = range(0, 55, 5) | ||||
|  | ||||
|     # calling _cython_agg_general directly, instead of via the user API | ||||
|     # which sets different values for min_count, so do that here. | ||||
|     g = df.groupby(pd.cut(df[0], grps), observed=observed) | ||||
|     result = g._cython_agg_general(op, alt=None, numeric_only=True) | ||||
|  | ||||
|     g = df.groupby(pd.cut(df[0], grps), observed=observed) | ||||
|     expected = g.agg(lambda x: targop(x)) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_cython_agg_empty_buckets_nanops(observed): | ||||
|     # GH-18869 can't call nanops on empty groups, so hardcode expected | ||||
|     # for these | ||||
|     df = DataFrame([11, 12, 13], columns=["a"]) | ||||
|     grps = np.arange(0, 25, 5, dtype=int) | ||||
|     # add / sum | ||||
|     result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general( | ||||
|         "sum", alt=None, numeric_only=True | ||||
|     ) | ||||
|     intervals = pd.interval_range(0, 20, freq=5) | ||||
|     expected = DataFrame( | ||||
|         {"a": [0, 0, 36, 0]}, | ||||
|         index=pd.CategoricalIndex(intervals, name="a", ordered=True), | ||||
|     ) | ||||
|     if observed: | ||||
|         expected = expected[expected.a != 0] | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # prod | ||||
|     result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general( | ||||
|         "prod", alt=None, numeric_only=True | ||||
|     ) | ||||
|     expected = DataFrame( | ||||
|         {"a": [1, 1, 1716, 1]}, | ||||
|         index=pd.CategoricalIndex(intervals, name="a", ordered=True), | ||||
|     ) | ||||
|     if observed: | ||||
|         expected = expected[expected.a != 1] | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("op", ["first", "last", "max", "min"]) | ||||
| @pytest.mark.parametrize( | ||||
|     "data", [Timestamp("2016-10-14 21:00:44.557"), Timedelta("17088 days 21:00:44.557")] | ||||
| ) | ||||
| def test_cython_with_timestamp_and_nat(op, data): | ||||
|     # https://github.com/pandas-dev/pandas/issues/19526 | ||||
|     df = DataFrame({"a": [0, 1], "b": [data, NaT]}) | ||||
|     index = Index([0, 1], name="a") | ||||
|  | ||||
|     # We will group by a and test the cython aggregations | ||||
|     expected = DataFrame({"b": [data, NaT]}, index=index) | ||||
|  | ||||
|     result = df.groupby("a").aggregate(op) | ||||
|     tm.assert_frame_equal(expected, result) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "agg", | ||||
|     [ | ||||
|         "min", | ||||
|         "max", | ||||
|         "count", | ||||
|         "sum", | ||||
|         "prod", | ||||
|         "var", | ||||
|         "mean", | ||||
|         "median", | ||||
|         "ohlc", | ||||
|         "cumprod", | ||||
|         "cumsum", | ||||
|         "shift", | ||||
|         "any", | ||||
|         "all", | ||||
|         "quantile", | ||||
|         "first", | ||||
|         "last", | ||||
|         "rank", | ||||
|         "cummin", | ||||
|         "cummax", | ||||
|     ], | ||||
| ) | ||||
| def test_read_only_buffer_source_agg(agg): | ||||
|     # https://github.com/pandas-dev/pandas/issues/36014 | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "sepal_length": [5.1, 4.9, 4.7, 4.6, 5.0], | ||||
|             "species": ["setosa", "setosa", "setosa", "setosa", "setosa"], | ||||
|         } | ||||
|     ) | ||||
|     df._mgr.arrays[0].flags.writeable = False | ||||
|  | ||||
|     result = df.groupby(["species"]).agg({"sepal_length": agg}) | ||||
|     expected = df.copy().groupby(["species"]).agg({"sepal_length": agg}) | ||||
|  | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "op_name", | ||||
|     [ | ||||
|         "count", | ||||
|         "sum", | ||||
|         "std", | ||||
|         "var", | ||||
|         "sem", | ||||
|         "mean", | ||||
|         "median", | ||||
|         "prod", | ||||
|         "min", | ||||
|         "max", | ||||
|     ], | ||||
| ) | ||||
| def test_cython_agg_nullable_int(op_name): | ||||
|     # ensure that the cython-based aggregations don't fail for nullable dtype | ||||
|     # (eg https://github.com/pandas-dev/pandas/issues/37415) | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "A": ["A", "B"] * 5, | ||||
|             "B": pd.array([1, 2, 3, 4, 5, 6, 7, 8, 9, pd.NA], dtype="Int64"), | ||||
|         } | ||||
|     ) | ||||
|     result = getattr(df.groupby("A")["B"], op_name)() | ||||
|     df2 = df.assign(B=df["B"].astype("float64")) | ||||
|     expected = getattr(df2.groupby("A")["B"], op_name)() | ||||
|     if op_name in ("mean", "median"): | ||||
|         convert_integer = False | ||||
|     else: | ||||
|         convert_integer = True | ||||
|     expected = expected.convert_dtypes(convert_integer=convert_integer) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"]) | ||||
| def test_count_masked_returns_masked_dtype(dtype): | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "A": [1, 1], | ||||
|             "B": pd.array([1, pd.NA], dtype=dtype), | ||||
|             "C": pd.array([1, 1], dtype=dtype), | ||||
|         } | ||||
|     ) | ||||
|     result = df.groupby("A").count() | ||||
|     expected = DataFrame( | ||||
|         [[1, 2]], index=Index([1], name="A"), columns=["B", "C"], dtype="Int64" | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("with_na", [True, False]) | ||||
| @pytest.mark.parametrize( | ||||
|     "op_name, action", | ||||
|     [ | ||||
|         # ("count", "always_int"), | ||||
|         ("sum", "large_int"), | ||||
|         # ("std", "always_float"), | ||||
|         ("var", "always_float"), | ||||
|         # ("sem", "always_float"), | ||||
|         ("mean", "always_float"), | ||||
|         ("median", "always_float"), | ||||
|         ("prod", "large_int"), | ||||
|         ("min", "preserve"), | ||||
|         ("max", "preserve"), | ||||
|         ("first", "preserve"), | ||||
|         ("last", "preserve"), | ||||
|     ], | ||||
| ) | ||||
| @pytest.mark.parametrize( | ||||
|     "data", | ||||
|     [ | ||||
|         pd.array([1, 2, 3, 4], dtype="Int64"), | ||||
|         pd.array([1, 2, 3, 4], dtype="Int8"), | ||||
|         pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float32"), | ||||
|         pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float64"), | ||||
|         pd.array([True, True, False, False], dtype="boolean"), | ||||
|     ], | ||||
| ) | ||||
| def test_cython_agg_EA_known_dtypes(data, op_name, action, with_na): | ||||
|     if with_na: | ||||
|         data[3] = pd.NA | ||||
|  | ||||
|     df = DataFrame({"key": ["a", "a", "b", "b"], "col": data}) | ||||
|     grouped = df.groupby("key") | ||||
|  | ||||
|     if action == "always_int": | ||||
|         # always Int64 | ||||
|         expected_dtype = pd.Int64Dtype() | ||||
|     elif action == "large_int": | ||||
|         # for any int/bool use Int64, for float preserve dtype | ||||
|         if is_float_dtype(data.dtype): | ||||
|             expected_dtype = data.dtype | ||||
|         elif is_integer_dtype(data.dtype): | ||||
|             # match the numpy dtype we'd get with the non-nullable analogue | ||||
|             expected_dtype = data.dtype | ||||
|         else: | ||||
|             expected_dtype = pd.Int64Dtype() | ||||
|     elif action == "always_float": | ||||
|         # for any int/bool use Float64, for float preserve dtype | ||||
|         if is_float_dtype(data.dtype): | ||||
|             expected_dtype = data.dtype | ||||
|         else: | ||||
|             expected_dtype = pd.Float64Dtype() | ||||
|     elif action == "preserve": | ||||
|         expected_dtype = data.dtype | ||||
|  | ||||
|     result = getattr(grouped, op_name)() | ||||
|     assert result["col"].dtype == expected_dtype | ||||
|  | ||||
|     result = grouped.aggregate(op_name) | ||||
|     assert result["col"].dtype == expected_dtype | ||||
|  | ||||
|     result = getattr(grouped["col"], op_name)() | ||||
|     assert result.dtype == expected_dtype | ||||
|  | ||||
|     result = grouped["col"].aggregate(op_name) | ||||
|     assert result.dtype == expected_dtype | ||||
| @ -0,0 +1,402 @@ | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas.compat import is_platform_arm | ||||
| from pandas.errors import NumbaUtilError | ||||
|  | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     NamedAgg, | ||||
|     Series, | ||||
|     option_context, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
| from pandas.util.version import Version | ||||
|  | ||||
| pytestmark = [pytest.mark.single_cpu] | ||||
|  | ||||
| numba = pytest.importorskip("numba") | ||||
| pytestmark.append( | ||||
|     pytest.mark.skipif( | ||||
|         Version(numba.__version__) == Version("0.61") and is_platform_arm(), | ||||
|         reason=f"Segfaults on ARM platforms with numba {numba.__version__}", | ||||
|     ) | ||||
| ) | ||||
|  | ||||
|  | ||||
| def test_correct_function_signature(): | ||||
|     pytest.importorskip("numba") | ||||
|  | ||||
|     def incorrect_function(x): | ||||
|         return sum(x) * 2.7 | ||||
|  | ||||
|     data = DataFrame( | ||||
|         {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, | ||||
|         columns=["key", "data"], | ||||
|     ) | ||||
|     with pytest.raises(NumbaUtilError, match="The first 2"): | ||||
|         data.groupby("key").agg(incorrect_function, engine="numba") | ||||
|  | ||||
|     with pytest.raises(NumbaUtilError, match="The first 2"): | ||||
|         data.groupby("key")["data"].agg(incorrect_function, engine="numba") | ||||
|  | ||||
|  | ||||
| def test_check_nopython_kwargs(): | ||||
|     pytest.importorskip("numba") | ||||
|  | ||||
|     def incorrect_function(values, index): | ||||
|         return sum(values) * 2.7 | ||||
|  | ||||
|     data = DataFrame( | ||||
|         {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, | ||||
|         columns=["key", "data"], | ||||
|     ) | ||||
|     with pytest.raises(NumbaUtilError, match="numba does not support"): | ||||
|         data.groupby("key").agg(incorrect_function, engine="numba", a=1) | ||||
|  | ||||
|     with pytest.raises(NumbaUtilError, match="numba does not support"): | ||||
|         data.groupby("key")["data"].agg(incorrect_function, engine="numba", a=1) | ||||
|  | ||||
|  | ||||
| @pytest.mark.filterwarnings("ignore") | ||||
| # Filter warnings when parallel=True and the function can't be parallelized by Numba | ||||
| @pytest.mark.parametrize("jit", [True, False]) | ||||
| @pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"]) | ||||
| @pytest.mark.parametrize("as_index", [True, False]) | ||||
| def test_numba_vs_cython(jit, pandas_obj, nogil, parallel, nopython, as_index): | ||||
|     pytest.importorskip("numba") | ||||
|  | ||||
|     def func_numba(values, index): | ||||
|         return np.mean(values) * 2.7 | ||||
|  | ||||
|     if jit: | ||||
|         # Test accepted jitted functions | ||||
|         import numba | ||||
|  | ||||
|         func_numba = numba.jit(func_numba) | ||||
|  | ||||
|     data = DataFrame( | ||||
|         {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1] | ||||
|     ) | ||||
|     engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} | ||||
|     grouped = data.groupby(0, as_index=as_index) | ||||
|     if pandas_obj == "Series": | ||||
|         grouped = grouped[1] | ||||
|  | ||||
|     result = grouped.agg(func_numba, engine="numba", engine_kwargs=engine_kwargs) | ||||
|     expected = grouped.agg(lambda x: np.mean(x) * 2.7, engine="cython") | ||||
|  | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.filterwarnings("ignore") | ||||
| # Filter warnings when parallel=True and the function can't be parallelized by Numba | ||||
| @pytest.mark.parametrize("jit", [True, False]) | ||||
| @pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"]) | ||||
| def test_cache(jit, pandas_obj, nogil, parallel, nopython): | ||||
|     # Test that the functions are cached correctly if we switch functions | ||||
|     pytest.importorskip("numba") | ||||
|  | ||||
|     def func_1(values, index): | ||||
|         return np.mean(values) - 3.4 | ||||
|  | ||||
|     def func_2(values, index): | ||||
|         return np.mean(values) * 2.7 | ||||
|  | ||||
|     if jit: | ||||
|         import numba | ||||
|  | ||||
|         func_1 = numba.jit(func_1) | ||||
|         func_2 = numba.jit(func_2) | ||||
|  | ||||
|     data = DataFrame( | ||||
|         {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1] | ||||
|     ) | ||||
|     engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} | ||||
|     grouped = data.groupby(0) | ||||
|     if pandas_obj == "Series": | ||||
|         grouped = grouped[1] | ||||
|  | ||||
|     result = grouped.agg(func_1, engine="numba", engine_kwargs=engine_kwargs) | ||||
|     expected = grouped.agg(lambda x: np.mean(x) - 3.4, engine="cython") | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|     # Add func_2 to the cache | ||||
|     result = grouped.agg(func_2, engine="numba", engine_kwargs=engine_kwargs) | ||||
|     expected = grouped.agg(lambda x: np.mean(x) * 2.7, engine="cython") | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|     # Retest func_1 which should use the cache | ||||
|     result = grouped.agg(func_1, engine="numba", engine_kwargs=engine_kwargs) | ||||
|     expected = grouped.agg(lambda x: np.mean(x) - 3.4, engine="cython") | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_use_global_config(): | ||||
|     pytest.importorskip("numba") | ||||
|  | ||||
|     def func_1(values, index): | ||||
|         return np.mean(values) - 3.4 | ||||
|  | ||||
|     data = DataFrame( | ||||
|         {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1] | ||||
|     ) | ||||
|     grouped = data.groupby(0) | ||||
|     expected = grouped.agg(func_1, engine="numba") | ||||
|     with option_context("compute.use_numba", True): | ||||
|         result = grouped.agg(func_1, engine=None) | ||||
|     tm.assert_frame_equal(expected, result) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "agg_kwargs", | ||||
|     [ | ||||
|         {"func": ["min", "max"]}, | ||||
|         {"func": "min"}, | ||||
|         {"func": {1: ["min", "max"], 2: "sum"}}, | ||||
|         {"bmin": NamedAgg(column=1, aggfunc="min")}, | ||||
|     ], | ||||
| ) | ||||
| def test_multifunc_numba_vs_cython_frame(agg_kwargs): | ||||
|     pytest.importorskip("numba") | ||||
|     data = DataFrame( | ||||
|         { | ||||
|             0: ["a", "a", "b", "b", "a"], | ||||
|             1: [1.0, 2.0, 3.0, 4.0, 5.0], | ||||
|             2: [1, 2, 3, 4, 5], | ||||
|         }, | ||||
|         columns=[0, 1, 2], | ||||
|     ) | ||||
|     grouped = data.groupby(0) | ||||
|     result = grouped.agg(**agg_kwargs, engine="numba") | ||||
|     expected = grouped.agg(**agg_kwargs, engine="cython") | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "agg_kwargs,expected_func", | ||||
|     [ | ||||
|         ({"func": lambda values, index: values.sum()}, "sum"), | ||||
|         # FIXME | ||||
|         pytest.param( | ||||
|             { | ||||
|                 "func": [ | ||||
|                     lambda values, index: values.sum(), | ||||
|                     lambda values, index: values.min(), | ||||
|                 ] | ||||
|             }, | ||||
|             ["sum", "min"], | ||||
|             marks=pytest.mark.xfail( | ||||
|                 reason="This doesn't work yet! Fails in nopython pipeline!" | ||||
|             ), | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_multifunc_numba_udf_frame(agg_kwargs, expected_func): | ||||
|     pytest.importorskip("numba") | ||||
|     data = DataFrame( | ||||
|         { | ||||
|             0: ["a", "a", "b", "b", "a"], | ||||
|             1: [1.0, 2.0, 3.0, 4.0, 5.0], | ||||
|             2: [1, 2, 3, 4, 5], | ||||
|         }, | ||||
|         columns=[0, 1, 2], | ||||
|     ) | ||||
|     grouped = data.groupby(0) | ||||
|     result = grouped.agg(**agg_kwargs, engine="numba") | ||||
|     expected = grouped.agg(expected_func, engine="cython") | ||||
|     # check_dtype can be removed if GH 44952 is addressed | ||||
|     # Currently, UDFs still always return float64 while reductions can preserve dtype | ||||
|     tm.assert_frame_equal(result, expected, check_dtype=False) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "agg_kwargs", | ||||
|     [{"func": ["min", "max"]}, {"func": "min"}, {"min_val": "min", "max_val": "max"}], | ||||
| ) | ||||
| def test_multifunc_numba_vs_cython_series(agg_kwargs): | ||||
|     pytest.importorskip("numba") | ||||
|     labels = ["a", "a", "b", "b", "a"] | ||||
|     data = Series([1.0, 2.0, 3.0, 4.0, 5.0]) | ||||
|     grouped = data.groupby(labels) | ||||
|     agg_kwargs["engine"] = "numba" | ||||
|     result = grouped.agg(**agg_kwargs) | ||||
|     agg_kwargs["engine"] = "cython" | ||||
|     expected = grouped.agg(**agg_kwargs) | ||||
|     if isinstance(expected, DataFrame): | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|     else: | ||||
|         tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.single_cpu | ||||
| @pytest.mark.parametrize( | ||||
|     "data,agg_kwargs", | ||||
|     [ | ||||
|         (Series([1.0, 2.0, 3.0, 4.0, 5.0]), {"func": ["min", "max"]}), | ||||
|         (Series([1.0, 2.0, 3.0, 4.0, 5.0]), {"func": "min"}), | ||||
|         ( | ||||
|             DataFrame( | ||||
|                 {1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2] | ||||
|             ), | ||||
|             {"func": ["min", "max"]}, | ||||
|         ), | ||||
|         ( | ||||
|             DataFrame( | ||||
|                 {1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2] | ||||
|             ), | ||||
|             {"func": "min"}, | ||||
|         ), | ||||
|         ( | ||||
|             DataFrame( | ||||
|                 {1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2] | ||||
|             ), | ||||
|             {"func": {1: ["min", "max"], 2: "sum"}}, | ||||
|         ), | ||||
|         ( | ||||
|             DataFrame( | ||||
|                 {1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2] | ||||
|             ), | ||||
|             {"min_col": NamedAgg(column=1, aggfunc="min")}, | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_multifunc_numba_kwarg_propagation(data, agg_kwargs): | ||||
|     pytest.importorskip("numba") | ||||
|     labels = ["a", "a", "b", "b", "a"] | ||||
|     grouped = data.groupby(labels) | ||||
|     result = grouped.agg(**agg_kwargs, engine="numba", engine_kwargs={"parallel": True}) | ||||
|     expected = grouped.agg(**agg_kwargs, engine="numba") | ||||
|     if isinstance(expected, DataFrame): | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|     else: | ||||
|         tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_args_not_cached(): | ||||
|     # GH 41647 | ||||
|     pytest.importorskip("numba") | ||||
|  | ||||
|     def sum_last(values, index, n): | ||||
|         return values[-n:].sum() | ||||
|  | ||||
|     df = DataFrame({"id": [0, 0, 1, 1], "x": [1, 1, 1, 1]}) | ||||
|     grouped_x = df.groupby("id")["x"] | ||||
|     result = grouped_x.agg(sum_last, 1, engine="numba") | ||||
|     expected = Series([1.0] * 2, name="x", index=Index([0, 1], name="id")) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     result = grouped_x.agg(sum_last, 2, engine="numba") | ||||
|     expected = Series([2.0] * 2, name="x", index=Index([0, 1], name="id")) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_index_data_correctly_passed(): | ||||
|     # GH 43133 | ||||
|     pytest.importorskip("numba") | ||||
|  | ||||
|     def f(values, index): | ||||
|         return np.mean(index) | ||||
|  | ||||
|     df = DataFrame({"group": ["A", "A", "B"], "v": [4, 5, 6]}, index=[-1, -2, -3]) | ||||
|     result = df.groupby("group").aggregate(f, engine="numba") | ||||
|     expected = DataFrame( | ||||
|         [-1.5, -3.0], columns=["v"], index=Index(["A", "B"], name="group") | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_engine_kwargs_not_cached(): | ||||
|     # If the user passes a different set of engine_kwargs don't return the same | ||||
|     # jitted function | ||||
|     pytest.importorskip("numba") | ||||
|     nogil = True | ||||
|     parallel = False | ||||
|     nopython = True | ||||
|  | ||||
|     def func_kwargs(values, index): | ||||
|         return nogil + parallel + nopython | ||||
|  | ||||
|     engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel} | ||||
|     df = DataFrame({"value": [0, 0, 0]}) | ||||
|     result = df.groupby(level=0).aggregate( | ||||
|         func_kwargs, engine="numba", engine_kwargs=engine_kwargs | ||||
|     ) | ||||
|     expected = DataFrame({"value": [2.0, 2.0, 2.0]}) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     nogil = False | ||||
|     engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel} | ||||
|     result = df.groupby(level=0).aggregate( | ||||
|         func_kwargs, engine="numba", engine_kwargs=engine_kwargs | ||||
|     ) | ||||
|     expected = DataFrame({"value": [1.0, 1.0, 1.0]}) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.filterwarnings("ignore") | ||||
| def test_multiindex_one_key(nogil, parallel, nopython): | ||||
|     pytest.importorskip("numba") | ||||
|  | ||||
|     def numba_func(values, index): | ||||
|         return 1 | ||||
|  | ||||
|     df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"]) | ||||
|     engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel} | ||||
|     result = df.groupby("A").agg( | ||||
|         numba_func, engine="numba", engine_kwargs=engine_kwargs | ||||
|     ) | ||||
|     expected = DataFrame([1.0], index=Index([1], name="A"), columns=["C"]) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_multiindex_multi_key_not_supported(nogil, parallel, nopython): | ||||
|     pytest.importorskip("numba") | ||||
|  | ||||
|     def numba_func(values, index): | ||||
|         return 1 | ||||
|  | ||||
|     df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"]) | ||||
|     engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel} | ||||
|     with pytest.raises(NotImplementedError, match="more than 1 grouping labels"): | ||||
|         df.groupby(["A", "B"]).agg( | ||||
|             numba_func, engine="numba", engine_kwargs=engine_kwargs | ||||
|         ) | ||||
|  | ||||
|  | ||||
| def test_multilabel_numba_vs_cython(numba_supported_reductions): | ||||
|     pytest.importorskip("numba") | ||||
|     reduction, kwargs = numba_supported_reductions | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], | ||||
|             "B": ["one", "one", "two", "three", "two", "two", "one", "three"], | ||||
|             "C": np.random.default_rng(2).standard_normal(8), | ||||
|             "D": np.random.default_rng(2).standard_normal(8), | ||||
|         } | ||||
|     ) | ||||
|     gb = df.groupby(["A", "B"]) | ||||
|     res_agg = gb.agg(reduction, engine="numba", **kwargs) | ||||
|     expected_agg = gb.agg(reduction, engine="cython", **kwargs) | ||||
|     tm.assert_frame_equal(res_agg, expected_agg) | ||||
|     # Test that calling the aggregation directly also works | ||||
|     direct_res = getattr(gb, reduction)(engine="numba", **kwargs) | ||||
|     direct_expected = getattr(gb, reduction)(engine="cython", **kwargs) | ||||
|     tm.assert_frame_equal(direct_res, direct_expected) | ||||
|  | ||||
|  | ||||
| def test_multilabel_udf_numba_vs_cython(): | ||||
|     pytest.importorskip("numba") | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], | ||||
|             "B": ["one", "one", "two", "three", "two", "two", "one", "three"], | ||||
|             "C": np.random.default_rng(2).standard_normal(8), | ||||
|             "D": np.random.default_rng(2).standard_normal(8), | ||||
|         } | ||||
|     ) | ||||
|     gb = df.groupby(["A", "B"]) | ||||
|     result = gb.agg(lambda values, index: values.min(), engine="numba") | ||||
|     expected = gb.agg(lambda x: x.min(), engine="cython") | ||||
|     tm.assert_frame_equal(result, expected) | ||||
| @ -0,0 +1,676 @@ | ||||
| """ | ||||
| test all other .agg behavior | ||||
| """ | ||||
|  | ||||
| import datetime as dt | ||||
| from functools import partial | ||||
|  | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas.errors import SpecificationError | ||||
|  | ||||
| import pandas as pd | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     MultiIndex, | ||||
|     PeriodIndex, | ||||
|     Series, | ||||
|     date_range, | ||||
|     period_range, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
| from pandas.io.formats.printing import pprint_thing | ||||
|  | ||||
|  | ||||
| def test_agg_partial_failure_raises(): | ||||
|     # GH#43741 | ||||
|  | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "data1": np.random.default_rng(2).standard_normal(5), | ||||
|             "data2": np.random.default_rng(2).standard_normal(5), | ||||
|             "key1": ["a", "a", "b", "b", "a"], | ||||
|             "key2": ["one", "two", "one", "two", "one"], | ||||
|         } | ||||
|     ) | ||||
|     grouped = df.groupby("key1") | ||||
|  | ||||
|     def peak_to_peak(arr): | ||||
|         return arr.max() - arr.min() | ||||
|  | ||||
|     with pytest.raises(TypeError, match="unsupported operand type"): | ||||
|         grouped.agg([peak_to_peak]) | ||||
|  | ||||
|     with pytest.raises(TypeError, match="unsupported operand type"): | ||||
|         grouped.agg(peak_to_peak) | ||||
|  | ||||
|  | ||||
| def test_agg_datetimes_mixed(): | ||||
|     data = [[1, "2012-01-01", 1.0], [2, "2012-01-02", 2.0], [3, None, 3.0]] | ||||
|  | ||||
|     df1 = DataFrame( | ||||
|         { | ||||
|             "key": [x[0] for x in data], | ||||
|             "date": [x[1] for x in data], | ||||
|             "value": [x[2] for x in data], | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|     data = [ | ||||
|         [ | ||||
|             row[0], | ||||
|             (dt.datetime.strptime(row[1], "%Y-%m-%d").date() if row[1] else None), | ||||
|             row[2], | ||||
|         ] | ||||
|         for row in data | ||||
|     ] | ||||
|  | ||||
|     df2 = DataFrame( | ||||
|         { | ||||
|             "key": [x[0] for x in data], | ||||
|             "date": [x[1] for x in data], | ||||
|             "value": [x[2] for x in data], | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|     df1["weights"] = df1["value"] / df1["value"].sum() | ||||
|     gb1 = df1.groupby("date").aggregate("sum") | ||||
|  | ||||
|     df2["weights"] = df1["value"] / df1["value"].sum() | ||||
|     gb2 = df2.groupby("date").aggregate("sum") | ||||
|  | ||||
|     assert len(gb1) == len(gb2) | ||||
|  | ||||
|  | ||||
| def test_agg_period_index(): | ||||
|     prng = period_range("2012-1-1", freq="M", periods=3) | ||||
|     df = DataFrame(np.random.default_rng(2).standard_normal((3, 2)), index=prng) | ||||
|     rs = df.groupby(level=0).sum() | ||||
|     assert isinstance(rs.index, PeriodIndex) | ||||
|  | ||||
|     # GH 3579 | ||||
|     index = period_range(start="1999-01", periods=5, freq="M") | ||||
|     s1 = Series(np.random.default_rng(2).random(len(index)), index=index) | ||||
|     s2 = Series(np.random.default_rng(2).random(len(index)), index=index) | ||||
|     df = DataFrame.from_dict({"s1": s1, "s2": s2}) | ||||
|     grouped = df.groupby(df.index.month) | ||||
|     list(grouped) | ||||
|  | ||||
|  | ||||
| def test_agg_dict_parameter_cast_result_dtypes(): | ||||
|     # GH 12821 | ||||
|  | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "class": ["A", "A", "B", "B", "C", "C", "D", "D"], | ||||
|             "time": date_range("1/1/2011", periods=8, freq="h"), | ||||
|         } | ||||
|     ) | ||||
|     df.loc[[0, 1, 2, 5], "time"] = None | ||||
|  | ||||
|     # test for `first` function | ||||
|     exp = df.loc[[0, 3, 4, 6]].set_index("class") | ||||
|     grouped = df.groupby("class") | ||||
|     tm.assert_frame_equal(grouped.first(), exp) | ||||
|     tm.assert_frame_equal(grouped.agg("first"), exp) | ||||
|     tm.assert_frame_equal(grouped.agg({"time": "first"}), exp) | ||||
|     tm.assert_series_equal(grouped.time.first(), exp["time"]) | ||||
|     tm.assert_series_equal(grouped.time.agg("first"), exp["time"]) | ||||
|  | ||||
|     # test for `last` function | ||||
|     exp = df.loc[[0, 3, 4, 7]].set_index("class") | ||||
|     grouped = df.groupby("class") | ||||
|     tm.assert_frame_equal(grouped.last(), exp) | ||||
|     tm.assert_frame_equal(grouped.agg("last"), exp) | ||||
|     tm.assert_frame_equal(grouped.agg({"time": "last"}), exp) | ||||
|     tm.assert_series_equal(grouped.time.last(), exp["time"]) | ||||
|     tm.assert_series_equal(grouped.time.agg("last"), exp["time"]) | ||||
|  | ||||
|     # count | ||||
|     exp = Series([2, 2, 2, 2], index=Index(list("ABCD"), name="class"), name="time") | ||||
|     tm.assert_series_equal(grouped.time.agg(len), exp) | ||||
|     tm.assert_series_equal(grouped.time.size(), exp) | ||||
|  | ||||
|     exp = Series([0, 1, 1, 2], index=Index(list("ABCD"), name="class"), name="time") | ||||
|     tm.assert_series_equal(grouped.time.count(), exp) | ||||
|  | ||||
|  | ||||
| def test_agg_cast_results_dtypes(): | ||||
|     # similar to GH12821 | ||||
|     # xref #11444 | ||||
|     u = [dt.datetime(2015, x + 1, 1) for x in range(12)] | ||||
|     v = list("aaabbbbbbccd") | ||||
|     df = DataFrame({"X": v, "Y": u}) | ||||
|  | ||||
|     result = df.groupby("X")["Y"].agg(len) | ||||
|     expected = df.groupby("X")["Y"].count() | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_aggregate_float64_no_int64(): | ||||
|     # see gh-11199 | ||||
|     df = DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 2, 4, 5], "c": [1, 2, 3, 4, 5]}) | ||||
|  | ||||
|     expected = DataFrame({"a": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5]) | ||||
|     expected.index.name = "b" | ||||
|  | ||||
|     result = df.groupby("b")[["a"]].mean() | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     expected = DataFrame({"a": [1, 2.5, 4, 5], "c": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5]) | ||||
|     expected.index.name = "b" | ||||
|  | ||||
|     result = df.groupby("b")[["a", "c"]].mean() | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_aggregate_api_consistency(): | ||||
|     # GH 9052 | ||||
|     # make sure that the aggregates via dict | ||||
|     # are consistent | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], | ||||
|             "B": ["one", "one", "two", "two", "two", "two", "one", "two"], | ||||
|             "C": np.random.default_rng(2).standard_normal(8) + 1.0, | ||||
|             "D": np.arange(8), | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|     grouped = df.groupby(["A", "B"]) | ||||
|     c_mean = grouped["C"].mean() | ||||
|     c_sum = grouped["C"].sum() | ||||
|     d_mean = grouped["D"].mean() | ||||
|     d_sum = grouped["D"].sum() | ||||
|  | ||||
|     result = grouped["D"].agg(["sum", "mean"]) | ||||
|     expected = pd.concat([d_sum, d_mean], axis=1) | ||||
|     expected.columns = ["sum", "mean"] | ||||
|     tm.assert_frame_equal(result, expected, check_like=True) | ||||
|  | ||||
|     result = grouped.agg(["sum", "mean"]) | ||||
|     expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1) | ||||
|     expected.columns = MultiIndex.from_product([["C", "D"], ["sum", "mean"]]) | ||||
|     tm.assert_frame_equal(result, expected, check_like=True) | ||||
|  | ||||
|     result = grouped[["D", "C"]].agg(["sum", "mean"]) | ||||
|     expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1) | ||||
|     expected.columns = MultiIndex.from_product([["D", "C"], ["sum", "mean"]]) | ||||
|     tm.assert_frame_equal(result, expected, check_like=True) | ||||
|  | ||||
|     result = grouped.agg({"C": "mean", "D": "sum"}) | ||||
|     expected = pd.concat([d_sum, c_mean], axis=1) | ||||
|     tm.assert_frame_equal(result, expected, check_like=True) | ||||
|  | ||||
|     result = grouped.agg({"C": ["mean", "sum"], "D": ["mean", "sum"]}) | ||||
|     expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1) | ||||
|     expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]]) | ||||
|  | ||||
|     msg = r"Column\(s\) \['r', 'r2'\] do not exist" | ||||
|     with pytest.raises(KeyError, match=msg): | ||||
|         grouped[["D", "C"]].agg({"r": "sum", "r2": "mean"}) | ||||
|  | ||||
|  | ||||
| def test_agg_dict_renaming_deprecation(): | ||||
|     # 15931 | ||||
|     df = DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)}) | ||||
|  | ||||
|     msg = r"nested renamer is not supported" | ||||
|     with pytest.raises(SpecificationError, match=msg): | ||||
|         df.groupby("A").agg( | ||||
|             {"B": {"foo": ["sum", "max"]}, "C": {"bar": ["count", "min"]}} | ||||
|         ) | ||||
|  | ||||
|     msg = r"Column\(s\) \['ma'\] do not exist" | ||||
|     with pytest.raises(KeyError, match=msg): | ||||
|         df.groupby("A")[["B", "C"]].agg({"ma": "max"}) | ||||
|  | ||||
|     msg = r"nested renamer is not supported" | ||||
|     with pytest.raises(SpecificationError, match=msg): | ||||
|         df.groupby("A").B.agg({"foo": "count"}) | ||||
|  | ||||
|  | ||||
| def test_agg_compat(): | ||||
|     # GH 12334 | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], | ||||
|             "B": ["one", "one", "two", "two", "two", "two", "one", "two"], | ||||
|             "C": np.random.default_rng(2).standard_normal(8) + 1.0, | ||||
|             "D": np.arange(8), | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|     g = df.groupby(["A", "B"]) | ||||
|  | ||||
|     msg = r"nested renamer is not supported" | ||||
|     with pytest.raises(SpecificationError, match=msg): | ||||
|         g["D"].agg({"C": ["sum", "std"]}) | ||||
|  | ||||
|     with pytest.raises(SpecificationError, match=msg): | ||||
|         g["D"].agg({"C": "sum", "D": "std"}) | ||||
|  | ||||
|  | ||||
| def test_agg_nested_dicts(): | ||||
|     # API change for disallowing these types of nested dicts | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], | ||||
|             "B": ["one", "one", "two", "two", "two", "two", "one", "two"], | ||||
|             "C": np.random.default_rng(2).standard_normal(8) + 1.0, | ||||
|             "D": np.arange(8), | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|     g = df.groupby(["A", "B"]) | ||||
|  | ||||
|     msg = r"nested renamer is not supported" | ||||
|     with pytest.raises(SpecificationError, match=msg): | ||||
|         g.aggregate({"r1": {"C": ["mean", "sum"]}, "r2": {"D": ["mean", "sum"]}}) | ||||
|  | ||||
|     with pytest.raises(SpecificationError, match=msg): | ||||
|         g.agg({"C": {"ra": ["mean", "std"]}, "D": {"rb": ["mean", "std"]}}) | ||||
|  | ||||
|     # same name as the original column | ||||
|     # GH9052 | ||||
|     with pytest.raises(SpecificationError, match=msg): | ||||
|         g["D"].agg({"result1": np.sum, "result2": np.mean}) | ||||
|  | ||||
|     with pytest.raises(SpecificationError, match=msg): | ||||
|         g["D"].agg({"D": np.sum, "result2": np.mean}) | ||||
|  | ||||
|  | ||||
| def test_agg_item_by_item_raise_typeerror(): | ||||
|     df = DataFrame(np.random.default_rng(2).integers(10, size=(20, 10))) | ||||
|  | ||||
|     def raiseException(df): | ||||
|         pprint_thing("----------------------------------------") | ||||
|         pprint_thing(df.to_string()) | ||||
|         raise TypeError("test") | ||||
|  | ||||
|     with pytest.raises(TypeError, match="test"): | ||||
|         df.groupby(0).agg(raiseException) | ||||
|  | ||||
|  | ||||
| def test_series_agg_multikey(): | ||||
|     ts = Series( | ||||
|         np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) | ||||
|     ) | ||||
|     grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) | ||||
|  | ||||
|     result = grouped.agg("sum") | ||||
|     expected = grouped.sum() | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_series_agg_multi_pure_python(): | ||||
|     data = DataFrame( | ||||
|         { | ||||
|             "A": [ | ||||
|                 "foo", | ||||
|                 "foo", | ||||
|                 "foo", | ||||
|                 "foo", | ||||
|                 "bar", | ||||
|                 "bar", | ||||
|                 "bar", | ||||
|                 "bar", | ||||
|                 "foo", | ||||
|                 "foo", | ||||
|                 "foo", | ||||
|             ], | ||||
|             "B": [ | ||||
|                 "one", | ||||
|                 "one", | ||||
|                 "one", | ||||
|                 "two", | ||||
|                 "one", | ||||
|                 "one", | ||||
|                 "one", | ||||
|                 "two", | ||||
|                 "two", | ||||
|                 "two", | ||||
|                 "one", | ||||
|             ], | ||||
|             "C": [ | ||||
|                 "dull", | ||||
|                 "dull", | ||||
|                 "shiny", | ||||
|                 "dull", | ||||
|                 "dull", | ||||
|                 "shiny", | ||||
|                 "shiny", | ||||
|                 "dull", | ||||
|                 "shiny", | ||||
|                 "shiny", | ||||
|                 "shiny", | ||||
|             ], | ||||
|             "D": np.random.default_rng(2).standard_normal(11), | ||||
|             "E": np.random.default_rng(2).standard_normal(11), | ||||
|             "F": np.random.default_rng(2).standard_normal(11), | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|     def bad(x): | ||||
|         if isinstance(x.values, np.ndarray): | ||||
|             assert len(x.values.base) > 0 | ||||
|         return "foo" | ||||
|  | ||||
|     result = data.groupby(["A", "B"]).agg(bad) | ||||
|     expected = data.groupby(["A", "B"]).agg(lambda x: "foo") | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_agg_consistency(): | ||||
|     # agg with ([]) and () not consistent | ||||
|     # GH 6715 | ||||
|     def P1(a): | ||||
|         return np.percentile(a.dropna(), q=1) | ||||
|  | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "col1": [1, 2, 3, 4], | ||||
|             "col2": [10, 25, 26, 31], | ||||
|             "date": [ | ||||
|                 dt.date(2013, 2, 10), | ||||
|                 dt.date(2013, 2, 10), | ||||
|                 dt.date(2013, 2, 11), | ||||
|                 dt.date(2013, 2, 11), | ||||
|             ], | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|     g = df.groupby("date") | ||||
|  | ||||
|     expected = g.agg([P1]) | ||||
|     expected.columns = expected.columns.levels[0] | ||||
|  | ||||
|     result = g.agg(P1) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_agg_callables(): | ||||
|     # GH 7929 | ||||
|     df = DataFrame({"foo": [1, 2], "bar": [3, 4]}).astype(np.int64) | ||||
|  | ||||
|     class fn_class: | ||||
|         def __call__(self, x): | ||||
|             return sum(x) | ||||
|  | ||||
|     equiv_callables = [ | ||||
|         sum, | ||||
|         np.sum, | ||||
|         lambda x: sum(x), | ||||
|         lambda x: x.sum(), | ||||
|         partial(sum), | ||||
|         fn_class(), | ||||
|     ] | ||||
|  | ||||
|     expected = df.groupby("foo").agg("sum") | ||||
|     for ecall in equiv_callables: | ||||
|         warn = FutureWarning if ecall is sum or ecall is np.sum else None | ||||
|         msg = "using DataFrameGroupBy.sum" | ||||
|         with tm.assert_produces_warning(warn, match=msg): | ||||
|             result = df.groupby("foo").agg(ecall) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_agg_over_numpy_arrays(): | ||||
|     # GH 3788 | ||||
|     df = DataFrame( | ||||
|         [ | ||||
|             [1, np.array([10, 20, 30])], | ||||
|             [1, np.array([40, 50, 60])], | ||||
|             [2, np.array([20, 30, 40])], | ||||
|         ], | ||||
|         columns=["category", "arraydata"], | ||||
|     ) | ||||
|     gb = df.groupby("category") | ||||
|  | ||||
|     expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]] | ||||
|     expected_index = Index([1, 2], name="category") | ||||
|     expected_column = ["arraydata"] | ||||
|     expected = DataFrame(expected_data, index=expected_index, columns=expected_column) | ||||
|  | ||||
|     alt = gb.sum(numeric_only=False) | ||||
|     tm.assert_frame_equal(alt, expected) | ||||
|  | ||||
|     result = gb.agg("sum", numeric_only=False) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # FIXME: the original version of this test called `gb.agg(sum)` | ||||
|     #  and that raises TypeError if `numeric_only=False` is passed | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("as_period", [True, False]) | ||||
| def test_agg_tzaware_non_datetime_result(as_period): | ||||
|     # discussed in GH#29589, fixed in GH#29641, operating on tzaware values | ||||
|     #  with function that is not dtype-preserving | ||||
|     dti = date_range("2012-01-01", periods=4, tz="UTC") | ||||
|     if as_period: | ||||
|         dti = dti.tz_localize(None).to_period("D") | ||||
|  | ||||
|     df = DataFrame({"a": [0, 0, 1, 1], "b": dti}) | ||||
|     gb = df.groupby("a") | ||||
|  | ||||
|     # Case that _does_ preserve the dtype | ||||
|     result = gb["b"].agg(lambda x: x.iloc[0]) | ||||
|     expected = Series(dti[::2], name="b") | ||||
|     expected.index.name = "a" | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     # Cases that do _not_ preserve the dtype | ||||
|     result = gb["b"].agg(lambda x: x.iloc[0].year) | ||||
|     expected = Series([2012, 2012], name="b") | ||||
|     expected.index.name = "a" | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     result = gb["b"].agg(lambda x: x.iloc[-1] - x.iloc[0]) | ||||
|     expected = Series([pd.Timedelta(days=1), pd.Timedelta(days=1)], name="b") | ||||
|     expected.index.name = "a" | ||||
|     if as_period: | ||||
|         expected = Series([pd.offsets.Day(1), pd.offsets.Day(1)], name="b") | ||||
|         expected.index.name = "a" | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_agg_timezone_round_trip(): | ||||
|     # GH 15426 | ||||
|     ts = pd.Timestamp("2016-01-01 12:00:00", tz="US/Pacific") | ||||
|     df = DataFrame({"a": 1, "b": [ts + dt.timedelta(minutes=nn) for nn in range(10)]}) | ||||
|  | ||||
|     result1 = df.groupby("a")["b"].agg("min").iloc[0] | ||||
|     result2 = df.groupby("a")["b"].agg(lambda x: np.min(x)).iloc[0] | ||||
|     result3 = df.groupby("a")["b"].min().iloc[0] | ||||
|  | ||||
|     assert result1 == ts | ||||
|     assert result2 == ts | ||||
|     assert result3 == ts | ||||
|  | ||||
|     dates = [ | ||||
|         pd.Timestamp(f"2016-01-0{i:d} 12:00:00", tz="US/Pacific") for i in range(1, 5) | ||||
|     ] | ||||
|     df = DataFrame({"A": ["a", "b"] * 2, "B": dates}) | ||||
|     grouped = df.groupby("A") | ||||
|  | ||||
|     ts = df["B"].iloc[0] | ||||
|     assert ts == grouped.nth(0)["B"].iloc[0] | ||||
|     assert ts == grouped.head(1)["B"].iloc[0] | ||||
|     assert ts == grouped.first()["B"].iloc[0] | ||||
|  | ||||
|     # GH#27110 applying iloc should return a DataFrame | ||||
|     msg = "DataFrameGroupBy.apply operated on the grouping columns" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1] | ||||
|  | ||||
|     ts = df["B"].iloc[2] | ||||
|     assert ts == grouped.last()["B"].iloc[0] | ||||
|  | ||||
|     # GH#27110 applying iloc should return a DataFrame | ||||
|     msg = "DataFrameGroupBy.apply operated on the grouping columns" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1] | ||||
|  | ||||
|  | ||||
| def test_sum_uint64_overflow(): | ||||
|     # see gh-14758 | ||||
|     # Convert to uint64 and don't overflow | ||||
|     df = DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object) | ||||
|     df = df + 9223372036854775807 | ||||
|  | ||||
|     index = Index( | ||||
|         [9223372036854775808, 9223372036854775810, 9223372036854775812], dtype=np.uint64 | ||||
|     ) | ||||
|     expected = DataFrame( | ||||
|         {1: [9223372036854775809, 9223372036854775811, 9223372036854775813]}, | ||||
|         index=index, | ||||
|         dtype=object, | ||||
|     ) | ||||
|  | ||||
|     expected.index.name = 0 | ||||
|     result = df.groupby(0).sum(numeric_only=False) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # out column is non-numeric, so with numeric_only=True it is dropped | ||||
|     result2 = df.groupby(0).sum(numeric_only=True) | ||||
|     expected2 = expected[[]] | ||||
|     tm.assert_frame_equal(result2, expected2) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "structure, expected", | ||||
|     [ | ||||
|         (tuple, DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})), | ||||
|         (list, DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})), | ||||
|         ( | ||||
|             lambda x: tuple(x), | ||||
|             DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}}), | ||||
|         ), | ||||
|         ( | ||||
|             lambda x: list(x), | ||||
|             DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}}), | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_agg_structs_dataframe(structure, expected): | ||||
|     df = DataFrame( | ||||
|         {"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]} | ||||
|     ) | ||||
|  | ||||
|     result = df.groupby(["A", "B"]).aggregate(structure) | ||||
|     expected.index.names = ["A", "B"] | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "structure, expected", | ||||
|     [ | ||||
|         (tuple, Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")), | ||||
|         (list, Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")), | ||||
|         (lambda x: tuple(x), Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")), | ||||
|         (lambda x: list(x), Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")), | ||||
|     ], | ||||
| ) | ||||
| def test_agg_structs_series(structure, expected): | ||||
|     # Issue #18079 | ||||
|     df = DataFrame( | ||||
|         {"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]} | ||||
|     ) | ||||
|  | ||||
|     result = df.groupby("A")["C"].aggregate(structure) | ||||
|     expected.index.name = "A" | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_agg_category_nansum(observed): | ||||
|     categories = ["a", "b", "c"] | ||||
|     df = DataFrame( | ||||
|         {"A": pd.Categorical(["a", "a", "b"], categories=categories), "B": [1, 2, 3]} | ||||
|     ) | ||||
|     msg = "using SeriesGroupBy.sum" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         result = df.groupby("A", observed=observed).B.agg(np.nansum) | ||||
|     expected = Series( | ||||
|         [3, 3, 0], | ||||
|         index=pd.CategoricalIndex(["a", "b", "c"], categories=categories, name="A"), | ||||
|         name="B", | ||||
|     ) | ||||
|     if observed: | ||||
|         expected = expected[expected != 0] | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_agg_list_like_func(): | ||||
|     # GH 18473 | ||||
|     df = DataFrame({"A": [str(x) for x in range(3)], "B": [str(x) for x in range(3)]}) | ||||
|     grouped = df.groupby("A", as_index=False, sort=False) | ||||
|     result = grouped.agg({"B": lambda x: list(x)}) | ||||
|     expected = DataFrame( | ||||
|         {"A": [str(x) for x in range(3)], "B": [[str(x)] for x in range(3)]} | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_agg_lambda_with_timezone(): | ||||
|     # GH 23683 | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "tag": [1, 1], | ||||
|             "date": [ | ||||
|                 pd.Timestamp("2018-01-01", tz="UTC"), | ||||
|                 pd.Timestamp("2018-01-02", tz="UTC"), | ||||
|             ], | ||||
|         } | ||||
|     ) | ||||
|     result = df.groupby("tag").agg({"date": lambda e: e.head(1)}) | ||||
|     expected = DataFrame( | ||||
|         [pd.Timestamp("2018-01-01", tz="UTC")], | ||||
|         index=Index([1], name="tag"), | ||||
|         columns=["date"], | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "err_cls", | ||||
|     [ | ||||
|         NotImplementedError, | ||||
|         RuntimeError, | ||||
|         KeyError, | ||||
|         IndexError, | ||||
|         OSError, | ||||
|         ValueError, | ||||
|         ArithmeticError, | ||||
|         AttributeError, | ||||
|     ], | ||||
| ) | ||||
| def test_groupby_agg_err_catching(err_cls): | ||||
|     # make sure we suppress anything other than TypeError or AssertionError | ||||
|     #  in _python_agg_general | ||||
|  | ||||
|     # Use a non-standard EA to make sure we don't go down ndarray paths | ||||
|     from pandas.tests.extension.decimal.array import ( | ||||
|         DecimalArray, | ||||
|         make_data, | ||||
|         to_decimal, | ||||
|     ) | ||||
|  | ||||
|     data = make_data()[:5] | ||||
|     df = DataFrame( | ||||
|         {"id1": [0, 0, 0, 1, 1], "id2": [0, 1, 0, 1, 1], "decimals": DecimalArray(data)} | ||||
|     ) | ||||
|  | ||||
|     expected = Series(to_decimal([data[0], data[3]])) | ||||
|  | ||||
|     def weird_func(x): | ||||
|         # weird function that raise something other than TypeError or IndexError | ||||
|         #  in _python_agg_general | ||||
|         if len(x) == 0: | ||||
|             raise err_cls | ||||
|         return x.iloc[0] | ||||
|  | ||||
|     result = df["decimals"].groupby(df["id1"]).agg(weird_func) | ||||
|     tm.assert_series_equal(result, expected, check_names=False) | ||||
		Reference in New Issue
	
	Block a user