done
This commit is contained in:
		| @ -0,0 +1,25 @@ | ||||
| def get_groupby_method_args(name, obj): | ||||
|     """ | ||||
|     Get required arguments for a groupby method. | ||||
|  | ||||
|     When parametrizing a test over groupby methods (e.g. "sum", "mean", "fillna"), | ||||
|     it is often the case that arguments are required for certain methods. | ||||
|  | ||||
|     Parameters | ||||
|     ---------- | ||||
|     name: str | ||||
|         Name of the method. | ||||
|     obj: Series or DataFrame | ||||
|         pandas object that is being grouped. | ||||
|  | ||||
|     Returns | ||||
|     ------- | ||||
|     A tuple of required arguments for the method. | ||||
|     """ | ||||
|     if name in ("nth", "fillna", "take"): | ||||
|         return (0,) | ||||
|     if name == "quantile": | ||||
|         return (0.5,) | ||||
|     if name == "corrwith": | ||||
|         return (obj,) | ||||
|     return () | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -0,0 +1,437 @@ | ||||
| """ | ||||
| test cython .agg behavior | ||||
| """ | ||||
|  | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas.core.dtypes.common import ( | ||||
|     is_float_dtype, | ||||
|     is_integer_dtype, | ||||
| ) | ||||
|  | ||||
| import pandas as pd | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     NaT, | ||||
|     Series, | ||||
|     Timedelta, | ||||
|     Timestamp, | ||||
|     bdate_range, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
| import pandas.core.common as com | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "op_name", | ||||
|     [ | ||||
|         "count", | ||||
|         "sum", | ||||
|         "std", | ||||
|         "var", | ||||
|         "sem", | ||||
|         "mean", | ||||
|         pytest.param( | ||||
|             "median", | ||||
|             # ignore mean of empty slice | ||||
|             # and all-NaN | ||||
|             marks=[pytest.mark.filterwarnings("ignore::RuntimeWarning")], | ||||
|         ), | ||||
|         "prod", | ||||
|         "min", | ||||
|         "max", | ||||
|     ], | ||||
| ) | ||||
| def test_cythonized_aggers(op_name): | ||||
|     data = { | ||||
|         "A": [0, 0, 0, 0, 1, 1, 1, 1, 1, 1.0, np.nan, np.nan], | ||||
|         "B": ["A", "B"] * 6, | ||||
|         "C": np.random.default_rng(2).standard_normal(12), | ||||
|     } | ||||
|     df = DataFrame(data) | ||||
|     df.loc[2:10:2, "C"] = np.nan | ||||
|  | ||||
|     op = lambda x: getattr(x, op_name)() | ||||
|  | ||||
|     # single column | ||||
|     grouped = df.drop(["B"], axis=1).groupby("A") | ||||
|     exp = {cat: op(group["C"]) for cat, group in grouped} | ||||
|     exp = DataFrame({"C": exp}) | ||||
|     exp.index.name = "A" | ||||
|     result = op(grouped) | ||||
|     tm.assert_frame_equal(result, exp) | ||||
|  | ||||
|     # multiple columns | ||||
|     grouped = df.groupby(["A", "B"]) | ||||
|     expd = {} | ||||
|     for (cat1, cat2), group in grouped: | ||||
|         expd.setdefault(cat1, {})[cat2] = op(group["C"]) | ||||
|     exp = DataFrame(expd).T.stack(future_stack=True) | ||||
|     exp.index.names = ["A", "B"] | ||||
|     exp.name = "C" | ||||
|  | ||||
|     result = op(grouped)["C"] | ||||
|     if op_name in ["sum", "prod"]: | ||||
|         tm.assert_series_equal(result, exp) | ||||
|  | ||||
|  | ||||
| def test_cython_agg_boolean(): | ||||
|     frame = DataFrame( | ||||
|         { | ||||
|             "a": np.random.default_rng(2).integers(0, 5, 50), | ||||
|             "b": np.random.default_rng(2).integers(0, 2, 50).astype("bool"), | ||||
|         } | ||||
|     ) | ||||
|     result = frame.groupby("a")["b"].mean() | ||||
|     msg = "using SeriesGroupBy.mean" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         # GH#53425 | ||||
|         expected = frame.groupby("a")["b"].agg(np.mean) | ||||
|  | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_cython_agg_nothing_to_agg(): | ||||
|     frame = DataFrame( | ||||
|         {"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25} | ||||
|     ) | ||||
|  | ||||
|     msg = "Cannot use numeric_only=True with SeriesGroupBy.mean and non-numeric dtypes" | ||||
|     with pytest.raises(TypeError, match=msg): | ||||
|         frame.groupby("a")["b"].mean(numeric_only=True) | ||||
|  | ||||
|     frame = DataFrame( | ||||
|         {"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25} | ||||
|     ) | ||||
|  | ||||
|     result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True) | ||||
|     expected = DataFrame( | ||||
|         [], | ||||
|         index=frame["a"].sort_values().drop_duplicates(), | ||||
|         columns=Index([], dtype="str"), | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_cython_agg_nothing_to_agg_with_dates(): | ||||
|     frame = DataFrame( | ||||
|         { | ||||
|             "a": np.random.default_rng(2).integers(0, 5, 50), | ||||
|             "b": ["foo", "bar"] * 25, | ||||
|             "dates": pd.date_range("now", periods=50, freq="min"), | ||||
|         } | ||||
|     ) | ||||
|     msg = "Cannot use numeric_only=True with SeriesGroupBy.mean and non-numeric dtypes" | ||||
|     with pytest.raises(TypeError, match=msg): | ||||
|         frame.groupby("b").dates.mean(numeric_only=True) | ||||
|  | ||||
|  | ||||
| def test_cython_agg_frame_columns(): | ||||
|     # #2113 | ||||
|     df = DataFrame({"x": [1, 2, 3], "y": [3, 4, 5]}) | ||||
|  | ||||
|     msg = "DataFrame.groupby with axis=1 is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         df.groupby(level=0, axis="columns").mean() | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         df.groupby(level=0, axis="columns").mean() | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         df.groupby(level=0, axis="columns").mean() | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         df.groupby(level=0, axis="columns").mean() | ||||
|  | ||||
|  | ||||
| def test_cython_agg_return_dict(): | ||||
|     # GH 16741 | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], | ||||
|             "B": ["one", "one", "two", "three", "two", "two", "one", "three"], | ||||
|             "C": np.random.default_rng(2).standard_normal(8), | ||||
|             "D": np.random.default_rng(2).standard_normal(8), | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|     ts = df.groupby("A")["B"].agg(lambda x: x.value_counts().to_dict()) | ||||
|     expected = Series( | ||||
|         [{"two": 1, "one": 1, "three": 1}, {"two": 2, "one": 2, "three": 1}], | ||||
|         index=Index(["bar", "foo"], name="A"), | ||||
|         name="B", | ||||
|     ) | ||||
|     tm.assert_series_equal(ts, expected) | ||||
|  | ||||
|  | ||||
| def test_cython_fail_agg(): | ||||
|     dr = bdate_range("1/1/2000", periods=50) | ||||
|     ts = Series(["A", "B", "C", "D", "E"] * 10, dtype=object, index=dr) | ||||
|  | ||||
|     grouped = ts.groupby(lambda x: x.month) | ||||
|     summed = grouped.sum() | ||||
|     msg = "using SeriesGroupBy.sum" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         # GH#53425 | ||||
|         expected = grouped.agg(np.sum).astype(object) | ||||
|     tm.assert_series_equal(summed, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "op, targop", | ||||
|     [ | ||||
|         ("mean", np.mean), | ||||
|         ("median", np.median), | ||||
|         ("var", np.var), | ||||
|         ("sum", np.sum), | ||||
|         ("prod", np.prod), | ||||
|         ("min", np.min), | ||||
|         ("max", np.max), | ||||
|         ("first", lambda x: x.iloc[0]), | ||||
|         ("last", lambda x: x.iloc[-1]), | ||||
|     ], | ||||
| ) | ||||
| def test__cython_agg_general(op, targop): | ||||
|     df = DataFrame(np.random.default_rng(2).standard_normal(1000)) | ||||
|     labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float) | ||||
|  | ||||
|     result = df.groupby(labels)._cython_agg_general(op, alt=None, numeric_only=True) | ||||
|     warn = FutureWarning if targop in com._cython_table else None | ||||
|     msg = f"using DataFrameGroupBy.{op}" | ||||
|     with tm.assert_produces_warning(warn, match=msg): | ||||
|         # GH#53425 | ||||
|         expected = df.groupby(labels).agg(targop) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "op, targop", | ||||
|     [ | ||||
|         ("mean", np.mean), | ||||
|         ("median", lambda x: np.median(x) if len(x) > 0 else np.nan), | ||||
|         ("var", lambda x: np.var(x, ddof=1)), | ||||
|         ("min", np.min), | ||||
|         ("max", np.max), | ||||
|     ], | ||||
| ) | ||||
| def test_cython_agg_empty_buckets(op, targop, observed): | ||||
|     df = DataFrame([11, 12, 13]) | ||||
|     grps = range(0, 55, 5) | ||||
|  | ||||
|     # calling _cython_agg_general directly, instead of via the user API | ||||
|     # which sets different values for min_count, so do that here. | ||||
|     g = df.groupby(pd.cut(df[0], grps), observed=observed) | ||||
|     result = g._cython_agg_general(op, alt=None, numeric_only=True) | ||||
|  | ||||
|     g = df.groupby(pd.cut(df[0], grps), observed=observed) | ||||
|     expected = g.agg(lambda x: targop(x)) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_cython_agg_empty_buckets_nanops(observed): | ||||
|     # GH-18869 can't call nanops on empty groups, so hardcode expected | ||||
|     # for these | ||||
|     df = DataFrame([11, 12, 13], columns=["a"]) | ||||
|     grps = np.arange(0, 25, 5, dtype=int) | ||||
|     # add / sum | ||||
|     result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general( | ||||
|         "sum", alt=None, numeric_only=True | ||||
|     ) | ||||
|     intervals = pd.interval_range(0, 20, freq=5) | ||||
|     expected = DataFrame( | ||||
|         {"a": [0, 0, 36, 0]}, | ||||
|         index=pd.CategoricalIndex(intervals, name="a", ordered=True), | ||||
|     ) | ||||
|     if observed: | ||||
|         expected = expected[expected.a != 0] | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # prod | ||||
|     result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general( | ||||
|         "prod", alt=None, numeric_only=True | ||||
|     ) | ||||
|     expected = DataFrame( | ||||
|         {"a": [1, 1, 1716, 1]}, | ||||
|         index=pd.CategoricalIndex(intervals, name="a", ordered=True), | ||||
|     ) | ||||
|     if observed: | ||||
|         expected = expected[expected.a != 1] | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("op", ["first", "last", "max", "min"]) | ||||
| @pytest.mark.parametrize( | ||||
|     "data", [Timestamp("2016-10-14 21:00:44.557"), Timedelta("17088 days 21:00:44.557")] | ||||
| ) | ||||
| def test_cython_with_timestamp_and_nat(op, data): | ||||
|     # https://github.com/pandas-dev/pandas/issues/19526 | ||||
|     df = DataFrame({"a": [0, 1], "b": [data, NaT]}) | ||||
|     index = Index([0, 1], name="a") | ||||
|  | ||||
|     # We will group by a and test the cython aggregations | ||||
|     expected = DataFrame({"b": [data, NaT]}, index=index) | ||||
|  | ||||
|     result = df.groupby("a").aggregate(op) | ||||
|     tm.assert_frame_equal(expected, result) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "agg", | ||||
|     [ | ||||
|         "min", | ||||
|         "max", | ||||
|         "count", | ||||
|         "sum", | ||||
|         "prod", | ||||
|         "var", | ||||
|         "mean", | ||||
|         "median", | ||||
|         "ohlc", | ||||
|         "cumprod", | ||||
|         "cumsum", | ||||
|         "shift", | ||||
|         "any", | ||||
|         "all", | ||||
|         "quantile", | ||||
|         "first", | ||||
|         "last", | ||||
|         "rank", | ||||
|         "cummin", | ||||
|         "cummax", | ||||
|     ], | ||||
| ) | ||||
| def test_read_only_buffer_source_agg(agg): | ||||
|     # https://github.com/pandas-dev/pandas/issues/36014 | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "sepal_length": [5.1, 4.9, 4.7, 4.6, 5.0], | ||||
|             "species": ["setosa", "setosa", "setosa", "setosa", "setosa"], | ||||
|         } | ||||
|     ) | ||||
|     df._mgr.arrays[0].flags.writeable = False | ||||
|  | ||||
|     result = df.groupby(["species"]).agg({"sepal_length": agg}) | ||||
|     expected = df.copy().groupby(["species"]).agg({"sepal_length": agg}) | ||||
|  | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "op_name", | ||||
|     [ | ||||
|         "count", | ||||
|         "sum", | ||||
|         "std", | ||||
|         "var", | ||||
|         "sem", | ||||
|         "mean", | ||||
|         "median", | ||||
|         "prod", | ||||
|         "min", | ||||
|         "max", | ||||
|     ], | ||||
| ) | ||||
| def test_cython_agg_nullable_int(op_name): | ||||
|     # ensure that the cython-based aggregations don't fail for nullable dtype | ||||
|     # (eg https://github.com/pandas-dev/pandas/issues/37415) | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "A": ["A", "B"] * 5, | ||||
|             "B": pd.array([1, 2, 3, 4, 5, 6, 7, 8, 9, pd.NA], dtype="Int64"), | ||||
|         } | ||||
|     ) | ||||
|     result = getattr(df.groupby("A")["B"], op_name)() | ||||
|     df2 = df.assign(B=df["B"].astype("float64")) | ||||
|     expected = getattr(df2.groupby("A")["B"], op_name)() | ||||
|     if op_name in ("mean", "median"): | ||||
|         convert_integer = False | ||||
|     else: | ||||
|         convert_integer = True | ||||
|     expected = expected.convert_dtypes(convert_integer=convert_integer) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"]) | ||||
| def test_count_masked_returns_masked_dtype(dtype): | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "A": [1, 1], | ||||
|             "B": pd.array([1, pd.NA], dtype=dtype), | ||||
|             "C": pd.array([1, 1], dtype=dtype), | ||||
|         } | ||||
|     ) | ||||
|     result = df.groupby("A").count() | ||||
|     expected = DataFrame( | ||||
|         [[1, 2]], index=Index([1], name="A"), columns=["B", "C"], dtype="Int64" | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("with_na", [True, False]) | ||||
| @pytest.mark.parametrize( | ||||
|     "op_name, action", | ||||
|     [ | ||||
|         # ("count", "always_int"), | ||||
|         ("sum", "large_int"), | ||||
|         # ("std", "always_float"), | ||||
|         ("var", "always_float"), | ||||
|         # ("sem", "always_float"), | ||||
|         ("mean", "always_float"), | ||||
|         ("median", "always_float"), | ||||
|         ("prod", "large_int"), | ||||
|         ("min", "preserve"), | ||||
|         ("max", "preserve"), | ||||
|         ("first", "preserve"), | ||||
|         ("last", "preserve"), | ||||
|     ], | ||||
| ) | ||||
| @pytest.mark.parametrize( | ||||
|     "data", | ||||
|     [ | ||||
|         pd.array([1, 2, 3, 4], dtype="Int64"), | ||||
|         pd.array([1, 2, 3, 4], dtype="Int8"), | ||||
|         pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float32"), | ||||
|         pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float64"), | ||||
|         pd.array([True, True, False, False], dtype="boolean"), | ||||
|     ], | ||||
| ) | ||||
| def test_cython_agg_EA_known_dtypes(data, op_name, action, with_na): | ||||
|     if with_na: | ||||
|         data[3] = pd.NA | ||||
|  | ||||
|     df = DataFrame({"key": ["a", "a", "b", "b"], "col": data}) | ||||
|     grouped = df.groupby("key") | ||||
|  | ||||
|     if action == "always_int": | ||||
|         # always Int64 | ||||
|         expected_dtype = pd.Int64Dtype() | ||||
|     elif action == "large_int": | ||||
|         # for any int/bool use Int64, for float preserve dtype | ||||
|         if is_float_dtype(data.dtype): | ||||
|             expected_dtype = data.dtype | ||||
|         elif is_integer_dtype(data.dtype): | ||||
|             # match the numpy dtype we'd get with the non-nullable analogue | ||||
|             expected_dtype = data.dtype | ||||
|         else: | ||||
|             expected_dtype = pd.Int64Dtype() | ||||
|     elif action == "always_float": | ||||
|         # for any int/bool use Float64, for float preserve dtype | ||||
|         if is_float_dtype(data.dtype): | ||||
|             expected_dtype = data.dtype | ||||
|         else: | ||||
|             expected_dtype = pd.Float64Dtype() | ||||
|     elif action == "preserve": | ||||
|         expected_dtype = data.dtype | ||||
|  | ||||
|     result = getattr(grouped, op_name)() | ||||
|     assert result["col"].dtype == expected_dtype | ||||
|  | ||||
|     result = grouped.aggregate(op_name) | ||||
|     assert result["col"].dtype == expected_dtype | ||||
|  | ||||
|     result = getattr(grouped["col"], op_name)() | ||||
|     assert result.dtype == expected_dtype | ||||
|  | ||||
|     result = grouped["col"].aggregate(op_name) | ||||
|     assert result.dtype == expected_dtype | ||||
| @ -0,0 +1,402 @@ | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas.compat import is_platform_arm | ||||
| from pandas.errors import NumbaUtilError | ||||
|  | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     NamedAgg, | ||||
|     Series, | ||||
|     option_context, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
| from pandas.util.version import Version | ||||
|  | ||||
| pytestmark = [pytest.mark.single_cpu] | ||||
|  | ||||
| numba = pytest.importorskip("numba") | ||||
| pytestmark.append( | ||||
|     pytest.mark.skipif( | ||||
|         Version(numba.__version__) == Version("0.61") and is_platform_arm(), | ||||
|         reason=f"Segfaults on ARM platforms with numba {numba.__version__}", | ||||
|     ) | ||||
| ) | ||||
|  | ||||
|  | ||||
| def test_correct_function_signature(): | ||||
|     pytest.importorskip("numba") | ||||
|  | ||||
|     def incorrect_function(x): | ||||
|         return sum(x) * 2.7 | ||||
|  | ||||
|     data = DataFrame( | ||||
|         {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, | ||||
|         columns=["key", "data"], | ||||
|     ) | ||||
|     with pytest.raises(NumbaUtilError, match="The first 2"): | ||||
|         data.groupby("key").agg(incorrect_function, engine="numba") | ||||
|  | ||||
|     with pytest.raises(NumbaUtilError, match="The first 2"): | ||||
|         data.groupby("key")["data"].agg(incorrect_function, engine="numba") | ||||
|  | ||||
|  | ||||
| def test_check_nopython_kwargs(): | ||||
|     pytest.importorskip("numba") | ||||
|  | ||||
|     def incorrect_function(values, index): | ||||
|         return sum(values) * 2.7 | ||||
|  | ||||
|     data = DataFrame( | ||||
|         {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, | ||||
|         columns=["key", "data"], | ||||
|     ) | ||||
|     with pytest.raises(NumbaUtilError, match="numba does not support"): | ||||
|         data.groupby("key").agg(incorrect_function, engine="numba", a=1) | ||||
|  | ||||
|     with pytest.raises(NumbaUtilError, match="numba does not support"): | ||||
|         data.groupby("key")["data"].agg(incorrect_function, engine="numba", a=1) | ||||
|  | ||||
|  | ||||
| @pytest.mark.filterwarnings("ignore") | ||||
| # Filter warnings when parallel=True and the function can't be parallelized by Numba | ||||
| @pytest.mark.parametrize("jit", [True, False]) | ||||
| @pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"]) | ||||
| @pytest.mark.parametrize("as_index", [True, False]) | ||||
| def test_numba_vs_cython(jit, pandas_obj, nogil, parallel, nopython, as_index): | ||||
|     pytest.importorskip("numba") | ||||
|  | ||||
|     def func_numba(values, index): | ||||
|         return np.mean(values) * 2.7 | ||||
|  | ||||
|     if jit: | ||||
|         # Test accepted jitted functions | ||||
|         import numba | ||||
|  | ||||
|         func_numba = numba.jit(func_numba) | ||||
|  | ||||
|     data = DataFrame( | ||||
|         {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1] | ||||
|     ) | ||||
|     engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} | ||||
|     grouped = data.groupby(0, as_index=as_index) | ||||
|     if pandas_obj == "Series": | ||||
|         grouped = grouped[1] | ||||
|  | ||||
|     result = grouped.agg(func_numba, engine="numba", engine_kwargs=engine_kwargs) | ||||
|     expected = grouped.agg(lambda x: np.mean(x) * 2.7, engine="cython") | ||||
|  | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.filterwarnings("ignore") | ||||
| # Filter warnings when parallel=True and the function can't be parallelized by Numba | ||||
| @pytest.mark.parametrize("jit", [True, False]) | ||||
| @pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"]) | ||||
| def test_cache(jit, pandas_obj, nogil, parallel, nopython): | ||||
|     # Test that the functions are cached correctly if we switch functions | ||||
|     pytest.importorskip("numba") | ||||
|  | ||||
|     def func_1(values, index): | ||||
|         return np.mean(values) - 3.4 | ||||
|  | ||||
|     def func_2(values, index): | ||||
|         return np.mean(values) * 2.7 | ||||
|  | ||||
|     if jit: | ||||
|         import numba | ||||
|  | ||||
|         func_1 = numba.jit(func_1) | ||||
|         func_2 = numba.jit(func_2) | ||||
|  | ||||
|     data = DataFrame( | ||||
|         {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1] | ||||
|     ) | ||||
|     engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} | ||||
|     grouped = data.groupby(0) | ||||
|     if pandas_obj == "Series": | ||||
|         grouped = grouped[1] | ||||
|  | ||||
|     result = grouped.agg(func_1, engine="numba", engine_kwargs=engine_kwargs) | ||||
|     expected = grouped.agg(lambda x: np.mean(x) - 3.4, engine="cython") | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|     # Add func_2 to the cache | ||||
|     result = grouped.agg(func_2, engine="numba", engine_kwargs=engine_kwargs) | ||||
|     expected = grouped.agg(lambda x: np.mean(x) * 2.7, engine="cython") | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|     # Retest func_1 which should use the cache | ||||
|     result = grouped.agg(func_1, engine="numba", engine_kwargs=engine_kwargs) | ||||
|     expected = grouped.agg(lambda x: np.mean(x) - 3.4, engine="cython") | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_use_global_config(): | ||||
|     pytest.importorskip("numba") | ||||
|  | ||||
|     def func_1(values, index): | ||||
|         return np.mean(values) - 3.4 | ||||
|  | ||||
|     data = DataFrame( | ||||
|         {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1] | ||||
|     ) | ||||
|     grouped = data.groupby(0) | ||||
|     expected = grouped.agg(func_1, engine="numba") | ||||
|     with option_context("compute.use_numba", True): | ||||
|         result = grouped.agg(func_1, engine=None) | ||||
|     tm.assert_frame_equal(expected, result) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "agg_kwargs", | ||||
|     [ | ||||
|         {"func": ["min", "max"]}, | ||||
|         {"func": "min"}, | ||||
|         {"func": {1: ["min", "max"], 2: "sum"}}, | ||||
|         {"bmin": NamedAgg(column=1, aggfunc="min")}, | ||||
|     ], | ||||
| ) | ||||
| def test_multifunc_numba_vs_cython_frame(agg_kwargs): | ||||
|     pytest.importorskip("numba") | ||||
|     data = DataFrame( | ||||
|         { | ||||
|             0: ["a", "a", "b", "b", "a"], | ||||
|             1: [1.0, 2.0, 3.0, 4.0, 5.0], | ||||
|             2: [1, 2, 3, 4, 5], | ||||
|         }, | ||||
|         columns=[0, 1, 2], | ||||
|     ) | ||||
|     grouped = data.groupby(0) | ||||
|     result = grouped.agg(**agg_kwargs, engine="numba") | ||||
|     expected = grouped.agg(**agg_kwargs, engine="cython") | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "agg_kwargs,expected_func", | ||||
|     [ | ||||
|         ({"func": lambda values, index: values.sum()}, "sum"), | ||||
|         # FIXME | ||||
|         pytest.param( | ||||
|             { | ||||
|                 "func": [ | ||||
|                     lambda values, index: values.sum(), | ||||
|                     lambda values, index: values.min(), | ||||
|                 ] | ||||
|             }, | ||||
|             ["sum", "min"], | ||||
|             marks=pytest.mark.xfail( | ||||
|                 reason="This doesn't work yet! Fails in nopython pipeline!" | ||||
|             ), | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_multifunc_numba_udf_frame(agg_kwargs, expected_func): | ||||
|     pytest.importorskip("numba") | ||||
|     data = DataFrame( | ||||
|         { | ||||
|             0: ["a", "a", "b", "b", "a"], | ||||
|             1: [1.0, 2.0, 3.0, 4.0, 5.0], | ||||
|             2: [1, 2, 3, 4, 5], | ||||
|         }, | ||||
|         columns=[0, 1, 2], | ||||
|     ) | ||||
|     grouped = data.groupby(0) | ||||
|     result = grouped.agg(**agg_kwargs, engine="numba") | ||||
|     expected = grouped.agg(expected_func, engine="cython") | ||||
|     # check_dtype can be removed if GH 44952 is addressed | ||||
|     # Currently, UDFs still always return float64 while reductions can preserve dtype | ||||
|     tm.assert_frame_equal(result, expected, check_dtype=False) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "agg_kwargs", | ||||
|     [{"func": ["min", "max"]}, {"func": "min"}, {"min_val": "min", "max_val": "max"}], | ||||
| ) | ||||
| def test_multifunc_numba_vs_cython_series(agg_kwargs): | ||||
|     pytest.importorskip("numba") | ||||
|     labels = ["a", "a", "b", "b", "a"] | ||||
|     data = Series([1.0, 2.0, 3.0, 4.0, 5.0]) | ||||
|     grouped = data.groupby(labels) | ||||
|     agg_kwargs["engine"] = "numba" | ||||
|     result = grouped.agg(**agg_kwargs) | ||||
|     agg_kwargs["engine"] = "cython" | ||||
|     expected = grouped.agg(**agg_kwargs) | ||||
|     if isinstance(expected, DataFrame): | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|     else: | ||||
|         tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.single_cpu | ||||
| @pytest.mark.parametrize( | ||||
|     "data,agg_kwargs", | ||||
|     [ | ||||
|         (Series([1.0, 2.0, 3.0, 4.0, 5.0]), {"func": ["min", "max"]}), | ||||
|         (Series([1.0, 2.0, 3.0, 4.0, 5.0]), {"func": "min"}), | ||||
|         ( | ||||
|             DataFrame( | ||||
|                 {1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2] | ||||
|             ), | ||||
|             {"func": ["min", "max"]}, | ||||
|         ), | ||||
|         ( | ||||
|             DataFrame( | ||||
|                 {1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2] | ||||
|             ), | ||||
|             {"func": "min"}, | ||||
|         ), | ||||
|         ( | ||||
|             DataFrame( | ||||
|                 {1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2] | ||||
|             ), | ||||
|             {"func": {1: ["min", "max"], 2: "sum"}}, | ||||
|         ), | ||||
|         ( | ||||
|             DataFrame( | ||||
|                 {1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2] | ||||
|             ), | ||||
|             {"min_col": NamedAgg(column=1, aggfunc="min")}, | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_multifunc_numba_kwarg_propagation(data, agg_kwargs): | ||||
|     pytest.importorskip("numba") | ||||
|     labels = ["a", "a", "b", "b", "a"] | ||||
|     grouped = data.groupby(labels) | ||||
|     result = grouped.agg(**agg_kwargs, engine="numba", engine_kwargs={"parallel": True}) | ||||
|     expected = grouped.agg(**agg_kwargs, engine="numba") | ||||
|     if isinstance(expected, DataFrame): | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|     else: | ||||
|         tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_args_not_cached(): | ||||
|     # GH 41647 | ||||
|     pytest.importorskip("numba") | ||||
|  | ||||
|     def sum_last(values, index, n): | ||||
|         return values[-n:].sum() | ||||
|  | ||||
|     df = DataFrame({"id": [0, 0, 1, 1], "x": [1, 1, 1, 1]}) | ||||
|     grouped_x = df.groupby("id")["x"] | ||||
|     result = grouped_x.agg(sum_last, 1, engine="numba") | ||||
|     expected = Series([1.0] * 2, name="x", index=Index([0, 1], name="id")) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     result = grouped_x.agg(sum_last, 2, engine="numba") | ||||
|     expected = Series([2.0] * 2, name="x", index=Index([0, 1], name="id")) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_index_data_correctly_passed(): | ||||
|     # GH 43133 | ||||
|     pytest.importorskip("numba") | ||||
|  | ||||
|     def f(values, index): | ||||
|         return np.mean(index) | ||||
|  | ||||
|     df = DataFrame({"group": ["A", "A", "B"], "v": [4, 5, 6]}, index=[-1, -2, -3]) | ||||
|     result = df.groupby("group").aggregate(f, engine="numba") | ||||
|     expected = DataFrame( | ||||
|         [-1.5, -3.0], columns=["v"], index=Index(["A", "B"], name="group") | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_engine_kwargs_not_cached(): | ||||
|     # If the user passes a different set of engine_kwargs don't return the same | ||||
|     # jitted function | ||||
|     pytest.importorskip("numba") | ||||
|     nogil = True | ||||
|     parallel = False | ||||
|     nopython = True | ||||
|  | ||||
|     def func_kwargs(values, index): | ||||
|         return nogil + parallel + nopython | ||||
|  | ||||
|     engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel} | ||||
|     df = DataFrame({"value": [0, 0, 0]}) | ||||
|     result = df.groupby(level=0).aggregate( | ||||
|         func_kwargs, engine="numba", engine_kwargs=engine_kwargs | ||||
|     ) | ||||
|     expected = DataFrame({"value": [2.0, 2.0, 2.0]}) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     nogil = False | ||||
|     engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel} | ||||
|     result = df.groupby(level=0).aggregate( | ||||
|         func_kwargs, engine="numba", engine_kwargs=engine_kwargs | ||||
|     ) | ||||
|     expected = DataFrame({"value": [1.0, 1.0, 1.0]}) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.filterwarnings("ignore") | ||||
| def test_multiindex_one_key(nogil, parallel, nopython): | ||||
|     pytest.importorskip("numba") | ||||
|  | ||||
|     def numba_func(values, index): | ||||
|         return 1 | ||||
|  | ||||
|     df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"]) | ||||
|     engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel} | ||||
|     result = df.groupby("A").agg( | ||||
|         numba_func, engine="numba", engine_kwargs=engine_kwargs | ||||
|     ) | ||||
|     expected = DataFrame([1.0], index=Index([1], name="A"), columns=["C"]) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_multiindex_multi_key_not_supported(nogil, parallel, nopython): | ||||
|     pytest.importorskip("numba") | ||||
|  | ||||
|     def numba_func(values, index): | ||||
|         return 1 | ||||
|  | ||||
|     df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"]) | ||||
|     engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel} | ||||
|     with pytest.raises(NotImplementedError, match="more than 1 grouping labels"): | ||||
|         df.groupby(["A", "B"]).agg( | ||||
|             numba_func, engine="numba", engine_kwargs=engine_kwargs | ||||
|         ) | ||||
|  | ||||
|  | ||||
| def test_multilabel_numba_vs_cython(numba_supported_reductions): | ||||
|     pytest.importorskip("numba") | ||||
|     reduction, kwargs = numba_supported_reductions | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], | ||||
|             "B": ["one", "one", "two", "three", "two", "two", "one", "three"], | ||||
|             "C": np.random.default_rng(2).standard_normal(8), | ||||
|             "D": np.random.default_rng(2).standard_normal(8), | ||||
|         } | ||||
|     ) | ||||
|     gb = df.groupby(["A", "B"]) | ||||
|     res_agg = gb.agg(reduction, engine="numba", **kwargs) | ||||
|     expected_agg = gb.agg(reduction, engine="cython", **kwargs) | ||||
|     tm.assert_frame_equal(res_agg, expected_agg) | ||||
|     # Test that calling the aggregation directly also works | ||||
|     direct_res = getattr(gb, reduction)(engine="numba", **kwargs) | ||||
|     direct_expected = getattr(gb, reduction)(engine="cython", **kwargs) | ||||
|     tm.assert_frame_equal(direct_res, direct_expected) | ||||
|  | ||||
|  | ||||
| def test_multilabel_udf_numba_vs_cython(): | ||||
|     pytest.importorskip("numba") | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], | ||||
|             "B": ["one", "one", "two", "three", "two", "two", "one", "three"], | ||||
|             "C": np.random.default_rng(2).standard_normal(8), | ||||
|             "D": np.random.default_rng(2).standard_normal(8), | ||||
|         } | ||||
|     ) | ||||
|     gb = df.groupby(["A", "B"]) | ||||
|     result = gb.agg(lambda values, index: values.min(), engine="numba") | ||||
|     expected = gb.agg(lambda x: x.min(), engine="cython") | ||||
|     tm.assert_frame_equal(result, expected) | ||||
| @ -0,0 +1,676 @@ | ||||
| """ | ||||
| test all other .agg behavior | ||||
| """ | ||||
|  | ||||
| import datetime as dt | ||||
| from functools import partial | ||||
|  | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas.errors import SpecificationError | ||||
|  | ||||
| import pandas as pd | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     MultiIndex, | ||||
|     PeriodIndex, | ||||
|     Series, | ||||
|     date_range, | ||||
|     period_range, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
| from pandas.io.formats.printing import pprint_thing | ||||
|  | ||||
|  | ||||
| def test_agg_partial_failure_raises(): | ||||
|     # GH#43741 | ||||
|  | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "data1": np.random.default_rng(2).standard_normal(5), | ||||
|             "data2": np.random.default_rng(2).standard_normal(5), | ||||
|             "key1": ["a", "a", "b", "b", "a"], | ||||
|             "key2": ["one", "two", "one", "two", "one"], | ||||
|         } | ||||
|     ) | ||||
|     grouped = df.groupby("key1") | ||||
|  | ||||
|     def peak_to_peak(arr): | ||||
|         return arr.max() - arr.min() | ||||
|  | ||||
|     with pytest.raises(TypeError, match="unsupported operand type"): | ||||
|         grouped.agg([peak_to_peak]) | ||||
|  | ||||
|     with pytest.raises(TypeError, match="unsupported operand type"): | ||||
|         grouped.agg(peak_to_peak) | ||||
|  | ||||
|  | ||||
| def test_agg_datetimes_mixed(): | ||||
|     data = [[1, "2012-01-01", 1.0], [2, "2012-01-02", 2.0], [3, None, 3.0]] | ||||
|  | ||||
|     df1 = DataFrame( | ||||
|         { | ||||
|             "key": [x[0] for x in data], | ||||
|             "date": [x[1] for x in data], | ||||
|             "value": [x[2] for x in data], | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|     data = [ | ||||
|         [ | ||||
|             row[0], | ||||
|             (dt.datetime.strptime(row[1], "%Y-%m-%d").date() if row[1] else None), | ||||
|             row[2], | ||||
|         ] | ||||
|         for row in data | ||||
|     ] | ||||
|  | ||||
|     df2 = DataFrame( | ||||
|         { | ||||
|             "key": [x[0] for x in data], | ||||
|             "date": [x[1] for x in data], | ||||
|             "value": [x[2] for x in data], | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|     df1["weights"] = df1["value"] / df1["value"].sum() | ||||
|     gb1 = df1.groupby("date").aggregate("sum") | ||||
|  | ||||
|     df2["weights"] = df1["value"] / df1["value"].sum() | ||||
|     gb2 = df2.groupby("date").aggregate("sum") | ||||
|  | ||||
|     assert len(gb1) == len(gb2) | ||||
|  | ||||
|  | ||||
| def test_agg_period_index(): | ||||
|     prng = period_range("2012-1-1", freq="M", periods=3) | ||||
|     df = DataFrame(np.random.default_rng(2).standard_normal((3, 2)), index=prng) | ||||
|     rs = df.groupby(level=0).sum() | ||||
|     assert isinstance(rs.index, PeriodIndex) | ||||
|  | ||||
|     # GH 3579 | ||||
|     index = period_range(start="1999-01", periods=5, freq="M") | ||||
|     s1 = Series(np.random.default_rng(2).random(len(index)), index=index) | ||||
|     s2 = Series(np.random.default_rng(2).random(len(index)), index=index) | ||||
|     df = DataFrame.from_dict({"s1": s1, "s2": s2}) | ||||
|     grouped = df.groupby(df.index.month) | ||||
|     list(grouped) | ||||
|  | ||||
|  | ||||
| def test_agg_dict_parameter_cast_result_dtypes(): | ||||
|     # GH 12821 | ||||
|  | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "class": ["A", "A", "B", "B", "C", "C", "D", "D"], | ||||
|             "time": date_range("1/1/2011", periods=8, freq="h"), | ||||
|         } | ||||
|     ) | ||||
|     df.loc[[0, 1, 2, 5], "time"] = None | ||||
|  | ||||
|     # test for `first` function | ||||
|     exp = df.loc[[0, 3, 4, 6]].set_index("class") | ||||
|     grouped = df.groupby("class") | ||||
|     tm.assert_frame_equal(grouped.first(), exp) | ||||
|     tm.assert_frame_equal(grouped.agg("first"), exp) | ||||
|     tm.assert_frame_equal(grouped.agg({"time": "first"}), exp) | ||||
|     tm.assert_series_equal(grouped.time.first(), exp["time"]) | ||||
|     tm.assert_series_equal(grouped.time.agg("first"), exp["time"]) | ||||
|  | ||||
|     # test for `last` function | ||||
|     exp = df.loc[[0, 3, 4, 7]].set_index("class") | ||||
|     grouped = df.groupby("class") | ||||
|     tm.assert_frame_equal(grouped.last(), exp) | ||||
|     tm.assert_frame_equal(grouped.agg("last"), exp) | ||||
|     tm.assert_frame_equal(grouped.agg({"time": "last"}), exp) | ||||
|     tm.assert_series_equal(grouped.time.last(), exp["time"]) | ||||
|     tm.assert_series_equal(grouped.time.agg("last"), exp["time"]) | ||||
|  | ||||
|     # count | ||||
|     exp = Series([2, 2, 2, 2], index=Index(list("ABCD"), name="class"), name="time") | ||||
|     tm.assert_series_equal(grouped.time.agg(len), exp) | ||||
|     tm.assert_series_equal(grouped.time.size(), exp) | ||||
|  | ||||
|     exp = Series([0, 1, 1, 2], index=Index(list("ABCD"), name="class"), name="time") | ||||
|     tm.assert_series_equal(grouped.time.count(), exp) | ||||
|  | ||||
|  | ||||
| def test_agg_cast_results_dtypes(): | ||||
|     # similar to GH12821 | ||||
|     # xref #11444 | ||||
|     u = [dt.datetime(2015, x + 1, 1) for x in range(12)] | ||||
|     v = list("aaabbbbbbccd") | ||||
|     df = DataFrame({"X": v, "Y": u}) | ||||
|  | ||||
|     result = df.groupby("X")["Y"].agg(len) | ||||
|     expected = df.groupby("X")["Y"].count() | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_aggregate_float64_no_int64(): | ||||
|     # see gh-11199 | ||||
|     df = DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 2, 4, 5], "c": [1, 2, 3, 4, 5]}) | ||||
|  | ||||
|     expected = DataFrame({"a": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5]) | ||||
|     expected.index.name = "b" | ||||
|  | ||||
|     result = df.groupby("b")[["a"]].mean() | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     expected = DataFrame({"a": [1, 2.5, 4, 5], "c": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5]) | ||||
|     expected.index.name = "b" | ||||
|  | ||||
|     result = df.groupby("b")[["a", "c"]].mean() | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_aggregate_api_consistency(): | ||||
|     # GH 9052 | ||||
|     # make sure that the aggregates via dict | ||||
|     # are consistent | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], | ||||
|             "B": ["one", "one", "two", "two", "two", "two", "one", "two"], | ||||
|             "C": np.random.default_rng(2).standard_normal(8) + 1.0, | ||||
|             "D": np.arange(8), | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|     grouped = df.groupby(["A", "B"]) | ||||
|     c_mean = grouped["C"].mean() | ||||
|     c_sum = grouped["C"].sum() | ||||
|     d_mean = grouped["D"].mean() | ||||
|     d_sum = grouped["D"].sum() | ||||
|  | ||||
|     result = grouped["D"].agg(["sum", "mean"]) | ||||
|     expected = pd.concat([d_sum, d_mean], axis=1) | ||||
|     expected.columns = ["sum", "mean"] | ||||
|     tm.assert_frame_equal(result, expected, check_like=True) | ||||
|  | ||||
|     result = grouped.agg(["sum", "mean"]) | ||||
|     expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1) | ||||
|     expected.columns = MultiIndex.from_product([["C", "D"], ["sum", "mean"]]) | ||||
|     tm.assert_frame_equal(result, expected, check_like=True) | ||||
|  | ||||
|     result = grouped[["D", "C"]].agg(["sum", "mean"]) | ||||
|     expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1) | ||||
|     expected.columns = MultiIndex.from_product([["D", "C"], ["sum", "mean"]]) | ||||
|     tm.assert_frame_equal(result, expected, check_like=True) | ||||
|  | ||||
|     result = grouped.agg({"C": "mean", "D": "sum"}) | ||||
|     expected = pd.concat([d_sum, c_mean], axis=1) | ||||
|     tm.assert_frame_equal(result, expected, check_like=True) | ||||
|  | ||||
|     result = grouped.agg({"C": ["mean", "sum"], "D": ["mean", "sum"]}) | ||||
|     expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1) | ||||
|     expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]]) | ||||
|  | ||||
|     msg = r"Column\(s\) \['r', 'r2'\] do not exist" | ||||
|     with pytest.raises(KeyError, match=msg): | ||||
|         grouped[["D", "C"]].agg({"r": "sum", "r2": "mean"}) | ||||
|  | ||||
|  | ||||
| def test_agg_dict_renaming_deprecation(): | ||||
|     # 15931 | ||||
|     df = DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)}) | ||||
|  | ||||
|     msg = r"nested renamer is not supported" | ||||
|     with pytest.raises(SpecificationError, match=msg): | ||||
|         df.groupby("A").agg( | ||||
|             {"B": {"foo": ["sum", "max"]}, "C": {"bar": ["count", "min"]}} | ||||
|         ) | ||||
|  | ||||
|     msg = r"Column\(s\) \['ma'\] do not exist" | ||||
|     with pytest.raises(KeyError, match=msg): | ||||
|         df.groupby("A")[["B", "C"]].agg({"ma": "max"}) | ||||
|  | ||||
|     msg = r"nested renamer is not supported" | ||||
|     with pytest.raises(SpecificationError, match=msg): | ||||
|         df.groupby("A").B.agg({"foo": "count"}) | ||||
|  | ||||
|  | ||||
| def test_agg_compat(): | ||||
|     # GH 12334 | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], | ||||
|             "B": ["one", "one", "two", "two", "two", "two", "one", "two"], | ||||
|             "C": np.random.default_rng(2).standard_normal(8) + 1.0, | ||||
|             "D": np.arange(8), | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|     g = df.groupby(["A", "B"]) | ||||
|  | ||||
|     msg = r"nested renamer is not supported" | ||||
|     with pytest.raises(SpecificationError, match=msg): | ||||
|         g["D"].agg({"C": ["sum", "std"]}) | ||||
|  | ||||
|     with pytest.raises(SpecificationError, match=msg): | ||||
|         g["D"].agg({"C": "sum", "D": "std"}) | ||||
|  | ||||
|  | ||||
| def test_agg_nested_dicts(): | ||||
|     # API change for disallowing these types of nested dicts | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], | ||||
|             "B": ["one", "one", "two", "two", "two", "two", "one", "two"], | ||||
|             "C": np.random.default_rng(2).standard_normal(8) + 1.0, | ||||
|             "D": np.arange(8), | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|     g = df.groupby(["A", "B"]) | ||||
|  | ||||
|     msg = r"nested renamer is not supported" | ||||
|     with pytest.raises(SpecificationError, match=msg): | ||||
|         g.aggregate({"r1": {"C": ["mean", "sum"]}, "r2": {"D": ["mean", "sum"]}}) | ||||
|  | ||||
|     with pytest.raises(SpecificationError, match=msg): | ||||
|         g.agg({"C": {"ra": ["mean", "std"]}, "D": {"rb": ["mean", "std"]}}) | ||||
|  | ||||
|     # same name as the original column | ||||
|     # GH9052 | ||||
|     with pytest.raises(SpecificationError, match=msg): | ||||
|         g["D"].agg({"result1": np.sum, "result2": np.mean}) | ||||
|  | ||||
|     with pytest.raises(SpecificationError, match=msg): | ||||
|         g["D"].agg({"D": np.sum, "result2": np.mean}) | ||||
|  | ||||
|  | ||||
| def test_agg_item_by_item_raise_typeerror(): | ||||
|     df = DataFrame(np.random.default_rng(2).integers(10, size=(20, 10))) | ||||
|  | ||||
|     def raiseException(df): | ||||
|         pprint_thing("----------------------------------------") | ||||
|         pprint_thing(df.to_string()) | ||||
|         raise TypeError("test") | ||||
|  | ||||
|     with pytest.raises(TypeError, match="test"): | ||||
|         df.groupby(0).agg(raiseException) | ||||
|  | ||||
|  | ||||
| def test_series_agg_multikey(): | ||||
|     ts = Series( | ||||
|         np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) | ||||
|     ) | ||||
|     grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) | ||||
|  | ||||
|     result = grouped.agg("sum") | ||||
|     expected = grouped.sum() | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_series_agg_multi_pure_python(): | ||||
|     data = DataFrame( | ||||
|         { | ||||
|             "A": [ | ||||
|                 "foo", | ||||
|                 "foo", | ||||
|                 "foo", | ||||
|                 "foo", | ||||
|                 "bar", | ||||
|                 "bar", | ||||
|                 "bar", | ||||
|                 "bar", | ||||
|                 "foo", | ||||
|                 "foo", | ||||
|                 "foo", | ||||
|             ], | ||||
|             "B": [ | ||||
|                 "one", | ||||
|                 "one", | ||||
|                 "one", | ||||
|                 "two", | ||||
|                 "one", | ||||
|                 "one", | ||||
|                 "one", | ||||
|                 "two", | ||||
|                 "two", | ||||
|                 "two", | ||||
|                 "one", | ||||
|             ], | ||||
|             "C": [ | ||||
|                 "dull", | ||||
|                 "dull", | ||||
|                 "shiny", | ||||
|                 "dull", | ||||
|                 "dull", | ||||
|                 "shiny", | ||||
|                 "shiny", | ||||
|                 "dull", | ||||
|                 "shiny", | ||||
|                 "shiny", | ||||
|                 "shiny", | ||||
|             ], | ||||
|             "D": np.random.default_rng(2).standard_normal(11), | ||||
|             "E": np.random.default_rng(2).standard_normal(11), | ||||
|             "F": np.random.default_rng(2).standard_normal(11), | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|     def bad(x): | ||||
|         if isinstance(x.values, np.ndarray): | ||||
|             assert len(x.values.base) > 0 | ||||
|         return "foo" | ||||
|  | ||||
|     result = data.groupby(["A", "B"]).agg(bad) | ||||
|     expected = data.groupby(["A", "B"]).agg(lambda x: "foo") | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_agg_consistency(): | ||||
|     # agg with ([]) and () not consistent | ||||
|     # GH 6715 | ||||
|     def P1(a): | ||||
|         return np.percentile(a.dropna(), q=1) | ||||
|  | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "col1": [1, 2, 3, 4], | ||||
|             "col2": [10, 25, 26, 31], | ||||
|             "date": [ | ||||
|                 dt.date(2013, 2, 10), | ||||
|                 dt.date(2013, 2, 10), | ||||
|                 dt.date(2013, 2, 11), | ||||
|                 dt.date(2013, 2, 11), | ||||
|             ], | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|     g = df.groupby("date") | ||||
|  | ||||
|     expected = g.agg([P1]) | ||||
|     expected.columns = expected.columns.levels[0] | ||||
|  | ||||
|     result = g.agg(P1) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_agg_callables(): | ||||
|     # GH 7929 | ||||
|     df = DataFrame({"foo": [1, 2], "bar": [3, 4]}).astype(np.int64) | ||||
|  | ||||
|     class fn_class: | ||||
|         def __call__(self, x): | ||||
|             return sum(x) | ||||
|  | ||||
|     equiv_callables = [ | ||||
|         sum, | ||||
|         np.sum, | ||||
|         lambda x: sum(x), | ||||
|         lambda x: x.sum(), | ||||
|         partial(sum), | ||||
|         fn_class(), | ||||
|     ] | ||||
|  | ||||
|     expected = df.groupby("foo").agg("sum") | ||||
|     for ecall in equiv_callables: | ||||
|         warn = FutureWarning if ecall is sum or ecall is np.sum else None | ||||
|         msg = "using DataFrameGroupBy.sum" | ||||
|         with tm.assert_produces_warning(warn, match=msg): | ||||
|             result = df.groupby("foo").agg(ecall) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_agg_over_numpy_arrays(): | ||||
|     # GH 3788 | ||||
|     df = DataFrame( | ||||
|         [ | ||||
|             [1, np.array([10, 20, 30])], | ||||
|             [1, np.array([40, 50, 60])], | ||||
|             [2, np.array([20, 30, 40])], | ||||
|         ], | ||||
|         columns=["category", "arraydata"], | ||||
|     ) | ||||
|     gb = df.groupby("category") | ||||
|  | ||||
|     expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]] | ||||
|     expected_index = Index([1, 2], name="category") | ||||
|     expected_column = ["arraydata"] | ||||
|     expected = DataFrame(expected_data, index=expected_index, columns=expected_column) | ||||
|  | ||||
|     alt = gb.sum(numeric_only=False) | ||||
|     tm.assert_frame_equal(alt, expected) | ||||
|  | ||||
|     result = gb.agg("sum", numeric_only=False) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # FIXME: the original version of this test called `gb.agg(sum)` | ||||
|     #  and that raises TypeError if `numeric_only=False` is passed | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("as_period", [True, False]) | ||||
| def test_agg_tzaware_non_datetime_result(as_period): | ||||
|     # discussed in GH#29589, fixed in GH#29641, operating on tzaware values | ||||
|     #  with function that is not dtype-preserving | ||||
|     dti = date_range("2012-01-01", periods=4, tz="UTC") | ||||
|     if as_period: | ||||
|         dti = dti.tz_localize(None).to_period("D") | ||||
|  | ||||
|     df = DataFrame({"a": [0, 0, 1, 1], "b": dti}) | ||||
|     gb = df.groupby("a") | ||||
|  | ||||
|     # Case that _does_ preserve the dtype | ||||
|     result = gb["b"].agg(lambda x: x.iloc[0]) | ||||
|     expected = Series(dti[::2], name="b") | ||||
|     expected.index.name = "a" | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     # Cases that do _not_ preserve the dtype | ||||
|     result = gb["b"].agg(lambda x: x.iloc[0].year) | ||||
|     expected = Series([2012, 2012], name="b") | ||||
|     expected.index.name = "a" | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     result = gb["b"].agg(lambda x: x.iloc[-1] - x.iloc[0]) | ||||
|     expected = Series([pd.Timedelta(days=1), pd.Timedelta(days=1)], name="b") | ||||
|     expected.index.name = "a" | ||||
|     if as_period: | ||||
|         expected = Series([pd.offsets.Day(1), pd.offsets.Day(1)], name="b") | ||||
|         expected.index.name = "a" | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_agg_timezone_round_trip(): | ||||
|     # GH 15426 | ||||
|     ts = pd.Timestamp("2016-01-01 12:00:00", tz="US/Pacific") | ||||
|     df = DataFrame({"a": 1, "b": [ts + dt.timedelta(minutes=nn) for nn in range(10)]}) | ||||
|  | ||||
|     result1 = df.groupby("a")["b"].agg("min").iloc[0] | ||||
|     result2 = df.groupby("a")["b"].agg(lambda x: np.min(x)).iloc[0] | ||||
|     result3 = df.groupby("a")["b"].min().iloc[0] | ||||
|  | ||||
|     assert result1 == ts | ||||
|     assert result2 == ts | ||||
|     assert result3 == ts | ||||
|  | ||||
|     dates = [ | ||||
|         pd.Timestamp(f"2016-01-0{i:d} 12:00:00", tz="US/Pacific") for i in range(1, 5) | ||||
|     ] | ||||
|     df = DataFrame({"A": ["a", "b"] * 2, "B": dates}) | ||||
|     grouped = df.groupby("A") | ||||
|  | ||||
|     ts = df["B"].iloc[0] | ||||
|     assert ts == grouped.nth(0)["B"].iloc[0] | ||||
|     assert ts == grouped.head(1)["B"].iloc[0] | ||||
|     assert ts == grouped.first()["B"].iloc[0] | ||||
|  | ||||
|     # GH#27110 applying iloc should return a DataFrame | ||||
|     msg = "DataFrameGroupBy.apply operated on the grouping columns" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1] | ||||
|  | ||||
|     ts = df["B"].iloc[2] | ||||
|     assert ts == grouped.last()["B"].iloc[0] | ||||
|  | ||||
|     # GH#27110 applying iloc should return a DataFrame | ||||
|     msg = "DataFrameGroupBy.apply operated on the grouping columns" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1] | ||||
|  | ||||
|  | ||||
| def test_sum_uint64_overflow(): | ||||
|     # see gh-14758 | ||||
|     # Convert to uint64 and don't overflow | ||||
|     df = DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object) | ||||
|     df = df + 9223372036854775807 | ||||
|  | ||||
|     index = Index( | ||||
|         [9223372036854775808, 9223372036854775810, 9223372036854775812], dtype=np.uint64 | ||||
|     ) | ||||
|     expected = DataFrame( | ||||
|         {1: [9223372036854775809, 9223372036854775811, 9223372036854775813]}, | ||||
|         index=index, | ||||
|         dtype=object, | ||||
|     ) | ||||
|  | ||||
|     expected.index.name = 0 | ||||
|     result = df.groupby(0).sum(numeric_only=False) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # out column is non-numeric, so with numeric_only=True it is dropped | ||||
|     result2 = df.groupby(0).sum(numeric_only=True) | ||||
|     expected2 = expected[[]] | ||||
|     tm.assert_frame_equal(result2, expected2) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "structure, expected", | ||||
|     [ | ||||
|         (tuple, DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})), | ||||
|         (list, DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})), | ||||
|         ( | ||||
|             lambda x: tuple(x), | ||||
|             DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}}), | ||||
|         ), | ||||
|         ( | ||||
|             lambda x: list(x), | ||||
|             DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}}), | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_agg_structs_dataframe(structure, expected): | ||||
|     df = DataFrame( | ||||
|         {"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]} | ||||
|     ) | ||||
|  | ||||
|     result = df.groupby(["A", "B"]).aggregate(structure) | ||||
|     expected.index.names = ["A", "B"] | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "structure, expected", | ||||
|     [ | ||||
|         (tuple, Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")), | ||||
|         (list, Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")), | ||||
|         (lambda x: tuple(x), Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")), | ||||
|         (lambda x: list(x), Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")), | ||||
|     ], | ||||
| ) | ||||
| def test_agg_structs_series(structure, expected): | ||||
|     # Issue #18079 | ||||
|     df = DataFrame( | ||||
|         {"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]} | ||||
|     ) | ||||
|  | ||||
|     result = df.groupby("A")["C"].aggregate(structure) | ||||
|     expected.index.name = "A" | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_agg_category_nansum(observed): | ||||
|     categories = ["a", "b", "c"] | ||||
|     df = DataFrame( | ||||
|         {"A": pd.Categorical(["a", "a", "b"], categories=categories), "B": [1, 2, 3]} | ||||
|     ) | ||||
|     msg = "using SeriesGroupBy.sum" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         result = df.groupby("A", observed=observed).B.agg(np.nansum) | ||||
|     expected = Series( | ||||
|         [3, 3, 0], | ||||
|         index=pd.CategoricalIndex(["a", "b", "c"], categories=categories, name="A"), | ||||
|         name="B", | ||||
|     ) | ||||
|     if observed: | ||||
|         expected = expected[expected != 0] | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_agg_list_like_func(): | ||||
|     # GH 18473 | ||||
|     df = DataFrame({"A": [str(x) for x in range(3)], "B": [str(x) for x in range(3)]}) | ||||
|     grouped = df.groupby("A", as_index=False, sort=False) | ||||
|     result = grouped.agg({"B": lambda x: list(x)}) | ||||
|     expected = DataFrame( | ||||
|         {"A": [str(x) for x in range(3)], "B": [[str(x)] for x in range(3)]} | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_agg_lambda_with_timezone(): | ||||
|     # GH 23683 | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "tag": [1, 1], | ||||
|             "date": [ | ||||
|                 pd.Timestamp("2018-01-01", tz="UTC"), | ||||
|                 pd.Timestamp("2018-01-02", tz="UTC"), | ||||
|             ], | ||||
|         } | ||||
|     ) | ||||
|     result = df.groupby("tag").agg({"date": lambda e: e.head(1)}) | ||||
|     expected = DataFrame( | ||||
|         [pd.Timestamp("2018-01-01", tz="UTC")], | ||||
|         index=Index([1], name="tag"), | ||||
|         columns=["date"], | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "err_cls", | ||||
|     [ | ||||
|         NotImplementedError, | ||||
|         RuntimeError, | ||||
|         KeyError, | ||||
|         IndexError, | ||||
|         OSError, | ||||
|         ValueError, | ||||
|         ArithmeticError, | ||||
|         AttributeError, | ||||
|     ], | ||||
| ) | ||||
| def test_groupby_agg_err_catching(err_cls): | ||||
|     # make sure we suppress anything other than TypeError or AssertionError | ||||
|     #  in _python_agg_general | ||||
|  | ||||
|     # Use a non-standard EA to make sure we don't go down ndarray paths | ||||
|     from pandas.tests.extension.decimal.array import ( | ||||
|         DecimalArray, | ||||
|         make_data, | ||||
|         to_decimal, | ||||
|     ) | ||||
|  | ||||
|     data = make_data()[:5] | ||||
|     df = DataFrame( | ||||
|         {"id1": [0, 0, 0, 1, 1], "id2": [0, 1, 0, 1, 1], "decimals": DecimalArray(data)} | ||||
|     ) | ||||
|  | ||||
|     expected = Series(to_decimal([data[0], data[3]])) | ||||
|  | ||||
|     def weird_func(x): | ||||
|         # weird function that raise something other than TypeError or IndexError | ||||
|         #  in _python_agg_general | ||||
|         if len(x) == 0: | ||||
|             raise err_cls | ||||
|         return x.iloc[0] | ||||
|  | ||||
|     result = df["decimals"].groupby(df["id1"]).agg(weird_func) | ||||
|     tm.assert_series_equal(result, expected, check_names=False) | ||||
							
								
								
									
										208
									
								
								lib/python3.11/site-packages/pandas/tests/groupby/conftest.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										208
									
								
								lib/python3.11/site-packages/pandas/tests/groupby/conftest.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,208 @@ | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     Series, | ||||
|     date_range, | ||||
| ) | ||||
| from pandas.core.groupby.base import ( | ||||
|     reduction_kernels, | ||||
|     transformation_kernels, | ||||
| ) | ||||
|  | ||||
|  | ||||
| @pytest.fixture(params=[True, False]) | ||||
| def sort(request): | ||||
|     return request.param | ||||
|  | ||||
|  | ||||
| @pytest.fixture(params=[True, False]) | ||||
| def as_index(request): | ||||
|     return request.param | ||||
|  | ||||
|  | ||||
| @pytest.fixture(params=[True, False]) | ||||
| def dropna(request): | ||||
|     return request.param | ||||
|  | ||||
|  | ||||
| @pytest.fixture(params=[True, False]) | ||||
| def observed(request): | ||||
|     return request.param | ||||
|  | ||||
|  | ||||
| @pytest.fixture | ||||
| def df(): | ||||
|     return DataFrame( | ||||
|         { | ||||
|             "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], | ||||
|             "B": ["one", "one", "two", "three", "two", "two", "one", "three"], | ||||
|             "C": np.random.default_rng(2).standard_normal(8), | ||||
|             "D": np.random.default_rng(2).standard_normal(8), | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|  | ||||
| @pytest.fixture | ||||
| def ts(): | ||||
|     return Series( | ||||
|         np.random.default_rng(2).standard_normal(30), | ||||
|         index=date_range("2000-01-01", periods=30, freq="B"), | ||||
|     ) | ||||
|  | ||||
|  | ||||
| @pytest.fixture | ||||
| def tsframe(): | ||||
|     return DataFrame( | ||||
|         np.random.default_rng(2).standard_normal((30, 4)), | ||||
|         columns=Index(list("ABCD"), dtype=object), | ||||
|         index=date_range("2000-01-01", periods=30, freq="B"), | ||||
|     ) | ||||
|  | ||||
|  | ||||
| @pytest.fixture | ||||
| def three_group(): | ||||
|     return DataFrame( | ||||
|         { | ||||
|             "A": [ | ||||
|                 "foo", | ||||
|                 "foo", | ||||
|                 "foo", | ||||
|                 "foo", | ||||
|                 "bar", | ||||
|                 "bar", | ||||
|                 "bar", | ||||
|                 "bar", | ||||
|                 "foo", | ||||
|                 "foo", | ||||
|                 "foo", | ||||
|             ], | ||||
|             "B": [ | ||||
|                 "one", | ||||
|                 "one", | ||||
|                 "one", | ||||
|                 "two", | ||||
|                 "one", | ||||
|                 "one", | ||||
|                 "one", | ||||
|                 "two", | ||||
|                 "two", | ||||
|                 "two", | ||||
|                 "one", | ||||
|             ], | ||||
|             "C": [ | ||||
|                 "dull", | ||||
|                 "dull", | ||||
|                 "shiny", | ||||
|                 "dull", | ||||
|                 "dull", | ||||
|                 "shiny", | ||||
|                 "shiny", | ||||
|                 "dull", | ||||
|                 "shiny", | ||||
|                 "shiny", | ||||
|                 "shiny", | ||||
|             ], | ||||
|             "D": np.random.default_rng(2).standard_normal(11), | ||||
|             "E": np.random.default_rng(2).standard_normal(11), | ||||
|             "F": np.random.default_rng(2).standard_normal(11), | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|  | ||||
| @pytest.fixture() | ||||
| def slice_test_df(): | ||||
|     data = [ | ||||
|         [0, "a", "a0_at_0"], | ||||
|         [1, "b", "b0_at_1"], | ||||
|         [2, "a", "a1_at_2"], | ||||
|         [3, "b", "b1_at_3"], | ||||
|         [4, "c", "c0_at_4"], | ||||
|         [5, "a", "a2_at_5"], | ||||
|         [6, "a", "a3_at_6"], | ||||
|         [7, "a", "a4_at_7"], | ||||
|     ] | ||||
|     df = DataFrame(data, columns=["Index", "Group", "Value"]) | ||||
|     return df.set_index("Index") | ||||
|  | ||||
|  | ||||
| @pytest.fixture() | ||||
| def slice_test_grouped(slice_test_df): | ||||
|     return slice_test_df.groupby("Group", as_index=False) | ||||
|  | ||||
|  | ||||
| @pytest.fixture(params=sorted(reduction_kernels)) | ||||
| def reduction_func(request): | ||||
|     """ | ||||
|     yields the string names of all groupby reduction functions, one at a time. | ||||
|     """ | ||||
|     return request.param | ||||
|  | ||||
|  | ||||
| @pytest.fixture(params=sorted(transformation_kernels)) | ||||
| def transformation_func(request): | ||||
|     """yields the string names of all groupby transformation functions.""" | ||||
|     return request.param | ||||
|  | ||||
|  | ||||
| @pytest.fixture(params=sorted(reduction_kernels) + sorted(transformation_kernels)) | ||||
| def groupby_func(request): | ||||
|     """yields both aggregation and transformation functions.""" | ||||
|     return request.param | ||||
|  | ||||
|  | ||||
| @pytest.fixture(params=[True, False]) | ||||
| def parallel(request): | ||||
|     """parallel keyword argument for numba.jit""" | ||||
|     return request.param | ||||
|  | ||||
|  | ||||
| # Can parameterize nogil & nopython over True | False, but limiting per | ||||
| # https://github.com/pandas-dev/pandas/pull/41971#issuecomment-860607472 | ||||
|  | ||||
|  | ||||
| @pytest.fixture(params=[False]) | ||||
| def nogil(request): | ||||
|     """nogil keyword argument for numba.jit""" | ||||
|     return request.param | ||||
|  | ||||
|  | ||||
| @pytest.fixture(params=[True]) | ||||
| def nopython(request): | ||||
|     """nopython keyword argument for numba.jit""" | ||||
|     return request.param | ||||
|  | ||||
|  | ||||
| @pytest.fixture( | ||||
|     params=[ | ||||
|         ("mean", {}), | ||||
|         ("var", {"ddof": 1}), | ||||
|         ("var", {"ddof": 0}), | ||||
|         ("std", {"ddof": 1}), | ||||
|         ("std", {"ddof": 0}), | ||||
|         ("sum", {}), | ||||
|         ("min", {}), | ||||
|         ("max", {}), | ||||
|         ("sum", {"min_count": 2}), | ||||
|         ("min", {"min_count": 2}), | ||||
|         ("max", {"min_count": 2}), | ||||
|     ], | ||||
|     ids=[ | ||||
|         "mean", | ||||
|         "var_1", | ||||
|         "var_0", | ||||
|         "std_1", | ||||
|         "std_0", | ||||
|         "sum", | ||||
|         "min", | ||||
|         "max", | ||||
|         "sum-min_count", | ||||
|         "min-min_count", | ||||
|         "max-min_count", | ||||
|     ], | ||||
| ) | ||||
| def numba_supported_reductions(request): | ||||
|     """reductions supported with engine='numba'""" | ||||
|     return request.param | ||||
| @ -0,0 +1,24 @@ | ||||
| import numpy as np | ||||
|  | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     Series, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| def test_corrwith_with_1_axis(): | ||||
|     # GH 47723 | ||||
|     df = DataFrame({"a": [1, 1, 2], "b": [3, 7, 4]}) | ||||
|     gb = df.groupby("a") | ||||
|  | ||||
|     msg = "DataFrameGroupBy.corrwith with axis=1 is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         result = gb.corrwith(df, axis=1) | ||||
|     index = Index( | ||||
|         data=[(1, 0), (1, 1), (1, 2), (2, 2), (2, 0), (2, 1)], | ||||
|         name=("a", None), | ||||
|     ) | ||||
|     expected = Series([np.nan] * 6, index=index) | ||||
|     tm.assert_series_equal(result, expected) | ||||
| @ -0,0 +1,301 @@ | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| import pandas as pd | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     MultiIndex, | ||||
|     Series, | ||||
|     Timestamp, | ||||
|     date_range, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| def test_apply_describe_bug(multiindex_dataframe_random_data): | ||||
|     grouped = multiindex_dataframe_random_data.groupby(level="first") | ||||
|     grouped.describe()  # it works! | ||||
|  | ||||
|  | ||||
| def test_series_describe_multikey(): | ||||
|     ts = Series( | ||||
|         np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) | ||||
|     ) | ||||
|     grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) | ||||
|     result = grouped.describe() | ||||
|     tm.assert_series_equal(result["mean"], grouped.mean(), check_names=False) | ||||
|     tm.assert_series_equal(result["std"], grouped.std(), check_names=False) | ||||
|     tm.assert_series_equal(result["min"], grouped.min(), check_names=False) | ||||
|  | ||||
|  | ||||
| def test_series_describe_single(): | ||||
|     ts = Series( | ||||
|         np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) | ||||
|     ) | ||||
|     grouped = ts.groupby(lambda x: x.month) | ||||
|     result = grouped.apply(lambda x: x.describe()) | ||||
|     expected = grouped.describe().stack(future_stack=True) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("keys", ["key1", ["key1", "key2"]]) | ||||
| def test_series_describe_as_index(as_index, keys): | ||||
|     # GH#49256 | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "key1": ["one", "two", "two", "three", "two"], | ||||
|             "key2": ["one", "two", "two", "three", "two"], | ||||
|             "foo2": [1, 2, 4, 4, 6], | ||||
|         } | ||||
|     ) | ||||
|     gb = df.groupby(keys, as_index=as_index)["foo2"] | ||||
|     result = gb.describe() | ||||
|     expected = DataFrame( | ||||
|         { | ||||
|             "key1": ["one", "three", "two"], | ||||
|             "count": [1.0, 1.0, 3.0], | ||||
|             "mean": [1.0, 4.0, 4.0], | ||||
|             "std": [np.nan, np.nan, 2.0], | ||||
|             "min": [1.0, 4.0, 2.0], | ||||
|             "25%": [1.0, 4.0, 3.0], | ||||
|             "50%": [1.0, 4.0, 4.0], | ||||
|             "75%": [1.0, 4.0, 5.0], | ||||
|             "max": [1.0, 4.0, 6.0], | ||||
|         } | ||||
|     ) | ||||
|     if len(keys) == 2: | ||||
|         expected.insert(1, "key2", expected["key1"]) | ||||
|     if as_index: | ||||
|         expected = expected.set_index(keys) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_frame_describe_multikey(tsframe, using_infer_string): | ||||
|     grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) | ||||
|     result = grouped.describe() | ||||
|     desc_groups = [] | ||||
|     for col in tsframe: | ||||
|         group = grouped[col].describe() | ||||
|         # GH 17464 - Remove duplicate MultiIndex levels | ||||
|         group_col = MultiIndex( | ||||
|             levels=[Index([col], dtype=tsframe.columns.dtype), group.columns], | ||||
|             codes=[[0] * len(group.columns), range(len(group.columns))], | ||||
|         ) | ||||
|         group = DataFrame(group.values, columns=group_col, index=group.index) | ||||
|         desc_groups.append(group) | ||||
|     expected = pd.concat(desc_groups, axis=1) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # remainder of the tests fails with string dtype but is testing deprecated behaviour | ||||
|     if using_infer_string: | ||||
|         return | ||||
|  | ||||
|     msg = "DataFrame.groupby with axis=1 is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) | ||||
|     result = groupedT.describe() | ||||
|     expected = tsframe.describe().T | ||||
|     # reverting the change from https://github.com/pandas-dev/pandas/pull/35441/ | ||||
|     expected.index = MultiIndex( | ||||
|         levels=[[0, 1], expected.index], | ||||
|         codes=[[0, 0, 1, 1], range(len(expected.index))], | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_frame_describe_tupleindex(): | ||||
|     # GH 14848 - regression from 0.19.0 to 0.19.1 | ||||
|     df1 = DataFrame( | ||||
|         { | ||||
|             "x": [1, 2, 3, 4, 5] * 3, | ||||
|             "y": [10, 20, 30, 40, 50] * 3, | ||||
|             "z": [100, 200, 300, 400, 500] * 3, | ||||
|         } | ||||
|     ) | ||||
|     df1["k"] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 | ||||
|     df2 = df1.rename(columns={"k": "key"}) | ||||
|     msg = "Names should be list-like for a MultiIndex" | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         df1.groupby("k").describe() | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         df2.groupby("key").describe() | ||||
|  | ||||
|  | ||||
| def test_frame_describe_unstacked_format(): | ||||
|     # GH 4792 | ||||
|     prices = { | ||||
|         Timestamp("2011-01-06 10:59:05", tz=None): 24990, | ||||
|         Timestamp("2011-01-06 12:43:33", tz=None): 25499, | ||||
|         Timestamp("2011-01-06 12:54:09", tz=None): 25499, | ||||
|     } | ||||
|     volumes = { | ||||
|         Timestamp("2011-01-06 10:59:05", tz=None): 1500000000, | ||||
|         Timestamp("2011-01-06 12:43:33", tz=None): 5000000000, | ||||
|         Timestamp("2011-01-06 12:54:09", tz=None): 100000000, | ||||
|     } | ||||
|     df = DataFrame({"PRICE": prices, "VOLUME": volumes}) | ||||
|     result = df.groupby("PRICE").VOLUME.describe() | ||||
|     data = [ | ||||
|         df[df.PRICE == 24990].VOLUME.describe().values.tolist(), | ||||
|         df[df.PRICE == 25499].VOLUME.describe().values.tolist(), | ||||
|     ] | ||||
|     expected = DataFrame( | ||||
|         data, | ||||
|         index=Index([24990, 25499], name="PRICE"), | ||||
|         columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.filterwarnings( | ||||
|     "ignore:" | ||||
|     "indexing past lexsort depth may impact performance:" | ||||
|     "pandas.errors.PerformanceWarning" | ||||
| ) | ||||
| @pytest.mark.parametrize("as_index", [True, False]) | ||||
| @pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]]) | ||||
| def test_describe_with_duplicate_output_column_names(as_index, keys): | ||||
|     # GH 35314 | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "a1": [99, 99, 99, 88, 88, 88], | ||||
|             "a2": [99, 99, 99, 88, 88, 88], | ||||
|             "b": [1, 2, 3, 4, 5, 6], | ||||
|             "c": [10, 20, 30, 40, 50, 60], | ||||
|         }, | ||||
|         columns=["a1", "a2", "b", "b"], | ||||
|         copy=False, | ||||
|     ) | ||||
|     if keys == ["a1"]: | ||||
|         df = df.drop(columns="a2") | ||||
|  | ||||
|     expected = ( | ||||
|         DataFrame.from_records( | ||||
|             [ | ||||
|                 ("b", "count", 3.0, 3.0), | ||||
|                 ("b", "mean", 5.0, 2.0), | ||||
|                 ("b", "std", 1.0, 1.0), | ||||
|                 ("b", "min", 4.0, 1.0), | ||||
|                 ("b", "25%", 4.5, 1.5), | ||||
|                 ("b", "50%", 5.0, 2.0), | ||||
|                 ("b", "75%", 5.5, 2.5), | ||||
|                 ("b", "max", 6.0, 3.0), | ||||
|                 ("b", "count", 3.0, 3.0), | ||||
|                 ("b", "mean", 5.0, 2.0), | ||||
|                 ("b", "std", 1.0, 1.0), | ||||
|                 ("b", "min", 4.0, 1.0), | ||||
|                 ("b", "25%", 4.5, 1.5), | ||||
|                 ("b", "50%", 5.0, 2.0), | ||||
|                 ("b", "75%", 5.5, 2.5), | ||||
|                 ("b", "max", 6.0, 3.0), | ||||
|             ], | ||||
|         ) | ||||
|         .set_index([0, 1]) | ||||
|         .T | ||||
|     ) | ||||
|     expected.columns.names = [None, None] | ||||
|     if len(keys) == 2: | ||||
|         expected.index = MultiIndex( | ||||
|             levels=[[88, 99], [88, 99]], codes=[[0, 1], [0, 1]], names=["a1", "a2"] | ||||
|         ) | ||||
|     else: | ||||
|         expected.index = Index([88, 99], name="a1") | ||||
|  | ||||
|     if not as_index: | ||||
|         expected = expected.reset_index() | ||||
|  | ||||
|     result = df.groupby(keys, as_index=as_index).describe() | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_describe_duplicate_columns(): | ||||
|     # GH#50806 | ||||
|     df = DataFrame([[0, 1, 2, 3]]) | ||||
|     df.columns = [0, 1, 2, 0] | ||||
|     gb = df.groupby(df[1]) | ||||
|     result = gb.describe(percentiles=[]) | ||||
|  | ||||
|     columns = ["count", "mean", "std", "min", "50%", "max"] | ||||
|     frames = [ | ||||
|         DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns) | ||||
|         for val in (0.0, 2.0, 3.0) | ||||
|     ] | ||||
|     expected = pd.concat(frames, axis=1) | ||||
|     expected.columns = MultiIndex( | ||||
|         levels=[[0, 2], columns], | ||||
|         codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))], | ||||
|     ) | ||||
|     expected.index.names = [1] | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| class TestGroupByNonCythonPaths: | ||||
|     # GH#5610 non-cython calls should not include the grouper | ||||
|     # Tests for code not expected to go through cython paths. | ||||
|  | ||||
|     @pytest.fixture | ||||
|     def df(self): | ||||
|         df = DataFrame( | ||||
|             [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]], | ||||
|             columns=["A", "B", "C"], | ||||
|         ) | ||||
|         return df | ||||
|  | ||||
|     @pytest.fixture | ||||
|     def gb(self, df): | ||||
|         gb = df.groupby("A") | ||||
|         return gb | ||||
|  | ||||
|     @pytest.fixture | ||||
|     def gni(self, df): | ||||
|         gni = df.groupby("A", as_index=False) | ||||
|         return gni | ||||
|  | ||||
|     def test_describe(self, df, gb, gni): | ||||
|         # describe | ||||
|         expected_index = Index([1, 3], name="A") | ||||
|         expected_col = MultiIndex( | ||||
|             levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]], | ||||
|             codes=[[0] * 8, list(range(8))], | ||||
|         ) | ||||
|         expected = DataFrame( | ||||
|             [ | ||||
|                 [1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0], | ||||
|                 [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], | ||||
|             ], | ||||
|             index=expected_index, | ||||
|             columns=expected_col, | ||||
|         ) | ||||
|         result = gb.describe() | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|         expected = expected.reset_index() | ||||
|         result = gni.describe() | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("dtype", [int, float, object]) | ||||
| @pytest.mark.parametrize( | ||||
|     "kwargs", | ||||
|     [ | ||||
|         {"percentiles": [0.10, 0.20, 0.30], "include": "all", "exclude": None}, | ||||
|         {"percentiles": [0.10, 0.20, 0.30], "include": None, "exclude": ["int"]}, | ||||
|         {"percentiles": [0.10, 0.20, 0.30], "include": ["int"], "exclude": None}, | ||||
|     ], | ||||
| ) | ||||
| def test_groupby_empty_dataset(dtype, kwargs): | ||||
|     # GH#41575 | ||||
|     df = DataFrame([[1, 2, 3]], columns=["A", "B", "C"], dtype=dtype) | ||||
|     df["B"] = df["B"].astype(int) | ||||
|     df["C"] = df["C"].astype(float) | ||||
|  | ||||
|     result = df.iloc[:0].groupby("A").describe(**kwargs) | ||||
|     expected = df.groupby("A").describe(**kwargs).reset_index(drop=True).iloc[:0] | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = df.iloc[:0].groupby("A").B.describe(**kwargs) | ||||
|     expected = df.groupby("A").B.describe(**kwargs).reset_index(drop=True).iloc[:0] | ||||
|     expected.index = Index([], dtype=df.columns.dtype) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
| @ -0,0 +1,255 @@ | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     NaT, | ||||
|     Series, | ||||
|     Timedelta, | ||||
|     Timestamp, | ||||
|     date_range, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| def test_group_shift_with_null_key(): | ||||
|     # This test is designed to replicate the segfault in issue #13813. | ||||
|     n_rows = 1200 | ||||
|  | ||||
|     # Generate a moderately large dataframe with occasional missing | ||||
|     # values in column `B`, and then group by [`A`, `B`]. This should | ||||
|     # force `-1` in `labels` array of `g._grouper.group_info` exactly | ||||
|     # at those places, where the group-by key is partially missing. | ||||
|     df = DataFrame( | ||||
|         [(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)], | ||||
|         dtype=float, | ||||
|         columns=["A", "B", "Z"], | ||||
|         index=None, | ||||
|     ) | ||||
|     g = df.groupby(["A", "B"]) | ||||
|  | ||||
|     expected = DataFrame( | ||||
|         [(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)], | ||||
|         dtype=float, | ||||
|         columns=["Z"], | ||||
|         index=None, | ||||
|     ) | ||||
|     result = g.shift(-1) | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_group_shift_with_fill_value(): | ||||
|     # GH #24128 | ||||
|     n_rows = 24 | ||||
|     df = DataFrame( | ||||
|         [(i % 12, i % 3, i) for i in range(n_rows)], | ||||
|         dtype=float, | ||||
|         columns=["A", "B", "Z"], | ||||
|         index=None, | ||||
|     ) | ||||
|     g = df.groupby(["A", "B"]) | ||||
|  | ||||
|     expected = DataFrame( | ||||
|         [(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)], | ||||
|         dtype=float, | ||||
|         columns=["Z"], | ||||
|         index=None, | ||||
|     ) | ||||
|     result = g.shift(-1, fill_value=0) | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_group_shift_lose_timezone(): | ||||
|     # GH 30134 | ||||
|     now_dt = Timestamp.utcnow().as_unit("ns") | ||||
|     df = DataFrame({"a": [1, 1], "date": now_dt}) | ||||
|     result = df.groupby("a").shift(0).iloc[0] | ||||
|     expected = Series({"date": now_dt}, name=result.name) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_group_diff_real_series(any_real_numpy_dtype): | ||||
|     df = DataFrame( | ||||
|         {"a": [1, 2, 3, 3, 2], "b": [1, 2, 3, 4, 5]}, | ||||
|         dtype=any_real_numpy_dtype, | ||||
|     ) | ||||
|     result = df.groupby("a")["b"].diff() | ||||
|     exp_dtype = "float" | ||||
|     if any_real_numpy_dtype in ["int8", "int16", "float32"]: | ||||
|         exp_dtype = "float32" | ||||
|     expected = Series([np.nan, np.nan, np.nan, 1.0, 3.0], dtype=exp_dtype, name="b") | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_group_diff_real_frame(any_real_numpy_dtype): | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "a": [1, 2, 3, 3, 2], | ||||
|             "b": [1, 2, 3, 4, 5], | ||||
|             "c": [1, 2, 3, 4, 6], | ||||
|         }, | ||||
|         dtype=any_real_numpy_dtype, | ||||
|     ) | ||||
|     result = df.groupby("a").diff() | ||||
|     exp_dtype = "float" | ||||
|     if any_real_numpy_dtype in ["int8", "int16", "float32"]: | ||||
|         exp_dtype = "float32" | ||||
|     expected = DataFrame( | ||||
|         { | ||||
|             "b": [np.nan, np.nan, np.nan, 1.0, 3.0], | ||||
|             "c": [np.nan, np.nan, np.nan, 1.0, 4.0], | ||||
|         }, | ||||
|         dtype=exp_dtype, | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "data", | ||||
|     [ | ||||
|         [ | ||||
|             Timestamp("2013-01-01"), | ||||
|             Timestamp("2013-01-02"), | ||||
|             Timestamp("2013-01-03"), | ||||
|         ], | ||||
|         [Timedelta("5 days"), Timedelta("6 days"), Timedelta("7 days")], | ||||
|     ], | ||||
| ) | ||||
| def test_group_diff_datetimelike(data, unit): | ||||
|     df = DataFrame({"a": [1, 2, 2], "b": data}) | ||||
|     df["b"] = df["b"].dt.as_unit(unit) | ||||
|     result = df.groupby("a")["b"].diff() | ||||
|     expected = Series([NaT, NaT, Timedelta("1 days")], name="b").dt.as_unit(unit) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_group_diff_bool(): | ||||
|     df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]}) | ||||
|     result = df.groupby("a")["b"].diff() | ||||
|     expected = Series([np.nan, np.nan, np.nan, False, False], name="b") | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_group_diff_object_raises(object_dtype): | ||||
|     df = DataFrame( | ||||
|         {"a": ["foo", "bar", "bar"], "b": ["baz", "foo", "foo"]}, dtype=object_dtype | ||||
|     ) | ||||
|     with pytest.raises(TypeError, match=r"unsupported operand type\(s\) for -"): | ||||
|         df.groupby("a")["b"].diff() | ||||
|  | ||||
|  | ||||
| def test_empty_shift_with_fill(): | ||||
|     # GH 41264, single-index check | ||||
|     df = DataFrame(columns=["a", "b", "c"]) | ||||
|     shifted = df.groupby(["a"]).shift(1) | ||||
|     shifted_with_fill = df.groupby(["a"]).shift(1, fill_value=0) | ||||
|     tm.assert_frame_equal(shifted, shifted_with_fill) | ||||
|     tm.assert_index_equal(shifted.index, shifted_with_fill.index) | ||||
|  | ||||
|  | ||||
| def test_multindex_empty_shift_with_fill(): | ||||
|     # GH 41264, multi-index check | ||||
|     df = DataFrame(columns=["a", "b", "c"]) | ||||
|     shifted = df.groupby(["a", "b"]).shift(1) | ||||
|     shifted_with_fill = df.groupby(["a", "b"]).shift(1, fill_value=0) | ||||
|     tm.assert_frame_equal(shifted, shifted_with_fill) | ||||
|     tm.assert_index_equal(shifted.index, shifted_with_fill.index) | ||||
|  | ||||
|  | ||||
| def test_shift_periods_freq(): | ||||
|     # GH 54093 | ||||
|     data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]} | ||||
|     df = DataFrame(data, index=date_range(start="20100101", periods=6)) | ||||
|     result = df.groupby(df.index).shift(periods=-2, freq="D") | ||||
|     expected = DataFrame(data, index=date_range(start="2009-12-30", periods=6)) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_shift_deprecate_freq_and_fill_value(): | ||||
|     # GH 53832 | ||||
|     data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]} | ||||
|     df = DataFrame(data, index=date_range(start="20100101", periods=6)) | ||||
|     msg = ( | ||||
|         "Passing a 'freq' together with a 'fill_value' silently ignores the fill_value" | ||||
|     ) | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         df.groupby(df.index).shift(periods=-2, freq="D", fill_value="1") | ||||
|  | ||||
|  | ||||
| def test_shift_disallow_suffix_if_periods_is_int(): | ||||
|     # GH#44424 | ||||
|     data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]} | ||||
|     df = DataFrame(data) | ||||
|     msg = "Cannot specify `suffix` if `periods` is an int." | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         df.groupby("b").shift(1, suffix="fails") | ||||
|  | ||||
|  | ||||
| def test_group_shift_with_multiple_periods(): | ||||
|     # GH#44424 | ||||
|     df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]}) | ||||
|  | ||||
|     shifted_df = df.groupby("b")[["a"]].shift([0, 1]) | ||||
|     expected_df = DataFrame( | ||||
|         {"a_0": [1, 2, 3, 3, 2], "a_1": [np.nan, 1.0, np.nan, 3.0, 2.0]} | ||||
|     ) | ||||
|     tm.assert_frame_equal(shifted_df, expected_df) | ||||
|  | ||||
|     # series | ||||
|     shifted_series = df.groupby("b")["a"].shift([0, 1]) | ||||
|     tm.assert_frame_equal(shifted_series, expected_df) | ||||
|  | ||||
|  | ||||
| def test_group_shift_with_multiple_periods_and_freq(): | ||||
|     # GH#44424 | ||||
|     df = DataFrame( | ||||
|         {"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]}, | ||||
|         index=date_range("1/1/2000", periods=5, freq="h"), | ||||
|     ) | ||||
|     shifted_df = df.groupby("b")[["a"]].shift( | ||||
|         [0, 1], | ||||
|         freq="h", | ||||
|     ) | ||||
|     expected_df = DataFrame( | ||||
|         { | ||||
|             "a_0": [1.0, 2.0, 3.0, 4.0, 5.0, np.nan], | ||||
|             "a_1": [ | ||||
|                 np.nan, | ||||
|                 1.0, | ||||
|                 2.0, | ||||
|                 3.0, | ||||
|                 4.0, | ||||
|                 5.0, | ||||
|             ], | ||||
|         }, | ||||
|         index=date_range("1/1/2000", periods=6, freq="h"), | ||||
|     ) | ||||
|     tm.assert_frame_equal(shifted_df, expected_df) | ||||
|  | ||||
|  | ||||
| def test_group_shift_with_multiple_periods_and_fill_value(): | ||||
|     # GH#44424 | ||||
|     df = DataFrame( | ||||
|         {"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]}, | ||||
|     ) | ||||
|     shifted_df = df.groupby("b")[["a"]].shift([0, 1], fill_value=-1) | ||||
|     expected_df = DataFrame( | ||||
|         {"a_0": [1, 2, 3, 4, 5], "a_1": [-1, 1, -1, 3, 2]}, | ||||
|     ) | ||||
|     tm.assert_frame_equal(shifted_df, expected_df) | ||||
|  | ||||
|  | ||||
| def test_group_shift_with_multiple_periods_and_both_fill_and_freq_deprecated(): | ||||
|     # GH#44424 | ||||
|     df = DataFrame( | ||||
|         {"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]}, | ||||
|         index=date_range("1/1/2000", periods=5, freq="h"), | ||||
|     ) | ||||
|     msg = ( | ||||
|         "Passing a 'freq' together with a 'fill_value' silently ignores the " | ||||
|         "fill_value" | ||||
|     ) | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         df.groupby("b")[["a"]].shift([1, 2], fill_value=1, freq="h") | ||||
| @ -0,0 +1,78 @@ | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     Series, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "in_vals, out_vals", | ||||
|     [ | ||||
|         # Basics: strictly increasing (T), strictly decreasing (F), | ||||
|         # abs val increasing (F), non-strictly increasing (T) | ||||
|         ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], [True, False, False, True]), | ||||
|         # Test with inf vals | ||||
|         ( | ||||
|             [1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf], | ||||
|             [True, False, True, False], | ||||
|         ), | ||||
|         # Test with nan vals; should always be False | ||||
|         ( | ||||
|             [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], | ||||
|             [False, False, False, False], | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_is_monotonic_increasing(in_vals, out_vals): | ||||
|     # GH 17015 | ||||
|     source_dict = { | ||||
|         "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"], | ||||
|         "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"], | ||||
|         "C": in_vals, | ||||
|     } | ||||
|     df = DataFrame(source_dict) | ||||
|     result = df.groupby("B").C.is_monotonic_increasing | ||||
|     index = Index(list("abcd"), name="B") | ||||
|     expected = Series(index=index, data=out_vals, name="C") | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     # Also check result equal to manually taking x.is_monotonic_increasing. | ||||
|     expected = df.groupby(["B"]).C.apply(lambda x: x.is_monotonic_increasing) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "in_vals, out_vals", | ||||
|     [ | ||||
|         # Basics: strictly decreasing (T), strictly increasing (F), | ||||
|         # abs val decreasing (F), non-strictly increasing (T) | ||||
|         ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], [True, False, False, True]), | ||||
|         # Test with inf vals | ||||
|         ( | ||||
|             [np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf], | ||||
|             [True, True, False, True], | ||||
|         ), | ||||
|         # Test with nan vals; should always be False | ||||
|         ( | ||||
|             [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], | ||||
|             [False, False, False, False], | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_is_monotonic_decreasing(in_vals, out_vals): | ||||
|     # GH 17015 | ||||
|     source_dict = { | ||||
|         "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"], | ||||
|         "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"], | ||||
|         "C": in_vals, | ||||
|     } | ||||
|  | ||||
|     df = DataFrame(source_dict) | ||||
|     result = df.groupby("B").C.is_monotonic_decreasing | ||||
|     index = Index(list("abcd"), name="B") | ||||
|     expected = Series(index=index, data=out_vals, name="C") | ||||
|     tm.assert_series_equal(result, expected) | ||||
| @ -0,0 +1,115 @@ | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas import ( | ||||
|     MultiIndex, | ||||
|     Series, | ||||
|     date_range, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| def test_nlargest(): | ||||
|     a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) | ||||
|     b = Series(list("a" * 5 + "b" * 5)) | ||||
|     gb = a.groupby(b) | ||||
|     r = gb.nlargest(3) | ||||
|     e = Series( | ||||
|         [7, 5, 3, 10, 9, 6], | ||||
|         index=MultiIndex.from_arrays([list("aaabbb"), [3, 2, 1, 9, 5, 8]]), | ||||
|     ) | ||||
|     tm.assert_series_equal(r, e) | ||||
|  | ||||
|     a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) | ||||
|     gb = a.groupby(b) | ||||
|     e = Series( | ||||
|         [3, 2, 1, 3, 3, 2], | ||||
|         index=MultiIndex.from_arrays([list("aaabbb"), [2, 3, 1, 6, 5, 7]]), | ||||
|     ) | ||||
|     tm.assert_series_equal(gb.nlargest(3, keep="last"), e) | ||||
|  | ||||
|  | ||||
| def test_nlargest_mi_grouper(): | ||||
|     # see gh-21411 | ||||
|     npr = np.random.default_rng(2) | ||||
|  | ||||
|     dts = date_range("20180101", periods=10) | ||||
|     iterables = [dts, ["one", "two"]] | ||||
|  | ||||
|     idx = MultiIndex.from_product(iterables, names=["first", "second"]) | ||||
|     s = Series(npr.standard_normal(20), index=idx) | ||||
|  | ||||
|     result = s.groupby("first").nlargest(1) | ||||
|  | ||||
|     exp_idx = MultiIndex.from_tuples( | ||||
|         [ | ||||
|             (dts[0], dts[0], "one"), | ||||
|             (dts[1], dts[1], "one"), | ||||
|             (dts[2], dts[2], "one"), | ||||
|             (dts[3], dts[3], "two"), | ||||
|             (dts[4], dts[4], "one"), | ||||
|             (dts[5], dts[5], "one"), | ||||
|             (dts[6], dts[6], "one"), | ||||
|             (dts[7], dts[7], "one"), | ||||
|             (dts[8], dts[8], "one"), | ||||
|             (dts[9], dts[9], "one"), | ||||
|         ], | ||||
|         names=["first", "first", "second"], | ||||
|     ) | ||||
|  | ||||
|     exp_values = [ | ||||
|         0.18905338179353307, | ||||
|         -0.41306354339189344, | ||||
|         1.799707382720902, | ||||
|         0.7738065867276614, | ||||
|         0.28121066979764925, | ||||
|         0.9775674511260357, | ||||
|         -0.3288239040579627, | ||||
|         0.45495807124085547, | ||||
|         0.5452887139646817, | ||||
|         0.12682784711186987, | ||||
|     ] | ||||
|  | ||||
|     expected = Series(exp_values, index=exp_idx) | ||||
|     tm.assert_series_equal(result, expected, check_exact=False, rtol=1e-3) | ||||
|  | ||||
|  | ||||
| def test_nsmallest(): | ||||
|     a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) | ||||
|     b = Series(list("a" * 5 + "b" * 5)) | ||||
|     gb = a.groupby(b) | ||||
|     r = gb.nsmallest(3) | ||||
|     e = Series( | ||||
|         [1, 2, 3, 0, 4, 6], | ||||
|         index=MultiIndex.from_arrays([list("aaabbb"), [0, 4, 1, 6, 7, 8]]), | ||||
|     ) | ||||
|     tm.assert_series_equal(r, e) | ||||
|  | ||||
|     a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) | ||||
|     gb = a.groupby(b) | ||||
|     e = Series( | ||||
|         [0, 1, 1, 0, 1, 2], | ||||
|         index=MultiIndex.from_arrays([list("aaabbb"), [4, 1, 0, 9, 8, 7]]), | ||||
|     ) | ||||
|     tm.assert_series_equal(gb.nsmallest(3, keep="last"), e) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "data, groups", | ||||
|     [([0, 1, 2, 3], [0, 0, 1, 1]), ([0], [0])], | ||||
| ) | ||||
| @pytest.mark.parametrize("dtype", [None, *tm.ALL_INT_NUMPY_DTYPES]) | ||||
| @pytest.mark.parametrize("method", ["nlargest", "nsmallest"]) | ||||
| def test_nlargest_and_smallest_noop(data, groups, dtype, method): | ||||
|     # GH 15272, GH 16345, GH 29129 | ||||
|     # Test nlargest/smallest when it results in a noop, | ||||
|     # i.e. input is sorted and group size <= n | ||||
|     if dtype is not None: | ||||
|         data = np.array(data, dtype=dtype) | ||||
|     if method == "nlargest": | ||||
|         data = list(reversed(data)) | ||||
|     ser = Series(data, name="a") | ||||
|     result = getattr(ser.groupby(groups), method)(n=2) | ||||
|     expidx = np.array(groups, dtype=int) if isinstance(groups, list) else groups | ||||
|     expected = Series(data, index=MultiIndex.from_arrays([expidx, ser.index]), name="a") | ||||
|     tm.assert_series_equal(result, expected) | ||||
| @ -0,0 +1,922 @@ | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| import pandas as pd | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     MultiIndex, | ||||
|     Series, | ||||
|     Timestamp, | ||||
|     isna, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| def test_first_last_nth(df): | ||||
|     # tests for first / last / nth | ||||
|     grouped = df.groupby("A") | ||||
|     first = grouped.first() | ||||
|     expected = df.loc[[1, 0], ["B", "C", "D"]] | ||||
|     expected.index = Index(["bar", "foo"], name="A") | ||||
|     expected = expected.sort_index() | ||||
|     tm.assert_frame_equal(first, expected) | ||||
|  | ||||
|     nth = grouped.nth(0) | ||||
|     expected = df.loc[[0, 1]] | ||||
|     tm.assert_frame_equal(nth, expected) | ||||
|  | ||||
|     last = grouped.last() | ||||
|     expected = df.loc[[5, 7], ["B", "C", "D"]] | ||||
|     expected.index = Index(["bar", "foo"], name="A") | ||||
|     tm.assert_frame_equal(last, expected) | ||||
|  | ||||
|     nth = grouped.nth(-1) | ||||
|     expected = df.iloc[[5, 7]] | ||||
|     tm.assert_frame_equal(nth, expected) | ||||
|  | ||||
|     nth = grouped.nth(1) | ||||
|     expected = df.iloc[[2, 3]] | ||||
|     tm.assert_frame_equal(nth, expected) | ||||
|  | ||||
|     # it works! | ||||
|     grouped["B"].first() | ||||
|     grouped["B"].last() | ||||
|     grouped["B"].nth(0) | ||||
|  | ||||
|     df = df.copy() | ||||
|     df.loc[df["A"] == "foo", "B"] = np.nan | ||||
|     grouped = df.groupby("A") | ||||
|     assert isna(grouped["B"].first()["foo"]) | ||||
|     assert isna(grouped["B"].last()["foo"]) | ||||
|     assert isna(grouped["B"].nth(0).iloc[0]) | ||||
|  | ||||
|     # v0.14.0 whatsnew | ||||
|     df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) | ||||
|     g = df.groupby("A") | ||||
|     result = g.first() | ||||
|     expected = df.iloc[[1, 2]].set_index("A") | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     expected = df.iloc[[1, 2]] | ||||
|     result = g.nth(0, dropna="any") | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("method", ["first", "last"]) | ||||
| def test_first_last_with_na_object(method, nulls_fixture): | ||||
|     # https://github.com/pandas-dev/pandas/issues/32123 | ||||
|     groups = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby("a") | ||||
|     result = getattr(groups, method)() | ||||
|  | ||||
|     if method == "first": | ||||
|         values = [1, 3] | ||||
|     else: | ||||
|         values = [2, 3] | ||||
|  | ||||
|     values = np.array(values, dtype=result["b"].dtype) | ||||
|     idx = Index([1, 2], name="a") | ||||
|     expected = DataFrame({"b": values}, index=idx) | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("index", [0, -1]) | ||||
| def test_nth_with_na_object(index, nulls_fixture): | ||||
|     # https://github.com/pandas-dev/pandas/issues/32123 | ||||
|     df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}) | ||||
|     groups = df.groupby("a") | ||||
|     result = groups.nth(index) | ||||
|     expected = df.iloc[[0, 2]] if index == 0 else df.iloc[[1, 3]] | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("method", ["first", "last"]) | ||||
| def test_first_last_with_None(method): | ||||
|     # https://github.com/pandas-dev/pandas/issues/32800 | ||||
|     # None should be preserved as object dtype | ||||
|     df = DataFrame.from_dict({"id": ["a"], "value": [None]}) | ||||
|     groups = df.groupby("id", as_index=False) | ||||
|     result = getattr(groups, method)() | ||||
|  | ||||
|     tm.assert_frame_equal(result, df) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("method", ["first", "last"]) | ||||
| @pytest.mark.parametrize( | ||||
|     "df, expected", | ||||
|     [ | ||||
|         ( | ||||
|             DataFrame({"id": "a", "value": [None, "foo", np.nan]}), | ||||
|             DataFrame({"value": ["foo"]}, index=Index(["a"], name="id")), | ||||
|         ), | ||||
|         ( | ||||
|             DataFrame({"id": "a", "value": [np.nan]}, dtype=object), | ||||
|             DataFrame({"value": [None]}, index=Index(["a"], name="id")), | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_first_last_with_None_expanded(method, df, expected): | ||||
|     # GH 32800, 38286 | ||||
|     result = getattr(df.groupby("id"), method)() | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_first_last_nth_dtypes(): | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], | ||||
|             "B": ["one", "one", "two", "three", "two", "two", "one", "three"], | ||||
|             "C": np.random.default_rng(2).standard_normal(8), | ||||
|             "D": np.array(np.random.default_rng(2).standard_normal(8), dtype="float32"), | ||||
|         } | ||||
|     ) | ||||
|     df["E"] = True | ||||
|     df["F"] = 1 | ||||
|  | ||||
|     # tests for first / last / nth | ||||
|     grouped = df.groupby("A") | ||||
|     first = grouped.first() | ||||
|     expected = df.loc[[1, 0], ["B", "C", "D", "E", "F"]] | ||||
|     expected.index = Index(["bar", "foo"], name="A") | ||||
|     expected = expected.sort_index() | ||||
|     tm.assert_frame_equal(first, expected) | ||||
|  | ||||
|     last = grouped.last() | ||||
|     expected = df.loc[[5, 7], ["B", "C", "D", "E", "F"]] | ||||
|     expected.index = Index(["bar", "foo"], name="A") | ||||
|     expected = expected.sort_index() | ||||
|     tm.assert_frame_equal(last, expected) | ||||
|  | ||||
|     nth = grouped.nth(1) | ||||
|     expected = df.iloc[[2, 3]] | ||||
|     tm.assert_frame_equal(nth, expected) | ||||
|  | ||||
|  | ||||
| def test_first_last_nth_dtypes2(): | ||||
|     # GH 2763, first/last shifting dtypes | ||||
|     idx = list(range(10)) | ||||
|     idx.append(9) | ||||
|     ser = Series(data=range(11), index=idx, name="IntCol") | ||||
|     assert ser.dtype == "int64" | ||||
|     f = ser.groupby(level=0).first() | ||||
|     assert f.dtype == "int64" | ||||
|  | ||||
|  | ||||
| def test_first_last_nth_nan_dtype(): | ||||
|     # GH 33591 | ||||
|     df = DataFrame({"data": ["A"], "nans": Series([None], dtype=object)}) | ||||
|     grouped = df.groupby("data") | ||||
|  | ||||
|     expected = df.set_index("data").nans | ||||
|     tm.assert_series_equal(grouped.nans.first(), expected) | ||||
|     tm.assert_series_equal(grouped.nans.last(), expected) | ||||
|  | ||||
|     expected = df.nans | ||||
|     tm.assert_series_equal(grouped.nans.nth(-1), expected) | ||||
|     tm.assert_series_equal(grouped.nans.nth(0), expected) | ||||
|  | ||||
|  | ||||
| def test_first_strings_timestamps(): | ||||
|     # GH 11244 | ||||
|     test = DataFrame( | ||||
|         { | ||||
|             Timestamp("2012-01-01 00:00:00"): ["a", "b"], | ||||
|             Timestamp("2012-01-02 00:00:00"): ["c", "d"], | ||||
|             "name": ["e", "e"], | ||||
|             "aaaa": ["f", "g"], | ||||
|         } | ||||
|     ) | ||||
|     result = test.groupby("name").first() | ||||
|     expected = DataFrame( | ||||
|         [["a", "c", "f"]], | ||||
|         columns=Index([Timestamp("2012-01-01"), Timestamp("2012-01-02"), "aaaa"]), | ||||
|         index=Index(["e"], name="name"), | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_nth(): | ||||
|     df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) | ||||
|     gb = df.groupby("A") | ||||
|  | ||||
|     tm.assert_frame_equal(gb.nth(0), df.iloc[[0, 2]]) | ||||
|     tm.assert_frame_equal(gb.nth(1), df.iloc[[1]]) | ||||
|     tm.assert_frame_equal(gb.nth(2), df.loc[[]]) | ||||
|     tm.assert_frame_equal(gb.nth(-1), df.iloc[[1, 2]]) | ||||
|     tm.assert_frame_equal(gb.nth(-2), df.iloc[[0]]) | ||||
|     tm.assert_frame_equal(gb.nth(-3), df.loc[[]]) | ||||
|     tm.assert_series_equal(gb.B.nth(0), df.B.iloc[[0, 2]]) | ||||
|     tm.assert_series_equal(gb.B.nth(1), df.B.iloc[[1]]) | ||||
|     tm.assert_frame_equal(gb[["B"]].nth(0), df[["B"]].iloc[[0, 2]]) | ||||
|  | ||||
|     tm.assert_frame_equal(gb.nth(0, dropna="any"), df.iloc[[1, 2]]) | ||||
|     tm.assert_frame_equal(gb.nth(-1, dropna="any"), df.iloc[[1, 2]]) | ||||
|  | ||||
|     tm.assert_frame_equal(gb.nth(7, dropna="any"), df.iloc[:0]) | ||||
|     tm.assert_frame_equal(gb.nth(2, dropna="any"), df.iloc[:0]) | ||||
|  | ||||
|  | ||||
| def test_nth2(): | ||||
|     # out of bounds, regression from 0.13.1 | ||||
|     # GH 6621 | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "color": {0: "green", 1: "green", 2: "red", 3: "red", 4: "red"}, | ||||
|             "food": {0: "ham", 1: "eggs", 2: "eggs", 3: "ham", 4: "pork"}, | ||||
|             "two": { | ||||
|                 0: 1.5456590000000001, | ||||
|                 1: -0.070345000000000005, | ||||
|                 2: -2.4004539999999999, | ||||
|                 3: 0.46206000000000003, | ||||
|                 4: 0.52350799999999997, | ||||
|             }, | ||||
|             "one": { | ||||
|                 0: 0.56573799999999996, | ||||
|                 1: -0.9742360000000001, | ||||
|                 2: 1.033801, | ||||
|                 3: -0.78543499999999999, | ||||
|                 4: 0.70422799999999997, | ||||
|             }, | ||||
|         } | ||||
|     ).set_index(["color", "food"]) | ||||
|  | ||||
|     result = df.groupby(level=0, as_index=False).nth(2) | ||||
|     expected = df.iloc[[-1]] | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby(level=0, as_index=False).nth(3) | ||||
|     expected = df.loc[[]] | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_nth3(): | ||||
|     # GH 7559 | ||||
|     # from the vbench | ||||
|     df = DataFrame(np.random.default_rng(2).integers(1, 10, (100, 2)), dtype="int64") | ||||
|     ser = df[1] | ||||
|     gb = df[0] | ||||
|     expected = ser.groupby(gb).first() | ||||
|     expected2 = ser.groupby(gb).apply(lambda x: x.iloc[0]) | ||||
|     tm.assert_series_equal(expected2, expected, check_names=False) | ||||
|     assert expected.name == 1 | ||||
|     assert expected2.name == 1 | ||||
|  | ||||
|     # validate first | ||||
|     v = ser[gb == 1].iloc[0] | ||||
|     assert expected.iloc[0] == v | ||||
|     assert expected2.iloc[0] == v | ||||
|  | ||||
|     with pytest.raises(ValueError, match="For a DataFrame"): | ||||
|         ser.groupby(gb, sort=False).nth(0, dropna=True) | ||||
|  | ||||
|  | ||||
| def test_nth4(): | ||||
|     # doc example | ||||
|     df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) | ||||
|     gb = df.groupby("A") | ||||
|     result = gb.B.nth(0, dropna="all") | ||||
|     expected = df.B.iloc[[1, 2]] | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_nth5(): | ||||
|     # test multiple nth values | ||||
|     df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], columns=["A", "B"]) | ||||
|     gb = df.groupby("A") | ||||
|  | ||||
|     tm.assert_frame_equal(gb.nth(0), df.iloc[[0, 3]]) | ||||
|     tm.assert_frame_equal(gb.nth([0]), df.iloc[[0, 3]]) | ||||
|     tm.assert_frame_equal(gb.nth([0, 1]), df.iloc[[0, 1, 3, 4]]) | ||||
|     tm.assert_frame_equal(gb.nth([0, -1]), df.iloc[[0, 2, 3, 4]]) | ||||
|     tm.assert_frame_equal(gb.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]]) | ||||
|     tm.assert_frame_equal(gb.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]]) | ||||
|     tm.assert_frame_equal(gb.nth([2]), df.iloc[[2]]) | ||||
|     tm.assert_frame_equal(gb.nth([3, 4]), df.loc[[]]) | ||||
|  | ||||
|  | ||||
| def test_nth_bdays(unit): | ||||
|     business_dates = pd.date_range( | ||||
|         start="4/1/2014", end="6/30/2014", freq="B", unit=unit | ||||
|     ) | ||||
|     df = DataFrame(1, index=business_dates, columns=["a", "b"]) | ||||
|     # get the first, fourth and last two business days for each month | ||||
|     key = [df.index.year, df.index.month] | ||||
|     result = df.groupby(key, as_index=False).nth([0, 3, -2, -1]) | ||||
|     expected_dates = pd.to_datetime( | ||||
|         [ | ||||
|             "2014/4/1", | ||||
|             "2014/4/4", | ||||
|             "2014/4/29", | ||||
|             "2014/4/30", | ||||
|             "2014/5/1", | ||||
|             "2014/5/6", | ||||
|             "2014/5/29", | ||||
|             "2014/5/30", | ||||
|             "2014/6/2", | ||||
|             "2014/6/5", | ||||
|             "2014/6/27", | ||||
|             "2014/6/30", | ||||
|         ] | ||||
|     ).as_unit(unit) | ||||
|     expected = DataFrame(1, columns=["a", "b"], index=expected_dates) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_nth_multi_grouper(three_group): | ||||
|     # PR 9090, related to issue 8979 | ||||
|     # test nth on multiple groupers | ||||
|     grouped = three_group.groupby(["A", "B"]) | ||||
|     result = grouped.nth(0) | ||||
|     expected = three_group.iloc[[0, 3, 4, 7]] | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "data, expected_first, expected_last", | ||||
|     [ | ||||
|         ( | ||||
|             { | ||||
|                 "id": ["A"], | ||||
|                 "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"), | ||||
|                 "foo": [1], | ||||
|             }, | ||||
|             { | ||||
|                 "id": ["A"], | ||||
|                 "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"), | ||||
|                 "foo": [1], | ||||
|             }, | ||||
|             { | ||||
|                 "id": ["A"], | ||||
|                 "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"), | ||||
|                 "foo": [1], | ||||
|             }, | ||||
|         ), | ||||
|         ( | ||||
|             { | ||||
|                 "id": ["A", "B", "A"], | ||||
|                 "time": [ | ||||
|                     Timestamp("2012-01-01 13:00:00", tz="America/New_York"), | ||||
|                     Timestamp("2012-02-01 14:00:00", tz="US/Central"), | ||||
|                     Timestamp("2012-03-01 12:00:00", tz="Europe/London"), | ||||
|                 ], | ||||
|                 "foo": [1, 2, 3], | ||||
|             }, | ||||
|             { | ||||
|                 "id": ["A", "B"], | ||||
|                 "time": [ | ||||
|                     Timestamp("2012-01-01 13:00:00", tz="America/New_York"), | ||||
|                     Timestamp("2012-02-01 14:00:00", tz="US/Central"), | ||||
|                 ], | ||||
|                 "foo": [1, 2], | ||||
|             }, | ||||
|             { | ||||
|                 "id": ["A", "B"], | ||||
|                 "time": [ | ||||
|                     Timestamp("2012-03-01 12:00:00", tz="Europe/London"), | ||||
|                     Timestamp("2012-02-01 14:00:00", tz="US/Central"), | ||||
|                 ], | ||||
|                 "foo": [3, 2], | ||||
|             }, | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_first_last_tz(data, expected_first, expected_last): | ||||
|     # GH15884 | ||||
|     # Test that the timezone is retained when calling first | ||||
|     # or last on groupby with as_index=False | ||||
|  | ||||
|     df = DataFrame(data) | ||||
|  | ||||
|     result = df.groupby("id", as_index=False).first() | ||||
|     expected = DataFrame(expected_first) | ||||
|     cols = ["id", "time", "foo"] | ||||
|     tm.assert_frame_equal(result[cols], expected[cols]) | ||||
|  | ||||
|     result = df.groupby("id", as_index=False)["time"].first() | ||||
|     tm.assert_frame_equal(result, expected[["id", "time"]]) | ||||
|  | ||||
|     result = df.groupby("id", as_index=False).last() | ||||
|     expected = DataFrame(expected_last) | ||||
|     cols = ["id", "time", "foo"] | ||||
|     tm.assert_frame_equal(result[cols], expected[cols]) | ||||
|  | ||||
|     result = df.groupby("id", as_index=False)["time"].last() | ||||
|     tm.assert_frame_equal(result, expected[["id", "time"]]) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "method, ts, alpha", | ||||
|     [ | ||||
|         ["first", Timestamp("2013-01-01", tz="US/Eastern"), "a"], | ||||
|         ["last", Timestamp("2013-01-02", tz="US/Eastern"), "b"], | ||||
|     ], | ||||
| ) | ||||
| def test_first_last_tz_multi_column(method, ts, alpha, unit): | ||||
|     # GH 21603 | ||||
|     category_string = Series(list("abc")).astype("category") | ||||
|     dti = pd.date_range("20130101", periods=3, tz="US/Eastern", unit=unit) | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "group": [1, 1, 2], | ||||
|             "category_string": category_string, | ||||
|             "datetimetz": dti, | ||||
|         } | ||||
|     ) | ||||
|     result = getattr(df.groupby("group"), method)() | ||||
|     expected = DataFrame( | ||||
|         { | ||||
|             "category_string": pd.Categorical( | ||||
|                 [alpha, "c"], dtype=category_string.dtype | ||||
|             ), | ||||
|             "datetimetz": [ts, Timestamp("2013-01-03", tz="US/Eastern")], | ||||
|         }, | ||||
|         index=Index([1, 2], name="group"), | ||||
|     ) | ||||
|     expected["datetimetz"] = expected["datetimetz"].dt.as_unit(unit) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "values", | ||||
|     [ | ||||
|         pd.array([True, False], dtype="boolean"), | ||||
|         pd.array([1, 2], dtype="Int64"), | ||||
|         pd.to_datetime(["2020-01-01", "2020-02-01"]), | ||||
|         pd.to_timedelta([1, 2], unit="D"), | ||||
|     ], | ||||
| ) | ||||
| @pytest.mark.parametrize("function", ["first", "last", "min", "max"]) | ||||
| def test_first_last_extension_array_keeps_dtype(values, function): | ||||
|     # https://github.com/pandas-dev/pandas/issues/33071 | ||||
|     # https://github.com/pandas-dev/pandas/issues/32194 | ||||
|     df = DataFrame({"a": [1, 2], "b": values}) | ||||
|     grouped = df.groupby("a") | ||||
|     idx = Index([1, 2], name="a") | ||||
|     expected_series = Series(values, name="b", index=idx) | ||||
|     expected_frame = DataFrame({"b": values}, index=idx) | ||||
|  | ||||
|     result_series = getattr(grouped["b"], function)() | ||||
|     tm.assert_series_equal(result_series, expected_series) | ||||
|  | ||||
|     result_frame = grouped.agg({"b": function}) | ||||
|     tm.assert_frame_equal(result_frame, expected_frame) | ||||
|  | ||||
|  | ||||
| def test_nth_multi_index_as_expected(): | ||||
|     # PR 9090, related to issue 8979 | ||||
|     # test nth on MultiIndex | ||||
|     three_group = DataFrame( | ||||
|         { | ||||
|             "A": [ | ||||
|                 "foo", | ||||
|                 "foo", | ||||
|                 "foo", | ||||
|                 "foo", | ||||
|                 "bar", | ||||
|                 "bar", | ||||
|                 "bar", | ||||
|                 "bar", | ||||
|                 "foo", | ||||
|                 "foo", | ||||
|                 "foo", | ||||
|             ], | ||||
|             "B": [ | ||||
|                 "one", | ||||
|                 "one", | ||||
|                 "one", | ||||
|                 "two", | ||||
|                 "one", | ||||
|                 "one", | ||||
|                 "one", | ||||
|                 "two", | ||||
|                 "two", | ||||
|                 "two", | ||||
|                 "one", | ||||
|             ], | ||||
|             "C": [ | ||||
|                 "dull", | ||||
|                 "dull", | ||||
|                 "shiny", | ||||
|                 "dull", | ||||
|                 "dull", | ||||
|                 "shiny", | ||||
|                 "shiny", | ||||
|                 "dull", | ||||
|                 "shiny", | ||||
|                 "shiny", | ||||
|                 "shiny", | ||||
|             ], | ||||
|         } | ||||
|     ) | ||||
|     grouped = three_group.groupby(["A", "B"]) | ||||
|     result = grouped.nth(0) | ||||
|     expected = three_group.iloc[[0, 3, 4, 7]] | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "op, n, expected_rows", | ||||
|     [ | ||||
|         ("head", -1, [0]), | ||||
|         ("head", 0, []), | ||||
|         ("head", 1, [0, 2]), | ||||
|         ("head", 7, [0, 1, 2]), | ||||
|         ("tail", -1, [1]), | ||||
|         ("tail", 0, []), | ||||
|         ("tail", 1, [1, 2]), | ||||
|         ("tail", 7, [0, 1, 2]), | ||||
|     ], | ||||
| ) | ||||
| @pytest.mark.parametrize("columns", [None, [], ["A"], ["B"], ["A", "B"]]) | ||||
| @pytest.mark.parametrize("as_index", [True, False]) | ||||
| def test_groupby_head_tail(op, n, expected_rows, columns, as_index): | ||||
|     df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) | ||||
|     g = df.groupby("A", as_index=as_index) | ||||
|     expected = df.iloc[expected_rows] | ||||
|     if columns is not None: | ||||
|         g = g[columns] | ||||
|         expected = expected[columns] | ||||
|     result = getattr(g, op)(n) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "op, n, expected_cols", | ||||
|     [ | ||||
|         ("head", -1, [0]), | ||||
|         ("head", 0, []), | ||||
|         ("head", 1, [0, 2]), | ||||
|         ("head", 7, [0, 1, 2]), | ||||
|         ("tail", -1, [1]), | ||||
|         ("tail", 0, []), | ||||
|         ("tail", 1, [1, 2]), | ||||
|         ("tail", 7, [0, 1, 2]), | ||||
|     ], | ||||
| ) | ||||
| def test_groupby_head_tail_axis_1(op, n, expected_cols): | ||||
|     # GH 9772 | ||||
|     df = DataFrame( | ||||
|         [[1, 2, 3], [1, 4, 5], [2, 6, 7], [3, 8, 9]], columns=["A", "B", "C"] | ||||
|     ) | ||||
|     msg = "DataFrame.groupby with axis=1 is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         g = df.groupby([0, 0, 1], axis=1) | ||||
|     expected = df.iloc[:, expected_cols] | ||||
|     result = getattr(g, op)(n) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_group_selection_cache(): | ||||
|     # GH 12839 nth, head, and tail should return same result consistently | ||||
|     df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) | ||||
|     expected = df.iloc[[0, 2]] | ||||
|  | ||||
|     g = df.groupby("A") | ||||
|     result1 = g.head(n=2) | ||||
|     result2 = g.nth(0) | ||||
|     tm.assert_frame_equal(result1, df) | ||||
|     tm.assert_frame_equal(result2, expected) | ||||
|  | ||||
|     g = df.groupby("A") | ||||
|     result1 = g.tail(n=2) | ||||
|     result2 = g.nth(0) | ||||
|     tm.assert_frame_equal(result1, df) | ||||
|     tm.assert_frame_equal(result2, expected) | ||||
|  | ||||
|     g = df.groupby("A") | ||||
|     result1 = g.nth(0) | ||||
|     result2 = g.head(n=2) | ||||
|     tm.assert_frame_equal(result1, expected) | ||||
|     tm.assert_frame_equal(result2, df) | ||||
|  | ||||
|     g = df.groupby("A") | ||||
|     result1 = g.nth(0) | ||||
|     result2 = g.tail(n=2) | ||||
|     tm.assert_frame_equal(result1, expected) | ||||
|     tm.assert_frame_equal(result2, df) | ||||
|  | ||||
|  | ||||
| def test_nth_empty(): | ||||
|     # GH 16064 | ||||
|     df = DataFrame(index=[0], columns=["a", "b", "c"]) | ||||
|     result = df.groupby("a").nth(10) | ||||
|     expected = df.iloc[:0] | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby(["a", "b"]).nth(10) | ||||
|     expected = df.iloc[:0] | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_nth_column_order(): | ||||
|     # GH 20760 | ||||
|     # Check that nth preserves column order | ||||
|     df = DataFrame( | ||||
|         [[1, "b", 100], [1, "a", 50], [1, "a", np.nan], [2, "c", 200], [2, "d", 150]], | ||||
|         columns=["A", "C", "B"], | ||||
|     ) | ||||
|     result = df.groupby("A").nth(0) | ||||
|     expected = df.iloc[[0, 3]] | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby("A").nth(-1, dropna="any") | ||||
|     expected = df.iloc[[1, 4]] | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("dropna", [None, "any", "all"]) | ||||
| def test_nth_nan_in_grouper(dropna): | ||||
|     # GH 26011 | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "a": [np.nan, "a", np.nan, "b", np.nan], | ||||
|             "b": [0, 2, 4, 6, 8], | ||||
|             "c": [1, 3, 5, 7, 9], | ||||
|         } | ||||
|     ) | ||||
|     result = df.groupby("a").nth(0, dropna=dropna) | ||||
|     expected = df.iloc[[1, 3]] | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("dropna", [None, "any", "all"]) | ||||
| def test_nth_nan_in_grouper_series(dropna): | ||||
|     # GH 26454 | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "a": [np.nan, "a", np.nan, "b", np.nan], | ||||
|             "b": [0, 2, 4, 6, 8], | ||||
|         } | ||||
|     ) | ||||
|     result = df.groupby("a")["b"].nth(0, dropna=dropna) | ||||
|     expected = df["b"].iloc[[1, 3]] | ||||
|  | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_first_categorical_and_datetime_data_nat(): | ||||
|     # GH 20520 | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "group": ["first", "first", "second", "third", "third"], | ||||
|             "time": 5 * [np.datetime64("NaT")], | ||||
|             "categories": Series(["a", "b", "c", "a", "b"], dtype="category"), | ||||
|         } | ||||
|     ) | ||||
|     result = df.groupby("group").first() | ||||
|     expected = DataFrame( | ||||
|         { | ||||
|             "time": 3 * [np.datetime64("NaT")], | ||||
|             "categories": Series(["a", "c", "a"]).astype( | ||||
|                 pd.CategoricalDtype(["a", "b", "c"]) | ||||
|             ), | ||||
|         } | ||||
|     ) | ||||
|     expected.index = Index(["first", "second", "third"], name="group") | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_first_multi_key_groupby_categorical(): | ||||
|     # GH 22512 | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "A": [1, 1, 1, 2, 2], | ||||
|             "B": [100, 100, 200, 100, 100], | ||||
|             "C": ["apple", "orange", "mango", "mango", "orange"], | ||||
|             "D": ["jupiter", "mercury", "mars", "venus", "venus"], | ||||
|         } | ||||
|     ) | ||||
|     df = df.astype({"D": "category"}) | ||||
|     result = df.groupby(by=["A", "B"]).first() | ||||
|     expected = DataFrame( | ||||
|         { | ||||
|             "C": ["apple", "mango", "mango"], | ||||
|             "D": Series(["jupiter", "mars", "venus"]).astype( | ||||
|                 pd.CategoricalDtype(["jupiter", "mars", "mercury", "venus"]) | ||||
|             ), | ||||
|         } | ||||
|     ) | ||||
|     expected.index = MultiIndex.from_tuples( | ||||
|         [(1, 100), (1, 200), (2, 100)], names=["A", "B"] | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("method", ["first", "last", "nth"]) | ||||
| def test_groupby_last_first_nth_with_none(method, nulls_fixture): | ||||
|     # GH29645 | ||||
|     expected = Series(["y"], dtype=object) | ||||
|     data = Series( | ||||
|         [nulls_fixture, nulls_fixture, nulls_fixture, "y", nulls_fixture], | ||||
|         index=[0, 0, 0, 0, 0], | ||||
|         dtype=object, | ||||
|     ).groupby(level=0) | ||||
|  | ||||
|     if method == "nth": | ||||
|         result = getattr(data, method)(3) | ||||
|     else: | ||||
|         result = getattr(data, method)() | ||||
|  | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "arg, expected_rows", | ||||
|     [ | ||||
|         [slice(None, 3, 2), [0, 1, 4, 5]], | ||||
|         [slice(None, -2), [0, 2, 5]], | ||||
|         [[slice(None, 2), slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]], | ||||
|         [[0, 1, slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]], | ||||
|     ], | ||||
| ) | ||||
| def test_slice(slice_test_df, slice_test_grouped, arg, expected_rows): | ||||
|     # Test slices     GH #42947 | ||||
|  | ||||
|     result = slice_test_grouped.nth[arg] | ||||
|     equivalent = slice_test_grouped.nth(arg) | ||||
|     expected = slice_test_df.iloc[expected_rows] | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|     tm.assert_frame_equal(equivalent, expected) | ||||
|  | ||||
|  | ||||
| def test_nth_indexed(slice_test_df, slice_test_grouped): | ||||
|     # Test index notation     GH #44688 | ||||
|  | ||||
|     result = slice_test_grouped.nth[0, 1, -2:] | ||||
|     equivalent = slice_test_grouped.nth([0, 1, slice(-2, None)]) | ||||
|     expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]] | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|     tm.assert_frame_equal(equivalent, expected) | ||||
|  | ||||
|  | ||||
| def test_invalid_argument(slice_test_grouped): | ||||
|     # Test for error on invalid argument | ||||
|  | ||||
|     with pytest.raises(TypeError, match="Invalid index"): | ||||
|         slice_test_grouped.nth(3.14) | ||||
|  | ||||
|  | ||||
| def test_negative_step(slice_test_grouped): | ||||
|     # Test for error on negative slice step | ||||
|  | ||||
|     with pytest.raises(ValueError, match="Invalid step"): | ||||
|         slice_test_grouped.nth(slice(None, None, -1)) | ||||
|  | ||||
|  | ||||
| def test_np_ints(slice_test_df, slice_test_grouped): | ||||
|     # Test np ints work | ||||
|  | ||||
|     result = slice_test_grouped.nth(np.array([0, 1])) | ||||
|     expected = slice_test_df.iloc[[0, 1, 2, 3, 4]] | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_groupby_nth_with_column_axis(): | ||||
|     # GH43926 | ||||
|     df = DataFrame( | ||||
|         [ | ||||
|             [4, 5, 6], | ||||
|             [8, 8, 7], | ||||
|         ], | ||||
|         index=["z", "y"], | ||||
|         columns=["C", "B", "A"], | ||||
|     ) | ||||
|     msg = "DataFrame.groupby with axis=1 is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         gb = df.groupby(df.iloc[1], axis=1) | ||||
|     result = gb.nth(0) | ||||
|     expected = df.iloc[:, [0, 2]] | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_groupby_nth_interval(): | ||||
|     # GH#24205 | ||||
|     idx_result = MultiIndex( | ||||
|         [ | ||||
|             pd.CategoricalIndex([pd.Interval(0, 1), pd.Interval(1, 2)]), | ||||
|             pd.CategoricalIndex([pd.Interval(0, 10), pd.Interval(10, 20)]), | ||||
|         ], | ||||
|         [[0, 0, 0, 1, 1], [0, 1, 1, 0, -1]], | ||||
|     ) | ||||
|     df_result = DataFrame({"col": range(len(idx_result))}, index=idx_result) | ||||
|     result = df_result.groupby(level=[0, 1], observed=False).nth(0) | ||||
|     val_expected = [0, 1, 3] | ||||
|     idx_expected = MultiIndex( | ||||
|         [ | ||||
|             pd.CategoricalIndex([pd.Interval(0, 1), pd.Interval(1, 2)]), | ||||
|             pd.CategoricalIndex([pd.Interval(0, 10), pd.Interval(10, 20)]), | ||||
|         ], | ||||
|         [[0, 0, 1], [0, 1, 0]], | ||||
|     ) | ||||
|     expected = DataFrame(val_expected, index=idx_expected, columns=["col"]) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "start, stop, expected_values, expected_columns", | ||||
|     [ | ||||
|         (None, None, [0, 1, 2, 3, 4], list("ABCDE")), | ||||
|         (None, 1, [0, 3], list("AD")), | ||||
|         (None, 9, [0, 1, 2, 3, 4], list("ABCDE")), | ||||
|         (None, -1, [0, 1, 3], list("ABD")), | ||||
|         (1, None, [1, 2, 4], list("BCE")), | ||||
|         (1, -1, [1], list("B")), | ||||
|         (-1, None, [2, 4], list("CE")), | ||||
|         (-1, 2, [4], list("E")), | ||||
|     ], | ||||
| ) | ||||
| @pytest.mark.parametrize("method", ["call", "index"]) | ||||
| def test_nth_slices_with_column_axis( | ||||
|     start, stop, expected_values, expected_columns, method | ||||
| ): | ||||
|     df = DataFrame([range(5)], columns=[list("ABCDE")]) | ||||
|     msg = "DataFrame.groupby with axis=1 is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         gb = df.groupby([5, 5, 5, 6, 6], axis=1) | ||||
|     result = { | ||||
|         "call": lambda start, stop: gb.nth(slice(start, stop)), | ||||
|         "index": lambda start, stop: gb.nth[start:stop], | ||||
|     }[method](start, stop) | ||||
|     expected = DataFrame([expected_values], columns=[expected_columns]) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.filterwarnings( | ||||
|     "ignore:invalid value encountered in remainder:RuntimeWarning" | ||||
| ) | ||||
| def test_head_tail_dropna_true(): | ||||
|     # GH#45089 | ||||
|     df = DataFrame( | ||||
|         [["a", "z"], ["b", np.nan], ["c", np.nan], ["c", np.nan]], columns=["X", "Y"] | ||||
|     ) | ||||
|     expected = DataFrame([["a", "z"]], columns=["X", "Y"]) | ||||
|  | ||||
|     result = df.groupby(["X", "Y"]).head(n=1) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby(["X", "Y"]).tail(n=1) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby(["X", "Y"]).nth(n=0) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_head_tail_dropna_false(): | ||||
|     # GH#45089 | ||||
|     df = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"]) | ||||
|     expected = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"]) | ||||
|  | ||||
|     result = df.groupby(["X", "Y"], dropna=False).head(n=1) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby(["X", "Y"], dropna=False).tail(n=1) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby(["X", "Y"], dropna=False).nth(n=0) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("selection", ("b", ["b"], ["b", "c"])) | ||||
| @pytest.mark.parametrize("dropna", ["any", "all", None]) | ||||
| def test_nth_after_selection(selection, dropna): | ||||
|     # GH#11038, GH#53518 | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "a": [1, 1, 2], | ||||
|             "b": [np.nan, 3, 4], | ||||
|             "c": [5, 6, 7], | ||||
|         } | ||||
|     ) | ||||
|     gb = df.groupby("a")[selection] | ||||
|     result = gb.nth(0, dropna=dropna) | ||||
|     if dropna == "any" or (dropna == "all" and selection != ["b", "c"]): | ||||
|         locs = [1, 2] | ||||
|     else: | ||||
|         locs = [0, 2] | ||||
|     expected = df.loc[locs, selection] | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "data", | ||||
|     [ | ||||
|         ( | ||||
|             Timestamp("2011-01-15 12:50:28.502376"), | ||||
|             Timestamp("2011-01-20 12:50:28.593448"), | ||||
|         ), | ||||
|         (24650000000000001, 24650000000000002), | ||||
|     ], | ||||
| ) | ||||
| def test_groupby_nth_int_like_precision(data): | ||||
|     # GH#6620, GH#9311 | ||||
|     df = DataFrame({"a": [1, 1], "b": data}) | ||||
|  | ||||
|     grouped = df.groupby("a") | ||||
|     result = grouped.nth(0) | ||||
|     expected = DataFrame({"a": 1, "b": [data[0]]}) | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
| @ -0,0 +1,496 @@ | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| import pandas as pd | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"] | ||||
| ) | ||||
| @pytest.mark.parametrize( | ||||
|     "a_vals,b_vals", | ||||
|     [ | ||||
|         # Ints | ||||
|         ([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]), | ||||
|         ([1, 2, 3, 4], [4, 3, 2, 1]), | ||||
|         ([1, 2, 3, 4, 5], [4, 3, 2, 1]), | ||||
|         # Floats | ||||
|         ([1.0, 2.0, 3.0, 4.0, 5.0], [5.0, 4.0, 3.0, 2.0, 1.0]), | ||||
|         # Missing data | ||||
|         ([1.0, np.nan, 3.0, np.nan, 5.0], [5.0, np.nan, 3.0, np.nan, 1.0]), | ||||
|         ([np.nan, 4.0, np.nan, 2.0, np.nan], [np.nan, 4.0, np.nan, 2.0, np.nan]), | ||||
|         # Timestamps | ||||
|         ( | ||||
|             pd.date_range("1/1/18", freq="D", periods=5), | ||||
|             pd.date_range("1/1/18", freq="D", periods=5)[::-1], | ||||
|         ), | ||||
|         ( | ||||
|             pd.date_range("1/1/18", freq="D", periods=5).as_unit("s"), | ||||
|             pd.date_range("1/1/18", freq="D", periods=5)[::-1].as_unit("s"), | ||||
|         ), | ||||
|         # All NA | ||||
|         ([np.nan] * 5, [np.nan] * 5), | ||||
|     ], | ||||
| ) | ||||
| @pytest.mark.parametrize("q", [0, 0.25, 0.5, 0.75, 1]) | ||||
| def test_quantile(interpolation, a_vals, b_vals, q, request): | ||||
|     if ( | ||||
|         interpolation == "nearest" | ||||
|         and q == 0.5 | ||||
|         and isinstance(b_vals, list) | ||||
|         and b_vals == [4, 3, 2, 1] | ||||
|     ): | ||||
|         request.applymarker( | ||||
|             pytest.mark.xfail( | ||||
|                 reason="Unclear numpy expectation for nearest " | ||||
|                 "result with equidistant data" | ||||
|             ) | ||||
|         ) | ||||
|     all_vals = pd.concat([pd.Series(a_vals), pd.Series(b_vals)]) | ||||
|  | ||||
|     a_expected = pd.Series(a_vals).quantile(q, interpolation=interpolation) | ||||
|     b_expected = pd.Series(b_vals).quantile(q, interpolation=interpolation) | ||||
|  | ||||
|     df = DataFrame({"key": ["a"] * len(a_vals) + ["b"] * len(b_vals), "val": all_vals}) | ||||
|  | ||||
|     expected = DataFrame( | ||||
|         [a_expected, b_expected], columns=["val"], index=Index(["a", "b"], name="key") | ||||
|     ) | ||||
|     if all_vals.dtype.kind == "M" and expected.dtypes.values[0].kind == "M": | ||||
|         # TODO(non-nano): this should be unnecessary once array_to_datetime | ||||
|         #  correctly infers non-nano from Timestamp.unit | ||||
|         expected = expected.astype(all_vals.dtype) | ||||
|     result = df.groupby("key").quantile(q, interpolation=interpolation) | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_quantile_array(): | ||||
|     # https://github.com/pandas-dev/pandas/issues/27526 | ||||
|     df = DataFrame({"A": [0, 1, 2, 3, 4]}) | ||||
|     key = np.array([0, 0, 1, 1, 1], dtype=np.int64) | ||||
|     result = df.groupby(key).quantile([0.25]) | ||||
|  | ||||
|     index = pd.MultiIndex.from_product([[0, 1], [0.25]]) | ||||
|     expected = DataFrame({"A": [0.25, 2.50]}, index=index) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     df = DataFrame({"A": [0, 1, 2, 3], "B": [4, 5, 6, 7]}) | ||||
|     index = pd.MultiIndex.from_product([[0, 1], [0.25, 0.75]]) | ||||
|  | ||||
|     key = np.array([0, 0, 1, 1], dtype=np.int64) | ||||
|     result = df.groupby(key).quantile([0.25, 0.75]) | ||||
|     expected = DataFrame( | ||||
|         {"A": [0.25, 0.75, 2.25, 2.75], "B": [4.25, 4.75, 6.25, 6.75]}, index=index | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_quantile_array2(): | ||||
|     # https://github.com/pandas-dev/pandas/pull/28085#issuecomment-524066959 | ||||
|     arr = np.random.default_rng(2).integers(0, 5, size=(10, 3), dtype=np.int64) | ||||
|     df = DataFrame(arr, columns=list("ABC")) | ||||
|     result = df.groupby("A").quantile([0.3, 0.7]) | ||||
|     expected = DataFrame( | ||||
|         { | ||||
|             "B": [2.0, 2.0, 2.3, 2.7, 0.3, 0.7, 3.2, 4.0, 0.3, 0.7], | ||||
|             "C": [1.0, 1.0, 1.9, 3.0999999999999996, 0.3, 0.7, 2.6, 3.0, 1.2, 2.8], | ||||
|         }, | ||||
|         index=pd.MultiIndex.from_product( | ||||
|             [[0, 1, 2, 3, 4], [0.3, 0.7]], names=["A", None] | ||||
|         ), | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_quantile_array_no_sort(): | ||||
|     df = DataFrame({"A": [0, 1, 2], "B": [3, 4, 5]}) | ||||
|     key = np.array([1, 0, 1], dtype=np.int64) | ||||
|     result = df.groupby(key, sort=False).quantile([0.25, 0.5, 0.75]) | ||||
|     expected = DataFrame( | ||||
|         {"A": [0.5, 1.0, 1.5, 1.0, 1.0, 1.0], "B": [3.5, 4.0, 4.5, 4.0, 4.0, 4.0]}, | ||||
|         index=pd.MultiIndex.from_product([[1, 0], [0.25, 0.5, 0.75]]), | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby(key, sort=False).quantile([0.75, 0.25]) | ||||
|     expected = DataFrame( | ||||
|         {"A": [1.5, 0.5, 1.0, 1.0], "B": [4.5, 3.5, 4.0, 4.0]}, | ||||
|         index=pd.MultiIndex.from_product([[1, 0], [0.75, 0.25]]), | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_quantile_array_multiple_levels(): | ||||
|     df = DataFrame( | ||||
|         {"A": [0, 1, 2], "B": [3, 4, 5], "c": ["a", "a", "a"], "d": ["a", "a", "b"]} | ||||
|     ) | ||||
|     result = df.groupby(["c", "d"]).quantile([0.25, 0.75]) | ||||
|     index = pd.MultiIndex.from_tuples( | ||||
|         [("a", "a", 0.25), ("a", "a", 0.75), ("a", "b", 0.25), ("a", "b", 0.75)], | ||||
|         names=["c", "d", None], | ||||
|     ) | ||||
|     expected = DataFrame( | ||||
|         {"A": [0.25, 0.75, 2.0, 2.0], "B": [3.25, 3.75, 5.0, 5.0]}, index=index | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("frame_size", [(2, 3), (100, 10)]) | ||||
| @pytest.mark.parametrize("groupby", [[0], [0, 1]]) | ||||
| @pytest.mark.parametrize("q", [[0.5, 0.6]]) | ||||
| def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, q): | ||||
|     # GH30289 | ||||
|     nrow, ncol = frame_size | ||||
|     df = DataFrame(np.array([ncol * [_ % 4] for _ in range(nrow)]), columns=range(ncol)) | ||||
|  | ||||
|     idx_levels = [np.arange(min(nrow, 4))] * len(groupby) + [q] | ||||
|     idx_codes = [[x for x in range(min(nrow, 4)) for _ in q]] * len(groupby) + [ | ||||
|         list(range(len(q))) * min(nrow, 4) | ||||
|     ] | ||||
|     expected_index = pd.MultiIndex( | ||||
|         levels=idx_levels, codes=idx_codes, names=groupby + [None] | ||||
|     ) | ||||
|     expected_values = [ | ||||
|         [float(x)] * (ncol - len(groupby)) for x in range(min(nrow, 4)) for _ in q | ||||
|     ] | ||||
|     expected_columns = [x for x in range(ncol) if x not in groupby] | ||||
|     expected = DataFrame( | ||||
|         expected_values, index=expected_index, columns=expected_columns | ||||
|     ) | ||||
|     result = df.groupby(groupby).quantile(q) | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_quantile_raises(): | ||||
|     df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"]) | ||||
|  | ||||
|     msg = "dtype '(object|str)' does not support operation 'quantile'" | ||||
|     with pytest.raises(TypeError, match=msg): | ||||
|         df.groupby("key").quantile() | ||||
|  | ||||
|  | ||||
| def test_quantile_out_of_bounds_q_raises(): | ||||
|     # https://github.com/pandas-dev/pandas/issues/27470 | ||||
|     df = DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": range(6)}) | ||||
|     g = df.groupby([0, 0, 0, 1, 1, 1]) | ||||
|     with pytest.raises(ValueError, match="Got '50.0' instead"): | ||||
|         g.quantile(50) | ||||
|  | ||||
|     with pytest.raises(ValueError, match="Got '-1.0' instead"): | ||||
|         g.quantile(-1) | ||||
|  | ||||
|  | ||||
| def test_quantile_missing_group_values_no_segfaults(): | ||||
|     # GH 28662 | ||||
|     data = np.array([1.0, np.nan, 1.0]) | ||||
|     df = DataFrame({"key": data, "val": range(3)}) | ||||
|  | ||||
|     # Random segfaults; would have been guaranteed in loop | ||||
|     grp = df.groupby("key") | ||||
|     for _ in range(100): | ||||
|         grp.quantile() | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "key, val, expected_key, expected_val", | ||||
|     [ | ||||
|         ([1.0, np.nan, 3.0, np.nan], range(4), [1.0, 3.0], [0.0, 2.0]), | ||||
|         ([1.0, np.nan, 2.0, 2.0], range(4), [1.0, 2.0], [0.0, 2.5]), | ||||
|         (["a", "b", "b", np.nan], range(4), ["a", "b"], [0, 1.5]), | ||||
|         ([0], [42], [0], [42.0]), | ||||
|         ([], [], np.array([], dtype="float64"), np.array([], dtype="float64")), | ||||
|     ], | ||||
| ) | ||||
| def test_quantile_missing_group_values_correct_results( | ||||
|     key, val, expected_key, expected_val | ||||
| ): | ||||
|     # GH 28662, GH 33200, GH 33569 | ||||
|     df = DataFrame({"key": key, "val": val}) | ||||
|  | ||||
|     expected = DataFrame( | ||||
|         expected_val, index=Index(expected_key, name="key"), columns=["val"] | ||||
|     ) | ||||
|  | ||||
|     grp = df.groupby("key") | ||||
|  | ||||
|     result = grp.quantile(0.5) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = grp.quantile() | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "values", | ||||
|     [ | ||||
|         pd.array([1, 0, None] * 2, dtype="Int64"), | ||||
|         pd.array([True, False, None] * 2, dtype="boolean"), | ||||
|     ], | ||||
| ) | ||||
| @pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]]) | ||||
| def test_groupby_quantile_nullable_array(values, q): | ||||
|     # https://github.com/pandas-dev/pandas/issues/33136 | ||||
|     df = DataFrame({"a": ["x"] * 3 + ["y"] * 3, "b": values}) | ||||
|     result = df.groupby("a")["b"].quantile(q) | ||||
|  | ||||
|     if isinstance(q, list): | ||||
|         idx = pd.MultiIndex.from_product((["x", "y"], q), names=["a", None]) | ||||
|         true_quantiles = [0.0, 0.5, 1.0] | ||||
|     else: | ||||
|         idx = Index(["x", "y"], name="a") | ||||
|         true_quantiles = [0.5] | ||||
|  | ||||
|     expected = pd.Series(true_quantiles * 2, index=idx, name="b", dtype="Float64") | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]]) | ||||
| @pytest.mark.parametrize("numeric_only", [True, False]) | ||||
| def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only): | ||||
|     df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]}) | ||||
|     if numeric_only: | ||||
|         result = df.groupby("a").quantile(q, numeric_only=numeric_only) | ||||
|         expected = df.groupby("a")[["b"]].quantile(q) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|     else: | ||||
|         msg = "dtype '.*' does not support operation 'quantile'" | ||||
|         with pytest.raises(TypeError, match=msg): | ||||
|             df.groupby("a").quantile(q, numeric_only=numeric_only) | ||||
|  | ||||
|  | ||||
| def test_groupby_quantile_NA_float(any_float_dtype): | ||||
|     # GH#42849 | ||||
|     df = DataFrame({"x": [1, 1], "y": [0.2, np.nan]}, dtype=any_float_dtype) | ||||
|     result = df.groupby("x")["y"].quantile(0.5) | ||||
|     exp_index = Index([1.0], dtype=any_float_dtype, name="x") | ||||
|  | ||||
|     if any_float_dtype in ["Float32", "Float64"]: | ||||
|         expected_dtype = any_float_dtype | ||||
|     else: | ||||
|         expected_dtype = None | ||||
|  | ||||
|     expected = pd.Series([0.2], dtype=expected_dtype, index=exp_index, name="y") | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby("x")["y"].quantile([0.5, 0.75]) | ||||
|     expected = pd.Series( | ||||
|         [0.2] * 2, | ||||
|         index=pd.MultiIndex.from_product((exp_index, [0.5, 0.75]), names=["x", None]), | ||||
|         name="y", | ||||
|         dtype=expected_dtype, | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_groupby_quantile_NA_int(any_int_ea_dtype): | ||||
|     # GH#42849 | ||||
|     df = DataFrame({"x": [1, 1], "y": [2, 5]}, dtype=any_int_ea_dtype) | ||||
|     result = df.groupby("x")["y"].quantile(0.5) | ||||
|     expected = pd.Series( | ||||
|         [3.5], | ||||
|         dtype="Float64", | ||||
|         index=Index([1], name="x", dtype=any_int_ea_dtype), | ||||
|         name="y", | ||||
|     ) | ||||
|     tm.assert_series_equal(expected, result) | ||||
|  | ||||
|     result = df.groupby("x").quantile(0.5) | ||||
|     expected = DataFrame( | ||||
|         {"y": 3.5}, dtype="Float64", index=Index([1], name="x", dtype=any_int_ea_dtype) | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "interpolation, val1, val2", [("lower", 2, 2), ("higher", 2, 3), ("nearest", 2, 2)] | ||||
| ) | ||||
| def test_groupby_quantile_all_na_group_masked( | ||||
|     interpolation, val1, val2, any_numeric_ea_dtype | ||||
| ): | ||||
|     # GH#37493 | ||||
|     df = DataFrame( | ||||
|         {"a": [1, 1, 1, 2], "b": [1, 2, 3, pd.NA]}, dtype=any_numeric_ea_dtype | ||||
|     ) | ||||
|     result = df.groupby("a").quantile(q=[0.5, 0.7], interpolation=interpolation) | ||||
|     expected = DataFrame( | ||||
|         {"b": [val1, val2, pd.NA, pd.NA]}, | ||||
|         dtype=any_numeric_ea_dtype, | ||||
|         index=pd.MultiIndex.from_arrays( | ||||
|             [pd.Series([1, 1, 2, 2], dtype=any_numeric_ea_dtype), [0.5, 0.7, 0.5, 0.7]], | ||||
|             names=["a", None], | ||||
|         ), | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("interpolation", ["midpoint", "linear"]) | ||||
| def test_groupby_quantile_all_na_group_masked_interp( | ||||
|     interpolation, any_numeric_ea_dtype | ||||
| ): | ||||
|     # GH#37493 | ||||
|     df = DataFrame( | ||||
|         {"a": [1, 1, 1, 2], "b": [1, 2, 3, pd.NA]}, dtype=any_numeric_ea_dtype | ||||
|     ) | ||||
|     result = df.groupby("a").quantile(q=[0.5, 0.75], interpolation=interpolation) | ||||
|  | ||||
|     if any_numeric_ea_dtype == "Float32": | ||||
|         expected_dtype = any_numeric_ea_dtype | ||||
|     else: | ||||
|         expected_dtype = "Float64" | ||||
|  | ||||
|     expected = DataFrame( | ||||
|         {"b": [2.0, 2.5, pd.NA, pd.NA]}, | ||||
|         dtype=expected_dtype, | ||||
|         index=pd.MultiIndex.from_arrays( | ||||
|             [ | ||||
|                 pd.Series([1, 1, 2, 2], dtype=any_numeric_ea_dtype), | ||||
|                 [0.5, 0.75, 0.5, 0.75], | ||||
|             ], | ||||
|             names=["a", None], | ||||
|         ), | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("dtype", ["Float64", "Float32"]) | ||||
| def test_groupby_quantile_allNA_column(dtype): | ||||
|     # GH#42849 | ||||
|     df = DataFrame({"x": [1, 1], "y": [pd.NA] * 2}, dtype=dtype) | ||||
|     result = df.groupby("x")["y"].quantile(0.5) | ||||
|     expected = pd.Series( | ||||
|         [np.nan], dtype=dtype, index=Index([1.0], dtype=dtype), name="y" | ||||
|     ) | ||||
|     expected.index.name = "x" | ||||
|     tm.assert_series_equal(expected, result) | ||||
|  | ||||
|  | ||||
| def test_groupby_timedelta_quantile(): | ||||
|     # GH: 29485 | ||||
|     df = DataFrame( | ||||
|         {"value": pd.to_timedelta(np.arange(4), unit="s"), "group": [1, 1, 2, 2]} | ||||
|     ) | ||||
|     result = df.groupby("group").quantile(0.99) | ||||
|     expected = DataFrame( | ||||
|         { | ||||
|             "value": [ | ||||
|                 pd.Timedelta("0 days 00:00:00.990000"), | ||||
|                 pd.Timedelta("0 days 00:00:02.990000"), | ||||
|             ] | ||||
|         }, | ||||
|         index=Index([1, 2], name="group"), | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_columns_groupby_quantile(): | ||||
|     # GH 33795 | ||||
|     df = DataFrame( | ||||
|         np.arange(12).reshape(3, -1), | ||||
|         index=list("XYZ"), | ||||
|         columns=pd.Series(list("ABAB"), name="col"), | ||||
|     ) | ||||
|     msg = "DataFrame.groupby with axis=1 is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         gb = df.groupby("col", axis=1) | ||||
|     result = gb.quantile(q=[0.8, 0.2]) | ||||
|     expected = DataFrame( | ||||
|         [ | ||||
|             [1.6, 0.4, 2.6, 1.4], | ||||
|             [5.6, 4.4, 6.6, 5.4], | ||||
|             [9.6, 8.4, 10.6, 9.4], | ||||
|         ], | ||||
|         index=list("XYZ"), | ||||
|         columns=pd.MultiIndex.from_tuples( | ||||
|             [("A", 0.8), ("A", 0.2), ("B", 0.8), ("B", 0.2)], names=["col", None] | ||||
|         ), | ||||
|     ) | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_timestamp_groupby_quantile(unit): | ||||
|     # GH 33168 | ||||
|     dti = pd.date_range( | ||||
|         start="2020-04-19 00:00:00", freq="1min", periods=100, tz="UTC", unit=unit | ||||
|     ).floor("1h") | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "timestamp": dti, | ||||
|             "category": list(range(1, 101)), | ||||
|             "value": list(range(101, 201)), | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|     result = df.groupby("timestamp").quantile([0.2, 0.8]) | ||||
|  | ||||
|     mi = pd.MultiIndex.from_product([dti[::99], [0.2, 0.8]], names=("timestamp", None)) | ||||
|     expected = DataFrame( | ||||
|         [ | ||||
|             {"category": 12.8, "value": 112.8}, | ||||
|             {"category": 48.2, "value": 148.2}, | ||||
|             {"category": 68.8, "value": 168.8}, | ||||
|             {"category": 92.2, "value": 192.2}, | ||||
|         ], | ||||
|         index=mi, | ||||
|     ) | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_groupby_quantile_dt64tz_period(): | ||||
|     # GH#51373 | ||||
|     dti = pd.date_range("2016-01-01", periods=1000) | ||||
|     df = pd.Series(dti).to_frame().copy() | ||||
|     df[1] = dti.tz_localize("US/Pacific") | ||||
|     df[2] = dti.to_period("D") | ||||
|     df[3] = dti - dti[0] | ||||
|     df.iloc[-1] = pd.NaT | ||||
|  | ||||
|     by = np.tile(np.arange(5), 200) | ||||
|     gb = df.groupby(by) | ||||
|  | ||||
|     result = gb.quantile(0.5) | ||||
|  | ||||
|     # Check that we match the group-by-group result | ||||
|     exp = {i: df.iloc[i::5].quantile(0.5) for i in range(5)} | ||||
|     expected = DataFrame(exp).T.infer_objects() | ||||
|     expected.index = expected.index.astype(int) | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_groupby_quantile_nonmulti_levels_order(): | ||||
|     # Non-regression test for GH #53009 | ||||
|     ind = pd.MultiIndex.from_tuples( | ||||
|         [ | ||||
|             (0, "a", "B"), | ||||
|             (0, "a", "A"), | ||||
|             (0, "b", "B"), | ||||
|             (0, "b", "A"), | ||||
|             (1, "a", "B"), | ||||
|             (1, "a", "A"), | ||||
|             (1, "b", "B"), | ||||
|             (1, "b", "A"), | ||||
|         ], | ||||
|         names=["sample", "cat0", "cat1"], | ||||
|     ) | ||||
|     ser = pd.Series(range(8), index=ind) | ||||
|     result = ser.groupby(level="cat1", sort=False).quantile([0.2, 0.8]) | ||||
|  | ||||
|     qind = pd.MultiIndex.from_tuples( | ||||
|         [("B", 0.2), ("B", 0.8), ("A", 0.2), ("A", 0.8)], names=["cat1", None] | ||||
|     ) | ||||
|     expected = pd.Series([1.2, 4.8, 2.2, 5.8], index=qind) | ||||
|  | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     # We need to check that index levels are not sorted | ||||
|     expected_levels = pd.core.indexes.frozen.FrozenList([["B", "A"], [0.2, 0.8]]) | ||||
|     tm.assert_equal(result.index.levels, expected_levels) | ||||
| @ -0,0 +1,721 @@ | ||||
| from datetime import datetime | ||||
|  | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| import pandas as pd | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     NaT, | ||||
|     Series, | ||||
|     concat, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| def test_rank_unordered_categorical_typeerror(): | ||||
|     # GH#51034 should be TypeError, not NotImplementedError | ||||
|     cat = pd.Categorical([], ordered=False) | ||||
|     ser = Series(cat) | ||||
|     df = ser.to_frame() | ||||
|  | ||||
|     msg = "Cannot perform rank with non-ordered Categorical" | ||||
|  | ||||
|     gb = ser.groupby(cat, observed=False) | ||||
|     with pytest.raises(TypeError, match=msg): | ||||
|         gb.rank() | ||||
|  | ||||
|     gb2 = df.groupby(cat, observed=False) | ||||
|     with pytest.raises(TypeError, match=msg): | ||||
|         gb2.rank() | ||||
|  | ||||
|  | ||||
| def test_rank_apply(): | ||||
|     lev1 = np.array(["a" * 10] * 100, dtype=object) | ||||
|     lev2 = np.array(["b" * 10] * 130, dtype=object) | ||||
|     lab1 = np.random.default_rng(2).integers(0, 100, size=500, dtype=int) | ||||
|     lab2 = np.random.default_rng(2).integers(0, 130, size=500, dtype=int) | ||||
|  | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "value": np.random.default_rng(2).standard_normal(500), | ||||
|             "key1": lev1.take(lab1), | ||||
|             "key2": lev2.take(lab2), | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|     result = df.groupby(["key1", "key2"]).value.rank() | ||||
|  | ||||
|     expected = [piece.value.rank() for key, piece in df.groupby(["key1", "key2"])] | ||||
|     expected = concat(expected, axis=0) | ||||
|     expected = expected.reindex(result.index) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby(["key1", "key2"]).value.rank(pct=True) | ||||
|  | ||||
|     expected = [ | ||||
|         piece.value.rank(pct=True) for key, piece in df.groupby(["key1", "key2"]) | ||||
|     ] | ||||
|     expected = concat(expected, axis=0) | ||||
|     expected = expected.reindex(result.index) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]]) | ||||
| @pytest.mark.parametrize( | ||||
|     "vals", | ||||
|     [ | ||||
|         np.array([2, 2, 8, 2, 6], dtype=dtype) | ||||
|         for dtype in ["i8", "i4", "i2", "i1", "u8", "u4", "u2", "u1", "f8", "f4", "f2"] | ||||
|     ] | ||||
|     + [ | ||||
|         [ | ||||
|             pd.Timestamp("2018-01-02"), | ||||
|             pd.Timestamp("2018-01-02"), | ||||
|             pd.Timestamp("2018-01-08"), | ||||
|             pd.Timestamp("2018-01-02"), | ||||
|             pd.Timestamp("2018-01-06"), | ||||
|         ], | ||||
|         [ | ||||
|             pd.Timestamp("2018-01-02", tz="US/Pacific"), | ||||
|             pd.Timestamp("2018-01-02", tz="US/Pacific"), | ||||
|             pd.Timestamp("2018-01-08", tz="US/Pacific"), | ||||
|             pd.Timestamp("2018-01-02", tz="US/Pacific"), | ||||
|             pd.Timestamp("2018-01-06", tz="US/Pacific"), | ||||
|         ], | ||||
|         [ | ||||
|             pd.Timestamp("2018-01-02") - pd.Timestamp(0), | ||||
|             pd.Timestamp("2018-01-02") - pd.Timestamp(0), | ||||
|             pd.Timestamp("2018-01-08") - pd.Timestamp(0), | ||||
|             pd.Timestamp("2018-01-02") - pd.Timestamp(0), | ||||
|             pd.Timestamp("2018-01-06") - pd.Timestamp(0), | ||||
|         ], | ||||
|         [ | ||||
|             pd.Timestamp("2018-01-02").to_period("D"), | ||||
|             pd.Timestamp("2018-01-02").to_period("D"), | ||||
|             pd.Timestamp("2018-01-08").to_period("D"), | ||||
|             pd.Timestamp("2018-01-02").to_period("D"), | ||||
|             pd.Timestamp("2018-01-06").to_period("D"), | ||||
|         ], | ||||
|     ], | ||||
|     ids=lambda x: type(x[0]), | ||||
| ) | ||||
| @pytest.mark.parametrize( | ||||
|     "ties_method,ascending,pct,exp", | ||||
|     [ | ||||
|         ("average", True, False, [2.0, 2.0, 5.0, 2.0, 4.0]), | ||||
|         ("average", True, True, [0.4, 0.4, 1.0, 0.4, 0.8]), | ||||
|         ("average", False, False, [4.0, 4.0, 1.0, 4.0, 2.0]), | ||||
|         ("average", False, True, [0.8, 0.8, 0.2, 0.8, 0.4]), | ||||
|         ("min", True, False, [1.0, 1.0, 5.0, 1.0, 4.0]), | ||||
|         ("min", True, True, [0.2, 0.2, 1.0, 0.2, 0.8]), | ||||
|         ("min", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]), | ||||
|         ("min", False, True, [0.6, 0.6, 0.2, 0.6, 0.4]), | ||||
|         ("max", True, False, [3.0, 3.0, 5.0, 3.0, 4.0]), | ||||
|         ("max", True, True, [0.6, 0.6, 1.0, 0.6, 0.8]), | ||||
|         ("max", False, False, [5.0, 5.0, 1.0, 5.0, 2.0]), | ||||
|         ("max", False, True, [1.0, 1.0, 0.2, 1.0, 0.4]), | ||||
|         ("first", True, False, [1.0, 2.0, 5.0, 3.0, 4.0]), | ||||
|         ("first", True, True, [0.2, 0.4, 1.0, 0.6, 0.8]), | ||||
|         ("first", False, False, [3.0, 4.0, 1.0, 5.0, 2.0]), | ||||
|         ("first", False, True, [0.6, 0.8, 0.2, 1.0, 0.4]), | ||||
|         ("dense", True, False, [1.0, 1.0, 3.0, 1.0, 2.0]), | ||||
|         ("dense", True, True, [1.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 2.0 / 3.0]), | ||||
|         ("dense", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]), | ||||
|         ("dense", False, True, [3.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 2.0 / 3.0]), | ||||
|     ], | ||||
| ) | ||||
| def test_rank_args(grps, vals, ties_method, ascending, pct, exp): | ||||
|     key = np.repeat(grps, len(vals)) | ||||
|  | ||||
|     orig_vals = vals | ||||
|     vals = list(vals) * len(grps) | ||||
|     if isinstance(orig_vals, np.ndarray): | ||||
|         vals = np.array(vals, dtype=orig_vals.dtype) | ||||
|  | ||||
|     df = DataFrame({"key": key, "val": vals}) | ||||
|     result = df.groupby("key").rank(method=ties_method, ascending=ascending, pct=pct) | ||||
|  | ||||
|     exp_df = DataFrame(exp * len(grps), columns=["val"]) | ||||
|     tm.assert_frame_equal(result, exp_df) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]]) | ||||
| @pytest.mark.parametrize( | ||||
|     "vals", [[-np.inf, -np.inf, np.nan, 1.0, np.nan, np.inf, np.inf]] | ||||
| ) | ||||
| @pytest.mark.parametrize( | ||||
|     "ties_method,ascending,na_option,exp", | ||||
|     [ | ||||
|         ("average", True, "keep", [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]), | ||||
|         ("average", True, "top", [3.5, 3.5, 1.5, 5.0, 1.5, 6.5, 6.5]), | ||||
|         ("average", True, "bottom", [1.5, 1.5, 6.5, 3.0, 6.5, 4.5, 4.5]), | ||||
|         ("average", False, "keep", [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]), | ||||
|         ("average", False, "top", [6.5, 6.5, 1.5, 5.0, 1.5, 3.5, 3.5]), | ||||
|         ("average", False, "bottom", [4.5, 4.5, 6.5, 3.0, 6.5, 1.5, 1.5]), | ||||
|         ("min", True, "keep", [1.0, 1.0, np.nan, 3.0, np.nan, 4.0, 4.0]), | ||||
|         ("min", True, "top", [3.0, 3.0, 1.0, 5.0, 1.0, 6.0, 6.0]), | ||||
|         ("min", True, "bottom", [1.0, 1.0, 6.0, 3.0, 6.0, 4.0, 4.0]), | ||||
|         ("min", False, "keep", [4.0, 4.0, np.nan, 3.0, np.nan, 1.0, 1.0]), | ||||
|         ("min", False, "top", [6.0, 6.0, 1.0, 5.0, 1.0, 3.0, 3.0]), | ||||
|         ("min", False, "bottom", [4.0, 4.0, 6.0, 3.0, 6.0, 1.0, 1.0]), | ||||
|         ("max", True, "keep", [2.0, 2.0, np.nan, 3.0, np.nan, 5.0, 5.0]), | ||||
|         ("max", True, "top", [4.0, 4.0, 2.0, 5.0, 2.0, 7.0, 7.0]), | ||||
|         ("max", True, "bottom", [2.0, 2.0, 7.0, 3.0, 7.0, 5.0, 5.0]), | ||||
|         ("max", False, "keep", [5.0, 5.0, np.nan, 3.0, np.nan, 2.0, 2.0]), | ||||
|         ("max", False, "top", [7.0, 7.0, 2.0, 5.0, 2.0, 4.0, 4.0]), | ||||
|         ("max", False, "bottom", [5.0, 5.0, 7.0, 3.0, 7.0, 2.0, 2.0]), | ||||
|         ("first", True, "keep", [1.0, 2.0, np.nan, 3.0, np.nan, 4.0, 5.0]), | ||||
|         ("first", True, "top", [3.0, 4.0, 1.0, 5.0, 2.0, 6.0, 7.0]), | ||||
|         ("first", True, "bottom", [1.0, 2.0, 6.0, 3.0, 7.0, 4.0, 5.0]), | ||||
|         ("first", False, "keep", [4.0, 5.0, np.nan, 3.0, np.nan, 1.0, 2.0]), | ||||
|         ("first", False, "top", [6.0, 7.0, 1.0, 5.0, 2.0, 3.0, 4.0]), | ||||
|         ("first", False, "bottom", [4.0, 5.0, 6.0, 3.0, 7.0, 1.0, 2.0]), | ||||
|         ("dense", True, "keep", [1.0, 1.0, np.nan, 2.0, np.nan, 3.0, 3.0]), | ||||
|         ("dense", True, "top", [2.0, 2.0, 1.0, 3.0, 1.0, 4.0, 4.0]), | ||||
|         ("dense", True, "bottom", [1.0, 1.0, 4.0, 2.0, 4.0, 3.0, 3.0]), | ||||
|         ("dense", False, "keep", [3.0, 3.0, np.nan, 2.0, np.nan, 1.0, 1.0]), | ||||
|         ("dense", False, "top", [4.0, 4.0, 1.0, 3.0, 1.0, 2.0, 2.0]), | ||||
|         ("dense", False, "bottom", [3.0, 3.0, 4.0, 2.0, 4.0, 1.0, 1.0]), | ||||
|     ], | ||||
| ) | ||||
| def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp): | ||||
|     # GH 20561 | ||||
|     key = np.repeat(grps, len(vals)) | ||||
|     vals = vals * len(grps) | ||||
|     df = DataFrame({"key": key, "val": vals}) | ||||
|     result = df.groupby("key").rank( | ||||
|         method=ties_method, ascending=ascending, na_option=na_option | ||||
|     ) | ||||
|     exp_df = DataFrame(exp * len(grps), columns=["val"]) | ||||
|     tm.assert_frame_equal(result, exp_df) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]]) | ||||
| @pytest.mark.parametrize( | ||||
|     "vals", | ||||
|     [ | ||||
|         np.array([2, 2, np.nan, 8, 2, 6, np.nan, np.nan], dtype=dtype) | ||||
|         for dtype in ["f8", "f4", "f2"] | ||||
|     ] | ||||
|     + [ | ||||
|         [ | ||||
|             pd.Timestamp("2018-01-02"), | ||||
|             pd.Timestamp("2018-01-02"), | ||||
|             np.nan, | ||||
|             pd.Timestamp("2018-01-08"), | ||||
|             pd.Timestamp("2018-01-02"), | ||||
|             pd.Timestamp("2018-01-06"), | ||||
|             np.nan, | ||||
|             np.nan, | ||||
|         ], | ||||
|         [ | ||||
|             pd.Timestamp("2018-01-02", tz="US/Pacific"), | ||||
|             pd.Timestamp("2018-01-02", tz="US/Pacific"), | ||||
|             np.nan, | ||||
|             pd.Timestamp("2018-01-08", tz="US/Pacific"), | ||||
|             pd.Timestamp("2018-01-02", tz="US/Pacific"), | ||||
|             pd.Timestamp("2018-01-06", tz="US/Pacific"), | ||||
|             np.nan, | ||||
|             np.nan, | ||||
|         ], | ||||
|         [ | ||||
|             pd.Timestamp("2018-01-02") - pd.Timestamp(0), | ||||
|             pd.Timestamp("2018-01-02") - pd.Timestamp(0), | ||||
|             np.nan, | ||||
|             pd.Timestamp("2018-01-08") - pd.Timestamp(0), | ||||
|             pd.Timestamp("2018-01-02") - pd.Timestamp(0), | ||||
|             pd.Timestamp("2018-01-06") - pd.Timestamp(0), | ||||
|             np.nan, | ||||
|             np.nan, | ||||
|         ], | ||||
|         [ | ||||
|             pd.Timestamp("2018-01-02").to_period("D"), | ||||
|             pd.Timestamp("2018-01-02").to_period("D"), | ||||
|             np.nan, | ||||
|             pd.Timestamp("2018-01-08").to_period("D"), | ||||
|             pd.Timestamp("2018-01-02").to_period("D"), | ||||
|             pd.Timestamp("2018-01-06").to_period("D"), | ||||
|             np.nan, | ||||
|             np.nan, | ||||
|         ], | ||||
|     ], | ||||
|     ids=lambda x: type(x[0]), | ||||
| ) | ||||
| @pytest.mark.parametrize( | ||||
|     "ties_method,ascending,na_option,pct,exp", | ||||
|     [ | ||||
|         ( | ||||
|             "average", | ||||
|             True, | ||||
|             "keep", | ||||
|             False, | ||||
|             [2.0, 2.0, np.nan, 5.0, 2.0, 4.0, np.nan, np.nan], | ||||
|         ), | ||||
|         ( | ||||
|             "average", | ||||
|             True, | ||||
|             "keep", | ||||
|             True, | ||||
|             [0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan], | ||||
|         ), | ||||
|         ( | ||||
|             "average", | ||||
|             False, | ||||
|             "keep", | ||||
|             False, | ||||
|             [4.0, 4.0, np.nan, 1.0, 4.0, 2.0, np.nan, np.nan], | ||||
|         ), | ||||
|         ( | ||||
|             "average", | ||||
|             False, | ||||
|             "keep", | ||||
|             True, | ||||
|             [0.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan], | ||||
|         ), | ||||
|         ("min", True, "keep", False, [1.0, 1.0, np.nan, 5.0, 1.0, 4.0, np.nan, np.nan]), | ||||
|         ("min", True, "keep", True, [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]), | ||||
|         ( | ||||
|             "min", | ||||
|             False, | ||||
|             "keep", | ||||
|             False, | ||||
|             [3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan], | ||||
|         ), | ||||
|         ("min", False, "keep", True, [0.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]), | ||||
|         ("max", True, "keep", False, [3.0, 3.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan]), | ||||
|         ("max", True, "keep", True, [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]), | ||||
|         ( | ||||
|             "max", | ||||
|             False, | ||||
|             "keep", | ||||
|             False, | ||||
|             [5.0, 5.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan], | ||||
|         ), | ||||
|         ("max", False, "keep", True, [1.0, 1.0, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan]), | ||||
|         ( | ||||
|             "first", | ||||
|             True, | ||||
|             "keep", | ||||
|             False, | ||||
|             [1.0, 2.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan], | ||||
|         ), | ||||
|         ( | ||||
|             "first", | ||||
|             True, | ||||
|             "keep", | ||||
|             True, | ||||
|             [0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan], | ||||
|         ), | ||||
|         ( | ||||
|             "first", | ||||
|             False, | ||||
|             "keep", | ||||
|             False, | ||||
|             [3.0, 4.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan], | ||||
|         ), | ||||
|         ( | ||||
|             "first", | ||||
|             False, | ||||
|             "keep", | ||||
|             True, | ||||
|             [0.6, 0.8, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan], | ||||
|         ), | ||||
|         ( | ||||
|             "dense", | ||||
|             True, | ||||
|             "keep", | ||||
|             False, | ||||
|             [1.0, 1.0, np.nan, 3.0, 1.0, 2.0, np.nan, np.nan], | ||||
|         ), | ||||
|         ( | ||||
|             "dense", | ||||
|             True, | ||||
|             "keep", | ||||
|             True, | ||||
|             [ | ||||
|                 1.0 / 3.0, | ||||
|                 1.0 / 3.0, | ||||
|                 np.nan, | ||||
|                 3.0 / 3.0, | ||||
|                 1.0 / 3.0, | ||||
|                 2.0 / 3.0, | ||||
|                 np.nan, | ||||
|                 np.nan, | ||||
|             ], | ||||
|         ), | ||||
|         ( | ||||
|             "dense", | ||||
|             False, | ||||
|             "keep", | ||||
|             False, | ||||
|             [3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan], | ||||
|         ), | ||||
|         ( | ||||
|             "dense", | ||||
|             False, | ||||
|             "keep", | ||||
|             True, | ||||
|             [ | ||||
|                 3.0 / 3.0, | ||||
|                 3.0 / 3.0, | ||||
|                 np.nan, | ||||
|                 1.0 / 3.0, | ||||
|                 3.0 / 3.0, | ||||
|                 2.0 / 3.0, | ||||
|                 np.nan, | ||||
|                 np.nan, | ||||
|             ], | ||||
|         ), | ||||
|         ("average", True, "bottom", False, [2.0, 2.0, 7.0, 5.0, 2.0, 4.0, 7.0, 7.0]), | ||||
|         ( | ||||
|             "average", | ||||
|             True, | ||||
|             "bottom", | ||||
|             True, | ||||
|             [0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875], | ||||
|         ), | ||||
|         ("average", False, "bottom", False, [4.0, 4.0, 7.0, 1.0, 4.0, 2.0, 7.0, 7.0]), | ||||
|         ( | ||||
|             "average", | ||||
|             False, | ||||
|             "bottom", | ||||
|             True, | ||||
|             [0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875], | ||||
|         ), | ||||
|         ("min", True, "bottom", False, [1.0, 1.0, 6.0, 5.0, 1.0, 4.0, 6.0, 6.0]), | ||||
|         ( | ||||
|             "min", | ||||
|             True, | ||||
|             "bottom", | ||||
|             True, | ||||
|             [0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75], | ||||
|         ), | ||||
|         ("min", False, "bottom", False, [3.0, 3.0, 6.0, 1.0, 3.0, 2.0, 6.0, 6.0]), | ||||
|         ( | ||||
|             "min", | ||||
|             False, | ||||
|             "bottom", | ||||
|             True, | ||||
|             [0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75], | ||||
|         ), | ||||
|         ("max", True, "bottom", False, [3.0, 3.0, 8.0, 5.0, 3.0, 4.0, 8.0, 8.0]), | ||||
|         ("max", True, "bottom", True, [0.375, 0.375, 1.0, 0.625, 0.375, 0.5, 1.0, 1.0]), | ||||
|         ("max", False, "bottom", False, [5.0, 5.0, 8.0, 1.0, 5.0, 2.0, 8.0, 8.0]), | ||||
|         ( | ||||
|             "max", | ||||
|             False, | ||||
|             "bottom", | ||||
|             True, | ||||
|             [0.625, 0.625, 1.0, 0.125, 0.625, 0.25, 1.0, 1.0], | ||||
|         ), | ||||
|         ("first", True, "bottom", False, [1.0, 2.0, 6.0, 5.0, 3.0, 4.0, 7.0, 8.0]), | ||||
|         ( | ||||
|             "first", | ||||
|             True, | ||||
|             "bottom", | ||||
|             True, | ||||
|             [0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.0], | ||||
|         ), | ||||
|         ("first", False, "bottom", False, [3.0, 4.0, 6.0, 1.0, 5.0, 2.0, 7.0, 8.0]), | ||||
|         ( | ||||
|             "first", | ||||
|             False, | ||||
|             "bottom", | ||||
|             True, | ||||
|             [0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.0], | ||||
|         ), | ||||
|         ("dense", True, "bottom", False, [1.0, 1.0, 4.0, 3.0, 1.0, 2.0, 4.0, 4.0]), | ||||
|         ("dense", True, "bottom", True, [0.25, 0.25, 1.0, 0.75, 0.25, 0.5, 1.0, 1.0]), | ||||
|         ("dense", False, "bottom", False, [3.0, 3.0, 4.0, 1.0, 3.0, 2.0, 4.0, 4.0]), | ||||
|         ("dense", False, "bottom", True, [0.75, 0.75, 1.0, 0.25, 0.75, 0.5, 1.0, 1.0]), | ||||
|     ], | ||||
| ) | ||||
| def test_rank_args_missing(grps, vals, ties_method, ascending, na_option, pct, exp): | ||||
|     key = np.repeat(grps, len(vals)) | ||||
|  | ||||
|     orig_vals = vals | ||||
|     vals = list(vals) * len(grps) | ||||
|     if isinstance(orig_vals, np.ndarray): | ||||
|         vals = np.array(vals, dtype=orig_vals.dtype) | ||||
|  | ||||
|     df = DataFrame({"key": key, "val": vals}) | ||||
|     result = df.groupby("key").rank( | ||||
|         method=ties_method, ascending=ascending, na_option=na_option, pct=pct | ||||
|     ) | ||||
|  | ||||
|     exp_df = DataFrame(exp * len(grps), columns=["val"]) | ||||
|     tm.assert_frame_equal(result, exp_df) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "pct,exp", [(False, [3.0, 3.0, 3.0, 3.0, 3.0]), (True, [0.6, 0.6, 0.6, 0.6, 0.6])] | ||||
| ) | ||||
| def test_rank_resets_each_group(pct, exp): | ||||
|     df = DataFrame( | ||||
|         {"key": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], "val": [1] * 10} | ||||
|     ) | ||||
|     result = df.groupby("key").rank(pct=pct) | ||||
|     exp_df = DataFrame(exp * 2, columns=["val"]) | ||||
|     tm.assert_frame_equal(result, exp_df) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "dtype", ["int64", "int32", "uint64", "uint32", "float64", "float32"] | ||||
| ) | ||||
| @pytest.mark.parametrize("upper", [True, False]) | ||||
| def test_rank_avg_even_vals(dtype, upper): | ||||
|     if upper: | ||||
|         # use IntegerDtype/FloatingDtype | ||||
|         dtype = dtype[0].upper() + dtype[1:] | ||||
|         dtype = dtype.replace("Ui", "UI") | ||||
|     df = DataFrame({"key": ["a"] * 4, "val": [1] * 4}) | ||||
|     df["val"] = df["val"].astype(dtype) | ||||
|     assert df["val"].dtype == dtype | ||||
|  | ||||
|     result = df.groupby("key").rank() | ||||
|     exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=["val"]) | ||||
|     if upper: | ||||
|         exp_df = exp_df.astype("Float64") | ||||
|     tm.assert_frame_equal(result, exp_df) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"]) | ||||
| @pytest.mark.parametrize("ascending", [True, False]) | ||||
| @pytest.mark.parametrize("na_option", ["keep", "top", "bottom"]) | ||||
| @pytest.mark.parametrize("pct", [True, False]) | ||||
| @pytest.mark.parametrize( | ||||
|     "vals", [["bar", "bar", "foo", "bar", "baz"], ["bar", np.nan, "foo", np.nan, "baz"]] | ||||
| ) | ||||
| def test_rank_object_dtype(ties_method, ascending, na_option, pct, vals): | ||||
|     df = DataFrame({"key": ["foo"] * 5, "val": vals}) | ||||
|     mask = df["val"].isna() | ||||
|  | ||||
|     gb = df.groupby("key") | ||||
|     res = gb.rank(method=ties_method, ascending=ascending, na_option=na_option, pct=pct) | ||||
|  | ||||
|     # construct our expected by using numeric values with the same ordering | ||||
|     if mask.any(): | ||||
|         df2 = DataFrame({"key": ["foo"] * 5, "val": [0, np.nan, 2, np.nan, 1]}) | ||||
|     else: | ||||
|         df2 = DataFrame({"key": ["foo"] * 5, "val": [0, 0, 2, 0, 1]}) | ||||
|  | ||||
|     gb2 = df2.groupby("key") | ||||
|     alt = gb2.rank( | ||||
|         method=ties_method, ascending=ascending, na_option=na_option, pct=pct | ||||
|     ) | ||||
|  | ||||
|     tm.assert_frame_equal(res, alt) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("na_option", [True, "bad", 1]) | ||||
| @pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"]) | ||||
| @pytest.mark.parametrize("ascending", [True, False]) | ||||
| @pytest.mark.parametrize("pct", [True, False]) | ||||
| @pytest.mark.parametrize( | ||||
|     "vals", | ||||
|     [ | ||||
|         ["bar", "bar", "foo", "bar", "baz"], | ||||
|         ["bar", np.nan, "foo", np.nan, "baz"], | ||||
|         [1, np.nan, 2, np.nan, 3], | ||||
|     ], | ||||
| ) | ||||
| def test_rank_naoption_raises(ties_method, ascending, na_option, pct, vals): | ||||
|     df = DataFrame({"key": ["foo"] * 5, "val": vals}) | ||||
|     msg = "na_option must be one of 'keep', 'top', or 'bottom'" | ||||
|  | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         df.groupby("key").rank( | ||||
|             method=ties_method, ascending=ascending, na_option=na_option, pct=pct | ||||
|         ) | ||||
|  | ||||
|  | ||||
| def test_rank_empty_group(): | ||||
|     # see gh-22519 | ||||
|     column = "A" | ||||
|     df = DataFrame({"A": [0, 1, 0], "B": [1.0, np.nan, 2.0]}) | ||||
|  | ||||
|     result = df.groupby(column).B.rank(pct=True) | ||||
|     expected = Series([0.5, np.nan, 1.0], name="B") | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby(column).rank(pct=True) | ||||
|     expected = DataFrame({"B": [0.5, np.nan, 1.0]}) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "input_key,input_value,output_value", | ||||
|     [ | ||||
|         ([1, 2], [1, 1], [1.0, 1.0]), | ||||
|         ([1, 1, 2, 2], [1, 2, 1, 2], [0.5, 1.0, 0.5, 1.0]), | ||||
|         ([1, 1, 2, 2], [1, 2, 1, np.nan], [0.5, 1.0, 1.0, np.nan]), | ||||
|         ([1, 1, 2], [1, 2, np.nan], [0.5, 1.0, np.nan]), | ||||
|     ], | ||||
| ) | ||||
| def test_rank_zero_div(input_key, input_value, output_value): | ||||
|     # GH 23666 | ||||
|     df = DataFrame({"A": input_key, "B": input_value}) | ||||
|  | ||||
|     result = df.groupby("A").rank(method="dense", pct=True) | ||||
|     expected = DataFrame({"B": output_value}) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_rank_min_int(): | ||||
|     # GH-32859 | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "grp": [1, 1, 2], | ||||
|             "int_col": [ | ||||
|                 np.iinfo(np.int64).min, | ||||
|                 np.iinfo(np.int64).max, | ||||
|                 np.iinfo(np.int64).min, | ||||
|             ], | ||||
|             "datetimelike": [NaT, datetime(2001, 1, 1), NaT], | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|     result = df.groupby("grp").rank() | ||||
|     expected = DataFrame( | ||||
|         {"int_col": [1.0, 2.0, 1.0], "datetimelike": [np.nan, 1.0, np.nan]} | ||||
|     ) | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("use_nan", [True, False]) | ||||
| def test_rank_pct_equal_values_on_group_transition(use_nan): | ||||
|     # GH#40518 | ||||
|     fill_value = np.nan if use_nan else 3 | ||||
|     df = DataFrame( | ||||
|         [ | ||||
|             [-1, 1], | ||||
|             [-1, 2], | ||||
|             [1, fill_value], | ||||
|             [-1, fill_value], | ||||
|         ], | ||||
|         columns=["group", "val"], | ||||
|     ) | ||||
|     result = df.groupby(["group"])["val"].rank( | ||||
|         method="dense", | ||||
|         pct=True, | ||||
|     ) | ||||
|     if use_nan: | ||||
|         expected = Series([0.5, 1, np.nan, np.nan], name="val") | ||||
|     else: | ||||
|         expected = Series([1 / 3, 2 / 3, 1, 1], name="val") | ||||
|  | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_rank_multiindex(): | ||||
|     # GH27721 | ||||
|     df = concat( | ||||
|         { | ||||
|             "a": DataFrame({"col1": [3, 4], "col2": [1, 2]}), | ||||
|             "b": DataFrame({"col3": [5, 6], "col4": [7, 8]}), | ||||
|         }, | ||||
|         axis=1, | ||||
|     ) | ||||
|  | ||||
|     msg = "DataFrame.groupby with axis=1 is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         gb = df.groupby(level=0, axis=1) | ||||
|     msg = "DataFrameGroupBy.rank with axis=1 is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         result = gb.rank(axis=1) | ||||
|  | ||||
|     expected = concat( | ||||
|         [ | ||||
|             df["a"].rank(axis=1), | ||||
|             df["b"].rank(axis=1), | ||||
|         ], | ||||
|         axis=1, | ||||
|         keys=["a", "b"], | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_groupby_axis0_rank_axis1(): | ||||
|     # GH#41320 | ||||
|     df = DataFrame( | ||||
|         {0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]}, | ||||
|         index=["a", "a", "b", "b"], | ||||
|     ) | ||||
|     msg = "The 'axis' keyword in DataFrame.groupby is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         gb = df.groupby(level=0, axis=0) | ||||
|  | ||||
|     msg = "DataFrameGroupBy.rank with axis=1 is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         res = gb.rank(axis=1) | ||||
|  | ||||
|     # This should match what we get when "manually" operating group-by-group | ||||
|     expected = concat([df.loc["a"].rank(axis=1), df.loc["b"].rank(axis=1)], axis=0) | ||||
|     tm.assert_frame_equal(res, expected) | ||||
|  | ||||
|     # check that we haven't accidentally written a case that coincidentally | ||||
|     # matches rank(axis=0) | ||||
|     msg = "The 'axis' keyword in DataFrameGroupBy.rank" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         alt = gb.rank(axis=0) | ||||
|     assert not alt.equals(expected) | ||||
|  | ||||
|  | ||||
| def test_groupby_axis0_cummax_axis1(): | ||||
|     # case where groupby axis is 0 and axis keyword in transform is 1 | ||||
|  | ||||
|     # df has mixed dtype -> multiple blocks | ||||
|     df = DataFrame( | ||||
|         {0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]}, | ||||
|         index=["a", "a", "b", "b"], | ||||
|     ) | ||||
|     msg = "The 'axis' keyword in DataFrame.groupby is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         gb = df.groupby(level=0, axis=0) | ||||
|  | ||||
|     msg = "DataFrameGroupBy.cummax with axis=1 is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         cmax = gb.cummax(axis=1) | ||||
|     expected = df[[0, 1]].astype(np.float64) | ||||
|     expected[2] = expected[1] | ||||
|     tm.assert_frame_equal(cmax, expected) | ||||
|  | ||||
|  | ||||
| def test_non_unique_index(): | ||||
|     # GH 16577 | ||||
|     df = DataFrame( | ||||
|         {"A": [1.0, 2.0, 3.0, np.nan], "value": 1.0}, | ||||
|         index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4, | ||||
|     ) | ||||
|     result = df.groupby([df.index, "A"]).value.rank(ascending=True, pct=True) | ||||
|     expected = Series( | ||||
|         [1.0, 1.0, 1.0, np.nan], | ||||
|         index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4, | ||||
|         name="value", | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_rank_categorical(): | ||||
|     cat = pd.Categorical(["a", "a", "b", np.nan, "c", "b"], ordered=True) | ||||
|     cat2 = pd.Categorical([1, 2, 3, np.nan, 4, 5], ordered=True) | ||||
|  | ||||
|     df = DataFrame({"col1": [0, 1, 0, 1, 0, 1], "col2": cat, "col3": cat2}) | ||||
|  | ||||
|     gb = df.groupby("col1") | ||||
|  | ||||
|     res = gb.rank() | ||||
|  | ||||
|     expected = df.astype(object).groupby("col1").rank() | ||||
|     tm.assert_frame_equal(res, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("na_option", ["top", "bottom"]) | ||||
| def test_groupby_op_with_nullables(na_option): | ||||
|     # GH 54206 | ||||
|     df = DataFrame({"x": [None]}, dtype="Float64") | ||||
|     result = df.groupby("x", dropna=False)["x"].rank(method="min", na_option=na_option) | ||||
|     expected = Series([1.0], dtype="Float64", name=result.name) | ||||
|     tm.assert_series_equal(result, expected) | ||||
| @ -0,0 +1,154 @@ | ||||
| import pytest | ||||
|  | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     Series, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("n, frac", [(2, None), (None, 0.2)]) | ||||
| def test_groupby_sample_balanced_groups_shape(n, frac): | ||||
|     values = [1] * 10 + [2] * 10 | ||||
|     df = DataFrame({"a": values, "b": values}) | ||||
|  | ||||
|     result = df.groupby("a").sample(n=n, frac=frac) | ||||
|     values = [1] * 2 + [2] * 2 | ||||
|     expected = DataFrame({"a": values, "b": values}, index=result.index) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby("a")["b"].sample(n=n, frac=frac) | ||||
|     expected = Series(values, name="b", index=result.index) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_groupby_sample_unbalanced_groups_shape(): | ||||
|     values = [1] * 10 + [2] * 20 | ||||
|     df = DataFrame({"a": values, "b": values}) | ||||
|  | ||||
|     result = df.groupby("a").sample(n=5) | ||||
|     values = [1] * 5 + [2] * 5 | ||||
|     expected = DataFrame({"a": values, "b": values}, index=result.index) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby("a")["b"].sample(n=5) | ||||
|     expected = Series(values, name="b", index=result.index) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_groupby_sample_index_value_spans_groups(): | ||||
|     values = [1] * 3 + [2] * 3 | ||||
|     df = DataFrame({"a": values, "b": values}, index=[1, 2, 2, 2, 2, 2]) | ||||
|  | ||||
|     result = df.groupby("a").sample(n=2) | ||||
|     values = [1] * 2 + [2] * 2 | ||||
|     expected = DataFrame({"a": values, "b": values}, index=result.index) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby("a")["b"].sample(n=2) | ||||
|     expected = Series(values, name="b", index=result.index) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_groupby_sample_n_and_frac_raises(): | ||||
|     df = DataFrame({"a": [1, 2], "b": [1, 2]}) | ||||
|     msg = "Please enter a value for `frac` OR `n`, not both" | ||||
|  | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         df.groupby("a").sample(n=1, frac=1.0) | ||||
|  | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         df.groupby("a")["b"].sample(n=1, frac=1.0) | ||||
|  | ||||
|  | ||||
| def test_groupby_sample_frac_gt_one_without_replacement_raises(): | ||||
|     df = DataFrame({"a": [1, 2], "b": [1, 2]}) | ||||
|     msg = "Replace has to be set to `True` when upsampling the population `frac` > 1." | ||||
|  | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         df.groupby("a").sample(frac=1.5, replace=False) | ||||
|  | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         df.groupby("a")["b"].sample(frac=1.5, replace=False) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("n", [-1, 1.5]) | ||||
| def test_groupby_sample_invalid_n_raises(n): | ||||
|     df = DataFrame({"a": [1, 2], "b": [1, 2]}) | ||||
|  | ||||
|     if n < 0: | ||||
|         msg = "A negative number of rows requested. Please provide `n` >= 0." | ||||
|     else: | ||||
|         msg = "Only integers accepted as `n` values" | ||||
|  | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         df.groupby("a").sample(n=n) | ||||
|  | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         df.groupby("a")["b"].sample(n=n) | ||||
|  | ||||
|  | ||||
| def test_groupby_sample_oversample(): | ||||
|     values = [1] * 10 + [2] * 10 | ||||
|     df = DataFrame({"a": values, "b": values}) | ||||
|  | ||||
|     result = df.groupby("a").sample(frac=2.0, replace=True) | ||||
|     values = [1] * 20 + [2] * 20 | ||||
|     expected = DataFrame({"a": values, "b": values}, index=result.index) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby("a")["b"].sample(frac=2.0, replace=True) | ||||
|     expected = Series(values, name="b", index=result.index) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_groupby_sample_without_n_or_frac(): | ||||
|     values = [1] * 10 + [2] * 10 | ||||
|     df = DataFrame({"a": values, "b": values}) | ||||
|  | ||||
|     result = df.groupby("a").sample(n=None, frac=None) | ||||
|     expected = DataFrame({"a": [1, 2], "b": [1, 2]}, index=result.index) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby("a")["b"].sample(n=None, frac=None) | ||||
|     expected = Series([1, 2], name="b", index=result.index) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "index, expected_index", | ||||
|     [(["w", "x", "y", "z"], ["w", "w", "y", "y"]), ([3, 4, 5, 6], [3, 3, 5, 5])], | ||||
| ) | ||||
| def test_groupby_sample_with_weights(index, expected_index): | ||||
|     # GH 39927 - tests for integer index needed | ||||
|     values = [1] * 2 + [2] * 2 | ||||
|     df = DataFrame({"a": values, "b": values}, index=Index(index)) | ||||
|  | ||||
|     result = df.groupby("a").sample(n=2, replace=True, weights=[1, 0, 1, 0]) | ||||
|     expected = DataFrame({"a": values, "b": values}, index=Index(expected_index)) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby("a")["b"].sample(n=2, replace=True, weights=[1, 0, 1, 0]) | ||||
|     expected = Series(values, name="b", index=Index(expected_index)) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_groupby_sample_with_selections(): | ||||
|     # GH 39928 | ||||
|     values = [1] * 10 + [2] * 10 | ||||
|     df = DataFrame({"a": values, "b": values, "c": values}) | ||||
|  | ||||
|     result = df.groupby("a")[["b", "c"]].sample(n=None, frac=None) | ||||
|     expected = DataFrame({"b": [1, 2], "c": [1, 2]}, index=result.index) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_groupby_sample_with_empty_inputs(): | ||||
|     # GH48459 | ||||
|     df = DataFrame({"a": [], "b": []}) | ||||
|     groupby_df = df.groupby("a") | ||||
|  | ||||
|     result = groupby_df.sample() | ||||
|     expected = df | ||||
|     tm.assert_frame_equal(result, expected) | ||||
| @ -0,0 +1,122 @@ | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas.core.dtypes.common import is_integer_dtype | ||||
|  | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     PeriodIndex, | ||||
|     Series, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("by", ["A", "B", ["A", "B"]]) | ||||
| def test_size(df, by): | ||||
|     grouped = df.groupby(by=by) | ||||
|     result = grouped.size() | ||||
|     for key, group in grouped: | ||||
|         assert result[key] == len(group) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "by", | ||||
|     [ | ||||
|         [0, 0, 0, 0], | ||||
|         [0, 1, 1, 1], | ||||
|         [1, 0, 1, 1], | ||||
|         [0, None, None, None], | ||||
|         pytest.param([None, None, None, None], marks=pytest.mark.xfail), | ||||
|     ], | ||||
| ) | ||||
| def test_size_axis_1(df, axis_1, by, sort, dropna): | ||||
|     # GH#45715 | ||||
|     counts = {key: sum(value == key for value in by) for key in dict.fromkeys(by)} | ||||
|     if dropna: | ||||
|         counts = {key: value for key, value in counts.items() if key is not None} | ||||
|     expected = Series(counts, dtype="int64") | ||||
|     if sort: | ||||
|         expected = expected.sort_index() | ||||
|     if is_integer_dtype(expected.index.dtype) and not any(x is None for x in by): | ||||
|         expected.index = expected.index.astype(int) | ||||
|  | ||||
|     msg = "DataFrame.groupby with axis=1 is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         grouped = df.groupby(by=by, axis=axis_1, sort=sort, dropna=dropna) | ||||
|     result = grouped.size() | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("by", ["A", "B", ["A", "B"]]) | ||||
| @pytest.mark.parametrize("sort", [True, False]) | ||||
| def test_size_sort(sort, by): | ||||
|     df = DataFrame(np.random.default_rng(2).choice(20, (1000, 3)), columns=list("ABC")) | ||||
|     left = df.groupby(by=by, sort=sort).size() | ||||
|     right = df.groupby(by=by, sort=sort)["C"].apply(lambda a: a.shape[0]) | ||||
|     tm.assert_series_equal(left, right, check_names=False) | ||||
|  | ||||
|  | ||||
| def test_size_series_dataframe(): | ||||
|     # https://github.com/pandas-dev/pandas/issues/11699 | ||||
|     df = DataFrame(columns=["A", "B"]) | ||||
|     out = Series(dtype="int64", index=Index([], name="A")) | ||||
|     tm.assert_series_equal(df.groupby("A").size(), out) | ||||
|  | ||||
|  | ||||
| def test_size_groupby_all_null(): | ||||
|     # https://github.com/pandas-dev/pandas/issues/23050 | ||||
|     # Assert no 'Value Error : Length of passed values is 2, index implies 0' | ||||
|     df = DataFrame({"A": [None, None]})  # all-null groups | ||||
|     result = df.groupby("A").size() | ||||
|     expected = Series(dtype="int64", index=Index([], name="A")) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_size_period_index(): | ||||
|     # https://github.com/pandas-dev/pandas/issues/34010 | ||||
|     ser = Series([1], index=PeriodIndex(["2000"], name="A", freq="D")) | ||||
|     grp = ser.groupby(level="A") | ||||
|     result = grp.size() | ||||
|     tm.assert_series_equal(result, ser) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("as_index", [True, False]) | ||||
| def test_size_on_categorical(as_index): | ||||
|     df = DataFrame([[1, 1], [2, 2]], columns=["A", "B"]) | ||||
|     df["A"] = df["A"].astype("category") | ||||
|     result = df.groupby(["A", "B"], as_index=as_index, observed=False).size() | ||||
|  | ||||
|     expected = DataFrame( | ||||
|         [[1, 1, 1], [1, 2, 0], [2, 1, 0], [2, 2, 1]], columns=["A", "B", "size"] | ||||
|     ) | ||||
|     expected["A"] = expected["A"].astype("category") | ||||
|     if as_index: | ||||
|         expected = expected.set_index(["A", "B"])["size"].rename(None) | ||||
|  | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"]) | ||||
| def test_size_series_masked_type_returns_Int64(dtype): | ||||
|     # GH 54132 | ||||
|     ser = Series([1, 1, 1], index=["a", "a", "b"], dtype=dtype) | ||||
|     result = ser.groupby(level=0).size() | ||||
|     expected = Series([2, 1], dtype="Int64", index=["a", "b"]) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_size_strings(any_string_dtype, using_infer_string): | ||||
|     # GH#55627 | ||||
|     dtype = any_string_dtype | ||||
|     df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype) | ||||
|     result = df.groupby("a")["b"].size() | ||||
|     exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64" | ||||
|     exp_index_dtype = "str" if using_infer_string and dtype == "object" else dtype | ||||
|     expected = Series( | ||||
|         [2, 1], | ||||
|         index=Index(["a", "b"], name="a", dtype=exp_index_dtype), | ||||
|         name="b", | ||||
|         dtype=exp_dtype, | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
| @ -0,0 +1,27 @@ | ||||
| import numpy as np | ||||
|  | ||||
| import pandas as pd | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| def test_groupby_skew_equivalence(): | ||||
|     # Test that that groupby skew method (which uses libgroupby.group_skew) | ||||
|     #  matches the results of operating group-by-group (which uses nanops.nanskew) | ||||
|     nrows = 1000 | ||||
|     ngroups = 3 | ||||
|     ncols = 2 | ||||
|     nan_frac = 0.05 | ||||
|  | ||||
|     arr = np.random.default_rng(2).standard_normal((nrows, ncols)) | ||||
|     arr[np.random.default_rng(2).random(nrows) < nan_frac] = np.nan | ||||
|  | ||||
|     df = pd.DataFrame(arr) | ||||
|     grps = np.random.default_rng(2).integers(0, ngroups, size=nrows) | ||||
|     gb = df.groupby(grps) | ||||
|  | ||||
|     result = gb.skew() | ||||
|  | ||||
|     grpwise = [grp.skew().to_frame(i).T for i, grp in gb] | ||||
|     expected = pd.concat(grpwise, axis=0) | ||||
|     expected.index = expected.index.astype(result.index.dtype)  # 32bit builds | ||||
|     tm.assert_frame_equal(result, expected) | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -0,0 +1,83 @@ | ||||
| """ | ||||
| Tests that apply to all groupby operation methods. | ||||
|  | ||||
| The only tests that should appear here are those that use the `groupby_func` fixture. | ||||
| Even if it does use that fixture, prefer a more specific test file if it available | ||||
| such as: | ||||
|  | ||||
|  - test_categorical | ||||
|  - test_groupby_dropna | ||||
|  - test_groupby_subclass | ||||
|  - test_raises | ||||
| """ | ||||
|  | ||||
| import pytest | ||||
|  | ||||
| import pandas as pd | ||||
| from pandas import DataFrame | ||||
| import pandas._testing as tm | ||||
| from pandas.tests.groupby import get_groupby_method_args | ||||
|  | ||||
|  | ||||
| def test_multiindex_group_all_columns_when_empty(groupby_func): | ||||
|     # GH 32464 | ||||
|     df = DataFrame({"a": [], "b": [], "c": []}).set_index(["a", "b", "c"]) | ||||
|     gb = df.groupby(["a", "b", "c"], group_keys=False) | ||||
|     method = getattr(gb, groupby_func) | ||||
|     args = get_groupby_method_args(groupby_func, df) | ||||
|  | ||||
|     warn = FutureWarning if groupby_func == "fillna" else None | ||||
|     warn_msg = "DataFrameGroupBy.fillna is deprecated" | ||||
|     with tm.assert_produces_warning(warn, match=warn_msg): | ||||
|         result = method(*args).index | ||||
|     expected = df.index | ||||
|     tm.assert_index_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_duplicate_columns(request, groupby_func, as_index): | ||||
|     # GH#50806 | ||||
|     if groupby_func == "corrwith": | ||||
|         msg = "GH#50845 - corrwith fails when there are duplicate columns" | ||||
|         request.applymarker(pytest.mark.xfail(reason=msg)) | ||||
|     df = DataFrame([[1, 3, 6], [1, 4, 7], [2, 5, 8]], columns=list("abb")) | ||||
|     args = get_groupby_method_args(groupby_func, df) | ||||
|     gb = df.groupby("a", as_index=as_index) | ||||
|     warn = FutureWarning if groupby_func == "fillna" else None | ||||
|     warn_msg = "DataFrameGroupBy.fillna is deprecated" | ||||
|     with tm.assert_produces_warning(warn, match=warn_msg): | ||||
|         result = getattr(gb, groupby_func)(*args) | ||||
|  | ||||
|     expected_df = df.set_axis(["a", "b", "c"], axis=1) | ||||
|     expected_args = get_groupby_method_args(groupby_func, expected_df) | ||||
|     expected_gb = expected_df.groupby("a", as_index=as_index) | ||||
|     warn = FutureWarning if groupby_func == "fillna" else None | ||||
|     warn_msg = "DataFrameGroupBy.fillna is deprecated" | ||||
|     with tm.assert_produces_warning(warn, match=warn_msg): | ||||
|         expected = getattr(expected_gb, groupby_func)(*expected_args) | ||||
|     if groupby_func not in ("size", "ngroup", "cumcount"): | ||||
|         expected = expected.rename(columns={"c": "b"}) | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "idx", | ||||
|     [ | ||||
|         pd.Index(["a", "a"], name="foo"), | ||||
|         pd.MultiIndex.from_tuples((("a", "a"), ("a", "a")), names=["foo", "bar"]), | ||||
|     ], | ||||
| ) | ||||
| def test_dup_labels_output_shape(groupby_func, idx): | ||||
|     if groupby_func in {"size", "ngroup", "cumcount"}: | ||||
|         pytest.skip(f"Not applicable for {groupby_func}") | ||||
|  | ||||
|     df = DataFrame([[1, 1]], columns=idx) | ||||
|     grp_by = df.groupby([0]) | ||||
|  | ||||
|     args = get_groupby_method_args(groupby_func, df) | ||||
|     warn = FutureWarning if groupby_func == "fillna" else None | ||||
|     warn_msg = "DataFrameGroupBy.fillna is deprecated" | ||||
|     with tm.assert_produces_warning(warn, match=warn_msg): | ||||
|         result = getattr(grp_by, groupby_func)(*args) | ||||
|  | ||||
|     assert result.shape == (1, 2) | ||||
|     tm.assert_index_equal(result.columns, idx) | ||||
							
								
								
									
										265
									
								
								lib/python3.11/site-packages/pandas/tests/groupby/test_api.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										265
									
								
								lib/python3.11/site-packages/pandas/tests/groupby/test_api.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,265 @@ | ||||
| """ | ||||
| Tests of the groupby API, including internal consistency and with other pandas objects. | ||||
|  | ||||
| Tests in this file should only check the existence, names, and arguments of groupby | ||||
| methods. It should not test the results of any groupby operation. | ||||
| """ | ||||
|  | ||||
| import inspect | ||||
|  | ||||
| import pytest | ||||
|  | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Series, | ||||
| ) | ||||
| from pandas.core.groupby.base import ( | ||||
|     groupby_other_methods, | ||||
|     reduction_kernels, | ||||
|     transformation_kernels, | ||||
| ) | ||||
| from pandas.core.groupby.generic import ( | ||||
|     DataFrameGroupBy, | ||||
|     SeriesGroupBy, | ||||
| ) | ||||
|  | ||||
|  | ||||
| def test_tab_completion(multiindex_dataframe_random_data): | ||||
|     grp = multiindex_dataframe_random_data.groupby(level="second") | ||||
|     results = {v for v in dir(grp) if not v.startswith("_")} | ||||
|     expected = { | ||||
|         "A", | ||||
|         "B", | ||||
|         "C", | ||||
|         "agg", | ||||
|         "aggregate", | ||||
|         "apply", | ||||
|         "boxplot", | ||||
|         "filter", | ||||
|         "first", | ||||
|         "get_group", | ||||
|         "groups", | ||||
|         "hist", | ||||
|         "indices", | ||||
|         "last", | ||||
|         "max", | ||||
|         "mean", | ||||
|         "median", | ||||
|         "min", | ||||
|         "ngroups", | ||||
|         "nth", | ||||
|         "ohlc", | ||||
|         "plot", | ||||
|         "prod", | ||||
|         "size", | ||||
|         "std", | ||||
|         "sum", | ||||
|         "transform", | ||||
|         "var", | ||||
|         "sem", | ||||
|         "count", | ||||
|         "nunique", | ||||
|         "head", | ||||
|         "describe", | ||||
|         "cummax", | ||||
|         "quantile", | ||||
|         "rank", | ||||
|         "cumprod", | ||||
|         "tail", | ||||
|         "resample", | ||||
|         "cummin", | ||||
|         "fillna", | ||||
|         "cumsum", | ||||
|         "cumcount", | ||||
|         "ngroup", | ||||
|         "all", | ||||
|         "shift", | ||||
|         "skew", | ||||
|         "take", | ||||
|         "pct_change", | ||||
|         "any", | ||||
|         "corr", | ||||
|         "corrwith", | ||||
|         "cov", | ||||
|         "dtypes", | ||||
|         "ndim", | ||||
|         "diff", | ||||
|         "idxmax", | ||||
|         "idxmin", | ||||
|         "ffill", | ||||
|         "bfill", | ||||
|         "rolling", | ||||
|         "expanding", | ||||
|         "pipe", | ||||
|         "sample", | ||||
|         "ewm", | ||||
|         "value_counts", | ||||
|     } | ||||
|     assert results == expected | ||||
|  | ||||
|  | ||||
| def test_all_methods_categorized(multiindex_dataframe_random_data): | ||||
|     grp = multiindex_dataframe_random_data.groupby( | ||||
|         multiindex_dataframe_random_data.iloc[:, 0] | ||||
|     ) | ||||
|     names = {_ for _ in dir(grp) if not _.startswith("_")} - set( | ||||
|         multiindex_dataframe_random_data.columns | ||||
|     ) | ||||
|     new_names = set(names) | ||||
|     new_names -= reduction_kernels | ||||
|     new_names -= transformation_kernels | ||||
|     new_names -= groupby_other_methods | ||||
|  | ||||
|     assert not reduction_kernels & transformation_kernels | ||||
|     assert not reduction_kernels & groupby_other_methods | ||||
|     assert not transformation_kernels & groupby_other_methods | ||||
|  | ||||
|     # new public method? | ||||
|     if new_names: | ||||
|         msg = f""" | ||||
| There are uncategorized methods defined on the Grouper class: | ||||
| {new_names}. | ||||
|  | ||||
| Was a new method recently added? | ||||
|  | ||||
| Every public method On Grouper must appear in exactly one the | ||||
| following three lists defined in pandas.core.groupby.base: | ||||
| - `reduction_kernels` | ||||
| - `transformation_kernels` | ||||
| - `groupby_other_methods` | ||||
| see the comments in pandas/core/groupby/base.py for guidance on | ||||
| how to fix this test. | ||||
|         """ | ||||
|         raise AssertionError(msg) | ||||
|  | ||||
|     # removed a public method? | ||||
|     all_categorized = reduction_kernels | transformation_kernels | groupby_other_methods | ||||
|     if names != all_categorized: | ||||
|         msg = f""" | ||||
| Some methods which are supposed to be on the Grouper class | ||||
| are missing: | ||||
| {all_categorized - names}. | ||||
|  | ||||
| They're still defined in one of the lists that live in pandas/core/groupby/base.py. | ||||
| If you removed a method, you should update them | ||||
| """ | ||||
|         raise AssertionError(msg) | ||||
|  | ||||
|  | ||||
| def test_frame_consistency(groupby_func): | ||||
|     # GH#48028 | ||||
|     if groupby_func in ("first", "last"): | ||||
|         msg = "first and last are entirely different between frame and groupby" | ||||
|         pytest.skip(reason=msg) | ||||
|  | ||||
|     if groupby_func in ("cumcount", "ngroup"): | ||||
|         assert not hasattr(DataFrame, groupby_func) | ||||
|         return | ||||
|  | ||||
|     frame_method = getattr(DataFrame, groupby_func) | ||||
|     gb_method = getattr(DataFrameGroupBy, groupby_func) | ||||
|     result = set(inspect.signature(gb_method).parameters) | ||||
|     if groupby_func == "size": | ||||
|         # "size" is a method on GroupBy but property on DataFrame: | ||||
|         expected = {"self"} | ||||
|     else: | ||||
|         expected = set(inspect.signature(frame_method).parameters) | ||||
|  | ||||
|     # Exclude certain arguments from result and expected depending on the operation | ||||
|     # Some of these may be purposeful inconsistencies between the APIs | ||||
|     exclude_expected, exclude_result = set(), set() | ||||
|     if groupby_func in ("any", "all"): | ||||
|         exclude_expected = {"kwargs", "bool_only", "axis"} | ||||
|     elif groupby_func in ("count",): | ||||
|         exclude_expected = {"numeric_only", "axis"} | ||||
|     elif groupby_func in ("nunique",): | ||||
|         exclude_expected = {"axis"} | ||||
|     elif groupby_func in ("max", "min"): | ||||
|         exclude_expected = {"axis", "kwargs", "skipna"} | ||||
|         exclude_result = {"min_count", "engine", "engine_kwargs"} | ||||
|     elif groupby_func in ("mean", "std", "sum", "var"): | ||||
|         exclude_expected = {"axis", "kwargs", "skipna"} | ||||
|         exclude_result = {"engine", "engine_kwargs"} | ||||
|     elif groupby_func in ("median", "prod", "sem"): | ||||
|         exclude_expected = {"axis", "kwargs", "skipna"} | ||||
|     elif groupby_func in ("backfill", "bfill", "ffill", "pad"): | ||||
|         exclude_expected = {"downcast", "inplace", "axis", "limit_area"} | ||||
|     elif groupby_func in ("cummax", "cummin"): | ||||
|         exclude_expected = {"skipna", "args"} | ||||
|         exclude_result = {"numeric_only"} | ||||
|     elif groupby_func in ("cumprod", "cumsum"): | ||||
|         exclude_expected = {"skipna"} | ||||
|     elif groupby_func in ("pct_change",): | ||||
|         exclude_expected = {"kwargs"} | ||||
|         exclude_result = {"axis"} | ||||
|     elif groupby_func in ("rank",): | ||||
|         exclude_expected = {"numeric_only"} | ||||
|     elif groupby_func in ("quantile",): | ||||
|         exclude_expected = {"method", "axis"} | ||||
|  | ||||
|     # Ensure excluded arguments are actually in the signatures | ||||
|     assert result & exclude_result == exclude_result | ||||
|     assert expected & exclude_expected == exclude_expected | ||||
|  | ||||
|     result -= exclude_result | ||||
|     expected -= exclude_expected | ||||
|     assert result == expected | ||||
|  | ||||
|  | ||||
| def test_series_consistency(request, groupby_func): | ||||
|     # GH#48028 | ||||
|     if groupby_func in ("first", "last"): | ||||
|         pytest.skip("first and last are entirely different between Series and groupby") | ||||
|  | ||||
|     if groupby_func in ("cumcount", "corrwith", "ngroup"): | ||||
|         assert not hasattr(Series, groupby_func) | ||||
|         return | ||||
|  | ||||
|     series_method = getattr(Series, groupby_func) | ||||
|     gb_method = getattr(SeriesGroupBy, groupby_func) | ||||
|     result = set(inspect.signature(gb_method).parameters) | ||||
|     if groupby_func == "size": | ||||
|         # "size" is a method on GroupBy but property on Series | ||||
|         expected = {"self"} | ||||
|     else: | ||||
|         expected = set(inspect.signature(series_method).parameters) | ||||
|  | ||||
|     # Exclude certain arguments from result and expected depending on the operation | ||||
|     # Some of these may be purposeful inconsistencies between the APIs | ||||
|     exclude_expected, exclude_result = set(), set() | ||||
|     if groupby_func in ("any", "all"): | ||||
|         exclude_expected = {"kwargs", "bool_only", "axis"} | ||||
|     elif groupby_func in ("diff",): | ||||
|         exclude_result = {"axis"} | ||||
|     elif groupby_func in ("max", "min"): | ||||
|         exclude_expected = {"axis", "kwargs", "skipna"} | ||||
|         exclude_result = {"min_count", "engine", "engine_kwargs"} | ||||
|     elif groupby_func in ("mean", "std", "sum", "var"): | ||||
|         exclude_expected = {"axis", "kwargs", "skipna"} | ||||
|         exclude_result = {"engine", "engine_kwargs"} | ||||
|     elif groupby_func in ("median", "prod", "sem"): | ||||
|         exclude_expected = {"axis", "kwargs", "skipna"} | ||||
|     elif groupby_func in ("backfill", "bfill", "ffill", "pad"): | ||||
|         exclude_expected = {"downcast", "inplace", "axis", "limit_area"} | ||||
|     elif groupby_func in ("cummax", "cummin"): | ||||
|         exclude_expected = {"skipna", "args"} | ||||
|         exclude_result = {"numeric_only"} | ||||
|     elif groupby_func in ("cumprod", "cumsum"): | ||||
|         exclude_expected = {"skipna"} | ||||
|     elif groupby_func in ("pct_change",): | ||||
|         exclude_expected = {"kwargs"} | ||||
|         exclude_result = {"axis"} | ||||
|     elif groupby_func in ("rank",): | ||||
|         exclude_expected = {"numeric_only"} | ||||
|     elif groupby_func in ("idxmin", "idxmax"): | ||||
|         exclude_expected = {"args", "kwargs"} | ||||
|     elif groupby_func in ("quantile",): | ||||
|         exclude_result = {"numeric_only"} | ||||
|  | ||||
|     # Ensure excluded arguments are actually in the signatures | ||||
|     assert result & exclude_result == exclude_result | ||||
|     assert expected & exclude_expected == exclude_expected | ||||
|  | ||||
|     result -= exclude_result | ||||
|     expected -= exclude_expected | ||||
|     assert result == expected | ||||
							
								
								
									
										1605
									
								
								lib/python3.11/site-packages/pandas/tests/groupby/test_apply.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1605
									
								
								lib/python3.11/site-packages/pandas/tests/groupby/test_apply.py
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -0,0 +1,163 @@ | ||||
| import numpy as np | ||||
|  | ||||
| import pandas as pd | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| def test_group_by_copy(): | ||||
|     # GH#44803 | ||||
|     df = pd.DataFrame( | ||||
|         { | ||||
|             "name": ["Alice", "Bob", "Carl"], | ||||
|             "age": [20, 21, 20], | ||||
|         } | ||||
|     ).set_index("name") | ||||
|  | ||||
|     msg = "DataFrameGroupBy.apply operated on the grouping columns" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         grp_by_same_value = df.groupby(["age"], group_keys=False).apply( | ||||
|             lambda group: group | ||||
|         ) | ||||
|     msg = "DataFrameGroupBy.apply operated on the grouping columns" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         grp_by_copy = df.groupby(["age"], group_keys=False).apply( | ||||
|             lambda group: group.copy() | ||||
|         ) | ||||
|     tm.assert_frame_equal(grp_by_same_value, grp_by_copy) | ||||
|  | ||||
|  | ||||
| def test_mutate_groups(): | ||||
|     # GH3380 | ||||
|  | ||||
|     df = pd.DataFrame( | ||||
|         { | ||||
|             "cat1": ["a"] * 8 + ["b"] * 6, | ||||
|             "cat2": ["c"] * 2 | ||||
|             + ["d"] * 2 | ||||
|             + ["e"] * 2 | ||||
|             + ["f"] * 2 | ||||
|             + ["c"] * 2 | ||||
|             + ["d"] * 2 | ||||
|             + ["e"] * 2, | ||||
|             "cat3": [f"g{x}" for x in range(1, 15)], | ||||
|             "val": np.random.default_rng(2).integers(100, size=14), | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|     def f_copy(x): | ||||
|         x = x.copy() | ||||
|         x["rank"] = x.val.rank(method="min") | ||||
|         return x.groupby("cat2")["rank"].min() | ||||
|  | ||||
|     def f_no_copy(x): | ||||
|         x["rank"] = x.val.rank(method="min") | ||||
|         return x.groupby("cat2")["rank"].min() | ||||
|  | ||||
|     msg = "DataFrameGroupBy.apply operated on the grouping columns" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         grpby_copy = df.groupby("cat1").apply(f_copy) | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         grpby_no_copy = df.groupby("cat1").apply(f_no_copy) | ||||
|     tm.assert_series_equal(grpby_copy, grpby_no_copy) | ||||
|  | ||||
|  | ||||
| def test_no_mutate_but_looks_like(): | ||||
|     # GH 8467 | ||||
|     # first show's mutation indicator | ||||
|     # second does not, but should yield the same results | ||||
|     df = pd.DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) | ||||
|  | ||||
|     msg = "DataFrameGroupBy.apply operated on the grouping columns" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key) | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key) | ||||
|     tm.assert_series_equal(result1, result2) | ||||
|  | ||||
|  | ||||
| def test_apply_function_with_indexing(warn_copy_on_write): | ||||
|     # GH: 33058 | ||||
|     df = pd.DataFrame( | ||||
|         {"col1": ["A", "A", "A", "B", "B", "B"], "col2": [1, 2, 3, 4, 5, 6]} | ||||
|     ) | ||||
|  | ||||
|     def fn(x): | ||||
|         x.loc[x.index[-1], "col2"] = 0 | ||||
|         return x.col2 | ||||
|  | ||||
|     msg = "DataFrameGroupBy.apply operated on the grouping columns" | ||||
|     with tm.assert_produces_warning( | ||||
|         FutureWarning, match=msg, raise_on_extra_warnings=not warn_copy_on_write | ||||
|     ): | ||||
|         result = df.groupby(["col1"], as_index=False).apply(fn) | ||||
|     expected = pd.Series( | ||||
|         [1, 2, 0, 4, 5, 0], | ||||
|         index=pd.MultiIndex.from_tuples( | ||||
|             [(0, 0), (0, 1), (0, 2), (1, 3), (1, 4), (1, 5)] | ||||
|         ), | ||||
|         name="col2", | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_apply_mutate_columns_multiindex(): | ||||
|     # GH 12652 | ||||
|     df = pd.DataFrame( | ||||
|         { | ||||
|             ("C", "julian"): [1, 2, 3], | ||||
|             ("B", "geoffrey"): [1, 2, 3], | ||||
|             ("A", "julian"): [1, 2, 3], | ||||
|             ("B", "julian"): [1, 2, 3], | ||||
|             ("A", "geoffrey"): [1, 2, 3], | ||||
|             ("C", "geoffrey"): [1, 2, 3], | ||||
|         }, | ||||
|         columns=pd.MultiIndex.from_tuples( | ||||
|             [ | ||||
|                 ("A", "julian"), | ||||
|                 ("A", "geoffrey"), | ||||
|                 ("B", "julian"), | ||||
|                 ("B", "geoffrey"), | ||||
|                 ("C", "julian"), | ||||
|                 ("C", "geoffrey"), | ||||
|             ] | ||||
|         ), | ||||
|     ) | ||||
|  | ||||
|     def add_column(grouped): | ||||
|         name = grouped.columns[0][1] | ||||
|         grouped["sum", name] = grouped.sum(axis=1) | ||||
|         return grouped | ||||
|  | ||||
|     msg = "DataFrame.groupby with axis=1 is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         gb = df.groupby(level=1, axis=1) | ||||
|     result = gb.apply(add_column) | ||||
|     expected = pd.DataFrame( | ||||
|         [ | ||||
|             [1, 1, 1, 3, 1, 1, 1, 3], | ||||
|             [2, 2, 2, 6, 2, 2, 2, 6], | ||||
|             [ | ||||
|                 3, | ||||
|                 3, | ||||
|                 3, | ||||
|                 9, | ||||
|                 3, | ||||
|                 3, | ||||
|                 3, | ||||
|                 9, | ||||
|             ], | ||||
|         ], | ||||
|         columns=pd.MultiIndex.from_tuples( | ||||
|             [ | ||||
|                 ("geoffrey", "A", "geoffrey"), | ||||
|                 ("geoffrey", "B", "geoffrey"), | ||||
|                 ("geoffrey", "C", "geoffrey"), | ||||
|                 ("geoffrey", "sum", "geoffrey"), | ||||
|                 ("julian", "A", "julian"), | ||||
|                 ("julian", "B", "julian"), | ||||
|                 ("julian", "C", "julian"), | ||||
|                 ("julian", "sum", "julian"), | ||||
|             ] | ||||
|         ), | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
| @ -0,0 +1,65 @@ | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas._libs import lib | ||||
| import pandas.util._test_decorators as td | ||||
|  | ||||
| import pandas as pd | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| def assert_block_lengths(x): | ||||
|     assert len(x) == len(x._mgr.blocks[0].mgr_locs) | ||||
|     return 0 | ||||
|  | ||||
|  | ||||
| def cumsum_max(x): | ||||
|     x.cumsum().max() | ||||
|     return 0 | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "func", | ||||
|     [ | ||||
|         cumsum_max, | ||||
|         pytest.param(assert_block_lengths, marks=td.skip_array_manager_invalid_test), | ||||
|     ], | ||||
| ) | ||||
| def test_mgr_locs_updated(func): | ||||
|     # https://github.com/pandas-dev/pandas/issues/31802 | ||||
|     # Some operations may require creating new blocks, which requires | ||||
|     # valid mgr_locs | ||||
|     df = pd.DataFrame({"A": ["a", "a", "a"], "B": ["a", "b", "b"], "C": [1, 1, 1]}) | ||||
|     result = df.groupby(["A", "B"]).agg(func) | ||||
|     expected = pd.DataFrame( | ||||
|         {"C": [0, 0]}, | ||||
|         index=pd.MultiIndex.from_product([["a"], ["a", "b"]], names=["A", "B"]), | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "binner,closed,expected", | ||||
|     [ | ||||
|         ( | ||||
|             np.array([0, 3, 6, 9], dtype=np.int64), | ||||
|             "left", | ||||
|             np.array([2, 5, 6], dtype=np.int64), | ||||
|         ), | ||||
|         ( | ||||
|             np.array([0, 3, 6, 9], dtype=np.int64), | ||||
|             "right", | ||||
|             np.array([3, 6, 6], dtype=np.int64), | ||||
|         ), | ||||
|         (np.array([0, 3, 6], dtype=np.int64), "left", np.array([2, 5], dtype=np.int64)), | ||||
|         ( | ||||
|             np.array([0, 3, 6], dtype=np.int64), | ||||
|             "right", | ||||
|             np.array([3, 6], dtype=np.int64), | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_generate_bins(binner, closed, expected): | ||||
|     values = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64) | ||||
|     result = lib.generate_bins_dt64(values, binner, closed=closed) | ||||
|     tm.assert_numpy_array_equal(result, expected) | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -0,0 +1,394 @@ | ||||
| from itertools import product | ||||
| from string import ascii_lowercase | ||||
|  | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     MultiIndex, | ||||
|     Period, | ||||
|     Series, | ||||
|     Timedelta, | ||||
|     Timestamp, | ||||
|     date_range, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| class TestCounting: | ||||
|     def test_cumcount(self): | ||||
|         df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"]) | ||||
|         g = df.groupby("A") | ||||
|         sg = g.A | ||||
|  | ||||
|         expected = Series([0, 1, 2, 0, 3]) | ||||
|  | ||||
|         tm.assert_series_equal(expected, g.cumcount()) | ||||
|         tm.assert_series_equal(expected, sg.cumcount()) | ||||
|  | ||||
|     def test_cumcount_empty(self): | ||||
|         ge = DataFrame().groupby(level=0) | ||||
|         se = Series(dtype=object).groupby(level=0) | ||||
|  | ||||
|         # edge case, as this is usually considered float | ||||
|         e = Series(dtype="int64") | ||||
|  | ||||
|         tm.assert_series_equal(e, ge.cumcount()) | ||||
|         tm.assert_series_equal(e, se.cumcount()) | ||||
|  | ||||
|     def test_cumcount_dupe_index(self): | ||||
|         df = DataFrame( | ||||
|             [["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5 | ||||
|         ) | ||||
|         g = df.groupby("A") | ||||
|         sg = g.A | ||||
|  | ||||
|         expected = Series([0, 1, 2, 0, 3], index=[0] * 5) | ||||
|  | ||||
|         tm.assert_series_equal(expected, g.cumcount()) | ||||
|         tm.assert_series_equal(expected, sg.cumcount()) | ||||
|  | ||||
|     def test_cumcount_mi(self): | ||||
|         mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]]) | ||||
|         df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=mi) | ||||
|         g = df.groupby("A") | ||||
|         sg = g.A | ||||
|  | ||||
|         expected = Series([0, 1, 2, 0, 3], index=mi) | ||||
|  | ||||
|         tm.assert_series_equal(expected, g.cumcount()) | ||||
|         tm.assert_series_equal(expected, sg.cumcount()) | ||||
|  | ||||
|     def test_cumcount_groupby_not_col(self): | ||||
|         df = DataFrame( | ||||
|             [["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5 | ||||
|         ) | ||||
|         g = df.groupby([0, 0, 0, 1, 0]) | ||||
|         sg = g.A | ||||
|  | ||||
|         expected = Series([0, 1, 2, 0, 3], index=[0] * 5) | ||||
|  | ||||
|         tm.assert_series_equal(expected, g.cumcount()) | ||||
|         tm.assert_series_equal(expected, sg.cumcount()) | ||||
|  | ||||
|     def test_ngroup(self): | ||||
|         df = DataFrame({"A": list("aaaba")}) | ||||
|         g = df.groupby("A") | ||||
|         sg = g.A | ||||
|  | ||||
|         expected = Series([0, 0, 0, 1, 0]) | ||||
|  | ||||
|         tm.assert_series_equal(expected, g.ngroup()) | ||||
|         tm.assert_series_equal(expected, sg.ngroup()) | ||||
|  | ||||
|     def test_ngroup_distinct(self): | ||||
|         df = DataFrame({"A": list("abcde")}) | ||||
|         g = df.groupby("A") | ||||
|         sg = g.A | ||||
|  | ||||
|         expected = Series(range(5), dtype="int64") | ||||
|  | ||||
|         tm.assert_series_equal(expected, g.ngroup()) | ||||
|         tm.assert_series_equal(expected, sg.ngroup()) | ||||
|  | ||||
|     def test_ngroup_one_group(self): | ||||
|         df = DataFrame({"A": [0] * 5}) | ||||
|         g = df.groupby("A") | ||||
|         sg = g.A | ||||
|  | ||||
|         expected = Series([0] * 5) | ||||
|  | ||||
|         tm.assert_series_equal(expected, g.ngroup()) | ||||
|         tm.assert_series_equal(expected, sg.ngroup()) | ||||
|  | ||||
|     def test_ngroup_empty(self): | ||||
|         ge = DataFrame().groupby(level=0) | ||||
|         se = Series(dtype=object).groupby(level=0) | ||||
|  | ||||
|         # edge case, as this is usually considered float | ||||
|         e = Series(dtype="int64") | ||||
|  | ||||
|         tm.assert_series_equal(e, ge.ngroup()) | ||||
|         tm.assert_series_equal(e, se.ngroup()) | ||||
|  | ||||
|     def test_ngroup_series_matches_frame(self): | ||||
|         df = DataFrame({"A": list("aaaba")}) | ||||
|         s = Series(list("aaaba")) | ||||
|  | ||||
|         tm.assert_series_equal(df.groupby(s).ngroup(), s.groupby(s).ngroup()) | ||||
|  | ||||
|     def test_ngroup_dupe_index(self): | ||||
|         df = DataFrame({"A": list("aaaba")}, index=[0] * 5) | ||||
|         g = df.groupby("A") | ||||
|         sg = g.A | ||||
|  | ||||
|         expected = Series([0, 0, 0, 1, 0], index=[0] * 5) | ||||
|  | ||||
|         tm.assert_series_equal(expected, g.ngroup()) | ||||
|         tm.assert_series_equal(expected, sg.ngroup()) | ||||
|  | ||||
|     def test_ngroup_mi(self): | ||||
|         mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]]) | ||||
|         df = DataFrame({"A": list("aaaba")}, index=mi) | ||||
|         g = df.groupby("A") | ||||
|         sg = g.A | ||||
|         expected = Series([0, 0, 0, 1, 0], index=mi) | ||||
|  | ||||
|         tm.assert_series_equal(expected, g.ngroup()) | ||||
|         tm.assert_series_equal(expected, sg.ngroup()) | ||||
|  | ||||
|     def test_ngroup_groupby_not_col(self): | ||||
|         df = DataFrame({"A": list("aaaba")}, index=[0] * 5) | ||||
|         g = df.groupby([0, 0, 0, 1, 0]) | ||||
|         sg = g.A | ||||
|  | ||||
|         expected = Series([0, 0, 0, 1, 0], index=[0] * 5) | ||||
|  | ||||
|         tm.assert_series_equal(expected, g.ngroup()) | ||||
|         tm.assert_series_equal(expected, sg.ngroup()) | ||||
|  | ||||
|     def test_ngroup_descending(self): | ||||
|         df = DataFrame(["a", "a", "b", "a", "b"], columns=["A"]) | ||||
|         g = df.groupby(["A"]) | ||||
|  | ||||
|         ascending = Series([0, 0, 1, 0, 1]) | ||||
|         descending = Series([1, 1, 0, 1, 0]) | ||||
|  | ||||
|         tm.assert_series_equal(descending, (g.ngroups - 1) - ascending) | ||||
|         tm.assert_series_equal(ascending, g.ngroup(ascending=True)) | ||||
|         tm.assert_series_equal(descending, g.ngroup(ascending=False)) | ||||
|  | ||||
|     def test_ngroup_matches_cumcount(self): | ||||
|         # verify one manually-worked out case works | ||||
|         df = DataFrame( | ||||
|             [["a", "x"], ["a", "y"], ["b", "x"], ["a", "x"], ["b", "y"]], | ||||
|             columns=["A", "X"], | ||||
|         ) | ||||
|         g = df.groupby(["A", "X"]) | ||||
|         g_ngroup = g.ngroup() | ||||
|         g_cumcount = g.cumcount() | ||||
|         expected_ngroup = Series([0, 1, 2, 0, 3]) | ||||
|         expected_cumcount = Series([0, 0, 0, 1, 0]) | ||||
|  | ||||
|         tm.assert_series_equal(g_ngroup, expected_ngroup) | ||||
|         tm.assert_series_equal(g_cumcount, expected_cumcount) | ||||
|  | ||||
|     def test_ngroup_cumcount_pair(self): | ||||
|         # brute force comparison for all small series | ||||
|         for p in product(range(3), repeat=4): | ||||
|             df = DataFrame({"a": p}) | ||||
|             g = df.groupby(["a"]) | ||||
|  | ||||
|             order = sorted(set(p)) | ||||
|             ngroupd = [order.index(val) for val in p] | ||||
|             cumcounted = [p[:i].count(val) for i, val in enumerate(p)] | ||||
|  | ||||
|             tm.assert_series_equal(g.ngroup(), Series(ngroupd)) | ||||
|             tm.assert_series_equal(g.cumcount(), Series(cumcounted)) | ||||
|  | ||||
|     def test_ngroup_respects_groupby_order(self, sort): | ||||
|         df = DataFrame({"a": np.random.default_rng(2).choice(list("abcdef"), 100)}) | ||||
|         g = df.groupby("a", sort=sort) | ||||
|         df["group_id"] = -1 | ||||
|         df["group_index"] = -1 | ||||
|  | ||||
|         for i, (_, group) in enumerate(g): | ||||
|             df.loc[group.index, "group_id"] = i | ||||
|             for j, ind in enumerate(group.index): | ||||
|                 df.loc[ind, "group_index"] = j | ||||
|  | ||||
|         tm.assert_series_equal(Series(df["group_id"].values), g.ngroup()) | ||||
|         tm.assert_series_equal(Series(df["group_index"].values), g.cumcount()) | ||||
|  | ||||
|     @pytest.mark.parametrize( | ||||
|         "datetimelike", | ||||
|         [ | ||||
|             [Timestamp(f"2016-05-{i:02d} 20:09:25+00:00") for i in range(1, 4)], | ||||
|             [Timestamp(f"2016-05-{i:02d} 20:09:25") for i in range(1, 4)], | ||||
|             [Timestamp(f"2016-05-{i:02d} 20:09:25", tz="UTC") for i in range(1, 4)], | ||||
|             [Timedelta(x, unit="h") for x in range(1, 4)], | ||||
|             [Period(freq="2W", year=2017, month=x) for x in range(1, 4)], | ||||
|         ], | ||||
|     ) | ||||
|     def test_count_with_datetimelike(self, datetimelike): | ||||
|         # test for #13393, where DataframeGroupBy.count() fails | ||||
|         # when counting a datetimelike column. | ||||
|  | ||||
|         df = DataFrame({"x": ["a", "a", "b"], "y": datetimelike}) | ||||
|         res = df.groupby("x").count() | ||||
|         expected = DataFrame({"y": [2, 1]}, index=["a", "b"]) | ||||
|         expected.index.name = "x" | ||||
|         tm.assert_frame_equal(expected, res) | ||||
|  | ||||
|     def test_count_with_only_nans_in_first_group(self): | ||||
|         # GH21956 | ||||
|         df = DataFrame({"A": [np.nan, np.nan], "B": ["a", "b"], "C": [1, 2]}) | ||||
|         result = df.groupby(["A", "B"]).C.count() | ||||
|         mi = MultiIndex(levels=[[], ["a", "b"]], codes=[[], []], names=["A", "B"]) | ||||
|         expected = Series([], index=mi, dtype=np.int64, name="C") | ||||
|         tm.assert_series_equal(result, expected, check_index_type=False) | ||||
|  | ||||
|     def test_count_groupby_column_with_nan_in_groupby_column(self): | ||||
|         # https://github.com/pandas-dev/pandas/issues/32841 | ||||
|         df = DataFrame({"A": [1, 1, 1, 1, 1], "B": [5, 4, np.nan, 3, 0]}) | ||||
|         res = df.groupby(["B"]).count() | ||||
|         expected = DataFrame( | ||||
|             index=Index([0.0, 3.0, 4.0, 5.0], name="B"), data={"A": [1, 1, 1, 1]} | ||||
|         ) | ||||
|         tm.assert_frame_equal(expected, res) | ||||
|  | ||||
|     def test_groupby_count_dateparseerror(self): | ||||
|         dr = date_range(start="1/1/2012", freq="5min", periods=10) | ||||
|  | ||||
|         # BAD Example, datetimes first | ||||
|         ser = Series(np.arange(10), index=[dr, np.arange(10)]) | ||||
|         grouped = ser.groupby(lambda x: x[1] % 2 == 0) | ||||
|         result = grouped.count() | ||||
|  | ||||
|         ser = Series(np.arange(10), index=[np.arange(10), dr]) | ||||
|         grouped = ser.groupby(lambda x: x[0] % 2 == 0) | ||||
|         expected = grouped.count() | ||||
|  | ||||
|         tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_groupby_timedelta_cython_count(): | ||||
|     df = DataFrame( | ||||
|         {"g": list("ab" * 2), "delta": np.arange(4).astype("timedelta64[ns]")} | ||||
|     ) | ||||
|     expected = Series([2, 2], index=Index(["a", "b"], name="g"), name="delta") | ||||
|     result = df.groupby("g").delta.count() | ||||
|     tm.assert_series_equal(expected, result) | ||||
|  | ||||
|  | ||||
| def test_count(): | ||||
|     n = 1 << 15 | ||||
|     dr = date_range("2015-08-30", periods=n // 10, freq="min") | ||||
|  | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "1st": np.random.default_rng(2).choice(list(ascii_lowercase), n), | ||||
|             "2nd": np.random.default_rng(2).integers(0, 5, n), | ||||
|             "3rd": np.random.default_rng(2).standard_normal(n).round(3), | ||||
|             "4th": np.random.default_rng(2).integers(-10, 10, n), | ||||
|             "5th": np.random.default_rng(2).choice(dr, n), | ||||
|             "6th": np.random.default_rng(2).standard_normal(n).round(3), | ||||
|             "7th": np.random.default_rng(2).standard_normal(n).round(3), | ||||
|             "8th": np.random.default_rng(2).choice(dr, n) | ||||
|             - np.random.default_rng(2).choice(dr, 1), | ||||
|             "9th": np.random.default_rng(2).choice(list(ascii_lowercase), n), | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|     for col in df.columns.drop(["1st", "2nd", "4th"]): | ||||
|         df.loc[np.random.default_rng(2).choice(n, n // 10), col] = np.nan | ||||
|  | ||||
|     df["9th"] = df["9th"].astype("category") | ||||
|  | ||||
|     for key in ["1st", "2nd", ["1st", "2nd"]]: | ||||
|         left = df.groupby(key).count() | ||||
|         msg = "DataFrameGroupBy.apply operated on the grouping columns" | ||||
|         with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|             right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) | ||||
|         tm.assert_frame_equal(left, right) | ||||
|  | ||||
|  | ||||
| def test_count_non_nulls(): | ||||
|     # GH#5610 | ||||
|     # count counts non-nulls | ||||
|     df = DataFrame( | ||||
|         [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, np.nan]], | ||||
|         columns=["A", "B", "C"], | ||||
|     ) | ||||
|  | ||||
|     count_as = df.groupby("A").count() | ||||
|     count_not_as = df.groupby("A", as_index=False).count() | ||||
|  | ||||
|     expected = DataFrame([[1, 2], [0, 0]], columns=["B", "C"], index=[1, 3]) | ||||
|     expected.index.name = "A" | ||||
|     tm.assert_frame_equal(count_not_as, expected.reset_index()) | ||||
|     tm.assert_frame_equal(count_as, expected) | ||||
|  | ||||
|     count_B = df.groupby("A")["B"].count() | ||||
|     tm.assert_series_equal(count_B, expected["B"]) | ||||
|  | ||||
|  | ||||
| def test_count_object(): | ||||
|     df = DataFrame({"a": ["a"] * 3 + ["b"] * 3, "c": [2] * 3 + [3] * 3}) | ||||
|     result = df.groupby("c").a.count() | ||||
|     expected = Series([3, 3], index=Index([2, 3], name="c"), name="a") | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     df = DataFrame({"a": ["a", np.nan, np.nan] + ["b"] * 3, "c": [2] * 3 + [3] * 3}) | ||||
|     result = df.groupby("c").a.count() | ||||
|     expected = Series([1, 3], index=Index([2, 3], name="c"), name="a") | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_count_cross_type(): | ||||
|     # GH8169 | ||||
|     # Set float64 dtype to avoid upcast when setting nan below | ||||
|     vals = np.hstack( | ||||
|         ( | ||||
|             np.random.default_rng(2).integers(0, 5, (100, 2)), | ||||
|             np.random.default_rng(2).integers(0, 2, (100, 2)), | ||||
|         ) | ||||
|     ).astype("float64") | ||||
|  | ||||
|     df = DataFrame(vals, columns=["a", "b", "c", "d"]) | ||||
|     df[df == 2] = np.nan | ||||
|     expected = df.groupby(["c", "d"]).count() | ||||
|  | ||||
|     for t in ["float32", "object"]: | ||||
|         df["a"] = df["a"].astype(t) | ||||
|         df["b"] = df["b"].astype(t) | ||||
|         result = df.groupby(["c", "d"]).count() | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_lower_int_prec_count(): | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "a": np.array([0, 1, 2, 100], np.int8), | ||||
|             "b": np.array([1, 2, 3, 6], np.uint32), | ||||
|             "c": np.array([4, 5, 6, 8], np.int16), | ||||
|             "grp": list("ab" * 2), | ||||
|         } | ||||
|     ) | ||||
|     result = df.groupby("grp").count() | ||||
|     expected = DataFrame( | ||||
|         {"a": [2, 2], "b": [2, 2], "c": [2, 2]}, index=Index(list("ab"), name="grp") | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_count_uses_size_on_exception(): | ||||
|     class RaisingObjectException(Exception): | ||||
|         pass | ||||
|  | ||||
|     class RaisingObject: | ||||
|         def __init__(self, msg="I will raise inside Cython") -> None: | ||||
|             super().__init__() | ||||
|             self.msg = msg | ||||
|  | ||||
|         def __eq__(self, other): | ||||
|             # gets called in Cython to check that raising calls the method | ||||
|             raise RaisingObjectException(self.msg) | ||||
|  | ||||
|     df = DataFrame({"a": [RaisingObject() for _ in range(4)], "grp": list("ab" * 2)}) | ||||
|     result = df.groupby("grp").count() | ||||
|     expected = DataFrame({"a": [2, 2]}, index=Index(list("ab"), name="grp")) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_count_arrow_string_array(any_string_dtype): | ||||
|     # GH#54751 | ||||
|     pytest.importorskip("pyarrow") | ||||
|     df = DataFrame( | ||||
|         {"a": [1, 2, 3], "b": Series(["a", "b", "a"], dtype=any_string_dtype)} | ||||
|     ) | ||||
|     result = df.groupby("a").count() | ||||
|     expected = DataFrame({"b": 1}, index=Index([1, 2, 3], name="a")) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
| @ -0,0 +1,319 @@ | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas.errors import UnsupportedFunctionCall | ||||
| import pandas.util._test_decorators as td | ||||
|  | ||||
| import pandas as pd | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Series, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| @pytest.fixture( | ||||
|     params=[np.int32, np.int64, np.float32, np.float64, "Int64", "Float64"], | ||||
|     ids=["np.int32", "np.int64", "np.float32", "np.float64", "Int64", "Float64"], | ||||
| ) | ||||
| def dtypes_for_minmax(request): | ||||
|     """ | ||||
|     Fixture of dtypes with min and max values used for testing | ||||
|     cummin and cummax | ||||
|     """ | ||||
|     dtype = request.param | ||||
|  | ||||
|     np_type = dtype | ||||
|     if dtype == "Int64": | ||||
|         np_type = np.int64 | ||||
|     elif dtype == "Float64": | ||||
|         np_type = np.float64 | ||||
|  | ||||
|     min_val = ( | ||||
|         np.iinfo(np_type).min | ||||
|         if np.dtype(np_type).kind == "i" | ||||
|         else np.finfo(np_type).min | ||||
|     ) | ||||
|     max_val = ( | ||||
|         np.iinfo(np_type).max | ||||
|         if np.dtype(np_type).kind == "i" | ||||
|         else np.finfo(np_type).max | ||||
|     ) | ||||
|  | ||||
|     return (dtype, min_val, max_val) | ||||
|  | ||||
|  | ||||
| def test_groupby_cumprod(): | ||||
|     # GH 4095 | ||||
|     df = DataFrame({"key": ["b"] * 10, "value": 2}) | ||||
|  | ||||
|     actual = df.groupby("key")["value"].cumprod() | ||||
|     expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod()) | ||||
|     expected.name = "value" | ||||
|     tm.assert_series_equal(actual, expected) | ||||
|  | ||||
|     df = DataFrame({"key": ["b"] * 100, "value": 2}) | ||||
|     df["value"] = df["value"].astype(float) | ||||
|     actual = df.groupby("key")["value"].cumprod() | ||||
|     expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod()) | ||||
|     expected.name = "value" | ||||
|     tm.assert_series_equal(actual, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.skip_ubsan | ||||
| def test_groupby_cumprod_overflow(): | ||||
|     # GH#37493 if we overflow we return garbage consistent with numpy | ||||
|     df = DataFrame({"key": ["b"] * 4, "value": 100_000}) | ||||
|     actual = df.groupby("key")["value"].cumprod() | ||||
|     expected = Series( | ||||
|         [100_000, 10_000_000_000, 1_000_000_000_000_000, 7766279631452241920], | ||||
|         name="value", | ||||
|     ) | ||||
|     tm.assert_series_equal(actual, expected) | ||||
|  | ||||
|     numpy_result = df.groupby("key", group_keys=False)["value"].apply( | ||||
|         lambda x: x.cumprod() | ||||
|     ) | ||||
|     numpy_result.name = "value" | ||||
|     tm.assert_series_equal(actual, numpy_result) | ||||
|  | ||||
|  | ||||
| def test_groupby_cumprod_nan_influences_other_columns(): | ||||
|     # GH#48064 | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "a": 1, | ||||
|             "b": [1, np.nan, 2], | ||||
|             "c": [1, 2, 3.0], | ||||
|         } | ||||
|     ) | ||||
|     result = df.groupby("a").cumprod(numeric_only=True, skipna=False) | ||||
|     expected = DataFrame({"b": [1, np.nan, np.nan], "c": [1, 2, 6.0]}) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_cummin(dtypes_for_minmax): | ||||
|     dtype = dtypes_for_minmax[0] | ||||
|     min_val = dtypes_for_minmax[1] | ||||
|  | ||||
|     # GH 15048 | ||||
|     base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) | ||||
|     expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] | ||||
|  | ||||
|     df = base_df.astype(dtype) | ||||
|  | ||||
|     expected = DataFrame({"B": expected_mins}).astype(dtype) | ||||
|     result = df.groupby("A").cummin() | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|     result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # Test w/ min value for dtype | ||||
|     df.loc[[2, 6], "B"] = min_val | ||||
|     df.loc[[1, 5], "B"] = min_val + 1 | ||||
|     expected.loc[[2, 3, 6, 7], "B"] = min_val | ||||
|     expected.loc[[1, 5], "B"] = min_val + 1  # should not be rounded to min_val | ||||
|     result = df.groupby("A").cummin() | ||||
|     tm.assert_frame_equal(result, expected, check_exact=True) | ||||
|     expected = ( | ||||
|         df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected, check_exact=True) | ||||
|  | ||||
|     # Test nan in some values | ||||
|     # Explicit cast to float to avoid implicit cast when setting nan | ||||
|     base_df = base_df.astype({"B": "float"}) | ||||
|     base_df.loc[[0, 2, 4, 6], "B"] = np.nan | ||||
|     expected = DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]}) | ||||
|     result = base_df.groupby("A").cummin() | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|     expected = ( | ||||
|         base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # GH 15561 | ||||
|     df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])}) | ||||
|     expected = Series(pd.to_datetime("2001"), index=[0], name="b") | ||||
|  | ||||
|     result = df.groupby("a")["b"].cummin() | ||||
|     tm.assert_series_equal(expected, result) | ||||
|  | ||||
|     # GH 15635 | ||||
|     df = DataFrame({"a": [1, 2, 1], "b": [1, 2, 2]}) | ||||
|     result = df.groupby("a").b.cummin() | ||||
|     expected = Series([1, 2, 1], name="b") | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("method", ["cummin", "cummax"]) | ||||
| @pytest.mark.parametrize("dtype", ["UInt64", "Int64", "Float64", "float", "boolean"]) | ||||
| def test_cummin_max_all_nan_column(method, dtype): | ||||
|     base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8}) | ||||
|     base_df["B"] = base_df["B"].astype(dtype) | ||||
|     grouped = base_df.groupby("A") | ||||
|  | ||||
|     expected = DataFrame({"B": [np.nan] * 8}, dtype=dtype) | ||||
|     result = getattr(grouped, method)() | ||||
|     tm.assert_frame_equal(expected, result) | ||||
|  | ||||
|     result = getattr(grouped["B"], method)().to_frame() | ||||
|     tm.assert_frame_equal(expected, result) | ||||
|  | ||||
|  | ||||
| def test_cummax(dtypes_for_minmax): | ||||
|     dtype = dtypes_for_minmax[0] | ||||
|     max_val = dtypes_for_minmax[2] | ||||
|  | ||||
|     # GH 15048 | ||||
|     base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) | ||||
|     expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3] | ||||
|  | ||||
|     df = base_df.astype(dtype) | ||||
|  | ||||
|     expected = DataFrame({"B": expected_maxs}).astype(dtype) | ||||
|     result = df.groupby("A").cummax() | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|     result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # Test w/ max value for dtype | ||||
|     df.loc[[2, 6], "B"] = max_val | ||||
|     expected.loc[[2, 3, 6, 7], "B"] = max_val | ||||
|     result = df.groupby("A").cummax() | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|     expected = ( | ||||
|         df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # Test nan in some values | ||||
|     # Explicit cast to float to avoid implicit cast when setting nan | ||||
|     base_df = base_df.astype({"B": "float"}) | ||||
|     base_df.loc[[0, 2, 4, 6], "B"] = np.nan | ||||
|     expected = DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]}) | ||||
|     result = base_df.groupby("A").cummax() | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|     expected = ( | ||||
|         base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # GH 15561 | ||||
|     df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])}) | ||||
|     expected = Series(pd.to_datetime("2001"), index=[0], name="b") | ||||
|  | ||||
|     result = df.groupby("a")["b"].cummax() | ||||
|     tm.assert_series_equal(expected, result) | ||||
|  | ||||
|     # GH 15635 | ||||
|     df = DataFrame({"a": [1, 2, 1], "b": [2, 1, 1]}) | ||||
|     result = df.groupby("a").b.cummax() | ||||
|     expected = Series([2, 1, 2], name="b") | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_cummax_i8_at_implementation_bound(): | ||||
|     # the minimum value used to be treated as NPY_NAT+1 instead of NPY_NAT | ||||
|     #  for int64 dtype GH#46382 | ||||
|     ser = Series([pd.NaT._value + n for n in range(5)]) | ||||
|     df = DataFrame({"A": 1, "B": ser, "C": ser._values.view("M8[ns]")}) | ||||
|     gb = df.groupby("A") | ||||
|  | ||||
|     res = gb.cummax() | ||||
|     exp = df[["B", "C"]] | ||||
|     tm.assert_frame_equal(res, exp) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("method", ["cummin", "cummax"]) | ||||
| @pytest.mark.parametrize("dtype", ["float", "Int64", "Float64"]) | ||||
| @pytest.mark.parametrize( | ||||
|     "groups,expected_data", | ||||
|     [ | ||||
|         ([1, 1, 1], [1, None, None]), | ||||
|         ([1, 2, 3], [1, None, 2]), | ||||
|         ([1, 3, 3], [1, None, None]), | ||||
|     ], | ||||
| ) | ||||
| def test_cummin_max_skipna(method, dtype, groups, expected_data): | ||||
|     # GH-34047 | ||||
|     df = DataFrame({"a": Series([1, None, 2], dtype=dtype)}) | ||||
|     orig = df.copy() | ||||
|     gb = df.groupby(groups)["a"] | ||||
|  | ||||
|     result = getattr(gb, method)(skipna=False) | ||||
|     expected = Series(expected_data, dtype=dtype, name="a") | ||||
|  | ||||
|     # check we didn't accidentally alter df | ||||
|     tm.assert_frame_equal(df, orig) | ||||
|  | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("method", ["cummin", "cummax"]) | ||||
| def test_cummin_max_skipna_multiple_cols(method): | ||||
|     # Ensure missing value in "a" doesn't cause "b" to be nan-filled | ||||
|     df = DataFrame({"a": [np.nan, 2.0, 2.0], "b": [2.0, 2.0, 2.0]}) | ||||
|     gb = df.groupby([1, 1, 1])[["a", "b"]] | ||||
|  | ||||
|     result = getattr(gb, method)(skipna=False) | ||||
|     expected = DataFrame({"a": [np.nan, np.nan, np.nan], "b": [2.0, 2.0, 2.0]}) | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("func", ["cumprod", "cumsum"]) | ||||
| def test_numpy_compat(func): | ||||
|     # see gh-12811 | ||||
|     df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]}) | ||||
|     g = df.groupby("A") | ||||
|  | ||||
|     msg = "numpy operations are not valid with groupby" | ||||
|  | ||||
|     with pytest.raises(UnsupportedFunctionCall, match=msg): | ||||
|         getattr(g, func)(1, 2, 3) | ||||
|     with pytest.raises(UnsupportedFunctionCall, match=msg): | ||||
|         getattr(g, func)(foo=1) | ||||
|  | ||||
|  | ||||
| @td.skip_if_32bit | ||||
| @pytest.mark.parametrize("method", ["cummin", "cummax"]) | ||||
| @pytest.mark.parametrize( | ||||
|     "dtype,val", [("UInt64", np.iinfo("uint64").max), ("Int64", 2**53 + 1)] | ||||
| ) | ||||
| def test_nullable_int_not_cast_as_float(method, dtype, val): | ||||
|     data = [val, pd.NA] | ||||
|     df = DataFrame({"grp": [1, 1], "b": data}, dtype=dtype) | ||||
|     grouped = df.groupby("grp") | ||||
|  | ||||
|     result = grouped.transform(method) | ||||
|     expected = DataFrame({"b": data}, dtype=dtype) | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_cython_api2(): | ||||
|     # this takes the fast apply path | ||||
|  | ||||
|     # cumsum (GH5614) | ||||
|     df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"]) | ||||
|     expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=["B", "C"]) | ||||
|     result = df.groupby("A").cumsum() | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # GH 5755 - cumsum is a transformer and should ignore as_index | ||||
|     result = df.groupby("A", as_index=False).cumsum() | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # GH 13994 | ||||
|     msg = "DataFrameGroupBy.cumsum with axis=1 is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         result = df.groupby("A").cumsum(axis=1) | ||||
|     expected = df.cumsum(axis=1) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     msg = "DataFrameGroupBy.cumprod with axis=1 is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         result = df.groupby("A").cumprod(axis=1) | ||||
|     expected = df.cumprod(axis=1) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
| @ -0,0 +1,636 @@ | ||||
| from string import ascii_lowercase | ||||
|  | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| import pandas as pd | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Series, | ||||
|     Timestamp, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| def test_filter_series(): | ||||
|     s = Series([1, 3, 20, 5, 22, 24, 7]) | ||||
|     expected_odd = Series([1, 3, 5, 7], index=[0, 1, 3, 6]) | ||||
|     expected_even = Series([20, 22, 24], index=[2, 4, 5]) | ||||
|     grouper = s.apply(lambda x: x % 2) | ||||
|     grouped = s.groupby(grouper) | ||||
|     tm.assert_series_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd) | ||||
|     tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 10), expected_even) | ||||
|     # Test dropna=False. | ||||
|     tm.assert_series_equal( | ||||
|         grouped.filter(lambda x: x.mean() < 10, dropna=False), | ||||
|         expected_odd.reindex(s.index), | ||||
|     ) | ||||
|     tm.assert_series_equal( | ||||
|         grouped.filter(lambda x: x.mean() > 10, dropna=False), | ||||
|         expected_even.reindex(s.index), | ||||
|     ) | ||||
|  | ||||
|  | ||||
| def test_filter_single_column_df(): | ||||
|     df = DataFrame([1, 3, 20, 5, 22, 24, 7]) | ||||
|     expected_odd = DataFrame([1, 3, 5, 7], index=[0, 1, 3, 6]) | ||||
|     expected_even = DataFrame([20, 22, 24], index=[2, 4, 5]) | ||||
|     grouper = df[0].apply(lambda x: x % 2) | ||||
|     grouped = df.groupby(grouper) | ||||
|     tm.assert_frame_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd) | ||||
|     tm.assert_frame_equal(grouped.filter(lambda x: x.mean() > 10), expected_even) | ||||
|     # Test dropna=False. | ||||
|     tm.assert_frame_equal( | ||||
|         grouped.filter(lambda x: x.mean() < 10, dropna=False), | ||||
|         expected_odd.reindex(df.index), | ||||
|     ) | ||||
|     tm.assert_frame_equal( | ||||
|         grouped.filter(lambda x: x.mean() > 10, dropna=False), | ||||
|         expected_even.reindex(df.index), | ||||
|     ) | ||||
|  | ||||
|  | ||||
| def test_filter_multi_column_df(): | ||||
|     df = DataFrame({"A": [1, 12, 12, 1], "B": [1, 1, 1, 1]}) | ||||
|     grouper = df["A"].apply(lambda x: x % 2) | ||||
|     grouped = df.groupby(grouper) | ||||
|     expected = DataFrame({"A": [12, 12], "B": [1, 1]}, index=[1, 2]) | ||||
|     tm.assert_frame_equal( | ||||
|         grouped.filter(lambda x: x["A"].sum() - x["B"].sum() > 10), expected | ||||
|     ) | ||||
|  | ||||
|  | ||||
| def test_filter_mixed_df(): | ||||
|     df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()}) | ||||
|     grouper = df["A"].apply(lambda x: x % 2) | ||||
|     grouped = df.groupby(grouper) | ||||
|     expected = DataFrame({"A": [12, 12], "B": ["b", "c"]}, index=[1, 2]) | ||||
|     tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 10), expected) | ||||
|  | ||||
|  | ||||
| def test_filter_out_all_groups(): | ||||
|     s = Series([1, 3, 20, 5, 22, 24, 7]) | ||||
|     grouper = s.apply(lambda x: x % 2) | ||||
|     grouped = s.groupby(grouper) | ||||
|     tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 1000), s[[]]) | ||||
|     df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()}) | ||||
|     grouper = df["A"].apply(lambda x: x % 2) | ||||
|     grouped = df.groupby(grouper) | ||||
|     tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 1000), df.loc[[]]) | ||||
|  | ||||
|  | ||||
| def test_filter_out_no_groups(): | ||||
|     s = Series([1, 3, 20, 5, 22, 24, 7]) | ||||
|     grouper = s.apply(lambda x: x % 2) | ||||
|     grouped = s.groupby(grouper) | ||||
|     filtered = grouped.filter(lambda x: x.mean() > 0) | ||||
|     tm.assert_series_equal(filtered, s) | ||||
|     df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()}) | ||||
|     grouper = df["A"].apply(lambda x: x % 2) | ||||
|     grouped = df.groupby(grouper) | ||||
|     filtered = grouped.filter(lambda x: x["A"].mean() > 0) | ||||
|     tm.assert_frame_equal(filtered, df) | ||||
|  | ||||
|  | ||||
| def test_filter_out_all_groups_in_df(): | ||||
|     # GH12768 | ||||
|     df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]}) | ||||
|     res = df.groupby("a") | ||||
|     res = res.filter(lambda x: x["b"].sum() > 5, dropna=False) | ||||
|     expected = DataFrame({"a": [np.nan] * 3, "b": [np.nan] * 3}) | ||||
|     tm.assert_frame_equal(expected, res) | ||||
|  | ||||
|     df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]}) | ||||
|     res = df.groupby("a") | ||||
|     res = res.filter(lambda x: x["b"].sum() > 5, dropna=True) | ||||
|     expected = DataFrame({"a": [], "b": []}, dtype="int64") | ||||
|     tm.assert_frame_equal(expected, res) | ||||
|  | ||||
|  | ||||
| def test_filter_condition_raises(): | ||||
|     def raise_if_sum_is_zero(x): | ||||
|         if x.sum() == 0: | ||||
|             raise ValueError | ||||
|         return x.sum() > 0 | ||||
|  | ||||
|     s = Series([-1, 0, 1, 2]) | ||||
|     grouper = s.apply(lambda x: x % 2) | ||||
|     grouped = s.groupby(grouper) | ||||
|     msg = "the filter must return a boolean result" | ||||
|     with pytest.raises(TypeError, match=msg): | ||||
|         grouped.filter(raise_if_sum_is_zero) | ||||
|  | ||||
|  | ||||
| def test_filter_with_axis_in_groupby(): | ||||
|     # issue 11041 | ||||
|     index = pd.MultiIndex.from_product([range(10), [0, 1]]) | ||||
|     data = DataFrame(np.arange(100).reshape(-1, 20), columns=index, dtype="int64") | ||||
|  | ||||
|     msg = "DataFrame.groupby with axis=1" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         gb = data.groupby(level=0, axis=1) | ||||
|     result = gb.filter(lambda x: x.iloc[0, 0] > 10) | ||||
|     expected = data.iloc[:, 12:20] | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_filter_bad_shapes(): | ||||
|     df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)}) | ||||
|     s = df["B"] | ||||
|     g_df = df.groupby("B") | ||||
|     g_s = s.groupby(s) | ||||
|  | ||||
|     f = lambda x: x | ||||
|     msg = "filter function returned a DataFrame, but expected a scalar bool" | ||||
|     with pytest.raises(TypeError, match=msg): | ||||
|         g_df.filter(f) | ||||
|     msg = "the filter must return a boolean result" | ||||
|     with pytest.raises(TypeError, match=msg): | ||||
|         g_s.filter(f) | ||||
|  | ||||
|     f = lambda x: x == 1 | ||||
|     msg = "filter function returned a DataFrame, but expected a scalar bool" | ||||
|     with pytest.raises(TypeError, match=msg): | ||||
|         g_df.filter(f) | ||||
|     msg = "the filter must return a boolean result" | ||||
|     with pytest.raises(TypeError, match=msg): | ||||
|         g_s.filter(f) | ||||
|  | ||||
|     f = lambda x: np.outer(x, x) | ||||
|     msg = "can't multiply sequence by non-int of type 'str'" | ||||
|     with pytest.raises(TypeError, match=msg): | ||||
|         g_df.filter(f) | ||||
|     msg = "the filter must return a boolean result" | ||||
|     with pytest.raises(TypeError, match=msg): | ||||
|         g_s.filter(f) | ||||
|  | ||||
|  | ||||
| def test_filter_nan_is_false(): | ||||
|     df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)}) | ||||
|     s = df["B"] | ||||
|     g_df = df.groupby(df["B"]) | ||||
|     g_s = s.groupby(s) | ||||
|  | ||||
|     f = lambda x: np.nan | ||||
|     tm.assert_frame_equal(g_df.filter(f), df.loc[[]]) | ||||
|     tm.assert_series_equal(g_s.filter(f), s[[]]) | ||||
|  | ||||
|  | ||||
| def test_filter_pdna_is_false(): | ||||
|     # in particular, dont raise in filter trying to call bool(pd.NA) | ||||
|     df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)}) | ||||
|     ser = df["B"] | ||||
|     g_df = df.groupby(df["B"]) | ||||
|     g_s = ser.groupby(ser) | ||||
|  | ||||
|     func = lambda x: pd.NA | ||||
|     res = g_df.filter(func) | ||||
|     tm.assert_frame_equal(res, df.loc[[]]) | ||||
|     res = g_s.filter(func) | ||||
|     tm.assert_series_equal(res, ser[[]]) | ||||
|  | ||||
|  | ||||
| def test_filter_against_workaround_ints(): | ||||
|     # Series of ints | ||||
|     s = Series(np.random.default_rng(2).integers(0, 100, 100)) | ||||
|     grouper = s.apply(lambda x: np.round(x, -1)) | ||||
|     grouped = s.groupby(grouper) | ||||
|     f = lambda x: x.mean() > 10 | ||||
|  | ||||
|     old_way = s[grouped.transform(f).astype("bool")] | ||||
|     new_way = grouped.filter(f) | ||||
|     tm.assert_series_equal(new_way.sort_values(), old_way.sort_values()) | ||||
|  | ||||
|  | ||||
| def test_filter_against_workaround_floats(): | ||||
|     # Series of floats | ||||
|     s = 100 * Series(np.random.default_rng(2).random(100)) | ||||
|     grouper = s.apply(lambda x: np.round(x, -1)) | ||||
|     grouped = s.groupby(grouper) | ||||
|     f = lambda x: x.mean() > 10 | ||||
|     old_way = s[grouped.transform(f).astype("bool")] | ||||
|     new_way = grouped.filter(f) | ||||
|     tm.assert_series_equal(new_way.sort_values(), old_way.sort_values()) | ||||
|  | ||||
|  | ||||
| def test_filter_against_workaround_dataframe(): | ||||
|     # Set up DataFrame of ints, floats, strings. | ||||
|     letters = np.array(list(ascii_lowercase)) | ||||
|     N = 100 | ||||
|     random_letters = letters.take( | ||||
|         np.random.default_rng(2).integers(0, 26, N, dtype=int) | ||||
|     ) | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "ints": Series(np.random.default_rng(2).integers(0, 100, N)), | ||||
|             "floats": N / 10 * Series(np.random.default_rng(2).random(N)), | ||||
|             "letters": Series(random_letters), | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|     # Group by ints; filter on floats. | ||||
|     grouped = df.groupby("ints") | ||||
|     old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 20).astype("bool")] | ||||
|     new_way = grouped.filter(lambda x: x["floats"].mean() > N / 20) | ||||
|     tm.assert_frame_equal(new_way, old_way) | ||||
|  | ||||
|     # Group by floats (rounded); filter on strings. | ||||
|     grouper = df.floats.apply(lambda x: np.round(x, -1)) | ||||
|     grouped = df.groupby(grouper) | ||||
|     old_way = df[grouped.letters.transform(lambda x: len(x) < N / 10).astype("bool")] | ||||
|     new_way = grouped.filter(lambda x: len(x.letters) < N / 10) | ||||
|     tm.assert_frame_equal(new_way, old_way) | ||||
|  | ||||
|     # Group by strings; filter on ints. | ||||
|     grouped = df.groupby("letters") | ||||
|     old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 20).astype("bool")] | ||||
|     new_way = grouped.filter(lambda x: x["ints"].mean() > N / 20) | ||||
|     tm.assert_frame_equal(new_way, old_way) | ||||
|  | ||||
|  | ||||
| def test_filter_using_len(): | ||||
|     # BUG GH4447 | ||||
|     df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)}) | ||||
|     grouped = df.groupby("B") | ||||
|     actual = grouped.filter(lambda x: len(x) > 2) | ||||
|     expected = DataFrame( | ||||
|         {"A": np.arange(2, 6), "B": list("bbbb"), "C": np.arange(2, 6)}, | ||||
|         index=np.arange(2, 6, dtype=np.int64), | ||||
|     ) | ||||
|     tm.assert_frame_equal(actual, expected) | ||||
|  | ||||
|     actual = grouped.filter(lambda x: len(x) > 4) | ||||
|     expected = df.loc[[]] | ||||
|     tm.assert_frame_equal(actual, expected) | ||||
|  | ||||
|     # Series have always worked properly, but we'll test anyway. | ||||
|     s = df["B"] | ||||
|     grouped = s.groupby(s) | ||||
|     actual = grouped.filter(lambda x: len(x) > 2) | ||||
|     expected = Series(4 * ["b"], index=np.arange(2, 6, dtype=np.int64), name="B") | ||||
|     tm.assert_series_equal(actual, expected) | ||||
|  | ||||
|     actual = grouped.filter(lambda x: len(x) > 4) | ||||
|     expected = s[[]] | ||||
|     tm.assert_series_equal(actual, expected) | ||||
|  | ||||
|  | ||||
| def test_filter_maintains_ordering(): | ||||
|     # Simple case: index is sequential. #4621 | ||||
|     df = DataFrame( | ||||
|         {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]} | ||||
|     ) | ||||
|     s = df["pid"] | ||||
|     grouped = df.groupby("tag") | ||||
|     actual = grouped.filter(lambda x: len(x) > 1) | ||||
|     expected = df.iloc[[1, 2, 4, 7]] | ||||
|     tm.assert_frame_equal(actual, expected) | ||||
|  | ||||
|     grouped = s.groupby(df["tag"]) | ||||
|     actual = grouped.filter(lambda x: len(x) > 1) | ||||
|     expected = s.iloc[[1, 2, 4, 7]] | ||||
|     tm.assert_series_equal(actual, expected) | ||||
|  | ||||
|     # Now index is sequentially decreasing. | ||||
|     df.index = np.arange(len(df) - 1, -1, -1) | ||||
|     s = df["pid"] | ||||
|     grouped = df.groupby("tag") | ||||
|     actual = grouped.filter(lambda x: len(x) > 1) | ||||
|     expected = df.iloc[[1, 2, 4, 7]] | ||||
|     tm.assert_frame_equal(actual, expected) | ||||
|  | ||||
|     grouped = s.groupby(df["tag"]) | ||||
|     actual = grouped.filter(lambda x: len(x) > 1) | ||||
|     expected = s.iloc[[1, 2, 4, 7]] | ||||
|     tm.assert_series_equal(actual, expected) | ||||
|  | ||||
|     # Index is shuffled. | ||||
|     SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3] | ||||
|     df.index = df.index[SHUFFLED] | ||||
|     s = df["pid"] | ||||
|     grouped = df.groupby("tag") | ||||
|     actual = grouped.filter(lambda x: len(x) > 1) | ||||
|     expected = df.iloc[[1, 2, 4, 7]] | ||||
|     tm.assert_frame_equal(actual, expected) | ||||
|  | ||||
|     grouped = s.groupby(df["tag"]) | ||||
|     actual = grouped.filter(lambda x: len(x) > 1) | ||||
|     expected = s.iloc[[1, 2, 4, 7]] | ||||
|     tm.assert_series_equal(actual, expected) | ||||
|  | ||||
|  | ||||
| def test_filter_multiple_timestamp(): | ||||
|     # GH 10114 | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "A": np.arange(5, dtype="int64"), | ||||
|             "B": ["foo", "bar", "foo", "bar", "bar"], | ||||
|             "C": Timestamp("20130101"), | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|     grouped = df.groupby(["B", "C"]) | ||||
|  | ||||
|     result = grouped["A"].filter(lambda x: True) | ||||
|     tm.assert_series_equal(df["A"], result) | ||||
|  | ||||
|     result = grouped["A"].transform(len) | ||||
|     expected = Series([2, 3, 2, 3, 3], name="A") | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     result = grouped.filter(lambda x: True) | ||||
|     tm.assert_frame_equal(df, result) | ||||
|  | ||||
|     result = grouped.transform("sum") | ||||
|     expected = DataFrame({"A": [2, 8, 2, 8, 8]}) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = grouped.transform(len) | ||||
|     expected = DataFrame({"A": [2, 3, 2, 3, 3]}) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_filter_and_transform_with_non_unique_int_index(): | ||||
|     # GH4620 | ||||
|     index = [1, 1, 1, 2, 1, 1, 0, 1] | ||||
|     df = DataFrame( | ||||
|         {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}, | ||||
|         index=index, | ||||
|     ) | ||||
|     grouped_df = df.groupby("tag") | ||||
|     ser = df["pid"] | ||||
|     grouped_ser = ser.groupby(df["tag"]) | ||||
|     expected_indexes = [1, 2, 4, 7] | ||||
|  | ||||
|     # Filter DataFrame | ||||
|     actual = grouped_df.filter(lambda x: len(x) > 1) | ||||
|     expected = df.iloc[expected_indexes] | ||||
|     tm.assert_frame_equal(actual, expected) | ||||
|  | ||||
|     actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) | ||||
|     # Cast to avoid upcast when setting nan below | ||||
|     expected = df.copy().astype("float64") | ||||
|     expected.iloc[[0, 3, 5, 6]] = np.nan | ||||
|     tm.assert_frame_equal(actual, expected) | ||||
|  | ||||
|     # Filter Series | ||||
|     actual = grouped_ser.filter(lambda x: len(x) > 1) | ||||
|     expected = ser.take(expected_indexes) | ||||
|     tm.assert_series_equal(actual, expected) | ||||
|  | ||||
|     actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) | ||||
|     expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid") | ||||
|     # ^ made manually because this can get confusing! | ||||
|     tm.assert_series_equal(actual, expected) | ||||
|  | ||||
|     # Transform Series | ||||
|     actual = grouped_ser.transform(len) | ||||
|     expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid") | ||||
|     tm.assert_series_equal(actual, expected) | ||||
|  | ||||
|     # Transform (a column from) DataFrameGroupBy | ||||
|     actual = grouped_df.pid.transform(len) | ||||
|     tm.assert_series_equal(actual, expected) | ||||
|  | ||||
|  | ||||
| def test_filter_and_transform_with_multiple_non_unique_int_index(): | ||||
|     # GH4620 | ||||
|     index = [1, 1, 1, 2, 0, 0, 0, 1] | ||||
|     df = DataFrame( | ||||
|         {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}, | ||||
|         index=index, | ||||
|     ) | ||||
|     grouped_df = df.groupby("tag") | ||||
|     ser = df["pid"] | ||||
|     grouped_ser = ser.groupby(df["tag"]) | ||||
|     expected_indexes = [1, 2, 4, 7] | ||||
|  | ||||
|     # Filter DataFrame | ||||
|     actual = grouped_df.filter(lambda x: len(x) > 1) | ||||
|     expected = df.iloc[expected_indexes] | ||||
|     tm.assert_frame_equal(actual, expected) | ||||
|  | ||||
|     actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) | ||||
|     # Cast to avoid upcast when setting nan below | ||||
|     expected = df.copy().astype("float64") | ||||
|     expected.iloc[[0, 3, 5, 6]] = np.nan | ||||
|     tm.assert_frame_equal(actual, expected) | ||||
|  | ||||
|     # Filter Series | ||||
|     actual = grouped_ser.filter(lambda x: len(x) > 1) | ||||
|     expected = ser.take(expected_indexes) | ||||
|     tm.assert_series_equal(actual, expected) | ||||
|  | ||||
|     actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) | ||||
|     expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid") | ||||
|     # ^ made manually because this can get confusing! | ||||
|     tm.assert_series_equal(actual, expected) | ||||
|  | ||||
|     # Transform Series | ||||
|     actual = grouped_ser.transform(len) | ||||
|     expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid") | ||||
|     tm.assert_series_equal(actual, expected) | ||||
|  | ||||
|     # Transform (a column from) DataFrameGroupBy | ||||
|     actual = grouped_df.pid.transform(len) | ||||
|     tm.assert_series_equal(actual, expected) | ||||
|  | ||||
|  | ||||
| def test_filter_and_transform_with_non_unique_float_index(): | ||||
|     # GH4620 | ||||
|     index = np.array([1, 1, 1, 2, 1, 1, 0, 1], dtype=float) | ||||
|     df = DataFrame( | ||||
|         {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}, | ||||
|         index=index, | ||||
|     ) | ||||
|     grouped_df = df.groupby("tag") | ||||
|     ser = df["pid"] | ||||
|     grouped_ser = ser.groupby(df["tag"]) | ||||
|     expected_indexes = [1, 2, 4, 7] | ||||
|  | ||||
|     # Filter DataFrame | ||||
|     actual = grouped_df.filter(lambda x: len(x) > 1) | ||||
|     expected = df.iloc[expected_indexes] | ||||
|     tm.assert_frame_equal(actual, expected) | ||||
|  | ||||
|     actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) | ||||
|     # Cast to avoid upcast when setting nan below | ||||
|     expected = df.copy().astype("float64") | ||||
|     expected.iloc[[0, 3, 5, 6]] = np.nan | ||||
|     tm.assert_frame_equal(actual, expected) | ||||
|  | ||||
|     # Filter Series | ||||
|     actual = grouped_ser.filter(lambda x: len(x) > 1) | ||||
|     expected = ser.take(expected_indexes) | ||||
|     tm.assert_series_equal(actual, expected) | ||||
|  | ||||
|     actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) | ||||
|     expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid") | ||||
|     # ^ made manually because this can get confusing! | ||||
|     tm.assert_series_equal(actual, expected) | ||||
|  | ||||
|     # Transform Series | ||||
|     actual = grouped_ser.transform(len) | ||||
|     expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid") | ||||
|     tm.assert_series_equal(actual, expected) | ||||
|  | ||||
|     # Transform (a column from) DataFrameGroupBy | ||||
|     actual = grouped_df.pid.transform(len) | ||||
|     tm.assert_series_equal(actual, expected) | ||||
|  | ||||
|  | ||||
| def test_filter_and_transform_with_non_unique_timestamp_index(): | ||||
|     # GH4620 | ||||
|     t0 = Timestamp("2013-09-30 00:05:00") | ||||
|     t1 = Timestamp("2013-10-30 00:05:00") | ||||
|     t2 = Timestamp("2013-11-30 00:05:00") | ||||
|     index = [t1, t1, t1, t2, t1, t1, t0, t1] | ||||
|     df = DataFrame( | ||||
|         {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}, | ||||
|         index=index, | ||||
|     ) | ||||
|     grouped_df = df.groupby("tag") | ||||
|     ser = df["pid"] | ||||
|     grouped_ser = ser.groupby(df["tag"]) | ||||
|     expected_indexes = [1, 2, 4, 7] | ||||
|  | ||||
|     # Filter DataFrame | ||||
|     actual = grouped_df.filter(lambda x: len(x) > 1) | ||||
|     expected = df.iloc[expected_indexes] | ||||
|     tm.assert_frame_equal(actual, expected) | ||||
|  | ||||
|     actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) | ||||
|     # Cast to avoid upcast when setting nan below | ||||
|     expected = df.copy().astype("float64") | ||||
|     expected.iloc[[0, 3, 5, 6]] = np.nan | ||||
|     tm.assert_frame_equal(actual, expected) | ||||
|  | ||||
|     # Filter Series | ||||
|     actual = grouped_ser.filter(lambda x: len(x) > 1) | ||||
|     expected = ser.take(expected_indexes) | ||||
|     tm.assert_series_equal(actual, expected) | ||||
|  | ||||
|     actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) | ||||
|     expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid") | ||||
|     # ^ made manually because this can get confusing! | ||||
|     tm.assert_series_equal(actual, expected) | ||||
|  | ||||
|     # Transform Series | ||||
|     actual = grouped_ser.transform(len) | ||||
|     expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid") | ||||
|     tm.assert_series_equal(actual, expected) | ||||
|  | ||||
|     # Transform (a column from) DataFrameGroupBy | ||||
|     actual = grouped_df.pid.transform(len) | ||||
|     tm.assert_series_equal(actual, expected) | ||||
|  | ||||
|  | ||||
| def test_filter_and_transform_with_non_unique_string_index(): | ||||
|     # GH4620 | ||||
|     index = list("bbbcbbab") | ||||
|     df = DataFrame( | ||||
|         {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}, | ||||
|         index=index, | ||||
|     ) | ||||
|     grouped_df = df.groupby("tag") | ||||
|     ser = df["pid"] | ||||
|     grouped_ser = ser.groupby(df["tag"]) | ||||
|     expected_indexes = [1, 2, 4, 7] | ||||
|  | ||||
|     # Filter DataFrame | ||||
|     actual = grouped_df.filter(lambda x: len(x) > 1) | ||||
|     expected = df.iloc[expected_indexes] | ||||
|     tm.assert_frame_equal(actual, expected) | ||||
|  | ||||
|     actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) | ||||
|     # Cast to avoid upcast when setting nan below | ||||
|     expected = df.copy().astype("float64") | ||||
|     expected.iloc[[0, 3, 5, 6]] = np.nan | ||||
|     tm.assert_frame_equal(actual, expected) | ||||
|  | ||||
|     # Filter Series | ||||
|     actual = grouped_ser.filter(lambda x: len(x) > 1) | ||||
|     expected = ser.take(expected_indexes) | ||||
|     tm.assert_series_equal(actual, expected) | ||||
|  | ||||
|     actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) | ||||
|     expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid") | ||||
|     # ^ made manually because this can get confusing! | ||||
|     tm.assert_series_equal(actual, expected) | ||||
|  | ||||
|     # Transform Series | ||||
|     actual = grouped_ser.transform(len) | ||||
|     expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid") | ||||
|     tm.assert_series_equal(actual, expected) | ||||
|  | ||||
|     # Transform (a column from) DataFrameGroupBy | ||||
|     actual = grouped_df.pid.transform(len) | ||||
|     tm.assert_series_equal(actual, expected) | ||||
|  | ||||
|  | ||||
| def test_filter_has_access_to_grouped_cols(): | ||||
|     df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=["A", "B"]) | ||||
|     g = df.groupby("A") | ||||
|     # previously didn't have access to col A #???? | ||||
|     filt = g.filter(lambda x: x["A"].sum() == 2) | ||||
|     tm.assert_frame_equal(filt, df.iloc[[0, 1]]) | ||||
|  | ||||
|  | ||||
| def test_filter_enforces_scalarness(): | ||||
|     df = DataFrame( | ||||
|         [ | ||||
|             ["best", "a", "x"], | ||||
|             ["worst", "b", "y"], | ||||
|             ["best", "c", "x"], | ||||
|             ["best", "d", "y"], | ||||
|             ["worst", "d", "y"], | ||||
|             ["worst", "d", "y"], | ||||
|             ["best", "d", "z"], | ||||
|         ], | ||||
|         columns=["a", "b", "c"], | ||||
|     ) | ||||
|     with pytest.raises(TypeError, match="filter function returned a.*"): | ||||
|         df.groupby("c").filter(lambda g: g["a"] == "best") | ||||
|  | ||||
|  | ||||
| def test_filter_non_bool_raises(): | ||||
|     df = DataFrame( | ||||
|         [ | ||||
|             ["best", "a", 1], | ||||
|             ["worst", "b", 1], | ||||
|             ["best", "c", 1], | ||||
|             ["best", "d", 1], | ||||
|             ["worst", "d", 1], | ||||
|             ["worst", "d", 1], | ||||
|             ["best", "d", 1], | ||||
|         ], | ||||
|         columns=["a", "b", "c"], | ||||
|     ) | ||||
|     with pytest.raises(TypeError, match="filter function returned a.*"): | ||||
|         df.groupby("a").filter(lambda g: g.c.mean()) | ||||
|  | ||||
|  | ||||
| def test_filter_dropna_with_empty_groups(): | ||||
|     # GH 10780 | ||||
|     data = Series(np.random.default_rng(2).random(9), index=np.repeat([1, 2, 3], 3)) | ||||
|     grouped = data.groupby(level=0) | ||||
|     result_false = grouped.filter(lambda x: x.mean() > 1, dropna=False) | ||||
|     expected_false = Series([np.nan] * 9, index=np.repeat([1, 2, 3], 3)) | ||||
|     tm.assert_series_equal(result_false, expected_false) | ||||
|  | ||||
|     result_true = grouped.filter(lambda x: x.mean() > 1, dropna=True) | ||||
|     expected_true = Series(index=pd.Index([], dtype=int), dtype=np.float64) | ||||
|     tm.assert_series_equal(result_true, expected_true) | ||||
|  | ||||
|  | ||||
| def test_filter_consistent_result_before_after_agg_func(): | ||||
|     # GH 17091 | ||||
|     df = DataFrame({"data": range(6), "key": list("ABCABC")}) | ||||
|     grouper = df.groupby("key") | ||||
|     result = grouper.filter(lambda x: True) | ||||
|     expected = DataFrame({"data": range(6), "key": list("ABCABC")}) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     grouper.sum() | ||||
|     result = grouper.filter(lambda x: True) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
							
								
								
									
										3363
									
								
								lib/python3.11/site-packages/pandas/tests/groupby/test_groupby.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3363
									
								
								lib/python3.11/site-packages/pandas/tests/groupby/test_groupby.py
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -0,0 +1,696 @@ | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas.compat.pyarrow import pa_version_under10p1 | ||||
|  | ||||
| from pandas.core.dtypes.missing import na_value_for_dtype | ||||
|  | ||||
| import pandas as pd | ||||
| import pandas._testing as tm | ||||
| from pandas.tests.groupby import get_groupby_method_args | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "dropna, tuples, outputs", | ||||
|     [ | ||||
|         ( | ||||
|             True, | ||||
|             [["A", "B"], ["B", "A"]], | ||||
|             {"c": [13.0, 123.23], "d": [13.0, 123.0], "e": [13.0, 1.0]}, | ||||
|         ), | ||||
|         ( | ||||
|             False, | ||||
|             [["A", "B"], ["A", np.nan], ["B", "A"]], | ||||
|             { | ||||
|                 "c": [13.0, 12.3, 123.23], | ||||
|                 "d": [13.0, 233.0, 123.0], | ||||
|                 "e": [13.0, 12.0, 1.0], | ||||
|             }, | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_groupby_dropna_multi_index_dataframe_nan_in_one_group( | ||||
|     dropna, tuples, outputs, nulls_fixture | ||||
| ): | ||||
|     # GH 3729 this is to test that NA is in one group | ||||
|     df_list = [ | ||||
|         ["A", "B", 12, 12, 12], | ||||
|         ["A", nulls_fixture, 12.3, 233.0, 12], | ||||
|         ["B", "A", 123.23, 123, 1], | ||||
|         ["A", "B", 1, 1, 1.0], | ||||
|     ] | ||||
|     df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"]) | ||||
|     grouped = df.groupby(["a", "b"], dropna=dropna).sum() | ||||
|  | ||||
|     mi = pd.MultiIndex.from_tuples(tuples, names=list("ab")) | ||||
|  | ||||
|     # Since right now, by default MI will drop NA from levels when we create MI | ||||
|     # via `from_*`, so we need to add NA for level manually afterwards. | ||||
|     if not dropna: | ||||
|         mi = mi.set_levels(["A", "B", np.nan], level="b") | ||||
|     expected = pd.DataFrame(outputs, index=mi) | ||||
|  | ||||
|     tm.assert_frame_equal(grouped, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "dropna, tuples, outputs", | ||||
|     [ | ||||
|         ( | ||||
|             True, | ||||
|             [["A", "B"], ["B", "A"]], | ||||
|             {"c": [12.0, 123.23], "d": [12.0, 123.0], "e": [12.0, 1.0]}, | ||||
|         ), | ||||
|         ( | ||||
|             False, | ||||
|             [["A", "B"], ["A", np.nan], ["B", "A"], [np.nan, "B"]], | ||||
|             { | ||||
|                 "c": [12.0, 13.3, 123.23, 1.0], | ||||
|                 "d": [12.0, 234.0, 123.0, 1.0], | ||||
|                 "e": [12.0, 13.0, 1.0, 1.0], | ||||
|             }, | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups( | ||||
|     dropna, tuples, outputs, nulls_fixture, nulls_fixture2 | ||||
| ): | ||||
|     # GH 3729 this is to test that NA in different groups with different representations | ||||
|     df_list = [ | ||||
|         ["A", "B", 12, 12, 12], | ||||
|         ["A", nulls_fixture, 12.3, 233.0, 12], | ||||
|         ["B", "A", 123.23, 123, 1], | ||||
|         [nulls_fixture2, "B", 1, 1, 1.0], | ||||
|         ["A", nulls_fixture2, 1, 1, 1.0], | ||||
|     ] | ||||
|     df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"]) | ||||
|     grouped = df.groupby(["a", "b"], dropna=dropna).sum() | ||||
|  | ||||
|     mi = pd.MultiIndex.from_tuples(tuples, names=list("ab")) | ||||
|  | ||||
|     # Since right now, by default MI will drop NA from levels when we create MI | ||||
|     # via `from_*`, so we need to add NA for level manually afterwards. | ||||
|     if not dropna: | ||||
|         mi = mi.set_levels([["A", "B", np.nan], ["A", "B", np.nan]]) | ||||
|     expected = pd.DataFrame(outputs, index=mi) | ||||
|  | ||||
|     tm.assert_frame_equal(grouped, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "dropna, idx, outputs", | ||||
|     [ | ||||
|         (True, ["A", "B"], {"b": [123.23, 13.0], "c": [123.0, 13.0], "d": [1.0, 13.0]}), | ||||
|         ( | ||||
|             False, | ||||
|             ["A", "B", np.nan], | ||||
|             { | ||||
|                 "b": [123.23, 13.0, 12.3], | ||||
|                 "c": [123.0, 13.0, 233.0], | ||||
|                 "d": [1.0, 13.0, 12.0], | ||||
|             }, | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs): | ||||
|     # GH 3729 | ||||
|     df_list = [ | ||||
|         ["B", 12, 12, 12], | ||||
|         [None, 12.3, 233.0, 12], | ||||
|         ["A", 123.23, 123, 1], | ||||
|         ["B", 1, 1, 1.0], | ||||
|     ] | ||||
|     df = pd.DataFrame(df_list, columns=["a", "b", "c", "d"]) | ||||
|     grouped = df.groupby("a", dropna=dropna).sum() | ||||
|  | ||||
|     expected = pd.DataFrame(outputs, index=pd.Index(idx, name="a")) | ||||
|  | ||||
|     tm.assert_frame_equal(grouped, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "dropna, idx, expected", | ||||
|     [ | ||||
|         (True, ["a", "a", "b", np.nan], pd.Series([3, 3], index=["a", "b"])), | ||||
|         ( | ||||
|             False, | ||||
|             ["a", "a", "b", np.nan], | ||||
|             pd.Series([3, 3, 3], index=["a", "b", np.nan]), | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_groupby_dropna_series_level(dropna, idx, expected): | ||||
|     ser = pd.Series([1, 2, 3, 3], index=idx) | ||||
|  | ||||
|     result = ser.groupby(level=0, dropna=dropna).sum() | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "dropna, expected", | ||||
|     [ | ||||
|         (True, pd.Series([210.0, 350.0], index=["a", "b"], name="Max Speed")), | ||||
|         ( | ||||
|             False, | ||||
|             pd.Series([210.0, 350.0, 20.0], index=["a", "b", np.nan], name="Max Speed"), | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_groupby_dropna_series_by(dropna, expected): | ||||
|     ser = pd.Series( | ||||
|         [390.0, 350.0, 30.0, 20.0], | ||||
|         index=["Falcon", "Falcon", "Parrot", "Parrot"], | ||||
|         name="Max Speed", | ||||
|     ) | ||||
|  | ||||
|     result = ser.groupby(["a", "b", "a", np.nan], dropna=dropna).mean() | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("dropna", (False, True)) | ||||
| def test_grouper_dropna_propagation(dropna): | ||||
|     # GH 36604 | ||||
|     df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}) | ||||
|     gb = df.groupby("A", dropna=dropna) | ||||
|     assert gb._grouper.dropna == dropna | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "index", | ||||
|     [ | ||||
|         pd.RangeIndex(0, 4), | ||||
|         list("abcd"), | ||||
|         pd.MultiIndex.from_product([(1, 2), ("R", "B")], names=["num", "col"]), | ||||
|     ], | ||||
| ) | ||||
| def test_groupby_dataframe_slice_then_transform(dropna, index): | ||||
|     # GH35014 & GH35612 | ||||
|     expected_data = {"B": [2, 2, 1, np.nan if dropna else 1]} | ||||
|  | ||||
|     df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=index) | ||||
|     gb = df.groupby("A", dropna=dropna) | ||||
|  | ||||
|     result = gb.transform(len) | ||||
|     expected = pd.DataFrame(expected_data, index=index) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = gb[["B"]].transform(len) | ||||
|     expected = pd.DataFrame(expected_data, index=index) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = gb["B"].transform(len) | ||||
|     expected = pd.Series(expected_data["B"], index=index, name="B") | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "dropna, tuples, outputs", | ||||
|     [ | ||||
|         ( | ||||
|             True, | ||||
|             [["A", "B"], ["B", "A"]], | ||||
|             {"c": [13.0, 123.23], "d": [12.0, 123.0], "e": [1.0, 1.0]}, | ||||
|         ), | ||||
|         ( | ||||
|             False, | ||||
|             [["A", "B"], ["A", np.nan], ["B", "A"]], | ||||
|             { | ||||
|                 "c": [13.0, 12.3, 123.23], | ||||
|                 "d": [12.0, 233.0, 123.0], | ||||
|                 "e": [1.0, 12.0, 1.0], | ||||
|             }, | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_groupby_dropna_multi_index_dataframe_agg(dropna, tuples, outputs): | ||||
|     # GH 3729 | ||||
|     df_list = [ | ||||
|         ["A", "B", 12, 12, 12], | ||||
|         ["A", None, 12.3, 233.0, 12], | ||||
|         ["B", "A", 123.23, 123, 1], | ||||
|         ["A", "B", 1, 1, 1.0], | ||||
|     ] | ||||
|     df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"]) | ||||
|     agg_dict = {"c": "sum", "d": "max", "e": "min"} | ||||
|     grouped = df.groupby(["a", "b"], dropna=dropna).agg(agg_dict) | ||||
|  | ||||
|     mi = pd.MultiIndex.from_tuples(tuples, names=list("ab")) | ||||
|  | ||||
|     # Since right now, by default MI will drop NA from levels when we create MI | ||||
|     # via `from_*`, so we need to add NA for level manually afterwards. | ||||
|     if not dropna: | ||||
|         mi = mi.set_levels(["A", "B", np.nan], level="b") | ||||
|     expected = pd.DataFrame(outputs, index=mi) | ||||
|  | ||||
|     tm.assert_frame_equal(grouped, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.arm_slow | ||||
| @pytest.mark.parametrize( | ||||
|     "datetime1, datetime2", | ||||
|     [ | ||||
|         (pd.Timestamp("2020-01-01"), pd.Timestamp("2020-02-01")), | ||||
|         (pd.Timedelta("-2 days"), pd.Timedelta("-1 days")), | ||||
|         (pd.Period("2020-01-01"), pd.Period("2020-02-01")), | ||||
|     ], | ||||
| ) | ||||
| @pytest.mark.parametrize("dropna, values", [(True, [12, 3]), (False, [12, 3, 6])]) | ||||
| def test_groupby_dropna_datetime_like_data( | ||||
|     dropna, values, datetime1, datetime2, unique_nulls_fixture, unique_nulls_fixture2 | ||||
| ): | ||||
|     # 3729 | ||||
|     df = pd.DataFrame( | ||||
|         { | ||||
|             "values": [1, 2, 3, 4, 5, 6], | ||||
|             "dt": [ | ||||
|                 datetime1, | ||||
|                 unique_nulls_fixture, | ||||
|                 datetime2, | ||||
|                 unique_nulls_fixture2, | ||||
|                 datetime1, | ||||
|                 datetime1, | ||||
|             ], | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|     if dropna: | ||||
|         indexes = [datetime1, datetime2] | ||||
|     else: | ||||
|         indexes = [datetime1, datetime2, np.nan] | ||||
|  | ||||
|     grouped = df.groupby("dt", dropna=dropna).agg({"values": "sum"}) | ||||
|     expected = pd.DataFrame({"values": values}, index=pd.Index(indexes, name="dt")) | ||||
|  | ||||
|     tm.assert_frame_equal(grouped, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "dropna, data, selected_data, levels", | ||||
|     [ | ||||
|         pytest.param( | ||||
|             False, | ||||
|             {"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]}, | ||||
|             {"values": [0, 1, 0, 0]}, | ||||
|             ["a", "b", np.nan], | ||||
|             id="dropna_false_has_nan", | ||||
|         ), | ||||
|         pytest.param( | ||||
|             True, | ||||
|             {"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]}, | ||||
|             {"values": [0, 1, 0]}, | ||||
|             None, | ||||
|             id="dropna_true_has_nan", | ||||
|         ), | ||||
|         pytest.param( | ||||
|             # no nan in "groups"; dropna=True|False should be same. | ||||
|             False, | ||||
|             {"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]}, | ||||
|             {"values": [0, 1, 0, 0]}, | ||||
|             None, | ||||
|             id="dropna_false_no_nan", | ||||
|         ), | ||||
|         pytest.param( | ||||
|             # no nan in "groups"; dropna=True|False should be same. | ||||
|             True, | ||||
|             {"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]}, | ||||
|             {"values": [0, 1, 0, 0]}, | ||||
|             None, | ||||
|             id="dropna_true_no_nan", | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data, levels): | ||||
|     # GH 35889 | ||||
|  | ||||
|     df = pd.DataFrame(data) | ||||
|     gb = df.groupby("groups", dropna=dropna) | ||||
|     msg = "DataFrameGroupBy.apply operated on the grouping columns" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))})) | ||||
|  | ||||
|     mi_tuples = tuple(zip(data["groups"], selected_data["values"])) | ||||
|     mi = pd.MultiIndex.from_tuples(mi_tuples, names=["groups", None]) | ||||
|     # Since right now, by default MI will drop NA from levels when we create MI | ||||
|     # via `from_*`, so we need to add NA for level manually afterwards. | ||||
|     if not dropna and levels: | ||||
|         mi = mi.set_levels(levels, level="groups") | ||||
|  | ||||
|     expected = pd.DataFrame(selected_data, index=mi) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("input_index", [None, ["a"], ["a", "b"]]) | ||||
| @pytest.mark.parametrize("keys", [["a"], ["a", "b"]]) | ||||
| @pytest.mark.parametrize("series", [True, False]) | ||||
| def test_groupby_dropna_with_multiindex_input(input_index, keys, series): | ||||
|     # GH#46783 | ||||
|     obj = pd.DataFrame( | ||||
|         { | ||||
|             "a": [1, np.nan], | ||||
|             "b": [1, 1], | ||||
|             "c": [2, 3], | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|     expected = obj.set_index(keys) | ||||
|     if series: | ||||
|         expected = expected["c"] | ||||
|     elif input_index == ["a", "b"] and keys == ["a"]: | ||||
|         # Column b should not be aggregated | ||||
|         expected = expected[["c"]] | ||||
|  | ||||
|     if input_index is not None: | ||||
|         obj = obj.set_index(input_index) | ||||
|     gb = obj.groupby(keys, dropna=False) | ||||
|     if series: | ||||
|         gb = gb["c"] | ||||
|     result = gb.sum() | ||||
|  | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_groupby_nan_included(): | ||||
|     # GH 35646 | ||||
|     data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]} | ||||
|     df = pd.DataFrame(data) | ||||
|     grouped = df.groupby("group", dropna=False) | ||||
|     result = grouped.indices | ||||
|     dtype = np.intp | ||||
|     expected = { | ||||
|         "g1": np.array([0, 2], dtype=dtype), | ||||
|         "g2": np.array([3], dtype=dtype), | ||||
|         np.nan: np.array([1, 4], dtype=dtype), | ||||
|     } | ||||
|     for result_values, expected_values in zip(result.values(), expected.values()): | ||||
|         tm.assert_numpy_array_equal(result_values, expected_values) | ||||
|     assert np.isnan(list(result.keys())[2]) | ||||
|     assert list(result.keys())[0:2] == ["g1", "g2"] | ||||
|  | ||||
|  | ||||
| def test_groupby_drop_nan_with_multi_index(): | ||||
|     # GH 39895 | ||||
|     df = pd.DataFrame([[np.nan, 0, 1]], columns=["a", "b", "c"]) | ||||
|     df = df.set_index(["a", "b"]) | ||||
|     result = df.groupby(["a", "b"], dropna=False).first() | ||||
|     expected = df | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| # sequence_index enumerates all strings made up of x, y, z of length 4 | ||||
| @pytest.mark.parametrize("sequence_index", range(3**4)) | ||||
| @pytest.mark.parametrize( | ||||
|     "dtype", | ||||
|     [ | ||||
|         None, | ||||
|         "UInt8", | ||||
|         "Int8", | ||||
|         "UInt16", | ||||
|         "Int16", | ||||
|         "UInt32", | ||||
|         "Int32", | ||||
|         "UInt64", | ||||
|         "Int64", | ||||
|         "Float32", | ||||
|         "Int64", | ||||
|         "Float64", | ||||
|         "category", | ||||
|         "string", | ||||
|         pytest.param( | ||||
|             "string[pyarrow]", | ||||
|             marks=pytest.mark.skipif( | ||||
|                 pa_version_under10p1, reason="pyarrow is not installed" | ||||
|             ), | ||||
|         ), | ||||
|         "datetime64[ns]", | ||||
|         "period[d]", | ||||
|         "Sparse[float]", | ||||
|     ], | ||||
| ) | ||||
| @pytest.mark.parametrize("test_series", [True, False]) | ||||
| def test_no_sort_keep_na(sequence_index, dtype, test_series, as_index): | ||||
|     # GH#46584, GH#48794 | ||||
|  | ||||
|     # Convert sequence_index into a string sequence, e.g. 5 becomes "xxyz" | ||||
|     # This sequence is used for the grouper. | ||||
|     sequence = "".join( | ||||
|         [{0: "x", 1: "y", 2: "z"}[sequence_index // (3**k) % 3] for k in range(4)] | ||||
|     ) | ||||
|  | ||||
|     # Unique values to use for grouper, depends on dtype | ||||
|     if dtype in ("string", "string[pyarrow]"): | ||||
|         uniques = {"x": "x", "y": "y", "z": pd.NA} | ||||
|     elif dtype in ("datetime64[ns]", "period[d]"): | ||||
|         uniques = {"x": "2016-01-01", "y": "2017-01-01", "z": pd.NA} | ||||
|     else: | ||||
|         uniques = {"x": 1, "y": 2, "z": np.nan} | ||||
|  | ||||
|     df = pd.DataFrame( | ||||
|         { | ||||
|             "key": pd.Series([uniques[label] for label in sequence], dtype=dtype), | ||||
|             "a": [0, 1, 2, 3], | ||||
|         } | ||||
|     ) | ||||
|     gb = df.groupby("key", dropna=False, sort=False, as_index=as_index, observed=False) | ||||
|     if test_series: | ||||
|         gb = gb["a"] | ||||
|     result = gb.sum() | ||||
|  | ||||
|     # Manually compute the groupby sum, use the labels "x", "y", and "z" to avoid | ||||
|     # issues with hashing np.nan | ||||
|     summed = {} | ||||
|     for idx, label in enumerate(sequence): | ||||
|         summed[label] = summed.get(label, 0) + idx | ||||
|     if dtype == "category": | ||||
|         index = pd.CategoricalIndex( | ||||
|             [uniques[e] for e in summed], | ||||
|             df["key"].cat.categories, | ||||
|             name="key", | ||||
|         ) | ||||
|     elif isinstance(dtype, str) and dtype.startswith("Sparse"): | ||||
|         index = pd.Index( | ||||
|             pd.array([uniques[label] for label in summed], dtype=dtype), name="key" | ||||
|         ) | ||||
|     else: | ||||
|         index = pd.Index([uniques[label] for label in summed], dtype=dtype, name="key") | ||||
|     expected = pd.Series(summed.values(), index=index, name="a", dtype=None) | ||||
|     if not test_series: | ||||
|         expected = expected.to_frame() | ||||
|     if not as_index: | ||||
|         expected = expected.reset_index() | ||||
|         if dtype is not None and dtype.startswith("Sparse"): | ||||
|             expected["key"] = expected["key"].astype(dtype) | ||||
|  | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("test_series", [True, False]) | ||||
| @pytest.mark.parametrize("dtype", [object, None]) | ||||
| def test_null_is_null_for_dtype( | ||||
|     sort, dtype, nulls_fixture, nulls_fixture2, test_series | ||||
| ): | ||||
|     # GH#48506 - groups should always result in using the null for the dtype | ||||
|     df = pd.DataFrame({"a": [1, 2]}) | ||||
|     groups = pd.Series([nulls_fixture, nulls_fixture2], dtype=dtype) | ||||
|     obj = df["a"] if test_series else df | ||||
|     gb = obj.groupby(groups, dropna=False, sort=sort) | ||||
|     result = gb.sum() | ||||
|     index = pd.Index([na_value_for_dtype(groups.dtype)]) | ||||
|     expected = pd.DataFrame({"a": [3]}, index=index) | ||||
|     if test_series: | ||||
|         tm.assert_series_equal(result, expected["a"]) | ||||
|     else: | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("index_kind", ["range", "single", "multi"]) | ||||
| def test_categorical_reducers(reduction_func, observed, sort, as_index, index_kind): | ||||
|     # Ensure there is at least one null value by appending to the end | ||||
|     values = np.append(np.random.default_rng(2).choice([1, 2, None], size=19), None) | ||||
|     df = pd.DataFrame( | ||||
|         {"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(20)} | ||||
|     ) | ||||
|  | ||||
|     # Strategy: Compare to dropna=True by filling null values with a new code | ||||
|     df_filled = df.copy() | ||||
|     df_filled["x"] = pd.Categorical(values, categories=[1, 2, 3, 4]).fillna(4) | ||||
|  | ||||
|     if index_kind == "range": | ||||
|         keys = ["x"] | ||||
|     elif index_kind == "single": | ||||
|         keys = ["x"] | ||||
|         df = df.set_index("x") | ||||
|         df_filled = df_filled.set_index("x") | ||||
|     else: | ||||
|         keys = ["x", "x2"] | ||||
|         df["x2"] = df["x"] | ||||
|         df = df.set_index(["x", "x2"]) | ||||
|         df_filled["x2"] = df_filled["x"] | ||||
|         df_filled = df_filled.set_index(["x", "x2"]) | ||||
|     args = get_groupby_method_args(reduction_func, df) | ||||
|     args_filled = get_groupby_method_args(reduction_func, df_filled) | ||||
|     if reduction_func == "corrwith" and index_kind == "range": | ||||
|         # Don't include the grouping columns so we can call reset_index | ||||
|         args = (args[0].drop(columns=keys),) | ||||
|         args_filled = (args_filled[0].drop(columns=keys),) | ||||
|  | ||||
|     gb_keepna = df.groupby( | ||||
|         keys, dropna=False, observed=observed, sort=sort, as_index=as_index | ||||
|     ) | ||||
|  | ||||
|     if not observed and reduction_func in ["idxmin", "idxmax"]: | ||||
|         with pytest.raises( | ||||
|             ValueError, match="empty group due to unobserved categories" | ||||
|         ): | ||||
|             getattr(gb_keepna, reduction_func)(*args) | ||||
|         return | ||||
|  | ||||
|     gb_filled = df_filled.groupby(keys, observed=observed, sort=sort, as_index=True) | ||||
|     expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index() | ||||
|     expected["x"] = expected["x"].cat.remove_categories([4]) | ||||
|     if index_kind == "multi": | ||||
|         expected["x2"] = expected["x2"].cat.remove_categories([4]) | ||||
|     if as_index: | ||||
|         if index_kind == "multi": | ||||
|             expected = expected.set_index(["x", "x2"]) | ||||
|         else: | ||||
|             expected = expected.set_index("x") | ||||
|     elif index_kind != "range" and reduction_func != "size": | ||||
|         # size, unlike other methods, has the desired behavior in GH#49519 | ||||
|         expected = expected.drop(columns="x") | ||||
|         if index_kind == "multi": | ||||
|             expected = expected.drop(columns="x2") | ||||
|     if reduction_func in ("idxmax", "idxmin") and index_kind != "range": | ||||
|         # expected was computed with a RangeIndex; need to translate to index values | ||||
|         values = expected["y"].values.tolist() | ||||
|         if index_kind == "single": | ||||
|             values = [np.nan if e == 4 else e for e in values] | ||||
|             expected["y"] = pd.Categorical(values, categories=[1, 2, 3]) | ||||
|         else: | ||||
|             values = [(np.nan, np.nan) if e == (4, 4) else e for e in values] | ||||
|             expected["y"] = values | ||||
|     if reduction_func == "size": | ||||
|         # size, unlike other methods, has the desired behavior in GH#49519 | ||||
|         expected = expected.rename(columns={0: "size"}) | ||||
|         if as_index: | ||||
|             expected = expected["size"].rename(None) | ||||
|  | ||||
|     if as_index or index_kind == "range" or reduction_func == "size": | ||||
|         warn = None | ||||
|     else: | ||||
|         warn = FutureWarning | ||||
|     msg = "A grouping .* was excluded from the result" | ||||
|     with tm.assert_produces_warning(warn, match=msg): | ||||
|         result = getattr(gb_keepna, reduction_func)(*args) | ||||
|  | ||||
|     # size will return a Series, others are DataFrame | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_categorical_transformers( | ||||
|     request, transformation_func, observed, sort, as_index | ||||
| ): | ||||
|     # GH#36327 | ||||
|     if transformation_func == "fillna": | ||||
|         msg = "GH#49651 fillna may incorrectly reorders results when dropna=False" | ||||
|         request.applymarker(pytest.mark.xfail(reason=msg, strict=False)) | ||||
|  | ||||
|     values = np.append(np.random.default_rng(2).choice([1, 2, None], size=19), None) | ||||
|     df = pd.DataFrame( | ||||
|         {"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(20)} | ||||
|     ) | ||||
|     args = get_groupby_method_args(transformation_func, df) | ||||
|  | ||||
|     # Compute result for null group | ||||
|     null_group_values = df[df["x"].isnull()]["y"] | ||||
|     if transformation_func == "cumcount": | ||||
|         null_group_data = list(range(len(null_group_values))) | ||||
|     elif transformation_func == "ngroup": | ||||
|         if sort: | ||||
|             if observed: | ||||
|                 na_group = df["x"].nunique(dropna=False) - 1 | ||||
|             else: | ||||
|                 # TODO: Should this be 3? | ||||
|                 na_group = df["x"].nunique(dropna=False) - 1 | ||||
|         else: | ||||
|             na_group = df.iloc[: null_group_values.index[0]]["x"].nunique() | ||||
|         null_group_data = len(null_group_values) * [na_group] | ||||
|     else: | ||||
|         null_group_data = getattr(null_group_values, transformation_func)(*args) | ||||
|     null_group_result = pd.DataFrame({"y": null_group_data}) | ||||
|  | ||||
|     gb_keepna = df.groupby( | ||||
|         "x", dropna=False, observed=observed, sort=sort, as_index=as_index | ||||
|     ) | ||||
|     gb_dropna = df.groupby("x", dropna=True, observed=observed, sort=sort) | ||||
|  | ||||
|     msg = "The default fill_method='ffill' in DataFrameGroupBy.pct_change is deprecated" | ||||
|     if transformation_func == "pct_change": | ||||
|         with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|             result = getattr(gb_keepna, "pct_change")(*args) | ||||
|     else: | ||||
|         result = getattr(gb_keepna, transformation_func)(*args) | ||||
|     expected = getattr(gb_dropna, transformation_func)(*args) | ||||
|  | ||||
|     for iloc, value in zip( | ||||
|         df[df["x"].isnull()].index.tolist(), null_group_result.values.ravel() | ||||
|     ): | ||||
|         if expected.ndim == 1: | ||||
|             expected.iloc[iloc] = value | ||||
|         else: | ||||
|             expected.iloc[iloc, 0] = value | ||||
|     if transformation_func == "ngroup": | ||||
|         expected[df["x"].notnull() & expected.ge(na_group)] += 1 | ||||
|     if transformation_func not in ("rank", "diff", "pct_change", "shift"): | ||||
|         expected = expected.astype("int64") | ||||
|  | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("method", ["head", "tail"]) | ||||
| def test_categorical_head_tail(method, observed, sort, as_index): | ||||
|     # GH#36327 | ||||
|     values = np.random.default_rng(2).choice([1, 2, None], 30) | ||||
|     df = pd.DataFrame( | ||||
|         {"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))} | ||||
|     ) | ||||
|     gb = df.groupby("x", dropna=False, observed=observed, sort=sort, as_index=as_index) | ||||
|     result = getattr(gb, method)() | ||||
|  | ||||
|     if method == "tail": | ||||
|         values = values[::-1] | ||||
|     # Take the top 5 values from each group | ||||
|     mask = ( | ||||
|         ((values == 1) & ((values == 1).cumsum() <= 5)) | ||||
|         | ((values == 2) & ((values == 2).cumsum() <= 5)) | ||||
|         # flake8 doesn't like the vectorized check for None, thinks we should use `is` | ||||
|         | ((values == None) & ((values == None).cumsum() <= 5))  # noqa: E711 | ||||
|     ) | ||||
|     if method == "tail": | ||||
|         mask = mask[::-1] | ||||
|     expected = df[mask] | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_categorical_agg(): | ||||
|     # GH#36327 | ||||
|     values = np.random.default_rng(2).choice([1, 2, None], 30) | ||||
|     df = pd.DataFrame( | ||||
|         {"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))} | ||||
|     ) | ||||
|     gb = df.groupby("x", dropna=False, observed=False) | ||||
|     result = gb.agg(lambda x: x.sum()) | ||||
|     expected = gb.sum() | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_categorical_transform(): | ||||
|     # GH#36327 | ||||
|     values = np.random.default_rng(2).choice([1, 2, None], 30) | ||||
|     df = pd.DataFrame( | ||||
|         {"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))} | ||||
|     ) | ||||
|     gb = df.groupby("x", dropna=False, observed=False) | ||||
|     result = gb.transform(lambda x: x.sum()) | ||||
|     expected = gb.transform("sum") | ||||
|     tm.assert_frame_equal(result, expected) | ||||
| @ -0,0 +1,135 @@ | ||||
| from datetime import datetime | ||||
|  | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     Series, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
| from pandas.tests.groupby import get_groupby_method_args | ||||
|  | ||||
| pytestmark = pytest.mark.filterwarnings( | ||||
|     "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" | ||||
| ) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "obj", | ||||
|     [ | ||||
|         tm.SubclassedDataFrame({"A": np.arange(0, 10)}), | ||||
|         tm.SubclassedSeries(np.arange(0, 10), name="A"), | ||||
|     ], | ||||
| ) | ||||
| def test_groupby_preserves_subclass(obj, groupby_func): | ||||
|     # GH28330 -- preserve subclass through groupby operations | ||||
|  | ||||
|     if isinstance(obj, Series) and groupby_func in {"corrwith"}: | ||||
|         pytest.skip(f"Not applicable for Series and {groupby_func}") | ||||
|  | ||||
|     grouped = obj.groupby(np.arange(0, 10)) | ||||
|  | ||||
|     # Groups should preserve subclass type | ||||
|     assert isinstance(grouped.get_group(0), type(obj)) | ||||
|  | ||||
|     args = get_groupby_method_args(groupby_func, obj) | ||||
|  | ||||
|     warn = FutureWarning if groupby_func == "fillna" else None | ||||
|     msg = f"{type(grouped).__name__}.fillna is deprecated" | ||||
|     with tm.assert_produces_warning(warn, match=msg, raise_on_extra_warnings=False): | ||||
|         result1 = getattr(grouped, groupby_func)(*args) | ||||
|     with tm.assert_produces_warning(warn, match=msg, raise_on_extra_warnings=False): | ||||
|         result2 = grouped.agg(groupby_func, *args) | ||||
|  | ||||
|     # Reduction or transformation kernels should preserve type | ||||
|     slices = {"ngroup", "cumcount", "size"} | ||||
|     if isinstance(obj, DataFrame) and groupby_func in slices: | ||||
|         assert isinstance(result1, tm.SubclassedSeries) | ||||
|     else: | ||||
|         assert isinstance(result1, type(obj)) | ||||
|  | ||||
|     # Confirm .agg() groupby operations return same results | ||||
|     if isinstance(result1, DataFrame): | ||||
|         tm.assert_frame_equal(result1, result2) | ||||
|     else: | ||||
|         tm.assert_series_equal(result1, result2) | ||||
|  | ||||
|  | ||||
| def test_groupby_preserves_metadata(): | ||||
|     # GH-37343 | ||||
|     custom_df = tm.SubclassedDataFrame({"a": [1, 2, 3], "b": [1, 1, 2], "c": [7, 8, 9]}) | ||||
|     assert "testattr" in custom_df._metadata | ||||
|     custom_df.testattr = "hello" | ||||
|     for _, group_df in custom_df.groupby("c"): | ||||
|         assert group_df.testattr == "hello" | ||||
|  | ||||
|     # GH-45314 | ||||
|     def func(group): | ||||
|         assert isinstance(group, tm.SubclassedDataFrame) | ||||
|         assert hasattr(group, "testattr") | ||||
|         assert group.testattr == "hello" | ||||
|         return group.testattr | ||||
|  | ||||
|     msg = "DataFrameGroupBy.apply operated on the grouping columns" | ||||
|     with tm.assert_produces_warning( | ||||
|         FutureWarning, | ||||
|         match=msg, | ||||
|         raise_on_extra_warnings=False, | ||||
|         check_stacklevel=False, | ||||
|     ): | ||||
|         result = custom_df.groupby("c").apply(func) | ||||
|     expected = tm.SubclassedSeries(["hello"] * 3, index=Index([7, 8, 9], name="c")) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     result = custom_df.groupby("c").apply(func, include_groups=False) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     # https://github.com/pandas-dev/pandas/pull/56761 | ||||
|     result = custom_df.groupby("c")[["a", "b"]].apply(func) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     def func2(group): | ||||
|         assert isinstance(group, tm.SubclassedSeries) | ||||
|         assert hasattr(group, "testattr") | ||||
|         return group.testattr | ||||
|  | ||||
|     custom_series = tm.SubclassedSeries([1, 2, 3]) | ||||
|     custom_series.testattr = "hello" | ||||
|     result = custom_series.groupby(custom_df["c"]).apply(func2) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|     result = custom_series.groupby(custom_df["c"]).agg(func2) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("obj", [DataFrame, tm.SubclassedDataFrame]) | ||||
| def test_groupby_resample_preserves_subclass(obj): | ||||
|     # GH28330 -- preserve subclass through groupby.resample() | ||||
|  | ||||
|     df = obj( | ||||
|         { | ||||
|             "Buyer": Series("Carl Carl Carl Carl Joe Carl".split(), dtype=object), | ||||
|             "Quantity": [18, 3, 5, 1, 9, 3], | ||||
|             "Date": [ | ||||
|                 datetime(2013, 9, 1, 13, 0), | ||||
|                 datetime(2013, 9, 1, 13, 5), | ||||
|                 datetime(2013, 10, 1, 20, 0), | ||||
|                 datetime(2013, 10, 3, 10, 0), | ||||
|                 datetime(2013, 12, 2, 12, 0), | ||||
|                 datetime(2013, 9, 2, 14, 0), | ||||
|             ], | ||||
|         } | ||||
|     ) | ||||
|     df = df.set_index("Date") | ||||
|  | ||||
|     # Confirm groupby.resample() preserves dataframe type | ||||
|     msg = "DataFrameGroupBy.resample operated on the grouping columns" | ||||
|     with tm.assert_produces_warning( | ||||
|         FutureWarning, | ||||
|         match=msg, | ||||
|         raise_on_extra_warnings=False, | ||||
|         check_stacklevel=False, | ||||
|     ): | ||||
|         result = df.groupby("Buyer").resample("5D").sum() | ||||
|     assert isinstance(result, obj) | ||||
							
								
								
									
										1238
									
								
								lib/python3.11/site-packages/pandas/tests/groupby/test_grouping.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1238
									
								
								lib/python3.11/site-packages/pandas/tests/groupby/test_grouping.py
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -0,0 +1,85 @@ | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| import pandas as pd | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| @pytest.fixture(params=[["inner"], ["inner", "outer"]]) | ||||
| def frame(request): | ||||
|     levels = request.param | ||||
|     df = pd.DataFrame( | ||||
|         { | ||||
|             "outer": ["a", "a", "a", "b", "b", "b"], | ||||
|             "inner": [1, 2, 3, 1, 2, 3], | ||||
|             "A": np.arange(6), | ||||
|             "B": ["one", "one", "two", "two", "one", "one"], | ||||
|         } | ||||
|     ) | ||||
|     if levels: | ||||
|         df = df.set_index(levels) | ||||
|  | ||||
|     return df | ||||
|  | ||||
|  | ||||
| @pytest.fixture() | ||||
| def series(): | ||||
|     df = pd.DataFrame( | ||||
|         { | ||||
|             "outer": ["a", "a", "a", "b", "b", "b"], | ||||
|             "inner": [1, 2, 3, 1, 2, 3], | ||||
|             "A": np.arange(6), | ||||
|             "B": ["one", "one", "two", "two", "one", "one"], | ||||
|         } | ||||
|     ) | ||||
|     s = df.set_index(["outer", "inner", "B"])["A"] | ||||
|  | ||||
|     return s | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "key_strs,groupers", | ||||
|     [ | ||||
|         ("inner", pd.Grouper(level="inner")),  # Index name | ||||
|         (["inner"], [pd.Grouper(level="inner")]),  # List of index name | ||||
|         (["B", "inner"], ["B", pd.Grouper(level="inner")]),  # Column and index | ||||
|         (["inner", "B"], [pd.Grouper(level="inner"), "B"]),  # Index and column | ||||
|     ], | ||||
| ) | ||||
| def test_grouper_index_level_as_string(frame, key_strs, groupers): | ||||
|     if "B" not in key_strs or "outer" in frame.columns: | ||||
|         result = frame.groupby(key_strs).mean(numeric_only=True) | ||||
|         expected = frame.groupby(groupers).mean(numeric_only=True) | ||||
|     else: | ||||
|         result = frame.groupby(key_strs).mean() | ||||
|         expected = frame.groupby(groupers).mean() | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "levels", | ||||
|     [ | ||||
|         "inner", | ||||
|         "outer", | ||||
|         "B", | ||||
|         ["inner"], | ||||
|         ["outer"], | ||||
|         ["B"], | ||||
|         ["inner", "outer"], | ||||
|         ["outer", "inner"], | ||||
|         ["inner", "outer", "B"], | ||||
|         ["B", "outer", "inner"], | ||||
|     ], | ||||
| ) | ||||
| def test_grouper_index_level_as_string_series(series, levels): | ||||
|     # Compute expected result | ||||
|     if isinstance(levels, list): | ||||
|         groupers = [pd.Grouper(level=lv) for lv in levels] | ||||
|     else: | ||||
|         groupers = pd.Grouper(level=levels) | ||||
|  | ||||
|     expected = series.groupby(groupers).mean() | ||||
|  | ||||
|     # Compute and check result | ||||
|     result = series.groupby(levels).mean() | ||||
|     tm.assert_series_equal(result, expected) | ||||
| @ -0,0 +1,333 @@ | ||||
| # Test GroupBy._positional_selector positional grouped indexing GH#42864 | ||||
|  | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| import pandas as pd | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "arg, expected_rows", | ||||
|     [ | ||||
|         [0, [0, 1, 4]], | ||||
|         [2, [5]], | ||||
|         [5, []], | ||||
|         [-1, [3, 4, 7]], | ||||
|         [-2, [1, 6]], | ||||
|         [-6, []], | ||||
|     ], | ||||
| ) | ||||
| def test_int(slice_test_df, slice_test_grouped, arg, expected_rows): | ||||
|     # Test single integer | ||||
|     result = slice_test_grouped._positional_selector[arg] | ||||
|     expected = slice_test_df.iloc[expected_rows] | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_slice(slice_test_df, slice_test_grouped): | ||||
|     # Test single slice | ||||
|     result = slice_test_grouped._positional_selector[0:3:2] | ||||
|     expected = slice_test_df.iloc[[0, 1, 4, 5]] | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "arg, expected_rows", | ||||
|     [ | ||||
|         [[0, 2], [0, 1, 4, 5]], | ||||
|         [[0, 2, -1], [0, 1, 3, 4, 5, 7]], | ||||
|         [range(0, 3, 2), [0, 1, 4, 5]], | ||||
|         [{0, 2}, [0, 1, 4, 5]], | ||||
|     ], | ||||
|     ids=[ | ||||
|         "list", | ||||
|         "negative", | ||||
|         "range", | ||||
|         "set", | ||||
|     ], | ||||
| ) | ||||
| def test_list(slice_test_df, slice_test_grouped, arg, expected_rows): | ||||
|     # Test lists of integers and integer valued iterables | ||||
|     result = slice_test_grouped._positional_selector[arg] | ||||
|     expected = slice_test_df.iloc[expected_rows] | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_ints(slice_test_df, slice_test_grouped): | ||||
|     # Test tuple of ints | ||||
|     result = slice_test_grouped._positional_selector[0, 2, -1] | ||||
|     expected = slice_test_df.iloc[[0, 1, 3, 4, 5, 7]] | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_slices(slice_test_df, slice_test_grouped): | ||||
|     # Test tuple of slices | ||||
|     result = slice_test_grouped._positional_selector[:2, -2:] | ||||
|     expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]] | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_mix(slice_test_df, slice_test_grouped): | ||||
|     # Test mixed tuple of ints and slices | ||||
|     result = slice_test_grouped._positional_selector[0, 1, -2:] | ||||
|     expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]] | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "arg, expected_rows", | ||||
|     [ | ||||
|         [0, [0, 1, 4]], | ||||
|         [[0, 2, -1], [0, 1, 3, 4, 5, 7]], | ||||
|         [(slice(None, 2), slice(-2, None)), [0, 1, 2, 3, 4, 6, 7]], | ||||
|     ], | ||||
| ) | ||||
| def test_as_index(slice_test_df, arg, expected_rows): | ||||
|     # Test the default as_index behaviour | ||||
|     result = slice_test_df.groupby("Group", sort=False)._positional_selector[arg] | ||||
|     expected = slice_test_df.iloc[expected_rows] | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_doc_examples(): | ||||
|     # Test the examples in the documentation | ||||
|     df = pd.DataFrame( | ||||
|         [["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], columns=["A", "B"] | ||||
|     ) | ||||
|  | ||||
|     grouped = df.groupby("A", as_index=False) | ||||
|  | ||||
|     result = grouped._positional_selector[1:2] | ||||
|     expected = pd.DataFrame([["a", 2], ["b", 5]], columns=["A", "B"], index=[1, 4]) | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = grouped._positional_selector[1, -1] | ||||
|     expected = pd.DataFrame( | ||||
|         [["a", 2], ["a", 3], ["b", 5]], columns=["A", "B"], index=[1, 2, 4] | ||||
|     ) | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.fixture() | ||||
| def multiindex_data(): | ||||
|     rng = np.random.default_rng(2) | ||||
|     ndates = 100 | ||||
|     nitems = 20 | ||||
|     dates = pd.date_range("20130101", periods=ndates, freq="D") | ||||
|     items = [f"item {i}" for i in range(nitems)] | ||||
|  | ||||
|     data = {} | ||||
|     for date in dates: | ||||
|         nitems_for_date = nitems - rng.integers(0, 12) | ||||
|         levels = [ | ||||
|             (item, rng.integers(0, 10000) / 100, rng.integers(0, 10000) / 100) | ||||
|             for item in items[:nitems_for_date] | ||||
|         ] | ||||
|         levels.sort(key=lambda x: x[1]) | ||||
|         data[date] = levels | ||||
|  | ||||
|     return data | ||||
|  | ||||
|  | ||||
| def _make_df_from_data(data): | ||||
|     rows = {} | ||||
|     for date in data: | ||||
|         for level in data[date]: | ||||
|             rows[(date, level[0])] = {"A": level[1], "B": level[2]} | ||||
|  | ||||
|     df = pd.DataFrame.from_dict(rows, orient="index") | ||||
|     df.index.names = ("Date", "Item") | ||||
|     return df | ||||
|  | ||||
|  | ||||
| def test_multiindex(multiindex_data): | ||||
|     # Test the multiindex mentioned as the use-case in the documentation | ||||
|     df = _make_df_from_data(multiindex_data) | ||||
|     result = df.groupby("Date", as_index=False).nth(slice(3, -3)) | ||||
|  | ||||
|     sliced = {date: multiindex_data[date][3:-3] for date in multiindex_data} | ||||
|     expected = _make_df_from_data(sliced) | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("arg", [1, 5, 30, 1000, -1, -5, -30, -1000]) | ||||
| @pytest.mark.parametrize("method", ["head", "tail"]) | ||||
| @pytest.mark.parametrize("simulated", [True, False]) | ||||
| def test_against_head_and_tail(arg, method, simulated): | ||||
|     # Test gives the same results as grouped head and tail | ||||
|     n_groups = 100 | ||||
|     n_rows_per_group = 30 | ||||
|  | ||||
|     data = { | ||||
|         "group": [ | ||||
|             f"group {g}" for j in range(n_rows_per_group) for g in range(n_groups) | ||||
|         ], | ||||
|         "value": [ | ||||
|             f"group {g} row {j}" | ||||
|             for j in range(n_rows_per_group) | ||||
|             for g in range(n_groups) | ||||
|         ], | ||||
|     } | ||||
|     df = pd.DataFrame(data) | ||||
|     grouped = df.groupby("group", as_index=False) | ||||
|     size = arg if arg >= 0 else n_rows_per_group + arg | ||||
|  | ||||
|     if method == "head": | ||||
|         result = grouped._positional_selector[:arg] | ||||
|  | ||||
|         if simulated: | ||||
|             indices = [ | ||||
|                 j * n_groups + i | ||||
|                 for j in range(size) | ||||
|                 for i in range(n_groups) | ||||
|                 if j * n_groups + i < n_groups * n_rows_per_group | ||||
|             ] | ||||
|             expected = df.iloc[indices] | ||||
|  | ||||
|         else: | ||||
|             expected = grouped.head(arg) | ||||
|  | ||||
|     else: | ||||
|         result = grouped._positional_selector[-arg:] | ||||
|  | ||||
|         if simulated: | ||||
|             indices = [ | ||||
|                 (n_rows_per_group + j - size) * n_groups + i | ||||
|                 for j in range(size) | ||||
|                 for i in range(n_groups) | ||||
|                 if (n_rows_per_group + j - size) * n_groups + i >= 0 | ||||
|             ] | ||||
|             expected = df.iloc[indices] | ||||
|  | ||||
|         else: | ||||
|             expected = grouped.tail(arg) | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("start", [None, 0, 1, 10, -1, -10]) | ||||
| @pytest.mark.parametrize("stop", [None, 0, 1, 10, -1, -10]) | ||||
| @pytest.mark.parametrize("step", [None, 1, 5]) | ||||
| def test_against_df_iloc(start, stop, step): | ||||
|     # Test that a single group gives the same results as DataFrame.iloc | ||||
|     n_rows = 30 | ||||
|  | ||||
|     data = { | ||||
|         "group": ["group 0"] * n_rows, | ||||
|         "value": list(range(n_rows)), | ||||
|     } | ||||
|     df = pd.DataFrame(data) | ||||
|     grouped = df.groupby("group", as_index=False) | ||||
|  | ||||
|     result = grouped._positional_selector[start:stop:step] | ||||
|     expected = df.iloc[start:stop:step] | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_series(): | ||||
|     # Test grouped Series | ||||
|     ser = pd.Series([1, 2, 3, 4, 5], index=["a", "a", "a", "b", "b"]) | ||||
|     grouped = ser.groupby(level=0) | ||||
|     result = grouped._positional_selector[1:2] | ||||
|     expected = pd.Series([2, 5], index=["a", "b"]) | ||||
|  | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("step", [1, 2, 3, 4, 5]) | ||||
| def test_step(step): | ||||
|     # Test slice with various step values | ||||
|     data = [["x", f"x{i}"] for i in range(5)] | ||||
|     data += [["y", f"y{i}"] for i in range(4)] | ||||
|     data += [["z", f"z{i}"] for i in range(3)] | ||||
|     df = pd.DataFrame(data, columns=["A", "B"]) | ||||
|  | ||||
|     grouped = df.groupby("A", as_index=False) | ||||
|  | ||||
|     result = grouped._positional_selector[::step] | ||||
|  | ||||
|     data = [["x", f"x{i}"] for i in range(0, 5, step)] | ||||
|     data += [["y", f"y{i}"] for i in range(0, 4, step)] | ||||
|     data += [["z", f"z{i}"] for i in range(0, 3, step)] | ||||
|  | ||||
|     index = [0 + i for i in range(0, 5, step)] | ||||
|     index += [5 + i for i in range(0, 4, step)] | ||||
|     index += [9 + i for i in range(0, 3, step)] | ||||
|  | ||||
|     expected = pd.DataFrame(data, columns=["A", "B"], index=index) | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.fixture() | ||||
| def column_group_df(): | ||||
|     return pd.DataFrame( | ||||
|         [[0, 1, 2, 3, 4, 5, 6], [0, 0, 1, 0, 1, 0, 2]], | ||||
|         columns=["A", "B", "C", "D", "E", "F", "G"], | ||||
|     ) | ||||
|  | ||||
|  | ||||
| def test_column_axis(column_group_df): | ||||
|     msg = "DataFrame.groupby with axis=1" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         g = column_group_df.groupby(column_group_df.iloc[1], axis=1) | ||||
|     result = g._positional_selector[1:-1] | ||||
|     expected = column_group_df.iloc[:, [1, 3]] | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_columns_on_iter(): | ||||
|     # GitHub issue #44821 | ||||
|     df = pd.DataFrame({k: range(10) for k in "ABC"}) | ||||
|  | ||||
|     # Group-by and select columns | ||||
|     cols = ["A", "B"] | ||||
|     for _, dg in df.groupby(df.A < 4)[cols]: | ||||
|         tm.assert_index_equal(dg.columns, pd.Index(cols)) | ||||
|         assert "C" not in dg.columns | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("func", [list, pd.Index, pd.Series, np.array]) | ||||
| def test_groupby_duplicated_columns(func): | ||||
|     # GH#44924 | ||||
|     df = pd.DataFrame( | ||||
|         { | ||||
|             "A": [1, 2], | ||||
|             "B": [3, 3], | ||||
|             "C": ["G", "G"], | ||||
|         } | ||||
|     ) | ||||
|     result = df.groupby("C")[func(["A", "B", "A"])].mean() | ||||
|     expected = pd.DataFrame( | ||||
|         [[1.5, 3.0, 1.5]], columns=["A", "B", "A"], index=pd.Index(["G"], name="C") | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_groupby_get_nonexisting_groups(): | ||||
|     # GH#32492 | ||||
|     df = pd.DataFrame( | ||||
|         data={ | ||||
|             "A": ["a1", "a2", None], | ||||
|             "B": ["b1", "b2", "b1"], | ||||
|             "val": [1, 2, 3], | ||||
|         } | ||||
|     ) | ||||
|     grps = df.groupby(by=["A", "B"]) | ||||
|  | ||||
|     msg = "('a2', 'b1')" | ||||
|     with pytest.raises(KeyError, match=msg): | ||||
|         grps.get_group(("a2", "b1")) | ||||
| @ -0,0 +1,331 @@ | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas._libs import groupby as libgroupby | ||||
| from pandas._libs.groupby import ( | ||||
|     group_cumprod, | ||||
|     group_cumsum, | ||||
|     group_mean, | ||||
|     group_sum, | ||||
|     group_var, | ||||
| ) | ||||
|  | ||||
| from pandas.core.dtypes.common import ensure_platform_int | ||||
|  | ||||
| from pandas import isna | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| class GroupVarTestMixin: | ||||
|     def test_group_var_generic_1d(self): | ||||
|         prng = np.random.default_rng(2) | ||||
|  | ||||
|         out = (np.nan * np.ones((5, 1))).astype(self.dtype) | ||||
|         counts = np.zeros(5, dtype="int64") | ||||
|         values = 10 * prng.random((15, 1)).astype(self.dtype) | ||||
|         labels = np.tile(np.arange(5), (3,)).astype("intp") | ||||
|  | ||||
|         expected_out = ( | ||||
|             np.squeeze(values).reshape((5, 3), order="F").std(axis=1, ddof=1) ** 2 | ||||
|         )[:, np.newaxis] | ||||
|         expected_counts = counts + 3 | ||||
|  | ||||
|         self.algo(out, counts, values, labels) | ||||
|         assert np.allclose(out, expected_out, self.rtol) | ||||
|         tm.assert_numpy_array_equal(counts, expected_counts) | ||||
|  | ||||
|     def test_group_var_generic_1d_flat_labels(self): | ||||
|         prng = np.random.default_rng(2) | ||||
|  | ||||
|         out = (np.nan * np.ones((1, 1))).astype(self.dtype) | ||||
|         counts = np.zeros(1, dtype="int64") | ||||
|         values = 10 * prng.random((5, 1)).astype(self.dtype) | ||||
|         labels = np.zeros(5, dtype="intp") | ||||
|  | ||||
|         expected_out = np.array([[values.std(ddof=1) ** 2]]) | ||||
|         expected_counts = counts + 5 | ||||
|  | ||||
|         self.algo(out, counts, values, labels) | ||||
|  | ||||
|         assert np.allclose(out, expected_out, self.rtol) | ||||
|         tm.assert_numpy_array_equal(counts, expected_counts) | ||||
|  | ||||
|     def test_group_var_generic_2d_all_finite(self): | ||||
|         prng = np.random.default_rng(2) | ||||
|  | ||||
|         out = (np.nan * np.ones((5, 2))).astype(self.dtype) | ||||
|         counts = np.zeros(5, dtype="int64") | ||||
|         values = 10 * prng.random((10, 2)).astype(self.dtype) | ||||
|         labels = np.tile(np.arange(5), (2,)).astype("intp") | ||||
|  | ||||
|         expected_out = np.std(values.reshape(2, 5, 2), ddof=1, axis=0) ** 2 | ||||
|         expected_counts = counts + 2 | ||||
|  | ||||
|         self.algo(out, counts, values, labels) | ||||
|         assert np.allclose(out, expected_out, self.rtol) | ||||
|         tm.assert_numpy_array_equal(counts, expected_counts) | ||||
|  | ||||
|     def test_group_var_generic_2d_some_nan(self): | ||||
|         prng = np.random.default_rng(2) | ||||
|  | ||||
|         out = (np.nan * np.ones((5, 2))).astype(self.dtype) | ||||
|         counts = np.zeros(5, dtype="int64") | ||||
|         values = 10 * prng.random((10, 2)).astype(self.dtype) | ||||
|         values[:, 1] = np.nan | ||||
|         labels = np.tile(np.arange(5), (2,)).astype("intp") | ||||
|  | ||||
|         expected_out = np.vstack( | ||||
|             [ | ||||
|                 values[:, 0].reshape(5, 2, order="F").std(ddof=1, axis=1) ** 2, | ||||
|                 np.nan * np.ones(5), | ||||
|             ] | ||||
|         ).T.astype(self.dtype) | ||||
|         expected_counts = counts + 2 | ||||
|  | ||||
|         self.algo(out, counts, values, labels) | ||||
|         tm.assert_almost_equal(out, expected_out, rtol=0.5e-06) | ||||
|         tm.assert_numpy_array_equal(counts, expected_counts) | ||||
|  | ||||
|     def test_group_var_constant(self): | ||||
|         # Regression test from GH 10448. | ||||
|  | ||||
|         out = np.array([[np.nan]], dtype=self.dtype) | ||||
|         counts = np.array([0], dtype="int64") | ||||
|         values = 0.832845131556193 * np.ones((3, 1), dtype=self.dtype) | ||||
|         labels = np.zeros(3, dtype="intp") | ||||
|  | ||||
|         self.algo(out, counts, values, labels) | ||||
|  | ||||
|         assert counts[0] == 3 | ||||
|         assert out[0, 0] >= 0 | ||||
|         tm.assert_almost_equal(out[0, 0], 0.0) | ||||
|  | ||||
|  | ||||
| class TestGroupVarFloat64(GroupVarTestMixin): | ||||
|     __test__ = True | ||||
|  | ||||
|     algo = staticmethod(group_var) | ||||
|     dtype = np.float64 | ||||
|     rtol = 1e-5 | ||||
|  | ||||
|     def test_group_var_large_inputs(self): | ||||
|         prng = np.random.default_rng(2) | ||||
|  | ||||
|         out = np.array([[np.nan]], dtype=self.dtype) | ||||
|         counts = np.array([0], dtype="int64") | ||||
|         values = (prng.random(10**6) + 10**12).astype(self.dtype) | ||||
|         values.shape = (10**6, 1) | ||||
|         labels = np.zeros(10**6, dtype="intp") | ||||
|  | ||||
|         self.algo(out, counts, values, labels) | ||||
|  | ||||
|         assert counts[0] == 10**6 | ||||
|         tm.assert_almost_equal(out[0, 0], 1.0 / 12, rtol=0.5e-3) | ||||
|  | ||||
|  | ||||
| class TestGroupVarFloat32(GroupVarTestMixin): | ||||
|     __test__ = True | ||||
|  | ||||
|     algo = staticmethod(group_var) | ||||
|     dtype = np.float32 | ||||
|     rtol = 1e-2 | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("dtype", ["float32", "float64"]) | ||||
| def test_group_ohlc(dtype): | ||||
|     obj = np.array(np.random.default_rng(2).standard_normal(20), dtype=dtype) | ||||
|  | ||||
|     bins = np.array([6, 12, 20]) | ||||
|     out = np.zeros((3, 4), dtype) | ||||
|     counts = np.zeros(len(out), dtype=np.int64) | ||||
|     labels = ensure_platform_int(np.repeat(np.arange(3), np.diff(np.r_[0, bins]))) | ||||
|  | ||||
|     func = libgroupby.group_ohlc | ||||
|     func(out, counts, obj[:, None], labels) | ||||
|  | ||||
|     def _ohlc(group): | ||||
|         if isna(group).all(): | ||||
|             return np.repeat(np.nan, 4) | ||||
|         return [group[0], group.max(), group.min(), group[-1]] | ||||
|  | ||||
|     expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), _ohlc(obj[12:])]) | ||||
|  | ||||
|     tm.assert_almost_equal(out, expected) | ||||
|     tm.assert_numpy_array_equal(counts, np.array([6, 6, 8], dtype=np.int64)) | ||||
|  | ||||
|     obj[:6] = np.nan | ||||
|     func(out, counts, obj[:, None], labels) | ||||
|     expected[0] = np.nan | ||||
|     tm.assert_almost_equal(out, expected) | ||||
|  | ||||
|  | ||||
| def _check_cython_group_transform_cumulative(pd_op, np_op, dtype): | ||||
|     """ | ||||
|     Check a group transform that executes a cumulative function. | ||||
|  | ||||
|     Parameters | ||||
|     ---------- | ||||
|     pd_op : callable | ||||
|         The pandas cumulative function. | ||||
|     np_op : callable | ||||
|         The analogous one in NumPy. | ||||
|     dtype : type | ||||
|         The specified dtype of the data. | ||||
|     """ | ||||
|     is_datetimelike = False | ||||
|  | ||||
|     data = np.array([[1], [2], [3], [4]], dtype=dtype) | ||||
|     answer = np.zeros_like(data) | ||||
|  | ||||
|     labels = np.array([0, 0, 0, 0], dtype=np.intp) | ||||
|     ngroups = 1 | ||||
|     pd_op(answer, data, labels, ngroups, is_datetimelike) | ||||
|  | ||||
|     tm.assert_numpy_array_equal(np_op(data), answer[:, 0], check_dtype=False) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("np_dtype", ["int64", "uint64", "float32", "float64"]) | ||||
| def test_cython_group_transform_cumsum(np_dtype): | ||||
|     # see gh-4095 | ||||
|     dtype = np.dtype(np_dtype).type | ||||
|     pd_op, np_op = group_cumsum, np.cumsum | ||||
|     _check_cython_group_transform_cumulative(pd_op, np_op, dtype) | ||||
|  | ||||
|  | ||||
| def test_cython_group_transform_cumprod(): | ||||
|     # see gh-4095 | ||||
|     dtype = np.float64 | ||||
|     pd_op, np_op = group_cumprod, np.cumprod | ||||
|     _check_cython_group_transform_cumulative(pd_op, np_op, dtype) | ||||
|  | ||||
|  | ||||
| def test_cython_group_transform_algos(): | ||||
|     # see gh-4095 | ||||
|     is_datetimelike = False | ||||
|  | ||||
|     # with nans | ||||
|     labels = np.array([0, 0, 0, 0, 0], dtype=np.intp) | ||||
|     ngroups = 1 | ||||
|  | ||||
|     data = np.array([[1], [2], [3], [np.nan], [4]], dtype="float64") | ||||
|     actual = np.zeros_like(data) | ||||
|     actual.fill(np.nan) | ||||
|     group_cumprod(actual, data, labels, ngroups, is_datetimelike) | ||||
|     expected = np.array([1, 2, 6, np.nan, 24], dtype="float64") | ||||
|     tm.assert_numpy_array_equal(actual[:, 0], expected) | ||||
|  | ||||
|     actual = np.zeros_like(data) | ||||
|     actual.fill(np.nan) | ||||
|     group_cumsum(actual, data, labels, ngroups, is_datetimelike) | ||||
|     expected = np.array([1, 3, 6, np.nan, 10], dtype="float64") | ||||
|     tm.assert_numpy_array_equal(actual[:, 0], expected) | ||||
|  | ||||
|     # timedelta | ||||
|     is_datetimelike = True | ||||
|     data = np.array([np.timedelta64(1, "ns")] * 5, dtype="m8[ns]")[:, None] | ||||
|     actual = np.zeros_like(data, dtype="int64") | ||||
|     group_cumsum(actual, data.view("int64"), labels, ngroups, is_datetimelike) | ||||
|     expected = np.array( | ||||
|         [ | ||||
|             np.timedelta64(1, "ns"), | ||||
|             np.timedelta64(2, "ns"), | ||||
|             np.timedelta64(3, "ns"), | ||||
|             np.timedelta64(4, "ns"), | ||||
|             np.timedelta64(5, "ns"), | ||||
|         ] | ||||
|     ) | ||||
|     tm.assert_numpy_array_equal(actual[:, 0].view("m8[ns]"), expected) | ||||
|  | ||||
|  | ||||
| def test_cython_group_mean_datetimelike(): | ||||
|     actual = np.zeros(shape=(1, 1), dtype="float64") | ||||
|     counts = np.array([0], dtype="int64") | ||||
|     data = ( | ||||
|         np.array( | ||||
|             [np.timedelta64(2, "ns"), np.timedelta64(4, "ns"), np.timedelta64("NaT")], | ||||
|             dtype="m8[ns]", | ||||
|         )[:, None] | ||||
|         .view("int64") | ||||
|         .astype("float64") | ||||
|     ) | ||||
|     labels = np.zeros(len(data), dtype=np.intp) | ||||
|  | ||||
|     group_mean(actual, counts, data, labels, is_datetimelike=True) | ||||
|  | ||||
|     tm.assert_numpy_array_equal(actual[:, 0], np.array([3], dtype="float64")) | ||||
|  | ||||
|  | ||||
| def test_cython_group_mean_wrong_min_count(): | ||||
|     actual = np.zeros(shape=(1, 1), dtype="float64") | ||||
|     counts = np.zeros(1, dtype="int64") | ||||
|     data = np.zeros(1, dtype="float64")[:, None] | ||||
|     labels = np.zeros(1, dtype=np.intp) | ||||
|  | ||||
|     with pytest.raises(AssertionError, match="min_count"): | ||||
|         group_mean(actual, counts, data, labels, is_datetimelike=True, min_count=0) | ||||
|  | ||||
|  | ||||
| def test_cython_group_mean_not_datetimelike_but_has_NaT_values(): | ||||
|     actual = np.zeros(shape=(1, 1), dtype="float64") | ||||
|     counts = np.array([0], dtype="int64") | ||||
|     data = ( | ||||
|         np.array( | ||||
|             [np.timedelta64("NaT"), np.timedelta64("NaT")], | ||||
|             dtype="m8[ns]", | ||||
|         )[:, None] | ||||
|         .view("int64") | ||||
|         .astype("float64") | ||||
|     ) | ||||
|     labels = np.zeros(len(data), dtype=np.intp) | ||||
|  | ||||
|     group_mean(actual, counts, data, labels, is_datetimelike=False) | ||||
|  | ||||
|     tm.assert_numpy_array_equal( | ||||
|         actual[:, 0], np.array(np.divide(np.add(data[0], data[1]), 2), dtype="float64") | ||||
|     ) | ||||
|  | ||||
|  | ||||
| def test_cython_group_mean_Inf_at_begining_and_end(): | ||||
|     # GH 50367 | ||||
|     actual = np.array([[np.nan, np.nan], [np.nan, np.nan]], dtype="float64") | ||||
|     counts = np.array([0, 0], dtype="int64") | ||||
|     data = np.array( | ||||
|         [[np.inf, 1.0], [1.0, 2.0], [2.0, 3.0], [3.0, 4.0], [4.0, 5.0], [5, np.inf]], | ||||
|         dtype="float64", | ||||
|     ) | ||||
|     labels = np.array([0, 1, 0, 1, 0, 1], dtype=np.intp) | ||||
|  | ||||
|     group_mean(actual, counts, data, labels, is_datetimelike=False) | ||||
|  | ||||
|     expected = np.array([[np.inf, 3], [3, np.inf]], dtype="float64") | ||||
|  | ||||
|     tm.assert_numpy_array_equal( | ||||
|         actual, | ||||
|         expected, | ||||
|     ) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "values, out", | ||||
|     [ | ||||
|         ([[np.inf], [np.inf], [np.inf]], [[np.inf], [np.inf]]), | ||||
|         ([[np.inf], [np.inf], [-np.inf]], [[np.inf], [np.nan]]), | ||||
|         ([[np.inf], [-np.inf], [np.inf]], [[np.inf], [np.nan]]), | ||||
|         ([[np.inf], [-np.inf], [-np.inf]], [[np.inf], [-np.inf]]), | ||||
|     ], | ||||
| ) | ||||
| def test_cython_group_sum_Inf_at_begining_and_end(values, out): | ||||
|     # GH #53606 | ||||
|     actual = np.array([[np.nan], [np.nan]], dtype="float64") | ||||
|     counts = np.array([0, 0], dtype="int64") | ||||
|     data = np.array(values, dtype="float64") | ||||
|     labels = np.array([0, 1, 1], dtype=np.intp) | ||||
|  | ||||
|     group_sum(actual, counts, data, labels, None, is_datetimelike=False) | ||||
|  | ||||
|     expected = np.array(out, dtype="float64") | ||||
|  | ||||
|     tm.assert_numpy_array_equal( | ||||
|         actual, | ||||
|         expected, | ||||
|     ) | ||||
| @ -0,0 +1,163 @@ | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| import pandas as pd | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     date_range, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("func", ["ffill", "bfill"]) | ||||
| def test_groupby_column_index_name_lost_fill_funcs(func): | ||||
|     # GH: 29764 groupby loses index sometimes | ||||
|     df = DataFrame( | ||||
|         [[1, 1.0, -1.0], [1, np.nan, np.nan], [1, 2.0, -2.0]], | ||||
|         columns=Index(["type", "a", "b"], name="idx"), | ||||
|     ) | ||||
|     df_grouped = df.groupby(["type"])[["a", "b"]] | ||||
|     result = getattr(df_grouped, func)().columns | ||||
|     expected = Index(["a", "b"], name="idx") | ||||
|     tm.assert_index_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("func", ["ffill", "bfill"]) | ||||
| def test_groupby_fill_duplicate_column_names(func): | ||||
|     # GH: 25610 ValueError with duplicate column names | ||||
|     df1 = DataFrame({"field1": [1, 3, 4], "field2": [1, 3, 4]}) | ||||
|     df2 = DataFrame({"field1": [1, np.nan, 4]}) | ||||
|     df_grouped = pd.concat([df1, df2], axis=1).groupby(by=["field2"]) | ||||
|     expected = DataFrame( | ||||
|         [[1, 1.0], [3, np.nan], [4, 4.0]], columns=["field1", "field1"] | ||||
|     ) | ||||
|     result = getattr(df_grouped, func)() | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_ffill_missing_arguments(): | ||||
|     # GH 14955 | ||||
|     df = DataFrame({"a": [1, 2], "b": [1, 1]}) | ||||
|     msg = "DataFrameGroupBy.fillna is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         with pytest.raises(ValueError, match="Must specify a fill"): | ||||
|             df.groupby("b").fillna() | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "method, expected", [("ffill", [None, "a", "a"]), ("bfill", ["a", "a", None])] | ||||
| ) | ||||
| def test_fillna_with_string_dtype(method, expected): | ||||
|     # GH 40250 | ||||
|     df = DataFrame({"a": pd.array([None, "a", None], dtype="string"), "b": [0, 0, 0]}) | ||||
|     grp = df.groupby("b") | ||||
|     msg = "DataFrameGroupBy.fillna is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         result = grp.fillna(method=method) | ||||
|     expected = DataFrame({"a": pd.array(expected, dtype="string")}) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_fill_consistency(): | ||||
|     # GH9221 | ||||
|     # pass thru keyword arguments to the generated wrapper | ||||
|     # are set if the passed kw is None (only) | ||||
|     df = DataFrame( | ||||
|         index=pd.MultiIndex.from_product( | ||||
|             [["value1", "value2"], date_range("2014-01-01", "2014-01-06")] | ||||
|         ), | ||||
|         columns=Index(["1", "2"], name="id"), | ||||
|     ) | ||||
|     df["1"] = [ | ||||
|         np.nan, | ||||
|         1, | ||||
|         np.nan, | ||||
|         np.nan, | ||||
|         11, | ||||
|         np.nan, | ||||
|         np.nan, | ||||
|         2, | ||||
|         np.nan, | ||||
|         np.nan, | ||||
|         22, | ||||
|         np.nan, | ||||
|     ] | ||||
|     df["2"] = [ | ||||
|         np.nan, | ||||
|         3, | ||||
|         np.nan, | ||||
|         np.nan, | ||||
|         33, | ||||
|         np.nan, | ||||
|         np.nan, | ||||
|         4, | ||||
|         np.nan, | ||||
|         np.nan, | ||||
|         44, | ||||
|         np.nan, | ||||
|     ] | ||||
|  | ||||
|     msg = "The 'axis' keyword in DataFrame.groupby is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         expected = df.groupby(level=0, axis=0).fillna(method="ffill") | ||||
|  | ||||
|     msg = "DataFrame.groupby with axis=1 is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         result = df.T.groupby(level=0, axis=1).fillna(method="ffill").T | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("method", ["ffill", "bfill"]) | ||||
| @pytest.mark.parametrize("dropna", [True, False]) | ||||
| @pytest.mark.parametrize("has_nan_group", [True, False]) | ||||
| def test_ffill_handles_nan_groups(dropna, method, has_nan_group): | ||||
|     # GH 34725 | ||||
|  | ||||
|     df_without_nan_rows = DataFrame([(1, 0.1), (2, 0.2)]) | ||||
|  | ||||
|     ridx = [-1, 0, -1, -1, 1, -1] | ||||
|     df = df_without_nan_rows.reindex(ridx).reset_index(drop=True) | ||||
|  | ||||
|     group_b = np.nan if has_nan_group else "b" | ||||
|     df["group_col"] = pd.Series(["a"] * 3 + [group_b] * 3) | ||||
|  | ||||
|     grouped = df.groupby(by="group_col", dropna=dropna) | ||||
|     result = getattr(grouped, method)(limit=None) | ||||
|  | ||||
|     expected_rows = { | ||||
|         ("ffill", True, True): [-1, 0, 0, -1, -1, -1], | ||||
|         ("ffill", True, False): [-1, 0, 0, -1, 1, 1], | ||||
|         ("ffill", False, True): [-1, 0, 0, -1, 1, 1], | ||||
|         ("ffill", False, False): [-1, 0, 0, -1, 1, 1], | ||||
|         ("bfill", True, True): [0, 0, -1, -1, -1, -1], | ||||
|         ("bfill", True, False): [0, 0, -1, 1, 1, -1], | ||||
|         ("bfill", False, True): [0, 0, -1, 1, 1, -1], | ||||
|         ("bfill", False, False): [0, 0, -1, 1, 1, -1], | ||||
|     } | ||||
|  | ||||
|     ridx = expected_rows.get((method, dropna, has_nan_group)) | ||||
|     expected = df_without_nan_rows.reindex(ridx).reset_index(drop=True) | ||||
|     # columns are a 'take' on df.columns, which are object dtype | ||||
|     expected.columns = expected.columns.astype(object) | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("min_count, value", [(2, np.nan), (-1, 1.0)]) | ||||
| @pytest.mark.parametrize("func", ["first", "last", "max", "min"]) | ||||
| def test_min_count(func, min_count, value): | ||||
|     # GH#37821 | ||||
|     df = DataFrame({"a": [1] * 3, "b": [1, np.nan, np.nan], "c": [np.nan] * 3}) | ||||
|     result = getattr(df.groupby("a"), func)(min_count=min_count) | ||||
|     expected = DataFrame({"b": [value], "c": [np.nan]}, index=Index([1], name="a")) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_indices_with_missing(): | ||||
|     # GH 9304 | ||||
|     df = DataFrame({"a": [1, 1, np.nan], "b": [2, 3, 4], "c": [5, 6, 7]}) | ||||
|     g = df.groupby(["a", "b"]) | ||||
|     result = g.indices | ||||
|     expected = {(1.0, 2): np.array([0]), (1.0, 3): np.array([1])} | ||||
|     assert result == expected | ||||
| @ -0,0 +1,89 @@ | ||||
| import pytest | ||||
|  | ||||
| from pandas.compat import is_platform_arm | ||||
|  | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Series, | ||||
|     option_context, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
| from pandas.util.version import Version | ||||
|  | ||||
| pytestmark = [pytest.mark.single_cpu] | ||||
|  | ||||
| numba = pytest.importorskip("numba") | ||||
| pytestmark.append( | ||||
|     pytest.mark.skipif( | ||||
|         Version(numba.__version__) == Version("0.61") and is_platform_arm(), | ||||
|         reason=f"Segfaults on ARM platforms with numba {numba.__version__}", | ||||
|     ) | ||||
| ) | ||||
|  | ||||
|  | ||||
| @pytest.mark.filterwarnings("ignore") | ||||
| # Filter warnings when parallel=True and the function can't be parallelized by Numba | ||||
| class TestEngine: | ||||
|     def test_cython_vs_numba_frame( | ||||
|         self, sort, nogil, parallel, nopython, numba_supported_reductions | ||||
|     ): | ||||
|         func, kwargs = numba_supported_reductions | ||||
|         df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)}) | ||||
|         engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} | ||||
|         gb = df.groupby("a", sort=sort) | ||||
|         result = getattr(gb, func)( | ||||
|             engine="numba", engine_kwargs=engine_kwargs, **kwargs | ||||
|         ) | ||||
|         expected = getattr(gb, func)(**kwargs) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     def test_cython_vs_numba_getitem( | ||||
|         self, sort, nogil, parallel, nopython, numba_supported_reductions | ||||
|     ): | ||||
|         func, kwargs = numba_supported_reductions | ||||
|         df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)}) | ||||
|         engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} | ||||
|         gb = df.groupby("a", sort=sort)["c"] | ||||
|         result = getattr(gb, func)( | ||||
|             engine="numba", engine_kwargs=engine_kwargs, **kwargs | ||||
|         ) | ||||
|         expected = getattr(gb, func)(**kwargs) | ||||
|         tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     def test_cython_vs_numba_series( | ||||
|         self, sort, nogil, parallel, nopython, numba_supported_reductions | ||||
|     ): | ||||
|         func, kwargs = numba_supported_reductions | ||||
|         ser = Series(range(3), index=[1, 2, 1], name="foo") | ||||
|         engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} | ||||
|         gb = ser.groupby(level=0, sort=sort) | ||||
|         result = getattr(gb, func)( | ||||
|             engine="numba", engine_kwargs=engine_kwargs, **kwargs | ||||
|         ) | ||||
|         expected = getattr(gb, func)(**kwargs) | ||||
|         tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     def test_as_index_false_unsupported(self, numba_supported_reductions): | ||||
|         func, kwargs = numba_supported_reductions | ||||
|         df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)}) | ||||
|         gb = df.groupby("a", as_index=False) | ||||
|         with pytest.raises(NotImplementedError, match="as_index=False"): | ||||
|             getattr(gb, func)(engine="numba", **kwargs) | ||||
|  | ||||
|     def test_axis_1_unsupported(self, numba_supported_reductions): | ||||
|         func, kwargs = numba_supported_reductions | ||||
|         df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)}) | ||||
|         gb = df.groupby("a", axis=1) | ||||
|         with pytest.raises(NotImplementedError, match="axis=1"): | ||||
|             getattr(gb, func)(engine="numba", **kwargs) | ||||
|  | ||||
|     def test_no_engine_doesnt_raise(self): | ||||
|         # GH55520 | ||||
|         df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)}) | ||||
|         gb = df.groupby("a") | ||||
|         # Make sure behavior of functions w/out engine argument don't raise | ||||
|         # when the global use_numba option is set | ||||
|         with option_context("compute.use_numba", True): | ||||
|             res = gb.agg({"b": "first"}) | ||||
|         expected = gb.agg({"b": "first"}) | ||||
|         tm.assert_frame_equal(res, expected) | ||||
| @ -0,0 +1,532 @@ | ||||
| import re | ||||
|  | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas._libs import lib | ||||
|  | ||||
| import pandas as pd | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     Series, | ||||
|     Timestamp, | ||||
|     date_range, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
| from pandas.tests.groupby import get_groupby_method_args | ||||
|  | ||||
|  | ||||
| class TestNumericOnly: | ||||
|     # make sure that we are passing thru kwargs to our agg functions | ||||
|  | ||||
|     @pytest.fixture | ||||
|     def df(self): | ||||
|         # GH3668 | ||||
|         # GH5724 | ||||
|         df = DataFrame( | ||||
|             { | ||||
|                 "group": [1, 1, 2], | ||||
|                 "int": [1, 2, 3], | ||||
|                 "float": [4.0, 5.0, 6.0], | ||||
|                 "string": Series(["a", "b", "c"], dtype="str"), | ||||
|                 "object": Series(["a", "b", "c"], dtype=object), | ||||
|                 "category_string": Series(list("abc")).astype("category"), | ||||
|                 "category_int": [7, 8, 9], | ||||
|                 "datetime": date_range("20130101", periods=3), | ||||
|                 "datetimetz": date_range("20130101", periods=3, tz="US/Eastern"), | ||||
|                 "timedelta": pd.timedelta_range("1 s", periods=3, freq="s"), | ||||
|             }, | ||||
|             columns=[ | ||||
|                 "group", | ||||
|                 "int", | ||||
|                 "float", | ||||
|                 "string", | ||||
|                 "object", | ||||
|                 "category_string", | ||||
|                 "category_int", | ||||
|                 "datetime", | ||||
|                 "datetimetz", | ||||
|                 "timedelta", | ||||
|             ], | ||||
|         ) | ||||
|         return df | ||||
|  | ||||
|     @pytest.mark.parametrize("method", ["mean", "median"]) | ||||
|     def test_averages(self, df, method): | ||||
|         # mean / median | ||||
|         expected_columns_numeric = Index(["int", "float", "category_int"]) | ||||
|  | ||||
|         gb = df.groupby("group") | ||||
|         expected = DataFrame( | ||||
|             { | ||||
|                 "category_int": [7.5, 9], | ||||
|                 "float": [4.5, 6.0], | ||||
|                 "timedelta": [pd.Timedelta("1.5s"), pd.Timedelta("3s")], | ||||
|                 "int": [1.5, 3], | ||||
|                 "datetime": [ | ||||
|                     Timestamp("2013-01-01 12:00:00"), | ||||
|                     Timestamp("2013-01-03 00:00:00"), | ||||
|                 ], | ||||
|                 "datetimetz": [ | ||||
|                     Timestamp("2013-01-01 12:00:00", tz="US/Eastern"), | ||||
|                     Timestamp("2013-01-03 00:00:00", tz="US/Eastern"), | ||||
|                 ], | ||||
|             }, | ||||
|             index=Index([1, 2], name="group"), | ||||
|             columns=[ | ||||
|                 "int", | ||||
|                 "float", | ||||
|                 "category_int", | ||||
|             ], | ||||
|         ) | ||||
|  | ||||
|         result = getattr(gb, method)(numeric_only=True) | ||||
|         tm.assert_frame_equal(result.reindex_like(expected), expected) | ||||
|  | ||||
|         expected_columns = expected.columns | ||||
|  | ||||
|         self._check(df, method, expected_columns, expected_columns_numeric) | ||||
|  | ||||
|     @pytest.mark.parametrize("method", ["min", "max"]) | ||||
|     def test_extrema(self, df, method): | ||||
|         # TODO: min, max *should* handle | ||||
|         # categorical (ordered) dtype | ||||
|  | ||||
|         expected_columns = Index( | ||||
|             [ | ||||
|                 "int", | ||||
|                 "float", | ||||
|                 "string", | ||||
|                 "category_int", | ||||
|                 "datetime", | ||||
|                 "datetimetz", | ||||
|                 "timedelta", | ||||
|             ] | ||||
|         ) | ||||
|         expected_columns_numeric = expected_columns | ||||
|  | ||||
|         self._check(df, method, expected_columns, expected_columns_numeric) | ||||
|  | ||||
|     @pytest.mark.parametrize("method", ["first", "last"]) | ||||
|     def test_first_last(self, df, method): | ||||
|         expected_columns = Index( | ||||
|             [ | ||||
|                 "int", | ||||
|                 "float", | ||||
|                 "string", | ||||
|                 "object", | ||||
|                 "category_string", | ||||
|                 "category_int", | ||||
|                 "datetime", | ||||
|                 "datetimetz", | ||||
|                 "timedelta", | ||||
|             ] | ||||
|         ) | ||||
|         expected_columns_numeric = expected_columns | ||||
|  | ||||
|         self._check(df, method, expected_columns, expected_columns_numeric) | ||||
|  | ||||
|     @pytest.mark.parametrize("method", ["sum", "cumsum"]) | ||||
|     def test_sum_cumsum(self, df, method): | ||||
|         expected_columns_numeric = Index(["int", "float", "category_int"]) | ||||
|         expected_columns = Index( | ||||
|             ["int", "float", "string", "category_int", "timedelta"] | ||||
|         ) | ||||
|         if method == "cumsum": | ||||
|             # cumsum loses string | ||||
|             expected_columns = Index(["int", "float", "category_int", "timedelta"]) | ||||
|  | ||||
|         self._check(df, method, expected_columns, expected_columns_numeric) | ||||
|  | ||||
|     @pytest.mark.parametrize("method", ["prod", "cumprod"]) | ||||
|     def test_prod_cumprod(self, df, method): | ||||
|         expected_columns = Index(["int", "float", "category_int"]) | ||||
|         expected_columns_numeric = expected_columns | ||||
|  | ||||
|         self._check(df, method, expected_columns, expected_columns_numeric) | ||||
|  | ||||
|     @pytest.mark.parametrize("method", ["cummin", "cummax"]) | ||||
|     def test_cummin_cummax(self, df, method): | ||||
|         # like min, max, but don't include strings | ||||
|         expected_columns = Index( | ||||
|             ["int", "float", "category_int", "datetime", "datetimetz", "timedelta"] | ||||
|         ) | ||||
|  | ||||
|         # GH#15561: numeric_only=False set by default like min/max | ||||
|         expected_columns_numeric = expected_columns | ||||
|  | ||||
|         self._check(df, method, expected_columns, expected_columns_numeric) | ||||
|  | ||||
|     def _check(self, df, method, expected_columns, expected_columns_numeric): | ||||
|         gb = df.groupby("group") | ||||
|  | ||||
|         # object dtypes for transformations are not implemented in Cython and | ||||
|         # have no Python fallback | ||||
|         exception = ( | ||||
|             (NotImplementedError, TypeError) if method.startswith("cum") else TypeError | ||||
|         ) | ||||
|  | ||||
|         if method in ("min", "max", "cummin", "cummax", "cumsum", "cumprod"): | ||||
|             # The methods default to numeric_only=False and raise TypeError | ||||
|             msg = "|".join( | ||||
|                 [ | ||||
|                     "Categorical is not ordered", | ||||
|                     f"Cannot perform {method} with non-ordered Categorical", | ||||
|                     re.escape(f"agg function failed [how->{method},dtype->object]"), | ||||
|                     # cumsum/cummin/cummax/cumprod | ||||
|                     "function is not implemented for this dtype", | ||||
|                     f"dtype 'str' does not support operation '{method}'", | ||||
|                 ] | ||||
|             ) | ||||
|             with pytest.raises(exception, match=msg): | ||||
|                 getattr(gb, method)() | ||||
|         elif method in ("sum", "mean", "median", "prod"): | ||||
|             msg = "|".join( | ||||
|                 [ | ||||
|                     "category type does not support sum operations", | ||||
|                     re.escape(f"agg function failed [how->{method},dtype->object]"), | ||||
|                     re.escape(f"agg function failed [how->{method},dtype->string]"), | ||||
|                     f"dtype 'str' does not support operation '{method}'", | ||||
|                 ] | ||||
|             ) | ||||
|             with pytest.raises(exception, match=msg): | ||||
|                 getattr(gb, method)() | ||||
|         else: | ||||
|             result = getattr(gb, method)() | ||||
|             tm.assert_index_equal(result.columns, expected_columns_numeric) | ||||
|  | ||||
|         if method not in ("first", "last"): | ||||
|             msg = "|".join( | ||||
|                 [ | ||||
|                     "Categorical is not ordered", | ||||
|                     "category type does not support", | ||||
|                     "function is not implemented for this dtype", | ||||
|                     f"Cannot perform {method} with non-ordered Categorical", | ||||
|                     re.escape(f"agg function failed [how->{method},dtype->object]"), | ||||
|                     re.escape(f"agg function failed [how->{method},dtype->string]"), | ||||
|                     f"dtype 'str' does not support operation '{method}'", | ||||
|                 ] | ||||
|             ) | ||||
|             with pytest.raises(exception, match=msg): | ||||
|                 getattr(gb, method)(numeric_only=False) | ||||
|         else: | ||||
|             result = getattr(gb, method)(numeric_only=False) | ||||
|             tm.assert_index_equal(result.columns, expected_columns) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("numeric_only", [True, False, None]) | ||||
| def test_axis1_numeric_only(request, groupby_func, numeric_only, using_infer_string): | ||||
|     if groupby_func in ("idxmax", "idxmin"): | ||||
|         pytest.skip("idxmax and idx_min tested in test_idxmin_idxmax_axis1") | ||||
|     if groupby_func in ("corrwith", "skew"): | ||||
|         msg = "GH#47723 groupby.corrwith and skew do not correctly implement axis=1" | ||||
|         request.applymarker(pytest.mark.xfail(reason=msg)) | ||||
|  | ||||
|     df = DataFrame( | ||||
|         np.random.default_rng(2).standard_normal((10, 4)), columns=["A", "B", "C", "D"] | ||||
|     ) | ||||
|     df["E"] = "x" | ||||
|     groups = [1, 2, 3, 1, 2, 3, 1, 2, 3, 4] | ||||
|     gb = df.groupby(groups) | ||||
|     method = getattr(gb, groupby_func) | ||||
|     args = get_groupby_method_args(groupby_func, df) | ||||
|     kwargs = {"axis": 1} | ||||
|     if numeric_only is not None: | ||||
|         # when numeric_only is None we don't pass any argument | ||||
|         kwargs["numeric_only"] = numeric_only | ||||
|  | ||||
|     # Functions without numeric_only and axis args | ||||
|     no_args = ("cumprod", "cumsum", "diff", "fillna", "pct_change", "rank", "shift") | ||||
|     # Functions with axis args | ||||
|     has_axis = ( | ||||
|         "cumprod", | ||||
|         "cumsum", | ||||
|         "diff", | ||||
|         "pct_change", | ||||
|         "rank", | ||||
|         "shift", | ||||
|         "cummax", | ||||
|         "cummin", | ||||
|         "idxmin", | ||||
|         "idxmax", | ||||
|         "fillna", | ||||
|     ) | ||||
|     warn_msg = f"DataFrameGroupBy.{groupby_func} with axis=1 is deprecated" | ||||
|     if numeric_only is not None and groupby_func in no_args: | ||||
|         msg = "got an unexpected keyword argument 'numeric_only'" | ||||
|         if groupby_func in ["cumprod", "cumsum"]: | ||||
|             with pytest.raises(TypeError, match=msg): | ||||
|                 with tm.assert_produces_warning(FutureWarning, match=warn_msg): | ||||
|                     method(*args, **kwargs) | ||||
|         else: | ||||
|             with pytest.raises(TypeError, match=msg): | ||||
|                 method(*args, **kwargs) | ||||
|     elif groupby_func not in has_axis: | ||||
|         msg = "got an unexpected keyword argument 'axis'" | ||||
|         with pytest.raises(TypeError, match=msg): | ||||
|             method(*args, **kwargs) | ||||
|     # fillna and shift are successful even on object dtypes | ||||
|     elif (numeric_only is None or not numeric_only) and groupby_func not in ( | ||||
|         "fillna", | ||||
|         "shift", | ||||
|     ): | ||||
|         msgs = ( | ||||
|             # cummax, cummin, rank | ||||
|             "not supported between instances of", | ||||
|             # cumprod | ||||
|             "can't multiply sequence by non-int of type 'float'", | ||||
|             # cumsum, diff, pct_change | ||||
|             "unsupported operand type", | ||||
|             "has no kernel", | ||||
|             "operation 'sub' not supported for dtype 'str' with dtype 'float64'", | ||||
|         ) | ||||
|         if using_infer_string: | ||||
|             pa = pytest.importorskip("pyarrow") | ||||
|  | ||||
|             errs = (TypeError, pa.lib.ArrowNotImplementedError) | ||||
|         else: | ||||
|             errs = TypeError | ||||
|         with pytest.raises(errs, match=f"({'|'.join(msgs)})"): | ||||
|             with tm.assert_produces_warning(FutureWarning, match=warn_msg): | ||||
|                 method(*args, **kwargs) | ||||
|     else: | ||||
|         with tm.assert_produces_warning(FutureWarning, match=warn_msg): | ||||
|             result = method(*args, **kwargs) | ||||
|  | ||||
|         df_expected = df.drop(columns="E").T if numeric_only else df.T | ||||
|         expected = getattr(df_expected, groupby_func)(*args).T | ||||
|         if groupby_func == "shift" and not numeric_only: | ||||
|             # shift with axis=1 leaves the leftmost column as numeric | ||||
|             # but transposing for expected gives us object dtype | ||||
|             expected = expected.astype(float) | ||||
|  | ||||
|         tm.assert_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "kernel, has_arg", | ||||
|     [ | ||||
|         ("all", False), | ||||
|         ("any", False), | ||||
|         ("bfill", False), | ||||
|         ("corr", True), | ||||
|         ("corrwith", True), | ||||
|         ("cov", True), | ||||
|         ("cummax", True), | ||||
|         ("cummin", True), | ||||
|         ("cumprod", True), | ||||
|         ("cumsum", True), | ||||
|         ("diff", False), | ||||
|         ("ffill", False), | ||||
|         ("fillna", False), | ||||
|         ("first", True), | ||||
|         ("idxmax", True), | ||||
|         ("idxmin", True), | ||||
|         ("last", True), | ||||
|         ("max", True), | ||||
|         ("mean", True), | ||||
|         ("median", True), | ||||
|         ("min", True), | ||||
|         ("nth", False), | ||||
|         ("nunique", False), | ||||
|         ("pct_change", False), | ||||
|         ("prod", True), | ||||
|         ("quantile", True), | ||||
|         ("sem", True), | ||||
|         ("skew", True), | ||||
|         ("std", True), | ||||
|         ("sum", True), | ||||
|         ("var", True), | ||||
|     ], | ||||
| ) | ||||
| @pytest.mark.parametrize("numeric_only", [True, False, lib.no_default]) | ||||
| @pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]]) | ||||
| def test_numeric_only(kernel, has_arg, numeric_only, keys): | ||||
|     # GH#46072 | ||||
|     # drops_nuisance: Whether the op drops nuisance columns even when numeric_only=False | ||||
|     # has_arg: Whether the op has a numeric_only arg | ||||
|     df = DataFrame({"a1": [1, 1], "a2": [2, 2], "a3": [5, 6], "b": 2 * [object]}) | ||||
|  | ||||
|     args = get_groupby_method_args(kernel, df) | ||||
|     kwargs = {} if numeric_only is lib.no_default else {"numeric_only": numeric_only} | ||||
|  | ||||
|     gb = df.groupby(keys) | ||||
|     method = getattr(gb, kernel) | ||||
|     if has_arg and numeric_only is True: | ||||
|         # Cases where b does not appear in the result | ||||
|         result = method(*args, **kwargs) | ||||
|         assert "b" not in result.columns | ||||
|     elif ( | ||||
|         # kernels that work on any dtype and have numeric_only arg | ||||
|         kernel in ("first", "last") | ||||
|         or ( | ||||
|             # kernels that work on any dtype and don't have numeric_only arg | ||||
|             kernel in ("any", "all", "bfill", "ffill", "fillna", "nth", "nunique") | ||||
|             and numeric_only is lib.no_default | ||||
|         ) | ||||
|     ): | ||||
|         warn = FutureWarning if kernel == "fillna" else None | ||||
|         msg = "DataFrameGroupBy.fillna is deprecated" | ||||
|         with tm.assert_produces_warning(warn, match=msg): | ||||
|             result = method(*args, **kwargs) | ||||
|         assert "b" in result.columns | ||||
|     elif has_arg: | ||||
|         assert numeric_only is not True | ||||
|         # kernels that are successful on any dtype were above; this will fail | ||||
|  | ||||
|         # object dtypes for transformations are not implemented in Cython and | ||||
|         # have no Python fallback | ||||
|         exception = NotImplementedError if kernel.startswith("cum") else TypeError | ||||
|  | ||||
|         msg = "|".join( | ||||
|             [ | ||||
|                 "not allowed for this dtype", | ||||
|                 "cannot be performed against 'object' dtypes", | ||||
|                 # On PY39 message is "a number"; on PY310 and after is "a real number" | ||||
|                 "must be a string or a.* number", | ||||
|                 "unsupported operand type", | ||||
|                 "function is not implemented for this dtype", | ||||
|                 re.escape(f"agg function failed [how->{kernel},dtype->object]"), | ||||
|             ] | ||||
|         ) | ||||
|         if kernel == "quantile": | ||||
|             msg = "dtype 'object' does not support operation 'quantile'" | ||||
|         elif kernel == "idxmin": | ||||
|             msg = "'<' not supported between instances of 'type' and 'type'" | ||||
|         elif kernel == "idxmax": | ||||
|             msg = "'>' not supported between instances of 'type' and 'type'" | ||||
|         with pytest.raises(exception, match=msg): | ||||
|             method(*args, **kwargs) | ||||
|     elif not has_arg and numeric_only is not lib.no_default: | ||||
|         with pytest.raises( | ||||
|             TypeError, match="got an unexpected keyword argument 'numeric_only'" | ||||
|         ): | ||||
|             method(*args, **kwargs) | ||||
|     else: | ||||
|         assert kernel in ("diff", "pct_change") | ||||
|         assert numeric_only is lib.no_default | ||||
|         # Doesn't have numeric_only argument and fails on nuisance columns | ||||
|         with pytest.raises(TypeError, match=r"unsupported operand type"): | ||||
|             method(*args, **kwargs) | ||||
|  | ||||
|  | ||||
| @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") | ||||
| @pytest.mark.parametrize("dtype", [bool, int, float, object]) | ||||
| def test_deprecate_numeric_only_series(dtype, groupby_func, request): | ||||
|     # GH#46560 | ||||
|     grouper = [0, 0, 1] | ||||
|  | ||||
|     ser = Series([1, 0, 0], dtype=dtype) | ||||
|     gb = ser.groupby(grouper) | ||||
|  | ||||
|     if groupby_func == "corrwith": | ||||
|         # corrwith is not implemented on SeriesGroupBy | ||||
|         assert not hasattr(gb, groupby_func) | ||||
|         return | ||||
|  | ||||
|     method = getattr(gb, groupby_func) | ||||
|  | ||||
|     expected_ser = Series([1, 0, 0]) | ||||
|     expected_gb = expected_ser.groupby(grouper) | ||||
|     expected_method = getattr(expected_gb, groupby_func) | ||||
|  | ||||
|     args = get_groupby_method_args(groupby_func, ser) | ||||
|  | ||||
|     fails_on_numeric_object = ( | ||||
|         "corr", | ||||
|         "cov", | ||||
|         "cummax", | ||||
|         "cummin", | ||||
|         "cumprod", | ||||
|         "cumsum", | ||||
|         "quantile", | ||||
|     ) | ||||
|     # ops that give an object result on object input | ||||
|     obj_result = ( | ||||
|         "first", | ||||
|         "last", | ||||
|         "nth", | ||||
|         "bfill", | ||||
|         "ffill", | ||||
|         "shift", | ||||
|         "sum", | ||||
|         "diff", | ||||
|         "pct_change", | ||||
|         "var", | ||||
|         "mean", | ||||
|         "median", | ||||
|         "min", | ||||
|         "max", | ||||
|         "prod", | ||||
|         "skew", | ||||
|     ) | ||||
|  | ||||
|     # Test default behavior; kernels that fail may be enabled in the future but kernels | ||||
|     # that succeed should not be allowed to fail (without deprecation, at least) | ||||
|     if groupby_func in fails_on_numeric_object and dtype is object: | ||||
|         if groupby_func == "quantile": | ||||
|             msg = "dtype 'object' does not support operation 'quantile'" | ||||
|         else: | ||||
|             msg = "is not supported for object dtype" | ||||
|         warn = FutureWarning if groupby_func == "fillna" else None | ||||
|         warn_msg = "DataFrameGroupBy.fillna is deprecated" | ||||
|         with tm.assert_produces_warning(warn, match=warn_msg): | ||||
|             with pytest.raises(TypeError, match=msg): | ||||
|                 method(*args) | ||||
|     elif dtype is object: | ||||
|         warn = FutureWarning if groupby_func == "fillna" else None | ||||
|         warn_msg = "SeriesGroupBy.fillna is deprecated" | ||||
|         with tm.assert_produces_warning(warn, match=warn_msg): | ||||
|             result = method(*args) | ||||
|         with tm.assert_produces_warning(warn, match=warn_msg): | ||||
|             expected = expected_method(*args) | ||||
|         if groupby_func in obj_result: | ||||
|             expected = expected.astype(object) | ||||
|         tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     has_numeric_only = ( | ||||
|         "first", | ||||
|         "last", | ||||
|         "max", | ||||
|         "mean", | ||||
|         "median", | ||||
|         "min", | ||||
|         "prod", | ||||
|         "quantile", | ||||
|         "sem", | ||||
|         "skew", | ||||
|         "std", | ||||
|         "sum", | ||||
|         "var", | ||||
|         "cummax", | ||||
|         "cummin", | ||||
|         "cumprod", | ||||
|         "cumsum", | ||||
|     ) | ||||
|     if groupby_func not in has_numeric_only: | ||||
|         msg = "got an unexpected keyword argument 'numeric_only'" | ||||
|         with pytest.raises(TypeError, match=msg): | ||||
|             method(*args, numeric_only=True) | ||||
|     elif dtype is object: | ||||
|         msg = "|".join( | ||||
|             [ | ||||
|                 "SeriesGroupBy.sem called with numeric_only=True and dtype object", | ||||
|                 "Series.skew does not allow numeric_only=True with non-numeric", | ||||
|                 "cum(sum|prod|min|max) is not supported for object dtype", | ||||
|                 r"Cannot use numeric_only=True with SeriesGroupBy\..* and non-numeric", | ||||
|             ] | ||||
|         ) | ||||
|         with pytest.raises(TypeError, match=msg): | ||||
|             method(*args, numeric_only=True) | ||||
|     elif dtype == bool and groupby_func == "quantile": | ||||
|         msg = "Allowing bool dtype in SeriesGroupBy.quantile" | ||||
|         with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|             # GH#51424 | ||||
|             result = method(*args, numeric_only=True) | ||||
|             expected = method(*args, numeric_only=False) | ||||
|         tm.assert_series_equal(result, expected) | ||||
|     else: | ||||
|         result = method(*args, numeric_only=True) | ||||
|         expected = method(*args, numeric_only=False) | ||||
|         tm.assert_series_equal(result, expected) | ||||
| @ -0,0 +1,80 @@ | ||||
| import numpy as np | ||||
|  | ||||
| import pandas as pd | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| def test_pipe(): | ||||
|     # Test the pipe method of DataFrameGroupBy. | ||||
|     # Issue #17871 | ||||
|  | ||||
|     random_state = np.random.default_rng(2) | ||||
|  | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], | ||||
|             "B": random_state.standard_normal(8), | ||||
|             "C": random_state.standard_normal(8), | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|     def f(dfgb): | ||||
|         return dfgb.B.max() - dfgb.C.min().min() | ||||
|  | ||||
|     def square(srs): | ||||
|         return srs**2 | ||||
|  | ||||
|     # Note that the transformations are | ||||
|     # GroupBy -> Series | ||||
|     # Series -> Series | ||||
|     # This then chains the GroupBy.pipe and the | ||||
|     # NDFrame.pipe methods | ||||
|     result = df.groupby("A").pipe(f).pipe(square) | ||||
|  | ||||
|     index = Index(["bar", "foo"], name="A") | ||||
|     expected = pd.Series([3.749306591013693, 6.717707873081384], name="B", index=index) | ||||
|  | ||||
|     tm.assert_series_equal(expected, result) | ||||
|  | ||||
|  | ||||
| def test_pipe_args(): | ||||
|     # Test passing args to the pipe method of DataFrameGroupBy. | ||||
|     # Issue #17871 | ||||
|  | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "group": ["A", "A", "B", "B", "C"], | ||||
|             "x": [1.0, 2.0, 3.0, 2.0, 5.0], | ||||
|             "y": [10.0, 100.0, 1000.0, -100.0, -1000.0], | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|     def f(dfgb, arg1): | ||||
|         filtered = dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False) | ||||
|         return filtered.groupby("group") | ||||
|  | ||||
|     def g(dfgb, arg2): | ||||
|         return dfgb.sum() / dfgb.sum().sum() + arg2 | ||||
|  | ||||
|     def h(df, arg3): | ||||
|         return df.x + df.y - arg3 | ||||
|  | ||||
|     result = df.groupby("group").pipe(f, 0).pipe(g, 10).pipe(h, 100) | ||||
|  | ||||
|     # Assert the results here | ||||
|     index = Index(["A", "B"], name="group") | ||||
|     expected = pd.Series([-79.5160891089, -78.4839108911], index=index) | ||||
|  | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     # test SeriesGroupby.pipe | ||||
|     ser = pd.Series([1, 1, 2, 2, 3, 3]) | ||||
|     result = ser.groupby(ser).pipe(lambda grp: grp.sum() * grp.count()) | ||||
|  | ||||
|     expected = pd.Series([4, 8, 12], index=Index([1, 2, 3], dtype=np.int64)) | ||||
|  | ||||
|     tm.assert_series_equal(result, expected) | ||||
							
								
								
									
										757
									
								
								lib/python3.11/site-packages/pandas/tests/groupby/test_raises.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										757
									
								
								lib/python3.11/site-packages/pandas/tests/groupby/test_raises.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,757 @@ | ||||
| # Only tests that raise an error and have no better location should go here. | ||||
| # Tests for specific groupby methods should go in their respective | ||||
| # test file. | ||||
|  | ||||
| import datetime | ||||
| import re | ||||
|  | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas import ( | ||||
|     Categorical, | ||||
|     DataFrame, | ||||
|     Grouper, | ||||
|     Series, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
| from pandas.tests.groupby import get_groupby_method_args | ||||
|  | ||||
|  | ||||
| @pytest.fixture( | ||||
|     params=[ | ||||
|         "a", | ||||
|         ["a"], | ||||
|         ["a", "b"], | ||||
|         Grouper(key="a"), | ||||
|         lambda x: x % 2, | ||||
|         [0, 0, 0, 1, 2, 2, 2, 3, 3], | ||||
|         np.array([0, 0, 0, 1, 2, 2, 2, 3, 3]), | ||||
|         dict(zip(range(9), [0, 0, 0, 1, 2, 2, 2, 3, 3])), | ||||
|         Series([1, 1, 1, 1, 1, 2, 2, 2, 2]), | ||||
|         [Series([1, 1, 1, 1, 1, 2, 2, 2, 2]), Series([3, 3, 4, 4, 4, 4, 4, 3, 3])], | ||||
|     ] | ||||
| ) | ||||
| def by(request): | ||||
|     return request.param | ||||
|  | ||||
|  | ||||
| @pytest.fixture(params=[True, False]) | ||||
| def groupby_series(request): | ||||
|     return request.param | ||||
|  | ||||
|  | ||||
| @pytest.fixture | ||||
| def df_with_string_col(): | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "a": [1, 1, 1, 1, 1, 2, 2, 2, 2], | ||||
|             "b": [3, 3, 4, 4, 4, 4, 4, 3, 3], | ||||
|             "c": range(9), | ||||
|             "d": list("xyzwtyuio"), | ||||
|         } | ||||
|     ) | ||||
|     return df | ||||
|  | ||||
|  | ||||
| @pytest.fixture | ||||
| def df_with_datetime_col(): | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "a": [1, 1, 1, 1, 1, 2, 2, 2, 2], | ||||
|             "b": [3, 3, 4, 4, 4, 4, 4, 3, 3], | ||||
|             "c": range(9), | ||||
|             "d": datetime.datetime(2005, 1, 1, 10, 30, 23, 540000), | ||||
|         } | ||||
|     ) | ||||
|     return df | ||||
|  | ||||
|  | ||||
| @pytest.fixture | ||||
| def df_with_timedelta_col(): | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "a": [1, 1, 1, 1, 1, 2, 2, 2, 2], | ||||
|             "b": [3, 3, 4, 4, 4, 4, 4, 3, 3], | ||||
|             "c": range(9), | ||||
|             "d": datetime.timedelta(days=1), | ||||
|         } | ||||
|     ) | ||||
|     return df | ||||
|  | ||||
|  | ||||
| @pytest.fixture | ||||
| def df_with_cat_col(): | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "a": [1, 1, 1, 1, 1, 2, 2, 2, 2], | ||||
|             "b": [3, 3, 4, 4, 4, 4, 4, 3, 3], | ||||
|             "c": range(9), | ||||
|             "d": Categorical( | ||||
|                 ["a", "a", "a", "a", "b", "b", "b", "b", "c"], | ||||
|                 categories=["a", "b", "c", "d"], | ||||
|                 ordered=True, | ||||
|             ), | ||||
|         } | ||||
|     ) | ||||
|     return df | ||||
|  | ||||
|  | ||||
| def _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=""): | ||||
|     warn_klass = None if warn_msg == "" else FutureWarning | ||||
|     with tm.assert_produces_warning(warn_klass, match=warn_msg): | ||||
|         if klass is None: | ||||
|             if how == "method": | ||||
|                 getattr(gb, groupby_func)(*args) | ||||
|             elif how == "agg": | ||||
|                 gb.agg(groupby_func, *args) | ||||
|             else: | ||||
|                 gb.transform(groupby_func, *args) | ||||
|         else: | ||||
|             with pytest.raises(klass, match=msg): | ||||
|                 if how == "method": | ||||
|                     getattr(gb, groupby_func)(*args) | ||||
|                 elif how == "agg": | ||||
|                     gb.agg(groupby_func, *args) | ||||
|                 else: | ||||
|                     gb.transform(groupby_func, *args) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("how", ["method", "agg", "transform"]) | ||||
| def test_groupby_raises_string( | ||||
|     how, by, groupby_series, groupby_func, df_with_string_col, using_infer_string | ||||
| ): | ||||
|     df = df_with_string_col | ||||
|     args = get_groupby_method_args(groupby_func, df) | ||||
|     gb = df.groupby(by=by) | ||||
|  | ||||
|     if groupby_series: | ||||
|         gb = gb["d"] | ||||
|  | ||||
|         if groupby_func == "corrwith": | ||||
|             assert not hasattr(gb, "corrwith") | ||||
|             return | ||||
|  | ||||
|     klass, msg = { | ||||
|         "all": (None, ""), | ||||
|         "any": (None, ""), | ||||
|         "bfill": (None, ""), | ||||
|         "corrwith": (TypeError, "Could not convert"), | ||||
|         "count": (None, ""), | ||||
|         "cumcount": (None, ""), | ||||
|         "cummax": ( | ||||
|             (NotImplementedError, TypeError), | ||||
|             "(function|cummax) is not (implemented|supported) for (this|object) dtype", | ||||
|         ), | ||||
|         "cummin": ( | ||||
|             (NotImplementedError, TypeError), | ||||
|             "(function|cummin) is not (implemented|supported) for (this|object) dtype", | ||||
|         ), | ||||
|         "cumprod": ( | ||||
|             (NotImplementedError, TypeError), | ||||
|             "(function|cumprod) is not (implemented|supported) for (this|object) dtype", | ||||
|         ), | ||||
|         "cumsum": ( | ||||
|             (NotImplementedError, TypeError), | ||||
|             "(function|cumsum) is not (implemented|supported) for (this|object) dtype", | ||||
|         ), | ||||
|         "diff": (TypeError, "unsupported operand type"), | ||||
|         "ffill": (None, ""), | ||||
|         "fillna": (None, ""), | ||||
|         "first": (None, ""), | ||||
|         "idxmax": (None, ""), | ||||
|         "idxmin": (None, ""), | ||||
|         "last": (None, ""), | ||||
|         "max": (None, ""), | ||||
|         "mean": ( | ||||
|             TypeError, | ||||
|             re.escape("agg function failed [how->mean,dtype->object]"), | ||||
|         ), | ||||
|         "median": ( | ||||
|             TypeError, | ||||
|             re.escape("agg function failed [how->median,dtype->object]"), | ||||
|         ), | ||||
|         "min": (None, ""), | ||||
|         "ngroup": (None, ""), | ||||
|         "nunique": (None, ""), | ||||
|         "pct_change": (TypeError, "unsupported operand type"), | ||||
|         "prod": ( | ||||
|             TypeError, | ||||
|             re.escape("agg function failed [how->prod,dtype->object]"), | ||||
|         ), | ||||
|         "quantile": (TypeError, "dtype 'object' does not support operation 'quantile'"), | ||||
|         "rank": (None, ""), | ||||
|         "sem": (ValueError, "could not convert string to float"), | ||||
|         "shift": (None, ""), | ||||
|         "size": (None, ""), | ||||
|         "skew": (ValueError, "could not convert string to float"), | ||||
|         "std": (ValueError, "could not convert string to float"), | ||||
|         "sum": (None, ""), | ||||
|         "var": ( | ||||
|             TypeError, | ||||
|             re.escape("agg function failed [how->var,dtype->"), | ||||
|         ), | ||||
|     }[groupby_func] | ||||
|  | ||||
|     if using_infer_string: | ||||
|         if groupby_func in [ | ||||
|             "prod", | ||||
|             "mean", | ||||
|             "median", | ||||
|             "cumsum", | ||||
|             "cumprod", | ||||
|             "std", | ||||
|             "sem", | ||||
|             "var", | ||||
|             "skew", | ||||
|             "quantile", | ||||
|         ]: | ||||
|             msg = f"dtype 'str' does not support operation '{groupby_func}'" | ||||
|             if groupby_func in ["sem", "std", "skew"]: | ||||
|                 # The object-dtype raises ValueError when trying to convert to numeric. | ||||
|                 klass = TypeError | ||||
|         elif groupby_func == "pct_change" and df["d"].dtype.storage == "pyarrow": | ||||
|             # This doesn't go through EA._groupby_op so the message isn't controlled | ||||
|             #  there. | ||||
|             msg = "operation 'truediv' not supported for dtype 'str' with dtype 'str'" | ||||
|         elif groupby_func == "diff" and df["d"].dtype.storage == "pyarrow": | ||||
|             # This doesn't go through EA._groupby_op so the message isn't controlled | ||||
|             #  there. | ||||
|             msg = "operation 'sub' not supported for dtype 'str' with dtype 'str'" | ||||
|  | ||||
|         elif groupby_func in ["cummin", "cummax"]: | ||||
|             msg = msg.replace("object", "str") | ||||
|         elif groupby_func == "corrwith": | ||||
|             msg = "Cannot perform reduction 'mean' with string dtype" | ||||
|  | ||||
|     if groupby_func == "fillna": | ||||
|         kind = "Series" if groupby_series else "DataFrame" | ||||
|         warn_msg = f"{kind}GroupBy.fillna is deprecated" | ||||
|     else: | ||||
|         warn_msg = "" | ||||
|     _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("how", ["agg", "transform"]) | ||||
| def test_groupby_raises_string_udf(how, by, groupby_series, df_with_string_col): | ||||
|     df = df_with_string_col | ||||
|     gb = df.groupby(by=by) | ||||
|  | ||||
|     if groupby_series: | ||||
|         gb = gb["d"] | ||||
|  | ||||
|     def func(x): | ||||
|         raise TypeError("Test error message") | ||||
|  | ||||
|     with pytest.raises(TypeError, match="Test error message"): | ||||
|         getattr(gb, how)(func) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("how", ["agg", "transform"]) | ||||
| @pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean]) | ||||
| def test_groupby_raises_string_np( | ||||
|     how, | ||||
|     by, | ||||
|     groupby_series, | ||||
|     groupby_func_np, | ||||
|     df_with_string_col, | ||||
|     using_infer_string, | ||||
| ): | ||||
|     # GH#50749 | ||||
|     df = df_with_string_col | ||||
|     gb = df.groupby(by=by) | ||||
|  | ||||
|     if groupby_series: | ||||
|         gb = gb["d"] | ||||
|  | ||||
|     klass, msg = { | ||||
|         np.sum: (None, ""), | ||||
|         np.mean: ( | ||||
|             TypeError, | ||||
|             "agg function failed|Cannot perform reduction 'mean' with string dtype", | ||||
|         ), | ||||
|     }[groupby_func_np] | ||||
|  | ||||
|     if using_infer_string: | ||||
|         if groupby_func_np is np.mean: | ||||
|             klass = TypeError | ||||
|         msg = "dtype 'str' does not support operation 'mean'" | ||||
|  | ||||
|     if groupby_series: | ||||
|         warn_msg = "using SeriesGroupBy.[sum|mean]" | ||||
|     else: | ||||
|         warn_msg = "using DataFrameGroupBy.[sum|mean]" | ||||
|     _call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("how", ["method", "agg", "transform"]) | ||||
| def test_groupby_raises_datetime( | ||||
|     how, by, groupby_series, groupby_func, df_with_datetime_col | ||||
| ): | ||||
|     df = df_with_datetime_col | ||||
|     args = get_groupby_method_args(groupby_func, df) | ||||
|     gb = df.groupby(by=by) | ||||
|  | ||||
|     if groupby_series: | ||||
|         gb = gb["d"] | ||||
|  | ||||
|         if groupby_func == "corrwith": | ||||
|             assert not hasattr(gb, "corrwith") | ||||
|             return | ||||
|  | ||||
|     klass, msg = { | ||||
|         "all": (None, ""), | ||||
|         "any": (None, ""), | ||||
|         "bfill": (None, ""), | ||||
|         "corrwith": (TypeError, "cannot perform __mul__ with this index type"), | ||||
|         "count": (None, ""), | ||||
|         "cumcount": (None, ""), | ||||
|         "cummax": (None, ""), | ||||
|         "cummin": (None, ""), | ||||
|         "cumprod": (TypeError, "datetime64 type does not support cumprod operations"), | ||||
|         "cumsum": (TypeError, "datetime64 type does not support cumsum operations"), | ||||
|         "diff": (None, ""), | ||||
|         "ffill": (None, ""), | ||||
|         "fillna": (None, ""), | ||||
|         "first": (None, ""), | ||||
|         "idxmax": (None, ""), | ||||
|         "idxmin": (None, ""), | ||||
|         "last": (None, ""), | ||||
|         "max": (None, ""), | ||||
|         "mean": (None, ""), | ||||
|         "median": (None, ""), | ||||
|         "min": (None, ""), | ||||
|         "ngroup": (None, ""), | ||||
|         "nunique": (None, ""), | ||||
|         "pct_change": (TypeError, "cannot perform __truediv__ with this index type"), | ||||
|         "prod": (TypeError, "datetime64 type does not support prod"), | ||||
|         "quantile": (None, ""), | ||||
|         "rank": (None, ""), | ||||
|         "sem": (None, ""), | ||||
|         "shift": (None, ""), | ||||
|         "size": (None, ""), | ||||
|         "skew": ( | ||||
|             TypeError, | ||||
|             "|".join( | ||||
|                 [ | ||||
|                     r"dtype datetime64\[ns\] does not support reduction", | ||||
|                     "datetime64 type does not support skew operations", | ||||
|                 ] | ||||
|             ), | ||||
|         ), | ||||
|         "std": (None, ""), | ||||
|         "sum": (TypeError, "datetime64 type does not support sum operations"), | ||||
|         "var": (TypeError, "datetime64 type does not support var operations"), | ||||
|     }[groupby_func] | ||||
|  | ||||
|     if groupby_func in ["any", "all"]: | ||||
|         warn_msg = f"'{groupby_func}' with datetime64 dtypes is deprecated" | ||||
|     elif groupby_func == "fillna": | ||||
|         kind = "Series" if groupby_series else "DataFrame" | ||||
|         warn_msg = f"{kind}GroupBy.fillna is deprecated" | ||||
|     else: | ||||
|         warn_msg = "" | ||||
|     _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=warn_msg) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("how", ["agg", "transform"]) | ||||
| def test_groupby_raises_datetime_udf(how, by, groupby_series, df_with_datetime_col): | ||||
|     df = df_with_datetime_col | ||||
|     gb = df.groupby(by=by) | ||||
|  | ||||
|     if groupby_series: | ||||
|         gb = gb["d"] | ||||
|  | ||||
|     def func(x): | ||||
|         raise TypeError("Test error message") | ||||
|  | ||||
|     with pytest.raises(TypeError, match="Test error message"): | ||||
|         getattr(gb, how)(func) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("how", ["agg", "transform"]) | ||||
| @pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean]) | ||||
| def test_groupby_raises_datetime_np( | ||||
|     how, by, groupby_series, groupby_func_np, df_with_datetime_col | ||||
| ): | ||||
|     # GH#50749 | ||||
|     df = df_with_datetime_col | ||||
|     gb = df.groupby(by=by) | ||||
|  | ||||
|     if groupby_series: | ||||
|         gb = gb["d"] | ||||
|  | ||||
|     klass, msg = { | ||||
|         np.sum: (TypeError, "datetime64 type does not support sum operations"), | ||||
|         np.mean: (None, ""), | ||||
|     }[groupby_func_np] | ||||
|  | ||||
|     if groupby_series: | ||||
|         warn_msg = "using SeriesGroupBy.[sum|mean]" | ||||
|     else: | ||||
|         warn_msg = "using DataFrameGroupBy.[sum|mean]" | ||||
|     _call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("func", ["prod", "cumprod", "skew", "var"]) | ||||
| def test_groupby_raises_timedelta(func, df_with_timedelta_col): | ||||
|     df = df_with_timedelta_col | ||||
|     gb = df.groupby(by="a") | ||||
|  | ||||
|     _call_and_check( | ||||
|         TypeError, | ||||
|         "timedelta64 type does not support .* operations", | ||||
|         "method", | ||||
|         gb, | ||||
|         func, | ||||
|         [], | ||||
|     ) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("how", ["method", "agg", "transform"]) | ||||
| def test_groupby_raises_category( | ||||
|     how, by, groupby_series, groupby_func, using_copy_on_write, df_with_cat_col | ||||
| ): | ||||
|     # GH#50749 | ||||
|     df = df_with_cat_col | ||||
|     args = get_groupby_method_args(groupby_func, df) | ||||
|     gb = df.groupby(by=by) | ||||
|  | ||||
|     if groupby_series: | ||||
|         gb = gb["d"] | ||||
|  | ||||
|         if groupby_func == "corrwith": | ||||
|             assert not hasattr(gb, "corrwith") | ||||
|             return | ||||
|  | ||||
|     klass, msg = { | ||||
|         "all": (None, ""), | ||||
|         "any": (None, ""), | ||||
|         "bfill": (None, ""), | ||||
|         "corrwith": ( | ||||
|             TypeError, | ||||
|             r"unsupported operand type\(s\) for \*: 'Categorical' and 'int'", | ||||
|         ), | ||||
|         "count": (None, ""), | ||||
|         "cumcount": (None, ""), | ||||
|         "cummax": ( | ||||
|             (NotImplementedError, TypeError), | ||||
|             "(category type does not support cummax operations|" | ||||
|             "category dtype not supported|" | ||||
|             "cummax is not supported for category dtype)", | ||||
|         ), | ||||
|         "cummin": ( | ||||
|             (NotImplementedError, TypeError), | ||||
|             "(category type does not support cummin operations|" | ||||
|             "category dtype not supported|" | ||||
|             "cummin is not supported for category dtype)", | ||||
|         ), | ||||
|         "cumprod": ( | ||||
|             (NotImplementedError, TypeError), | ||||
|             "(category type does not support cumprod operations|" | ||||
|             "category dtype not supported|" | ||||
|             "cumprod is not supported for category dtype)", | ||||
|         ), | ||||
|         "cumsum": ( | ||||
|             (NotImplementedError, TypeError), | ||||
|             "(category type does not support cumsum operations|" | ||||
|             "category dtype not supported|" | ||||
|             "cumsum is not supported for category dtype)", | ||||
|         ), | ||||
|         "diff": ( | ||||
|             TypeError, | ||||
|             r"unsupported operand type\(s\) for -: 'Categorical' and 'Categorical'", | ||||
|         ), | ||||
|         "ffill": (None, ""), | ||||
|         "fillna": ( | ||||
|             TypeError, | ||||
|             r"Cannot setitem on a Categorical with a new category \(0\), " | ||||
|             "set the categories first", | ||||
|         ) | ||||
|         if not using_copy_on_write | ||||
|         else (None, ""),  # no-op with CoW | ||||
|         "first": (None, ""), | ||||
|         "idxmax": (None, ""), | ||||
|         "idxmin": (None, ""), | ||||
|         "last": (None, ""), | ||||
|         "max": (None, ""), | ||||
|         "mean": ( | ||||
|             TypeError, | ||||
|             "|".join( | ||||
|                 [ | ||||
|                     "'Categorical' .* does not support reduction 'mean'", | ||||
|                     "category dtype does not support aggregation 'mean'", | ||||
|                 ] | ||||
|             ), | ||||
|         ), | ||||
|         "median": ( | ||||
|             TypeError, | ||||
|             "|".join( | ||||
|                 [ | ||||
|                     "'Categorical' .* does not support reduction 'median'", | ||||
|                     "category dtype does not support aggregation 'median'", | ||||
|                 ] | ||||
|             ), | ||||
|         ), | ||||
|         "min": (None, ""), | ||||
|         "ngroup": (None, ""), | ||||
|         "nunique": (None, ""), | ||||
|         "pct_change": ( | ||||
|             TypeError, | ||||
|             r"unsupported operand type\(s\) for /: 'Categorical' and 'Categorical'", | ||||
|         ), | ||||
|         "prod": (TypeError, "category type does not support prod operations"), | ||||
|         "quantile": (TypeError, "No matching signature found"), | ||||
|         "rank": (None, ""), | ||||
|         "sem": ( | ||||
|             TypeError, | ||||
|             "|".join( | ||||
|                 [ | ||||
|                     "'Categorical' .* does not support reduction 'sem'", | ||||
|                     "category dtype does not support aggregation 'sem'", | ||||
|                 ] | ||||
|             ), | ||||
|         ), | ||||
|         "shift": (None, ""), | ||||
|         "size": (None, ""), | ||||
|         "skew": ( | ||||
|             TypeError, | ||||
|             "|".join( | ||||
|                 [ | ||||
|                     "dtype category does not support reduction 'skew'", | ||||
|                     "category type does not support skew operations", | ||||
|                 ] | ||||
|             ), | ||||
|         ), | ||||
|         "std": ( | ||||
|             TypeError, | ||||
|             "|".join( | ||||
|                 [ | ||||
|                     "'Categorical' .* does not support reduction 'std'", | ||||
|                     "category dtype does not support aggregation 'std'", | ||||
|                 ] | ||||
|             ), | ||||
|         ), | ||||
|         "sum": (TypeError, "category type does not support sum operations"), | ||||
|         "var": ( | ||||
|             TypeError, | ||||
|             "|".join( | ||||
|                 [ | ||||
|                     "'Categorical' .* does not support reduction 'var'", | ||||
|                     "category dtype does not support aggregation 'var'", | ||||
|                 ] | ||||
|             ), | ||||
|         ), | ||||
|     }[groupby_func] | ||||
|  | ||||
|     if groupby_func == "fillna": | ||||
|         kind = "Series" if groupby_series else "DataFrame" | ||||
|         warn_msg = f"{kind}GroupBy.fillna is deprecated" | ||||
|     else: | ||||
|         warn_msg = "" | ||||
|     _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("how", ["agg", "transform"]) | ||||
| def test_groupby_raises_category_udf(how, by, groupby_series, df_with_cat_col): | ||||
|     # GH#50749 | ||||
|     df = df_with_cat_col | ||||
|     gb = df.groupby(by=by) | ||||
|  | ||||
|     if groupby_series: | ||||
|         gb = gb["d"] | ||||
|  | ||||
|     def func(x): | ||||
|         raise TypeError("Test error message") | ||||
|  | ||||
|     with pytest.raises(TypeError, match="Test error message"): | ||||
|         getattr(gb, how)(func) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("how", ["agg", "transform"]) | ||||
| @pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean]) | ||||
| def test_groupby_raises_category_np( | ||||
|     how, by, groupby_series, groupby_func_np, df_with_cat_col | ||||
| ): | ||||
|     # GH#50749 | ||||
|     df = df_with_cat_col | ||||
|     gb = df.groupby(by=by) | ||||
|  | ||||
|     if groupby_series: | ||||
|         gb = gb["d"] | ||||
|  | ||||
|     klass, msg = { | ||||
|         np.sum: (TypeError, "category type does not support sum operations"), | ||||
|         np.mean: ( | ||||
|             TypeError, | ||||
|             "category dtype does not support aggregation 'mean'", | ||||
|         ), | ||||
|     }[groupby_func_np] | ||||
|  | ||||
|     if groupby_series: | ||||
|         warn_msg = "using SeriesGroupBy.[sum|mean]" | ||||
|     else: | ||||
|         warn_msg = "using DataFrameGroupBy.[sum|mean]" | ||||
|     _call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("how", ["method", "agg", "transform"]) | ||||
| def test_groupby_raises_category_on_category( | ||||
|     how, | ||||
|     by, | ||||
|     groupby_series, | ||||
|     groupby_func, | ||||
|     observed, | ||||
|     using_copy_on_write, | ||||
|     df_with_cat_col, | ||||
| ): | ||||
|     # GH#50749 | ||||
|     df = df_with_cat_col | ||||
|     df["a"] = Categorical( | ||||
|         ["a", "a", "a", "a", "b", "b", "b", "b", "c"], | ||||
|         categories=["a", "b", "c", "d"], | ||||
|         ordered=True, | ||||
|     ) | ||||
|     args = get_groupby_method_args(groupby_func, df) | ||||
|     gb = df.groupby(by=by, observed=observed) | ||||
|  | ||||
|     if groupby_series: | ||||
|         gb = gb["d"] | ||||
|  | ||||
|         if groupby_func == "corrwith": | ||||
|             assert not hasattr(gb, "corrwith") | ||||
|             return | ||||
|  | ||||
|     empty_groups = not observed and any(group.empty for group in gb.groups.values()) | ||||
|     if ( | ||||
|         not observed | ||||
|         and how != "transform" | ||||
|         and isinstance(by, list) | ||||
|         and isinstance(by[0], str) | ||||
|         and by == ["a", "b"] | ||||
|     ): | ||||
|         assert not empty_groups | ||||
|         # TODO: empty_groups should be true due to unobserved categorical combinations | ||||
|         empty_groups = True | ||||
|     if how == "transform": | ||||
|         # empty groups will be ignored | ||||
|         empty_groups = False | ||||
|  | ||||
|     klass, msg = { | ||||
|         "all": (None, ""), | ||||
|         "any": (None, ""), | ||||
|         "bfill": (None, ""), | ||||
|         "corrwith": ( | ||||
|             TypeError, | ||||
|             r"unsupported operand type\(s\) for \*: 'Categorical' and 'int'", | ||||
|         ), | ||||
|         "count": (None, ""), | ||||
|         "cumcount": (None, ""), | ||||
|         "cummax": ( | ||||
|             (NotImplementedError, TypeError), | ||||
|             "(cummax is not supported for category dtype|" | ||||
|             "category dtype not supported|" | ||||
|             "category type does not support cummax operations)", | ||||
|         ), | ||||
|         "cummin": ( | ||||
|             (NotImplementedError, TypeError), | ||||
|             "(cummin is not supported for category dtype|" | ||||
|             "category dtype not supported|" | ||||
|             "category type does not support cummin operations)", | ||||
|         ), | ||||
|         "cumprod": ( | ||||
|             (NotImplementedError, TypeError), | ||||
|             "(cumprod is not supported for category dtype|" | ||||
|             "category dtype not supported|" | ||||
|             "category type does not support cumprod operations)", | ||||
|         ), | ||||
|         "cumsum": ( | ||||
|             (NotImplementedError, TypeError), | ||||
|             "(cumsum is not supported for category dtype|" | ||||
|             "category dtype not supported|" | ||||
|             "category type does not support cumsum operations)", | ||||
|         ), | ||||
|         "diff": (TypeError, "unsupported operand type"), | ||||
|         "ffill": (None, ""), | ||||
|         "fillna": ( | ||||
|             TypeError, | ||||
|             r"Cannot setitem on a Categorical with a new category \(0\), " | ||||
|             "set the categories first", | ||||
|         ) | ||||
|         if not using_copy_on_write | ||||
|         else (None, ""),  # no-op with CoW | ||||
|         "first": (None, ""), | ||||
|         "idxmax": (ValueError, "empty group due to unobserved categories") | ||||
|         if empty_groups | ||||
|         else (None, ""), | ||||
|         "idxmin": (ValueError, "empty group due to unobserved categories") | ||||
|         if empty_groups | ||||
|         else (None, ""), | ||||
|         "last": (None, ""), | ||||
|         "max": (None, ""), | ||||
|         "mean": (TypeError, "category dtype does not support aggregation 'mean'"), | ||||
|         "median": (TypeError, "category dtype does not support aggregation 'median'"), | ||||
|         "min": (None, ""), | ||||
|         "ngroup": (None, ""), | ||||
|         "nunique": (None, ""), | ||||
|         "pct_change": (TypeError, "unsupported operand type"), | ||||
|         "prod": (TypeError, "category type does not support prod operations"), | ||||
|         "quantile": (TypeError, "No matching signature found"), | ||||
|         "rank": (None, ""), | ||||
|         "sem": ( | ||||
|             TypeError, | ||||
|             "|".join( | ||||
|                 [ | ||||
|                     "'Categorical' .* does not support reduction 'sem'", | ||||
|                     "category dtype does not support aggregation 'sem'", | ||||
|                 ] | ||||
|             ), | ||||
|         ), | ||||
|         "shift": (None, ""), | ||||
|         "size": (None, ""), | ||||
|         "skew": ( | ||||
|             TypeError, | ||||
|             "|".join( | ||||
|                 [ | ||||
|                     "category type does not support skew operations", | ||||
|                     "dtype category does not support reduction 'skew'", | ||||
|                 ] | ||||
|             ), | ||||
|         ), | ||||
|         "std": ( | ||||
|             TypeError, | ||||
|             "|".join( | ||||
|                 [ | ||||
|                     "'Categorical' .* does not support reduction 'std'", | ||||
|                     "category dtype does not support aggregation 'std'", | ||||
|                 ] | ||||
|             ), | ||||
|         ), | ||||
|         "sum": (TypeError, "category type does not support sum operations"), | ||||
|         "var": ( | ||||
|             TypeError, | ||||
|             "|".join( | ||||
|                 [ | ||||
|                     "'Categorical' .* does not support reduction 'var'", | ||||
|                     "category dtype does not support aggregation 'var'", | ||||
|                 ] | ||||
|             ), | ||||
|         ), | ||||
|     }[groupby_func] | ||||
|  | ||||
|     if groupby_func == "fillna": | ||||
|         kind = "Series" if groupby_series else "DataFrame" | ||||
|         warn_msg = f"{kind}GroupBy.fillna is deprecated" | ||||
|     else: | ||||
|         warn_msg = "" | ||||
|     _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg) | ||||
|  | ||||
|  | ||||
| def test_subsetting_columns_axis_1_raises(): | ||||
|     # GH 35443 | ||||
|     df = DataFrame({"a": [1], "b": [2], "c": [3]}) | ||||
|     msg = "DataFrame.groupby with axis=1 is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         gb = df.groupby("a", axis=1) | ||||
|     with pytest.raises(ValueError, match="Cannot subset columns when using axis=1"): | ||||
|         gb["b"] | ||||
							
								
								
									
										1277
									
								
								lib/python3.11/site-packages/pandas/tests/groupby/test_reductions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1277
									
								
								lib/python3.11/site-packages/pandas/tests/groupby/test_reductions.py
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -0,0 +1,968 @@ | ||||
| """ | ||||
| test with the TimeGrouper / grouping with datetimes | ||||
| """ | ||||
| from datetime import ( | ||||
|     datetime, | ||||
|     timedelta, | ||||
| ) | ||||
|  | ||||
| import numpy as np | ||||
| import pytest | ||||
| import pytz | ||||
|  | ||||
| from pandas._config import using_string_dtype | ||||
|  | ||||
| import pandas as pd | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     DatetimeIndex, | ||||
|     Index, | ||||
|     MultiIndex, | ||||
|     Series, | ||||
|     Timestamp, | ||||
|     date_range, | ||||
|     offsets, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
| from pandas.core.groupby.grouper import Grouper | ||||
| from pandas.core.groupby.ops import BinGrouper | ||||
|  | ||||
|  | ||||
| @pytest.fixture | ||||
| def frame_for_truncated_bingrouper(): | ||||
|     """ | ||||
|     DataFrame used by groupby_with_truncated_bingrouper, made into | ||||
|     a separate fixture for easier reuse in | ||||
|     test_groupby_apply_timegrouper_with_nat_apply_squeeze | ||||
|     """ | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "Quantity": [18, 3, 5, 1, 9, 3], | ||||
|             "Date": [ | ||||
|                 Timestamp(2013, 9, 1, 13, 0), | ||||
|                 Timestamp(2013, 9, 1, 13, 5), | ||||
|                 Timestamp(2013, 10, 1, 20, 0), | ||||
|                 Timestamp(2013, 10, 3, 10, 0), | ||||
|                 pd.NaT, | ||||
|                 Timestamp(2013, 9, 2, 14, 0), | ||||
|             ], | ||||
|         } | ||||
|     ) | ||||
|     return df | ||||
|  | ||||
|  | ||||
| @pytest.fixture | ||||
| def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper): | ||||
|     """ | ||||
|     GroupBy object such that gb._grouper is a BinGrouper and | ||||
|     len(gb._grouper.result_index) < len(gb._grouper.group_keys_seq) | ||||
|  | ||||
|     Aggregations on this groupby should have | ||||
|  | ||||
|         dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date") | ||||
|  | ||||
|     As either the index or an index level. | ||||
|     """ | ||||
|     df = frame_for_truncated_bingrouper | ||||
|  | ||||
|     tdg = Grouper(key="Date", freq="5D") | ||||
|     gb = df.groupby(tdg) | ||||
|  | ||||
|     # check we're testing the case we're interested in | ||||
|     assert len(gb._grouper.result_index) != len(gb._grouper.group_keys_seq) | ||||
|  | ||||
|     return gb | ||||
|  | ||||
|  | ||||
| class TestGroupBy: | ||||
|     # TODO(infer_string) resample sum introduces 0's | ||||
|     # https://github.com/pandas-dev/pandas/issues/60229 | ||||
|     @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") | ||||
|     def test_groupby_with_timegrouper(self): | ||||
|         # GH 4161 | ||||
|         # TimeGrouper requires a sorted index | ||||
|         # also verifies that the resultant index has the correct name | ||||
|         df_original = DataFrame( | ||||
|             { | ||||
|                 "Buyer": "Carl Carl Carl Carl Joe Carl".split(), | ||||
|                 "Quantity": [18, 3, 5, 1, 9, 3], | ||||
|                 "Date": [ | ||||
|                     datetime(2013, 9, 1, 13, 0), | ||||
|                     datetime(2013, 9, 1, 13, 5), | ||||
|                     datetime(2013, 10, 1, 20, 0), | ||||
|                     datetime(2013, 10, 3, 10, 0), | ||||
|                     datetime(2013, 12, 2, 12, 0), | ||||
|                     datetime(2013, 9, 2, 14, 0), | ||||
|                 ], | ||||
|             } | ||||
|         ) | ||||
|  | ||||
|         # GH 6908 change target column's order | ||||
|         df_reordered = df_original.sort_values(by="Quantity") | ||||
|  | ||||
|         for df in [df_original, df_reordered]: | ||||
|             df = df.set_index(["Date"]) | ||||
|  | ||||
|             exp_dti = date_range( | ||||
|                 "20130901", | ||||
|                 "20131205", | ||||
|                 freq="5D", | ||||
|                 name="Date", | ||||
|                 inclusive="left", | ||||
|                 unit=df.index.unit, | ||||
|             ) | ||||
|             expected = DataFrame( | ||||
|                 {"Buyer": 0, "Quantity": 0}, | ||||
|                 index=exp_dti, | ||||
|             ) | ||||
|             # Cast to object to avoid implicit cast when setting entry to "CarlCarlCarl" | ||||
|             expected = expected.astype({"Buyer": object}) | ||||
|             expected.iloc[0, 0] = "CarlCarlCarl" | ||||
|             expected.iloc[6, 0] = "CarlCarl" | ||||
|             expected.iloc[18, 0] = "Joe" | ||||
|             expected.iloc[[0, 6, 18], 1] = np.array([24, 6, 9], dtype="int64") | ||||
|  | ||||
|             result1 = df.resample("5D").sum() | ||||
|             tm.assert_frame_equal(result1, expected) | ||||
|  | ||||
|             df_sorted = df.sort_index() | ||||
|             result2 = df_sorted.groupby(Grouper(freq="5D")).sum() | ||||
|             tm.assert_frame_equal(result2, expected) | ||||
|  | ||||
|             result3 = df.groupby(Grouper(freq="5D")).sum() | ||||
|             tm.assert_frame_equal(result3, expected) | ||||
|  | ||||
|     @pytest.mark.parametrize("should_sort", [True, False]) | ||||
|     def test_groupby_with_timegrouper_methods(self, should_sort): | ||||
|         # GH 3881 | ||||
|         # make sure API of timegrouper conforms | ||||
|  | ||||
|         df = DataFrame( | ||||
|             { | ||||
|                 "Branch": "A A A A A B".split(), | ||||
|                 "Buyer": "Carl Mark Carl Joe Joe Carl".split(), | ||||
|                 "Quantity": [1, 3, 5, 8, 9, 3], | ||||
|                 "Date": [ | ||||
|                     datetime(2013, 1, 1, 13, 0), | ||||
|                     datetime(2013, 1, 1, 13, 5), | ||||
|                     datetime(2013, 10, 1, 20, 0), | ||||
|                     datetime(2013, 10, 2, 10, 0), | ||||
|                     datetime(2013, 12, 2, 12, 0), | ||||
|                     datetime(2013, 12, 2, 14, 0), | ||||
|                 ], | ||||
|             } | ||||
|         ) | ||||
|  | ||||
|         if should_sort: | ||||
|             df = df.sort_values(by="Quantity", ascending=False) | ||||
|  | ||||
|         df = df.set_index("Date", drop=False) | ||||
|         g = df.groupby(Grouper(freq="6ME")) | ||||
|         assert g.group_keys | ||||
|  | ||||
|         assert isinstance(g._grouper, BinGrouper) | ||||
|         groups = g.groups | ||||
|         assert isinstance(groups, dict) | ||||
|         assert len(groups) == 3 | ||||
|  | ||||
|     def test_timegrouper_with_reg_groups(self): | ||||
|         # GH 3794 | ||||
|         # allow combination of timegrouper/reg groups | ||||
|  | ||||
|         df_original = DataFrame( | ||||
|             { | ||||
|                 "Branch": "A A A A A A A B".split(), | ||||
|                 "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(), | ||||
|                 "Quantity": [1, 3, 5, 1, 8, 1, 9, 3], | ||||
|                 "Date": [ | ||||
|                     datetime(2013, 1, 1, 13, 0), | ||||
|                     datetime(2013, 1, 1, 13, 5), | ||||
|                     datetime(2013, 10, 1, 20, 0), | ||||
|                     datetime(2013, 10, 2, 10, 0), | ||||
|                     datetime(2013, 10, 1, 20, 0), | ||||
|                     datetime(2013, 10, 2, 10, 0), | ||||
|                     datetime(2013, 12, 2, 12, 0), | ||||
|                     datetime(2013, 12, 2, 14, 0), | ||||
|                 ], | ||||
|             } | ||||
|         ).set_index("Date") | ||||
|  | ||||
|         df_sorted = df_original.sort_values(by="Quantity", ascending=False) | ||||
|  | ||||
|         for df in [df_original, df_sorted]: | ||||
|             expected = DataFrame( | ||||
|                 { | ||||
|                     "Buyer": "Carl Joe Mark".split(), | ||||
|                     "Quantity": [10, 18, 3], | ||||
|                     "Date": [ | ||||
|                         datetime(2013, 12, 31, 0, 0), | ||||
|                         datetime(2013, 12, 31, 0, 0), | ||||
|                         datetime(2013, 12, 31, 0, 0), | ||||
|                     ], | ||||
|                 } | ||||
|             ).set_index(["Date", "Buyer"]) | ||||
|  | ||||
|             msg = "The default value of numeric_only" | ||||
|             result = df.groupby([Grouper(freq="YE"), "Buyer"]).sum(numeric_only=True) | ||||
|             tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|             expected = DataFrame( | ||||
|                 { | ||||
|                     "Buyer": "Carl Mark Carl Joe".split(), | ||||
|                     "Quantity": [1, 3, 9, 18], | ||||
|                     "Date": [ | ||||
|                         datetime(2013, 1, 1, 0, 0), | ||||
|                         datetime(2013, 1, 1, 0, 0), | ||||
|                         datetime(2013, 7, 1, 0, 0), | ||||
|                         datetime(2013, 7, 1, 0, 0), | ||||
|                     ], | ||||
|                 } | ||||
|             ).set_index(["Date", "Buyer"]) | ||||
|             result = df.groupby([Grouper(freq="6MS"), "Buyer"]).sum(numeric_only=True) | ||||
|             tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|         df_original = DataFrame( | ||||
|             { | ||||
|                 "Branch": "A A A A A A A B".split(), | ||||
|                 "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(), | ||||
|                 "Quantity": [1, 3, 5, 1, 8, 1, 9, 3], | ||||
|                 "Date": [ | ||||
|                     datetime(2013, 10, 1, 13, 0), | ||||
|                     datetime(2013, 10, 1, 13, 5), | ||||
|                     datetime(2013, 10, 1, 20, 0), | ||||
|                     datetime(2013, 10, 2, 10, 0), | ||||
|                     datetime(2013, 10, 1, 20, 0), | ||||
|                     datetime(2013, 10, 2, 10, 0), | ||||
|                     datetime(2013, 10, 2, 12, 0), | ||||
|                     datetime(2013, 10, 2, 14, 0), | ||||
|                 ], | ||||
|             } | ||||
|         ).set_index("Date") | ||||
|  | ||||
|         df_sorted = df_original.sort_values(by="Quantity", ascending=False) | ||||
|         for df in [df_original, df_sorted]: | ||||
|             expected = DataFrame( | ||||
|                 { | ||||
|                     "Buyer": "Carl Joe Mark Carl Joe".split(), | ||||
|                     "Quantity": [6, 8, 3, 4, 10], | ||||
|                     "Date": [ | ||||
|                         datetime(2013, 10, 1, 0, 0), | ||||
|                         datetime(2013, 10, 1, 0, 0), | ||||
|                         datetime(2013, 10, 1, 0, 0), | ||||
|                         datetime(2013, 10, 2, 0, 0), | ||||
|                         datetime(2013, 10, 2, 0, 0), | ||||
|                     ], | ||||
|                 } | ||||
|             ).set_index(["Date", "Buyer"]) | ||||
|  | ||||
|             result = df.groupby([Grouper(freq="1D"), "Buyer"]).sum(numeric_only=True) | ||||
|             tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|             result = df.groupby([Grouper(freq="1ME"), "Buyer"]).sum(numeric_only=True) | ||||
|             expected = DataFrame( | ||||
|                 { | ||||
|                     "Buyer": "Carl Joe Mark".split(), | ||||
|                     "Quantity": [10, 18, 3], | ||||
|                     "Date": [ | ||||
|                         datetime(2013, 10, 31, 0, 0), | ||||
|                         datetime(2013, 10, 31, 0, 0), | ||||
|                         datetime(2013, 10, 31, 0, 0), | ||||
|                     ], | ||||
|                 } | ||||
|             ).set_index(["Date", "Buyer"]) | ||||
|             tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|             # passing the name | ||||
|             df = df.reset_index() | ||||
|             result = df.groupby([Grouper(freq="1ME", key="Date"), "Buyer"]).sum( | ||||
|                 numeric_only=True | ||||
|             ) | ||||
|             tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|             with pytest.raises(KeyError, match="'The grouper name foo is not found'"): | ||||
|                 df.groupby([Grouper(freq="1ME", key="foo"), "Buyer"]).sum() | ||||
|  | ||||
|             # passing the level | ||||
|             df = df.set_index("Date") | ||||
|             result = df.groupby([Grouper(freq="1ME", level="Date"), "Buyer"]).sum( | ||||
|                 numeric_only=True | ||||
|             ) | ||||
|             tm.assert_frame_equal(result, expected) | ||||
|             result = df.groupby([Grouper(freq="1ME", level=0), "Buyer"]).sum( | ||||
|                 numeric_only=True | ||||
|             ) | ||||
|             tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|             with pytest.raises(ValueError, match="The level foo is not valid"): | ||||
|                 df.groupby([Grouper(freq="1ME", level="foo"), "Buyer"]).sum() | ||||
|  | ||||
|             # multi names | ||||
|             df = df.copy() | ||||
|             df["Date"] = df.index + offsets.MonthEnd(2) | ||||
|             result = df.groupby([Grouper(freq="1ME", key="Date"), "Buyer"]).sum( | ||||
|                 numeric_only=True | ||||
|             ) | ||||
|             expected = DataFrame( | ||||
|                 { | ||||
|                     "Buyer": "Carl Joe Mark".split(), | ||||
|                     "Quantity": [10, 18, 3], | ||||
|                     "Date": [ | ||||
|                         datetime(2013, 11, 30, 0, 0), | ||||
|                         datetime(2013, 11, 30, 0, 0), | ||||
|                         datetime(2013, 11, 30, 0, 0), | ||||
|                     ], | ||||
|                 } | ||||
|             ).set_index(["Date", "Buyer"]) | ||||
|             tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|             # error as we have both a level and a name! | ||||
|             msg = "The Grouper cannot specify both a key and a level!" | ||||
|             with pytest.raises(ValueError, match=msg): | ||||
|                 df.groupby( | ||||
|                     [Grouper(freq="1ME", key="Date", level="Date"), "Buyer"] | ||||
|                 ).sum() | ||||
|  | ||||
|             # single groupers | ||||
|             expected = DataFrame( | ||||
|                 [[31]], | ||||
|                 columns=["Quantity"], | ||||
|                 index=DatetimeIndex( | ||||
|                     [datetime(2013, 10, 31, 0, 0)], freq=offsets.MonthEnd(), name="Date" | ||||
|                 ), | ||||
|             ) | ||||
|             result = df.groupby(Grouper(freq="1ME")).sum(numeric_only=True) | ||||
|             tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|             result = df.groupby([Grouper(freq="1ME")]).sum(numeric_only=True) | ||||
|             tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|             expected.index = expected.index.shift(1) | ||||
|             assert expected.index.freq == offsets.MonthEnd() | ||||
|             result = df.groupby(Grouper(freq="1ME", key="Date")).sum(numeric_only=True) | ||||
|             tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|             result = df.groupby([Grouper(freq="1ME", key="Date")]).sum( | ||||
|                 numeric_only=True | ||||
|             ) | ||||
|             tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     @pytest.mark.parametrize("freq", ["D", "ME", "YE", "QE-APR"]) | ||||
|     def test_timegrouper_with_reg_groups_freq(self, freq): | ||||
|         # GH 6764 multiple grouping with/without sort | ||||
|         df = DataFrame( | ||||
|             { | ||||
|                 "date": pd.to_datetime( | ||||
|                     [ | ||||
|                         "20121002", | ||||
|                         "20121007", | ||||
|                         "20130130", | ||||
|                         "20130202", | ||||
|                         "20130305", | ||||
|                         "20121002", | ||||
|                         "20121207", | ||||
|                         "20130130", | ||||
|                         "20130202", | ||||
|                         "20130305", | ||||
|                         "20130202", | ||||
|                         "20130305", | ||||
|                     ] | ||||
|                 ), | ||||
|                 "user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5], | ||||
|                 "whole_cost": [ | ||||
|                     1790, | ||||
|                     364, | ||||
|                     280, | ||||
|                     259, | ||||
|                     201, | ||||
|                     623, | ||||
|                     90, | ||||
|                     312, | ||||
|                     359, | ||||
|                     301, | ||||
|                     359, | ||||
|                     801, | ||||
|                 ], | ||||
|                 "cost1": [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12], | ||||
|             } | ||||
|         ).set_index("date") | ||||
|  | ||||
|         expected = ( | ||||
|             df.groupby("user_id")["whole_cost"] | ||||
|             .resample(freq) | ||||
|             .sum(min_count=1)  # XXX | ||||
|             .dropna() | ||||
|             .reorder_levels(["date", "user_id"]) | ||||
|             .sort_index() | ||||
|             .astype("int64") | ||||
|         ) | ||||
|         expected.name = "whole_cost" | ||||
|  | ||||
|         result1 = ( | ||||
|             df.sort_index().groupby([Grouper(freq=freq), "user_id"])["whole_cost"].sum() | ||||
|         ) | ||||
|         tm.assert_series_equal(result1, expected) | ||||
|  | ||||
|         result2 = df.groupby([Grouper(freq=freq), "user_id"])["whole_cost"].sum() | ||||
|         tm.assert_series_equal(result2, expected) | ||||
|  | ||||
|     def test_timegrouper_get_group(self): | ||||
|         # GH 6914 | ||||
|  | ||||
|         df_original = DataFrame( | ||||
|             { | ||||
|                 "Buyer": "Carl Joe Joe Carl Joe Carl".split(), | ||||
|                 "Quantity": [18, 3, 5, 1, 9, 3], | ||||
|                 "Date": [ | ||||
|                     datetime(2013, 9, 1, 13, 0), | ||||
|                     datetime(2013, 9, 1, 13, 5), | ||||
|                     datetime(2013, 10, 1, 20, 0), | ||||
|                     datetime(2013, 10, 3, 10, 0), | ||||
|                     datetime(2013, 12, 2, 12, 0), | ||||
|                     datetime(2013, 9, 2, 14, 0), | ||||
|                 ], | ||||
|             } | ||||
|         ) | ||||
|         df_reordered = df_original.sort_values(by="Quantity") | ||||
|  | ||||
|         # single grouping | ||||
|         expected_list = [ | ||||
|             df_original.iloc[[0, 1, 5]], | ||||
|             df_original.iloc[[2, 3]], | ||||
|             df_original.iloc[[4]], | ||||
|         ] | ||||
|         dt_list = ["2013-09-30", "2013-10-31", "2013-12-31"] | ||||
|  | ||||
|         for df in [df_original, df_reordered]: | ||||
|             grouped = df.groupby(Grouper(freq="ME", key="Date")) | ||||
|             for t, expected in zip(dt_list, expected_list): | ||||
|                 dt = Timestamp(t) | ||||
|                 result = grouped.get_group(dt) | ||||
|                 tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|         # multiple grouping | ||||
|         expected_list = [ | ||||
|             df_original.iloc[[1]], | ||||
|             df_original.iloc[[3]], | ||||
|             df_original.iloc[[4]], | ||||
|         ] | ||||
|         g_list = [("Joe", "2013-09-30"), ("Carl", "2013-10-31"), ("Joe", "2013-12-31")] | ||||
|  | ||||
|         for df in [df_original, df_reordered]: | ||||
|             grouped = df.groupby(["Buyer", Grouper(freq="ME", key="Date")]) | ||||
|             for (b, t), expected in zip(g_list, expected_list): | ||||
|                 dt = Timestamp(t) | ||||
|                 result = grouped.get_group((b, dt)) | ||||
|                 tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|         # with index | ||||
|         df_original = df_original.set_index("Date") | ||||
|         df_reordered = df_original.sort_values(by="Quantity") | ||||
|  | ||||
|         expected_list = [ | ||||
|             df_original.iloc[[0, 1, 5]], | ||||
|             df_original.iloc[[2, 3]], | ||||
|             df_original.iloc[[4]], | ||||
|         ] | ||||
|  | ||||
|         for df in [df_original, df_reordered]: | ||||
|             grouped = df.groupby(Grouper(freq="ME")) | ||||
|             for t, expected in zip(dt_list, expected_list): | ||||
|                 dt = Timestamp(t) | ||||
|                 result = grouped.get_group(dt) | ||||
|                 tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     def test_timegrouper_apply_return_type_series(self): | ||||
|         # Using `apply` with the `TimeGrouper` should give the | ||||
|         # same return type as an `apply` with a `Grouper`. | ||||
|         # Issue #11742 | ||||
|         df = DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]}) | ||||
|         df_dt = df.copy() | ||||
|         df_dt["date"] = pd.to_datetime(df_dt["date"]) | ||||
|  | ||||
|         def sumfunc_series(x): | ||||
|             return Series([x["value"].sum()], ("sum",)) | ||||
|  | ||||
|         msg = "DataFrameGroupBy.apply operated on the grouping columns" | ||||
|         with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|             expected = df.groupby(Grouper(key="date")).apply(sumfunc_series) | ||||
|         msg = "DataFrameGroupBy.apply operated on the grouping columns" | ||||
|         with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|             result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_series) | ||||
|         tm.assert_frame_equal( | ||||
|             result.reset_index(drop=True), expected.reset_index(drop=True) | ||||
|         ) | ||||
|  | ||||
|     def test_timegrouper_apply_return_type_value(self): | ||||
|         # Using `apply` with the `TimeGrouper` should give the | ||||
|         # same return type as an `apply` with a `Grouper`. | ||||
|         # Issue #11742 | ||||
|         df = DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]}) | ||||
|         df_dt = df.copy() | ||||
|         df_dt["date"] = pd.to_datetime(df_dt["date"]) | ||||
|  | ||||
|         def sumfunc_value(x): | ||||
|             return x.value.sum() | ||||
|  | ||||
|         msg = "DataFrameGroupBy.apply operated on the grouping columns" | ||||
|         with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|             expected = df.groupby(Grouper(key="date")).apply(sumfunc_value) | ||||
|         with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|             result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_value) | ||||
|         tm.assert_series_equal( | ||||
|             result.reset_index(drop=True), expected.reset_index(drop=True) | ||||
|         ) | ||||
|  | ||||
|     def test_groupby_groups_datetimeindex(self): | ||||
|         # GH#1430 | ||||
|         periods = 1000 | ||||
|         ind = date_range(start="2012/1/1", freq="5min", periods=periods) | ||||
|         df = DataFrame( | ||||
|             {"high": np.arange(periods), "low": np.arange(periods)}, index=ind | ||||
|         ) | ||||
|         grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day)) | ||||
|  | ||||
|         # it works! | ||||
|         groups = grouped.groups | ||||
|         assert isinstance(next(iter(groups.keys())), datetime) | ||||
|  | ||||
|     def test_groupby_groups_datetimeindex2(self): | ||||
|         # GH#11442 | ||||
|         index = date_range("2015/01/01", periods=5, name="date") | ||||
|         df = DataFrame({"A": [5, 6, 7, 8, 9], "B": [1, 2, 3, 4, 5]}, index=index) | ||||
|         result = df.groupby(level="date").groups | ||||
|         dates = ["2015-01-05", "2015-01-04", "2015-01-03", "2015-01-02", "2015-01-01"] | ||||
|         expected = { | ||||
|             Timestamp(date): DatetimeIndex([date], name="date") for date in dates | ||||
|         } | ||||
|         tm.assert_dict_equal(result, expected) | ||||
|  | ||||
|         grouped = df.groupby(level="date") | ||||
|         for date in dates: | ||||
|             result = grouped.get_group(date) | ||||
|             data = [[df.loc[date, "A"], df.loc[date, "B"]]] | ||||
|             expected_index = DatetimeIndex( | ||||
|                 [date], name="date", freq="D", dtype=index.dtype | ||||
|             ) | ||||
|             expected = DataFrame(data, columns=list("AB"), index=expected_index) | ||||
|             tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     def test_groupby_groups_datetimeindex_tz(self): | ||||
|         # GH 3950 | ||||
|         dates = [ | ||||
|             "2011-07-19 07:00:00", | ||||
|             "2011-07-19 08:00:00", | ||||
|             "2011-07-19 09:00:00", | ||||
|             "2011-07-19 07:00:00", | ||||
|             "2011-07-19 08:00:00", | ||||
|             "2011-07-19 09:00:00", | ||||
|         ] | ||||
|         df = DataFrame( | ||||
|             { | ||||
|                 "label": ["a", "a", "a", "b", "b", "b"], | ||||
|                 "datetime": dates, | ||||
|                 "value1": np.arange(6, dtype="int64"), | ||||
|                 "value2": [1, 2] * 3, | ||||
|             } | ||||
|         ) | ||||
|         df["datetime"] = df["datetime"].apply(lambda d: Timestamp(d, tz="US/Pacific")) | ||||
|  | ||||
|         exp_idx1 = DatetimeIndex( | ||||
|             [ | ||||
|                 "2011-07-19 07:00:00", | ||||
|                 "2011-07-19 07:00:00", | ||||
|                 "2011-07-19 08:00:00", | ||||
|                 "2011-07-19 08:00:00", | ||||
|                 "2011-07-19 09:00:00", | ||||
|                 "2011-07-19 09:00:00", | ||||
|             ], | ||||
|             tz="US/Pacific", | ||||
|             name="datetime", | ||||
|         ) | ||||
|         exp_idx2 = Index(["a", "b"] * 3, name="label") | ||||
|         exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2]) | ||||
|         expected = DataFrame( | ||||
|             {"value1": [0, 3, 1, 4, 2, 5], "value2": [1, 2, 2, 1, 1, 2]}, | ||||
|             index=exp_idx, | ||||
|             columns=["value1", "value2"], | ||||
|         ) | ||||
|  | ||||
|         result = df.groupby(["datetime", "label"]).sum() | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|         # by level | ||||
|         didx = DatetimeIndex(dates, tz="Asia/Tokyo") | ||||
|         df = DataFrame( | ||||
|             {"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]}, | ||||
|             index=didx, | ||||
|         ) | ||||
|  | ||||
|         exp_idx = DatetimeIndex( | ||||
|             ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], | ||||
|             tz="Asia/Tokyo", | ||||
|         ) | ||||
|         expected = DataFrame( | ||||
|             {"value1": [3, 5, 7], "value2": [2, 4, 6]}, | ||||
|             index=exp_idx, | ||||
|             columns=["value1", "value2"], | ||||
|         ) | ||||
|  | ||||
|         result = df.groupby(level=0).sum() | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     def test_frame_datetime64_handling_groupby(self): | ||||
|         # it works! | ||||
|         df = DataFrame( | ||||
|             [(3, np.datetime64("2012-07-03")), (3, np.datetime64("2012-07-04"))], | ||||
|             columns=["a", "date"], | ||||
|         ) | ||||
|         result = df.groupby("a").first() | ||||
|         assert result["date"][3] == Timestamp("2012-07-03") | ||||
|  | ||||
|     def test_groupby_multi_timezone(self): | ||||
|         # combining multiple / different timezones yields UTC | ||||
|         df = DataFrame( | ||||
|             { | ||||
|                 "value": range(5), | ||||
|                 "date": [ | ||||
|                     "2000-01-28 16:47:00", | ||||
|                     "2000-01-29 16:48:00", | ||||
|                     "2000-01-30 16:49:00", | ||||
|                     "2000-01-31 16:50:00", | ||||
|                     "2000-01-01 16:50:00", | ||||
|                 ], | ||||
|                 "tz": [ | ||||
|                     "America/Chicago", | ||||
|                     "America/Chicago", | ||||
|                     "America/Los_Angeles", | ||||
|                     "America/Chicago", | ||||
|                     "America/New_York", | ||||
|                 ], | ||||
|             } | ||||
|         ) | ||||
|  | ||||
|         result = df.groupby("tz", group_keys=False).date.apply( | ||||
|             lambda x: pd.to_datetime(x).dt.tz_localize(x.name) | ||||
|         ) | ||||
|  | ||||
|         expected = Series( | ||||
|             [ | ||||
|                 Timestamp("2000-01-28 16:47:00-0600", tz="America/Chicago"), | ||||
|                 Timestamp("2000-01-29 16:48:00-0600", tz="America/Chicago"), | ||||
|                 Timestamp("2000-01-30 16:49:00-0800", tz="America/Los_Angeles"), | ||||
|                 Timestamp("2000-01-31 16:50:00-0600", tz="America/Chicago"), | ||||
|                 Timestamp("2000-01-01 16:50:00-0500", tz="America/New_York"), | ||||
|             ], | ||||
|             name="date", | ||||
|             dtype=object, | ||||
|         ) | ||||
|         tm.assert_series_equal(result, expected) | ||||
|  | ||||
|         tz = "America/Chicago" | ||||
|         res_values = df.groupby("tz").date.get_group(tz) | ||||
|         result = pd.to_datetime(res_values).dt.tz_localize(tz) | ||||
|         exp_values = Series( | ||||
|             ["2000-01-28 16:47:00", "2000-01-29 16:48:00", "2000-01-31 16:50:00"], | ||||
|             index=[0, 1, 3], | ||||
|             name="date", | ||||
|         ) | ||||
|         expected = pd.to_datetime(exp_values).dt.tz_localize(tz) | ||||
|         tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     def test_groupby_groups_periods(self): | ||||
|         dates = [ | ||||
|             "2011-07-19 07:00:00", | ||||
|             "2011-07-19 08:00:00", | ||||
|             "2011-07-19 09:00:00", | ||||
|             "2011-07-19 07:00:00", | ||||
|             "2011-07-19 08:00:00", | ||||
|             "2011-07-19 09:00:00", | ||||
|         ] | ||||
|         df = DataFrame( | ||||
|             { | ||||
|                 "label": ["a", "a", "a", "b", "b", "b"], | ||||
|                 "period": [pd.Period(d, freq="h") for d in dates], | ||||
|                 "value1": np.arange(6, dtype="int64"), | ||||
|                 "value2": [1, 2] * 3, | ||||
|             } | ||||
|         ) | ||||
|  | ||||
|         exp_idx1 = pd.PeriodIndex( | ||||
|             [ | ||||
|                 "2011-07-19 07:00:00", | ||||
|                 "2011-07-19 07:00:00", | ||||
|                 "2011-07-19 08:00:00", | ||||
|                 "2011-07-19 08:00:00", | ||||
|                 "2011-07-19 09:00:00", | ||||
|                 "2011-07-19 09:00:00", | ||||
|             ], | ||||
|             freq="h", | ||||
|             name="period", | ||||
|         ) | ||||
|         exp_idx2 = Index(["a", "b"] * 3, name="label") | ||||
|         exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2]) | ||||
|         expected = DataFrame( | ||||
|             {"value1": [0, 3, 1, 4, 2, 5], "value2": [1, 2, 2, 1, 1, 2]}, | ||||
|             index=exp_idx, | ||||
|             columns=["value1", "value2"], | ||||
|         ) | ||||
|  | ||||
|         result = df.groupby(["period", "label"]).sum() | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|         # by level | ||||
|         didx = pd.PeriodIndex(dates, freq="h") | ||||
|         df = DataFrame( | ||||
|             {"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]}, | ||||
|             index=didx, | ||||
|         ) | ||||
|  | ||||
|         exp_idx = pd.PeriodIndex( | ||||
|             ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], | ||||
|             freq="h", | ||||
|         ) | ||||
|         expected = DataFrame( | ||||
|             {"value1": [3, 5, 7], "value2": [2, 4, 6]}, | ||||
|             index=exp_idx, | ||||
|             columns=["value1", "value2"], | ||||
|         ) | ||||
|  | ||||
|         result = df.groupby(level=0).sum() | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     def test_groupby_first_datetime64(self): | ||||
|         df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)]) | ||||
|         df[1] = df[1].astype("M8[ns]") | ||||
|  | ||||
|         assert issubclass(df[1].dtype.type, np.datetime64) | ||||
|  | ||||
|         result = df.groupby(level=0).first() | ||||
|         got_dt = result[1].dtype | ||||
|         assert issubclass(got_dt.type, np.datetime64) | ||||
|  | ||||
|         result = df[1].groupby(level=0).first() | ||||
|         got_dt = result.dtype | ||||
|         assert issubclass(got_dt.type, np.datetime64) | ||||
|  | ||||
|     def test_groupby_max_datetime64(self): | ||||
|         # GH 5869 | ||||
|         # datetimelike dtype conversion from int | ||||
|         df = DataFrame({"A": Timestamp("20130101"), "B": np.arange(5)}) | ||||
|         # TODO: can we retain second reso in .apply here? | ||||
|         expected = df.groupby("A")["A"].apply(lambda x: x.max()).astype("M8[s]") | ||||
|         result = df.groupby("A")["A"].max() | ||||
|         tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     def test_groupby_datetime64_32_bit(self): | ||||
|         # GH 6410 / numpy 4328 | ||||
|         # 32-bit under 1.9-dev indexing issue | ||||
|  | ||||
|         df = DataFrame({"A": range(2), "B": [Timestamp("2000-01-1")] * 2}) | ||||
|         result = df.groupby("A")["B"].transform("min") | ||||
|         expected = Series([Timestamp("2000-01-1")] * 2, name="B") | ||||
|         tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     def test_groupby_with_timezone_selection(self): | ||||
|         # GH 11616 | ||||
|         # Test that column selection returns output in correct timezone. | ||||
|  | ||||
|         df = DataFrame( | ||||
|             { | ||||
|                 "factor": np.random.default_rng(2).integers(0, 3, size=60), | ||||
|                 "time": date_range("01/01/2000 00:00", periods=60, freq="s", tz="UTC"), | ||||
|             } | ||||
|         ) | ||||
|         df1 = df.groupby("factor").max()["time"] | ||||
|         df2 = df.groupby("factor")["time"].max() | ||||
|         tm.assert_series_equal(df1, df2) | ||||
|  | ||||
|     def test_timezone_info(self): | ||||
|         # see gh-11682: Timezone info lost when broadcasting | ||||
|         # scalar datetime to DataFrame | ||||
|  | ||||
|         df = DataFrame({"a": [1], "b": [datetime.now(pytz.utc)]}) | ||||
|         assert df["b"][0].tzinfo == pytz.utc | ||||
|         df = DataFrame({"a": [1, 2, 3]}) | ||||
|         df["b"] = datetime.now(pytz.utc) | ||||
|         assert df["b"][0].tzinfo == pytz.utc | ||||
|  | ||||
|     def test_datetime_count(self): | ||||
|         df = DataFrame( | ||||
|             {"a": [1, 2, 3] * 2, "dates": date_range("now", periods=6, freq="min")} | ||||
|         ) | ||||
|         result = df.groupby("a").dates.count() | ||||
|         expected = Series([2, 2, 2], index=Index([1, 2, 3], name="a"), name="dates") | ||||
|         tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     def test_first_last_max_min_on_time_data(self): | ||||
|         # GH 10295 | ||||
|         # Verify that NaT is not in the result of max, min, first and last on | ||||
|         # Dataframe with datetime or timedelta values. | ||||
|         df_test = DataFrame( | ||||
|             { | ||||
|                 "dt": [ | ||||
|                     np.nan, | ||||
|                     "2015-07-24 10:10", | ||||
|                     "2015-07-25 11:11", | ||||
|                     "2015-07-23 12:12", | ||||
|                     np.nan, | ||||
|                 ], | ||||
|                 "td": [ | ||||
|                     np.nan, | ||||
|                     timedelta(days=1), | ||||
|                     timedelta(days=2), | ||||
|                     timedelta(days=3), | ||||
|                     np.nan, | ||||
|                 ], | ||||
|             } | ||||
|         ) | ||||
|         df_test.dt = pd.to_datetime(df_test.dt) | ||||
|         df_test["group"] = "A" | ||||
|         df_ref = df_test[df_test.dt.notna()] | ||||
|  | ||||
|         grouped_test = df_test.groupby("group") | ||||
|         grouped_ref = df_ref.groupby("group") | ||||
|  | ||||
|         tm.assert_frame_equal(grouped_ref.max(), grouped_test.max()) | ||||
|         tm.assert_frame_equal(grouped_ref.min(), grouped_test.min()) | ||||
|         tm.assert_frame_equal(grouped_ref.first(), grouped_test.first()) | ||||
|         tm.assert_frame_equal(grouped_ref.last(), grouped_test.last()) | ||||
|  | ||||
|     def test_nunique_with_timegrouper_and_nat(self): | ||||
|         # GH 17575 | ||||
|         test = DataFrame( | ||||
|             { | ||||
|                 "time": [ | ||||
|                     Timestamp("2016-06-28 09:35:35"), | ||||
|                     pd.NaT, | ||||
|                     Timestamp("2016-06-28 16:46:28"), | ||||
|                 ], | ||||
|                 "data": ["1", "2", "3"], | ||||
|             } | ||||
|         ) | ||||
|  | ||||
|         grouper = Grouper(key="time", freq="h") | ||||
|         result = test.groupby(grouper)["data"].nunique() | ||||
|         expected = test[test.time.notnull()].groupby(grouper)["data"].nunique() | ||||
|         expected.index = expected.index._with_freq(None) | ||||
|         tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     def test_scalar_call_versus_list_call(self): | ||||
|         # Issue: 17530 | ||||
|         data_frame = { | ||||
|             "location": ["shanghai", "beijing", "shanghai"], | ||||
|             "time": Series( | ||||
|                 ["2017-08-09 13:32:23", "2017-08-11 23:23:15", "2017-08-11 22:23:15"], | ||||
|                 dtype="datetime64[ns]", | ||||
|             ), | ||||
|             "value": [1, 2, 3], | ||||
|         } | ||||
|         data_frame = DataFrame(data_frame).set_index("time") | ||||
|         grouper = Grouper(freq="D") | ||||
|  | ||||
|         grouped = data_frame.groupby(grouper) | ||||
|         result = grouped.count() | ||||
|         grouped = data_frame.groupby([grouper]) | ||||
|         expected = grouped.count() | ||||
|  | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     def test_grouper_period_index(self): | ||||
|         # GH 32108 | ||||
|         periods = 2 | ||||
|         index = pd.period_range( | ||||
|             start="2018-01", periods=periods, freq="M", name="Month" | ||||
|         ) | ||||
|         period_series = Series(range(periods), index=index) | ||||
|         result = period_series.groupby(period_series.index.month).sum() | ||||
|  | ||||
|         expected = Series( | ||||
|             range(periods), index=Index(range(1, periods + 1), name=index.name) | ||||
|         ) | ||||
|         tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     def test_groupby_apply_timegrouper_with_nat_dict_returns( | ||||
|         self, groupby_with_truncated_bingrouper | ||||
|     ): | ||||
|         # GH#43500 case where gb._grouper.result_index and gb._grouper.group_keys_seq | ||||
|         #  have different lengths that goes through the `isinstance(values[0], dict)` | ||||
|         #  path | ||||
|         gb = groupby_with_truncated_bingrouper | ||||
|  | ||||
|         res = gb["Quantity"].apply(lambda x: {"foo": len(x)}) | ||||
|  | ||||
|         df = gb.obj | ||||
|         unit = df["Date"]._values.unit | ||||
|         dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date", unit=unit) | ||||
|         mi = MultiIndex.from_arrays([dti, ["foo"] * len(dti)]) | ||||
|         expected = Series([3, 0, 0, 0, 0, 0, 2], index=mi, name="Quantity") | ||||
|         tm.assert_series_equal(res, expected) | ||||
|  | ||||
|     def test_groupby_apply_timegrouper_with_nat_scalar_returns( | ||||
|         self, groupby_with_truncated_bingrouper | ||||
|     ): | ||||
|         # GH#43500 Previously raised ValueError bc used index with incorrect | ||||
|         #  length in wrap_applied_result | ||||
|         gb = groupby_with_truncated_bingrouper | ||||
|  | ||||
|         res = gb["Quantity"].apply(lambda x: x.iloc[0] if len(x) else np.nan) | ||||
|  | ||||
|         df = gb.obj | ||||
|         unit = df["Date"]._values.unit | ||||
|         dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date", unit=unit) | ||||
|         expected = Series( | ||||
|             [18, np.nan, np.nan, np.nan, np.nan, np.nan, 5], | ||||
|             index=dti._with_freq(None), | ||||
|             name="Quantity", | ||||
|         ) | ||||
|  | ||||
|         tm.assert_series_equal(res, expected) | ||||
|  | ||||
|     def test_groupby_apply_timegrouper_with_nat_apply_squeeze( | ||||
|         self, frame_for_truncated_bingrouper | ||||
|     ): | ||||
|         df = frame_for_truncated_bingrouper | ||||
|  | ||||
|         # We need to create a GroupBy object with only one non-NaT group, | ||||
|         #  so use a huge freq so that all non-NaT dates will be grouped together | ||||
|         tdg = Grouper(key="Date", freq="100YE") | ||||
|         gb = df.groupby(tdg) | ||||
|  | ||||
|         # check that we will go through the singular_series path | ||||
|         #  in _wrap_applied_output_series | ||||
|         assert gb.ngroups == 1 | ||||
|         assert gb._selected_obj._get_axis(gb.axis).nlevels == 1 | ||||
|  | ||||
|         # function that returns a Series | ||||
|         msg = "DataFrameGroupBy.apply operated on the grouping columns" | ||||
|         with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|             res = gb.apply(lambda x: x["Quantity"] * 2) | ||||
|  | ||||
|         dti = Index([Timestamp("2013-12-31")], dtype=df["Date"].dtype, name="Date") | ||||
|         expected = DataFrame( | ||||
|             [[36, 6, 6, 10, 2]], | ||||
|             index=dti, | ||||
|             columns=Index([0, 1, 5, 2, 3], name="Quantity"), | ||||
|         ) | ||||
|         tm.assert_frame_equal(res, expected) | ||||
|  | ||||
|     @pytest.mark.single_cpu | ||||
|     def test_groupby_agg_numba_timegrouper_with_nat( | ||||
|         self, groupby_with_truncated_bingrouper | ||||
|     ): | ||||
|         pytest.importorskip("numba") | ||||
|  | ||||
|         # See discussion in GH#43487 | ||||
|         gb = groupby_with_truncated_bingrouper | ||||
|  | ||||
|         result = gb["Quantity"].aggregate( | ||||
|             lambda values, index: np.nanmean(values), engine="numba" | ||||
|         ) | ||||
|  | ||||
|         expected = gb["Quantity"].aggregate("mean") | ||||
|         tm.assert_series_equal(result, expected) | ||||
|  | ||||
|         result_df = gb[["Quantity"]].aggregate( | ||||
|             lambda values, index: np.nanmean(values), engine="numba" | ||||
|         ) | ||||
|         expected_df = gb[["Quantity"]].aggregate("mean") | ||||
|         tm.assert_frame_equal(result_df, expected_df) | ||||
| @ -0,0 +1,294 @@ | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas.compat import is_platform_arm | ||||
| from pandas.errors import NumbaUtilError | ||||
|  | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Series, | ||||
|     option_context, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
| from pandas.util.version import Version | ||||
|  | ||||
| pytestmark = [pytest.mark.single_cpu] | ||||
|  | ||||
| numba = pytest.importorskip("numba") | ||||
| pytestmark.append( | ||||
|     pytest.mark.skipif( | ||||
|         Version(numba.__version__) == Version("0.61") and is_platform_arm(), | ||||
|         reason=f"Segfaults on ARM platforms with numba {numba.__version__}", | ||||
|     ) | ||||
| ) | ||||
|  | ||||
|  | ||||
| def test_correct_function_signature(): | ||||
|     pytest.importorskip("numba") | ||||
|  | ||||
|     def incorrect_function(x): | ||||
|         return x + 1 | ||||
|  | ||||
|     data = DataFrame( | ||||
|         {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, | ||||
|         columns=["key", "data"], | ||||
|     ) | ||||
|     with pytest.raises(NumbaUtilError, match="The first 2"): | ||||
|         data.groupby("key").transform(incorrect_function, engine="numba") | ||||
|  | ||||
|     with pytest.raises(NumbaUtilError, match="The first 2"): | ||||
|         data.groupby("key")["data"].transform(incorrect_function, engine="numba") | ||||
|  | ||||
|  | ||||
| def test_check_nopython_kwargs(): | ||||
|     pytest.importorskip("numba") | ||||
|  | ||||
|     def incorrect_function(values, index): | ||||
|         return values + 1 | ||||
|  | ||||
|     data = DataFrame( | ||||
|         {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, | ||||
|         columns=["key", "data"], | ||||
|     ) | ||||
|     with pytest.raises(NumbaUtilError, match="numba does not support"): | ||||
|         data.groupby("key").transform(incorrect_function, engine="numba", a=1) | ||||
|  | ||||
|     with pytest.raises(NumbaUtilError, match="numba does not support"): | ||||
|         data.groupby("key")["data"].transform(incorrect_function, engine="numba", a=1) | ||||
|  | ||||
|  | ||||
| @pytest.mark.filterwarnings("ignore") | ||||
| # Filter warnings when parallel=True and the function can't be parallelized by Numba | ||||
| @pytest.mark.parametrize("jit", [True, False]) | ||||
| @pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"]) | ||||
| @pytest.mark.parametrize("as_index", [True, False]) | ||||
| def test_numba_vs_cython(jit, pandas_obj, nogil, parallel, nopython, as_index): | ||||
|     pytest.importorskip("numba") | ||||
|  | ||||
|     def func(values, index): | ||||
|         return values + 1 | ||||
|  | ||||
|     if jit: | ||||
|         # Test accepted jitted functions | ||||
|         import numba | ||||
|  | ||||
|         func = numba.jit(func) | ||||
|  | ||||
|     data = DataFrame( | ||||
|         {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1] | ||||
|     ) | ||||
|     engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} | ||||
|     grouped = data.groupby(0, as_index=as_index) | ||||
|     if pandas_obj == "Series": | ||||
|         grouped = grouped[1] | ||||
|  | ||||
|     result = grouped.transform(func, engine="numba", engine_kwargs=engine_kwargs) | ||||
|     expected = grouped.transform(lambda x: x + 1, engine="cython") | ||||
|  | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.filterwarnings("ignore") | ||||
| # Filter warnings when parallel=True and the function can't be parallelized by Numba | ||||
| @pytest.mark.parametrize("jit", [True, False]) | ||||
| @pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"]) | ||||
| def test_cache(jit, pandas_obj, nogil, parallel, nopython): | ||||
|     # Test that the functions are cached correctly if we switch functions | ||||
|     pytest.importorskip("numba") | ||||
|  | ||||
|     def func_1(values, index): | ||||
|         return values + 1 | ||||
|  | ||||
|     def func_2(values, index): | ||||
|         return values * 5 | ||||
|  | ||||
|     if jit: | ||||
|         import numba | ||||
|  | ||||
|         func_1 = numba.jit(func_1) | ||||
|         func_2 = numba.jit(func_2) | ||||
|  | ||||
|     data = DataFrame( | ||||
|         {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1] | ||||
|     ) | ||||
|     engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} | ||||
|     grouped = data.groupby(0) | ||||
|     if pandas_obj == "Series": | ||||
|         grouped = grouped[1] | ||||
|  | ||||
|     result = grouped.transform(func_1, engine="numba", engine_kwargs=engine_kwargs) | ||||
|     expected = grouped.transform(lambda x: x + 1, engine="cython") | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|     result = grouped.transform(func_2, engine="numba", engine_kwargs=engine_kwargs) | ||||
|     expected = grouped.transform(lambda x: x * 5, engine="cython") | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|     # Retest func_1 which should use the cache | ||||
|     result = grouped.transform(func_1, engine="numba", engine_kwargs=engine_kwargs) | ||||
|     expected = grouped.transform(lambda x: x + 1, engine="cython") | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_use_global_config(): | ||||
|     pytest.importorskip("numba") | ||||
|  | ||||
|     def func_1(values, index): | ||||
|         return values + 1 | ||||
|  | ||||
|     data = DataFrame( | ||||
|         {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1] | ||||
|     ) | ||||
|     grouped = data.groupby(0) | ||||
|     expected = grouped.transform(func_1, engine="numba") | ||||
|     with option_context("compute.use_numba", True): | ||||
|         result = grouped.transform(func_1, engine=None) | ||||
|     tm.assert_frame_equal(expected, result) | ||||
|  | ||||
|  | ||||
| # TODO: Test more than just reductions (e.g. actually test transformations once we have | ||||
| @pytest.mark.parametrize( | ||||
|     "agg_func", [["min", "max"], "min", {"B": ["min", "max"], "C": "sum"}] | ||||
| ) | ||||
| def test_string_cython_vs_numba(agg_func, numba_supported_reductions): | ||||
|     pytest.importorskip("numba") | ||||
|     agg_func, kwargs = numba_supported_reductions | ||||
|     data = DataFrame( | ||||
|         {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1] | ||||
|     ) | ||||
|     grouped = data.groupby(0) | ||||
|  | ||||
|     result = grouped.transform(agg_func, engine="numba", **kwargs) | ||||
|     expected = grouped.transform(agg_func, engine="cython", **kwargs) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = grouped[1].transform(agg_func, engine="numba", **kwargs) | ||||
|     expected = grouped[1].transform(agg_func, engine="cython", **kwargs) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_args_not_cached(): | ||||
|     # GH 41647 | ||||
|     pytest.importorskip("numba") | ||||
|  | ||||
|     def sum_last(values, index, n): | ||||
|         return values[-n:].sum() | ||||
|  | ||||
|     df = DataFrame({"id": [0, 0, 1, 1], "x": [1, 1, 1, 1]}) | ||||
|     grouped_x = df.groupby("id")["x"] | ||||
|     result = grouped_x.transform(sum_last, 1, engine="numba") | ||||
|     expected = Series([1.0] * 4, name="x") | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     result = grouped_x.transform(sum_last, 2, engine="numba") | ||||
|     expected = Series([2.0] * 4, name="x") | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_index_data_correctly_passed(): | ||||
|     # GH 43133 | ||||
|     pytest.importorskip("numba") | ||||
|  | ||||
|     def f(values, index): | ||||
|         return index - 1 | ||||
|  | ||||
|     df = DataFrame({"group": ["A", "A", "B"], "v": [4, 5, 6]}, index=[-1, -2, -3]) | ||||
|     result = df.groupby("group").transform(f, engine="numba") | ||||
|     expected = DataFrame([-4.0, -3.0, -2.0], columns=["v"], index=[-1, -2, -3]) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_engine_kwargs_not_cached(): | ||||
|     # If the user passes a different set of engine_kwargs don't return the same | ||||
|     # jitted function | ||||
|     pytest.importorskip("numba") | ||||
|     nogil = True | ||||
|     parallel = False | ||||
|     nopython = True | ||||
|  | ||||
|     def func_kwargs(values, index): | ||||
|         return nogil + parallel + nopython | ||||
|  | ||||
|     engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel} | ||||
|     df = DataFrame({"value": [0, 0, 0]}) | ||||
|     result = df.groupby(level=0).transform( | ||||
|         func_kwargs, engine="numba", engine_kwargs=engine_kwargs | ||||
|     ) | ||||
|     expected = DataFrame({"value": [2.0, 2.0, 2.0]}) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     nogil = False | ||||
|     engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel} | ||||
|     result = df.groupby(level=0).transform( | ||||
|         func_kwargs, engine="numba", engine_kwargs=engine_kwargs | ||||
|     ) | ||||
|     expected = DataFrame({"value": [1.0, 1.0, 1.0]}) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.filterwarnings("ignore") | ||||
| def test_multiindex_one_key(nogil, parallel, nopython): | ||||
|     pytest.importorskip("numba") | ||||
|  | ||||
|     def numba_func(values, index): | ||||
|         return 1 | ||||
|  | ||||
|     df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"]) | ||||
|     engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel} | ||||
|     result = df.groupby("A").transform( | ||||
|         numba_func, engine="numba", engine_kwargs=engine_kwargs | ||||
|     ) | ||||
|     expected = DataFrame([{"A": 1, "B": 2, "C": 1.0}]).set_index(["A", "B"]) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_multiindex_multi_key_not_supported(nogil, parallel, nopython): | ||||
|     pytest.importorskip("numba") | ||||
|  | ||||
|     def numba_func(values, index): | ||||
|         return 1 | ||||
|  | ||||
|     df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"]) | ||||
|     engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel} | ||||
|     with pytest.raises(NotImplementedError, match="more than 1 grouping labels"): | ||||
|         df.groupby(["A", "B"]).transform( | ||||
|             numba_func, engine="numba", engine_kwargs=engine_kwargs | ||||
|         ) | ||||
|  | ||||
|  | ||||
| def test_multilabel_numba_vs_cython(numba_supported_reductions): | ||||
|     pytest.importorskip("numba") | ||||
|     reduction, kwargs = numba_supported_reductions | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], | ||||
|             "B": ["one", "one", "two", "three", "two", "two", "one", "three"], | ||||
|             "C": np.random.default_rng(2).standard_normal(8), | ||||
|             "D": np.random.default_rng(2).standard_normal(8), | ||||
|         } | ||||
|     ) | ||||
|     gb = df.groupby(["A", "B"]) | ||||
|     res_agg = gb.transform(reduction, engine="numba", **kwargs) | ||||
|     expected_agg = gb.transform(reduction, engine="cython", **kwargs) | ||||
|     tm.assert_frame_equal(res_agg, expected_agg) | ||||
|  | ||||
|  | ||||
| def test_multilabel_udf_numba_vs_cython(): | ||||
|     pytest.importorskip("numba") | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], | ||||
|             "B": ["one", "one", "two", "three", "two", "two", "one", "three"], | ||||
|             "C": np.random.default_rng(2).standard_normal(8), | ||||
|             "D": np.random.default_rng(2).standard_normal(8), | ||||
|         } | ||||
|     ) | ||||
|     gb = df.groupby(["A", "B"]) | ||||
|     result = gb.transform( | ||||
|         lambda values, index: (values - values.min()) / (values.max() - values.min()), | ||||
|         engine="numba", | ||||
|     ) | ||||
|     expected = gb.transform( | ||||
|         lambda x: (x - x.min()) / (x.max() - x.min()), engine="cython" | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
		Reference in New Issue
	
	Block a user