done
This commit is contained in:
		| @ -0,0 +1,24 @@ | ||||
| import numpy as np | ||||
|  | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     Series, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| def test_corrwith_with_1_axis(): | ||||
|     # GH 47723 | ||||
|     df = DataFrame({"a": [1, 1, 2], "b": [3, 7, 4]}) | ||||
|     gb = df.groupby("a") | ||||
|  | ||||
|     msg = "DataFrameGroupBy.corrwith with axis=1 is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         result = gb.corrwith(df, axis=1) | ||||
|     index = Index( | ||||
|         data=[(1, 0), (1, 1), (1, 2), (2, 2), (2, 0), (2, 1)], | ||||
|         name=("a", None), | ||||
|     ) | ||||
|     expected = Series([np.nan] * 6, index=index) | ||||
|     tm.assert_series_equal(result, expected) | ||||
| @ -0,0 +1,301 @@ | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| import pandas as pd | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     MultiIndex, | ||||
|     Series, | ||||
|     Timestamp, | ||||
|     date_range, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| def test_apply_describe_bug(multiindex_dataframe_random_data): | ||||
|     grouped = multiindex_dataframe_random_data.groupby(level="first") | ||||
|     grouped.describe()  # it works! | ||||
|  | ||||
|  | ||||
| def test_series_describe_multikey(): | ||||
|     ts = Series( | ||||
|         np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) | ||||
|     ) | ||||
|     grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) | ||||
|     result = grouped.describe() | ||||
|     tm.assert_series_equal(result["mean"], grouped.mean(), check_names=False) | ||||
|     tm.assert_series_equal(result["std"], grouped.std(), check_names=False) | ||||
|     tm.assert_series_equal(result["min"], grouped.min(), check_names=False) | ||||
|  | ||||
|  | ||||
| def test_series_describe_single(): | ||||
|     ts = Series( | ||||
|         np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) | ||||
|     ) | ||||
|     grouped = ts.groupby(lambda x: x.month) | ||||
|     result = grouped.apply(lambda x: x.describe()) | ||||
|     expected = grouped.describe().stack(future_stack=True) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("keys", ["key1", ["key1", "key2"]]) | ||||
| def test_series_describe_as_index(as_index, keys): | ||||
|     # GH#49256 | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "key1": ["one", "two", "two", "three", "two"], | ||||
|             "key2": ["one", "two", "two", "three", "two"], | ||||
|             "foo2": [1, 2, 4, 4, 6], | ||||
|         } | ||||
|     ) | ||||
|     gb = df.groupby(keys, as_index=as_index)["foo2"] | ||||
|     result = gb.describe() | ||||
|     expected = DataFrame( | ||||
|         { | ||||
|             "key1": ["one", "three", "two"], | ||||
|             "count": [1.0, 1.0, 3.0], | ||||
|             "mean": [1.0, 4.0, 4.0], | ||||
|             "std": [np.nan, np.nan, 2.0], | ||||
|             "min": [1.0, 4.0, 2.0], | ||||
|             "25%": [1.0, 4.0, 3.0], | ||||
|             "50%": [1.0, 4.0, 4.0], | ||||
|             "75%": [1.0, 4.0, 5.0], | ||||
|             "max": [1.0, 4.0, 6.0], | ||||
|         } | ||||
|     ) | ||||
|     if len(keys) == 2: | ||||
|         expected.insert(1, "key2", expected["key1"]) | ||||
|     if as_index: | ||||
|         expected = expected.set_index(keys) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_frame_describe_multikey(tsframe, using_infer_string): | ||||
|     grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) | ||||
|     result = grouped.describe() | ||||
|     desc_groups = [] | ||||
|     for col in tsframe: | ||||
|         group = grouped[col].describe() | ||||
|         # GH 17464 - Remove duplicate MultiIndex levels | ||||
|         group_col = MultiIndex( | ||||
|             levels=[Index([col], dtype=tsframe.columns.dtype), group.columns], | ||||
|             codes=[[0] * len(group.columns), range(len(group.columns))], | ||||
|         ) | ||||
|         group = DataFrame(group.values, columns=group_col, index=group.index) | ||||
|         desc_groups.append(group) | ||||
|     expected = pd.concat(desc_groups, axis=1) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # remainder of the tests fails with string dtype but is testing deprecated behaviour | ||||
|     if using_infer_string: | ||||
|         return | ||||
|  | ||||
|     msg = "DataFrame.groupby with axis=1 is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) | ||||
|     result = groupedT.describe() | ||||
|     expected = tsframe.describe().T | ||||
|     # reverting the change from https://github.com/pandas-dev/pandas/pull/35441/ | ||||
|     expected.index = MultiIndex( | ||||
|         levels=[[0, 1], expected.index], | ||||
|         codes=[[0, 0, 1, 1], range(len(expected.index))], | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_frame_describe_tupleindex(): | ||||
|     # GH 14848 - regression from 0.19.0 to 0.19.1 | ||||
|     df1 = DataFrame( | ||||
|         { | ||||
|             "x": [1, 2, 3, 4, 5] * 3, | ||||
|             "y": [10, 20, 30, 40, 50] * 3, | ||||
|             "z": [100, 200, 300, 400, 500] * 3, | ||||
|         } | ||||
|     ) | ||||
|     df1["k"] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 | ||||
|     df2 = df1.rename(columns={"k": "key"}) | ||||
|     msg = "Names should be list-like for a MultiIndex" | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         df1.groupby("k").describe() | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         df2.groupby("key").describe() | ||||
|  | ||||
|  | ||||
| def test_frame_describe_unstacked_format(): | ||||
|     # GH 4792 | ||||
|     prices = { | ||||
|         Timestamp("2011-01-06 10:59:05", tz=None): 24990, | ||||
|         Timestamp("2011-01-06 12:43:33", tz=None): 25499, | ||||
|         Timestamp("2011-01-06 12:54:09", tz=None): 25499, | ||||
|     } | ||||
|     volumes = { | ||||
|         Timestamp("2011-01-06 10:59:05", tz=None): 1500000000, | ||||
|         Timestamp("2011-01-06 12:43:33", tz=None): 5000000000, | ||||
|         Timestamp("2011-01-06 12:54:09", tz=None): 100000000, | ||||
|     } | ||||
|     df = DataFrame({"PRICE": prices, "VOLUME": volumes}) | ||||
|     result = df.groupby("PRICE").VOLUME.describe() | ||||
|     data = [ | ||||
|         df[df.PRICE == 24990].VOLUME.describe().values.tolist(), | ||||
|         df[df.PRICE == 25499].VOLUME.describe().values.tolist(), | ||||
|     ] | ||||
|     expected = DataFrame( | ||||
|         data, | ||||
|         index=Index([24990, 25499], name="PRICE"), | ||||
|         columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.filterwarnings( | ||||
|     "ignore:" | ||||
|     "indexing past lexsort depth may impact performance:" | ||||
|     "pandas.errors.PerformanceWarning" | ||||
| ) | ||||
| @pytest.mark.parametrize("as_index", [True, False]) | ||||
| @pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]]) | ||||
| def test_describe_with_duplicate_output_column_names(as_index, keys): | ||||
|     # GH 35314 | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "a1": [99, 99, 99, 88, 88, 88], | ||||
|             "a2": [99, 99, 99, 88, 88, 88], | ||||
|             "b": [1, 2, 3, 4, 5, 6], | ||||
|             "c": [10, 20, 30, 40, 50, 60], | ||||
|         }, | ||||
|         columns=["a1", "a2", "b", "b"], | ||||
|         copy=False, | ||||
|     ) | ||||
|     if keys == ["a1"]: | ||||
|         df = df.drop(columns="a2") | ||||
|  | ||||
|     expected = ( | ||||
|         DataFrame.from_records( | ||||
|             [ | ||||
|                 ("b", "count", 3.0, 3.0), | ||||
|                 ("b", "mean", 5.0, 2.0), | ||||
|                 ("b", "std", 1.0, 1.0), | ||||
|                 ("b", "min", 4.0, 1.0), | ||||
|                 ("b", "25%", 4.5, 1.5), | ||||
|                 ("b", "50%", 5.0, 2.0), | ||||
|                 ("b", "75%", 5.5, 2.5), | ||||
|                 ("b", "max", 6.0, 3.0), | ||||
|                 ("b", "count", 3.0, 3.0), | ||||
|                 ("b", "mean", 5.0, 2.0), | ||||
|                 ("b", "std", 1.0, 1.0), | ||||
|                 ("b", "min", 4.0, 1.0), | ||||
|                 ("b", "25%", 4.5, 1.5), | ||||
|                 ("b", "50%", 5.0, 2.0), | ||||
|                 ("b", "75%", 5.5, 2.5), | ||||
|                 ("b", "max", 6.0, 3.0), | ||||
|             ], | ||||
|         ) | ||||
|         .set_index([0, 1]) | ||||
|         .T | ||||
|     ) | ||||
|     expected.columns.names = [None, None] | ||||
|     if len(keys) == 2: | ||||
|         expected.index = MultiIndex( | ||||
|             levels=[[88, 99], [88, 99]], codes=[[0, 1], [0, 1]], names=["a1", "a2"] | ||||
|         ) | ||||
|     else: | ||||
|         expected.index = Index([88, 99], name="a1") | ||||
|  | ||||
|     if not as_index: | ||||
|         expected = expected.reset_index() | ||||
|  | ||||
|     result = df.groupby(keys, as_index=as_index).describe() | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_describe_duplicate_columns(): | ||||
|     # GH#50806 | ||||
|     df = DataFrame([[0, 1, 2, 3]]) | ||||
|     df.columns = [0, 1, 2, 0] | ||||
|     gb = df.groupby(df[1]) | ||||
|     result = gb.describe(percentiles=[]) | ||||
|  | ||||
|     columns = ["count", "mean", "std", "min", "50%", "max"] | ||||
|     frames = [ | ||||
|         DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns) | ||||
|         for val in (0.0, 2.0, 3.0) | ||||
|     ] | ||||
|     expected = pd.concat(frames, axis=1) | ||||
|     expected.columns = MultiIndex( | ||||
|         levels=[[0, 2], columns], | ||||
|         codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))], | ||||
|     ) | ||||
|     expected.index.names = [1] | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| class TestGroupByNonCythonPaths: | ||||
|     # GH#5610 non-cython calls should not include the grouper | ||||
|     # Tests for code not expected to go through cython paths. | ||||
|  | ||||
|     @pytest.fixture | ||||
|     def df(self): | ||||
|         df = DataFrame( | ||||
|             [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]], | ||||
|             columns=["A", "B", "C"], | ||||
|         ) | ||||
|         return df | ||||
|  | ||||
|     @pytest.fixture | ||||
|     def gb(self, df): | ||||
|         gb = df.groupby("A") | ||||
|         return gb | ||||
|  | ||||
|     @pytest.fixture | ||||
|     def gni(self, df): | ||||
|         gni = df.groupby("A", as_index=False) | ||||
|         return gni | ||||
|  | ||||
|     def test_describe(self, df, gb, gni): | ||||
|         # describe | ||||
|         expected_index = Index([1, 3], name="A") | ||||
|         expected_col = MultiIndex( | ||||
|             levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]], | ||||
|             codes=[[0] * 8, list(range(8))], | ||||
|         ) | ||||
|         expected = DataFrame( | ||||
|             [ | ||||
|                 [1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0], | ||||
|                 [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], | ||||
|             ], | ||||
|             index=expected_index, | ||||
|             columns=expected_col, | ||||
|         ) | ||||
|         result = gb.describe() | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|         expected = expected.reset_index() | ||||
|         result = gni.describe() | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("dtype", [int, float, object]) | ||||
| @pytest.mark.parametrize( | ||||
|     "kwargs", | ||||
|     [ | ||||
|         {"percentiles": [0.10, 0.20, 0.30], "include": "all", "exclude": None}, | ||||
|         {"percentiles": [0.10, 0.20, 0.30], "include": None, "exclude": ["int"]}, | ||||
|         {"percentiles": [0.10, 0.20, 0.30], "include": ["int"], "exclude": None}, | ||||
|     ], | ||||
| ) | ||||
| def test_groupby_empty_dataset(dtype, kwargs): | ||||
|     # GH#41575 | ||||
|     df = DataFrame([[1, 2, 3]], columns=["A", "B", "C"], dtype=dtype) | ||||
|     df["B"] = df["B"].astype(int) | ||||
|     df["C"] = df["C"].astype(float) | ||||
|  | ||||
|     result = df.iloc[:0].groupby("A").describe(**kwargs) | ||||
|     expected = df.groupby("A").describe(**kwargs).reset_index(drop=True).iloc[:0] | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = df.iloc[:0].groupby("A").B.describe(**kwargs) | ||||
|     expected = df.groupby("A").B.describe(**kwargs).reset_index(drop=True).iloc[:0] | ||||
|     expected.index = Index([], dtype=df.columns.dtype) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
| @ -0,0 +1,255 @@ | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     NaT, | ||||
|     Series, | ||||
|     Timedelta, | ||||
|     Timestamp, | ||||
|     date_range, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| def test_group_shift_with_null_key(): | ||||
|     # This test is designed to replicate the segfault in issue #13813. | ||||
|     n_rows = 1200 | ||||
|  | ||||
|     # Generate a moderately large dataframe with occasional missing | ||||
|     # values in column `B`, and then group by [`A`, `B`]. This should | ||||
|     # force `-1` in `labels` array of `g._grouper.group_info` exactly | ||||
|     # at those places, where the group-by key is partially missing. | ||||
|     df = DataFrame( | ||||
|         [(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)], | ||||
|         dtype=float, | ||||
|         columns=["A", "B", "Z"], | ||||
|         index=None, | ||||
|     ) | ||||
|     g = df.groupby(["A", "B"]) | ||||
|  | ||||
|     expected = DataFrame( | ||||
|         [(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)], | ||||
|         dtype=float, | ||||
|         columns=["Z"], | ||||
|         index=None, | ||||
|     ) | ||||
|     result = g.shift(-1) | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_group_shift_with_fill_value(): | ||||
|     # GH #24128 | ||||
|     n_rows = 24 | ||||
|     df = DataFrame( | ||||
|         [(i % 12, i % 3, i) for i in range(n_rows)], | ||||
|         dtype=float, | ||||
|         columns=["A", "B", "Z"], | ||||
|         index=None, | ||||
|     ) | ||||
|     g = df.groupby(["A", "B"]) | ||||
|  | ||||
|     expected = DataFrame( | ||||
|         [(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)], | ||||
|         dtype=float, | ||||
|         columns=["Z"], | ||||
|         index=None, | ||||
|     ) | ||||
|     result = g.shift(-1, fill_value=0) | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_group_shift_lose_timezone(): | ||||
|     # GH 30134 | ||||
|     now_dt = Timestamp.utcnow().as_unit("ns") | ||||
|     df = DataFrame({"a": [1, 1], "date": now_dt}) | ||||
|     result = df.groupby("a").shift(0).iloc[0] | ||||
|     expected = Series({"date": now_dt}, name=result.name) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_group_diff_real_series(any_real_numpy_dtype): | ||||
|     df = DataFrame( | ||||
|         {"a": [1, 2, 3, 3, 2], "b": [1, 2, 3, 4, 5]}, | ||||
|         dtype=any_real_numpy_dtype, | ||||
|     ) | ||||
|     result = df.groupby("a")["b"].diff() | ||||
|     exp_dtype = "float" | ||||
|     if any_real_numpy_dtype in ["int8", "int16", "float32"]: | ||||
|         exp_dtype = "float32" | ||||
|     expected = Series([np.nan, np.nan, np.nan, 1.0, 3.0], dtype=exp_dtype, name="b") | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_group_diff_real_frame(any_real_numpy_dtype): | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "a": [1, 2, 3, 3, 2], | ||||
|             "b": [1, 2, 3, 4, 5], | ||||
|             "c": [1, 2, 3, 4, 6], | ||||
|         }, | ||||
|         dtype=any_real_numpy_dtype, | ||||
|     ) | ||||
|     result = df.groupby("a").diff() | ||||
|     exp_dtype = "float" | ||||
|     if any_real_numpy_dtype in ["int8", "int16", "float32"]: | ||||
|         exp_dtype = "float32" | ||||
|     expected = DataFrame( | ||||
|         { | ||||
|             "b": [np.nan, np.nan, np.nan, 1.0, 3.0], | ||||
|             "c": [np.nan, np.nan, np.nan, 1.0, 4.0], | ||||
|         }, | ||||
|         dtype=exp_dtype, | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "data", | ||||
|     [ | ||||
|         [ | ||||
|             Timestamp("2013-01-01"), | ||||
|             Timestamp("2013-01-02"), | ||||
|             Timestamp("2013-01-03"), | ||||
|         ], | ||||
|         [Timedelta("5 days"), Timedelta("6 days"), Timedelta("7 days")], | ||||
|     ], | ||||
| ) | ||||
| def test_group_diff_datetimelike(data, unit): | ||||
|     df = DataFrame({"a": [1, 2, 2], "b": data}) | ||||
|     df["b"] = df["b"].dt.as_unit(unit) | ||||
|     result = df.groupby("a")["b"].diff() | ||||
|     expected = Series([NaT, NaT, Timedelta("1 days")], name="b").dt.as_unit(unit) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_group_diff_bool(): | ||||
|     df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]}) | ||||
|     result = df.groupby("a")["b"].diff() | ||||
|     expected = Series([np.nan, np.nan, np.nan, False, False], name="b") | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_group_diff_object_raises(object_dtype): | ||||
|     df = DataFrame( | ||||
|         {"a": ["foo", "bar", "bar"], "b": ["baz", "foo", "foo"]}, dtype=object_dtype | ||||
|     ) | ||||
|     with pytest.raises(TypeError, match=r"unsupported operand type\(s\) for -"): | ||||
|         df.groupby("a")["b"].diff() | ||||
|  | ||||
|  | ||||
| def test_empty_shift_with_fill(): | ||||
|     # GH 41264, single-index check | ||||
|     df = DataFrame(columns=["a", "b", "c"]) | ||||
|     shifted = df.groupby(["a"]).shift(1) | ||||
|     shifted_with_fill = df.groupby(["a"]).shift(1, fill_value=0) | ||||
|     tm.assert_frame_equal(shifted, shifted_with_fill) | ||||
|     tm.assert_index_equal(shifted.index, shifted_with_fill.index) | ||||
|  | ||||
|  | ||||
| def test_multindex_empty_shift_with_fill(): | ||||
|     # GH 41264, multi-index check | ||||
|     df = DataFrame(columns=["a", "b", "c"]) | ||||
|     shifted = df.groupby(["a", "b"]).shift(1) | ||||
|     shifted_with_fill = df.groupby(["a", "b"]).shift(1, fill_value=0) | ||||
|     tm.assert_frame_equal(shifted, shifted_with_fill) | ||||
|     tm.assert_index_equal(shifted.index, shifted_with_fill.index) | ||||
|  | ||||
|  | ||||
| def test_shift_periods_freq(): | ||||
|     # GH 54093 | ||||
|     data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]} | ||||
|     df = DataFrame(data, index=date_range(start="20100101", periods=6)) | ||||
|     result = df.groupby(df.index).shift(periods=-2, freq="D") | ||||
|     expected = DataFrame(data, index=date_range(start="2009-12-30", periods=6)) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_shift_deprecate_freq_and_fill_value(): | ||||
|     # GH 53832 | ||||
|     data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]} | ||||
|     df = DataFrame(data, index=date_range(start="20100101", periods=6)) | ||||
|     msg = ( | ||||
|         "Passing a 'freq' together with a 'fill_value' silently ignores the fill_value" | ||||
|     ) | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         df.groupby(df.index).shift(periods=-2, freq="D", fill_value="1") | ||||
|  | ||||
|  | ||||
| def test_shift_disallow_suffix_if_periods_is_int(): | ||||
|     # GH#44424 | ||||
|     data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]} | ||||
|     df = DataFrame(data) | ||||
|     msg = "Cannot specify `suffix` if `periods` is an int." | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         df.groupby("b").shift(1, suffix="fails") | ||||
|  | ||||
|  | ||||
| def test_group_shift_with_multiple_periods(): | ||||
|     # GH#44424 | ||||
|     df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]}) | ||||
|  | ||||
|     shifted_df = df.groupby("b")[["a"]].shift([0, 1]) | ||||
|     expected_df = DataFrame( | ||||
|         {"a_0": [1, 2, 3, 3, 2], "a_1": [np.nan, 1.0, np.nan, 3.0, 2.0]} | ||||
|     ) | ||||
|     tm.assert_frame_equal(shifted_df, expected_df) | ||||
|  | ||||
|     # series | ||||
|     shifted_series = df.groupby("b")["a"].shift([0, 1]) | ||||
|     tm.assert_frame_equal(shifted_series, expected_df) | ||||
|  | ||||
|  | ||||
| def test_group_shift_with_multiple_periods_and_freq(): | ||||
|     # GH#44424 | ||||
|     df = DataFrame( | ||||
|         {"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]}, | ||||
|         index=date_range("1/1/2000", periods=5, freq="h"), | ||||
|     ) | ||||
|     shifted_df = df.groupby("b")[["a"]].shift( | ||||
|         [0, 1], | ||||
|         freq="h", | ||||
|     ) | ||||
|     expected_df = DataFrame( | ||||
|         { | ||||
|             "a_0": [1.0, 2.0, 3.0, 4.0, 5.0, np.nan], | ||||
|             "a_1": [ | ||||
|                 np.nan, | ||||
|                 1.0, | ||||
|                 2.0, | ||||
|                 3.0, | ||||
|                 4.0, | ||||
|                 5.0, | ||||
|             ], | ||||
|         }, | ||||
|         index=date_range("1/1/2000", periods=6, freq="h"), | ||||
|     ) | ||||
|     tm.assert_frame_equal(shifted_df, expected_df) | ||||
|  | ||||
|  | ||||
| def test_group_shift_with_multiple_periods_and_fill_value(): | ||||
|     # GH#44424 | ||||
|     df = DataFrame( | ||||
|         {"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]}, | ||||
|     ) | ||||
|     shifted_df = df.groupby("b")[["a"]].shift([0, 1], fill_value=-1) | ||||
|     expected_df = DataFrame( | ||||
|         {"a_0": [1, 2, 3, 4, 5], "a_1": [-1, 1, -1, 3, 2]}, | ||||
|     ) | ||||
|     tm.assert_frame_equal(shifted_df, expected_df) | ||||
|  | ||||
|  | ||||
| def test_group_shift_with_multiple_periods_and_both_fill_and_freq_deprecated(): | ||||
|     # GH#44424 | ||||
|     df = DataFrame( | ||||
|         {"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]}, | ||||
|         index=date_range("1/1/2000", periods=5, freq="h"), | ||||
|     ) | ||||
|     msg = ( | ||||
|         "Passing a 'freq' together with a 'fill_value' silently ignores the " | ||||
|         "fill_value" | ||||
|     ) | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         df.groupby("b")[["a"]].shift([1, 2], fill_value=1, freq="h") | ||||
| @ -0,0 +1,78 @@ | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     Series, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "in_vals, out_vals", | ||||
|     [ | ||||
|         # Basics: strictly increasing (T), strictly decreasing (F), | ||||
|         # abs val increasing (F), non-strictly increasing (T) | ||||
|         ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], [True, False, False, True]), | ||||
|         # Test with inf vals | ||||
|         ( | ||||
|             [1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf], | ||||
|             [True, False, True, False], | ||||
|         ), | ||||
|         # Test with nan vals; should always be False | ||||
|         ( | ||||
|             [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], | ||||
|             [False, False, False, False], | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_is_monotonic_increasing(in_vals, out_vals): | ||||
|     # GH 17015 | ||||
|     source_dict = { | ||||
|         "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"], | ||||
|         "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"], | ||||
|         "C": in_vals, | ||||
|     } | ||||
|     df = DataFrame(source_dict) | ||||
|     result = df.groupby("B").C.is_monotonic_increasing | ||||
|     index = Index(list("abcd"), name="B") | ||||
|     expected = Series(index=index, data=out_vals, name="C") | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     # Also check result equal to manually taking x.is_monotonic_increasing. | ||||
|     expected = df.groupby(["B"]).C.apply(lambda x: x.is_monotonic_increasing) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "in_vals, out_vals", | ||||
|     [ | ||||
|         # Basics: strictly decreasing (T), strictly increasing (F), | ||||
|         # abs val decreasing (F), non-strictly increasing (T) | ||||
|         ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], [True, False, False, True]), | ||||
|         # Test with inf vals | ||||
|         ( | ||||
|             [np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf], | ||||
|             [True, True, False, True], | ||||
|         ), | ||||
|         # Test with nan vals; should always be False | ||||
|         ( | ||||
|             [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], | ||||
|             [False, False, False, False], | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_is_monotonic_decreasing(in_vals, out_vals): | ||||
|     # GH 17015 | ||||
|     source_dict = { | ||||
|         "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"], | ||||
|         "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"], | ||||
|         "C": in_vals, | ||||
|     } | ||||
|  | ||||
|     df = DataFrame(source_dict) | ||||
|     result = df.groupby("B").C.is_monotonic_decreasing | ||||
|     index = Index(list("abcd"), name="B") | ||||
|     expected = Series(index=index, data=out_vals, name="C") | ||||
|     tm.assert_series_equal(result, expected) | ||||
| @ -0,0 +1,115 @@ | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas import ( | ||||
|     MultiIndex, | ||||
|     Series, | ||||
|     date_range, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| def test_nlargest(): | ||||
|     a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) | ||||
|     b = Series(list("a" * 5 + "b" * 5)) | ||||
|     gb = a.groupby(b) | ||||
|     r = gb.nlargest(3) | ||||
|     e = Series( | ||||
|         [7, 5, 3, 10, 9, 6], | ||||
|         index=MultiIndex.from_arrays([list("aaabbb"), [3, 2, 1, 9, 5, 8]]), | ||||
|     ) | ||||
|     tm.assert_series_equal(r, e) | ||||
|  | ||||
|     a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) | ||||
|     gb = a.groupby(b) | ||||
|     e = Series( | ||||
|         [3, 2, 1, 3, 3, 2], | ||||
|         index=MultiIndex.from_arrays([list("aaabbb"), [2, 3, 1, 6, 5, 7]]), | ||||
|     ) | ||||
|     tm.assert_series_equal(gb.nlargest(3, keep="last"), e) | ||||
|  | ||||
|  | ||||
| def test_nlargest_mi_grouper(): | ||||
|     # see gh-21411 | ||||
|     npr = np.random.default_rng(2) | ||||
|  | ||||
|     dts = date_range("20180101", periods=10) | ||||
|     iterables = [dts, ["one", "two"]] | ||||
|  | ||||
|     idx = MultiIndex.from_product(iterables, names=["first", "second"]) | ||||
|     s = Series(npr.standard_normal(20), index=idx) | ||||
|  | ||||
|     result = s.groupby("first").nlargest(1) | ||||
|  | ||||
|     exp_idx = MultiIndex.from_tuples( | ||||
|         [ | ||||
|             (dts[0], dts[0], "one"), | ||||
|             (dts[1], dts[1], "one"), | ||||
|             (dts[2], dts[2], "one"), | ||||
|             (dts[3], dts[3], "two"), | ||||
|             (dts[4], dts[4], "one"), | ||||
|             (dts[5], dts[5], "one"), | ||||
|             (dts[6], dts[6], "one"), | ||||
|             (dts[7], dts[7], "one"), | ||||
|             (dts[8], dts[8], "one"), | ||||
|             (dts[9], dts[9], "one"), | ||||
|         ], | ||||
|         names=["first", "first", "second"], | ||||
|     ) | ||||
|  | ||||
|     exp_values = [ | ||||
|         0.18905338179353307, | ||||
|         -0.41306354339189344, | ||||
|         1.799707382720902, | ||||
|         0.7738065867276614, | ||||
|         0.28121066979764925, | ||||
|         0.9775674511260357, | ||||
|         -0.3288239040579627, | ||||
|         0.45495807124085547, | ||||
|         0.5452887139646817, | ||||
|         0.12682784711186987, | ||||
|     ] | ||||
|  | ||||
|     expected = Series(exp_values, index=exp_idx) | ||||
|     tm.assert_series_equal(result, expected, check_exact=False, rtol=1e-3) | ||||
|  | ||||
|  | ||||
| def test_nsmallest(): | ||||
|     a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) | ||||
|     b = Series(list("a" * 5 + "b" * 5)) | ||||
|     gb = a.groupby(b) | ||||
|     r = gb.nsmallest(3) | ||||
|     e = Series( | ||||
|         [1, 2, 3, 0, 4, 6], | ||||
|         index=MultiIndex.from_arrays([list("aaabbb"), [0, 4, 1, 6, 7, 8]]), | ||||
|     ) | ||||
|     tm.assert_series_equal(r, e) | ||||
|  | ||||
|     a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) | ||||
|     gb = a.groupby(b) | ||||
|     e = Series( | ||||
|         [0, 1, 1, 0, 1, 2], | ||||
|         index=MultiIndex.from_arrays([list("aaabbb"), [4, 1, 0, 9, 8, 7]]), | ||||
|     ) | ||||
|     tm.assert_series_equal(gb.nsmallest(3, keep="last"), e) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "data, groups", | ||||
|     [([0, 1, 2, 3], [0, 0, 1, 1]), ([0], [0])], | ||||
| ) | ||||
| @pytest.mark.parametrize("dtype", [None, *tm.ALL_INT_NUMPY_DTYPES]) | ||||
| @pytest.mark.parametrize("method", ["nlargest", "nsmallest"]) | ||||
| def test_nlargest_and_smallest_noop(data, groups, dtype, method): | ||||
|     # GH 15272, GH 16345, GH 29129 | ||||
|     # Test nlargest/smallest when it results in a noop, | ||||
|     # i.e. input is sorted and group size <= n | ||||
|     if dtype is not None: | ||||
|         data = np.array(data, dtype=dtype) | ||||
|     if method == "nlargest": | ||||
|         data = list(reversed(data)) | ||||
|     ser = Series(data, name="a") | ||||
|     result = getattr(ser.groupby(groups), method)(n=2) | ||||
|     expidx = np.array(groups, dtype=int) if isinstance(groups, list) else groups | ||||
|     expected = Series(data, index=MultiIndex.from_arrays([expidx, ser.index]), name="a") | ||||
|     tm.assert_series_equal(result, expected) | ||||
| @ -0,0 +1,922 @@ | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| import pandas as pd | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     MultiIndex, | ||||
|     Series, | ||||
|     Timestamp, | ||||
|     isna, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| def test_first_last_nth(df): | ||||
|     # tests for first / last / nth | ||||
|     grouped = df.groupby("A") | ||||
|     first = grouped.first() | ||||
|     expected = df.loc[[1, 0], ["B", "C", "D"]] | ||||
|     expected.index = Index(["bar", "foo"], name="A") | ||||
|     expected = expected.sort_index() | ||||
|     tm.assert_frame_equal(first, expected) | ||||
|  | ||||
|     nth = grouped.nth(0) | ||||
|     expected = df.loc[[0, 1]] | ||||
|     tm.assert_frame_equal(nth, expected) | ||||
|  | ||||
|     last = grouped.last() | ||||
|     expected = df.loc[[5, 7], ["B", "C", "D"]] | ||||
|     expected.index = Index(["bar", "foo"], name="A") | ||||
|     tm.assert_frame_equal(last, expected) | ||||
|  | ||||
|     nth = grouped.nth(-1) | ||||
|     expected = df.iloc[[5, 7]] | ||||
|     tm.assert_frame_equal(nth, expected) | ||||
|  | ||||
|     nth = grouped.nth(1) | ||||
|     expected = df.iloc[[2, 3]] | ||||
|     tm.assert_frame_equal(nth, expected) | ||||
|  | ||||
|     # it works! | ||||
|     grouped["B"].first() | ||||
|     grouped["B"].last() | ||||
|     grouped["B"].nth(0) | ||||
|  | ||||
|     df = df.copy() | ||||
|     df.loc[df["A"] == "foo", "B"] = np.nan | ||||
|     grouped = df.groupby("A") | ||||
|     assert isna(grouped["B"].first()["foo"]) | ||||
|     assert isna(grouped["B"].last()["foo"]) | ||||
|     assert isna(grouped["B"].nth(0).iloc[0]) | ||||
|  | ||||
|     # v0.14.0 whatsnew | ||||
|     df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) | ||||
|     g = df.groupby("A") | ||||
|     result = g.first() | ||||
|     expected = df.iloc[[1, 2]].set_index("A") | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     expected = df.iloc[[1, 2]] | ||||
|     result = g.nth(0, dropna="any") | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("method", ["first", "last"]) | ||||
| def test_first_last_with_na_object(method, nulls_fixture): | ||||
|     # https://github.com/pandas-dev/pandas/issues/32123 | ||||
|     groups = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby("a") | ||||
|     result = getattr(groups, method)() | ||||
|  | ||||
|     if method == "first": | ||||
|         values = [1, 3] | ||||
|     else: | ||||
|         values = [2, 3] | ||||
|  | ||||
|     values = np.array(values, dtype=result["b"].dtype) | ||||
|     idx = Index([1, 2], name="a") | ||||
|     expected = DataFrame({"b": values}, index=idx) | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("index", [0, -1]) | ||||
| def test_nth_with_na_object(index, nulls_fixture): | ||||
|     # https://github.com/pandas-dev/pandas/issues/32123 | ||||
|     df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}) | ||||
|     groups = df.groupby("a") | ||||
|     result = groups.nth(index) | ||||
|     expected = df.iloc[[0, 2]] if index == 0 else df.iloc[[1, 3]] | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("method", ["first", "last"]) | ||||
| def test_first_last_with_None(method): | ||||
|     # https://github.com/pandas-dev/pandas/issues/32800 | ||||
|     # None should be preserved as object dtype | ||||
|     df = DataFrame.from_dict({"id": ["a"], "value": [None]}) | ||||
|     groups = df.groupby("id", as_index=False) | ||||
|     result = getattr(groups, method)() | ||||
|  | ||||
|     tm.assert_frame_equal(result, df) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("method", ["first", "last"]) | ||||
| @pytest.mark.parametrize( | ||||
|     "df, expected", | ||||
|     [ | ||||
|         ( | ||||
|             DataFrame({"id": "a", "value": [None, "foo", np.nan]}), | ||||
|             DataFrame({"value": ["foo"]}, index=Index(["a"], name="id")), | ||||
|         ), | ||||
|         ( | ||||
|             DataFrame({"id": "a", "value": [np.nan]}, dtype=object), | ||||
|             DataFrame({"value": [None]}, index=Index(["a"], name="id")), | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_first_last_with_None_expanded(method, df, expected): | ||||
|     # GH 32800, 38286 | ||||
|     result = getattr(df.groupby("id"), method)() | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_first_last_nth_dtypes(): | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], | ||||
|             "B": ["one", "one", "two", "three", "two", "two", "one", "three"], | ||||
|             "C": np.random.default_rng(2).standard_normal(8), | ||||
|             "D": np.array(np.random.default_rng(2).standard_normal(8), dtype="float32"), | ||||
|         } | ||||
|     ) | ||||
|     df["E"] = True | ||||
|     df["F"] = 1 | ||||
|  | ||||
|     # tests for first / last / nth | ||||
|     grouped = df.groupby("A") | ||||
|     first = grouped.first() | ||||
|     expected = df.loc[[1, 0], ["B", "C", "D", "E", "F"]] | ||||
|     expected.index = Index(["bar", "foo"], name="A") | ||||
|     expected = expected.sort_index() | ||||
|     tm.assert_frame_equal(first, expected) | ||||
|  | ||||
|     last = grouped.last() | ||||
|     expected = df.loc[[5, 7], ["B", "C", "D", "E", "F"]] | ||||
|     expected.index = Index(["bar", "foo"], name="A") | ||||
|     expected = expected.sort_index() | ||||
|     tm.assert_frame_equal(last, expected) | ||||
|  | ||||
|     nth = grouped.nth(1) | ||||
|     expected = df.iloc[[2, 3]] | ||||
|     tm.assert_frame_equal(nth, expected) | ||||
|  | ||||
|  | ||||
| def test_first_last_nth_dtypes2(): | ||||
|     # GH 2763, first/last shifting dtypes | ||||
|     idx = list(range(10)) | ||||
|     idx.append(9) | ||||
|     ser = Series(data=range(11), index=idx, name="IntCol") | ||||
|     assert ser.dtype == "int64" | ||||
|     f = ser.groupby(level=0).first() | ||||
|     assert f.dtype == "int64" | ||||
|  | ||||
|  | ||||
| def test_first_last_nth_nan_dtype(): | ||||
|     # GH 33591 | ||||
|     df = DataFrame({"data": ["A"], "nans": Series([None], dtype=object)}) | ||||
|     grouped = df.groupby("data") | ||||
|  | ||||
|     expected = df.set_index("data").nans | ||||
|     tm.assert_series_equal(grouped.nans.first(), expected) | ||||
|     tm.assert_series_equal(grouped.nans.last(), expected) | ||||
|  | ||||
|     expected = df.nans | ||||
|     tm.assert_series_equal(grouped.nans.nth(-1), expected) | ||||
|     tm.assert_series_equal(grouped.nans.nth(0), expected) | ||||
|  | ||||
|  | ||||
| def test_first_strings_timestamps(): | ||||
|     # GH 11244 | ||||
|     test = DataFrame( | ||||
|         { | ||||
|             Timestamp("2012-01-01 00:00:00"): ["a", "b"], | ||||
|             Timestamp("2012-01-02 00:00:00"): ["c", "d"], | ||||
|             "name": ["e", "e"], | ||||
|             "aaaa": ["f", "g"], | ||||
|         } | ||||
|     ) | ||||
|     result = test.groupby("name").first() | ||||
|     expected = DataFrame( | ||||
|         [["a", "c", "f"]], | ||||
|         columns=Index([Timestamp("2012-01-01"), Timestamp("2012-01-02"), "aaaa"]), | ||||
|         index=Index(["e"], name="name"), | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_nth(): | ||||
|     df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) | ||||
|     gb = df.groupby("A") | ||||
|  | ||||
|     tm.assert_frame_equal(gb.nth(0), df.iloc[[0, 2]]) | ||||
|     tm.assert_frame_equal(gb.nth(1), df.iloc[[1]]) | ||||
|     tm.assert_frame_equal(gb.nth(2), df.loc[[]]) | ||||
|     tm.assert_frame_equal(gb.nth(-1), df.iloc[[1, 2]]) | ||||
|     tm.assert_frame_equal(gb.nth(-2), df.iloc[[0]]) | ||||
|     tm.assert_frame_equal(gb.nth(-3), df.loc[[]]) | ||||
|     tm.assert_series_equal(gb.B.nth(0), df.B.iloc[[0, 2]]) | ||||
|     tm.assert_series_equal(gb.B.nth(1), df.B.iloc[[1]]) | ||||
|     tm.assert_frame_equal(gb[["B"]].nth(0), df[["B"]].iloc[[0, 2]]) | ||||
|  | ||||
|     tm.assert_frame_equal(gb.nth(0, dropna="any"), df.iloc[[1, 2]]) | ||||
|     tm.assert_frame_equal(gb.nth(-1, dropna="any"), df.iloc[[1, 2]]) | ||||
|  | ||||
|     tm.assert_frame_equal(gb.nth(7, dropna="any"), df.iloc[:0]) | ||||
|     tm.assert_frame_equal(gb.nth(2, dropna="any"), df.iloc[:0]) | ||||
|  | ||||
|  | ||||
| def test_nth2(): | ||||
|     # out of bounds, regression from 0.13.1 | ||||
|     # GH 6621 | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "color": {0: "green", 1: "green", 2: "red", 3: "red", 4: "red"}, | ||||
|             "food": {0: "ham", 1: "eggs", 2: "eggs", 3: "ham", 4: "pork"}, | ||||
|             "two": { | ||||
|                 0: 1.5456590000000001, | ||||
|                 1: -0.070345000000000005, | ||||
|                 2: -2.4004539999999999, | ||||
|                 3: 0.46206000000000003, | ||||
|                 4: 0.52350799999999997, | ||||
|             }, | ||||
|             "one": { | ||||
|                 0: 0.56573799999999996, | ||||
|                 1: -0.9742360000000001, | ||||
|                 2: 1.033801, | ||||
|                 3: -0.78543499999999999, | ||||
|                 4: 0.70422799999999997, | ||||
|             }, | ||||
|         } | ||||
|     ).set_index(["color", "food"]) | ||||
|  | ||||
|     result = df.groupby(level=0, as_index=False).nth(2) | ||||
|     expected = df.iloc[[-1]] | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby(level=0, as_index=False).nth(3) | ||||
|     expected = df.loc[[]] | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_nth3(): | ||||
|     # GH 7559 | ||||
|     # from the vbench | ||||
|     df = DataFrame(np.random.default_rng(2).integers(1, 10, (100, 2)), dtype="int64") | ||||
|     ser = df[1] | ||||
|     gb = df[0] | ||||
|     expected = ser.groupby(gb).first() | ||||
|     expected2 = ser.groupby(gb).apply(lambda x: x.iloc[0]) | ||||
|     tm.assert_series_equal(expected2, expected, check_names=False) | ||||
|     assert expected.name == 1 | ||||
|     assert expected2.name == 1 | ||||
|  | ||||
|     # validate first | ||||
|     v = ser[gb == 1].iloc[0] | ||||
|     assert expected.iloc[0] == v | ||||
|     assert expected2.iloc[0] == v | ||||
|  | ||||
|     with pytest.raises(ValueError, match="For a DataFrame"): | ||||
|         ser.groupby(gb, sort=False).nth(0, dropna=True) | ||||
|  | ||||
|  | ||||
| def test_nth4(): | ||||
|     # doc example | ||||
|     df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) | ||||
|     gb = df.groupby("A") | ||||
|     result = gb.B.nth(0, dropna="all") | ||||
|     expected = df.B.iloc[[1, 2]] | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_nth5(): | ||||
|     # test multiple nth values | ||||
|     df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], columns=["A", "B"]) | ||||
|     gb = df.groupby("A") | ||||
|  | ||||
|     tm.assert_frame_equal(gb.nth(0), df.iloc[[0, 3]]) | ||||
|     tm.assert_frame_equal(gb.nth([0]), df.iloc[[0, 3]]) | ||||
|     tm.assert_frame_equal(gb.nth([0, 1]), df.iloc[[0, 1, 3, 4]]) | ||||
|     tm.assert_frame_equal(gb.nth([0, -1]), df.iloc[[0, 2, 3, 4]]) | ||||
|     tm.assert_frame_equal(gb.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]]) | ||||
|     tm.assert_frame_equal(gb.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]]) | ||||
|     tm.assert_frame_equal(gb.nth([2]), df.iloc[[2]]) | ||||
|     tm.assert_frame_equal(gb.nth([3, 4]), df.loc[[]]) | ||||
|  | ||||
|  | ||||
| def test_nth_bdays(unit): | ||||
|     business_dates = pd.date_range( | ||||
|         start="4/1/2014", end="6/30/2014", freq="B", unit=unit | ||||
|     ) | ||||
|     df = DataFrame(1, index=business_dates, columns=["a", "b"]) | ||||
|     # get the first, fourth and last two business days for each month | ||||
|     key = [df.index.year, df.index.month] | ||||
|     result = df.groupby(key, as_index=False).nth([0, 3, -2, -1]) | ||||
|     expected_dates = pd.to_datetime( | ||||
|         [ | ||||
|             "2014/4/1", | ||||
|             "2014/4/4", | ||||
|             "2014/4/29", | ||||
|             "2014/4/30", | ||||
|             "2014/5/1", | ||||
|             "2014/5/6", | ||||
|             "2014/5/29", | ||||
|             "2014/5/30", | ||||
|             "2014/6/2", | ||||
|             "2014/6/5", | ||||
|             "2014/6/27", | ||||
|             "2014/6/30", | ||||
|         ] | ||||
|     ).as_unit(unit) | ||||
|     expected = DataFrame(1, columns=["a", "b"], index=expected_dates) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_nth_multi_grouper(three_group): | ||||
|     # PR 9090, related to issue 8979 | ||||
|     # test nth on multiple groupers | ||||
|     grouped = three_group.groupby(["A", "B"]) | ||||
|     result = grouped.nth(0) | ||||
|     expected = three_group.iloc[[0, 3, 4, 7]] | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "data, expected_first, expected_last", | ||||
|     [ | ||||
|         ( | ||||
|             { | ||||
|                 "id": ["A"], | ||||
|                 "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"), | ||||
|                 "foo": [1], | ||||
|             }, | ||||
|             { | ||||
|                 "id": ["A"], | ||||
|                 "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"), | ||||
|                 "foo": [1], | ||||
|             }, | ||||
|             { | ||||
|                 "id": ["A"], | ||||
|                 "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"), | ||||
|                 "foo": [1], | ||||
|             }, | ||||
|         ), | ||||
|         ( | ||||
|             { | ||||
|                 "id": ["A", "B", "A"], | ||||
|                 "time": [ | ||||
|                     Timestamp("2012-01-01 13:00:00", tz="America/New_York"), | ||||
|                     Timestamp("2012-02-01 14:00:00", tz="US/Central"), | ||||
|                     Timestamp("2012-03-01 12:00:00", tz="Europe/London"), | ||||
|                 ], | ||||
|                 "foo": [1, 2, 3], | ||||
|             }, | ||||
|             { | ||||
|                 "id": ["A", "B"], | ||||
|                 "time": [ | ||||
|                     Timestamp("2012-01-01 13:00:00", tz="America/New_York"), | ||||
|                     Timestamp("2012-02-01 14:00:00", tz="US/Central"), | ||||
|                 ], | ||||
|                 "foo": [1, 2], | ||||
|             }, | ||||
|             { | ||||
|                 "id": ["A", "B"], | ||||
|                 "time": [ | ||||
|                     Timestamp("2012-03-01 12:00:00", tz="Europe/London"), | ||||
|                     Timestamp("2012-02-01 14:00:00", tz="US/Central"), | ||||
|                 ], | ||||
|                 "foo": [3, 2], | ||||
|             }, | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_first_last_tz(data, expected_first, expected_last): | ||||
|     # GH15884 | ||||
|     # Test that the timezone is retained when calling first | ||||
|     # or last on groupby with as_index=False | ||||
|  | ||||
|     df = DataFrame(data) | ||||
|  | ||||
|     result = df.groupby("id", as_index=False).first() | ||||
|     expected = DataFrame(expected_first) | ||||
|     cols = ["id", "time", "foo"] | ||||
|     tm.assert_frame_equal(result[cols], expected[cols]) | ||||
|  | ||||
|     result = df.groupby("id", as_index=False)["time"].first() | ||||
|     tm.assert_frame_equal(result, expected[["id", "time"]]) | ||||
|  | ||||
|     result = df.groupby("id", as_index=False).last() | ||||
|     expected = DataFrame(expected_last) | ||||
|     cols = ["id", "time", "foo"] | ||||
|     tm.assert_frame_equal(result[cols], expected[cols]) | ||||
|  | ||||
|     result = df.groupby("id", as_index=False)["time"].last() | ||||
|     tm.assert_frame_equal(result, expected[["id", "time"]]) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "method, ts, alpha", | ||||
|     [ | ||||
|         ["first", Timestamp("2013-01-01", tz="US/Eastern"), "a"], | ||||
|         ["last", Timestamp("2013-01-02", tz="US/Eastern"), "b"], | ||||
|     ], | ||||
| ) | ||||
| def test_first_last_tz_multi_column(method, ts, alpha, unit): | ||||
|     # GH 21603 | ||||
|     category_string = Series(list("abc")).astype("category") | ||||
|     dti = pd.date_range("20130101", periods=3, tz="US/Eastern", unit=unit) | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "group": [1, 1, 2], | ||||
|             "category_string": category_string, | ||||
|             "datetimetz": dti, | ||||
|         } | ||||
|     ) | ||||
|     result = getattr(df.groupby("group"), method)() | ||||
|     expected = DataFrame( | ||||
|         { | ||||
|             "category_string": pd.Categorical( | ||||
|                 [alpha, "c"], dtype=category_string.dtype | ||||
|             ), | ||||
|             "datetimetz": [ts, Timestamp("2013-01-03", tz="US/Eastern")], | ||||
|         }, | ||||
|         index=Index([1, 2], name="group"), | ||||
|     ) | ||||
|     expected["datetimetz"] = expected["datetimetz"].dt.as_unit(unit) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "values", | ||||
|     [ | ||||
|         pd.array([True, False], dtype="boolean"), | ||||
|         pd.array([1, 2], dtype="Int64"), | ||||
|         pd.to_datetime(["2020-01-01", "2020-02-01"]), | ||||
|         pd.to_timedelta([1, 2], unit="D"), | ||||
|     ], | ||||
| ) | ||||
| @pytest.mark.parametrize("function", ["first", "last", "min", "max"]) | ||||
| def test_first_last_extension_array_keeps_dtype(values, function): | ||||
|     # https://github.com/pandas-dev/pandas/issues/33071 | ||||
|     # https://github.com/pandas-dev/pandas/issues/32194 | ||||
|     df = DataFrame({"a": [1, 2], "b": values}) | ||||
|     grouped = df.groupby("a") | ||||
|     idx = Index([1, 2], name="a") | ||||
|     expected_series = Series(values, name="b", index=idx) | ||||
|     expected_frame = DataFrame({"b": values}, index=idx) | ||||
|  | ||||
|     result_series = getattr(grouped["b"], function)() | ||||
|     tm.assert_series_equal(result_series, expected_series) | ||||
|  | ||||
|     result_frame = grouped.agg({"b": function}) | ||||
|     tm.assert_frame_equal(result_frame, expected_frame) | ||||
|  | ||||
|  | ||||
| def test_nth_multi_index_as_expected(): | ||||
|     # PR 9090, related to issue 8979 | ||||
|     # test nth on MultiIndex | ||||
|     three_group = DataFrame( | ||||
|         { | ||||
|             "A": [ | ||||
|                 "foo", | ||||
|                 "foo", | ||||
|                 "foo", | ||||
|                 "foo", | ||||
|                 "bar", | ||||
|                 "bar", | ||||
|                 "bar", | ||||
|                 "bar", | ||||
|                 "foo", | ||||
|                 "foo", | ||||
|                 "foo", | ||||
|             ], | ||||
|             "B": [ | ||||
|                 "one", | ||||
|                 "one", | ||||
|                 "one", | ||||
|                 "two", | ||||
|                 "one", | ||||
|                 "one", | ||||
|                 "one", | ||||
|                 "two", | ||||
|                 "two", | ||||
|                 "two", | ||||
|                 "one", | ||||
|             ], | ||||
|             "C": [ | ||||
|                 "dull", | ||||
|                 "dull", | ||||
|                 "shiny", | ||||
|                 "dull", | ||||
|                 "dull", | ||||
|                 "shiny", | ||||
|                 "shiny", | ||||
|                 "dull", | ||||
|                 "shiny", | ||||
|                 "shiny", | ||||
|                 "shiny", | ||||
|             ], | ||||
|         } | ||||
|     ) | ||||
|     grouped = three_group.groupby(["A", "B"]) | ||||
|     result = grouped.nth(0) | ||||
|     expected = three_group.iloc[[0, 3, 4, 7]] | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "op, n, expected_rows", | ||||
|     [ | ||||
|         ("head", -1, [0]), | ||||
|         ("head", 0, []), | ||||
|         ("head", 1, [0, 2]), | ||||
|         ("head", 7, [0, 1, 2]), | ||||
|         ("tail", -1, [1]), | ||||
|         ("tail", 0, []), | ||||
|         ("tail", 1, [1, 2]), | ||||
|         ("tail", 7, [0, 1, 2]), | ||||
|     ], | ||||
| ) | ||||
| @pytest.mark.parametrize("columns", [None, [], ["A"], ["B"], ["A", "B"]]) | ||||
| @pytest.mark.parametrize("as_index", [True, False]) | ||||
| def test_groupby_head_tail(op, n, expected_rows, columns, as_index): | ||||
|     df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) | ||||
|     g = df.groupby("A", as_index=as_index) | ||||
|     expected = df.iloc[expected_rows] | ||||
|     if columns is not None: | ||||
|         g = g[columns] | ||||
|         expected = expected[columns] | ||||
|     result = getattr(g, op)(n) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "op, n, expected_cols", | ||||
|     [ | ||||
|         ("head", -1, [0]), | ||||
|         ("head", 0, []), | ||||
|         ("head", 1, [0, 2]), | ||||
|         ("head", 7, [0, 1, 2]), | ||||
|         ("tail", -1, [1]), | ||||
|         ("tail", 0, []), | ||||
|         ("tail", 1, [1, 2]), | ||||
|         ("tail", 7, [0, 1, 2]), | ||||
|     ], | ||||
| ) | ||||
| def test_groupby_head_tail_axis_1(op, n, expected_cols): | ||||
|     # GH 9772 | ||||
|     df = DataFrame( | ||||
|         [[1, 2, 3], [1, 4, 5], [2, 6, 7], [3, 8, 9]], columns=["A", "B", "C"] | ||||
|     ) | ||||
|     msg = "DataFrame.groupby with axis=1 is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         g = df.groupby([0, 0, 1], axis=1) | ||||
|     expected = df.iloc[:, expected_cols] | ||||
|     result = getattr(g, op)(n) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_group_selection_cache(): | ||||
|     # GH 12839 nth, head, and tail should return same result consistently | ||||
|     df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) | ||||
|     expected = df.iloc[[0, 2]] | ||||
|  | ||||
|     g = df.groupby("A") | ||||
|     result1 = g.head(n=2) | ||||
|     result2 = g.nth(0) | ||||
|     tm.assert_frame_equal(result1, df) | ||||
|     tm.assert_frame_equal(result2, expected) | ||||
|  | ||||
|     g = df.groupby("A") | ||||
|     result1 = g.tail(n=2) | ||||
|     result2 = g.nth(0) | ||||
|     tm.assert_frame_equal(result1, df) | ||||
|     tm.assert_frame_equal(result2, expected) | ||||
|  | ||||
|     g = df.groupby("A") | ||||
|     result1 = g.nth(0) | ||||
|     result2 = g.head(n=2) | ||||
|     tm.assert_frame_equal(result1, expected) | ||||
|     tm.assert_frame_equal(result2, df) | ||||
|  | ||||
|     g = df.groupby("A") | ||||
|     result1 = g.nth(0) | ||||
|     result2 = g.tail(n=2) | ||||
|     tm.assert_frame_equal(result1, expected) | ||||
|     tm.assert_frame_equal(result2, df) | ||||
|  | ||||
|  | ||||
| def test_nth_empty(): | ||||
|     # GH 16064 | ||||
|     df = DataFrame(index=[0], columns=["a", "b", "c"]) | ||||
|     result = df.groupby("a").nth(10) | ||||
|     expected = df.iloc[:0] | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby(["a", "b"]).nth(10) | ||||
|     expected = df.iloc[:0] | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_nth_column_order(): | ||||
|     # GH 20760 | ||||
|     # Check that nth preserves column order | ||||
|     df = DataFrame( | ||||
|         [[1, "b", 100], [1, "a", 50], [1, "a", np.nan], [2, "c", 200], [2, "d", 150]], | ||||
|         columns=["A", "C", "B"], | ||||
|     ) | ||||
|     result = df.groupby("A").nth(0) | ||||
|     expected = df.iloc[[0, 3]] | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby("A").nth(-1, dropna="any") | ||||
|     expected = df.iloc[[1, 4]] | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("dropna", [None, "any", "all"]) | ||||
| def test_nth_nan_in_grouper(dropna): | ||||
|     # GH 26011 | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "a": [np.nan, "a", np.nan, "b", np.nan], | ||||
|             "b": [0, 2, 4, 6, 8], | ||||
|             "c": [1, 3, 5, 7, 9], | ||||
|         } | ||||
|     ) | ||||
|     result = df.groupby("a").nth(0, dropna=dropna) | ||||
|     expected = df.iloc[[1, 3]] | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("dropna", [None, "any", "all"]) | ||||
| def test_nth_nan_in_grouper_series(dropna): | ||||
|     # GH 26454 | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "a": [np.nan, "a", np.nan, "b", np.nan], | ||||
|             "b": [0, 2, 4, 6, 8], | ||||
|         } | ||||
|     ) | ||||
|     result = df.groupby("a")["b"].nth(0, dropna=dropna) | ||||
|     expected = df["b"].iloc[[1, 3]] | ||||
|  | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_first_categorical_and_datetime_data_nat(): | ||||
|     # GH 20520 | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "group": ["first", "first", "second", "third", "third"], | ||||
|             "time": 5 * [np.datetime64("NaT")], | ||||
|             "categories": Series(["a", "b", "c", "a", "b"], dtype="category"), | ||||
|         } | ||||
|     ) | ||||
|     result = df.groupby("group").first() | ||||
|     expected = DataFrame( | ||||
|         { | ||||
|             "time": 3 * [np.datetime64("NaT")], | ||||
|             "categories": Series(["a", "c", "a"]).astype( | ||||
|                 pd.CategoricalDtype(["a", "b", "c"]) | ||||
|             ), | ||||
|         } | ||||
|     ) | ||||
|     expected.index = Index(["first", "second", "third"], name="group") | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_first_multi_key_groupby_categorical(): | ||||
|     # GH 22512 | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "A": [1, 1, 1, 2, 2], | ||||
|             "B": [100, 100, 200, 100, 100], | ||||
|             "C": ["apple", "orange", "mango", "mango", "orange"], | ||||
|             "D": ["jupiter", "mercury", "mars", "venus", "venus"], | ||||
|         } | ||||
|     ) | ||||
|     df = df.astype({"D": "category"}) | ||||
|     result = df.groupby(by=["A", "B"]).first() | ||||
|     expected = DataFrame( | ||||
|         { | ||||
|             "C": ["apple", "mango", "mango"], | ||||
|             "D": Series(["jupiter", "mars", "venus"]).astype( | ||||
|                 pd.CategoricalDtype(["jupiter", "mars", "mercury", "venus"]) | ||||
|             ), | ||||
|         } | ||||
|     ) | ||||
|     expected.index = MultiIndex.from_tuples( | ||||
|         [(1, 100), (1, 200), (2, 100)], names=["A", "B"] | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("method", ["first", "last", "nth"]) | ||||
| def test_groupby_last_first_nth_with_none(method, nulls_fixture): | ||||
|     # GH29645 | ||||
|     expected = Series(["y"], dtype=object) | ||||
|     data = Series( | ||||
|         [nulls_fixture, nulls_fixture, nulls_fixture, "y", nulls_fixture], | ||||
|         index=[0, 0, 0, 0, 0], | ||||
|         dtype=object, | ||||
|     ).groupby(level=0) | ||||
|  | ||||
|     if method == "nth": | ||||
|         result = getattr(data, method)(3) | ||||
|     else: | ||||
|         result = getattr(data, method)() | ||||
|  | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "arg, expected_rows", | ||||
|     [ | ||||
|         [slice(None, 3, 2), [0, 1, 4, 5]], | ||||
|         [slice(None, -2), [0, 2, 5]], | ||||
|         [[slice(None, 2), slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]], | ||||
|         [[0, 1, slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]], | ||||
|     ], | ||||
| ) | ||||
| def test_slice(slice_test_df, slice_test_grouped, arg, expected_rows): | ||||
|     # Test slices     GH #42947 | ||||
|  | ||||
|     result = slice_test_grouped.nth[arg] | ||||
|     equivalent = slice_test_grouped.nth(arg) | ||||
|     expected = slice_test_df.iloc[expected_rows] | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|     tm.assert_frame_equal(equivalent, expected) | ||||
|  | ||||
|  | ||||
| def test_nth_indexed(slice_test_df, slice_test_grouped): | ||||
|     # Test index notation     GH #44688 | ||||
|  | ||||
|     result = slice_test_grouped.nth[0, 1, -2:] | ||||
|     equivalent = slice_test_grouped.nth([0, 1, slice(-2, None)]) | ||||
|     expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]] | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|     tm.assert_frame_equal(equivalent, expected) | ||||
|  | ||||
|  | ||||
| def test_invalid_argument(slice_test_grouped): | ||||
|     # Test for error on invalid argument | ||||
|  | ||||
|     with pytest.raises(TypeError, match="Invalid index"): | ||||
|         slice_test_grouped.nth(3.14) | ||||
|  | ||||
|  | ||||
| def test_negative_step(slice_test_grouped): | ||||
|     # Test for error on negative slice step | ||||
|  | ||||
|     with pytest.raises(ValueError, match="Invalid step"): | ||||
|         slice_test_grouped.nth(slice(None, None, -1)) | ||||
|  | ||||
|  | ||||
| def test_np_ints(slice_test_df, slice_test_grouped): | ||||
|     # Test np ints work | ||||
|  | ||||
|     result = slice_test_grouped.nth(np.array([0, 1])) | ||||
|     expected = slice_test_df.iloc[[0, 1, 2, 3, 4]] | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_groupby_nth_with_column_axis(): | ||||
|     # GH43926 | ||||
|     df = DataFrame( | ||||
|         [ | ||||
|             [4, 5, 6], | ||||
|             [8, 8, 7], | ||||
|         ], | ||||
|         index=["z", "y"], | ||||
|         columns=["C", "B", "A"], | ||||
|     ) | ||||
|     msg = "DataFrame.groupby with axis=1 is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         gb = df.groupby(df.iloc[1], axis=1) | ||||
|     result = gb.nth(0) | ||||
|     expected = df.iloc[:, [0, 2]] | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_groupby_nth_interval(): | ||||
|     # GH#24205 | ||||
|     idx_result = MultiIndex( | ||||
|         [ | ||||
|             pd.CategoricalIndex([pd.Interval(0, 1), pd.Interval(1, 2)]), | ||||
|             pd.CategoricalIndex([pd.Interval(0, 10), pd.Interval(10, 20)]), | ||||
|         ], | ||||
|         [[0, 0, 0, 1, 1], [0, 1, 1, 0, -1]], | ||||
|     ) | ||||
|     df_result = DataFrame({"col": range(len(idx_result))}, index=idx_result) | ||||
|     result = df_result.groupby(level=[0, 1], observed=False).nth(0) | ||||
|     val_expected = [0, 1, 3] | ||||
|     idx_expected = MultiIndex( | ||||
|         [ | ||||
|             pd.CategoricalIndex([pd.Interval(0, 1), pd.Interval(1, 2)]), | ||||
|             pd.CategoricalIndex([pd.Interval(0, 10), pd.Interval(10, 20)]), | ||||
|         ], | ||||
|         [[0, 0, 1], [0, 1, 0]], | ||||
|     ) | ||||
|     expected = DataFrame(val_expected, index=idx_expected, columns=["col"]) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "start, stop, expected_values, expected_columns", | ||||
|     [ | ||||
|         (None, None, [0, 1, 2, 3, 4], list("ABCDE")), | ||||
|         (None, 1, [0, 3], list("AD")), | ||||
|         (None, 9, [0, 1, 2, 3, 4], list("ABCDE")), | ||||
|         (None, -1, [0, 1, 3], list("ABD")), | ||||
|         (1, None, [1, 2, 4], list("BCE")), | ||||
|         (1, -1, [1], list("B")), | ||||
|         (-1, None, [2, 4], list("CE")), | ||||
|         (-1, 2, [4], list("E")), | ||||
|     ], | ||||
| ) | ||||
| @pytest.mark.parametrize("method", ["call", "index"]) | ||||
| def test_nth_slices_with_column_axis( | ||||
|     start, stop, expected_values, expected_columns, method | ||||
| ): | ||||
|     df = DataFrame([range(5)], columns=[list("ABCDE")]) | ||||
|     msg = "DataFrame.groupby with axis=1 is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         gb = df.groupby([5, 5, 5, 6, 6], axis=1) | ||||
|     result = { | ||||
|         "call": lambda start, stop: gb.nth(slice(start, stop)), | ||||
|         "index": lambda start, stop: gb.nth[start:stop], | ||||
|     }[method](start, stop) | ||||
|     expected = DataFrame([expected_values], columns=[expected_columns]) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.filterwarnings( | ||||
|     "ignore:invalid value encountered in remainder:RuntimeWarning" | ||||
| ) | ||||
| def test_head_tail_dropna_true(): | ||||
|     # GH#45089 | ||||
|     df = DataFrame( | ||||
|         [["a", "z"], ["b", np.nan], ["c", np.nan], ["c", np.nan]], columns=["X", "Y"] | ||||
|     ) | ||||
|     expected = DataFrame([["a", "z"]], columns=["X", "Y"]) | ||||
|  | ||||
|     result = df.groupby(["X", "Y"]).head(n=1) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby(["X", "Y"]).tail(n=1) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby(["X", "Y"]).nth(n=0) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_head_tail_dropna_false(): | ||||
|     # GH#45089 | ||||
|     df = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"]) | ||||
|     expected = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"]) | ||||
|  | ||||
|     result = df.groupby(["X", "Y"], dropna=False).head(n=1) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby(["X", "Y"], dropna=False).tail(n=1) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby(["X", "Y"], dropna=False).nth(n=0) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("selection", ("b", ["b"], ["b", "c"])) | ||||
| @pytest.mark.parametrize("dropna", ["any", "all", None]) | ||||
| def test_nth_after_selection(selection, dropna): | ||||
|     # GH#11038, GH#53518 | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "a": [1, 1, 2], | ||||
|             "b": [np.nan, 3, 4], | ||||
|             "c": [5, 6, 7], | ||||
|         } | ||||
|     ) | ||||
|     gb = df.groupby("a")[selection] | ||||
|     result = gb.nth(0, dropna=dropna) | ||||
|     if dropna == "any" or (dropna == "all" and selection != ["b", "c"]): | ||||
|         locs = [1, 2] | ||||
|     else: | ||||
|         locs = [0, 2] | ||||
|     expected = df.loc[locs, selection] | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "data", | ||||
|     [ | ||||
|         ( | ||||
|             Timestamp("2011-01-15 12:50:28.502376"), | ||||
|             Timestamp("2011-01-20 12:50:28.593448"), | ||||
|         ), | ||||
|         (24650000000000001, 24650000000000002), | ||||
|     ], | ||||
| ) | ||||
| def test_groupby_nth_int_like_precision(data): | ||||
|     # GH#6620, GH#9311 | ||||
|     df = DataFrame({"a": [1, 1], "b": data}) | ||||
|  | ||||
|     grouped = df.groupby("a") | ||||
|     result = grouped.nth(0) | ||||
|     expected = DataFrame({"a": 1, "b": [data[0]]}) | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
| @ -0,0 +1,496 @@ | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| import pandas as pd | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"] | ||||
| ) | ||||
| @pytest.mark.parametrize( | ||||
|     "a_vals,b_vals", | ||||
|     [ | ||||
|         # Ints | ||||
|         ([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]), | ||||
|         ([1, 2, 3, 4], [4, 3, 2, 1]), | ||||
|         ([1, 2, 3, 4, 5], [4, 3, 2, 1]), | ||||
|         # Floats | ||||
|         ([1.0, 2.0, 3.0, 4.0, 5.0], [5.0, 4.0, 3.0, 2.0, 1.0]), | ||||
|         # Missing data | ||||
|         ([1.0, np.nan, 3.0, np.nan, 5.0], [5.0, np.nan, 3.0, np.nan, 1.0]), | ||||
|         ([np.nan, 4.0, np.nan, 2.0, np.nan], [np.nan, 4.0, np.nan, 2.0, np.nan]), | ||||
|         # Timestamps | ||||
|         ( | ||||
|             pd.date_range("1/1/18", freq="D", periods=5), | ||||
|             pd.date_range("1/1/18", freq="D", periods=5)[::-1], | ||||
|         ), | ||||
|         ( | ||||
|             pd.date_range("1/1/18", freq="D", periods=5).as_unit("s"), | ||||
|             pd.date_range("1/1/18", freq="D", periods=5)[::-1].as_unit("s"), | ||||
|         ), | ||||
|         # All NA | ||||
|         ([np.nan] * 5, [np.nan] * 5), | ||||
|     ], | ||||
| ) | ||||
| @pytest.mark.parametrize("q", [0, 0.25, 0.5, 0.75, 1]) | ||||
| def test_quantile(interpolation, a_vals, b_vals, q, request): | ||||
|     if ( | ||||
|         interpolation == "nearest" | ||||
|         and q == 0.5 | ||||
|         and isinstance(b_vals, list) | ||||
|         and b_vals == [4, 3, 2, 1] | ||||
|     ): | ||||
|         request.applymarker( | ||||
|             pytest.mark.xfail( | ||||
|                 reason="Unclear numpy expectation for nearest " | ||||
|                 "result with equidistant data" | ||||
|             ) | ||||
|         ) | ||||
|     all_vals = pd.concat([pd.Series(a_vals), pd.Series(b_vals)]) | ||||
|  | ||||
|     a_expected = pd.Series(a_vals).quantile(q, interpolation=interpolation) | ||||
|     b_expected = pd.Series(b_vals).quantile(q, interpolation=interpolation) | ||||
|  | ||||
|     df = DataFrame({"key": ["a"] * len(a_vals) + ["b"] * len(b_vals), "val": all_vals}) | ||||
|  | ||||
|     expected = DataFrame( | ||||
|         [a_expected, b_expected], columns=["val"], index=Index(["a", "b"], name="key") | ||||
|     ) | ||||
|     if all_vals.dtype.kind == "M" and expected.dtypes.values[0].kind == "M": | ||||
|         # TODO(non-nano): this should be unnecessary once array_to_datetime | ||||
|         #  correctly infers non-nano from Timestamp.unit | ||||
|         expected = expected.astype(all_vals.dtype) | ||||
|     result = df.groupby("key").quantile(q, interpolation=interpolation) | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_quantile_array(): | ||||
|     # https://github.com/pandas-dev/pandas/issues/27526 | ||||
|     df = DataFrame({"A": [0, 1, 2, 3, 4]}) | ||||
|     key = np.array([0, 0, 1, 1, 1], dtype=np.int64) | ||||
|     result = df.groupby(key).quantile([0.25]) | ||||
|  | ||||
|     index = pd.MultiIndex.from_product([[0, 1], [0.25]]) | ||||
|     expected = DataFrame({"A": [0.25, 2.50]}, index=index) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     df = DataFrame({"A": [0, 1, 2, 3], "B": [4, 5, 6, 7]}) | ||||
|     index = pd.MultiIndex.from_product([[0, 1], [0.25, 0.75]]) | ||||
|  | ||||
|     key = np.array([0, 0, 1, 1], dtype=np.int64) | ||||
|     result = df.groupby(key).quantile([0.25, 0.75]) | ||||
|     expected = DataFrame( | ||||
|         {"A": [0.25, 0.75, 2.25, 2.75], "B": [4.25, 4.75, 6.25, 6.75]}, index=index | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_quantile_array2(): | ||||
|     # https://github.com/pandas-dev/pandas/pull/28085#issuecomment-524066959 | ||||
|     arr = np.random.default_rng(2).integers(0, 5, size=(10, 3), dtype=np.int64) | ||||
|     df = DataFrame(arr, columns=list("ABC")) | ||||
|     result = df.groupby("A").quantile([0.3, 0.7]) | ||||
|     expected = DataFrame( | ||||
|         { | ||||
|             "B": [2.0, 2.0, 2.3, 2.7, 0.3, 0.7, 3.2, 4.0, 0.3, 0.7], | ||||
|             "C": [1.0, 1.0, 1.9, 3.0999999999999996, 0.3, 0.7, 2.6, 3.0, 1.2, 2.8], | ||||
|         }, | ||||
|         index=pd.MultiIndex.from_product( | ||||
|             [[0, 1, 2, 3, 4], [0.3, 0.7]], names=["A", None] | ||||
|         ), | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_quantile_array_no_sort(): | ||||
|     df = DataFrame({"A": [0, 1, 2], "B": [3, 4, 5]}) | ||||
|     key = np.array([1, 0, 1], dtype=np.int64) | ||||
|     result = df.groupby(key, sort=False).quantile([0.25, 0.5, 0.75]) | ||||
|     expected = DataFrame( | ||||
|         {"A": [0.5, 1.0, 1.5, 1.0, 1.0, 1.0], "B": [3.5, 4.0, 4.5, 4.0, 4.0, 4.0]}, | ||||
|         index=pd.MultiIndex.from_product([[1, 0], [0.25, 0.5, 0.75]]), | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby(key, sort=False).quantile([0.75, 0.25]) | ||||
|     expected = DataFrame( | ||||
|         {"A": [1.5, 0.5, 1.0, 1.0], "B": [4.5, 3.5, 4.0, 4.0]}, | ||||
|         index=pd.MultiIndex.from_product([[1, 0], [0.75, 0.25]]), | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_quantile_array_multiple_levels(): | ||||
|     df = DataFrame( | ||||
|         {"A": [0, 1, 2], "B": [3, 4, 5], "c": ["a", "a", "a"], "d": ["a", "a", "b"]} | ||||
|     ) | ||||
|     result = df.groupby(["c", "d"]).quantile([0.25, 0.75]) | ||||
|     index = pd.MultiIndex.from_tuples( | ||||
|         [("a", "a", 0.25), ("a", "a", 0.75), ("a", "b", 0.25), ("a", "b", 0.75)], | ||||
|         names=["c", "d", None], | ||||
|     ) | ||||
|     expected = DataFrame( | ||||
|         {"A": [0.25, 0.75, 2.0, 2.0], "B": [3.25, 3.75, 5.0, 5.0]}, index=index | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("frame_size", [(2, 3), (100, 10)]) | ||||
| @pytest.mark.parametrize("groupby", [[0], [0, 1]]) | ||||
| @pytest.mark.parametrize("q", [[0.5, 0.6]]) | ||||
| def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, q): | ||||
|     # GH30289 | ||||
|     nrow, ncol = frame_size | ||||
|     df = DataFrame(np.array([ncol * [_ % 4] for _ in range(nrow)]), columns=range(ncol)) | ||||
|  | ||||
|     idx_levels = [np.arange(min(nrow, 4))] * len(groupby) + [q] | ||||
|     idx_codes = [[x for x in range(min(nrow, 4)) for _ in q]] * len(groupby) + [ | ||||
|         list(range(len(q))) * min(nrow, 4) | ||||
|     ] | ||||
|     expected_index = pd.MultiIndex( | ||||
|         levels=idx_levels, codes=idx_codes, names=groupby + [None] | ||||
|     ) | ||||
|     expected_values = [ | ||||
|         [float(x)] * (ncol - len(groupby)) for x in range(min(nrow, 4)) for _ in q | ||||
|     ] | ||||
|     expected_columns = [x for x in range(ncol) if x not in groupby] | ||||
|     expected = DataFrame( | ||||
|         expected_values, index=expected_index, columns=expected_columns | ||||
|     ) | ||||
|     result = df.groupby(groupby).quantile(q) | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_quantile_raises(): | ||||
|     df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"]) | ||||
|  | ||||
|     msg = "dtype '(object|str)' does not support operation 'quantile'" | ||||
|     with pytest.raises(TypeError, match=msg): | ||||
|         df.groupby("key").quantile() | ||||
|  | ||||
|  | ||||
| def test_quantile_out_of_bounds_q_raises(): | ||||
|     # https://github.com/pandas-dev/pandas/issues/27470 | ||||
|     df = DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": range(6)}) | ||||
|     g = df.groupby([0, 0, 0, 1, 1, 1]) | ||||
|     with pytest.raises(ValueError, match="Got '50.0' instead"): | ||||
|         g.quantile(50) | ||||
|  | ||||
|     with pytest.raises(ValueError, match="Got '-1.0' instead"): | ||||
|         g.quantile(-1) | ||||
|  | ||||
|  | ||||
| def test_quantile_missing_group_values_no_segfaults(): | ||||
|     # GH 28662 | ||||
|     data = np.array([1.0, np.nan, 1.0]) | ||||
|     df = DataFrame({"key": data, "val": range(3)}) | ||||
|  | ||||
|     # Random segfaults; would have been guaranteed in loop | ||||
|     grp = df.groupby("key") | ||||
|     for _ in range(100): | ||||
|         grp.quantile() | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "key, val, expected_key, expected_val", | ||||
|     [ | ||||
|         ([1.0, np.nan, 3.0, np.nan], range(4), [1.0, 3.0], [0.0, 2.0]), | ||||
|         ([1.0, np.nan, 2.0, 2.0], range(4), [1.0, 2.0], [0.0, 2.5]), | ||||
|         (["a", "b", "b", np.nan], range(4), ["a", "b"], [0, 1.5]), | ||||
|         ([0], [42], [0], [42.0]), | ||||
|         ([], [], np.array([], dtype="float64"), np.array([], dtype="float64")), | ||||
|     ], | ||||
| ) | ||||
| def test_quantile_missing_group_values_correct_results( | ||||
|     key, val, expected_key, expected_val | ||||
| ): | ||||
|     # GH 28662, GH 33200, GH 33569 | ||||
|     df = DataFrame({"key": key, "val": val}) | ||||
|  | ||||
|     expected = DataFrame( | ||||
|         expected_val, index=Index(expected_key, name="key"), columns=["val"] | ||||
|     ) | ||||
|  | ||||
|     grp = df.groupby("key") | ||||
|  | ||||
|     result = grp.quantile(0.5) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = grp.quantile() | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "values", | ||||
|     [ | ||||
|         pd.array([1, 0, None] * 2, dtype="Int64"), | ||||
|         pd.array([True, False, None] * 2, dtype="boolean"), | ||||
|     ], | ||||
| ) | ||||
| @pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]]) | ||||
| def test_groupby_quantile_nullable_array(values, q): | ||||
|     # https://github.com/pandas-dev/pandas/issues/33136 | ||||
|     df = DataFrame({"a": ["x"] * 3 + ["y"] * 3, "b": values}) | ||||
|     result = df.groupby("a")["b"].quantile(q) | ||||
|  | ||||
|     if isinstance(q, list): | ||||
|         idx = pd.MultiIndex.from_product((["x", "y"], q), names=["a", None]) | ||||
|         true_quantiles = [0.0, 0.5, 1.0] | ||||
|     else: | ||||
|         idx = Index(["x", "y"], name="a") | ||||
|         true_quantiles = [0.5] | ||||
|  | ||||
|     expected = pd.Series(true_quantiles * 2, index=idx, name="b", dtype="Float64") | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]]) | ||||
| @pytest.mark.parametrize("numeric_only", [True, False]) | ||||
| def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only): | ||||
|     df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]}) | ||||
|     if numeric_only: | ||||
|         result = df.groupby("a").quantile(q, numeric_only=numeric_only) | ||||
|         expected = df.groupby("a")[["b"]].quantile(q) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|     else: | ||||
|         msg = "dtype '.*' does not support operation 'quantile'" | ||||
|         with pytest.raises(TypeError, match=msg): | ||||
|             df.groupby("a").quantile(q, numeric_only=numeric_only) | ||||
|  | ||||
|  | ||||
| def test_groupby_quantile_NA_float(any_float_dtype): | ||||
|     # GH#42849 | ||||
|     df = DataFrame({"x": [1, 1], "y": [0.2, np.nan]}, dtype=any_float_dtype) | ||||
|     result = df.groupby("x")["y"].quantile(0.5) | ||||
|     exp_index = Index([1.0], dtype=any_float_dtype, name="x") | ||||
|  | ||||
|     if any_float_dtype in ["Float32", "Float64"]: | ||||
|         expected_dtype = any_float_dtype | ||||
|     else: | ||||
|         expected_dtype = None | ||||
|  | ||||
|     expected = pd.Series([0.2], dtype=expected_dtype, index=exp_index, name="y") | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby("x")["y"].quantile([0.5, 0.75]) | ||||
|     expected = pd.Series( | ||||
|         [0.2] * 2, | ||||
|         index=pd.MultiIndex.from_product((exp_index, [0.5, 0.75]), names=["x", None]), | ||||
|         name="y", | ||||
|         dtype=expected_dtype, | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_groupby_quantile_NA_int(any_int_ea_dtype): | ||||
|     # GH#42849 | ||||
|     df = DataFrame({"x": [1, 1], "y": [2, 5]}, dtype=any_int_ea_dtype) | ||||
|     result = df.groupby("x")["y"].quantile(0.5) | ||||
|     expected = pd.Series( | ||||
|         [3.5], | ||||
|         dtype="Float64", | ||||
|         index=Index([1], name="x", dtype=any_int_ea_dtype), | ||||
|         name="y", | ||||
|     ) | ||||
|     tm.assert_series_equal(expected, result) | ||||
|  | ||||
|     result = df.groupby("x").quantile(0.5) | ||||
|     expected = DataFrame( | ||||
|         {"y": 3.5}, dtype="Float64", index=Index([1], name="x", dtype=any_int_ea_dtype) | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "interpolation, val1, val2", [("lower", 2, 2), ("higher", 2, 3), ("nearest", 2, 2)] | ||||
| ) | ||||
| def test_groupby_quantile_all_na_group_masked( | ||||
|     interpolation, val1, val2, any_numeric_ea_dtype | ||||
| ): | ||||
|     # GH#37493 | ||||
|     df = DataFrame( | ||||
|         {"a": [1, 1, 1, 2], "b": [1, 2, 3, pd.NA]}, dtype=any_numeric_ea_dtype | ||||
|     ) | ||||
|     result = df.groupby("a").quantile(q=[0.5, 0.7], interpolation=interpolation) | ||||
|     expected = DataFrame( | ||||
|         {"b": [val1, val2, pd.NA, pd.NA]}, | ||||
|         dtype=any_numeric_ea_dtype, | ||||
|         index=pd.MultiIndex.from_arrays( | ||||
|             [pd.Series([1, 1, 2, 2], dtype=any_numeric_ea_dtype), [0.5, 0.7, 0.5, 0.7]], | ||||
|             names=["a", None], | ||||
|         ), | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("interpolation", ["midpoint", "linear"]) | ||||
| def test_groupby_quantile_all_na_group_masked_interp( | ||||
|     interpolation, any_numeric_ea_dtype | ||||
| ): | ||||
|     # GH#37493 | ||||
|     df = DataFrame( | ||||
|         {"a": [1, 1, 1, 2], "b": [1, 2, 3, pd.NA]}, dtype=any_numeric_ea_dtype | ||||
|     ) | ||||
|     result = df.groupby("a").quantile(q=[0.5, 0.75], interpolation=interpolation) | ||||
|  | ||||
|     if any_numeric_ea_dtype == "Float32": | ||||
|         expected_dtype = any_numeric_ea_dtype | ||||
|     else: | ||||
|         expected_dtype = "Float64" | ||||
|  | ||||
|     expected = DataFrame( | ||||
|         {"b": [2.0, 2.5, pd.NA, pd.NA]}, | ||||
|         dtype=expected_dtype, | ||||
|         index=pd.MultiIndex.from_arrays( | ||||
|             [ | ||||
|                 pd.Series([1, 1, 2, 2], dtype=any_numeric_ea_dtype), | ||||
|                 [0.5, 0.75, 0.5, 0.75], | ||||
|             ], | ||||
|             names=["a", None], | ||||
|         ), | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("dtype", ["Float64", "Float32"]) | ||||
| def test_groupby_quantile_allNA_column(dtype): | ||||
|     # GH#42849 | ||||
|     df = DataFrame({"x": [1, 1], "y": [pd.NA] * 2}, dtype=dtype) | ||||
|     result = df.groupby("x")["y"].quantile(0.5) | ||||
|     expected = pd.Series( | ||||
|         [np.nan], dtype=dtype, index=Index([1.0], dtype=dtype), name="y" | ||||
|     ) | ||||
|     expected.index.name = "x" | ||||
|     tm.assert_series_equal(expected, result) | ||||
|  | ||||
|  | ||||
| def test_groupby_timedelta_quantile(): | ||||
|     # GH: 29485 | ||||
|     df = DataFrame( | ||||
|         {"value": pd.to_timedelta(np.arange(4), unit="s"), "group": [1, 1, 2, 2]} | ||||
|     ) | ||||
|     result = df.groupby("group").quantile(0.99) | ||||
|     expected = DataFrame( | ||||
|         { | ||||
|             "value": [ | ||||
|                 pd.Timedelta("0 days 00:00:00.990000"), | ||||
|                 pd.Timedelta("0 days 00:00:02.990000"), | ||||
|             ] | ||||
|         }, | ||||
|         index=Index([1, 2], name="group"), | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_columns_groupby_quantile(): | ||||
|     # GH 33795 | ||||
|     df = DataFrame( | ||||
|         np.arange(12).reshape(3, -1), | ||||
|         index=list("XYZ"), | ||||
|         columns=pd.Series(list("ABAB"), name="col"), | ||||
|     ) | ||||
|     msg = "DataFrame.groupby with axis=1 is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         gb = df.groupby("col", axis=1) | ||||
|     result = gb.quantile(q=[0.8, 0.2]) | ||||
|     expected = DataFrame( | ||||
|         [ | ||||
|             [1.6, 0.4, 2.6, 1.4], | ||||
|             [5.6, 4.4, 6.6, 5.4], | ||||
|             [9.6, 8.4, 10.6, 9.4], | ||||
|         ], | ||||
|         index=list("XYZ"), | ||||
|         columns=pd.MultiIndex.from_tuples( | ||||
|             [("A", 0.8), ("A", 0.2), ("B", 0.8), ("B", 0.2)], names=["col", None] | ||||
|         ), | ||||
|     ) | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_timestamp_groupby_quantile(unit): | ||||
|     # GH 33168 | ||||
|     dti = pd.date_range( | ||||
|         start="2020-04-19 00:00:00", freq="1min", periods=100, tz="UTC", unit=unit | ||||
|     ).floor("1h") | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "timestamp": dti, | ||||
|             "category": list(range(1, 101)), | ||||
|             "value": list(range(101, 201)), | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|     result = df.groupby("timestamp").quantile([0.2, 0.8]) | ||||
|  | ||||
|     mi = pd.MultiIndex.from_product([dti[::99], [0.2, 0.8]], names=("timestamp", None)) | ||||
|     expected = DataFrame( | ||||
|         [ | ||||
|             {"category": 12.8, "value": 112.8}, | ||||
|             {"category": 48.2, "value": 148.2}, | ||||
|             {"category": 68.8, "value": 168.8}, | ||||
|             {"category": 92.2, "value": 192.2}, | ||||
|         ], | ||||
|         index=mi, | ||||
|     ) | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_groupby_quantile_dt64tz_period(): | ||||
|     # GH#51373 | ||||
|     dti = pd.date_range("2016-01-01", periods=1000) | ||||
|     df = pd.Series(dti).to_frame().copy() | ||||
|     df[1] = dti.tz_localize("US/Pacific") | ||||
|     df[2] = dti.to_period("D") | ||||
|     df[3] = dti - dti[0] | ||||
|     df.iloc[-1] = pd.NaT | ||||
|  | ||||
|     by = np.tile(np.arange(5), 200) | ||||
|     gb = df.groupby(by) | ||||
|  | ||||
|     result = gb.quantile(0.5) | ||||
|  | ||||
|     # Check that we match the group-by-group result | ||||
|     exp = {i: df.iloc[i::5].quantile(0.5) for i in range(5)} | ||||
|     expected = DataFrame(exp).T.infer_objects() | ||||
|     expected.index = expected.index.astype(int) | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_groupby_quantile_nonmulti_levels_order(): | ||||
|     # Non-regression test for GH #53009 | ||||
|     ind = pd.MultiIndex.from_tuples( | ||||
|         [ | ||||
|             (0, "a", "B"), | ||||
|             (0, "a", "A"), | ||||
|             (0, "b", "B"), | ||||
|             (0, "b", "A"), | ||||
|             (1, "a", "B"), | ||||
|             (1, "a", "A"), | ||||
|             (1, "b", "B"), | ||||
|             (1, "b", "A"), | ||||
|         ], | ||||
|         names=["sample", "cat0", "cat1"], | ||||
|     ) | ||||
|     ser = pd.Series(range(8), index=ind) | ||||
|     result = ser.groupby(level="cat1", sort=False).quantile([0.2, 0.8]) | ||||
|  | ||||
|     qind = pd.MultiIndex.from_tuples( | ||||
|         [("B", 0.2), ("B", 0.8), ("A", 0.2), ("A", 0.8)], names=["cat1", None] | ||||
|     ) | ||||
|     expected = pd.Series([1.2, 4.8, 2.2, 5.8], index=qind) | ||||
|  | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     # We need to check that index levels are not sorted | ||||
|     expected_levels = pd.core.indexes.frozen.FrozenList([["B", "A"], [0.2, 0.8]]) | ||||
|     tm.assert_equal(result.index.levels, expected_levels) | ||||
| @ -0,0 +1,721 @@ | ||||
| from datetime import datetime | ||||
|  | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| import pandas as pd | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     NaT, | ||||
|     Series, | ||||
|     concat, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| def test_rank_unordered_categorical_typeerror(): | ||||
|     # GH#51034 should be TypeError, not NotImplementedError | ||||
|     cat = pd.Categorical([], ordered=False) | ||||
|     ser = Series(cat) | ||||
|     df = ser.to_frame() | ||||
|  | ||||
|     msg = "Cannot perform rank with non-ordered Categorical" | ||||
|  | ||||
|     gb = ser.groupby(cat, observed=False) | ||||
|     with pytest.raises(TypeError, match=msg): | ||||
|         gb.rank() | ||||
|  | ||||
|     gb2 = df.groupby(cat, observed=False) | ||||
|     with pytest.raises(TypeError, match=msg): | ||||
|         gb2.rank() | ||||
|  | ||||
|  | ||||
| def test_rank_apply(): | ||||
|     lev1 = np.array(["a" * 10] * 100, dtype=object) | ||||
|     lev2 = np.array(["b" * 10] * 130, dtype=object) | ||||
|     lab1 = np.random.default_rng(2).integers(0, 100, size=500, dtype=int) | ||||
|     lab2 = np.random.default_rng(2).integers(0, 130, size=500, dtype=int) | ||||
|  | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "value": np.random.default_rng(2).standard_normal(500), | ||||
|             "key1": lev1.take(lab1), | ||||
|             "key2": lev2.take(lab2), | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|     result = df.groupby(["key1", "key2"]).value.rank() | ||||
|  | ||||
|     expected = [piece.value.rank() for key, piece in df.groupby(["key1", "key2"])] | ||||
|     expected = concat(expected, axis=0) | ||||
|     expected = expected.reindex(result.index) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby(["key1", "key2"]).value.rank(pct=True) | ||||
|  | ||||
|     expected = [ | ||||
|         piece.value.rank(pct=True) for key, piece in df.groupby(["key1", "key2"]) | ||||
|     ] | ||||
|     expected = concat(expected, axis=0) | ||||
|     expected = expected.reindex(result.index) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]]) | ||||
| @pytest.mark.parametrize( | ||||
|     "vals", | ||||
|     [ | ||||
|         np.array([2, 2, 8, 2, 6], dtype=dtype) | ||||
|         for dtype in ["i8", "i4", "i2", "i1", "u8", "u4", "u2", "u1", "f8", "f4", "f2"] | ||||
|     ] | ||||
|     + [ | ||||
|         [ | ||||
|             pd.Timestamp("2018-01-02"), | ||||
|             pd.Timestamp("2018-01-02"), | ||||
|             pd.Timestamp("2018-01-08"), | ||||
|             pd.Timestamp("2018-01-02"), | ||||
|             pd.Timestamp("2018-01-06"), | ||||
|         ], | ||||
|         [ | ||||
|             pd.Timestamp("2018-01-02", tz="US/Pacific"), | ||||
|             pd.Timestamp("2018-01-02", tz="US/Pacific"), | ||||
|             pd.Timestamp("2018-01-08", tz="US/Pacific"), | ||||
|             pd.Timestamp("2018-01-02", tz="US/Pacific"), | ||||
|             pd.Timestamp("2018-01-06", tz="US/Pacific"), | ||||
|         ], | ||||
|         [ | ||||
|             pd.Timestamp("2018-01-02") - pd.Timestamp(0), | ||||
|             pd.Timestamp("2018-01-02") - pd.Timestamp(0), | ||||
|             pd.Timestamp("2018-01-08") - pd.Timestamp(0), | ||||
|             pd.Timestamp("2018-01-02") - pd.Timestamp(0), | ||||
|             pd.Timestamp("2018-01-06") - pd.Timestamp(0), | ||||
|         ], | ||||
|         [ | ||||
|             pd.Timestamp("2018-01-02").to_period("D"), | ||||
|             pd.Timestamp("2018-01-02").to_period("D"), | ||||
|             pd.Timestamp("2018-01-08").to_period("D"), | ||||
|             pd.Timestamp("2018-01-02").to_period("D"), | ||||
|             pd.Timestamp("2018-01-06").to_period("D"), | ||||
|         ], | ||||
|     ], | ||||
|     ids=lambda x: type(x[0]), | ||||
| ) | ||||
| @pytest.mark.parametrize( | ||||
|     "ties_method,ascending,pct,exp", | ||||
|     [ | ||||
|         ("average", True, False, [2.0, 2.0, 5.0, 2.0, 4.0]), | ||||
|         ("average", True, True, [0.4, 0.4, 1.0, 0.4, 0.8]), | ||||
|         ("average", False, False, [4.0, 4.0, 1.0, 4.0, 2.0]), | ||||
|         ("average", False, True, [0.8, 0.8, 0.2, 0.8, 0.4]), | ||||
|         ("min", True, False, [1.0, 1.0, 5.0, 1.0, 4.0]), | ||||
|         ("min", True, True, [0.2, 0.2, 1.0, 0.2, 0.8]), | ||||
|         ("min", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]), | ||||
|         ("min", False, True, [0.6, 0.6, 0.2, 0.6, 0.4]), | ||||
|         ("max", True, False, [3.0, 3.0, 5.0, 3.0, 4.0]), | ||||
|         ("max", True, True, [0.6, 0.6, 1.0, 0.6, 0.8]), | ||||
|         ("max", False, False, [5.0, 5.0, 1.0, 5.0, 2.0]), | ||||
|         ("max", False, True, [1.0, 1.0, 0.2, 1.0, 0.4]), | ||||
|         ("first", True, False, [1.0, 2.0, 5.0, 3.0, 4.0]), | ||||
|         ("first", True, True, [0.2, 0.4, 1.0, 0.6, 0.8]), | ||||
|         ("first", False, False, [3.0, 4.0, 1.0, 5.0, 2.0]), | ||||
|         ("first", False, True, [0.6, 0.8, 0.2, 1.0, 0.4]), | ||||
|         ("dense", True, False, [1.0, 1.0, 3.0, 1.0, 2.0]), | ||||
|         ("dense", True, True, [1.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 2.0 / 3.0]), | ||||
|         ("dense", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]), | ||||
|         ("dense", False, True, [3.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 2.0 / 3.0]), | ||||
|     ], | ||||
| ) | ||||
| def test_rank_args(grps, vals, ties_method, ascending, pct, exp): | ||||
|     key = np.repeat(grps, len(vals)) | ||||
|  | ||||
|     orig_vals = vals | ||||
|     vals = list(vals) * len(grps) | ||||
|     if isinstance(orig_vals, np.ndarray): | ||||
|         vals = np.array(vals, dtype=orig_vals.dtype) | ||||
|  | ||||
|     df = DataFrame({"key": key, "val": vals}) | ||||
|     result = df.groupby("key").rank(method=ties_method, ascending=ascending, pct=pct) | ||||
|  | ||||
|     exp_df = DataFrame(exp * len(grps), columns=["val"]) | ||||
|     tm.assert_frame_equal(result, exp_df) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]]) | ||||
| @pytest.mark.parametrize( | ||||
|     "vals", [[-np.inf, -np.inf, np.nan, 1.0, np.nan, np.inf, np.inf]] | ||||
| ) | ||||
| @pytest.mark.parametrize( | ||||
|     "ties_method,ascending,na_option,exp", | ||||
|     [ | ||||
|         ("average", True, "keep", [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]), | ||||
|         ("average", True, "top", [3.5, 3.5, 1.5, 5.0, 1.5, 6.5, 6.5]), | ||||
|         ("average", True, "bottom", [1.5, 1.5, 6.5, 3.0, 6.5, 4.5, 4.5]), | ||||
|         ("average", False, "keep", [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]), | ||||
|         ("average", False, "top", [6.5, 6.5, 1.5, 5.0, 1.5, 3.5, 3.5]), | ||||
|         ("average", False, "bottom", [4.5, 4.5, 6.5, 3.0, 6.5, 1.5, 1.5]), | ||||
|         ("min", True, "keep", [1.0, 1.0, np.nan, 3.0, np.nan, 4.0, 4.0]), | ||||
|         ("min", True, "top", [3.0, 3.0, 1.0, 5.0, 1.0, 6.0, 6.0]), | ||||
|         ("min", True, "bottom", [1.0, 1.0, 6.0, 3.0, 6.0, 4.0, 4.0]), | ||||
|         ("min", False, "keep", [4.0, 4.0, np.nan, 3.0, np.nan, 1.0, 1.0]), | ||||
|         ("min", False, "top", [6.0, 6.0, 1.0, 5.0, 1.0, 3.0, 3.0]), | ||||
|         ("min", False, "bottom", [4.0, 4.0, 6.0, 3.0, 6.0, 1.0, 1.0]), | ||||
|         ("max", True, "keep", [2.0, 2.0, np.nan, 3.0, np.nan, 5.0, 5.0]), | ||||
|         ("max", True, "top", [4.0, 4.0, 2.0, 5.0, 2.0, 7.0, 7.0]), | ||||
|         ("max", True, "bottom", [2.0, 2.0, 7.0, 3.0, 7.0, 5.0, 5.0]), | ||||
|         ("max", False, "keep", [5.0, 5.0, np.nan, 3.0, np.nan, 2.0, 2.0]), | ||||
|         ("max", False, "top", [7.0, 7.0, 2.0, 5.0, 2.0, 4.0, 4.0]), | ||||
|         ("max", False, "bottom", [5.0, 5.0, 7.0, 3.0, 7.0, 2.0, 2.0]), | ||||
|         ("first", True, "keep", [1.0, 2.0, np.nan, 3.0, np.nan, 4.0, 5.0]), | ||||
|         ("first", True, "top", [3.0, 4.0, 1.0, 5.0, 2.0, 6.0, 7.0]), | ||||
|         ("first", True, "bottom", [1.0, 2.0, 6.0, 3.0, 7.0, 4.0, 5.0]), | ||||
|         ("first", False, "keep", [4.0, 5.0, np.nan, 3.0, np.nan, 1.0, 2.0]), | ||||
|         ("first", False, "top", [6.0, 7.0, 1.0, 5.0, 2.0, 3.0, 4.0]), | ||||
|         ("first", False, "bottom", [4.0, 5.0, 6.0, 3.0, 7.0, 1.0, 2.0]), | ||||
|         ("dense", True, "keep", [1.0, 1.0, np.nan, 2.0, np.nan, 3.0, 3.0]), | ||||
|         ("dense", True, "top", [2.0, 2.0, 1.0, 3.0, 1.0, 4.0, 4.0]), | ||||
|         ("dense", True, "bottom", [1.0, 1.0, 4.0, 2.0, 4.0, 3.0, 3.0]), | ||||
|         ("dense", False, "keep", [3.0, 3.0, np.nan, 2.0, np.nan, 1.0, 1.0]), | ||||
|         ("dense", False, "top", [4.0, 4.0, 1.0, 3.0, 1.0, 2.0, 2.0]), | ||||
|         ("dense", False, "bottom", [3.0, 3.0, 4.0, 2.0, 4.0, 1.0, 1.0]), | ||||
|     ], | ||||
| ) | ||||
| def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp): | ||||
|     # GH 20561 | ||||
|     key = np.repeat(grps, len(vals)) | ||||
|     vals = vals * len(grps) | ||||
|     df = DataFrame({"key": key, "val": vals}) | ||||
|     result = df.groupby("key").rank( | ||||
|         method=ties_method, ascending=ascending, na_option=na_option | ||||
|     ) | ||||
|     exp_df = DataFrame(exp * len(grps), columns=["val"]) | ||||
|     tm.assert_frame_equal(result, exp_df) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]]) | ||||
| @pytest.mark.parametrize( | ||||
|     "vals", | ||||
|     [ | ||||
|         np.array([2, 2, np.nan, 8, 2, 6, np.nan, np.nan], dtype=dtype) | ||||
|         for dtype in ["f8", "f4", "f2"] | ||||
|     ] | ||||
|     + [ | ||||
|         [ | ||||
|             pd.Timestamp("2018-01-02"), | ||||
|             pd.Timestamp("2018-01-02"), | ||||
|             np.nan, | ||||
|             pd.Timestamp("2018-01-08"), | ||||
|             pd.Timestamp("2018-01-02"), | ||||
|             pd.Timestamp("2018-01-06"), | ||||
|             np.nan, | ||||
|             np.nan, | ||||
|         ], | ||||
|         [ | ||||
|             pd.Timestamp("2018-01-02", tz="US/Pacific"), | ||||
|             pd.Timestamp("2018-01-02", tz="US/Pacific"), | ||||
|             np.nan, | ||||
|             pd.Timestamp("2018-01-08", tz="US/Pacific"), | ||||
|             pd.Timestamp("2018-01-02", tz="US/Pacific"), | ||||
|             pd.Timestamp("2018-01-06", tz="US/Pacific"), | ||||
|             np.nan, | ||||
|             np.nan, | ||||
|         ], | ||||
|         [ | ||||
|             pd.Timestamp("2018-01-02") - pd.Timestamp(0), | ||||
|             pd.Timestamp("2018-01-02") - pd.Timestamp(0), | ||||
|             np.nan, | ||||
|             pd.Timestamp("2018-01-08") - pd.Timestamp(0), | ||||
|             pd.Timestamp("2018-01-02") - pd.Timestamp(0), | ||||
|             pd.Timestamp("2018-01-06") - pd.Timestamp(0), | ||||
|             np.nan, | ||||
|             np.nan, | ||||
|         ], | ||||
|         [ | ||||
|             pd.Timestamp("2018-01-02").to_period("D"), | ||||
|             pd.Timestamp("2018-01-02").to_period("D"), | ||||
|             np.nan, | ||||
|             pd.Timestamp("2018-01-08").to_period("D"), | ||||
|             pd.Timestamp("2018-01-02").to_period("D"), | ||||
|             pd.Timestamp("2018-01-06").to_period("D"), | ||||
|             np.nan, | ||||
|             np.nan, | ||||
|         ], | ||||
|     ], | ||||
|     ids=lambda x: type(x[0]), | ||||
| ) | ||||
| @pytest.mark.parametrize( | ||||
|     "ties_method,ascending,na_option,pct,exp", | ||||
|     [ | ||||
|         ( | ||||
|             "average", | ||||
|             True, | ||||
|             "keep", | ||||
|             False, | ||||
|             [2.0, 2.0, np.nan, 5.0, 2.0, 4.0, np.nan, np.nan], | ||||
|         ), | ||||
|         ( | ||||
|             "average", | ||||
|             True, | ||||
|             "keep", | ||||
|             True, | ||||
|             [0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan], | ||||
|         ), | ||||
|         ( | ||||
|             "average", | ||||
|             False, | ||||
|             "keep", | ||||
|             False, | ||||
|             [4.0, 4.0, np.nan, 1.0, 4.0, 2.0, np.nan, np.nan], | ||||
|         ), | ||||
|         ( | ||||
|             "average", | ||||
|             False, | ||||
|             "keep", | ||||
|             True, | ||||
|             [0.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan], | ||||
|         ), | ||||
|         ("min", True, "keep", False, [1.0, 1.0, np.nan, 5.0, 1.0, 4.0, np.nan, np.nan]), | ||||
|         ("min", True, "keep", True, [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]), | ||||
|         ( | ||||
|             "min", | ||||
|             False, | ||||
|             "keep", | ||||
|             False, | ||||
|             [3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan], | ||||
|         ), | ||||
|         ("min", False, "keep", True, [0.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]), | ||||
|         ("max", True, "keep", False, [3.0, 3.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan]), | ||||
|         ("max", True, "keep", True, [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]), | ||||
|         ( | ||||
|             "max", | ||||
|             False, | ||||
|             "keep", | ||||
|             False, | ||||
|             [5.0, 5.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan], | ||||
|         ), | ||||
|         ("max", False, "keep", True, [1.0, 1.0, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan]), | ||||
|         ( | ||||
|             "first", | ||||
|             True, | ||||
|             "keep", | ||||
|             False, | ||||
|             [1.0, 2.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan], | ||||
|         ), | ||||
|         ( | ||||
|             "first", | ||||
|             True, | ||||
|             "keep", | ||||
|             True, | ||||
|             [0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan], | ||||
|         ), | ||||
|         ( | ||||
|             "first", | ||||
|             False, | ||||
|             "keep", | ||||
|             False, | ||||
|             [3.0, 4.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan], | ||||
|         ), | ||||
|         ( | ||||
|             "first", | ||||
|             False, | ||||
|             "keep", | ||||
|             True, | ||||
|             [0.6, 0.8, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan], | ||||
|         ), | ||||
|         ( | ||||
|             "dense", | ||||
|             True, | ||||
|             "keep", | ||||
|             False, | ||||
|             [1.0, 1.0, np.nan, 3.0, 1.0, 2.0, np.nan, np.nan], | ||||
|         ), | ||||
|         ( | ||||
|             "dense", | ||||
|             True, | ||||
|             "keep", | ||||
|             True, | ||||
|             [ | ||||
|                 1.0 / 3.0, | ||||
|                 1.0 / 3.0, | ||||
|                 np.nan, | ||||
|                 3.0 / 3.0, | ||||
|                 1.0 / 3.0, | ||||
|                 2.0 / 3.0, | ||||
|                 np.nan, | ||||
|                 np.nan, | ||||
|             ], | ||||
|         ), | ||||
|         ( | ||||
|             "dense", | ||||
|             False, | ||||
|             "keep", | ||||
|             False, | ||||
|             [3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan], | ||||
|         ), | ||||
|         ( | ||||
|             "dense", | ||||
|             False, | ||||
|             "keep", | ||||
|             True, | ||||
|             [ | ||||
|                 3.0 / 3.0, | ||||
|                 3.0 / 3.0, | ||||
|                 np.nan, | ||||
|                 1.0 / 3.0, | ||||
|                 3.0 / 3.0, | ||||
|                 2.0 / 3.0, | ||||
|                 np.nan, | ||||
|                 np.nan, | ||||
|             ], | ||||
|         ), | ||||
|         ("average", True, "bottom", False, [2.0, 2.0, 7.0, 5.0, 2.0, 4.0, 7.0, 7.0]), | ||||
|         ( | ||||
|             "average", | ||||
|             True, | ||||
|             "bottom", | ||||
|             True, | ||||
|             [0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875], | ||||
|         ), | ||||
|         ("average", False, "bottom", False, [4.0, 4.0, 7.0, 1.0, 4.0, 2.0, 7.0, 7.0]), | ||||
|         ( | ||||
|             "average", | ||||
|             False, | ||||
|             "bottom", | ||||
|             True, | ||||
|             [0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875], | ||||
|         ), | ||||
|         ("min", True, "bottom", False, [1.0, 1.0, 6.0, 5.0, 1.0, 4.0, 6.0, 6.0]), | ||||
|         ( | ||||
|             "min", | ||||
|             True, | ||||
|             "bottom", | ||||
|             True, | ||||
|             [0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75], | ||||
|         ), | ||||
|         ("min", False, "bottom", False, [3.0, 3.0, 6.0, 1.0, 3.0, 2.0, 6.0, 6.0]), | ||||
|         ( | ||||
|             "min", | ||||
|             False, | ||||
|             "bottom", | ||||
|             True, | ||||
|             [0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75], | ||||
|         ), | ||||
|         ("max", True, "bottom", False, [3.0, 3.0, 8.0, 5.0, 3.0, 4.0, 8.0, 8.0]), | ||||
|         ("max", True, "bottom", True, [0.375, 0.375, 1.0, 0.625, 0.375, 0.5, 1.0, 1.0]), | ||||
|         ("max", False, "bottom", False, [5.0, 5.0, 8.0, 1.0, 5.0, 2.0, 8.0, 8.0]), | ||||
|         ( | ||||
|             "max", | ||||
|             False, | ||||
|             "bottom", | ||||
|             True, | ||||
|             [0.625, 0.625, 1.0, 0.125, 0.625, 0.25, 1.0, 1.0], | ||||
|         ), | ||||
|         ("first", True, "bottom", False, [1.0, 2.0, 6.0, 5.0, 3.0, 4.0, 7.0, 8.0]), | ||||
|         ( | ||||
|             "first", | ||||
|             True, | ||||
|             "bottom", | ||||
|             True, | ||||
|             [0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.0], | ||||
|         ), | ||||
|         ("first", False, "bottom", False, [3.0, 4.0, 6.0, 1.0, 5.0, 2.0, 7.0, 8.0]), | ||||
|         ( | ||||
|             "first", | ||||
|             False, | ||||
|             "bottom", | ||||
|             True, | ||||
|             [0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.0], | ||||
|         ), | ||||
|         ("dense", True, "bottom", False, [1.0, 1.0, 4.0, 3.0, 1.0, 2.0, 4.0, 4.0]), | ||||
|         ("dense", True, "bottom", True, [0.25, 0.25, 1.0, 0.75, 0.25, 0.5, 1.0, 1.0]), | ||||
|         ("dense", False, "bottom", False, [3.0, 3.0, 4.0, 1.0, 3.0, 2.0, 4.0, 4.0]), | ||||
|         ("dense", False, "bottom", True, [0.75, 0.75, 1.0, 0.25, 0.75, 0.5, 1.0, 1.0]), | ||||
|     ], | ||||
| ) | ||||
| def test_rank_args_missing(grps, vals, ties_method, ascending, na_option, pct, exp): | ||||
|     key = np.repeat(grps, len(vals)) | ||||
|  | ||||
|     orig_vals = vals | ||||
|     vals = list(vals) * len(grps) | ||||
|     if isinstance(orig_vals, np.ndarray): | ||||
|         vals = np.array(vals, dtype=orig_vals.dtype) | ||||
|  | ||||
|     df = DataFrame({"key": key, "val": vals}) | ||||
|     result = df.groupby("key").rank( | ||||
|         method=ties_method, ascending=ascending, na_option=na_option, pct=pct | ||||
|     ) | ||||
|  | ||||
|     exp_df = DataFrame(exp * len(grps), columns=["val"]) | ||||
|     tm.assert_frame_equal(result, exp_df) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "pct,exp", [(False, [3.0, 3.0, 3.0, 3.0, 3.0]), (True, [0.6, 0.6, 0.6, 0.6, 0.6])] | ||||
| ) | ||||
| def test_rank_resets_each_group(pct, exp): | ||||
|     df = DataFrame( | ||||
|         {"key": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], "val": [1] * 10} | ||||
|     ) | ||||
|     result = df.groupby("key").rank(pct=pct) | ||||
|     exp_df = DataFrame(exp * 2, columns=["val"]) | ||||
|     tm.assert_frame_equal(result, exp_df) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "dtype", ["int64", "int32", "uint64", "uint32", "float64", "float32"] | ||||
| ) | ||||
| @pytest.mark.parametrize("upper", [True, False]) | ||||
| def test_rank_avg_even_vals(dtype, upper): | ||||
|     if upper: | ||||
|         # use IntegerDtype/FloatingDtype | ||||
|         dtype = dtype[0].upper() + dtype[1:] | ||||
|         dtype = dtype.replace("Ui", "UI") | ||||
|     df = DataFrame({"key": ["a"] * 4, "val": [1] * 4}) | ||||
|     df["val"] = df["val"].astype(dtype) | ||||
|     assert df["val"].dtype == dtype | ||||
|  | ||||
|     result = df.groupby("key").rank() | ||||
|     exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=["val"]) | ||||
|     if upper: | ||||
|         exp_df = exp_df.astype("Float64") | ||||
|     tm.assert_frame_equal(result, exp_df) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"]) | ||||
| @pytest.mark.parametrize("ascending", [True, False]) | ||||
| @pytest.mark.parametrize("na_option", ["keep", "top", "bottom"]) | ||||
| @pytest.mark.parametrize("pct", [True, False]) | ||||
| @pytest.mark.parametrize( | ||||
|     "vals", [["bar", "bar", "foo", "bar", "baz"], ["bar", np.nan, "foo", np.nan, "baz"]] | ||||
| ) | ||||
| def test_rank_object_dtype(ties_method, ascending, na_option, pct, vals): | ||||
|     df = DataFrame({"key": ["foo"] * 5, "val": vals}) | ||||
|     mask = df["val"].isna() | ||||
|  | ||||
|     gb = df.groupby("key") | ||||
|     res = gb.rank(method=ties_method, ascending=ascending, na_option=na_option, pct=pct) | ||||
|  | ||||
|     # construct our expected by using numeric values with the same ordering | ||||
|     if mask.any(): | ||||
|         df2 = DataFrame({"key": ["foo"] * 5, "val": [0, np.nan, 2, np.nan, 1]}) | ||||
|     else: | ||||
|         df2 = DataFrame({"key": ["foo"] * 5, "val": [0, 0, 2, 0, 1]}) | ||||
|  | ||||
|     gb2 = df2.groupby("key") | ||||
|     alt = gb2.rank( | ||||
|         method=ties_method, ascending=ascending, na_option=na_option, pct=pct | ||||
|     ) | ||||
|  | ||||
|     tm.assert_frame_equal(res, alt) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("na_option", [True, "bad", 1]) | ||||
| @pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"]) | ||||
| @pytest.mark.parametrize("ascending", [True, False]) | ||||
| @pytest.mark.parametrize("pct", [True, False]) | ||||
| @pytest.mark.parametrize( | ||||
|     "vals", | ||||
|     [ | ||||
|         ["bar", "bar", "foo", "bar", "baz"], | ||||
|         ["bar", np.nan, "foo", np.nan, "baz"], | ||||
|         [1, np.nan, 2, np.nan, 3], | ||||
|     ], | ||||
| ) | ||||
| def test_rank_naoption_raises(ties_method, ascending, na_option, pct, vals): | ||||
|     df = DataFrame({"key": ["foo"] * 5, "val": vals}) | ||||
|     msg = "na_option must be one of 'keep', 'top', or 'bottom'" | ||||
|  | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         df.groupby("key").rank( | ||||
|             method=ties_method, ascending=ascending, na_option=na_option, pct=pct | ||||
|         ) | ||||
|  | ||||
|  | ||||
| def test_rank_empty_group(): | ||||
|     # see gh-22519 | ||||
|     column = "A" | ||||
|     df = DataFrame({"A": [0, 1, 0], "B": [1.0, np.nan, 2.0]}) | ||||
|  | ||||
|     result = df.groupby(column).B.rank(pct=True) | ||||
|     expected = Series([0.5, np.nan, 1.0], name="B") | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby(column).rank(pct=True) | ||||
|     expected = DataFrame({"B": [0.5, np.nan, 1.0]}) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "input_key,input_value,output_value", | ||||
|     [ | ||||
|         ([1, 2], [1, 1], [1.0, 1.0]), | ||||
|         ([1, 1, 2, 2], [1, 2, 1, 2], [0.5, 1.0, 0.5, 1.0]), | ||||
|         ([1, 1, 2, 2], [1, 2, 1, np.nan], [0.5, 1.0, 1.0, np.nan]), | ||||
|         ([1, 1, 2], [1, 2, np.nan], [0.5, 1.0, np.nan]), | ||||
|     ], | ||||
| ) | ||||
| def test_rank_zero_div(input_key, input_value, output_value): | ||||
|     # GH 23666 | ||||
|     df = DataFrame({"A": input_key, "B": input_value}) | ||||
|  | ||||
|     result = df.groupby("A").rank(method="dense", pct=True) | ||||
|     expected = DataFrame({"B": output_value}) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_rank_min_int(): | ||||
|     # GH-32859 | ||||
|     df = DataFrame( | ||||
|         { | ||||
|             "grp": [1, 1, 2], | ||||
|             "int_col": [ | ||||
|                 np.iinfo(np.int64).min, | ||||
|                 np.iinfo(np.int64).max, | ||||
|                 np.iinfo(np.int64).min, | ||||
|             ], | ||||
|             "datetimelike": [NaT, datetime(2001, 1, 1), NaT], | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|     result = df.groupby("grp").rank() | ||||
|     expected = DataFrame( | ||||
|         {"int_col": [1.0, 2.0, 1.0], "datetimelike": [np.nan, 1.0, np.nan]} | ||||
|     ) | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("use_nan", [True, False]) | ||||
| def test_rank_pct_equal_values_on_group_transition(use_nan): | ||||
|     # GH#40518 | ||||
|     fill_value = np.nan if use_nan else 3 | ||||
|     df = DataFrame( | ||||
|         [ | ||||
|             [-1, 1], | ||||
|             [-1, 2], | ||||
|             [1, fill_value], | ||||
|             [-1, fill_value], | ||||
|         ], | ||||
|         columns=["group", "val"], | ||||
|     ) | ||||
|     result = df.groupby(["group"])["val"].rank( | ||||
|         method="dense", | ||||
|         pct=True, | ||||
|     ) | ||||
|     if use_nan: | ||||
|         expected = Series([0.5, 1, np.nan, np.nan], name="val") | ||||
|     else: | ||||
|         expected = Series([1 / 3, 2 / 3, 1, 1], name="val") | ||||
|  | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_rank_multiindex(): | ||||
|     # GH27721 | ||||
|     df = concat( | ||||
|         { | ||||
|             "a": DataFrame({"col1": [3, 4], "col2": [1, 2]}), | ||||
|             "b": DataFrame({"col3": [5, 6], "col4": [7, 8]}), | ||||
|         }, | ||||
|         axis=1, | ||||
|     ) | ||||
|  | ||||
|     msg = "DataFrame.groupby with axis=1 is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         gb = df.groupby(level=0, axis=1) | ||||
|     msg = "DataFrameGroupBy.rank with axis=1 is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         result = gb.rank(axis=1) | ||||
|  | ||||
|     expected = concat( | ||||
|         [ | ||||
|             df["a"].rank(axis=1), | ||||
|             df["b"].rank(axis=1), | ||||
|         ], | ||||
|         axis=1, | ||||
|         keys=["a", "b"], | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_groupby_axis0_rank_axis1(): | ||||
|     # GH#41320 | ||||
|     df = DataFrame( | ||||
|         {0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]}, | ||||
|         index=["a", "a", "b", "b"], | ||||
|     ) | ||||
|     msg = "The 'axis' keyword in DataFrame.groupby is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         gb = df.groupby(level=0, axis=0) | ||||
|  | ||||
|     msg = "DataFrameGroupBy.rank with axis=1 is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         res = gb.rank(axis=1) | ||||
|  | ||||
|     # This should match what we get when "manually" operating group-by-group | ||||
|     expected = concat([df.loc["a"].rank(axis=1), df.loc["b"].rank(axis=1)], axis=0) | ||||
|     tm.assert_frame_equal(res, expected) | ||||
|  | ||||
|     # check that we haven't accidentally written a case that coincidentally | ||||
|     # matches rank(axis=0) | ||||
|     msg = "The 'axis' keyword in DataFrameGroupBy.rank" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         alt = gb.rank(axis=0) | ||||
|     assert not alt.equals(expected) | ||||
|  | ||||
|  | ||||
| def test_groupby_axis0_cummax_axis1(): | ||||
|     # case where groupby axis is 0 and axis keyword in transform is 1 | ||||
|  | ||||
|     # df has mixed dtype -> multiple blocks | ||||
|     df = DataFrame( | ||||
|         {0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]}, | ||||
|         index=["a", "a", "b", "b"], | ||||
|     ) | ||||
|     msg = "The 'axis' keyword in DataFrame.groupby is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         gb = df.groupby(level=0, axis=0) | ||||
|  | ||||
|     msg = "DataFrameGroupBy.cummax with axis=1 is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         cmax = gb.cummax(axis=1) | ||||
|     expected = df[[0, 1]].astype(np.float64) | ||||
|     expected[2] = expected[1] | ||||
|     tm.assert_frame_equal(cmax, expected) | ||||
|  | ||||
|  | ||||
| def test_non_unique_index(): | ||||
|     # GH 16577 | ||||
|     df = DataFrame( | ||||
|         {"A": [1.0, 2.0, 3.0, np.nan], "value": 1.0}, | ||||
|         index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4, | ||||
|     ) | ||||
|     result = df.groupby([df.index, "A"]).value.rank(ascending=True, pct=True) | ||||
|     expected = Series( | ||||
|         [1.0, 1.0, 1.0, np.nan], | ||||
|         index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4, | ||||
|         name="value", | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_rank_categorical(): | ||||
|     cat = pd.Categorical(["a", "a", "b", np.nan, "c", "b"], ordered=True) | ||||
|     cat2 = pd.Categorical([1, 2, 3, np.nan, 4, 5], ordered=True) | ||||
|  | ||||
|     df = DataFrame({"col1": [0, 1, 0, 1, 0, 1], "col2": cat, "col3": cat2}) | ||||
|  | ||||
|     gb = df.groupby("col1") | ||||
|  | ||||
|     res = gb.rank() | ||||
|  | ||||
|     expected = df.astype(object).groupby("col1").rank() | ||||
|     tm.assert_frame_equal(res, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("na_option", ["top", "bottom"]) | ||||
| def test_groupby_op_with_nullables(na_option): | ||||
|     # GH 54206 | ||||
|     df = DataFrame({"x": [None]}, dtype="Float64") | ||||
|     result = df.groupby("x", dropna=False)["x"].rank(method="min", na_option=na_option) | ||||
|     expected = Series([1.0], dtype="Float64", name=result.name) | ||||
|     tm.assert_series_equal(result, expected) | ||||
| @ -0,0 +1,154 @@ | ||||
| import pytest | ||||
|  | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     Series, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("n, frac", [(2, None), (None, 0.2)]) | ||||
| def test_groupby_sample_balanced_groups_shape(n, frac): | ||||
|     values = [1] * 10 + [2] * 10 | ||||
|     df = DataFrame({"a": values, "b": values}) | ||||
|  | ||||
|     result = df.groupby("a").sample(n=n, frac=frac) | ||||
|     values = [1] * 2 + [2] * 2 | ||||
|     expected = DataFrame({"a": values, "b": values}, index=result.index) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby("a")["b"].sample(n=n, frac=frac) | ||||
|     expected = Series(values, name="b", index=result.index) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_groupby_sample_unbalanced_groups_shape(): | ||||
|     values = [1] * 10 + [2] * 20 | ||||
|     df = DataFrame({"a": values, "b": values}) | ||||
|  | ||||
|     result = df.groupby("a").sample(n=5) | ||||
|     values = [1] * 5 + [2] * 5 | ||||
|     expected = DataFrame({"a": values, "b": values}, index=result.index) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby("a")["b"].sample(n=5) | ||||
|     expected = Series(values, name="b", index=result.index) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_groupby_sample_index_value_spans_groups(): | ||||
|     values = [1] * 3 + [2] * 3 | ||||
|     df = DataFrame({"a": values, "b": values}, index=[1, 2, 2, 2, 2, 2]) | ||||
|  | ||||
|     result = df.groupby("a").sample(n=2) | ||||
|     values = [1] * 2 + [2] * 2 | ||||
|     expected = DataFrame({"a": values, "b": values}, index=result.index) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby("a")["b"].sample(n=2) | ||||
|     expected = Series(values, name="b", index=result.index) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_groupby_sample_n_and_frac_raises(): | ||||
|     df = DataFrame({"a": [1, 2], "b": [1, 2]}) | ||||
|     msg = "Please enter a value for `frac` OR `n`, not both" | ||||
|  | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         df.groupby("a").sample(n=1, frac=1.0) | ||||
|  | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         df.groupby("a")["b"].sample(n=1, frac=1.0) | ||||
|  | ||||
|  | ||||
| def test_groupby_sample_frac_gt_one_without_replacement_raises(): | ||||
|     df = DataFrame({"a": [1, 2], "b": [1, 2]}) | ||||
|     msg = "Replace has to be set to `True` when upsampling the population `frac` > 1." | ||||
|  | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         df.groupby("a").sample(frac=1.5, replace=False) | ||||
|  | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         df.groupby("a")["b"].sample(frac=1.5, replace=False) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("n", [-1, 1.5]) | ||||
| def test_groupby_sample_invalid_n_raises(n): | ||||
|     df = DataFrame({"a": [1, 2], "b": [1, 2]}) | ||||
|  | ||||
|     if n < 0: | ||||
|         msg = "A negative number of rows requested. Please provide `n` >= 0." | ||||
|     else: | ||||
|         msg = "Only integers accepted as `n` values" | ||||
|  | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         df.groupby("a").sample(n=n) | ||||
|  | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         df.groupby("a")["b"].sample(n=n) | ||||
|  | ||||
|  | ||||
| def test_groupby_sample_oversample(): | ||||
|     values = [1] * 10 + [2] * 10 | ||||
|     df = DataFrame({"a": values, "b": values}) | ||||
|  | ||||
|     result = df.groupby("a").sample(frac=2.0, replace=True) | ||||
|     values = [1] * 20 + [2] * 20 | ||||
|     expected = DataFrame({"a": values, "b": values}, index=result.index) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby("a")["b"].sample(frac=2.0, replace=True) | ||||
|     expected = Series(values, name="b", index=result.index) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_groupby_sample_without_n_or_frac(): | ||||
|     values = [1] * 10 + [2] * 10 | ||||
|     df = DataFrame({"a": values, "b": values}) | ||||
|  | ||||
|     result = df.groupby("a").sample(n=None, frac=None) | ||||
|     expected = DataFrame({"a": [1, 2], "b": [1, 2]}, index=result.index) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby("a")["b"].sample(n=None, frac=None) | ||||
|     expected = Series([1, 2], name="b", index=result.index) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "index, expected_index", | ||||
|     [(["w", "x", "y", "z"], ["w", "w", "y", "y"]), ([3, 4, 5, 6], [3, 3, 5, 5])], | ||||
| ) | ||||
| def test_groupby_sample_with_weights(index, expected_index): | ||||
|     # GH 39927 - tests for integer index needed | ||||
|     values = [1] * 2 + [2] * 2 | ||||
|     df = DataFrame({"a": values, "b": values}, index=Index(index)) | ||||
|  | ||||
|     result = df.groupby("a").sample(n=2, replace=True, weights=[1, 0, 1, 0]) | ||||
|     expected = DataFrame({"a": values, "b": values}, index=Index(expected_index)) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = df.groupby("a")["b"].sample(n=2, replace=True, weights=[1, 0, 1, 0]) | ||||
|     expected = Series(values, name="b", index=Index(expected_index)) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_groupby_sample_with_selections(): | ||||
|     # GH 39928 | ||||
|     values = [1] * 10 + [2] * 10 | ||||
|     df = DataFrame({"a": values, "b": values, "c": values}) | ||||
|  | ||||
|     result = df.groupby("a")[["b", "c"]].sample(n=None, frac=None) | ||||
|     expected = DataFrame({"b": [1, 2], "c": [1, 2]}, index=result.index) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_groupby_sample_with_empty_inputs(): | ||||
|     # GH48459 | ||||
|     df = DataFrame({"a": [], "b": []}) | ||||
|     groupby_df = df.groupby("a") | ||||
|  | ||||
|     result = groupby_df.sample() | ||||
|     expected = df | ||||
|     tm.assert_frame_equal(result, expected) | ||||
| @ -0,0 +1,122 @@ | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas.core.dtypes.common import is_integer_dtype | ||||
|  | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     PeriodIndex, | ||||
|     Series, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("by", ["A", "B", ["A", "B"]]) | ||||
| def test_size(df, by): | ||||
|     grouped = df.groupby(by=by) | ||||
|     result = grouped.size() | ||||
|     for key, group in grouped: | ||||
|         assert result[key] == len(group) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "by", | ||||
|     [ | ||||
|         [0, 0, 0, 0], | ||||
|         [0, 1, 1, 1], | ||||
|         [1, 0, 1, 1], | ||||
|         [0, None, None, None], | ||||
|         pytest.param([None, None, None, None], marks=pytest.mark.xfail), | ||||
|     ], | ||||
| ) | ||||
| def test_size_axis_1(df, axis_1, by, sort, dropna): | ||||
|     # GH#45715 | ||||
|     counts = {key: sum(value == key for value in by) for key in dict.fromkeys(by)} | ||||
|     if dropna: | ||||
|         counts = {key: value for key, value in counts.items() if key is not None} | ||||
|     expected = Series(counts, dtype="int64") | ||||
|     if sort: | ||||
|         expected = expected.sort_index() | ||||
|     if is_integer_dtype(expected.index.dtype) and not any(x is None for x in by): | ||||
|         expected.index = expected.index.astype(int) | ||||
|  | ||||
|     msg = "DataFrame.groupby with axis=1 is deprecated" | ||||
|     with tm.assert_produces_warning(FutureWarning, match=msg): | ||||
|         grouped = df.groupby(by=by, axis=axis_1, sort=sort, dropna=dropna) | ||||
|     result = grouped.size() | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("by", ["A", "B", ["A", "B"]]) | ||||
| @pytest.mark.parametrize("sort", [True, False]) | ||||
| def test_size_sort(sort, by): | ||||
|     df = DataFrame(np.random.default_rng(2).choice(20, (1000, 3)), columns=list("ABC")) | ||||
|     left = df.groupby(by=by, sort=sort).size() | ||||
|     right = df.groupby(by=by, sort=sort)["C"].apply(lambda a: a.shape[0]) | ||||
|     tm.assert_series_equal(left, right, check_names=False) | ||||
|  | ||||
|  | ||||
| def test_size_series_dataframe(): | ||||
|     # https://github.com/pandas-dev/pandas/issues/11699 | ||||
|     df = DataFrame(columns=["A", "B"]) | ||||
|     out = Series(dtype="int64", index=Index([], name="A")) | ||||
|     tm.assert_series_equal(df.groupby("A").size(), out) | ||||
|  | ||||
|  | ||||
| def test_size_groupby_all_null(): | ||||
|     # https://github.com/pandas-dev/pandas/issues/23050 | ||||
|     # Assert no 'Value Error : Length of passed values is 2, index implies 0' | ||||
|     df = DataFrame({"A": [None, None]})  # all-null groups | ||||
|     result = df.groupby("A").size() | ||||
|     expected = Series(dtype="int64", index=Index([], name="A")) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_size_period_index(): | ||||
|     # https://github.com/pandas-dev/pandas/issues/34010 | ||||
|     ser = Series([1], index=PeriodIndex(["2000"], name="A", freq="D")) | ||||
|     grp = ser.groupby(level="A") | ||||
|     result = grp.size() | ||||
|     tm.assert_series_equal(result, ser) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("as_index", [True, False]) | ||||
| def test_size_on_categorical(as_index): | ||||
|     df = DataFrame([[1, 1], [2, 2]], columns=["A", "B"]) | ||||
|     df["A"] = df["A"].astype("category") | ||||
|     result = df.groupby(["A", "B"], as_index=as_index, observed=False).size() | ||||
|  | ||||
|     expected = DataFrame( | ||||
|         [[1, 1, 1], [1, 2, 0], [2, 1, 0], [2, 2, 1]], columns=["A", "B", "size"] | ||||
|     ) | ||||
|     expected["A"] = expected["A"].astype("category") | ||||
|     if as_index: | ||||
|         expected = expected.set_index(["A", "B"])["size"].rename(None) | ||||
|  | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"]) | ||||
| def test_size_series_masked_type_returns_Int64(dtype): | ||||
|     # GH 54132 | ||||
|     ser = Series([1, 1, 1], index=["a", "a", "b"], dtype=dtype) | ||||
|     result = ser.groupby(level=0).size() | ||||
|     expected = Series([2, 1], dtype="Int64", index=["a", "b"]) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_size_strings(any_string_dtype, using_infer_string): | ||||
|     # GH#55627 | ||||
|     dtype = any_string_dtype | ||||
|     df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype) | ||||
|     result = df.groupby("a")["b"].size() | ||||
|     exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64" | ||||
|     exp_index_dtype = "str" if using_infer_string and dtype == "object" else dtype | ||||
|     expected = Series( | ||||
|         [2, 1], | ||||
|         index=Index(["a", "b"], name="a", dtype=exp_index_dtype), | ||||
|         name="b", | ||||
|         dtype=exp_dtype, | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
| @ -0,0 +1,27 @@ | ||||
| import numpy as np | ||||
|  | ||||
| import pandas as pd | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| def test_groupby_skew_equivalence(): | ||||
|     # Test that that groupby skew method (which uses libgroupby.group_skew) | ||||
|     #  matches the results of operating group-by-group (which uses nanops.nanskew) | ||||
|     nrows = 1000 | ||||
|     ngroups = 3 | ||||
|     ncols = 2 | ||||
|     nan_frac = 0.05 | ||||
|  | ||||
|     arr = np.random.default_rng(2).standard_normal((nrows, ncols)) | ||||
|     arr[np.random.default_rng(2).random(nrows) < nan_frac] = np.nan | ||||
|  | ||||
|     df = pd.DataFrame(arr) | ||||
|     grps = np.random.default_rng(2).integers(0, ngroups, size=nrows) | ||||
|     gb = df.groupby(grps) | ||||
|  | ||||
|     result = gb.skew() | ||||
|  | ||||
|     grpwise = [grp.skew().to_frame(i).T for i, grp in gb] | ||||
|     expected = pd.concat(grpwise, axis=0) | ||||
|     expected.index = expected.index.astype(result.index.dtype)  # 32bit builds | ||||
|     tm.assert_frame_equal(result, expected) | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
		Reference in New Issue
	
	Block a user