done

2025-09-07 22:09:54 +02:00
parent e1b817252c
commit 2fc0d000b6
7796 changed files with 2159515 additions and 933 deletions
--- a/lib/python3.11/site-packages/pandas/core/groupby/init.py
+++ b/lib/python3.11/site-packages/pandas/core/groupby/init.py
@ -0,0 +1,15 @@
+from pandas.core.groupby.generic import (
+    DataFrameGroupBy,
+    NamedAgg,
+    SeriesGroupBy,
+)
+from pandas.core.groupby.groupby import GroupBy
+from pandas.core.groupby.grouper import Grouper
+
+__all__ = [
+    "DataFrameGroupBy",
+    "NamedAgg",
+    "SeriesGroupBy",
+    "GroupBy",
+    "Grouper",
+]
--- a/lib/python3.11/site-packages/pandas/core/groupby/base.py
+++ b/lib/python3.11/site-packages/pandas/core/groupby/base.py
@ -0,0 +1,121 @@
+"""
+Provide basic components for groupby.
+"""
+from __future__ import annotations
+
+import dataclasses
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from collections.abc import Hashable
+
+
+@dataclasses.dataclass(order=True, frozen=True)
+class OutputKey:
+    label: Hashable
+    position: int
+
+
+# special case to prevent duplicate plots when catching exceptions when
+# forwarding methods from NDFrames
+plotting_methods = frozenset(["plot", "hist"])
+
+# cythonized transformations or canned "agg+broadcast", which do not
+# require postprocessing of the result by transform.
+cythonized_kernels = frozenset(["cumprod", "cumsum", "shift", "cummin", "cummax"])
+
+# List of aggregation/reduction functions.
+# These map each group to a single numeric value
+reduction_kernels = frozenset(
+    [
+        "all",
+        "any",
+        "corrwith",
+        "count",
+        "first",
+        "idxmax",
+        "idxmin",
+        "last",
+        "max",
+        "mean",
+        "median",
+        "min",
+        "nunique",
+        "prod",
+        # as long as `quantile`'s signature accepts only
+        # a single quantile value, it's a reduction.
+        # GH#27526 might change that.
+        "quantile",
+        "sem",
+        "size",
+        "skew",
+        "std",
+        "sum",
+        "var",
+    ]
+)
+
+# List of transformation functions.
+# a transformation is a function that, for each group,
+# produces a result that has the same shape as the group.
+
+
+transformation_kernels = frozenset(
+    [
+        "bfill",
+        "cumcount",
+        "cummax",
+        "cummin",
+        "cumprod",
+        "cumsum",
+        "diff",
+        "ffill",
+        "fillna",
+        "ngroup",
+        "pct_change",
+        "rank",
+        "shift",
+    ]
+)
+
+# these are all the public methods on Grouper which don't belong
+# in either of the above lists
+groupby_other_methods = frozenset(
+    [
+        "agg",
+        "aggregate",
+        "apply",
+        "boxplot",
+        # corr and cov return ngroups*ncolumns rows, so they
+        # are neither a transformation nor a reduction
+        "corr",
+        "cov",
+        "describe",
+        "dtypes",
+        "expanding",
+        "ewm",
+        "filter",
+        "get_group",
+        "groups",
+        "head",
+        "hist",
+        "indices",
+        "ndim",
+        "ngroups",
+        "nth",
+        "ohlc",
+        "pipe",
+        "plot",
+        "resample",
+        "rolling",
+        "tail",
+        "take",
+        "transform",
+        "sample",
+        "value_counts",
+    ]
+)
+# Valid values  of `name` for `groupby.transform(name)`
+# NOTE: do NOT edit this directly. New additions should be inserted
+# into the appropriate list above.
+transform_kernel_allowlist = reduction_kernels | transformation_kernels
--- a/lib/python3.11/site-packages/pandas/core/groupby/categorical.py
+++ b/lib/python3.11/site-packages/pandas/core/groupby/categorical.py
@ -0,0 +1,87 @@
+from __future__ import annotations
+
+import numpy as np
+
+from pandas.core.algorithms import unique1d
+from pandas.core.arrays.categorical import (
+    Categorical,
+    CategoricalDtype,
+    recode_for_categories,
+)
+
+
+def recode_for_groupby(
+    c: Categorical, sort: bool, observed: bool
+) -> tuple[Categorical, Categorical | None]:
+    """
+    Code the categories to ensure we can groupby for categoricals.
+
+    If observed=True, we return a new Categorical with the observed
+    categories only.
+
+    If sort=False, return a copy of self, coded with categories as
+    returned by .unique(), followed by any categories not appearing in
+    the data. If sort=True, return self.
+
+    This method is needed solely to ensure the categorical index of the
+    GroupBy result has categories in the order of appearance in the data
+    (GH-8868).
+
+    Parameters
+    ----------
+    c : Categorical
+    sort : bool
+        The value of the sort parameter groupby was called with.
+    observed : bool
+        Account only for the observed values
+
+    Returns
+    -------
+    Categorical
+        If sort=False, the new categories are set to the order of
+        appearance in codes (unless ordered=True, in which case the
+        original order is preserved), followed by any unrepresented
+        categories in the original order.
+    Categorical or None
+        If we are observed, return the original categorical, otherwise None
+    """
+    # we only care about observed values
+    if observed:
+        # In cases with c.ordered, this is equivalent to
+        #  return c.remove_unused_categories(), c
+
+        unique_codes = unique1d(c.codes)
+
+        take_codes = unique_codes[unique_codes != -1]
+        if sort:
+            take_codes = np.sort(take_codes)
+
+        # we recode according to the uniques
+        categories = c.categories.take(take_codes)
+        codes = recode_for_categories(c.codes, c.categories, categories)
+
+        # return a new categorical that maps our new codes
+        # and categories
+        dtype = CategoricalDtype(categories, ordered=c.ordered)
+        return Categorical._simple_new(codes, dtype=dtype), c
+
+    # Already sorted according to c.categories; all is fine
+    if sort:
+        return c, None
+
+    # sort=False should order groups in as-encountered order (GH-8868)
+
+    # xref GH:46909: Re-ordering codes faster than using (set|add|reorder)_categories
+    all_codes = np.arange(c.categories.nunique())
+    # GH 38140: exclude nan from indexer for categories
+    unique_notnan_codes = unique1d(c.codes[c.codes != -1])
+    if sort:
+        unique_notnan_codes = np.sort(unique_notnan_codes)
+    if len(all_codes) > len(unique_notnan_codes):
+        # GH 13179: All categories need to be present, even if missing from the data
+        missing_codes = np.setdiff1d(all_codes, unique_notnan_codes, assume_unique=True)
+        take_codes = np.concatenate((unique_notnan_codes, missing_codes))
+    else:
+        take_codes = unique_notnan_codes
+
+    return Categorical(c, c.unique().categories.take(take_codes)), None
--- a/lib/python3.11/site-packages/pandas/core/groupby/generic.py
+++ b/lib/python3.11/site-packages/pandas/core/groupby/generic.py
--- a/lib/python3.11/site-packages/pandas/core/groupby/groupby.py
+++ b/lib/python3.11/site-packages/pandas/core/groupby/groupby.py
--- a/lib/python3.11/site-packages/pandas/core/groupby/grouper.py
+++ b/lib/python3.11/site-packages/pandas/core/groupby/grouper.py
--- a/lib/python3.11/site-packages/pandas/core/groupby/indexing.py
+++ b/lib/python3.11/site-packages/pandas/core/groupby/indexing.py
@ -0,0 +1,304 @@
+from __future__ import annotations
+
+from collections.abc import Iterable
+from typing import (
+    TYPE_CHECKING,
+    Literal,
+    cast,
+)
+
+import numpy as np
+
+from pandas.util._decorators import (
+    cache_readonly,
+    doc,
+)
+
+from pandas.core.dtypes.common import (
+    is_integer,
+    is_list_like,
+)
+
+if TYPE_CHECKING:
+    from pandas._typing import PositionalIndexer
+
+    from pandas import (
+        DataFrame,
+        Series,
+    )
+    from pandas.core.groupby import groupby
+
+
+class GroupByIndexingMixin:
+    """
+    Mixin for adding ._positional_selector to GroupBy.
+    """
+
+    @cache_readonly
+    def _positional_selector(self) -> GroupByPositionalSelector:
+        """
+        Return positional selection for each group.
+
+        ``groupby._positional_selector[i:j]`` is similar to
+        ``groupby.apply(lambda x: x.iloc[i:j])``
+        but much faster and preserves the original index and order.
+
+        ``_positional_selector[]`` is compatible with and extends :meth:`~GroupBy.head`
+        and :meth:`~GroupBy.tail`. For example:
+
+        - ``head(5)``
+        - ``_positional_selector[5:-5]``
+        - ``tail(5)``
+
+        together return all the rows.
+
+        Allowed inputs for the index are:
+
+        - An integer valued iterable, e.g. ``range(2, 4)``.
+        - A comma separated list of integers and slices, e.g. ``5``, ``2, 4``, ``2:4``.
+
+        The output format is the same as :meth:`~GroupBy.head` and
+        :meth:`~GroupBy.tail`, namely
+        a subset of the ``DataFrame`` or ``Series`` with the index and order preserved.
+
+        Returns
+        -------
+        Series
+            The filtered subset of the original Series.
+        DataFrame
+            The filtered subset of the original DataFrame.
+
+        See Also
+        --------
+        DataFrame.iloc : Purely integer-location based indexing for selection by
+            position.
+        GroupBy.head : Return first n rows of each group.
+        GroupBy.tail : Return last n rows of each group.
+        GroupBy.nth : Take the nth row from each group if n is an int, or a
+            subset of rows, if n is a list of ints.
+
+        Notes
+        -----
+        - The slice step cannot be negative.
+        - If the index specification results in overlaps, the item is not duplicated.
+        - If the index specification changes the order of items, then
+          they are returned in their original order.
+          By contrast, ``DataFrame.iloc`` can change the row order.
+        - ``groupby()`` parameters such as as_index and dropna are ignored.
+
+        The differences between ``_positional_selector[]`` and :meth:`~GroupBy.nth`
+        with ``as_index=False`` are:
+
+        - Input to ``_positional_selector`` can include
+          one or more slices whereas ``nth``
+          just handles an integer or a list of integers.
+        - ``_positional_selector`` can  accept a slice relative to the
+          last row of each group.
+        - ``_positional_selector`` does not have an equivalent to the
+          ``nth()`` ``dropna`` parameter.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]],
+        ...                   columns=["A", "B"])
+        >>> df.groupby("A")._positional_selector[1:2]
+           A  B
+        1  a  2
+        4  b  5
+
+        >>> df.groupby("A")._positional_selector[1, -1]
+           A  B
+        1  a  2
+        2  a  3
+        4  b  5
+        """
+        if TYPE_CHECKING:
+            # pylint: disable-next=used-before-assignment
+            groupby_self = cast(groupby.GroupBy, self)
+        else:
+            groupby_self = self
+
+        return GroupByPositionalSelector(groupby_self)
+
+    def _make_mask_from_positional_indexer(
+        self,
+        arg: PositionalIndexer | tuple,
+    ) -> np.ndarray:
+        if is_list_like(arg):
+            if all(is_integer(i) for i in cast(Iterable, arg)):
+                mask = self._make_mask_from_list(cast(Iterable[int], arg))
+            else:
+                mask = self._make_mask_from_tuple(cast(tuple, arg))
+
+        elif isinstance(arg, slice):
+            mask = self._make_mask_from_slice(arg)
+        elif is_integer(arg):
+            mask = self._make_mask_from_int(cast(int, arg))
+        else:
+            raise TypeError(
+                f"Invalid index {type(arg)}. "
+                "Must be integer, list-like, slice or a tuple of "
+                "integers and slices"
+            )
+
+        if isinstance(mask, bool):
+            if mask:
+                mask = self._ascending_count >= 0
+            else:
+                mask = self._ascending_count < 0
+
+        return cast(np.ndarray, mask)
+
+    def _make_mask_from_int(self, arg: int) -> np.ndarray:
+        if arg >= 0:
+            return self._ascending_count == arg
+        else:
+            return self._descending_count == (-arg - 1)
+
+    def _make_mask_from_list(self, args: Iterable[int]) -> bool | np.ndarray:
+        positive = [arg for arg in args if arg >= 0]
+        negative = [-arg - 1 for arg in args if arg < 0]
+
+        mask: bool | np.ndarray = False
+
+        if positive:
+            mask |= np.isin(self._ascending_count, positive)
+
+        if negative:
+            mask |= np.isin(self._descending_count, negative)
+
+        return mask
+
+    def _make_mask_from_tuple(self, args: tuple) -> bool | np.ndarray:
+        mask: bool | np.ndarray = False
+
+        for arg in args:
+            if is_integer(arg):
+                mask |= self._make_mask_from_int(cast(int, arg))
+            elif isinstance(arg, slice):
+                mask |= self._make_mask_from_slice(arg)
+            else:
+                raise ValueError(
+                    f"Invalid argument {type(arg)}. Should be int or slice."
+                )
+
+        return mask
+
+    def _make_mask_from_slice(self, arg: slice) -> bool | np.ndarray:
+        start = arg.start
+        stop = arg.stop
+        step = arg.step
+
+        if step is not None and step < 0:
+            raise ValueError(f"Invalid step {step}. Must be non-negative")
+
+        mask: bool | np.ndarray = True
+
+        if step is None:
+            step = 1
+
+        if start is None:
+            if step > 1:
+                mask &= self._ascending_count % step == 0
+
+        elif start >= 0:
+            mask &= self._ascending_count >= start
+
+            if step > 1:
+                mask &= (self._ascending_count - start) % step == 0
+
+        else:
+            mask &= self._descending_count < -start
+
+            offset_array = self._descending_count + start + 1
+            limit_array = (
+                self._ascending_count + self._descending_count + (start + 1)
+            ) < 0
+            offset_array = np.where(limit_array, self._ascending_count, offset_array)
+
+            mask &= offset_array % step == 0
+
+        if stop is not None:
+            if stop >= 0:
+                mask &= self._ascending_count < stop
+            else:
+                mask &= self._descending_count >= -stop
+
+        return mask
+
+    @cache_readonly
+    def _ascending_count(self) -> np.ndarray:
+        if TYPE_CHECKING:
+            groupby_self = cast(groupby.GroupBy, self)
+        else:
+            groupby_self = self
+
+        return groupby_self._cumcount_array()
+
+    @cache_readonly
+    def _descending_count(self) -> np.ndarray:
+        if TYPE_CHECKING:
+            groupby_self = cast(groupby.GroupBy, self)
+        else:
+            groupby_self = self
+
+        return groupby_self._cumcount_array(ascending=False)
+
+
+@doc(GroupByIndexingMixin._positional_selector)
+class GroupByPositionalSelector:
+    def __init__(self, groupby_object: groupby.GroupBy) -> None:
+        self.groupby_object = groupby_object
+
+    def __getitem__(self, arg: PositionalIndexer | tuple) -> DataFrame | Series:
+        """
+        Select by positional index per group.
+
+        Implements GroupBy._positional_selector
+
+        Parameters
+        ----------
+        arg : PositionalIndexer | tuple
+            Allowed values are:
+            - int
+            - int valued iterable such as list or range
+            - slice with step either None or positive
+            - tuple of integers and slices
+
+        Returns
+        -------
+        Series
+            The filtered subset of the original groupby Series.
+        DataFrame
+            The filtered subset of the original groupby DataFrame.
+
+        See Also
+        --------
+        DataFrame.iloc : Integer-location based indexing for selection by position.
+        GroupBy.head : Return first n rows of each group.
+        GroupBy.tail : Return last n rows of each group.
+        GroupBy._positional_selector : Return positional selection for each group.
+        GroupBy.nth : Take the nth row from each group if n is an int, or a
+            subset of rows, if n is a list of ints.
+        """
+        mask = self.groupby_object._make_mask_from_positional_indexer(arg)
+        return self.groupby_object._mask_selected_obj(mask)
+
+
+class GroupByNthSelector:
+    """
+    Dynamically substituted for GroupBy.nth to enable both call and index
+    """
+
+    def __init__(self, groupby_object: groupby.GroupBy) -> None:
+        self.groupby_object = groupby_object
+
+    def __call__(
+        self,
+        n: PositionalIndexer | tuple,
+        dropna: Literal["any", "all", None] = None,
+    ) -> DataFrame | Series:
+        return self.groupby_object._nth(n, dropna)
+
+    def __getitem__(self, n: PositionalIndexer | tuple) -> DataFrame | Series:
+        return self.groupby_object._nth(n)
--- a/lib/python3.11/site-packages/pandas/core/groupby/numba_.py
+++ b/lib/python3.11/site-packages/pandas/core/groupby/numba_.py
@ -0,0 +1,181 @@
+"""Common utilities for Numba operations with groupby ops"""
+from __future__ import annotations
+
+import functools
+import inspect
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+)
+
+import numpy as np
+
+from pandas.compat._optional import import_optional_dependency
+
+from pandas.core.util.numba_ import (
+    NumbaUtilError,
+    jit_user_function,
+)
+
+if TYPE_CHECKING:
+    from pandas._typing import Scalar
+
+
+def validate_udf(func: Callable) -> None:
+    """
+    Validate user defined function for ops when using Numba with groupby ops.
+
+    The first signature arguments should include:
+
+    def f(values, index, ...):
+        ...
+
+    Parameters
+    ----------
+    func : function, default False
+        user defined function
+
+    Returns
+    -------
+    None
+
+    Raises
+    ------
+    NumbaUtilError
+    """
+    if not callable(func):
+        raise NotImplementedError(
+            "Numba engine can only be used with a single function."
+        )
+    udf_signature = list(inspect.signature(func).parameters.keys())
+    expected_args = ["values", "index"]
+    min_number_args = len(expected_args)
+    if (
+        len(udf_signature) < min_number_args
+        or udf_signature[:min_number_args] != expected_args
+    ):
+        raise NumbaUtilError(
+            f"The first {min_number_args} arguments to {func.__name__} must be "
+            f"{expected_args}"
+        )
+
+
+@functools.cache
+def generate_numba_agg_func(
+    func: Callable[..., Scalar],
+    nopython: bool,
+    nogil: bool,
+    parallel: bool,
+) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, Any], np.ndarray]:
+    """
+    Generate a numba jitted agg function specified by values from engine_kwargs.
+
+    1. jit the user's function
+    2. Return a groupby agg function with the jitted function inline
+
+    Configurations specified in engine_kwargs apply to both the user's
+    function _AND_ the groupby evaluation loop.
+
+    Parameters
+    ----------
+    func : function
+        function to be applied to each group and will be JITed
+    nopython : bool
+        nopython to be passed into numba.jit
+    nogil : bool
+        nogil to be passed into numba.jit
+    parallel : bool
+        parallel to be passed into numba.jit
+
+    Returns
+    -------
+    Numba function
+    """
+    numba_func = jit_user_function(func)
+    if TYPE_CHECKING:
+        import numba
+    else:
+        numba = import_optional_dependency("numba")
+
+    @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
+    def group_agg(
+        values: np.ndarray,
+        index: np.ndarray,
+        begin: np.ndarray,
+        end: np.ndarray,
+        num_columns: int,
+        *args: Any,
+    ) -> np.ndarray:
+        assert len(begin) == len(end)
+        num_groups = len(begin)
+
+        result = np.empty((num_groups, num_columns))
+        for i in numba.prange(num_groups):
+            group_index = index[begin[i] : end[i]]
+            for j in numba.prange(num_columns):
+                group = values[begin[i] : end[i], j]
+                result[i, j] = numba_func(group, group_index, *args)
+        return result
+
+    return group_agg
+
+
+@functools.cache
+def generate_numba_transform_func(
+    func: Callable[..., np.ndarray],
+    nopython: bool,
+    nogil: bool,
+    parallel: bool,
+) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, Any], np.ndarray]:
+    """
+    Generate a numba jitted transform function specified by values from engine_kwargs.
+
+    1. jit the user's function
+    2. Return a groupby transform function with the jitted function inline
+
+    Configurations specified in engine_kwargs apply to both the user's
+    function _AND_ the groupby evaluation loop.
+
+    Parameters
+    ----------
+    func : function
+        function to be applied to each window and will be JITed
+    nopython : bool
+        nopython to be passed into numba.jit
+    nogil : bool
+        nogil to be passed into numba.jit
+    parallel : bool
+        parallel to be passed into numba.jit
+
+    Returns
+    -------
+    Numba function
+    """
+    numba_func = jit_user_function(func)
+    if TYPE_CHECKING:
+        import numba
+    else:
+        numba = import_optional_dependency("numba")
+
+    @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
+    def group_transform(
+        values: np.ndarray,
+        index: np.ndarray,
+        begin: np.ndarray,
+        end: np.ndarray,
+        num_columns: int,
+        *args: Any,
+    ) -> np.ndarray:
+        assert len(begin) == len(end)
+        num_groups = len(begin)
+
+        result = np.empty((len(values), num_columns))
+        for i in numba.prange(num_groups):
+            group_index = index[begin[i] : end[i]]
+            for j in numba.prange(num_columns):
+                group = values[begin[i] : end[i], j]
+                result[begin[i] : end[i], j] = numba_func(group, group_index, *args)
+        return result
+
+    return group_transform
--- a/lib/python3.11/site-packages/pandas/core/groupby/ops.py
+++ b/lib/python3.11/site-packages/pandas/core/groupby/ops.py