done
This commit is contained in:
41
lib/python3.11/site-packages/pandas/core/reshape/api.py
Normal file
41
lib/python3.11/site-packages/pandas/core/reshape/api.py
Normal file
@ -0,0 +1,41 @@
|
||||
from pandas.core.reshape.concat import concat
|
||||
from pandas.core.reshape.encoding import (
|
||||
from_dummies,
|
||||
get_dummies,
|
||||
)
|
||||
from pandas.core.reshape.melt import (
|
||||
lreshape,
|
||||
melt,
|
||||
wide_to_long,
|
||||
)
|
||||
from pandas.core.reshape.merge import (
|
||||
merge,
|
||||
merge_asof,
|
||||
merge_ordered,
|
||||
)
|
||||
from pandas.core.reshape.pivot import (
|
||||
crosstab,
|
||||
pivot,
|
||||
pivot_table,
|
||||
)
|
||||
from pandas.core.reshape.tile import (
|
||||
cut,
|
||||
qcut,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"concat",
|
||||
"crosstab",
|
||||
"cut",
|
||||
"from_dummies",
|
||||
"get_dummies",
|
||||
"lreshape",
|
||||
"melt",
|
||||
"merge",
|
||||
"merge_asof",
|
||||
"merge_ordered",
|
||||
"pivot",
|
||||
"pivot_table",
|
||||
"qcut",
|
||||
"wide_to_long",
|
||||
]
|
894
lib/python3.11/site-packages/pandas/core/reshape/concat.py
Normal file
894
lib/python3.11/site-packages/pandas/core/reshape/concat.py
Normal file
@ -0,0 +1,894 @@
|
||||
"""
|
||||
Concat routines.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import abc
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Callable,
|
||||
Literal,
|
||||
cast,
|
||||
overload,
|
||||
)
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._config import using_copy_on_write
|
||||
|
||||
from pandas.util._decorators import cache_readonly
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_bool,
|
||||
is_iterator,
|
||||
)
|
||||
from pandas.core.dtypes.concat import concat_compat
|
||||
from pandas.core.dtypes.generic import (
|
||||
ABCDataFrame,
|
||||
ABCSeries,
|
||||
)
|
||||
from pandas.core.dtypes.missing import isna
|
||||
|
||||
from pandas.core.arrays.categorical import (
|
||||
factorize_from_iterable,
|
||||
factorize_from_iterables,
|
||||
)
|
||||
import pandas.core.common as com
|
||||
from pandas.core.indexes.api import (
|
||||
Index,
|
||||
MultiIndex,
|
||||
all_indexes_same,
|
||||
default_index,
|
||||
ensure_index,
|
||||
get_objs_combined_axis,
|
||||
get_unanimous_names,
|
||||
)
|
||||
from pandas.core.internals import concatenate_managers
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import (
|
||||
Hashable,
|
||||
Iterable,
|
||||
Mapping,
|
||||
)
|
||||
|
||||
from pandas._typing import (
|
||||
Axis,
|
||||
AxisInt,
|
||||
HashableT,
|
||||
)
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------
|
||||
# Concatenate DataFrame objects
|
||||
|
||||
|
||||
@overload
|
||||
def concat(
|
||||
objs: Iterable[DataFrame] | Mapping[HashableT, DataFrame],
|
||||
*,
|
||||
axis: Literal[0, "index"] = ...,
|
||||
join: str = ...,
|
||||
ignore_index: bool = ...,
|
||||
keys: Iterable[Hashable] | None = ...,
|
||||
levels=...,
|
||||
names: list[HashableT] | None = ...,
|
||||
verify_integrity: bool = ...,
|
||||
sort: bool = ...,
|
||||
copy: bool | None = ...,
|
||||
) -> DataFrame:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def concat(
|
||||
objs: Iterable[Series] | Mapping[HashableT, Series],
|
||||
*,
|
||||
axis: Literal[0, "index"] = ...,
|
||||
join: str = ...,
|
||||
ignore_index: bool = ...,
|
||||
keys: Iterable[Hashable] | None = ...,
|
||||
levels=...,
|
||||
names: list[HashableT] | None = ...,
|
||||
verify_integrity: bool = ...,
|
||||
sort: bool = ...,
|
||||
copy: bool | None = ...,
|
||||
) -> Series:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def concat(
|
||||
objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
|
||||
*,
|
||||
axis: Literal[0, "index"] = ...,
|
||||
join: str = ...,
|
||||
ignore_index: bool = ...,
|
||||
keys: Iterable[Hashable] | None = ...,
|
||||
levels=...,
|
||||
names: list[HashableT] | None = ...,
|
||||
verify_integrity: bool = ...,
|
||||
sort: bool = ...,
|
||||
copy: bool | None = ...,
|
||||
) -> DataFrame | Series:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def concat(
|
||||
objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
|
||||
*,
|
||||
axis: Literal[1, "columns"],
|
||||
join: str = ...,
|
||||
ignore_index: bool = ...,
|
||||
keys: Iterable[Hashable] | None = ...,
|
||||
levels=...,
|
||||
names: list[HashableT] | None = ...,
|
||||
verify_integrity: bool = ...,
|
||||
sort: bool = ...,
|
||||
copy: bool | None = ...,
|
||||
) -> DataFrame:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def concat(
|
||||
objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
|
||||
*,
|
||||
axis: Axis = ...,
|
||||
join: str = ...,
|
||||
ignore_index: bool = ...,
|
||||
keys: Iterable[Hashable] | None = ...,
|
||||
levels=...,
|
||||
names: list[HashableT] | None = ...,
|
||||
verify_integrity: bool = ...,
|
||||
sort: bool = ...,
|
||||
copy: bool | None = ...,
|
||||
) -> DataFrame | Series:
|
||||
...
|
||||
|
||||
|
||||
def concat(
|
||||
objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
|
||||
*,
|
||||
axis: Axis = 0,
|
||||
join: str = "outer",
|
||||
ignore_index: bool = False,
|
||||
keys: Iterable[Hashable] | None = None,
|
||||
levels=None,
|
||||
names: list[HashableT] | None = None,
|
||||
verify_integrity: bool = False,
|
||||
sort: bool = False,
|
||||
copy: bool | None = None,
|
||||
) -> DataFrame | Series:
|
||||
"""
|
||||
Concatenate pandas objects along a particular axis.
|
||||
|
||||
Allows optional set logic along the other axes.
|
||||
|
||||
Can also add a layer of hierarchical indexing on the concatenation axis,
|
||||
which may be useful if the labels are the same (or overlapping) on
|
||||
the passed axis number.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
objs : a sequence or mapping of Series or DataFrame objects
|
||||
If a mapping is passed, the sorted keys will be used as the `keys`
|
||||
argument, unless it is passed, in which case the values will be
|
||||
selected (see below). Any None objects will be dropped silently unless
|
||||
they are all None in which case a ValueError will be raised.
|
||||
axis : {0/'index', 1/'columns'}, default 0
|
||||
The axis to concatenate along.
|
||||
join : {'inner', 'outer'}, default 'outer'
|
||||
How to handle indexes on other axis (or axes).
|
||||
ignore_index : bool, default False
|
||||
If True, do not use the index values along the concatenation axis. The
|
||||
resulting axis will be labeled 0, ..., n - 1. This is useful if you are
|
||||
concatenating objects where the concatenation axis does not have
|
||||
meaningful indexing information. Note the index values on the other
|
||||
axes are still respected in the join.
|
||||
keys : sequence, default None
|
||||
If multiple levels passed, should contain tuples. Construct
|
||||
hierarchical index using the passed keys as the outermost level.
|
||||
levels : list of sequences, default None
|
||||
Specific levels (unique values) to use for constructing a
|
||||
MultiIndex. Otherwise they will be inferred from the keys.
|
||||
names : list, default None
|
||||
Names for the levels in the resulting hierarchical index.
|
||||
verify_integrity : bool, default False
|
||||
Check whether the new concatenated axis contains duplicates. This can
|
||||
be very expensive relative to the actual data concatenation.
|
||||
sort : bool, default False
|
||||
Sort non-concatenation axis if it is not already aligned. One exception to
|
||||
this is when the non-concatentation axis is a DatetimeIndex and join='outer'
|
||||
and the axis is not already aligned. In that case, the non-concatenation
|
||||
axis is always sorted lexicographically.
|
||||
copy : bool, default True
|
||||
If False, do not copy data unnecessarily.
|
||||
|
||||
Returns
|
||||
-------
|
||||
object, type of objs
|
||||
When concatenating all ``Series`` along the index (axis=0), a
|
||||
``Series`` is returned. When ``objs`` contains at least one
|
||||
``DataFrame``, a ``DataFrame`` is returned. When concatenating along
|
||||
the columns (axis=1), a ``DataFrame`` is returned.
|
||||
|
||||
See Also
|
||||
--------
|
||||
DataFrame.join : Join DataFrames using indexes.
|
||||
DataFrame.merge : Merge DataFrames by indexes or columns.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The keys, levels, and names arguments are all optional.
|
||||
|
||||
A walkthrough of how this method fits in with other tools for combining
|
||||
pandas objects can be found `here
|
||||
<https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html>`__.
|
||||
|
||||
It is not recommended to build DataFrames by adding single rows in a
|
||||
for loop. Build a list of rows and make a DataFrame in a single concat.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Combine two ``Series``.
|
||||
|
||||
>>> s1 = pd.Series(['a', 'b'])
|
||||
>>> s2 = pd.Series(['c', 'd'])
|
||||
>>> pd.concat([s1, s2])
|
||||
0 a
|
||||
1 b
|
||||
0 c
|
||||
1 d
|
||||
dtype: object
|
||||
|
||||
Clear the existing index and reset it in the result
|
||||
by setting the ``ignore_index`` option to ``True``.
|
||||
|
||||
>>> pd.concat([s1, s2], ignore_index=True)
|
||||
0 a
|
||||
1 b
|
||||
2 c
|
||||
3 d
|
||||
dtype: object
|
||||
|
||||
Add a hierarchical index at the outermost level of
|
||||
the data with the ``keys`` option.
|
||||
|
||||
>>> pd.concat([s1, s2], keys=['s1', 's2'])
|
||||
s1 0 a
|
||||
1 b
|
||||
s2 0 c
|
||||
1 d
|
||||
dtype: object
|
||||
|
||||
Label the index keys you create with the ``names`` option.
|
||||
|
||||
>>> pd.concat([s1, s2], keys=['s1', 's2'],
|
||||
... names=['Series name', 'Row ID'])
|
||||
Series name Row ID
|
||||
s1 0 a
|
||||
1 b
|
||||
s2 0 c
|
||||
1 d
|
||||
dtype: object
|
||||
|
||||
Combine two ``DataFrame`` objects with identical columns.
|
||||
|
||||
>>> df1 = pd.DataFrame([['a', 1], ['b', 2]],
|
||||
... columns=['letter', 'number'])
|
||||
>>> df1
|
||||
letter number
|
||||
0 a 1
|
||||
1 b 2
|
||||
>>> df2 = pd.DataFrame([['c', 3], ['d', 4]],
|
||||
... columns=['letter', 'number'])
|
||||
>>> df2
|
||||
letter number
|
||||
0 c 3
|
||||
1 d 4
|
||||
>>> pd.concat([df1, df2])
|
||||
letter number
|
||||
0 a 1
|
||||
1 b 2
|
||||
0 c 3
|
||||
1 d 4
|
||||
|
||||
Combine ``DataFrame`` objects with overlapping columns
|
||||
and return everything. Columns outside the intersection will
|
||||
be filled with ``NaN`` values.
|
||||
|
||||
>>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']],
|
||||
... columns=['letter', 'number', 'animal'])
|
||||
>>> df3
|
||||
letter number animal
|
||||
0 c 3 cat
|
||||
1 d 4 dog
|
||||
>>> pd.concat([df1, df3], sort=False)
|
||||
letter number animal
|
||||
0 a 1 NaN
|
||||
1 b 2 NaN
|
||||
0 c 3 cat
|
||||
1 d 4 dog
|
||||
|
||||
Combine ``DataFrame`` objects with overlapping columns
|
||||
and return only those that are shared by passing ``inner`` to
|
||||
the ``join`` keyword argument.
|
||||
|
||||
>>> pd.concat([df1, df3], join="inner")
|
||||
letter number
|
||||
0 a 1
|
||||
1 b 2
|
||||
0 c 3
|
||||
1 d 4
|
||||
|
||||
Combine ``DataFrame`` objects horizontally along the x axis by
|
||||
passing in ``axis=1``.
|
||||
|
||||
>>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']],
|
||||
... columns=['animal', 'name'])
|
||||
>>> pd.concat([df1, df4], axis=1)
|
||||
letter number animal name
|
||||
0 a 1 bird polly
|
||||
1 b 2 monkey george
|
||||
|
||||
Prevent the result from including duplicate index values with the
|
||||
``verify_integrity`` option.
|
||||
|
||||
>>> df5 = pd.DataFrame([1], index=['a'])
|
||||
>>> df5
|
||||
0
|
||||
a 1
|
||||
>>> df6 = pd.DataFrame([2], index=['a'])
|
||||
>>> df6
|
||||
0
|
||||
a 2
|
||||
>>> pd.concat([df5, df6], verify_integrity=True)
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
ValueError: Indexes have overlapping values: ['a']
|
||||
|
||||
Append a single row to the end of a ``DataFrame`` object.
|
||||
|
||||
>>> df7 = pd.DataFrame({'a': 1, 'b': 2}, index=[0])
|
||||
>>> df7
|
||||
a b
|
||||
0 1 2
|
||||
>>> new_row = pd.Series({'a': 3, 'b': 4})
|
||||
>>> new_row
|
||||
a 3
|
||||
b 4
|
||||
dtype: int64
|
||||
>>> pd.concat([df7, new_row.to_frame().T], ignore_index=True)
|
||||
a b
|
||||
0 1 2
|
||||
1 3 4
|
||||
"""
|
||||
if copy is None:
|
||||
if using_copy_on_write():
|
||||
copy = False
|
||||
else:
|
||||
copy = True
|
||||
elif copy and using_copy_on_write():
|
||||
copy = False
|
||||
|
||||
op = _Concatenator(
|
||||
objs,
|
||||
axis=axis,
|
||||
ignore_index=ignore_index,
|
||||
join=join,
|
||||
keys=keys,
|
||||
levels=levels,
|
||||
names=names,
|
||||
verify_integrity=verify_integrity,
|
||||
copy=copy,
|
||||
sort=sort,
|
||||
)
|
||||
|
||||
return op.get_result()
|
||||
|
||||
|
||||
class _Concatenator:
|
||||
"""
|
||||
Orchestrates a concatenation operation for BlockManagers
|
||||
"""
|
||||
|
||||
sort: bool
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
|
||||
axis: Axis = 0,
|
||||
join: str = "outer",
|
||||
keys: Iterable[Hashable] | None = None,
|
||||
levels=None,
|
||||
names: list[HashableT] | None = None,
|
||||
ignore_index: bool = False,
|
||||
verify_integrity: bool = False,
|
||||
copy: bool = True,
|
||||
sort: bool = False,
|
||||
) -> None:
|
||||
if isinstance(objs, (ABCSeries, ABCDataFrame, str)):
|
||||
raise TypeError(
|
||||
"first argument must be an iterable of pandas "
|
||||
f'objects, you passed an object of type "{type(objs).__name__}"'
|
||||
)
|
||||
|
||||
if join == "outer":
|
||||
self.intersect = False
|
||||
elif join == "inner":
|
||||
self.intersect = True
|
||||
else: # pragma: no cover
|
||||
raise ValueError(
|
||||
"Only can inner (intersect) or outer (union) join the other axis"
|
||||
)
|
||||
|
||||
if not is_bool(sort):
|
||||
raise ValueError(
|
||||
f"The 'sort' keyword only accepts boolean values; {sort} was passed."
|
||||
)
|
||||
# Incompatible types in assignment (expression has type "Union[bool, bool_]",
|
||||
# variable has type "bool")
|
||||
self.sort = sort # type: ignore[assignment]
|
||||
|
||||
self.ignore_index = ignore_index
|
||||
self.verify_integrity = verify_integrity
|
||||
self.copy = copy
|
||||
|
||||
objs, keys = self._clean_keys_and_objs(objs, keys)
|
||||
|
||||
# figure out what our result ndim is going to be
|
||||
ndims = self._get_ndims(objs)
|
||||
sample, objs = self._get_sample_object(objs, ndims, keys, names, levels)
|
||||
|
||||
# Standardize axis parameter to int
|
||||
if sample.ndim == 1:
|
||||
from pandas import DataFrame
|
||||
|
||||
axis = DataFrame._get_axis_number(axis)
|
||||
self._is_frame = False
|
||||
self._is_series = True
|
||||
else:
|
||||
axis = sample._get_axis_number(axis)
|
||||
self._is_frame = True
|
||||
self._is_series = False
|
||||
|
||||
# Need to flip BlockManager axis in the DataFrame special case
|
||||
axis = sample._get_block_manager_axis(axis)
|
||||
|
||||
# if we have mixed ndims, then convert to highest ndim
|
||||
# creating column numbers as needed
|
||||
if len(ndims) > 1:
|
||||
objs = self._sanitize_mixed_ndim(objs, sample, ignore_index, axis)
|
||||
|
||||
self.objs = objs
|
||||
|
||||
# note: this is the BlockManager axis (since DataFrame is transposed)
|
||||
self.bm_axis = axis
|
||||
self.axis = 1 - self.bm_axis if self._is_frame else 0
|
||||
self.keys = keys
|
||||
self.names = names or getattr(keys, "names", None)
|
||||
self.levels = levels
|
||||
|
||||
def _get_ndims(self, objs: list[Series | DataFrame]) -> set[int]:
|
||||
# figure out what our result ndim is going to be
|
||||
ndims = set()
|
||||
for obj in objs:
|
||||
if not isinstance(obj, (ABCSeries, ABCDataFrame)):
|
||||
msg = (
|
||||
f"cannot concatenate object of type '{type(obj)}'; "
|
||||
"only Series and DataFrame objs are valid"
|
||||
)
|
||||
raise TypeError(msg)
|
||||
|
||||
ndims.add(obj.ndim)
|
||||
return ndims
|
||||
|
||||
def _clean_keys_and_objs(
|
||||
self,
|
||||
objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
|
||||
keys,
|
||||
) -> tuple[list[Series | DataFrame], Index | None]:
|
||||
if isinstance(objs, abc.Mapping):
|
||||
if keys is None:
|
||||
keys = list(objs.keys())
|
||||
objs_list = [objs[k] for k in keys]
|
||||
else:
|
||||
objs_list = list(objs)
|
||||
|
||||
if len(objs_list) == 0:
|
||||
raise ValueError("No objects to concatenate")
|
||||
|
||||
if keys is None:
|
||||
objs_list = list(com.not_none(*objs_list))
|
||||
else:
|
||||
# GH#1649
|
||||
clean_keys = []
|
||||
clean_objs = []
|
||||
if is_iterator(keys):
|
||||
keys = list(keys)
|
||||
if len(keys) != len(objs_list):
|
||||
# GH#43485
|
||||
warnings.warn(
|
||||
"The behavior of pd.concat with len(keys) != len(objs) is "
|
||||
"deprecated. In a future version this will raise instead of "
|
||||
"truncating to the smaller of the two sequences",
|
||||
FutureWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
for k, v in zip(keys, objs_list):
|
||||
if v is None:
|
||||
continue
|
||||
clean_keys.append(k)
|
||||
clean_objs.append(v)
|
||||
objs_list = clean_objs
|
||||
|
||||
if isinstance(keys, MultiIndex):
|
||||
# TODO: retain levels?
|
||||
keys = type(keys).from_tuples(clean_keys, names=keys.names)
|
||||
else:
|
||||
name = getattr(keys, "name", None)
|
||||
keys = Index(clean_keys, name=name, dtype=getattr(keys, "dtype", None))
|
||||
|
||||
if len(objs_list) == 0:
|
||||
raise ValueError("All objects passed were None")
|
||||
|
||||
return objs_list, keys
|
||||
|
||||
def _get_sample_object(
|
||||
self,
|
||||
objs: list[Series | DataFrame],
|
||||
ndims: set[int],
|
||||
keys,
|
||||
names,
|
||||
levels,
|
||||
) -> tuple[Series | DataFrame, list[Series | DataFrame]]:
|
||||
# get the sample
|
||||
# want the highest ndim that we have, and must be non-empty
|
||||
# unless all objs are empty
|
||||
sample: Series | DataFrame | None = None
|
||||
if len(ndims) > 1:
|
||||
max_ndim = max(ndims)
|
||||
for obj in objs:
|
||||
if obj.ndim == max_ndim and np.sum(obj.shape):
|
||||
sample = obj
|
||||
break
|
||||
|
||||
else:
|
||||
# filter out the empties if we have not multi-index possibilities
|
||||
# note to keep empty Series as it affect to result columns / name
|
||||
non_empties = [obj for obj in objs if sum(obj.shape) > 0 or obj.ndim == 1]
|
||||
|
||||
if len(non_empties) and (
|
||||
keys is None and names is None and levels is None and not self.intersect
|
||||
):
|
||||
objs = non_empties
|
||||
sample = objs[0]
|
||||
|
||||
if sample is None:
|
||||
sample = objs[0]
|
||||
return sample, objs
|
||||
|
||||
def _sanitize_mixed_ndim(
|
||||
self,
|
||||
objs: list[Series | DataFrame],
|
||||
sample: Series | DataFrame,
|
||||
ignore_index: bool,
|
||||
axis: AxisInt,
|
||||
) -> list[Series | DataFrame]:
|
||||
# if we have mixed ndims, then convert to highest ndim
|
||||
# creating column numbers as needed
|
||||
|
||||
new_objs = []
|
||||
|
||||
current_column = 0
|
||||
max_ndim = sample.ndim
|
||||
for obj in objs:
|
||||
ndim = obj.ndim
|
||||
if ndim == max_ndim:
|
||||
pass
|
||||
|
||||
elif ndim != max_ndim - 1:
|
||||
raise ValueError(
|
||||
"cannot concatenate unaligned mixed dimensional NDFrame objects"
|
||||
)
|
||||
|
||||
else:
|
||||
name = getattr(obj, "name", None)
|
||||
if ignore_index or name is None:
|
||||
if axis == 1:
|
||||
# doing a row-wise concatenation so need everything
|
||||
# to line up
|
||||
name = 0
|
||||
else:
|
||||
# doing a column-wise concatenation so need series
|
||||
# to have unique names
|
||||
name = current_column
|
||||
current_column += 1
|
||||
|
||||
obj = sample._constructor({name: obj}, copy=False)
|
||||
|
||||
new_objs.append(obj)
|
||||
|
||||
return new_objs
|
||||
|
||||
def get_result(self):
|
||||
cons: Callable[..., DataFrame | Series]
|
||||
sample: DataFrame | Series
|
||||
|
||||
# series only
|
||||
if self._is_series:
|
||||
sample = cast("Series", self.objs[0])
|
||||
|
||||
# stack blocks
|
||||
if self.bm_axis == 0:
|
||||
name = com.consensus_name_attr(self.objs)
|
||||
cons = sample._constructor
|
||||
|
||||
arrs = [ser._values for ser in self.objs]
|
||||
|
||||
res = concat_compat(arrs, axis=0)
|
||||
|
||||
new_index: Index
|
||||
if self.ignore_index:
|
||||
# We can avoid surprisingly-expensive _get_concat_axis
|
||||
new_index = default_index(len(res))
|
||||
else:
|
||||
new_index = self.new_axes[0]
|
||||
|
||||
mgr = type(sample._mgr).from_array(res, index=new_index)
|
||||
|
||||
result = sample._constructor_from_mgr(mgr, axes=mgr.axes)
|
||||
result._name = name
|
||||
return result.__finalize__(self, method="concat")
|
||||
|
||||
# combine as columns in a frame
|
||||
else:
|
||||
data = dict(zip(range(len(self.objs)), self.objs))
|
||||
|
||||
# GH28330 Preserves subclassed objects through concat
|
||||
cons = sample._constructor_expanddim
|
||||
|
||||
index, columns = self.new_axes
|
||||
df = cons(data, index=index, copy=self.copy)
|
||||
df.columns = columns
|
||||
return df.__finalize__(self, method="concat")
|
||||
|
||||
# combine block managers
|
||||
else:
|
||||
sample = cast("DataFrame", self.objs[0])
|
||||
|
||||
mgrs_indexers = []
|
||||
for obj in self.objs:
|
||||
indexers = {}
|
||||
for ax, new_labels in enumerate(self.new_axes):
|
||||
# ::-1 to convert BlockManager ax to DataFrame ax
|
||||
if ax == self.bm_axis:
|
||||
# Suppress reindexing on concat axis
|
||||
continue
|
||||
|
||||
# 1-ax to convert BlockManager axis to DataFrame axis
|
||||
obj_labels = obj.axes[1 - ax]
|
||||
if not new_labels.equals(obj_labels):
|
||||
indexers[ax] = obj_labels.get_indexer(new_labels)
|
||||
|
||||
mgrs_indexers.append((obj._mgr, indexers))
|
||||
|
||||
new_data = concatenate_managers(
|
||||
mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy
|
||||
)
|
||||
if not self.copy and not using_copy_on_write():
|
||||
new_data._consolidate_inplace()
|
||||
|
||||
out = sample._constructor_from_mgr(new_data, axes=new_data.axes)
|
||||
return out.__finalize__(self, method="concat")
|
||||
|
||||
def _get_result_dim(self) -> int:
|
||||
if self._is_series and self.bm_axis == 1:
|
||||
return 2
|
||||
else:
|
||||
return self.objs[0].ndim
|
||||
|
||||
@cache_readonly
|
||||
def new_axes(self) -> list[Index]:
|
||||
ndim = self._get_result_dim()
|
||||
return [
|
||||
self._get_concat_axis if i == self.bm_axis else self._get_comb_axis(i)
|
||||
for i in range(ndim)
|
||||
]
|
||||
|
||||
def _get_comb_axis(self, i: AxisInt) -> Index:
|
||||
data_axis = self.objs[0]._get_block_manager_axis(i)
|
||||
return get_objs_combined_axis(
|
||||
self.objs,
|
||||
axis=data_axis,
|
||||
intersect=self.intersect,
|
||||
sort=self.sort,
|
||||
copy=self.copy,
|
||||
)
|
||||
|
||||
@cache_readonly
|
||||
def _get_concat_axis(self) -> Index:
|
||||
"""
|
||||
Return index to be used along concatenation axis.
|
||||
"""
|
||||
if self._is_series:
|
||||
if self.bm_axis == 0:
|
||||
indexes = [x.index for x in self.objs]
|
||||
elif self.ignore_index:
|
||||
idx = default_index(len(self.objs))
|
||||
return idx
|
||||
elif self.keys is None:
|
||||
names: list[Hashable] = [None] * len(self.objs)
|
||||
num = 0
|
||||
has_names = False
|
||||
for i, x in enumerate(self.objs):
|
||||
if x.ndim != 1:
|
||||
raise TypeError(
|
||||
f"Cannot concatenate type 'Series' with "
|
||||
f"object of type '{type(x).__name__}'"
|
||||
)
|
||||
if x.name is not None:
|
||||
names[i] = x.name
|
||||
has_names = True
|
||||
else:
|
||||
names[i] = num
|
||||
num += 1
|
||||
if has_names:
|
||||
return Index(names)
|
||||
else:
|
||||
return default_index(len(self.objs))
|
||||
else:
|
||||
return ensure_index(self.keys).set_names(self.names)
|
||||
else:
|
||||
indexes = [x.axes[self.axis] for x in self.objs]
|
||||
|
||||
if self.ignore_index:
|
||||
idx = default_index(sum(len(i) for i in indexes))
|
||||
return idx
|
||||
|
||||
if self.keys is None:
|
||||
if self.levels is not None:
|
||||
raise ValueError("levels supported only when keys is not None")
|
||||
concat_axis = _concat_indexes(indexes)
|
||||
else:
|
||||
concat_axis = _make_concat_multiindex(
|
||||
indexes, self.keys, self.levels, self.names
|
||||
)
|
||||
|
||||
self._maybe_check_integrity(concat_axis)
|
||||
|
||||
return concat_axis
|
||||
|
||||
def _maybe_check_integrity(self, concat_index: Index):
|
||||
if self.verify_integrity:
|
||||
if not concat_index.is_unique:
|
||||
overlap = concat_index[concat_index.duplicated()].unique()
|
||||
raise ValueError(f"Indexes have overlapping values: {overlap}")
|
||||
|
||||
|
||||
def _concat_indexes(indexes) -> Index:
|
||||
return indexes[0].append(indexes[1:])
|
||||
|
||||
|
||||
def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiIndex:
|
||||
if (levels is None and isinstance(keys[0], tuple)) or (
|
||||
levels is not None and len(levels) > 1
|
||||
):
|
||||
zipped = list(zip(*keys))
|
||||
if names is None:
|
||||
names = [None] * len(zipped)
|
||||
|
||||
if levels is None:
|
||||
_, levels = factorize_from_iterables(zipped)
|
||||
else:
|
||||
levels = [ensure_index(x) for x in levels]
|
||||
else:
|
||||
zipped = [keys]
|
||||
if names is None:
|
||||
names = [None]
|
||||
|
||||
if levels is None:
|
||||
levels = [ensure_index(keys).unique()]
|
||||
else:
|
||||
levels = [ensure_index(x) for x in levels]
|
||||
|
||||
for level in levels:
|
||||
if not level.is_unique:
|
||||
raise ValueError(f"Level values not unique: {level.tolist()}")
|
||||
|
||||
if not all_indexes_same(indexes) or not all(level.is_unique for level in levels):
|
||||
codes_list = []
|
||||
|
||||
# things are potentially different sizes, so compute the exact codes
|
||||
# for each level and pass those to MultiIndex.from_arrays
|
||||
|
||||
for hlevel, level in zip(zipped, levels):
|
||||
to_concat = []
|
||||
if isinstance(hlevel, Index) and hlevel.equals(level):
|
||||
lens = [len(idx) for idx in indexes]
|
||||
codes_list.append(np.repeat(np.arange(len(hlevel)), lens))
|
||||
else:
|
||||
for key, index in zip(hlevel, indexes):
|
||||
# Find matching codes, include matching nan values as equal.
|
||||
mask = (isna(level) & isna(key)) | (level == key)
|
||||
if not mask.any():
|
||||
raise ValueError(f"Key {key} not in level {level}")
|
||||
i = np.nonzero(mask)[0][0]
|
||||
|
||||
to_concat.append(np.repeat(i, len(index)))
|
||||
codes_list.append(np.concatenate(to_concat))
|
||||
|
||||
concat_index = _concat_indexes(indexes)
|
||||
|
||||
# these go at the end
|
||||
if isinstance(concat_index, MultiIndex):
|
||||
levels.extend(concat_index.levels)
|
||||
codes_list.extend(concat_index.codes)
|
||||
else:
|
||||
codes, categories = factorize_from_iterable(concat_index)
|
||||
levels.append(categories)
|
||||
codes_list.append(codes)
|
||||
|
||||
if len(names) == len(levels):
|
||||
names = list(names)
|
||||
else:
|
||||
# make sure that all of the passed indices have the same nlevels
|
||||
if not len({idx.nlevels for idx in indexes}) == 1:
|
||||
raise AssertionError(
|
||||
"Cannot concat indices that do not have the same number of levels"
|
||||
)
|
||||
|
||||
# also copies
|
||||
names = list(names) + list(get_unanimous_names(*indexes))
|
||||
|
||||
return MultiIndex(
|
||||
levels=levels, codes=codes_list, names=names, verify_integrity=False
|
||||
)
|
||||
|
||||
new_index = indexes[0]
|
||||
n = len(new_index)
|
||||
kpieces = len(indexes)
|
||||
|
||||
# also copies
|
||||
new_names = list(names)
|
||||
new_levels = list(levels)
|
||||
|
||||
# construct codes
|
||||
new_codes = []
|
||||
|
||||
# do something a bit more speedy
|
||||
|
||||
for hlevel, level in zip(zipped, levels):
|
||||
hlevel_index = ensure_index(hlevel)
|
||||
mapped = level.get_indexer(hlevel_index)
|
||||
|
||||
mask = mapped == -1
|
||||
if mask.any():
|
||||
raise ValueError(
|
||||
f"Values not found in passed level: {hlevel_index[mask]!s}"
|
||||
)
|
||||
|
||||
new_codes.append(np.repeat(mapped, n))
|
||||
|
||||
if isinstance(new_index, MultiIndex):
|
||||
new_levels.extend(new_index.levels)
|
||||
new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes])
|
||||
else:
|
||||
new_levels.append(new_index.unique())
|
||||
single_codes = new_index.unique().get_indexer(new_index)
|
||||
new_codes.append(np.tile(single_codes, kpieces))
|
||||
|
||||
if len(new_names) < len(new_levels):
|
||||
new_names.extend(new_index.names)
|
||||
|
||||
return MultiIndex(
|
||||
levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
|
||||
)
|
571
lib/python3.11/site-packages/pandas/core/reshape/encoding.py
Normal file
571
lib/python3.11/site-packages/pandas/core/reshape/encoding.py
Normal file
@ -0,0 +1,571 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import defaultdict
|
||||
from collections.abc import (
|
||||
Hashable,
|
||||
Iterable,
|
||||
)
|
||||
import itertools
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
cast,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import missing as libmissing
|
||||
from pandas._libs.sparse import IntIndex
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_integer_dtype,
|
||||
is_list_like,
|
||||
is_object_dtype,
|
||||
pandas_dtype,
|
||||
)
|
||||
from pandas.core.dtypes.dtypes import (
|
||||
ArrowDtype,
|
||||
CategoricalDtype,
|
||||
)
|
||||
|
||||
from pandas.core.arrays import SparseArray
|
||||
from pandas.core.arrays.categorical import factorize_from_iterable
|
||||
from pandas.core.arrays.string_ import StringDtype
|
||||
from pandas.core.frame import DataFrame
|
||||
from pandas.core.indexes.api import (
|
||||
Index,
|
||||
default_index,
|
||||
)
|
||||
from pandas.core.series import Series
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import NpDtype
|
||||
|
||||
|
||||
def get_dummies(
|
||||
data,
|
||||
prefix=None,
|
||||
prefix_sep: str | Iterable[str] | dict[str, str] = "_",
|
||||
dummy_na: bool = False,
|
||||
columns=None,
|
||||
sparse: bool = False,
|
||||
drop_first: bool = False,
|
||||
dtype: NpDtype | None = None,
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Convert categorical variable into dummy/indicator variables.
|
||||
|
||||
Each variable is converted in as many 0/1 variables as there are different
|
||||
values. Columns in the output are each named after a value; if the input is
|
||||
a DataFrame, the name of the original variable is prepended to the value.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array-like, Series, or DataFrame
|
||||
Data of which to get dummy indicators.
|
||||
prefix : str, list of str, or dict of str, default None
|
||||
String to append DataFrame column names.
|
||||
Pass a list with length equal to the number of columns
|
||||
when calling get_dummies on a DataFrame. Alternatively, `prefix`
|
||||
can be a dictionary mapping column names to prefixes.
|
||||
prefix_sep : str, default '_'
|
||||
If appending prefix, separator/delimiter to use. Or pass a
|
||||
list or dictionary as with `prefix`.
|
||||
dummy_na : bool, default False
|
||||
Add a column to indicate NaNs, if False NaNs are ignored.
|
||||
columns : list-like, default None
|
||||
Column names in the DataFrame to be encoded.
|
||||
If `columns` is None then all the columns with
|
||||
`object`, `string`, or `category` dtype will be converted.
|
||||
sparse : bool, default False
|
||||
Whether the dummy-encoded columns should be backed by
|
||||
a :class:`SparseArray` (True) or a regular NumPy array (False).
|
||||
drop_first : bool, default False
|
||||
Whether to get k-1 dummies out of k categorical levels by removing the
|
||||
first level.
|
||||
dtype : dtype, default bool
|
||||
Data type for new columns. Only a single dtype is allowed.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
Dummy-coded data. If `data` contains other columns than the
|
||||
dummy-coded one(s), these will be prepended, unaltered, to the result.
|
||||
|
||||
See Also
|
||||
--------
|
||||
Series.str.get_dummies : Convert Series of strings to dummy codes.
|
||||
:func:`~pandas.from_dummies` : Convert dummy codes to categorical ``DataFrame``.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Reference :ref:`the user guide <reshaping.dummies>` for more examples.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> s = pd.Series(list('abca'))
|
||||
|
||||
>>> pd.get_dummies(s)
|
||||
a b c
|
||||
0 True False False
|
||||
1 False True False
|
||||
2 False False True
|
||||
3 True False False
|
||||
|
||||
>>> s1 = ['a', 'b', np.nan]
|
||||
|
||||
>>> pd.get_dummies(s1)
|
||||
a b
|
||||
0 True False
|
||||
1 False True
|
||||
2 False False
|
||||
|
||||
>>> pd.get_dummies(s1, dummy_na=True)
|
||||
a b NaN
|
||||
0 True False False
|
||||
1 False True False
|
||||
2 False False True
|
||||
|
||||
>>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
|
||||
... 'C': [1, 2, 3]})
|
||||
|
||||
>>> pd.get_dummies(df, prefix=['col1', 'col2'])
|
||||
C col1_a col1_b col2_a col2_b col2_c
|
||||
0 1 True False False True False
|
||||
1 2 False True True False False
|
||||
2 3 True False False False True
|
||||
|
||||
>>> pd.get_dummies(pd.Series(list('abcaa')))
|
||||
a b c
|
||||
0 True False False
|
||||
1 False True False
|
||||
2 False False True
|
||||
3 True False False
|
||||
4 True False False
|
||||
|
||||
>>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)
|
||||
b c
|
||||
0 False False
|
||||
1 True False
|
||||
2 False True
|
||||
3 False False
|
||||
4 False False
|
||||
|
||||
>>> pd.get_dummies(pd.Series(list('abc')), dtype=float)
|
||||
a b c
|
||||
0 1.0 0.0 0.0
|
||||
1 0.0 1.0 0.0
|
||||
2 0.0 0.0 1.0
|
||||
"""
|
||||
from pandas.core.reshape.concat import concat
|
||||
|
||||
dtypes_to_encode = ["object", "string", "category"]
|
||||
|
||||
if isinstance(data, DataFrame):
|
||||
# determine columns being encoded
|
||||
if columns is None:
|
||||
data_to_encode = data.select_dtypes(include=dtypes_to_encode)
|
||||
elif not is_list_like(columns):
|
||||
raise TypeError("Input must be a list-like for parameter `columns`")
|
||||
else:
|
||||
data_to_encode = data[columns]
|
||||
|
||||
# validate prefixes and separator to avoid silently dropping cols
|
||||
def check_len(item, name: str):
|
||||
if is_list_like(item):
|
||||
if not len(item) == data_to_encode.shape[1]:
|
||||
len_msg = (
|
||||
f"Length of '{name}' ({len(item)}) did not match the "
|
||||
"length of the columns being encoded "
|
||||
f"({data_to_encode.shape[1]})."
|
||||
)
|
||||
raise ValueError(len_msg)
|
||||
|
||||
check_len(prefix, "prefix")
|
||||
check_len(prefix_sep, "prefix_sep")
|
||||
|
||||
if isinstance(prefix, str):
|
||||
prefix = itertools.cycle([prefix])
|
||||
if isinstance(prefix, dict):
|
||||
prefix = [prefix[col] for col in data_to_encode.columns]
|
||||
|
||||
if prefix is None:
|
||||
prefix = data_to_encode.columns
|
||||
|
||||
# validate separators
|
||||
if isinstance(prefix_sep, str):
|
||||
prefix_sep = itertools.cycle([prefix_sep])
|
||||
elif isinstance(prefix_sep, dict):
|
||||
prefix_sep = [prefix_sep[col] for col in data_to_encode.columns]
|
||||
|
||||
with_dummies: list[DataFrame]
|
||||
if data_to_encode.shape == data.shape:
|
||||
# Encoding the entire df, do not prepend any dropped columns
|
||||
with_dummies = []
|
||||
elif columns is not None:
|
||||
# Encoding only cols specified in columns. Get all cols not in
|
||||
# columns to prepend to result.
|
||||
with_dummies = [data.drop(columns, axis=1)]
|
||||
else:
|
||||
# Encoding only object and category dtype columns. Get remaining
|
||||
# columns to prepend to result.
|
||||
with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)]
|
||||
|
||||
for col, pre, sep in zip(data_to_encode.items(), prefix, prefix_sep):
|
||||
# col is (column_name, column), use just column data here
|
||||
dummy = _get_dummies_1d(
|
||||
col[1],
|
||||
prefix=pre,
|
||||
prefix_sep=sep,
|
||||
dummy_na=dummy_na,
|
||||
sparse=sparse,
|
||||
drop_first=drop_first,
|
||||
dtype=dtype,
|
||||
)
|
||||
with_dummies.append(dummy)
|
||||
result = concat(with_dummies, axis=1)
|
||||
else:
|
||||
result = _get_dummies_1d(
|
||||
data,
|
||||
prefix,
|
||||
prefix_sep,
|
||||
dummy_na,
|
||||
sparse=sparse,
|
||||
drop_first=drop_first,
|
||||
dtype=dtype,
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
def _get_dummies_1d(
|
||||
data,
|
||||
prefix,
|
||||
prefix_sep: str | Iterable[str] | dict[str, str] = "_",
|
||||
dummy_na: bool = False,
|
||||
sparse: bool = False,
|
||||
drop_first: bool = False,
|
||||
dtype: NpDtype | None = None,
|
||||
) -> DataFrame:
|
||||
from pandas.core.reshape.concat import concat
|
||||
|
||||
# Series avoids inconsistent NaN handling
|
||||
codes, levels = factorize_from_iterable(Series(data, copy=False))
|
||||
|
||||
if dtype is None and hasattr(data, "dtype"):
|
||||
input_dtype = data.dtype
|
||||
if isinstance(input_dtype, CategoricalDtype):
|
||||
input_dtype = input_dtype.categories.dtype
|
||||
|
||||
if isinstance(input_dtype, ArrowDtype):
|
||||
import pyarrow as pa
|
||||
|
||||
dtype = ArrowDtype(pa.bool_()) # type: ignore[assignment]
|
||||
elif (
|
||||
isinstance(input_dtype, StringDtype)
|
||||
and input_dtype.na_value is libmissing.NA
|
||||
):
|
||||
dtype = pandas_dtype("boolean") # type: ignore[assignment]
|
||||
else:
|
||||
dtype = np.dtype(bool)
|
||||
elif dtype is None:
|
||||
dtype = np.dtype(bool)
|
||||
|
||||
_dtype = pandas_dtype(dtype)
|
||||
|
||||
if is_object_dtype(_dtype):
|
||||
raise ValueError("dtype=object is not a valid dtype for get_dummies")
|
||||
|
||||
def get_empty_frame(data) -> DataFrame:
|
||||
index: Index | np.ndarray
|
||||
if isinstance(data, Series):
|
||||
index = data.index
|
||||
else:
|
||||
index = default_index(len(data))
|
||||
return DataFrame(index=index)
|
||||
|
||||
# if all NaN
|
||||
if not dummy_na and len(levels) == 0:
|
||||
return get_empty_frame(data)
|
||||
|
||||
codes = codes.copy()
|
||||
if dummy_na:
|
||||
codes[codes == -1] = len(levels)
|
||||
levels = levels.insert(len(levels), np.nan)
|
||||
|
||||
# if dummy_na, we just fake a nan level. drop_first will drop it again
|
||||
if drop_first and len(levels) == 1:
|
||||
return get_empty_frame(data)
|
||||
|
||||
number_of_cols = len(levels)
|
||||
|
||||
if prefix is None:
|
||||
dummy_cols = levels
|
||||
else:
|
||||
dummy_cols = Index([f"{prefix}{prefix_sep}{level}" for level in levels])
|
||||
|
||||
index: Index | None
|
||||
if isinstance(data, Series):
|
||||
index = data.index
|
||||
else:
|
||||
index = None
|
||||
|
||||
if sparse:
|
||||
fill_value: bool | float
|
||||
if is_integer_dtype(dtype):
|
||||
fill_value = 0
|
||||
elif dtype == np.dtype(bool):
|
||||
fill_value = False
|
||||
else:
|
||||
fill_value = 0.0
|
||||
|
||||
sparse_series = []
|
||||
N = len(data)
|
||||
sp_indices: list[list] = [[] for _ in range(len(dummy_cols))]
|
||||
mask = codes != -1
|
||||
codes = codes[mask]
|
||||
n_idx = np.arange(N)[mask]
|
||||
|
||||
for ndx, code in zip(n_idx, codes):
|
||||
sp_indices[code].append(ndx)
|
||||
|
||||
if drop_first:
|
||||
# remove first categorical level to avoid perfect collinearity
|
||||
# GH12042
|
||||
sp_indices = sp_indices[1:]
|
||||
dummy_cols = dummy_cols[1:]
|
||||
for col, ixs in zip(dummy_cols, sp_indices):
|
||||
sarr = SparseArray(
|
||||
np.ones(len(ixs), dtype=dtype),
|
||||
sparse_index=IntIndex(N, ixs),
|
||||
fill_value=fill_value,
|
||||
dtype=dtype,
|
||||
)
|
||||
sparse_series.append(Series(data=sarr, index=index, name=col, copy=False))
|
||||
|
||||
return concat(sparse_series, axis=1, copy=False)
|
||||
|
||||
else:
|
||||
# ensure ndarray layout is column-major
|
||||
shape = len(codes), number_of_cols
|
||||
dummy_dtype: NpDtype
|
||||
if isinstance(_dtype, np.dtype):
|
||||
dummy_dtype = _dtype
|
||||
else:
|
||||
dummy_dtype = np.bool_
|
||||
dummy_mat = np.zeros(shape=shape, dtype=dummy_dtype, order="F")
|
||||
dummy_mat[np.arange(len(codes)), codes] = 1
|
||||
|
||||
if not dummy_na:
|
||||
# reset NaN GH4446
|
||||
dummy_mat[codes == -1] = 0
|
||||
|
||||
if drop_first:
|
||||
# remove first GH12042
|
||||
dummy_mat = dummy_mat[:, 1:]
|
||||
dummy_cols = dummy_cols[1:]
|
||||
return DataFrame(dummy_mat, index=index, columns=dummy_cols, dtype=_dtype)
|
||||
|
||||
|
||||
def from_dummies(
|
||||
data: DataFrame,
|
||||
sep: None | str = None,
|
||||
default_category: None | Hashable | dict[str, Hashable] = None,
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Create a categorical ``DataFrame`` from a ``DataFrame`` of dummy variables.
|
||||
|
||||
Inverts the operation performed by :func:`~pandas.get_dummies`.
|
||||
|
||||
.. versionadded:: 1.5.0
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : DataFrame
|
||||
Data which contains dummy-coded variables in form of integer columns of
|
||||
1's and 0's.
|
||||
sep : str, default None
|
||||
Separator used in the column names of the dummy categories they are
|
||||
character indicating the separation of the categorical names from the prefixes.
|
||||
For example, if your column names are 'prefix_A' and 'prefix_B',
|
||||
you can strip the underscore by specifying sep='_'.
|
||||
default_category : None, Hashable or dict of Hashables, default None
|
||||
The default category is the implied category when a value has none of the
|
||||
listed categories specified with a one, i.e. if all dummies in a row are
|
||||
zero. Can be a single value for all variables or a dict directly mapping
|
||||
the default categories to a prefix of a variable.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
Categorical data decoded from the dummy input-data.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
* When the input ``DataFrame`` ``data`` contains NA values.
|
||||
* When the input ``DataFrame`` ``data`` contains column names with separators
|
||||
that do not match the separator specified with ``sep``.
|
||||
* When a ``dict`` passed to ``default_category`` does not include an implied
|
||||
category for each prefix.
|
||||
* When a value in ``data`` has more than one category assigned to it.
|
||||
* When ``default_category=None`` and a value in ``data`` has no category
|
||||
assigned to it.
|
||||
TypeError
|
||||
* When the input ``data`` is not of type ``DataFrame``.
|
||||
* When the input ``DataFrame`` ``data`` contains non-dummy data.
|
||||
* When the passed ``sep`` is of a wrong data type.
|
||||
* When the passed ``default_category`` is of a wrong data type.
|
||||
|
||||
See Also
|
||||
--------
|
||||
:func:`~pandas.get_dummies` : Convert ``Series`` or ``DataFrame`` to dummy codes.
|
||||
:class:`~pandas.Categorical` : Represent a categorical variable in classic.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The columns of the passed dummy data should only include 1's and 0's,
|
||||
or boolean values.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0],
|
||||
... "c": [0, 0, 1, 0]})
|
||||
|
||||
>>> df
|
||||
a b c
|
||||
0 1 0 0
|
||||
1 0 1 0
|
||||
2 0 0 1
|
||||
3 1 0 0
|
||||
|
||||
>>> pd.from_dummies(df)
|
||||
0 a
|
||||
1 b
|
||||
2 c
|
||||
3 a
|
||||
|
||||
>>> df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0],
|
||||
... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0],
|
||||
... "col2_c": [0, 0, 1]})
|
||||
|
||||
>>> df
|
||||
col1_a col1_b col2_a col2_b col2_c
|
||||
0 1 0 0 1 0
|
||||
1 0 1 1 0 0
|
||||
2 1 0 0 0 1
|
||||
|
||||
>>> pd.from_dummies(df, sep="_")
|
||||
col1 col2
|
||||
0 a b
|
||||
1 b a
|
||||
2 a c
|
||||
|
||||
>>> df = pd.DataFrame({"col1_a": [1, 0, 0], "col1_b": [0, 1, 0],
|
||||
... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0],
|
||||
... "col2_c": [0, 0, 0]})
|
||||
|
||||
>>> df
|
||||
col1_a col1_b col2_a col2_b col2_c
|
||||
0 1 0 0 1 0
|
||||
1 0 1 1 0 0
|
||||
2 0 0 0 0 0
|
||||
|
||||
>>> pd.from_dummies(df, sep="_", default_category={"col1": "d", "col2": "e"})
|
||||
col1 col2
|
||||
0 a b
|
||||
1 b a
|
||||
2 d e
|
||||
"""
|
||||
from pandas.core.reshape.concat import concat
|
||||
|
||||
if not isinstance(data, DataFrame):
|
||||
raise TypeError(
|
||||
"Expected 'data' to be a 'DataFrame'; "
|
||||
f"Received 'data' of type: {type(data).__name__}"
|
||||
)
|
||||
|
||||
col_isna_mask = cast(Series, data.isna().any())
|
||||
|
||||
if col_isna_mask.any():
|
||||
raise ValueError(
|
||||
"Dummy DataFrame contains NA value in column: "
|
||||
f"'{col_isna_mask.idxmax()}'"
|
||||
)
|
||||
|
||||
# index data with a list of all columns that are dummies
|
||||
try:
|
||||
data_to_decode = data.astype("boolean", copy=False)
|
||||
except TypeError:
|
||||
raise TypeError("Passed DataFrame contains non-dummy data")
|
||||
|
||||
# collect prefixes and get lists to slice data for each prefix
|
||||
variables_slice = defaultdict(list)
|
||||
if sep is None:
|
||||
variables_slice[""] = list(data.columns)
|
||||
elif isinstance(sep, str):
|
||||
for col in data_to_decode.columns:
|
||||
prefix = col.split(sep)[0]
|
||||
if len(prefix) == len(col):
|
||||
raise ValueError(f"Separator not specified for column: {col}")
|
||||
variables_slice[prefix].append(col)
|
||||
else:
|
||||
raise TypeError(
|
||||
"Expected 'sep' to be of type 'str' or 'None'; "
|
||||
f"Received 'sep' of type: {type(sep).__name__}"
|
||||
)
|
||||
|
||||
if default_category is not None:
|
||||
if isinstance(default_category, dict):
|
||||
if not len(default_category) == len(variables_slice):
|
||||
len_msg = (
|
||||
f"Length of 'default_category' ({len(default_category)}) "
|
||||
f"did not match the length of the columns being encoded "
|
||||
f"({len(variables_slice)})"
|
||||
)
|
||||
raise ValueError(len_msg)
|
||||
elif isinstance(default_category, Hashable):
|
||||
default_category = dict(
|
||||
zip(variables_slice, [default_category] * len(variables_slice))
|
||||
)
|
||||
else:
|
||||
raise TypeError(
|
||||
"Expected 'default_category' to be of type "
|
||||
"'None', 'Hashable', or 'dict'; "
|
||||
"Received 'default_category' of type: "
|
||||
f"{type(default_category).__name__}"
|
||||
)
|
||||
|
||||
cat_data = {}
|
||||
for prefix, prefix_slice in variables_slice.items():
|
||||
if sep is None:
|
||||
cats = prefix_slice.copy()
|
||||
else:
|
||||
cats = [col[len(prefix + sep) :] for col in prefix_slice]
|
||||
assigned = data_to_decode.loc[:, prefix_slice].sum(axis=1)
|
||||
if any(assigned > 1):
|
||||
raise ValueError(
|
||||
"Dummy DataFrame contains multi-assignment(s); "
|
||||
f"First instance in row: {assigned.idxmax()}"
|
||||
)
|
||||
if any(assigned == 0):
|
||||
if isinstance(default_category, dict):
|
||||
cats.append(default_category[prefix])
|
||||
else:
|
||||
raise ValueError(
|
||||
"Dummy DataFrame contains unassigned value(s); "
|
||||
f"First instance in row: {assigned.idxmin()}"
|
||||
)
|
||||
data_slice = concat(
|
||||
(data_to_decode.loc[:, prefix_slice], assigned == 0), axis=1
|
||||
)
|
||||
else:
|
||||
data_slice = data_to_decode.loc[:, prefix_slice]
|
||||
cats_array = data._constructor_sliced(cats, dtype=data.columns.dtype)
|
||||
# get indices of True entries along axis=1
|
||||
true_values = data_slice.idxmax(axis=1)
|
||||
indexer = data_slice.columns.get_indexer_for(true_values)
|
||||
cat_data[prefix] = cats_array.take(indexer).set_axis(data.index)
|
||||
|
||||
result = DataFrame(cat_data)
|
||||
if sep is not None:
|
||||
result.columns = result.columns.astype(data.columns.dtype)
|
||||
return result
|
512
lib/python3.11/site-packages/pandas/core/reshape/melt.py
Normal file
512
lib/python3.11/site-packages/pandas/core/reshape/melt.py
Normal file
@ -0,0 +1,512 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.util._decorators import Appender
|
||||
|
||||
from pandas.core.dtypes.common import is_list_like
|
||||
from pandas.core.dtypes.concat import concat_compat
|
||||
from pandas.core.dtypes.missing import notna
|
||||
|
||||
import pandas.core.algorithms as algos
|
||||
from pandas.core.indexes.api import MultiIndex
|
||||
from pandas.core.reshape.concat import concat
|
||||
from pandas.core.reshape.util import tile_compat
|
||||
from pandas.core.shared_docs import _shared_docs
|
||||
from pandas.core.tools.numeric import to_numeric
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Hashable
|
||||
|
||||
from pandas._typing import AnyArrayLike
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
|
||||
def ensure_list_vars(arg_vars, variable: str, columns) -> list:
|
||||
if arg_vars is not None:
|
||||
if not is_list_like(arg_vars):
|
||||
return [arg_vars]
|
||||
elif isinstance(columns, MultiIndex) and not isinstance(arg_vars, list):
|
||||
raise ValueError(
|
||||
f"{variable} must be a list of tuples when columns are a MultiIndex"
|
||||
)
|
||||
else:
|
||||
return list(arg_vars)
|
||||
else:
|
||||
return []
|
||||
|
||||
|
||||
@Appender(_shared_docs["melt"] % {"caller": "pd.melt(df, ", "other": "DataFrame.melt"})
|
||||
def melt(
|
||||
frame: DataFrame,
|
||||
id_vars=None,
|
||||
value_vars=None,
|
||||
var_name=None,
|
||||
value_name: Hashable = "value",
|
||||
col_level=None,
|
||||
ignore_index: bool = True,
|
||||
) -> DataFrame:
|
||||
if value_name in frame.columns:
|
||||
raise ValueError(
|
||||
f"value_name ({value_name}) cannot match an element in "
|
||||
"the DataFrame columns."
|
||||
)
|
||||
id_vars = ensure_list_vars(id_vars, "id_vars", frame.columns)
|
||||
value_vars_was_not_none = value_vars is not None
|
||||
value_vars = ensure_list_vars(value_vars, "value_vars", frame.columns)
|
||||
|
||||
if id_vars or value_vars:
|
||||
if col_level is not None:
|
||||
level = frame.columns.get_level_values(col_level)
|
||||
else:
|
||||
level = frame.columns
|
||||
labels = id_vars + value_vars
|
||||
idx = level.get_indexer_for(labels)
|
||||
missing = idx == -1
|
||||
if missing.any():
|
||||
missing_labels = [
|
||||
lab for lab, not_found in zip(labels, missing) if not_found
|
||||
]
|
||||
raise KeyError(
|
||||
"The following id_vars or value_vars are not present in "
|
||||
f"the DataFrame: {missing_labels}"
|
||||
)
|
||||
if value_vars_was_not_none:
|
||||
frame = frame.iloc[:, algos.unique(idx)]
|
||||
else:
|
||||
frame = frame.copy()
|
||||
else:
|
||||
frame = frame.copy()
|
||||
|
||||
if col_level is not None: # allow list or other?
|
||||
# frame is a copy
|
||||
frame.columns = frame.columns.get_level_values(col_level)
|
||||
|
||||
if var_name is None:
|
||||
if isinstance(frame.columns, MultiIndex):
|
||||
if len(frame.columns.names) == len(set(frame.columns.names)):
|
||||
var_name = frame.columns.names
|
||||
else:
|
||||
var_name = [f"variable_{i}" for i in range(len(frame.columns.names))]
|
||||
else:
|
||||
var_name = [
|
||||
frame.columns.name if frame.columns.name is not None else "variable"
|
||||
]
|
||||
elif is_list_like(var_name):
|
||||
raise ValueError(f"{var_name=} must be a scalar.")
|
||||
else:
|
||||
var_name = [var_name]
|
||||
|
||||
num_rows, K = frame.shape
|
||||
num_cols_adjusted = K - len(id_vars)
|
||||
|
||||
mdata: dict[Hashable, AnyArrayLike] = {}
|
||||
for col in id_vars:
|
||||
id_data = frame.pop(col)
|
||||
if not isinstance(id_data.dtype, np.dtype):
|
||||
# i.e. ExtensionDtype
|
||||
if num_cols_adjusted > 0:
|
||||
mdata[col] = concat([id_data] * num_cols_adjusted, ignore_index=True)
|
||||
else:
|
||||
# We can't concat empty list. (GH 46044)
|
||||
mdata[col] = type(id_data)([], name=id_data.name, dtype=id_data.dtype)
|
||||
else:
|
||||
mdata[col] = np.tile(id_data._values, num_cols_adjusted)
|
||||
|
||||
mcolumns = id_vars + var_name + [value_name]
|
||||
|
||||
if frame.shape[1] > 0 and not any(
|
||||
not isinstance(dt, np.dtype) and dt._supports_2d for dt in frame.dtypes
|
||||
):
|
||||
mdata[value_name] = concat(
|
||||
[frame.iloc[:, i] for i in range(frame.shape[1])]
|
||||
).values
|
||||
else:
|
||||
mdata[value_name] = frame._values.ravel("F")
|
||||
for i, col in enumerate(var_name):
|
||||
mdata[col] = frame.columns._get_level_values(i).repeat(num_rows)
|
||||
|
||||
result = frame._constructor(mdata, columns=mcolumns)
|
||||
|
||||
if not ignore_index:
|
||||
result.index = tile_compat(frame.index, num_cols_adjusted)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def lreshape(data: DataFrame, groups: dict, dropna: bool = True) -> DataFrame:
|
||||
"""
|
||||
Reshape wide-format data to long. Generalized inverse of DataFrame.pivot.
|
||||
|
||||
Accepts a dictionary, ``groups``, in which each key is a new column name
|
||||
and each value is a list of old column names that will be "melted" under
|
||||
the new column name as part of the reshape.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : DataFrame
|
||||
The wide-format DataFrame.
|
||||
groups : dict
|
||||
{new_name : list_of_columns}.
|
||||
dropna : bool, default True
|
||||
Do not include columns whose entries are all NaN.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
Reshaped DataFrame.
|
||||
|
||||
See Also
|
||||
--------
|
||||
melt : Unpivot a DataFrame from wide to long format, optionally leaving
|
||||
identifiers set.
|
||||
pivot : Create a spreadsheet-style pivot table as a DataFrame.
|
||||
DataFrame.pivot : Pivot without aggregation that can handle
|
||||
non-numeric data.
|
||||
DataFrame.pivot_table : Generalization of pivot that can handle
|
||||
duplicate values for one index/column pair.
|
||||
DataFrame.unstack : Pivot based on the index values instead of a
|
||||
column.
|
||||
wide_to_long : Wide panel to long format. Less flexible but more
|
||||
user-friendly than melt.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526],
|
||||
... 'team': ['Red Sox', 'Yankees'],
|
||||
... 'year1': [2007, 2007], 'year2': [2008, 2008]})
|
||||
>>> data
|
||||
hr1 hr2 team year1 year2
|
||||
0 514 545 Red Sox 2007 2008
|
||||
1 573 526 Yankees 2007 2008
|
||||
|
||||
>>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']})
|
||||
team year hr
|
||||
0 Red Sox 2007 514
|
||||
1 Yankees 2007 573
|
||||
2 Red Sox 2008 545
|
||||
3 Yankees 2008 526
|
||||
"""
|
||||
mdata = {}
|
||||
pivot_cols = []
|
||||
all_cols: set[Hashable] = set()
|
||||
K = len(next(iter(groups.values())))
|
||||
for target, names in groups.items():
|
||||
if len(names) != K:
|
||||
raise ValueError("All column lists must be same length")
|
||||
to_concat = [data[col]._values for col in names]
|
||||
|
||||
mdata[target] = concat_compat(to_concat)
|
||||
pivot_cols.append(target)
|
||||
all_cols = all_cols.union(names)
|
||||
|
||||
id_cols = list(data.columns.difference(all_cols))
|
||||
for col in id_cols:
|
||||
mdata[col] = np.tile(data[col]._values, K)
|
||||
|
||||
if dropna:
|
||||
mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool)
|
||||
for c in pivot_cols:
|
||||
mask &= notna(mdata[c])
|
||||
if not mask.all():
|
||||
mdata = {k: v[mask] for k, v in mdata.items()}
|
||||
|
||||
return data._constructor(mdata, columns=id_cols + pivot_cols)
|
||||
|
||||
|
||||
def wide_to_long(
|
||||
df: DataFrame, stubnames, i, j, sep: str = "", suffix: str = r"\d+"
|
||||
) -> DataFrame:
|
||||
r"""
|
||||
Unpivot a DataFrame from wide to long format.
|
||||
|
||||
Less flexible but more user-friendly than melt.
|
||||
|
||||
With stubnames ['A', 'B'], this function expects to find one or more
|
||||
group of columns with format
|
||||
A-suffix1, A-suffix2,..., B-suffix1, B-suffix2,...
|
||||
You specify what you want to call this suffix in the resulting long format
|
||||
with `j` (for example `j='year'`)
|
||||
|
||||
Each row of these wide variables are assumed to be uniquely identified by
|
||||
`i` (can be a single column name or a list of column names)
|
||||
|
||||
All remaining variables in the data frame are left intact.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : DataFrame
|
||||
The wide-format DataFrame.
|
||||
stubnames : str or list-like
|
||||
The stub name(s). The wide format variables are assumed to
|
||||
start with the stub names.
|
||||
i : str or list-like
|
||||
Column(s) to use as id variable(s).
|
||||
j : str
|
||||
The name of the sub-observation variable. What you wish to name your
|
||||
suffix in the long format.
|
||||
sep : str, default ""
|
||||
A character indicating the separation of the variable names
|
||||
in the wide format, to be stripped from the names in the long format.
|
||||
For example, if your column names are A-suffix1, A-suffix2, you
|
||||
can strip the hyphen by specifying `sep='-'`.
|
||||
suffix : str, default '\\d+'
|
||||
A regular expression capturing the wanted suffixes. '\\d+' captures
|
||||
numeric suffixes. Suffixes with no numbers could be specified with the
|
||||
negated character class '\\D+'. You can also further disambiguate
|
||||
suffixes, for example, if your wide variables are of the form A-one,
|
||||
B-two,.., and you have an unrelated column A-rating, you can ignore the
|
||||
last one by specifying `suffix='(!?one|two)'`. When all suffixes are
|
||||
numeric, they are cast to int64/float64.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
A DataFrame that contains each stub name as a variable, with new index
|
||||
(i, j).
|
||||
|
||||
See Also
|
||||
--------
|
||||
melt : Unpivot a DataFrame from wide to long format, optionally leaving
|
||||
identifiers set.
|
||||
pivot : Create a spreadsheet-style pivot table as a DataFrame.
|
||||
DataFrame.pivot : Pivot without aggregation that can handle
|
||||
non-numeric data.
|
||||
DataFrame.pivot_table : Generalization of pivot that can handle
|
||||
duplicate values for one index/column pair.
|
||||
DataFrame.unstack : Pivot based on the index values instead of a
|
||||
column.
|
||||
|
||||
Notes
|
||||
-----
|
||||
All extra variables are left untouched. This simply uses
|
||||
`pandas.melt` under the hood, but is hard-coded to "do the right thing"
|
||||
in a typical case.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> np.random.seed(123)
|
||||
>>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},
|
||||
... "A1980" : {0 : "d", 1 : "e", 2 : "f"},
|
||||
... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},
|
||||
... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},
|
||||
... "X" : dict(zip(range(3), np.random.randn(3)))
|
||||
... })
|
||||
>>> df["id"] = df.index
|
||||
>>> df
|
||||
A1970 A1980 B1970 B1980 X id
|
||||
0 a d 2.5 3.2 -1.085631 0
|
||||
1 b e 1.2 1.3 0.997345 1
|
||||
2 c f 0.7 0.1 0.282978 2
|
||||
>>> pd.wide_to_long(df, ["A", "B"], i="id", j="year")
|
||||
... # doctest: +NORMALIZE_WHITESPACE
|
||||
X A B
|
||||
id year
|
||||
0 1970 -1.085631 a 2.5
|
||||
1 1970 0.997345 b 1.2
|
||||
2 1970 0.282978 c 0.7
|
||||
0 1980 -1.085631 d 3.2
|
||||
1 1980 0.997345 e 1.3
|
||||
2 1980 0.282978 f 0.1
|
||||
|
||||
With multiple id columns
|
||||
|
||||
>>> df = pd.DataFrame({
|
||||
... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
|
||||
... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
|
||||
... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
|
||||
... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
|
||||
... })
|
||||
>>> df
|
||||
famid birth ht1 ht2
|
||||
0 1 1 2.8 3.4
|
||||
1 1 2 2.9 3.8
|
||||
2 1 3 2.2 2.9
|
||||
3 2 1 2.0 3.2
|
||||
4 2 2 1.8 2.8
|
||||
5 2 3 1.9 2.4
|
||||
6 3 1 2.2 3.3
|
||||
7 3 2 2.3 3.4
|
||||
8 3 3 2.1 2.9
|
||||
>>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age')
|
||||
>>> l
|
||||
... # doctest: +NORMALIZE_WHITESPACE
|
||||
ht
|
||||
famid birth age
|
||||
1 1 1 2.8
|
||||
2 3.4
|
||||
2 1 2.9
|
||||
2 3.8
|
||||
3 1 2.2
|
||||
2 2.9
|
||||
2 1 1 2.0
|
||||
2 3.2
|
||||
2 1 1.8
|
||||
2 2.8
|
||||
3 1 1.9
|
||||
2 2.4
|
||||
3 1 1 2.2
|
||||
2 3.3
|
||||
2 1 2.3
|
||||
2 3.4
|
||||
3 1 2.1
|
||||
2 2.9
|
||||
|
||||
Going from long back to wide just takes some creative use of `unstack`
|
||||
|
||||
>>> w = l.unstack()
|
||||
>>> w.columns = w.columns.map('{0[0]}{0[1]}'.format)
|
||||
>>> w.reset_index()
|
||||
famid birth ht1 ht2
|
||||
0 1 1 2.8 3.4
|
||||
1 1 2 2.9 3.8
|
||||
2 1 3 2.2 2.9
|
||||
3 2 1 2.0 3.2
|
||||
4 2 2 1.8 2.8
|
||||
5 2 3 1.9 2.4
|
||||
6 3 1 2.2 3.3
|
||||
7 3 2 2.3 3.4
|
||||
8 3 3 2.1 2.9
|
||||
|
||||
Less wieldy column names are also handled
|
||||
|
||||
>>> np.random.seed(0)
|
||||
>>> df = pd.DataFrame({'A(weekly)-2010': np.random.rand(3),
|
||||
... 'A(weekly)-2011': np.random.rand(3),
|
||||
... 'B(weekly)-2010': np.random.rand(3),
|
||||
... 'B(weekly)-2011': np.random.rand(3),
|
||||
... 'X' : np.random.randint(3, size=3)})
|
||||
>>> df['id'] = df.index
|
||||
>>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
|
||||
A(weekly)-2010 A(weekly)-2011 B(weekly)-2010 B(weekly)-2011 X id
|
||||
0 0.548814 0.544883 0.437587 0.383442 0 0
|
||||
1 0.715189 0.423655 0.891773 0.791725 1 1
|
||||
2 0.602763 0.645894 0.963663 0.528895 1 2
|
||||
|
||||
>>> pd.wide_to_long(df, ['A(weekly)', 'B(weekly)'], i='id',
|
||||
... j='year', sep='-')
|
||||
... # doctest: +NORMALIZE_WHITESPACE
|
||||
X A(weekly) B(weekly)
|
||||
id year
|
||||
0 2010 0 0.548814 0.437587
|
||||
1 2010 1 0.715189 0.891773
|
||||
2 2010 1 0.602763 0.963663
|
||||
0 2011 0 0.544883 0.383442
|
||||
1 2011 1 0.423655 0.791725
|
||||
2 2011 1 0.645894 0.528895
|
||||
|
||||
If we have many columns, we could also use a regex to find our
|
||||
stubnames and pass that list on to wide_to_long
|
||||
|
||||
>>> stubnames = sorted(
|
||||
... set([match[0] for match in df.columns.str.findall(
|
||||
... r'[A-B]\(.*\)').values if match != []])
|
||||
... )
|
||||
>>> list(stubnames)
|
||||
['A(weekly)', 'B(weekly)']
|
||||
|
||||
All of the above examples have integers as suffixes. It is possible to
|
||||
have non-integers as suffixes.
|
||||
|
||||
>>> df = pd.DataFrame({
|
||||
... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
|
||||
... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
|
||||
... 'ht_one': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
|
||||
... 'ht_two': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
|
||||
... })
|
||||
>>> df
|
||||
famid birth ht_one ht_two
|
||||
0 1 1 2.8 3.4
|
||||
1 1 2 2.9 3.8
|
||||
2 1 3 2.2 2.9
|
||||
3 2 1 2.0 3.2
|
||||
4 2 2 1.8 2.8
|
||||
5 2 3 1.9 2.4
|
||||
6 3 1 2.2 3.3
|
||||
7 3 2 2.3 3.4
|
||||
8 3 3 2.1 2.9
|
||||
|
||||
>>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age',
|
||||
... sep='_', suffix=r'\w+')
|
||||
>>> l
|
||||
... # doctest: +NORMALIZE_WHITESPACE
|
||||
ht
|
||||
famid birth age
|
||||
1 1 one 2.8
|
||||
two 3.4
|
||||
2 one 2.9
|
||||
two 3.8
|
||||
3 one 2.2
|
||||
two 2.9
|
||||
2 1 one 2.0
|
||||
two 3.2
|
||||
2 one 1.8
|
||||
two 2.8
|
||||
3 one 1.9
|
||||
two 2.4
|
||||
3 1 one 2.2
|
||||
two 3.3
|
||||
2 one 2.3
|
||||
two 3.4
|
||||
3 one 2.1
|
||||
two 2.9
|
||||
"""
|
||||
|
||||
def get_var_names(df, stub: str, sep: str, suffix: str):
|
||||
regex = rf"^{re.escape(stub)}{re.escape(sep)}{suffix}$"
|
||||
return df.columns[df.columns.str.match(regex)]
|
||||
|
||||
def melt_stub(df, stub: str, i, j, value_vars, sep: str):
|
||||
newdf = melt(
|
||||
df,
|
||||
id_vars=i,
|
||||
value_vars=value_vars,
|
||||
value_name=stub.rstrip(sep),
|
||||
var_name=j,
|
||||
)
|
||||
newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "", regex=True)
|
||||
|
||||
# GH17627 Cast numerics suffixes to int/float
|
||||
try:
|
||||
newdf[j] = to_numeric(newdf[j])
|
||||
except (TypeError, ValueError, OverflowError):
|
||||
# TODO: anything else to catch?
|
||||
pass
|
||||
|
||||
return newdf.set_index(i + [j])
|
||||
|
||||
if not is_list_like(stubnames):
|
||||
stubnames = [stubnames]
|
||||
else:
|
||||
stubnames = list(stubnames)
|
||||
|
||||
if df.columns.isin(stubnames).any():
|
||||
raise ValueError("stubname can't be identical to a column name")
|
||||
|
||||
if not is_list_like(i):
|
||||
i = [i]
|
||||
else:
|
||||
i = list(i)
|
||||
|
||||
if df[i].duplicated().any():
|
||||
raise ValueError("the id variables need to uniquely identify each row")
|
||||
|
||||
_melted = []
|
||||
value_vars_flattened = []
|
||||
for stub in stubnames:
|
||||
value_var = get_var_names(df, stub, sep, suffix)
|
||||
value_vars_flattened.extend(value_var)
|
||||
_melted.append(melt_stub(df, stub, i, j, value_var, sep))
|
||||
|
||||
melted = concat(_melted, axis=1)
|
||||
id_vars = df.columns.difference(value_vars_flattened)
|
||||
new = df[id_vars]
|
||||
|
||||
if len(i) == 1:
|
||||
return new.set_index(i).join(melted)
|
||||
else:
|
||||
return new.merge(melted.reset_index(), on=i).set_index(i + [j])
|
2762
lib/python3.11/site-packages/pandas/core/reshape/merge.py
Normal file
2762
lib/python3.11/site-packages/pandas/core/reshape/merge.py
Normal file
File diff suppressed because it is too large
Load Diff
899
lib/python3.11/site-packages/pandas/core/reshape/pivot.py
Normal file
899
lib/python3.11/site-packages/pandas/core/reshape/pivot.py
Normal file
@ -0,0 +1,899 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import (
|
||||
Hashable,
|
||||
Sequence,
|
||||
)
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Callable,
|
||||
Literal,
|
||||
cast,
|
||||
)
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas.util._decorators import (
|
||||
Appender,
|
||||
Substitution,
|
||||
)
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
from pandas.core.dtypes.cast import maybe_downcast_to_dtype
|
||||
from pandas.core.dtypes.common import (
|
||||
is_list_like,
|
||||
is_nested_list_like,
|
||||
is_scalar,
|
||||
)
|
||||
from pandas.core.dtypes.dtypes import ExtensionDtype
|
||||
from pandas.core.dtypes.generic import (
|
||||
ABCDataFrame,
|
||||
ABCSeries,
|
||||
)
|
||||
|
||||
import pandas.core.common as com
|
||||
from pandas.core.frame import _shared_docs
|
||||
from pandas.core.groupby import Grouper
|
||||
from pandas.core.indexes.api import (
|
||||
Index,
|
||||
MultiIndex,
|
||||
get_objs_combined_axis,
|
||||
)
|
||||
from pandas.core.reshape.concat import concat
|
||||
from pandas.core.reshape.util import cartesian_product
|
||||
from pandas.core.series import Series
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
AggFuncType,
|
||||
AggFuncTypeBase,
|
||||
AggFuncTypeDict,
|
||||
IndexLabel,
|
||||
)
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
|
||||
# Note: We need to make sure `frame` is imported before `pivot`, otherwise
|
||||
# _shared_docs['pivot_table'] will not yet exist. TODO: Fix this dependency
|
||||
@Substitution("\ndata : DataFrame")
|
||||
@Appender(_shared_docs["pivot_table"], indents=1)
|
||||
def pivot_table(
|
||||
data: DataFrame,
|
||||
values=None,
|
||||
index=None,
|
||||
columns=None,
|
||||
aggfunc: AggFuncType = "mean",
|
||||
fill_value=None,
|
||||
margins: bool = False,
|
||||
dropna: bool = True,
|
||||
margins_name: Hashable = "All",
|
||||
observed: bool | lib.NoDefault = lib.no_default,
|
||||
sort: bool = True,
|
||||
) -> DataFrame:
|
||||
index = _convert_by(index)
|
||||
columns = _convert_by(columns)
|
||||
|
||||
if isinstance(aggfunc, list):
|
||||
pieces: list[DataFrame] = []
|
||||
keys = []
|
||||
for func in aggfunc:
|
||||
_table = __internal_pivot_table(
|
||||
data,
|
||||
values=values,
|
||||
index=index,
|
||||
columns=columns,
|
||||
fill_value=fill_value,
|
||||
aggfunc=func,
|
||||
margins=margins,
|
||||
dropna=dropna,
|
||||
margins_name=margins_name,
|
||||
observed=observed,
|
||||
sort=sort,
|
||||
)
|
||||
pieces.append(_table)
|
||||
keys.append(getattr(func, "__name__", func))
|
||||
|
||||
table = concat(pieces, keys=keys, axis=1)
|
||||
return table.__finalize__(data, method="pivot_table")
|
||||
|
||||
table = __internal_pivot_table(
|
||||
data,
|
||||
values,
|
||||
index,
|
||||
columns,
|
||||
aggfunc,
|
||||
fill_value,
|
||||
margins,
|
||||
dropna,
|
||||
margins_name,
|
||||
observed,
|
||||
sort,
|
||||
)
|
||||
return table.__finalize__(data, method="pivot_table")
|
||||
|
||||
|
||||
def __internal_pivot_table(
|
||||
data: DataFrame,
|
||||
values,
|
||||
index,
|
||||
columns,
|
||||
aggfunc: AggFuncTypeBase | AggFuncTypeDict,
|
||||
fill_value,
|
||||
margins: bool,
|
||||
dropna: bool,
|
||||
margins_name: Hashable,
|
||||
observed: bool | lib.NoDefault,
|
||||
sort: bool,
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Helper of :func:`pandas.pivot_table` for any non-list ``aggfunc``.
|
||||
"""
|
||||
keys = index + columns
|
||||
|
||||
values_passed = values is not None
|
||||
if values_passed:
|
||||
if is_list_like(values):
|
||||
values_multi = True
|
||||
values = list(values)
|
||||
else:
|
||||
values_multi = False
|
||||
values = [values]
|
||||
|
||||
# GH14938 Make sure value labels are in data
|
||||
for i in values:
|
||||
if i not in data:
|
||||
raise KeyError(i)
|
||||
|
||||
to_filter = []
|
||||
for x in keys + values:
|
||||
if isinstance(x, Grouper):
|
||||
x = x.key
|
||||
try:
|
||||
if x in data:
|
||||
to_filter.append(x)
|
||||
except TypeError:
|
||||
pass
|
||||
if len(to_filter) < len(data.columns):
|
||||
data = data[to_filter]
|
||||
|
||||
else:
|
||||
values = data.columns
|
||||
for key in keys:
|
||||
try:
|
||||
values = values.drop(key)
|
||||
except (TypeError, ValueError, KeyError):
|
||||
pass
|
||||
values = list(values)
|
||||
|
||||
observed_bool = False if observed is lib.no_default else observed
|
||||
grouped = data.groupby(keys, observed=observed_bool, sort=sort, dropna=dropna)
|
||||
if observed is lib.no_default and any(
|
||||
ping._passed_categorical for ping in grouped._grouper.groupings
|
||||
):
|
||||
warnings.warn(
|
||||
"The default value of observed=False is deprecated and will change "
|
||||
"to observed=True in a future version of pandas. Specify "
|
||||
"observed=False to silence this warning and retain the current behavior",
|
||||
category=FutureWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
agged = grouped.agg(aggfunc)
|
||||
|
||||
if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):
|
||||
agged = agged.dropna(how="all")
|
||||
|
||||
table = agged
|
||||
|
||||
# GH17038, this check should only happen if index is defined (not None)
|
||||
if table.index.nlevels > 1 and index:
|
||||
# Related GH #17123
|
||||
# If index_names are integers, determine whether the integers refer
|
||||
# to the level position or name.
|
||||
index_names = agged.index.names[: len(index)]
|
||||
to_unstack = []
|
||||
for i in range(len(index), len(keys)):
|
||||
name = agged.index.names[i]
|
||||
if name is None or name in index_names:
|
||||
to_unstack.append(i)
|
||||
else:
|
||||
to_unstack.append(name)
|
||||
table = agged.unstack(to_unstack, fill_value=fill_value)
|
||||
|
||||
if not dropna:
|
||||
if isinstance(table.index, MultiIndex):
|
||||
m = MultiIndex.from_arrays(
|
||||
cartesian_product(table.index.levels), names=table.index.names
|
||||
)
|
||||
table = table.reindex(m, axis=0, fill_value=fill_value)
|
||||
|
||||
if isinstance(table.columns, MultiIndex):
|
||||
m = MultiIndex.from_arrays(
|
||||
cartesian_product(table.columns.levels), names=table.columns.names
|
||||
)
|
||||
table = table.reindex(m, axis=1, fill_value=fill_value)
|
||||
|
||||
if sort is True and isinstance(table, ABCDataFrame):
|
||||
table = table.sort_index(axis=1)
|
||||
|
||||
if fill_value is not None:
|
||||
table = table.fillna(fill_value)
|
||||
if aggfunc is len and not observed and lib.is_integer(fill_value):
|
||||
# TODO: can we avoid this? this used to be handled by
|
||||
# downcast="infer" in fillna
|
||||
table = table.astype(np.int64)
|
||||
|
||||
if margins:
|
||||
if dropna:
|
||||
data = data[data.notna().all(axis=1)]
|
||||
table = _add_margins(
|
||||
table,
|
||||
data,
|
||||
values,
|
||||
rows=index,
|
||||
cols=columns,
|
||||
aggfunc=aggfunc,
|
||||
observed=dropna,
|
||||
margins_name=margins_name,
|
||||
fill_value=fill_value,
|
||||
)
|
||||
|
||||
# discard the top level
|
||||
if values_passed and not values_multi and table.columns.nlevels > 1:
|
||||
table.columns = table.columns.droplevel(0)
|
||||
if len(index) == 0 and len(columns) > 0:
|
||||
table = table.T
|
||||
|
||||
# GH 15193 Make sure empty columns are removed if dropna=True
|
||||
if isinstance(table, ABCDataFrame) and dropna:
|
||||
table = table.dropna(how="all", axis=1)
|
||||
|
||||
return table
|
||||
|
||||
|
||||
def _add_margins(
|
||||
table: DataFrame | Series,
|
||||
data: DataFrame,
|
||||
values,
|
||||
rows,
|
||||
cols,
|
||||
aggfunc,
|
||||
observed: bool,
|
||||
margins_name: Hashable = "All",
|
||||
fill_value=None,
|
||||
):
|
||||
if not isinstance(margins_name, str):
|
||||
raise ValueError("margins_name argument must be a string")
|
||||
|
||||
msg = f'Conflicting name "{margins_name}" in margins'
|
||||
for level in table.index.names:
|
||||
if margins_name in table.index.get_level_values(level):
|
||||
raise ValueError(msg)
|
||||
|
||||
grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name)
|
||||
|
||||
if table.ndim == 2:
|
||||
# i.e. DataFrame
|
||||
for level in table.columns.names[1:]:
|
||||
if margins_name in table.columns.get_level_values(level):
|
||||
raise ValueError(msg)
|
||||
|
||||
key: str | tuple[str, ...]
|
||||
if len(rows) > 1:
|
||||
key = (margins_name,) + ("",) * (len(rows) - 1)
|
||||
else:
|
||||
key = margins_name
|
||||
|
||||
if not values and isinstance(table, ABCSeries):
|
||||
# If there are no values and the table is a series, then there is only
|
||||
# one column in the data. Compute grand margin and return it.
|
||||
return table._append(table._constructor({key: grand_margin[margins_name]}))
|
||||
|
||||
elif values:
|
||||
marginal_result_set = _generate_marginal_results(
|
||||
table, data, values, rows, cols, aggfunc, observed, margins_name
|
||||
)
|
||||
if not isinstance(marginal_result_set, tuple):
|
||||
return marginal_result_set
|
||||
result, margin_keys, row_margin = marginal_result_set
|
||||
else:
|
||||
# no values, and table is a DataFrame
|
||||
assert isinstance(table, ABCDataFrame)
|
||||
marginal_result_set = _generate_marginal_results_without_values(
|
||||
table, data, rows, cols, aggfunc, observed, margins_name
|
||||
)
|
||||
if not isinstance(marginal_result_set, tuple):
|
||||
return marginal_result_set
|
||||
result, margin_keys, row_margin = marginal_result_set
|
||||
|
||||
row_margin = row_margin.reindex(result.columns, fill_value=fill_value)
|
||||
# populate grand margin
|
||||
for k in margin_keys:
|
||||
if isinstance(k, str):
|
||||
row_margin[k] = grand_margin[k]
|
||||
else:
|
||||
row_margin[k] = grand_margin[k[0]]
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
margin_dummy = DataFrame(row_margin, columns=Index([key])).T
|
||||
|
||||
row_names = result.index.names
|
||||
# check the result column and leave floats
|
||||
|
||||
for dtype in set(result.dtypes):
|
||||
if isinstance(dtype, ExtensionDtype):
|
||||
# Can hold NA already
|
||||
continue
|
||||
|
||||
cols = result.select_dtypes([dtype]).columns
|
||||
margin_dummy[cols] = margin_dummy[cols].apply(
|
||||
maybe_downcast_to_dtype, args=(dtype,)
|
||||
)
|
||||
result = result._append(margin_dummy)
|
||||
result.index.names = row_names
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _compute_grand_margin(
|
||||
data: DataFrame, values, aggfunc, margins_name: Hashable = "All"
|
||||
):
|
||||
if values:
|
||||
grand_margin = {}
|
||||
for k, v in data[values].items():
|
||||
try:
|
||||
if isinstance(aggfunc, str):
|
||||
grand_margin[k] = getattr(v, aggfunc)()
|
||||
elif isinstance(aggfunc, dict):
|
||||
if isinstance(aggfunc[k], str):
|
||||
grand_margin[k] = getattr(v, aggfunc[k])()
|
||||
else:
|
||||
grand_margin[k] = aggfunc[k](v)
|
||||
else:
|
||||
grand_margin[k] = aggfunc(v)
|
||||
except TypeError:
|
||||
pass
|
||||
return grand_margin
|
||||
else:
|
||||
return {margins_name: aggfunc(data.index)}
|
||||
|
||||
|
||||
def _generate_marginal_results(
|
||||
table,
|
||||
data: DataFrame,
|
||||
values,
|
||||
rows,
|
||||
cols,
|
||||
aggfunc,
|
||||
observed: bool,
|
||||
margins_name: Hashable = "All",
|
||||
):
|
||||
margin_keys: list | Index
|
||||
if len(cols) > 0:
|
||||
# need to "interleave" the margins
|
||||
table_pieces = []
|
||||
margin_keys = []
|
||||
|
||||
def _all_key(key):
|
||||
return (key, margins_name) + ("",) * (len(cols) - 1)
|
||||
|
||||
if len(rows) > 0:
|
||||
margin = data[rows + values].groupby(rows, observed=observed).agg(aggfunc)
|
||||
cat_axis = 1
|
||||
|
||||
for key, piece in table.T.groupby(level=0, observed=observed):
|
||||
piece = piece.T
|
||||
all_key = _all_key(key)
|
||||
|
||||
# we are going to mutate this, so need to copy!
|
||||
piece = piece.copy()
|
||||
piece[all_key] = margin[key]
|
||||
|
||||
table_pieces.append(piece)
|
||||
margin_keys.append(all_key)
|
||||
else:
|
||||
from pandas import DataFrame
|
||||
|
||||
cat_axis = 0
|
||||
for key, piece in table.groupby(level=0, observed=observed):
|
||||
if len(cols) > 1:
|
||||
all_key = _all_key(key)
|
||||
else:
|
||||
all_key = margins_name
|
||||
table_pieces.append(piece)
|
||||
# GH31016 this is to calculate margin for each group, and assign
|
||||
# corresponded key as index
|
||||
transformed_piece = DataFrame(piece.apply(aggfunc)).T
|
||||
if isinstance(piece.index, MultiIndex):
|
||||
# We are adding an empty level
|
||||
transformed_piece.index = MultiIndex.from_tuples(
|
||||
[all_key], names=piece.index.names + [None]
|
||||
)
|
||||
else:
|
||||
transformed_piece.index = Index([all_key], name=piece.index.name)
|
||||
|
||||
# append piece for margin into table_piece
|
||||
table_pieces.append(transformed_piece)
|
||||
margin_keys.append(all_key)
|
||||
|
||||
if not table_pieces:
|
||||
# GH 49240
|
||||
return table
|
||||
else:
|
||||
result = concat(table_pieces, axis=cat_axis)
|
||||
|
||||
if len(rows) == 0:
|
||||
return result
|
||||
else:
|
||||
result = table
|
||||
margin_keys = table.columns
|
||||
|
||||
if len(cols) > 0:
|
||||
row_margin = data[cols + values].groupby(cols, observed=observed).agg(aggfunc)
|
||||
row_margin = row_margin.stack(future_stack=True)
|
||||
|
||||
# GH#26568. Use names instead of indices in case of numeric names
|
||||
new_order_indices = [len(cols)] + list(range(len(cols)))
|
||||
new_order_names = [row_margin.index.names[i] for i in new_order_indices]
|
||||
row_margin.index = row_margin.index.reorder_levels(new_order_names)
|
||||
else:
|
||||
row_margin = data._constructor_sliced(np.nan, index=result.columns)
|
||||
|
||||
return result, margin_keys, row_margin
|
||||
|
||||
|
||||
def _generate_marginal_results_without_values(
|
||||
table: DataFrame,
|
||||
data: DataFrame,
|
||||
rows,
|
||||
cols,
|
||||
aggfunc,
|
||||
observed: bool,
|
||||
margins_name: Hashable = "All",
|
||||
):
|
||||
margin_keys: list | Index
|
||||
if len(cols) > 0:
|
||||
# need to "interleave" the margins
|
||||
margin_keys = []
|
||||
|
||||
def _all_key():
|
||||
if len(cols) == 1:
|
||||
return margins_name
|
||||
return (margins_name,) + ("",) * (len(cols) - 1)
|
||||
|
||||
if len(rows) > 0:
|
||||
margin = data.groupby(rows, observed=observed)[rows].apply(aggfunc)
|
||||
all_key = _all_key()
|
||||
table[all_key] = margin
|
||||
result = table
|
||||
margin_keys.append(all_key)
|
||||
|
||||
else:
|
||||
margin = data.groupby(level=0, axis=0, observed=observed).apply(aggfunc)
|
||||
all_key = _all_key()
|
||||
table[all_key] = margin
|
||||
result = table
|
||||
margin_keys.append(all_key)
|
||||
return result
|
||||
else:
|
||||
result = table
|
||||
margin_keys = table.columns
|
||||
|
||||
if len(cols):
|
||||
row_margin = data.groupby(cols, observed=observed)[cols].apply(aggfunc)
|
||||
else:
|
||||
row_margin = Series(np.nan, index=result.columns)
|
||||
|
||||
return result, margin_keys, row_margin
|
||||
|
||||
|
||||
def _convert_by(by):
|
||||
if by is None:
|
||||
by = []
|
||||
elif (
|
||||
is_scalar(by)
|
||||
or isinstance(by, (np.ndarray, Index, ABCSeries, Grouper))
|
||||
or callable(by)
|
||||
):
|
||||
by = [by]
|
||||
else:
|
||||
by = list(by)
|
||||
return by
|
||||
|
||||
|
||||
@Substitution("\ndata : DataFrame")
|
||||
@Appender(_shared_docs["pivot"], indents=1)
|
||||
def pivot(
|
||||
data: DataFrame,
|
||||
*,
|
||||
columns: IndexLabel,
|
||||
index: IndexLabel | lib.NoDefault = lib.no_default,
|
||||
values: IndexLabel | lib.NoDefault = lib.no_default,
|
||||
) -> DataFrame:
|
||||
columns_listlike = com.convert_to_list_like(columns)
|
||||
|
||||
# If columns is None we will create a MultiIndex level with None as name
|
||||
# which might cause duplicated names because None is the default for
|
||||
# level names
|
||||
data = data.copy(deep=False)
|
||||
data.index = data.index.copy()
|
||||
data.index.names = [
|
||||
name if name is not None else lib.no_default for name in data.index.names
|
||||
]
|
||||
|
||||
indexed: DataFrame | Series
|
||||
if values is lib.no_default:
|
||||
if index is not lib.no_default:
|
||||
cols = com.convert_to_list_like(index)
|
||||
else:
|
||||
cols = []
|
||||
|
||||
append = index is lib.no_default
|
||||
# error: Unsupported operand types for + ("List[Any]" and "ExtensionArray")
|
||||
# error: Unsupported left operand type for + ("ExtensionArray")
|
||||
indexed = data.set_index(
|
||||
cols + columns_listlike, append=append # type: ignore[operator]
|
||||
)
|
||||
else:
|
||||
index_list: list[Index] | list[Series]
|
||||
if index is lib.no_default:
|
||||
if isinstance(data.index, MultiIndex):
|
||||
# GH 23955
|
||||
index_list = [
|
||||
data.index.get_level_values(i) for i in range(data.index.nlevels)
|
||||
]
|
||||
else:
|
||||
index_list = [
|
||||
data._constructor_sliced(data.index, name=data.index.name)
|
||||
]
|
||||
else:
|
||||
index_list = [data[idx] for idx in com.convert_to_list_like(index)]
|
||||
|
||||
data_columns = [data[col] for col in columns_listlike]
|
||||
index_list.extend(data_columns)
|
||||
multiindex = MultiIndex.from_arrays(index_list)
|
||||
|
||||
if is_list_like(values) and not isinstance(values, tuple):
|
||||
# Exclude tuple because it is seen as a single column name
|
||||
values = cast(Sequence[Hashable], values)
|
||||
indexed = data._constructor(
|
||||
data[values]._values, index=multiindex, columns=values
|
||||
)
|
||||
else:
|
||||
indexed = data._constructor_sliced(data[values]._values, index=multiindex)
|
||||
# error: Argument 1 to "unstack" of "DataFrame" has incompatible type "Union
|
||||
# [List[Any], ExtensionArray, ndarray[Any, Any], Index, Series]"; expected
|
||||
# "Hashable"
|
||||
result = indexed.unstack(columns_listlike) # type: ignore[arg-type]
|
||||
result.index.names = [
|
||||
name if name is not lib.no_default else None for name in result.index.names
|
||||
]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def crosstab(
|
||||
index,
|
||||
columns,
|
||||
values=None,
|
||||
rownames=None,
|
||||
colnames=None,
|
||||
aggfunc=None,
|
||||
margins: bool = False,
|
||||
margins_name: Hashable = "All",
|
||||
dropna: bool = True,
|
||||
normalize: bool | Literal[0, 1, "all", "index", "columns"] = False,
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Compute a simple cross tabulation of two (or more) factors.
|
||||
|
||||
By default, computes a frequency table of the factors unless an
|
||||
array of values and an aggregation function are passed.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
index : array-like, Series, or list of arrays/Series
|
||||
Values to group by in the rows.
|
||||
columns : array-like, Series, or list of arrays/Series
|
||||
Values to group by in the columns.
|
||||
values : array-like, optional
|
||||
Array of values to aggregate according to the factors.
|
||||
Requires `aggfunc` be specified.
|
||||
rownames : sequence, default None
|
||||
If passed, must match number of row arrays passed.
|
||||
colnames : sequence, default None
|
||||
If passed, must match number of column arrays passed.
|
||||
aggfunc : function, optional
|
||||
If specified, requires `values` be specified as well.
|
||||
margins : bool, default False
|
||||
Add row/column margins (subtotals).
|
||||
margins_name : str, default 'All'
|
||||
Name of the row/column that will contain the totals
|
||||
when margins is True.
|
||||
dropna : bool, default True
|
||||
Do not include columns whose entries are all NaN.
|
||||
normalize : bool, {'all', 'index', 'columns'}, or {0,1}, default False
|
||||
Normalize by dividing all values by the sum of values.
|
||||
|
||||
- If passed 'all' or `True`, will normalize over all values.
|
||||
- If passed 'index' will normalize over each row.
|
||||
- If passed 'columns' will normalize over each column.
|
||||
- If margins is `True`, will also normalize margin values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
Cross tabulation of the data.
|
||||
|
||||
See Also
|
||||
--------
|
||||
DataFrame.pivot : Reshape data based on column values.
|
||||
pivot_table : Create a pivot table as a DataFrame.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Any Series passed will have their name attributes used unless row or column
|
||||
names for the cross-tabulation are specified.
|
||||
|
||||
Any input passed containing Categorical data will have **all** of its
|
||||
categories included in the cross-tabulation, even if the actual data does
|
||||
not contain any instances of a particular category.
|
||||
|
||||
In the event that there aren't overlapping indexes an empty DataFrame will
|
||||
be returned.
|
||||
|
||||
Reference :ref:`the user guide <reshaping.crosstabulations>` for more examples.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar",
|
||||
... "bar", "bar", "foo", "foo", "foo"], dtype=object)
|
||||
>>> b = np.array(["one", "one", "one", "two", "one", "one",
|
||||
... "one", "two", "two", "two", "one"], dtype=object)
|
||||
>>> c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny",
|
||||
... "shiny", "dull", "shiny", "shiny", "shiny"],
|
||||
... dtype=object)
|
||||
>>> pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])
|
||||
b one two
|
||||
c dull shiny dull shiny
|
||||
a
|
||||
bar 1 2 1 0
|
||||
foo 2 2 1 2
|
||||
|
||||
Here 'c' and 'f' are not represented in the data and will not be
|
||||
shown in the output because dropna is True by default. Set
|
||||
dropna=False to preserve categories with no data.
|
||||
|
||||
>>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])
|
||||
>>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f'])
|
||||
>>> pd.crosstab(foo, bar)
|
||||
col_0 d e
|
||||
row_0
|
||||
a 1 0
|
||||
b 0 1
|
||||
>>> pd.crosstab(foo, bar, dropna=False)
|
||||
col_0 d e f
|
||||
row_0
|
||||
a 1 0 0
|
||||
b 0 1 0
|
||||
c 0 0 0
|
||||
"""
|
||||
if values is None and aggfunc is not None:
|
||||
raise ValueError("aggfunc cannot be used without values.")
|
||||
|
||||
if values is not None and aggfunc is None:
|
||||
raise ValueError("values cannot be used without an aggfunc.")
|
||||
|
||||
if not is_nested_list_like(index):
|
||||
index = [index]
|
||||
if not is_nested_list_like(columns):
|
||||
columns = [columns]
|
||||
|
||||
common_idx = None
|
||||
pass_objs = [x for x in index + columns if isinstance(x, (ABCSeries, ABCDataFrame))]
|
||||
if pass_objs:
|
||||
common_idx = get_objs_combined_axis(pass_objs, intersect=True, sort=False)
|
||||
|
||||
rownames = _get_names(index, rownames, prefix="row")
|
||||
colnames = _get_names(columns, colnames, prefix="col")
|
||||
|
||||
# duplicate names mapped to unique names for pivot op
|
||||
(
|
||||
rownames_mapper,
|
||||
unique_rownames,
|
||||
colnames_mapper,
|
||||
unique_colnames,
|
||||
) = _build_names_mapper(rownames, colnames)
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
data = {
|
||||
**dict(zip(unique_rownames, index)),
|
||||
**dict(zip(unique_colnames, columns)),
|
||||
}
|
||||
df = DataFrame(data, index=common_idx)
|
||||
|
||||
if values is None:
|
||||
df["__dummy__"] = 0
|
||||
kwargs = {"aggfunc": len, "fill_value": 0}
|
||||
else:
|
||||
df["__dummy__"] = values
|
||||
kwargs = {"aggfunc": aggfunc}
|
||||
|
||||
# error: Argument 7 to "pivot_table" of "DataFrame" has incompatible type
|
||||
# "**Dict[str, object]"; expected "Union[...]"
|
||||
table = df.pivot_table(
|
||||
"__dummy__",
|
||||
index=unique_rownames,
|
||||
columns=unique_colnames,
|
||||
margins=margins,
|
||||
margins_name=margins_name,
|
||||
dropna=dropna,
|
||||
observed=False,
|
||||
**kwargs, # type: ignore[arg-type]
|
||||
)
|
||||
|
||||
# Post-process
|
||||
if normalize is not False:
|
||||
table = _normalize(
|
||||
table, normalize=normalize, margins=margins, margins_name=margins_name
|
||||
)
|
||||
|
||||
table = table.rename_axis(index=rownames_mapper, axis=0)
|
||||
table = table.rename_axis(columns=colnames_mapper, axis=1)
|
||||
|
||||
return table
|
||||
|
||||
|
||||
def _normalize(
|
||||
table: DataFrame, normalize, margins: bool, margins_name: Hashable = "All"
|
||||
) -> DataFrame:
|
||||
if not isinstance(normalize, (bool, str)):
|
||||
axis_subs = {0: "index", 1: "columns"}
|
||||
try:
|
||||
normalize = axis_subs[normalize]
|
||||
except KeyError as err:
|
||||
raise ValueError("Not a valid normalize argument") from err
|
||||
|
||||
if margins is False:
|
||||
# Actual Normalizations
|
||||
normalizers: dict[bool | str, Callable] = {
|
||||
"all": lambda x: x / x.sum(axis=1).sum(axis=0),
|
||||
"columns": lambda x: x / x.sum(),
|
||||
"index": lambda x: x.div(x.sum(axis=1), axis=0),
|
||||
}
|
||||
|
||||
normalizers[True] = normalizers["all"]
|
||||
|
||||
try:
|
||||
f = normalizers[normalize]
|
||||
except KeyError as err:
|
||||
raise ValueError("Not a valid normalize argument") from err
|
||||
|
||||
table = f(table)
|
||||
table = table.fillna(0)
|
||||
|
||||
elif margins is True:
|
||||
# keep index and column of pivoted table
|
||||
table_index = table.index
|
||||
table_columns = table.columns
|
||||
last_ind_or_col = table.iloc[-1, :].name
|
||||
|
||||
# check if margin name is not in (for MI cases) and not equal to last
|
||||
# index/column and save the column and index margin
|
||||
if (margins_name not in last_ind_or_col) & (margins_name != last_ind_or_col):
|
||||
raise ValueError(f"{margins_name} not in pivoted DataFrame")
|
||||
column_margin = table.iloc[:-1, -1]
|
||||
index_margin = table.iloc[-1, :-1]
|
||||
|
||||
# keep the core table
|
||||
table = table.iloc[:-1, :-1]
|
||||
|
||||
# Normalize core
|
||||
table = _normalize(table, normalize=normalize, margins=False)
|
||||
|
||||
# Fix Margins
|
||||
if normalize == "columns":
|
||||
column_margin = column_margin / column_margin.sum()
|
||||
table = concat([table, column_margin], axis=1)
|
||||
table = table.fillna(0)
|
||||
table.columns = table_columns
|
||||
|
||||
elif normalize == "index":
|
||||
index_margin = index_margin / index_margin.sum()
|
||||
table = table._append(index_margin)
|
||||
table = table.fillna(0)
|
||||
table.index = table_index
|
||||
|
||||
elif normalize == "all" or normalize is True:
|
||||
column_margin = column_margin / column_margin.sum()
|
||||
index_margin = index_margin / index_margin.sum()
|
||||
index_margin.loc[margins_name] = 1
|
||||
table = concat([table, column_margin], axis=1)
|
||||
table = table._append(index_margin)
|
||||
|
||||
table = table.fillna(0)
|
||||
table.index = table_index
|
||||
table.columns = table_columns
|
||||
|
||||
else:
|
||||
raise ValueError("Not a valid normalize argument")
|
||||
|
||||
else:
|
||||
raise ValueError("Not a valid margins argument")
|
||||
|
||||
return table
|
||||
|
||||
|
||||
def _get_names(arrs, names, prefix: str = "row"):
|
||||
if names is None:
|
||||
names = []
|
||||
for i, arr in enumerate(arrs):
|
||||
if isinstance(arr, ABCSeries) and arr.name is not None:
|
||||
names.append(arr.name)
|
||||
else:
|
||||
names.append(f"{prefix}_{i}")
|
||||
else:
|
||||
if len(names) != len(arrs):
|
||||
raise AssertionError("arrays and names must have the same length")
|
||||
if not isinstance(names, list):
|
||||
names = list(names)
|
||||
|
||||
return names
|
||||
|
||||
|
||||
def _build_names_mapper(
|
||||
rownames: list[str], colnames: list[str]
|
||||
) -> tuple[dict[str, str], list[str], dict[str, str], list[str]]:
|
||||
"""
|
||||
Given the names of a DataFrame's rows and columns, returns a set of unique row
|
||||
and column names and mappers that convert to original names.
|
||||
|
||||
A row or column name is replaced if it is duplicate among the rows of the inputs,
|
||||
among the columns of the inputs or between the rows and the columns.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
rownames: list[str]
|
||||
colnames: list[str]
|
||||
|
||||
Returns
|
||||
-------
|
||||
Tuple(Dict[str, str], List[str], Dict[str, str], List[str])
|
||||
|
||||
rownames_mapper: dict[str, str]
|
||||
a dictionary with new row names as keys and original rownames as values
|
||||
unique_rownames: list[str]
|
||||
a list of rownames with duplicate names replaced by dummy names
|
||||
colnames_mapper: dict[str, str]
|
||||
a dictionary with new column names as keys and original column names as values
|
||||
unique_colnames: list[str]
|
||||
a list of column names with duplicate names replaced by dummy names
|
||||
|
||||
"""
|
||||
|
||||
def get_duplicates(names):
|
||||
seen: set = set()
|
||||
return {name for name in names if name not in seen}
|
||||
|
||||
shared_names = set(rownames).intersection(set(colnames))
|
||||
dup_names = get_duplicates(rownames) | get_duplicates(colnames) | shared_names
|
||||
|
||||
rownames_mapper = {
|
||||
f"row_{i}": name for i, name in enumerate(rownames) if name in dup_names
|
||||
}
|
||||
unique_rownames = [
|
||||
f"row_{i}" if name in dup_names else name for i, name in enumerate(rownames)
|
||||
]
|
||||
|
||||
colnames_mapper = {
|
||||
f"col_{i}": name for i, name in enumerate(colnames) if name in dup_names
|
||||
}
|
||||
unique_colnames = [
|
||||
f"col_{i}" if name in dup_names else name for i, name in enumerate(colnames)
|
||||
]
|
||||
|
||||
return rownames_mapper, unique_rownames, colnames_mapper, unique_colnames
|
989
lib/python3.11/site-packages/pandas/core/reshape/reshape.py
Normal file
989
lib/python3.11/site-packages/pandas/core/reshape/reshape.py
Normal file
@ -0,0 +1,989 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import itertools
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
cast,
|
||||
)
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
import pandas._libs.reshape as libreshape
|
||||
from pandas.errors import PerformanceWarning
|
||||
from pandas.util._decorators import cache_readonly
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
from pandas.core.dtypes.cast import (
|
||||
find_common_type,
|
||||
maybe_promote,
|
||||
)
|
||||
from pandas.core.dtypes.common import (
|
||||
ensure_platform_int,
|
||||
is_1d_only_ea_dtype,
|
||||
is_integer,
|
||||
needs_i8_conversion,
|
||||
)
|
||||
from pandas.core.dtypes.dtypes import ExtensionDtype
|
||||
from pandas.core.dtypes.missing import notna
|
||||
|
||||
import pandas.core.algorithms as algos
|
||||
from pandas.core.algorithms import (
|
||||
factorize,
|
||||
unique,
|
||||
)
|
||||
from pandas.core.arrays.categorical import factorize_from_iterable
|
||||
from pandas.core.construction import ensure_wrapped_if_datetimelike
|
||||
from pandas.core.frame import DataFrame
|
||||
from pandas.core.indexes.api import (
|
||||
Index,
|
||||
MultiIndex,
|
||||
RangeIndex,
|
||||
)
|
||||
from pandas.core.reshape.concat import concat
|
||||
from pandas.core.series import Series
|
||||
from pandas.core.sorting import (
|
||||
compress_group_index,
|
||||
decons_obs_group_ids,
|
||||
get_compressed_ids,
|
||||
get_group_index,
|
||||
get_group_index_sorter,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
Level,
|
||||
npt,
|
||||
)
|
||||
|
||||
from pandas.core.arrays import ExtensionArray
|
||||
from pandas.core.indexes.frozen import FrozenList
|
||||
|
||||
|
||||
class _Unstacker:
|
||||
"""
|
||||
Helper class to unstack data / pivot with multi-level index
|
||||
|
||||
Parameters
|
||||
----------
|
||||
index : MultiIndex
|
||||
level : int or str, default last level
|
||||
Level to "unstack". Accepts a name for the level.
|
||||
fill_value : scalar, optional
|
||||
Default value to fill in missing values if subgroups do not have the
|
||||
same set of labels. By default, missing values will be replaced with
|
||||
the default fill value for that data type, NaN for float, NaT for
|
||||
datetimelike, etc. For integer types, by default data will converted to
|
||||
float and missing values will be set to NaN.
|
||||
constructor : object
|
||||
Pandas ``DataFrame`` or subclass used to create unstacked
|
||||
response. If None, DataFrame will be used.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
|
||||
... ('two', 'a'), ('two', 'b')])
|
||||
>>> s = pd.Series(np.arange(1, 5, dtype=np.int64), index=index)
|
||||
>>> s
|
||||
one a 1
|
||||
b 2
|
||||
two a 3
|
||||
b 4
|
||||
dtype: int64
|
||||
|
||||
>>> s.unstack(level=-1)
|
||||
a b
|
||||
one 1 2
|
||||
two 3 4
|
||||
|
||||
>>> s.unstack(level=0)
|
||||
one two
|
||||
a 1 3
|
||||
b 2 4
|
||||
|
||||
Returns
|
||||
-------
|
||||
unstacked : DataFrame
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, index: MultiIndex, level: Level, constructor, sort: bool = True
|
||||
) -> None:
|
||||
self.constructor = constructor
|
||||
self.sort = sort
|
||||
|
||||
self.index = index.remove_unused_levels()
|
||||
|
||||
self.level = self.index._get_level_number(level)
|
||||
|
||||
# when index includes `nan`, need to lift levels/strides by 1
|
||||
self.lift = 1 if -1 in self.index.codes[self.level] else 0
|
||||
|
||||
# Note: the "pop" below alters these in-place.
|
||||
self.new_index_levels = list(self.index.levels)
|
||||
self.new_index_names = list(self.index.names)
|
||||
|
||||
self.removed_name = self.new_index_names.pop(self.level)
|
||||
self.removed_level = self.new_index_levels.pop(self.level)
|
||||
self.removed_level_full = index.levels[self.level]
|
||||
if not self.sort:
|
||||
unique_codes = unique(self.index.codes[self.level])
|
||||
self.removed_level = self.removed_level.take(unique_codes)
|
||||
self.removed_level_full = self.removed_level_full.take(unique_codes)
|
||||
|
||||
# Bug fix GH 20601
|
||||
# If the data frame is too big, the number of unique index combination
|
||||
# will cause int32 overflow on windows environments.
|
||||
# We want to check and raise an warning before this happens
|
||||
num_rows = np.max([index_level.size for index_level in self.new_index_levels])
|
||||
num_columns = self.removed_level.size
|
||||
|
||||
# GH20601: This forces an overflow if the number of cells is too high.
|
||||
num_cells = num_rows * num_columns
|
||||
|
||||
# GH 26314: Previous ValueError raised was too restrictive for many users.
|
||||
if num_cells > np.iinfo(np.int32).max:
|
||||
warnings.warn(
|
||||
f"The following operation may generate {num_cells} cells "
|
||||
f"in the resulting pandas object.",
|
||||
PerformanceWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
|
||||
self._make_selectors()
|
||||
|
||||
@cache_readonly
|
||||
def _indexer_and_to_sort(
|
||||
self,
|
||||
) -> tuple[
|
||||
npt.NDArray[np.intp],
|
||||
list[np.ndarray], # each has _some_ signed integer dtype
|
||||
]:
|
||||
v = self.level
|
||||
|
||||
codes = list(self.index.codes)
|
||||
levs = list(self.index.levels)
|
||||
to_sort = codes[:v] + codes[v + 1 :] + [codes[v]]
|
||||
sizes = tuple(len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]])
|
||||
|
||||
comp_index, obs_ids = get_compressed_ids(to_sort, sizes)
|
||||
ngroups = len(obs_ids)
|
||||
|
||||
indexer = get_group_index_sorter(comp_index, ngroups)
|
||||
return indexer, to_sort
|
||||
|
||||
@cache_readonly
|
||||
def sorted_labels(self) -> list[np.ndarray]:
|
||||
indexer, to_sort = self._indexer_and_to_sort
|
||||
if self.sort:
|
||||
return [line.take(indexer) for line in to_sort]
|
||||
return to_sort
|
||||
|
||||
def _make_sorted_values(self, values: np.ndarray) -> np.ndarray:
|
||||
if self.sort:
|
||||
indexer, _ = self._indexer_and_to_sort
|
||||
|
||||
sorted_values = algos.take_nd(values, indexer, axis=0)
|
||||
return sorted_values
|
||||
return values
|
||||
|
||||
def _make_selectors(self):
|
||||
new_levels = self.new_index_levels
|
||||
|
||||
# make the mask
|
||||
remaining_labels = self.sorted_labels[:-1]
|
||||
level_sizes = tuple(len(x) for x in new_levels)
|
||||
|
||||
comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes)
|
||||
ngroups = len(obs_ids)
|
||||
|
||||
comp_index = ensure_platform_int(comp_index)
|
||||
stride = self.index.levshape[self.level] + self.lift
|
||||
self.full_shape = ngroups, stride
|
||||
|
||||
selector = self.sorted_labels[-1] + stride * comp_index + self.lift
|
||||
mask = np.zeros(np.prod(self.full_shape), dtype=bool)
|
||||
mask.put(selector, True)
|
||||
|
||||
if mask.sum() < len(self.index):
|
||||
raise ValueError("Index contains duplicate entries, cannot reshape")
|
||||
|
||||
self.group_index = comp_index
|
||||
self.mask = mask
|
||||
if self.sort:
|
||||
self.compressor = comp_index.searchsorted(np.arange(ngroups))
|
||||
else:
|
||||
self.compressor = np.sort(np.unique(comp_index, return_index=True)[1])
|
||||
|
||||
@cache_readonly
|
||||
def mask_all(self) -> bool:
|
||||
return bool(self.mask.all())
|
||||
|
||||
@cache_readonly
|
||||
def arange_result(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.bool_]]:
|
||||
# We cache this for reuse in ExtensionBlock._unstack
|
||||
dummy_arr = np.arange(len(self.index), dtype=np.intp)
|
||||
new_values, mask = self.get_new_values(dummy_arr, fill_value=-1)
|
||||
return new_values, mask.any(0)
|
||||
# TODO: in all tests we have mask.any(0).all(); can we rely on that?
|
||||
|
||||
def get_result(self, values, value_columns, fill_value) -> DataFrame:
|
||||
if values.ndim == 1:
|
||||
values = values[:, np.newaxis]
|
||||
|
||||
if value_columns is None and values.shape[1] != 1: # pragma: no cover
|
||||
raise ValueError("must pass column labels for multi-column data")
|
||||
|
||||
values, _ = self.get_new_values(values, fill_value)
|
||||
columns = self.get_new_columns(value_columns)
|
||||
index = self.new_index
|
||||
|
||||
return self.constructor(
|
||||
values, index=index, columns=columns, dtype=values.dtype
|
||||
)
|
||||
|
||||
def get_new_values(self, values, fill_value=None):
|
||||
if values.ndim == 1:
|
||||
values = values[:, np.newaxis]
|
||||
|
||||
sorted_values = self._make_sorted_values(values)
|
||||
|
||||
# place the values
|
||||
length, width = self.full_shape
|
||||
stride = values.shape[1]
|
||||
result_width = width * stride
|
||||
result_shape = (length, result_width)
|
||||
mask = self.mask
|
||||
mask_all = self.mask_all
|
||||
|
||||
# we can simply reshape if we don't have a mask
|
||||
if mask_all and len(values):
|
||||
# TODO: Under what circumstances can we rely on sorted_values
|
||||
# matching values? When that holds, we can slice instead
|
||||
# of take (in particular for EAs)
|
||||
new_values = (
|
||||
sorted_values.reshape(length, width, stride)
|
||||
.swapaxes(1, 2)
|
||||
.reshape(result_shape)
|
||||
)
|
||||
new_mask = np.ones(result_shape, dtype=bool)
|
||||
return new_values, new_mask
|
||||
|
||||
dtype = values.dtype
|
||||
|
||||
# if our mask is all True, then we can use our existing dtype
|
||||
if mask_all:
|
||||
dtype = values.dtype
|
||||
new_values = np.empty(result_shape, dtype=dtype)
|
||||
else:
|
||||
if isinstance(dtype, ExtensionDtype):
|
||||
# GH#41875
|
||||
# We are assuming that fill_value can be held by this dtype,
|
||||
# unlike the non-EA case that promotes.
|
||||
cls = dtype.construct_array_type()
|
||||
new_values = cls._empty(result_shape, dtype=dtype)
|
||||
new_values[:] = fill_value
|
||||
else:
|
||||
dtype, fill_value = maybe_promote(dtype, fill_value)
|
||||
new_values = np.empty(result_shape, dtype=dtype)
|
||||
new_values.fill(fill_value)
|
||||
|
||||
name = dtype.name
|
||||
new_mask = np.zeros(result_shape, dtype=bool)
|
||||
|
||||
# we need to convert to a basic dtype
|
||||
# and possibly coerce an input to our output dtype
|
||||
# e.g. ints -> floats
|
||||
if needs_i8_conversion(values.dtype):
|
||||
sorted_values = sorted_values.view("i8")
|
||||
new_values = new_values.view("i8")
|
||||
else:
|
||||
sorted_values = sorted_values.astype(name, copy=False)
|
||||
|
||||
# fill in our values & mask
|
||||
libreshape.unstack(
|
||||
sorted_values,
|
||||
mask.view("u1"),
|
||||
stride,
|
||||
length,
|
||||
width,
|
||||
new_values,
|
||||
new_mask.view("u1"),
|
||||
)
|
||||
|
||||
# reconstruct dtype if needed
|
||||
if needs_i8_conversion(values.dtype):
|
||||
# view as datetime64 so we can wrap in DatetimeArray and use
|
||||
# DTA's view method
|
||||
new_values = new_values.view("M8[ns]")
|
||||
new_values = ensure_wrapped_if_datetimelike(new_values)
|
||||
new_values = new_values.view(values.dtype)
|
||||
|
||||
return new_values, new_mask
|
||||
|
||||
def get_new_columns(self, value_columns: Index | None):
|
||||
if value_columns is None:
|
||||
if self.lift == 0:
|
||||
return self.removed_level._rename(name=self.removed_name)
|
||||
|
||||
lev = self.removed_level.insert(0, item=self.removed_level._na_value)
|
||||
return lev.rename(self.removed_name)
|
||||
|
||||
stride = len(self.removed_level) + self.lift
|
||||
width = len(value_columns)
|
||||
propagator = np.repeat(np.arange(width), stride)
|
||||
|
||||
new_levels: FrozenList | list[Index]
|
||||
|
||||
if isinstance(value_columns, MultiIndex):
|
||||
# error: Cannot determine type of "__add__" [has-type]
|
||||
new_levels = value_columns.levels + ( # type: ignore[has-type]
|
||||
self.removed_level_full,
|
||||
)
|
||||
new_names = value_columns.names + (self.removed_name,)
|
||||
|
||||
new_codes = [lab.take(propagator) for lab in value_columns.codes]
|
||||
else:
|
||||
new_levels = [
|
||||
value_columns,
|
||||
self.removed_level_full,
|
||||
]
|
||||
new_names = [value_columns.name, self.removed_name]
|
||||
new_codes = [propagator]
|
||||
|
||||
repeater = self._repeater
|
||||
|
||||
# The entire level is then just a repetition of the single chunk:
|
||||
new_codes.append(np.tile(repeater, width))
|
||||
return MultiIndex(
|
||||
levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
|
||||
)
|
||||
|
||||
@cache_readonly
|
||||
def _repeater(self) -> np.ndarray:
|
||||
# The two indices differ only if the unstacked level had unused items:
|
||||
if len(self.removed_level_full) != len(self.removed_level):
|
||||
# In this case, we remap the new codes to the original level:
|
||||
repeater = self.removed_level_full.get_indexer(self.removed_level)
|
||||
if self.lift:
|
||||
repeater = np.insert(repeater, 0, -1)
|
||||
else:
|
||||
# Otherwise, we just use each level item exactly once:
|
||||
stride = len(self.removed_level) + self.lift
|
||||
repeater = np.arange(stride) - self.lift
|
||||
|
||||
return repeater
|
||||
|
||||
@cache_readonly
|
||||
def new_index(self) -> MultiIndex:
|
||||
# Does not depend on values or value_columns
|
||||
result_codes = [lab.take(self.compressor) for lab in self.sorted_labels[:-1]]
|
||||
|
||||
# construct the new index
|
||||
if len(self.new_index_levels) == 1:
|
||||
level, level_codes = self.new_index_levels[0], result_codes[0]
|
||||
if (level_codes == -1).any():
|
||||
level = level.insert(len(level), level._na_value)
|
||||
return level.take(level_codes).rename(self.new_index_names[0])
|
||||
|
||||
return MultiIndex(
|
||||
levels=self.new_index_levels,
|
||||
codes=result_codes,
|
||||
names=self.new_index_names,
|
||||
verify_integrity=False,
|
||||
)
|
||||
|
||||
|
||||
def _unstack_multiple(
|
||||
data: Series | DataFrame, clocs, fill_value=None, sort: bool = True
|
||||
):
|
||||
if len(clocs) == 0:
|
||||
return data
|
||||
|
||||
# NOTE: This doesn't deal with hierarchical columns yet
|
||||
|
||||
index = data.index
|
||||
index = cast(MultiIndex, index) # caller is responsible for checking
|
||||
|
||||
# GH 19966 Make sure if MultiIndexed index has tuple name, they will be
|
||||
# recognised as a whole
|
||||
if clocs in index.names:
|
||||
clocs = [clocs]
|
||||
clocs = [index._get_level_number(i) for i in clocs]
|
||||
|
||||
rlocs = [i for i in range(index.nlevels) if i not in clocs]
|
||||
|
||||
clevels = [index.levels[i] for i in clocs]
|
||||
ccodes = [index.codes[i] for i in clocs]
|
||||
cnames = [index.names[i] for i in clocs]
|
||||
rlevels = [index.levels[i] for i in rlocs]
|
||||
rcodes = [index.codes[i] for i in rlocs]
|
||||
rnames = [index.names[i] for i in rlocs]
|
||||
|
||||
shape = tuple(len(x) for x in clevels)
|
||||
group_index = get_group_index(ccodes, shape, sort=False, xnull=False)
|
||||
|
||||
comp_ids, obs_ids = compress_group_index(group_index, sort=False)
|
||||
recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False)
|
||||
|
||||
if not rlocs:
|
||||
# Everything is in clocs, so the dummy df has a regular index
|
||||
dummy_index = Index(obs_ids, name="__placeholder__")
|
||||
else:
|
||||
dummy_index = MultiIndex(
|
||||
levels=rlevels + [obs_ids],
|
||||
codes=rcodes + [comp_ids],
|
||||
names=rnames + ["__placeholder__"],
|
||||
verify_integrity=False,
|
||||
)
|
||||
|
||||
if isinstance(data, Series):
|
||||
dummy = data.copy()
|
||||
dummy.index = dummy_index
|
||||
|
||||
unstacked = dummy.unstack("__placeholder__", fill_value=fill_value, sort=sort)
|
||||
new_levels = clevels
|
||||
new_names = cnames
|
||||
new_codes = recons_codes
|
||||
else:
|
||||
if isinstance(data.columns, MultiIndex):
|
||||
result = data
|
||||
while clocs:
|
||||
val = clocs.pop(0)
|
||||
result = result.unstack(val, fill_value=fill_value, sort=sort)
|
||||
clocs = [v if v < val else v - 1 for v in clocs]
|
||||
|
||||
return result
|
||||
|
||||
# GH#42579 deep=False to avoid consolidating
|
||||
dummy_df = data.copy(deep=False)
|
||||
dummy_df.index = dummy_index
|
||||
|
||||
unstacked = dummy_df.unstack(
|
||||
"__placeholder__", fill_value=fill_value, sort=sort
|
||||
)
|
||||
if isinstance(unstacked, Series):
|
||||
unstcols = unstacked.index
|
||||
else:
|
||||
unstcols = unstacked.columns
|
||||
assert isinstance(unstcols, MultiIndex) # for mypy
|
||||
new_levels = [unstcols.levels[0]] + clevels
|
||||
new_names = [data.columns.name] + cnames
|
||||
|
||||
new_codes = [unstcols.codes[0]]
|
||||
new_codes.extend(rec.take(unstcols.codes[-1]) for rec in recons_codes)
|
||||
|
||||
new_columns = MultiIndex(
|
||||
levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
|
||||
)
|
||||
|
||||
if isinstance(unstacked, Series):
|
||||
unstacked.index = new_columns
|
||||
else:
|
||||
unstacked.columns = new_columns
|
||||
|
||||
return unstacked
|
||||
|
||||
|
||||
def unstack(obj: Series | DataFrame, level, fill_value=None, sort: bool = True):
|
||||
if isinstance(level, (tuple, list)):
|
||||
if len(level) != 1:
|
||||
# _unstack_multiple only handles MultiIndexes,
|
||||
# and isn't needed for a single level
|
||||
return _unstack_multiple(obj, level, fill_value=fill_value, sort=sort)
|
||||
else:
|
||||
level = level[0]
|
||||
|
||||
if not is_integer(level) and not level == "__placeholder__":
|
||||
# check if level is valid in case of regular index
|
||||
obj.index._get_level_number(level)
|
||||
|
||||
if isinstance(obj, DataFrame):
|
||||
if isinstance(obj.index, MultiIndex):
|
||||
return _unstack_frame(obj, level, fill_value=fill_value, sort=sort)
|
||||
else:
|
||||
return obj.T.stack(future_stack=True)
|
||||
elif not isinstance(obj.index, MultiIndex):
|
||||
# GH 36113
|
||||
# Give nicer error messages when unstack a Series whose
|
||||
# Index is not a MultiIndex.
|
||||
raise ValueError(
|
||||
f"index must be a MultiIndex to unstack, {type(obj.index)} was passed"
|
||||
)
|
||||
else:
|
||||
if is_1d_only_ea_dtype(obj.dtype):
|
||||
return _unstack_extension_series(obj, level, fill_value, sort=sort)
|
||||
unstacker = _Unstacker(
|
||||
obj.index, level=level, constructor=obj._constructor_expanddim, sort=sort
|
||||
)
|
||||
return unstacker.get_result(
|
||||
obj._values, value_columns=None, fill_value=fill_value
|
||||
)
|
||||
|
||||
|
||||
def _unstack_frame(
|
||||
obj: DataFrame, level, fill_value=None, sort: bool = True
|
||||
) -> DataFrame:
|
||||
assert isinstance(obj.index, MultiIndex) # checked by caller
|
||||
unstacker = _Unstacker(
|
||||
obj.index, level=level, constructor=obj._constructor, sort=sort
|
||||
)
|
||||
|
||||
if not obj._can_fast_transpose:
|
||||
mgr = obj._mgr.unstack(unstacker, fill_value=fill_value)
|
||||
return obj._constructor_from_mgr(mgr, axes=mgr.axes)
|
||||
else:
|
||||
return unstacker.get_result(
|
||||
obj._values, value_columns=obj.columns, fill_value=fill_value
|
||||
)
|
||||
|
||||
|
||||
def _unstack_extension_series(
|
||||
series: Series, level, fill_value, sort: bool
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Unstack an ExtensionArray-backed Series.
|
||||
|
||||
The ExtensionDtype is preserved.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
series : Series
|
||||
A Series with an ExtensionArray for values
|
||||
level : Any
|
||||
The level name or number.
|
||||
fill_value : Any
|
||||
The user-level (not physical storage) fill value to use for
|
||||
missing values introduced by the reshape. Passed to
|
||||
``series.values.take``.
|
||||
sort : bool
|
||||
Whether to sort the resulting MuliIndex levels
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
Each column of the DataFrame will have the same dtype as
|
||||
the input Series.
|
||||
"""
|
||||
# Defer to the logic in ExtensionBlock._unstack
|
||||
df = series.to_frame()
|
||||
result = df.unstack(level=level, fill_value=fill_value, sort=sort)
|
||||
|
||||
# equiv: result.droplevel(level=0, axis=1)
|
||||
# but this avoids an extra copy
|
||||
result.columns = result.columns._drop_level_numbers([0])
|
||||
return result
|
||||
|
||||
|
||||
def stack(frame: DataFrame, level=-1, dropna: bool = True, sort: bool = True):
|
||||
"""
|
||||
Convert DataFrame to Series with multi-level Index. Columns become the
|
||||
second level of the resulting hierarchical index
|
||||
|
||||
Returns
|
||||
-------
|
||||
stacked : Series or DataFrame
|
||||
"""
|
||||
|
||||
def stack_factorize(index):
|
||||
if index.is_unique:
|
||||
return index, np.arange(len(index))
|
||||
codes, categories = factorize_from_iterable(index)
|
||||
return categories, codes
|
||||
|
||||
N, K = frame.shape
|
||||
|
||||
# Will also convert negative level numbers and check if out of bounds.
|
||||
level_num = frame.columns._get_level_number(level)
|
||||
|
||||
if isinstance(frame.columns, MultiIndex):
|
||||
return _stack_multi_columns(
|
||||
frame, level_num=level_num, dropna=dropna, sort=sort
|
||||
)
|
||||
elif isinstance(frame.index, MultiIndex):
|
||||
new_levels = list(frame.index.levels)
|
||||
new_codes = [lab.repeat(K) for lab in frame.index.codes]
|
||||
|
||||
clev, clab = stack_factorize(frame.columns)
|
||||
new_levels.append(clev)
|
||||
new_codes.append(np.tile(clab, N).ravel())
|
||||
|
||||
new_names = list(frame.index.names)
|
||||
new_names.append(frame.columns.name)
|
||||
new_index = MultiIndex(
|
||||
levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
|
||||
)
|
||||
else:
|
||||
levels, (ilab, clab) = zip(*map(stack_factorize, (frame.index, frame.columns)))
|
||||
codes = ilab.repeat(K), np.tile(clab, N).ravel()
|
||||
new_index = MultiIndex(
|
||||
levels=levels,
|
||||
codes=codes,
|
||||
names=[frame.index.name, frame.columns.name],
|
||||
verify_integrity=False,
|
||||
)
|
||||
|
||||
new_values: ArrayLike
|
||||
if not frame.empty and frame._is_homogeneous_type:
|
||||
# For homogeneous EAs, frame._values will coerce to object. So
|
||||
# we concatenate instead.
|
||||
dtypes = list(frame.dtypes._values)
|
||||
dtype = dtypes[0]
|
||||
|
||||
if isinstance(dtype, ExtensionDtype):
|
||||
arr = dtype.construct_array_type()
|
||||
new_values = arr._concat_same_type(
|
||||
[col._values for _, col in frame.items()]
|
||||
)
|
||||
new_values = _reorder_for_extension_array_stack(new_values, N, K)
|
||||
else:
|
||||
# homogeneous, non-EA
|
||||
new_values = frame._values.ravel()
|
||||
|
||||
else:
|
||||
# non-homogeneous
|
||||
new_values = frame._values.ravel()
|
||||
|
||||
if dropna:
|
||||
mask = notna(new_values)
|
||||
new_values = new_values[mask]
|
||||
new_index = new_index[mask]
|
||||
|
||||
return frame._constructor_sliced(new_values, index=new_index)
|
||||
|
||||
|
||||
def stack_multiple(frame: DataFrame, level, dropna: bool = True, sort: bool = True):
|
||||
# If all passed levels match up to column names, no
|
||||
# ambiguity about what to do
|
||||
if all(lev in frame.columns.names for lev in level):
|
||||
result = frame
|
||||
for lev in level:
|
||||
result = stack(result, lev, dropna=dropna, sort=sort)
|
||||
|
||||
# Otherwise, level numbers may change as each successive level is stacked
|
||||
elif all(isinstance(lev, int) for lev in level):
|
||||
# As each stack is done, the level numbers decrease, so we need
|
||||
# to account for that when level is a sequence of ints
|
||||
result = frame
|
||||
# _get_level_number() checks level numbers are in range and converts
|
||||
# negative numbers to positive
|
||||
level = [frame.columns._get_level_number(lev) for lev in level]
|
||||
|
||||
while level:
|
||||
lev = level.pop(0)
|
||||
result = stack(result, lev, dropna=dropna, sort=sort)
|
||||
# Decrement all level numbers greater than current, as these
|
||||
# have now shifted down by one
|
||||
level = [v if v <= lev else v - 1 for v in level]
|
||||
|
||||
else:
|
||||
raise ValueError(
|
||||
"level should contain all level names or all level "
|
||||
"numbers, not a mixture of the two."
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex:
|
||||
"""Creates a MultiIndex from the first N-1 levels of this MultiIndex."""
|
||||
if len(columns.levels) <= 2:
|
||||
return columns.levels[0]._rename(name=columns.names[0])
|
||||
|
||||
levs = [
|
||||
[lev[c] if c >= 0 else None for c in codes]
|
||||
for lev, codes in zip(columns.levels[:-1], columns.codes[:-1])
|
||||
]
|
||||
|
||||
# Remove duplicate tuples in the MultiIndex.
|
||||
tuples = zip(*levs)
|
||||
unique_tuples = (key for key, _ in itertools.groupby(tuples))
|
||||
new_levs = zip(*unique_tuples)
|
||||
|
||||
# The dtype of each level must be explicitly set to avoid inferring the wrong type.
|
||||
# See GH-36991.
|
||||
return MultiIndex.from_arrays(
|
||||
[
|
||||
# Not all indices can accept None values.
|
||||
Index(new_lev, dtype=lev.dtype) if None not in new_lev else new_lev
|
||||
for new_lev, lev in zip(new_levs, columns.levels)
|
||||
],
|
||||
names=columns.names[:-1],
|
||||
)
|
||||
|
||||
|
||||
def _stack_multi_columns(
|
||||
frame: DataFrame, level_num: int = -1, dropna: bool = True, sort: bool = True
|
||||
) -> DataFrame:
|
||||
def _convert_level_number(level_num: int, columns: Index):
|
||||
"""
|
||||
Logic for converting the level number to something we can safely pass
|
||||
to swaplevel.
|
||||
|
||||
If `level_num` matches a column name return the name from
|
||||
position `level_num`, otherwise return `level_num`.
|
||||
"""
|
||||
if level_num in columns.names:
|
||||
return columns.names[level_num]
|
||||
|
||||
return level_num
|
||||
|
||||
this = frame.copy(deep=False)
|
||||
mi_cols = this.columns # cast(MultiIndex, this.columns)
|
||||
assert isinstance(mi_cols, MultiIndex) # caller is responsible
|
||||
|
||||
# this makes life much simpler
|
||||
if level_num != mi_cols.nlevels - 1:
|
||||
# roll levels to put selected level at end
|
||||
roll_columns = mi_cols
|
||||
for i in range(level_num, mi_cols.nlevels - 1):
|
||||
# Need to check if the ints conflict with level names
|
||||
lev1 = _convert_level_number(i, roll_columns)
|
||||
lev2 = _convert_level_number(i + 1, roll_columns)
|
||||
roll_columns = roll_columns.swaplevel(lev1, lev2)
|
||||
this.columns = mi_cols = roll_columns
|
||||
|
||||
if not mi_cols._is_lexsorted() and sort:
|
||||
# Workaround the edge case where 0 is one of the column names,
|
||||
# which interferes with trying to sort based on the first
|
||||
# level
|
||||
level_to_sort = _convert_level_number(0, mi_cols)
|
||||
this = this.sort_index(level=level_to_sort, axis=1)
|
||||
mi_cols = this.columns
|
||||
|
||||
mi_cols = cast(MultiIndex, mi_cols)
|
||||
new_columns = _stack_multi_column_index(mi_cols)
|
||||
|
||||
# time to ravel the values
|
||||
new_data = {}
|
||||
level_vals = mi_cols.levels[-1]
|
||||
level_codes = unique(mi_cols.codes[-1])
|
||||
if sort:
|
||||
level_codes = np.sort(level_codes)
|
||||
level_vals_nan = level_vals.insert(len(level_vals), None)
|
||||
|
||||
level_vals_used = np.take(level_vals_nan, level_codes)
|
||||
levsize = len(level_codes)
|
||||
drop_cols = []
|
||||
for key in new_columns:
|
||||
try:
|
||||
loc = this.columns.get_loc(key)
|
||||
except KeyError:
|
||||
drop_cols.append(key)
|
||||
continue
|
||||
|
||||
# can make more efficient?
|
||||
# we almost always return a slice
|
||||
# but if unsorted can get a boolean
|
||||
# indexer
|
||||
if not isinstance(loc, slice):
|
||||
slice_len = len(loc)
|
||||
else:
|
||||
slice_len = loc.stop - loc.start
|
||||
|
||||
if slice_len != levsize:
|
||||
chunk = this.loc[:, this.columns[loc]]
|
||||
chunk.columns = level_vals_nan.take(chunk.columns.codes[-1])
|
||||
value_slice = chunk.reindex(columns=level_vals_used).values
|
||||
else:
|
||||
subset = this.iloc[:, loc]
|
||||
dtype = find_common_type(subset.dtypes.tolist())
|
||||
if isinstance(dtype, ExtensionDtype):
|
||||
# TODO(EA2D): won't need special case, can go through .values
|
||||
# paths below (might change to ._values)
|
||||
value_slice = dtype.construct_array_type()._concat_same_type(
|
||||
[x._values.astype(dtype, copy=False) for _, x in subset.items()]
|
||||
)
|
||||
N, K = subset.shape
|
||||
idx = np.arange(N * K).reshape(K, N).T.ravel()
|
||||
value_slice = value_slice.take(idx)
|
||||
else:
|
||||
value_slice = subset.values
|
||||
|
||||
if value_slice.ndim > 1:
|
||||
# i.e. not extension
|
||||
value_slice = value_slice.ravel()
|
||||
|
||||
new_data[key] = value_slice
|
||||
|
||||
if len(drop_cols) > 0:
|
||||
new_columns = new_columns.difference(drop_cols)
|
||||
|
||||
N = len(this)
|
||||
|
||||
if isinstance(this.index, MultiIndex):
|
||||
new_levels = list(this.index.levels)
|
||||
new_names = list(this.index.names)
|
||||
new_codes = [lab.repeat(levsize) for lab in this.index.codes]
|
||||
else:
|
||||
old_codes, old_levels = factorize_from_iterable(this.index)
|
||||
new_levels = [old_levels]
|
||||
new_codes = [old_codes.repeat(levsize)]
|
||||
new_names = [this.index.name] # something better?
|
||||
|
||||
new_levels.append(level_vals)
|
||||
new_codes.append(np.tile(level_codes, N))
|
||||
new_names.append(frame.columns.names[level_num])
|
||||
|
||||
new_index = MultiIndex(
|
||||
levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
|
||||
)
|
||||
|
||||
result = frame._constructor(new_data, index=new_index, columns=new_columns)
|
||||
|
||||
if frame.columns.nlevels > 1:
|
||||
desired_columns = frame.columns._drop_level_numbers([level_num]).unique()
|
||||
if not result.columns.equals(desired_columns):
|
||||
result = result[desired_columns]
|
||||
|
||||
# more efficient way to go about this? can do the whole masking biz but
|
||||
# will only save a small amount of time...
|
||||
if dropna:
|
||||
result = result.dropna(axis=0, how="all")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _reorder_for_extension_array_stack(
|
||||
arr: ExtensionArray, n_rows: int, n_columns: int
|
||||
) -> ExtensionArray:
|
||||
"""
|
||||
Re-orders the values when stacking multiple extension-arrays.
|
||||
|
||||
The indirect stacking method used for EAs requires a followup
|
||||
take to get the order correct.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
arr : ExtensionArray
|
||||
n_rows, n_columns : int
|
||||
The number of rows and columns in the original DataFrame.
|
||||
|
||||
Returns
|
||||
-------
|
||||
taken : ExtensionArray
|
||||
The original `arr` with elements re-ordered appropriately
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> arr = np.array(['a', 'b', 'c', 'd', 'e', 'f'])
|
||||
>>> _reorder_for_extension_array_stack(arr, 2, 3)
|
||||
array(['a', 'c', 'e', 'b', 'd', 'f'], dtype='<U1')
|
||||
|
||||
>>> _reorder_for_extension_array_stack(arr, 3, 2)
|
||||
array(['a', 'd', 'b', 'e', 'c', 'f'], dtype='<U1')
|
||||
"""
|
||||
# final take to get the order correct.
|
||||
# idx is an indexer like
|
||||
# [c0r0, c1r0, c2r0, ...,
|
||||
# c0r1, c1r1, c2r1, ...]
|
||||
idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.ravel()
|
||||
return arr.take(idx)
|
||||
|
||||
|
||||
def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame:
|
||||
if frame.columns.nunique() != len(frame.columns):
|
||||
raise ValueError("Columns with duplicate values are not supported in stack")
|
||||
|
||||
# If we need to drop `level` from columns, it needs to be in descending order
|
||||
drop_levnums = sorted(level, reverse=True)
|
||||
stack_cols = frame.columns._drop_level_numbers(
|
||||
[k for k in range(frame.columns.nlevels) if k not in level][::-1]
|
||||
)
|
||||
if len(level) > 1:
|
||||
# Arrange columns in the order we want to take them, e.g. level=[2, 0, 1]
|
||||
sorter = np.argsort(level)
|
||||
ordered_stack_cols = stack_cols._reorder_ilevels(sorter)
|
||||
else:
|
||||
ordered_stack_cols = stack_cols
|
||||
|
||||
stack_cols_unique = stack_cols.unique()
|
||||
ordered_stack_cols_unique = ordered_stack_cols.unique()
|
||||
|
||||
# Grab data for each unique index to be stacked
|
||||
buf = []
|
||||
for idx in stack_cols_unique:
|
||||
if len(frame.columns) == 1:
|
||||
data = frame.copy()
|
||||
else:
|
||||
# Take the data from frame corresponding to this idx value
|
||||
if len(level) == 1:
|
||||
idx = (idx,)
|
||||
gen = iter(idx)
|
||||
column_indexer = tuple(
|
||||
next(gen) if k in level else slice(None)
|
||||
for k in range(frame.columns.nlevels)
|
||||
)
|
||||
data = frame.loc[:, column_indexer]
|
||||
|
||||
if len(level) < frame.columns.nlevels:
|
||||
data.columns = data.columns._drop_level_numbers(drop_levnums)
|
||||
elif stack_cols.nlevels == 1:
|
||||
if data.ndim == 1:
|
||||
data.name = 0
|
||||
else:
|
||||
data.columns = RangeIndex(len(data.columns))
|
||||
buf.append(data)
|
||||
|
||||
result: Series | DataFrame
|
||||
if len(buf) > 0 and not frame.empty:
|
||||
result = concat(buf)
|
||||
ratio = len(result) // len(frame)
|
||||
else:
|
||||
# input is empty
|
||||
if len(level) < frame.columns.nlevels:
|
||||
# concat column order may be different from dropping the levels
|
||||
new_columns = frame.columns._drop_level_numbers(drop_levnums).unique()
|
||||
else:
|
||||
new_columns = [0]
|
||||
result = DataFrame(columns=new_columns, dtype=frame._values.dtype)
|
||||
ratio = 0
|
||||
|
||||
if len(level) < frame.columns.nlevels:
|
||||
# concat column order may be different from dropping the levels
|
||||
desired_columns = frame.columns._drop_level_numbers(drop_levnums).unique()
|
||||
if not result.columns.equals(desired_columns):
|
||||
result = result[desired_columns]
|
||||
|
||||
# Construct the correct MultiIndex by combining the frame's index and
|
||||
# stacked columns.
|
||||
index_levels: list | FrozenList
|
||||
if isinstance(frame.index, MultiIndex):
|
||||
index_levels = frame.index.levels
|
||||
index_codes = list(np.tile(frame.index.codes, (1, ratio)))
|
||||
else:
|
||||
codes, uniques = factorize(frame.index, use_na_sentinel=False)
|
||||
index_levels = [uniques]
|
||||
index_codes = list(np.tile(codes, (1, ratio)))
|
||||
if isinstance(stack_cols, MultiIndex):
|
||||
column_levels = ordered_stack_cols.levels
|
||||
column_codes = ordered_stack_cols.drop_duplicates().codes
|
||||
else:
|
||||
column_levels = [ordered_stack_cols.unique()]
|
||||
column_codes = [factorize(ordered_stack_cols_unique, use_na_sentinel=False)[0]]
|
||||
column_codes = [np.repeat(codes, len(frame)) for codes in column_codes]
|
||||
result.index = MultiIndex(
|
||||
levels=index_levels + column_levels,
|
||||
codes=index_codes + column_codes,
|
||||
names=frame.index.names + list(ordered_stack_cols.names),
|
||||
verify_integrity=False,
|
||||
)
|
||||
|
||||
# sort result, but faster than calling sort_index since we know the order we need
|
||||
len_df = len(frame)
|
||||
n_uniques = len(ordered_stack_cols_unique)
|
||||
indexer = np.arange(n_uniques)
|
||||
idxs = np.tile(len_df * indexer, len_df) + np.repeat(np.arange(len_df), n_uniques)
|
||||
result = result.take(idxs)
|
||||
|
||||
# Reshape/rename if needed and dropna
|
||||
if result.ndim == 2 and frame.columns.nlevels == len(level):
|
||||
if len(result.columns) == 0:
|
||||
result = Series(index=result.index)
|
||||
else:
|
||||
result = result.iloc[:, 0]
|
||||
if result.ndim == 1:
|
||||
result.name = None
|
||||
|
||||
return result
|
638
lib/python3.11/site-packages/pandas/core/reshape/tile.py
Normal file
638
lib/python3.11/site-packages/pandas/core/reshape/tile.py
Normal file
@ -0,0 +1,638 @@
|
||||
"""
|
||||
Quantilization functions and related stuff
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
Literal,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import (
|
||||
Timedelta,
|
||||
Timestamp,
|
||||
lib,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
ensure_platform_int,
|
||||
is_bool_dtype,
|
||||
is_integer,
|
||||
is_list_like,
|
||||
is_numeric_dtype,
|
||||
is_scalar,
|
||||
)
|
||||
from pandas.core.dtypes.dtypes import (
|
||||
CategoricalDtype,
|
||||
DatetimeTZDtype,
|
||||
ExtensionDtype,
|
||||
)
|
||||
from pandas.core.dtypes.generic import ABCSeries
|
||||
from pandas.core.dtypes.missing import isna
|
||||
|
||||
from pandas import (
|
||||
Categorical,
|
||||
Index,
|
||||
IntervalIndex,
|
||||
)
|
||||
import pandas.core.algorithms as algos
|
||||
from pandas.core.arrays.datetimelike import dtype_to_unit
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
DtypeObj,
|
||||
IntervalLeftRight,
|
||||
)
|
||||
|
||||
|
||||
def cut(
|
||||
x,
|
||||
bins,
|
||||
right: bool = True,
|
||||
labels=None,
|
||||
retbins: bool = False,
|
||||
precision: int = 3,
|
||||
include_lowest: bool = False,
|
||||
duplicates: str = "raise",
|
||||
ordered: bool = True,
|
||||
):
|
||||
"""
|
||||
Bin values into discrete intervals.
|
||||
|
||||
Use `cut` when you need to segment and sort data values into bins. This
|
||||
function is also useful for going from a continuous variable to a
|
||||
categorical variable. For example, `cut` could convert ages to groups of
|
||||
age ranges. Supports binning into an equal number of bins, or a
|
||||
pre-specified array of bins.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : array-like
|
||||
The input array to be binned. Must be 1-dimensional.
|
||||
bins : int, sequence of scalars, or IntervalIndex
|
||||
The criteria to bin by.
|
||||
|
||||
* int : Defines the number of equal-width bins in the range of `x`. The
|
||||
range of `x` is extended by .1% on each side to include the minimum
|
||||
and maximum values of `x`.
|
||||
* sequence of scalars : Defines the bin edges allowing for non-uniform
|
||||
width. No extension of the range of `x` is done.
|
||||
* IntervalIndex : Defines the exact bins to be used. Note that
|
||||
IntervalIndex for `bins` must be non-overlapping.
|
||||
|
||||
right : bool, default True
|
||||
Indicates whether `bins` includes the rightmost edge or not. If
|
||||
``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]``
|
||||
indicate (1,2], (2,3], (3,4]. This argument is ignored when
|
||||
`bins` is an IntervalIndex.
|
||||
labels : array or False, default None
|
||||
Specifies the labels for the returned bins. Must be the same length as
|
||||
the resulting bins. If False, returns only integer indicators of the
|
||||
bins. This affects the type of the output container (see below).
|
||||
This argument is ignored when `bins` is an IntervalIndex. If True,
|
||||
raises an error. When `ordered=False`, labels must be provided.
|
||||
retbins : bool, default False
|
||||
Whether to return the bins or not. Useful when bins is provided
|
||||
as a scalar.
|
||||
precision : int, default 3
|
||||
The precision at which to store and display the bins labels.
|
||||
include_lowest : bool, default False
|
||||
Whether the first interval should be left-inclusive or not.
|
||||
duplicates : {default 'raise', 'drop'}, optional
|
||||
If bin edges are not unique, raise ValueError or drop non-uniques.
|
||||
ordered : bool, default True
|
||||
Whether the labels are ordered or not. Applies to returned types
|
||||
Categorical and Series (with Categorical dtype). If True,
|
||||
the resulting categorical will be ordered. If False, the resulting
|
||||
categorical will be unordered (labels must be provided).
|
||||
|
||||
Returns
|
||||
-------
|
||||
out : Categorical, Series, or ndarray
|
||||
An array-like object representing the respective bin for each value
|
||||
of `x`. The type depends on the value of `labels`.
|
||||
|
||||
* None (default) : returns a Series for Series `x` or a
|
||||
Categorical for all other inputs. The values stored within
|
||||
are Interval dtype.
|
||||
|
||||
* sequence of scalars : returns a Series for Series `x` or a
|
||||
Categorical for all other inputs. The values stored within
|
||||
are whatever the type in the sequence is.
|
||||
|
||||
* False : returns an ndarray of integers.
|
||||
|
||||
bins : numpy.ndarray or IntervalIndex.
|
||||
The computed or specified bins. Only returned when `retbins=True`.
|
||||
For scalar or sequence `bins`, this is an ndarray with the computed
|
||||
bins. If set `duplicates=drop`, `bins` will drop non-unique bin. For
|
||||
an IntervalIndex `bins`, this is equal to `bins`.
|
||||
|
||||
See Also
|
||||
--------
|
||||
qcut : Discretize variable into equal-sized buckets based on rank
|
||||
or based on sample quantiles.
|
||||
Categorical : Array type for storing data that come from a
|
||||
fixed set of values.
|
||||
Series : One-dimensional array with axis labels (including time series).
|
||||
IntervalIndex : Immutable Index implementing an ordered, sliceable set.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Any NA values will be NA in the result. Out of bounds values will be NA in
|
||||
the resulting Series or Categorical object.
|
||||
|
||||
Reference :ref:`the user guide <reshaping.tile.cut>` for more examples.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Discretize into three equal-sized bins.
|
||||
|
||||
>>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3)
|
||||
... # doctest: +ELLIPSIS
|
||||
[(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...
|
||||
Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] ...
|
||||
|
||||
>>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True)
|
||||
... # doctest: +ELLIPSIS
|
||||
([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...
|
||||
Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] ...
|
||||
array([0.994, 3. , 5. , 7. ]))
|
||||
|
||||
Discovers the same bins, but assign them specific labels. Notice that
|
||||
the returned Categorical's categories are `labels` and is ordered.
|
||||
|
||||
>>> pd.cut(np.array([1, 7, 5, 4, 6, 3]),
|
||||
... 3, labels=["bad", "medium", "good"])
|
||||
['bad', 'good', 'medium', 'medium', 'good', 'bad']
|
||||
Categories (3, object): ['bad' < 'medium' < 'good']
|
||||
|
||||
``ordered=False`` will result in unordered categories when labels are passed.
|
||||
This parameter can be used to allow non-unique labels:
|
||||
|
||||
>>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3,
|
||||
... labels=["B", "A", "B"], ordered=False)
|
||||
['B', 'B', 'A', 'A', 'B', 'B']
|
||||
Categories (2, object): ['A', 'B']
|
||||
|
||||
``labels=False`` implies you just want the bins back.
|
||||
|
||||
>>> pd.cut([0, 1, 1, 2], bins=4, labels=False)
|
||||
array([0, 1, 1, 3])
|
||||
|
||||
Passing a Series as an input returns a Series with categorical dtype:
|
||||
|
||||
>>> s = pd.Series(np.array([2, 4, 6, 8, 10]),
|
||||
... index=['a', 'b', 'c', 'd', 'e'])
|
||||
>>> pd.cut(s, 3)
|
||||
... # doctest: +ELLIPSIS
|
||||
a (1.992, 4.667]
|
||||
b (1.992, 4.667]
|
||||
c (4.667, 7.333]
|
||||
d (7.333, 10.0]
|
||||
e (7.333, 10.0]
|
||||
dtype: category
|
||||
Categories (3, interval[float64, right]): [(1.992, 4.667] < (4.667, ...
|
||||
|
||||
Passing a Series as an input returns a Series with mapping value.
|
||||
It is used to map numerically to intervals based on bins.
|
||||
|
||||
>>> s = pd.Series(np.array([2, 4, 6, 8, 10]),
|
||||
... index=['a', 'b', 'c', 'd', 'e'])
|
||||
>>> pd.cut(s, [0, 2, 4, 6, 8, 10], labels=False, retbins=True, right=False)
|
||||
... # doctest: +ELLIPSIS
|
||||
(a 1.0
|
||||
b 2.0
|
||||
c 3.0
|
||||
d 4.0
|
||||
e NaN
|
||||
dtype: float64,
|
||||
array([ 0, 2, 4, 6, 8, 10]))
|
||||
|
||||
Use `drop` optional when bins is not unique
|
||||
|
||||
>>> pd.cut(s, [0, 2, 4, 6, 10, 10], labels=False, retbins=True,
|
||||
... right=False, duplicates='drop')
|
||||
... # doctest: +ELLIPSIS
|
||||
(a 1.0
|
||||
b 2.0
|
||||
c 3.0
|
||||
d 3.0
|
||||
e NaN
|
||||
dtype: float64,
|
||||
array([ 0, 2, 4, 6, 10]))
|
||||
|
||||
Passing an IntervalIndex for `bins` results in those categories exactly.
|
||||
Notice that values not covered by the IntervalIndex are set to NaN. 0
|
||||
is to the left of the first bin (which is closed on the right), and 1.5
|
||||
falls between two bins.
|
||||
|
||||
>>> bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)])
|
||||
>>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins)
|
||||
[NaN, (0.0, 1.0], NaN, (2.0, 3.0], (4.0, 5.0]]
|
||||
Categories (3, interval[int64, right]): [(0, 1] < (2, 3] < (4, 5]]
|
||||
"""
|
||||
# NOTE: this binning code is changed a bit from histogram for var(x) == 0
|
||||
|
||||
original = x
|
||||
x_idx = _preprocess_for_cut(x)
|
||||
x_idx, _ = _coerce_to_type(x_idx)
|
||||
|
||||
if not np.iterable(bins):
|
||||
bins = _nbins_to_bins(x_idx, bins, right)
|
||||
|
||||
elif isinstance(bins, IntervalIndex):
|
||||
if bins.is_overlapping:
|
||||
raise ValueError("Overlapping IntervalIndex is not accepted.")
|
||||
|
||||
else:
|
||||
bins = Index(bins)
|
||||
if not bins.is_monotonic_increasing:
|
||||
raise ValueError("bins must increase monotonically.")
|
||||
|
||||
fac, bins = _bins_to_cuts(
|
||||
x_idx,
|
||||
bins,
|
||||
right=right,
|
||||
labels=labels,
|
||||
precision=precision,
|
||||
include_lowest=include_lowest,
|
||||
duplicates=duplicates,
|
||||
ordered=ordered,
|
||||
)
|
||||
|
||||
return _postprocess_for_cut(fac, bins, retbins, original)
|
||||
|
||||
|
||||
def qcut(
|
||||
x,
|
||||
q,
|
||||
labels=None,
|
||||
retbins: bool = False,
|
||||
precision: int = 3,
|
||||
duplicates: str = "raise",
|
||||
):
|
||||
"""
|
||||
Quantile-based discretization function.
|
||||
|
||||
Discretize variable into equal-sized buckets based on rank or based
|
||||
on sample quantiles. For example 1000 values for 10 quantiles would
|
||||
produce a Categorical object indicating quantile membership for each data point.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : 1d ndarray or Series
|
||||
q : int or list-like of float
|
||||
Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
|
||||
array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles.
|
||||
labels : array or False, default None
|
||||
Used as labels for the resulting bins. Must be of the same length as
|
||||
the resulting bins. If False, return only integer indicators of the
|
||||
bins. If True, raises an error.
|
||||
retbins : bool, optional
|
||||
Whether to return the (bins, labels) or not. Can be useful if bins
|
||||
is given as a scalar.
|
||||
precision : int, optional
|
||||
The precision at which to store and display the bins labels.
|
||||
duplicates : {default 'raise', 'drop'}, optional
|
||||
If bin edges are not unique, raise ValueError or drop non-uniques.
|
||||
|
||||
Returns
|
||||
-------
|
||||
out : Categorical or Series or array of integers if labels is False
|
||||
The return type (Categorical or Series) depends on the input: a Series
|
||||
of type category if input is a Series else Categorical. Bins are
|
||||
represented as categories when categorical data is returned.
|
||||
bins : ndarray of floats
|
||||
Returned only if `retbins` is True.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Out of bounds values will be NA in the resulting Categorical object
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> pd.qcut(range(5), 4)
|
||||
... # doctest: +ELLIPSIS
|
||||
[(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]]
|
||||
Categories (4, interval[float64, right]): [(-0.001, 1.0] < (1.0, 2.0] ...
|
||||
|
||||
>>> pd.qcut(range(5), 3, labels=["good", "medium", "bad"])
|
||||
... # doctest: +SKIP
|
||||
[good, good, medium, bad, bad]
|
||||
Categories (3, object): [good < medium < bad]
|
||||
|
||||
>>> pd.qcut(range(5), 4, labels=False)
|
||||
array([0, 0, 1, 2, 3])
|
||||
"""
|
||||
original = x
|
||||
x_idx = _preprocess_for_cut(x)
|
||||
x_idx, _ = _coerce_to_type(x_idx)
|
||||
|
||||
quantiles = np.linspace(0, 1, q + 1) if is_integer(q) else q
|
||||
|
||||
bins = x_idx.to_series().dropna().quantile(quantiles)
|
||||
|
||||
fac, bins = _bins_to_cuts(
|
||||
x_idx,
|
||||
Index(bins),
|
||||
labels=labels,
|
||||
precision=precision,
|
||||
include_lowest=True,
|
||||
duplicates=duplicates,
|
||||
)
|
||||
|
||||
return _postprocess_for_cut(fac, bins, retbins, original)
|
||||
|
||||
|
||||
def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index:
|
||||
"""
|
||||
If a user passed an integer N for bins, convert this to a sequence of N
|
||||
equal(ish)-sized bins.
|
||||
"""
|
||||
if is_scalar(nbins) and nbins < 1:
|
||||
raise ValueError("`bins` should be a positive integer.")
|
||||
|
||||
if x_idx.size == 0:
|
||||
raise ValueError("Cannot cut empty array")
|
||||
|
||||
rng = (x_idx.min(), x_idx.max())
|
||||
mn, mx = rng
|
||||
|
||||
if is_numeric_dtype(x_idx.dtype) and (np.isinf(mn) or np.isinf(mx)):
|
||||
# GH#24314
|
||||
raise ValueError(
|
||||
"cannot specify integer `bins` when input data contains infinity"
|
||||
)
|
||||
|
||||
if mn == mx: # adjust end points before binning
|
||||
if _is_dt_or_td(x_idx.dtype):
|
||||
# using seconds=1 is pretty arbitrary here
|
||||
# error: Argument 1 to "dtype_to_unit" has incompatible type
|
||||
# "dtype[Any] | ExtensionDtype"; expected "DatetimeTZDtype | dtype[Any]"
|
||||
unit = dtype_to_unit(x_idx.dtype) # type: ignore[arg-type]
|
||||
td = Timedelta(seconds=1).as_unit(unit)
|
||||
# Use DatetimeArray/TimedeltaArray method instead of linspace
|
||||
# error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]"
|
||||
# has no attribute "_generate_range"
|
||||
bins = x_idx._values._generate_range( # type: ignore[union-attr]
|
||||
start=mn - td, end=mx + td, periods=nbins + 1, freq=None, unit=unit
|
||||
)
|
||||
else:
|
||||
mn -= 0.001 * abs(mn) if mn != 0 else 0.001
|
||||
mx += 0.001 * abs(mx) if mx != 0 else 0.001
|
||||
|
||||
bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
|
||||
else: # adjust end points after binning
|
||||
if _is_dt_or_td(x_idx.dtype):
|
||||
# Use DatetimeArray/TimedeltaArray method instead of linspace
|
||||
|
||||
# error: Argument 1 to "dtype_to_unit" has incompatible type
|
||||
# "dtype[Any] | ExtensionDtype"; expected "DatetimeTZDtype | dtype[Any]"
|
||||
unit = dtype_to_unit(x_idx.dtype) # type: ignore[arg-type]
|
||||
# error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]"
|
||||
# has no attribute "_generate_range"
|
||||
bins = x_idx._values._generate_range( # type: ignore[union-attr]
|
||||
start=mn, end=mx, periods=nbins + 1, freq=None, unit=unit
|
||||
)
|
||||
else:
|
||||
bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
|
||||
adj = (mx - mn) * 0.001 # 0.1% of the range
|
||||
if right:
|
||||
bins[0] -= adj
|
||||
else:
|
||||
bins[-1] += adj
|
||||
|
||||
return Index(bins)
|
||||
|
||||
|
||||
def _bins_to_cuts(
|
||||
x_idx: Index,
|
||||
bins: Index,
|
||||
right: bool = True,
|
||||
labels=None,
|
||||
precision: int = 3,
|
||||
include_lowest: bool = False,
|
||||
duplicates: str = "raise",
|
||||
ordered: bool = True,
|
||||
):
|
||||
if not ordered and labels is None:
|
||||
raise ValueError("'labels' must be provided if 'ordered = False'")
|
||||
|
||||
if duplicates not in ["raise", "drop"]:
|
||||
raise ValueError(
|
||||
"invalid value for 'duplicates' parameter, valid options are: raise, drop"
|
||||
)
|
||||
|
||||
result: Categorical | np.ndarray
|
||||
|
||||
if isinstance(bins, IntervalIndex):
|
||||
# we have a fast-path here
|
||||
ids = bins.get_indexer(x_idx)
|
||||
cat_dtype = CategoricalDtype(bins, ordered=True)
|
||||
result = Categorical.from_codes(ids, dtype=cat_dtype, validate=False)
|
||||
return result, bins
|
||||
|
||||
unique_bins = algos.unique(bins)
|
||||
if len(unique_bins) < len(bins) and len(bins) != 2:
|
||||
if duplicates == "raise":
|
||||
raise ValueError(
|
||||
f"Bin edges must be unique: {repr(bins)}.\n"
|
||||
f"You can drop duplicate edges by setting the 'duplicates' kwarg"
|
||||
)
|
||||
bins = unique_bins
|
||||
|
||||
side: Literal["left", "right"] = "left" if right else "right"
|
||||
|
||||
try:
|
||||
ids = bins.searchsorted(x_idx, side=side)
|
||||
except TypeError as err:
|
||||
# e.g. test_datetime_nan_error if bins are DatetimeArray and x_idx
|
||||
# is integers
|
||||
if x_idx.dtype.kind == "m":
|
||||
raise ValueError("bins must be of timedelta64 dtype") from err
|
||||
elif x_idx.dtype.kind == bins.dtype.kind == "M":
|
||||
raise ValueError(
|
||||
"Cannot use timezone-naive bins with timezone-aware values, "
|
||||
"or vice-versa"
|
||||
) from err
|
||||
elif x_idx.dtype.kind == "M":
|
||||
raise ValueError("bins must be of datetime64 dtype") from err
|
||||
else:
|
||||
raise
|
||||
ids = ensure_platform_int(ids)
|
||||
|
||||
if include_lowest:
|
||||
ids[x_idx == bins[0]] = 1
|
||||
|
||||
na_mask = isna(x_idx) | (ids == len(bins)) | (ids == 0)
|
||||
has_nas = na_mask.any()
|
||||
|
||||
if labels is not False:
|
||||
if not (labels is None or is_list_like(labels)):
|
||||
raise ValueError(
|
||||
"Bin labels must either be False, None or passed in as a "
|
||||
"list-like argument"
|
||||
)
|
||||
|
||||
if labels is None:
|
||||
labels = _format_labels(
|
||||
bins, precision, right=right, include_lowest=include_lowest
|
||||
)
|
||||
elif ordered and len(set(labels)) != len(labels):
|
||||
raise ValueError(
|
||||
"labels must be unique if ordered=True; pass ordered=False "
|
||||
"for duplicate labels"
|
||||
)
|
||||
else:
|
||||
if len(labels) != len(bins) - 1:
|
||||
raise ValueError(
|
||||
"Bin labels must be one fewer than the number of bin edges"
|
||||
)
|
||||
|
||||
if not isinstance(getattr(labels, "dtype", None), CategoricalDtype):
|
||||
labels = Categorical(
|
||||
labels,
|
||||
categories=labels if len(set(labels)) == len(labels) else None,
|
||||
ordered=ordered,
|
||||
)
|
||||
# TODO: handle mismatch between categorical label order and pandas.cut order.
|
||||
np.putmask(ids, na_mask, 0)
|
||||
result = algos.take_nd(labels, ids - 1)
|
||||
|
||||
else:
|
||||
result = ids - 1
|
||||
if has_nas:
|
||||
result = result.astype(np.float64)
|
||||
np.putmask(result, na_mask, np.nan)
|
||||
|
||||
return result, bins
|
||||
|
||||
|
||||
def _coerce_to_type(x: Index) -> tuple[Index, DtypeObj | None]:
|
||||
"""
|
||||
if the passed data is of datetime/timedelta, bool or nullable int type,
|
||||
this method converts it to numeric so that cut or qcut method can
|
||||
handle it
|
||||
"""
|
||||
dtype: DtypeObj | None = None
|
||||
|
||||
if _is_dt_or_td(x.dtype):
|
||||
dtype = x.dtype
|
||||
elif is_bool_dtype(x.dtype):
|
||||
# GH 20303
|
||||
x = x.astype(np.int64)
|
||||
# To support cut and qcut for IntegerArray we convert to float dtype.
|
||||
# Will properly support in the future.
|
||||
# https://github.com/pandas-dev/pandas/pull/31290
|
||||
# https://github.com/pandas-dev/pandas/issues/31389
|
||||
elif isinstance(x.dtype, ExtensionDtype) and is_numeric_dtype(x.dtype):
|
||||
x_arr = x.to_numpy(dtype=np.float64, na_value=np.nan)
|
||||
x = Index(x_arr)
|
||||
|
||||
return Index(x), dtype
|
||||
|
||||
|
||||
def _is_dt_or_td(dtype: DtypeObj) -> bool:
|
||||
# Note: the dtype here comes from an Index.dtype, so we know that that any
|
||||
# dt64/td64 dtype is of a supported unit.
|
||||
return isinstance(dtype, DatetimeTZDtype) or lib.is_np_dtype(dtype, "mM")
|
||||
|
||||
|
||||
def _format_labels(
|
||||
bins: Index,
|
||||
precision: int,
|
||||
right: bool = True,
|
||||
include_lowest: bool = False,
|
||||
):
|
||||
"""based on the dtype, return our labels"""
|
||||
closed: IntervalLeftRight = "right" if right else "left"
|
||||
|
||||
formatter: Callable[[Any], Timestamp] | Callable[[Any], Timedelta]
|
||||
|
||||
if _is_dt_or_td(bins.dtype):
|
||||
# error: Argument 1 to "dtype_to_unit" has incompatible type
|
||||
# "dtype[Any] | ExtensionDtype"; expected "DatetimeTZDtype | dtype[Any]"
|
||||
unit = dtype_to_unit(bins.dtype) # type: ignore[arg-type]
|
||||
formatter = lambda x: x
|
||||
adjust = lambda x: x - Timedelta(1, unit=unit).as_unit(unit)
|
||||
else:
|
||||
precision = _infer_precision(precision, bins)
|
||||
formatter = lambda x: _round_frac(x, precision)
|
||||
adjust = lambda x: x - 10 ** (-precision)
|
||||
|
||||
breaks = [formatter(b) for b in bins]
|
||||
if right and include_lowest:
|
||||
# adjust lhs of first interval by precision to account for being right closed
|
||||
breaks[0] = adjust(breaks[0])
|
||||
|
||||
if _is_dt_or_td(bins.dtype):
|
||||
# error: "Index" has no attribute "as_unit"
|
||||
breaks = type(bins)(breaks).as_unit(unit) # type: ignore[attr-defined]
|
||||
|
||||
return IntervalIndex.from_breaks(breaks, closed=closed)
|
||||
|
||||
|
||||
def _preprocess_for_cut(x) -> Index:
|
||||
"""
|
||||
handles preprocessing for cut where we convert passed
|
||||
input to array, strip the index information and store it
|
||||
separately
|
||||
"""
|
||||
# Check that the passed array is a Pandas or Numpy object
|
||||
# We don't want to strip away a Pandas data-type here (e.g. datetimetz)
|
||||
ndim = getattr(x, "ndim", None)
|
||||
if ndim is None:
|
||||
x = np.asarray(x)
|
||||
if x.ndim != 1:
|
||||
raise ValueError("Input array must be 1 dimensional")
|
||||
|
||||
return Index(x)
|
||||
|
||||
|
||||
def _postprocess_for_cut(fac, bins, retbins: bool, original):
|
||||
"""
|
||||
handles post processing for the cut method where
|
||||
we combine the index information if the originally passed
|
||||
datatype was a series
|
||||
"""
|
||||
if isinstance(original, ABCSeries):
|
||||
fac = original._constructor(fac, index=original.index, name=original.name)
|
||||
|
||||
if not retbins:
|
||||
return fac
|
||||
|
||||
if isinstance(bins, Index) and is_numeric_dtype(bins.dtype):
|
||||
bins = bins._values
|
||||
|
||||
return fac, bins
|
||||
|
||||
|
||||
def _round_frac(x, precision: int):
|
||||
"""
|
||||
Round the fractional part of the given number
|
||||
"""
|
||||
if not np.isfinite(x) or x == 0:
|
||||
return x
|
||||
else:
|
||||
frac, whole = np.modf(x)
|
||||
if whole == 0:
|
||||
digits = -int(np.floor(np.log10(abs(frac)))) - 1 + precision
|
||||
else:
|
||||
digits = precision
|
||||
return np.around(x, digits)
|
||||
|
||||
|
||||
def _infer_precision(base_precision: int, bins: Index) -> int:
|
||||
"""
|
||||
Infer an appropriate precision for _round_frac
|
||||
"""
|
||||
for precision in range(base_precision, 20):
|
||||
levels = np.asarray([_round_frac(b, precision) for b in bins])
|
||||
if algos.unique(levels).size == bins.size:
|
||||
return precision
|
||||
return base_precision # default
|
85
lib/python3.11/site-packages/pandas/core/reshape/util.py
Normal file
85
lib/python3.11/site-packages/pandas/core/reshape/util.py
Normal file
@ -0,0 +1,85 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.core.dtypes.common import is_list_like
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import NumpyIndexT
|
||||
|
||||
|
||||
def cartesian_product(X) -> list[np.ndarray]:
|
||||
"""
|
||||
Numpy version of itertools.product.
|
||||
Sometimes faster (for large inputs)...
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : list-like of list-likes
|
||||
|
||||
Returns
|
||||
-------
|
||||
product : list of ndarrays
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> cartesian_product([list('ABC'), [1, 2]])
|
||||
[array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='<U1'), array([1, 2, 1, 2, 1, 2])]
|
||||
|
||||
See Also
|
||||
--------
|
||||
itertools.product : Cartesian product of input iterables. Equivalent to
|
||||
nested for-loops.
|
||||
"""
|
||||
msg = "Input must be a list-like of list-likes"
|
||||
if not is_list_like(X):
|
||||
raise TypeError(msg)
|
||||
for x in X:
|
||||
if not is_list_like(x):
|
||||
raise TypeError(msg)
|
||||
|
||||
if len(X) == 0:
|
||||
return []
|
||||
|
||||
lenX = np.fromiter((len(x) for x in X), dtype=np.intp)
|
||||
cumprodX = np.cumprod(lenX)
|
||||
|
||||
if np.any(cumprodX < 0):
|
||||
raise ValueError("Product space too large to allocate arrays!")
|
||||
|
||||
a = np.roll(cumprodX, 1)
|
||||
a[0] = 1
|
||||
|
||||
if cumprodX[-1] != 0:
|
||||
b = cumprodX[-1] / cumprodX
|
||||
else:
|
||||
# if any factor is empty, the cartesian product is empty
|
||||
b = np.zeros_like(cumprodX)
|
||||
|
||||
# error: Argument of type "int_" cannot be assigned to parameter "num" of
|
||||
# type "int" in function "tile_compat"
|
||||
return [
|
||||
tile_compat(
|
||||
np.repeat(x, b[i]),
|
||||
np.prod(a[i]),
|
||||
)
|
||||
for i, x in enumerate(X)
|
||||
]
|
||||
|
||||
|
||||
def tile_compat(arr: NumpyIndexT, num: int) -> NumpyIndexT:
|
||||
"""
|
||||
Index compat for np.tile.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Does not support multi-dimensional `num`.
|
||||
"""
|
||||
if isinstance(arr, np.ndarray):
|
||||
return np.tile(arr, num)
|
||||
|
||||
# Otherwise we have an Index
|
||||
taker = np.tile(np.arange(len(arr)), num)
|
||||
return arr.take(taker)
|
Reference in New Issue
Block a user