This commit is contained in:
2025-09-07 22:09:54 +02:00
parent e1b817252c
commit 2fc0d000b6
7796 changed files with 2159515 additions and 933 deletions

View File

@ -0,0 +1,85 @@
from pandas.core.internals.api import make_block # 2023-09-18 pyarrow uses this
from pandas.core.internals.array_manager import (
ArrayManager,
SingleArrayManager,
)
from pandas.core.internals.base import (
DataManager,
SingleDataManager,
)
from pandas.core.internals.concat import concatenate_managers
from pandas.core.internals.managers import (
BlockManager,
SingleBlockManager,
)
__all__ = [
"Block", # pylint: disable=undefined-all-variable
"DatetimeTZBlock", # pylint: disable=undefined-all-variable
"ExtensionBlock", # pylint: disable=undefined-all-variable
"make_block",
"DataManager",
"ArrayManager",
"BlockManager",
"SingleDataManager",
"SingleBlockManager",
"SingleArrayManager",
"concatenate_managers",
]
def __getattr__(name: str):
# GH#55139
import warnings
if name == "create_block_manager_from_blocks":
# GH#33892
warnings.warn(
f"{name} is deprecated and will be removed in a future version. "
"Use public APIs instead.",
DeprecationWarning,
# https://github.com/pandas-dev/pandas/pull/55139#pullrequestreview-1720690758
# on hard-coding stacklevel
stacklevel=2,
)
from pandas.core.internals.managers import create_block_manager_from_blocks
return create_block_manager_from_blocks
if name in [
"NumericBlock",
"ObjectBlock",
"Block",
"ExtensionBlock",
"DatetimeTZBlock",
]:
warnings.warn(
f"{name} is deprecated and will be removed in a future version. "
"Use public APIs instead.",
DeprecationWarning,
# https://github.com/pandas-dev/pandas/pull/55139#pullrequestreview-1720690758
# on hard-coding stacklevel
stacklevel=2,
)
if name == "NumericBlock":
from pandas.core.internals.blocks import NumericBlock
return NumericBlock
elif name == "DatetimeTZBlock":
from pandas.core.internals.blocks import DatetimeTZBlock
return DatetimeTZBlock
elif name == "ExtensionBlock":
from pandas.core.internals.blocks import ExtensionBlock
return ExtensionBlock
elif name == "Block":
from pandas.core.internals.blocks import Block
return Block
else:
from pandas.core.internals.blocks import ObjectBlock
return ObjectBlock
raise AttributeError(f"module 'pandas.core.internals' has no attribute '{name}'")

View File

@ -0,0 +1,156 @@
"""
This is a pseudo-public API for downstream libraries. We ask that downstream
authors
1) Try to avoid using internals directly altogether, and failing that,
2) Use only functions exposed here (or in core.internals)
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import numpy as np
from pandas._libs.internals import BlockPlacement
from pandas.core.dtypes.common import pandas_dtype
from pandas.core.dtypes.dtypes import (
DatetimeTZDtype,
PeriodDtype,
)
from pandas.core.arrays import DatetimeArray
from pandas.core.construction import extract_array
from pandas.core.internals.blocks import (
check_ndim,
ensure_block_shape,
extract_pandas_array,
get_block_type,
maybe_coerce_values,
)
if TYPE_CHECKING:
from pandas._typing import Dtype
from pandas.core.internals.blocks import Block
def make_block(
values, placement, klass=None, ndim=None, dtype: Dtype | None = None
) -> Block:
"""
This is a pseudo-public analogue to blocks.new_block.
We ask that downstream libraries use this rather than any fully-internal
APIs, including but not limited to:
- core.internals.blocks.make_block
- Block.make_block
- Block.make_block_same_class
- Block.__init__
"""
if dtype is not None:
dtype = pandas_dtype(dtype)
values, dtype = extract_pandas_array(values, dtype, ndim)
from pandas.core.internals.blocks import (
DatetimeTZBlock,
ExtensionBlock,
)
if klass is ExtensionBlock and isinstance(values.dtype, PeriodDtype):
# GH-44681 changed PeriodArray to be stored in the 2D
# NDArrayBackedExtensionBlock instead of ExtensionBlock
# -> still allow ExtensionBlock to be passed in this case for back compat
klass = None
if klass is None:
dtype = dtype or values.dtype
klass = get_block_type(dtype)
elif klass is DatetimeTZBlock and not isinstance(values.dtype, DatetimeTZDtype):
# pyarrow calls get here
values = DatetimeArray._simple_new(
# error: Argument "dtype" to "_simple_new" of "DatetimeArray" has
# incompatible type "Union[ExtensionDtype, dtype[Any], None]";
# expected "Union[dtype[datetime64], DatetimeTZDtype]"
values,
dtype=dtype, # type: ignore[arg-type]
)
if not isinstance(placement, BlockPlacement):
placement = BlockPlacement(placement)
ndim = maybe_infer_ndim(values, placement, ndim)
if isinstance(values.dtype, (PeriodDtype, DatetimeTZDtype)):
# GH#41168 ensure we can pass 1D dt64tz values
# More generally, any EA dtype that isn't is_1d_only_ea_dtype
values = extract_array(values, extract_numpy=True)
values = ensure_block_shape(values, ndim)
check_ndim(values, placement, ndim)
values = maybe_coerce_values(values)
return klass(values, ndim=ndim, placement=placement)
def maybe_infer_ndim(values, placement: BlockPlacement, ndim: int | None) -> int:
"""
If `ndim` is not provided, infer it from placement and values.
"""
if ndim is None:
# GH#38134 Block constructor now assumes ndim is not None
if not isinstance(values.dtype, np.dtype):
if len(placement) != 1:
ndim = 1
else:
ndim = 2
else:
ndim = values.ndim
return ndim
def __getattr__(name: str):
# GH#55139
import warnings
if name in [
"Block",
"ExtensionBlock",
"DatetimeTZBlock",
"create_block_manager_from_blocks",
]:
# GH#33892
warnings.warn(
f"{name} is deprecated and will be removed in a future version. "
"Use public APIs instead.",
DeprecationWarning,
# https://github.com/pandas-dev/pandas/pull/55139#pullrequestreview-1720690758
# on hard-coding stacklevel
stacklevel=2,
)
if name == "create_block_manager_from_blocks":
from pandas.core.internals.managers import create_block_manager_from_blocks
return create_block_manager_from_blocks
elif name == "Block":
from pandas.core.internals.blocks import Block
return Block
elif name == "DatetimeTZBlock":
from pandas.core.internals.blocks import DatetimeTZBlock
return DatetimeTZBlock
elif name == "ExtensionBlock":
from pandas.core.internals.blocks import ExtensionBlock
return ExtensionBlock
raise AttributeError(
f"module 'pandas.core.internals.api' has no attribute '{name}'"
)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,407 @@
"""
Base class for the internal managers. Both BlockManager and ArrayManager
inherit from this class.
"""
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Any,
Literal,
cast,
final,
)
import numpy as np
from pandas._config import (
using_copy_on_write,
warn_copy_on_write,
)
from pandas._libs import (
algos as libalgos,
lib,
)
from pandas.errors import AbstractMethodError
from pandas.util._validators import validate_bool_kwarg
from pandas.core.dtypes.cast import (
find_common_type,
np_can_hold_element,
)
from pandas.core.dtypes.dtypes import (
ExtensionDtype,
SparseDtype,
)
from pandas.core.base import PandasObject
from pandas.core.construction import extract_array
from pandas.core.indexes.api import (
Index,
default_index,
)
if TYPE_CHECKING:
from pandas._typing import (
ArrayLike,
AxisInt,
DtypeObj,
Self,
Shape,
)
class _AlreadyWarned:
def __init__(self):
# This class is used on the manager level to the block level to
# ensure that we warn only once. The block method can update the
# warned_already option without returning a value to keep the
# interface consistent. This is only a temporary solution for
# CoW warnings.
self.warned_already = False
class DataManager(PandasObject):
# TODO share more methods/attributes
axes: list[Index]
@property
def items(self) -> Index:
raise AbstractMethodError(self)
@final
def __len__(self) -> int:
return len(self.items)
@property
def ndim(self) -> int:
return len(self.axes)
@property
def shape(self) -> Shape:
return tuple(len(ax) for ax in self.axes)
@final
def _validate_set_axis(self, axis: AxisInt, new_labels: Index) -> None:
# Caller is responsible for ensuring we have an Index object.
old_len = len(self.axes[axis])
new_len = len(new_labels)
if axis == 1 and len(self.items) == 0:
# If we are setting the index on a DataFrame with no columns,
# it is OK to change the length.
pass
elif new_len != old_len:
raise ValueError(
f"Length mismatch: Expected axis has {old_len} elements, new "
f"values have {new_len} elements"
)
def reindex_indexer(
self,
new_axis,
indexer,
axis: AxisInt,
fill_value=None,
allow_dups: bool = False,
copy: bool = True,
only_slice: bool = False,
) -> Self:
raise AbstractMethodError(self)
@final
def reindex_axis(
self,
new_index: Index,
axis: AxisInt,
fill_value=None,
only_slice: bool = False,
) -> Self:
"""
Conform data manager to new index.
"""
new_index, indexer = self.axes[axis].reindex(new_index)
return self.reindex_indexer(
new_index,
indexer,
axis=axis,
fill_value=fill_value,
copy=False,
only_slice=only_slice,
)
def _equal_values(self, other: Self) -> bool:
"""
To be implemented by the subclasses. Only check the column values
assuming shape and indexes have already been checked.
"""
raise AbstractMethodError(self)
@final
def equals(self, other: object) -> bool:
"""
Implementation for DataFrame.equals
"""
if not isinstance(other, type(self)):
return False
self_axes, other_axes = self.axes, other.axes
if len(self_axes) != len(other_axes):
return False
if not all(ax1.equals(ax2) for ax1, ax2 in zip(self_axes, other_axes)):
return False
return self._equal_values(other)
def apply(
self,
f,
align_keys: list[str] | None = None,
**kwargs,
) -> Self:
raise AbstractMethodError(self)
def apply_with_block(
self,
f,
align_keys: list[str] | None = None,
**kwargs,
) -> Self:
raise AbstractMethodError(self)
@final
def isna(self, func) -> Self:
return self.apply("apply", func=func)
@final
def fillna(self, value, limit: int | None, inplace: bool, downcast) -> Self:
if limit is not None:
# Do this validation even if we go through one of the no-op paths
limit = libalgos.validate_limit(None, limit=limit)
return self.apply_with_block(
"fillna",
value=value,
limit=limit,
inplace=inplace,
downcast=downcast,
using_cow=using_copy_on_write(),
already_warned=_AlreadyWarned(),
)
@final
def where(self, other, cond, align: bool) -> Self:
if align:
align_keys = ["other", "cond"]
else:
align_keys = ["cond"]
other = extract_array(other, extract_numpy=True)
return self.apply_with_block(
"where",
align_keys=align_keys,
other=other,
cond=cond,
using_cow=using_copy_on_write(),
)
@final
def putmask(self, mask, new, align: bool = True, warn: bool = True) -> Self:
if align:
align_keys = ["new", "mask"]
else:
align_keys = ["mask"]
new = extract_array(new, extract_numpy=True)
already_warned = None
if warn_copy_on_write():
already_warned = _AlreadyWarned()
if not warn:
already_warned.warned_already = True
return self.apply_with_block(
"putmask",
align_keys=align_keys,
mask=mask,
new=new,
using_cow=using_copy_on_write(),
already_warned=already_warned,
)
@final
def round(self, decimals: int, using_cow: bool = False) -> Self:
return self.apply_with_block(
"round",
decimals=decimals,
using_cow=using_cow,
)
@final
def replace(self, to_replace, value, inplace: bool) -> Self:
inplace = validate_bool_kwarg(inplace, "inplace")
# NDFrame.replace ensures the not-is_list_likes here
assert not lib.is_list_like(to_replace)
assert not lib.is_list_like(value)
return self.apply_with_block(
"replace",
to_replace=to_replace,
value=value,
inplace=inplace,
using_cow=using_copy_on_write(),
already_warned=_AlreadyWarned(),
)
@final
def replace_regex(self, **kwargs) -> Self:
return self.apply_with_block(
"_replace_regex",
**kwargs,
using_cow=using_copy_on_write(),
already_warned=_AlreadyWarned(),
)
@final
def replace_list(
self,
src_list: list[Any],
dest_list: list[Any],
inplace: bool = False,
regex: bool = False,
) -> Self:
"""do a list replace"""
inplace = validate_bool_kwarg(inplace, "inplace")
bm = self.apply_with_block(
"replace_list",
src_list=src_list,
dest_list=dest_list,
inplace=inplace,
regex=regex,
using_cow=using_copy_on_write(),
already_warned=_AlreadyWarned(),
)
bm._consolidate_inplace()
return bm
def interpolate(self, inplace: bool, **kwargs) -> Self:
return self.apply_with_block(
"interpolate",
inplace=inplace,
**kwargs,
using_cow=using_copy_on_write(),
already_warned=_AlreadyWarned(),
)
def pad_or_backfill(self, inplace: bool, **kwargs) -> Self:
return self.apply_with_block(
"pad_or_backfill",
inplace=inplace,
**kwargs,
using_cow=using_copy_on_write(),
already_warned=_AlreadyWarned(),
)
def shift(self, periods: int, fill_value) -> Self:
if fill_value is lib.no_default:
fill_value = None
return self.apply_with_block("shift", periods=periods, fill_value=fill_value)
# --------------------------------------------------------------------
# Consolidation: No-ops for all but BlockManager
def is_consolidated(self) -> bool:
return True
def consolidate(self) -> Self:
return self
def _consolidate_inplace(self) -> None:
return
class SingleDataManager(DataManager):
@property
def ndim(self) -> Literal[1]:
return 1
@final
@property
def array(self) -> ArrayLike:
"""
Quick access to the backing array of the Block or SingleArrayManager.
"""
# error: "SingleDataManager" has no attribute "arrays"; maybe "array"
return self.arrays[0] # type: ignore[attr-defined]
def setitem_inplace(self, indexer, value, warn: bool = True) -> None:
"""
Set values with indexer.
For Single[Block/Array]Manager, this backs s[indexer] = value
This is an inplace version of `setitem()`, mutating the manager/values
in place, not returning a new Manager (and Block), and thus never changing
the dtype.
"""
arr = self.array
# EAs will do this validation in their own __setitem__ methods.
if isinstance(arr, np.ndarray):
# Note: checking for ndarray instead of np.dtype means we exclude
# dt64/td64, which do their own validation.
value = np_can_hold_element(arr.dtype, value)
if isinstance(value, np.ndarray) and value.ndim == 1 and len(value) == 1:
# NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615
value = value[0, ...]
arr[indexer] = value
def grouped_reduce(self, func):
arr = self.array
res = func(arr)
index = default_index(len(res))
mgr = type(self).from_array(res, index)
return mgr
@classmethod
def from_array(cls, arr: ArrayLike, index: Index):
raise AbstractMethodError(cls)
def interleaved_dtype(dtypes: list[DtypeObj]) -> DtypeObj | None:
"""
Find the common dtype for `blocks`.
Parameters
----------
blocks : List[DtypeObj]
Returns
-------
dtype : np.dtype, ExtensionDtype, or None
None is returned when `blocks` is empty.
"""
if not len(dtypes):
return None
return find_common_type(dtypes)
def ensure_np_dtype(dtype: DtypeObj) -> np.dtype:
# TODO: https://github.com/pandas-dev/pandas/issues/22791
# Give EAs some input on what happens here. Sparse needs this.
if isinstance(dtype, SparseDtype):
dtype = dtype.subtype
dtype = cast(np.dtype, dtype)
elif isinstance(dtype, ExtensionDtype):
dtype = np.dtype("object")
elif dtype == np.dtype(str):
dtype = np.dtype("object")
return dtype

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,598 @@
from __future__ import annotations
from typing import (
TYPE_CHECKING,
cast,
)
import warnings
import numpy as np
from pandas._libs import (
NaT,
algos as libalgos,
internals as libinternals,
lib,
)
from pandas._libs.missing import NA
from pandas.util._decorators import cache_readonly
from pandas.util._exceptions import find_stack_level
from pandas.core.dtypes.cast import (
ensure_dtype_can_hold_na,
find_common_type,
)
from pandas.core.dtypes.common import (
is_1d_only_ea_dtype,
is_scalar,
needs_i8_conversion,
)
from pandas.core.dtypes.concat import concat_compat
from pandas.core.dtypes.dtypes import (
ExtensionDtype,
SparseDtype,
)
from pandas.core.dtypes.missing import (
is_valid_na_for_dtype,
isna,
isna_all,
)
from pandas.core.construction import ensure_wrapped_if_datetimelike
from pandas.core.internals.array_manager import ArrayManager
from pandas.core.internals.blocks import (
ensure_block_shape,
new_block_2d,
)
from pandas.core.internals.managers import (
BlockManager,
make_na_array,
)
if TYPE_CHECKING:
from collections.abc import Sequence
from pandas._typing import (
ArrayLike,
AxisInt,
DtypeObj,
Manager2D,
Shape,
)
from pandas import Index
from pandas.core.internals.blocks import (
Block,
BlockPlacement,
)
def _concatenate_array_managers(
mgrs: list[ArrayManager], axes: list[Index], concat_axis: AxisInt
) -> Manager2D:
"""
Concatenate array managers into one.
Parameters
----------
mgrs_indexers : list of (ArrayManager, {axis: indexer,...}) tuples
axes : list of Index
concat_axis : int
Returns
-------
ArrayManager
"""
if concat_axis == 1:
return mgrs[0].concat_vertical(mgrs, axes)
else:
# concatting along the columns -> combine reindexed arrays in a single manager
assert concat_axis == 0
return mgrs[0].concat_horizontal(mgrs, axes)
def concatenate_managers(
mgrs_indexers, axes: list[Index], concat_axis: AxisInt, copy: bool
) -> Manager2D:
"""
Concatenate block managers into one.
Parameters
----------
mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples
axes : list of Index
concat_axis : int
copy : bool
Returns
-------
BlockManager
"""
needs_copy = copy and concat_axis == 0
# TODO(ArrayManager) this assumes that all managers are of the same type
if isinstance(mgrs_indexers[0][0], ArrayManager):
mgrs = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers, needs_copy)
# error: Argument 1 to "_concatenate_array_managers" has incompatible
# type "List[BlockManager]"; expected "List[Union[ArrayManager,
# SingleArrayManager, BlockManager, SingleBlockManager]]"
return _concatenate_array_managers(
mgrs, axes, concat_axis # type: ignore[arg-type]
)
# Assertions disabled for performance
# for tup in mgrs_indexers:
# # caller is responsible for ensuring this
# indexers = tup[1]
# assert concat_axis not in indexers
if concat_axis == 0:
mgrs = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers, needs_copy)
return mgrs[0].concat_horizontal(mgrs, axes)
if len(mgrs_indexers) > 0 and mgrs_indexers[0][0].nblocks > 0:
first_dtype = mgrs_indexers[0][0].blocks[0].dtype
if first_dtype in [np.float64, np.float32]:
# TODO: support more dtypes here. This will be simpler once
# JoinUnit.is_na behavior is deprecated.
if (
all(_is_homogeneous_mgr(mgr, first_dtype) for mgr, _ in mgrs_indexers)
and len(mgrs_indexers) > 1
):
# Fastpath!
# Length restriction is just to avoid having to worry about 'copy'
shape = tuple(len(x) for x in axes)
nb = _concat_homogeneous_fastpath(mgrs_indexers, shape, first_dtype)
return BlockManager((nb,), axes)
mgrs = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers, needs_copy)
if len(mgrs) == 1:
mgr = mgrs[0]
out = mgr.copy(deep=False)
out.axes = axes
return out
concat_plan = _get_combined_plan(mgrs)
blocks = []
values: ArrayLike
for placement, join_units in concat_plan:
unit = join_units[0]
blk = unit.block
if _is_uniform_join_units(join_units):
vals = [ju.block.values for ju in join_units]
if not blk.is_extension:
# _is_uniform_join_units ensures a single dtype, so
# we can use np.concatenate, which is more performant
# than concat_compat
# error: Argument 1 to "concatenate" has incompatible type
# "List[Union[ndarray[Any, Any], ExtensionArray]]";
# expected "Union[_SupportsArray[dtype[Any]],
# _NestedSequence[_SupportsArray[dtype[Any]]]]"
values = np.concatenate(vals, axis=1) # type: ignore[arg-type]
elif is_1d_only_ea_dtype(blk.dtype):
# TODO(EA2D): special-casing not needed with 2D EAs
values = concat_compat(vals, axis=0, ea_compat_axis=True)
values = ensure_block_shape(values, ndim=2)
else:
values = concat_compat(vals, axis=1)
values = ensure_wrapped_if_datetimelike(values)
fastpath = blk.values.dtype == values.dtype
else:
values = _concatenate_join_units(join_units, copy=copy)
fastpath = False
if fastpath:
b = blk.make_block_same_class(values, placement=placement)
else:
b = new_block_2d(values, placement=placement)
blocks.append(b)
return BlockManager(tuple(blocks), axes)
def _maybe_reindex_columns_na_proxy(
axes: list[Index],
mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]],
needs_copy: bool,
) -> list[BlockManager]:
"""
Reindex along columns so that all of the BlockManagers being concatenated
have matching columns.
Columns added in this reindexing have dtype=np.void, indicating they
should be ignored when choosing a column's final dtype.
"""
new_mgrs = []
for mgr, indexers in mgrs_indexers:
# For axis=0 (i.e. columns) we use_na_proxy and only_slice, so this
# is a cheap reindexing.
for i, indexer in indexers.items():
mgr = mgr.reindex_indexer(
axes[i],
indexers[i],
axis=i,
copy=False,
only_slice=True, # only relevant for i==0
allow_dups=True,
use_na_proxy=True, # only relevant for i==0
)
if needs_copy and not indexers:
mgr = mgr.copy()
new_mgrs.append(mgr)
return new_mgrs
def _is_homogeneous_mgr(mgr: BlockManager, first_dtype: DtypeObj) -> bool:
"""
Check if this Manager can be treated as a single ndarray.
"""
if mgr.nblocks != 1:
return False
blk = mgr.blocks[0]
if not (blk.mgr_locs.is_slice_like and blk.mgr_locs.as_slice.step == 1):
return False
return blk.dtype == first_dtype
def _concat_homogeneous_fastpath(
mgrs_indexers, shape: Shape, first_dtype: np.dtype
) -> Block:
"""
With single-Block managers with homogeneous dtypes (that can already hold nan),
we avoid [...]
"""
# assumes
# all(_is_homogeneous_mgr(mgr, first_dtype) for mgr, _ in in mgrs_indexers)
if all(not indexers for _, indexers in mgrs_indexers):
# https://github.com/pandas-dev/pandas/pull/52685#issuecomment-1523287739
arrs = [mgr.blocks[0].values.T for mgr, _ in mgrs_indexers]
arr = np.concatenate(arrs).T
bp = libinternals.BlockPlacement(slice(shape[0]))
nb = new_block_2d(arr, bp)
return nb
arr = np.empty(shape, dtype=first_dtype)
if first_dtype == np.float64:
take_func = libalgos.take_2d_axis0_float64_float64
else:
take_func = libalgos.take_2d_axis0_float32_float32
start = 0
for mgr, indexers in mgrs_indexers:
mgr_len = mgr.shape[1]
end = start + mgr_len
if 0 in indexers:
take_func(
mgr.blocks[0].values,
indexers[0],
arr[:, start:end],
)
else:
# No reindexing necessary, we can copy values directly
arr[:, start:end] = mgr.blocks[0].values
start += mgr_len
bp = libinternals.BlockPlacement(slice(shape[0]))
nb = new_block_2d(arr, bp)
return nb
def _get_combined_plan(
mgrs: list[BlockManager],
) -> list[tuple[BlockPlacement, list[JoinUnit]]]:
plan = []
max_len = mgrs[0].shape[0]
blknos_list = [mgr.blknos for mgr in mgrs]
pairs = libinternals.get_concat_blkno_indexers(blknos_list)
for ind, (blknos, bp) in enumerate(pairs):
# assert bp.is_slice_like
# assert len(bp) > 0
units_for_bp = []
for k, mgr in enumerate(mgrs):
blkno = blknos[k]
nb = _get_block_for_concat_plan(mgr, bp, blkno, max_len=max_len)
unit = JoinUnit(nb)
units_for_bp.append(unit)
plan.append((bp, units_for_bp))
return plan
def _get_block_for_concat_plan(
mgr: BlockManager, bp: BlockPlacement, blkno: int, *, max_len: int
) -> Block:
blk = mgr.blocks[blkno]
# Assertions disabled for performance:
# assert bp.is_slice_like
# assert blkno != -1
# assert (mgr.blknos[bp] == blkno).all()
if len(bp) == len(blk.mgr_locs) and (
blk.mgr_locs.is_slice_like and blk.mgr_locs.as_slice.step == 1
):
nb = blk
else:
ax0_blk_indexer = mgr.blklocs[bp.indexer]
slc = lib.maybe_indices_to_slice(ax0_blk_indexer, max_len)
# TODO: in all extant test cases 2023-04-08 we have a slice here.
# Will this always be the case?
if isinstance(slc, slice):
nb = blk.slice_block_columns(slc)
else:
nb = blk.take_block_columns(slc)
# assert nb.shape == (len(bp), mgr.shape[1])
return nb
class JoinUnit:
def __init__(self, block: Block) -> None:
self.block = block
def __repr__(self) -> str:
return f"{type(self).__name__}({repr(self.block)})"
def _is_valid_na_for(self, dtype: DtypeObj) -> bool:
"""
Check that we are all-NA of a type/dtype that is compatible with this dtype.
Augments `self.is_na` with an additional check of the type of NA values.
"""
if not self.is_na:
return False
blk = self.block
if blk.dtype.kind == "V":
return True
if blk.dtype == object:
values = blk.values
return all(is_valid_na_for_dtype(x, dtype) for x in values.ravel(order="K"))
na_value = blk.fill_value
if na_value is NaT and blk.dtype != dtype:
# e.g. we are dt64 and other is td64
# fill_values match but we should not cast blk.values to dtype
# TODO: this will need updating if we ever have non-nano dt64/td64
return False
if na_value is NA and needs_i8_conversion(dtype):
# FIXME: kludge; test_append_empty_frame_with_timedelta64ns_nat
# e.g. blk.dtype == "Int64" and dtype is td64, we dont want
# to consider these as matching
return False
# TODO: better to use can_hold_element?
return is_valid_na_for_dtype(na_value, dtype)
@cache_readonly
def is_na(self) -> bool:
blk = self.block
if blk.dtype.kind == "V":
return True
if not blk._can_hold_na:
return False
values = blk.values
if values.size == 0:
# GH#39122 this case will return False once deprecation is enforced
return True
if isinstance(values.dtype, SparseDtype):
return False
if values.ndim == 1:
# TODO(EA2D): no need for special case with 2D EAs
val = values[0]
if not is_scalar(val) or not isna(val):
# ideally isna_all would do this short-circuiting
return False
return isna_all(values)
else:
val = values[0][0]
if not is_scalar(val) or not isna(val):
# ideally isna_all would do this short-circuiting
return False
return all(isna_all(row) for row in values)
@cache_readonly
def is_na_after_size_and_isna_all_deprecation(self) -> bool:
"""
Will self.is_na be True after values.size == 0 deprecation and isna_all
deprecation are enforced?
"""
blk = self.block
if blk.dtype.kind == "V":
return True
return False
def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
values: ArrayLike
if upcasted_na is None and self.block.dtype.kind != "V":
# No upcasting is necessary
return self.block.values
else:
fill_value = upcasted_na
if self._is_valid_na_for(empty_dtype):
# note: always holds when self.block.dtype.kind == "V"
blk_dtype = self.block.dtype
if blk_dtype == np.dtype("object"):
# we want to avoid filling with np.nan if we are
# using None; we already know that we are all
# nulls
values = cast(np.ndarray, self.block.values)
if values.size and values[0, 0] is None:
fill_value = None
return make_na_array(empty_dtype, self.block.shape, fill_value)
return self.block.values
def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike:
"""
Concatenate values from several join units along axis=1.
"""
empty_dtype, empty_dtype_future = _get_empty_dtype(join_units)
has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units)
upcasted_na = _dtype_to_na_value(empty_dtype, has_none_blocks)
to_concat = [
ju.get_reindexed_values(empty_dtype=empty_dtype, upcasted_na=upcasted_na)
for ju in join_units
]
if any(is_1d_only_ea_dtype(t.dtype) for t in to_concat):
# TODO(EA2D): special case not needed if all EAs used HybridBlocks
# error: No overload variant of "__getitem__" of "ExtensionArray" matches
# argument type "Tuple[int, slice]"
to_concat = [
t
if is_1d_only_ea_dtype(t.dtype)
else t[0, :] # type: ignore[call-overload]
for t in to_concat
]
concat_values = concat_compat(to_concat, axis=0, ea_compat_axis=True)
concat_values = ensure_block_shape(concat_values, 2)
else:
concat_values = concat_compat(to_concat, axis=1)
if empty_dtype != empty_dtype_future:
if empty_dtype == concat_values.dtype:
# GH#39122, GH#40893
warnings.warn(
"The behavior of DataFrame concatenation with empty or all-NA "
"entries is deprecated. In a future version, this will no longer "
"exclude empty or all-NA columns when determining the result dtypes. "
"To retain the old behavior, exclude the relevant entries before "
"the concat operation.",
FutureWarning,
stacklevel=find_stack_level(),
)
return concat_values
def _dtype_to_na_value(dtype: DtypeObj, has_none_blocks: bool):
"""
Find the NA value to go with this dtype.
"""
if isinstance(dtype, ExtensionDtype):
return dtype.na_value
elif dtype.kind in "mM":
return dtype.type("NaT")
elif dtype.kind in "fc":
return dtype.type("NaN")
elif dtype.kind == "b":
# different from missing.na_value_for_dtype
return None
elif dtype.kind in "iu":
if not has_none_blocks:
# different from missing.na_value_for_dtype
return None
return np.nan
elif dtype.kind == "O":
return np.nan
raise NotImplementedError
def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> tuple[DtypeObj, DtypeObj]:
"""
Return dtype and N/A values to use when concatenating specified units.
Returned N/A value may be None which means there was no casting involved.
Returns
-------
dtype
"""
if lib.dtypes_all_equal([ju.block.dtype for ju in join_units]):
empty_dtype = join_units[0].block.dtype
return empty_dtype, empty_dtype
has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units)
dtypes = [unit.block.dtype for unit in join_units if not unit.is_na]
if not len(dtypes):
dtypes = [
unit.block.dtype for unit in join_units if unit.block.dtype.kind != "V"
]
dtype = find_common_type(dtypes)
if has_none_blocks:
dtype = ensure_dtype_can_hold_na(dtype)
dtype_future = dtype
if len(dtypes) != len(join_units):
dtypes_future = [
unit.block.dtype
for unit in join_units
if not unit.is_na_after_size_and_isna_all_deprecation
]
if not len(dtypes_future):
dtypes_future = [
unit.block.dtype for unit in join_units if unit.block.dtype.kind != "V"
]
if len(dtypes) != len(dtypes_future):
dtype_future = find_common_type(dtypes_future)
if has_none_blocks:
dtype_future = ensure_dtype_can_hold_na(dtype_future)
return dtype, dtype_future
def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool:
"""
Check if the join units consist of blocks of uniform type that can
be concatenated using Block.concat_same_type instead of the generic
_concatenate_join_units (which uses `concat_compat`).
"""
first = join_units[0].block
if first.dtype.kind == "V":
return False
return (
# exclude cases where a) ju.block is None or b) we have e.g. Int64+int64
all(type(ju.block) is type(first) for ju in join_units)
and
# e.g. DatetimeLikeBlock can be dt64 or td64, but these are not uniform
all(
ju.block.dtype == first.dtype
# GH#42092 we only want the dtype_equal check for non-numeric blocks
# (for now, may change but that would need a deprecation)
or ju.block.dtype.kind in "iub"
for ju in join_units
)
and
# no blocks that would get missing values (can lead to type upcasts)
# unless we're an extension dtype.
all(not ju.is_na or ju.block.is_extension for ju in join_units)
)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,154 @@
from __future__ import annotations
from typing import (
TYPE_CHECKING,
NamedTuple,
)
from pandas.core.dtypes.common import is_1d_only_ea_dtype
if TYPE_CHECKING:
from collections.abc import Iterator
from pandas._libs.internals import BlockPlacement
from pandas._typing import ArrayLike
from pandas.core.internals.blocks import Block
from pandas.core.internals.managers import BlockManager
class BlockPairInfo(NamedTuple):
lvals: ArrayLike
rvals: ArrayLike
locs: BlockPlacement
left_ea: bool
right_ea: bool
rblk: Block
def _iter_block_pairs(
left: BlockManager, right: BlockManager
) -> Iterator[BlockPairInfo]:
# At this point we have already checked the parent DataFrames for
# assert rframe._indexed_same(lframe)
for blk in left.blocks:
locs = blk.mgr_locs
blk_vals = blk.values
left_ea = blk_vals.ndim == 1
rblks = right._slice_take_blocks_ax0(locs.indexer, only_slice=True)
# Assertions are disabled for performance, but should hold:
# if left_ea:
# assert len(locs) == 1, locs
# assert len(rblks) == 1, rblks
# assert rblks[0].shape[0] == 1, rblks[0].shape
for rblk in rblks:
right_ea = rblk.values.ndim == 1
lvals, rvals = _get_same_shape_values(blk, rblk, left_ea, right_ea)
info = BlockPairInfo(lvals, rvals, locs, left_ea, right_ea, rblk)
yield info
def operate_blockwise(
left: BlockManager, right: BlockManager, array_op
) -> BlockManager:
# At this point we have already checked the parent DataFrames for
# assert rframe._indexed_same(lframe)
res_blks: list[Block] = []
for lvals, rvals, locs, left_ea, right_ea, rblk in _iter_block_pairs(left, right):
res_values = array_op(lvals, rvals)
if (
left_ea
and not right_ea
and hasattr(res_values, "reshape")
and not is_1d_only_ea_dtype(res_values.dtype)
):
res_values = res_values.reshape(1, -1)
nbs = rblk._split_op_result(res_values)
# Assertions are disabled for performance, but should hold:
# if right_ea or left_ea:
# assert len(nbs) == 1
# else:
# assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape)
_reset_block_mgr_locs(nbs, locs)
res_blks.extend(nbs)
# Assertions are disabled for performance, but should hold:
# slocs = {y for nb in res_blks for y in nb.mgr_locs.as_array}
# nlocs = sum(len(nb.mgr_locs.as_array) for nb in res_blks)
# assert nlocs == len(left.items), (nlocs, len(left.items))
# assert len(slocs) == nlocs, (len(slocs), nlocs)
# assert slocs == set(range(nlocs)), slocs
new_mgr = type(right)(tuple(res_blks), axes=right.axes, verify_integrity=False)
return new_mgr
def _reset_block_mgr_locs(nbs: list[Block], locs) -> None:
"""
Reset mgr_locs to correspond to our original DataFrame.
"""
for nb in nbs:
nblocs = locs[nb.mgr_locs.indexer]
nb.mgr_locs = nblocs
# Assertions are disabled for performance, but should hold:
# assert len(nblocs) == nb.shape[0], (len(nblocs), nb.shape)
# assert all(x in locs.as_array for x in nb.mgr_locs.as_array)
def _get_same_shape_values(
lblk: Block, rblk: Block, left_ea: bool, right_ea: bool
) -> tuple[ArrayLike, ArrayLike]:
"""
Slice lblk.values to align with rblk. Squeeze if we have EAs.
"""
lvals = lblk.values
rvals = rblk.values
# Require that the indexing into lvals be slice-like
assert rblk.mgr_locs.is_slice_like, rblk.mgr_locs
# TODO(EA2D): with 2D EAs only this first clause would be needed
if not (left_ea or right_ea):
# error: No overload variant of "__getitem__" of "ExtensionArray" matches
# argument type "Tuple[Union[ndarray, slice], slice]"
lvals = lvals[rblk.mgr_locs.indexer, :] # type: ignore[call-overload]
assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape)
elif left_ea and right_ea:
assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape)
elif right_ea:
# lvals are 2D, rvals are 1D
# error: No overload variant of "__getitem__" of "ExtensionArray" matches
# argument type "Tuple[Union[ndarray, slice], slice]"
lvals = lvals[rblk.mgr_locs.indexer, :] # type: ignore[call-overload]
assert lvals.shape[0] == 1, lvals.shape
lvals = lvals[0, :]
else:
# lvals are 1D, rvals are 2D
assert rvals.shape[0] == 1, rvals.shape
# error: No overload variant of "__getitem__" of "ExtensionArray" matches
# argument type "Tuple[int, slice]"
rvals = rvals[0, :] # type: ignore[call-overload]
return lvals, rvals
def blockwise_all(left: BlockManager, right: BlockManager, op) -> bool:
"""
Blockwise `all` reduction.
"""
for info in _iter_block_pairs(left, right):
res = op(info.lvals, info.rvals)
if not res:
return False
return True