This commit is contained in:
2025-09-07 22:09:54 +02:00
parent e1b817252c
commit 2fc0d000b6
7796 changed files with 2159515 additions and 933 deletions

View File

@ -0,0 +1,239 @@
from __future__ import annotations
import functools
from typing import (
TYPE_CHECKING,
Any,
Callable,
)
if TYPE_CHECKING:
from pandas._typing import Scalar
import numpy as np
from pandas.compat._optional import import_optional_dependency
@functools.cache
def generate_apply_looper(func, nopython=True, nogil=True, parallel=False):
if TYPE_CHECKING:
import numba
else:
numba = import_optional_dependency("numba")
nb_compat_func = numba.extending.register_jitable(func)
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
def nb_looper(values, axis):
# Operate on the first row/col in order to get
# the output shape
if axis == 0:
first_elem = values[:, 0]
dim0 = values.shape[1]
else:
first_elem = values[0]
dim0 = values.shape[0]
res0 = nb_compat_func(first_elem)
# Use np.asarray to get shape for
# https://github.com/numba/numba/issues/4202#issuecomment-1185981507
buf_shape = (dim0,) + np.atleast_1d(np.asarray(res0)).shape
if axis == 0:
buf_shape = buf_shape[::-1]
buff = np.empty(buf_shape)
if axis == 1:
buff[0] = res0
for i in numba.prange(1, values.shape[0]):
buff[i] = nb_compat_func(values[i])
else:
buff[:, 0] = res0
for j in numba.prange(1, values.shape[1]):
buff[:, j] = nb_compat_func(values[:, j])
return buff
return nb_looper
@functools.cache
def make_looper(func, result_dtype, is_grouped_kernel, nopython, nogil, parallel):
if TYPE_CHECKING:
import numba
else:
numba = import_optional_dependency("numba")
if is_grouped_kernel:
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
def column_looper(
values: np.ndarray,
labels: np.ndarray,
ngroups: int,
min_periods: int,
*args,
):
result = np.empty((values.shape[0], ngroups), dtype=result_dtype)
na_positions = {}
for i in numba.prange(values.shape[0]):
output, na_pos = func(
values[i], result_dtype, labels, ngroups, min_periods, *args
)
result[i] = output
if len(na_pos) > 0:
na_positions[i] = np.array(na_pos)
return result, na_positions
else:
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
def column_looper(
values: np.ndarray,
start: np.ndarray,
end: np.ndarray,
min_periods: int,
*args,
):
result = np.empty((values.shape[0], len(start)), dtype=result_dtype)
na_positions = {}
for i in numba.prange(values.shape[0]):
output, na_pos = func(
values[i], result_dtype, start, end, min_periods, *args
)
result[i] = output
if len(na_pos) > 0:
na_positions[i] = np.array(na_pos)
return result, na_positions
return column_looper
default_dtype_mapping: dict[np.dtype, Any] = {
np.dtype("int8"): np.int64,
np.dtype("int16"): np.int64,
np.dtype("int32"): np.int64,
np.dtype("int64"): np.int64,
np.dtype("uint8"): np.uint64,
np.dtype("uint16"): np.uint64,
np.dtype("uint32"): np.uint64,
np.dtype("uint64"): np.uint64,
np.dtype("float32"): np.float64,
np.dtype("float64"): np.float64,
np.dtype("complex64"): np.complex128,
np.dtype("complex128"): np.complex128,
}
# TODO: Preserve complex dtypes
float_dtype_mapping: dict[np.dtype, Any] = {
np.dtype("int8"): np.float64,
np.dtype("int16"): np.float64,
np.dtype("int32"): np.float64,
np.dtype("int64"): np.float64,
np.dtype("uint8"): np.float64,
np.dtype("uint16"): np.float64,
np.dtype("uint32"): np.float64,
np.dtype("uint64"): np.float64,
np.dtype("float32"): np.float64,
np.dtype("float64"): np.float64,
np.dtype("complex64"): np.float64,
np.dtype("complex128"): np.float64,
}
identity_dtype_mapping: dict[np.dtype, Any] = {
np.dtype("int8"): np.int8,
np.dtype("int16"): np.int16,
np.dtype("int32"): np.int32,
np.dtype("int64"): np.int64,
np.dtype("uint8"): np.uint8,
np.dtype("uint16"): np.uint16,
np.dtype("uint32"): np.uint32,
np.dtype("uint64"): np.uint64,
np.dtype("float32"): np.float32,
np.dtype("float64"): np.float64,
np.dtype("complex64"): np.complex64,
np.dtype("complex128"): np.complex128,
}
def generate_shared_aggregator(
func: Callable[..., Scalar],
dtype_mapping: dict[np.dtype, np.dtype],
is_grouped_kernel: bool,
nopython: bool,
nogil: bool,
parallel: bool,
):
"""
Generate a Numba function that loops over the columns 2D object and applies
a 1D numba kernel over each column.
Parameters
----------
func : function
aggregation function to be applied to each column
dtype_mapping: dict or None
If not None, maps a dtype to a result dtype.
Otherwise, will fall back to default mapping.
is_grouped_kernel: bool, default False
Whether func operates using the group labels (True)
or using starts/ends arrays
If true, you also need to pass the number of groups to this function
nopython : bool
nopython to be passed into numba.jit
nogil : bool
nogil to be passed into numba.jit
parallel : bool
parallel to be passed into numba.jit
Returns
-------
Numba function
"""
# A wrapper around the looper function,
# to dispatch based on dtype since numba is unable to do that in nopython mode
# It also post-processes the values by inserting nans where number of observations
# is less than min_periods
# Cannot do this in numba nopython mode
# (you'll run into type-unification error when you cast int -> float)
def looper_wrapper(
values,
start=None,
end=None,
labels=None,
ngroups=None,
min_periods: int = 0,
**kwargs,
):
result_dtype = dtype_mapping[values.dtype]
column_looper = make_looper(
func, result_dtype, is_grouped_kernel, nopython, nogil, parallel
)
# Need to unpack kwargs since numba only supports *args
if is_grouped_kernel:
result, na_positions = column_looper(
values, labels, ngroups, min_periods, *kwargs.values()
)
else:
result, na_positions = column_looper(
values, start, end, min_periods, *kwargs.values()
)
if result.dtype.kind == "i":
# Look if na_positions is not empty
# If so, convert the whole block
# This is OK since int dtype cannot hold nan,
# so if min_periods not satisfied for 1 col, it is not satisfied for
# all columns at that index
for na_pos in na_positions.values():
if len(na_pos) > 0:
result = result.astype("float64")
break
# TODO: Optimize this
for i, na_pos in na_positions.items():
if len(na_pos) > 0:
result[i, na_pos] = np.nan
return result
return looper_wrapper

View File

@ -0,0 +1,585 @@
# Disable type checking for this module since numba's internals
# are not typed, and we use numba's internals via its extension API
# mypy: ignore-errors
"""
Utility classes/functions to let numba recognize
pandas Index/Series/DataFrame
Mostly vendored from https://github.com/numba/numba/blob/main/numba/tests/pdlike_usecase.py
"""
from __future__ import annotations
from contextlib import contextmanager
import operator
import numba
from numba import types
from numba.core import cgutils
from numba.core.datamodel import models
from numba.core.extending import (
NativeValue,
box,
lower_builtin,
make_attribute_wrapper,
overload,
overload_attribute,
overload_method,
register_model,
type_callable,
typeof_impl,
unbox,
)
from numba.core.imputils import impl_ret_borrowed
import numpy as np
from pandas._libs import lib
from pandas.core.indexes.base import Index
from pandas.core.indexing import _iLocIndexer
from pandas.core.internals import SingleBlockManager
from pandas.core.series import Series
# Helper function to hack around fact that Index casts numpy string dtype to object
#
# Idea is to set an attribute on a Index called _numba_data
# that is the original data, or the object data casted to numpy string dtype,
# with a context manager that is unset afterwards
@contextmanager
def set_numba_data(index: Index):
numba_data = index._data
if numba_data.dtype in (object, "string"):
numba_data = np.asarray(numba_data)
if not lib.is_string_array(numba_data):
raise ValueError(
"The numba engine only supports using string or numeric column names"
)
numba_data = numba_data.astype("U")
try:
index._numba_data = numba_data
yield index
finally:
del index._numba_data
# TODO: Range index support
# (this currently lowers OK, but does not round-trip)
class IndexType(types.Type):
"""
The type class for Index objects.
"""
def __init__(self, dtype, layout, pyclass: any) -> None:
self.pyclass = pyclass
name = f"index({dtype}, {layout})"
self.dtype = dtype
self.layout = layout
super().__init__(name)
@property
def key(self):
return self.pyclass, self.dtype, self.layout
@property
def as_array(self):
return types.Array(self.dtype, 1, self.layout)
def copy(self, dtype=None, ndim: int = 1, layout=None):
assert ndim == 1
if dtype is None:
dtype = self.dtype
layout = layout or self.layout
return type(self)(dtype, layout, self.pyclass)
class SeriesType(types.Type):
"""
The type class for Series objects.
"""
def __init__(self, dtype, index, namety) -> None:
assert isinstance(index, IndexType)
self.dtype = dtype
self.index = index
self.values = types.Array(self.dtype, 1, "C")
self.namety = namety
name = f"series({dtype}, {index}, {namety})"
super().__init__(name)
@property
def key(self):
return self.dtype, self.index, self.namety
@property
def as_array(self):
return self.values
def copy(self, dtype=None, ndim: int = 1, layout: str = "C"):
assert ndim == 1
assert layout == "C"
if dtype is None:
dtype = self.dtype
return type(self)(dtype, self.index, self.namety)
@typeof_impl.register(Index)
def typeof_index(val, c):
"""
This will assume that only strings are in object dtype
index.
(you should check this before this gets lowered down to numba)
"""
# arrty = typeof_impl(val._data, c)
arrty = typeof_impl(val._numba_data, c)
assert arrty.ndim == 1
return IndexType(arrty.dtype, arrty.layout, type(val))
@typeof_impl.register(Series)
def typeof_series(val, c):
index = typeof_impl(val.index, c)
arrty = typeof_impl(val.values, c)
namety = typeof_impl(val.name, c)
assert arrty.ndim == 1
assert arrty.layout == "C"
return SeriesType(arrty.dtype, index, namety)
@type_callable(Series)
def type_series_constructor(context):
def typer(data, index, name=None):
if isinstance(index, IndexType) and isinstance(data, types.Array):
assert data.ndim == 1
if name is None:
name = types.intp
return SeriesType(data.dtype, index, name)
return typer
@type_callable(Index)
def type_index_constructor(context):
def typer(data, hashmap=None):
if isinstance(data, types.Array):
assert data.layout == "C"
assert data.ndim == 1
assert hashmap is None or isinstance(hashmap, types.DictType)
return IndexType(data.dtype, layout=data.layout, pyclass=Index)
return typer
# Backend extensions for Index and Series and Frame
@register_model(IndexType)
class IndexModel(models.StructModel):
def __init__(self, dmm, fe_type) -> None:
# We don't want the numpy string scalar type in our hashmap
members = [
("data", fe_type.as_array),
# This is an attempt to emulate our hashtable code with a numba
# typed dict
# It maps from values in the index to their integer positions in the array
("hashmap", types.DictType(fe_type.dtype, types.intp)),
# Pointer to the Index object this was created from, or that it
# boxes to
# https://numba.discourse.group/t/qst-how-to-cache-the-boxing-of-an-object/2128/2?u=lithomas1
("parent", types.pyobject),
]
models.StructModel.__init__(self, dmm, fe_type, members)
@register_model(SeriesType)
class SeriesModel(models.StructModel):
def __init__(self, dmm, fe_type) -> None:
members = [
("index", fe_type.index),
("values", fe_type.as_array),
("name", fe_type.namety),
]
models.StructModel.__init__(self, dmm, fe_type, members)
make_attribute_wrapper(IndexType, "data", "_data")
make_attribute_wrapper(IndexType, "hashmap", "hashmap")
make_attribute_wrapper(SeriesType, "index", "index")
make_attribute_wrapper(SeriesType, "values", "values")
make_attribute_wrapper(SeriesType, "name", "name")
@lower_builtin(Series, types.Array, IndexType)
def pdseries_constructor(context, builder, sig, args):
data, index = args
series = cgutils.create_struct_proxy(sig.return_type)(context, builder)
series.index = index
series.values = data
series.name = context.get_constant(types.intp, 0)
return impl_ret_borrowed(context, builder, sig.return_type, series._getvalue())
@lower_builtin(Series, types.Array, IndexType, types.intp)
@lower_builtin(Series, types.Array, IndexType, types.float64)
@lower_builtin(Series, types.Array, IndexType, types.unicode_type)
def pdseries_constructor_with_name(context, builder, sig, args):
data, index, name = args
series = cgutils.create_struct_proxy(sig.return_type)(context, builder)
series.index = index
series.values = data
series.name = name
return impl_ret_borrowed(context, builder, sig.return_type, series._getvalue())
@lower_builtin(Index, types.Array, types.DictType, types.pyobject)
def index_constructor_2arg(context, builder, sig, args):
(data, hashmap, parent) = args
index = cgutils.create_struct_proxy(sig.return_type)(context, builder)
index.data = data
index.hashmap = hashmap
index.parent = parent
return impl_ret_borrowed(context, builder, sig.return_type, index._getvalue())
@lower_builtin(Index, types.Array, types.DictType)
def index_constructor_2arg_parent(context, builder, sig, args):
# Basically same as index_constructor_1arg, but also lets you specify the
# parent object
(data, hashmap) = args
index = cgutils.create_struct_proxy(sig.return_type)(context, builder)
index.data = data
index.hashmap = hashmap
return impl_ret_borrowed(context, builder, sig.return_type, index._getvalue())
@lower_builtin(Index, types.Array)
def index_constructor_1arg(context, builder, sig, args):
from numba.typed import Dict
key_type = sig.return_type.dtype
value_type = types.intp
def index_impl(data):
return Index(data, Dict.empty(key_type, value_type))
return context.compile_internal(builder, index_impl, sig, args)
# Helper to convert the unicodecharseq (numpy string scalar) into a unicode_type
# (regular string)
def maybe_cast_str(x):
# Dummy function that numba can overload
pass
@overload(maybe_cast_str)
def maybe_cast_str_impl(x):
"""Converts numba UnicodeCharSeq (numpy string scalar) -> unicode type (string).
Is a no-op for other types."""
if isinstance(x, types.UnicodeCharSeq):
return lambda x: str(x)
else:
return lambda x: x
@unbox(IndexType)
def unbox_index(typ, obj, c):
"""
Convert a Index object to a native structure.
Note: Object dtype is not allowed here
"""
data_obj = c.pyapi.object_getattr_string(obj, "_numba_data")
index = cgutils.create_struct_proxy(typ)(c.context, c.builder)
# If we see an object array, assume its been validated as only containing strings
# We still need to do the conversion though
index.data = c.unbox(typ.as_array, data_obj).value
typed_dict_obj = c.pyapi.unserialize(c.pyapi.serialize_object(numba.typed.Dict))
# Create an empty typed dict in numba for the hashmap for indexing
# equiv of numba.typed.Dict.empty(typ.dtype, types.intp)
arr_type_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.dtype))
intp_type_obj = c.pyapi.unserialize(c.pyapi.serialize_object(types.intp))
hashmap_obj = c.pyapi.call_method(
typed_dict_obj, "empty", (arr_type_obj, intp_type_obj)
)
index.hashmap = c.unbox(types.DictType(typ.dtype, types.intp), hashmap_obj).value
# Set the parent for speedy boxing.
index.parent = obj
# Decrefs
c.pyapi.decref(data_obj)
c.pyapi.decref(arr_type_obj)
c.pyapi.decref(intp_type_obj)
c.pyapi.decref(typed_dict_obj)
return NativeValue(index._getvalue())
@unbox(SeriesType)
def unbox_series(typ, obj, c):
"""
Convert a Series object to a native structure.
"""
index_obj = c.pyapi.object_getattr_string(obj, "index")
values_obj = c.pyapi.object_getattr_string(obj, "values")
name_obj = c.pyapi.object_getattr_string(obj, "name")
series = cgutils.create_struct_proxy(typ)(c.context, c.builder)
series.index = c.unbox(typ.index, index_obj).value
series.values = c.unbox(typ.values, values_obj).value
series.name = c.unbox(typ.namety, name_obj).value
# Decrefs
c.pyapi.decref(index_obj)
c.pyapi.decref(values_obj)
c.pyapi.decref(name_obj)
return NativeValue(series._getvalue())
@box(IndexType)
def box_index(typ, val, c):
"""
Convert a native index structure to a Index object.
If our native index is of a numpy string dtype, we'll cast it to
object.
"""
# First build a Numpy array object, then wrap it in a Index
index = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val)
res = cgutils.alloca_once_value(c.builder, index.parent)
# Does parent exist?
# (it means already boxed once, or Index same as original df.index or df.columns)
# xref https://github.com/numba/numba/blob/596e8a55334cc46854e3192766e643767bd7c934/numba/core/boxing.py#L593C17-L593C17
with c.builder.if_else(cgutils.is_not_null(c.builder, index.parent)) as (
has_parent,
otherwise,
):
with has_parent:
c.pyapi.incref(index.parent)
with otherwise:
# TODO: preserve the original class for the index
# Also need preserve the name of the Index
# class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.pyclass))
class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Index))
array_obj = c.box(typ.as_array, index.data)
if isinstance(typ.dtype, types.UnicodeCharSeq):
# We converted to numpy string dtype, convert back
# to object since _simple_new won't do that for uss
object_str_obj = c.pyapi.unserialize(c.pyapi.serialize_object("object"))
array_obj = c.pyapi.call_method(array_obj, "astype", (object_str_obj,))
c.pyapi.decref(object_str_obj)
# this is basically Index._simple_new(array_obj, name_obj) in python
index_obj = c.pyapi.call_method(class_obj, "_simple_new", (array_obj,))
index.parent = index_obj
c.builder.store(index_obj, res)
# Decrefs
c.pyapi.decref(class_obj)
c.pyapi.decref(array_obj)
return c.builder.load(res)
@box(SeriesType)
def box_series(typ, val, c):
"""
Convert a native series structure to a Series object.
"""
series = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val)
series_const_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Series._from_mgr))
mgr_const_obj = c.pyapi.unserialize(
c.pyapi.serialize_object(SingleBlockManager.from_array)
)
index_obj = c.box(typ.index, series.index)
array_obj = c.box(typ.as_array, series.values)
name_obj = c.box(typ.namety, series.name)
# This is basically equivalent of
# pd.Series(data=array_obj, index=index_obj)
# To improve perf, we will construct the Series from a manager
# object to avoid checks.
# We'll also set the name attribute manually to avoid validation
mgr_obj = c.pyapi.call_function_objargs(
mgr_const_obj,
(
array_obj,
index_obj,
),
)
mgr_axes_obj = c.pyapi.object_getattr_string(mgr_obj, "axes")
# Series._constructor_from_mgr(mgr, axes)
series_obj = c.pyapi.call_function_objargs(
series_const_obj, (mgr_obj, mgr_axes_obj)
)
c.pyapi.object_setattr_string(series_obj, "_name", name_obj)
# Decrefs
c.pyapi.decref(series_const_obj)
c.pyapi.decref(mgr_axes_obj)
c.pyapi.decref(mgr_obj)
c.pyapi.decref(mgr_const_obj)
c.pyapi.decref(index_obj)
c.pyapi.decref(array_obj)
c.pyapi.decref(name_obj)
return series_obj
# Add common series reductions (e.g. mean, sum),
# and also add common binops (e.g. add, sub, mul, div)
def generate_series_reduction(ser_reduction, ser_method):
@overload_method(SeriesType, ser_reduction)
def series_reduction(series):
def series_reduction_impl(series):
return ser_method(series.values)
return series_reduction_impl
return series_reduction
def generate_series_binop(binop):
@overload(binop)
def series_binop(series1, value):
if isinstance(series1, SeriesType):
if isinstance(value, SeriesType):
def series_binop_impl(series1, series2):
# TODO: Check index matching?
return Series(
binop(series1.values, series2.values),
series1.index,
series1.name,
)
return series_binop_impl
else:
def series_binop_impl(series1, value):
return Series(
binop(series1.values, value), series1.index, series1.name
)
return series_binop_impl
return series_binop
series_reductions = [
("sum", np.sum),
("mean", np.mean),
# Disabled due to discrepancies between numba std. dev
# and pandas std. dev (no way to specify dof)
# ("std", np.std),
# ("var", np.var),
("min", np.min),
("max", np.max),
]
for reduction, reduction_method in series_reductions:
generate_series_reduction(reduction, reduction_method)
series_binops = [operator.add, operator.sub, operator.mul, operator.truediv]
for ser_binop in series_binops:
generate_series_binop(ser_binop)
# get_loc on Index
@overload_method(IndexType, "get_loc")
def index_get_loc(index, item):
def index_get_loc_impl(index, item):
# Initialize the hash table if not initialized
if len(index.hashmap) == 0:
for i, val in enumerate(index._data):
index.hashmap[val] = i
return index.hashmap[item]
return index_get_loc_impl
# Indexing for Series/Index
@overload(operator.getitem)
def series_indexing(series, item):
if isinstance(series, SeriesType):
def series_getitem(series, item):
loc = series.index.get_loc(item)
return series.iloc[loc]
return series_getitem
@overload(operator.getitem)
def index_indexing(index, idx):
if isinstance(index, IndexType):
def index_getitem(index, idx):
return index._data[idx]
return index_getitem
class IlocType(types.Type):
def __init__(self, obj_type) -> None:
self.obj_type = obj_type
name = f"iLocIndexer({obj_type})"
super().__init__(name=name)
@property
def key(self):
return self.obj_type
@typeof_impl.register(_iLocIndexer)
def typeof_iloc(val, c):
objtype = typeof_impl(val.obj, c)
return IlocType(objtype)
@type_callable(_iLocIndexer)
def type_iloc_constructor(context):
def typer(obj):
if isinstance(obj, SeriesType):
return IlocType(obj)
return typer
@lower_builtin(_iLocIndexer, SeriesType)
def iloc_constructor(context, builder, sig, args):
(obj,) = args
iloc_indexer = cgutils.create_struct_proxy(sig.return_type)(context, builder)
iloc_indexer.obj = obj
return impl_ret_borrowed(
context, builder, sig.return_type, iloc_indexer._getvalue()
)
@register_model(IlocType)
class ILocModel(models.StructModel):
def __init__(self, dmm, fe_type) -> None:
members = [("obj", fe_type.obj_type)]
models.StructModel.__init__(self, dmm, fe_type, members)
make_attribute_wrapper(IlocType, "obj", "obj")
@overload_attribute(SeriesType, "iloc")
def series_iloc(series):
def get(series):
return _iLocIndexer(series)
return get
@overload(operator.getitem)
def iloc_getitem(iloc_indexer, i):
if isinstance(iloc_indexer, IlocType):
def getitem_impl(iloc_indexer, i):
return iloc_indexer.obj.values[i]
return getitem_impl

View File

@ -0,0 +1,27 @@
from pandas.core._numba.kernels.mean_ import (
grouped_mean,
sliding_mean,
)
from pandas.core._numba.kernels.min_max_ import (
grouped_min_max,
sliding_min_max,
)
from pandas.core._numba.kernels.sum_ import (
grouped_sum,
sliding_sum,
)
from pandas.core._numba.kernels.var_ import (
grouped_var,
sliding_var,
)
__all__ = [
"sliding_mean",
"grouped_mean",
"sliding_sum",
"grouped_sum",
"sliding_var",
"grouped_var",
"sliding_min_max",
"grouped_min_max",
]

View File

@ -0,0 +1,196 @@
"""
Numba 1D mean kernels that can be shared by
* Dataframe / Series
* groupby
* rolling / expanding
Mirrors pandas/_libs/window/aggregation.pyx
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import numba
import numpy as np
from pandas.core._numba.kernels.shared import is_monotonic_increasing
from pandas.core._numba.kernels.sum_ import grouped_kahan_sum
if TYPE_CHECKING:
from pandas._typing import npt
@numba.jit(nopython=True, nogil=True, parallel=False)
def add_mean(
val: float,
nobs: int,
sum_x: float,
neg_ct: int,
compensation: float,
num_consecutive_same_value: int,
prev_value: float,
) -> tuple[int, float, int, float, int, float]:
if not np.isnan(val):
nobs += 1
y = val - compensation
t = sum_x + y
compensation = t - sum_x - y
sum_x = t
if val < 0:
neg_ct += 1
if val == prev_value:
num_consecutive_same_value += 1
else:
num_consecutive_same_value = 1
prev_value = val
return nobs, sum_x, neg_ct, compensation, num_consecutive_same_value, prev_value
@numba.jit(nopython=True, nogil=True, parallel=False)
def remove_mean(
val: float, nobs: int, sum_x: float, neg_ct: int, compensation: float
) -> tuple[int, float, int, float]:
if not np.isnan(val):
nobs -= 1
y = -val - compensation
t = sum_x + y
compensation = t - sum_x - y
sum_x = t
if val < 0:
neg_ct -= 1
return nobs, sum_x, neg_ct, compensation
@numba.jit(nopython=True, nogil=True, parallel=False)
def sliding_mean(
values: np.ndarray,
result_dtype: np.dtype,
start: np.ndarray,
end: np.ndarray,
min_periods: int,
) -> tuple[np.ndarray, list[int]]:
N = len(start)
nobs = 0
sum_x = 0.0
neg_ct = 0
compensation_add = 0.0
compensation_remove = 0.0
is_monotonic_increasing_bounds = is_monotonic_increasing(
start
) and is_monotonic_increasing(end)
output = np.empty(N, dtype=result_dtype)
for i in range(N):
s = start[i]
e = end[i]
if i == 0 or not is_monotonic_increasing_bounds:
prev_value = values[s]
num_consecutive_same_value = 0
for j in range(s, e):
val = values[j]
(
nobs,
sum_x,
neg_ct,
compensation_add,
num_consecutive_same_value,
prev_value,
) = add_mean(
val,
nobs,
sum_x,
neg_ct,
compensation_add,
num_consecutive_same_value,
prev_value, # pyright: ignore[reportGeneralTypeIssues]
)
else:
for j in range(start[i - 1], s):
val = values[j]
nobs, sum_x, neg_ct, compensation_remove = remove_mean(
val, nobs, sum_x, neg_ct, compensation_remove
)
for j in range(end[i - 1], e):
val = values[j]
(
nobs,
sum_x,
neg_ct,
compensation_add,
num_consecutive_same_value,
prev_value,
) = add_mean(
val,
nobs,
sum_x,
neg_ct,
compensation_add,
num_consecutive_same_value,
prev_value, # pyright: ignore[reportGeneralTypeIssues]
)
if nobs >= min_periods and nobs > 0:
result = sum_x / nobs
if num_consecutive_same_value >= nobs:
result = prev_value
elif neg_ct == 0 and result < 0:
result = 0
elif neg_ct == nobs and result > 0:
result = 0
else:
result = np.nan
output[i] = result
if not is_monotonic_increasing_bounds:
nobs = 0
sum_x = 0.0
neg_ct = 0
compensation_remove = 0.0
# na_position is empty list since float64 can already hold nans
# Do list comprehension, since numba cannot figure out that na_pos is
# empty list of ints on its own
na_pos = [0 for i in range(0)]
return output, na_pos
@numba.jit(nopython=True, nogil=True, parallel=False)
def grouped_mean(
values: np.ndarray,
result_dtype: np.dtype,
labels: npt.NDArray[np.intp],
ngroups: int,
min_periods: int,
) -> tuple[np.ndarray, list[int]]:
output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum(
values, result_dtype, labels, ngroups
)
# Post-processing, replace sums that don't satisfy min_periods
for lab in range(ngroups):
nobs = nobs_arr[lab]
num_consecutive_same_value = consecutive_counts[lab]
prev_value = prev_vals[lab]
sum_x = output[lab]
if nobs >= min_periods:
if num_consecutive_same_value >= nobs:
result = prev_value * nobs
else:
result = sum_x
else:
result = np.nan
result /= nobs
output[lab] = result
# na_position is empty list since float64 can already hold nans
# Do list comprehension, since numba cannot figure out that na_pos is
# empty list of ints on its own
na_pos = [0 for i in range(0)]
return output, na_pos

View File

@ -0,0 +1,125 @@
"""
Numba 1D min/max kernels that can be shared by
* Dataframe / Series
* groupby
* rolling / expanding
Mirrors pandas/_libs/window/aggregation.pyx
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import numba
import numpy as np
if TYPE_CHECKING:
from pandas._typing import npt
@numba.jit(nopython=True, nogil=True, parallel=False)
def sliding_min_max(
values: np.ndarray,
result_dtype: np.dtype,
start: np.ndarray,
end: np.ndarray,
min_periods: int,
is_max: bool,
) -> tuple[np.ndarray, list[int]]:
N = len(start)
nobs = 0
output = np.empty(N, dtype=result_dtype)
na_pos = []
# Use deque once numba supports it
# https://github.com/numba/numba/issues/7417
Q: list = []
W: list = []
for i in range(N):
curr_win_size = end[i] - start[i]
if i == 0:
st = start[i]
else:
st = end[i - 1]
for k in range(st, end[i]):
ai = values[k]
if not np.isnan(ai):
nobs += 1
elif is_max:
ai = -np.inf
else:
ai = np.inf
# Discard previous entries if we find new min or max
if is_max:
while Q and ((ai >= values[Q[-1]]) or values[Q[-1]] != values[Q[-1]]):
Q.pop()
else:
while Q and ((ai <= values[Q[-1]]) or values[Q[-1]] != values[Q[-1]]):
Q.pop()
Q.append(k)
W.append(k)
# Discard entries outside and left of current window
while Q and Q[0] <= start[i] - 1:
Q.pop(0)
while W and W[0] <= start[i] - 1:
if not np.isnan(values[W[0]]):
nobs -= 1
W.pop(0)
# Save output based on index in input value array
if Q and curr_win_size > 0 and nobs >= min_periods:
output[i] = values[Q[0]]
else:
if values.dtype.kind != "i":
output[i] = np.nan
else:
na_pos.append(i)
return output, na_pos
@numba.jit(nopython=True, nogil=True, parallel=False)
def grouped_min_max(
values: np.ndarray,
result_dtype: np.dtype,
labels: npt.NDArray[np.intp],
ngroups: int,
min_periods: int,
is_max: bool,
) -> tuple[np.ndarray, list[int]]:
N = len(labels)
nobs = np.zeros(ngroups, dtype=np.int64)
na_pos = []
output = np.empty(ngroups, dtype=result_dtype)
for i in range(N):
lab = labels[i]
val = values[i]
if lab < 0:
continue
if values.dtype.kind == "i" or not np.isnan(val):
nobs[lab] += 1
else:
# NaN value cannot be a min/max value
continue
if nobs[lab] == 1:
# First element in group, set output equal to this
output[lab] = val
continue
if is_max:
if val > output[lab]:
output[lab] = val
else:
if val < output[lab]:
output[lab] = val
# Set labels that don't satisfy min_periods as np.nan
for lab, count in enumerate(nobs):
if count < min_periods:
na_pos.append(lab)
return output, na_pos

View File

@ -0,0 +1,29 @@
from __future__ import annotations
from typing import TYPE_CHECKING
import numba
if TYPE_CHECKING:
import numpy as np
@numba.jit(
# error: Any? not callable
numba.boolean(numba.int64[:]), # type: ignore[misc]
nopython=True,
nogil=True,
parallel=False,
)
def is_monotonic_increasing(bounds: np.ndarray) -> bool:
"""Check if int64 values are monotonically increasing."""
n = len(bounds)
if n < 2:
return True
prev = bounds[0]
for i in range(1, n):
cur = bounds[i]
if cur < prev:
return False
prev = cur
return True

View File

@ -0,0 +1,244 @@
"""
Numba 1D sum kernels that can be shared by
* Dataframe / Series
* groupby
* rolling / expanding
Mirrors pandas/_libs/window/aggregation.pyx
"""
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Any,
)
import numba
from numba.extending import register_jitable
import numpy as np
if TYPE_CHECKING:
from pandas._typing import npt
from pandas.core._numba.kernels.shared import is_monotonic_increasing
@numba.jit(nopython=True, nogil=True, parallel=False)
def add_sum(
val: Any,
nobs: int,
sum_x: Any,
compensation: Any,
num_consecutive_same_value: int,
prev_value: Any,
) -> tuple[int, Any, Any, int, Any]:
if not np.isnan(val):
nobs += 1
y = val - compensation
t = sum_x + y
compensation = t - sum_x - y
sum_x = t
if val == prev_value:
num_consecutive_same_value += 1
else:
num_consecutive_same_value = 1
prev_value = val
return nobs, sum_x, compensation, num_consecutive_same_value, prev_value
@numba.jit(nopython=True, nogil=True, parallel=False)
def remove_sum(
val: Any, nobs: int, sum_x: Any, compensation: Any
) -> tuple[int, Any, Any]:
if not np.isnan(val):
nobs -= 1
y = -val - compensation
t = sum_x + y
compensation = t - sum_x - y
sum_x = t
return nobs, sum_x, compensation
@numba.jit(nopython=True, nogil=True, parallel=False)
def sliding_sum(
values: np.ndarray,
result_dtype: np.dtype,
start: np.ndarray,
end: np.ndarray,
min_periods: int,
) -> tuple[np.ndarray, list[int]]:
dtype = values.dtype
na_val: object = np.nan
if dtype.kind == "i":
na_val = 0
N = len(start)
nobs = 0
sum_x = 0
compensation_add = 0
compensation_remove = 0
na_pos = []
is_monotonic_increasing_bounds = is_monotonic_increasing(
start
) and is_monotonic_increasing(end)
output = np.empty(N, dtype=result_dtype)
for i in range(N):
s = start[i]
e = end[i]
if i == 0 or not is_monotonic_increasing_bounds:
prev_value = values[s]
num_consecutive_same_value = 0
for j in range(s, e):
val = values[j]
(
nobs,
sum_x,
compensation_add,
num_consecutive_same_value,
prev_value,
) = add_sum(
val,
nobs,
sum_x,
compensation_add,
num_consecutive_same_value,
prev_value,
)
else:
for j in range(start[i - 1], s):
val = values[j]
nobs, sum_x, compensation_remove = remove_sum(
val, nobs, sum_x, compensation_remove
)
for j in range(end[i - 1], e):
val = values[j]
(
nobs,
sum_x,
compensation_add,
num_consecutive_same_value,
prev_value,
) = add_sum(
val,
nobs,
sum_x,
compensation_add,
num_consecutive_same_value,
prev_value,
)
if nobs == 0 == min_periods:
result: object = 0
elif nobs >= min_periods:
if num_consecutive_same_value >= nobs:
result = prev_value * nobs
else:
result = sum_x
else:
result = na_val
if dtype.kind == "i":
na_pos.append(i)
output[i] = result
if not is_monotonic_increasing_bounds:
nobs = 0
sum_x = 0
compensation_remove = 0
return output, na_pos
# Mypy/pyright don't like the fact that the decorator is untyped
@register_jitable # type: ignore[misc]
def grouped_kahan_sum(
values: np.ndarray,
result_dtype: np.dtype,
labels: npt.NDArray[np.intp],
ngroups: int,
) -> tuple[
np.ndarray, npt.NDArray[np.int64], np.ndarray, npt.NDArray[np.int64], np.ndarray
]:
N = len(labels)
nobs_arr = np.zeros(ngroups, dtype=np.int64)
comp_arr = np.zeros(ngroups, dtype=values.dtype)
consecutive_counts = np.zeros(ngroups, dtype=np.int64)
prev_vals = np.zeros(ngroups, dtype=values.dtype)
output = np.zeros(ngroups, dtype=result_dtype)
for i in range(N):
lab = labels[i]
val = values[i]
if lab < 0:
continue
sum_x = output[lab]
nobs = nobs_arr[lab]
compensation_add = comp_arr[lab]
num_consecutive_same_value = consecutive_counts[lab]
prev_value = prev_vals[lab]
(
nobs,
sum_x,
compensation_add,
num_consecutive_same_value,
prev_value,
) = add_sum(
val,
nobs,
sum_x,
compensation_add,
num_consecutive_same_value,
prev_value,
)
output[lab] = sum_x
consecutive_counts[lab] = num_consecutive_same_value
prev_vals[lab] = prev_value
comp_arr[lab] = compensation_add
nobs_arr[lab] = nobs
return output, nobs_arr, comp_arr, consecutive_counts, prev_vals
@numba.jit(nopython=True, nogil=True, parallel=False)
def grouped_sum(
values: np.ndarray,
result_dtype: np.dtype,
labels: npt.NDArray[np.intp],
ngroups: int,
min_periods: int,
) -> tuple[np.ndarray, list[int]]:
na_pos = []
output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum(
values, result_dtype, labels, ngroups
)
# Post-processing, replace sums that don't satisfy min_periods
for lab in range(ngroups):
nobs = nobs_arr[lab]
num_consecutive_same_value = consecutive_counts[lab]
prev_value = prev_vals[lab]
sum_x = output[lab]
if nobs >= min_periods:
if num_consecutive_same_value >= nobs:
result = prev_value * nobs
else:
result = sum_x
else:
result = sum_x # Don't change val, will be replaced by nan later
na_pos.append(lab)
output[lab] = result
return output, na_pos

View File

@ -0,0 +1,245 @@
"""
Numba 1D var kernels that can be shared by
* Dataframe / Series
* groupby
* rolling / expanding
Mirrors pandas/_libs/window/aggregation.pyx
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import numba
import numpy as np
if TYPE_CHECKING:
from pandas._typing import npt
from pandas.core._numba.kernels.shared import is_monotonic_increasing
@numba.jit(nopython=True, nogil=True, parallel=False)
def add_var(
val: float,
nobs: int,
mean_x: float,
ssqdm_x: float,
compensation: float,
num_consecutive_same_value: int,
prev_value: float,
) -> tuple[int, float, float, float, int, float]:
if not np.isnan(val):
if val == prev_value:
num_consecutive_same_value += 1
else:
num_consecutive_same_value = 1
prev_value = val
nobs += 1
prev_mean = mean_x - compensation
y = val - compensation
t = y - mean_x
compensation = t + mean_x - y
delta = t
if nobs:
mean_x += delta / nobs
else:
mean_x = 0
ssqdm_x += (val - prev_mean) * (val - mean_x)
return nobs, mean_x, ssqdm_x, compensation, num_consecutive_same_value, prev_value
@numba.jit(nopython=True, nogil=True, parallel=False)
def remove_var(
val: float, nobs: int, mean_x: float, ssqdm_x: float, compensation: float
) -> tuple[int, float, float, float]:
if not np.isnan(val):
nobs -= 1
if nobs:
prev_mean = mean_x - compensation
y = val - compensation
t = y - mean_x
compensation = t + mean_x - y
delta = t
mean_x -= delta / nobs
ssqdm_x -= (val - prev_mean) * (val - mean_x)
else:
mean_x = 0
ssqdm_x = 0
return nobs, mean_x, ssqdm_x, compensation
@numba.jit(nopython=True, nogil=True, parallel=False)
def sliding_var(
values: np.ndarray,
result_dtype: np.dtype,
start: np.ndarray,
end: np.ndarray,
min_periods: int,
ddof: int = 1,
) -> tuple[np.ndarray, list[int]]:
N = len(start)
nobs = 0
mean_x = 0.0
ssqdm_x = 0.0
compensation_add = 0.0
compensation_remove = 0.0
min_periods = max(min_periods, 1)
is_monotonic_increasing_bounds = is_monotonic_increasing(
start
) and is_monotonic_increasing(end)
output = np.empty(N, dtype=result_dtype)
for i in range(N):
s = start[i]
e = end[i]
if i == 0 or not is_monotonic_increasing_bounds:
prev_value = values[s]
num_consecutive_same_value = 0
for j in range(s, e):
val = values[j]
(
nobs,
mean_x,
ssqdm_x,
compensation_add,
num_consecutive_same_value,
prev_value,
) = add_var(
val,
nobs,
mean_x,
ssqdm_x,
compensation_add,
num_consecutive_same_value,
prev_value,
)
else:
for j in range(start[i - 1], s):
val = values[j]
nobs, mean_x, ssqdm_x, compensation_remove = remove_var(
val, nobs, mean_x, ssqdm_x, compensation_remove
)
for j in range(end[i - 1], e):
val = values[j]
(
nobs,
mean_x,
ssqdm_x,
compensation_add,
num_consecutive_same_value,
prev_value,
) = add_var(
val,
nobs,
mean_x,
ssqdm_x,
compensation_add,
num_consecutive_same_value,
prev_value,
)
if nobs >= min_periods and nobs > ddof:
if nobs == 1 or num_consecutive_same_value >= nobs:
result = 0.0
else:
result = ssqdm_x / (nobs - ddof)
else:
result = np.nan
output[i] = result
if not is_monotonic_increasing_bounds:
nobs = 0
mean_x = 0.0
ssqdm_x = 0.0
compensation_remove = 0.0
# na_position is empty list since float64 can already hold nans
# Do list comprehension, since numba cannot figure out that na_pos is
# empty list of ints on its own
na_pos = [0 for i in range(0)]
return output, na_pos
@numba.jit(nopython=True, nogil=True, parallel=False)
def grouped_var(
values: np.ndarray,
result_dtype: np.dtype,
labels: npt.NDArray[np.intp],
ngroups: int,
min_periods: int,
ddof: int = 1,
) -> tuple[np.ndarray, list[int]]:
N = len(labels)
nobs_arr = np.zeros(ngroups, dtype=np.int64)
comp_arr = np.zeros(ngroups, dtype=values.dtype)
consecutive_counts = np.zeros(ngroups, dtype=np.int64)
prev_vals = np.zeros(ngroups, dtype=values.dtype)
output = np.zeros(ngroups, dtype=result_dtype)
means = np.zeros(ngroups, dtype=result_dtype)
for i in range(N):
lab = labels[i]
val = values[i]
if lab < 0:
continue
mean_x = means[lab]
ssqdm_x = output[lab]
nobs = nobs_arr[lab]
compensation_add = comp_arr[lab]
num_consecutive_same_value = consecutive_counts[lab]
prev_value = prev_vals[lab]
(
nobs,
mean_x,
ssqdm_x,
compensation_add,
num_consecutive_same_value,
prev_value,
) = add_var(
val,
nobs,
mean_x,
ssqdm_x,
compensation_add,
num_consecutive_same_value,
prev_value,
)
output[lab] = ssqdm_x
means[lab] = mean_x
consecutive_counts[lab] = num_consecutive_same_value
prev_vals[lab] = prev_value
comp_arr[lab] = compensation_add
nobs_arr[lab] = nobs
# Post-processing, replace vars that don't satisfy min_periods
for lab in range(ngroups):
nobs = nobs_arr[lab]
num_consecutive_same_value = consecutive_counts[lab]
ssqdm_x = output[lab]
if nobs >= min_periods and nobs > ddof:
if nobs == 1 or num_consecutive_same_value >= nobs:
result = 0.0
else:
result = ssqdm_x / (nobs - ddof)
else:
result = np.nan
output[lab] = result
# Second pass to get the std.dev
# na_position is empty list since float64 can already hold nans
# Do list comprehension, since numba cannot figure out that na_pos is
# empty list of ints on its own
na_pos = [0 for i in range(0)]
return output, na_pos

View File

@ -0,0 +1,340 @@
"""
accessor.py contains base classes for implementing accessor properties
that can be mixed into or pinned onto other pandas classes.
"""
from __future__ import annotations
from typing import (
Callable,
final,
)
import warnings
from pandas.util._decorators import doc
from pandas.util._exceptions import find_stack_level
class DirNamesMixin:
_accessors: set[str] = set()
_hidden_attrs: frozenset[str] = frozenset()
@final
def _dir_deletions(self) -> set[str]:
"""
Delete unwanted __dir__ for this object.
"""
return self._accessors | self._hidden_attrs
def _dir_additions(self) -> set[str]:
"""
Add additional __dir__ for this object.
"""
return {accessor for accessor in self._accessors if hasattr(self, accessor)}
def __dir__(self) -> list[str]:
"""
Provide method name lookup and completion.
Notes
-----
Only provide 'public' methods.
"""
rv = set(super().__dir__())
rv = (rv - self._dir_deletions()) | self._dir_additions()
return sorted(rv)
class PandasDelegate:
"""
Abstract base class for delegating methods/properties.
"""
def _delegate_property_get(self, name: str, *args, **kwargs):
raise TypeError(f"You cannot access the property {name}")
def _delegate_property_set(self, name: str, value, *args, **kwargs):
raise TypeError(f"The property {name} cannot be set")
def _delegate_method(self, name: str, *args, **kwargs):
raise TypeError(f"You cannot call method {name}")
@classmethod
def _add_delegate_accessors(
cls,
delegate,
accessors: list[str],
typ: str,
overwrite: bool = False,
accessor_mapping: Callable[[str], str] = lambda x: x,
raise_on_missing: bool = True,
) -> None:
"""
Add accessors to cls from the delegate class.
Parameters
----------
cls
Class to add the methods/properties to.
delegate
Class to get methods/properties and doc-strings.
accessors : list of str
List of accessors to add.
typ : {'property', 'method'}
overwrite : bool, default False
Overwrite the method/property in the target class if it exists.
accessor_mapping: Callable, default lambda x: x
Callable to map the delegate's function to the cls' function.
raise_on_missing: bool, default True
Raise if an accessor does not exist on delegate.
False skips the missing accessor.
"""
def _create_delegator_property(name: str):
def _getter(self):
return self._delegate_property_get(name)
def _setter(self, new_values):
return self._delegate_property_set(name, new_values)
_getter.__name__ = name
_setter.__name__ = name
return property(
fget=_getter,
fset=_setter,
doc=getattr(delegate, accessor_mapping(name)).__doc__,
)
def _create_delegator_method(name: str):
def f(self, *args, **kwargs):
return self._delegate_method(name, *args, **kwargs)
f.__name__ = name
f.__doc__ = getattr(delegate, accessor_mapping(name)).__doc__
return f
for name in accessors:
if (
not raise_on_missing
and getattr(delegate, accessor_mapping(name), None) is None
):
continue
if typ == "property":
f = _create_delegator_property(name)
else:
f = _create_delegator_method(name)
# don't overwrite existing methods/properties
if overwrite or not hasattr(cls, name):
setattr(cls, name, f)
def delegate_names(
delegate,
accessors: list[str],
typ: str,
overwrite: bool = False,
accessor_mapping: Callable[[str], str] = lambda x: x,
raise_on_missing: bool = True,
):
"""
Add delegated names to a class using a class decorator. This provides
an alternative usage to directly calling `_add_delegate_accessors`
below a class definition.
Parameters
----------
delegate : object
The class to get methods/properties & doc-strings.
accessors : Sequence[str]
List of accessor to add.
typ : {'property', 'method'}
overwrite : bool, default False
Overwrite the method/property in the target class if it exists.
accessor_mapping: Callable, default lambda x: x
Callable to map the delegate's function to the cls' function.
raise_on_missing: bool, default True
Raise if an accessor does not exist on delegate.
False skips the missing accessor.
Returns
-------
callable
A class decorator.
Examples
--------
@delegate_names(Categorical, ["categories", "ordered"], "property")
class CategoricalAccessor(PandasDelegate):
[...]
"""
def add_delegate_accessors(cls):
cls._add_delegate_accessors(
delegate,
accessors,
typ,
overwrite=overwrite,
accessor_mapping=accessor_mapping,
raise_on_missing=raise_on_missing,
)
return cls
return add_delegate_accessors
# Ported with modifications from xarray; licence at LICENSES/XARRAY_LICENSE
# https://github.com/pydata/xarray/blob/master/xarray/core/extensions.py
# 1. We don't need to catch and re-raise AttributeErrors as RuntimeErrors
# 2. We use a UserWarning instead of a custom Warning
class CachedAccessor:
"""
Custom property-like object.
A descriptor for caching accessors.
Parameters
----------
name : str
Namespace that will be accessed under, e.g. ``df.foo``.
accessor : cls
Class with the extension methods.
Notes
-----
For accessor, The class's __init__ method assumes that one of
``Series``, ``DataFrame`` or ``Index`` as the
single argument ``data``.
"""
def __init__(self, name: str, accessor) -> None:
self._name = name
self._accessor = accessor
def __get__(self, obj, cls):
if obj is None:
# we're accessing the attribute of the class, i.e., Dataset.geo
return self._accessor
accessor_obj = self._accessor(obj)
# Replace the property with the accessor object. Inspired by:
# https://www.pydanny.com/cached-property.html
# We need to use object.__setattr__ because we overwrite __setattr__ on
# NDFrame
object.__setattr__(obj, self._name, accessor_obj)
return accessor_obj
@doc(klass="", others="")
def _register_accessor(name: str, cls):
"""
Register a custom accessor on {klass} objects.
Parameters
----------
name : str
Name under which the accessor should be registered. A warning is issued
if this name conflicts with a preexisting attribute.
Returns
-------
callable
A class decorator.
See Also
--------
register_dataframe_accessor : Register a custom accessor on DataFrame objects.
register_series_accessor : Register a custom accessor on Series objects.
register_index_accessor : Register a custom accessor on Index objects.
Notes
-----
When accessed, your accessor will be initialized with the pandas object
the user is interacting with. So the signature must be
.. code-block:: python
def __init__(self, pandas_object): # noqa: E999
...
For consistency with pandas methods, you should raise an ``AttributeError``
if the data passed to your accessor has an incorrect dtype.
>>> pd.Series(['a', 'b']).dt
Traceback (most recent call last):
...
AttributeError: Can only use .dt accessor with datetimelike values
Examples
--------
In your library code::
import pandas as pd
@pd.api.extensions.register_dataframe_accessor("geo")
class GeoAccessor:
def __init__(self, pandas_obj):
self._obj = pandas_obj
@property
def center(self):
# return the geographic center point of this DataFrame
lat = self._obj.latitude
lon = self._obj.longitude
return (float(lon.mean()), float(lat.mean()))
def plot(self):
# plot this array's data on a map, e.g., using Cartopy
pass
Back in an interactive IPython session:
.. code-block:: ipython
In [1]: ds = pd.DataFrame({{"longitude": np.linspace(0, 10),
...: "latitude": np.linspace(0, 20)}})
In [2]: ds.geo.center
Out[2]: (5.0, 10.0)
In [3]: ds.geo.plot() # plots data on a map
"""
def decorator(accessor):
if hasattr(cls, name):
warnings.warn(
f"registration of accessor {repr(accessor)} under name "
f"{repr(name)} for type {repr(cls)} is overriding a preexisting "
f"attribute with the same name.",
UserWarning,
stacklevel=find_stack_level(),
)
setattr(cls, name, CachedAccessor(name, accessor))
cls._accessors.add(name)
return accessor
return decorator
@doc(_register_accessor, klass="DataFrame")
def register_dataframe_accessor(name: str):
from pandas import DataFrame
return _register_accessor(name, DataFrame)
@doc(_register_accessor, klass="Series")
def register_series_accessor(name: str):
from pandas import Series
return _register_accessor(name, Series)
@doc(_register_accessor, klass="Index")
def register_index_accessor(name: str):
from pandas import Index
return _register_accessor(name, Index)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,140 @@
from pandas._libs import (
NaT,
Period,
Timedelta,
Timestamp,
)
from pandas._libs.missing import NA
from pandas.core.dtypes.dtypes import (
ArrowDtype,
CategoricalDtype,
DatetimeTZDtype,
IntervalDtype,
PeriodDtype,
)
from pandas.core.dtypes.missing import (
isna,
isnull,
notna,
notnull,
)
from pandas.core.algorithms import (
factorize,
unique,
value_counts,
)
from pandas.core.arrays import Categorical
from pandas.core.arrays.boolean import BooleanDtype
from pandas.core.arrays.floating import (
Float32Dtype,
Float64Dtype,
)
from pandas.core.arrays.integer import (
Int8Dtype,
Int16Dtype,
Int32Dtype,
Int64Dtype,
UInt8Dtype,
UInt16Dtype,
UInt32Dtype,
UInt64Dtype,
)
from pandas.core.arrays.string_ import StringDtype
from pandas.core.construction import array
from pandas.core.flags import Flags
from pandas.core.groupby import (
Grouper,
NamedAgg,
)
from pandas.core.indexes.api import (
CategoricalIndex,
DatetimeIndex,
Index,
IntervalIndex,
MultiIndex,
PeriodIndex,
RangeIndex,
TimedeltaIndex,
)
from pandas.core.indexes.datetimes import (
bdate_range,
date_range,
)
from pandas.core.indexes.interval import (
Interval,
interval_range,
)
from pandas.core.indexes.period import period_range
from pandas.core.indexes.timedeltas import timedelta_range
from pandas.core.indexing import IndexSlice
from pandas.core.series import Series
from pandas.core.tools.datetimes import to_datetime
from pandas.core.tools.numeric import to_numeric
from pandas.core.tools.timedeltas import to_timedelta
from pandas.io.formats.format import set_eng_float_format
from pandas.tseries.offsets import DateOffset
# DataFrame needs to be imported after NamedAgg to avoid a circular import
from pandas.core.frame import DataFrame # isort:skip
__all__ = [
"array",
"ArrowDtype",
"bdate_range",
"BooleanDtype",
"Categorical",
"CategoricalDtype",
"CategoricalIndex",
"DataFrame",
"DateOffset",
"date_range",
"DatetimeIndex",
"DatetimeTZDtype",
"factorize",
"Flags",
"Float32Dtype",
"Float64Dtype",
"Grouper",
"Index",
"IndexSlice",
"Int16Dtype",
"Int32Dtype",
"Int64Dtype",
"Int8Dtype",
"Interval",
"IntervalDtype",
"IntervalIndex",
"interval_range",
"isna",
"isnull",
"MultiIndex",
"NA",
"NamedAgg",
"NaT",
"notna",
"notnull",
"Period",
"PeriodDtype",
"PeriodIndex",
"period_range",
"RangeIndex",
"Series",
"set_eng_float_format",
"StringDtype",
"Timedelta",
"TimedeltaIndex",
"timedelta_range",
"Timestamp",
"to_datetime",
"to_numeric",
"to_timedelta",
"UInt16Dtype",
"UInt32Dtype",
"UInt64Dtype",
"UInt8Dtype",
"unique",
"value_counts",
]

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,9 @@
"""
core.array_algos is for algorithms that operate on ndarray and ExtensionArray.
These should:
- Assume that any Index, Series, or DataFrame objects have already been unwrapped.
- Assume that any list arguments have already been cast to ndarray/EA.
- Not depend on Index, Series, or DataFrame, nor import any of these.
- May dispatch to ExtensionArray methods, but should not import from core.arrays.
"""

View File

@ -0,0 +1,67 @@
"""
datetimelke_accumulations.py is for accumulations of datetimelike extension arrays
"""
from __future__ import annotations
from typing import Callable
import numpy as np
from pandas._libs import iNaT
from pandas.core.dtypes.missing import isna
def _cum_func(
func: Callable,
values: np.ndarray,
*,
skipna: bool = True,
):
"""
Accumulations for 1D datetimelike arrays.
Parameters
----------
func : np.cumsum, np.maximum.accumulate, np.minimum.accumulate
values : np.ndarray
Numpy array with the values (can be of any dtype that support the
operation). Values is changed is modified inplace.
skipna : bool, default True
Whether to skip NA.
"""
try:
fill_value = {
np.maximum.accumulate: np.iinfo(np.int64).min,
np.cumsum: 0,
np.minimum.accumulate: np.iinfo(np.int64).max,
}[func]
except KeyError:
raise ValueError(f"No accumulation for {func} implemented on BaseMaskedArray")
mask = isna(values)
y = values.view("i8")
y[mask] = fill_value
if not skipna:
mask = np.maximum.accumulate(mask)
result = func(y)
result[mask] = iNaT
if values.dtype.kind in "mM":
return result.view(values.dtype.base)
return result
def cumsum(values: np.ndarray, *, skipna: bool = True) -> np.ndarray:
return _cum_func(np.cumsum, values, skipna=skipna)
def cummin(values: np.ndarray, *, skipna: bool = True):
return _cum_func(np.minimum.accumulate, values, skipna=skipna)
def cummax(values: np.ndarray, *, skipna: bool = True):
return _cum_func(np.maximum.accumulate, values, skipna=skipna)

View File

@ -0,0 +1,90 @@
"""
masked_accumulations.py is for accumulation algorithms using a mask-based approach
for missing values.
"""
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Callable,
)
import numpy as np
if TYPE_CHECKING:
from pandas._typing import npt
def _cum_func(
func: Callable,
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
):
"""
Accumulations for 1D masked array.
We will modify values in place to replace NAs with the appropriate fill value.
Parameters
----------
func : np.cumsum, np.cumprod, np.maximum.accumulate, np.minimum.accumulate
values : np.ndarray
Numpy array with the values (can be of any dtype that support the
operation).
mask : np.ndarray
Boolean numpy array (True values indicate missing values).
skipna : bool, default True
Whether to skip NA.
"""
dtype_info: np.iinfo | np.finfo
if values.dtype.kind == "f":
dtype_info = np.finfo(values.dtype.type)
elif values.dtype.kind in "iu":
dtype_info = np.iinfo(values.dtype.type)
elif values.dtype.kind == "b":
# Max value of bool is 1, but since we are setting into a boolean
# array, 255 is fine as well. Min value has to be 0 when setting
# into the boolean array.
dtype_info = np.iinfo(np.uint8)
else:
raise NotImplementedError(
f"No masked accumulation defined for dtype {values.dtype.type}"
)
try:
fill_value = {
np.cumprod: 1,
np.maximum.accumulate: dtype_info.min,
np.cumsum: 0,
np.minimum.accumulate: dtype_info.max,
}[func]
except KeyError:
raise NotImplementedError(
f"No accumulation for {func} implemented on BaseMaskedArray"
)
values[mask] = fill_value
if not skipna:
mask = np.maximum.accumulate(mask)
values = func(values)
return values, mask
def cumsum(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
return _cum_func(np.cumsum, values, mask, skipna=skipna)
def cumprod(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
return _cum_func(np.cumprod, values, mask, skipna=skipna)
def cummin(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
return _cum_func(np.minimum.accumulate, values, mask, skipna=skipna)
def cummax(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
return _cum_func(np.maximum.accumulate, values, mask, skipna=skipna)

View File

@ -0,0 +1,201 @@
"""
masked_reductions.py is for reduction algorithms using a mask-based approach
for missing values.
"""
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Callable,
)
import warnings
import numpy as np
from pandas._libs import missing as libmissing
from pandas.core.nanops import check_below_min_count
if TYPE_CHECKING:
from pandas._typing import (
AxisInt,
npt,
)
def _reductions(
func: Callable,
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
min_count: int = 0,
axis: AxisInt | None = None,
**kwargs,
):
"""
Sum, mean or product for 1D masked array.
Parameters
----------
func : np.sum or np.prod
values : np.ndarray
Numpy array with the values (can be of any dtype that support the
operation).
mask : np.ndarray[bool]
Boolean numpy array (True values indicate missing values).
skipna : bool, default True
Whether to skip NA.
min_count : int, default 0
The required number of valid values to perform the operation. If fewer than
``min_count`` non-NA values are present the result will be NA.
axis : int, optional, default None
"""
if not skipna:
if mask.any() or check_below_min_count(values.shape, None, min_count):
return libmissing.NA
else:
return func(values, axis=axis, **kwargs)
else:
if check_below_min_count(values.shape, mask, min_count) and (
axis is None or values.ndim == 1
):
return libmissing.NA
if values.dtype == np.dtype(object):
# object dtype does not support `where` without passing an initial
values = values[~mask]
return func(values, axis=axis, **kwargs)
return func(values, where=~mask, axis=axis, **kwargs)
def sum(
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
min_count: int = 0,
axis: AxisInt | None = None,
):
return _reductions(
np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis
)
def prod(
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
min_count: int = 0,
axis: AxisInt | None = None,
):
return _reductions(
np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis
)
def _minmax(
func: Callable,
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
axis: AxisInt | None = None,
):
"""
Reduction for 1D masked array.
Parameters
----------
func : np.min or np.max
values : np.ndarray
Numpy array with the values (can be of any dtype that support the
operation).
mask : np.ndarray[bool]
Boolean numpy array (True values indicate missing values).
skipna : bool, default True
Whether to skip NA.
axis : int, optional, default None
"""
if not skipna:
if mask.any() or not values.size:
# min/max with empty array raise in numpy, pandas returns NA
return libmissing.NA
else:
return func(values, axis=axis)
else:
subset = values[~mask]
if subset.size:
return func(subset, axis=axis)
else:
# min/max with empty array raise in numpy, pandas returns NA
return libmissing.NA
def min(
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
axis: AxisInt | None = None,
):
return _minmax(np.min, values=values, mask=mask, skipna=skipna, axis=axis)
def max(
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
axis: AxisInt | None = None,
):
return _minmax(np.max, values=values, mask=mask, skipna=skipna, axis=axis)
def mean(
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
axis: AxisInt | None = None,
):
if not values.size or mask.all():
return libmissing.NA
return _reductions(np.mean, values=values, mask=mask, skipna=skipna, axis=axis)
def var(
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
axis: AxisInt | None = None,
ddof: int = 1,
):
if not values.size or mask.all():
return libmissing.NA
with warnings.catch_warnings():
warnings.simplefilter("ignore", RuntimeWarning)
return _reductions(
np.var, values=values, mask=mask, skipna=skipna, axis=axis, ddof=ddof
)
def std(
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
axis: AxisInt | None = None,
ddof: int = 1,
):
if not values.size or mask.all():
return libmissing.NA
with warnings.catch_warnings():
warnings.simplefilter("ignore", RuntimeWarning)
return _reductions(
np.std, values=values, mask=mask, skipna=skipna, axis=axis, ddof=ddof
)

View File

@ -0,0 +1,149 @@
"""
EA-compatible analogue to np.putmask
"""
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Any,
)
import numpy as np
from pandas._libs import lib
from pandas.core.dtypes.cast import infer_dtype_from
from pandas.core.dtypes.common import is_list_like
from pandas.core.arrays import ExtensionArray
if TYPE_CHECKING:
from pandas._typing import (
ArrayLike,
npt,
)
from pandas import MultiIndex
def putmask_inplace(values: ArrayLike, mask: npt.NDArray[np.bool_], value: Any) -> None:
"""
ExtensionArray-compatible implementation of np.putmask. The main
difference is we do not handle repeating or truncating like numpy.
Parameters
----------
values: np.ndarray or ExtensionArray
mask : np.ndarray[bool]
We assume extract_bool_array has already been called.
value : Any
"""
if (
not isinstance(values, np.ndarray)
or (values.dtype == object and not lib.is_scalar(value))
# GH#43424: np.putmask raises TypeError if we cannot cast between types with
# rule = "safe", a stricter guarantee we may not have here
or (
isinstance(value, np.ndarray) and not np.can_cast(value.dtype, values.dtype)
)
):
# GH#19266 using np.putmask gives unexpected results with listlike value
# along with object dtype
if is_list_like(value) and len(value) == len(values):
values[mask] = value[mask]
else:
values[mask] = value
else:
# GH#37833 np.putmask is more performant than __setitem__
np.putmask(values, mask, value)
def putmask_without_repeat(
values: np.ndarray, mask: npt.NDArray[np.bool_], new: Any
) -> None:
"""
np.putmask will truncate or repeat if `new` is a listlike with
len(new) != len(values). We require an exact match.
Parameters
----------
values : np.ndarray
mask : np.ndarray[bool]
new : Any
"""
if getattr(new, "ndim", 0) >= 1:
new = new.astype(values.dtype, copy=False)
# TODO: this prob needs some better checking for 2D cases
nlocs = mask.sum()
if nlocs > 0 and is_list_like(new) and getattr(new, "ndim", 1) == 1:
shape = np.shape(new)
# np.shape compat for if setitem_datetimelike_compat
# changed arraylike to list e.g. test_where_dt64_2d
if nlocs == shape[-1]:
# GH#30567
# If length of ``new`` is less than the length of ``values``,
# `np.putmask` would first repeat the ``new`` array and then
# assign the masked values hence produces incorrect result.
# `np.place` on the other hand uses the ``new`` values at it is
# to place in the masked locations of ``values``
np.place(values, mask, new)
# i.e. values[mask] = new
elif mask.shape[-1] == shape[-1] or shape[-1] == 1:
np.putmask(values, mask, new)
else:
raise ValueError("cannot assign mismatch length to masked array")
else:
np.putmask(values, mask, new)
def validate_putmask(
values: ArrayLike | MultiIndex, mask: np.ndarray
) -> tuple[npt.NDArray[np.bool_], bool]:
"""
Validate mask and check if this putmask operation is a no-op.
"""
mask = extract_bool_array(mask)
if mask.shape != values.shape:
raise ValueError("putmask: mask and data must be the same size")
noop = not mask.any()
return mask, noop
def extract_bool_array(mask: ArrayLike) -> npt.NDArray[np.bool_]:
"""
If we have a SparseArray or BooleanArray, convert it to ndarray[bool].
"""
if isinstance(mask, ExtensionArray):
# We could have BooleanArray, Sparse[bool], ...
# Except for BooleanArray, this is equivalent to just
# np.asarray(mask, dtype=bool)
mask = mask.to_numpy(dtype=bool, na_value=False)
mask = np.asarray(mask, dtype=bool)
return mask
def setitem_datetimelike_compat(values: np.ndarray, num_set: int, other):
"""
Parameters
----------
values : np.ndarray
num_set : int
For putmask, this is mask.sum()
other : Any
"""
if values.dtype == object:
dtype, _ = infer_dtype_from(other)
if lib.is_np_dtype(dtype, "mM"):
# https://github.com/numpy/numpy/issues/12550
# timedelta64 will incorrectly cast to int
if not is_list_like(other):
other = [other] * num_set
else:
other = list(other)
return other

View File

@ -0,0 +1,226 @@
from __future__ import annotations
from typing import TYPE_CHECKING
import numpy as np
from pandas.core.dtypes.missing import (
isna,
na_value_for_dtype,
)
if TYPE_CHECKING:
from pandas._typing import (
ArrayLike,
Scalar,
npt,
)
def quantile_compat(
values: ArrayLike, qs: npt.NDArray[np.float64], interpolation: str
) -> ArrayLike:
"""
Compute the quantiles of the given values for each quantile in `qs`.
Parameters
----------
values : np.ndarray or ExtensionArray
qs : np.ndarray[float64]
interpolation : str
Returns
-------
np.ndarray or ExtensionArray
"""
if isinstance(values, np.ndarray):
fill_value = na_value_for_dtype(values.dtype, compat=False)
mask = isna(values)
return quantile_with_mask(values, mask, fill_value, qs, interpolation)
else:
return values._quantile(qs, interpolation)
def quantile_with_mask(
values: np.ndarray,
mask: npt.NDArray[np.bool_],
fill_value,
qs: npt.NDArray[np.float64],
interpolation: str,
) -> np.ndarray:
"""
Compute the quantiles of the given values for each quantile in `qs`.
Parameters
----------
values : np.ndarray
For ExtensionArray, this is _values_for_factorize()[0]
mask : np.ndarray[bool]
mask = isna(values)
For ExtensionArray, this is computed before calling _value_for_factorize
fill_value : Scalar
The value to interpret fill NA entries with
For ExtensionArray, this is _values_for_factorize()[1]
qs : np.ndarray[float64]
interpolation : str
Type of interpolation
Returns
-------
np.ndarray
Notes
-----
Assumes values is already 2D. For ExtensionArray this means np.atleast_2d
has been called on _values_for_factorize()[0]
Quantile is computed along axis=1.
"""
assert values.shape == mask.shape
if values.ndim == 1:
# unsqueeze, operate, re-squeeze
values = np.atleast_2d(values)
mask = np.atleast_2d(mask)
res_values = quantile_with_mask(values, mask, fill_value, qs, interpolation)
return res_values[0]
assert values.ndim == 2
is_empty = values.shape[1] == 0
if is_empty:
# create the array of na_values
# 2d len(values) * len(qs)
flat = np.array([fill_value] * len(qs))
result = np.repeat(flat, len(values)).reshape(len(values), len(qs))
else:
result = _nanpercentile(
values,
qs * 100.0,
na_value=fill_value,
mask=mask,
interpolation=interpolation,
)
result = np.asarray(result)
result = result.T
return result
def _nanpercentile_1d(
values: np.ndarray,
mask: npt.NDArray[np.bool_],
qs: npt.NDArray[np.float64],
na_value: Scalar,
interpolation: str,
) -> Scalar | np.ndarray:
"""
Wrapper for np.percentile that skips missing values, specialized to
1-dimensional case.
Parameters
----------
values : array over which to find quantiles
mask : ndarray[bool]
locations in values that should be considered missing
qs : np.ndarray[float64] of quantile indices to find
na_value : scalar
value to return for empty or all-null values
interpolation : str
Returns
-------
quantiles : scalar or array
"""
# mask is Union[ExtensionArray, ndarray]
values = values[~mask]
if len(values) == 0:
# Can't pass dtype=values.dtype here bc we might have na_value=np.nan
# with values.dtype=int64 see test_quantile_empty
# equiv: 'np.array([na_value] * len(qs))' but much faster
return np.full(len(qs), na_value)
return np.percentile(
values,
qs,
# error: No overload variant of "percentile" matches argument
# types "ndarray[Any, Any]", "ndarray[Any, dtype[floating[_64Bit]]]"
# , "Dict[str, str]" [call-overload]
method=interpolation, # type: ignore[call-overload]
)
def _nanpercentile(
values: np.ndarray,
qs: npt.NDArray[np.float64],
*,
na_value,
mask: npt.NDArray[np.bool_],
interpolation: str,
):
"""
Wrapper for np.percentile that skips missing values.
Parameters
----------
values : np.ndarray[ndim=2] over which to find quantiles
qs : np.ndarray[float64] of quantile indices to find
na_value : scalar
value to return for empty or all-null values
mask : np.ndarray[bool]
locations in values that should be considered missing
interpolation : str
Returns
-------
quantiles : scalar or array
"""
if values.dtype.kind in "mM":
# need to cast to integer to avoid rounding errors in numpy
result = _nanpercentile(
values.view("i8"),
qs=qs,
na_value=na_value.view("i8"),
mask=mask,
interpolation=interpolation,
)
# Note: we have to do `astype` and not view because in general we
# have float result at this point, not i8
return result.astype(values.dtype)
if mask.any():
# Caller is responsible for ensuring mask shape match
assert mask.shape == values.shape
result = [
_nanpercentile_1d(val, m, qs, na_value, interpolation=interpolation)
for (val, m) in zip(list(values), list(mask))
]
if values.dtype.kind == "f":
# preserve itemsize
result = np.asarray(result, dtype=values.dtype).T
else:
result = np.asarray(result).T
if (
result.dtype != values.dtype
and not mask.all()
and (result == result.astype(values.dtype, copy=False)).all()
):
# mask.all() will never get cast back to int
# e.g. values id integer dtype and result is floating dtype,
# only cast back to integer dtype if result values are all-integer.
result = result.astype(values.dtype, copy=False)
return result
else:
return np.percentile(
values,
qs,
axis=1,
# error: No overload variant of "percentile" matches argument types
# "ndarray[Any, Any]", "ndarray[Any, dtype[floating[_64Bit]]]",
# "int", "Dict[str, str]" [call-overload]
method=interpolation, # type: ignore[call-overload]
)

View File

@ -0,0 +1,155 @@
"""
Methods used by Block.replace and related methods.
"""
from __future__ import annotations
import operator
import re
from re import Pattern
from typing import (
TYPE_CHECKING,
Any,
)
import numpy as np
from pandas.core.dtypes.common import (
is_bool,
is_re,
is_re_compilable,
)
from pandas.core.dtypes.missing import isna
if TYPE_CHECKING:
from pandas._typing import (
ArrayLike,
Scalar,
npt,
)
def should_use_regex(regex: bool, to_replace: Any) -> bool:
"""
Decide whether to treat `to_replace` as a regular expression.
"""
if is_re(to_replace):
regex = True
regex = regex and is_re_compilable(to_replace)
# Don't use regex if the pattern is empty.
regex = regex and re.compile(to_replace).pattern != ""
return regex
def compare_or_regex_search(
a: ArrayLike, b: Scalar | Pattern, regex: bool, mask: npt.NDArray[np.bool_]
) -> ArrayLike:
"""
Compare two array-like inputs of the same shape or two scalar values
Calls operator.eq or re.search, depending on regex argument. If regex is
True, perform an element-wise regex matching.
Parameters
----------
a : array-like
b : scalar or regex pattern
regex : bool
mask : np.ndarray[bool]
Returns
-------
mask : array-like of bool
"""
if isna(b):
return ~mask
def _check_comparison_types(
result: ArrayLike | bool, a: ArrayLike, b: Scalar | Pattern
):
"""
Raises an error if the two arrays (a,b) cannot be compared.
Otherwise, returns the comparison result as expected.
"""
if is_bool(result) and isinstance(a, np.ndarray):
type_names = [type(a).__name__, type(b).__name__]
type_names[0] = f"ndarray(dtype={a.dtype})"
raise TypeError(
f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}"
)
if not regex or not should_use_regex(regex, b):
# TODO: should use missing.mask_missing?
op = lambda x: operator.eq(x, b)
else:
op = np.vectorize(
lambda x: bool(re.search(b, x))
if isinstance(x, str) and isinstance(b, (str, Pattern))
else False
)
# GH#32621 use mask to avoid comparing to NAs
if isinstance(a, np.ndarray) and mask is not None:
a = a[mask]
result = op(a)
if isinstance(result, np.ndarray):
# The shape of the mask can differ to that of the result
# since we may compare only a subset of a's or b's elements
tmp = np.zeros(mask.shape, dtype=np.bool_)
np.place(tmp, mask, result)
result = tmp
else:
result = op(a)
_check_comparison_types(result, a, b)
return result
def replace_regex(
values: ArrayLike, rx: re.Pattern, value, mask: npt.NDArray[np.bool_] | None
) -> None:
"""
Parameters
----------
values : ArrayLike
Object dtype.
rx : re.Pattern
value : Any
mask : np.ndarray[bool], optional
Notes
-----
Alters values in-place.
"""
# deal with replacing values with objects (strings) that match but
# whose replacement is not a string (numeric, nan, object)
if isna(value) or not isinstance(value, str):
def re_replacer(s):
if is_re(rx) and isinstance(s, str):
return value if rx.search(s) is not None else s
else:
return s
else:
# value is guaranteed to be a string here, s can be either a string
# or null if it's null it gets returned
def re_replacer(s):
if is_re(rx) and isinstance(s, str):
return rx.sub(value, s)
else:
return s
f = np.vectorize(re_replacer, otypes=[np.object_])
if mask is None:
values[:] = f(values)
else:
if values.ndim != mask.ndim:
mask = np.broadcast_to(mask, values.shape)
values[mask] = f(values[mask])

View File

@ -0,0 +1,594 @@
from __future__ import annotations
import functools
from typing import (
TYPE_CHECKING,
cast,
overload,
)
import numpy as np
from pandas._libs import (
algos as libalgos,
lib,
)
from pandas.core.dtypes.cast import maybe_promote
from pandas.core.dtypes.common import (
ensure_platform_int,
is_1d_only_ea_dtype,
)
from pandas.core.dtypes.missing import na_value_for_dtype
from pandas.core.construction import ensure_wrapped_if_datetimelike
if TYPE_CHECKING:
from pandas._typing import (
ArrayLike,
AxisInt,
npt,
)
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
from pandas.core.arrays.base import ExtensionArray
@overload
def take_nd(
arr: np.ndarray,
indexer,
axis: AxisInt = ...,
fill_value=...,
allow_fill: bool = ...,
) -> np.ndarray:
...
@overload
def take_nd(
arr: ExtensionArray,
indexer,
axis: AxisInt = ...,
fill_value=...,
allow_fill: bool = ...,
) -> ArrayLike:
...
def take_nd(
arr: ArrayLike,
indexer,
axis: AxisInt = 0,
fill_value=lib.no_default,
allow_fill: bool = True,
) -> ArrayLike:
"""
Specialized Cython take which sets NaN values in one pass
This dispatches to ``take`` defined on ExtensionArrays.
Note: this function assumes that the indexer is a valid(ated) indexer with
no out of bound indices.
Parameters
----------
arr : np.ndarray or ExtensionArray
Input array.
indexer : ndarray
1-D array of indices to take, subarrays corresponding to -1 value
indices are filed with fill_value
axis : int, default 0
Axis to take from
fill_value : any, default np.nan
Fill value to replace -1 values with
allow_fill : bool, default True
If False, indexer is assumed to contain no -1 values so no filling
will be done. This short-circuits computation of a mask. Result is
undefined if allow_fill == False and -1 is present in indexer.
Returns
-------
subarray : np.ndarray or ExtensionArray
May be the same type as the input, or cast to an ndarray.
"""
if fill_value is lib.no_default:
fill_value = na_value_for_dtype(arr.dtype, compat=False)
elif lib.is_np_dtype(arr.dtype, "mM"):
dtype, fill_value = maybe_promote(arr.dtype, fill_value)
if arr.dtype != dtype:
# EA.take is strict about returning a new object of the same type
# so for that case cast upfront
arr = arr.astype(dtype)
if not isinstance(arr, np.ndarray):
# i.e. ExtensionArray,
# includes for EA to catch DatetimeArray, TimedeltaArray
if not is_1d_only_ea_dtype(arr.dtype):
# i.e. DatetimeArray, TimedeltaArray
arr = cast("NDArrayBackedExtensionArray", arr)
return arr.take(
indexer, fill_value=fill_value, allow_fill=allow_fill, axis=axis
)
return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
arr = np.asarray(arr)
return _take_nd_ndarray(arr, indexer, axis, fill_value, allow_fill)
def _take_nd_ndarray(
arr: np.ndarray,
indexer: npt.NDArray[np.intp] | None,
axis: AxisInt,
fill_value,
allow_fill: bool,
) -> np.ndarray:
if indexer is None:
indexer = np.arange(arr.shape[axis], dtype=np.intp)
dtype, fill_value = arr.dtype, arr.dtype.type()
else:
indexer = ensure_platform_int(indexer)
dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value(
arr, indexer, fill_value, allow_fill
)
flip_order = False
if arr.ndim == 2 and arr.flags.f_contiguous:
flip_order = True
if flip_order:
arr = arr.T
axis = arr.ndim - axis - 1
# at this point, it's guaranteed that dtype can hold both the arr values
# and the fill_value
out_shape_ = list(arr.shape)
out_shape_[axis] = len(indexer)
out_shape = tuple(out_shape_)
if arr.flags.f_contiguous and axis == arr.ndim - 1:
# minor tweak that can make an order-of-magnitude difference
# for dataframes initialized directly from 2-d ndarrays
# (s.t. df.values is c-contiguous and df._mgr.blocks[0] is its
# f-contiguous transpose)
out = np.empty(out_shape, dtype=dtype, order="F")
else:
out = np.empty(out_shape, dtype=dtype)
func = _get_take_nd_function(
arr.ndim, arr.dtype, out.dtype, axis=axis, mask_info=mask_info
)
func(arr, indexer, out, fill_value)
if flip_order:
out = out.T
return out
def take_1d(
arr: ArrayLike,
indexer: npt.NDArray[np.intp],
fill_value=None,
allow_fill: bool = True,
mask: npt.NDArray[np.bool_] | None = None,
) -> ArrayLike:
"""
Specialized version for 1D arrays. Differences compared to `take_nd`:
- Assumes input array has already been converted to numpy array / EA
- Assumes indexer is already guaranteed to be intp dtype ndarray
- Only works for 1D arrays
To ensure the lowest possible overhead.
Note: similarly to `take_nd`, this function assumes that the indexer is
a valid(ated) indexer with no out of bound indices.
Parameters
----------
arr : np.ndarray or ExtensionArray
Input array.
indexer : ndarray
1-D array of indices to take (validated indices, intp dtype).
fill_value : any, default np.nan
Fill value to replace -1 values with
allow_fill : bool, default True
If False, indexer is assumed to contain no -1 values so no filling
will be done. This short-circuits computation of a mask. Result is
undefined if allow_fill == False and -1 is present in indexer.
mask : np.ndarray, optional, default None
If `allow_fill` is True, and the mask (where indexer == -1) is already
known, it can be passed to avoid recomputation.
"""
if not isinstance(arr, np.ndarray):
# ExtensionArray -> dispatch to their method
return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
if not allow_fill:
return arr.take(indexer)
dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value(
arr, indexer, fill_value, True, mask
)
# at this point, it's guaranteed that dtype can hold both the arr values
# and the fill_value
out = np.empty(indexer.shape, dtype=dtype)
func = _get_take_nd_function(
arr.ndim, arr.dtype, out.dtype, axis=0, mask_info=mask_info
)
func(arr, indexer, out, fill_value)
return out
def take_2d_multi(
arr: np.ndarray,
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
fill_value=np.nan,
) -> np.ndarray:
"""
Specialized Cython take which sets NaN values in one pass.
"""
# This is only called from one place in DataFrame._reindex_multi,
# so we know indexer is well-behaved.
assert indexer is not None
assert indexer[0] is not None
assert indexer[1] is not None
row_idx, col_idx = indexer
row_idx = ensure_platform_int(row_idx)
col_idx = ensure_platform_int(col_idx)
indexer = row_idx, col_idx
mask_info = None
# check for promotion based on types only (do this first because
# it's faster than computing a mask)
dtype, fill_value = maybe_promote(arr.dtype, fill_value)
if dtype != arr.dtype:
# check if promotion is actually required based on indexer
row_mask = row_idx == -1
col_mask = col_idx == -1
row_needs = row_mask.any()
col_needs = col_mask.any()
mask_info = (row_mask, col_mask), (row_needs, col_needs)
if not (row_needs or col_needs):
# if not, then depromote, set fill_value to dummy
# (it won't be used but we don't want the cython code
# to crash when trying to cast it to dtype)
dtype, fill_value = arr.dtype, arr.dtype.type()
# at this point, it's guaranteed that dtype can hold both the arr values
# and the fill_value
out_shape = len(row_idx), len(col_idx)
out = np.empty(out_shape, dtype=dtype)
func = _take_2d_multi_dict.get((arr.dtype.name, out.dtype.name), None)
if func is None and arr.dtype != out.dtype:
func = _take_2d_multi_dict.get((out.dtype.name, out.dtype.name), None)
if func is not None:
func = _convert_wrapper(func, out.dtype)
if func is not None:
func(arr, indexer, out=out, fill_value=fill_value)
else:
# test_reindex_multi
_take_2d_multi_object(
arr, indexer, out, fill_value=fill_value, mask_info=mask_info
)
return out
@functools.lru_cache
def _get_take_nd_function_cached(
ndim: int, arr_dtype: np.dtype, out_dtype: np.dtype, axis: AxisInt
):
"""
Part of _get_take_nd_function below that doesn't need `mask_info` and thus
can be cached (mask_info potentially contains a numpy ndarray which is not
hashable and thus cannot be used as argument for cached function).
"""
tup = (arr_dtype.name, out_dtype.name)
if ndim == 1:
func = _take_1d_dict.get(tup, None)
elif ndim == 2:
if axis == 0:
func = _take_2d_axis0_dict.get(tup, None)
else:
func = _take_2d_axis1_dict.get(tup, None)
if func is not None:
return func
# We get here with string, uint, float16, and complex dtypes that could
# potentially be handled in algos_take_helper.
# Also a couple with (M8[ns], object) and (m8[ns], object)
tup = (out_dtype.name, out_dtype.name)
if ndim == 1:
func = _take_1d_dict.get(tup, None)
elif ndim == 2:
if axis == 0:
func = _take_2d_axis0_dict.get(tup, None)
else:
func = _take_2d_axis1_dict.get(tup, None)
if func is not None:
func = _convert_wrapper(func, out_dtype)
return func
return None
def _get_take_nd_function(
ndim: int,
arr_dtype: np.dtype,
out_dtype: np.dtype,
axis: AxisInt = 0,
mask_info=None,
):
"""
Get the appropriate "take" implementation for the given dimension, axis
and dtypes.
"""
func = None
if ndim <= 2:
# for this part we don't need `mask_info` -> use the cached algo lookup
func = _get_take_nd_function_cached(ndim, arr_dtype, out_dtype, axis)
if func is None:
def func(arr, indexer, out, fill_value=np.nan) -> None:
indexer = ensure_platform_int(indexer)
_take_nd_object(
arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info
)
return func
def _view_wrapper(f, arr_dtype=None, out_dtype=None, fill_wrap=None):
def wrapper(
arr: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
) -> None:
if arr_dtype is not None:
arr = arr.view(arr_dtype)
if out_dtype is not None:
out = out.view(out_dtype)
if fill_wrap is not None:
# FIXME: if we get here with dt64/td64 we need to be sure we have
# matching resos
if fill_value.dtype.kind == "m":
fill_value = fill_value.astype("m8[ns]")
else:
fill_value = fill_value.astype("M8[ns]")
fill_value = fill_wrap(fill_value)
f(arr, indexer, out, fill_value=fill_value)
return wrapper
def _convert_wrapper(f, conv_dtype):
def wrapper(
arr: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
) -> None:
if conv_dtype == object:
# GH#39755 avoid casting dt64/td64 to integers
arr = ensure_wrapped_if_datetimelike(arr)
arr = arr.astype(conv_dtype)
f(arr, indexer, out, fill_value=fill_value)
return wrapper
_take_1d_dict = {
("int8", "int8"): libalgos.take_1d_int8_int8,
("int8", "int32"): libalgos.take_1d_int8_int32,
("int8", "int64"): libalgos.take_1d_int8_int64,
("int8", "float64"): libalgos.take_1d_int8_float64,
("int16", "int16"): libalgos.take_1d_int16_int16,
("int16", "int32"): libalgos.take_1d_int16_int32,
("int16", "int64"): libalgos.take_1d_int16_int64,
("int16", "float64"): libalgos.take_1d_int16_float64,
("int32", "int32"): libalgos.take_1d_int32_int32,
("int32", "int64"): libalgos.take_1d_int32_int64,
("int32", "float64"): libalgos.take_1d_int32_float64,
("int64", "int64"): libalgos.take_1d_int64_int64,
("int64", "float64"): libalgos.take_1d_int64_float64,
("float32", "float32"): libalgos.take_1d_float32_float32,
("float32", "float64"): libalgos.take_1d_float32_float64,
("float64", "float64"): libalgos.take_1d_float64_float64,
("object", "object"): libalgos.take_1d_object_object,
("bool", "bool"): _view_wrapper(libalgos.take_1d_bool_bool, np.uint8, np.uint8),
("bool", "object"): _view_wrapper(libalgos.take_1d_bool_object, np.uint8, None),
("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
libalgos.take_1d_int64_int64, np.int64, np.int64, np.int64
),
("timedelta64[ns]", "timedelta64[ns]"): _view_wrapper(
libalgos.take_1d_int64_int64, np.int64, np.int64, np.int64
),
}
_take_2d_axis0_dict = {
("int8", "int8"): libalgos.take_2d_axis0_int8_int8,
("int8", "int32"): libalgos.take_2d_axis0_int8_int32,
("int8", "int64"): libalgos.take_2d_axis0_int8_int64,
("int8", "float64"): libalgos.take_2d_axis0_int8_float64,
("int16", "int16"): libalgos.take_2d_axis0_int16_int16,
("int16", "int32"): libalgos.take_2d_axis0_int16_int32,
("int16", "int64"): libalgos.take_2d_axis0_int16_int64,
("int16", "float64"): libalgos.take_2d_axis0_int16_float64,
("int32", "int32"): libalgos.take_2d_axis0_int32_int32,
("int32", "int64"): libalgos.take_2d_axis0_int32_int64,
("int32", "float64"): libalgos.take_2d_axis0_int32_float64,
("int64", "int64"): libalgos.take_2d_axis0_int64_int64,
("int64", "float64"): libalgos.take_2d_axis0_int64_float64,
("float32", "float32"): libalgos.take_2d_axis0_float32_float32,
("float32", "float64"): libalgos.take_2d_axis0_float32_float64,
("float64", "float64"): libalgos.take_2d_axis0_float64_float64,
("object", "object"): libalgos.take_2d_axis0_object_object,
("bool", "bool"): _view_wrapper(
libalgos.take_2d_axis0_bool_bool, np.uint8, np.uint8
),
("bool", "object"): _view_wrapper(
libalgos.take_2d_axis0_bool_object, np.uint8, None
),
("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
libalgos.take_2d_axis0_int64_int64, np.int64, np.int64, fill_wrap=np.int64
),
("timedelta64[ns]", "timedelta64[ns]"): _view_wrapper(
libalgos.take_2d_axis0_int64_int64, np.int64, np.int64, fill_wrap=np.int64
),
}
_take_2d_axis1_dict = {
("int8", "int8"): libalgos.take_2d_axis1_int8_int8,
("int8", "int32"): libalgos.take_2d_axis1_int8_int32,
("int8", "int64"): libalgos.take_2d_axis1_int8_int64,
("int8", "float64"): libalgos.take_2d_axis1_int8_float64,
("int16", "int16"): libalgos.take_2d_axis1_int16_int16,
("int16", "int32"): libalgos.take_2d_axis1_int16_int32,
("int16", "int64"): libalgos.take_2d_axis1_int16_int64,
("int16", "float64"): libalgos.take_2d_axis1_int16_float64,
("int32", "int32"): libalgos.take_2d_axis1_int32_int32,
("int32", "int64"): libalgos.take_2d_axis1_int32_int64,
("int32", "float64"): libalgos.take_2d_axis1_int32_float64,
("int64", "int64"): libalgos.take_2d_axis1_int64_int64,
("int64", "float64"): libalgos.take_2d_axis1_int64_float64,
("float32", "float32"): libalgos.take_2d_axis1_float32_float32,
("float32", "float64"): libalgos.take_2d_axis1_float32_float64,
("float64", "float64"): libalgos.take_2d_axis1_float64_float64,
("object", "object"): libalgos.take_2d_axis1_object_object,
("bool", "bool"): _view_wrapper(
libalgos.take_2d_axis1_bool_bool, np.uint8, np.uint8
),
("bool", "object"): _view_wrapper(
libalgos.take_2d_axis1_bool_object, np.uint8, None
),
("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
libalgos.take_2d_axis1_int64_int64, np.int64, np.int64, fill_wrap=np.int64
),
("timedelta64[ns]", "timedelta64[ns]"): _view_wrapper(
libalgos.take_2d_axis1_int64_int64, np.int64, np.int64, fill_wrap=np.int64
),
}
_take_2d_multi_dict = {
("int8", "int8"): libalgos.take_2d_multi_int8_int8,
("int8", "int32"): libalgos.take_2d_multi_int8_int32,
("int8", "int64"): libalgos.take_2d_multi_int8_int64,
("int8", "float64"): libalgos.take_2d_multi_int8_float64,
("int16", "int16"): libalgos.take_2d_multi_int16_int16,
("int16", "int32"): libalgos.take_2d_multi_int16_int32,
("int16", "int64"): libalgos.take_2d_multi_int16_int64,
("int16", "float64"): libalgos.take_2d_multi_int16_float64,
("int32", "int32"): libalgos.take_2d_multi_int32_int32,
("int32", "int64"): libalgos.take_2d_multi_int32_int64,
("int32", "float64"): libalgos.take_2d_multi_int32_float64,
("int64", "int64"): libalgos.take_2d_multi_int64_int64,
("int64", "float64"): libalgos.take_2d_multi_int64_float64,
("float32", "float32"): libalgos.take_2d_multi_float32_float32,
("float32", "float64"): libalgos.take_2d_multi_float32_float64,
("float64", "float64"): libalgos.take_2d_multi_float64_float64,
("object", "object"): libalgos.take_2d_multi_object_object,
("bool", "bool"): _view_wrapper(
libalgos.take_2d_multi_bool_bool, np.uint8, np.uint8
),
("bool", "object"): _view_wrapper(
libalgos.take_2d_multi_bool_object, np.uint8, None
),
("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
libalgos.take_2d_multi_int64_int64, np.int64, np.int64, fill_wrap=np.int64
),
("timedelta64[ns]", "timedelta64[ns]"): _view_wrapper(
libalgos.take_2d_multi_int64_int64, np.int64, np.int64, fill_wrap=np.int64
),
}
def _take_nd_object(
arr: np.ndarray,
indexer: npt.NDArray[np.intp],
out: np.ndarray,
axis: AxisInt,
fill_value,
mask_info,
) -> None:
if mask_info is not None:
mask, needs_masking = mask_info
else:
mask = indexer == -1
needs_masking = mask.any()
if arr.dtype != out.dtype:
arr = arr.astype(out.dtype)
if arr.shape[axis] > 0:
arr.take(indexer, axis=axis, out=out)
if needs_masking:
outindexer = [slice(None)] * arr.ndim
outindexer[axis] = mask
out[tuple(outindexer)] = fill_value
def _take_2d_multi_object(
arr: np.ndarray,
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
out: np.ndarray,
fill_value,
mask_info,
) -> None:
# this is not ideal, performance-wise, but it's better than raising
# an exception (best to optimize in Cython to avoid getting here)
row_idx, col_idx = indexer # both np.intp
if mask_info is not None:
(row_mask, col_mask), (row_needs, col_needs) = mask_info
else:
row_mask = row_idx == -1
col_mask = col_idx == -1
row_needs = row_mask.any()
col_needs = col_mask.any()
if fill_value is not None:
if row_needs:
out[row_mask, :] = fill_value
if col_needs:
out[:, col_mask] = fill_value
for i, u_ in enumerate(row_idx):
if u_ != -1:
for j, v in enumerate(col_idx):
if v != -1:
out[i, j] = arr[u_, v]
def _take_preprocess_indexer_and_fill_value(
arr: np.ndarray,
indexer: npt.NDArray[np.intp],
fill_value,
allow_fill: bool,
mask: npt.NDArray[np.bool_] | None = None,
):
mask_info: tuple[np.ndarray | None, bool] | None = None
if not allow_fill:
dtype, fill_value = arr.dtype, arr.dtype.type()
mask_info = None, False
else:
# check for promotion based on types only (do this first because
# it's faster than computing a mask)
dtype, fill_value = maybe_promote(arr.dtype, fill_value)
if dtype != arr.dtype:
# check if promotion is actually required based on indexer
if mask is not None:
needs_masking = True
else:
mask = indexer == -1
needs_masking = bool(mask.any())
mask_info = mask, needs_masking
if not needs_masking:
# if not, then depromote, set fill_value to dummy
# (it won't be used but we don't want the cython code
# to crash when trying to cast it to dtype)
dtype, fill_value = arr.dtype, arr.dtype.type()
return dtype, fill_value, mask_info

View File

@ -0,0 +1,50 @@
"""
transforms.py is for shape-preserving functions.
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import numpy as np
if TYPE_CHECKING:
from pandas._typing import (
AxisInt,
Scalar,
)
def shift(
values: np.ndarray, periods: int, axis: AxisInt, fill_value: Scalar
) -> np.ndarray:
new_values = values
if periods == 0 or values.size == 0:
return new_values.copy()
# make sure array sent to np.roll is c_contiguous
f_ordered = values.flags.f_contiguous
if f_ordered:
new_values = new_values.T
axis = new_values.ndim - axis - 1
if new_values.size:
new_values = np.roll(
new_values,
np.intp(periods),
axis=axis,
)
axis_indexer = [slice(None)] * values.ndim
if periods > 0:
axis_indexer[axis] = slice(None, periods)
else:
axis_indexer[axis] = slice(periods, None)
new_values[tuple(axis_indexer)] = fill_value
# restore original order
if f_ordered:
new_values = new_values.T
return new_values

View File

@ -0,0 +1,530 @@
"""
Methods that can be shared by many array-like classes or subclasses:
Series
Index
ExtensionArray
"""
from __future__ import annotations
import operator
from typing import Any
import numpy as np
from pandas._libs import lib
from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op
from pandas.core.dtypes.generic import ABCNDFrame
from pandas.core import roperator
from pandas.core.construction import extract_array
from pandas.core.ops.common import unpack_zerodim_and_defer
REDUCTION_ALIASES = {
"maximum": "max",
"minimum": "min",
"add": "sum",
"multiply": "prod",
}
class OpsMixin:
# -------------------------------------------------------------
# Comparisons
def _cmp_method(self, other, op):
return NotImplemented
@unpack_zerodim_and_defer("__eq__")
def __eq__(self, other):
return self._cmp_method(other, operator.eq)
@unpack_zerodim_and_defer("__ne__")
def __ne__(self, other):
return self._cmp_method(other, operator.ne)
@unpack_zerodim_and_defer("__lt__")
def __lt__(self, other):
return self._cmp_method(other, operator.lt)
@unpack_zerodim_and_defer("__le__")
def __le__(self, other):
return self._cmp_method(other, operator.le)
@unpack_zerodim_and_defer("__gt__")
def __gt__(self, other):
return self._cmp_method(other, operator.gt)
@unpack_zerodim_and_defer("__ge__")
def __ge__(self, other):
return self._cmp_method(other, operator.ge)
# -------------------------------------------------------------
# Logical Methods
def _logical_method(self, other, op):
return NotImplemented
@unpack_zerodim_and_defer("__and__")
def __and__(self, other):
return self._logical_method(other, operator.and_)
@unpack_zerodim_and_defer("__rand__")
def __rand__(self, other):
return self._logical_method(other, roperator.rand_)
@unpack_zerodim_and_defer("__or__")
def __or__(self, other):
return self._logical_method(other, operator.or_)
@unpack_zerodim_and_defer("__ror__")
def __ror__(self, other):
return self._logical_method(other, roperator.ror_)
@unpack_zerodim_and_defer("__xor__")
def __xor__(self, other):
return self._logical_method(other, operator.xor)
@unpack_zerodim_and_defer("__rxor__")
def __rxor__(self, other):
return self._logical_method(other, roperator.rxor)
# -------------------------------------------------------------
# Arithmetic Methods
def _arith_method(self, other, op):
return NotImplemented
@unpack_zerodim_and_defer("__add__")
def __add__(self, other):
"""
Get Addition of DataFrame and other, column-wise.
Equivalent to ``DataFrame.add(other)``.
Parameters
----------
other : scalar, sequence, Series, dict or DataFrame
Object to be added to the DataFrame.
Returns
-------
DataFrame
The result of adding ``other`` to DataFrame.
See Also
--------
DataFrame.add : Add a DataFrame and another object, with option for index-
or column-oriented addition.
Examples
--------
>>> df = pd.DataFrame({'height': [1.5, 2.6], 'weight': [500, 800]},
... index=['elk', 'moose'])
>>> df
height weight
elk 1.5 500
moose 2.6 800
Adding a scalar affects all rows and columns.
>>> df[['height', 'weight']] + 1.5
height weight
elk 3.0 501.5
moose 4.1 801.5
Each element of a list is added to a column of the DataFrame, in order.
>>> df[['height', 'weight']] + [0.5, 1.5]
height weight
elk 2.0 501.5
moose 3.1 801.5
Keys of a dictionary are aligned to the DataFrame, based on column names;
each value in the dictionary is added to the corresponding column.
>>> df[['height', 'weight']] + {'height': 0.5, 'weight': 1.5}
height weight
elk 2.0 501.5
moose 3.1 801.5
When `other` is a :class:`Series`, the index of `other` is aligned with the
columns of the DataFrame.
>>> s1 = pd.Series([0.5, 1.5], index=['weight', 'height'])
>>> df[['height', 'weight']] + s1
height weight
elk 3.0 500.5
moose 4.1 800.5
Even when the index of `other` is the same as the index of the DataFrame,
the :class:`Series` will not be reoriented. If index-wise alignment is desired,
:meth:`DataFrame.add` should be used with `axis='index'`.
>>> s2 = pd.Series([0.5, 1.5], index=['elk', 'moose'])
>>> df[['height', 'weight']] + s2
elk height moose weight
elk NaN NaN NaN NaN
moose NaN NaN NaN NaN
>>> df[['height', 'weight']].add(s2, axis='index')
height weight
elk 2.0 500.5
moose 4.1 801.5
When `other` is a :class:`DataFrame`, both columns names and the
index are aligned.
>>> other = pd.DataFrame({'height': [0.2, 0.4, 0.6]},
... index=['elk', 'moose', 'deer'])
>>> df[['height', 'weight']] + other
height weight
deer NaN NaN
elk 1.7 NaN
moose 3.0 NaN
"""
return self._arith_method(other, operator.add)
@unpack_zerodim_and_defer("__radd__")
def __radd__(self, other):
return self._arith_method(other, roperator.radd)
@unpack_zerodim_and_defer("__sub__")
def __sub__(self, other):
return self._arith_method(other, operator.sub)
@unpack_zerodim_and_defer("__rsub__")
def __rsub__(self, other):
return self._arith_method(other, roperator.rsub)
@unpack_zerodim_and_defer("__mul__")
def __mul__(self, other):
return self._arith_method(other, operator.mul)
@unpack_zerodim_and_defer("__rmul__")
def __rmul__(self, other):
return self._arith_method(other, roperator.rmul)
@unpack_zerodim_and_defer("__truediv__")
def __truediv__(self, other):
return self._arith_method(other, operator.truediv)
@unpack_zerodim_and_defer("__rtruediv__")
def __rtruediv__(self, other):
return self._arith_method(other, roperator.rtruediv)
@unpack_zerodim_and_defer("__floordiv__")
def __floordiv__(self, other):
return self._arith_method(other, operator.floordiv)
@unpack_zerodim_and_defer("__rfloordiv")
def __rfloordiv__(self, other):
return self._arith_method(other, roperator.rfloordiv)
@unpack_zerodim_and_defer("__mod__")
def __mod__(self, other):
return self._arith_method(other, operator.mod)
@unpack_zerodim_and_defer("__rmod__")
def __rmod__(self, other):
return self._arith_method(other, roperator.rmod)
@unpack_zerodim_and_defer("__divmod__")
def __divmod__(self, other):
return self._arith_method(other, divmod)
@unpack_zerodim_and_defer("__rdivmod__")
def __rdivmod__(self, other):
return self._arith_method(other, roperator.rdivmod)
@unpack_zerodim_and_defer("__pow__")
def __pow__(self, other):
return self._arith_method(other, operator.pow)
@unpack_zerodim_and_defer("__rpow__")
def __rpow__(self, other):
return self._arith_method(other, roperator.rpow)
# -----------------------------------------------------------------------------
# Helpers to implement __array_ufunc__
def array_ufunc(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any):
"""
Compatibility with numpy ufuncs.
See also
--------
numpy.org/doc/stable/reference/arrays.classes.html#numpy.class.__array_ufunc__
"""
from pandas.core.frame import (
DataFrame,
Series,
)
from pandas.core.generic import NDFrame
from pandas.core.internals import (
ArrayManager,
BlockManager,
)
cls = type(self)
kwargs = _standardize_out_kwarg(**kwargs)
# for binary ops, use our custom dunder methods
result = maybe_dispatch_ufunc_to_dunder_op(self, ufunc, method, *inputs, **kwargs)
if result is not NotImplemented:
return result
# Determine if we should defer.
no_defer = (
np.ndarray.__array_ufunc__,
cls.__array_ufunc__,
)
for item in inputs:
higher_priority = (
hasattr(item, "__array_priority__")
and item.__array_priority__ > self.__array_priority__
)
has_array_ufunc = (
hasattr(item, "__array_ufunc__")
and type(item).__array_ufunc__ not in no_defer
and not isinstance(item, self._HANDLED_TYPES)
)
if higher_priority or has_array_ufunc:
return NotImplemented
# align all the inputs.
types = tuple(type(x) for x in inputs)
alignable = [x for x, t in zip(inputs, types) if issubclass(t, NDFrame)]
if len(alignable) > 1:
# This triggers alignment.
# At the moment, there aren't any ufuncs with more than two inputs
# so this ends up just being x1.index | x2.index, but we write
# it to handle *args.
set_types = set(types)
if len(set_types) > 1 and {DataFrame, Series}.issubset(set_types):
# We currently don't handle ufunc(DataFrame, Series)
# well. Previously this raised an internal ValueError. We might
# support it someday, so raise a NotImplementedError.
raise NotImplementedError(
f"Cannot apply ufunc {ufunc} to mixed DataFrame and Series inputs."
)
axes = self.axes
for obj in alignable[1:]:
# this relies on the fact that we aren't handling mixed
# series / frame ufuncs.
for i, (ax1, ax2) in enumerate(zip(axes, obj.axes)):
axes[i] = ax1.union(ax2)
reconstruct_axes = dict(zip(self._AXIS_ORDERS, axes))
inputs = tuple(
x.reindex(**reconstruct_axes) if issubclass(t, NDFrame) else x
for x, t in zip(inputs, types)
)
else:
reconstruct_axes = dict(zip(self._AXIS_ORDERS, self.axes))
if self.ndim == 1:
names = [getattr(x, "name") for x in inputs if hasattr(x, "name")]
name = names[0] if len(set(names)) == 1 else None
reconstruct_kwargs = {"name": name}
else:
reconstruct_kwargs = {}
def reconstruct(result):
if ufunc.nout > 1:
# np.modf, np.frexp, np.divmod
return tuple(_reconstruct(x) for x in result)
return _reconstruct(result)
def _reconstruct(result):
if lib.is_scalar(result):
return result
if result.ndim != self.ndim:
if method == "outer":
raise NotImplementedError
return result
if isinstance(result, (BlockManager, ArrayManager)):
# we went through BlockManager.apply e.g. np.sqrt
result = self._constructor_from_mgr(result, axes=result.axes)
else:
# we converted an array, lost our axes
result = self._constructor(
result, **reconstruct_axes, **reconstruct_kwargs, copy=False
)
# TODO: When we support multiple values in __finalize__, this
# should pass alignable to `__finalize__` instead of self.
# Then `np.add(a, b)` would consider attrs from both a and b
# when a and b are NDFrames.
if len(alignable) == 1:
result = result.__finalize__(self)
return result
if "out" in kwargs:
# e.g. test_multiindex_get_loc
result = dispatch_ufunc_with_out(self, ufunc, method, *inputs, **kwargs)
return reconstruct(result)
if method == "reduce":
# e.g. test.series.test_ufunc.test_reduce
result = dispatch_reduction_ufunc(self, ufunc, method, *inputs, **kwargs)
if result is not NotImplemented:
return result
# We still get here with kwargs `axis` for e.g. np.maximum.accumulate
# and `dtype` and `keepdims` for np.ptp
if self.ndim > 1 and (len(inputs) > 1 or ufunc.nout > 1):
# Just give up on preserving types in the complex case.
# In theory we could preserve them for them.
# * nout>1 is doable if BlockManager.apply took nout and
# returned a Tuple[BlockManager].
# * len(inputs) > 1 is doable when we know that we have
# aligned blocks / dtypes.
# e.g. my_ufunc, modf, logaddexp, heaviside, subtract, add
inputs = tuple(np.asarray(x) for x in inputs)
# Note: we can't use default_array_ufunc here bc reindexing means
# that `self` may not be among `inputs`
result = getattr(ufunc, method)(*inputs, **kwargs)
elif self.ndim == 1:
# ufunc(series, ...)
inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs)
result = getattr(ufunc, method)(*inputs, **kwargs)
else:
# ufunc(dataframe)
if method == "__call__" and not kwargs:
# for np.<ufunc>(..) calls
# kwargs cannot necessarily be handled block-by-block, so only
# take this path if there are no kwargs
mgr = inputs[0]._mgr
result = mgr.apply(getattr(ufunc, method))
else:
# otherwise specific ufunc methods (eg np.<ufunc>.accumulate(..))
# Those can have an axis keyword and thus can't be called block-by-block
result = default_array_ufunc(inputs[0], ufunc, method, *inputs, **kwargs)
# e.g. np.negative (only one reached), with "where" and "out" in kwargs
result = reconstruct(result)
return result
def _standardize_out_kwarg(**kwargs) -> dict:
"""
If kwargs contain "out1" and "out2", replace that with a tuple "out"
np.divmod, np.modf, np.frexp can have either `out=(out1, out2)` or
`out1=out1, out2=out2)`
"""
if "out" not in kwargs and "out1" in kwargs and "out2" in kwargs:
out1 = kwargs.pop("out1")
out2 = kwargs.pop("out2")
out = (out1, out2)
kwargs["out"] = out
return kwargs
def dispatch_ufunc_with_out(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
"""
If we have an `out` keyword, then call the ufunc without `out` and then
set the result into the given `out`.
"""
# Note: we assume _standardize_out_kwarg has already been called.
out = kwargs.pop("out")
where = kwargs.pop("where", None)
result = getattr(ufunc, method)(*inputs, **kwargs)
if result is NotImplemented:
return NotImplemented
if isinstance(result, tuple):
# i.e. np.divmod, np.modf, np.frexp
if not isinstance(out, tuple) or len(out) != len(result):
raise NotImplementedError
for arr, res in zip(out, result):
_assign_where(arr, res, where)
return out
if isinstance(out, tuple):
if len(out) == 1:
out = out[0]
else:
raise NotImplementedError
_assign_where(out, result, where)
return out
def _assign_where(out, result, where) -> None:
"""
Set a ufunc result into 'out', masking with a 'where' argument if necessary.
"""
if where is None:
# no 'where' arg passed to ufunc
out[:] = result
else:
np.putmask(out, where, result)
def default_array_ufunc(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
"""
Fallback to the behavior we would get if we did not define __array_ufunc__.
Notes
-----
We are assuming that `self` is among `inputs`.
"""
if not any(x is self for x in inputs):
raise NotImplementedError
new_inputs = [x if x is not self else np.asarray(x) for x in inputs]
return getattr(ufunc, method)(*new_inputs, **kwargs)
def dispatch_reduction_ufunc(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
"""
Dispatch ufunc reductions to self's reduction methods.
"""
assert method == "reduce"
if len(inputs) != 1 or inputs[0] is not self:
return NotImplemented
if ufunc.__name__ not in REDUCTION_ALIASES:
return NotImplemented
method_name = REDUCTION_ALIASES[ufunc.__name__]
# NB: we are assuming that min/max represent minimum/maximum methods,
# which would not be accurate for e.g. Timestamp.min
if not hasattr(self, method_name):
return NotImplemented
if self.ndim > 1:
if isinstance(self, ABCNDFrame):
# TODO: test cases where this doesn't hold, i.e. 2D DTA/TDA
kwargs["numeric_only"] = False
if "axis" not in kwargs:
# For DataFrame reductions we don't want the default axis=0
# Note: np.min is not a ufunc, but uses array_function_dispatch,
# so calls DataFrame.min (without ever getting here) with the np.min
# default of axis=None, which DataFrame.min catches and changes to axis=0.
# np.minimum.reduce(df) gets here bc axis is not in kwargs,
# so we set axis=0 to match the behaviorof np.minimum.reduce(df.values)
kwargs["axis"] = 0
# By default, numpy's reductions do not skip NaNs, so we have to
# pass skipna=False
return getattr(self, method_name)(skipna=False, **kwargs)

View File

@ -0,0 +1,43 @@
from pandas.core.arrays.arrow import ArrowExtensionArray
from pandas.core.arrays.base import (
ExtensionArray,
ExtensionOpsMixin,
ExtensionScalarOpsMixin,
)
from pandas.core.arrays.boolean import BooleanArray
from pandas.core.arrays.categorical import Categorical
from pandas.core.arrays.datetimes import DatetimeArray
from pandas.core.arrays.floating import FloatingArray
from pandas.core.arrays.integer import IntegerArray
from pandas.core.arrays.interval import IntervalArray
from pandas.core.arrays.masked import BaseMaskedArray
from pandas.core.arrays.numpy_ import NumpyExtensionArray
from pandas.core.arrays.period import (
PeriodArray,
period_array,
)
from pandas.core.arrays.sparse import SparseArray
from pandas.core.arrays.string_ import StringArray
from pandas.core.arrays.string_arrow import ArrowStringArray
from pandas.core.arrays.timedeltas import TimedeltaArray
__all__ = [
"ArrowExtensionArray",
"ExtensionArray",
"ExtensionOpsMixin",
"ExtensionScalarOpsMixin",
"ArrowStringArray",
"BaseMaskedArray",
"BooleanArray",
"Categorical",
"DatetimeArray",
"FloatingArray",
"IntegerArray",
"IntervalArray",
"NumpyExtensionArray",
"PeriodArray",
"period_array",
"SparseArray",
"StringArray",
"TimedeltaArray",
]

View File

@ -0,0 +1,362 @@
from __future__ import annotations
from functools import partial
import re
from typing import (
TYPE_CHECKING,
Any,
Literal,
)
import numpy as np
from pandas._libs import lib
from pandas.compat import (
pa_version_under10p1,
pa_version_under11p0,
pa_version_under13p0,
pa_version_under17p0,
)
if not pa_version_under10p1:
import pyarrow as pa
import pyarrow.compute as pc
if TYPE_CHECKING:
from collections.abc import Callable
from pandas._typing import (
Scalar,
Self,
)
class ArrowStringArrayMixin:
_pa_array: pa.ChunkedArray
def __init__(self, *args, **kwargs) -> None:
raise NotImplementedError
def _convert_bool_result(self, result, na=lib.no_default, method_name=None):
# Convert a bool-dtype result to the appropriate result type
raise NotImplementedError
def _convert_int_result(self, result):
# Convert an integer-dtype result to the appropriate result type
raise NotImplementedError
def _apply_elementwise(self, func: Callable) -> list[list[Any]]:
raise NotImplementedError
def _str_len(self):
result = pc.utf8_length(self._pa_array)
return self._convert_int_result(result)
def _str_lower(self) -> Self:
return type(self)(pc.utf8_lower(self._pa_array))
def _str_upper(self) -> Self:
return type(self)(pc.utf8_upper(self._pa_array))
def _str_strip(self, to_strip=None) -> Self:
if to_strip is None:
result = pc.utf8_trim_whitespace(self._pa_array)
else:
result = pc.utf8_trim(self._pa_array, characters=to_strip)
return type(self)(result)
def _str_lstrip(self, to_strip=None) -> Self:
if to_strip is None:
result = pc.utf8_ltrim_whitespace(self._pa_array)
else:
result = pc.utf8_ltrim(self._pa_array, characters=to_strip)
return type(self)(result)
def _str_rstrip(self, to_strip=None) -> Self:
if to_strip is None:
result = pc.utf8_rtrim_whitespace(self._pa_array)
else:
result = pc.utf8_rtrim(self._pa_array, characters=to_strip)
return type(self)(result)
def _str_pad(
self,
width: int,
side: Literal["left", "right", "both"] = "left",
fillchar: str = " ",
):
if side == "left":
pa_pad = pc.utf8_lpad
elif side == "right":
pa_pad = pc.utf8_rpad
elif side == "both":
if pa_version_under17p0:
# GH#59624 fall back to object dtype
from pandas import array as pd_array
obj_arr = self.astype(object, copy=False) # type: ignore[attr-defined]
obj = pd_array(obj_arr, dtype=object)
result = obj._str_pad(width, side, fillchar) # type: ignore[attr-defined]
return type(self)._from_sequence(result, dtype=self.dtype) # type: ignore[attr-defined]
else:
# GH#54792
# https://github.com/apache/arrow/issues/15053#issuecomment-2317032347
lean_left = (width % 2) == 0
pa_pad = partial(pc.utf8_center, lean_left_on_odd_padding=lean_left)
else:
raise ValueError(
f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'"
)
return type(self)(pa_pad(self._pa_array, width=width, padding=fillchar))
def _str_get(self, i: int):
lengths = pc.utf8_length(self._pa_array)
if i >= 0:
out_of_bounds = pc.greater_equal(i, lengths)
start = i
stop = i + 1
step = 1
else:
out_of_bounds = pc.greater(-i, lengths)
start = i
stop = i - 1
step = -1
not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True))
selected = pc.utf8_slice_codeunits(
self._pa_array, start=start, stop=stop, step=step
)
null_value = pa.scalar(None, type=self._pa_array.type)
result = pc.if_else(not_out_of_bounds, selected, null_value)
return type(self)(result)
def _str_slice(
self, start: int | None = None, stop: int | None = None, step: int | None = None
):
if pa_version_under11p0:
# GH#59724
result = self._apply_elementwise(lambda val: val[start:stop:step])
return type(self)(pa.chunked_array(result, type=self._pa_array.type))
if start is None:
if step is not None and step < 0:
# GH#59710
start = -1
else:
start = 0
if step is None:
step = 1
return type(self)(
pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
)
def _str_slice_replace(
self, start: int | None = None, stop: int | None = None, repl: str | None = None
):
if repl is None:
repl = ""
if start is None:
start = 0
if stop is None:
stop = np.iinfo(np.int64).max
return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl))
def _str_replace(
self,
pat: str | re.Pattern,
repl: str | Callable,
n: int = -1,
case: bool = True,
flags: int = 0,
regex: bool = True,
) -> Self:
if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:
raise NotImplementedError(
"replace is not supported with a re.Pattern, callable repl, "
"case=False, or flags!=0"
)
func = pc.replace_substring_regex if regex else pc.replace_substring
# https://github.com/apache/arrow/issues/39149
# GH 56404, unexpected behavior with negative max_replacements with pyarrow.
pa_max_replacements = None if n < 0 else n
result = func(
self._pa_array,
pattern=pat,
replacement=repl,
max_replacements=pa_max_replacements,
)
return type(self)(result)
def _str_capitalize(self) -> Self:
return type(self)(pc.utf8_capitalize(self._pa_array))
def _str_title(self):
return type(self)(pc.utf8_title(self._pa_array))
def _str_swapcase(self):
return type(self)(pc.utf8_swapcase(self._pa_array))
def _str_removeprefix(self, prefix: str):
if not pa_version_under13p0:
starts_with = pc.starts_with(self._pa_array, pattern=prefix)
removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix))
result = pc.if_else(starts_with, removed, self._pa_array)
return type(self)(result)
predicate = lambda val: val.removeprefix(prefix)
result = self._apply_elementwise(predicate)
return type(self)(pa.chunked_array(result))
def _str_removesuffix(self, suffix: str):
ends_with = pc.ends_with(self._pa_array, pattern=suffix)
removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))
result = pc.if_else(ends_with, removed, self._pa_array)
return type(self)(result)
def _str_startswith(
self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default
):
if isinstance(pat, str):
result = pc.starts_with(self._pa_array, pattern=pat)
else:
if len(pat) == 0:
# For empty tuple we return null for missing values and False
# for valid values.
result = pc.if_else(pc.is_null(self._pa_array), None, False)
else:
result = pc.starts_with(self._pa_array, pattern=pat[0])
for p in pat[1:]:
result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p))
return self._convert_bool_result(result, na=na, method_name="startswith")
def _str_endswith(
self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default
):
if isinstance(pat, str):
result = pc.ends_with(self._pa_array, pattern=pat)
else:
if len(pat) == 0:
# For empty tuple we return null for missing values and False
# for valid values.
result = pc.if_else(pc.is_null(self._pa_array), None, False)
else:
result = pc.ends_with(self._pa_array, pattern=pat[0])
for p in pat[1:]:
result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p))
return self._convert_bool_result(result, na=na, method_name="endswith")
def _str_isalnum(self):
result = pc.utf8_is_alnum(self._pa_array)
return self._convert_bool_result(result)
def _str_isalpha(self):
result = pc.utf8_is_alpha(self._pa_array)
return self._convert_bool_result(result)
def _str_isdecimal(self):
result = pc.utf8_is_decimal(self._pa_array)
return self._convert_bool_result(result)
def _str_isdigit(self):
result = pc.utf8_is_digit(self._pa_array)
return self._convert_bool_result(result)
def _str_islower(self):
result = pc.utf8_is_lower(self._pa_array)
return self._convert_bool_result(result)
def _str_isnumeric(self):
result = pc.utf8_is_numeric(self._pa_array)
return self._convert_bool_result(result)
def _str_isspace(self):
result = pc.utf8_is_space(self._pa_array)
return self._convert_bool_result(result)
def _str_istitle(self):
result = pc.utf8_is_title(self._pa_array)
return self._convert_bool_result(result)
def _str_isupper(self):
result = pc.utf8_is_upper(self._pa_array)
return self._convert_bool_result(result)
def _str_contains(
self,
pat,
case: bool = True,
flags: int = 0,
na: Scalar | lib.NoDefault = lib.no_default,
regex: bool = True,
):
if flags:
raise NotImplementedError(f"contains not implemented with {flags=}")
if regex:
pa_contains = pc.match_substring_regex
else:
pa_contains = pc.match_substring
result = pa_contains(self._pa_array, pat, ignore_case=not case)
return self._convert_bool_result(result, na=na, method_name="contains")
def _str_match(
self,
pat: str | re.Pattern,
case: bool = True,
flags: int = 0,
na: Scalar | lib.NoDefault = lib.no_default,
):
if isinstance(pat, re.Pattern):
# GH#61952
pat = pat.pattern
if isinstance(pat, str) and not pat.startswith("^"):
pat = f"^{pat}"
return self._str_contains(pat, case, flags, na, regex=True)
def _str_fullmatch(
self,
pat: str | re.Pattern,
case: bool = True,
flags: int = 0,
na: Scalar | lib.NoDefault = lib.no_default,
):
if isinstance(pat, re.Pattern):
# GH#61952
pat = pat.pattern
if isinstance(pat, str) and (not pat.endswith("$") or pat.endswith("\\$")):
pat = f"{pat}$"
return self._str_match(pat, case, flags, na)
def _str_find(self, sub: str, start: int = 0, end: int | None = None):
if (
pa_version_under13p0
and not (start != 0 and end is not None)
and not (start == 0 and end is None)
):
# GH#59562
res_list = self._apply_elementwise(lambda val: val.find(sub, start, end))
return self._convert_int_result(pa.chunked_array(res_list))
if (start == 0 or start is None) and end is None:
result = pc.find_substring(self._pa_array, sub)
else:
if sub == "":
# GH#56792
res_list = self._apply_elementwise(
lambda val: val.find(sub, start, end)
)
return self._convert_int_result(pa.chunked_array(res_list))
if start is None:
start_offset = 0
start = 0
elif start < 0:
start_offset = pc.add(start, pc.utf8_length(self._pa_array))
start_offset = pc.if_else(pc.less(start_offset, 0), 0, start_offset)
else:
start_offset = start
slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
result = pc.find_substring(slices, sub)
found = pc.not_equal(result, pa.scalar(-1, type=result.type))
offset_result = pc.add(result, start_offset)
result = pc.if_else(found, offset_result, -1)
return self._convert_int_result(result)

View File

@ -0,0 +1,544 @@
from __future__ import annotations
from functools import wraps
from typing import (
TYPE_CHECKING,
Any,
Literal,
cast,
overload,
)
import numpy as np
from pandas._libs import lib
from pandas._libs.arrays import NDArrayBacked
from pandas._libs.tslibs import is_supported_dtype
from pandas._typing import (
ArrayLike,
AxisInt,
Dtype,
F,
FillnaOptions,
PositionalIndexer2D,
PositionalIndexerTuple,
ScalarIndexer,
Self,
SequenceIndexer,
Shape,
TakeIndexer,
npt,
)
from pandas.errors import AbstractMethodError
from pandas.util._decorators import doc
from pandas.util._validators import (
validate_bool_kwarg,
validate_fillna_kwargs,
validate_insert_loc,
)
from pandas.core.dtypes.common import pandas_dtype
from pandas.core.dtypes.dtypes import (
DatetimeTZDtype,
ExtensionDtype,
PeriodDtype,
)
from pandas.core.dtypes.missing import array_equivalent
from pandas.core import missing
from pandas.core.algorithms import (
take,
unique,
value_counts_internal as value_counts,
)
from pandas.core.array_algos.quantile import quantile_with_mask
from pandas.core.array_algos.transforms import shift
from pandas.core.arrays.base import ExtensionArray
from pandas.core.construction import extract_array
from pandas.core.indexers import check_array_indexer
from pandas.core.sorting import nargminmax
if TYPE_CHECKING:
from collections.abc import Sequence
from pandas._typing import (
NumpySorter,
NumpyValueArrayLike,
)
from pandas import Series
def ravel_compat(meth: F) -> F:
"""
Decorator to ravel a 2D array before passing it to a cython operation,
then reshape the result to our own shape.
"""
@wraps(meth)
def method(self, *args, **kwargs):
if self.ndim == 1:
return meth(self, *args, **kwargs)
flags = self._ndarray.flags
flat = self.ravel("K")
result = meth(flat, *args, **kwargs)
order = "F" if flags.f_contiguous else "C"
return result.reshape(self.shape, order=order)
return cast(F, method)
class NDArrayBackedExtensionArray(NDArrayBacked, ExtensionArray):
"""
ExtensionArray that is backed by a single NumPy ndarray.
"""
_ndarray: np.ndarray
# scalar used to denote NA value inside our self._ndarray, e.g. -1
# for Categorical, iNaT for Period. Outside of object dtype,
# self.isna() should be exactly locations in self._ndarray with
# _internal_fill_value.
_internal_fill_value: Any
def _box_func(self, x):
"""
Wrap numpy type in our dtype.type if necessary.
"""
return x
def _validate_scalar(self, value):
# used by NDArrayBackedExtensionIndex.insert
raise AbstractMethodError(self)
# ------------------------------------------------------------------------
def view(self, dtype: Dtype | None = None) -> ArrayLike:
# We handle datetime64, datetime64tz, timedelta64, and period
# dtypes here. Everything else we pass through to the underlying
# ndarray.
if dtype is None or dtype is self.dtype:
return self._from_backing_data(self._ndarray)
if isinstance(dtype, type):
# we sometimes pass non-dtype objects, e.g np.ndarray;
# pass those through to the underlying ndarray
return self._ndarray.view(dtype)
dtype = pandas_dtype(dtype)
arr = self._ndarray
if isinstance(dtype, PeriodDtype):
cls = dtype.construct_array_type()
return cls(arr.view("i8"), dtype=dtype)
elif isinstance(dtype, DatetimeTZDtype):
dt_cls = dtype.construct_array_type()
dt64_values = arr.view(f"M8[{dtype.unit}]")
return dt_cls._simple_new(dt64_values, dtype=dtype)
elif lib.is_np_dtype(dtype, "M") and is_supported_dtype(dtype):
from pandas.core.arrays import DatetimeArray
dt64_values = arr.view(dtype)
return DatetimeArray._simple_new(dt64_values, dtype=dtype)
elif lib.is_np_dtype(dtype, "m") and is_supported_dtype(dtype):
from pandas.core.arrays import TimedeltaArray
td64_values = arr.view(dtype)
return TimedeltaArray._simple_new(td64_values, dtype=dtype)
# error: Argument "dtype" to "view" of "_ArrayOrScalarCommon" has incompatible
# type "Union[ExtensionDtype, dtype[Any]]"; expected "Union[dtype[Any], None,
# type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, Union[int,
# Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]"
return arr.view(dtype=dtype) # type: ignore[arg-type]
def take(
self,
indices: TakeIndexer,
*,
allow_fill: bool = False,
fill_value: Any = None,
axis: AxisInt = 0,
) -> Self:
if allow_fill:
fill_value = self._validate_scalar(fill_value)
new_data = take(
self._ndarray,
indices,
allow_fill=allow_fill,
fill_value=fill_value,
axis=axis,
)
return self._from_backing_data(new_data)
# ------------------------------------------------------------------------
def equals(self, other) -> bool:
if type(self) is not type(other):
return False
if self.dtype != other.dtype:
return False
return bool(array_equivalent(self._ndarray, other._ndarray, dtype_equal=True))
@classmethod
def _from_factorized(cls, values, original):
assert values.dtype == original._ndarray.dtype
return original._from_backing_data(values)
def _values_for_argsort(self) -> np.ndarray:
return self._ndarray
def _values_for_factorize(self):
return self._ndarray, self._internal_fill_value
def _hash_pandas_object(
self, *, encoding: str, hash_key: str, categorize: bool
) -> npt.NDArray[np.uint64]:
from pandas.core.util.hashing import hash_array
values = self._ndarray
return hash_array(
values, encoding=encoding, hash_key=hash_key, categorize=categorize
)
# Signature of "argmin" incompatible with supertype "ExtensionArray"
def argmin(self, axis: AxisInt = 0, skipna: bool = True): # type: ignore[override]
# override base class by adding axis keyword
validate_bool_kwarg(skipna, "skipna")
if not skipna and self._hasna:
raise NotImplementedError
return nargminmax(self, "argmin", axis=axis)
# Signature of "argmax" incompatible with supertype "ExtensionArray"
def argmax(self, axis: AxisInt = 0, skipna: bool = True): # type: ignore[override]
# override base class by adding axis keyword
validate_bool_kwarg(skipna, "skipna")
if not skipna and self._hasna:
raise NotImplementedError
return nargminmax(self, "argmax", axis=axis)
def unique(self) -> Self:
new_data = unique(self._ndarray)
return self._from_backing_data(new_data)
@classmethod
@doc(ExtensionArray._concat_same_type)
def _concat_same_type(
cls,
to_concat: Sequence[Self],
axis: AxisInt = 0,
) -> Self:
if not lib.dtypes_all_equal([x.dtype for x in to_concat]):
dtypes = {str(x.dtype) for x in to_concat}
raise ValueError("to_concat must have the same dtype", dtypes)
return super()._concat_same_type(to_concat, axis=axis)
@doc(ExtensionArray.searchsorted)
def searchsorted(
self,
value: NumpyValueArrayLike | ExtensionArray,
side: Literal["left", "right"] = "left",
sorter: NumpySorter | None = None,
) -> npt.NDArray[np.intp] | np.intp:
npvalue = self._validate_setitem_value(value)
return self._ndarray.searchsorted(npvalue, side=side, sorter=sorter)
@doc(ExtensionArray.shift)
def shift(self, periods: int = 1, fill_value=None):
# NB: shift is always along axis=0
axis = 0
fill_value = self._validate_scalar(fill_value)
new_values = shift(self._ndarray, periods, axis, fill_value)
return self._from_backing_data(new_values)
def __setitem__(self, key, value) -> None:
key = check_array_indexer(self, key)
value = self._validate_setitem_value(value)
self._ndarray[key] = value
def _validate_setitem_value(self, value):
return value
@overload
def __getitem__(self, key: ScalarIndexer) -> Any:
...
@overload
def __getitem__(
self,
key: SequenceIndexer | PositionalIndexerTuple,
) -> Self:
...
def __getitem__(
self,
key: PositionalIndexer2D,
) -> Self | Any:
if lib.is_integer(key):
# fast-path
result = self._ndarray[key]
if self.ndim == 1:
return self._box_func(result)
return self._from_backing_data(result)
# error: Incompatible types in assignment (expression has type "ExtensionArray",
# variable has type "Union[int, slice, ndarray]")
key = extract_array(key, extract_numpy=True) # type: ignore[assignment]
key = check_array_indexer(self, key)
result = self._ndarray[key]
if lib.is_scalar(result):
return self._box_func(result)
result = self._from_backing_data(result)
return result
def _fill_mask_inplace(
self, method: str, limit: int | None, mask: npt.NDArray[np.bool_]
) -> None:
# (for now) when self.ndim == 2, we assume axis=0
func = missing.get_fill_func(method, ndim=self.ndim)
func(self._ndarray.T, limit=limit, mask=mask.T)
def _pad_or_backfill(
self,
*,
method: FillnaOptions,
limit: int | None = None,
limit_area: Literal["inside", "outside"] | None = None,
copy: bool = True,
) -> Self:
mask = self.isna()
if mask.any():
# (for now) when self.ndim == 2, we assume axis=0
func = missing.get_fill_func(method, ndim=self.ndim)
npvalues = self._ndarray.T
if copy:
npvalues = npvalues.copy()
func(npvalues, limit=limit, limit_area=limit_area, mask=mask.T)
npvalues = npvalues.T
if copy:
new_values = self._from_backing_data(npvalues)
else:
new_values = self
else:
if copy:
new_values = self.copy()
else:
new_values = self
return new_values
@doc(ExtensionArray.fillna)
def fillna(
self, value=None, method=None, limit: int | None = None, copy: bool = True
) -> Self:
value, method = validate_fillna_kwargs(
value, method, validate_scalar_dict_value=False
)
mask = self.isna()
# error: Argument 2 to "check_value_size" has incompatible type
# "ExtensionArray"; expected "ndarray"
value = missing.check_value_size(
value, mask, len(self) # type: ignore[arg-type]
)
if mask.any():
if method is not None:
# (for now) when self.ndim == 2, we assume axis=0
func = missing.get_fill_func(method, ndim=self.ndim)
npvalues = self._ndarray.T
if copy:
npvalues = npvalues.copy()
func(npvalues, limit=limit, mask=mask.T)
npvalues = npvalues.T
# TODO: NumpyExtensionArray didn't used to copy, need tests
# for this
new_values = self._from_backing_data(npvalues)
else:
# fill with value
if copy:
new_values = self.copy()
else:
new_values = self[:]
new_values[mask] = value
else:
# We validate the fill_value even if there is nothing to fill
if value is not None:
self._validate_setitem_value(value)
if not copy:
new_values = self[:]
else:
new_values = self.copy()
return new_values
# ------------------------------------------------------------------------
# Reductions
def _wrap_reduction_result(self, axis: AxisInt | None, result):
if axis is None or self.ndim == 1:
return self._box_func(result)
return self._from_backing_data(result)
# ------------------------------------------------------------------------
# __array_function__ methods
def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None:
"""
Analogue to np.putmask(self, mask, value)
Parameters
----------
mask : np.ndarray[bool]
value : scalar or listlike
Raises
------
TypeError
If value cannot be cast to self.dtype.
"""
value = self._validate_setitem_value(value)
np.putmask(self._ndarray, mask, value)
def _where(self: Self, mask: npt.NDArray[np.bool_], value) -> Self:
"""
Analogue to np.where(mask, self, value)
Parameters
----------
mask : np.ndarray[bool]
value : scalar or listlike
Raises
------
TypeError
If value cannot be cast to self.dtype.
"""
value = self._validate_setitem_value(value)
res_values = np.where(mask, self._ndarray, value)
if res_values.dtype != self._ndarray.dtype:
raise AssertionError(
# GH#56410
"Something has gone wrong, please report a bug at "
"github.com/pandas-dev/pandas/"
)
return self._from_backing_data(res_values)
# ------------------------------------------------------------------------
# Index compat methods
def insert(self, loc: int, item) -> Self:
"""
Make new ExtensionArray inserting new item at location. Follows
Python list.append semantics for negative values.
Parameters
----------
loc : int
item : object
Returns
-------
type(self)
"""
loc = validate_insert_loc(loc, len(self))
code = self._validate_scalar(item)
new_vals = np.concatenate(
(
self._ndarray[:loc],
np.asarray([code], dtype=self._ndarray.dtype),
self._ndarray[loc:],
)
)
return self._from_backing_data(new_vals)
# ------------------------------------------------------------------------
# Additional array methods
# These are not part of the EA API, but we implement them because
# pandas assumes they're there.
def value_counts(self, dropna: bool = True) -> Series:
"""
Return a Series containing counts of unique values.
Parameters
----------
dropna : bool, default True
Don't include counts of NA values.
Returns
-------
Series
"""
if self.ndim != 1:
raise NotImplementedError
from pandas import (
Index,
Series,
)
if dropna:
# error: Unsupported operand type for ~ ("ExtensionArray")
values = self[~self.isna()]._ndarray # type: ignore[operator]
else:
values = self._ndarray
result = value_counts(values, sort=False, dropna=dropna)
index_arr = self._from_backing_data(np.asarray(result.index._data))
index = Index(index_arr, name=result.index.name)
return Series(result._values, index=index, name=result.name, copy=False)
def _quantile(
self,
qs: npt.NDArray[np.float64],
interpolation: str,
) -> Self:
# TODO: disable for Categorical if not ordered?
mask = np.asarray(self.isna())
arr = self._ndarray
fill_value = self._internal_fill_value
res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation)
if res_values.dtype == self._ndarray.dtype:
return self._from_backing_data(res_values)
else:
# e.g. test_quantile_empty we are empty integer dtype and res_values
# has floating dtype
# TODO: technically __init__ isn't defined here.
# Should we raise NotImplementedError and handle this on NumpyEA?
return type(self)(res_values) # type: ignore[call-arg]
# ------------------------------------------------------------------------
# numpy-like methods
@classmethod
def _empty(cls, shape: Shape, dtype: ExtensionDtype) -> Self:
"""
Analogous to np.empty(shape, dtype=dtype)
Parameters
----------
shape : tuple[int]
dtype : ExtensionDtype
"""
# The base implementation uses a naive approach to find the dtype
# for the backing ndarray
arr = cls._from_sequence([], dtype=dtype)
backing = np.empty(shape, dtype=arr._ndarray.dtype)
return arr._from_backing_data(backing)

View File

@ -0,0 +1,207 @@
"""
Helper functions to generate range-like data for DatetimeArray
(and possibly TimedeltaArray/PeriodArray)
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import numpy as np
from pandas._libs.lib import i8max
from pandas._libs.tslibs import (
BaseOffset,
OutOfBoundsDatetime,
Timedelta,
Timestamp,
iNaT,
)
if TYPE_CHECKING:
from pandas._typing import npt
def generate_regular_range(
start: Timestamp | Timedelta | None,
end: Timestamp | Timedelta | None,
periods: int | None,
freq: BaseOffset,
unit: str = "ns",
) -> npt.NDArray[np.intp]:
"""
Generate a range of dates or timestamps with the spans between dates
described by the given `freq` DateOffset.
Parameters
----------
start : Timedelta, Timestamp or None
First point of produced date range.
end : Timedelta, Timestamp or None
Last point of produced date range.
periods : int or None
Number of periods in produced date range.
freq : Tick
Describes space between dates in produced date range.
unit : str, default "ns"
The resolution the output is meant to represent.
Returns
-------
ndarray[np.int64]
Representing the given resolution.
"""
istart = start._value if start is not None else None
iend = end._value if end is not None else None
freq.nanos # raises if non-fixed frequency
td = Timedelta(freq)
b: int
e: int
try:
td = td.as_unit(unit, round_ok=False)
except ValueError as err:
raise ValueError(
f"freq={freq} is incompatible with unit={unit}. "
"Use a lower freq or a higher unit instead."
) from err
stride = int(td._value)
if periods is None and istart is not None and iend is not None:
b = istart
# cannot just use e = Timestamp(end) + 1 because arange breaks when
# stride is too large, see GH10887
e = b + (iend - b) // stride * stride + stride // 2 + 1
elif istart is not None and periods is not None:
b = istart
e = _generate_range_overflow_safe(b, periods, stride, side="start")
elif iend is not None and periods is not None:
e = iend + stride
b = _generate_range_overflow_safe(e, periods, stride, side="end")
else:
raise ValueError(
"at least 'start' or 'end' should be specified if a 'period' is given."
)
with np.errstate(over="raise"):
# If the range is sufficiently large, np.arange may overflow
# and incorrectly return an empty array if not caught.
try:
values = np.arange(b, e, stride, dtype=np.int64)
except FloatingPointError:
xdr = [b]
while xdr[-1] != e:
xdr.append(xdr[-1] + stride)
values = np.array(xdr[:-1], dtype=np.int64)
return values
def _generate_range_overflow_safe(
endpoint: int, periods: int, stride: int, side: str = "start"
) -> int:
"""
Calculate the second endpoint for passing to np.arange, checking
to avoid an integer overflow. Catch OverflowError and re-raise
as OutOfBoundsDatetime.
Parameters
----------
endpoint : int
nanosecond timestamp of the known endpoint of the desired range
periods : int
number of periods in the desired range
stride : int
nanoseconds between periods in the desired range
side : {'start', 'end'}
which end of the range `endpoint` refers to
Returns
-------
other_end : int
Raises
------
OutOfBoundsDatetime
"""
# GH#14187 raise instead of incorrectly wrapping around
assert side in ["start", "end"]
i64max = np.uint64(i8max)
msg = f"Cannot generate range with {side}={endpoint} and periods={periods}"
with np.errstate(over="raise"):
# if periods * strides cannot be multiplied within the *uint64* bounds,
# we cannot salvage the operation by recursing, so raise
try:
addend = np.uint64(periods) * np.uint64(np.abs(stride))
except FloatingPointError as err:
raise OutOfBoundsDatetime(msg) from err
if np.abs(addend) <= i64max:
# relatively easy case without casting concerns
return _generate_range_overflow_safe_signed(endpoint, periods, stride, side)
elif (endpoint > 0 and side == "start" and stride > 0) or (
endpoint < 0 < stride and side == "end"
):
# no chance of not-overflowing
raise OutOfBoundsDatetime(msg)
elif side == "end" and endpoint - stride <= i64max < endpoint:
# in _generate_regular_range we added `stride` thereby overflowing
# the bounds. Adjust to fix this.
return _generate_range_overflow_safe(
endpoint - stride, periods - 1, stride, side
)
# split into smaller pieces
mid_periods = periods // 2
remaining = periods - mid_periods
assert 0 < remaining < periods, (remaining, periods, endpoint, stride)
midpoint = int(_generate_range_overflow_safe(endpoint, mid_periods, stride, side))
return _generate_range_overflow_safe(midpoint, remaining, stride, side)
def _generate_range_overflow_safe_signed(
endpoint: int, periods: int, stride: int, side: str
) -> int:
"""
A special case for _generate_range_overflow_safe where `periods * stride`
can be calculated without overflowing int64 bounds.
"""
assert side in ["start", "end"]
if side == "end":
stride *= -1
with np.errstate(over="raise"):
addend = np.int64(periods) * np.int64(stride)
try:
# easy case with no overflows
result = np.int64(endpoint) + addend
if result == iNaT:
# Putting this into a DatetimeArray/TimedeltaArray
# would incorrectly be interpreted as NaT
raise OverflowError
return int(result)
except (FloatingPointError, OverflowError):
# with endpoint negative and addend positive we risk
# FloatingPointError; with reversed signed we risk OverflowError
pass
# if stride and endpoint had opposite signs, then endpoint + addend
# should never overflow. so they must have the same signs
assert (stride > 0 and endpoint >= 0) or (stride < 0 and endpoint <= 0)
if stride > 0:
# watch out for very special case in which we just slightly
# exceed implementation bounds, but when passing the result to
# np.arange will get a result slightly within the bounds
uresult = np.uint64(endpoint) + np.uint64(addend)
i64max = np.uint64(i8max)
assert uresult > i64max
if uresult <= i64max + np.uint64(stride):
return int(uresult)
raise OutOfBoundsDatetime(
f"Cannot generate range with {side}={endpoint} and periods={periods}"
)

View File

@ -0,0 +1,63 @@
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Any,
)
import numpy as np
from pandas._libs import lib
from pandas.errors import LossySetitemError
from pandas.core.dtypes.cast import np_can_hold_element
from pandas.core.dtypes.common import is_numeric_dtype
if TYPE_CHECKING:
from pandas._typing import (
ArrayLike,
npt,
)
def to_numpy_dtype_inference(
arr: ArrayLike, dtype: npt.DTypeLike | None, na_value, hasna: bool
) -> tuple[npt.DTypeLike, Any]:
if dtype is None and is_numeric_dtype(arr.dtype):
dtype_given = False
if hasna:
if arr.dtype.kind == "b":
dtype = np.dtype(np.object_)
else:
if arr.dtype.kind in "iu":
dtype = np.dtype(np.float64)
else:
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
if na_value is lib.no_default:
na_value = np.nan
else:
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
elif dtype is not None:
dtype = np.dtype(dtype)
dtype_given = True
else:
dtype_given = True
if na_value is lib.no_default:
if dtype is None or not hasna:
na_value = arr.dtype.na_value
elif dtype.kind == "f": # type: ignore[union-attr]
na_value = np.nan
elif dtype.kind == "M": # type: ignore[union-attr]
na_value = np.datetime64("nat")
elif dtype.kind == "m": # type: ignore[union-attr]
na_value = np.timedelta64("nat")
else:
na_value = arr.dtype.na_value
if not dtype_given and hasna:
try:
np_can_hold_element(dtype, na_value) # type: ignore[arg-type]
except LossySetitemError:
dtype = np.dtype(np.object_)
return dtype, na_value

View File

@ -0,0 +1,7 @@
from pandas.core.arrays.arrow.accessors import (
ListAccessor,
StructAccessor,
)
from pandas.core.arrays.arrow.array import ArrowExtensionArray
__all__ = ["ArrowExtensionArray", "StructAccessor", "ListAccessor"]

View File

@ -0,0 +1,50 @@
from __future__ import annotations
import numpy as np
import pyarrow
def pyarrow_array_to_numpy_and_mask(
arr, dtype: np.dtype
) -> tuple[np.ndarray, np.ndarray]:
"""
Convert a primitive pyarrow.Array to a numpy array and boolean mask based
on the buffers of the Array.
At the moment pyarrow.BooleanArray is not supported.
Parameters
----------
arr : pyarrow.Array
dtype : numpy.dtype
Returns
-------
(data, mask)
Tuple of two numpy arrays with the raw data (with specified dtype) and
a boolean mask (validity mask, so False means missing)
"""
dtype = np.dtype(dtype)
if pyarrow.types.is_null(arr.type):
# No initialization of data is needed since everything is null
data = np.empty(len(arr), dtype=dtype)
mask = np.zeros(len(arr), dtype=bool)
return data, mask
buflist = arr.buffers()
# Since Arrow buffers might contain padding and the data might be offset,
# the buffer gets sliced here before handing it to numpy.
# See also https://github.com/pandas-dev/pandas/issues/40896
offset = arr.offset * dtype.itemsize
length = len(arr) * dtype.itemsize
data_buf = buflist[1][offset : offset + length]
data = np.frombuffer(data_buf, dtype=dtype)
bitmask = buflist[0]
if bitmask is not None:
mask = pyarrow.BooleanArray.from_buffers(
pyarrow.bool_(), len(arr), [None, bitmask], offset=arr.offset
)
mask = np.asarray(mask)
else:
mask = np.ones(len(arr), dtype=bool)
return data, mask

View File

@ -0,0 +1,473 @@
"""Accessors for arrow-backed data."""
from __future__ import annotations
from abc import (
ABCMeta,
abstractmethod,
)
from typing import (
TYPE_CHECKING,
cast,
)
from pandas.compat import (
pa_version_under10p1,
pa_version_under11p0,
)
from pandas.core.dtypes.common import is_list_like
if not pa_version_under10p1:
import pyarrow as pa
import pyarrow.compute as pc
from pandas.core.dtypes.dtypes import ArrowDtype
if TYPE_CHECKING:
from collections.abc import Iterator
from pandas import (
DataFrame,
Series,
)
class ArrowAccessor(metaclass=ABCMeta):
@abstractmethod
def __init__(self, data, validation_msg: str) -> None:
self._data = data
self._validation_msg = validation_msg
self._validate(data)
@abstractmethod
def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
pass
def _validate(self, data):
dtype = data.dtype
if pa_version_under10p1 or not isinstance(dtype, ArrowDtype):
# Raise AttributeError so that inspect can handle non-struct Series.
raise AttributeError(self._validation_msg.format(dtype=dtype))
if not self._is_valid_pyarrow_dtype(dtype.pyarrow_dtype):
# Raise AttributeError so that inspect can handle invalid Series.
raise AttributeError(self._validation_msg.format(dtype=dtype))
@property
def _pa_array(self):
return self._data.array._pa_array
class ListAccessor(ArrowAccessor):
"""
Accessor object for list data properties of the Series values.
Parameters
----------
data : Series
Series containing Arrow list data.
"""
def __init__(self, data=None) -> None:
super().__init__(
data,
validation_msg="Can only use the '.list' accessor with "
"'list[pyarrow]' dtype, not {dtype}.",
)
def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
return (
pa.types.is_list(pyarrow_dtype)
or pa.types.is_fixed_size_list(pyarrow_dtype)
or pa.types.is_large_list(pyarrow_dtype)
)
def len(self) -> Series:
"""
Return the length of each list in the Series.
Returns
-------
pandas.Series
The length of each list.
Examples
--------
>>> import pyarrow as pa
>>> s = pd.Series(
... [
... [1, 2, 3],
... [3],
... ],
... dtype=pd.ArrowDtype(pa.list_(
... pa.int64()
... ))
... )
>>> s.list.len()
0 3
1 1
dtype: int32[pyarrow]
"""
from pandas import Series
value_lengths = pc.list_value_length(self._pa_array)
return Series(value_lengths, dtype=ArrowDtype(value_lengths.type))
def __getitem__(self, key: int | slice) -> Series:
"""
Index or slice lists in the Series.
Parameters
----------
key : int | slice
Index or slice of indices to access from each list.
Returns
-------
pandas.Series
The list at requested index.
Examples
--------
>>> import pyarrow as pa
>>> s = pd.Series(
... [
... [1, 2, 3],
... [3],
... ],
... dtype=pd.ArrowDtype(pa.list_(
... pa.int64()
... ))
... )
>>> s.list[0]
0 1
1 3
dtype: int64[pyarrow]
"""
from pandas import Series
if isinstance(key, int):
# TODO: Support negative key but pyarrow does not allow
# element index to be an array.
# if key < 0:
# key = pc.add(key, pc.list_value_length(self._pa_array))
element = pc.list_element(self._pa_array, key)
return Series(element, dtype=ArrowDtype(element.type))
elif isinstance(key, slice):
if pa_version_under11p0:
raise NotImplementedError(
f"List slice not supported by pyarrow {pa.__version__}."
)
# TODO: Support negative start/stop/step, ideally this would be added
# upstream in pyarrow.
start, stop, step = key.start, key.stop, key.step
if start is None:
# TODO: When adding negative step support
# this should be setto last element of array
# when step is negative.
start = 0
if step is None:
step = 1
sliced = pc.list_slice(self._pa_array, start, stop, step)
return Series(sliced, dtype=ArrowDtype(sliced.type))
else:
raise ValueError(f"key must be an int or slice, got {type(key).__name__}")
def __iter__(self) -> Iterator:
raise TypeError(f"'{type(self).__name__}' object is not iterable")
def flatten(self) -> Series:
"""
Flatten list values.
Returns
-------
pandas.Series
The data from all lists in the series flattened.
Examples
--------
>>> import pyarrow as pa
>>> s = pd.Series(
... [
... [1, 2, 3],
... [3],
... ],
... dtype=pd.ArrowDtype(pa.list_(
... pa.int64()
... ))
... )
>>> s.list.flatten()
0 1
1 2
2 3
3 3
dtype: int64[pyarrow]
"""
from pandas import Series
flattened = pc.list_flatten(self._pa_array)
return Series(flattened, dtype=ArrowDtype(flattened.type))
class StructAccessor(ArrowAccessor):
"""
Accessor object for structured data properties of the Series values.
Parameters
----------
data : Series
Series containing Arrow struct data.
"""
def __init__(self, data=None) -> None:
super().__init__(
data,
validation_msg=(
"Can only use the '.struct' accessor with 'struct[pyarrow]' "
"dtype, not {dtype}."
),
)
def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
return pa.types.is_struct(pyarrow_dtype)
@property
def dtypes(self) -> Series:
"""
Return the dtype object of each child field of the struct.
Returns
-------
pandas.Series
The data type of each child field.
Examples
--------
>>> import pyarrow as pa
>>> s = pd.Series(
... [
... {"version": 1, "project": "pandas"},
... {"version": 2, "project": "pandas"},
... {"version": 1, "project": "numpy"},
... ],
... dtype=pd.ArrowDtype(pa.struct(
... [("version", pa.int64()), ("project", pa.string())]
... ))
... )
>>> s.struct.dtypes
version int64[pyarrow]
project string[pyarrow]
dtype: object
"""
from pandas import (
Index,
Series,
)
pa_type = self._data.dtype.pyarrow_dtype
types = [ArrowDtype(struct.type) for struct in pa_type]
names = [struct.name for struct in pa_type]
return Series(types, index=Index(names))
def field(
self,
name_or_index: list[str]
| list[bytes]
| list[int]
| pc.Expression
| bytes
| str
| int,
) -> Series:
"""
Extract a child field of a struct as a Series.
Parameters
----------
name_or_index : str | bytes | int | expression | list
Name or index of the child field to extract.
For list-like inputs, this will index into a nested
struct.
Returns
-------
pandas.Series
The data corresponding to the selected child field.
See Also
--------
Series.struct.explode : Return all child fields as a DataFrame.
Notes
-----
The name of the resulting Series will be set using the following
rules:
- For string, bytes, or integer `name_or_index` (or a list of these, for
a nested selection), the Series name is set to the selected
field's name.
- For a :class:`pyarrow.compute.Expression`, this is set to
the string form of the expression.
- For list-like `name_or_index`, the name will be set to the
name of the final field selected.
Examples
--------
>>> import pyarrow as pa
>>> s = pd.Series(
... [
... {"version": 1, "project": "pandas"},
... {"version": 2, "project": "pandas"},
... {"version": 1, "project": "numpy"},
... ],
... dtype=pd.ArrowDtype(pa.struct(
... [("version", pa.int64()), ("project", pa.string())]
... ))
... )
Extract by field name.
>>> s.struct.field("project")
0 pandas
1 pandas
2 numpy
Name: project, dtype: string[pyarrow]
Extract by field index.
>>> s.struct.field(0)
0 1
1 2
2 1
Name: version, dtype: int64[pyarrow]
Or an expression
>>> import pyarrow.compute as pc
>>> s.struct.field(pc.field("project"))
0 pandas
1 pandas
2 numpy
Name: project, dtype: string[pyarrow]
For nested struct types, you can pass a list of values to index
multiple levels:
>>> version_type = pa.struct([
... ("major", pa.int64()),
... ("minor", pa.int64()),
... ])
>>> s = pd.Series(
... [
... {"version": {"major": 1, "minor": 5}, "project": "pandas"},
... {"version": {"major": 2, "minor": 1}, "project": "pandas"},
... {"version": {"major": 1, "minor": 26}, "project": "numpy"},
... ],
... dtype=pd.ArrowDtype(pa.struct(
... [("version", version_type), ("project", pa.string())]
... ))
... )
>>> s.struct.field(["version", "minor"])
0 5
1 1
2 26
Name: minor, dtype: int64[pyarrow]
>>> s.struct.field([0, 0])
0 1
1 2
2 1
Name: major, dtype: int64[pyarrow]
"""
from pandas import Series
def get_name(
level_name_or_index: list[str]
| list[bytes]
| list[int]
| pc.Expression
| bytes
| str
| int,
data: pa.ChunkedArray,
):
if isinstance(level_name_or_index, int):
name = data.type.field(level_name_or_index).name
elif isinstance(level_name_or_index, (str, bytes)):
name = level_name_or_index
elif isinstance(level_name_or_index, pc.Expression):
name = str(level_name_or_index)
elif is_list_like(level_name_or_index):
# For nested input like [2, 1, 2]
# iteratively get the struct and field name. The last
# one is used for the name of the index.
level_name_or_index = list(reversed(level_name_or_index))
selected = data
while level_name_or_index:
# we need the cast, otherwise mypy complains about
# getting ints, bytes, or str here, which isn't possible.
level_name_or_index = cast(list, level_name_or_index)
name_or_index = level_name_or_index.pop()
name = get_name(name_or_index, selected)
selected = selected.type.field(selected.type.get_field_index(name))
name = selected.name
else:
raise ValueError(
"name_or_index must be an int, str, bytes, "
"pyarrow.compute.Expression, or list of those"
)
return name
pa_arr = self._data.array._pa_array
name = get_name(name_or_index, pa_arr)
field_arr = pc.struct_field(pa_arr, name_or_index)
return Series(
field_arr,
dtype=ArrowDtype(field_arr.type),
index=self._data.index,
name=name,
)
def explode(self) -> DataFrame:
"""
Extract all child fields of a struct as a DataFrame.
Returns
-------
pandas.DataFrame
The data corresponding to all child fields.
See Also
--------
Series.struct.field : Return a single child field as a Series.
Examples
--------
>>> import pyarrow as pa
>>> s = pd.Series(
... [
... {"version": 1, "project": "pandas"},
... {"version": 2, "project": "pandas"},
... {"version": 1, "project": "numpy"},
... ],
... dtype=pd.ArrowDtype(pa.struct(
... [("version", pa.int64()), ("project", pa.string())]
... ))
... )
>>> s.struct.explode()
version project
0 1 pandas
1 2 pandas
2 1 numpy
"""
from pandas import concat
pa_type = self._pa_array.type
return concat(
[self.field(i) for i in range(pa_type.num_fields)], axis="columns"
)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,174 @@
from __future__ import annotations
import json
from typing import TYPE_CHECKING
import pyarrow
from pandas.compat import pa_version_under14p1
from pandas.core.dtypes.dtypes import (
IntervalDtype,
PeriodDtype,
)
from pandas.core.arrays.interval import VALID_CLOSED
if TYPE_CHECKING:
from pandas._typing import IntervalClosedType
class ArrowPeriodType(pyarrow.ExtensionType):
def __init__(self, freq) -> None:
# attributes need to be set first before calling
# super init (as that calls serialize)
self._freq = freq
pyarrow.ExtensionType.__init__(self, pyarrow.int64(), "pandas.period")
@property
def freq(self):
return self._freq
def __arrow_ext_serialize__(self) -> bytes:
metadata = {"freq": self.freq}
return json.dumps(metadata).encode()
@classmethod
def __arrow_ext_deserialize__(cls, storage_type, serialized) -> ArrowPeriodType:
metadata = json.loads(serialized.decode())
return ArrowPeriodType(metadata["freq"])
def __eq__(self, other):
if isinstance(other, pyarrow.BaseExtensionType):
return type(self) == type(other) and self.freq == other.freq
else:
return NotImplemented
def __ne__(self, other) -> bool:
return not self == other
def __hash__(self) -> int:
return hash((str(self), self.freq))
def to_pandas_dtype(self) -> PeriodDtype:
return PeriodDtype(freq=self.freq)
# register the type with a dummy instance
_period_type = ArrowPeriodType("D")
pyarrow.register_extension_type(_period_type)
class ArrowIntervalType(pyarrow.ExtensionType):
def __init__(self, subtype, closed: IntervalClosedType) -> None:
# attributes need to be set first before calling
# super init (as that calls serialize)
assert closed in VALID_CLOSED
self._closed: IntervalClosedType = closed
if not isinstance(subtype, pyarrow.DataType):
subtype = pyarrow.type_for_alias(str(subtype))
self._subtype = subtype
storage_type = pyarrow.struct([("left", subtype), ("right", subtype)])
pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval")
@property
def subtype(self):
return self._subtype
@property
def closed(self) -> IntervalClosedType:
return self._closed
def __arrow_ext_serialize__(self) -> bytes:
metadata = {"subtype": str(self.subtype), "closed": self.closed}
return json.dumps(metadata).encode()
@classmethod
def __arrow_ext_deserialize__(cls, storage_type, serialized) -> ArrowIntervalType:
metadata = json.loads(serialized.decode())
subtype = pyarrow.type_for_alias(metadata["subtype"])
closed = metadata["closed"]
return ArrowIntervalType(subtype, closed)
def __eq__(self, other):
if isinstance(other, pyarrow.BaseExtensionType):
return (
type(self) == type(other)
and self.subtype == other.subtype
and self.closed == other.closed
)
else:
return NotImplemented
def __ne__(self, other) -> bool:
return not self == other
def __hash__(self) -> int:
return hash((str(self), str(self.subtype), self.closed))
def to_pandas_dtype(self) -> IntervalDtype:
return IntervalDtype(self.subtype.to_pandas_dtype(), self.closed)
# register the type with a dummy instance
_interval_type = ArrowIntervalType(pyarrow.int64(), "left")
pyarrow.register_extension_type(_interval_type)
_ERROR_MSG = """\
Disallowed deserialization of 'arrow.py_extension_type':
storage_type = {storage_type}
serialized = {serialized}
pickle disassembly:\n{pickle_disassembly}
Reading of untrusted Parquet or Feather files with a PyExtensionType column
allows arbitrary code execution.
If you trust this file, you can enable reading the extension type by one of:
- upgrading to pyarrow >= 14.0.1, and call `pa.PyExtensionType.set_auto_load(True)`
- install pyarrow-hotfix (`pip install pyarrow-hotfix`) and disable it by running
`import pyarrow_hotfix; pyarrow_hotfix.uninstall()`
We strongly recommend updating your Parquet/Feather files to use extension types
derived from `pyarrow.ExtensionType` instead, and register this type explicitly.
"""
def patch_pyarrow():
# starting from pyarrow 14.0.1, it has its own mechanism
if not pa_version_under14p1:
return
# if https://github.com/pitrou/pyarrow-hotfix was installed and enabled
if getattr(pyarrow, "_hotfix_installed", False):
return
class ForbiddenExtensionType(pyarrow.ExtensionType):
def __arrow_ext_serialize__(self):
return b""
@classmethod
def __arrow_ext_deserialize__(cls, storage_type, serialized):
import io
import pickletools
out = io.StringIO()
pickletools.dis(serialized, out)
raise RuntimeError(
_ERROR_MSG.format(
storage_type=storage_type,
serialized=serialized,
pickle_disassembly=out.getvalue(),
)
)
pyarrow.unregister_extension_type("arrow.py_extension_type")
pyarrow.register_extension_type(
ForbiddenExtensionType(pyarrow.null(), "arrow.py_extension_type")
)
pyarrow._hotfix_installed = True
patch_pyarrow()

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,407 @@
from __future__ import annotations
import numbers
from typing import (
TYPE_CHECKING,
ClassVar,
cast,
)
import numpy as np
from pandas._libs import (
lib,
missing as libmissing,
)
from pandas.core.dtypes.common import is_list_like
from pandas.core.dtypes.dtypes import register_extension_dtype
from pandas.core.dtypes.missing import isna
from pandas.core import ops
from pandas.core.array_algos import masked_accumulations
from pandas.core.arrays.masked import (
BaseMaskedArray,
BaseMaskedDtype,
)
if TYPE_CHECKING:
import pyarrow
from pandas._typing import (
Dtype,
DtypeObj,
Self,
npt,
type_t,
)
@register_extension_dtype
class BooleanDtype(BaseMaskedDtype):
"""
Extension dtype for boolean data.
.. warning::
BooleanDtype is considered experimental. The implementation and
parts of the API may change without warning.
Attributes
----------
None
Methods
-------
None
Examples
--------
>>> pd.BooleanDtype()
BooleanDtype
"""
name: ClassVar[str] = "boolean"
# https://github.com/python/mypy/issues/4125
# error: Signature of "type" incompatible with supertype "BaseMaskedDtype"
@property
def type(self) -> type: # type: ignore[override]
return np.bool_
@property
def kind(self) -> str:
return "b"
@property
def numpy_dtype(self) -> np.dtype:
return np.dtype("bool")
@classmethod
def construct_array_type(cls) -> type_t[BooleanArray]:
"""
Return the array type associated with this dtype.
Returns
-------
type
"""
return BooleanArray
def __repr__(self) -> str:
return "BooleanDtype"
@property
def _is_boolean(self) -> bool:
return True
@property
def _is_numeric(self) -> bool:
return True
def __from_arrow__(
self, array: pyarrow.Array | pyarrow.ChunkedArray
) -> BooleanArray:
"""
Construct BooleanArray from pyarrow Array/ChunkedArray.
"""
import pyarrow
if array.type != pyarrow.bool_() and not pyarrow.types.is_null(array.type):
raise TypeError(f"Expected array of boolean type, got {array.type} instead")
if isinstance(array, pyarrow.Array):
chunks = [array]
length = len(array)
else:
# pyarrow.ChunkedArray
chunks = array.chunks
length = array.length()
if pyarrow.types.is_null(array.type):
mask = np.ones(length, dtype=bool)
# No need to init data, since all null
data = np.empty(length, dtype=bool)
return BooleanArray(data, mask)
results = []
for arr in chunks:
buflist = arr.buffers()
data = pyarrow.BooleanArray.from_buffers(
arr.type, len(arr), [None, buflist[1]], offset=arr.offset
).to_numpy(zero_copy_only=False)
if arr.null_count != 0:
mask = pyarrow.BooleanArray.from_buffers(
arr.type, len(arr), [None, buflist[0]], offset=arr.offset
).to_numpy(zero_copy_only=False)
mask = ~mask
else:
mask = np.zeros(len(arr), dtype=bool)
bool_arr = BooleanArray(data, mask)
results.append(bool_arr)
if not results:
return BooleanArray(
np.array([], dtype=np.bool_), np.array([], dtype=np.bool_)
)
else:
return BooleanArray._concat_same_type(results)
def coerce_to_array(
values, mask=None, copy: bool = False
) -> tuple[np.ndarray, np.ndarray]:
"""
Coerce the input values array to numpy arrays with a mask.
Parameters
----------
values : 1D list-like
mask : bool 1D array, optional
copy : bool, default False
if True, copy the input
Returns
-------
tuple of (values, mask)
"""
if isinstance(values, BooleanArray):
if mask is not None:
raise ValueError("cannot pass mask for BooleanArray input")
values, mask = values._data, values._mask
if copy:
values = values.copy()
mask = mask.copy()
return values, mask
mask_values = None
if isinstance(values, np.ndarray) and values.dtype == np.bool_:
if copy:
values = values.copy()
elif isinstance(values, np.ndarray) and values.dtype.kind in "iufcb":
mask_values = isna(values)
values_bool = np.zeros(len(values), dtype=bool)
values_bool[~mask_values] = values[~mask_values].astype(bool)
if not np.all(
values_bool[~mask_values].astype(values.dtype) == values[~mask_values]
):
raise TypeError("Need to pass bool-like values")
values = values_bool
else:
values_object = np.asarray(values, dtype=object)
inferred_dtype = lib.infer_dtype(values_object, skipna=True)
integer_like = ("floating", "integer", "mixed-integer-float")
if inferred_dtype not in ("boolean", "empty") + integer_like:
raise TypeError("Need to pass bool-like values")
# mypy does not narrow the type of mask_values to npt.NDArray[np.bool_]
# within this branch, it assumes it can also be None
mask_values = cast("npt.NDArray[np.bool_]", isna(values_object))
values = np.zeros(len(values), dtype=bool)
values[~mask_values] = values_object[~mask_values].astype(bool)
# if the values were integer-like, validate it were actually 0/1's
if (inferred_dtype in integer_like) and not (
np.all(
values[~mask_values].astype(float)
== values_object[~mask_values].astype(float)
)
):
raise TypeError("Need to pass bool-like values")
if mask is None and mask_values is None:
mask = np.zeros(values.shape, dtype=bool)
elif mask is None:
mask = mask_values
else:
if isinstance(mask, np.ndarray) and mask.dtype == np.bool_:
if mask_values is not None:
mask = mask | mask_values
else:
if copy:
mask = mask.copy()
else:
mask = np.array(mask, dtype=bool)
if mask_values is not None:
mask = mask | mask_values
if values.shape != mask.shape:
raise ValueError("values.shape and mask.shape must match")
return values, mask
class BooleanArray(BaseMaskedArray):
"""
Array of boolean (True/False) data with missing values.
This is a pandas Extension array for boolean data, under the hood
represented by 2 numpy arrays: a boolean array with the data and
a boolean array with the mask (True indicating missing).
BooleanArray implements Kleene logic (sometimes called three-value
logic) for logical operations. See :ref:`boolean.kleene` for more.
To construct an BooleanArray from generic array-like input, use
:func:`pandas.array` specifying ``dtype="boolean"`` (see examples
below).
.. warning::
BooleanArray is considered experimental. The implementation and
parts of the API may change without warning.
Parameters
----------
values : numpy.ndarray
A 1-d boolean-dtype array with the data.
mask : numpy.ndarray
A 1-d boolean-dtype array indicating missing values (True
indicates missing).
copy : bool, default False
Whether to copy the `values` and `mask` arrays.
Attributes
----------
None
Methods
-------
None
Returns
-------
BooleanArray
Examples
--------
Create an BooleanArray with :func:`pandas.array`:
>>> pd.array([True, False, None], dtype="boolean")
<BooleanArray>
[True, False, <NA>]
Length: 3, dtype: boolean
"""
# The value used to fill '_data' to avoid upcasting
_internal_fill_value = False
# Fill values used for any/all
# Incompatible types in assignment (expression has type "bool", base class
# "BaseMaskedArray" defined the type as "<typing special form>")
_truthy_value = True # type: ignore[assignment]
_falsey_value = False # type: ignore[assignment]
_TRUE_VALUES = {"True", "TRUE", "true", "1", "1.0"}
_FALSE_VALUES = {"False", "FALSE", "false", "0", "0.0"}
@classmethod
def _simple_new(cls, values: np.ndarray, mask: npt.NDArray[np.bool_]) -> Self:
result = super()._simple_new(values, mask)
result._dtype = BooleanDtype()
return result
def __init__(
self, values: np.ndarray, mask: np.ndarray, copy: bool = False
) -> None:
if not (isinstance(values, np.ndarray) and values.dtype == np.bool_):
raise TypeError(
"values should be boolean numpy array. Use "
"the 'pd.array' function instead"
)
self._dtype = BooleanDtype()
super().__init__(values, mask, copy=copy)
@property
def dtype(self) -> BooleanDtype:
return self._dtype
@classmethod
def _from_sequence_of_strings(
cls,
strings: list[str],
*,
dtype: Dtype | None = None,
copy: bool = False,
true_values: list[str] | None = None,
false_values: list[str] | None = None,
) -> BooleanArray:
true_values_union = cls._TRUE_VALUES.union(true_values or [])
false_values_union = cls._FALSE_VALUES.union(false_values or [])
def map_string(s) -> bool:
if s in true_values_union:
return True
elif s in false_values_union:
return False
else:
raise ValueError(f"{s} cannot be cast to bool")
scalars = np.array(strings, dtype=object)
mask = isna(scalars)
scalars[~mask] = list(map(map_string, scalars[~mask]))
return cls._from_sequence(scalars, dtype=dtype, copy=copy)
_HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_)
@classmethod
def _coerce_to_array(
cls, value, *, dtype: DtypeObj, copy: bool = False
) -> tuple[np.ndarray, np.ndarray]:
if dtype:
assert dtype == "boolean"
return coerce_to_array(value, copy=copy)
def _logical_method(self, other, op):
assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"}
other_is_scalar = lib.is_scalar(other)
mask = None
if isinstance(other, BooleanArray):
other, mask = other._data, other._mask
elif is_list_like(other):
other = np.asarray(other, dtype="bool")
if other.ndim > 1:
raise NotImplementedError("can only perform ops with 1-d structures")
other, mask = coerce_to_array(other, copy=False)
elif isinstance(other, np.bool_):
other = other.item()
if other_is_scalar and other is not libmissing.NA and not lib.is_bool(other):
raise TypeError(
"'other' should be pandas.NA or a bool. "
f"Got {type(other).__name__} instead."
)
if not other_is_scalar and len(self) != len(other):
raise ValueError("Lengths must match")
if op.__name__ in {"or_", "ror_"}:
result, mask = ops.kleene_or(self._data, other, self._mask, mask)
elif op.__name__ in {"and_", "rand_"}:
result, mask = ops.kleene_and(self._data, other, self._mask, mask)
else:
# i.e. xor, rxor
result, mask = ops.kleene_xor(self._data, other, self._mask, mask)
# i.e. BooleanArray
return self._maybe_mask_result(result, mask)
def _accumulate(
self, name: str, *, skipna: bool = True, **kwargs
) -> BaseMaskedArray:
data = self._data
mask = self._mask
if name in ("cummin", "cummax"):
op = getattr(masked_accumulations, name)
data, mask = op(data, mask, skipna=skipna, **kwargs)
return self._simple_new(data, mask)
else:
from pandas.core.arrays import IntegerArray
return IntegerArray(data.astype(int), mask)._accumulate(
name, skipna=skipna, **kwargs
)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,173 @@
from __future__ import annotations
from typing import ClassVar
import numpy as np
from pandas.core.dtypes.base import register_extension_dtype
from pandas.core.dtypes.common import is_float_dtype
from pandas.core.arrays.numeric import (
NumericArray,
NumericDtype,
)
class FloatingDtype(NumericDtype):
"""
An ExtensionDtype to hold a single size of floating dtype.
These specific implementations are subclasses of the non-public
FloatingDtype. For example we have Float32Dtype to represent float32.
The attributes name & type are set when these subclasses are created.
"""
_default_np_dtype = np.dtype(np.float64)
_checker = is_float_dtype
@classmethod
def construct_array_type(cls) -> type[FloatingArray]:
"""
Return the array type associated with this dtype.
Returns
-------
type
"""
return FloatingArray
@classmethod
def _get_dtype_mapping(cls) -> dict[np.dtype, FloatingDtype]:
return NUMPY_FLOAT_TO_DTYPE
@classmethod
def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray:
"""
Safely cast the values to the given dtype.
"safe" in this context means the casting is lossless.
"""
# This is really only here for compatibility with IntegerDtype
# Here for compat with IntegerDtype
return values.astype(dtype, copy=copy)
class FloatingArray(NumericArray):
"""
Array of floating (optional missing) values.
.. warning::
FloatingArray is currently experimental, and its API or internal
implementation may change without warning. Especially the behaviour
regarding NaN (distinct from NA missing values) is subject to change.
We represent a FloatingArray with 2 numpy arrays:
- data: contains a numpy float array of the appropriate dtype
- mask: a boolean array holding a mask on the data, True is missing
To construct an FloatingArray from generic array-like input, use
:func:`pandas.array` with one of the float dtypes (see examples).
See :ref:`integer_na` for more.
Parameters
----------
values : numpy.ndarray
A 1-d float-dtype array.
mask : numpy.ndarray
A 1-d boolean-dtype array indicating missing values.
copy : bool, default False
Whether to copy the `values` and `mask`.
Attributes
----------
None
Methods
-------
None
Returns
-------
FloatingArray
Examples
--------
Create an FloatingArray with :func:`pandas.array`:
>>> pd.array([0.1, None, 0.3], dtype=pd.Float32Dtype())
<FloatingArray>
[0.1, <NA>, 0.3]
Length: 3, dtype: Float32
String aliases for the dtypes are also available. They are capitalized.
>>> pd.array([0.1, None, 0.3], dtype="Float32")
<FloatingArray>
[0.1, <NA>, 0.3]
Length: 3, dtype: Float32
"""
_dtype_cls = FloatingDtype
# The value used to fill '_data' to avoid upcasting
_internal_fill_value = np.nan
# Fill values used for any/all
# Incompatible types in assignment (expression has type "float", base class
# "BaseMaskedArray" defined the type as "<typing special form>")
_truthy_value = 1.0 # type: ignore[assignment]
_falsey_value = 0.0 # type: ignore[assignment]
_dtype_docstring = """
An ExtensionDtype for {dtype} data.
This dtype uses ``pd.NA`` as missing value indicator.
Attributes
----------
None
Methods
-------
None
Examples
--------
For Float32Dtype:
>>> ser = pd.Series([2.25, pd.NA], dtype=pd.Float32Dtype())
>>> ser.dtype
Float32Dtype()
For Float64Dtype:
>>> ser = pd.Series([2.25, pd.NA], dtype=pd.Float64Dtype())
>>> ser.dtype
Float64Dtype()
"""
# create the Dtype
@register_extension_dtype
class Float32Dtype(FloatingDtype):
type = np.float32
name: ClassVar[str] = "Float32"
__doc__ = _dtype_docstring.format(dtype="float32")
@register_extension_dtype
class Float64Dtype(FloatingDtype):
type = np.float64
name: ClassVar[str] = "Float64"
__doc__ = _dtype_docstring.format(dtype="float64")
NUMPY_FLOAT_TO_DTYPE: dict[np.dtype, FloatingDtype] = {
np.dtype(np.float32): Float32Dtype(),
np.dtype(np.float64): Float64Dtype(),
}

View File

@ -0,0 +1,272 @@
from __future__ import annotations
from typing import ClassVar
import numpy as np
from pandas.core.dtypes.base import register_extension_dtype
from pandas.core.dtypes.common import is_integer_dtype
from pandas.core.arrays.numeric import (
NumericArray,
NumericDtype,
)
class IntegerDtype(NumericDtype):
"""
An ExtensionDtype to hold a single size & kind of integer dtype.
These specific implementations are subclasses of the non-public
IntegerDtype. For example, we have Int8Dtype to represent signed int 8s.
The attributes name & type are set when these subclasses are created.
"""
_default_np_dtype = np.dtype(np.int64)
_checker = is_integer_dtype
@classmethod
def construct_array_type(cls) -> type[IntegerArray]:
"""
Return the array type associated with this dtype.
Returns
-------
type
"""
return IntegerArray
@classmethod
def _get_dtype_mapping(cls) -> dict[np.dtype, IntegerDtype]:
return NUMPY_INT_TO_DTYPE
@classmethod
def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray:
"""
Safely cast the values to the given dtype.
"safe" in this context means the casting is lossless. e.g. if 'values'
has a floating dtype, each value must be an integer.
"""
try:
return values.astype(dtype, casting="safe", copy=copy)
except TypeError as err:
casted = values.astype(dtype, copy=copy)
if (casted == values).all():
return casted
raise TypeError(
f"cannot safely cast non-equivalent {values.dtype} to {np.dtype(dtype)}"
) from err
class IntegerArray(NumericArray):
"""
Array of integer (optional missing) values.
Uses :attr:`pandas.NA` as the missing value.
.. warning::
IntegerArray is currently experimental, and its API or internal
implementation may change without warning.
We represent an IntegerArray with 2 numpy arrays:
- data: contains a numpy integer array of the appropriate dtype
- mask: a boolean array holding a mask on the data, True is missing
To construct an IntegerArray from generic array-like input, use
:func:`pandas.array` with one of the integer dtypes (see examples).
See :ref:`integer_na` for more.
Parameters
----------
values : numpy.ndarray
A 1-d integer-dtype array.
mask : numpy.ndarray
A 1-d boolean-dtype array indicating missing values.
copy : bool, default False
Whether to copy the `values` and `mask`.
Attributes
----------
None
Methods
-------
None
Returns
-------
IntegerArray
Examples
--------
Create an IntegerArray with :func:`pandas.array`.
>>> int_array = pd.array([1, None, 3], dtype=pd.Int32Dtype())
>>> int_array
<IntegerArray>
[1, <NA>, 3]
Length: 3, dtype: Int32
String aliases for the dtypes are also available. They are capitalized.
>>> pd.array([1, None, 3], dtype='Int32')
<IntegerArray>
[1, <NA>, 3]
Length: 3, dtype: Int32
>>> pd.array([1, None, 3], dtype='UInt16')
<IntegerArray>
[1, <NA>, 3]
Length: 3, dtype: UInt16
"""
_dtype_cls = IntegerDtype
# The value used to fill '_data' to avoid upcasting
_internal_fill_value = 1
# Fill values used for any/all
# Incompatible types in assignment (expression has type "int", base class
# "BaseMaskedArray" defined the type as "<typing special form>")
_truthy_value = 1 # type: ignore[assignment]
_falsey_value = 0 # type: ignore[assignment]
_dtype_docstring = """
An ExtensionDtype for {dtype} integer data.
Uses :attr:`pandas.NA` as its missing value, rather than :attr:`numpy.nan`.
Attributes
----------
None
Methods
-------
None
Examples
--------
For Int8Dtype:
>>> ser = pd.Series([2, pd.NA], dtype=pd.Int8Dtype())
>>> ser.dtype
Int8Dtype()
For Int16Dtype:
>>> ser = pd.Series([2, pd.NA], dtype=pd.Int16Dtype())
>>> ser.dtype
Int16Dtype()
For Int32Dtype:
>>> ser = pd.Series([2, pd.NA], dtype=pd.Int32Dtype())
>>> ser.dtype
Int32Dtype()
For Int64Dtype:
>>> ser = pd.Series([2, pd.NA], dtype=pd.Int64Dtype())
>>> ser.dtype
Int64Dtype()
For UInt8Dtype:
>>> ser = pd.Series([2, pd.NA], dtype=pd.UInt8Dtype())
>>> ser.dtype
UInt8Dtype()
For UInt16Dtype:
>>> ser = pd.Series([2, pd.NA], dtype=pd.UInt16Dtype())
>>> ser.dtype
UInt16Dtype()
For UInt32Dtype:
>>> ser = pd.Series([2, pd.NA], dtype=pd.UInt32Dtype())
>>> ser.dtype
UInt32Dtype()
For UInt64Dtype:
>>> ser = pd.Series([2, pd.NA], dtype=pd.UInt64Dtype())
>>> ser.dtype
UInt64Dtype()
"""
# create the Dtype
@register_extension_dtype
class Int8Dtype(IntegerDtype):
type = np.int8
name: ClassVar[str] = "Int8"
__doc__ = _dtype_docstring.format(dtype="int8")
@register_extension_dtype
class Int16Dtype(IntegerDtype):
type = np.int16
name: ClassVar[str] = "Int16"
__doc__ = _dtype_docstring.format(dtype="int16")
@register_extension_dtype
class Int32Dtype(IntegerDtype):
type = np.int32
name: ClassVar[str] = "Int32"
__doc__ = _dtype_docstring.format(dtype="int32")
@register_extension_dtype
class Int64Dtype(IntegerDtype):
type = np.int64
name: ClassVar[str] = "Int64"
__doc__ = _dtype_docstring.format(dtype="int64")
@register_extension_dtype
class UInt8Dtype(IntegerDtype):
type = np.uint8
name: ClassVar[str] = "UInt8"
__doc__ = _dtype_docstring.format(dtype="uint8")
@register_extension_dtype
class UInt16Dtype(IntegerDtype):
type = np.uint16
name: ClassVar[str] = "UInt16"
__doc__ = _dtype_docstring.format(dtype="uint16")
@register_extension_dtype
class UInt32Dtype(IntegerDtype):
type = np.uint32
name: ClassVar[str] = "UInt32"
__doc__ = _dtype_docstring.format(dtype="uint32")
@register_extension_dtype
class UInt64Dtype(IntegerDtype):
type = np.uint64
name: ClassVar[str] = "UInt64"
__doc__ = _dtype_docstring.format(dtype="uint64")
NUMPY_INT_TO_DTYPE: dict[np.dtype, IntegerDtype] = {
np.dtype(np.int8): Int8Dtype(),
np.dtype(np.int16): Int16Dtype(),
np.dtype(np.int32): Int32Dtype(),
np.dtype(np.int64): Int64Dtype(),
np.dtype(np.uint8): UInt8Dtype(),
np.dtype(np.uint16): UInt16Dtype(),
np.dtype(np.uint32): UInt32Dtype(),
np.dtype(np.uint64): UInt64Dtype(),
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,286 @@
from __future__ import annotations
import numbers
from typing import (
TYPE_CHECKING,
Any,
Callable,
)
import numpy as np
from pandas._libs import (
lib,
missing as libmissing,
)
from pandas.errors import AbstractMethodError
from pandas.util._decorators import cache_readonly
from pandas.core.dtypes.common import (
is_integer_dtype,
is_string_dtype,
pandas_dtype,
)
from pandas.core.arrays.masked import (
BaseMaskedArray,
BaseMaskedDtype,
)
if TYPE_CHECKING:
from collections.abc import Mapping
import pyarrow
from pandas._typing import (
Dtype,
DtypeObj,
Self,
npt,
)
class NumericDtype(BaseMaskedDtype):
_default_np_dtype: np.dtype
_checker: Callable[[Any], bool] # is_foo_dtype
def __repr__(self) -> str:
return f"{self.name}Dtype()"
@cache_readonly
def is_signed_integer(self) -> bool:
return self.kind == "i"
@cache_readonly
def is_unsigned_integer(self) -> bool:
return self.kind == "u"
@property
def _is_numeric(self) -> bool:
return True
def __from_arrow__(
self, array: pyarrow.Array | pyarrow.ChunkedArray
) -> BaseMaskedArray:
"""
Construct IntegerArray/FloatingArray from pyarrow Array/ChunkedArray.
"""
import pyarrow
from pandas.core.arrays.arrow._arrow_utils import (
pyarrow_array_to_numpy_and_mask,
)
array_class = self.construct_array_type()
pyarrow_type = pyarrow.from_numpy_dtype(self.type)
if not array.type.equals(pyarrow_type) and not pyarrow.types.is_null(
array.type
):
# test_from_arrow_type_error raise for string, but allow
# through itemsize conversion GH#31896
rt_dtype = pandas_dtype(array.type.to_pandas_dtype())
if rt_dtype.kind not in "iuf":
# Could allow "c" or potentially disallow float<->int conversion,
# but at the moment we specifically test that uint<->int works
raise TypeError(
f"Expected array of {self} type, got {array.type} instead"
)
array = array.cast(pyarrow_type)
if isinstance(array, pyarrow.ChunkedArray):
# TODO this "if" can be removed when requiring pyarrow >= 10.0, which fixed
# combine_chunks for empty arrays https://github.com/apache/arrow/pull/13757
if array.num_chunks == 0:
array = pyarrow.array([], type=array.type)
else:
array = array.combine_chunks()
data, mask = pyarrow_array_to_numpy_and_mask(array, dtype=self.numpy_dtype)
return array_class(data.copy(), ~mask, copy=False)
@classmethod
def _get_dtype_mapping(cls) -> Mapping[np.dtype, NumericDtype]:
raise AbstractMethodError(cls)
@classmethod
def _standardize_dtype(cls, dtype: NumericDtype | str | np.dtype) -> NumericDtype:
"""
Convert a string representation or a numpy dtype to NumericDtype.
"""
if isinstance(dtype, str) and (dtype.startswith(("Int", "UInt", "Float"))):
# Avoid DeprecationWarning from NumPy about np.dtype("Int64")
# https://github.com/numpy/numpy/pull/7476
dtype = dtype.lower()
if not isinstance(dtype, NumericDtype):
mapping = cls._get_dtype_mapping()
try:
dtype = mapping[np.dtype(dtype)]
except KeyError as err:
raise ValueError(f"invalid dtype specified {dtype}") from err
return dtype
@classmethod
def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray:
"""
Safely cast the values to the given dtype.
"safe" in this context means the casting is lossless.
"""
raise AbstractMethodError(cls)
def _coerce_to_data_and_mask(
values, dtype, copy: bool, dtype_cls: type[NumericDtype], default_dtype: np.dtype
):
checker = dtype_cls._checker
mask = None
inferred_type = None
if dtype is None and hasattr(values, "dtype"):
if checker(values.dtype):
dtype = values.dtype
if dtype is not None:
dtype = dtype_cls._standardize_dtype(dtype)
cls = dtype_cls.construct_array_type()
if isinstance(values, cls):
values, mask = values._data, values._mask
if dtype is not None:
values = values.astype(dtype.numpy_dtype, copy=False)
if copy:
values = values.copy()
mask = mask.copy()
return values, mask, dtype, inferred_type
original = values
if not copy:
values = np.asarray(values)
else:
values = np.array(values, copy=copy)
inferred_type = None
if values.dtype == object or is_string_dtype(values.dtype):
inferred_type = lib.infer_dtype(values, skipna=True)
if inferred_type == "boolean" and dtype is None:
name = dtype_cls.__name__.strip("_")
raise TypeError(f"{values.dtype} cannot be converted to {name}")
elif values.dtype.kind == "b" and checker(dtype):
if not copy:
values = np.asarray(values, dtype=default_dtype)
else:
values = np.array(values, dtype=default_dtype, copy=copy)
elif values.dtype.kind not in "iuf":
name = dtype_cls.__name__.strip("_")
raise TypeError(f"{values.dtype} cannot be converted to {name}")
if values.ndim != 1:
raise TypeError("values must be a 1D list-like")
if mask is None:
if values.dtype.kind in "iu":
# fastpath
mask = np.zeros(len(values), dtype=np.bool_)
else:
mask = libmissing.is_numeric_na(values)
else:
assert len(mask) == len(values)
if mask.ndim != 1:
raise TypeError("mask must be a 1D list-like")
# infer dtype if needed
if dtype is None:
dtype = default_dtype
else:
dtype = dtype.numpy_dtype
if is_integer_dtype(dtype) and values.dtype.kind == "f" and len(values) > 0:
if mask.all():
values = np.ones(values.shape, dtype=dtype)
else:
idx = np.nanargmax(values)
if int(values[idx]) != original[idx]:
# We have ints that lost precision during the cast.
inferred_type = lib.infer_dtype(original, skipna=True)
if (
inferred_type not in ["floating", "mixed-integer-float"]
and not mask.any()
):
values = np.asarray(original, dtype=dtype)
else:
values = np.asarray(original, dtype="object")
# we copy as need to coerce here
if mask.any():
values = values.copy()
values[mask] = cls._internal_fill_value
if inferred_type in ("string", "unicode"):
# casts from str are always safe since they raise
# a ValueError if the str cannot be parsed into a float
values = values.astype(dtype, copy=copy)
else:
values = dtype_cls._safe_cast(values, dtype, copy=False)
return values, mask, dtype, inferred_type
class NumericArray(BaseMaskedArray):
"""
Base class for IntegerArray and FloatingArray.
"""
_dtype_cls: type[NumericDtype]
def __init__(
self, values: np.ndarray, mask: npt.NDArray[np.bool_], copy: bool = False
) -> None:
checker = self._dtype_cls._checker
if not (isinstance(values, np.ndarray) and checker(values.dtype)):
descr = (
"floating"
if self._dtype_cls.kind == "f" # type: ignore[comparison-overlap]
else "integer"
)
raise TypeError(
f"values should be {descr} numpy array. Use "
"the 'pd.array' function instead"
)
if values.dtype == np.float16:
# If we don't raise here, then accessing self.dtype would raise
raise TypeError("FloatingArray does not support np.float16 dtype.")
super().__init__(values, mask, copy=copy)
@cache_readonly
def dtype(self) -> NumericDtype:
mapping = self._dtype_cls._get_dtype_mapping()
return mapping[self._data.dtype]
@classmethod
def _coerce_to_array(
cls, value, *, dtype: DtypeObj, copy: bool = False
) -> tuple[np.ndarray, np.ndarray]:
dtype_cls = cls._dtype_cls
default_dtype = dtype_cls._default_np_dtype
values, mask, _, _ = _coerce_to_data_and_mask(
value, dtype, copy, dtype_cls, default_dtype
)
return values, mask
@classmethod
def _from_sequence_of_strings(
cls, strings, *, dtype: Dtype | None = None, copy: bool = False
) -> Self:
from pandas.core.tools.numeric import to_numeric
scalars = to_numeric(strings, errors="raise", dtype_backend="numpy_nullable")
return cls._from_sequence(scalars, dtype=dtype, copy=copy)
_HANDLED_TYPES = (np.ndarray, numbers.Number)

View File

@ -0,0 +1,574 @@
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Any,
Literal,
)
import numpy as np
from pandas._libs import lib
from pandas._libs.tslibs import is_supported_dtype
from pandas.compat.numpy import function as nv
from pandas.core.dtypes.astype import astype_array
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
from pandas.core.dtypes.common import pandas_dtype
from pandas.core.dtypes.dtypes import NumpyEADtype
from pandas.core.dtypes.missing import isna
from pandas.core import (
arraylike,
missing,
nanops,
ops,
)
from pandas.core.arraylike import OpsMixin
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
from pandas.core.construction import ensure_wrapped_if_datetimelike
from pandas.core.strings.object_array import ObjectStringArrayMixin
if TYPE_CHECKING:
from collections.abc import Callable
from pandas._typing import (
AxisInt,
Dtype,
FillnaOptions,
InterpolateOptions,
NpDtype,
Scalar,
Self,
npt,
)
from pandas import Index
# error: Definition of "_concat_same_type" in base class "NDArrayBacked" is
# incompatible with definition in base class "ExtensionArray"
class NumpyExtensionArray( # type: ignore[misc]
OpsMixin,
NDArrayBackedExtensionArray,
ObjectStringArrayMixin,
):
"""
A pandas ExtensionArray for NumPy data.
This is mostly for internal compatibility, and is not especially
useful on its own.
Parameters
----------
values : ndarray
The NumPy ndarray to wrap. Must be 1-dimensional.
copy : bool, default False
Whether to copy `values`.
Attributes
----------
None
Methods
-------
None
Examples
--------
>>> pd.arrays.NumpyExtensionArray(np.array([0, 1, 2, 3]))
<NumpyExtensionArray>
[0, 1, 2, 3]
Length: 4, dtype: int64
"""
# If you're wondering why pd.Series(cls) doesn't put the array in an
# ExtensionBlock, search for `ABCNumpyExtensionArray`. We check for
# that _typ to ensure that users don't unnecessarily use EAs inside
# pandas internals, which turns off things like block consolidation.
_typ = "npy_extension"
__array_priority__ = 1000
_ndarray: np.ndarray
_dtype: NumpyEADtype
_internal_fill_value = np.nan
# ------------------------------------------------------------------------
# Constructors
def __init__(
self, values: np.ndarray | NumpyExtensionArray, copy: bool = False
) -> None:
if isinstance(values, type(self)):
values = values._ndarray
if not isinstance(values, np.ndarray):
raise ValueError(
f"'values' must be a NumPy array, not {type(values).__name__}"
)
if values.ndim == 0:
# Technically we support 2, but do not advertise that fact.
raise ValueError("NumpyExtensionArray must be 1-dimensional.")
if copy:
values = values.copy()
dtype = NumpyEADtype(values.dtype)
super().__init__(values, dtype)
@classmethod
def _from_sequence(
cls, scalars, *, dtype: Dtype | None = None, copy: bool = False
) -> NumpyExtensionArray:
if isinstance(dtype, NumpyEADtype):
dtype = dtype._dtype
# error: Argument "dtype" to "asarray" has incompatible type
# "Union[ExtensionDtype, str, dtype[Any], dtype[floating[_64Bit]], Type[object],
# None]"; expected "Union[dtype[Any], None, type, _SupportsDType, str,
# Union[Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], List[Any],
# _DTypeDict, Tuple[Any, Any]]]"
result = np.asarray(scalars, dtype=dtype) # type: ignore[arg-type]
if (
result.ndim > 1
and not hasattr(scalars, "dtype")
and (dtype is None or dtype == object)
):
# e.g. list-of-tuples
result = construct_1d_object_array_from_listlike(scalars)
if copy and result is scalars:
result = result.copy()
return cls(result)
# ------------------------------------------------------------------------
# Data
@property
def dtype(self) -> NumpyEADtype:
return self._dtype
# ------------------------------------------------------------------------
# NumPy Array Interface
def __array__(
self, dtype: NpDtype | None = None, copy: bool | None = None
) -> np.ndarray:
if copy is not None:
# Note: branch avoids `copy=None` for NumPy 1.x support
return np.array(self._ndarray, dtype=dtype, copy=copy)
return np.asarray(self._ndarray, dtype=dtype)
def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
# Lightly modified version of
# https://numpy.org/doc/stable/reference/generated/numpy.lib.mixins.NDArrayOperatorsMixin.html
# The primary modification is not boxing scalar return values
# in NumpyExtensionArray, since pandas' ExtensionArrays are 1-d.
out = kwargs.get("out", ())
result = arraylike.maybe_dispatch_ufunc_to_dunder_op(
self, ufunc, method, *inputs, **kwargs
)
if result is not NotImplemented:
return result
if "out" in kwargs:
# e.g. test_ufunc_unary
return arraylike.dispatch_ufunc_with_out(
self, ufunc, method, *inputs, **kwargs
)
if method == "reduce":
result = arraylike.dispatch_reduction_ufunc(
self, ufunc, method, *inputs, **kwargs
)
if result is not NotImplemented:
# e.g. tests.series.test_ufunc.TestNumpyReductions
return result
# Defer to the implementation of the ufunc on unwrapped values.
inputs = tuple(
x._ndarray if isinstance(x, NumpyExtensionArray) else x for x in inputs
)
if out:
kwargs["out"] = tuple(
x._ndarray if isinstance(x, NumpyExtensionArray) else x for x in out
)
result = getattr(ufunc, method)(*inputs, **kwargs)
if ufunc.nout > 1:
# multiple return values; re-box array-like results
return tuple(type(self)(x) for x in result)
elif method == "at":
# no return value
return None
elif method == "reduce":
if isinstance(result, np.ndarray):
# e.g. test_np_reduce_2d
return type(self)(result)
# e.g. test_np_max_nested_tuples
return result
else:
# one return value; re-box array-like results
return type(self)(result)
# ------------------------------------------------------------------------
# Pandas ExtensionArray Interface
def astype(self, dtype, copy: bool = True):
dtype = pandas_dtype(dtype)
if dtype == self.dtype:
if copy:
return self.copy()
return self
result = astype_array(self._ndarray, dtype=dtype, copy=copy)
return result
def isna(self) -> np.ndarray:
return isna(self._ndarray)
def _validate_scalar(self, fill_value):
if fill_value is None:
# Primarily for subclasses
fill_value = self.dtype.na_value
return fill_value
def _values_for_factorize(self) -> tuple[np.ndarray, float | None]:
if self.dtype.kind in "iub":
fv = None
else:
fv = np.nan
return self._ndarray, fv
# Base EA class (and all other EA classes) don't have limit_area keyword
# This can be removed here as well when the interpolate ffill/bfill method
# deprecation is enforced
def _pad_or_backfill(
self,
*,
method: FillnaOptions,
limit: int | None = None,
limit_area: Literal["inside", "outside"] | None = None,
copy: bool = True,
) -> Self:
"""
ffill or bfill along axis=0.
"""
if copy:
out_data = self._ndarray.copy()
else:
out_data = self._ndarray
meth = missing.clean_fill_method(method)
missing.pad_or_backfill_inplace(
out_data.T,
method=meth,
axis=0,
limit=limit,
limit_area=limit_area,
)
if not copy:
return self
return type(self)._simple_new(out_data, dtype=self.dtype)
def interpolate(
self,
*,
method: InterpolateOptions,
axis: int,
index: Index,
limit,
limit_direction,
limit_area,
copy: bool,
**kwargs,
) -> Self:
"""
See NDFrame.interpolate.__doc__.
"""
# NB: we return type(self) even if copy=False
if not self.dtype._is_numeric:
raise TypeError(f"Cannot interpolate with {self.dtype} dtype")
if not copy:
out_data = self._ndarray
else:
out_data = self._ndarray.copy()
# TODO: assert we have floating dtype?
missing.interpolate_2d_inplace(
out_data,
method=method,
axis=axis,
index=index,
limit=limit,
limit_direction=limit_direction,
limit_area=limit_area,
**kwargs,
)
if not copy:
return self
return type(self)._simple_new(out_data, dtype=self.dtype)
# ------------------------------------------------------------------------
# Reductions
def any(
self,
*,
axis: AxisInt | None = None,
out=None,
keepdims: bool = False,
skipna: bool = True,
):
nv.validate_any((), {"out": out, "keepdims": keepdims})
result = nanops.nanany(self._ndarray, axis=axis, skipna=skipna)
return self._wrap_reduction_result(axis, result)
def all(
self,
*,
axis: AxisInt | None = None,
out=None,
keepdims: bool = False,
skipna: bool = True,
):
nv.validate_all((), {"out": out, "keepdims": keepdims})
result = nanops.nanall(self._ndarray, axis=axis, skipna=skipna)
return self._wrap_reduction_result(axis, result)
def min(
self, *, axis: AxisInt | None = None, skipna: bool = True, **kwargs
) -> Scalar:
nv.validate_min((), kwargs)
result = nanops.nanmin(
values=self._ndarray, axis=axis, mask=self.isna(), skipna=skipna
)
return self._wrap_reduction_result(axis, result)
def max(
self, *, axis: AxisInt | None = None, skipna: bool = True, **kwargs
) -> Scalar:
nv.validate_max((), kwargs)
result = nanops.nanmax(
values=self._ndarray, axis=axis, mask=self.isna(), skipna=skipna
)
return self._wrap_reduction_result(axis, result)
def sum(
self,
*,
axis: AxisInt | None = None,
skipna: bool = True,
min_count: int = 0,
**kwargs,
) -> Scalar:
nv.validate_sum((), kwargs)
result = nanops.nansum(
self._ndarray, axis=axis, skipna=skipna, min_count=min_count
)
return self._wrap_reduction_result(axis, result)
def prod(
self,
*,
axis: AxisInt | None = None,
skipna: bool = True,
min_count: int = 0,
**kwargs,
) -> Scalar:
nv.validate_prod((), kwargs)
result = nanops.nanprod(
self._ndarray, axis=axis, skipna=skipna, min_count=min_count
)
return self._wrap_reduction_result(axis, result)
def mean(
self,
*,
axis: AxisInt | None = None,
dtype: NpDtype | None = None,
out=None,
keepdims: bool = False,
skipna: bool = True,
):
nv.validate_mean((), {"dtype": dtype, "out": out, "keepdims": keepdims})
result = nanops.nanmean(self._ndarray, axis=axis, skipna=skipna)
return self._wrap_reduction_result(axis, result)
def median(
self,
*,
axis: AxisInt | None = None,
out=None,
overwrite_input: bool = False,
keepdims: bool = False,
skipna: bool = True,
):
nv.validate_median(
(), {"out": out, "overwrite_input": overwrite_input, "keepdims": keepdims}
)
result = nanops.nanmedian(self._ndarray, axis=axis, skipna=skipna)
return self._wrap_reduction_result(axis, result)
def std(
self,
*,
axis: AxisInt | None = None,
dtype: NpDtype | None = None,
out=None,
ddof: int = 1,
keepdims: bool = False,
skipna: bool = True,
):
nv.validate_stat_ddof_func(
(), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="std"
)
result = nanops.nanstd(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
return self._wrap_reduction_result(axis, result)
def var(
self,
*,
axis: AxisInt | None = None,
dtype: NpDtype | None = None,
out=None,
ddof: int = 1,
keepdims: bool = False,
skipna: bool = True,
):
nv.validate_stat_ddof_func(
(), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="var"
)
result = nanops.nanvar(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
return self._wrap_reduction_result(axis, result)
def sem(
self,
*,
axis: AxisInt | None = None,
dtype: NpDtype | None = None,
out=None,
ddof: int = 1,
keepdims: bool = False,
skipna: bool = True,
):
nv.validate_stat_ddof_func(
(), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="sem"
)
result = nanops.nansem(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
return self._wrap_reduction_result(axis, result)
def kurt(
self,
*,
axis: AxisInt | None = None,
dtype: NpDtype | None = None,
out=None,
keepdims: bool = False,
skipna: bool = True,
):
nv.validate_stat_ddof_func(
(), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="kurt"
)
result = nanops.nankurt(self._ndarray, axis=axis, skipna=skipna)
return self._wrap_reduction_result(axis, result)
def skew(
self,
*,
axis: AxisInt | None = None,
dtype: NpDtype | None = None,
out=None,
keepdims: bool = False,
skipna: bool = True,
):
nv.validate_stat_ddof_func(
(), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="skew"
)
result = nanops.nanskew(self._ndarray, axis=axis, skipna=skipna)
return self._wrap_reduction_result(axis, result)
# ------------------------------------------------------------------------
# Additional Methods
def to_numpy(
self,
dtype: npt.DTypeLike | None = None,
copy: bool = False,
na_value: object = lib.no_default,
) -> np.ndarray:
mask = self.isna()
if na_value is not lib.no_default and mask.any():
result = self._ndarray.copy()
result[mask] = na_value
else:
result = self._ndarray
result = np.asarray(result, dtype=dtype)
if copy and result is self._ndarray:
result = result.copy()
return result
# ------------------------------------------------------------------------
# Ops
def __invert__(self) -> NumpyExtensionArray:
return type(self)(~self._ndarray)
def __neg__(self) -> NumpyExtensionArray:
return type(self)(-self._ndarray)
def __pos__(self) -> NumpyExtensionArray:
return type(self)(+self._ndarray)
def __abs__(self) -> NumpyExtensionArray:
return type(self)(abs(self._ndarray))
def _cmp_method(self, other, op):
if isinstance(other, NumpyExtensionArray):
other = other._ndarray
other = ops.maybe_prepare_scalar_for_op(other, (len(self),))
pd_op = ops.get_array_op(op)
other = ensure_wrapped_if_datetimelike(other)
result = pd_op(self._ndarray, other)
if op is divmod or op is ops.rdivmod:
a, b = result
if isinstance(a, np.ndarray):
# for e.g. op vs TimedeltaArray, we may already
# have an ExtensionArray, in which case we do not wrap
return self._wrap_ndarray_result(a), self._wrap_ndarray_result(b)
return a, b
if isinstance(result, np.ndarray):
# for e.g. multiplication vs TimedeltaArray, we may already
# have an ExtensionArray, in which case we do not wrap
return self._wrap_ndarray_result(result)
return result
_arith_method = _cmp_method
def _wrap_ndarray_result(self, result: np.ndarray):
# If we have timedelta64[ns] result, return a TimedeltaArray instead
# of a NumpyExtensionArray
if result.dtype.kind == "m" and is_supported_dtype(result.dtype):
from pandas.core.arrays import TimedeltaArray
return TimedeltaArray._simple_new(result, dtype=result.dtype)
return type(self)(result)
def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]:
# NEP 51: https://github.com/numpy/numpy/pull/22449
if self.dtype.kind in "SU":
return "'{}'".format
elif self.dtype == "object":
return repr
else:
return str

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,19 @@
from pandas.core.arrays.sparse.accessor import (
SparseAccessor,
SparseFrameAccessor,
)
from pandas.core.arrays.sparse.array import (
BlockIndex,
IntIndex,
SparseArray,
make_sparse_index,
)
__all__ = [
"BlockIndex",
"IntIndex",
"make_sparse_index",
"SparseAccessor",
"SparseArray",
"SparseFrameAccessor",
]

View File

@ -0,0 +1,414 @@
"""Sparse accessor"""
from __future__ import annotations
from typing import TYPE_CHECKING
import numpy as np
from pandas.compat._optional import import_optional_dependency
from pandas.core.dtypes.cast import find_common_type
from pandas.core.dtypes.dtypes import SparseDtype
from pandas.core.accessor import (
PandasDelegate,
delegate_names,
)
from pandas.core.arrays.sparse.array import SparseArray
if TYPE_CHECKING:
from pandas import (
DataFrame,
Series,
)
class BaseAccessor:
_validation_msg = "Can only use the '.sparse' accessor with Sparse data."
def __init__(self, data=None) -> None:
self._parent = data
self._validate(data)
def _validate(self, data):
raise NotImplementedError
@delegate_names(
SparseArray, ["npoints", "density", "fill_value", "sp_values"], typ="property"
)
class SparseAccessor(BaseAccessor, PandasDelegate):
"""
Accessor for SparseSparse from other sparse matrix data types.
Examples
--------
>>> ser = pd.Series([0, 0, 2, 2, 2], dtype="Sparse[int]")
>>> ser.sparse.density
0.6
>>> ser.sparse.sp_values
array([2, 2, 2])
"""
def _validate(self, data):
if not isinstance(data.dtype, SparseDtype):
raise AttributeError(self._validation_msg)
def _delegate_property_get(self, name: str, *args, **kwargs):
return getattr(self._parent.array, name)
def _delegate_method(self, name: str, *args, **kwargs):
if name == "from_coo":
return self.from_coo(*args, **kwargs)
elif name == "to_coo":
return self.to_coo(*args, **kwargs)
else:
raise ValueError
@classmethod
def from_coo(cls, A, dense_index: bool = False) -> Series:
"""
Create a Series with sparse values from a scipy.sparse.coo_matrix.
Parameters
----------
A : scipy.sparse.coo_matrix
dense_index : bool, default False
If False (default), the index consists of only the
coords of the non-null entries of the original coo_matrix.
If True, the index consists of the full sorted
(row, col) coordinates of the coo_matrix.
Returns
-------
s : Series
A Series with sparse values.
Examples
--------
>>> from scipy import sparse
>>> A = sparse.coo_matrix(
... ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4)
... )
>>> A
<COOrdinate sparse matrix of dtype 'float64'
with 3 stored elements and shape (3, 4)>
>>> A.todense()
matrix([[0., 0., 1., 2.],
[3., 0., 0., 0.],
[0., 0., 0., 0.]])
>>> ss = pd.Series.sparse.from_coo(A)
>>> ss
0 2 1.0
3 2.0
1 0 3.0
dtype: Sparse[float64, nan]
"""
from pandas import Series
from pandas.core.arrays.sparse.scipy_sparse import coo_to_sparse_series
result = coo_to_sparse_series(A, dense_index=dense_index)
result = Series(result.array, index=result.index, copy=False)
return result
def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels: bool = False):
"""
Create a scipy.sparse.coo_matrix from a Series with MultiIndex.
Use row_levels and column_levels to determine the row and column
coordinates respectively. row_levels and column_levels are the names
(labels) or numbers of the levels. {row_levels, column_levels} must be
a partition of the MultiIndex level names (or numbers).
Parameters
----------
row_levels : tuple/list
column_levels : tuple/list
sort_labels : bool, default False
Sort the row and column labels before forming the sparse matrix.
When `row_levels` and/or `column_levels` refer to a single level,
set to `True` for a faster execution.
Returns
-------
y : scipy.sparse.coo_matrix
rows : list (row labels)
columns : list (column labels)
Examples
--------
>>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan])
>>> s.index = pd.MultiIndex.from_tuples(
... [
... (1, 2, "a", 0),
... (1, 2, "a", 1),
... (1, 1, "b", 0),
... (1, 1, "b", 1),
... (2, 1, "b", 0),
... (2, 1, "b", 1)
... ],
... names=["A", "B", "C", "D"],
... )
>>> s
A B C D
1 2 a 0 3.0
1 NaN
1 b 0 1.0
1 3.0
2 1 b 0 NaN
1 NaN
dtype: float64
>>> ss = s.astype("Sparse")
>>> ss
A B C D
1 2 a 0 3.0
1 NaN
1 b 0 1.0
1 3.0
2 1 b 0 NaN
1 NaN
dtype: Sparse[float64, nan]
>>> A, rows, columns = ss.sparse.to_coo(
... row_levels=["A", "B"], column_levels=["C", "D"], sort_labels=True
... )
>>> A
<COOrdinate sparse matrix of dtype 'float64'
with 3 stored elements and shape (3, 4)>
>>> A.todense()
matrix([[0., 0., 1., 3.],
[3., 0., 0., 0.],
[0., 0., 0., 0.]])
>>> rows
[(1, 1), (1, 2), (2, 1)]
>>> columns
[('a', 0), ('a', 1), ('b', 0), ('b', 1)]
"""
from pandas.core.arrays.sparse.scipy_sparse import sparse_series_to_coo
A, rows, columns = sparse_series_to_coo(
self._parent, row_levels, column_levels, sort_labels=sort_labels
)
return A, rows, columns
def to_dense(self) -> Series:
"""
Convert a Series from sparse values to dense.
Returns
-------
Series:
A Series with the same values, stored as a dense array.
Examples
--------
>>> series = pd.Series(pd.arrays.SparseArray([0, 1, 0]))
>>> series
0 0
1 1
2 0
dtype: Sparse[int64, 0]
>>> series.sparse.to_dense()
0 0
1 1
2 0
dtype: int64
"""
from pandas import Series
return Series(
self._parent.array.to_dense(),
index=self._parent.index,
name=self._parent.name,
copy=False,
)
class SparseFrameAccessor(BaseAccessor, PandasDelegate):
"""
DataFrame accessor for sparse data.
Examples
--------
>>> df = pd.DataFrame({"a": [1, 2, 0, 0],
... "b": [3, 0, 0, 4]}, dtype="Sparse[int]")
>>> df.sparse.density
0.5
"""
def _validate(self, data):
dtypes = data.dtypes
if not all(isinstance(t, SparseDtype) for t in dtypes):
raise AttributeError(self._validation_msg)
@classmethod
def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame:
"""
Create a new DataFrame from a scipy sparse matrix.
Parameters
----------
data : scipy.sparse.spmatrix
Must be convertible to csc format.
index, columns : Index, optional
Row and column labels to use for the resulting DataFrame.
Defaults to a RangeIndex.
Returns
-------
DataFrame
Each column of the DataFrame is stored as a
:class:`arrays.SparseArray`.
Examples
--------
>>> import scipy.sparse
>>> mat = scipy.sparse.eye(3, dtype=float)
>>> pd.DataFrame.sparse.from_spmatrix(mat)
0 1 2
0 1.0 0 0
1 0 1.0 0
2 0 0 1.0
"""
from pandas._libs.sparse import IntIndex
from pandas import DataFrame
data = data.tocsc()
index, columns = cls._prep_index(data, index, columns)
n_rows, n_columns = data.shape
# We need to make sure indices are sorted, as we create
# IntIndex with no input validation (i.e. check_integrity=False ).
# Indices may already be sorted in scipy in which case this adds
# a small overhead.
data.sort_indices()
indices = data.indices
indptr = data.indptr
array_data = data.data
dtype = SparseDtype(array_data.dtype, 0)
arrays = []
for i in range(n_columns):
sl = slice(indptr[i], indptr[i + 1])
idx = IntIndex(n_rows, indices[sl], check_integrity=False)
arr = SparseArray._simple_new(array_data[sl], idx, dtype)
arrays.append(arr)
return DataFrame._from_arrays(
arrays, columns=columns, index=index, verify_integrity=False
)
def to_dense(self) -> DataFrame:
"""
Convert a DataFrame with sparse values to dense.
Returns
-------
DataFrame
A DataFrame with the same values stored as dense arrays.
Examples
--------
>>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0])})
>>> df.sparse.to_dense()
A
0 0
1 1
2 0
"""
from pandas import DataFrame
data = {k: v.array.to_dense() for k, v in self._parent.items()}
return DataFrame(data, index=self._parent.index, columns=self._parent.columns)
def to_coo(self):
"""
Return the contents of the frame as a sparse SciPy COO matrix.
Returns
-------
scipy.sparse.spmatrix
If the caller is heterogeneous and contains booleans or objects,
the result will be of dtype=object. See Notes.
Notes
-----
The dtype will be the lowest-common-denominator type (implicit
upcasting); that is to say if the dtypes (even of numeric types)
are mixed, the one that accommodates all will be chosen.
e.g. If the dtypes are float16 and float32, dtype will be upcast to
float32. By numpy.find_common_type convention, mixing int64 and
and uint64 will result in a float64 dtype.
Examples
--------
>>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])})
>>> df.sparse.to_coo()
<COOrdinate sparse matrix of dtype 'int64'
with 2 stored elements and shape (4, 1)>
"""
import_optional_dependency("scipy")
from scipy.sparse import coo_matrix
dtype = find_common_type(self._parent.dtypes.to_list())
if isinstance(dtype, SparseDtype):
dtype = dtype.subtype
cols, rows, data = [], [], []
for col, (_, ser) in enumerate(self._parent.items()):
sp_arr = ser.array
if sp_arr.fill_value != 0:
raise ValueError("fill value must be 0 when converting to COO matrix")
row = sp_arr.sp_index.indices
cols.append(np.repeat(col, len(row)))
rows.append(row)
data.append(sp_arr.sp_values.astype(dtype, copy=False))
cols = np.concatenate(cols)
rows = np.concatenate(rows)
data = np.concatenate(data)
return coo_matrix((data, (rows, cols)), shape=self._parent.shape)
@property
def density(self) -> float:
"""
Ratio of non-sparse points to total (dense) data points.
Examples
--------
>>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])})
>>> df.sparse.density
0.5
"""
tmp = np.mean([column.array.density for _, column in self._parent.items()])
return tmp
@staticmethod
def _prep_index(data, index, columns):
from pandas.core.indexes.api import (
default_index,
ensure_index,
)
N, K = data.shape
if index is None:
index = default_index(N)
else:
index = ensure_index(index)
if columns is None:
columns = default_index(K)
else:
columns = ensure_index(columns)
if len(columns) != K:
raise ValueError(f"Column length mismatch: {len(columns)} vs. {K}")
if len(index) != N:
raise ValueError(f"Index length mismatch: {len(index)} vs. {N}")
return index, columns

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,207 @@
"""
Interaction with scipy.sparse matrices.
Currently only includes to_coo helpers.
"""
from __future__ import annotations
from typing import TYPE_CHECKING
from pandas._libs import lib
from pandas.core.dtypes.missing import notna
from pandas.core.algorithms import factorize
from pandas.core.indexes.api import MultiIndex
from pandas.core.series import Series
if TYPE_CHECKING:
from collections.abc import Iterable
import numpy as np
import scipy.sparse
from pandas._typing import (
IndexLabel,
npt,
)
def _check_is_partition(parts: Iterable, whole: Iterable):
whole = set(whole)
parts = [set(x) for x in parts]
if set.intersection(*parts) != set():
raise ValueError("Is not a partition because intersection is not null.")
if set.union(*parts) != whole:
raise ValueError("Is not a partition because union is not the whole.")
def _levels_to_axis(
ss,
levels: tuple[int] | list[int],
valid_ilocs: npt.NDArray[np.intp],
sort_labels: bool = False,
) -> tuple[npt.NDArray[np.intp], list[IndexLabel]]:
"""
For a MultiIndexed sparse Series `ss`, return `ax_coords` and `ax_labels`,
where `ax_coords` are the coordinates along one of the two axes of the
destination sparse matrix, and `ax_labels` are the labels from `ss`' Index
which correspond to these coordinates.
Parameters
----------
ss : Series
levels : tuple/list
valid_ilocs : numpy.ndarray
Array of integer positions of valid values for the sparse matrix in ss.
sort_labels : bool, default False
Sort the axis labels before forming the sparse matrix. When `levels`
refers to a single level, set to True for a faster execution.
Returns
-------
ax_coords : numpy.ndarray (axis coordinates)
ax_labels : list (axis labels)
"""
# Since the labels are sorted in `Index.levels`, when we wish to sort and
# there is only one level of the MultiIndex for this axis, the desired
# output can be obtained in the following simpler, more efficient way.
if sort_labels and len(levels) == 1:
ax_coords = ss.index.codes[levels[0]][valid_ilocs]
ax_labels = ss.index.levels[levels[0]]
else:
levels_values = lib.fast_zip(
[ss.index.get_level_values(lvl).to_numpy() for lvl in levels]
)
codes, ax_labels = factorize(levels_values, sort=sort_labels)
ax_coords = codes[valid_ilocs]
ax_labels = ax_labels.tolist()
return ax_coords, ax_labels
def _to_ijv(
ss,
row_levels: tuple[int] | list[int] = (0,),
column_levels: tuple[int] | list[int] = (1,),
sort_labels: bool = False,
) -> tuple[
np.ndarray,
npt.NDArray[np.intp],
npt.NDArray[np.intp],
list[IndexLabel],
list[IndexLabel],
]:
"""
For an arbitrary MultiIndexed sparse Series return (v, i, j, ilabels,
jlabels) where (v, (i, j)) is suitable for passing to scipy.sparse.coo
constructor, and ilabels and jlabels are the row and column labels
respectively.
Parameters
----------
ss : Series
row_levels : tuple/list
column_levels : tuple/list
sort_labels : bool, default False
Sort the row and column labels before forming the sparse matrix.
When `row_levels` and/or `column_levels` refer to a single level,
set to `True` for a faster execution.
Returns
-------
values : numpy.ndarray
Valid values to populate a sparse matrix, extracted from
ss.
i_coords : numpy.ndarray (row coordinates of the values)
j_coords : numpy.ndarray (column coordinates of the values)
i_labels : list (row labels)
j_labels : list (column labels)
"""
# index and column levels must be a partition of the index
_check_is_partition([row_levels, column_levels], range(ss.index.nlevels))
# From the sparse Series, get the integer indices and data for valid sparse
# entries.
sp_vals = ss.array.sp_values
na_mask = notna(sp_vals)
values = sp_vals[na_mask]
valid_ilocs = ss.array.sp_index.indices[na_mask]
i_coords, i_labels = _levels_to_axis(
ss, row_levels, valid_ilocs, sort_labels=sort_labels
)
j_coords, j_labels = _levels_to_axis(
ss, column_levels, valid_ilocs, sort_labels=sort_labels
)
return values, i_coords, j_coords, i_labels, j_labels
def sparse_series_to_coo(
ss: Series,
row_levels: Iterable[int] = (0,),
column_levels: Iterable[int] = (1,),
sort_labels: bool = False,
) -> tuple[scipy.sparse.coo_matrix, list[IndexLabel], list[IndexLabel]]:
"""
Convert a sparse Series to a scipy.sparse.coo_matrix using index
levels row_levels, column_levels as the row and column
labels respectively. Returns the sparse_matrix, row and column labels.
"""
import scipy.sparse
if ss.index.nlevels < 2:
raise ValueError("to_coo requires MultiIndex with nlevels >= 2.")
if not ss.index.is_unique:
raise ValueError(
"Duplicate index entries are not allowed in to_coo transformation."
)
# to keep things simple, only rely on integer indexing (not labels)
row_levels = [ss.index._get_level_number(x) for x in row_levels]
column_levels = [ss.index._get_level_number(x) for x in column_levels]
v, i, j, rows, columns = _to_ijv(
ss, row_levels=row_levels, column_levels=column_levels, sort_labels=sort_labels
)
sparse_matrix = scipy.sparse.coo_matrix(
(v, (i, j)), shape=(len(rows), len(columns))
)
return sparse_matrix, rows, columns
def coo_to_sparse_series(
A: scipy.sparse.coo_matrix, dense_index: bool = False
) -> Series:
"""
Convert a scipy.sparse.coo_matrix to a Series with type sparse.
Parameters
----------
A : scipy.sparse.coo_matrix
dense_index : bool, default False
Returns
-------
Series
Raises
------
TypeError if A is not a coo_matrix
"""
from pandas import SparseDtype
try:
ser = Series(A.data, MultiIndex.from_arrays((A.row, A.col)), copy=False)
except AttributeError as err:
raise TypeError(
f"Expected coo_matrix. Got {type(A).__name__} instead."
) from err
ser = ser.sort_index()
ser = ser.astype(SparseDtype(ser.dtype))
if dense_index:
ind = MultiIndex.from_product([A.row, A.col])
ser = ser.reindex(ind)
return ser

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,495 @@
from __future__ import annotations
import operator
import re
from typing import (
TYPE_CHECKING,
Callable,
Union,
)
import warnings
import numpy as np
from pandas._libs import (
lib,
missing as libmissing,
)
from pandas.compat import (
pa_version_under10p1,
pa_version_under13p0,
pa_version_under16p0,
)
from pandas.util._exceptions import find_stack_level
from pandas.core.dtypes.common import (
is_scalar,
pandas_dtype,
)
from pandas.core.dtypes.missing import isna
from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin
from pandas.core.arrays.arrow import ArrowExtensionArray
from pandas.core.arrays.boolean import BooleanDtype
from pandas.core.arrays.floating import Float64Dtype
from pandas.core.arrays.integer import Int64Dtype
from pandas.core.arrays.numeric import NumericDtype
from pandas.core.arrays.string_ import (
BaseStringArray,
StringDtype,
)
from pandas.core.strings.object_array import ObjectStringArrayMixin
if not pa_version_under10p1:
import pyarrow as pa
import pyarrow.compute as pc
if TYPE_CHECKING:
from collections.abc import Sequence
from pandas._typing import (
ArrayLike,
Dtype,
Self,
npt,
)
from pandas import Series
ArrowStringScalarOrNAT = Union[str, libmissing.NAType]
def _chk_pyarrow_available() -> None:
if pa_version_under10p1:
msg = "pyarrow>=10.0.1 is required for PyArrow backed ArrowExtensionArray."
raise ImportError(msg)
def _is_string_view(typ):
return not pa_version_under16p0 and pa.types.is_string_view(typ)
# TODO: Inherit directly from BaseStringArrayMethods. Currently we inherit from
# ObjectStringArrayMixin because we want to have the object-dtype based methods as
# fallback for the ones that pyarrow doesn't yet support
class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringArray):
"""
Extension array for string data in a ``pyarrow.ChunkedArray``.
.. warning::
ArrowStringArray is considered experimental. The implementation and
parts of the API may change without warning.
Parameters
----------
values : pyarrow.Array or pyarrow.ChunkedArray
The array of data.
Attributes
----------
None
Methods
-------
None
See Also
--------
:func:`pandas.array`
The recommended function for creating a ArrowStringArray.
Series.str
The string methods are available on Series backed by
a ArrowStringArray.
Notes
-----
ArrowStringArray returns a BooleanArray for comparison methods.
Examples
--------
>>> pd.array(['This is', 'some text', None, 'data.'], dtype="string[pyarrow]")
<ArrowStringArray>
['This is', 'some text', <NA>, 'data.']
Length: 4, dtype: string
"""
# error: Incompatible types in assignment (expression has type "StringDtype",
# base class "ArrowExtensionArray" defined the type as "ArrowDtype")
_dtype: StringDtype # type: ignore[assignment]
_storage = "pyarrow"
_na_value: libmissing.NAType | float = libmissing.NA
def __init__(self, values) -> None:
_chk_pyarrow_available()
if isinstance(values, (pa.Array, pa.ChunkedArray)) and (
pa.types.is_string(values.type)
or _is_string_view(values.type)
or (
pa.types.is_dictionary(values.type)
and (
pa.types.is_string(values.type.value_type)
or pa.types.is_large_string(values.type.value_type)
or _is_string_view(values.type.value_type)
)
)
):
values = pc.cast(values, pa.large_string())
super().__init__(values)
self._dtype = StringDtype(storage=self._storage, na_value=self._na_value)
if not pa.types.is_large_string(self._pa_array.type):
raise ValueError(
"ArrowStringArray requires a PyArrow (chunked) array of "
"large_string type"
)
@classmethod
def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
pa_scalar = super()._box_pa_scalar(value, pa_type)
if pa.types.is_string(pa_scalar.type) and pa_type is None:
pa_scalar = pc.cast(pa_scalar, pa.large_string())
return pa_scalar
@classmethod
def _box_pa_array(
cls, value, pa_type: pa.DataType | None = None, copy: bool = False
) -> pa.Array | pa.ChunkedArray:
pa_array = super()._box_pa_array(value, pa_type)
if pa.types.is_string(pa_array.type) and pa_type is None:
pa_array = pc.cast(pa_array, pa.large_string())
return pa_array
def __len__(self) -> int:
"""
Length of this array.
Returns
-------
length : int
"""
return len(self._pa_array)
@classmethod
def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
from pandas.core.arrays.masked import BaseMaskedArray
_chk_pyarrow_available()
if dtype and not (isinstance(dtype, str) and dtype == "string"):
dtype = pandas_dtype(dtype)
assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow"
if isinstance(scalars, BaseMaskedArray):
# avoid costly conversion to object dtype in ensure_string_array and
# numerical issues with Float32Dtype
na_values = scalars._mask
result = scalars._data
result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
return cls(pa.array(result, mask=na_values, type=pa.large_string()))
elif isinstance(scalars, (pa.Array, pa.ChunkedArray)):
return cls(pc.cast(scalars, pa.large_string()))
# convert non-na-likes to str
result = lib.ensure_string_array(scalars, copy=copy)
return cls(pa.array(result, type=pa.large_string(), from_pandas=True))
@classmethod
def _from_sequence_of_strings(
cls, strings, dtype: Dtype | None = None, copy: bool = False
):
return cls._from_sequence(strings, dtype=dtype, copy=copy)
@property
def dtype(self) -> StringDtype: # type: ignore[override]
"""
An instance of 'string[pyarrow]'.
"""
return self._dtype
def insert(self, loc: int, item) -> ArrowStringArray:
if self.dtype.na_value is np.nan and item is np.nan:
item = libmissing.NA
if not isinstance(item, str) and item is not libmissing.NA:
raise TypeError(
f"Invalid value '{item}' for dtype 'str'. Value should be a "
f"string or missing value, got '{type(item).__name__}' instead."
)
return super().insert(loc, item)
def _convert_bool_result(self, values, na=lib.no_default, method_name=None):
if na is not lib.no_default and not isna(na) and not isinstance(na, bool):
# GH#59561
warnings.warn(
f"Allowing a non-bool 'na' in obj.str.{method_name} is deprecated "
"and will raise in a future version.",
FutureWarning,
stacklevel=find_stack_level(),
)
na = bool(na)
if self.dtype.na_value is np.nan:
if na is lib.no_default or isna(na):
# NaN propagates as False
values = values.fill_null(False)
else:
values = values.fill_null(na)
return values.to_numpy()
else:
if na is not lib.no_default and not isna(
na
): # pyright: ignore [reportGeneralTypeIssues]
values = values.fill_null(na)
return BooleanDtype().__from_arrow__(values)
def _maybe_convert_setitem_value(self, value):
"""Maybe convert value to be pyarrow compatible."""
if is_scalar(value):
if isna(value):
value = None
elif not isinstance(value, str):
raise TypeError(
f"Invalid value '{value}' for dtype 'str'. Value should be a "
f"string or missing value, got '{type(value).__name__}' instead."
)
else:
value = np.array(value, dtype=object, copy=True)
value[isna(value)] = None
for v in value:
if not (v is None or isinstance(v, str)):
raise TypeError(
"Invalid value for dtype 'str'. Value should be a "
"string or missing value (or array of those)."
)
return super()._maybe_convert_setitem_value(value)
def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
value_set = [
pa_scalar.as_py()
for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values]
if pa_scalar.type in (pa.string(), pa.null(), pa.large_string())
]
# short-circuit to return all False array.
if not len(value_set):
return np.zeros(len(self), dtype=bool)
result = pc.is_in(
self._pa_array, value_set=pa.array(value_set, type=self._pa_array.type)
)
# pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls
# to False
return np.array(result, dtype=np.bool_)
def astype(self, dtype, copy: bool = True):
dtype = pandas_dtype(dtype)
if dtype == self.dtype:
if copy:
return self.copy()
return self
elif isinstance(dtype, NumericDtype):
data = self._pa_array.cast(pa.from_numpy_dtype(dtype.numpy_dtype))
return dtype.__from_arrow__(data)
elif isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.floating):
return self.to_numpy(dtype=dtype, na_value=np.nan)
return super().astype(dtype, copy=copy)
@property
def _data(self):
# dask accesses ._data directlys
warnings.warn(
f"{type(self).__name__}._data is a deprecated and will be removed "
"in a future version, use ._pa_array instead",
FutureWarning,
stacklevel=find_stack_level(),
)
return self._pa_array
# ------------------------------------------------------------------------
# String methods interface
_str_isalnum = ArrowStringArrayMixin._str_isalnum
_str_isalpha = ArrowStringArrayMixin._str_isalpha
_str_isdecimal = ArrowStringArrayMixin._str_isdecimal
_str_isdigit = ArrowStringArrayMixin._str_isdigit
_str_islower = ArrowStringArrayMixin._str_islower
_str_isnumeric = ArrowStringArrayMixin._str_isnumeric
_str_isspace = ArrowStringArrayMixin._str_isspace
_str_istitle = ArrowStringArrayMixin._str_istitle
_str_isupper = ArrowStringArrayMixin._str_isupper
_str_map = BaseStringArray._str_map
_str_startswith = ArrowStringArrayMixin._str_startswith
_str_endswith = ArrowStringArrayMixin._str_endswith
_str_pad = ArrowStringArrayMixin._str_pad
_str_match = ArrowStringArrayMixin._str_match
_str_fullmatch = ArrowStringArrayMixin._str_fullmatch
_str_lower = ArrowStringArrayMixin._str_lower
_str_upper = ArrowStringArrayMixin._str_upper
_str_strip = ArrowStringArrayMixin._str_strip
_str_lstrip = ArrowStringArrayMixin._str_lstrip
_str_rstrip = ArrowStringArrayMixin._str_rstrip
_str_removesuffix = ArrowStringArrayMixin._str_removesuffix
_str_get = ArrowStringArrayMixin._str_get
_str_capitalize = ArrowStringArrayMixin._str_capitalize
_str_title = ArrowStringArrayMixin._str_title
_str_swapcase = ArrowStringArrayMixin._str_swapcase
_str_slice_replace = ArrowStringArrayMixin._str_slice_replace
_str_len = ArrowStringArrayMixin._str_len
_str_slice = ArrowStringArrayMixin._str_slice
def _str_contains(
self,
pat,
case: bool = True,
flags: int = 0,
na=lib.no_default,
regex: bool = True,
):
if flags:
return super()._str_contains(pat, case, flags, na, regex)
if isinstance(pat, re.Pattern):
pat = pat.pattern
return ArrowStringArrayMixin._str_contains(self, pat, case, flags, na, regex)
def _str_replace(
self,
pat: str | re.Pattern,
repl: str | Callable,
n: int = -1,
case: bool = True,
flags: int = 0,
regex: bool = True,
):
if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:
return super()._str_replace(pat, repl, n, case, flags, regex)
return ArrowStringArrayMixin._str_replace(
self, pat, repl, n, case, flags, regex
)
def _str_repeat(self, repeats: int | Sequence[int]):
if not isinstance(repeats, int):
return super()._str_repeat(repeats)
else:
return ArrowExtensionArray._str_repeat(self, repeats=repeats)
def _str_removeprefix(self, prefix: str):
if not pa_version_under13p0:
return ArrowStringArrayMixin._str_removeprefix(self, prefix)
return super()._str_removeprefix(prefix)
def _str_count(self, pat: str, flags: int = 0):
if flags:
return super()._str_count(pat, flags)
result = pc.count_substring_regex(self._pa_array, pat)
return self._convert_int_result(result)
def _str_find(self, sub: str, start: int = 0, end: int | None = None):
if (
pa_version_under13p0
and not (start != 0 and end is not None)
and not (start == 0 and end is None)
):
# GH#59562
return super()._str_find(sub, start, end)
return ArrowStringArrayMixin._str_find(self, sub, start, end)
def _str_get_dummies(self, sep: str = "|"):
dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(sep)
if len(labels) == 0:
return np.empty(shape=(0, 0), dtype=np.int64), labels
dummies = np.vstack(dummies_pa.to_numpy())
return dummies.astype(np.int64, copy=False), labels
def _convert_int_result(self, result):
if self.dtype.na_value is np.nan:
if isinstance(result, pa.Array):
result = result.to_numpy(zero_copy_only=False)
else:
result = result.to_numpy()
if result.dtype == np.int32:
result = result.astype(np.int64)
return result
return Int64Dtype().__from_arrow__(result)
def _convert_rank_result(self, result):
if self.dtype.na_value is np.nan:
if isinstance(result, pa.Array):
result = result.to_numpy(zero_copy_only=False)
else:
result = result.to_numpy()
return result.astype("float64", copy=False)
return Float64Dtype().__from_arrow__(result)
def _reduce(
self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
):
if self.dtype.na_value is np.nan and name in ["any", "all"]:
if not skipna:
nas = pc.is_null(self._pa_array)
arr = pc.or_kleene(nas, pc.not_equal(self._pa_array, ""))
else:
arr = pc.not_equal(self._pa_array, "")
result = ArrowExtensionArray(arr)._reduce(
name, skipna=skipna, keepdims=keepdims, **kwargs
)
if keepdims:
# ArrowExtensionArray will return a length-1 bool[pyarrow] array
return result.astype(np.bool_)
return result
if name in ("min", "max", "sum", "argmin", "argmax"):
result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs)
else:
raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
if name in ("argmin", "argmax") and isinstance(result, pa.Array):
return self._convert_int_result(result)
elif isinstance(result, pa.Array):
return type(self)(result)
else:
return result
def value_counts(self, dropna: bool = True) -> Series:
result = super().value_counts(dropna=dropna)
if self.dtype.na_value is np.nan:
res_values = result._values.to_numpy()
return result._constructor(
res_values, index=result.index, name=result.name, copy=False
)
return result
def _cmp_method(self, other, op):
if (
isinstance(other, (BaseStringArray, ArrowExtensionArray))
and self.dtype.na_value is not libmissing.NA
and other.dtype.na_value is libmissing.NA
):
# NA has priority of NaN semantics
return NotImplemented
result = super()._cmp_method(other, op)
if self.dtype.na_value is np.nan:
if op == operator.ne:
return result.to_numpy(np.bool_, na_value=True)
else:
return result.to_numpy(np.bool_, na_value=False)
return result
def __pos__(self) -> Self:
raise TypeError(f"bad operand type for unary +: '{self.dtype}'")
class ArrowStringArrayNumpySemantics(ArrowStringArray):
_na_value = np.nan

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,657 @@
"""
Misc tools for implementing data structures
Note: pandas.core.common is *not* part of the public API.
"""
from __future__ import annotations
import builtins
from collections import (
abc,
defaultdict,
)
from collections.abc import (
Collection,
Generator,
Hashable,
Iterable,
Sequence,
)
import contextlib
from functools import partial
import inspect
from typing import (
TYPE_CHECKING,
Any,
Callable,
cast,
overload,
)
import warnings
import numpy as np
from pandas._libs import lib
from pandas.compat.numpy import np_version_gte1p24
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
from pandas.core.dtypes.common import (
is_bool_dtype,
is_integer,
)
from pandas.core.dtypes.generic import (
ABCExtensionArray,
ABCIndex,
ABCMultiIndex,
ABCSeries,
)
from pandas.core.dtypes.inference import iterable_not_string
if TYPE_CHECKING:
from pandas._typing import (
AnyArrayLike,
ArrayLike,
NpDtype,
RandomState,
T,
)
from pandas import Index
def flatten(line):
"""
Flatten an arbitrarily nested sequence.
Parameters
----------
line : sequence
The non string sequence to flatten
Notes
-----
This doesn't consider strings sequences.
Returns
-------
flattened : generator
"""
for element in line:
if iterable_not_string(element):
yield from flatten(element)
else:
yield element
def consensus_name_attr(objs):
name = objs[0].name
for obj in objs[1:]:
try:
if obj.name != name:
name = None
except ValueError:
name = None
return name
def is_bool_indexer(key: Any) -> bool:
"""
Check whether `key` is a valid boolean indexer.
Parameters
----------
key : Any
Only list-likes may be considered boolean indexers.
All other types are not considered a boolean indexer.
For array-like input, boolean ndarrays or ExtensionArrays
with ``_is_boolean`` set are considered boolean indexers.
Returns
-------
bool
Whether `key` is a valid boolean indexer.
Raises
------
ValueError
When the array is an object-dtype ndarray or ExtensionArray
and contains missing values.
See Also
--------
check_array_indexer : Check that `key` is a valid array to index,
and convert to an ndarray.
"""
if isinstance(
key, (ABCSeries, np.ndarray, ABCIndex, ABCExtensionArray)
) and not isinstance(key, ABCMultiIndex):
if key.dtype == np.object_:
key_array = np.asarray(key)
if not lib.is_bool_array(key_array):
na_msg = "Cannot mask with non-boolean array containing NA / NaN values"
if lib.is_bool_array(key_array, skipna=True):
# Don't raise on e.g. ["A", "B", np.nan], see
# test_loc_getitem_list_of_labels_categoricalindex_with_na
raise ValueError(na_msg)
return False
return True
elif is_bool_dtype(key.dtype):
return True
elif isinstance(key, list):
# check if np.array(key).dtype would be bool
if len(key) > 0:
if type(key) is not list: # noqa: E721
# GH#42461 cython will raise TypeError if we pass a subclass
key = list(key)
return lib.is_bool_list(key)
return False
def cast_scalar_indexer(val):
"""
Disallow indexing with a float key, even if that key is a round number.
Parameters
----------
val : scalar
Returns
-------
outval : scalar
"""
# assumes lib.is_scalar(val)
if lib.is_float(val) and val.is_integer():
raise IndexError(
# GH#34193
"Indexing with a float is no longer supported. Manually convert "
"to an integer key instead."
)
return val
def not_none(*args):
"""
Returns a generator consisting of the arguments that are not None.
"""
return (arg for arg in args if arg is not None)
def any_none(*args) -> bool:
"""
Returns a boolean indicating if any argument is None.
"""
return any(arg is None for arg in args)
def all_none(*args) -> bool:
"""
Returns a boolean indicating if all arguments are None.
"""
return all(arg is None for arg in args)
def any_not_none(*args) -> bool:
"""
Returns a boolean indicating if any argument is not None.
"""
return any(arg is not None for arg in args)
def all_not_none(*args) -> bool:
"""
Returns a boolean indicating if all arguments are not None.
"""
return all(arg is not None for arg in args)
def count_not_none(*args) -> int:
"""
Returns the count of arguments that are not None.
"""
return sum(x is not None for x in args)
@overload
def asarray_tuplesafe(
values: ArrayLike | list | tuple | zip, dtype: NpDtype | None = ...
) -> np.ndarray:
# ExtensionArray can only be returned when values is an Index, all other iterables
# will return np.ndarray. Unfortunately "all other" cannot be encoded in a type
# signature, so instead we special-case some common types.
...
@overload
def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = ...) -> ArrayLike:
...
def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = None) -> ArrayLike:
if not (isinstance(values, (list, tuple)) or hasattr(values, "__array__")):
values = list(values)
elif isinstance(values, ABCIndex):
return values._values
elif isinstance(values, ABCSeries):
return values._values
if isinstance(values, list) and dtype in [np.object_, object]:
return construct_1d_object_array_from_listlike(values)
try:
with warnings.catch_warnings():
# Can remove warning filter once NumPy 1.24 is min version
if not np_version_gte1p24:
warnings.simplefilter("ignore", np.VisibleDeprecationWarning)
result = np.asarray(values, dtype=dtype)
except ValueError:
# Using try/except since it's more performant than checking is_list_like
# over each element
# error: Argument 1 to "construct_1d_object_array_from_listlike"
# has incompatible type "Iterable[Any]"; expected "Sized"
return construct_1d_object_array_from_listlike(values) # type: ignore[arg-type]
if issubclass(result.dtype.type, str):
result = np.asarray(values, dtype=object)
if result.ndim == 2:
# Avoid building an array of arrays:
values = [tuple(x) for x in values]
result = construct_1d_object_array_from_listlike(values)
return result
def index_labels_to_array(
labels: np.ndarray | Iterable, dtype: NpDtype | None = None
) -> np.ndarray:
"""
Transform label or iterable of labels to array, for use in Index.
Parameters
----------
dtype : dtype
If specified, use as dtype of the resulting array, otherwise infer.
Returns
-------
array
"""
if isinstance(labels, (str, tuple)):
labels = [labels]
if not isinstance(labels, (list, np.ndarray)):
try:
labels = list(labels)
except TypeError: # non-iterable
labels = [labels]
labels = asarray_tuplesafe(labels, dtype=dtype)
return labels
def maybe_make_list(obj):
if obj is not None and not isinstance(obj, (tuple, list)):
return [obj]
return obj
def maybe_iterable_to_list(obj: Iterable[T] | T) -> Collection[T] | T:
"""
If obj is Iterable but not list-like, consume into list.
"""
if isinstance(obj, abc.Iterable) and not isinstance(obj, abc.Sized):
return list(obj)
obj = cast(Collection, obj)
return obj
def is_null_slice(obj) -> bool:
"""
We have a null slice.
"""
return (
isinstance(obj, slice)
and obj.start is None
and obj.stop is None
and obj.step is None
)
def is_empty_slice(obj) -> bool:
"""
We have an empty slice, e.g. no values are selected.
"""
return (
isinstance(obj, slice)
and obj.start is not None
and obj.stop is not None
and obj.start == obj.stop
)
def is_true_slices(line) -> list[bool]:
"""
Find non-trivial slices in "line": return a list of booleans with same length.
"""
return [isinstance(k, slice) and not is_null_slice(k) for k in line]
# TODO: used only once in indexing; belongs elsewhere?
def is_full_slice(obj, line: int) -> bool:
"""
We have a full length slice.
"""
return (
isinstance(obj, slice)
and obj.start == 0
and obj.stop == line
and obj.step is None
)
def get_callable_name(obj):
# typical case has name
if hasattr(obj, "__name__"):
return getattr(obj, "__name__")
# some objects don't; could recurse
if isinstance(obj, partial):
return get_callable_name(obj.func)
# fall back to class name
if callable(obj):
return type(obj).__name__
# everything failed (probably because the argument
# wasn't actually callable); we return None
# instead of the empty string in this case to allow
# distinguishing between no name and a name of ''
return None
def apply_if_callable(maybe_callable, obj, **kwargs):
"""
Evaluate possibly callable input using obj and kwargs if it is callable,
otherwise return as it is.
Parameters
----------
maybe_callable : possibly a callable
obj : NDFrame
**kwargs
"""
if callable(maybe_callable):
return maybe_callable(obj, **kwargs)
return maybe_callable
def standardize_mapping(into):
"""
Helper function to standardize a supplied mapping.
Parameters
----------
into : instance or subclass of collections.abc.Mapping
Must be a class, an initialized collections.defaultdict,
or an instance of a collections.abc.Mapping subclass.
Returns
-------
mapping : a collections.abc.Mapping subclass or other constructor
a callable object that can accept an iterator to create
the desired Mapping.
See Also
--------
DataFrame.to_dict
Series.to_dict
"""
if not inspect.isclass(into):
if isinstance(into, defaultdict):
return partial(defaultdict, into.default_factory)
into = type(into)
if not issubclass(into, abc.Mapping):
raise TypeError(f"unsupported type: {into}")
if into == defaultdict:
raise TypeError("to_dict() only accepts initialized defaultdicts")
return into
@overload
def random_state(state: np.random.Generator) -> np.random.Generator:
...
@overload
def random_state(
state: int | np.ndarray | np.random.BitGenerator | np.random.RandomState | None,
) -> np.random.RandomState:
...
def random_state(state: RandomState | None = None):
"""
Helper function for processing random_state arguments.
Parameters
----------
state : int, array-like, BitGenerator, Generator, np.random.RandomState, None.
If receives an int, array-like, or BitGenerator, passes to
np.random.RandomState() as seed.
If receives an np.random RandomState or Generator, just returns that unchanged.
If receives `None`, returns np.random.
If receives anything else, raises an informative ValueError.
Default None.
Returns
-------
np.random.RandomState or np.random.Generator. If state is None, returns np.random
"""
if is_integer(state) or isinstance(state, (np.ndarray, np.random.BitGenerator)):
return np.random.RandomState(state)
elif isinstance(state, np.random.RandomState):
return state
elif isinstance(state, np.random.Generator):
return state
elif state is None:
return np.random
else:
raise ValueError(
"random_state must be an integer, array-like, a BitGenerator, Generator, "
"a numpy RandomState, or None"
)
def pipe(
obj, func: Callable[..., T] | tuple[Callable[..., T], str], *args, **kwargs
) -> T:
"""
Apply a function ``func`` to object ``obj`` either by passing obj as the
first argument to the function or, in the case that the func is a tuple,
interpret the first element of the tuple as a function and pass the obj to
that function as a keyword argument whose key is the value of the second
element of the tuple.
Parameters
----------
func : callable or tuple of (callable, str)
Function to apply to this object or, alternatively, a
``(callable, data_keyword)`` tuple where ``data_keyword`` is a
string indicating the keyword of ``callable`` that expects the
object.
*args : iterable, optional
Positional arguments passed into ``func``.
**kwargs : dict, optional
A dictionary of keyword arguments passed into ``func``.
Returns
-------
object : the return type of ``func``.
"""
if isinstance(func, tuple):
func, target = func
if target in kwargs:
msg = f"{target} is both the pipe target and a keyword argument"
raise ValueError(msg)
kwargs[target] = obj
return func(*args, **kwargs)
else:
return func(obj, *args, **kwargs)
def get_rename_function(mapper):
"""
Returns a function that will map names/labels, dependent if mapper
is a dict, Series or just a function.
"""
def f(x):
if x in mapper:
return mapper[x]
else:
return x
return f if isinstance(mapper, (abc.Mapping, ABCSeries)) else mapper
def convert_to_list_like(
values: Hashable | Iterable | AnyArrayLike,
) -> list | AnyArrayLike:
"""
Convert list-like or scalar input to list-like. List, numpy and pandas array-like
inputs are returned unmodified whereas others are converted to list.
"""
if isinstance(values, (list, np.ndarray, ABCIndex, ABCSeries, ABCExtensionArray)):
return values
elif isinstance(values, abc.Iterable) and not isinstance(values, str):
return list(values)
return [values]
@contextlib.contextmanager
def temp_setattr(
obj, attr: str, value, condition: bool = True
) -> Generator[None, None, None]:
"""
Temporarily set attribute on an object.
Parameters
----------
obj : object
Object whose attribute will be modified.
attr : str
Attribute to modify.
value : Any
Value to temporarily set attribute to.
condition : bool, default True
Whether to set the attribute. Provided in order to not have to
conditionally use this context manager.
Yields
------
object : obj with modified attribute.
"""
if condition:
old_value = getattr(obj, attr)
setattr(obj, attr, value)
try:
yield obj
finally:
if condition:
setattr(obj, attr, old_value)
def require_length_match(data, index: Index) -> None:
"""
Check the length of data matches the length of the index.
"""
if len(data) != len(index):
raise ValueError(
"Length of values "
f"({len(data)}) "
"does not match length of index "
f"({len(index)})"
)
# the ufuncs np.maximum.reduce and np.minimum.reduce default to axis=0,
# whereas np.min and np.max (which directly call obj.min and obj.max)
# default to axis=None.
_builtin_table = {
builtins.sum: np.sum,
builtins.max: np.maximum.reduce,
builtins.min: np.minimum.reduce,
}
# GH#53425: Only for deprecation
_builtin_table_alias = {
builtins.sum: "np.sum",
builtins.max: "np.maximum.reduce",
builtins.min: "np.minimum.reduce",
}
_cython_table = {
builtins.sum: "sum",
builtins.max: "max",
builtins.min: "min",
np.all: "all",
np.any: "any",
np.sum: "sum",
np.nansum: "sum",
np.mean: "mean",
np.nanmean: "mean",
np.prod: "prod",
np.nanprod: "prod",
np.std: "std",
np.nanstd: "std",
np.var: "var",
np.nanvar: "var",
np.median: "median",
np.nanmedian: "median",
np.max: "max",
np.nanmax: "max",
np.min: "min",
np.nanmin: "min",
np.cumprod: "cumprod",
np.nancumprod: "cumprod",
np.cumsum: "cumsum",
np.nancumsum: "cumsum",
}
def get_cython_func(arg: Callable) -> str | None:
"""
if we define an internal function for this argument, return it
"""
return _cython_table.get(arg)
def is_builtin_func(arg):
"""
if we define a builtin function for this argument, return it,
otherwise return the arg
"""
return _builtin_table.get(arg, arg)
def fill_missing_names(names: Sequence[Hashable | None]) -> list[Hashable]:
"""
If a name is missing then replace it by level_n, where n is the count
.. versionadded:: 1.4.0
Parameters
----------
names : list-like
list of column names or None values.
Returns
-------
list
list of column names with the None values replaced.
"""
return [f"level_{i}" if name is None else name for i, name in enumerate(names)]

View File

@ -0,0 +1,213 @@
"""
Core eval alignment algorithms.
"""
from __future__ import annotations
from functools import (
partial,
wraps,
)
from typing import (
TYPE_CHECKING,
Callable,
)
import warnings
import numpy as np
from pandas.errors import PerformanceWarning
from pandas.util._exceptions import find_stack_level
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCSeries,
)
from pandas.core.base import PandasObject
import pandas.core.common as com
from pandas.core.computation.common import result_type_many
if TYPE_CHECKING:
from collections.abc import Sequence
from pandas._typing import F
from pandas.core.generic import NDFrame
from pandas.core.indexes.api import Index
def _align_core_single_unary_op(
term,
) -> tuple[partial | type[NDFrame], dict[str, Index] | None]:
typ: partial | type[NDFrame]
axes: dict[str, Index] | None = None
if isinstance(term.value, np.ndarray):
typ = partial(np.asanyarray, dtype=term.value.dtype)
else:
typ = type(term.value)
if hasattr(term.value, "axes"):
axes = _zip_axes_from_type(typ, term.value.axes)
return typ, axes
def _zip_axes_from_type(
typ: type[NDFrame], new_axes: Sequence[Index]
) -> dict[str, Index]:
return {name: new_axes[i] for i, name in enumerate(typ._AXIS_ORDERS)}
def _any_pandas_objects(terms) -> bool:
"""
Check a sequence of terms for instances of PandasObject.
"""
return any(isinstance(term.value, PandasObject) for term in terms)
def _filter_special_cases(f) -> Callable[[F], F]:
@wraps(f)
def wrapper(terms):
# single unary operand
if len(terms) == 1:
return _align_core_single_unary_op(terms[0])
term_values = (term.value for term in terms)
# we don't have any pandas objects
if not _any_pandas_objects(terms):
return result_type_many(*term_values), None
return f(terms)
return wrapper
@_filter_special_cases
def _align_core(terms):
term_index = [i for i, term in enumerate(terms) if hasattr(term.value, "axes")]
term_dims = [terms[i].value.ndim for i in term_index]
from pandas import Series
ndims = Series(dict(zip(term_index, term_dims)))
# initial axes are the axes of the largest-axis'd term
biggest = terms[ndims.idxmax()].value
typ = biggest._constructor
axes = biggest.axes
naxes = len(axes)
gt_than_one_axis = naxes > 1
for value in (terms[i].value for i in term_index):
is_series = isinstance(value, ABCSeries)
is_series_and_gt_one_axis = is_series and gt_than_one_axis
for axis, items in enumerate(value.axes):
if is_series_and_gt_one_axis:
ax, itm = naxes - 1, value.index
else:
ax, itm = axis, items
if not axes[ax].is_(itm):
axes[ax] = axes[ax].union(itm)
for i, ndim in ndims.items():
for axis, items in zip(range(ndim), axes):
ti = terms[i].value
if hasattr(ti, "reindex"):
transpose = isinstance(ti, ABCSeries) and naxes > 1
reindexer = axes[naxes - 1] if transpose else items
term_axis_size = len(ti.axes[axis])
reindexer_size = len(reindexer)
ordm = np.log10(max(1, abs(reindexer_size - term_axis_size)))
if ordm >= 1 and reindexer_size >= 10000:
w = (
f"Alignment difference on axis {axis} is larger "
f"than an order of magnitude on term {repr(terms[i].name)}, "
f"by more than {ordm:.4g}; performance may suffer."
)
warnings.warn(
w, category=PerformanceWarning, stacklevel=find_stack_level()
)
obj = ti.reindex(reindexer, axis=axis, copy=False)
terms[i].update(obj)
terms[i].update(terms[i].value.values)
return typ, _zip_axes_from_type(typ, axes)
def align_terms(terms):
"""
Align a set of terms.
"""
try:
# flatten the parse tree (a nested list, really)
terms = list(com.flatten(terms))
except TypeError:
# can't iterate so it must just be a constant or single variable
if isinstance(terms.value, (ABCSeries, ABCDataFrame)):
typ = type(terms.value)
return typ, _zip_axes_from_type(typ, terms.value.axes)
return np.result_type(terms.type), None
# if all resolved variables are numeric scalars
if all(term.is_scalar for term in terms):
return result_type_many(*(term.value for term in terms)).type, None
# perform the main alignment
typ, axes = _align_core(terms)
return typ, axes
def reconstruct_object(typ, obj, axes, dtype):
"""
Reconstruct an object given its type, raw value, and possibly empty
(None) axes.
Parameters
----------
typ : object
A type
obj : object
The value to use in the type constructor
axes : dict
The axes to use to construct the resulting pandas object
Returns
-------
ret : typ
An object of type ``typ`` with the value `obj` and possible axes
`axes`.
"""
try:
typ = typ.type
except AttributeError:
pass
res_t = np.result_type(obj.dtype, dtype)
if not isinstance(typ, partial) and issubclass(typ, PandasObject):
return typ(obj, dtype=res_t, **axes)
# special case for pathological things like ~True/~False
if hasattr(res_t, "type") and typ == np.bool_ and res_t != np.bool_:
ret_value = res_t.type(obj)
else:
ret_value = typ(obj).astype(res_t)
# The condition is to distinguish 0-dim array (returned in case of
# scalar) and 1 element array
# e.g. np.array(0) and np.array([0])
if (
len(obj.shape) == 1
and len(obj) == 1
and not isinstance(ret_value, np.ndarray)
):
ret_value = np.array([ret_value]).astype(res_t)
return ret_value

View File

@ -0,0 +1,2 @@
__all__ = ["eval"]
from pandas.core.computation.eval import eval

View File

@ -0,0 +1,8 @@
from __future__ import annotations
from pandas.compat._optional import import_optional_dependency
ne = import_optional_dependency("numexpr", errors="warn")
NUMEXPR_INSTALLED = ne is not None
__all__ = ["NUMEXPR_INSTALLED"]

View File

@ -0,0 +1,48 @@
from __future__ import annotations
from functools import reduce
import numpy as np
from pandas._config import get_option
def ensure_decoded(s) -> str:
"""
If we have bytes, decode them to unicode.
"""
if isinstance(s, (np.bytes_, bytes)):
s = s.decode(get_option("display.encoding"))
return s
def result_type_many(*arrays_and_dtypes):
"""
Wrapper around numpy.result_type which overcomes the NPY_MAXARGS (32)
argument limit.
"""
try:
return np.result_type(*arrays_and_dtypes)
except ValueError:
# we have > NPY_MAXARGS terms in our expression
return reduce(np.result_type, arrays_and_dtypes)
except TypeError:
from pandas.core.dtypes.cast import find_common_type
from pandas.core.dtypes.common import is_extension_array_dtype
arr_and_dtypes = list(arrays_and_dtypes)
ea_dtypes, non_ea_dtypes = [], []
for arr_or_dtype in arr_and_dtypes:
if is_extension_array_dtype(arr_or_dtype):
ea_dtypes.append(arr_or_dtype)
else:
non_ea_dtypes.append(arr_or_dtype)
if non_ea_dtypes:
try:
np_dtype = np.result_type(*non_ea_dtypes)
except ValueError:
np_dtype = reduce(np.result_type, arrays_and_dtypes)
return find_common_type(ea_dtypes + [np_dtype])
return find_common_type(ea_dtypes)

View File

@ -0,0 +1,143 @@
"""
Engine classes for :func:`~pandas.eval`
"""
from __future__ import annotations
import abc
from typing import TYPE_CHECKING
from pandas.errors import NumExprClobberingError
from pandas.core.computation.align import (
align_terms,
reconstruct_object,
)
from pandas.core.computation.ops import (
MATHOPS,
REDUCTIONS,
)
from pandas.io.formats import printing
if TYPE_CHECKING:
from pandas.core.computation.expr import Expr
_ne_builtins = frozenset(MATHOPS + REDUCTIONS)
def _check_ne_builtin_clash(expr: Expr) -> None:
"""
Attempt to prevent foot-shooting in a helpful way.
Parameters
----------
expr : Expr
Terms can contain
"""
names = expr.names
overlap = names & _ne_builtins
if overlap:
s = ", ".join([repr(x) for x in overlap])
raise NumExprClobberingError(
f'Variables in expression "{expr}" overlap with builtins: ({s})'
)
class AbstractEngine(metaclass=abc.ABCMeta):
"""Object serving as a base class for all engines."""
has_neg_frac = False
def __init__(self, expr) -> None:
self.expr = expr
self.aligned_axes = None
self.result_type = None
def convert(self) -> str:
"""
Convert an expression for evaluation.
Defaults to return the expression as a string.
"""
return printing.pprint_thing(self.expr)
def evaluate(self) -> object:
"""
Run the engine on the expression.
This method performs alignment which is necessary no matter what engine
is being used, thus its implementation is in the base class.
Returns
-------
object
The result of the passed expression.
"""
if not self._is_aligned:
self.result_type, self.aligned_axes = align_terms(self.expr.terms)
# make sure no names in resolvers and locals/globals clash
res = self._evaluate()
return reconstruct_object(
self.result_type, res, self.aligned_axes, self.expr.terms.return_type
)
@property
def _is_aligned(self) -> bool:
return self.aligned_axes is not None and self.result_type is not None
@abc.abstractmethod
def _evaluate(self):
"""
Return an evaluated expression.
Parameters
----------
env : Scope
The local and global environment in which to evaluate an
expression.
Notes
-----
Must be implemented by subclasses.
"""
class NumExprEngine(AbstractEngine):
"""NumExpr engine class"""
has_neg_frac = True
def _evaluate(self):
import numexpr as ne
# convert the expression to a valid numexpr expression
s = self.convert()
env = self.expr.env
scope = env.full_scope
_check_ne_builtin_clash(self.expr)
return ne.evaluate(s, local_dict=scope)
class PythonEngine(AbstractEngine):
"""
Evaluate an expression in Python space.
Mostly for testing purposes.
"""
has_neg_frac = False
def evaluate(self):
return self.expr()
def _evaluate(self) -> None:
pass
ENGINES: dict[str, type[AbstractEngine]] = {
"numexpr": NumExprEngine,
"python": PythonEngine,
}

View File

@ -0,0 +1,421 @@
"""
Top level ``eval`` module.
"""
from __future__ import annotations
import tokenize
from typing import TYPE_CHECKING
import warnings
from pandas.util._exceptions import find_stack_level
from pandas.util._validators import validate_bool_kwarg
from pandas.core.dtypes.common import (
is_extension_array_dtype,
is_string_dtype,
)
from pandas.core.computation.engines import ENGINES
from pandas.core.computation.expr import (
PARSERS,
Expr,
)
from pandas.core.computation.parsing import tokenize_string
from pandas.core.computation.scope import ensure_scope
from pandas.core.generic import NDFrame
from pandas.io.formats.printing import pprint_thing
if TYPE_CHECKING:
from pandas.core.computation.ops import BinOp
def _check_engine(engine: str | None) -> str:
"""
Make sure a valid engine is passed.
Parameters
----------
engine : str
String to validate.
Raises
------
KeyError
* If an invalid engine is passed.
ImportError
* If numexpr was requested but doesn't exist.
Returns
-------
str
Engine name.
"""
from pandas.core.computation.check import NUMEXPR_INSTALLED
from pandas.core.computation.expressions import USE_NUMEXPR
if engine is None:
engine = "numexpr" if USE_NUMEXPR else "python"
if engine not in ENGINES:
valid_engines = list(ENGINES.keys())
raise KeyError(
f"Invalid engine '{engine}' passed, valid engines are {valid_engines}"
)
# TODO: validate this in a more general way (thinking of future engines
# that won't necessarily be import-able)
# Could potentially be done on engine instantiation
if engine == "numexpr" and not NUMEXPR_INSTALLED:
raise ImportError(
"'numexpr' is not installed or an unsupported version. Cannot use "
"engine='numexpr' for query/eval if 'numexpr' is not installed"
)
return engine
def _check_parser(parser: str):
"""
Make sure a valid parser is passed.
Parameters
----------
parser : str
Raises
------
KeyError
* If an invalid parser is passed
"""
if parser not in PARSERS:
raise KeyError(
f"Invalid parser '{parser}' passed, valid parsers are {PARSERS.keys()}"
)
def _check_resolvers(resolvers):
if resolvers is not None:
for resolver in resolvers:
if not hasattr(resolver, "__getitem__"):
name = type(resolver).__name__
raise TypeError(
f"Resolver of type '{name}' does not "
"implement the __getitem__ method"
)
def _check_expression(expr):
"""
Make sure an expression is not an empty string
Parameters
----------
expr : object
An object that can be converted to a string
Raises
------
ValueError
* If expr is an empty string
"""
if not expr:
raise ValueError("expr cannot be an empty string")
def _convert_expression(expr) -> str:
"""
Convert an object to an expression.
This function converts an object to an expression (a unicode string) and
checks to make sure it isn't empty after conversion. This is used to
convert operators to their string representation for recursive calls to
:func:`~pandas.eval`.
Parameters
----------
expr : object
The object to be converted to a string.
Returns
-------
str
The string representation of an object.
Raises
------
ValueError
* If the expression is empty.
"""
s = pprint_thing(expr)
_check_expression(s)
return s
def _check_for_locals(expr: str, stack_level: int, parser: str):
at_top_of_stack = stack_level == 0
not_pandas_parser = parser != "pandas"
if not_pandas_parser:
msg = "The '@' prefix is only supported by the pandas parser"
elif at_top_of_stack:
msg = (
"The '@' prefix is not allowed in top-level eval calls.\n"
"please refer to your variables by name without the '@' prefix."
)
if at_top_of_stack or not_pandas_parser:
for toknum, tokval in tokenize_string(expr):
if toknum == tokenize.OP and tokval == "@":
raise SyntaxError(msg)
def eval(
expr: str | BinOp, # we leave BinOp out of the docstr bc it isn't for users
parser: str = "pandas",
engine: str | None = None,
local_dict=None,
global_dict=None,
resolvers=(),
level: int = 0,
target=None,
inplace: bool = False,
):
"""
Evaluate a Python expression as a string using various backends.
The following arithmetic operations are supported: ``+``, ``-``, ``*``,
``/``, ``**``, ``%``, ``//`` (python engine only) along with the following
boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not).
Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`,
:keyword:`or`, and :keyword:`not` with the same semantics as the
corresponding bitwise operators. :class:`~pandas.Series` and
:class:`~pandas.DataFrame` objects are supported and behave as they would
with plain ol' Python evaluation.
Parameters
----------
expr : str
The expression to evaluate. This string cannot contain any Python
`statements
<https://docs.python.org/3/reference/simple_stmts.html#simple-statements>`__,
only Python `expressions
<https://docs.python.org/3/reference/simple_stmts.html#expression-statements>`__.
parser : {'pandas', 'python'}, default 'pandas'
The parser to use to construct the syntax tree from the expression. The
default of ``'pandas'`` parses code slightly different than standard
Python. Alternatively, you can parse an expression using the
``'python'`` parser to retain strict Python semantics. See the
:ref:`enhancing performance <enhancingperf.eval>` documentation for
more details.
engine : {'python', 'numexpr'}, default 'numexpr'
The engine used to evaluate the expression. Supported engines are
- None : tries to use ``numexpr``, falls back to ``python``
- ``'numexpr'`` : This default engine evaluates pandas objects using
numexpr for large speed ups in complex expressions with large frames.
- ``'python'`` : Performs operations as if you had ``eval``'d in top
level python. This engine is generally not that useful.
More backends may be available in the future.
local_dict : dict or None, optional
A dictionary of local variables, taken from locals() by default.
global_dict : dict or None, optional
A dictionary of global variables, taken from globals() by default.
resolvers : list of dict-like or None, optional
A list of objects implementing the ``__getitem__`` special method that
you can use to inject an additional collection of namespaces to use for
variable lookup. For example, this is used in the
:meth:`~DataFrame.query` method to inject the
``DataFrame.index`` and ``DataFrame.columns``
variables that refer to their respective :class:`~pandas.DataFrame`
instance attributes.
level : int, optional
The number of prior stack frames to traverse and add to the current
scope. Most users will **not** need to change this parameter.
target : object, optional, default None
This is the target object for assignment. It is used when there is
variable assignment in the expression. If so, then `target` must
support item assignment with string keys, and if a copy is being
returned, it must also support `.copy()`.
inplace : bool, default False
If `target` is provided, and the expression mutates `target`, whether
to modify `target` inplace. Otherwise, return a copy of `target` with
the mutation.
Returns
-------
ndarray, numeric scalar, DataFrame, Series, or None
The completion value of evaluating the given code or None if ``inplace=True``.
Raises
------
ValueError
There are many instances where such an error can be raised:
- `target=None`, but the expression is multiline.
- The expression is multiline, but not all them have item assignment.
An example of such an arrangement is this:
a = b + 1
a + 2
Here, there are expressions on different lines, making it multiline,
but the last line has no variable assigned to the output of `a + 2`.
- `inplace=True`, but the expression is missing item assignment.
- Item assignment is provided, but the `target` does not support
string item assignment.
- Item assignment is provided and `inplace=False`, but the `target`
does not support the `.copy()` method
See Also
--------
DataFrame.query : Evaluates a boolean expression to query the columns
of a frame.
DataFrame.eval : Evaluate a string describing operations on
DataFrame columns.
Notes
-----
The ``dtype`` of any objects involved in an arithmetic ``%`` operation are
recursively cast to ``float64``.
See the :ref:`enhancing performance <enhancingperf.eval>` documentation for
more details.
Examples
--------
>>> df = pd.DataFrame({"animal": ["dog", "pig"], "age": [10, 20]})
>>> df
animal age
0 dog 10
1 pig 20
We can add a new column using ``pd.eval``:
>>> pd.eval("double_age = df.age * 2", target=df)
animal age double_age
0 dog 10 20
1 pig 20 40
"""
inplace = validate_bool_kwarg(inplace, "inplace")
exprs: list[str | BinOp]
if isinstance(expr, str):
_check_expression(expr)
exprs = [e.strip() for e in expr.splitlines() if e.strip() != ""]
else:
# ops.BinOp; for internal compat, not intended to be passed by users
exprs = [expr]
multi_line = len(exprs) > 1
if multi_line and target is None:
raise ValueError(
"multi-line expressions are only valid in the "
"context of data, use DataFrame.eval"
)
engine = _check_engine(engine)
_check_parser(parser)
_check_resolvers(resolvers)
ret = None
first_expr = True
target_modified = False
for expr in exprs:
expr = _convert_expression(expr)
_check_for_locals(expr, level, parser)
# get our (possibly passed-in) scope
env = ensure_scope(
level + 1,
global_dict=global_dict,
local_dict=local_dict,
resolvers=resolvers,
target=target,
)
parsed_expr = Expr(expr, engine=engine, parser=parser, env=env)
if engine == "numexpr" and (
(
is_extension_array_dtype(parsed_expr.terms.return_type)
and not is_string_dtype(parsed_expr.terms.return_type)
)
or getattr(parsed_expr.terms, "operand_types", None) is not None
and any(
(is_extension_array_dtype(elem) and not is_string_dtype(elem))
for elem in parsed_expr.terms.operand_types
)
):
warnings.warn(
"Engine has switched to 'python' because numexpr does not support "
"extension array dtypes. Please set your engine to python manually.",
RuntimeWarning,
stacklevel=find_stack_level(),
)
engine = "python"
# construct the engine and evaluate the parsed expression
eng = ENGINES[engine]
eng_inst = eng(parsed_expr)
ret = eng_inst.evaluate()
if parsed_expr.assigner is None:
if multi_line:
raise ValueError(
"Multi-line expressions are only valid "
"if all expressions contain an assignment"
)
if inplace:
raise ValueError("Cannot operate inplace if there is no assignment")
# assign if needed
assigner = parsed_expr.assigner
if env.target is not None and assigner is not None:
target_modified = True
# if returning a copy, copy only on the first assignment
if not inplace and first_expr:
try:
target = env.target
if isinstance(target, NDFrame):
target = target.copy(deep=None)
else:
target = target.copy()
except AttributeError as err:
raise ValueError("Cannot return a copy of the target") from err
else:
target = env.target
# TypeError is most commonly raised (e.g. int, list), but you
# get IndexError if you try to do this assignment on np.ndarray.
# we will ignore numpy warnings here; e.g. if trying
# to use a non-numeric indexer
try:
if inplace and isinstance(target, NDFrame):
target.loc[:, assigner] = ret
else:
target[assigner] = ret # pyright: ignore[reportGeneralTypeIssues]
except (TypeError, IndexError) as err:
raise ValueError("Cannot assign expression output to target") from err
if not resolvers:
resolvers = ({assigner: ret},)
else:
# existing resolver needs updated to handle
# case of mutating existing column in copy
for resolver in resolvers:
if assigner in resolver:
resolver[assigner] = ret
break
else:
resolvers += ({assigner: ret},)
ret = None
first_expr = False
# We want to exclude `inplace=None` as being False.
if inplace is False:
return target if target_modified else ret

View File

@ -0,0 +1,840 @@
"""
:func:`~pandas.eval` parsers.
"""
from __future__ import annotations
import ast
from functools import (
partial,
reduce,
)
from keyword import iskeyword
import tokenize
from typing import (
Callable,
ClassVar,
TypeVar,
)
import numpy as np
from pandas.errors import UndefinedVariableError
from pandas.core.dtypes.common import is_string_dtype
import pandas.core.common as com
from pandas.core.computation.ops import (
ARITH_OPS_SYMS,
BOOL_OPS_SYMS,
CMP_OPS_SYMS,
LOCAL_TAG,
MATHOPS,
REDUCTIONS,
UNARY_OPS_SYMS,
BinOp,
Constant,
FuncNode,
Op,
Term,
UnaryOp,
is_term,
)
from pandas.core.computation.parsing import (
clean_backtick_quoted_toks,
tokenize_string,
)
from pandas.core.computation.scope import Scope
from pandas.io.formats import printing
def _rewrite_assign(tok: tuple[int, str]) -> tuple[int, str]:
"""
Rewrite the assignment operator for PyTables expressions that use ``=``
as a substitute for ``==``.
Parameters
----------
tok : tuple of int, str
ints correspond to the all caps constants in the tokenize module
Returns
-------
tuple of int, str
Either the input or token or the replacement values
"""
toknum, tokval = tok
return toknum, "==" if tokval == "=" else tokval
def _replace_booleans(tok: tuple[int, str]) -> tuple[int, str]:
"""
Replace ``&`` with ``and`` and ``|`` with ``or`` so that bitwise
precedence is changed to boolean precedence.
Parameters
----------
tok : tuple of int, str
ints correspond to the all caps constants in the tokenize module
Returns
-------
tuple of int, str
Either the input or token or the replacement values
"""
toknum, tokval = tok
if toknum == tokenize.OP:
if tokval == "&":
return tokenize.NAME, "and"
elif tokval == "|":
return tokenize.NAME, "or"
return toknum, tokval
return toknum, tokval
def _replace_locals(tok: tuple[int, str]) -> tuple[int, str]:
"""
Replace local variables with a syntactically valid name.
Parameters
----------
tok : tuple of int, str
ints correspond to the all caps constants in the tokenize module
Returns
-------
tuple of int, str
Either the input or token or the replacement values
Notes
-----
This is somewhat of a hack in that we rewrite a string such as ``'@a'`` as
``'__pd_eval_local_a'`` by telling the tokenizer that ``__pd_eval_local_``
is a ``tokenize.OP`` and to replace the ``'@'`` symbol with it.
"""
toknum, tokval = tok
if toknum == tokenize.OP and tokval == "@":
return tokenize.OP, LOCAL_TAG
return toknum, tokval
def _compose2(f, g):
"""
Compose 2 callables.
"""
return lambda *args, **kwargs: f(g(*args, **kwargs))
def _compose(*funcs):
"""
Compose 2 or more callables.
"""
assert len(funcs) > 1, "At least 2 callables must be passed to compose"
return reduce(_compose2, funcs)
def _preparse(
source: str,
f=_compose(
_replace_locals, _replace_booleans, _rewrite_assign, clean_backtick_quoted_toks
),
) -> str:
"""
Compose a collection of tokenization functions.
Parameters
----------
source : str
A Python source code string
f : callable
This takes a tuple of (toknum, tokval) as its argument and returns a
tuple with the same structure but possibly different elements. Defaults
to the composition of ``_rewrite_assign``, ``_replace_booleans``, and
``_replace_locals``.
Returns
-------
str
Valid Python source code
Notes
-----
The `f` parameter can be any callable that takes *and* returns input of the
form ``(toknum, tokval)``, where ``toknum`` is one of the constants from
the ``tokenize`` module and ``tokval`` is a string.
"""
assert callable(f), "f must be callable"
return tokenize.untokenize(f(x) for x in tokenize_string(source))
def _is_type(t):
"""
Factory for a type checking function of type ``t`` or tuple of types.
"""
return lambda x: isinstance(x.value, t)
_is_list = _is_type(list)
_is_str = _is_type(str)
# partition all AST nodes
_all_nodes = frozenset(
node
for node in (getattr(ast, name) for name in dir(ast))
if isinstance(node, type) and issubclass(node, ast.AST)
)
def _filter_nodes(superclass, all_nodes=_all_nodes):
"""
Filter out AST nodes that are subclasses of ``superclass``.
"""
node_names = (node.__name__ for node in all_nodes if issubclass(node, superclass))
return frozenset(node_names)
_all_node_names = frozenset(x.__name__ for x in _all_nodes)
_mod_nodes = _filter_nodes(ast.mod)
_stmt_nodes = _filter_nodes(ast.stmt)
_expr_nodes = _filter_nodes(ast.expr)
_expr_context_nodes = _filter_nodes(ast.expr_context)
_boolop_nodes = _filter_nodes(ast.boolop)
_operator_nodes = _filter_nodes(ast.operator)
_unary_op_nodes = _filter_nodes(ast.unaryop)
_cmp_op_nodes = _filter_nodes(ast.cmpop)
_comprehension_nodes = _filter_nodes(ast.comprehension)
_handler_nodes = _filter_nodes(ast.excepthandler)
_arguments_nodes = _filter_nodes(ast.arguments)
_keyword_nodes = _filter_nodes(ast.keyword)
_alias_nodes = _filter_nodes(ast.alias)
# nodes that we don't support directly but are needed for parsing
_hacked_nodes = frozenset(["Assign", "Module", "Expr"])
_unsupported_expr_nodes = frozenset(
[
"Yield",
"GeneratorExp",
"IfExp",
"DictComp",
"SetComp",
"Repr",
"Lambda",
"Set",
"AST",
"Is",
"IsNot",
]
)
# these nodes are low priority or won't ever be supported (e.g., AST)
_unsupported_nodes = (
_stmt_nodes
| _mod_nodes
| _handler_nodes
| _arguments_nodes
| _keyword_nodes
| _alias_nodes
| _expr_context_nodes
| _unsupported_expr_nodes
) - _hacked_nodes
# we're adding a different assignment in some cases to be equality comparison
# and we don't want `stmt` and friends in their so get only the class whose
# names are capitalized
_base_supported_nodes = (_all_node_names - _unsupported_nodes) | _hacked_nodes
intersection = _unsupported_nodes & _base_supported_nodes
_msg = f"cannot both support and not support {intersection}"
assert not intersection, _msg
def _node_not_implemented(node_name: str) -> Callable[..., None]:
"""
Return a function that raises a NotImplementedError with a passed node name.
"""
def f(self, *args, **kwargs):
raise NotImplementedError(f"'{node_name}' nodes are not implemented")
return f
# should be bound by BaseExprVisitor but that creates a circular dependency:
# _T is used in disallow, but disallow is used to define BaseExprVisitor
# https://github.com/microsoft/pyright/issues/2315
_T = TypeVar("_T")
def disallow(nodes: set[str]) -> Callable[[type[_T]], type[_T]]:
"""
Decorator to disallow certain nodes from parsing. Raises a
NotImplementedError instead.
Returns
-------
callable
"""
def disallowed(cls: type[_T]) -> type[_T]:
# error: "Type[_T]" has no attribute "unsupported_nodes"
cls.unsupported_nodes = () # type: ignore[attr-defined]
for node in nodes:
new_method = _node_not_implemented(node)
name = f"visit_{node}"
# error: "Type[_T]" has no attribute "unsupported_nodes"
cls.unsupported_nodes += (name,) # type: ignore[attr-defined]
setattr(cls, name, new_method)
return cls
return disallowed
def _op_maker(op_class, op_symbol):
"""
Return a function to create an op class with its symbol already passed.
Returns
-------
callable
"""
def f(self, node, *args, **kwargs):
"""
Return a partial function with an Op subclass with an operator already passed.
Returns
-------
callable
"""
return partial(op_class, op_symbol, *args, **kwargs)
return f
_op_classes = {"binary": BinOp, "unary": UnaryOp}
def add_ops(op_classes):
"""
Decorator to add default implementation of ops.
"""
def f(cls):
for op_attr_name, op_class in op_classes.items():
ops = getattr(cls, f"{op_attr_name}_ops")
ops_map = getattr(cls, f"{op_attr_name}_op_nodes_map")
for op in ops:
op_node = ops_map[op]
if op_node is not None:
made_op = _op_maker(op_class, op)
setattr(cls, f"visit_{op_node}", made_op)
return cls
return f
@disallow(_unsupported_nodes)
@add_ops(_op_classes)
class BaseExprVisitor(ast.NodeVisitor):
"""
Custom ast walker. Parsers of other engines should subclass this class
if necessary.
Parameters
----------
env : Scope
engine : str
parser : str
preparser : callable
"""
const_type: ClassVar[type[Term]] = Constant
term_type: ClassVar[type[Term]] = Term
binary_ops = CMP_OPS_SYMS + BOOL_OPS_SYMS + ARITH_OPS_SYMS
binary_op_nodes = (
"Gt",
"Lt",
"GtE",
"LtE",
"Eq",
"NotEq",
"In",
"NotIn",
"BitAnd",
"BitOr",
"And",
"Or",
"Add",
"Sub",
"Mult",
"Div",
"Pow",
"FloorDiv",
"Mod",
)
binary_op_nodes_map = dict(zip(binary_ops, binary_op_nodes))
unary_ops = UNARY_OPS_SYMS
unary_op_nodes = "UAdd", "USub", "Invert", "Not"
unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes))
rewrite_map = {
ast.Eq: ast.In,
ast.NotEq: ast.NotIn,
ast.In: ast.In,
ast.NotIn: ast.NotIn,
}
unsupported_nodes: tuple[str, ...]
def __init__(self, env, engine, parser, preparser=_preparse) -> None:
self.env = env
self.engine = engine
self.parser = parser
self.preparser = preparser
self.assigner = None
def visit(self, node, **kwargs):
if isinstance(node, str):
clean = self.preparser(node)
try:
node = ast.fix_missing_locations(ast.parse(clean))
except SyntaxError as e:
if any(iskeyword(x) for x in clean.split()):
e.msg = "Python keyword not valid identifier in numexpr query"
raise e
method = f"visit_{type(node).__name__}"
visitor = getattr(self, method)
return visitor(node, **kwargs)
def visit_Module(self, node, **kwargs):
if len(node.body) != 1:
raise SyntaxError("only a single expression is allowed")
expr = node.body[0]
return self.visit(expr, **kwargs)
def visit_Expr(self, node, **kwargs):
return self.visit(node.value, **kwargs)
def _rewrite_membership_op(self, node, left, right):
# the kind of the operator (is actually an instance)
op_instance = node.op
op_type = type(op_instance)
# must be two terms and the comparison operator must be ==/!=/in/not in
if is_term(left) and is_term(right) and op_type in self.rewrite_map:
left_list, right_list = map(_is_list, (left, right))
left_str, right_str = map(_is_str, (left, right))
# if there are any strings or lists in the expression
if left_list or right_list or left_str or right_str:
op_instance = self.rewrite_map[op_type]()
# pop the string variable out of locals and replace it with a list
# of one string, kind of a hack
if right_str:
name = self.env.add_tmp([right.value])
right = self.term_type(name, self.env)
if left_str:
name = self.env.add_tmp([left.value])
left = self.term_type(name, self.env)
op = self.visit(op_instance)
return op, op_instance, left, right
def _maybe_transform_eq_ne(self, node, left=None, right=None):
if left is None:
left = self.visit(node.left, side="left")
if right is None:
right = self.visit(node.right, side="right")
op, op_class, left, right = self._rewrite_membership_op(node, left, right)
return op, op_class, left, right
def _maybe_downcast_constants(self, left, right):
f32 = np.dtype(np.float32)
if (
left.is_scalar
and hasattr(left, "value")
and not right.is_scalar
and right.return_type == f32
):
# right is a float32 array, left is a scalar
name = self.env.add_tmp(np.float32(left.value))
left = self.term_type(name, self.env)
if (
right.is_scalar
and hasattr(right, "value")
and not left.is_scalar
and left.return_type == f32
):
# left is a float32 array, right is a scalar
name = self.env.add_tmp(np.float32(right.value))
right = self.term_type(name, self.env)
return left, right
def _maybe_eval(self, binop, eval_in_python):
# eval `in` and `not in` (for now) in "partial" python space
# things that can be evaluated in "eval" space will be turned into
# temporary variables. for example,
# [1,2] in a + 2 * b
# in that case a + 2 * b will be evaluated using numexpr, and the "in"
# call will be evaluated using isin (in python space)
return binop.evaluate(
self.env, self.engine, self.parser, self.term_type, eval_in_python
)
def _maybe_evaluate_binop(
self,
op,
op_class,
lhs,
rhs,
eval_in_python=("in", "not in"),
maybe_eval_in_python=("==", "!=", "<", ">", "<=", ">="),
):
res = op(lhs, rhs)
if res.has_invalid_return_type:
raise TypeError(
f"unsupported operand type(s) for {res.op}: "
f"'{lhs.type}' and '{rhs.type}'"
)
if self.engine != "pytables" and (
res.op in CMP_OPS_SYMS
and getattr(lhs, "is_datetime", False)
or getattr(rhs, "is_datetime", False)
):
# all date ops must be done in python bc numexpr doesn't work
# well with NaT
return self._maybe_eval(res, self.binary_ops)
if res.op in eval_in_python:
# "in"/"not in" ops are always evaluated in python
return self._maybe_eval(res, eval_in_python)
elif self.engine != "pytables":
if (
getattr(lhs, "return_type", None) == object
or is_string_dtype(getattr(lhs, "return_type", None))
or getattr(rhs, "return_type", None) == object
or is_string_dtype(getattr(rhs, "return_type", None))
):
# evaluate "==" and "!=" in python if either of our operands
# has an object or string return type
return self._maybe_eval(res, eval_in_python + maybe_eval_in_python)
return res
def visit_BinOp(self, node, **kwargs):
op, op_class, left, right = self._maybe_transform_eq_ne(node)
left, right = self._maybe_downcast_constants(left, right)
return self._maybe_evaluate_binop(op, op_class, left, right)
def visit_UnaryOp(self, node, **kwargs):
op = self.visit(node.op)
operand = self.visit(node.operand)
return op(operand)
def visit_Name(self, node, **kwargs) -> Term:
return self.term_type(node.id, self.env, **kwargs)
# TODO(py314): deprecated since Python 3.8. Remove after Python 3.14 is min
def visit_NameConstant(self, node, **kwargs) -> Term:
return self.const_type(node.value, self.env)
# TODO(py314): deprecated since Python 3.8. Remove after Python 3.14 is min
def visit_Num(self, node, **kwargs) -> Term:
return self.const_type(node.value, self.env)
def visit_Constant(self, node, **kwargs) -> Term:
return self.const_type(node.value, self.env)
# TODO(py314): deprecated since Python 3.8. Remove after Python 3.14 is min
def visit_Str(self, node, **kwargs) -> Term:
name = self.env.add_tmp(node.s)
return self.term_type(name, self.env)
def visit_List(self, node, **kwargs) -> Term:
name = self.env.add_tmp([self.visit(e)(self.env) for e in node.elts])
return self.term_type(name, self.env)
visit_Tuple = visit_List
def visit_Index(self, node, **kwargs):
"""df.index[4]"""
return self.visit(node.value)
def visit_Subscript(self, node, **kwargs) -> Term:
from pandas import eval as pd_eval
value = self.visit(node.value)
slobj = self.visit(node.slice)
result = pd_eval(
slobj, local_dict=self.env, engine=self.engine, parser=self.parser
)
try:
# a Term instance
v = value.value[result]
except AttributeError:
# an Op instance
lhs = pd_eval(
value, local_dict=self.env, engine=self.engine, parser=self.parser
)
v = lhs[result]
name = self.env.add_tmp(v)
return self.term_type(name, env=self.env)
def visit_Slice(self, node, **kwargs) -> slice:
"""df.index[slice(4,6)]"""
lower = node.lower
if lower is not None:
lower = self.visit(lower).value
upper = node.upper
if upper is not None:
upper = self.visit(upper).value
step = node.step
if step is not None:
step = self.visit(step).value
return slice(lower, upper, step)
def visit_Assign(self, node, **kwargs):
"""
support a single assignment node, like
c = a + b
set the assigner at the top level, must be a Name node which
might or might not exist in the resolvers
"""
if len(node.targets) != 1:
raise SyntaxError("can only assign a single expression")
if not isinstance(node.targets[0], ast.Name):
raise SyntaxError("left hand side of an assignment must be a single name")
if self.env.target is None:
raise ValueError("cannot assign without a target object")
try:
assigner = self.visit(node.targets[0], **kwargs)
except UndefinedVariableError:
assigner = node.targets[0].id
self.assigner = getattr(assigner, "name", assigner)
if self.assigner is None:
raise SyntaxError(
"left hand side of an assignment must be a single resolvable name"
)
return self.visit(node.value, **kwargs)
def visit_Attribute(self, node, **kwargs):
attr = node.attr
value = node.value
ctx = node.ctx
if isinstance(ctx, ast.Load):
# resolve the value
resolved = self.visit(value).value
try:
v = getattr(resolved, attr)
name = self.env.add_tmp(v)
return self.term_type(name, self.env)
except AttributeError:
# something like datetime.datetime where scope is overridden
if isinstance(value, ast.Name) and value.id == attr:
return resolved
raise
raise ValueError(f"Invalid Attribute context {type(ctx).__name__}")
def visit_Call(self, node, side=None, **kwargs):
if isinstance(node.func, ast.Attribute) and node.func.attr != "__call__":
res = self.visit_Attribute(node.func)
elif not isinstance(node.func, ast.Name):
raise TypeError("Only named functions are supported")
else:
try:
res = self.visit(node.func)
except UndefinedVariableError:
# Check if this is a supported function name
try:
res = FuncNode(node.func.id)
except ValueError:
# Raise original error
raise
if res is None:
# error: "expr" has no attribute "id"
raise ValueError(
f"Invalid function call {node.func.id}" # type: ignore[attr-defined]
)
if hasattr(res, "value"):
res = res.value
if isinstance(res, FuncNode):
new_args = [self.visit(arg) for arg in node.args]
if node.keywords:
raise TypeError(
f'Function "{res.name}" does not support keyword arguments'
)
return res(*new_args)
else:
new_args = [self.visit(arg)(self.env) for arg in node.args]
for key in node.keywords:
if not isinstance(key, ast.keyword):
# error: "expr" has no attribute "id"
raise ValueError(
"keyword error in function call "
f"'{node.func.id}'" # type: ignore[attr-defined]
)
if key.arg:
kwargs[key.arg] = self.visit(key.value)(self.env)
name = self.env.add_tmp(res(*new_args, **kwargs))
return self.term_type(name=name, env=self.env)
def translate_In(self, op):
return op
def visit_Compare(self, node, **kwargs):
ops = node.ops
comps = node.comparators
# base case: we have something like a CMP b
if len(comps) == 1:
op = self.translate_In(ops[0])
binop = ast.BinOp(op=op, left=node.left, right=comps[0])
return self.visit(binop)
# recursive case: we have a chained comparison, a CMP b CMP c, etc.
left = node.left
values = []
for op, comp in zip(ops, comps):
new_node = self.visit(
ast.Compare(comparators=[comp], left=left, ops=[self.translate_In(op)])
)
left = comp
values.append(new_node)
return self.visit(ast.BoolOp(op=ast.And(), values=values))
def _try_visit_binop(self, bop):
if isinstance(bop, (Op, Term)):
return bop
return self.visit(bop)
def visit_BoolOp(self, node, **kwargs):
def visitor(x, y):
lhs = self._try_visit_binop(x)
rhs = self._try_visit_binop(y)
op, op_class, lhs, rhs = self._maybe_transform_eq_ne(node, lhs, rhs)
return self._maybe_evaluate_binop(op, node.op, lhs, rhs)
operands = node.values
return reduce(visitor, operands)
_python_not_supported = frozenset(["Dict", "BoolOp", "In", "NotIn"])
_numexpr_supported_calls = frozenset(REDUCTIONS + MATHOPS)
@disallow(
(_unsupported_nodes | _python_not_supported)
- (_boolop_nodes | frozenset(["BoolOp", "Attribute", "In", "NotIn", "Tuple"]))
)
class PandasExprVisitor(BaseExprVisitor):
def __init__(
self,
env,
engine,
parser,
preparser=partial(
_preparse,
f=_compose(_replace_locals, _replace_booleans, clean_backtick_quoted_toks),
),
) -> None:
super().__init__(env, engine, parser, preparser)
@disallow(_unsupported_nodes | _python_not_supported | frozenset(["Not"]))
class PythonExprVisitor(BaseExprVisitor):
def __init__(
self, env, engine, parser, preparser=lambda source, f=None: source
) -> None:
super().__init__(env, engine, parser, preparser=preparser)
class Expr:
"""
Object encapsulating an expression.
Parameters
----------
expr : str
engine : str, optional, default 'numexpr'
parser : str, optional, default 'pandas'
env : Scope, optional, default None
level : int, optional, default 2
"""
env: Scope
engine: str
parser: str
def __init__(
self,
expr,
engine: str = "numexpr",
parser: str = "pandas",
env: Scope | None = None,
level: int = 0,
) -> None:
self.expr = expr
self.env = env or Scope(level=level + 1)
self.engine = engine
self.parser = parser
self._visitor = PARSERS[parser](self.env, self.engine, self.parser)
self.terms = self.parse()
@property
def assigner(self):
return getattr(self._visitor, "assigner", None)
def __call__(self):
return self.terms(self.env)
def __repr__(self) -> str:
return printing.pprint_thing(self.terms)
def __len__(self) -> int:
return len(self.expr)
def parse(self):
"""
Parse an expression.
"""
return self._visitor.visit(self.expr)
@property
def names(self):
"""
Get the names in an expression.
"""
if is_term(self.terms):
return frozenset([self.terms.name])
return frozenset(term.name for term in com.flatten(self.terms))
PARSERS = {"python": PythonExprVisitor, "pandas": PandasExprVisitor}

View File

@ -0,0 +1,286 @@
"""
Expressions
-----------
Offer fast expression evaluation through numexpr
"""
from __future__ import annotations
import operator
from typing import TYPE_CHECKING
import warnings
import numpy as np
from pandas._config import get_option
from pandas.util._exceptions import find_stack_level
from pandas.core import roperator
from pandas.core.computation.check import NUMEXPR_INSTALLED
if NUMEXPR_INSTALLED:
import numexpr as ne
if TYPE_CHECKING:
from pandas._typing import FuncType
_TEST_MODE: bool | None = None
_TEST_RESULT: list[bool] = []
USE_NUMEXPR = NUMEXPR_INSTALLED
_evaluate: FuncType | None = None
_where: FuncType | None = None
# the set of dtypes that we will allow pass to numexpr
_ALLOWED_DTYPES = {
"evaluate": {"int64", "int32", "float64", "float32", "bool"},
"where": {"int64", "float64", "bool"},
}
# the minimum prod shape that we will use numexpr
_MIN_ELEMENTS = 1_000_000
def set_use_numexpr(v: bool = True) -> None:
# set/unset to use numexpr
global USE_NUMEXPR
if NUMEXPR_INSTALLED:
USE_NUMEXPR = v
# choose what we are going to do
global _evaluate, _where
_evaluate = _evaluate_numexpr if USE_NUMEXPR else _evaluate_standard
_where = _where_numexpr if USE_NUMEXPR else _where_standard
def set_numexpr_threads(n=None) -> None:
# if we are using numexpr, set the threads to n
# otherwise reset
if NUMEXPR_INSTALLED and USE_NUMEXPR:
if n is None:
n = ne.detect_number_of_cores()
ne.set_num_threads(n)
def _evaluate_standard(op, op_str, a, b):
"""
Standard evaluation.
"""
if _TEST_MODE:
_store_test_result(False)
return op(a, b)
def _can_use_numexpr(op, op_str, a, b, dtype_check) -> bool:
"""return a boolean if we WILL be using numexpr"""
if op_str is not None:
# required min elements (otherwise we are adding overhead)
if a.size > _MIN_ELEMENTS:
# check for dtype compatibility
dtypes: set[str] = set()
for o in [a, b]:
# ndarray and Series Case
if hasattr(o, "dtype"):
dtypes |= {o.dtype.name}
# allowed are a superset
if not len(dtypes) or _ALLOWED_DTYPES[dtype_check] >= dtypes:
return True
return False
def _evaluate_numexpr(op, op_str, a, b):
result = None
if _can_use_numexpr(op, op_str, a, b, "evaluate"):
is_reversed = op.__name__.strip("_").startswith("r")
if is_reversed:
# we were originally called by a reversed op method
a, b = b, a
a_value = a
b_value = b
try:
result = ne.evaluate(
f"a_value {op_str} b_value",
local_dict={"a_value": a_value, "b_value": b_value},
casting="safe",
)
except TypeError:
# numexpr raises eg for array ** array with integers
# (https://github.com/pydata/numexpr/issues/379)
pass
except NotImplementedError:
if _bool_arith_fallback(op_str, a, b):
pass
else:
raise
if is_reversed:
# reverse order to original for fallback
a, b = b, a
if _TEST_MODE:
_store_test_result(result is not None)
if result is None:
result = _evaluate_standard(op, op_str, a, b)
return result
_op_str_mapping = {
operator.add: "+",
roperator.radd: "+",
operator.mul: "*",
roperator.rmul: "*",
operator.sub: "-",
roperator.rsub: "-",
operator.truediv: "/",
roperator.rtruediv: "/",
# floordiv not supported by numexpr 2.x
operator.floordiv: None,
roperator.rfloordiv: None,
# we require Python semantics for mod of negative for backwards compatibility
# see https://github.com/pydata/numexpr/issues/365
# so sticking with unaccelerated for now GH#36552
operator.mod: None,
roperator.rmod: None,
operator.pow: "**",
roperator.rpow: "**",
operator.eq: "==",
operator.ne: "!=",
operator.le: "<=",
operator.lt: "<",
operator.ge: ">=",
operator.gt: ">",
operator.and_: "&",
roperator.rand_: "&",
operator.or_: "|",
roperator.ror_: "|",
operator.xor: "^",
roperator.rxor: "^",
divmod: None,
roperator.rdivmod: None,
}
def _where_standard(cond, a, b):
# Caller is responsible for extracting ndarray if necessary
return np.where(cond, a, b)
def _where_numexpr(cond, a, b):
# Caller is responsible for extracting ndarray if necessary
result = None
if _can_use_numexpr(None, "where", a, b, "where"):
result = ne.evaluate(
"where(cond_value, a_value, b_value)",
local_dict={"cond_value": cond, "a_value": a, "b_value": b},
casting="safe",
)
if result is None:
result = _where_standard(cond, a, b)
return result
# turn myself on
set_use_numexpr(get_option("compute.use_numexpr"))
def _has_bool_dtype(x):
try:
return x.dtype == bool
except AttributeError:
return isinstance(x, (bool, np.bool_))
_BOOL_OP_UNSUPPORTED = {"+": "|", "*": "&", "-": "^"}
def _bool_arith_fallback(op_str, a, b) -> bool:
"""
Check if we should fallback to the python `_evaluate_standard` in case
of an unsupported operation by numexpr, which is the case for some
boolean ops.
"""
if _has_bool_dtype(a) and _has_bool_dtype(b):
if op_str in _BOOL_OP_UNSUPPORTED:
warnings.warn(
f"evaluating in Python space because the {repr(op_str)} "
"operator is not supported by numexpr for the bool dtype, "
f"use {repr(_BOOL_OP_UNSUPPORTED[op_str])} instead.",
stacklevel=find_stack_level(),
)
return True
return False
def evaluate(op, a, b, use_numexpr: bool = True):
"""
Evaluate and return the expression of the op on a and b.
Parameters
----------
op : the actual operand
a : left operand
b : right operand
use_numexpr : bool, default True
Whether to try to use numexpr.
"""
op_str = _op_str_mapping[op]
if op_str is not None:
if use_numexpr:
# error: "None" not callable
return _evaluate(op, op_str, a, b) # type: ignore[misc]
return _evaluate_standard(op, op_str, a, b)
def where(cond, a, b, use_numexpr: bool = True):
"""
Evaluate the where condition cond on a and b.
Parameters
----------
cond : np.ndarray[bool]
a : return if cond is True
b : return if cond is False
use_numexpr : bool, default True
Whether to try to use numexpr.
"""
assert _where is not None
return _where(cond, a, b) if use_numexpr else _where_standard(cond, a, b)
def set_test_mode(v: bool = True) -> None:
"""
Keeps track of whether numexpr was used.
Stores an additional ``True`` for every successful use of evaluate with
numexpr since the last ``get_test_result``.
"""
global _TEST_MODE, _TEST_RESULT
_TEST_MODE = v
_TEST_RESULT = []
def _store_test_result(used_numexpr: bool) -> None:
if used_numexpr:
_TEST_RESULT.append(used_numexpr)
def get_test_result() -> list[bool]:
"""
Get test result and reset test_results.
"""
global _TEST_RESULT
res = _TEST_RESULT
_TEST_RESULT = []
return res

View File

@ -0,0 +1,572 @@
"""
Operator classes for eval.
"""
from __future__ import annotations
from datetime import datetime
from functools import partial
import operator
from typing import (
TYPE_CHECKING,
Callable,
Literal,
)
import numpy as np
from pandas._libs.tslibs import Timestamp
from pandas.core.dtypes.common import (
is_list_like,
is_scalar,
)
import pandas.core.common as com
from pandas.core.computation.common import (
ensure_decoded,
result_type_many,
)
from pandas.core.computation.scope import DEFAULT_GLOBALS
from pandas.io.formats.printing import (
pprint_thing,
pprint_thing_encoded,
)
if TYPE_CHECKING:
from collections.abc import (
Iterable,
Iterator,
)
REDUCTIONS = ("sum", "prod", "min", "max")
_unary_math_ops = (
"sin",
"cos",
"exp",
"log",
"expm1",
"log1p",
"sqrt",
"sinh",
"cosh",
"tanh",
"arcsin",
"arccos",
"arctan",
"arccosh",
"arcsinh",
"arctanh",
"abs",
"log10",
"floor",
"ceil",
)
_binary_math_ops = ("arctan2",)
MATHOPS = _unary_math_ops + _binary_math_ops
LOCAL_TAG = "__pd_eval_local_"
class Term:
def __new__(cls, name, env, side=None, encoding=None):
klass = Constant if not isinstance(name, str) else cls
# error: Argument 2 for "super" not an instance of argument 1
supr_new = super(Term, klass).__new__ # type: ignore[misc]
return supr_new(klass)
is_local: bool
def __init__(self, name, env, side=None, encoding=None) -> None:
# name is a str for Term, but may be something else for subclasses
self._name = name
self.env = env
self.side = side
tname = str(name)
self.is_local = tname.startswith(LOCAL_TAG) or tname in DEFAULT_GLOBALS
self._value = self._resolve_name()
self.encoding = encoding
@property
def local_name(self) -> str:
return self.name.replace(LOCAL_TAG, "")
def __repr__(self) -> str:
return pprint_thing(self.name)
def __call__(self, *args, **kwargs):
return self.value
def evaluate(self, *args, **kwargs) -> Term:
return self
def _resolve_name(self):
local_name = str(self.local_name)
is_local = self.is_local
if local_name in self.env.scope and isinstance(
self.env.scope[local_name], type
):
is_local = False
res = self.env.resolve(local_name, is_local=is_local)
self.update(res)
if hasattr(res, "ndim") and res.ndim > 2:
raise NotImplementedError(
"N-dimensional objects, where N > 2, are not supported with eval"
)
return res
def update(self, value) -> None:
"""
search order for local (i.e., @variable) variables:
scope, key_variable
[('locals', 'local_name'),
('globals', 'local_name'),
('locals', 'key'),
('globals', 'key')]
"""
key = self.name
# if it's a variable name (otherwise a constant)
if isinstance(key, str):
self.env.swapkey(self.local_name, key, new_value=value)
self.value = value
@property
def is_scalar(self) -> bool:
return is_scalar(self._value)
@property
def type(self):
try:
# potentially very slow for large, mixed dtype frames
return self._value.values.dtype
except AttributeError:
try:
# ndarray
return self._value.dtype
except AttributeError:
# scalar
return type(self._value)
return_type = type
@property
def raw(self) -> str:
return f"{type(self).__name__}(name={repr(self.name)}, type={self.type})"
@property
def is_datetime(self) -> bool:
try:
t = self.type.type
except AttributeError:
t = self.type
return issubclass(t, (datetime, np.datetime64))
@property
def value(self):
return self._value
@value.setter
def value(self, new_value) -> None:
self._value = new_value
@property
def name(self):
return self._name
@property
def ndim(self) -> int:
return self._value.ndim
class Constant(Term):
def _resolve_name(self):
return self._name
@property
def name(self):
return self.value
def __repr__(self) -> str:
# in python 2 str() of float
# can truncate shorter than repr()
return repr(self.name)
_bool_op_map = {"not": "~", "and": "&", "or": "|"}
class Op:
"""
Hold an operator of arbitrary arity.
"""
op: str
def __init__(self, op: str, operands: Iterable[Term | Op], encoding=None) -> None:
self.op = _bool_op_map.get(op, op)
self.operands = operands
self.encoding = encoding
def __iter__(self) -> Iterator:
return iter(self.operands)
def __repr__(self) -> str:
"""
Print a generic n-ary operator and its operands using infix notation.
"""
# recurse over the operands
parened = (f"({pprint_thing(opr)})" for opr in self.operands)
return pprint_thing(f" {self.op} ".join(parened))
@property
def return_type(self):
# clobber types to bool if the op is a boolean operator
if self.op in (CMP_OPS_SYMS + BOOL_OPS_SYMS):
return np.bool_
return result_type_many(*(term.type for term in com.flatten(self)))
@property
def has_invalid_return_type(self) -> bool:
types = self.operand_types
obj_dtype_set = frozenset([np.dtype("object")])
return self.return_type == object and types - obj_dtype_set
@property
def operand_types(self):
return frozenset(term.type for term in com.flatten(self))
@property
def is_scalar(self) -> bool:
return all(operand.is_scalar for operand in self.operands)
@property
def is_datetime(self) -> bool:
try:
t = self.return_type.type
except AttributeError:
t = self.return_type
return issubclass(t, (datetime, np.datetime64))
def _in(x, y):
"""
Compute the vectorized membership of ``x in y`` if possible, otherwise
use Python.
"""
try:
return x.isin(y)
except AttributeError:
if is_list_like(x):
try:
return y.isin(x)
except AttributeError:
pass
return x in y
def _not_in(x, y):
"""
Compute the vectorized membership of ``x not in y`` if possible,
otherwise use Python.
"""
try:
return ~x.isin(y)
except AttributeError:
if is_list_like(x):
try:
return ~y.isin(x)
except AttributeError:
pass
return x not in y
CMP_OPS_SYMS = (">", "<", ">=", "<=", "==", "!=", "in", "not in")
_cmp_ops_funcs = (
operator.gt,
operator.lt,
operator.ge,
operator.le,
operator.eq,
operator.ne,
_in,
_not_in,
)
_cmp_ops_dict = dict(zip(CMP_OPS_SYMS, _cmp_ops_funcs))
BOOL_OPS_SYMS = ("&", "|", "and", "or")
_bool_ops_funcs = (operator.and_, operator.or_, operator.and_, operator.or_)
_bool_ops_dict = dict(zip(BOOL_OPS_SYMS, _bool_ops_funcs))
ARITH_OPS_SYMS = ("+", "-", "*", "/", "**", "//", "%")
_arith_ops_funcs = (
operator.add,
operator.sub,
operator.mul,
operator.truediv,
operator.pow,
operator.floordiv,
operator.mod,
)
_arith_ops_dict = dict(zip(ARITH_OPS_SYMS, _arith_ops_funcs))
SPECIAL_CASE_ARITH_OPS_SYMS = ("**", "//", "%")
_special_case_arith_ops_funcs = (operator.pow, operator.floordiv, operator.mod)
_special_case_arith_ops_dict = dict(
zip(SPECIAL_CASE_ARITH_OPS_SYMS, _special_case_arith_ops_funcs)
)
_binary_ops_dict = {}
for d in (_cmp_ops_dict, _bool_ops_dict, _arith_ops_dict):
_binary_ops_dict.update(d)
def is_term(obj) -> bool:
return isinstance(obj, Term)
class BinOp(Op):
"""
Hold a binary operator and its operands.
Parameters
----------
op : str
lhs : Term or Op
rhs : Term or Op
"""
def __init__(self, op: str, lhs, rhs) -> None:
super().__init__(op, (lhs, rhs))
self.lhs = lhs
self.rhs = rhs
self._disallow_scalar_only_bool_ops()
self.convert_values()
try:
self.func = _binary_ops_dict[op]
except KeyError as err:
# has to be made a list for python3
keys = list(_binary_ops_dict.keys())
raise ValueError(
f"Invalid binary operator {repr(op)}, valid operators are {keys}"
) from err
def __call__(self, env):
"""
Recursively evaluate an expression in Python space.
Parameters
----------
env : Scope
Returns
-------
object
The result of an evaluated expression.
"""
# recurse over the left/right nodes
left = self.lhs(env)
right = self.rhs(env)
return self.func(left, right)
def evaluate(self, env, engine: str, parser, term_type, eval_in_python):
"""
Evaluate a binary operation *before* being passed to the engine.
Parameters
----------
env : Scope
engine : str
parser : str
term_type : type
eval_in_python : list
Returns
-------
term_type
The "pre-evaluated" expression as an instance of ``term_type``
"""
if engine == "python":
res = self(env)
else:
# recurse over the left/right nodes
left = self.lhs.evaluate(
env,
engine=engine,
parser=parser,
term_type=term_type,
eval_in_python=eval_in_python,
)
right = self.rhs.evaluate(
env,
engine=engine,
parser=parser,
term_type=term_type,
eval_in_python=eval_in_python,
)
# base cases
if self.op in eval_in_python:
res = self.func(left.value, right.value)
else:
from pandas.core.computation.eval import eval
res = eval(self, local_dict=env, engine=engine, parser=parser)
name = env.add_tmp(res)
return term_type(name, env=env)
def convert_values(self) -> None:
"""
Convert datetimes to a comparable value in an expression.
"""
def stringify(value):
encoder: Callable
if self.encoding is not None:
encoder = partial(pprint_thing_encoded, encoding=self.encoding)
else:
encoder = pprint_thing
return encoder(value)
lhs, rhs = self.lhs, self.rhs
if is_term(lhs) and lhs.is_datetime and is_term(rhs) and rhs.is_scalar:
v = rhs.value
if isinstance(v, (int, float)):
v = stringify(v)
v = Timestamp(ensure_decoded(v))
if v.tz is not None:
v = v.tz_convert("UTC")
self.rhs.update(v)
if is_term(rhs) and rhs.is_datetime and is_term(lhs) and lhs.is_scalar:
v = lhs.value
if isinstance(v, (int, float)):
v = stringify(v)
v = Timestamp(ensure_decoded(v))
if v.tz is not None:
v = v.tz_convert("UTC")
self.lhs.update(v)
def _disallow_scalar_only_bool_ops(self):
rhs = self.rhs
lhs = self.lhs
# GH#24883 unwrap dtype if necessary to ensure we have a type object
rhs_rt = rhs.return_type
rhs_rt = getattr(rhs_rt, "type", rhs_rt)
lhs_rt = lhs.return_type
lhs_rt = getattr(lhs_rt, "type", lhs_rt)
if (
(lhs.is_scalar or rhs.is_scalar)
and self.op in _bool_ops_dict
and (
not (
issubclass(rhs_rt, (bool, np.bool_))
and issubclass(lhs_rt, (bool, np.bool_))
)
)
):
raise NotImplementedError("cannot evaluate scalar only bool ops")
def isnumeric(dtype) -> bool:
return issubclass(np.dtype(dtype).type, np.number)
UNARY_OPS_SYMS = ("+", "-", "~", "not")
_unary_ops_funcs = (operator.pos, operator.neg, operator.invert, operator.invert)
_unary_ops_dict = dict(zip(UNARY_OPS_SYMS, _unary_ops_funcs))
class UnaryOp(Op):
"""
Hold a unary operator and its operands.
Parameters
----------
op : str
The token used to represent the operator.
operand : Term or Op
The Term or Op operand to the operator.
Raises
------
ValueError
* If no function associated with the passed operator token is found.
"""
def __init__(self, op: Literal["+", "-", "~", "not"], operand) -> None:
super().__init__(op, (operand,))
self.operand = operand
try:
self.func = _unary_ops_dict[op]
except KeyError as err:
raise ValueError(
f"Invalid unary operator {repr(op)}, "
f"valid operators are {UNARY_OPS_SYMS}"
) from err
def __call__(self, env) -> MathCall:
operand = self.operand(env)
# error: Cannot call function of unknown type
return self.func(operand) # type: ignore[operator]
def __repr__(self) -> str:
return pprint_thing(f"{self.op}({self.operand})")
@property
def return_type(self) -> np.dtype:
operand = self.operand
if operand.return_type == np.dtype("bool"):
return np.dtype("bool")
if isinstance(operand, Op) and (
operand.op in _cmp_ops_dict or operand.op in _bool_ops_dict
):
return np.dtype("bool")
return np.dtype("int")
class MathCall(Op):
def __init__(self, func, args) -> None:
super().__init__(func.name, args)
self.func = func
def __call__(self, env):
# error: "Op" not callable
operands = [op(env) for op in self.operands] # type: ignore[operator]
return self.func.func(*operands)
def __repr__(self) -> str:
operands = map(str, self.operands)
return pprint_thing(f"{self.op}({','.join(operands)})")
class FuncNode:
def __init__(self, name: str) -> None:
if name not in MATHOPS:
raise ValueError(f'"{name}" is not a supported function')
self.name = name
self.func = getattr(np, name)
def __call__(self, *args) -> MathCall:
return MathCall(self, args)

View File

@ -0,0 +1,198 @@
"""
:func:`~pandas.eval` source string parsing functions
"""
from __future__ import annotations
from io import StringIO
from keyword import iskeyword
import token
import tokenize
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from collections.abc import (
Hashable,
Iterator,
)
# A token value Python's tokenizer probably will never use.
BACKTICK_QUOTED_STRING = 100
def create_valid_python_identifier(name: str) -> str:
"""
Create valid Python identifiers from any string.
Check if name contains any special characters. If it contains any
special characters, the special characters will be replaced by
a special string and a prefix is added.
Raises
------
SyntaxError
If the returned name is not a Python valid identifier, raise an exception.
This can happen if there is a hashtag in the name, as the tokenizer will
than terminate and not find the backtick.
But also for characters that fall out of the range of (U+0001..U+007F).
"""
if name.isidentifier() and not iskeyword(name):
return name
# Create a dict with the special characters and their replacement string.
# EXACT_TOKEN_TYPES contains these special characters
# token.tok_name contains a readable description of the replacement string.
special_characters_replacements = {
char: f"_{token.tok_name[tokval]}_"
for char, tokval in (tokenize.EXACT_TOKEN_TYPES.items())
}
special_characters_replacements.update(
{
" ": "_",
"?": "_QUESTIONMARK_",
"!": "_EXCLAMATIONMARK_",
"$": "_DOLLARSIGN_",
"": "_EUROSIGN_",
"°": "_DEGREESIGN_",
# Including quotes works, but there are exceptions.
"'": "_SINGLEQUOTE_",
'"': "_DOUBLEQUOTE_",
# Currently not possible. Terminates parser and won't find backtick.
# "#": "_HASH_",
}
)
name = "".join([special_characters_replacements.get(char, char) for char in name])
name = f"BACKTICK_QUOTED_STRING_{name}"
if not name.isidentifier():
raise SyntaxError(f"Could not convert '{name}' to a valid Python identifier.")
return name
def clean_backtick_quoted_toks(tok: tuple[int, str]) -> tuple[int, str]:
"""
Clean up a column name if surrounded by backticks.
Backtick quoted string are indicated by a certain tokval value. If a string
is a backtick quoted token it will processed by
:func:`_create_valid_python_identifier` so that the parser can find this
string when the query is executed.
In this case the tok will get the NAME tokval.
Parameters
----------
tok : tuple of int, str
ints correspond to the all caps constants in the tokenize module
Returns
-------
tok : Tuple[int, str]
Either the input or token or the replacement values
"""
toknum, tokval = tok
if toknum == BACKTICK_QUOTED_STRING:
return tokenize.NAME, create_valid_python_identifier(tokval)
return toknum, tokval
def clean_column_name(name: Hashable) -> Hashable:
"""
Function to emulate the cleaning of a backtick quoted name.
The purpose for this function is to see what happens to the name of
identifier if it goes to the process of being parsed a Python code
inside a backtick quoted string and than being cleaned
(removed of any special characters).
Parameters
----------
name : hashable
Name to be cleaned.
Returns
-------
name : hashable
Returns the name after tokenizing and cleaning.
Notes
-----
For some cases, a name cannot be converted to a valid Python identifier.
In that case :func:`tokenize_string` raises a SyntaxError.
In that case, we just return the name unmodified.
If this name was used in the query string (this makes the query call impossible)
an error will be raised by :func:`tokenize_backtick_quoted_string` instead,
which is not caught and propagates to the user level.
"""
try:
tokenized = tokenize_string(f"`{name}`")
tokval = next(tokenized)[1]
return create_valid_python_identifier(tokval)
except SyntaxError:
return name
def tokenize_backtick_quoted_string(
token_generator: Iterator[tokenize.TokenInfo], source: str, string_start: int
) -> tuple[int, str]:
"""
Creates a token from a backtick quoted string.
Moves the token_generator forwards till right after the next backtick.
Parameters
----------
token_generator : Iterator[tokenize.TokenInfo]
The generator that yields the tokens of the source string (Tuple[int, str]).
The generator is at the first token after the backtick (`)
source : str
The Python source code string.
string_start : int
This is the start of backtick quoted string inside the source string.
Returns
-------
tok: Tuple[int, str]
The token that represents the backtick quoted string.
The integer is equal to BACKTICK_QUOTED_STRING (100).
"""
for _, tokval, start, _, _ in token_generator:
if tokval == "`":
string_end = start[1]
break
return BACKTICK_QUOTED_STRING, source[string_start:string_end]
def tokenize_string(source: str) -> Iterator[tuple[int, str]]:
"""
Tokenize a Python source code string.
Parameters
----------
source : str
The Python source code string.
Returns
-------
tok_generator : Iterator[Tuple[int, str]]
An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]).
"""
line_reader = StringIO(source).readline
token_generator = tokenize.generate_tokens(line_reader)
# Loop over all tokens till a backtick (`) is found.
# Then, take all tokens till the next backtick to form a backtick quoted string
for toknum, tokval, start, _, _ in token_generator:
if tokval == "`":
try:
yield tokenize_backtick_quoted_string(
token_generator, source, string_start=start[1] + 1
)
except Exception as err:
raise SyntaxError(f"Failed to parse backticks in '{source}'.") from err
else:
yield toknum, tokval

View File

@ -0,0 +1,666 @@
""" manage PyTables query interface via Expressions """
from __future__ import annotations
import ast
from decimal import (
Decimal,
InvalidOperation,
)
from functools import partial
from typing import (
TYPE_CHECKING,
Any,
ClassVar,
)
import numpy as np
from pandas._libs.tslibs import (
Timedelta,
Timestamp,
)
from pandas.errors import UndefinedVariableError
from pandas.core.dtypes.common import is_list_like
import pandas.core.common as com
from pandas.core.computation import (
expr,
ops,
scope as _scope,
)
from pandas.core.computation.common import ensure_decoded
from pandas.core.computation.expr import BaseExprVisitor
from pandas.core.computation.ops import is_term
from pandas.core.construction import extract_array
from pandas.core.indexes.base import Index
from pandas.io.formats.printing import (
pprint_thing,
pprint_thing_encoded,
)
if TYPE_CHECKING:
from pandas._typing import (
Self,
npt,
)
class PyTablesScope(_scope.Scope):
__slots__ = ("queryables",)
queryables: dict[str, Any]
def __init__(
self,
level: int,
global_dict=None,
local_dict=None,
queryables: dict[str, Any] | None = None,
) -> None:
super().__init__(level + 1, global_dict=global_dict, local_dict=local_dict)
self.queryables = queryables or {}
class Term(ops.Term):
env: PyTablesScope
def __new__(cls, name, env, side=None, encoding=None):
if isinstance(name, str):
klass = cls
else:
klass = Constant
return object.__new__(klass)
def __init__(self, name, env: PyTablesScope, side=None, encoding=None) -> None:
super().__init__(name, env, side=side, encoding=encoding)
def _resolve_name(self):
# must be a queryables
if self.side == "left":
# Note: The behavior of __new__ ensures that self.name is a str here
if self.name not in self.env.queryables:
raise NameError(f"name {repr(self.name)} is not defined")
return self.name
# resolve the rhs (and allow it to be None)
try:
return self.env.resolve(self.name, is_local=False)
except UndefinedVariableError:
return self.name
# read-only property overwriting read/write property
@property # type: ignore[misc]
def value(self):
return self._value
class Constant(Term):
def __init__(self, name, env: PyTablesScope, side=None, encoding=None) -> None:
assert isinstance(env, PyTablesScope), type(env)
super().__init__(name, env, side=side, encoding=encoding)
def _resolve_name(self):
return self._name
class BinOp(ops.BinOp):
_max_selectors = 31
op: str
queryables: dict[str, Any]
condition: str | None
def __init__(self, op: str, lhs, rhs, queryables: dict[str, Any], encoding) -> None:
super().__init__(op, lhs, rhs)
self.queryables = queryables
self.encoding = encoding
self.condition = None
def _disallow_scalar_only_bool_ops(self) -> None:
pass
def prune(self, klass):
def pr(left, right):
"""create and return a new specialized BinOp from myself"""
if left is None:
return right
elif right is None:
return left
k = klass
if isinstance(left, ConditionBinOp):
if isinstance(right, ConditionBinOp):
k = JointConditionBinOp
elif isinstance(left, k):
return left
elif isinstance(right, k):
return right
elif isinstance(left, FilterBinOp):
if isinstance(right, FilterBinOp):
k = JointFilterBinOp
elif isinstance(left, k):
return left
elif isinstance(right, k):
return right
return k(
self.op, left, right, queryables=self.queryables, encoding=self.encoding
).evaluate()
left, right = self.lhs, self.rhs
if is_term(left) and is_term(right):
res = pr(left.value, right.value)
elif not is_term(left) and is_term(right):
res = pr(left.prune(klass), right.value)
elif is_term(left) and not is_term(right):
res = pr(left.value, right.prune(klass))
elif not (is_term(left) or is_term(right)):
res = pr(left.prune(klass), right.prune(klass))
return res
def conform(self, rhs):
"""inplace conform rhs"""
if not is_list_like(rhs):
rhs = [rhs]
if isinstance(rhs, np.ndarray):
rhs = rhs.ravel()
return rhs
@property
def is_valid(self) -> bool:
"""return True if this is a valid field"""
return self.lhs in self.queryables
@property
def is_in_table(self) -> bool:
"""
return True if this is a valid column name for generation (e.g. an
actual column in the table)
"""
return self.queryables.get(self.lhs) is not None
@property
def kind(self):
"""the kind of my field"""
return getattr(self.queryables.get(self.lhs), "kind", None)
@property
def meta(self):
"""the meta of my field"""
return getattr(self.queryables.get(self.lhs), "meta", None)
@property
def metadata(self):
"""the metadata of my field"""
return getattr(self.queryables.get(self.lhs), "metadata", None)
def generate(self, v) -> str:
"""create and return the op string for this TermValue"""
val = v.tostring(self.encoding)
return f"({self.lhs} {self.op} {val})"
def convert_value(self, v) -> TermValue:
"""
convert the expression that is in the term to something that is
accepted by pytables
"""
def stringify(value):
if self.encoding is not None:
return pprint_thing_encoded(value, encoding=self.encoding)
return pprint_thing(value)
kind = ensure_decoded(self.kind)
meta = ensure_decoded(self.meta)
if kind == "datetime" or (kind and kind.startswith("datetime64")):
if isinstance(v, (int, float)):
v = stringify(v)
v = ensure_decoded(v)
v = Timestamp(v).as_unit("ns")
if v.tz is not None:
v = v.tz_convert("UTC")
return TermValue(v, v._value, kind)
elif kind in ("timedelta64", "timedelta"):
if isinstance(v, str):
v = Timedelta(v)
else:
v = Timedelta(v, unit="s")
v = v.as_unit("ns")._value
return TermValue(int(v), v, kind)
elif meta == "category":
metadata = extract_array(self.metadata, extract_numpy=True)
result: npt.NDArray[np.intp] | np.intp | int
if v not in metadata:
result = -1
else:
result = metadata.searchsorted(v, side="left")
return TermValue(result, result, "integer")
elif kind == "integer":
try:
v_dec = Decimal(v)
except InvalidOperation:
# GH 54186
# convert v to float to raise float's ValueError
float(v)
else:
v = int(v_dec.to_integral_exact(rounding="ROUND_HALF_EVEN"))
return TermValue(v, v, kind)
elif kind == "float":
v = float(v)
return TermValue(v, v, kind)
elif kind == "bool":
if isinstance(v, str):
v = v.strip().lower() not in [
"false",
"f",
"no",
"n",
"none",
"0",
"[]",
"{}",
"",
]
else:
v = bool(v)
return TermValue(v, v, kind)
elif isinstance(v, str):
# string quoting
return TermValue(v, stringify(v), "string")
else:
raise TypeError(f"Cannot compare {v} of type {type(v)} to {kind} column")
def convert_values(self) -> None:
pass
class FilterBinOp(BinOp):
filter: tuple[Any, Any, Index] | None = None
def __repr__(self) -> str:
if self.filter is None:
return "Filter: Not Initialized"
return pprint_thing(f"[Filter : [{self.filter[0]}] -> [{self.filter[1]}]")
def invert(self) -> Self:
"""invert the filter"""
if self.filter is not None:
self.filter = (
self.filter[0],
self.generate_filter_op(invert=True),
self.filter[2],
)
return self
def format(self):
"""return the actual filter format"""
return [self.filter]
# error: Signature of "evaluate" incompatible with supertype "BinOp"
def evaluate(self) -> Self | None: # type: ignore[override]
if not self.is_valid:
raise ValueError(f"query term is not valid [{self}]")
rhs = self.conform(self.rhs)
values = list(rhs)
if self.is_in_table:
# if too many values to create the expression, use a filter instead
if self.op in ["==", "!="] and len(values) > self._max_selectors:
filter_op = self.generate_filter_op()
self.filter = (self.lhs, filter_op, Index(values))
return self
return None
# equality conditions
if self.op in ["==", "!="]:
filter_op = self.generate_filter_op()
self.filter = (self.lhs, filter_op, Index(values))
else:
raise TypeError(
f"passing a filterable condition to a non-table indexer [{self}]"
)
return self
def generate_filter_op(self, invert: bool = False):
if (self.op == "!=" and not invert) or (self.op == "==" and invert):
return lambda axis, vals: ~axis.isin(vals)
else:
return lambda axis, vals: axis.isin(vals)
class JointFilterBinOp(FilterBinOp):
def format(self):
raise NotImplementedError("unable to collapse Joint Filters")
# error: Signature of "evaluate" incompatible with supertype "BinOp"
def evaluate(self) -> Self: # type: ignore[override]
return self
class ConditionBinOp(BinOp):
def __repr__(self) -> str:
return pprint_thing(f"[Condition : [{self.condition}]]")
def invert(self):
"""invert the condition"""
# if self.condition is not None:
# self.condition = "~(%s)" % self.condition
# return self
raise NotImplementedError(
"cannot use an invert condition when passing to numexpr"
)
def format(self):
"""return the actual ne format"""
return self.condition
# error: Signature of "evaluate" incompatible with supertype "BinOp"
def evaluate(self) -> Self | None: # type: ignore[override]
if not self.is_valid:
raise ValueError(f"query term is not valid [{self}]")
# convert values if we are in the table
if not self.is_in_table:
return None
rhs = self.conform(self.rhs)
values = [self.convert_value(v) for v in rhs]
# equality conditions
if self.op in ["==", "!="]:
# too many values to create the expression?
if len(values) <= self._max_selectors:
vs = [self.generate(v) for v in values]
self.condition = f"({' | '.join(vs)})"
# use a filter after reading
else:
return None
else:
self.condition = self.generate(values[0])
return self
class JointConditionBinOp(ConditionBinOp):
# error: Signature of "evaluate" incompatible with supertype "BinOp"
def evaluate(self) -> Self: # type: ignore[override]
self.condition = f"({self.lhs.condition} {self.op} {self.rhs.condition})"
return self
class UnaryOp(ops.UnaryOp):
def prune(self, klass):
if self.op != "~":
raise NotImplementedError("UnaryOp only support invert type ops")
operand = self.operand
operand = operand.prune(klass)
if operand is not None and (
issubclass(klass, ConditionBinOp)
and operand.condition is not None
or not issubclass(klass, ConditionBinOp)
and issubclass(klass, FilterBinOp)
and operand.filter is not None
):
return operand.invert()
return None
class PyTablesExprVisitor(BaseExprVisitor):
const_type: ClassVar[type[ops.Term]] = Constant
term_type: ClassVar[type[Term]] = Term
def __init__(self, env, engine, parser, **kwargs) -> None:
super().__init__(env, engine, parser)
for bin_op in self.binary_ops:
bin_node = self.binary_op_nodes_map[bin_op]
setattr(
self,
f"visit_{bin_node}",
lambda node, bin_op=bin_op: partial(BinOp, bin_op, **kwargs),
)
def visit_UnaryOp(self, node, **kwargs) -> ops.Term | UnaryOp | None:
if isinstance(node.op, (ast.Not, ast.Invert)):
return UnaryOp("~", self.visit(node.operand))
elif isinstance(node.op, ast.USub):
return self.const_type(-self.visit(node.operand).value, self.env)
elif isinstance(node.op, ast.UAdd):
raise NotImplementedError("Unary addition not supported")
# TODO: return None might never be reached
return None
def visit_Index(self, node, **kwargs):
return self.visit(node.value).value
def visit_Assign(self, node, **kwargs):
cmpr = ast.Compare(
ops=[ast.Eq()], left=node.targets[0], comparators=[node.value]
)
return self.visit(cmpr)
def visit_Subscript(self, node, **kwargs) -> ops.Term:
# only allow simple subscripts
value = self.visit(node.value)
slobj = self.visit(node.slice)
try:
value = value.value
except AttributeError:
pass
if isinstance(slobj, Term):
# In py39 np.ndarray lookups with Term containing int raise
slobj = slobj.value
try:
return self.const_type(value[slobj], self.env)
except TypeError as err:
raise ValueError(
f"cannot subscript {repr(value)} with {repr(slobj)}"
) from err
def visit_Attribute(self, node, **kwargs):
attr = node.attr
value = node.value
ctx = type(node.ctx)
if ctx == ast.Load:
# resolve the value
resolved = self.visit(value)
# try to get the value to see if we are another expression
try:
resolved = resolved.value
except AttributeError:
pass
try:
return self.term_type(getattr(resolved, attr), self.env)
except AttributeError:
# something like datetime.datetime where scope is overridden
if isinstance(value, ast.Name) and value.id == attr:
return resolved
raise ValueError(f"Invalid Attribute context {ctx.__name__}")
def translate_In(self, op):
return ast.Eq() if isinstance(op, ast.In) else op
def _rewrite_membership_op(self, node, left, right):
return self.visit(node.op), node.op, left, right
def _validate_where(w):
"""
Validate that the where statement is of the right type.
The type may either be String, Expr, or list-like of Exprs.
Parameters
----------
w : String term expression, Expr, or list-like of Exprs.
Returns
-------
where : The original where clause if the check was successful.
Raises
------
TypeError : An invalid data type was passed in for w (e.g. dict).
"""
if not (isinstance(w, (PyTablesExpr, str)) or is_list_like(w)):
raise TypeError(
"where must be passed as a string, PyTablesExpr, "
"or list-like of PyTablesExpr"
)
return w
class PyTablesExpr(expr.Expr):
"""
Hold a pytables-like expression, comprised of possibly multiple 'terms'.
Parameters
----------
where : string term expression, PyTablesExpr, or list-like of PyTablesExprs
queryables : a "kinds" map (dict of column name -> kind), or None if column
is non-indexable
encoding : an encoding that will encode the query terms
Returns
-------
a PyTablesExpr object
Examples
--------
'index>=date'
"columns=['A', 'D']"
'columns=A'
'columns==A'
"~(columns=['A','B'])"
'index>df.index[3] & string="bar"'
'(index>df.index[3] & index<=df.index[6]) | string="bar"'
"ts>=Timestamp('2012-02-01')"
"major_axis>=20130101"
"""
_visitor: PyTablesExprVisitor | None
env: PyTablesScope
expr: str
def __init__(
self,
where,
queryables: dict[str, Any] | None = None,
encoding=None,
scope_level: int = 0,
) -> None:
where = _validate_where(where)
self.encoding = encoding
self.condition = None
self.filter = None
self.terms = None
self._visitor = None
# capture the environment if needed
local_dict: _scope.DeepChainMap[Any, Any] | None = None
if isinstance(where, PyTablesExpr):
local_dict = where.env.scope
_where = where.expr
elif is_list_like(where):
where = list(where)
for idx, w in enumerate(where):
if isinstance(w, PyTablesExpr):
local_dict = w.env.scope
else:
where[idx] = _validate_where(w)
_where = " & ".join([f"({w})" for w in com.flatten(where)])
else:
# _validate_where ensures we otherwise have a string
_where = where
self.expr = _where
self.env = PyTablesScope(scope_level + 1, local_dict=local_dict)
if queryables is not None and isinstance(self.expr, str):
self.env.queryables.update(queryables)
self._visitor = PyTablesExprVisitor(
self.env,
queryables=queryables,
parser="pytables",
engine="pytables",
encoding=encoding,
)
self.terms = self.parse()
def __repr__(self) -> str:
if self.terms is not None:
return pprint_thing(self.terms)
return pprint_thing(self.expr)
def evaluate(self):
"""create and return the numexpr condition and filter"""
try:
self.condition = self.terms.prune(ConditionBinOp)
except AttributeError as err:
raise ValueError(
f"cannot process expression [{self.expr}], [{self}] "
"is not a valid condition"
) from err
try:
self.filter = self.terms.prune(FilterBinOp)
except AttributeError as err:
raise ValueError(
f"cannot process expression [{self.expr}], [{self}] "
"is not a valid filter"
) from err
return self.condition, self.filter
class TermValue:
"""hold a term value the we use to construct a condition/filter"""
def __init__(self, value, converted, kind: str) -> None:
assert isinstance(kind, str), kind
self.value = value
self.converted = converted
self.kind = kind
def tostring(self, encoding) -> str:
"""quote the string if not encoded else encode and return"""
if self.kind == "string":
if encoding is not None:
return str(self.converted)
return f'"{self.converted}"'
elif self.kind == "float":
# python 2 str(float) is not always
# round-trippable so use repr()
return repr(self.converted)
return str(self.converted)
def maybe_expression(s) -> bool:
"""loose checking if s is a pytables-acceptable expression"""
if not isinstance(s, str):
return False
operations = PyTablesExprVisitor.binary_ops + PyTablesExprVisitor.unary_ops + ("=",)
# make sure we have an op at least
return any(op in s for op in operations)

View File

@ -0,0 +1,355 @@
"""
Module for scope operations
"""
from __future__ import annotations
from collections import ChainMap
import datetime
import inspect
from io import StringIO
import itertools
import pprint
import struct
import sys
from typing import TypeVar
import numpy as np
from pandas._libs.tslibs import Timestamp
from pandas.errors import UndefinedVariableError
_KT = TypeVar("_KT")
_VT = TypeVar("_VT")
# https://docs.python.org/3/library/collections.html#chainmap-examples-and-recipes
class DeepChainMap(ChainMap[_KT, _VT]):
"""
Variant of ChainMap that allows direct updates to inner scopes.
Only works when all passed mapping are mutable.
"""
def __setitem__(self, key: _KT, value: _VT) -> None:
for mapping in self.maps:
if key in mapping:
mapping[key] = value
return
self.maps[0][key] = value
def __delitem__(self, key: _KT) -> None:
"""
Raises
------
KeyError
If `key` doesn't exist.
"""
for mapping in self.maps:
if key in mapping:
del mapping[key]
return
raise KeyError(key)
def ensure_scope(
level: int, global_dict=None, local_dict=None, resolvers=(), target=None
) -> Scope:
"""Ensure that we are grabbing the correct scope."""
return Scope(
level + 1,
global_dict=global_dict,
local_dict=local_dict,
resolvers=resolvers,
target=target,
)
def _replacer(x) -> str:
"""
Replace a number with its hexadecimal representation. Used to tag
temporary variables with their calling scope's id.
"""
# get the hex repr of the binary char and remove 0x and pad by pad_size
# zeros
try:
hexin = ord(x)
except TypeError:
# bytes literals masquerade as ints when iterating in py3
hexin = x
return hex(hexin)
def _raw_hex_id(obj) -> str:
"""Return the padded hexadecimal id of ``obj``."""
# interpret as a pointer since that's what really what id returns
packed = struct.pack("@P", id(obj))
return "".join([_replacer(x) for x in packed])
DEFAULT_GLOBALS = {
"Timestamp": Timestamp,
"datetime": datetime.datetime,
"True": True,
"False": False,
"list": list,
"tuple": tuple,
"inf": np.inf,
"Inf": np.inf,
}
def _get_pretty_string(obj) -> str:
"""
Return a prettier version of obj.
Parameters
----------
obj : object
Object to pretty print
Returns
-------
str
Pretty print object repr
"""
sio = StringIO()
pprint.pprint(obj, stream=sio)
return sio.getvalue()
class Scope:
"""
Object to hold scope, with a few bells to deal with some custom syntax
and contexts added by pandas.
Parameters
----------
level : int
global_dict : dict or None, optional, default None
local_dict : dict or Scope or None, optional, default None
resolvers : list-like or None, optional, default None
target : object
Attributes
----------
level : int
scope : DeepChainMap
target : object
temps : dict
"""
__slots__ = ["level", "scope", "target", "resolvers", "temps"]
level: int
scope: DeepChainMap
resolvers: DeepChainMap
temps: dict
def __init__(
self, level: int, global_dict=None, local_dict=None, resolvers=(), target=None
) -> None:
self.level = level + 1
# shallow copy because we don't want to keep filling this up with what
# was there before if there are multiple calls to Scope/_ensure_scope
self.scope = DeepChainMap(DEFAULT_GLOBALS.copy())
self.target = target
if isinstance(local_dict, Scope):
self.scope.update(local_dict.scope)
if local_dict.target is not None:
self.target = local_dict.target
self._update(local_dict.level)
frame = sys._getframe(self.level)
try:
# shallow copy here because we don't want to replace what's in
# scope when we align terms (alignment accesses the underlying
# numpy array of pandas objects)
scope_global = self.scope.new_child(
(global_dict if global_dict is not None else frame.f_globals).copy()
)
self.scope = DeepChainMap(scope_global)
if not isinstance(local_dict, Scope):
scope_local = self.scope.new_child(
(local_dict if local_dict is not None else frame.f_locals).copy()
)
self.scope = DeepChainMap(scope_local)
finally:
del frame
# assumes that resolvers are going from outermost scope to inner
if isinstance(local_dict, Scope):
resolvers += tuple(local_dict.resolvers.maps)
self.resolvers = DeepChainMap(*resolvers)
self.temps = {}
def __repr__(self) -> str:
scope_keys = _get_pretty_string(list(self.scope.keys()))
res_keys = _get_pretty_string(list(self.resolvers.keys()))
return f"{type(self).__name__}(scope={scope_keys}, resolvers={res_keys})"
@property
def has_resolvers(self) -> bool:
"""
Return whether we have any extra scope.
For example, DataFrames pass Their columns as resolvers during calls to
``DataFrame.eval()`` and ``DataFrame.query()``.
Returns
-------
hr : bool
"""
return bool(len(self.resolvers))
def resolve(self, key: str, is_local: bool):
"""
Resolve a variable name in a possibly local context.
Parameters
----------
key : str
A variable name
is_local : bool
Flag indicating whether the variable is local or not (prefixed with
the '@' symbol)
Returns
-------
value : object
The value of a particular variable
"""
try:
# only look for locals in outer scope
if is_local:
return self.scope[key]
# not a local variable so check in resolvers if we have them
if self.has_resolvers:
return self.resolvers[key]
# if we're here that means that we have no locals and we also have
# no resolvers
assert not is_local and not self.has_resolvers
return self.scope[key]
except KeyError:
try:
# last ditch effort we look in temporaries
# these are created when parsing indexing expressions
# e.g., df[df > 0]
return self.temps[key]
except KeyError as err:
raise UndefinedVariableError(key, is_local) from err
def swapkey(self, old_key: str, new_key: str, new_value=None) -> None:
"""
Replace a variable name, with a potentially new value.
Parameters
----------
old_key : str
Current variable name to replace
new_key : str
New variable name to replace `old_key` with
new_value : object
Value to be replaced along with the possible renaming
"""
if self.has_resolvers:
maps = self.resolvers.maps + self.scope.maps
else:
maps = self.scope.maps
maps.append(self.temps)
for mapping in maps:
if old_key in mapping:
mapping[new_key] = new_value
return
def _get_vars(self, stack, scopes: list[str]) -> None:
"""
Get specifically scoped variables from a list of stack frames.
Parameters
----------
stack : list
A list of stack frames as returned by ``inspect.stack()``
scopes : sequence of strings
A sequence containing valid stack frame attribute names that
evaluate to a dictionary. For example, ('locals', 'globals')
"""
variables = itertools.product(scopes, stack)
for scope, (frame, _, _, _, _, _) in variables:
try:
d = getattr(frame, f"f_{scope}")
self.scope = DeepChainMap(self.scope.new_child(d))
finally:
# won't remove it, but DECREF it
# in Py3 this probably isn't necessary since frame won't be
# scope after the loop
del frame
def _update(self, level: int) -> None:
"""
Update the current scope by going back `level` levels.
Parameters
----------
level : int
"""
sl = level + 1
# add sl frames to the scope starting with the
# most distant and overwriting with more current
# makes sure that we can capture variable scope
stack = inspect.stack()
try:
self._get_vars(stack[:sl], scopes=["locals"])
finally:
del stack[:], stack
def add_tmp(self, value) -> str:
"""
Add a temporary variable to the scope.
Parameters
----------
value : object
An arbitrary object to be assigned to a temporary variable.
Returns
-------
str
The name of the temporary variable created.
"""
name = f"{type(value).__name__}_{self.ntemps}_{_raw_hex_id(self)}"
# add to inner most scope
assert name not in self.temps
self.temps[name] = value
assert name in self.temps
# only increment if the variable gets put in the scope
return name
@property
def ntemps(self) -> int:
"""The number of temporary variables in this scope"""
return len(self.temps)
@property
def full_scope(self) -> DeepChainMap:
"""
Return the full scope for use with passing to engines transparently
as a mapping.
Returns
-------
vars : DeepChainMap
All variables in this scope.
"""
maps = [self.temps] + self.resolvers.maps + self.scope.maps
return DeepChainMap(*maps)

View File

@ -0,0 +1,941 @@
"""
This module is imported from the pandas package __init__.py file
in order to ensure that the core.config options registered here will
be available as soon as the user loads the package. if register_option
is invoked inside specific modules, they will not be registered until that
module is imported, which may or may not be a problem.
If you need to make sure options are available even before a certain
module is imported, register them here rather than in the module.
"""
from __future__ import annotations
import os
from typing import (
Any,
Callable,
)
import pandas._config.config as cf
from pandas._config.config import (
is_bool,
is_callable,
is_instance_factory,
is_int,
is_nonnegative_int,
is_one_of_factory,
is_str,
is_text,
)
# compute
use_bottleneck_doc = """
: bool
Use the bottleneck library to accelerate if it is installed,
the default is True
Valid values: False,True
"""
def use_bottleneck_cb(key) -> None:
from pandas.core import nanops
nanops.set_use_bottleneck(cf.get_option(key))
use_numexpr_doc = """
: bool
Use the numexpr library to accelerate computation if it is installed,
the default is True
Valid values: False,True
"""
def use_numexpr_cb(key) -> None:
from pandas.core.computation import expressions
expressions.set_use_numexpr(cf.get_option(key))
use_numba_doc = """
: bool
Use the numba engine option for select operations if it is installed,
the default is False
Valid values: False,True
"""
def use_numba_cb(key) -> None:
from pandas.core.util import numba_
numba_.set_use_numba(cf.get_option(key))
with cf.config_prefix("compute"):
cf.register_option(
"use_bottleneck",
True,
use_bottleneck_doc,
validator=is_bool,
cb=use_bottleneck_cb,
)
cf.register_option(
"use_numexpr", True, use_numexpr_doc, validator=is_bool, cb=use_numexpr_cb
)
cf.register_option(
"use_numba", False, use_numba_doc, validator=is_bool, cb=use_numba_cb
)
#
# options from the "display" namespace
pc_precision_doc = """
: int
Floating point output precision in terms of number of places after the
decimal, for regular formatting as well as scientific notation. Similar
to ``precision`` in :meth:`numpy.set_printoptions`.
"""
pc_colspace_doc = """
: int
Default space for DataFrame columns.
"""
pc_max_rows_doc = """
: int
If max_rows is exceeded, switch to truncate view. Depending on
`large_repr`, objects are either centrally truncated or printed as
a summary view. 'None' value means unlimited.
In case python/IPython is running in a terminal and `large_repr`
equals 'truncate' this can be set to 0 and pandas will auto-detect
the height of the terminal and print a truncated object which fits
the screen height. The IPython notebook, IPython qtconsole, or
IDLE do not run in a terminal and hence it is not possible to do
correct auto-detection.
"""
pc_min_rows_doc = """
: int
The numbers of rows to show in a truncated view (when `max_rows` is
exceeded). Ignored when `max_rows` is set to None or 0. When set to
None, follows the value of `max_rows`.
"""
pc_max_cols_doc = """
: int
If max_cols is exceeded, switch to truncate view. Depending on
`large_repr`, objects are either centrally truncated or printed as
a summary view. 'None' value means unlimited.
In case python/IPython is running in a terminal and `large_repr`
equals 'truncate' this can be set to 0 or None and pandas will auto-detect
the width of the terminal and print a truncated object which fits
the screen width. The IPython notebook, IPython qtconsole, or IDLE
do not run in a terminal and hence it is not possible to do
correct auto-detection and defaults to 20.
"""
pc_max_categories_doc = """
: int
This sets the maximum number of categories pandas should output when
printing out a `Categorical` or a Series of dtype "category".
"""
pc_max_info_cols_doc = """
: int
max_info_columns is used in DataFrame.info method to decide if
per column information will be printed.
"""
pc_nb_repr_h_doc = """
: boolean
When True, IPython notebook will use html representation for
pandas objects (if it is available).
"""
pc_pprint_nest_depth = """
: int
Controls the number of nested levels to process when pretty-printing
"""
pc_multi_sparse_doc = """
: boolean
"sparsify" MultiIndex display (don't display repeated
elements in outer levels within groups)
"""
float_format_doc = """
: callable
The callable should accept a floating point number and return
a string with the desired format of the number. This is used
in some places like SeriesFormatter.
See formats.format.EngFormatter for an example.
"""
max_colwidth_doc = """
: int or None
The maximum width in characters of a column in the repr of
a pandas data structure. When the column overflows, a "..."
placeholder is embedded in the output. A 'None' value means unlimited.
"""
colheader_justify_doc = """
: 'left'/'right'
Controls the justification of column headers. used by DataFrameFormatter.
"""
pc_expand_repr_doc = """
: boolean
Whether to print out the full DataFrame repr for wide DataFrames across
multiple lines, `max_columns` is still respected, but the output will
wrap-around across multiple "pages" if its width exceeds `display.width`.
"""
pc_show_dimensions_doc = """
: boolean or 'truncate'
Whether to print out dimensions at the end of DataFrame repr.
If 'truncate' is specified, only print out the dimensions if the
frame is truncated (e.g. not display all rows and/or columns)
"""
pc_east_asian_width_doc = """
: boolean
Whether to use the Unicode East Asian Width to calculate the display text
width.
Enabling this may affect to the performance (default: False)
"""
pc_ambiguous_as_wide_doc = """
: boolean
Whether to handle Unicode characters belong to Ambiguous as Wide (width=2)
(default: False)
"""
pc_table_schema_doc = """
: boolean
Whether to publish a Table Schema representation for frontends
that support it.
(default: False)
"""
pc_html_border_doc = """
: int
A ``border=value`` attribute is inserted in the ``<table>`` tag
for the DataFrame HTML repr.
"""
pc_html_use_mathjax_doc = """\
: boolean
When True, Jupyter notebook will process table contents using MathJax,
rendering mathematical expressions enclosed by the dollar symbol.
(default: True)
"""
pc_max_dir_items = """\
: int
The number of items that will be added to `dir(...)`. 'None' value means
unlimited. Because dir is cached, changing this option will not immediately
affect already existing dataframes until a column is deleted or added.
This is for instance used to suggest columns from a dataframe to tab
completion.
"""
pc_width_doc = """
: int
Width of the display in characters. In case python/IPython is running in
a terminal this can be set to None and pandas will correctly auto-detect
the width.
Note that the IPython notebook, IPython qtconsole, or IDLE do not run in a
terminal and hence it is not possible to correctly detect the width.
"""
pc_chop_threshold_doc = """
: float or None
if set to a float value, all float values smaller than the given threshold
will be displayed as exactly 0 by repr and friends.
"""
pc_max_seq_items = """
: int or None
When pretty-printing a long sequence, no more then `max_seq_items`
will be printed. If items are omitted, they will be denoted by the
addition of "..." to the resulting string.
If set to None, the number of items to be printed is unlimited.
"""
pc_max_info_rows_doc = """
: int
df.info() will usually show null-counts for each column.
For large frames this can be quite slow. max_info_rows and max_info_cols
limit this null check only to frames with smaller dimensions than
specified.
"""
pc_large_repr_doc = """
: 'truncate'/'info'
For DataFrames exceeding max_rows/max_cols, the repr (and HTML repr) can
show a truncated table, or switch to the view from
df.info() (the behaviour in earlier versions of pandas).
"""
pc_memory_usage_doc = """
: bool, string or None
This specifies if the memory usage of a DataFrame should be displayed when
df.info() is called. Valid values True,False,'deep'
"""
def table_schema_cb(key) -> None:
from pandas.io.formats.printing import enable_data_resource_formatter
enable_data_resource_formatter(cf.get_option(key))
def is_terminal() -> bool:
"""
Detect if Python is running in a terminal.
Returns True if Python is running in a terminal or False if not.
"""
try:
# error: Name 'get_ipython' is not defined
ip = get_ipython() # type: ignore[name-defined]
except NameError: # assume standard Python interpreter in a terminal
return True
else:
if hasattr(ip, "kernel"): # IPython as a Jupyter kernel
return False
else: # IPython in a terminal
return True
with cf.config_prefix("display"):
cf.register_option("precision", 6, pc_precision_doc, validator=is_nonnegative_int)
cf.register_option(
"float_format",
None,
float_format_doc,
validator=is_one_of_factory([None, is_callable]),
)
cf.register_option(
"max_info_rows",
1690785,
pc_max_info_rows_doc,
validator=is_int,
)
cf.register_option("max_rows", 60, pc_max_rows_doc, validator=is_nonnegative_int)
cf.register_option(
"min_rows",
10,
pc_min_rows_doc,
validator=is_instance_factory([type(None), int]),
)
cf.register_option("max_categories", 8, pc_max_categories_doc, validator=is_int)
cf.register_option(
"max_colwidth",
50,
max_colwidth_doc,
validator=is_nonnegative_int,
)
if is_terminal():
max_cols = 0 # automatically determine optimal number of columns
else:
max_cols = 20 # cannot determine optimal number of columns
cf.register_option(
"max_columns", max_cols, pc_max_cols_doc, validator=is_nonnegative_int
)
cf.register_option(
"large_repr",
"truncate",
pc_large_repr_doc,
validator=is_one_of_factory(["truncate", "info"]),
)
cf.register_option("max_info_columns", 100, pc_max_info_cols_doc, validator=is_int)
cf.register_option(
"colheader_justify", "right", colheader_justify_doc, validator=is_text
)
cf.register_option("notebook_repr_html", True, pc_nb_repr_h_doc, validator=is_bool)
cf.register_option("pprint_nest_depth", 3, pc_pprint_nest_depth, validator=is_int)
cf.register_option("multi_sparse", True, pc_multi_sparse_doc, validator=is_bool)
cf.register_option("expand_frame_repr", True, pc_expand_repr_doc)
cf.register_option(
"show_dimensions",
"truncate",
pc_show_dimensions_doc,
validator=is_one_of_factory([True, False, "truncate"]),
)
cf.register_option("chop_threshold", None, pc_chop_threshold_doc)
cf.register_option("max_seq_items", 100, pc_max_seq_items)
cf.register_option(
"width", 80, pc_width_doc, validator=is_instance_factory([type(None), int])
)
cf.register_option(
"memory_usage",
True,
pc_memory_usage_doc,
validator=is_one_of_factory([None, True, False, "deep"]),
)
cf.register_option(
"unicode.east_asian_width", False, pc_east_asian_width_doc, validator=is_bool
)
cf.register_option(
"unicode.ambiguous_as_wide", False, pc_east_asian_width_doc, validator=is_bool
)
cf.register_option(
"html.table_schema",
False,
pc_table_schema_doc,
validator=is_bool,
cb=table_schema_cb,
)
cf.register_option("html.border", 1, pc_html_border_doc, validator=is_int)
cf.register_option(
"html.use_mathjax", True, pc_html_use_mathjax_doc, validator=is_bool
)
cf.register_option(
"max_dir_items", 100, pc_max_dir_items, validator=is_nonnegative_int
)
tc_sim_interactive_doc = """
: boolean
Whether to simulate interactive mode for purposes of testing
"""
with cf.config_prefix("mode"):
cf.register_option("sim_interactive", False, tc_sim_interactive_doc)
use_inf_as_na_doc = """
: boolean
True means treat None, NaN, INF, -INF as NA (old way),
False means None and NaN are null, but INF, -INF are not NA
(new way).
This option is deprecated in pandas 2.1.0 and will be removed in 3.0.
"""
# We don't want to start importing everything at the global context level
# or we'll hit circular deps.
def use_inf_as_na_cb(key) -> None:
# TODO(3.0): enforcing this deprecation will close GH#52501
from pandas.core.dtypes.missing import _use_inf_as_na
_use_inf_as_na(key)
with cf.config_prefix("mode"):
cf.register_option("use_inf_as_na", False, use_inf_as_na_doc, cb=use_inf_as_na_cb)
cf.deprecate_option(
# GH#51684
"mode.use_inf_as_na",
"use_inf_as_na option is deprecated and will be removed in a future "
"version. Convert inf values to NaN before operating instead.",
)
data_manager_doc = """
: string
Internal data manager type; can be "block" or "array". Defaults to "block",
unless overridden by the 'PANDAS_DATA_MANAGER' environment variable (needs
to be set before pandas is imported).
"""
with cf.config_prefix("mode"):
cf.register_option(
"data_manager",
# Get the default from an environment variable, if set, otherwise defaults
# to "block". This environment variable can be set for testing.
os.environ.get("PANDAS_DATA_MANAGER", "block"),
data_manager_doc,
validator=is_one_of_factory(["block", "array"]),
)
cf.deprecate_option(
# GH#55043
"mode.data_manager",
"data_manager option is deprecated and will be removed in a future "
"version. Only the BlockManager will be available.",
)
# TODO better name?
copy_on_write_doc = """
: bool
Use new copy-view behaviour using Copy-on-Write. Defaults to False,
unless overridden by the 'PANDAS_COPY_ON_WRITE' environment variable
(if set to "1" for True, needs to be set before pandas is imported).
"""
with cf.config_prefix("mode"):
cf.register_option(
"copy_on_write",
# Get the default from an environment variable, if set, otherwise defaults
# to False. This environment variable can be set for testing.
"warn"
if os.environ.get("PANDAS_COPY_ON_WRITE", "0") == "warn"
else os.environ.get("PANDAS_COPY_ON_WRITE", "0") == "1",
copy_on_write_doc,
validator=is_one_of_factory([True, False, "warn"]),
)
# user warnings
chained_assignment = """
: string
Raise an exception, warn, or no action if trying to use chained assignment,
The default is warn
"""
with cf.config_prefix("mode"):
cf.register_option(
"chained_assignment",
"warn",
chained_assignment,
validator=is_one_of_factory([None, "warn", "raise"]),
)
string_storage_doc = """
: string
The default storage for StringDtype.
"""
def is_valid_string_storage(value: Any) -> None:
legal_values = ["auto", "python", "pyarrow"]
if value not in legal_values:
msg = "Value must be one of python|pyarrow"
if value == "pyarrow_numpy":
# TODO: we can remove extra message after 3.0
msg += (
". 'pyarrow_numpy' was specified, but this option should be "
"enabled using pandas.options.future.infer_string instead"
)
raise ValueError(msg)
with cf.config_prefix("mode"):
cf.register_option(
"string_storage",
"auto",
string_storage_doc,
# validator=is_one_of_factory(["python", "pyarrow"]),
validator=is_valid_string_storage,
)
# Set up the io.excel specific reader configuration.
reader_engine_doc = """
: string
The default Excel reader engine for '{ext}' files. Available options:
auto, {others}.
"""
_xls_options = ["xlrd", "calamine"]
_xlsm_options = ["xlrd", "openpyxl", "calamine"]
_xlsx_options = ["xlrd", "openpyxl", "calamine"]
_ods_options = ["odf", "calamine"]
_xlsb_options = ["pyxlsb", "calamine"]
with cf.config_prefix("io.excel.xls"):
cf.register_option(
"reader",
"auto",
reader_engine_doc.format(ext="xls", others=", ".join(_xls_options)),
validator=is_one_of_factory(_xls_options + ["auto"]),
)
with cf.config_prefix("io.excel.xlsm"):
cf.register_option(
"reader",
"auto",
reader_engine_doc.format(ext="xlsm", others=", ".join(_xlsm_options)),
validator=is_one_of_factory(_xlsm_options + ["auto"]),
)
with cf.config_prefix("io.excel.xlsx"):
cf.register_option(
"reader",
"auto",
reader_engine_doc.format(ext="xlsx", others=", ".join(_xlsx_options)),
validator=is_one_of_factory(_xlsx_options + ["auto"]),
)
with cf.config_prefix("io.excel.ods"):
cf.register_option(
"reader",
"auto",
reader_engine_doc.format(ext="ods", others=", ".join(_ods_options)),
validator=is_one_of_factory(_ods_options + ["auto"]),
)
with cf.config_prefix("io.excel.xlsb"):
cf.register_option(
"reader",
"auto",
reader_engine_doc.format(ext="xlsb", others=", ".join(_xlsb_options)),
validator=is_one_of_factory(_xlsb_options + ["auto"]),
)
# Set up the io.excel specific writer configuration.
writer_engine_doc = """
: string
The default Excel writer engine for '{ext}' files. Available options:
auto, {others}.
"""
_xlsm_options = ["openpyxl"]
_xlsx_options = ["openpyxl", "xlsxwriter"]
_ods_options = ["odf"]
with cf.config_prefix("io.excel.xlsm"):
cf.register_option(
"writer",
"auto",
writer_engine_doc.format(ext="xlsm", others=", ".join(_xlsm_options)),
validator=str,
)
with cf.config_prefix("io.excel.xlsx"):
cf.register_option(
"writer",
"auto",
writer_engine_doc.format(ext="xlsx", others=", ".join(_xlsx_options)),
validator=str,
)
with cf.config_prefix("io.excel.ods"):
cf.register_option(
"writer",
"auto",
writer_engine_doc.format(ext="ods", others=", ".join(_ods_options)),
validator=str,
)
# Set up the io.parquet specific configuration.
parquet_engine_doc = """
: string
The default parquet reader/writer engine. Available options:
'auto', 'pyarrow', 'fastparquet', the default is 'auto'
"""
with cf.config_prefix("io.parquet"):
cf.register_option(
"engine",
"auto",
parquet_engine_doc,
validator=is_one_of_factory(["auto", "pyarrow", "fastparquet"]),
)
# Set up the io.sql specific configuration.
sql_engine_doc = """
: string
The default sql reader/writer engine. Available options:
'auto', 'sqlalchemy', the default is 'auto'
"""
with cf.config_prefix("io.sql"):
cf.register_option(
"engine",
"auto",
sql_engine_doc,
validator=is_one_of_factory(["auto", "sqlalchemy"]),
)
# --------
# Plotting
# ---------
plotting_backend_doc = """
: str
The plotting backend to use. The default value is "matplotlib", the
backend provided with pandas. Other backends can be specified by
providing the name of the module that implements the backend.
"""
def register_plotting_backend_cb(key) -> None:
if key == "matplotlib":
# We defer matplotlib validation, since it's the default
return
from pandas.plotting._core import _get_plot_backend
_get_plot_backend(key)
with cf.config_prefix("plotting"):
cf.register_option(
"backend",
defval="matplotlib",
doc=plotting_backend_doc,
validator=register_plotting_backend_cb,
)
register_converter_doc = """
: bool or 'auto'.
Whether to register converters with matplotlib's units registry for
dates, times, datetimes, and Periods. Toggling to False will remove
the converters, restoring any converters that pandas overwrote.
"""
def register_converter_cb(key) -> None:
from pandas.plotting import (
deregister_matplotlib_converters,
register_matplotlib_converters,
)
if cf.get_option(key):
register_matplotlib_converters()
else:
deregister_matplotlib_converters()
with cf.config_prefix("plotting.matplotlib"):
cf.register_option(
"register_converters",
"auto",
register_converter_doc,
validator=is_one_of_factory(["auto", True, False]),
cb=register_converter_cb,
)
# ------
# Styler
# ------
styler_sparse_index_doc = """
: bool
Whether to sparsify the display of a hierarchical index. Setting to False will
display each explicit level element in a hierarchical key for each row.
"""
styler_sparse_columns_doc = """
: bool
Whether to sparsify the display of hierarchical columns. Setting to False will
display each explicit level element in a hierarchical key for each column.
"""
styler_render_repr = """
: str
Determine which output to use in Jupyter Notebook in {"html", "latex"}.
"""
styler_max_elements = """
: int
The maximum number of data-cell (<td>) elements that will be rendered before
trimming will occur over columns, rows or both if needed.
"""
styler_max_rows = """
: int, optional
The maximum number of rows that will be rendered. May still be reduced to
satisfy ``max_elements``, which takes precedence.
"""
styler_max_columns = """
: int, optional
The maximum number of columns that will be rendered. May still be reduced to
satisfy ``max_elements``, which takes precedence.
"""
styler_precision = """
: int
The precision for floats and complex numbers.
"""
styler_decimal = """
: str
The character representation for the decimal separator for floats and complex.
"""
styler_thousands = """
: str, optional
The character representation for thousands separator for floats, int and complex.
"""
styler_na_rep = """
: str, optional
The string representation for values identified as missing.
"""
styler_escape = """
: str, optional
Whether to escape certain characters according to the given context; html or latex.
"""
styler_formatter = """
: str, callable, dict, optional
A formatter object to be used as default within ``Styler.format``.
"""
styler_multirow_align = """
: {"c", "t", "b"}
The specifier for vertical alignment of sparsified LaTeX multirows.
"""
styler_multicol_align = r"""
: {"r", "c", "l", "naive-l", "naive-r"}
The specifier for horizontal alignment of sparsified LaTeX multicolumns. Pipe
decorators can also be added to non-naive values to draw vertical
rules, e.g. "\|r" will draw a rule on the left side of right aligned merged cells.
"""
styler_hrules = """
: bool
Whether to add horizontal rules on top and bottom and below the headers.
"""
styler_environment = """
: str
The environment to replace ``\\begin{table}``. If "longtable" is used results
in a specific longtable environment format.
"""
styler_encoding = """
: str
The encoding used for output HTML and LaTeX files.
"""
styler_mathjax = """
: bool
If False will render special CSS classes to table attributes that indicate Mathjax
will not be used in Jupyter Notebook.
"""
with cf.config_prefix("styler"):
cf.register_option("sparse.index", True, styler_sparse_index_doc, validator=is_bool)
cf.register_option(
"sparse.columns", True, styler_sparse_columns_doc, validator=is_bool
)
cf.register_option(
"render.repr",
"html",
styler_render_repr,
validator=is_one_of_factory(["html", "latex"]),
)
cf.register_option(
"render.max_elements",
2**18,
styler_max_elements,
validator=is_nonnegative_int,
)
cf.register_option(
"render.max_rows",
None,
styler_max_rows,
validator=is_nonnegative_int,
)
cf.register_option(
"render.max_columns",
None,
styler_max_columns,
validator=is_nonnegative_int,
)
cf.register_option("render.encoding", "utf-8", styler_encoding, validator=is_str)
cf.register_option("format.decimal", ".", styler_decimal, validator=is_str)
cf.register_option(
"format.precision", 6, styler_precision, validator=is_nonnegative_int
)
cf.register_option(
"format.thousands",
None,
styler_thousands,
validator=is_instance_factory([type(None), str]),
)
cf.register_option(
"format.na_rep",
None,
styler_na_rep,
validator=is_instance_factory([type(None), str]),
)
cf.register_option(
"format.escape",
None,
styler_escape,
validator=is_one_of_factory([None, "html", "latex", "latex-math"]),
)
cf.register_option(
"format.formatter",
None,
styler_formatter,
validator=is_instance_factory([type(None), dict, Callable, str]),
)
cf.register_option("html.mathjax", True, styler_mathjax, validator=is_bool)
cf.register_option(
"latex.multirow_align",
"c",
styler_multirow_align,
validator=is_one_of_factory(["c", "t", "b", "naive"]),
)
val_mca = ["r", "|r|", "|r", "r|", "c", "|c|", "|c", "c|", "l", "|l|", "|l", "l|"]
val_mca += ["naive-l", "naive-r"]
cf.register_option(
"latex.multicol_align",
"r",
styler_multicol_align,
validator=is_one_of_factory(val_mca),
)
cf.register_option("latex.hrules", False, styler_hrules, validator=is_bool)
cf.register_option(
"latex.environment",
None,
styler_environment,
validator=is_instance_factory([type(None), str]),
)
with cf.config_prefix("future"):
cf.register_option(
"infer_string",
True if os.environ.get("PANDAS_FUTURE_INFER_STRING", "0") == "1" else False,
"Whether to infer sequence of str objects as pyarrow string "
"dtype, which will be the default in pandas 3.0 "
"(at which point this option will be deprecated).",
validator=is_one_of_factory([True, False]),
)
cf.register_option(
"no_silent_downcasting",
False,
"Whether to opt-in to the future behavior which will *not* silently "
"downcast results from Series and DataFrame `where`, `mask`, and `clip` "
"methods. "
"Silent downcasting will be removed in pandas 3.0 "
"(at which point this option will be deprecated).",
validator=is_one_of_factory([True, False]),
)

View File

@ -0,0 +1,821 @@
"""
Constructor functions intended to be shared by pd.array, Series.__init__,
and Index.__new__.
These should not depend on core.internals.
"""
from __future__ import annotations
from collections.abc import Sequence
from typing import (
TYPE_CHECKING,
Optional,
Union,
cast,
overload,
)
import warnings
import numpy as np
from numpy import ma
from pandas._config import using_string_dtype
from pandas._libs import lib
from pandas._libs.tslibs import (
Period,
get_supported_dtype,
is_supported_dtype,
)
from pandas._typing import (
AnyArrayLike,
ArrayLike,
Dtype,
DtypeObj,
T,
)
from pandas.util._exceptions import find_stack_level
from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.cast import (
construct_1d_arraylike_from_scalar,
construct_1d_object_array_from_listlike,
maybe_cast_to_datetime,
maybe_cast_to_integer_array,
maybe_convert_platform,
maybe_infer_to_datetimelike,
maybe_promote,
)
from pandas.core.dtypes.common import (
is_list_like,
is_object_dtype,
is_string_dtype,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import NumpyEADtype
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCExtensionArray,
ABCIndex,
ABCSeries,
)
from pandas.core.dtypes.missing import isna
import pandas.core.common as com
if TYPE_CHECKING:
from pandas import (
Index,
Series,
)
from pandas.core.arrays.base import ExtensionArray
def array(
data: Sequence[object] | AnyArrayLike,
dtype: Dtype | None = None,
copy: bool = True,
) -> ExtensionArray:
"""
Create an array.
Parameters
----------
data : Sequence of objects
The scalars inside `data` should be instances of the
scalar type for `dtype`. It's expected that `data`
represents a 1-dimensional array of data.
When `data` is an Index or Series, the underlying array
will be extracted from `data`.
dtype : str, np.dtype, or ExtensionDtype, optional
The dtype to use for the array. This may be a NumPy
dtype or an extension type registered with pandas using
:meth:`pandas.api.extensions.register_extension_dtype`.
If not specified, there are two possibilities:
1. When `data` is a :class:`Series`, :class:`Index`, or
:class:`ExtensionArray`, the `dtype` will be taken
from the data.
2. Otherwise, pandas will attempt to infer the `dtype`
from the data.
Note that when `data` is a NumPy array, ``data.dtype`` is
*not* used for inferring the array type. This is because
NumPy cannot represent all the types of data that can be
held in extension arrays.
Currently, pandas will infer an extension dtype for sequences of
============================== =======================================
Scalar Type Array Type
============================== =======================================
:class:`pandas.Interval` :class:`pandas.arrays.IntervalArray`
:class:`pandas.Period` :class:`pandas.arrays.PeriodArray`
:class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray`
:class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray`
:class:`int` :class:`pandas.arrays.IntegerArray`
:class:`float` :class:`pandas.arrays.FloatingArray`
:class:`str` :class:`pandas.arrays.StringArray` or
:class:`pandas.arrays.ArrowStringArray`
:class:`bool` :class:`pandas.arrays.BooleanArray`
============================== =======================================
The ExtensionArray created when the scalar type is :class:`str` is determined by
``pd.options.mode.string_storage`` if the dtype is not explicitly given.
For all other cases, NumPy's usual inference rules will be used.
copy : bool, default True
Whether to copy the data, even if not necessary. Depending
on the type of `data`, creating the new array may require
copying data, even if ``copy=False``.
Returns
-------
ExtensionArray
The newly created array.
Raises
------
ValueError
When `data` is not 1-dimensional.
See Also
--------
numpy.array : Construct a NumPy array.
Series : Construct a pandas Series.
Index : Construct a pandas Index.
arrays.NumpyExtensionArray : ExtensionArray wrapping a NumPy array.
Series.array : Extract the array stored within a Series.
Notes
-----
Omitting the `dtype` argument means pandas will attempt to infer the
best array type from the values in the data. As new array types are
added by pandas and 3rd party libraries, the "best" array type may
change. We recommend specifying `dtype` to ensure that
1. the correct array type for the data is returned
2. the returned array type doesn't change as new extension types
are added by pandas and third-party libraries
Additionally, if the underlying memory representation of the returned
array matters, we recommend specifying the `dtype` as a concrete object
rather than a string alias or allowing it to be inferred. For example,
a future version of pandas or a 3rd-party library may include a
dedicated ExtensionArray for string data. In this event, the following
would no longer return a :class:`arrays.NumpyExtensionArray` backed by a
NumPy array.
>>> pd.array(['a', 'b'], dtype=str)
<NumpyExtensionArray>
['a', 'b']
Length: 2, dtype: str32
This would instead return the new ExtensionArray dedicated for string
data. If you really need the new array to be backed by a NumPy array,
specify that in the dtype.
>>> pd.array(['a', 'b'], dtype=np.dtype("<U1"))
<NumpyExtensionArray>
['a', 'b']
Length: 2, dtype: str32
Finally, Pandas has arrays that mostly overlap with NumPy
* :class:`arrays.DatetimeArray`
* :class:`arrays.TimedeltaArray`
When data with a ``datetime64[ns]`` or ``timedelta64[ns]`` dtype is
passed, pandas will always return a ``DatetimeArray`` or ``TimedeltaArray``
rather than a ``NumpyExtensionArray``. This is for symmetry with the case of
timezone-aware data, which NumPy does not natively support.
>>> pd.array(['2015', '2016'], dtype='datetime64[ns]')
<DatetimeArray>
['2015-01-01 00:00:00', '2016-01-01 00:00:00']
Length: 2, dtype: datetime64[ns]
>>> pd.array(["1h", "2h"], dtype='timedelta64[ns]')
<TimedeltaArray>
['0 days 01:00:00', '0 days 02:00:00']
Length: 2, dtype: timedelta64[ns]
Examples
--------
If a dtype is not specified, pandas will infer the best dtype from the values.
See the description of `dtype` for the types pandas infers for.
>>> pd.array([1, 2])
<IntegerArray>
[1, 2]
Length: 2, dtype: Int64
>>> pd.array([1, 2, np.nan])
<IntegerArray>
[1, 2, <NA>]
Length: 3, dtype: Int64
>>> pd.array([1.1, 2.2])
<FloatingArray>
[1.1, 2.2]
Length: 2, dtype: Float64
>>> pd.array(["a", None, "c"])
<StringArray>
['a', <NA>, 'c']
Length: 3, dtype: string
>>> with pd.option_context("string_storage", "pyarrow"):
... arr = pd.array(["a", None, "c"])
...
>>> arr
<ArrowStringArray>
['a', <NA>, 'c']
Length: 3, dtype: string
>>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")])
<PeriodArray>
['2000-01-01', '2000-01-01']
Length: 2, dtype: period[D]
You can use the string alias for `dtype`
>>> pd.array(['a', 'b', 'a'], dtype='category')
['a', 'b', 'a']
Categories (2, object): ['a', 'b']
Or specify the actual dtype
>>> pd.array(['a', 'b', 'a'],
... dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True))
['a', 'b', 'a']
Categories (3, object): ['a' < 'b' < 'c']
If pandas does not infer a dedicated extension type a
:class:`arrays.NumpyExtensionArray` is returned.
>>> pd.array([1 + 1j, 3 + 2j])
<NumpyExtensionArray>
[(1+1j), (3+2j)]
Length: 2, dtype: complex128
As mentioned in the "Notes" section, new extension types may be added
in the future (by pandas or 3rd party libraries), causing the return
value to no longer be a :class:`arrays.NumpyExtensionArray`. Specify the
`dtype` as a NumPy dtype if you need to ensure there's no future change in
behavior.
>>> pd.array([1, 2], dtype=np.dtype("int32"))
<NumpyExtensionArray>
[1, 2]
Length: 2, dtype: int32
`data` must be 1-dimensional. A ValueError is raised when the input
has the wrong dimensionality.
>>> pd.array(1)
Traceback (most recent call last):
...
ValueError: Cannot pass scalar '1' to 'pandas.array'.
"""
from pandas.core.arrays import (
BooleanArray,
DatetimeArray,
ExtensionArray,
FloatingArray,
IntegerArray,
IntervalArray,
NumpyExtensionArray,
PeriodArray,
TimedeltaArray,
)
from pandas.core.arrays.string_ import StringDtype
if lib.is_scalar(data):
msg = f"Cannot pass scalar '{data}' to 'pandas.array'."
raise ValueError(msg)
elif isinstance(data, ABCDataFrame):
raise TypeError("Cannot pass DataFrame to 'pandas.array'")
if dtype is None and isinstance(data, (ABCSeries, ABCIndex, ExtensionArray)):
# Note: we exclude np.ndarray here, will do type inference on it
dtype = data.dtype
data = extract_array(data, extract_numpy=True)
# this returns None for not-found dtypes.
if dtype is not None:
dtype = pandas_dtype(dtype)
if isinstance(data, ExtensionArray) and (dtype is None or data.dtype == dtype):
# e.g. TimedeltaArray[s], avoid casting to NumpyExtensionArray
if copy:
return data.copy()
return data
if isinstance(dtype, ExtensionDtype):
cls = dtype.construct_array_type()
return cls._from_sequence(data, dtype=dtype, copy=copy)
if dtype is None:
inferred_dtype = lib.infer_dtype(data, skipna=True)
if inferred_dtype == "period":
period_data = cast(Union[Sequence[Optional[Period]], AnyArrayLike], data)
return PeriodArray._from_sequence(period_data, copy=copy)
elif inferred_dtype == "interval":
return IntervalArray(data, copy=copy)
elif inferred_dtype.startswith("datetime"):
# datetime, datetime64
try:
return DatetimeArray._from_sequence(data, copy=copy)
except ValueError:
# Mixture of timezones, fall back to NumpyExtensionArray
pass
elif inferred_dtype.startswith("timedelta"):
# timedelta, timedelta64
return TimedeltaArray._from_sequence(data, copy=copy)
elif inferred_dtype == "string":
# StringArray/ArrowStringArray depending on pd.options.mode.string_storage
dtype = StringDtype()
cls = dtype.construct_array_type()
return cls._from_sequence(data, dtype=dtype, copy=copy)
elif inferred_dtype == "integer":
return IntegerArray._from_sequence(data, copy=copy)
elif inferred_dtype == "empty" and not hasattr(data, "dtype") and not len(data):
return FloatingArray._from_sequence(data, copy=copy)
elif (
inferred_dtype in ("floating", "mixed-integer-float")
and getattr(data, "dtype", None) != np.float16
):
# GH#44715 Exclude np.float16 bc FloatingArray does not support it;
# we will fall back to NumpyExtensionArray.
return FloatingArray._from_sequence(data, copy=copy)
elif inferred_dtype == "boolean":
return BooleanArray._from_sequence(data, dtype="boolean", copy=copy)
# Pandas overrides NumPy for
# 1. datetime64[ns,us,ms,s]
# 2. timedelta64[ns,us,ms,s]
# so that a DatetimeArray is returned.
if lib.is_np_dtype(dtype, "M") and is_supported_dtype(dtype):
return DatetimeArray._from_sequence(data, dtype=dtype, copy=copy)
if lib.is_np_dtype(dtype, "m") and is_supported_dtype(dtype):
return TimedeltaArray._from_sequence(data, dtype=dtype, copy=copy)
elif lib.is_np_dtype(dtype, "mM"):
warnings.warn(
r"datetime64 and timedelta64 dtype resolutions other than "
r"'s', 'ms', 'us', and 'ns' are deprecated. "
r"In future releases passing unsupported resolutions will "
r"raise an exception.",
FutureWarning,
stacklevel=find_stack_level(),
)
return NumpyExtensionArray._from_sequence(data, dtype=dtype, copy=copy)
_typs = frozenset(
{
"index",
"rangeindex",
"multiindex",
"datetimeindex",
"timedeltaindex",
"periodindex",
"categoricalindex",
"intervalindex",
"series",
}
)
@overload
def extract_array(
obj: Series | Index, extract_numpy: bool = ..., extract_range: bool = ...
) -> ArrayLike:
...
@overload
def extract_array(
obj: T, extract_numpy: bool = ..., extract_range: bool = ...
) -> T | ArrayLike:
...
def extract_array(
obj: T, extract_numpy: bool = False, extract_range: bool = False
) -> T | ArrayLike:
"""
Extract the ndarray or ExtensionArray from a Series or Index.
For all other types, `obj` is just returned as is.
Parameters
----------
obj : object
For Series / Index, the underlying ExtensionArray is unboxed.
extract_numpy : bool, default False
Whether to extract the ndarray from a NumpyExtensionArray.
extract_range : bool, default False
If we have a RangeIndex, return range._values if True
(which is a materialized integer ndarray), otherwise return unchanged.
Returns
-------
arr : object
Examples
--------
>>> extract_array(pd.Series(['a', 'b', 'c'], dtype='category'))
['a', 'b', 'c']
Categories (3, object): ['a', 'b', 'c']
Other objects like lists, arrays, and DataFrames are just passed through.
>>> extract_array([1, 2, 3])
[1, 2, 3]
For an ndarray-backed Series / Index the ndarray is returned.
>>> extract_array(pd.Series([1, 2, 3]))
array([1, 2, 3])
To extract all the way down to the ndarray, pass ``extract_numpy=True``.
>>> extract_array(pd.Series([1, 2, 3]), extract_numpy=True)
array([1, 2, 3])
"""
typ = getattr(obj, "_typ", None)
if typ in _typs:
# i.e. isinstance(obj, (ABCIndex, ABCSeries))
if typ == "rangeindex":
if extract_range:
# error: "T" has no attribute "_values"
return obj._values # type: ignore[attr-defined]
return obj
# error: "T" has no attribute "_values"
return obj._values # type: ignore[attr-defined]
elif extract_numpy and typ == "npy_extension":
# i.e. isinstance(obj, ABCNumpyExtensionArray)
# error: "T" has no attribute "to_numpy"
return obj.to_numpy() # type: ignore[attr-defined]
return obj
def ensure_wrapped_if_datetimelike(arr):
"""
Wrap datetime64 and timedelta64 ndarrays in DatetimeArray/TimedeltaArray.
"""
if isinstance(arr, np.ndarray):
if arr.dtype.kind == "M":
from pandas.core.arrays import DatetimeArray
dtype = get_supported_dtype(arr.dtype)
return DatetimeArray._from_sequence(arr, dtype=dtype)
elif arr.dtype.kind == "m":
from pandas.core.arrays import TimedeltaArray
dtype = get_supported_dtype(arr.dtype)
return TimedeltaArray._from_sequence(arr, dtype=dtype)
return arr
def sanitize_masked_array(data: ma.MaskedArray) -> np.ndarray:
"""
Convert numpy MaskedArray to ensure mask is softened.
"""
mask = ma.getmaskarray(data)
if mask.any():
dtype, fill_value = maybe_promote(data.dtype, np.nan)
dtype = cast(np.dtype, dtype)
data = ma.asarray(data.astype(dtype, copy=True))
data.soften_mask() # set hardmask False if it was True
data[mask] = fill_value
else:
data = data.copy()
return data
def sanitize_array(
data,
index: Index | None,
dtype: DtypeObj | None = None,
copy: bool = False,
*,
allow_2d: bool = False,
) -> ArrayLike:
"""
Sanitize input data to an ndarray or ExtensionArray, copy if specified,
coerce to the dtype if specified.
Parameters
----------
data : Any
index : Index or None, default None
dtype : np.dtype, ExtensionDtype, or None, default None
copy : bool, default False
allow_2d : bool, default False
If False, raise if we have a 2D Arraylike.
Returns
-------
np.ndarray or ExtensionArray
"""
original_dtype = dtype
if isinstance(data, ma.MaskedArray):
data = sanitize_masked_array(data)
if isinstance(dtype, NumpyEADtype):
# Avoid ending up with a NumpyExtensionArray
dtype = dtype.numpy_dtype
object_index = False
if isinstance(data, ABCIndex) and data.dtype == object and dtype is None:
object_index = True
# extract ndarray or ExtensionArray, ensure we have no NumpyExtensionArray
data = extract_array(data, extract_numpy=True, extract_range=True)
if isinstance(data, np.ndarray) and data.ndim == 0:
if dtype is None:
dtype = data.dtype
data = lib.item_from_zerodim(data)
elif isinstance(data, range):
# GH#16804
data = range_to_ndarray(data)
copy = False
if not is_list_like(data):
if index is None:
raise ValueError("index must be specified when data is not list-like")
if isinstance(data, str) and using_string_dtype() and original_dtype is None:
from pandas.core.arrays.string_ import StringDtype
dtype = StringDtype(na_value=np.nan)
data = construct_1d_arraylike_from_scalar(data, len(index), dtype)
return data
elif isinstance(data, ABCExtensionArray):
# it is already ensured above this is not a NumpyExtensionArray
# Until GH#49309 is fixed this check needs to come before the
# ExtensionDtype check
if dtype is not None:
subarr = data.astype(dtype, copy=copy)
elif copy:
subarr = data.copy()
else:
subarr = data
elif isinstance(dtype, ExtensionDtype):
# create an extension array from its dtype
_sanitize_non_ordered(data)
cls = dtype.construct_array_type()
if not hasattr(data, "__array__"):
data = list(data)
subarr = cls._from_sequence(data, dtype=dtype, copy=copy)
# GH#846
elif isinstance(data, np.ndarray):
if isinstance(data, np.matrix):
data = data.A
if dtype is None:
subarr = data
if data.dtype == object:
subarr = maybe_infer_to_datetimelike(data)
if object_index and using_string_dtype() and is_string_dtype(subarr):
# Avoid inference when string option is set
subarr = data
elif data.dtype.kind == "U" and using_string_dtype():
from pandas.core.arrays.string_ import StringDtype
dtype = StringDtype(na_value=np.nan)
subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype)
if (
subarr is data
or (subarr.dtype == "str" and subarr.dtype.storage == "python") # type: ignore[union-attr]
) and copy:
subarr = subarr.copy()
else:
# we will try to copy by-definition here
subarr = _try_cast(data, dtype, copy)
elif hasattr(data, "__array__"):
# e.g. dask array GH#38645
if not copy:
data = np.asarray(data)
else:
data = np.array(data, copy=copy)
return sanitize_array(
data,
index=index,
dtype=dtype,
copy=False,
allow_2d=allow_2d,
)
else:
_sanitize_non_ordered(data)
# materialize e.g. generators, convert e.g. tuples, abc.ValueView
data = list(data)
if len(data) == 0 and dtype is None:
# We default to float64, matching numpy
subarr = np.array([], dtype=np.float64)
elif dtype is not None:
subarr = _try_cast(data, dtype, copy)
else:
subarr = maybe_convert_platform(data)
if subarr.dtype == object:
subarr = cast(np.ndarray, subarr)
subarr = maybe_infer_to_datetimelike(subarr)
subarr = _sanitize_ndim(subarr, data, dtype, index, allow_2d=allow_2d)
if isinstance(subarr, np.ndarray):
# at this point we should have dtype be None or subarr.dtype == dtype
dtype = cast(np.dtype, dtype)
subarr = _sanitize_str_dtypes(subarr, data, dtype, copy)
return subarr
def range_to_ndarray(rng: range) -> np.ndarray:
"""
Cast a range object to ndarray.
"""
# GH#30171 perf avoid realizing range as a list in np.array
try:
arr = np.arange(rng.start, rng.stop, rng.step, dtype="int64")
except OverflowError:
# GH#30173 handling for ranges that overflow int64
if (rng.start >= 0 and rng.step > 0) or (rng.step < 0 <= rng.stop):
try:
arr = np.arange(rng.start, rng.stop, rng.step, dtype="uint64")
except OverflowError:
arr = construct_1d_object_array_from_listlike(list(rng))
else:
arr = construct_1d_object_array_from_listlike(list(rng))
return arr
def _sanitize_non_ordered(data) -> None:
"""
Raise only for unordered sets, e.g., not for dict_keys
"""
if isinstance(data, (set, frozenset)):
raise TypeError(f"'{type(data).__name__}' type is unordered")
def _sanitize_ndim(
result: ArrayLike,
data,
dtype: DtypeObj | None,
index: Index | None,
*,
allow_2d: bool = False,
) -> ArrayLike:
"""
Ensure we have a 1-dimensional result array.
"""
if getattr(result, "ndim", 0) == 0:
raise ValueError("result should be arraylike with ndim > 0")
if result.ndim == 1:
# the result that we want
result = _maybe_repeat(result, index)
elif result.ndim > 1:
if isinstance(data, np.ndarray):
if allow_2d:
return result
raise ValueError(
f"Data must be 1-dimensional, got ndarray of shape {data.shape} instead"
)
if is_object_dtype(dtype) and isinstance(dtype, ExtensionDtype):
# i.e. NumpyEADtype("O")
result = com.asarray_tuplesafe(data, dtype=np.dtype("object"))
cls = dtype.construct_array_type()
result = cls._from_sequence(result, dtype=dtype)
else:
# error: Argument "dtype" to "asarray_tuplesafe" has incompatible type
# "Union[dtype[Any], ExtensionDtype, None]"; expected "Union[str,
# dtype[Any], None]"
result = com.asarray_tuplesafe(data, dtype=dtype) # type: ignore[arg-type]
return result
def _sanitize_str_dtypes(
result: np.ndarray, data, dtype: np.dtype | None, copy: bool
) -> np.ndarray:
"""
Ensure we have a dtype that is supported by pandas.
"""
# This is to prevent mixed-type Series getting all casted to
# NumPy string type, e.g. NaN --> '-1#IND'.
if issubclass(result.dtype.type, str):
# GH#16605
# If not empty convert the data to dtype
# GH#19853: If data is a scalar, result has already the result
if not lib.is_scalar(data):
if not np.all(isna(data)):
data = np.asarray(data, dtype=dtype)
if not copy:
result = np.asarray(data, dtype=object)
else:
result = np.array(data, dtype=object, copy=copy)
return result
def _maybe_repeat(arr: ArrayLike, index: Index | None) -> ArrayLike:
"""
If we have a length-1 array and an index describing how long we expect
the result to be, repeat the array.
"""
if index is not None:
if 1 == len(arr) != len(index):
arr = arr.repeat(len(index))
return arr
def _try_cast(
arr: list | np.ndarray,
dtype: np.dtype,
copy: bool,
) -> ArrayLike:
"""
Convert input to numpy ndarray and optionally cast to a given dtype.
Parameters
----------
arr : ndarray or list
Excludes: ExtensionArray, Series, Index.
dtype : np.dtype
copy : bool
If False, don't copy the data if not needed.
Returns
-------
np.ndarray or ExtensionArray
"""
is_ndarray = isinstance(arr, np.ndarray)
if dtype == object:
if not is_ndarray:
subarr = construct_1d_object_array_from_listlike(arr)
return subarr
return ensure_wrapped_if_datetimelike(arr).astype(dtype, copy=copy)
elif dtype.kind == "U":
# TODO: test cases with arr.dtype.kind in "mM"
if is_ndarray:
arr = cast(np.ndarray, arr)
shape = arr.shape
if arr.ndim > 1:
arr = arr.ravel()
else:
shape = (len(arr),)
return lib.ensure_string_array(arr, convert_na_value=False, copy=copy).reshape(
shape
)
elif dtype.kind in "mM":
return maybe_cast_to_datetime(arr, dtype)
# GH#15832: Check if we are requesting a numeric dtype and
# that we can convert the data to the requested dtype.
elif dtype.kind in "iu":
# this will raise if we have e.g. floats
subarr = maybe_cast_to_integer_array(arr, dtype)
elif not copy:
subarr = np.asarray(arr, dtype=dtype)
else:
subarr = np.array(arr, dtype=dtype, copy=copy)
return subarr

View File

@ -0,0 +1,85 @@
from pandas.core.dtypes.common import (
is_any_real_numeric_dtype,
is_array_like,
is_bool,
is_bool_dtype,
is_categorical_dtype,
is_complex,
is_complex_dtype,
is_datetime64_any_dtype,
is_datetime64_dtype,
is_datetime64_ns_dtype,
is_datetime64tz_dtype,
is_dict_like,
is_dtype_equal,
is_extension_array_dtype,
is_file_like,
is_float,
is_float_dtype,
is_hashable,
is_int64_dtype,
is_integer,
is_integer_dtype,
is_interval,
is_interval_dtype,
is_iterator,
is_list_like,
is_named_tuple,
is_number,
is_numeric_dtype,
is_object_dtype,
is_period_dtype,
is_re,
is_re_compilable,
is_scalar,
is_signed_integer_dtype,
is_sparse,
is_string_dtype,
is_timedelta64_dtype,
is_timedelta64_ns_dtype,
is_unsigned_integer_dtype,
pandas_dtype,
)
__all__ = [
"is_any_real_numeric_dtype",
"is_array_like",
"is_bool",
"is_bool_dtype",
"is_categorical_dtype",
"is_complex",
"is_complex_dtype",
"is_datetime64_any_dtype",
"is_datetime64_dtype",
"is_datetime64_ns_dtype",
"is_datetime64tz_dtype",
"is_dict_like",
"is_dtype_equal",
"is_extension_array_dtype",
"is_file_like",
"is_float",
"is_float_dtype",
"is_hashable",
"is_int64_dtype",
"is_integer",
"is_integer_dtype",
"is_interval",
"is_interval_dtype",
"is_iterator",
"is_list_like",
"is_named_tuple",
"is_number",
"is_numeric_dtype",
"is_object_dtype",
"is_period_dtype",
"is_re",
"is_re_compilable",
"is_scalar",
"is_signed_integer_dtype",
"is_sparse",
"is_string_dtype",
"is_timedelta64_dtype",
"is_timedelta64_ns_dtype",
"is_unsigned_integer_dtype",
"pandas_dtype",
]

View File

@ -0,0 +1,301 @@
"""
Functions for implementing 'astype' methods according to pandas conventions,
particularly ones that differ from numpy.
"""
from __future__ import annotations
import inspect
from typing import (
TYPE_CHECKING,
overload,
)
import warnings
import numpy as np
from pandas._libs import lib
from pandas._libs.tslibs.timedeltas import array_to_timedelta64
from pandas.errors import IntCastingNaNError
from pandas.core.dtypes.common import (
is_object_dtype,
is_string_dtype,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import (
ExtensionDtype,
NumpyEADtype,
)
if TYPE_CHECKING:
from pandas._typing import (
ArrayLike,
DtypeObj,
IgnoreRaise,
)
from pandas.core.arrays import ExtensionArray
_dtype_obj = np.dtype(object)
@overload
def _astype_nansafe(
arr: np.ndarray, dtype: np.dtype, copy: bool = ..., skipna: bool = ...
) -> np.ndarray:
...
@overload
def _astype_nansafe(
arr: np.ndarray, dtype: ExtensionDtype, copy: bool = ..., skipna: bool = ...
) -> ExtensionArray:
...
def _astype_nansafe(
arr: np.ndarray, dtype: DtypeObj, copy: bool = True, skipna: bool = False
) -> ArrayLike:
"""
Cast the elements of an array to a given dtype a nan-safe manner.
Parameters
----------
arr : ndarray
dtype : np.dtype or ExtensionDtype
copy : bool, default True
If False, a view will be attempted but may fail, if
e.g. the item sizes don't align.
skipna: bool, default False
Whether or not we should skip NaN when casting as a string-type.
Raises
------
ValueError
The dtype was a datetime64/timedelta64 dtype, but it had no unit.
"""
# dispatch on extension dtype if needed
if isinstance(dtype, ExtensionDtype):
return dtype.construct_array_type()._from_sequence(arr, dtype=dtype, copy=copy)
elif not isinstance(dtype, np.dtype): # pragma: no cover
raise ValueError("dtype must be np.dtype or ExtensionDtype")
if arr.dtype.kind in "mM":
from pandas.core.construction import ensure_wrapped_if_datetimelike
arr = ensure_wrapped_if_datetimelike(arr)
res = arr.astype(dtype, copy=copy)
return np.asarray(res)
if issubclass(dtype.type, str):
shape = arr.shape
if arr.ndim > 1:
arr = arr.ravel()
return lib.ensure_string_array(
arr, skipna=skipna, convert_na_value=False
).reshape(shape)
elif np.issubdtype(arr.dtype, np.floating) and dtype.kind in "iu":
return _astype_float_to_int_nansafe(arr, dtype, copy)
elif arr.dtype == object:
# if we have a datetime/timedelta array of objects
# then coerce to datetime64[ns] and use DatetimeArray.astype
if lib.is_np_dtype(dtype, "M"):
from pandas.core.arrays import DatetimeArray
dta = DatetimeArray._from_sequence(arr, dtype=dtype)
return dta._ndarray
elif lib.is_np_dtype(dtype, "m"):
from pandas.core.construction import ensure_wrapped_if_datetimelike
# bc we know arr.dtype == object, this is equivalent to
# `np.asarray(to_timedelta(arr))`, but using a lower-level API that
# does not require a circular import.
tdvals = array_to_timedelta64(arr).view("m8[ns]")
tda = ensure_wrapped_if_datetimelike(tdvals)
return tda.astype(dtype, copy=False)._ndarray
if dtype.name in ("datetime64", "timedelta64"):
msg = (
f"The '{dtype.name}' dtype has no unit. Please pass in "
f"'{dtype.name}[ns]' instead."
)
raise ValueError(msg)
if copy or arr.dtype == object or dtype == object:
# Explicit copy, or required since NumPy can't view from / to object.
return arr.astype(dtype, copy=True)
return arr.astype(dtype, copy=copy)
def _astype_float_to_int_nansafe(
values: np.ndarray, dtype: np.dtype, copy: bool
) -> np.ndarray:
"""
astype with a check preventing converting NaN to an meaningless integer value.
"""
if not np.isfinite(values).all():
raise IntCastingNaNError(
"Cannot convert non-finite values (NA or inf) to integer"
)
if dtype.kind == "u":
# GH#45151
if not (values >= 0).all():
raise ValueError(f"Cannot losslessly cast from {values.dtype} to {dtype}")
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=RuntimeWarning)
return values.astype(dtype, copy=copy)
def astype_array(values: ArrayLike, dtype: DtypeObj, copy: bool = False) -> ArrayLike:
"""
Cast array (ndarray or ExtensionArray) to the new dtype.
Parameters
----------
values : ndarray or ExtensionArray
dtype : dtype object
copy : bool, default False
copy if indicated
Returns
-------
ndarray or ExtensionArray
"""
if values.dtype == dtype:
if copy:
return values.copy()
return values
if not isinstance(values, np.ndarray):
# i.e. ExtensionArray
values = values.astype(dtype, copy=copy)
else:
values = _astype_nansafe(values, dtype, copy=copy)
# in pandas we don't store numpy str dtypes, so convert to object
if isinstance(dtype, np.dtype) and issubclass(values.dtype.type, str):
values = np.array(values, dtype=object)
return values
def astype_array_safe(
values: ArrayLike, dtype, copy: bool = False, errors: IgnoreRaise = "raise"
) -> ArrayLike:
"""
Cast array (ndarray or ExtensionArray) to the new dtype.
This basically is the implementation for DataFrame/Series.astype and
includes all custom logic for pandas (NaN-safety, converting str to object,
not allowing )
Parameters
----------
values : ndarray or ExtensionArray
dtype : str, dtype convertible
copy : bool, default False
copy if indicated
errors : str, {'raise', 'ignore'}, default 'raise'
- ``raise`` : allow exceptions to be raised
- ``ignore`` : suppress exceptions. On error return original object
Returns
-------
ndarray or ExtensionArray
"""
errors_legal_values = ("raise", "ignore")
if errors not in errors_legal_values:
invalid_arg = (
"Expected value of kwarg 'errors' to be one of "
f"{list(errors_legal_values)}. Supplied value is '{errors}'"
)
raise ValueError(invalid_arg)
if inspect.isclass(dtype) and issubclass(dtype, ExtensionDtype):
msg = (
f"Expected an instance of {dtype.__name__}, "
"but got the class instead. Try instantiating 'dtype'."
)
raise TypeError(msg)
dtype = pandas_dtype(dtype)
if isinstance(dtype, NumpyEADtype):
# Ensure we don't end up with a NumpyExtensionArray
dtype = dtype.numpy_dtype
try:
new_values = astype_array(values, dtype, copy=copy)
except (ValueError, TypeError):
# e.g. _astype_nansafe can fail on object-dtype of strings
# trying to convert to float
if errors == "ignore":
new_values = values
else:
raise
return new_values
def astype_is_view(dtype: DtypeObj, new_dtype: DtypeObj) -> bool:
"""Checks if astype avoided copying the data.
Parameters
----------
dtype : Original dtype
new_dtype : target dtype
Returns
-------
True if new data is a view or not guaranteed to be a copy, False otherwise
"""
if isinstance(dtype, np.dtype) and not isinstance(new_dtype, np.dtype):
new_dtype, dtype = dtype, new_dtype
if dtype == new_dtype:
return True
elif isinstance(dtype, np.dtype) and isinstance(new_dtype, np.dtype):
# Only equal numpy dtypes avoid a copy
return False
elif is_string_dtype(dtype) and is_string_dtype(new_dtype):
# Potentially! a view when converting from object to string
return True
elif is_object_dtype(dtype) and new_dtype.kind == "O":
# When the underlying array has dtype object, we don't have to make a copy
return True
elif dtype.kind in "mM" and new_dtype.kind in "mM":
dtype = getattr(dtype, "numpy_dtype", dtype)
new_dtype = getattr(new_dtype, "numpy_dtype", new_dtype)
return getattr(dtype, "unit", None) == getattr(new_dtype, "unit", None)
numpy_dtype = getattr(dtype, "numpy_dtype", None)
new_numpy_dtype = getattr(new_dtype, "numpy_dtype", None)
if numpy_dtype is None and isinstance(dtype, np.dtype):
numpy_dtype = dtype
if new_numpy_dtype is None and isinstance(new_dtype, np.dtype):
new_numpy_dtype = new_dtype
if numpy_dtype is not None and new_numpy_dtype is not None:
# if both have NumPy dtype or one of them is a numpy dtype
# they are only a view when the numpy dtypes are equal, e.g.
# int64 -> Int64 or int64[pyarrow]
# int64 -> Int32 copies
return numpy_dtype == new_numpy_dtype
# Assume this is a view since we don't know for sure if a copy was made
return True

View File

@ -0,0 +1,583 @@
"""
Extend pandas with custom array types.
"""
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Any,
TypeVar,
cast,
overload,
)
import numpy as np
from pandas._libs import missing as libmissing
from pandas._libs.hashtable import object_hash
from pandas._libs.properties import cache_readonly
from pandas.errors import AbstractMethodError
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCIndex,
ABCSeries,
)
if TYPE_CHECKING:
from pandas._typing import (
DtypeObj,
Self,
Shape,
npt,
type_t,
)
from pandas import Index
from pandas.core.arrays import ExtensionArray
# To parameterize on same ExtensionDtype
ExtensionDtypeT = TypeVar("ExtensionDtypeT", bound="ExtensionDtype")
class ExtensionDtype:
"""
A custom data type, to be paired with an ExtensionArray.
See Also
--------
extensions.register_extension_dtype: Register an ExtensionType
with pandas as class decorator.
extensions.ExtensionArray: Abstract base class for custom 1-D array types.
Notes
-----
The interface includes the following abstract methods that must
be implemented by subclasses:
* type
* name
* construct_array_type
The following attributes and methods influence the behavior of the dtype in
pandas operations
* _is_numeric
* _is_boolean
* _get_common_dtype
The `na_value` class attribute can be used to set the default NA value
for this type. :attr:`numpy.nan` is used by default.
ExtensionDtypes are required to be hashable. The base class provides
a default implementation, which relies on the ``_metadata`` class
attribute. ``_metadata`` should be a tuple containing the strings
that define your data type. For example, with ``PeriodDtype`` that's
the ``freq`` attribute.
**If you have a parametrized dtype you should set the ``_metadata``
class property**.
Ideally, the attributes in ``_metadata`` will match the
parameters to your ``ExtensionDtype.__init__`` (if any). If any of
the attributes in ``_metadata`` don't implement the standard
``__eq__`` or ``__hash__``, the default implementations here will not
work.
Examples
--------
For interaction with Apache Arrow (pyarrow), a ``__from_arrow__`` method
can be implemented: this method receives a pyarrow Array or ChunkedArray
as only argument and is expected to return the appropriate pandas
ExtensionArray for this dtype and the passed values:
>>> import pyarrow
>>> from pandas.api.extensions import ExtensionArray
>>> class ExtensionDtype:
... def __from_arrow__(
... self,
... array: pyarrow.Array | pyarrow.ChunkedArray
... ) -> ExtensionArray:
... ...
This class does not inherit from 'abc.ABCMeta' for performance reasons.
Methods and properties required by the interface raise
``pandas.errors.AbstractMethodError`` and no ``register`` method is
provided for registering virtual subclasses.
"""
_metadata: tuple[str, ...] = ()
def __str__(self) -> str:
return self.name
def __eq__(self, other: object) -> bool:
"""
Check whether 'other' is equal to self.
By default, 'other' is considered equal if either
* it's a string matching 'self.name'.
* it's an instance of this type and all of the attributes
in ``self._metadata`` are equal between `self` and `other`.
Parameters
----------
other : Any
Returns
-------
bool
"""
if isinstance(other, str):
try:
other = self.construct_from_string(other)
except TypeError:
return False
if isinstance(other, type(self)):
return all(
getattr(self, attr) == getattr(other, attr) for attr in self._metadata
)
return False
def __hash__(self) -> int:
# for python>=3.10, different nan objects have different hashes
# we need to avoid that and thus use hash function with old behavior
return object_hash(tuple(getattr(self, attr) for attr in self._metadata))
def __ne__(self, other: object) -> bool:
return not self.__eq__(other)
@property
def na_value(self) -> object:
"""
Default NA value to use for this type.
This is used in e.g. ExtensionArray.take. This should be the
user-facing "boxed" version of the NA value, not the physical NA value
for storage. e.g. for JSONArray, this is an empty dictionary.
"""
return np.nan
@property
def type(self) -> type_t[Any]:
"""
The scalar type for the array, e.g. ``int``
It's expected ``ExtensionArray[item]`` returns an instance
of ``ExtensionDtype.type`` for scalar ``item``, assuming
that value is valid (not NA). NA values do not need to be
instances of `type`.
"""
raise AbstractMethodError(self)
@property
def kind(self) -> str:
"""
A character code (one of 'biufcmMOSUV'), default 'O'
This should match the NumPy dtype used when the array is
converted to an ndarray, which is probably 'O' for object if
the extension type cannot be represented as a built-in NumPy
type.
See Also
--------
numpy.dtype.kind
"""
return "O"
@property
def name(self) -> str:
"""
A string identifying the data type.
Will be used for display in, e.g. ``Series.dtype``
"""
raise AbstractMethodError(self)
@property
def names(self) -> list[str] | None:
"""
Ordered list of field names, or None if there are no fields.
This is for compatibility with NumPy arrays, and may be removed in the
future.
"""
return None
@classmethod
def construct_array_type(cls) -> type_t[ExtensionArray]:
"""
Return the array type associated with this dtype.
Returns
-------
type
"""
raise AbstractMethodError(cls)
def empty(self, shape: Shape) -> ExtensionArray:
"""
Construct an ExtensionArray of this dtype with the given shape.
Analogous to numpy.empty.
Parameters
----------
shape : int or tuple[int]
Returns
-------
ExtensionArray
"""
cls = self.construct_array_type()
return cls._empty(shape, dtype=self)
@classmethod
def construct_from_string(cls, string: str) -> Self:
r"""
Construct this type from a string.
This is useful mainly for data types that accept parameters.
For example, a period dtype accepts a frequency parameter that
can be set as ``period[h]`` (where H means hourly frequency).
By default, in the abstract class, just the name of the type is
expected. But subclasses can overwrite this method to accept
parameters.
Parameters
----------
string : str
The name of the type, for example ``category``.
Returns
-------
ExtensionDtype
Instance of the dtype.
Raises
------
TypeError
If a class cannot be constructed from this 'string'.
Examples
--------
For extension dtypes with arguments the following may be an
adequate implementation.
>>> import re
>>> @classmethod
... def construct_from_string(cls, string):
... pattern = re.compile(r"^my_type\[(?P<arg_name>.+)\]$")
... match = pattern.match(string)
... if match:
... return cls(**match.groupdict())
... else:
... raise TypeError(
... f"Cannot construct a '{cls.__name__}' from '{string}'"
... )
"""
if not isinstance(string, str):
raise TypeError(
f"'construct_from_string' expects a string, got {type(string)}"
)
# error: Non-overlapping equality check (left operand type: "str", right
# operand type: "Callable[[ExtensionDtype], str]") [comparison-overlap]
assert isinstance(cls.name, str), (cls, type(cls.name))
if string != cls.name:
raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
return cls()
@classmethod
def is_dtype(cls, dtype: object) -> bool:
"""
Check if we match 'dtype'.
Parameters
----------
dtype : object
The object to check.
Returns
-------
bool
Notes
-----
The default implementation is True if
1. ``cls.construct_from_string(dtype)`` is an instance
of ``cls``.
2. ``dtype`` is an object and is an instance of ``cls``
3. ``dtype`` has a ``dtype`` attribute, and any of the above
conditions is true for ``dtype.dtype``.
"""
dtype = getattr(dtype, "dtype", dtype)
if isinstance(dtype, (ABCSeries, ABCIndex, ABCDataFrame, np.dtype)):
# https://github.com/pandas-dev/pandas/issues/22960
# avoid passing data to `construct_from_string`. This could
# cause a FutureWarning from numpy about failing elementwise
# comparison from, e.g., comparing DataFrame == 'category'.
return False
elif dtype is None:
return False
elif isinstance(dtype, cls):
return True
if isinstance(dtype, str):
try:
return cls.construct_from_string(dtype) is not None
except TypeError:
return False
return False
@property
def _is_numeric(self) -> bool:
"""
Whether columns with this dtype should be considered numeric.
By default ExtensionDtypes are assumed to be non-numeric.
They'll be excluded from operations that exclude non-numeric
columns, like (groupby) reductions, plotting, etc.
"""
return False
@property
def _is_boolean(self) -> bool:
"""
Whether this dtype should be considered boolean.
By default, ExtensionDtypes are assumed to be non-numeric.
Setting this to True will affect the behavior of several places,
e.g.
* is_bool
* boolean indexing
Returns
-------
bool
"""
return False
def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
"""
Return the common dtype, if one exists.
Used in `find_common_type` implementation. This is for example used
to determine the resulting dtype in a concat operation.
If no common dtype exists, return None (which gives the other dtypes
the chance to determine a common dtype). If all dtypes in the list
return None, then the common dtype will be "object" dtype (this means
it is never needed to return "object" dtype from this method itself).
Parameters
----------
dtypes : list of dtypes
The dtypes for which to determine a common dtype. This is a list
of np.dtype or ExtensionDtype instances.
Returns
-------
Common dtype (np.dtype or ExtensionDtype) or None
"""
if len(set(dtypes)) == 1:
# only itself
return self
else:
return None
@property
def _can_hold_na(self) -> bool:
"""
Can arrays of this dtype hold NA values?
"""
return True
@property
def _is_immutable(self) -> bool:
"""
Can arrays with this dtype be modified with __setitem__? If not, return
True.
Immutable arrays are expected to raise TypeError on __setitem__ calls.
"""
return False
@cache_readonly
def index_class(self) -> type_t[Index]:
"""
The Index subclass to return from Index.__new__ when this dtype is
encountered.
"""
from pandas import Index
return Index
@property
def _supports_2d(self) -> bool:
"""
Do ExtensionArrays with this dtype support 2D arrays?
Historically ExtensionArrays were limited to 1D. By returning True here,
authors can indicate that their arrays support 2D instances. This can
improve performance in some cases, particularly operations with `axis=1`.
Arrays that support 2D values should:
- implement Array.reshape
- subclass the Dim2CompatTests in tests.extension.base
- _concat_same_type should support `axis` keyword
- _reduce and reductions should support `axis` keyword
"""
return False
@property
def _can_fast_transpose(self) -> bool:
"""
Is transposing an array with this dtype zero-copy?
Only relevant for cases where _supports_2d is True.
"""
return False
class StorageExtensionDtype(ExtensionDtype):
"""ExtensionDtype that may be backed by more than one implementation."""
name: str
_metadata = ("storage",)
def __init__(self, storage: str | None = None) -> None:
self.storage = storage
def __repr__(self) -> str:
return f"{self.name}[{self.storage}]"
def __str__(self) -> str:
return self.name
def __eq__(self, other: object) -> bool:
if isinstance(other, str) and other == self.name:
return True
return super().__eq__(other)
def __hash__(self) -> int:
# custom __eq__ so have to override __hash__
return super().__hash__()
@property
def na_value(self) -> libmissing.NAType:
return libmissing.NA
def register_extension_dtype(cls: type_t[ExtensionDtypeT]) -> type_t[ExtensionDtypeT]:
"""
Register an ExtensionType with pandas as class decorator.
This enables operations like ``.astype(name)`` for the name
of the ExtensionDtype.
Returns
-------
callable
A class decorator.
Examples
--------
>>> from pandas.api.extensions import register_extension_dtype, ExtensionDtype
>>> @register_extension_dtype
... class MyExtensionDtype(ExtensionDtype):
... name = "myextension"
"""
_registry.register(cls)
return cls
class Registry:
"""
Registry for dtype inference.
The registry allows one to map a string repr of a extension
dtype to an extension dtype. The string alias can be used in several
places, including
* Series and Index constructors
* :meth:`pandas.array`
* :meth:`pandas.Series.astype`
Multiple extension types can be registered.
These are tried in order.
"""
def __init__(self) -> None:
self.dtypes: list[type_t[ExtensionDtype]] = []
def register(self, dtype: type_t[ExtensionDtype]) -> None:
"""
Parameters
----------
dtype : ExtensionDtype class
"""
if not issubclass(dtype, ExtensionDtype):
raise ValueError("can only register pandas extension dtypes")
self.dtypes.append(dtype)
@overload
def find(self, dtype: type_t[ExtensionDtypeT]) -> type_t[ExtensionDtypeT]:
...
@overload
def find(self, dtype: ExtensionDtypeT) -> ExtensionDtypeT:
...
@overload
def find(self, dtype: str) -> ExtensionDtype | None:
...
@overload
def find(
self, dtype: npt.DTypeLike
) -> type_t[ExtensionDtype] | ExtensionDtype | None:
...
def find(
self, dtype: type_t[ExtensionDtype] | ExtensionDtype | npt.DTypeLike
) -> type_t[ExtensionDtype] | ExtensionDtype | None:
"""
Parameters
----------
dtype : ExtensionDtype class or instance or str or numpy dtype or python type
Returns
-------
return the first matching dtype, otherwise return None
"""
if not isinstance(dtype, str):
dtype_type: type_t
if not isinstance(dtype, type):
dtype_type = type(dtype)
else:
dtype_type = dtype
if issubclass(dtype_type, ExtensionDtype):
# cast needed here as mypy doesn't know we have figured
# out it is an ExtensionDtype or type_t[ExtensionDtype]
return cast("ExtensionDtype | type_t[ExtensionDtype]", dtype)
return None
for dtype_type in self.dtypes:
try:
return dtype_type.construct_from_string(dtype)
except TypeError:
pass
return None
_registry = Registry()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,348 @@
"""
Utility functions related to concat.
"""
from __future__ import annotations
from typing import (
TYPE_CHECKING,
cast,
)
import warnings
import numpy as np
from pandas._libs import lib
from pandas.util._exceptions import find_stack_level
from pandas.core.dtypes.astype import astype_array
from pandas.core.dtypes.cast import (
common_dtype_categorical_compat,
find_common_type,
np_find_common_type,
)
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.dtypes.generic import (
ABCCategoricalIndex,
ABCSeries,
)
if TYPE_CHECKING:
from collections.abc import Sequence
from pandas._typing import (
ArrayLike,
AxisInt,
DtypeObj,
)
from pandas.core.arrays import (
Categorical,
ExtensionArray,
)
def _is_nonempty(x, axis) -> bool:
# filter empty arrays
# 1-d dtypes always are included here
if x.ndim <= axis:
return True
return x.shape[axis] > 0
def concat_compat(
to_concat: Sequence[ArrayLike], axis: AxisInt = 0, ea_compat_axis: bool = False
) -> ArrayLike:
"""
provide concatenation of an array of arrays each of which is a single
'normalized' dtypes (in that for example, if it's object, then it is a
non-datetimelike and provide a combined dtype for the resulting array that
preserves the overall dtype if possible)
Parameters
----------
to_concat : sequence of arrays
axis : axis to provide concatenation
ea_compat_axis : bool, default False
For ExtensionArray compat, behave as if axis == 1 when determining
whether to drop empty arrays.
Returns
-------
a single array, preserving the combined dtypes
"""
if len(to_concat) and lib.dtypes_all_equal([obj.dtype for obj in to_concat]):
# fastpath!
obj = to_concat[0]
if isinstance(obj, np.ndarray):
to_concat_arrs = cast("Sequence[np.ndarray]", to_concat)
return np.concatenate(to_concat_arrs, axis=axis)
to_concat_eas = cast("Sequence[ExtensionArray]", to_concat)
if ea_compat_axis:
# We have 1D objects, that don't support axis keyword
return obj._concat_same_type(to_concat_eas)
elif axis == 0:
return obj._concat_same_type(to_concat_eas)
else:
# e.g. DatetimeArray
# NB: We are assuming here that ensure_wrapped_if_arraylike has
# been called where relevant.
return obj._concat_same_type(
# error: Unexpected keyword argument "axis" for "_concat_same_type"
# of "ExtensionArray"
to_concat_eas,
axis=axis, # type: ignore[call-arg]
)
# If all arrays are empty, there's nothing to convert, just short-cut to
# the concatenation, #3121.
#
# Creating an empty array directly is tempting, but the winnings would be
# marginal given that it would still require shape & dtype calculation and
# np.concatenate which has them both implemented is compiled.
orig = to_concat
non_empties = [x for x in to_concat if _is_nonempty(x, axis)]
if non_empties and axis == 0 and not ea_compat_axis:
# ea_compat_axis see GH#39574
to_concat = non_empties
any_ea, kinds, target_dtype = _get_result_dtype(to_concat, non_empties)
if len(to_concat) < len(orig):
_, _, alt_dtype = _get_result_dtype(orig, non_empties)
if alt_dtype != target_dtype:
# GH#39122
warnings.warn(
"The behavior of array concatenation with empty entries is "
"deprecated. In a future version, this will no longer exclude "
"empty items when determining the result dtype. "
"To retain the old behavior, exclude the empty entries before "
"the concat operation.",
FutureWarning,
stacklevel=find_stack_level(),
)
if target_dtype is not None:
to_concat = [astype_array(arr, target_dtype, copy=False) for arr in to_concat]
if not isinstance(to_concat[0], np.ndarray):
# i.e. isinstance(to_concat[0], ExtensionArray)
to_concat_eas = cast("Sequence[ExtensionArray]", to_concat)
cls = type(to_concat[0])
# GH#53640: eg. for datetime array, axis=1 but 0 is default
# However, class method `_concat_same_type()` for some classes
# may not support the `axis` keyword
if ea_compat_axis or axis == 0:
return cls._concat_same_type(to_concat_eas)
else:
return cls._concat_same_type(
to_concat_eas,
axis=axis, # type: ignore[call-arg]
)
else:
to_concat_arrs = cast("Sequence[np.ndarray]", to_concat)
result = np.concatenate(to_concat_arrs, axis=axis)
if not any_ea and "b" in kinds and result.dtype.kind in "iuf":
# GH#39817 cast to object instead of casting bools to numeric
result = result.astype(object, copy=False)
return result
def _get_result_dtype(
to_concat: Sequence[ArrayLike], non_empties: Sequence[ArrayLike]
) -> tuple[bool, set[str], DtypeObj | None]:
target_dtype = None
dtypes = {obj.dtype for obj in to_concat}
kinds = {obj.dtype.kind for obj in to_concat}
any_ea = any(not isinstance(x, np.ndarray) for x in to_concat)
if any_ea:
# i.e. any ExtensionArrays
# we ignore axis here, as internally concatting with EAs is always
# for axis=0
if len(dtypes) != 1:
target_dtype = find_common_type([x.dtype for x in to_concat])
target_dtype = common_dtype_categorical_compat(to_concat, target_dtype)
elif not len(non_empties):
# we have all empties, but may need to coerce the result dtype to
# object if we have non-numeric type operands (numpy would otherwise
# cast this to float)
if len(kinds) != 1:
if not len(kinds - {"i", "u", "f"}) or not len(kinds - {"b", "i", "u"}):
# let numpy coerce
pass
else:
# coerce to object
target_dtype = np.dtype(object)
kinds = {"o"}
else:
# error: Argument 1 to "np_find_common_type" has incompatible type
# "*Set[Union[ExtensionDtype, Any]]"; expected "dtype[Any]"
target_dtype = np_find_common_type(*dtypes) # type: ignore[arg-type]
return any_ea, kinds, target_dtype
def union_categoricals(
to_union, sort_categories: bool = False, ignore_order: bool = False
) -> Categorical:
"""
Combine list-like of Categorical-like, unioning categories.
All categories must have the same dtype.
Parameters
----------
to_union : list-like
Categorical, CategoricalIndex, or Series with dtype='category'.
sort_categories : bool, default False
If true, resulting categories will be lexsorted, otherwise
they will be ordered as they appear in the data.
ignore_order : bool, default False
If true, the ordered attribute of the Categoricals will be ignored.
Results in an unordered categorical.
Returns
-------
Categorical
Raises
------
TypeError
- all inputs do not have the same dtype
- all inputs do not have the same ordered property
- all inputs are ordered and their categories are not identical
- sort_categories=True and Categoricals are ordered
ValueError
Empty list of categoricals passed
Notes
-----
To learn more about categories, see `link
<https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html#unioning>`__
Examples
--------
If you want to combine categoricals that do not necessarily have
the same categories, `union_categoricals` will combine a list-like
of categoricals. The new categories will be the union of the
categories being combined.
>>> a = pd.Categorical(["b", "c"])
>>> b = pd.Categorical(["a", "b"])
>>> pd.api.types.union_categoricals([a, b])
['b', 'c', 'a', 'b']
Categories (3, object): ['b', 'c', 'a']
By default, the resulting categories will be ordered as they appear
in the `categories` of the data. If you want the categories to be
lexsorted, use `sort_categories=True` argument.
>>> pd.api.types.union_categoricals([a, b], sort_categories=True)
['b', 'c', 'a', 'b']
Categories (3, object): ['a', 'b', 'c']
`union_categoricals` also works with the case of combining two
categoricals of the same categories and order information (e.g. what
you could also `append` for).
>>> a = pd.Categorical(["a", "b"], ordered=True)
>>> b = pd.Categorical(["a", "b", "a"], ordered=True)
>>> pd.api.types.union_categoricals([a, b])
['a', 'b', 'a', 'b', 'a']
Categories (2, object): ['a' < 'b']
Raises `TypeError` because the categories are ordered and not identical.
>>> a = pd.Categorical(["a", "b"], ordered=True)
>>> b = pd.Categorical(["a", "b", "c"], ordered=True)
>>> pd.api.types.union_categoricals([a, b])
Traceback (most recent call last):
...
TypeError: to union ordered Categoricals, all categories must be the same
Ordered categoricals with different categories or orderings can be
combined by using the `ignore_ordered=True` argument.
>>> a = pd.Categorical(["a", "b", "c"], ordered=True)
>>> b = pd.Categorical(["c", "b", "a"], ordered=True)
>>> pd.api.types.union_categoricals([a, b], ignore_order=True)
['a', 'b', 'c', 'c', 'b', 'a']
Categories (3, object): ['a', 'b', 'c']
`union_categoricals` also works with a `CategoricalIndex`, or `Series`
containing categorical data, but note that the resulting array will
always be a plain `Categorical`
>>> a = pd.Series(["b", "c"], dtype='category')
>>> b = pd.Series(["a", "b"], dtype='category')
>>> pd.api.types.union_categoricals([a, b])
['b', 'c', 'a', 'b']
Categories (3, object): ['b', 'c', 'a']
"""
from pandas import Categorical
from pandas.core.arrays.categorical import recode_for_categories
if len(to_union) == 0:
raise ValueError("No Categoricals to union")
def _maybe_unwrap(x):
if isinstance(x, (ABCCategoricalIndex, ABCSeries)):
return x._values
elif isinstance(x, Categorical):
return x
else:
raise TypeError("all components to combine must be Categorical")
to_union = [_maybe_unwrap(x) for x in to_union]
first = to_union[0]
if not lib.dtypes_all_equal([obj.categories.dtype for obj in to_union]):
raise TypeError("dtype of categories must be the same")
ordered = False
if all(first._categories_match_up_to_permutation(other) for other in to_union[1:]):
# identical categories - fastpath
categories = first.categories
ordered = first.ordered
all_codes = [first._encode_with_my_categories(x)._codes for x in to_union]
new_codes = np.concatenate(all_codes)
if sort_categories and not ignore_order and ordered:
raise TypeError("Cannot use sort_categories=True with ordered Categoricals")
if sort_categories and not categories.is_monotonic_increasing:
categories = categories.sort_values()
indexer = categories.get_indexer(first.categories)
from pandas.core.algorithms import take_nd
new_codes = take_nd(indexer, new_codes, fill_value=-1)
elif ignore_order or all(not c.ordered for c in to_union):
# different categories - union and recode
cats = first.categories.append([c.categories for c in to_union[1:]])
categories = cats.unique()
if sort_categories:
categories = categories.sort_values()
new_codes = [
recode_for_categories(c.codes, c.categories, categories) for c in to_union
]
new_codes = np.concatenate(new_codes)
else:
# ordered - to show a proper error message
if all(c.ordered for c in to_union):
msg = "to union ordered Categoricals, all categories must be the same"
raise TypeError(msg)
raise TypeError("Categorical.ordered must be the same")
if ignore_order:
ordered = False
dtype = CategoricalDtype(categories=categories, ordered=ordered)
return Categorical._simple_new(new_codes, dtype=dtype)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,147 @@
""" define generic base classes for pandas objects """
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Type,
cast,
)
if TYPE_CHECKING:
from pandas import (
Categorical,
CategoricalIndex,
DataFrame,
DatetimeIndex,
Index,
IntervalIndex,
MultiIndex,
PeriodIndex,
RangeIndex,
Series,
TimedeltaIndex,
)
from pandas.core.arrays import (
DatetimeArray,
ExtensionArray,
NumpyExtensionArray,
PeriodArray,
TimedeltaArray,
)
from pandas.core.generic import NDFrame
# define abstract base classes to enable isinstance type checking on our
# objects
def create_pandas_abc_type(name, attr, comp):
def _check(inst) -> bool:
return getattr(inst, attr, "_typ") in comp
# https://github.com/python/mypy/issues/1006
# error: 'classmethod' used with a non-method
@classmethod # type: ignore[misc]
def _instancecheck(cls, inst) -> bool:
return _check(inst) and not isinstance(inst, type)
@classmethod # type: ignore[misc]
def _subclasscheck(cls, inst) -> bool:
# Raise instead of returning False
# This is consistent with default __subclasscheck__ behavior
if not isinstance(inst, type):
raise TypeError("issubclass() arg 1 must be a class")
return _check(inst)
dct = {"__instancecheck__": _instancecheck, "__subclasscheck__": _subclasscheck}
meta = type("ABCBase", (type,), dct)
return meta(name, (), dct)
ABCRangeIndex = cast(
"Type[RangeIndex]",
create_pandas_abc_type("ABCRangeIndex", "_typ", ("rangeindex",)),
)
ABCMultiIndex = cast(
"Type[MultiIndex]",
create_pandas_abc_type("ABCMultiIndex", "_typ", ("multiindex",)),
)
ABCDatetimeIndex = cast(
"Type[DatetimeIndex]",
create_pandas_abc_type("ABCDatetimeIndex", "_typ", ("datetimeindex",)),
)
ABCTimedeltaIndex = cast(
"Type[TimedeltaIndex]",
create_pandas_abc_type("ABCTimedeltaIndex", "_typ", ("timedeltaindex",)),
)
ABCPeriodIndex = cast(
"Type[PeriodIndex]",
create_pandas_abc_type("ABCPeriodIndex", "_typ", ("periodindex",)),
)
ABCCategoricalIndex = cast(
"Type[CategoricalIndex]",
create_pandas_abc_type("ABCCategoricalIndex", "_typ", ("categoricalindex",)),
)
ABCIntervalIndex = cast(
"Type[IntervalIndex]",
create_pandas_abc_type("ABCIntervalIndex", "_typ", ("intervalindex",)),
)
ABCIndex = cast(
"Type[Index]",
create_pandas_abc_type(
"ABCIndex",
"_typ",
{
"index",
"rangeindex",
"multiindex",
"datetimeindex",
"timedeltaindex",
"periodindex",
"categoricalindex",
"intervalindex",
},
),
)
ABCNDFrame = cast(
"Type[NDFrame]",
create_pandas_abc_type("ABCNDFrame", "_typ", ("series", "dataframe")),
)
ABCSeries = cast(
"Type[Series]",
create_pandas_abc_type("ABCSeries", "_typ", ("series",)),
)
ABCDataFrame = cast(
"Type[DataFrame]", create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe",))
)
ABCCategorical = cast(
"Type[Categorical]",
create_pandas_abc_type("ABCCategorical", "_typ", ("categorical")),
)
ABCDatetimeArray = cast(
"Type[DatetimeArray]",
create_pandas_abc_type("ABCDatetimeArray", "_typ", ("datetimearray")),
)
ABCTimedeltaArray = cast(
"Type[TimedeltaArray]",
create_pandas_abc_type("ABCTimedeltaArray", "_typ", ("timedeltaarray")),
)
ABCPeriodArray = cast(
"Type[PeriodArray]",
create_pandas_abc_type("ABCPeriodArray", "_typ", ("periodarray",)),
)
ABCExtensionArray = cast(
"Type[ExtensionArray]",
create_pandas_abc_type(
"ABCExtensionArray",
"_typ",
# Note: IntervalArray and SparseArray are included bc they have _typ="extension"
{"extension", "categorical", "periodarray", "datetimearray", "timedeltaarray"},
),
)
ABCNumpyExtensionArray = cast(
"Type[NumpyExtensionArray]",
create_pandas_abc_type("ABCNumpyExtensionArray", "_typ", ("npy_extension",)),
)

View File

@ -0,0 +1,437 @@
""" basic inference routines """
from __future__ import annotations
from collections import abc
from numbers import Number
import re
from re import Pattern
from typing import TYPE_CHECKING
import numpy as np
from pandas._libs import lib
if TYPE_CHECKING:
from collections.abc import Hashable
from pandas._typing import TypeGuard
is_bool = lib.is_bool
is_integer = lib.is_integer
is_float = lib.is_float
is_complex = lib.is_complex
is_scalar = lib.is_scalar
is_decimal = lib.is_decimal
is_interval = lib.is_interval
is_list_like = lib.is_list_like
is_iterator = lib.is_iterator
def is_number(obj) -> TypeGuard[Number | np.number]:
"""
Check if the object is a number.
Returns True when the object is a number, and False if is not.
Parameters
----------
obj : any type
The object to check if is a number.
Returns
-------
bool
Whether `obj` is a number or not.
See Also
--------
api.types.is_integer: Checks a subgroup of numbers.
Examples
--------
>>> from pandas.api.types import is_number
>>> is_number(1)
True
>>> is_number(7.15)
True
Booleans are valid because they are int subclass.
>>> is_number(False)
True
>>> is_number("foo")
False
>>> is_number("5")
False
"""
return isinstance(obj, (Number, np.number))
def iterable_not_string(obj) -> bool:
"""
Check if the object is an iterable but not a string.
Parameters
----------
obj : The object to check.
Returns
-------
is_iter_not_string : bool
Whether `obj` is a non-string iterable.
Examples
--------
>>> iterable_not_string([1, 2, 3])
True
>>> iterable_not_string("foo")
False
>>> iterable_not_string(1)
False
"""
return isinstance(obj, abc.Iterable) and not isinstance(obj, str)
def is_file_like(obj) -> bool:
"""
Check if the object is a file-like object.
For objects to be considered file-like, they must
be an iterator AND have either a `read` and/or `write`
method as an attribute.
Note: file-like objects must be iterable, but
iterable objects need not be file-like.
Parameters
----------
obj : The object to check
Returns
-------
bool
Whether `obj` has file-like properties.
Examples
--------
>>> import io
>>> from pandas.api.types import is_file_like
>>> buffer = io.StringIO("data")
>>> is_file_like(buffer)
True
>>> is_file_like([1, 2, 3])
False
"""
if not (hasattr(obj, "read") or hasattr(obj, "write")):
return False
return bool(hasattr(obj, "__iter__"))
def is_re(obj) -> TypeGuard[Pattern]:
"""
Check if the object is a regex pattern instance.
Parameters
----------
obj : The object to check
Returns
-------
bool
Whether `obj` is a regex pattern.
Examples
--------
>>> from pandas.api.types import is_re
>>> import re
>>> is_re(re.compile(".*"))
True
>>> is_re("foo")
False
"""
return isinstance(obj, Pattern)
def is_re_compilable(obj) -> bool:
"""
Check if the object can be compiled into a regex pattern instance.
Parameters
----------
obj : The object to check
Returns
-------
bool
Whether `obj` can be compiled as a regex pattern.
Examples
--------
>>> from pandas.api.types import is_re_compilable
>>> is_re_compilable(".*")
True
>>> is_re_compilable(1)
False
"""
try:
re.compile(obj)
except TypeError:
return False
else:
return True
def is_array_like(obj) -> bool:
"""
Check if the object is array-like.
For an object to be considered array-like, it must be list-like and
have a `dtype` attribute.
Parameters
----------
obj : The object to check
Returns
-------
is_array_like : bool
Whether `obj` has array-like properties.
Examples
--------
>>> is_array_like(np.array([1, 2, 3]))
True
>>> is_array_like(pd.Series(["a", "b"]))
True
>>> is_array_like(pd.Index(["2016-01-01"]))
True
>>> is_array_like([1, 2, 3])
False
>>> is_array_like(("a", "b"))
False
"""
return is_list_like(obj) and hasattr(obj, "dtype")
def is_nested_list_like(obj) -> bool:
"""
Check if the object is list-like, and that all of its elements
are also list-like.
Parameters
----------
obj : The object to check
Returns
-------
is_list_like : bool
Whether `obj` has list-like properties.
Examples
--------
>>> is_nested_list_like([[1, 2, 3]])
True
>>> is_nested_list_like([{1, 2, 3}, {1, 2, 3}])
True
>>> is_nested_list_like(["foo"])
False
>>> is_nested_list_like([])
False
>>> is_nested_list_like([[1, 2, 3], 1])
False
Notes
-----
This won't reliably detect whether a consumable iterator (e. g.
a generator) is a nested-list-like without consuming the iterator.
To avoid consuming it, we always return False if the outer container
doesn't define `__len__`.
See Also
--------
is_list_like
"""
return (
is_list_like(obj)
and hasattr(obj, "__len__")
and len(obj) > 0
and all(is_list_like(item) for item in obj)
)
def is_dict_like(obj) -> bool:
"""
Check if the object is dict-like.
Parameters
----------
obj : The object to check
Returns
-------
bool
Whether `obj` has dict-like properties.
Examples
--------
>>> from pandas.api.types import is_dict_like
>>> is_dict_like({1: 2})
True
>>> is_dict_like([1, 2, 3])
False
>>> is_dict_like(dict)
False
>>> is_dict_like(dict())
True
"""
dict_like_attrs = ("__getitem__", "keys", "__contains__")
return (
all(hasattr(obj, attr) for attr in dict_like_attrs)
# [GH 25196] exclude classes
and not isinstance(obj, type)
)
def is_named_tuple(obj) -> bool:
"""
Check if the object is a named tuple.
Parameters
----------
obj : The object to check
Returns
-------
bool
Whether `obj` is a named tuple.
Examples
--------
>>> from collections import namedtuple
>>> from pandas.api.types import is_named_tuple
>>> Point = namedtuple("Point", ["x", "y"])
>>> p = Point(1, 2)
>>>
>>> is_named_tuple(p)
True
>>> is_named_tuple((1, 2))
False
"""
return isinstance(obj, abc.Sequence) and hasattr(obj, "_fields")
def is_hashable(obj) -> TypeGuard[Hashable]:
"""
Return True if hash(obj) will succeed, False otherwise.
Some types will pass a test against collections.abc.Hashable but fail when
they are actually hashed with hash().
Distinguish between these and other types by trying the call to hash() and
seeing if they raise TypeError.
Returns
-------
bool
Examples
--------
>>> import collections
>>> from pandas.api.types import is_hashable
>>> a = ([],)
>>> isinstance(a, collections.abc.Hashable)
True
>>> is_hashable(a)
False
"""
# Unfortunately, we can't use isinstance(obj, collections.abc.Hashable),
# which can be faster than calling hash. That is because numpy scalars
# fail this test.
# Reconsider this decision once this numpy bug is fixed:
# https://github.com/numpy/numpy/issues/5562
try:
hash(obj)
except TypeError:
return False
else:
return True
def is_sequence(obj) -> bool:
"""
Check if the object is a sequence of objects.
String types are not included as sequences here.
Parameters
----------
obj : The object to check
Returns
-------
is_sequence : bool
Whether `obj` is a sequence of objects.
Examples
--------
>>> l = [1, 2, 3]
>>>
>>> is_sequence(l)
True
>>> is_sequence(iter(l))
False
"""
try:
iter(obj) # Can iterate over it.
len(obj) # Has a length associated with it.
return not isinstance(obj, (str, bytes))
except (TypeError, AttributeError):
return False
def is_dataclass(item) -> bool:
"""
Checks if the object is a data-class instance
Parameters
----------
item : object
Returns
--------
is_dataclass : bool
True if the item is an instance of a data-class,
will return false if you pass the data class itself
Examples
--------
>>> from dataclasses import dataclass
>>> @dataclass
... class Point:
... x: int
... y: int
>>> is_dataclass(Point)
False
>>> is_dataclass(Point(0,2))
True
"""
try:
import dataclasses
return dataclasses.is_dataclass(item) and not isinstance(item, type)
except ImportError:
return False

View File

@ -0,0 +1,810 @@
"""
missing types & inference
"""
from __future__ import annotations
from decimal import Decimal
from functools import partial
from typing import (
TYPE_CHECKING,
overload,
)
import warnings
import numpy as np
from pandas._config import get_option
from pandas._libs import lib
import pandas._libs.missing as libmissing
from pandas._libs.tslibs import (
NaT,
iNaT,
)
from pandas.core.dtypes.common import (
DT64NS_DTYPE,
TD64NS_DTYPE,
ensure_object,
is_scalar,
is_string_or_object_np_dtype,
)
from pandas.core.dtypes.dtypes import (
CategoricalDtype,
DatetimeTZDtype,
ExtensionDtype,
IntervalDtype,
PeriodDtype,
)
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCExtensionArray,
ABCIndex,
ABCMultiIndex,
ABCSeries,
)
from pandas.core.dtypes.inference import is_list_like
if TYPE_CHECKING:
from re import Pattern
from pandas._typing import (
ArrayLike,
DtypeObj,
NDFrame,
NDFrameT,
Scalar,
npt,
)
from pandas import Series
from pandas.core.indexes.base import Index
isposinf_scalar = libmissing.isposinf_scalar
isneginf_scalar = libmissing.isneginf_scalar
nan_checker = np.isnan
INF_AS_NA = False
_dtype_object = np.dtype("object")
_dtype_str = np.dtype(str)
@overload
def isna(obj: Scalar | Pattern) -> bool:
...
@overload
def isna(
obj: ArrayLike | Index | list,
) -> npt.NDArray[np.bool_]:
...
@overload
def isna(obj: NDFrameT) -> NDFrameT:
...
# handle unions
@overload
def isna(obj: NDFrameT | ArrayLike | Index | list) -> NDFrameT | npt.NDArray[np.bool_]:
...
@overload
def isna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame:
...
def isna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame:
"""
Detect missing values for an array-like object.
This function takes a scalar or array-like object and indicates
whether values are missing (``NaN`` in numeric arrays, ``None`` or ``NaN``
in object arrays, ``NaT`` in datetimelike).
Parameters
----------
obj : scalar or array-like
Object to check for null or missing values.
Returns
-------
bool or array-like of bool
For scalar input, returns a scalar boolean.
For array input, returns an array of boolean indicating whether each
corresponding element is missing.
See Also
--------
notna : Boolean inverse of pandas.isna.
Series.isna : Detect missing values in a Series.
DataFrame.isna : Detect missing values in a DataFrame.
Index.isna : Detect missing values in an Index.
Examples
--------
Scalar arguments (including strings) result in a scalar boolean.
>>> pd.isna('dog')
False
>>> pd.isna(pd.NA)
True
>>> pd.isna(np.nan)
True
ndarrays result in an ndarray of booleans.
>>> array = np.array([[1, np.nan, 3], [4, 5, np.nan]])
>>> array
array([[ 1., nan, 3.],
[ 4., 5., nan]])
>>> pd.isna(array)
array([[False, True, False],
[False, False, True]])
For indexes, an ndarray of booleans is returned.
>>> index = pd.DatetimeIndex(["2017-07-05", "2017-07-06", None,
... "2017-07-08"])
>>> index
DatetimeIndex(['2017-07-05', '2017-07-06', 'NaT', '2017-07-08'],
dtype='datetime64[ns]', freq=None)
>>> pd.isna(index)
array([False, False, True, False])
For Series and DataFrame, the same type is returned, containing booleans.
>>> df = pd.DataFrame([['ant', 'bee', 'cat'], ['dog', None, 'fly']])
>>> df
0 1 2
0 ant bee cat
1 dog None fly
>>> pd.isna(df)
0 1 2
0 False False False
1 False True False
>>> pd.isna(df[1])
0 False
1 True
Name: 1, dtype: bool
"""
return _isna(obj)
isnull = isna
def _isna(obj, inf_as_na: bool = False):
"""
Detect missing values, treating None, NaN or NA as null. Infinite
values will also be treated as null if inf_as_na is True.
Parameters
----------
obj: ndarray or object value
Input array or scalar value.
inf_as_na: bool
Whether to treat infinity as null.
Returns
-------
boolean ndarray or boolean
"""
if is_scalar(obj):
return libmissing.checknull(obj, inf_as_na=inf_as_na)
elif isinstance(obj, ABCMultiIndex):
raise NotImplementedError("isna is not defined for MultiIndex")
elif isinstance(obj, type):
return False
elif isinstance(obj, (np.ndarray, ABCExtensionArray)):
return _isna_array(obj, inf_as_na=inf_as_na)
elif isinstance(obj, ABCIndex):
# Try to use cached isna, which also short-circuits for integer dtypes
# and avoids materializing RangeIndex._values
if not obj._can_hold_na:
return obj.isna()
return _isna_array(obj._values, inf_as_na=inf_as_na)
elif isinstance(obj, ABCSeries):
result = _isna_array(obj._values, inf_as_na=inf_as_na)
# box
result = obj._constructor(result, index=obj.index, name=obj.name, copy=False)
return result
elif isinstance(obj, ABCDataFrame):
return obj.isna()
elif isinstance(obj, list):
return _isna_array(np.asarray(obj, dtype=object), inf_as_na=inf_as_na)
elif hasattr(obj, "__array__"):
return _isna_array(np.asarray(obj), inf_as_na=inf_as_na)
else:
return False
def _use_inf_as_na(key) -> None:
"""
Option change callback for na/inf behaviour.
Choose which replacement for numpy.isnan / -numpy.isfinite is used.
Parameters
----------
flag: bool
True means treat None, NaN, INF, -INF as null (old way),
False means None and NaN are null, but INF, -INF are not null
(new way).
Notes
-----
This approach to setting global module values is discussed and
approved here:
* https://stackoverflow.com/questions/4859217/
programmatically-creating-variables-in-python/4859312#4859312
"""
inf_as_na = get_option(key)
globals()["_isna"] = partial(_isna, inf_as_na=inf_as_na)
if inf_as_na:
globals()["nan_checker"] = lambda x: ~np.isfinite(x)
globals()["INF_AS_NA"] = True
else:
globals()["nan_checker"] = np.isnan
globals()["INF_AS_NA"] = False
def _isna_array(values: ArrayLike, inf_as_na: bool = False):
"""
Return an array indicating which values of the input array are NaN / NA.
Parameters
----------
obj: ndarray or ExtensionArray
The input array whose elements are to be checked.
inf_as_na: bool
Whether or not to treat infinite values as NA.
Returns
-------
array-like
Array of boolean values denoting the NA status of each element.
"""
dtype = values.dtype
if not isinstance(values, np.ndarray):
# i.e. ExtensionArray
if inf_as_na and isinstance(dtype, CategoricalDtype):
result = libmissing.isnaobj(values.to_numpy(), inf_as_na=inf_as_na)
else:
# error: Incompatible types in assignment (expression has type
# "Union[ndarray[Any, Any], ExtensionArraySupportsAnyAll]", variable has
# type "ndarray[Any, dtype[bool_]]")
result = values.isna() # type: ignore[assignment]
elif isinstance(values, np.rec.recarray):
# GH 48526
result = _isna_recarray_dtype(values, inf_as_na=inf_as_na)
elif is_string_or_object_np_dtype(values.dtype):
result = _isna_string_dtype(values, inf_as_na=inf_as_na)
elif dtype.kind in "mM":
# this is the NaT pattern
result = values.view("i8") == iNaT
else:
if inf_as_na:
result = ~np.isfinite(values)
else:
result = np.isnan(values)
return result
def _isna_string_dtype(values: np.ndarray, inf_as_na: bool) -> npt.NDArray[np.bool_]:
# Working around NumPy ticket 1542
dtype = values.dtype
if dtype.kind in ("S", "U"):
result = np.zeros(values.shape, dtype=bool)
else:
if values.ndim in {1, 2}:
result = libmissing.isnaobj(values, inf_as_na=inf_as_na)
else:
# 0-D, reached via e.g. mask_missing
result = libmissing.isnaobj(values.ravel(), inf_as_na=inf_as_na)
result = result.reshape(values.shape)
return result
def _has_record_inf_value(record_as_array: np.ndarray) -> np.bool_:
is_inf_in_record = np.zeros(len(record_as_array), dtype=bool)
for i, value in enumerate(record_as_array):
is_element_inf = False
try:
is_element_inf = np.isinf(value)
except TypeError:
is_element_inf = False
is_inf_in_record[i] = is_element_inf
return np.any(is_inf_in_record)
def _isna_recarray_dtype(
values: np.rec.recarray, inf_as_na: bool
) -> npt.NDArray[np.bool_]:
result = np.zeros(values.shape, dtype=bool)
for i, record in enumerate(values):
record_as_array = np.array(record.tolist())
does_record_contain_nan = isna_all(record_as_array)
does_record_contain_inf = False
if inf_as_na:
does_record_contain_inf = bool(_has_record_inf_value(record_as_array))
result[i] = np.any(
np.logical_or(does_record_contain_nan, does_record_contain_inf)
)
return result
@overload
def notna(obj: Scalar) -> bool:
...
@overload
def notna(
obj: ArrayLike | Index | list,
) -> npt.NDArray[np.bool_]:
...
@overload
def notna(obj: NDFrameT) -> NDFrameT:
...
# handle unions
@overload
def notna(obj: NDFrameT | ArrayLike | Index | list) -> NDFrameT | npt.NDArray[np.bool_]:
...
@overload
def notna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame:
...
def notna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame:
"""
Detect non-missing values for an array-like object.
This function takes a scalar or array-like object and indicates
whether values are valid (not missing, which is ``NaN`` in numeric
arrays, ``None`` or ``NaN`` in object arrays, ``NaT`` in datetimelike).
Parameters
----------
obj : array-like or object value
Object to check for *not* null or *non*-missing values.
Returns
-------
bool or array-like of bool
For scalar input, returns a scalar boolean.
For array input, returns an array of boolean indicating whether each
corresponding element is valid.
See Also
--------
isna : Boolean inverse of pandas.notna.
Series.notna : Detect valid values in a Series.
DataFrame.notna : Detect valid values in a DataFrame.
Index.notna : Detect valid values in an Index.
Examples
--------
Scalar arguments (including strings) result in a scalar boolean.
>>> pd.notna('dog')
True
>>> pd.notna(pd.NA)
False
>>> pd.notna(np.nan)
False
ndarrays result in an ndarray of booleans.
>>> array = np.array([[1, np.nan, 3], [4, 5, np.nan]])
>>> array
array([[ 1., nan, 3.],
[ 4., 5., nan]])
>>> pd.notna(array)
array([[ True, False, True],
[ True, True, False]])
For indexes, an ndarray of booleans is returned.
>>> index = pd.DatetimeIndex(["2017-07-05", "2017-07-06", None,
... "2017-07-08"])
>>> index
DatetimeIndex(['2017-07-05', '2017-07-06', 'NaT', '2017-07-08'],
dtype='datetime64[ns]', freq=None)
>>> pd.notna(index)
array([ True, True, False, True])
For Series and DataFrame, the same type is returned, containing booleans.
>>> df = pd.DataFrame([['ant', 'bee', 'cat'], ['dog', None, 'fly']])
>>> df
0 1 2
0 ant bee cat
1 dog None fly
>>> pd.notna(df)
0 1 2
0 True True True
1 True False True
>>> pd.notna(df[1])
0 True
1 False
Name: 1, dtype: bool
"""
res = isna(obj)
if isinstance(res, bool):
return not res
return ~res
notnull = notna
def array_equivalent(
left,
right,
strict_nan: bool = False,
dtype_equal: bool = False,
) -> bool:
"""
True if two arrays, left and right, have equal non-NaN elements, and NaNs
in corresponding locations. False otherwise. It is assumed that left and
right are NumPy arrays of the same dtype. The behavior of this function
(particularly with respect to NaNs) is not defined if the dtypes are
different.
Parameters
----------
left, right : ndarrays
strict_nan : bool, default False
If True, consider NaN and None to be different.
dtype_equal : bool, default False
Whether `left` and `right` are known to have the same dtype
according to `is_dtype_equal`. Some methods like `BlockManager.equals`.
require that the dtypes match. Setting this to ``True`` can improve
performance, but will give different results for arrays that are
equal but different dtypes.
Returns
-------
b : bool
Returns True if the arrays are equivalent.
Examples
--------
>>> array_equivalent(
... np.array([1, 2, np.nan]),
... np.array([1, 2, np.nan]))
True
>>> array_equivalent(
... np.array([1, np.nan, 2]),
... np.array([1, 2, np.nan]))
False
"""
left, right = np.asarray(left), np.asarray(right)
# shape compat
if left.shape != right.shape:
return False
if dtype_equal:
# fastpath when we require that the dtypes match (Block.equals)
if left.dtype.kind in "fc":
return _array_equivalent_float(left, right)
elif left.dtype.kind in "mM":
return _array_equivalent_datetimelike(left, right)
elif is_string_or_object_np_dtype(left.dtype):
# TODO: fastpath for pandas' StringDtype
return _array_equivalent_object(left, right, strict_nan)
else:
return np.array_equal(left, right)
# Slow path when we allow comparing different dtypes.
# Object arrays can contain None, NaN and NaT.
# string dtypes must be come to this path for NumPy 1.7.1 compat
if left.dtype.kind in "OSU" or right.dtype.kind in "OSU":
# Note: `in "OSU"` is non-trivially faster than `in ["O", "S", "U"]`
# or `in ("O", "S", "U")`
return _array_equivalent_object(left, right, strict_nan)
# NaNs can occur in float and complex arrays.
if left.dtype.kind in "fc":
if not (left.size and right.size):
return True
return ((left == right) | (isna(left) & isna(right))).all()
elif left.dtype.kind in "mM" or right.dtype.kind in "mM":
# datetime64, timedelta64, Period
if left.dtype != right.dtype:
return False
left = left.view("i8")
right = right.view("i8")
# if we have structured dtypes, compare first
if (
left.dtype.type is np.void or right.dtype.type is np.void
) and left.dtype != right.dtype:
return False
return np.array_equal(left, right)
def _array_equivalent_float(left: np.ndarray, right: np.ndarray) -> bool:
return bool(((left == right) | (np.isnan(left) & np.isnan(right))).all())
def _array_equivalent_datetimelike(left: np.ndarray, right: np.ndarray):
return np.array_equal(left.view("i8"), right.view("i8"))
def _array_equivalent_object(left: np.ndarray, right: np.ndarray, strict_nan: bool):
left = ensure_object(left)
right = ensure_object(right)
mask: npt.NDArray[np.bool_] | None = None
if strict_nan:
mask = isna(left) & isna(right)
if not mask.any():
mask = None
try:
if mask is None:
return lib.array_equivalent_object(left, right)
if not lib.array_equivalent_object(left[~mask], right[~mask]):
return False
left_remaining = left[mask]
right_remaining = right[mask]
except ValueError:
# can raise a ValueError if left and right cannot be
# compared (e.g. nested arrays)
left_remaining = left
right_remaining = right
for left_value, right_value in zip(left_remaining, right_remaining):
if left_value is NaT and right_value is not NaT:
return False
elif left_value is libmissing.NA and right_value is not libmissing.NA:
return False
elif isinstance(left_value, float) and np.isnan(left_value):
if not isinstance(right_value, float) or not np.isnan(right_value):
return False
else:
with warnings.catch_warnings():
# suppress numpy's "elementwise comparison failed"
warnings.simplefilter("ignore", DeprecationWarning)
try:
if np.any(np.asarray(left_value != right_value)):
return False
except TypeError as err:
if "boolean value of NA is ambiguous" in str(err):
return False
raise
except ValueError:
# numpy can raise a ValueError if left and right cannot be
# compared (e.g. nested arrays)
return False
return True
def array_equals(left: ArrayLike, right: ArrayLike) -> bool:
"""
ExtensionArray-compatible implementation of array_equivalent.
"""
if left.dtype != right.dtype:
return False
elif isinstance(left, ABCExtensionArray):
return left.equals(right)
else:
return array_equivalent(left, right, dtype_equal=True)
def infer_fill_value(val):
"""
infer the fill value for the nan/NaT from the provided
scalar/ndarray/list-like if we are a NaT, return the correct dtyped
element to provide proper block construction
"""
if not is_list_like(val):
val = [val]
val = np.asarray(val)
if val.dtype.kind in "mM":
return np.array("NaT", dtype=val.dtype)
elif val.dtype == object:
dtype = lib.infer_dtype(ensure_object(val), skipna=False)
if dtype in ["datetime", "datetime64"]:
return np.array("NaT", dtype=DT64NS_DTYPE)
elif dtype in ["timedelta", "timedelta64"]:
return np.array("NaT", dtype=TD64NS_DTYPE)
return np.array(np.nan, dtype=object)
elif val.dtype.kind == "U":
return np.array(np.nan, dtype=val.dtype)
return np.nan
def construct_1d_array_from_inferred_fill_value(
value: object, length: int
) -> ArrayLike:
# Find our empty_value dtype by constructing an array
# from our value and doing a .take on it
from pandas.core.algorithms import take_nd
from pandas.core.construction import sanitize_array
from pandas.core.indexes.base import Index
arr = sanitize_array(value, Index(range(1)), copy=False)
taker = -1 * np.ones(length, dtype=np.intp)
return take_nd(arr, taker)
def maybe_fill(arr: np.ndarray) -> np.ndarray:
"""
Fill numpy.ndarray with NaN, unless we have a integer or boolean dtype.
"""
if arr.dtype.kind not in "iub":
arr.fill(np.nan)
return arr
def na_value_for_dtype(dtype: DtypeObj, compat: bool = True):
"""
Return a dtype compat na value
Parameters
----------
dtype : string / dtype
compat : bool, default True
Returns
-------
np.dtype or a pandas dtype
Examples
--------
>>> na_value_for_dtype(np.dtype('int64'))
0
>>> na_value_for_dtype(np.dtype('int64'), compat=False)
nan
>>> na_value_for_dtype(np.dtype('float64'))
nan
>>> na_value_for_dtype(np.dtype('bool'))
False
>>> na_value_for_dtype(np.dtype('datetime64[ns]'))
numpy.datetime64('NaT')
"""
if isinstance(dtype, ExtensionDtype):
return dtype.na_value
elif dtype.kind in "mM":
unit = np.datetime_data(dtype)[0]
return dtype.type("NaT", unit)
elif dtype.kind == "f":
return np.nan
elif dtype.kind in "iu":
if compat:
return 0
return np.nan
elif dtype.kind == "b":
if compat:
return False
return np.nan
return np.nan
def remove_na_arraylike(arr: Series | Index | np.ndarray):
"""
Return array-like containing only true/non-NaN values, possibly empty.
"""
if isinstance(arr.dtype, ExtensionDtype):
return arr[notna(arr)]
else:
return arr[notna(np.asarray(arr))]
def is_valid_na_for_dtype(obj, dtype: DtypeObj) -> bool:
"""
isna check that excludes incompatible dtypes
Parameters
----------
obj : object
dtype : np.datetime64, np.timedelta64, DatetimeTZDtype, or PeriodDtype
Returns
-------
bool
"""
if not lib.is_scalar(obj) or not isna(obj):
return False
elif dtype.kind == "M":
if isinstance(dtype, np.dtype):
# i.e. not tzaware
return not isinstance(obj, (np.timedelta64, Decimal))
# we have to rule out tznaive dt64("NaT")
return not isinstance(obj, (np.timedelta64, np.datetime64, Decimal))
elif dtype.kind == "m":
return not isinstance(obj, (np.datetime64, Decimal))
elif dtype.kind in "iufc":
# Numeric
return obj is not NaT and not isinstance(obj, (np.datetime64, np.timedelta64))
elif dtype.kind == "b":
# We allow pd.NA, None, np.nan in BooleanArray (same as IntervalDtype)
return lib.is_float(obj) or obj is None or obj is libmissing.NA
elif dtype == _dtype_str:
# numpy string dtypes to avoid float np.nan
return not isinstance(obj, (np.datetime64, np.timedelta64, Decimal, float))
elif dtype == _dtype_object:
# This is needed for Categorical, but is kind of weird
return True
elif isinstance(dtype, PeriodDtype):
return not isinstance(obj, (np.datetime64, np.timedelta64, Decimal))
elif isinstance(dtype, IntervalDtype):
return lib.is_float(obj) or obj is None or obj is libmissing.NA
elif isinstance(dtype, CategoricalDtype):
return is_valid_na_for_dtype(obj, dtype.categories.dtype)
# fallback, default to allowing NaN, None, NA, NaT
return not isinstance(obj, (np.datetime64, np.timedelta64, Decimal))
def isna_all(arr: ArrayLike) -> bool:
"""
Optimized equivalent to isna(arr).all()
"""
total_len = len(arr)
# Usually it's enough to check but a small fraction of values to see if
# a block is NOT null, chunks should help in such cases.
# parameters 1000 and 40 were chosen arbitrarily
chunk_len = max(total_len // 40, 1000)
dtype = arr.dtype
if lib.is_np_dtype(dtype, "f"):
checker = nan_checker
elif (lib.is_np_dtype(dtype, "mM")) or isinstance(
dtype, (DatetimeTZDtype, PeriodDtype)
):
# error: Incompatible types in assignment (expression has type
# "Callable[[Any], Any]", variable has type "ufunc")
checker = lambda x: np.asarray(x.view("i8")) == iNaT # type: ignore[assignment]
else:
# error: Incompatible types in assignment (expression has type "Callable[[Any],
# Any]", variable has type "ufunc")
checker = lambda x: _isna_array( # type: ignore[assignment]
x, inf_as_na=INF_AS_NA
)
return all(
checker(arr[i : i + chunk_len]).all() for i in range(0, total_len, chunk_len)
)

View File

@ -0,0 +1,117 @@
from __future__ import annotations
from typing import TYPE_CHECKING
import weakref
if TYPE_CHECKING:
from pandas.core.generic import NDFrame
class Flags:
"""
Flags that apply to pandas objects.
Parameters
----------
obj : Series or DataFrame
The object these flags are associated with.
allows_duplicate_labels : bool, default True
Whether to allow duplicate labels in this object. By default,
duplicate labels are permitted. Setting this to ``False`` will
cause an :class:`errors.DuplicateLabelError` to be raised when
`index` (or columns for DataFrame) is not unique, or any
subsequent operation on introduces duplicates.
See :ref:`duplicates.disallow` for more.
.. warning::
This is an experimental feature. Currently, many methods fail to
propagate the ``allows_duplicate_labels`` value. In future versions
it is expected that every method taking or returning one or more
DataFrame or Series objects will propagate ``allows_duplicate_labels``.
Examples
--------
Attributes can be set in two ways:
>>> df = pd.DataFrame()
>>> df.flags
<Flags(allows_duplicate_labels=True)>
>>> df.flags.allows_duplicate_labels = False
>>> df.flags
<Flags(allows_duplicate_labels=False)>
>>> df.flags['allows_duplicate_labels'] = True
>>> df.flags
<Flags(allows_duplicate_labels=True)>
"""
_keys: set[str] = {"allows_duplicate_labels"}
def __init__(self, obj: NDFrame, *, allows_duplicate_labels: bool) -> None:
self._allows_duplicate_labels = allows_duplicate_labels
self._obj = weakref.ref(obj)
@property
def allows_duplicate_labels(self) -> bool:
"""
Whether this object allows duplicate labels.
Setting ``allows_duplicate_labels=False`` ensures that the
index (and columns of a DataFrame) are unique. Most methods
that accept and return a Series or DataFrame will propagate
the value of ``allows_duplicate_labels``.
See :ref:`duplicates` for more.
See Also
--------
DataFrame.attrs : Set global metadata on this object.
DataFrame.set_flags : Set global flags on this object.
Examples
--------
>>> df = pd.DataFrame({"A": [1, 2]}, index=['a', 'a'])
>>> df.flags.allows_duplicate_labels
True
>>> df.flags.allows_duplicate_labels = False
Traceback (most recent call last):
...
pandas.errors.DuplicateLabelError: Index has duplicates.
positions
label
a [0, 1]
"""
return self._allows_duplicate_labels
@allows_duplicate_labels.setter
def allows_duplicate_labels(self, value: bool) -> None:
value = bool(value)
obj = self._obj()
if obj is None:
raise ValueError("This flag's object has been deleted.")
if not value:
for ax in obj.axes:
ax._maybe_check_unique()
self._allows_duplicate_labels = value
def __getitem__(self, key: str):
if key not in self._keys:
raise KeyError(key)
return getattr(self, key)
def __setitem__(self, key: str, value) -> None:
if key not in self._keys:
raise ValueError(f"Unknown flag {key}. Must be one of {self._keys}")
setattr(self, key, value)
def __repr__(self) -> str:
return f"<Flags(allows_duplicate_labels={self.allows_duplicate_labels})>"
def __eq__(self, other) -> bool:
if isinstance(other, type(self)):
return self.allows_duplicate_labels == other.allows_duplicate_labels
return False

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,15 @@
from pandas.core.groupby.generic import (
DataFrameGroupBy,
NamedAgg,
SeriesGroupBy,
)
from pandas.core.groupby.groupby import GroupBy
from pandas.core.groupby.grouper import Grouper
__all__ = [
"DataFrameGroupBy",
"NamedAgg",
"SeriesGroupBy",
"GroupBy",
"Grouper",
]

View File

@ -0,0 +1,121 @@
"""
Provide basic components for groupby.
"""
from __future__ import annotations
import dataclasses
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from collections.abc import Hashable
@dataclasses.dataclass(order=True, frozen=True)
class OutputKey:
label: Hashable
position: int
# special case to prevent duplicate plots when catching exceptions when
# forwarding methods from NDFrames
plotting_methods = frozenset(["plot", "hist"])
# cythonized transformations or canned "agg+broadcast", which do not
# require postprocessing of the result by transform.
cythonized_kernels = frozenset(["cumprod", "cumsum", "shift", "cummin", "cummax"])
# List of aggregation/reduction functions.
# These map each group to a single numeric value
reduction_kernels = frozenset(
[
"all",
"any",
"corrwith",
"count",
"first",
"idxmax",
"idxmin",
"last",
"max",
"mean",
"median",
"min",
"nunique",
"prod",
# as long as `quantile`'s signature accepts only
# a single quantile value, it's a reduction.
# GH#27526 might change that.
"quantile",
"sem",
"size",
"skew",
"std",
"sum",
"var",
]
)
# List of transformation functions.
# a transformation is a function that, for each group,
# produces a result that has the same shape as the group.
transformation_kernels = frozenset(
[
"bfill",
"cumcount",
"cummax",
"cummin",
"cumprod",
"cumsum",
"diff",
"ffill",
"fillna",
"ngroup",
"pct_change",
"rank",
"shift",
]
)
# these are all the public methods on Grouper which don't belong
# in either of the above lists
groupby_other_methods = frozenset(
[
"agg",
"aggregate",
"apply",
"boxplot",
# corr and cov return ngroups*ncolumns rows, so they
# are neither a transformation nor a reduction
"corr",
"cov",
"describe",
"dtypes",
"expanding",
"ewm",
"filter",
"get_group",
"groups",
"head",
"hist",
"indices",
"ndim",
"ngroups",
"nth",
"ohlc",
"pipe",
"plot",
"resample",
"rolling",
"tail",
"take",
"transform",
"sample",
"value_counts",
]
)
# Valid values of `name` for `groupby.transform(name)`
# NOTE: do NOT edit this directly. New additions should be inserted
# into the appropriate list above.
transform_kernel_allowlist = reduction_kernels | transformation_kernels

View File

@ -0,0 +1,87 @@
from __future__ import annotations
import numpy as np
from pandas.core.algorithms import unique1d
from pandas.core.arrays.categorical import (
Categorical,
CategoricalDtype,
recode_for_categories,
)
def recode_for_groupby(
c: Categorical, sort: bool, observed: bool
) -> tuple[Categorical, Categorical | None]:
"""
Code the categories to ensure we can groupby for categoricals.
If observed=True, we return a new Categorical with the observed
categories only.
If sort=False, return a copy of self, coded with categories as
returned by .unique(), followed by any categories not appearing in
the data. If sort=True, return self.
This method is needed solely to ensure the categorical index of the
GroupBy result has categories in the order of appearance in the data
(GH-8868).
Parameters
----------
c : Categorical
sort : bool
The value of the sort parameter groupby was called with.
observed : bool
Account only for the observed values
Returns
-------
Categorical
If sort=False, the new categories are set to the order of
appearance in codes (unless ordered=True, in which case the
original order is preserved), followed by any unrepresented
categories in the original order.
Categorical or None
If we are observed, return the original categorical, otherwise None
"""
# we only care about observed values
if observed:
# In cases with c.ordered, this is equivalent to
# return c.remove_unused_categories(), c
unique_codes = unique1d(c.codes)
take_codes = unique_codes[unique_codes != -1]
if sort:
take_codes = np.sort(take_codes)
# we recode according to the uniques
categories = c.categories.take(take_codes)
codes = recode_for_categories(c.codes, c.categories, categories)
# return a new categorical that maps our new codes
# and categories
dtype = CategoricalDtype(categories, ordered=c.ordered)
return Categorical._simple_new(codes, dtype=dtype), c
# Already sorted according to c.categories; all is fine
if sort:
return c, None
# sort=False should order groups in as-encountered order (GH-8868)
# xref GH:46909: Re-ordering codes faster than using (set|add|reorder)_categories
all_codes = np.arange(c.categories.nunique())
# GH 38140: exclude nan from indexer for categories
unique_notnan_codes = unique1d(c.codes[c.codes != -1])
if sort:
unique_notnan_codes = np.sort(unique_notnan_codes)
if len(all_codes) > len(unique_notnan_codes):
# GH 13179: All categories need to be present, even if missing from the data
missing_codes = np.setdiff1d(all_codes, unique_notnan_codes, assume_unique=True)
take_codes = np.concatenate((unique_notnan_codes, missing_codes))
else:
take_codes = unique_notnan_codes
return Categorical(c, c.unique().categories.take(take_codes)), None

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,304 @@
from __future__ import annotations
from collections.abc import Iterable
from typing import (
TYPE_CHECKING,
Literal,
cast,
)
import numpy as np
from pandas.util._decorators import (
cache_readonly,
doc,
)
from pandas.core.dtypes.common import (
is_integer,
is_list_like,
)
if TYPE_CHECKING:
from pandas._typing import PositionalIndexer
from pandas import (
DataFrame,
Series,
)
from pandas.core.groupby import groupby
class GroupByIndexingMixin:
"""
Mixin for adding ._positional_selector to GroupBy.
"""
@cache_readonly
def _positional_selector(self) -> GroupByPositionalSelector:
"""
Return positional selection for each group.
``groupby._positional_selector[i:j]`` is similar to
``groupby.apply(lambda x: x.iloc[i:j])``
but much faster and preserves the original index and order.
``_positional_selector[]`` is compatible with and extends :meth:`~GroupBy.head`
and :meth:`~GroupBy.tail`. For example:
- ``head(5)``
- ``_positional_selector[5:-5]``
- ``tail(5)``
together return all the rows.
Allowed inputs for the index are:
- An integer valued iterable, e.g. ``range(2, 4)``.
- A comma separated list of integers and slices, e.g. ``5``, ``2, 4``, ``2:4``.
The output format is the same as :meth:`~GroupBy.head` and
:meth:`~GroupBy.tail`, namely
a subset of the ``DataFrame`` or ``Series`` with the index and order preserved.
Returns
-------
Series
The filtered subset of the original Series.
DataFrame
The filtered subset of the original DataFrame.
See Also
--------
DataFrame.iloc : Purely integer-location based indexing for selection by
position.
GroupBy.head : Return first n rows of each group.
GroupBy.tail : Return last n rows of each group.
GroupBy.nth : Take the nth row from each group if n is an int, or a
subset of rows, if n is a list of ints.
Notes
-----
- The slice step cannot be negative.
- If the index specification results in overlaps, the item is not duplicated.
- If the index specification changes the order of items, then
they are returned in their original order.
By contrast, ``DataFrame.iloc`` can change the row order.
- ``groupby()`` parameters such as as_index and dropna are ignored.
The differences between ``_positional_selector[]`` and :meth:`~GroupBy.nth`
with ``as_index=False`` are:
- Input to ``_positional_selector`` can include
one or more slices whereas ``nth``
just handles an integer or a list of integers.
- ``_positional_selector`` can accept a slice relative to the
last row of each group.
- ``_positional_selector`` does not have an equivalent to the
``nth()`` ``dropna`` parameter.
Examples
--------
>>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]],
... columns=["A", "B"])
>>> df.groupby("A")._positional_selector[1:2]
A B
1 a 2
4 b 5
>>> df.groupby("A")._positional_selector[1, -1]
A B
1 a 2
2 a 3
4 b 5
"""
if TYPE_CHECKING:
# pylint: disable-next=used-before-assignment
groupby_self = cast(groupby.GroupBy, self)
else:
groupby_self = self
return GroupByPositionalSelector(groupby_self)
def _make_mask_from_positional_indexer(
self,
arg: PositionalIndexer | tuple,
) -> np.ndarray:
if is_list_like(arg):
if all(is_integer(i) for i in cast(Iterable, arg)):
mask = self._make_mask_from_list(cast(Iterable[int], arg))
else:
mask = self._make_mask_from_tuple(cast(tuple, arg))
elif isinstance(arg, slice):
mask = self._make_mask_from_slice(arg)
elif is_integer(arg):
mask = self._make_mask_from_int(cast(int, arg))
else:
raise TypeError(
f"Invalid index {type(arg)}. "
"Must be integer, list-like, slice or a tuple of "
"integers and slices"
)
if isinstance(mask, bool):
if mask:
mask = self._ascending_count >= 0
else:
mask = self._ascending_count < 0
return cast(np.ndarray, mask)
def _make_mask_from_int(self, arg: int) -> np.ndarray:
if arg >= 0:
return self._ascending_count == arg
else:
return self._descending_count == (-arg - 1)
def _make_mask_from_list(self, args: Iterable[int]) -> bool | np.ndarray:
positive = [arg for arg in args if arg >= 0]
negative = [-arg - 1 for arg in args if arg < 0]
mask: bool | np.ndarray = False
if positive:
mask |= np.isin(self._ascending_count, positive)
if negative:
mask |= np.isin(self._descending_count, negative)
return mask
def _make_mask_from_tuple(self, args: tuple) -> bool | np.ndarray:
mask: bool | np.ndarray = False
for arg in args:
if is_integer(arg):
mask |= self._make_mask_from_int(cast(int, arg))
elif isinstance(arg, slice):
mask |= self._make_mask_from_slice(arg)
else:
raise ValueError(
f"Invalid argument {type(arg)}. Should be int or slice."
)
return mask
def _make_mask_from_slice(self, arg: slice) -> bool | np.ndarray:
start = arg.start
stop = arg.stop
step = arg.step
if step is not None and step < 0:
raise ValueError(f"Invalid step {step}. Must be non-negative")
mask: bool | np.ndarray = True
if step is None:
step = 1
if start is None:
if step > 1:
mask &= self._ascending_count % step == 0
elif start >= 0:
mask &= self._ascending_count >= start
if step > 1:
mask &= (self._ascending_count - start) % step == 0
else:
mask &= self._descending_count < -start
offset_array = self._descending_count + start + 1
limit_array = (
self._ascending_count + self._descending_count + (start + 1)
) < 0
offset_array = np.where(limit_array, self._ascending_count, offset_array)
mask &= offset_array % step == 0
if stop is not None:
if stop >= 0:
mask &= self._ascending_count < stop
else:
mask &= self._descending_count >= -stop
return mask
@cache_readonly
def _ascending_count(self) -> np.ndarray:
if TYPE_CHECKING:
groupby_self = cast(groupby.GroupBy, self)
else:
groupby_self = self
return groupby_self._cumcount_array()
@cache_readonly
def _descending_count(self) -> np.ndarray:
if TYPE_CHECKING:
groupby_self = cast(groupby.GroupBy, self)
else:
groupby_self = self
return groupby_self._cumcount_array(ascending=False)
@doc(GroupByIndexingMixin._positional_selector)
class GroupByPositionalSelector:
def __init__(self, groupby_object: groupby.GroupBy) -> None:
self.groupby_object = groupby_object
def __getitem__(self, arg: PositionalIndexer | tuple) -> DataFrame | Series:
"""
Select by positional index per group.
Implements GroupBy._positional_selector
Parameters
----------
arg : PositionalIndexer | tuple
Allowed values are:
- int
- int valued iterable such as list or range
- slice with step either None or positive
- tuple of integers and slices
Returns
-------
Series
The filtered subset of the original groupby Series.
DataFrame
The filtered subset of the original groupby DataFrame.
See Also
--------
DataFrame.iloc : Integer-location based indexing for selection by position.
GroupBy.head : Return first n rows of each group.
GroupBy.tail : Return last n rows of each group.
GroupBy._positional_selector : Return positional selection for each group.
GroupBy.nth : Take the nth row from each group if n is an int, or a
subset of rows, if n is a list of ints.
"""
mask = self.groupby_object._make_mask_from_positional_indexer(arg)
return self.groupby_object._mask_selected_obj(mask)
class GroupByNthSelector:
"""
Dynamically substituted for GroupBy.nth to enable both call and index
"""
def __init__(self, groupby_object: groupby.GroupBy) -> None:
self.groupby_object = groupby_object
def __call__(
self,
n: PositionalIndexer | tuple,
dropna: Literal["any", "all", None] = None,
) -> DataFrame | Series:
return self.groupby_object._nth(n, dropna)
def __getitem__(self, n: PositionalIndexer | tuple) -> DataFrame | Series:
return self.groupby_object._nth(n)

View File

@ -0,0 +1,181 @@
"""Common utilities for Numba operations with groupby ops"""
from __future__ import annotations
import functools
import inspect
from typing import (
TYPE_CHECKING,
Any,
Callable,
)
import numpy as np
from pandas.compat._optional import import_optional_dependency
from pandas.core.util.numba_ import (
NumbaUtilError,
jit_user_function,
)
if TYPE_CHECKING:
from pandas._typing import Scalar
def validate_udf(func: Callable) -> None:
"""
Validate user defined function for ops when using Numba with groupby ops.
The first signature arguments should include:
def f(values, index, ...):
...
Parameters
----------
func : function, default False
user defined function
Returns
-------
None
Raises
------
NumbaUtilError
"""
if not callable(func):
raise NotImplementedError(
"Numba engine can only be used with a single function."
)
udf_signature = list(inspect.signature(func).parameters.keys())
expected_args = ["values", "index"]
min_number_args = len(expected_args)
if (
len(udf_signature) < min_number_args
or udf_signature[:min_number_args] != expected_args
):
raise NumbaUtilError(
f"The first {min_number_args} arguments to {func.__name__} must be "
f"{expected_args}"
)
@functools.cache
def generate_numba_agg_func(
func: Callable[..., Scalar],
nopython: bool,
nogil: bool,
parallel: bool,
) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, Any], np.ndarray]:
"""
Generate a numba jitted agg function specified by values from engine_kwargs.
1. jit the user's function
2. Return a groupby agg function with the jitted function inline
Configurations specified in engine_kwargs apply to both the user's
function _AND_ the groupby evaluation loop.
Parameters
----------
func : function
function to be applied to each group and will be JITed
nopython : bool
nopython to be passed into numba.jit
nogil : bool
nogil to be passed into numba.jit
parallel : bool
parallel to be passed into numba.jit
Returns
-------
Numba function
"""
numba_func = jit_user_function(func)
if TYPE_CHECKING:
import numba
else:
numba = import_optional_dependency("numba")
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
def group_agg(
values: np.ndarray,
index: np.ndarray,
begin: np.ndarray,
end: np.ndarray,
num_columns: int,
*args: Any,
) -> np.ndarray:
assert len(begin) == len(end)
num_groups = len(begin)
result = np.empty((num_groups, num_columns))
for i in numba.prange(num_groups):
group_index = index[begin[i] : end[i]]
for j in numba.prange(num_columns):
group = values[begin[i] : end[i], j]
result[i, j] = numba_func(group, group_index, *args)
return result
return group_agg
@functools.cache
def generate_numba_transform_func(
func: Callable[..., np.ndarray],
nopython: bool,
nogil: bool,
parallel: bool,
) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, Any], np.ndarray]:
"""
Generate a numba jitted transform function specified by values from engine_kwargs.
1. jit the user's function
2. Return a groupby transform function with the jitted function inline
Configurations specified in engine_kwargs apply to both the user's
function _AND_ the groupby evaluation loop.
Parameters
----------
func : function
function to be applied to each window and will be JITed
nopython : bool
nopython to be passed into numba.jit
nogil : bool
nogil to be passed into numba.jit
parallel : bool
parallel to be passed into numba.jit
Returns
-------
Numba function
"""
numba_func = jit_user_function(func)
if TYPE_CHECKING:
import numba
else:
numba = import_optional_dependency("numba")
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
def group_transform(
values: np.ndarray,
index: np.ndarray,
begin: np.ndarray,
end: np.ndarray,
num_columns: int,
*args: Any,
) -> np.ndarray:
assert len(begin) == len(end)
num_groups = len(begin)
result = np.empty((len(values), num_columns))
for i in numba.prange(num_groups):
group_index = index[begin[i] : end[i]]
for j in numba.prange(num_columns):
group = values[begin[i] : end[i], j]
result[begin[i] : end[i], j] = numba_func(group, group_index, *args)
return result
return group_transform

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,31 @@
from pandas.core.indexers.utils import (
check_array_indexer,
check_key_length,
check_setitem_lengths,
disallow_ndim_indexing,
is_empty_indexer,
is_list_like_indexer,
is_scalar_indexer,
is_valid_positional_slice,
length_of_indexer,
maybe_convert_indices,
unpack_1tuple,
unpack_tuple_and_ellipses,
validate_indices,
)
__all__ = [
"is_valid_positional_slice",
"is_list_like_indexer",
"is_scalar_indexer",
"is_empty_indexer",
"check_setitem_lengths",
"validate_indices",
"maybe_convert_indices",
"length_of_indexer",
"disallow_ndim_indexing",
"unpack_1tuple",
"check_key_length",
"check_array_indexer",
"unpack_tuple_and_ellipses",
]

View File

@ -0,0 +1,453 @@
"""Indexer objects for computing start/end window bounds for rolling operations"""
from __future__ import annotations
from datetime import timedelta
import numpy as np
from pandas._libs.tslibs import BaseOffset
from pandas._libs.window.indexers import calculate_variable_window_bounds
from pandas.util._decorators import Appender
from pandas.core.dtypes.common import ensure_platform_int
from pandas.core.indexes.datetimes import DatetimeIndex
from pandas.tseries.offsets import Nano
get_window_bounds_doc = """
Computes the bounds of a window.
Parameters
----------
num_values : int, default 0
number of values that will be aggregated over
window_size : int, default 0
the number of rows in a window
min_periods : int, default None
min_periods passed from the top level rolling API
center : bool, default None
center passed from the top level rolling API
closed : str, default None
closed passed from the top level rolling API
step : int, default None
step passed from the top level rolling API
.. versionadded:: 1.5
win_type : str, default None
win_type passed from the top level rolling API
Returns
-------
A tuple of ndarray[int64]s, indicating the boundaries of each
window
"""
class BaseIndexer:
"""
Base class for window bounds calculations.
Examples
--------
>>> from pandas.api.indexers import BaseIndexer
>>> class CustomIndexer(BaseIndexer):
... def get_window_bounds(self, num_values, min_periods, center, closed, step):
... start = np.empty(num_values, dtype=np.int64)
... end = np.empty(num_values, dtype=np.int64)
... for i in range(num_values):
... start[i] = i
... end[i] = i + self.window_size
... return start, end
>>> df = pd.DataFrame({"values": range(5)})
>>> indexer = CustomIndexer(window_size=2)
>>> df.rolling(indexer).sum()
values
0 1.0
1 3.0
2 5.0
3 7.0
4 4.0
"""
def __init__(
self, index_array: np.ndarray | None = None, window_size: int = 0, **kwargs
) -> None:
self.index_array = index_array
self.window_size = window_size
# Set user defined kwargs as attributes that can be used in get_window_bounds
for key, value in kwargs.items():
setattr(self, key, value)
@Appender(get_window_bounds_doc)
def get_window_bounds(
self,
num_values: int = 0,
min_periods: int | None = None,
center: bool | None = None,
closed: str | None = None,
step: int | None = None,
) -> tuple[np.ndarray, np.ndarray]:
raise NotImplementedError
class FixedWindowIndexer(BaseIndexer):
"""Creates window boundaries that are of fixed length."""
@Appender(get_window_bounds_doc)
def get_window_bounds(
self,
num_values: int = 0,
min_periods: int | None = None,
center: bool | None = None,
closed: str | None = None,
step: int | None = None,
) -> tuple[np.ndarray, np.ndarray]:
if center or self.window_size == 0:
offset = (self.window_size - 1) // 2
else:
offset = 0
end = np.arange(1 + offset, num_values + 1 + offset, step, dtype="int64")
start = end - self.window_size
if closed in ["left", "both"]:
start -= 1
if closed in ["left", "neither"]:
end -= 1
end = np.clip(end, 0, num_values)
start = np.clip(start, 0, num_values)
return start, end
class VariableWindowIndexer(BaseIndexer):
"""Creates window boundaries that are of variable length, namely for time series."""
@Appender(get_window_bounds_doc)
def get_window_bounds(
self,
num_values: int = 0,
min_periods: int | None = None,
center: bool | None = None,
closed: str | None = None,
step: int | None = None,
) -> tuple[np.ndarray, np.ndarray]:
# error: Argument 4 to "calculate_variable_window_bounds" has incompatible
# type "Optional[bool]"; expected "bool"
# error: Argument 6 to "calculate_variable_window_bounds" has incompatible
# type "Optional[ndarray]"; expected "ndarray"
return calculate_variable_window_bounds(
num_values,
self.window_size,
min_periods,
center, # type: ignore[arg-type]
closed,
self.index_array, # type: ignore[arg-type]
)
class VariableOffsetWindowIndexer(BaseIndexer):
"""
Calculate window boundaries based on a non-fixed offset such as a BusinessDay.
Examples
--------
>>> from pandas.api.indexers import VariableOffsetWindowIndexer
>>> df = pd.DataFrame(range(10), index=pd.date_range("2020", periods=10))
>>> offset = pd.offsets.BDay(1)
>>> indexer = VariableOffsetWindowIndexer(index=df.index, offset=offset)
>>> df
0
2020-01-01 0
2020-01-02 1
2020-01-03 2
2020-01-04 3
2020-01-05 4
2020-01-06 5
2020-01-07 6
2020-01-08 7
2020-01-09 8
2020-01-10 9
>>> df.rolling(indexer).sum()
0
2020-01-01 0.0
2020-01-02 1.0
2020-01-03 2.0
2020-01-04 3.0
2020-01-05 7.0
2020-01-06 12.0
2020-01-07 6.0
2020-01-08 7.0
2020-01-09 8.0
2020-01-10 9.0
"""
def __init__(
self,
index_array: np.ndarray | None = None,
window_size: int = 0,
index: DatetimeIndex | None = None,
offset: BaseOffset | None = None,
**kwargs,
) -> None:
super().__init__(index_array, window_size, **kwargs)
if not isinstance(index, DatetimeIndex):
raise ValueError("index must be a DatetimeIndex.")
self.index = index
if not isinstance(offset, BaseOffset):
raise ValueError("offset must be a DateOffset-like object.")
self.offset = offset
@Appender(get_window_bounds_doc)
def get_window_bounds(
self,
num_values: int = 0,
min_periods: int | None = None,
center: bool | None = None,
closed: str | None = None,
step: int | None = None,
) -> tuple[np.ndarray, np.ndarray]:
if step is not None:
raise NotImplementedError("step not implemented for variable offset window")
if num_values <= 0:
return np.empty(0, dtype="int64"), np.empty(0, dtype="int64")
# if windows is variable, default is 'right', otherwise default is 'both'
if closed is None:
closed = "right" if self.index is not None else "both"
right_closed = closed in ["right", "both"]
left_closed = closed in ["left", "both"]
if self.index[num_values - 1] < self.index[0]:
index_growth_sign = -1
else:
index_growth_sign = 1
offset_diff = index_growth_sign * self.offset
start = np.empty(num_values, dtype="int64")
start.fill(-1)
end = np.empty(num_values, dtype="int64")
end.fill(-1)
start[0] = 0
# right endpoint is closed
if right_closed:
end[0] = 1
# right endpoint is open
else:
end[0] = 0
zero = timedelta(0)
# start is start of slice interval (including)
# end is end of slice interval (not including)
for i in range(1, num_values):
end_bound = self.index[i]
start_bound = end_bound - offset_diff
# left endpoint is closed
if left_closed:
start_bound -= Nano(1)
# advance the start bound until we are
# within the constraint
start[i] = i
for j in range(start[i - 1], i):
start_diff = (self.index[j] - start_bound) * index_growth_sign
if start_diff > zero:
start[i] = j
break
# end bound is previous end
# or current index
end_diff = (self.index[end[i - 1]] - end_bound) * index_growth_sign
if end_diff == zero and not right_closed:
end[i] = end[i - 1] + 1
elif end_diff <= zero:
end[i] = i + 1
else:
end[i] = end[i - 1]
# right endpoint is open
if not right_closed:
end[i] -= 1
return start, end
class ExpandingIndexer(BaseIndexer):
"""Calculate expanding window bounds, mimicking df.expanding()"""
@Appender(get_window_bounds_doc)
def get_window_bounds(
self,
num_values: int = 0,
min_periods: int | None = None,
center: bool | None = None,
closed: str | None = None,
step: int | None = None,
) -> tuple[np.ndarray, np.ndarray]:
return (
np.zeros(num_values, dtype=np.int64),
np.arange(1, num_values + 1, dtype=np.int64),
)
class FixedForwardWindowIndexer(BaseIndexer):
"""
Creates window boundaries for fixed-length windows that include the current row.
Examples
--------
>>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]})
>>> df
B
0 0.0
1 1.0
2 2.0
3 NaN
4 4.0
>>> indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=2)
>>> df.rolling(window=indexer, min_periods=1).sum()
B
0 1.0
1 3.0
2 2.0
3 4.0
4 4.0
"""
@Appender(get_window_bounds_doc)
def get_window_bounds(
self,
num_values: int = 0,
min_periods: int | None = None,
center: bool | None = None,
closed: str | None = None,
step: int | None = None,
) -> tuple[np.ndarray, np.ndarray]:
if center:
raise ValueError("Forward-looking windows can't have center=True")
if closed is not None:
raise ValueError(
"Forward-looking windows don't support setting the closed argument"
)
if step is None:
step = 1
start = np.arange(0, num_values, step, dtype="int64")
end = start + self.window_size
if self.window_size:
end = np.clip(end, 0, num_values)
return start, end
class GroupbyIndexer(BaseIndexer):
"""Calculate bounds to compute groupby rolling, mimicking df.groupby().rolling()"""
def __init__(
self,
index_array: np.ndarray | None = None,
window_size: int | BaseIndexer = 0,
groupby_indices: dict | None = None,
window_indexer: type[BaseIndexer] = BaseIndexer,
indexer_kwargs: dict | None = None,
**kwargs,
) -> None:
"""
Parameters
----------
index_array : np.ndarray or None
np.ndarray of the index of the original object that we are performing
a chained groupby operation over. This index has been pre-sorted relative to
the groups
window_size : int or BaseIndexer
window size during the windowing operation
groupby_indices : dict or None
dict of {group label: [positional index of rows belonging to the group]}
window_indexer : BaseIndexer
BaseIndexer class determining the start and end bounds of each group
indexer_kwargs : dict or None
Custom kwargs to be passed to window_indexer
**kwargs :
keyword arguments that will be available when get_window_bounds is called
"""
self.groupby_indices = groupby_indices or {}
self.window_indexer = window_indexer
self.indexer_kwargs = indexer_kwargs.copy() if indexer_kwargs else {}
super().__init__(
index_array=index_array,
window_size=self.indexer_kwargs.pop("window_size", window_size),
**kwargs,
)
@Appender(get_window_bounds_doc)
def get_window_bounds(
self,
num_values: int = 0,
min_periods: int | None = None,
center: bool | None = None,
closed: str | None = None,
step: int | None = None,
) -> tuple[np.ndarray, np.ndarray]:
# 1) For each group, get the indices that belong to the group
# 2) Use the indices to calculate the start & end bounds of the window
# 3) Append the window bounds in group order
start_arrays = []
end_arrays = []
window_indices_start = 0
for key, indices in self.groupby_indices.items():
index_array: np.ndarray | None
if self.index_array is not None:
index_array = self.index_array.take(ensure_platform_int(indices))
else:
index_array = self.index_array
indexer = self.window_indexer(
index_array=index_array,
window_size=self.window_size,
**self.indexer_kwargs,
)
start, end = indexer.get_window_bounds(
len(indices), min_periods, center, closed, step
)
start = start.astype(np.int64)
end = end.astype(np.int64)
assert len(start) == len(
end
), "these should be equal in length from get_window_bounds"
# Cannot use groupby_indices as they might not be monotonic with the object
# we're rolling over
window_indices = np.arange(
window_indices_start, window_indices_start + len(indices)
)
window_indices_start += len(indices)
# Extend as we'll be slicing window like [start, end)
window_indices = np.append(window_indices, [window_indices[-1] + 1]).astype(
np.int64, copy=False
)
start_arrays.append(window_indices.take(ensure_platform_int(start)))
end_arrays.append(window_indices.take(ensure_platform_int(end)))
if len(start_arrays) == 0:
return np.array([], dtype=np.int64), np.array([], dtype=np.int64)
start = np.concatenate(start_arrays)
end = np.concatenate(end_arrays)
return start, end
class ExponentialMovingWindowIndexer(BaseIndexer):
"""Calculate ewm window bounds (the entire window)"""
@Appender(get_window_bounds_doc)
def get_window_bounds(
self,
num_values: int = 0,
min_periods: int | None = None,
center: bool | None = None,
closed: str | None = None,
step: int | None = None,
) -> tuple[np.ndarray, np.ndarray]:
return np.array([0], dtype=np.int64), np.array([num_values], dtype=np.int64)

View File

@ -0,0 +1,553 @@
"""
Low-dependency indexing utilities.
"""
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Any,
)
import numpy as np
from pandas._libs import lib
from pandas.core.dtypes.common import (
is_array_like,
is_bool_dtype,
is_integer,
is_integer_dtype,
is_list_like,
)
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.generic import (
ABCIndex,
ABCSeries,
)
if TYPE_CHECKING:
from pandas._typing import AnyArrayLike
from pandas.core.frame import DataFrame
from pandas.core.indexes.base import Index
# -----------------------------------------------------------
# Indexer Identification
def is_valid_positional_slice(slc: slice) -> bool:
"""
Check if a slice object can be interpreted as a positional indexer.
Parameters
----------
slc : slice
Returns
-------
bool
Notes
-----
A valid positional slice may also be interpreted as a label-based slice
depending on the index being sliced.
"""
return (
lib.is_int_or_none(slc.start)
and lib.is_int_or_none(slc.stop)
and lib.is_int_or_none(slc.step)
)
def is_list_like_indexer(key) -> bool:
"""
Check if we have a list-like indexer that is *not* a NamedTuple.
Parameters
----------
key : object
Returns
-------
bool
"""
# allow a list_like, but exclude NamedTuples which can be indexers
return is_list_like(key) and not (isinstance(key, tuple) and type(key) is not tuple)
def is_scalar_indexer(indexer, ndim: int) -> bool:
"""
Return True if we are all scalar indexers.
Parameters
----------
indexer : object
ndim : int
Number of dimensions in the object being indexed.
Returns
-------
bool
"""
if ndim == 1 and is_integer(indexer):
# GH37748: allow indexer to be an integer for Series
return True
if isinstance(indexer, tuple) and len(indexer) == ndim:
return all(is_integer(x) for x in indexer)
return False
def is_empty_indexer(indexer) -> bool:
"""
Check if we have an empty indexer.
Parameters
----------
indexer : object
Returns
-------
bool
"""
if is_list_like(indexer) and not len(indexer):
return True
if not isinstance(indexer, tuple):
indexer = (indexer,)
return any(isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer)
# -----------------------------------------------------------
# Indexer Validation
def check_setitem_lengths(indexer, value, values) -> bool:
"""
Validate that value and indexer are the same length.
An special-case is allowed for when the indexer is a boolean array
and the number of true values equals the length of ``value``. In
this case, no exception is raised.
Parameters
----------
indexer : sequence
Key for the setitem.
value : array-like
Value for the setitem.
values : array-like
Values being set into.
Returns
-------
bool
Whether this is an empty listlike setting which is a no-op.
Raises
------
ValueError
When the indexer is an ndarray or list and the lengths don't match.
"""
no_op = False
if isinstance(indexer, (np.ndarray, list)):
# We can ignore other listlikes because they are either
# a) not necessarily 1-D indexers, e.g. tuple
# b) boolean indexers e.g. BoolArray
if is_list_like(value):
if len(indexer) != len(value) and values.ndim == 1:
# boolean with truth values == len of the value is ok too
if isinstance(indexer, list):
indexer = np.array(indexer)
if not (
isinstance(indexer, np.ndarray)
and indexer.dtype == np.bool_
and indexer.sum() == len(value)
):
raise ValueError(
"cannot set using a list-like indexer "
"with a different length than the value"
)
if not len(indexer):
no_op = True
elif isinstance(indexer, slice):
if is_list_like(value):
if len(value) != length_of_indexer(indexer, values) and values.ndim == 1:
# In case of two dimensional value is used row-wise and broadcasted
raise ValueError(
"cannot set using a slice indexer with a "
"different length than the value"
)
if not len(value):
no_op = True
return no_op
def validate_indices(indices: np.ndarray, n: int) -> None:
"""
Perform bounds-checking for an indexer.
-1 is allowed for indicating missing values.
Parameters
----------
indices : ndarray
n : int
Length of the array being indexed.
Raises
------
ValueError
Examples
--------
>>> validate_indices(np.array([1, 2]), 3) # OK
>>> validate_indices(np.array([1, -2]), 3)
Traceback (most recent call last):
...
ValueError: negative dimensions are not allowed
>>> validate_indices(np.array([1, 2, 3]), 3)
Traceback (most recent call last):
...
IndexError: indices are out-of-bounds
>>> validate_indices(np.array([-1, -1]), 0) # OK
>>> validate_indices(np.array([0, 1]), 0)
Traceback (most recent call last):
...
IndexError: indices are out-of-bounds
"""
if len(indices):
min_idx = indices.min()
if min_idx < -1:
msg = f"'indices' contains values less than allowed ({min_idx} < -1)"
raise ValueError(msg)
max_idx = indices.max()
if max_idx >= n:
raise IndexError("indices are out-of-bounds")
# -----------------------------------------------------------
# Indexer Conversion
def maybe_convert_indices(indices, n: int, verify: bool = True) -> np.ndarray:
"""
Attempt to convert indices into valid, positive indices.
If we have negative indices, translate to positive here.
If we have indices that are out-of-bounds, raise an IndexError.
Parameters
----------
indices : array-like
Array of indices that we are to convert.
n : int
Number of elements in the array that we are indexing.
verify : bool, default True
Check that all entries are between 0 and n - 1, inclusive.
Returns
-------
array-like
An array-like of positive indices that correspond to the ones
that were passed in initially to this function.
Raises
------
IndexError
One of the converted indices either exceeded the number of,
elements (specified by `n`), or was still negative.
"""
if isinstance(indices, list):
indices = np.array(indices)
if len(indices) == 0:
# If `indices` is empty, np.array will return a float,
# and will cause indexing errors.
return np.empty(0, dtype=np.intp)
mask = indices < 0
if mask.any():
indices = indices.copy()
indices[mask] += n
if verify:
mask = (indices >= n) | (indices < 0)
if mask.any():
raise IndexError("indices are out-of-bounds")
return indices
# -----------------------------------------------------------
# Unsorted
def length_of_indexer(indexer, target=None) -> int:
"""
Return the expected length of target[indexer]
Returns
-------
int
"""
if target is not None and isinstance(indexer, slice):
target_len = len(target)
start = indexer.start
stop = indexer.stop
step = indexer.step
if start is None:
start = 0
elif start < 0:
start += target_len
if stop is None or stop > target_len:
stop = target_len
elif stop < 0:
stop += target_len
if step is None:
step = 1
elif step < 0:
start, stop = stop + 1, start + 1
step = -step
return (stop - start + step - 1) // step
elif isinstance(indexer, (ABCSeries, ABCIndex, np.ndarray, list)):
if isinstance(indexer, list):
indexer = np.array(indexer)
if indexer.dtype == bool:
# GH#25774
return indexer.sum()
return len(indexer)
elif isinstance(indexer, range):
return (indexer.stop - indexer.start) // indexer.step
elif not is_list_like_indexer(indexer):
return 1
raise AssertionError("cannot find the length of the indexer")
def disallow_ndim_indexing(result) -> None:
"""
Helper function to disallow multi-dimensional indexing on 1D Series/Index.
GH#27125 indexer like idx[:, None] expands dim, but we cannot do that
and keep an index, so we used to return ndarray, which was deprecated
in GH#30588.
"""
if np.ndim(result) > 1:
raise ValueError(
"Multi-dimensional indexing (e.g. `obj[:, None]`) is no longer "
"supported. Convert to a numpy array before indexing instead."
)
def unpack_1tuple(tup):
"""
If we have a length-1 tuple/list that contains a slice, unpack to just
the slice.
Notes
-----
The list case is deprecated.
"""
if len(tup) == 1 and isinstance(tup[0], slice):
# if we don't have a MultiIndex, we may still be able to handle
# a 1-tuple. see test_1tuple_without_multiindex
if isinstance(tup, list):
# GH#31299
raise ValueError(
"Indexing with a single-item list containing a "
"slice is not allowed. Pass a tuple instead.",
)
return tup[0]
return tup
def check_key_length(columns: Index, key, value: DataFrame) -> None:
"""
Checks if a key used as indexer has the same length as the columns it is
associated with.
Parameters
----------
columns : Index The columns of the DataFrame to index.
key : A list-like of keys to index with.
value : DataFrame The value to set for the keys.
Raises
------
ValueError: If the length of key is not equal to the number of columns in value
or if the number of columns referenced by key is not equal to number
of columns.
"""
if columns.is_unique:
if len(value.columns) != len(key):
raise ValueError("Columns must be same length as key")
else:
# Missing keys in columns are represented as -1
if len(columns.get_indexer_non_unique(key)[0]) != len(value.columns):
raise ValueError("Columns must be same length as key")
def unpack_tuple_and_ellipses(item: tuple):
"""
Possibly unpack arr[..., n] to arr[n]
"""
if len(item) > 1:
# Note: we are assuming this indexing is being done on a 1D arraylike
if item[0] is Ellipsis:
item = item[1:]
elif item[-1] is Ellipsis:
item = item[:-1]
if len(item) > 1:
raise IndexError("too many indices for array.")
item = item[0]
return item
# -----------------------------------------------------------
# Public indexer validation
def check_array_indexer(array: AnyArrayLike, indexer: Any) -> Any:
"""
Check if `indexer` is a valid array indexer for `array`.
For a boolean mask, `array` and `indexer` are checked to have the same
length. The dtype is validated, and if it is an integer or boolean
ExtensionArray, it is checked if there are missing values present, and
it is converted to the appropriate numpy array. Other dtypes will raise
an error.
Non-array indexers (integer, slice, Ellipsis, tuples, ..) are passed
through as is.
Parameters
----------
array : array-like
The array that is being indexed (only used for the length).
indexer : array-like or list-like
The array-like that's used to index. List-like input that is not yet
a numpy array or an ExtensionArray is converted to one. Other input
types are passed through as is.
Returns
-------
numpy.ndarray
The validated indexer as a numpy array that can be used to index.
Raises
------
IndexError
When the lengths don't match.
ValueError
When `indexer` cannot be converted to a numpy ndarray to index
(e.g. presence of missing values).
See Also
--------
api.types.is_bool_dtype : Check if `key` is of boolean dtype.
Examples
--------
When checking a boolean mask, a boolean ndarray is returned when the
arguments are all valid.
>>> mask = pd.array([True, False])
>>> arr = pd.array([1, 2])
>>> pd.api.indexers.check_array_indexer(arr, mask)
array([ True, False])
An IndexError is raised when the lengths don't match.
>>> mask = pd.array([True, False, True])
>>> pd.api.indexers.check_array_indexer(arr, mask)
Traceback (most recent call last):
...
IndexError: Boolean index has wrong length: 3 instead of 2.
NA values in a boolean array are treated as False.
>>> mask = pd.array([True, pd.NA])
>>> pd.api.indexers.check_array_indexer(arr, mask)
array([ True, False])
A numpy boolean mask will get passed through (if the length is correct):
>>> mask = np.array([True, False])
>>> pd.api.indexers.check_array_indexer(arr, mask)
array([ True, False])
Similarly for integer indexers, an integer ndarray is returned when it is
a valid indexer, otherwise an error is (for integer indexers, a matching
length is not required):
>>> indexer = pd.array([0, 2], dtype="Int64")
>>> arr = pd.array([1, 2, 3])
>>> pd.api.indexers.check_array_indexer(arr, indexer)
array([0, 2])
>>> indexer = pd.array([0, pd.NA], dtype="Int64")
>>> pd.api.indexers.check_array_indexer(arr, indexer)
Traceback (most recent call last):
...
ValueError: Cannot index with an integer indexer containing NA values
For non-integer/boolean dtypes, an appropriate error is raised:
>>> indexer = np.array([0., 2.], dtype="float64")
>>> pd.api.indexers.check_array_indexer(arr, indexer)
Traceback (most recent call last):
...
IndexError: arrays used as indices must be of integer or boolean type
"""
from pandas.core.construction import array as pd_array
# whatever is not an array-like is returned as-is (possible valid array
# indexers that are not array-like: integer, slice, Ellipsis, None)
# In this context, tuples are not considered as array-like, as they have
# a specific meaning in indexing (multi-dimensional indexing)
if is_list_like(indexer):
if isinstance(indexer, tuple):
return indexer
else:
return indexer
# convert list-likes to array
if not is_array_like(indexer):
indexer = pd_array(indexer)
if len(indexer) == 0:
# empty list is converted to float array by pd.array
indexer = np.array([], dtype=np.intp)
dtype = indexer.dtype
if is_bool_dtype(dtype):
if isinstance(dtype, ExtensionDtype):
indexer = indexer.to_numpy(dtype=bool, na_value=False)
else:
indexer = np.asarray(indexer, dtype=bool)
# GH26658
if len(indexer) != len(array):
raise IndexError(
f"Boolean index has wrong length: "
f"{len(indexer)} instead of {len(array)}"
)
elif is_integer_dtype(dtype):
try:
indexer = np.asarray(indexer, dtype=np.intp)
except ValueError as err:
raise ValueError(
"Cannot index with an integer indexer containing NA values"
) from err
else:
raise IndexError("arrays used as indices must be of integer or boolean type")
return indexer

View File

@ -0,0 +1,643 @@
"""
datetimelike delegation
"""
from __future__ import annotations
from typing import (
TYPE_CHECKING,
cast,
)
import warnings
import numpy as np
from pandas._libs import lib
from pandas.util._exceptions import find_stack_level
from pandas.core.dtypes.common import (
is_integer_dtype,
is_list_like,
)
from pandas.core.dtypes.dtypes import (
ArrowDtype,
CategoricalDtype,
DatetimeTZDtype,
PeriodDtype,
)
from pandas.core.dtypes.generic import ABCSeries
from pandas.core.accessor import (
PandasDelegate,
delegate_names,
)
from pandas.core.arrays import (
DatetimeArray,
PeriodArray,
TimedeltaArray,
)
from pandas.core.arrays.arrow.array import ArrowExtensionArray
from pandas.core.base import (
NoNewAttributesMixin,
PandasObject,
)
from pandas.core.indexes.datetimes import DatetimeIndex
from pandas.core.indexes.timedeltas import TimedeltaIndex
if TYPE_CHECKING:
from pandas import (
DataFrame,
Series,
)
class Properties(PandasDelegate, PandasObject, NoNewAttributesMixin):
_hidden_attrs = PandasObject._hidden_attrs | {
"orig",
"name",
}
def __init__(self, data: Series, orig) -> None:
if not isinstance(data, ABCSeries):
raise TypeError(
f"cannot convert an object of type {type(data)} to a datetimelike index"
)
self._parent = data
self.orig = orig
self.name = getattr(data, "name", None)
self._freeze()
def _get_values(self):
data = self._parent
if lib.is_np_dtype(data.dtype, "M"):
return DatetimeIndex(data, copy=False, name=self.name)
elif isinstance(data.dtype, DatetimeTZDtype):
return DatetimeIndex(data, copy=False, name=self.name)
elif lib.is_np_dtype(data.dtype, "m"):
return TimedeltaIndex(data, copy=False, name=self.name)
elif isinstance(data.dtype, PeriodDtype):
return PeriodArray(data, copy=False)
raise TypeError(
f"cannot convert an object of type {type(data)} to a datetimelike index"
)
def _delegate_property_get(self, name: str):
from pandas import Series
values = self._get_values()
result = getattr(values, name)
# maybe need to upcast (ints)
if isinstance(result, np.ndarray):
if is_integer_dtype(result):
result = result.astype("int64")
elif not is_list_like(result):
return result
result = np.asarray(result)
if self.orig is not None:
index = self.orig.index
else:
index = self._parent.index
# return the result as a Series
result = Series(result, index=index, name=self.name).__finalize__(self._parent)
# setting this object will show a SettingWithCopyWarning/Error
result._is_copy = (
"modifications to a property of a datetimelike "
"object are not supported and are discarded. "
"Change values on the original."
)
return result
def _delegate_property_set(self, name: str, value, *args, **kwargs):
raise ValueError(
"modifications to a property of a datetimelike object are not supported. "
"Change values on the original."
)
def _delegate_method(self, name: str, *args, **kwargs):
from pandas import Series
values = self._get_values()
method = getattr(values, name)
result = method(*args, **kwargs)
if not is_list_like(result):
return result
result = Series(result, index=self._parent.index, name=self.name).__finalize__(
self._parent
)
# setting this object will show a SettingWithCopyWarning/Error
result._is_copy = (
"modifications to a method of a datetimelike "
"object are not supported and are discarded. "
"Change values on the original."
)
return result
@delegate_names(
delegate=ArrowExtensionArray,
accessors=TimedeltaArray._datetimelike_ops,
typ="property",
accessor_mapping=lambda x: f"_dt_{x}",
raise_on_missing=False,
)
@delegate_names(
delegate=ArrowExtensionArray,
accessors=TimedeltaArray._datetimelike_methods,
typ="method",
accessor_mapping=lambda x: f"_dt_{x}",
raise_on_missing=False,
)
@delegate_names(
delegate=ArrowExtensionArray,
accessors=DatetimeArray._datetimelike_ops,
typ="property",
accessor_mapping=lambda x: f"_dt_{x}",
raise_on_missing=False,
)
@delegate_names(
delegate=ArrowExtensionArray,
accessors=DatetimeArray._datetimelike_methods,
typ="method",
accessor_mapping=lambda x: f"_dt_{x}",
raise_on_missing=False,
)
class ArrowTemporalProperties(PandasDelegate, PandasObject, NoNewAttributesMixin):
def __init__(self, data: Series, orig) -> None:
if not isinstance(data, ABCSeries):
raise TypeError(
f"cannot convert an object of type {type(data)} to a datetimelike index"
)
self._parent = data
self._orig = orig
self._freeze()
def _delegate_property_get(self, name: str):
if not hasattr(self._parent.array, f"_dt_{name}"):
raise NotImplementedError(
f"dt.{name} is not supported for {self._parent.dtype}"
)
result = getattr(self._parent.array, f"_dt_{name}")
if not is_list_like(result):
return result
if self._orig is not None:
index = self._orig.index
else:
index = self._parent.index
# return the result as a Series, which is by definition a copy
result = type(self._parent)(
result, index=index, name=self._parent.name
).__finalize__(self._parent)
return result
def _delegate_method(self, name: str, *args, **kwargs):
if not hasattr(self._parent.array, f"_dt_{name}"):
raise NotImplementedError(
f"dt.{name} is not supported for {self._parent.dtype}"
)
result = getattr(self._parent.array, f"_dt_{name}")(*args, **kwargs)
if self._orig is not None:
index = self._orig.index
else:
index = self._parent.index
# return the result as a Series, which is by definition a copy
result = type(self._parent)(
result, index=index, name=self._parent.name
).__finalize__(self._parent)
return result
def to_pytimedelta(self):
return cast(ArrowExtensionArray, self._parent.array)._dt_to_pytimedelta()
def to_pydatetime(self):
# GH#20306
warnings.warn(
f"The behavior of {type(self).__name__}.to_pydatetime is deprecated, "
"in a future version this will return a Series containing python "
"datetime objects instead of an ndarray. To retain the old behavior, "
"call `np.array` on the result",
FutureWarning,
stacklevel=find_stack_level(),
)
return cast(ArrowExtensionArray, self._parent.array)._dt_to_pydatetime()
def isocalendar(self) -> DataFrame:
from pandas import DataFrame
result = (
cast(ArrowExtensionArray, self._parent.array)
._dt_isocalendar()
._pa_array.combine_chunks()
)
iso_calendar_df = DataFrame(
{
col: type(self._parent.array)(result.field(i)) # type: ignore[call-arg]
for i, col in enumerate(["year", "week", "day"])
}
)
return iso_calendar_df
@property
def components(self) -> DataFrame:
from pandas import DataFrame
components_df = DataFrame(
{
col: getattr(self._parent.array, f"_dt_{col}")
for col in [
"days",
"hours",
"minutes",
"seconds",
"milliseconds",
"microseconds",
"nanoseconds",
]
}
)
return components_df
@delegate_names(
delegate=DatetimeArray,
accessors=DatetimeArray._datetimelike_ops + ["unit"],
typ="property",
)
@delegate_names(
delegate=DatetimeArray,
accessors=DatetimeArray._datetimelike_methods + ["as_unit"],
typ="method",
)
class DatetimeProperties(Properties):
"""
Accessor object for datetimelike properties of the Series values.
Examples
--------
>>> seconds_series = pd.Series(pd.date_range("2000-01-01", periods=3, freq="s"))
>>> seconds_series
0 2000-01-01 00:00:00
1 2000-01-01 00:00:01
2 2000-01-01 00:00:02
dtype: datetime64[ns]
>>> seconds_series.dt.second
0 0
1 1
2 2
dtype: int32
>>> hours_series = pd.Series(pd.date_range("2000-01-01", periods=3, freq="h"))
>>> hours_series
0 2000-01-01 00:00:00
1 2000-01-01 01:00:00
2 2000-01-01 02:00:00
dtype: datetime64[ns]
>>> hours_series.dt.hour
0 0
1 1
2 2
dtype: int32
>>> quarters_series = pd.Series(pd.date_range("2000-01-01", periods=3, freq="QE"))
>>> quarters_series
0 2000-03-31
1 2000-06-30
2 2000-09-30
dtype: datetime64[ns]
>>> quarters_series.dt.quarter
0 1
1 2
2 3
dtype: int32
Returns a Series indexed like the original Series.
Raises TypeError if the Series does not contain datetimelike values.
"""
def to_pydatetime(self) -> np.ndarray:
"""
Return the data as an array of :class:`datetime.datetime` objects.
.. deprecated:: 2.1.0
The current behavior of dt.to_pydatetime is deprecated.
In a future version this will return a Series containing python
datetime objects instead of a ndarray.
Timezone information is retained if present.
.. warning::
Python's datetime uses microsecond resolution, which is lower than
pandas (nanosecond). The values are truncated.
Returns
-------
numpy.ndarray
Object dtype array containing native Python datetime objects.
See Also
--------
datetime.datetime : Standard library value for a datetime.
Examples
--------
>>> s = pd.Series(pd.date_range('20180310', periods=2))
>>> s
0 2018-03-10
1 2018-03-11
dtype: datetime64[ns]
>>> s.dt.to_pydatetime()
array([datetime.datetime(2018, 3, 10, 0, 0),
datetime.datetime(2018, 3, 11, 0, 0)], dtype=object)
pandas' nanosecond precision is truncated to microseconds.
>>> s = pd.Series(pd.date_range('20180310', periods=2, freq='ns'))
>>> s
0 2018-03-10 00:00:00.000000000
1 2018-03-10 00:00:00.000000001
dtype: datetime64[ns]
>>> s.dt.to_pydatetime()
array([datetime.datetime(2018, 3, 10, 0, 0),
datetime.datetime(2018, 3, 10, 0, 0)], dtype=object)
"""
# GH#20306
warnings.warn(
f"The behavior of {type(self).__name__}.to_pydatetime is deprecated, "
"in a future version this will return a Series containing python "
"datetime objects instead of an ndarray. To retain the old behavior, "
"call `np.array` on the result",
FutureWarning,
stacklevel=find_stack_level(),
)
return self._get_values().to_pydatetime()
@property
def freq(self):
return self._get_values().inferred_freq
def isocalendar(self) -> DataFrame:
"""
Calculate year, week, and day according to the ISO 8601 standard.
Returns
-------
DataFrame
With columns year, week and day.
See Also
--------
Timestamp.isocalendar : Function return a 3-tuple containing ISO year,
week number, and weekday for the given Timestamp object.
datetime.date.isocalendar : Return a named tuple object with
three components: year, week and weekday.
Examples
--------
>>> ser = pd.to_datetime(pd.Series(["2010-01-01", pd.NaT]))
>>> ser.dt.isocalendar()
year week day
0 2009 53 5
1 <NA> <NA> <NA>
>>> ser.dt.isocalendar().week
0 53
1 <NA>
Name: week, dtype: UInt32
"""
return self._get_values().isocalendar().set_index(self._parent.index)
@delegate_names(
delegate=TimedeltaArray, accessors=TimedeltaArray._datetimelike_ops, typ="property"
)
@delegate_names(
delegate=TimedeltaArray,
accessors=TimedeltaArray._datetimelike_methods,
typ="method",
)
class TimedeltaProperties(Properties):
"""
Accessor object for datetimelike properties of the Series values.
Returns a Series indexed like the original Series.
Raises TypeError if the Series does not contain datetimelike values.
Examples
--------
>>> seconds_series = pd.Series(
... pd.timedelta_range(start="1 second", periods=3, freq="s")
... )
>>> seconds_series
0 0 days 00:00:01
1 0 days 00:00:02
2 0 days 00:00:03
dtype: timedelta64[ns]
>>> seconds_series.dt.seconds
0 1
1 2
2 3
dtype: int32
"""
def to_pytimedelta(self) -> np.ndarray:
"""
Return an array of native :class:`datetime.timedelta` objects.
Python's standard `datetime` library uses a different representation
timedelta's. This method converts a Series of pandas Timedeltas
to `datetime.timedelta` format with the same length as the original
Series.
Returns
-------
numpy.ndarray
Array of 1D containing data with `datetime.timedelta` type.
See Also
--------
datetime.timedelta : A duration expressing the difference
between two date, time, or datetime.
Examples
--------
>>> s = pd.Series(pd.to_timedelta(np.arange(5), unit="d"))
>>> s
0 0 days
1 1 days
2 2 days
3 3 days
4 4 days
dtype: timedelta64[ns]
>>> s.dt.to_pytimedelta()
array([datetime.timedelta(0), datetime.timedelta(days=1),
datetime.timedelta(days=2), datetime.timedelta(days=3),
datetime.timedelta(days=4)], dtype=object)
"""
return self._get_values().to_pytimedelta()
@property
def components(self):
"""
Return a Dataframe of the components of the Timedeltas.
Returns
-------
DataFrame
Examples
--------
>>> s = pd.Series(pd.to_timedelta(np.arange(5), unit='s'))
>>> s
0 0 days 00:00:00
1 0 days 00:00:01
2 0 days 00:00:02
3 0 days 00:00:03
4 0 days 00:00:04
dtype: timedelta64[ns]
>>> s.dt.components
days hours minutes seconds milliseconds microseconds nanoseconds
0 0 0 0 0 0 0 0
1 0 0 0 1 0 0 0
2 0 0 0 2 0 0 0
3 0 0 0 3 0 0 0
4 0 0 0 4 0 0 0
"""
return (
self._get_values()
.components.set_index(self._parent.index)
.__finalize__(self._parent)
)
@property
def freq(self):
return self._get_values().inferred_freq
@delegate_names(
delegate=PeriodArray, accessors=PeriodArray._datetimelike_ops, typ="property"
)
@delegate_names(
delegate=PeriodArray, accessors=PeriodArray._datetimelike_methods, typ="method"
)
class PeriodProperties(Properties):
"""
Accessor object for datetimelike properties of the Series values.
Returns a Series indexed like the original Series.
Raises TypeError if the Series does not contain datetimelike values.
Examples
--------
>>> seconds_series = pd.Series(
... pd.period_range(
... start="2000-01-01 00:00:00", end="2000-01-01 00:00:03", freq="s"
... )
... )
>>> seconds_series
0 2000-01-01 00:00:00
1 2000-01-01 00:00:01
2 2000-01-01 00:00:02
3 2000-01-01 00:00:03
dtype: period[s]
>>> seconds_series.dt.second
0 0
1 1
2 2
3 3
dtype: int64
>>> hours_series = pd.Series(
... pd.period_range(start="2000-01-01 00:00", end="2000-01-01 03:00", freq="h")
... )
>>> hours_series
0 2000-01-01 00:00
1 2000-01-01 01:00
2 2000-01-01 02:00
3 2000-01-01 03:00
dtype: period[h]
>>> hours_series.dt.hour
0 0
1 1
2 2
3 3
dtype: int64
>>> quarters_series = pd.Series(
... pd.period_range(start="2000-01-01", end="2000-12-31", freq="Q-DEC")
... )
>>> quarters_series
0 2000Q1
1 2000Q2
2 2000Q3
3 2000Q4
dtype: period[Q-DEC]
>>> quarters_series.dt.quarter
0 1
1 2
2 3
3 4
dtype: int64
"""
class CombinedDatetimelikeProperties(
DatetimeProperties, TimedeltaProperties, PeriodProperties
):
def __new__(cls, data: Series): # pyright: ignore[reportInconsistentConstructor]
# CombinedDatetimelikeProperties isn't really instantiated. Instead
# we need to choose which parent (datetime or timedelta) is
# appropriate. Since we're checking the dtypes anyway, we'll just
# do all the validation here.
if not isinstance(data, ABCSeries):
raise TypeError(
f"cannot convert an object of type {type(data)} to a datetimelike index"
)
orig = data if isinstance(data.dtype, CategoricalDtype) else None
if orig is not None:
data = data._constructor(
orig.array,
name=orig.name,
copy=False,
dtype=orig._values.categories.dtype,
index=orig.index,
)
if isinstance(data.dtype, ArrowDtype) and data.dtype.kind in "Mm":
return ArrowTemporalProperties(data, orig)
if lib.is_np_dtype(data.dtype, "M"):
return DatetimeProperties(data, orig)
elif isinstance(data.dtype, DatetimeTZDtype):
return DatetimeProperties(data, orig)
elif lib.is_np_dtype(data.dtype, "m"):
return TimedeltaProperties(data, orig)
elif isinstance(data.dtype, PeriodDtype):
return PeriodProperties(data, orig)
raise AttributeError("Can only use .dt accessor with datetimelike values")

View File

@ -0,0 +1,388 @@
from __future__ import annotations
import textwrap
from typing import (
TYPE_CHECKING,
cast,
)
import numpy as np
from pandas._libs import (
NaT,
lib,
)
from pandas.errors import InvalidIndexError
from pandas.core.dtypes.cast import find_common_type
from pandas.core.algorithms import safe_sort
from pandas.core.indexes.base import (
Index,
_new_Index,
ensure_index,
ensure_index_from_sequences,
get_unanimous_names,
)
from pandas.core.indexes.category import CategoricalIndex
from pandas.core.indexes.datetimes import DatetimeIndex
from pandas.core.indexes.interval import IntervalIndex
from pandas.core.indexes.multi import MultiIndex
from pandas.core.indexes.period import PeriodIndex
from pandas.core.indexes.range import RangeIndex
from pandas.core.indexes.timedeltas import TimedeltaIndex
if TYPE_CHECKING:
from pandas._typing import Axis
_sort_msg = textwrap.dedent(
"""\
Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.
To accept the future behavior, pass 'sort=False'.
To retain the current behavior and silence the warning, pass 'sort=True'.
"""
)
__all__ = [
"Index",
"MultiIndex",
"CategoricalIndex",
"IntervalIndex",
"RangeIndex",
"InvalidIndexError",
"TimedeltaIndex",
"PeriodIndex",
"DatetimeIndex",
"_new_Index",
"NaT",
"ensure_index",
"ensure_index_from_sequences",
"get_objs_combined_axis",
"union_indexes",
"get_unanimous_names",
"all_indexes_same",
"default_index",
"safe_sort_index",
]
def get_objs_combined_axis(
objs,
intersect: bool = False,
axis: Axis = 0,
sort: bool = True,
copy: bool = False,
) -> Index:
"""
Extract combined index: return intersection or union (depending on the
value of "intersect") of indexes on given axis, or None if all objects
lack indexes (e.g. they are numpy arrays).
Parameters
----------
objs : list
Series or DataFrame objects, may be mix of the two.
intersect : bool, default False
If True, calculate the intersection between indexes. Otherwise,
calculate the union.
axis : {0 or 'index', 1 or 'outer'}, default 0
The axis to extract indexes from.
sort : bool, default True
Whether the result index should come out sorted or not.
copy : bool, default False
If True, return a copy of the combined index.
Returns
-------
Index
"""
obs_idxes = [obj._get_axis(axis) for obj in objs]
return _get_combined_index(obs_idxes, intersect=intersect, sort=sort, copy=copy)
def _get_distinct_objs(objs: list[Index]) -> list[Index]:
"""
Return a list with distinct elements of "objs" (different ids).
Preserves order.
"""
ids: set[int] = set()
res = []
for obj in objs:
if id(obj) not in ids:
ids.add(id(obj))
res.append(obj)
return res
def _get_combined_index(
indexes: list[Index],
intersect: bool = False,
sort: bool = False,
copy: bool = False,
) -> Index:
"""
Return the union or intersection of indexes.
Parameters
----------
indexes : list of Index or list objects
When intersect=True, do not accept list of lists.
intersect : bool, default False
If True, calculate the intersection between indexes. Otherwise,
calculate the union.
sort : bool, default False
Whether the result index should come out sorted or not.
copy : bool, default False
If True, return a copy of the combined index.
Returns
-------
Index
"""
# TODO: handle index names!
indexes = _get_distinct_objs(indexes)
if len(indexes) == 0:
index = Index([])
elif len(indexes) == 1:
index = indexes[0]
elif intersect:
index = indexes[0]
for other in indexes[1:]:
index = index.intersection(other)
else:
index = union_indexes(indexes, sort=False)
index = ensure_index(index)
if sort:
index = safe_sort_index(index)
# GH 29879
if copy:
index = index.copy()
return index
def safe_sort_index(index: Index) -> Index:
"""
Returns the sorted index
We keep the dtypes and the name attributes.
Parameters
----------
index : an Index
Returns
-------
Index
"""
if index.is_monotonic_increasing:
return index
try:
array_sorted = safe_sort(index)
except TypeError:
pass
else:
if isinstance(array_sorted, Index):
return array_sorted
array_sorted = cast(np.ndarray, array_sorted)
if isinstance(index, MultiIndex):
index = MultiIndex.from_tuples(array_sorted, names=index.names)
else:
index = Index(array_sorted, name=index.name, dtype=index.dtype)
return index
def union_indexes(indexes, sort: bool | None = True) -> Index:
"""
Return the union of indexes.
The behavior of sort and names is not consistent.
Parameters
----------
indexes : list of Index or list objects
sort : bool, default True
Whether the result index should come out sorted or not.
Returns
-------
Index
"""
if len(indexes) == 0:
raise AssertionError("Must have at least 1 Index to union")
if len(indexes) == 1:
result = indexes[0]
if isinstance(result, list):
if not sort:
result = Index(result)
else:
result = Index(sorted(result))
return result
indexes, kind = _sanitize_and_check(indexes)
def _unique_indices(inds, dtype) -> Index:
"""
Concatenate indices and remove duplicates.
Parameters
----------
inds : list of Index or list objects
dtype : dtype to set for the resulting Index
Returns
-------
Index
"""
if all(isinstance(ind, Index) for ind in inds):
inds = [ind.astype(dtype, copy=False) for ind in inds]
result = inds[0].unique()
other = inds[1].append(inds[2:])
diff = other[result.get_indexer_for(other) == -1]
if len(diff):
result = result.append(diff.unique())
if sort:
result = result.sort_values()
return result
def conv(i):
if isinstance(i, Index):
i = i.tolist()
return i
return Index(
lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort),
dtype=dtype,
)
def _find_common_index_dtype(inds):
"""
Finds a common type for the indexes to pass through to resulting index.
Parameters
----------
inds: list of Index or list objects
Returns
-------
The common type or None if no indexes were given
"""
dtypes = [idx.dtype for idx in indexes if isinstance(idx, Index)]
if dtypes:
dtype = find_common_type(dtypes)
else:
dtype = None
return dtype
if kind == "special":
result = indexes[0]
dtis = [x for x in indexes if isinstance(x, DatetimeIndex)]
dti_tzs = [x for x in dtis if x.tz is not None]
if len(dti_tzs) not in [0, len(dtis)]:
# TODO: this behavior is not tested (so may not be desired),
# but is kept in order to keep behavior the same when
# deprecating union_many
# test_frame_from_dict_with_mixed_indexes
raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex")
if len(dtis) == len(indexes):
sort = True
result = indexes[0]
elif len(dtis) > 1:
# If we have mixed timezones, our casting behavior may depend on
# the order of indexes, which we don't want.
sort = False
# TODO: what about Categorical[dt64]?
# test_frame_from_dict_with_mixed_indexes
indexes = [x.astype(object, copy=False) for x in indexes]
result = indexes[0]
for other in indexes[1:]:
result = result.union(other, sort=None if sort else False)
return result
elif kind == "array":
dtype = _find_common_index_dtype(indexes)
index = indexes[0]
if not all(index.equals(other) for other in indexes[1:]):
index = _unique_indices(indexes, dtype)
name = get_unanimous_names(*indexes)[0]
if name != index.name:
index = index.rename(name)
return index
else: # kind='list'
dtype = _find_common_index_dtype(indexes)
return _unique_indices(indexes, dtype)
def _sanitize_and_check(indexes):
"""
Verify the type of indexes and convert lists to Index.
Cases:
- [list, list, ...]: Return ([list, list, ...], 'list')
- [list, Index, ...]: Return _sanitize_and_check([Index, Index, ...])
Lists are sorted and converted to Index.
- [Index, Index, ...]: Return ([Index, Index, ...], TYPE)
TYPE = 'special' if at least one special type, 'array' otherwise.
Parameters
----------
indexes : list of Index or list objects
Returns
-------
sanitized_indexes : list of Index or list objects
type : {'list', 'array', 'special'}
"""
kinds = list({type(index) for index in indexes})
if list in kinds:
if len(kinds) > 1:
indexes = [
Index(list(x)) if not isinstance(x, Index) else x for x in indexes
]
kinds.remove(list)
else:
return indexes, "list"
if len(kinds) > 1 or Index not in kinds:
return indexes, "special"
else:
return indexes, "array"
def all_indexes_same(indexes) -> bool:
"""
Determine if all indexes contain the same elements.
Parameters
----------
indexes : iterable of Index objects
Returns
-------
bool
True if all indexes contain the same elements, False otherwise.
"""
itr = iter(indexes)
first = next(itr)
return all(first.equals(index) for index in itr)
def default_index(n: int) -> RangeIndex:
rng = range(n)
return RangeIndex._simple_new(rng, name=None)

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More