done
This commit is contained in:
239
lib/python3.11/site-packages/pandas/core/_numba/executor.py
Normal file
239
lib/python3.11/site-packages/pandas/core/_numba/executor.py
Normal file
@ -0,0 +1,239 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import functools
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import Scalar
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
|
||||
|
||||
@functools.cache
|
||||
def generate_apply_looper(func, nopython=True, nogil=True, parallel=False):
|
||||
if TYPE_CHECKING:
|
||||
import numba
|
||||
else:
|
||||
numba = import_optional_dependency("numba")
|
||||
nb_compat_func = numba.extending.register_jitable(func)
|
||||
|
||||
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
|
||||
def nb_looper(values, axis):
|
||||
# Operate on the first row/col in order to get
|
||||
# the output shape
|
||||
if axis == 0:
|
||||
first_elem = values[:, 0]
|
||||
dim0 = values.shape[1]
|
||||
else:
|
||||
first_elem = values[0]
|
||||
dim0 = values.shape[0]
|
||||
res0 = nb_compat_func(first_elem)
|
||||
# Use np.asarray to get shape for
|
||||
# https://github.com/numba/numba/issues/4202#issuecomment-1185981507
|
||||
buf_shape = (dim0,) + np.atleast_1d(np.asarray(res0)).shape
|
||||
if axis == 0:
|
||||
buf_shape = buf_shape[::-1]
|
||||
buff = np.empty(buf_shape)
|
||||
|
||||
if axis == 1:
|
||||
buff[0] = res0
|
||||
for i in numba.prange(1, values.shape[0]):
|
||||
buff[i] = nb_compat_func(values[i])
|
||||
else:
|
||||
buff[:, 0] = res0
|
||||
for j in numba.prange(1, values.shape[1]):
|
||||
buff[:, j] = nb_compat_func(values[:, j])
|
||||
return buff
|
||||
|
||||
return nb_looper
|
||||
|
||||
|
||||
@functools.cache
|
||||
def make_looper(func, result_dtype, is_grouped_kernel, nopython, nogil, parallel):
|
||||
if TYPE_CHECKING:
|
||||
import numba
|
||||
else:
|
||||
numba = import_optional_dependency("numba")
|
||||
|
||||
if is_grouped_kernel:
|
||||
|
||||
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
|
||||
def column_looper(
|
||||
values: np.ndarray,
|
||||
labels: np.ndarray,
|
||||
ngroups: int,
|
||||
min_periods: int,
|
||||
*args,
|
||||
):
|
||||
result = np.empty((values.shape[0], ngroups), dtype=result_dtype)
|
||||
na_positions = {}
|
||||
for i in numba.prange(values.shape[0]):
|
||||
output, na_pos = func(
|
||||
values[i], result_dtype, labels, ngroups, min_periods, *args
|
||||
)
|
||||
result[i] = output
|
||||
if len(na_pos) > 0:
|
||||
na_positions[i] = np.array(na_pos)
|
||||
return result, na_positions
|
||||
|
||||
else:
|
||||
|
||||
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
|
||||
def column_looper(
|
||||
values: np.ndarray,
|
||||
start: np.ndarray,
|
||||
end: np.ndarray,
|
||||
min_periods: int,
|
||||
*args,
|
||||
):
|
||||
result = np.empty((values.shape[0], len(start)), dtype=result_dtype)
|
||||
na_positions = {}
|
||||
for i in numba.prange(values.shape[0]):
|
||||
output, na_pos = func(
|
||||
values[i], result_dtype, start, end, min_periods, *args
|
||||
)
|
||||
result[i] = output
|
||||
if len(na_pos) > 0:
|
||||
na_positions[i] = np.array(na_pos)
|
||||
return result, na_positions
|
||||
|
||||
return column_looper
|
||||
|
||||
|
||||
default_dtype_mapping: dict[np.dtype, Any] = {
|
||||
np.dtype("int8"): np.int64,
|
||||
np.dtype("int16"): np.int64,
|
||||
np.dtype("int32"): np.int64,
|
||||
np.dtype("int64"): np.int64,
|
||||
np.dtype("uint8"): np.uint64,
|
||||
np.dtype("uint16"): np.uint64,
|
||||
np.dtype("uint32"): np.uint64,
|
||||
np.dtype("uint64"): np.uint64,
|
||||
np.dtype("float32"): np.float64,
|
||||
np.dtype("float64"): np.float64,
|
||||
np.dtype("complex64"): np.complex128,
|
||||
np.dtype("complex128"): np.complex128,
|
||||
}
|
||||
|
||||
|
||||
# TODO: Preserve complex dtypes
|
||||
|
||||
float_dtype_mapping: dict[np.dtype, Any] = {
|
||||
np.dtype("int8"): np.float64,
|
||||
np.dtype("int16"): np.float64,
|
||||
np.dtype("int32"): np.float64,
|
||||
np.dtype("int64"): np.float64,
|
||||
np.dtype("uint8"): np.float64,
|
||||
np.dtype("uint16"): np.float64,
|
||||
np.dtype("uint32"): np.float64,
|
||||
np.dtype("uint64"): np.float64,
|
||||
np.dtype("float32"): np.float64,
|
||||
np.dtype("float64"): np.float64,
|
||||
np.dtype("complex64"): np.float64,
|
||||
np.dtype("complex128"): np.float64,
|
||||
}
|
||||
|
||||
identity_dtype_mapping: dict[np.dtype, Any] = {
|
||||
np.dtype("int8"): np.int8,
|
||||
np.dtype("int16"): np.int16,
|
||||
np.dtype("int32"): np.int32,
|
||||
np.dtype("int64"): np.int64,
|
||||
np.dtype("uint8"): np.uint8,
|
||||
np.dtype("uint16"): np.uint16,
|
||||
np.dtype("uint32"): np.uint32,
|
||||
np.dtype("uint64"): np.uint64,
|
||||
np.dtype("float32"): np.float32,
|
||||
np.dtype("float64"): np.float64,
|
||||
np.dtype("complex64"): np.complex64,
|
||||
np.dtype("complex128"): np.complex128,
|
||||
}
|
||||
|
||||
|
||||
def generate_shared_aggregator(
|
||||
func: Callable[..., Scalar],
|
||||
dtype_mapping: dict[np.dtype, np.dtype],
|
||||
is_grouped_kernel: bool,
|
||||
nopython: bool,
|
||||
nogil: bool,
|
||||
parallel: bool,
|
||||
):
|
||||
"""
|
||||
Generate a Numba function that loops over the columns 2D object and applies
|
||||
a 1D numba kernel over each column.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
func : function
|
||||
aggregation function to be applied to each column
|
||||
dtype_mapping: dict or None
|
||||
If not None, maps a dtype to a result dtype.
|
||||
Otherwise, will fall back to default mapping.
|
||||
is_grouped_kernel: bool, default False
|
||||
Whether func operates using the group labels (True)
|
||||
or using starts/ends arrays
|
||||
|
||||
If true, you also need to pass the number of groups to this function
|
||||
nopython : bool
|
||||
nopython to be passed into numba.jit
|
||||
nogil : bool
|
||||
nogil to be passed into numba.jit
|
||||
parallel : bool
|
||||
parallel to be passed into numba.jit
|
||||
|
||||
Returns
|
||||
-------
|
||||
Numba function
|
||||
"""
|
||||
|
||||
# A wrapper around the looper function,
|
||||
# to dispatch based on dtype since numba is unable to do that in nopython mode
|
||||
|
||||
# It also post-processes the values by inserting nans where number of observations
|
||||
# is less than min_periods
|
||||
# Cannot do this in numba nopython mode
|
||||
# (you'll run into type-unification error when you cast int -> float)
|
||||
def looper_wrapper(
|
||||
values,
|
||||
start=None,
|
||||
end=None,
|
||||
labels=None,
|
||||
ngroups=None,
|
||||
min_periods: int = 0,
|
||||
**kwargs,
|
||||
):
|
||||
result_dtype = dtype_mapping[values.dtype]
|
||||
column_looper = make_looper(
|
||||
func, result_dtype, is_grouped_kernel, nopython, nogil, parallel
|
||||
)
|
||||
# Need to unpack kwargs since numba only supports *args
|
||||
if is_grouped_kernel:
|
||||
result, na_positions = column_looper(
|
||||
values, labels, ngroups, min_periods, *kwargs.values()
|
||||
)
|
||||
else:
|
||||
result, na_positions = column_looper(
|
||||
values, start, end, min_periods, *kwargs.values()
|
||||
)
|
||||
if result.dtype.kind == "i":
|
||||
# Look if na_positions is not empty
|
||||
# If so, convert the whole block
|
||||
# This is OK since int dtype cannot hold nan,
|
||||
# so if min_periods not satisfied for 1 col, it is not satisfied for
|
||||
# all columns at that index
|
||||
for na_pos in na_positions.values():
|
||||
if len(na_pos) > 0:
|
||||
result = result.astype("float64")
|
||||
break
|
||||
# TODO: Optimize this
|
||||
for i, na_pos in na_positions.items():
|
||||
if len(na_pos) > 0:
|
||||
result[i, na_pos] = np.nan
|
||||
return result
|
||||
|
||||
return looper_wrapper
|
||||
585
lib/python3.11/site-packages/pandas/core/_numba/extensions.py
Normal file
585
lib/python3.11/site-packages/pandas/core/_numba/extensions.py
Normal file
@ -0,0 +1,585 @@
|
||||
# Disable type checking for this module since numba's internals
|
||||
# are not typed, and we use numba's internals via its extension API
|
||||
# mypy: ignore-errors
|
||||
"""
|
||||
Utility classes/functions to let numba recognize
|
||||
pandas Index/Series/DataFrame
|
||||
|
||||
Mostly vendored from https://github.com/numba/numba/blob/main/numba/tests/pdlike_usecase.py
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from contextlib import contextmanager
|
||||
import operator
|
||||
|
||||
import numba
|
||||
from numba import types
|
||||
from numba.core import cgutils
|
||||
from numba.core.datamodel import models
|
||||
from numba.core.extending import (
|
||||
NativeValue,
|
||||
box,
|
||||
lower_builtin,
|
||||
make_attribute_wrapper,
|
||||
overload,
|
||||
overload_attribute,
|
||||
overload_method,
|
||||
register_model,
|
||||
type_callable,
|
||||
typeof_impl,
|
||||
unbox,
|
||||
)
|
||||
from numba.core.imputils import impl_ret_borrowed
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import lib
|
||||
|
||||
from pandas.core.indexes.base import Index
|
||||
from pandas.core.indexing import _iLocIndexer
|
||||
from pandas.core.internals import SingleBlockManager
|
||||
from pandas.core.series import Series
|
||||
|
||||
|
||||
# Helper function to hack around fact that Index casts numpy string dtype to object
|
||||
#
|
||||
# Idea is to set an attribute on a Index called _numba_data
|
||||
# that is the original data, or the object data casted to numpy string dtype,
|
||||
# with a context manager that is unset afterwards
|
||||
@contextmanager
|
||||
def set_numba_data(index: Index):
|
||||
numba_data = index._data
|
||||
if numba_data.dtype in (object, "string"):
|
||||
numba_data = np.asarray(numba_data)
|
||||
if not lib.is_string_array(numba_data):
|
||||
raise ValueError(
|
||||
"The numba engine only supports using string or numeric column names"
|
||||
)
|
||||
numba_data = numba_data.astype("U")
|
||||
try:
|
||||
index._numba_data = numba_data
|
||||
yield index
|
||||
finally:
|
||||
del index._numba_data
|
||||
|
||||
|
||||
# TODO: Range index support
|
||||
# (this currently lowers OK, but does not round-trip)
|
||||
class IndexType(types.Type):
|
||||
"""
|
||||
The type class for Index objects.
|
||||
"""
|
||||
|
||||
def __init__(self, dtype, layout, pyclass: any) -> None:
|
||||
self.pyclass = pyclass
|
||||
name = f"index({dtype}, {layout})"
|
||||
self.dtype = dtype
|
||||
self.layout = layout
|
||||
super().__init__(name)
|
||||
|
||||
@property
|
||||
def key(self):
|
||||
return self.pyclass, self.dtype, self.layout
|
||||
|
||||
@property
|
||||
def as_array(self):
|
||||
return types.Array(self.dtype, 1, self.layout)
|
||||
|
||||
def copy(self, dtype=None, ndim: int = 1, layout=None):
|
||||
assert ndim == 1
|
||||
if dtype is None:
|
||||
dtype = self.dtype
|
||||
layout = layout or self.layout
|
||||
return type(self)(dtype, layout, self.pyclass)
|
||||
|
||||
|
||||
class SeriesType(types.Type):
|
||||
"""
|
||||
The type class for Series objects.
|
||||
"""
|
||||
|
||||
def __init__(self, dtype, index, namety) -> None:
|
||||
assert isinstance(index, IndexType)
|
||||
self.dtype = dtype
|
||||
self.index = index
|
||||
self.values = types.Array(self.dtype, 1, "C")
|
||||
self.namety = namety
|
||||
name = f"series({dtype}, {index}, {namety})"
|
||||
super().__init__(name)
|
||||
|
||||
@property
|
||||
def key(self):
|
||||
return self.dtype, self.index, self.namety
|
||||
|
||||
@property
|
||||
def as_array(self):
|
||||
return self.values
|
||||
|
||||
def copy(self, dtype=None, ndim: int = 1, layout: str = "C"):
|
||||
assert ndim == 1
|
||||
assert layout == "C"
|
||||
if dtype is None:
|
||||
dtype = self.dtype
|
||||
return type(self)(dtype, self.index, self.namety)
|
||||
|
||||
|
||||
@typeof_impl.register(Index)
|
||||
def typeof_index(val, c):
|
||||
"""
|
||||
This will assume that only strings are in object dtype
|
||||
index.
|
||||
(you should check this before this gets lowered down to numba)
|
||||
"""
|
||||
# arrty = typeof_impl(val._data, c)
|
||||
arrty = typeof_impl(val._numba_data, c)
|
||||
assert arrty.ndim == 1
|
||||
return IndexType(arrty.dtype, arrty.layout, type(val))
|
||||
|
||||
|
||||
@typeof_impl.register(Series)
|
||||
def typeof_series(val, c):
|
||||
index = typeof_impl(val.index, c)
|
||||
arrty = typeof_impl(val.values, c)
|
||||
namety = typeof_impl(val.name, c)
|
||||
assert arrty.ndim == 1
|
||||
assert arrty.layout == "C"
|
||||
return SeriesType(arrty.dtype, index, namety)
|
||||
|
||||
|
||||
@type_callable(Series)
|
||||
def type_series_constructor(context):
|
||||
def typer(data, index, name=None):
|
||||
if isinstance(index, IndexType) and isinstance(data, types.Array):
|
||||
assert data.ndim == 1
|
||||
if name is None:
|
||||
name = types.intp
|
||||
return SeriesType(data.dtype, index, name)
|
||||
|
||||
return typer
|
||||
|
||||
|
||||
@type_callable(Index)
|
||||
def type_index_constructor(context):
|
||||
def typer(data, hashmap=None):
|
||||
if isinstance(data, types.Array):
|
||||
assert data.layout == "C"
|
||||
assert data.ndim == 1
|
||||
assert hashmap is None or isinstance(hashmap, types.DictType)
|
||||
return IndexType(data.dtype, layout=data.layout, pyclass=Index)
|
||||
|
||||
return typer
|
||||
|
||||
|
||||
# Backend extensions for Index and Series and Frame
|
||||
@register_model(IndexType)
|
||||
class IndexModel(models.StructModel):
|
||||
def __init__(self, dmm, fe_type) -> None:
|
||||
# We don't want the numpy string scalar type in our hashmap
|
||||
members = [
|
||||
("data", fe_type.as_array),
|
||||
# This is an attempt to emulate our hashtable code with a numba
|
||||
# typed dict
|
||||
# It maps from values in the index to their integer positions in the array
|
||||
("hashmap", types.DictType(fe_type.dtype, types.intp)),
|
||||
# Pointer to the Index object this was created from, or that it
|
||||
# boxes to
|
||||
# https://numba.discourse.group/t/qst-how-to-cache-the-boxing-of-an-object/2128/2?u=lithomas1
|
||||
("parent", types.pyobject),
|
||||
]
|
||||
models.StructModel.__init__(self, dmm, fe_type, members)
|
||||
|
||||
|
||||
@register_model(SeriesType)
|
||||
class SeriesModel(models.StructModel):
|
||||
def __init__(self, dmm, fe_type) -> None:
|
||||
members = [
|
||||
("index", fe_type.index),
|
||||
("values", fe_type.as_array),
|
||||
("name", fe_type.namety),
|
||||
]
|
||||
models.StructModel.__init__(self, dmm, fe_type, members)
|
||||
|
||||
|
||||
make_attribute_wrapper(IndexType, "data", "_data")
|
||||
make_attribute_wrapper(IndexType, "hashmap", "hashmap")
|
||||
|
||||
make_attribute_wrapper(SeriesType, "index", "index")
|
||||
make_attribute_wrapper(SeriesType, "values", "values")
|
||||
make_attribute_wrapper(SeriesType, "name", "name")
|
||||
|
||||
|
||||
@lower_builtin(Series, types.Array, IndexType)
|
||||
def pdseries_constructor(context, builder, sig, args):
|
||||
data, index = args
|
||||
series = cgutils.create_struct_proxy(sig.return_type)(context, builder)
|
||||
series.index = index
|
||||
series.values = data
|
||||
series.name = context.get_constant(types.intp, 0)
|
||||
return impl_ret_borrowed(context, builder, sig.return_type, series._getvalue())
|
||||
|
||||
|
||||
@lower_builtin(Series, types.Array, IndexType, types.intp)
|
||||
@lower_builtin(Series, types.Array, IndexType, types.float64)
|
||||
@lower_builtin(Series, types.Array, IndexType, types.unicode_type)
|
||||
def pdseries_constructor_with_name(context, builder, sig, args):
|
||||
data, index, name = args
|
||||
series = cgutils.create_struct_proxy(sig.return_type)(context, builder)
|
||||
series.index = index
|
||||
series.values = data
|
||||
series.name = name
|
||||
return impl_ret_borrowed(context, builder, sig.return_type, series._getvalue())
|
||||
|
||||
|
||||
@lower_builtin(Index, types.Array, types.DictType, types.pyobject)
|
||||
def index_constructor_2arg(context, builder, sig, args):
|
||||
(data, hashmap, parent) = args
|
||||
index = cgutils.create_struct_proxy(sig.return_type)(context, builder)
|
||||
|
||||
index.data = data
|
||||
index.hashmap = hashmap
|
||||
index.parent = parent
|
||||
return impl_ret_borrowed(context, builder, sig.return_type, index._getvalue())
|
||||
|
||||
|
||||
@lower_builtin(Index, types.Array, types.DictType)
|
||||
def index_constructor_2arg_parent(context, builder, sig, args):
|
||||
# Basically same as index_constructor_1arg, but also lets you specify the
|
||||
# parent object
|
||||
(data, hashmap) = args
|
||||
index = cgutils.create_struct_proxy(sig.return_type)(context, builder)
|
||||
|
||||
index.data = data
|
||||
index.hashmap = hashmap
|
||||
return impl_ret_borrowed(context, builder, sig.return_type, index._getvalue())
|
||||
|
||||
|
||||
@lower_builtin(Index, types.Array)
|
||||
def index_constructor_1arg(context, builder, sig, args):
|
||||
from numba.typed import Dict
|
||||
|
||||
key_type = sig.return_type.dtype
|
||||
value_type = types.intp
|
||||
|
||||
def index_impl(data):
|
||||
return Index(data, Dict.empty(key_type, value_type))
|
||||
|
||||
return context.compile_internal(builder, index_impl, sig, args)
|
||||
|
||||
|
||||
# Helper to convert the unicodecharseq (numpy string scalar) into a unicode_type
|
||||
# (regular string)
|
||||
def maybe_cast_str(x):
|
||||
# Dummy function that numba can overload
|
||||
pass
|
||||
|
||||
|
||||
@overload(maybe_cast_str)
|
||||
def maybe_cast_str_impl(x):
|
||||
"""Converts numba UnicodeCharSeq (numpy string scalar) -> unicode type (string).
|
||||
Is a no-op for other types."""
|
||||
if isinstance(x, types.UnicodeCharSeq):
|
||||
return lambda x: str(x)
|
||||
else:
|
||||
return lambda x: x
|
||||
|
||||
|
||||
@unbox(IndexType)
|
||||
def unbox_index(typ, obj, c):
|
||||
"""
|
||||
Convert a Index object to a native structure.
|
||||
|
||||
Note: Object dtype is not allowed here
|
||||
"""
|
||||
data_obj = c.pyapi.object_getattr_string(obj, "_numba_data")
|
||||
index = cgutils.create_struct_proxy(typ)(c.context, c.builder)
|
||||
# If we see an object array, assume its been validated as only containing strings
|
||||
# We still need to do the conversion though
|
||||
index.data = c.unbox(typ.as_array, data_obj).value
|
||||
typed_dict_obj = c.pyapi.unserialize(c.pyapi.serialize_object(numba.typed.Dict))
|
||||
# Create an empty typed dict in numba for the hashmap for indexing
|
||||
# equiv of numba.typed.Dict.empty(typ.dtype, types.intp)
|
||||
arr_type_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.dtype))
|
||||
intp_type_obj = c.pyapi.unserialize(c.pyapi.serialize_object(types.intp))
|
||||
hashmap_obj = c.pyapi.call_method(
|
||||
typed_dict_obj, "empty", (arr_type_obj, intp_type_obj)
|
||||
)
|
||||
index.hashmap = c.unbox(types.DictType(typ.dtype, types.intp), hashmap_obj).value
|
||||
# Set the parent for speedy boxing.
|
||||
index.parent = obj
|
||||
|
||||
# Decrefs
|
||||
c.pyapi.decref(data_obj)
|
||||
c.pyapi.decref(arr_type_obj)
|
||||
c.pyapi.decref(intp_type_obj)
|
||||
c.pyapi.decref(typed_dict_obj)
|
||||
|
||||
return NativeValue(index._getvalue())
|
||||
|
||||
|
||||
@unbox(SeriesType)
|
||||
def unbox_series(typ, obj, c):
|
||||
"""
|
||||
Convert a Series object to a native structure.
|
||||
"""
|
||||
index_obj = c.pyapi.object_getattr_string(obj, "index")
|
||||
values_obj = c.pyapi.object_getattr_string(obj, "values")
|
||||
name_obj = c.pyapi.object_getattr_string(obj, "name")
|
||||
|
||||
series = cgutils.create_struct_proxy(typ)(c.context, c.builder)
|
||||
series.index = c.unbox(typ.index, index_obj).value
|
||||
series.values = c.unbox(typ.values, values_obj).value
|
||||
series.name = c.unbox(typ.namety, name_obj).value
|
||||
|
||||
# Decrefs
|
||||
c.pyapi.decref(index_obj)
|
||||
c.pyapi.decref(values_obj)
|
||||
c.pyapi.decref(name_obj)
|
||||
|
||||
return NativeValue(series._getvalue())
|
||||
|
||||
|
||||
@box(IndexType)
|
||||
def box_index(typ, val, c):
|
||||
"""
|
||||
Convert a native index structure to a Index object.
|
||||
|
||||
If our native index is of a numpy string dtype, we'll cast it to
|
||||
object.
|
||||
"""
|
||||
# First build a Numpy array object, then wrap it in a Index
|
||||
index = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val)
|
||||
|
||||
res = cgutils.alloca_once_value(c.builder, index.parent)
|
||||
|
||||
# Does parent exist?
|
||||
# (it means already boxed once, or Index same as original df.index or df.columns)
|
||||
# xref https://github.com/numba/numba/blob/596e8a55334cc46854e3192766e643767bd7c934/numba/core/boxing.py#L593C17-L593C17
|
||||
with c.builder.if_else(cgutils.is_not_null(c.builder, index.parent)) as (
|
||||
has_parent,
|
||||
otherwise,
|
||||
):
|
||||
with has_parent:
|
||||
c.pyapi.incref(index.parent)
|
||||
with otherwise:
|
||||
# TODO: preserve the original class for the index
|
||||
# Also need preserve the name of the Index
|
||||
# class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.pyclass))
|
||||
class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Index))
|
||||
array_obj = c.box(typ.as_array, index.data)
|
||||
if isinstance(typ.dtype, types.UnicodeCharSeq):
|
||||
# We converted to numpy string dtype, convert back
|
||||
# to object since _simple_new won't do that for uss
|
||||
object_str_obj = c.pyapi.unserialize(c.pyapi.serialize_object("object"))
|
||||
array_obj = c.pyapi.call_method(array_obj, "astype", (object_str_obj,))
|
||||
c.pyapi.decref(object_str_obj)
|
||||
# this is basically Index._simple_new(array_obj, name_obj) in python
|
||||
index_obj = c.pyapi.call_method(class_obj, "_simple_new", (array_obj,))
|
||||
index.parent = index_obj
|
||||
c.builder.store(index_obj, res)
|
||||
|
||||
# Decrefs
|
||||
c.pyapi.decref(class_obj)
|
||||
c.pyapi.decref(array_obj)
|
||||
return c.builder.load(res)
|
||||
|
||||
|
||||
@box(SeriesType)
|
||||
def box_series(typ, val, c):
|
||||
"""
|
||||
Convert a native series structure to a Series object.
|
||||
"""
|
||||
series = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val)
|
||||
series_const_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Series._from_mgr))
|
||||
mgr_const_obj = c.pyapi.unserialize(
|
||||
c.pyapi.serialize_object(SingleBlockManager.from_array)
|
||||
)
|
||||
index_obj = c.box(typ.index, series.index)
|
||||
array_obj = c.box(typ.as_array, series.values)
|
||||
name_obj = c.box(typ.namety, series.name)
|
||||
# This is basically equivalent of
|
||||
# pd.Series(data=array_obj, index=index_obj)
|
||||
# To improve perf, we will construct the Series from a manager
|
||||
# object to avoid checks.
|
||||
# We'll also set the name attribute manually to avoid validation
|
||||
mgr_obj = c.pyapi.call_function_objargs(
|
||||
mgr_const_obj,
|
||||
(
|
||||
array_obj,
|
||||
index_obj,
|
||||
),
|
||||
)
|
||||
mgr_axes_obj = c.pyapi.object_getattr_string(mgr_obj, "axes")
|
||||
# Series._constructor_from_mgr(mgr, axes)
|
||||
series_obj = c.pyapi.call_function_objargs(
|
||||
series_const_obj, (mgr_obj, mgr_axes_obj)
|
||||
)
|
||||
c.pyapi.object_setattr_string(series_obj, "_name", name_obj)
|
||||
|
||||
# Decrefs
|
||||
c.pyapi.decref(series_const_obj)
|
||||
c.pyapi.decref(mgr_axes_obj)
|
||||
c.pyapi.decref(mgr_obj)
|
||||
c.pyapi.decref(mgr_const_obj)
|
||||
c.pyapi.decref(index_obj)
|
||||
c.pyapi.decref(array_obj)
|
||||
c.pyapi.decref(name_obj)
|
||||
|
||||
return series_obj
|
||||
|
||||
|
||||
# Add common series reductions (e.g. mean, sum),
|
||||
# and also add common binops (e.g. add, sub, mul, div)
|
||||
def generate_series_reduction(ser_reduction, ser_method):
|
||||
@overload_method(SeriesType, ser_reduction)
|
||||
def series_reduction(series):
|
||||
def series_reduction_impl(series):
|
||||
return ser_method(series.values)
|
||||
|
||||
return series_reduction_impl
|
||||
|
||||
return series_reduction
|
||||
|
||||
|
||||
def generate_series_binop(binop):
|
||||
@overload(binop)
|
||||
def series_binop(series1, value):
|
||||
if isinstance(series1, SeriesType):
|
||||
if isinstance(value, SeriesType):
|
||||
|
||||
def series_binop_impl(series1, series2):
|
||||
# TODO: Check index matching?
|
||||
return Series(
|
||||
binop(series1.values, series2.values),
|
||||
series1.index,
|
||||
series1.name,
|
||||
)
|
||||
|
||||
return series_binop_impl
|
||||
else:
|
||||
|
||||
def series_binop_impl(series1, value):
|
||||
return Series(
|
||||
binop(series1.values, value), series1.index, series1.name
|
||||
)
|
||||
|
||||
return series_binop_impl
|
||||
|
||||
return series_binop
|
||||
|
||||
|
||||
series_reductions = [
|
||||
("sum", np.sum),
|
||||
("mean", np.mean),
|
||||
# Disabled due to discrepancies between numba std. dev
|
||||
# and pandas std. dev (no way to specify dof)
|
||||
# ("std", np.std),
|
||||
# ("var", np.var),
|
||||
("min", np.min),
|
||||
("max", np.max),
|
||||
]
|
||||
for reduction, reduction_method in series_reductions:
|
||||
generate_series_reduction(reduction, reduction_method)
|
||||
|
||||
series_binops = [operator.add, operator.sub, operator.mul, operator.truediv]
|
||||
|
||||
for ser_binop in series_binops:
|
||||
generate_series_binop(ser_binop)
|
||||
|
||||
|
||||
# get_loc on Index
|
||||
@overload_method(IndexType, "get_loc")
|
||||
def index_get_loc(index, item):
|
||||
def index_get_loc_impl(index, item):
|
||||
# Initialize the hash table if not initialized
|
||||
if len(index.hashmap) == 0:
|
||||
for i, val in enumerate(index._data):
|
||||
index.hashmap[val] = i
|
||||
return index.hashmap[item]
|
||||
|
||||
return index_get_loc_impl
|
||||
|
||||
|
||||
# Indexing for Series/Index
|
||||
@overload(operator.getitem)
|
||||
def series_indexing(series, item):
|
||||
if isinstance(series, SeriesType):
|
||||
|
||||
def series_getitem(series, item):
|
||||
loc = series.index.get_loc(item)
|
||||
return series.iloc[loc]
|
||||
|
||||
return series_getitem
|
||||
|
||||
|
||||
@overload(operator.getitem)
|
||||
def index_indexing(index, idx):
|
||||
if isinstance(index, IndexType):
|
||||
|
||||
def index_getitem(index, idx):
|
||||
return index._data[idx]
|
||||
|
||||
return index_getitem
|
||||
|
||||
|
||||
class IlocType(types.Type):
|
||||
def __init__(self, obj_type) -> None:
|
||||
self.obj_type = obj_type
|
||||
name = f"iLocIndexer({obj_type})"
|
||||
super().__init__(name=name)
|
||||
|
||||
@property
|
||||
def key(self):
|
||||
return self.obj_type
|
||||
|
||||
|
||||
@typeof_impl.register(_iLocIndexer)
|
||||
def typeof_iloc(val, c):
|
||||
objtype = typeof_impl(val.obj, c)
|
||||
return IlocType(objtype)
|
||||
|
||||
|
||||
@type_callable(_iLocIndexer)
|
||||
def type_iloc_constructor(context):
|
||||
def typer(obj):
|
||||
if isinstance(obj, SeriesType):
|
||||
return IlocType(obj)
|
||||
|
||||
return typer
|
||||
|
||||
|
||||
@lower_builtin(_iLocIndexer, SeriesType)
|
||||
def iloc_constructor(context, builder, sig, args):
|
||||
(obj,) = args
|
||||
iloc_indexer = cgutils.create_struct_proxy(sig.return_type)(context, builder)
|
||||
iloc_indexer.obj = obj
|
||||
return impl_ret_borrowed(
|
||||
context, builder, sig.return_type, iloc_indexer._getvalue()
|
||||
)
|
||||
|
||||
|
||||
@register_model(IlocType)
|
||||
class ILocModel(models.StructModel):
|
||||
def __init__(self, dmm, fe_type) -> None:
|
||||
members = [("obj", fe_type.obj_type)]
|
||||
models.StructModel.__init__(self, dmm, fe_type, members)
|
||||
|
||||
|
||||
make_attribute_wrapper(IlocType, "obj", "obj")
|
||||
|
||||
|
||||
@overload_attribute(SeriesType, "iloc")
|
||||
def series_iloc(series):
|
||||
def get(series):
|
||||
return _iLocIndexer(series)
|
||||
|
||||
return get
|
||||
|
||||
|
||||
@overload(operator.getitem)
|
||||
def iloc_getitem(iloc_indexer, i):
|
||||
if isinstance(iloc_indexer, IlocType):
|
||||
|
||||
def getitem_impl(iloc_indexer, i):
|
||||
return iloc_indexer.obj.values[i]
|
||||
|
||||
return getitem_impl
|
||||
@ -0,0 +1,27 @@
|
||||
from pandas.core._numba.kernels.mean_ import (
|
||||
grouped_mean,
|
||||
sliding_mean,
|
||||
)
|
||||
from pandas.core._numba.kernels.min_max_ import (
|
||||
grouped_min_max,
|
||||
sliding_min_max,
|
||||
)
|
||||
from pandas.core._numba.kernels.sum_ import (
|
||||
grouped_sum,
|
||||
sliding_sum,
|
||||
)
|
||||
from pandas.core._numba.kernels.var_ import (
|
||||
grouped_var,
|
||||
sliding_var,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"sliding_mean",
|
||||
"grouped_mean",
|
||||
"sliding_sum",
|
||||
"grouped_sum",
|
||||
"sliding_var",
|
||||
"grouped_var",
|
||||
"sliding_min_max",
|
||||
"grouped_min_max",
|
||||
]
|
||||
196
lib/python3.11/site-packages/pandas/core/_numba/kernels/mean_.py
Normal file
196
lib/python3.11/site-packages/pandas/core/_numba/kernels/mean_.py
Normal file
@ -0,0 +1,196 @@
|
||||
"""
|
||||
Numba 1D mean kernels that can be shared by
|
||||
* Dataframe / Series
|
||||
* groupby
|
||||
* rolling / expanding
|
||||
|
||||
Mirrors pandas/_libs/window/aggregation.pyx
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numba
|
||||
import numpy as np
|
||||
|
||||
from pandas.core._numba.kernels.shared import is_monotonic_increasing
|
||||
from pandas.core._numba.kernels.sum_ import grouped_kahan_sum
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import npt
|
||||
|
||||
|
||||
@numba.jit(nopython=True, nogil=True, parallel=False)
|
||||
def add_mean(
|
||||
val: float,
|
||||
nobs: int,
|
||||
sum_x: float,
|
||||
neg_ct: int,
|
||||
compensation: float,
|
||||
num_consecutive_same_value: int,
|
||||
prev_value: float,
|
||||
) -> tuple[int, float, int, float, int, float]:
|
||||
if not np.isnan(val):
|
||||
nobs += 1
|
||||
y = val - compensation
|
||||
t = sum_x + y
|
||||
compensation = t - sum_x - y
|
||||
sum_x = t
|
||||
if val < 0:
|
||||
neg_ct += 1
|
||||
|
||||
if val == prev_value:
|
||||
num_consecutive_same_value += 1
|
||||
else:
|
||||
num_consecutive_same_value = 1
|
||||
prev_value = val
|
||||
|
||||
return nobs, sum_x, neg_ct, compensation, num_consecutive_same_value, prev_value
|
||||
|
||||
|
||||
@numba.jit(nopython=True, nogil=True, parallel=False)
|
||||
def remove_mean(
|
||||
val: float, nobs: int, sum_x: float, neg_ct: int, compensation: float
|
||||
) -> tuple[int, float, int, float]:
|
||||
if not np.isnan(val):
|
||||
nobs -= 1
|
||||
y = -val - compensation
|
||||
t = sum_x + y
|
||||
compensation = t - sum_x - y
|
||||
sum_x = t
|
||||
if val < 0:
|
||||
neg_ct -= 1
|
||||
return nobs, sum_x, neg_ct, compensation
|
||||
|
||||
|
||||
@numba.jit(nopython=True, nogil=True, parallel=False)
|
||||
def sliding_mean(
|
||||
values: np.ndarray,
|
||||
result_dtype: np.dtype,
|
||||
start: np.ndarray,
|
||||
end: np.ndarray,
|
||||
min_periods: int,
|
||||
) -> tuple[np.ndarray, list[int]]:
|
||||
N = len(start)
|
||||
nobs = 0
|
||||
sum_x = 0.0
|
||||
neg_ct = 0
|
||||
compensation_add = 0.0
|
||||
compensation_remove = 0.0
|
||||
|
||||
is_monotonic_increasing_bounds = is_monotonic_increasing(
|
||||
start
|
||||
) and is_monotonic_increasing(end)
|
||||
|
||||
output = np.empty(N, dtype=result_dtype)
|
||||
|
||||
for i in range(N):
|
||||
s = start[i]
|
||||
e = end[i]
|
||||
if i == 0 or not is_monotonic_increasing_bounds:
|
||||
prev_value = values[s]
|
||||
num_consecutive_same_value = 0
|
||||
|
||||
for j in range(s, e):
|
||||
val = values[j]
|
||||
(
|
||||
nobs,
|
||||
sum_x,
|
||||
neg_ct,
|
||||
compensation_add,
|
||||
num_consecutive_same_value,
|
||||
prev_value,
|
||||
) = add_mean(
|
||||
val,
|
||||
nobs,
|
||||
sum_x,
|
||||
neg_ct,
|
||||
compensation_add,
|
||||
num_consecutive_same_value,
|
||||
prev_value, # pyright: ignore[reportGeneralTypeIssues]
|
||||
)
|
||||
else:
|
||||
for j in range(start[i - 1], s):
|
||||
val = values[j]
|
||||
nobs, sum_x, neg_ct, compensation_remove = remove_mean(
|
||||
val, nobs, sum_x, neg_ct, compensation_remove
|
||||
)
|
||||
|
||||
for j in range(end[i - 1], e):
|
||||
val = values[j]
|
||||
(
|
||||
nobs,
|
||||
sum_x,
|
||||
neg_ct,
|
||||
compensation_add,
|
||||
num_consecutive_same_value,
|
||||
prev_value,
|
||||
) = add_mean(
|
||||
val,
|
||||
nobs,
|
||||
sum_x,
|
||||
neg_ct,
|
||||
compensation_add,
|
||||
num_consecutive_same_value,
|
||||
prev_value, # pyright: ignore[reportGeneralTypeIssues]
|
||||
)
|
||||
|
||||
if nobs >= min_periods and nobs > 0:
|
||||
result = sum_x / nobs
|
||||
if num_consecutive_same_value >= nobs:
|
||||
result = prev_value
|
||||
elif neg_ct == 0 and result < 0:
|
||||
result = 0
|
||||
elif neg_ct == nobs and result > 0:
|
||||
result = 0
|
||||
else:
|
||||
result = np.nan
|
||||
|
||||
output[i] = result
|
||||
|
||||
if not is_monotonic_increasing_bounds:
|
||||
nobs = 0
|
||||
sum_x = 0.0
|
||||
neg_ct = 0
|
||||
compensation_remove = 0.0
|
||||
|
||||
# na_position is empty list since float64 can already hold nans
|
||||
# Do list comprehension, since numba cannot figure out that na_pos is
|
||||
# empty list of ints on its own
|
||||
na_pos = [0 for i in range(0)]
|
||||
return output, na_pos
|
||||
|
||||
|
||||
@numba.jit(nopython=True, nogil=True, parallel=False)
|
||||
def grouped_mean(
|
||||
values: np.ndarray,
|
||||
result_dtype: np.dtype,
|
||||
labels: npt.NDArray[np.intp],
|
||||
ngroups: int,
|
||||
min_periods: int,
|
||||
) -> tuple[np.ndarray, list[int]]:
|
||||
output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum(
|
||||
values, result_dtype, labels, ngroups
|
||||
)
|
||||
|
||||
# Post-processing, replace sums that don't satisfy min_periods
|
||||
for lab in range(ngroups):
|
||||
nobs = nobs_arr[lab]
|
||||
num_consecutive_same_value = consecutive_counts[lab]
|
||||
prev_value = prev_vals[lab]
|
||||
sum_x = output[lab]
|
||||
if nobs >= min_periods:
|
||||
if num_consecutive_same_value >= nobs:
|
||||
result = prev_value * nobs
|
||||
else:
|
||||
result = sum_x
|
||||
else:
|
||||
result = np.nan
|
||||
result /= nobs
|
||||
output[lab] = result
|
||||
|
||||
# na_position is empty list since float64 can already hold nans
|
||||
# Do list comprehension, since numba cannot figure out that na_pos is
|
||||
# empty list of ints on its own
|
||||
na_pos = [0 for i in range(0)]
|
||||
return output, na_pos
|
||||
@ -0,0 +1,125 @@
|
||||
"""
|
||||
Numba 1D min/max kernels that can be shared by
|
||||
* Dataframe / Series
|
||||
* groupby
|
||||
* rolling / expanding
|
||||
|
||||
Mirrors pandas/_libs/window/aggregation.pyx
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numba
|
||||
import numpy as np
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import npt
|
||||
|
||||
|
||||
@numba.jit(nopython=True, nogil=True, parallel=False)
|
||||
def sliding_min_max(
|
||||
values: np.ndarray,
|
||||
result_dtype: np.dtype,
|
||||
start: np.ndarray,
|
||||
end: np.ndarray,
|
||||
min_periods: int,
|
||||
is_max: bool,
|
||||
) -> tuple[np.ndarray, list[int]]:
|
||||
N = len(start)
|
||||
nobs = 0
|
||||
output = np.empty(N, dtype=result_dtype)
|
||||
na_pos = []
|
||||
# Use deque once numba supports it
|
||||
# https://github.com/numba/numba/issues/7417
|
||||
Q: list = []
|
||||
W: list = []
|
||||
for i in range(N):
|
||||
curr_win_size = end[i] - start[i]
|
||||
if i == 0:
|
||||
st = start[i]
|
||||
else:
|
||||
st = end[i - 1]
|
||||
|
||||
for k in range(st, end[i]):
|
||||
ai = values[k]
|
||||
if not np.isnan(ai):
|
||||
nobs += 1
|
||||
elif is_max:
|
||||
ai = -np.inf
|
||||
else:
|
||||
ai = np.inf
|
||||
# Discard previous entries if we find new min or max
|
||||
if is_max:
|
||||
while Q and ((ai >= values[Q[-1]]) or values[Q[-1]] != values[Q[-1]]):
|
||||
Q.pop()
|
||||
else:
|
||||
while Q and ((ai <= values[Q[-1]]) or values[Q[-1]] != values[Q[-1]]):
|
||||
Q.pop()
|
||||
Q.append(k)
|
||||
W.append(k)
|
||||
|
||||
# Discard entries outside and left of current window
|
||||
while Q and Q[0] <= start[i] - 1:
|
||||
Q.pop(0)
|
||||
while W and W[0] <= start[i] - 1:
|
||||
if not np.isnan(values[W[0]]):
|
||||
nobs -= 1
|
||||
W.pop(0)
|
||||
|
||||
# Save output based on index in input value array
|
||||
if Q and curr_win_size > 0 and nobs >= min_periods:
|
||||
output[i] = values[Q[0]]
|
||||
else:
|
||||
if values.dtype.kind != "i":
|
||||
output[i] = np.nan
|
||||
else:
|
||||
na_pos.append(i)
|
||||
|
||||
return output, na_pos
|
||||
|
||||
|
||||
@numba.jit(nopython=True, nogil=True, parallel=False)
|
||||
def grouped_min_max(
|
||||
values: np.ndarray,
|
||||
result_dtype: np.dtype,
|
||||
labels: npt.NDArray[np.intp],
|
||||
ngroups: int,
|
||||
min_periods: int,
|
||||
is_max: bool,
|
||||
) -> tuple[np.ndarray, list[int]]:
|
||||
N = len(labels)
|
||||
nobs = np.zeros(ngroups, dtype=np.int64)
|
||||
na_pos = []
|
||||
output = np.empty(ngroups, dtype=result_dtype)
|
||||
|
||||
for i in range(N):
|
||||
lab = labels[i]
|
||||
val = values[i]
|
||||
if lab < 0:
|
||||
continue
|
||||
|
||||
if values.dtype.kind == "i" or not np.isnan(val):
|
||||
nobs[lab] += 1
|
||||
else:
|
||||
# NaN value cannot be a min/max value
|
||||
continue
|
||||
|
||||
if nobs[lab] == 1:
|
||||
# First element in group, set output equal to this
|
||||
output[lab] = val
|
||||
continue
|
||||
|
||||
if is_max:
|
||||
if val > output[lab]:
|
||||
output[lab] = val
|
||||
else:
|
||||
if val < output[lab]:
|
||||
output[lab] = val
|
||||
|
||||
# Set labels that don't satisfy min_periods as np.nan
|
||||
for lab, count in enumerate(nobs):
|
||||
if count < min_periods:
|
||||
na_pos.append(lab)
|
||||
|
||||
return output, na_pos
|
||||
@ -0,0 +1,29 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numba
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import numpy as np
|
||||
|
||||
|
||||
@numba.jit(
|
||||
# error: Any? not callable
|
||||
numba.boolean(numba.int64[:]), # type: ignore[misc]
|
||||
nopython=True,
|
||||
nogil=True,
|
||||
parallel=False,
|
||||
)
|
||||
def is_monotonic_increasing(bounds: np.ndarray) -> bool:
|
||||
"""Check if int64 values are monotonically increasing."""
|
||||
n = len(bounds)
|
||||
if n < 2:
|
||||
return True
|
||||
prev = bounds[0]
|
||||
for i in range(1, n):
|
||||
cur = bounds[i]
|
||||
if cur < prev:
|
||||
return False
|
||||
prev = cur
|
||||
return True
|
||||
244
lib/python3.11/site-packages/pandas/core/_numba/kernels/sum_.py
Normal file
244
lib/python3.11/site-packages/pandas/core/_numba/kernels/sum_.py
Normal file
@ -0,0 +1,244 @@
|
||||
"""
|
||||
Numba 1D sum kernels that can be shared by
|
||||
* Dataframe / Series
|
||||
* groupby
|
||||
* rolling / expanding
|
||||
|
||||
Mirrors pandas/_libs/window/aggregation.pyx
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
)
|
||||
|
||||
import numba
|
||||
from numba.extending import register_jitable
|
||||
import numpy as np
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import npt
|
||||
|
||||
from pandas.core._numba.kernels.shared import is_monotonic_increasing
|
||||
|
||||
|
||||
@numba.jit(nopython=True, nogil=True, parallel=False)
|
||||
def add_sum(
|
||||
val: Any,
|
||||
nobs: int,
|
||||
sum_x: Any,
|
||||
compensation: Any,
|
||||
num_consecutive_same_value: int,
|
||||
prev_value: Any,
|
||||
) -> tuple[int, Any, Any, int, Any]:
|
||||
if not np.isnan(val):
|
||||
nobs += 1
|
||||
y = val - compensation
|
||||
t = sum_x + y
|
||||
compensation = t - sum_x - y
|
||||
sum_x = t
|
||||
|
||||
if val == prev_value:
|
||||
num_consecutive_same_value += 1
|
||||
else:
|
||||
num_consecutive_same_value = 1
|
||||
prev_value = val
|
||||
|
||||
return nobs, sum_x, compensation, num_consecutive_same_value, prev_value
|
||||
|
||||
|
||||
@numba.jit(nopython=True, nogil=True, parallel=False)
|
||||
def remove_sum(
|
||||
val: Any, nobs: int, sum_x: Any, compensation: Any
|
||||
) -> tuple[int, Any, Any]:
|
||||
if not np.isnan(val):
|
||||
nobs -= 1
|
||||
y = -val - compensation
|
||||
t = sum_x + y
|
||||
compensation = t - sum_x - y
|
||||
sum_x = t
|
||||
return nobs, sum_x, compensation
|
||||
|
||||
|
||||
@numba.jit(nopython=True, nogil=True, parallel=False)
|
||||
def sliding_sum(
|
||||
values: np.ndarray,
|
||||
result_dtype: np.dtype,
|
||||
start: np.ndarray,
|
||||
end: np.ndarray,
|
||||
min_periods: int,
|
||||
) -> tuple[np.ndarray, list[int]]:
|
||||
dtype = values.dtype
|
||||
|
||||
na_val: object = np.nan
|
||||
if dtype.kind == "i":
|
||||
na_val = 0
|
||||
|
||||
N = len(start)
|
||||
nobs = 0
|
||||
sum_x = 0
|
||||
compensation_add = 0
|
||||
compensation_remove = 0
|
||||
na_pos = []
|
||||
|
||||
is_monotonic_increasing_bounds = is_monotonic_increasing(
|
||||
start
|
||||
) and is_monotonic_increasing(end)
|
||||
|
||||
output = np.empty(N, dtype=result_dtype)
|
||||
|
||||
for i in range(N):
|
||||
s = start[i]
|
||||
e = end[i]
|
||||
if i == 0 or not is_monotonic_increasing_bounds:
|
||||
prev_value = values[s]
|
||||
num_consecutive_same_value = 0
|
||||
|
||||
for j in range(s, e):
|
||||
val = values[j]
|
||||
(
|
||||
nobs,
|
||||
sum_x,
|
||||
compensation_add,
|
||||
num_consecutive_same_value,
|
||||
prev_value,
|
||||
) = add_sum(
|
||||
val,
|
||||
nobs,
|
||||
sum_x,
|
||||
compensation_add,
|
||||
num_consecutive_same_value,
|
||||
prev_value,
|
||||
)
|
||||
else:
|
||||
for j in range(start[i - 1], s):
|
||||
val = values[j]
|
||||
nobs, sum_x, compensation_remove = remove_sum(
|
||||
val, nobs, sum_x, compensation_remove
|
||||
)
|
||||
|
||||
for j in range(end[i - 1], e):
|
||||
val = values[j]
|
||||
(
|
||||
nobs,
|
||||
sum_x,
|
||||
compensation_add,
|
||||
num_consecutive_same_value,
|
||||
prev_value,
|
||||
) = add_sum(
|
||||
val,
|
||||
nobs,
|
||||
sum_x,
|
||||
compensation_add,
|
||||
num_consecutive_same_value,
|
||||
prev_value,
|
||||
)
|
||||
|
||||
if nobs == 0 == min_periods:
|
||||
result: object = 0
|
||||
elif nobs >= min_periods:
|
||||
if num_consecutive_same_value >= nobs:
|
||||
result = prev_value * nobs
|
||||
else:
|
||||
result = sum_x
|
||||
else:
|
||||
result = na_val
|
||||
if dtype.kind == "i":
|
||||
na_pos.append(i)
|
||||
|
||||
output[i] = result
|
||||
|
||||
if not is_monotonic_increasing_bounds:
|
||||
nobs = 0
|
||||
sum_x = 0
|
||||
compensation_remove = 0
|
||||
|
||||
return output, na_pos
|
||||
|
||||
|
||||
# Mypy/pyright don't like the fact that the decorator is untyped
|
||||
@register_jitable # type: ignore[misc]
|
||||
def grouped_kahan_sum(
|
||||
values: np.ndarray,
|
||||
result_dtype: np.dtype,
|
||||
labels: npt.NDArray[np.intp],
|
||||
ngroups: int,
|
||||
) -> tuple[
|
||||
np.ndarray, npt.NDArray[np.int64], np.ndarray, npt.NDArray[np.int64], np.ndarray
|
||||
]:
|
||||
N = len(labels)
|
||||
|
||||
nobs_arr = np.zeros(ngroups, dtype=np.int64)
|
||||
comp_arr = np.zeros(ngroups, dtype=values.dtype)
|
||||
consecutive_counts = np.zeros(ngroups, dtype=np.int64)
|
||||
prev_vals = np.zeros(ngroups, dtype=values.dtype)
|
||||
output = np.zeros(ngroups, dtype=result_dtype)
|
||||
|
||||
for i in range(N):
|
||||
lab = labels[i]
|
||||
val = values[i]
|
||||
|
||||
if lab < 0:
|
||||
continue
|
||||
|
||||
sum_x = output[lab]
|
||||
nobs = nobs_arr[lab]
|
||||
compensation_add = comp_arr[lab]
|
||||
num_consecutive_same_value = consecutive_counts[lab]
|
||||
prev_value = prev_vals[lab]
|
||||
|
||||
(
|
||||
nobs,
|
||||
sum_x,
|
||||
compensation_add,
|
||||
num_consecutive_same_value,
|
||||
prev_value,
|
||||
) = add_sum(
|
||||
val,
|
||||
nobs,
|
||||
sum_x,
|
||||
compensation_add,
|
||||
num_consecutive_same_value,
|
||||
prev_value,
|
||||
)
|
||||
|
||||
output[lab] = sum_x
|
||||
consecutive_counts[lab] = num_consecutive_same_value
|
||||
prev_vals[lab] = prev_value
|
||||
comp_arr[lab] = compensation_add
|
||||
nobs_arr[lab] = nobs
|
||||
return output, nobs_arr, comp_arr, consecutive_counts, prev_vals
|
||||
|
||||
|
||||
@numba.jit(nopython=True, nogil=True, parallel=False)
|
||||
def grouped_sum(
|
||||
values: np.ndarray,
|
||||
result_dtype: np.dtype,
|
||||
labels: npt.NDArray[np.intp],
|
||||
ngroups: int,
|
||||
min_periods: int,
|
||||
) -> tuple[np.ndarray, list[int]]:
|
||||
na_pos = []
|
||||
|
||||
output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum(
|
||||
values, result_dtype, labels, ngroups
|
||||
)
|
||||
|
||||
# Post-processing, replace sums that don't satisfy min_periods
|
||||
for lab in range(ngroups):
|
||||
nobs = nobs_arr[lab]
|
||||
num_consecutive_same_value = consecutive_counts[lab]
|
||||
prev_value = prev_vals[lab]
|
||||
sum_x = output[lab]
|
||||
if nobs >= min_periods:
|
||||
if num_consecutive_same_value >= nobs:
|
||||
result = prev_value * nobs
|
||||
else:
|
||||
result = sum_x
|
||||
else:
|
||||
result = sum_x # Don't change val, will be replaced by nan later
|
||||
na_pos.append(lab)
|
||||
output[lab] = result
|
||||
|
||||
return output, na_pos
|
||||
245
lib/python3.11/site-packages/pandas/core/_numba/kernels/var_.py
Normal file
245
lib/python3.11/site-packages/pandas/core/_numba/kernels/var_.py
Normal file
@ -0,0 +1,245 @@
|
||||
"""
|
||||
Numba 1D var kernels that can be shared by
|
||||
* Dataframe / Series
|
||||
* groupby
|
||||
* rolling / expanding
|
||||
|
||||
Mirrors pandas/_libs/window/aggregation.pyx
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numba
|
||||
import numpy as np
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import npt
|
||||
|
||||
from pandas.core._numba.kernels.shared import is_monotonic_increasing
|
||||
|
||||
|
||||
@numba.jit(nopython=True, nogil=True, parallel=False)
|
||||
def add_var(
|
||||
val: float,
|
||||
nobs: int,
|
||||
mean_x: float,
|
||||
ssqdm_x: float,
|
||||
compensation: float,
|
||||
num_consecutive_same_value: int,
|
||||
prev_value: float,
|
||||
) -> tuple[int, float, float, float, int, float]:
|
||||
if not np.isnan(val):
|
||||
if val == prev_value:
|
||||
num_consecutive_same_value += 1
|
||||
else:
|
||||
num_consecutive_same_value = 1
|
||||
prev_value = val
|
||||
|
||||
nobs += 1
|
||||
prev_mean = mean_x - compensation
|
||||
y = val - compensation
|
||||
t = y - mean_x
|
||||
compensation = t + mean_x - y
|
||||
delta = t
|
||||
if nobs:
|
||||
mean_x += delta / nobs
|
||||
else:
|
||||
mean_x = 0
|
||||
ssqdm_x += (val - prev_mean) * (val - mean_x)
|
||||
return nobs, mean_x, ssqdm_x, compensation, num_consecutive_same_value, prev_value
|
||||
|
||||
|
||||
@numba.jit(nopython=True, nogil=True, parallel=False)
|
||||
def remove_var(
|
||||
val: float, nobs: int, mean_x: float, ssqdm_x: float, compensation: float
|
||||
) -> tuple[int, float, float, float]:
|
||||
if not np.isnan(val):
|
||||
nobs -= 1
|
||||
if nobs:
|
||||
prev_mean = mean_x - compensation
|
||||
y = val - compensation
|
||||
t = y - mean_x
|
||||
compensation = t + mean_x - y
|
||||
delta = t
|
||||
mean_x -= delta / nobs
|
||||
ssqdm_x -= (val - prev_mean) * (val - mean_x)
|
||||
else:
|
||||
mean_x = 0
|
||||
ssqdm_x = 0
|
||||
return nobs, mean_x, ssqdm_x, compensation
|
||||
|
||||
|
||||
@numba.jit(nopython=True, nogil=True, parallel=False)
|
||||
def sliding_var(
|
||||
values: np.ndarray,
|
||||
result_dtype: np.dtype,
|
||||
start: np.ndarray,
|
||||
end: np.ndarray,
|
||||
min_periods: int,
|
||||
ddof: int = 1,
|
||||
) -> tuple[np.ndarray, list[int]]:
|
||||
N = len(start)
|
||||
nobs = 0
|
||||
mean_x = 0.0
|
||||
ssqdm_x = 0.0
|
||||
compensation_add = 0.0
|
||||
compensation_remove = 0.0
|
||||
|
||||
min_periods = max(min_periods, 1)
|
||||
is_monotonic_increasing_bounds = is_monotonic_increasing(
|
||||
start
|
||||
) and is_monotonic_increasing(end)
|
||||
|
||||
output = np.empty(N, dtype=result_dtype)
|
||||
|
||||
for i in range(N):
|
||||
s = start[i]
|
||||
e = end[i]
|
||||
if i == 0 or not is_monotonic_increasing_bounds:
|
||||
prev_value = values[s]
|
||||
num_consecutive_same_value = 0
|
||||
|
||||
for j in range(s, e):
|
||||
val = values[j]
|
||||
(
|
||||
nobs,
|
||||
mean_x,
|
||||
ssqdm_x,
|
||||
compensation_add,
|
||||
num_consecutive_same_value,
|
||||
prev_value,
|
||||
) = add_var(
|
||||
val,
|
||||
nobs,
|
||||
mean_x,
|
||||
ssqdm_x,
|
||||
compensation_add,
|
||||
num_consecutive_same_value,
|
||||
prev_value,
|
||||
)
|
||||
else:
|
||||
for j in range(start[i - 1], s):
|
||||
val = values[j]
|
||||
nobs, mean_x, ssqdm_x, compensation_remove = remove_var(
|
||||
val, nobs, mean_x, ssqdm_x, compensation_remove
|
||||
)
|
||||
|
||||
for j in range(end[i - 1], e):
|
||||
val = values[j]
|
||||
(
|
||||
nobs,
|
||||
mean_x,
|
||||
ssqdm_x,
|
||||
compensation_add,
|
||||
num_consecutive_same_value,
|
||||
prev_value,
|
||||
) = add_var(
|
||||
val,
|
||||
nobs,
|
||||
mean_x,
|
||||
ssqdm_x,
|
||||
compensation_add,
|
||||
num_consecutive_same_value,
|
||||
prev_value,
|
||||
)
|
||||
|
||||
if nobs >= min_periods and nobs > ddof:
|
||||
if nobs == 1 or num_consecutive_same_value >= nobs:
|
||||
result = 0.0
|
||||
else:
|
||||
result = ssqdm_x / (nobs - ddof)
|
||||
else:
|
||||
result = np.nan
|
||||
|
||||
output[i] = result
|
||||
|
||||
if not is_monotonic_increasing_bounds:
|
||||
nobs = 0
|
||||
mean_x = 0.0
|
||||
ssqdm_x = 0.0
|
||||
compensation_remove = 0.0
|
||||
|
||||
# na_position is empty list since float64 can already hold nans
|
||||
# Do list comprehension, since numba cannot figure out that na_pos is
|
||||
# empty list of ints on its own
|
||||
na_pos = [0 for i in range(0)]
|
||||
return output, na_pos
|
||||
|
||||
|
||||
@numba.jit(nopython=True, nogil=True, parallel=False)
|
||||
def grouped_var(
|
||||
values: np.ndarray,
|
||||
result_dtype: np.dtype,
|
||||
labels: npt.NDArray[np.intp],
|
||||
ngroups: int,
|
||||
min_periods: int,
|
||||
ddof: int = 1,
|
||||
) -> tuple[np.ndarray, list[int]]:
|
||||
N = len(labels)
|
||||
|
||||
nobs_arr = np.zeros(ngroups, dtype=np.int64)
|
||||
comp_arr = np.zeros(ngroups, dtype=values.dtype)
|
||||
consecutive_counts = np.zeros(ngroups, dtype=np.int64)
|
||||
prev_vals = np.zeros(ngroups, dtype=values.dtype)
|
||||
output = np.zeros(ngroups, dtype=result_dtype)
|
||||
means = np.zeros(ngroups, dtype=result_dtype)
|
||||
|
||||
for i in range(N):
|
||||
lab = labels[i]
|
||||
val = values[i]
|
||||
|
||||
if lab < 0:
|
||||
continue
|
||||
|
||||
mean_x = means[lab]
|
||||
ssqdm_x = output[lab]
|
||||
nobs = nobs_arr[lab]
|
||||
compensation_add = comp_arr[lab]
|
||||
num_consecutive_same_value = consecutive_counts[lab]
|
||||
prev_value = prev_vals[lab]
|
||||
|
||||
(
|
||||
nobs,
|
||||
mean_x,
|
||||
ssqdm_x,
|
||||
compensation_add,
|
||||
num_consecutive_same_value,
|
||||
prev_value,
|
||||
) = add_var(
|
||||
val,
|
||||
nobs,
|
||||
mean_x,
|
||||
ssqdm_x,
|
||||
compensation_add,
|
||||
num_consecutive_same_value,
|
||||
prev_value,
|
||||
)
|
||||
|
||||
output[lab] = ssqdm_x
|
||||
means[lab] = mean_x
|
||||
consecutive_counts[lab] = num_consecutive_same_value
|
||||
prev_vals[lab] = prev_value
|
||||
comp_arr[lab] = compensation_add
|
||||
nobs_arr[lab] = nobs
|
||||
|
||||
# Post-processing, replace vars that don't satisfy min_periods
|
||||
for lab in range(ngroups):
|
||||
nobs = nobs_arr[lab]
|
||||
num_consecutive_same_value = consecutive_counts[lab]
|
||||
ssqdm_x = output[lab]
|
||||
if nobs >= min_periods and nobs > ddof:
|
||||
if nobs == 1 or num_consecutive_same_value >= nobs:
|
||||
result = 0.0
|
||||
else:
|
||||
result = ssqdm_x / (nobs - ddof)
|
||||
else:
|
||||
result = np.nan
|
||||
output[lab] = result
|
||||
|
||||
# Second pass to get the std.dev
|
||||
# na_position is empty list since float64 can already hold nans
|
||||
# Do list comprehension, since numba cannot figure out that na_pos is
|
||||
# empty list of ints on its own
|
||||
na_pos = [0 for i in range(0)]
|
||||
return output, na_pos
|
||||
340
lib/python3.11/site-packages/pandas/core/accessor.py
Normal file
340
lib/python3.11/site-packages/pandas/core/accessor.py
Normal file
@ -0,0 +1,340 @@
|
||||
"""
|
||||
|
||||
accessor.py contains base classes for implementing accessor properties
|
||||
that can be mixed into or pinned onto other pandas classes.
|
||||
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
Callable,
|
||||
final,
|
||||
)
|
||||
import warnings
|
||||
|
||||
from pandas.util._decorators import doc
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
|
||||
class DirNamesMixin:
|
||||
_accessors: set[str] = set()
|
||||
_hidden_attrs: frozenset[str] = frozenset()
|
||||
|
||||
@final
|
||||
def _dir_deletions(self) -> set[str]:
|
||||
"""
|
||||
Delete unwanted __dir__ for this object.
|
||||
"""
|
||||
return self._accessors | self._hidden_attrs
|
||||
|
||||
def _dir_additions(self) -> set[str]:
|
||||
"""
|
||||
Add additional __dir__ for this object.
|
||||
"""
|
||||
return {accessor for accessor in self._accessors if hasattr(self, accessor)}
|
||||
|
||||
def __dir__(self) -> list[str]:
|
||||
"""
|
||||
Provide method name lookup and completion.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Only provide 'public' methods.
|
||||
"""
|
||||
rv = set(super().__dir__())
|
||||
rv = (rv - self._dir_deletions()) | self._dir_additions()
|
||||
return sorted(rv)
|
||||
|
||||
|
||||
class PandasDelegate:
|
||||
"""
|
||||
Abstract base class for delegating methods/properties.
|
||||
"""
|
||||
|
||||
def _delegate_property_get(self, name: str, *args, **kwargs):
|
||||
raise TypeError(f"You cannot access the property {name}")
|
||||
|
||||
def _delegate_property_set(self, name: str, value, *args, **kwargs):
|
||||
raise TypeError(f"The property {name} cannot be set")
|
||||
|
||||
def _delegate_method(self, name: str, *args, **kwargs):
|
||||
raise TypeError(f"You cannot call method {name}")
|
||||
|
||||
@classmethod
|
||||
def _add_delegate_accessors(
|
||||
cls,
|
||||
delegate,
|
||||
accessors: list[str],
|
||||
typ: str,
|
||||
overwrite: bool = False,
|
||||
accessor_mapping: Callable[[str], str] = lambda x: x,
|
||||
raise_on_missing: bool = True,
|
||||
) -> None:
|
||||
"""
|
||||
Add accessors to cls from the delegate class.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cls
|
||||
Class to add the methods/properties to.
|
||||
delegate
|
||||
Class to get methods/properties and doc-strings.
|
||||
accessors : list of str
|
||||
List of accessors to add.
|
||||
typ : {'property', 'method'}
|
||||
overwrite : bool, default False
|
||||
Overwrite the method/property in the target class if it exists.
|
||||
accessor_mapping: Callable, default lambda x: x
|
||||
Callable to map the delegate's function to the cls' function.
|
||||
raise_on_missing: bool, default True
|
||||
Raise if an accessor does not exist on delegate.
|
||||
False skips the missing accessor.
|
||||
"""
|
||||
|
||||
def _create_delegator_property(name: str):
|
||||
def _getter(self):
|
||||
return self._delegate_property_get(name)
|
||||
|
||||
def _setter(self, new_values):
|
||||
return self._delegate_property_set(name, new_values)
|
||||
|
||||
_getter.__name__ = name
|
||||
_setter.__name__ = name
|
||||
|
||||
return property(
|
||||
fget=_getter,
|
||||
fset=_setter,
|
||||
doc=getattr(delegate, accessor_mapping(name)).__doc__,
|
||||
)
|
||||
|
||||
def _create_delegator_method(name: str):
|
||||
def f(self, *args, **kwargs):
|
||||
return self._delegate_method(name, *args, **kwargs)
|
||||
|
||||
f.__name__ = name
|
||||
f.__doc__ = getattr(delegate, accessor_mapping(name)).__doc__
|
||||
|
||||
return f
|
||||
|
||||
for name in accessors:
|
||||
if (
|
||||
not raise_on_missing
|
||||
and getattr(delegate, accessor_mapping(name), None) is None
|
||||
):
|
||||
continue
|
||||
|
||||
if typ == "property":
|
||||
f = _create_delegator_property(name)
|
||||
else:
|
||||
f = _create_delegator_method(name)
|
||||
|
||||
# don't overwrite existing methods/properties
|
||||
if overwrite or not hasattr(cls, name):
|
||||
setattr(cls, name, f)
|
||||
|
||||
|
||||
def delegate_names(
|
||||
delegate,
|
||||
accessors: list[str],
|
||||
typ: str,
|
||||
overwrite: bool = False,
|
||||
accessor_mapping: Callable[[str], str] = lambda x: x,
|
||||
raise_on_missing: bool = True,
|
||||
):
|
||||
"""
|
||||
Add delegated names to a class using a class decorator. This provides
|
||||
an alternative usage to directly calling `_add_delegate_accessors`
|
||||
below a class definition.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
delegate : object
|
||||
The class to get methods/properties & doc-strings.
|
||||
accessors : Sequence[str]
|
||||
List of accessor to add.
|
||||
typ : {'property', 'method'}
|
||||
overwrite : bool, default False
|
||||
Overwrite the method/property in the target class if it exists.
|
||||
accessor_mapping: Callable, default lambda x: x
|
||||
Callable to map the delegate's function to the cls' function.
|
||||
raise_on_missing: bool, default True
|
||||
Raise if an accessor does not exist on delegate.
|
||||
False skips the missing accessor.
|
||||
|
||||
Returns
|
||||
-------
|
||||
callable
|
||||
A class decorator.
|
||||
|
||||
Examples
|
||||
--------
|
||||
@delegate_names(Categorical, ["categories", "ordered"], "property")
|
||||
class CategoricalAccessor(PandasDelegate):
|
||||
[...]
|
||||
"""
|
||||
|
||||
def add_delegate_accessors(cls):
|
||||
cls._add_delegate_accessors(
|
||||
delegate,
|
||||
accessors,
|
||||
typ,
|
||||
overwrite=overwrite,
|
||||
accessor_mapping=accessor_mapping,
|
||||
raise_on_missing=raise_on_missing,
|
||||
)
|
||||
return cls
|
||||
|
||||
return add_delegate_accessors
|
||||
|
||||
|
||||
# Ported with modifications from xarray; licence at LICENSES/XARRAY_LICENSE
|
||||
# https://github.com/pydata/xarray/blob/master/xarray/core/extensions.py
|
||||
# 1. We don't need to catch and re-raise AttributeErrors as RuntimeErrors
|
||||
# 2. We use a UserWarning instead of a custom Warning
|
||||
|
||||
|
||||
class CachedAccessor:
|
||||
"""
|
||||
Custom property-like object.
|
||||
|
||||
A descriptor for caching accessors.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name : str
|
||||
Namespace that will be accessed under, e.g. ``df.foo``.
|
||||
accessor : cls
|
||||
Class with the extension methods.
|
||||
|
||||
Notes
|
||||
-----
|
||||
For accessor, The class's __init__ method assumes that one of
|
||||
``Series``, ``DataFrame`` or ``Index`` as the
|
||||
single argument ``data``.
|
||||
"""
|
||||
|
||||
def __init__(self, name: str, accessor) -> None:
|
||||
self._name = name
|
||||
self._accessor = accessor
|
||||
|
||||
def __get__(self, obj, cls):
|
||||
if obj is None:
|
||||
# we're accessing the attribute of the class, i.e., Dataset.geo
|
||||
return self._accessor
|
||||
accessor_obj = self._accessor(obj)
|
||||
# Replace the property with the accessor object. Inspired by:
|
||||
# https://www.pydanny.com/cached-property.html
|
||||
# We need to use object.__setattr__ because we overwrite __setattr__ on
|
||||
# NDFrame
|
||||
object.__setattr__(obj, self._name, accessor_obj)
|
||||
return accessor_obj
|
||||
|
||||
|
||||
@doc(klass="", others="")
|
||||
def _register_accessor(name: str, cls):
|
||||
"""
|
||||
Register a custom accessor on {klass} objects.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name : str
|
||||
Name under which the accessor should be registered. A warning is issued
|
||||
if this name conflicts with a preexisting attribute.
|
||||
|
||||
Returns
|
||||
-------
|
||||
callable
|
||||
A class decorator.
|
||||
|
||||
See Also
|
||||
--------
|
||||
register_dataframe_accessor : Register a custom accessor on DataFrame objects.
|
||||
register_series_accessor : Register a custom accessor on Series objects.
|
||||
register_index_accessor : Register a custom accessor on Index objects.
|
||||
|
||||
Notes
|
||||
-----
|
||||
When accessed, your accessor will be initialized with the pandas object
|
||||
the user is interacting with. So the signature must be
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def __init__(self, pandas_object): # noqa: E999
|
||||
...
|
||||
|
||||
For consistency with pandas methods, you should raise an ``AttributeError``
|
||||
if the data passed to your accessor has an incorrect dtype.
|
||||
|
||||
>>> pd.Series(['a', 'b']).dt
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
AttributeError: Can only use .dt accessor with datetimelike values
|
||||
|
||||
Examples
|
||||
--------
|
||||
In your library code::
|
||||
|
||||
import pandas as pd
|
||||
|
||||
@pd.api.extensions.register_dataframe_accessor("geo")
|
||||
class GeoAccessor:
|
||||
def __init__(self, pandas_obj):
|
||||
self._obj = pandas_obj
|
||||
|
||||
@property
|
||||
def center(self):
|
||||
# return the geographic center point of this DataFrame
|
||||
lat = self._obj.latitude
|
||||
lon = self._obj.longitude
|
||||
return (float(lon.mean()), float(lat.mean()))
|
||||
|
||||
def plot(self):
|
||||
# plot this array's data on a map, e.g., using Cartopy
|
||||
pass
|
||||
|
||||
Back in an interactive IPython session:
|
||||
|
||||
.. code-block:: ipython
|
||||
|
||||
In [1]: ds = pd.DataFrame({{"longitude": np.linspace(0, 10),
|
||||
...: "latitude": np.linspace(0, 20)}})
|
||||
In [2]: ds.geo.center
|
||||
Out[2]: (5.0, 10.0)
|
||||
In [3]: ds.geo.plot() # plots data on a map
|
||||
"""
|
||||
|
||||
def decorator(accessor):
|
||||
if hasattr(cls, name):
|
||||
warnings.warn(
|
||||
f"registration of accessor {repr(accessor)} under name "
|
||||
f"{repr(name)} for type {repr(cls)} is overriding a preexisting "
|
||||
f"attribute with the same name.",
|
||||
UserWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
setattr(cls, name, CachedAccessor(name, accessor))
|
||||
cls._accessors.add(name)
|
||||
return accessor
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
@doc(_register_accessor, klass="DataFrame")
|
||||
def register_dataframe_accessor(name: str):
|
||||
from pandas import DataFrame
|
||||
|
||||
return _register_accessor(name, DataFrame)
|
||||
|
||||
|
||||
@doc(_register_accessor, klass="Series")
|
||||
def register_series_accessor(name: str):
|
||||
from pandas import Series
|
||||
|
||||
return _register_accessor(name, Series)
|
||||
|
||||
|
||||
@doc(_register_accessor, klass="Index")
|
||||
def register_index_accessor(name: str):
|
||||
from pandas import Index
|
||||
|
||||
return _register_accessor(name, Index)
|
||||
1747
lib/python3.11/site-packages/pandas/core/algorithms.py
Normal file
1747
lib/python3.11/site-packages/pandas/core/algorithms.py
Normal file
File diff suppressed because it is too large
Load Diff
140
lib/python3.11/site-packages/pandas/core/api.py
Normal file
140
lib/python3.11/site-packages/pandas/core/api.py
Normal file
@ -0,0 +1,140 @@
|
||||
from pandas._libs import (
|
||||
NaT,
|
||||
Period,
|
||||
Timedelta,
|
||||
Timestamp,
|
||||
)
|
||||
from pandas._libs.missing import NA
|
||||
|
||||
from pandas.core.dtypes.dtypes import (
|
||||
ArrowDtype,
|
||||
CategoricalDtype,
|
||||
DatetimeTZDtype,
|
||||
IntervalDtype,
|
||||
PeriodDtype,
|
||||
)
|
||||
from pandas.core.dtypes.missing import (
|
||||
isna,
|
||||
isnull,
|
||||
notna,
|
||||
notnull,
|
||||
)
|
||||
|
||||
from pandas.core.algorithms import (
|
||||
factorize,
|
||||
unique,
|
||||
value_counts,
|
||||
)
|
||||
from pandas.core.arrays import Categorical
|
||||
from pandas.core.arrays.boolean import BooleanDtype
|
||||
from pandas.core.arrays.floating import (
|
||||
Float32Dtype,
|
||||
Float64Dtype,
|
||||
)
|
||||
from pandas.core.arrays.integer import (
|
||||
Int8Dtype,
|
||||
Int16Dtype,
|
||||
Int32Dtype,
|
||||
Int64Dtype,
|
||||
UInt8Dtype,
|
||||
UInt16Dtype,
|
||||
UInt32Dtype,
|
||||
UInt64Dtype,
|
||||
)
|
||||
from pandas.core.arrays.string_ import StringDtype
|
||||
from pandas.core.construction import array
|
||||
from pandas.core.flags import Flags
|
||||
from pandas.core.groupby import (
|
||||
Grouper,
|
||||
NamedAgg,
|
||||
)
|
||||
from pandas.core.indexes.api import (
|
||||
CategoricalIndex,
|
||||
DatetimeIndex,
|
||||
Index,
|
||||
IntervalIndex,
|
||||
MultiIndex,
|
||||
PeriodIndex,
|
||||
RangeIndex,
|
||||
TimedeltaIndex,
|
||||
)
|
||||
from pandas.core.indexes.datetimes import (
|
||||
bdate_range,
|
||||
date_range,
|
||||
)
|
||||
from pandas.core.indexes.interval import (
|
||||
Interval,
|
||||
interval_range,
|
||||
)
|
||||
from pandas.core.indexes.period import period_range
|
||||
from pandas.core.indexes.timedeltas import timedelta_range
|
||||
from pandas.core.indexing import IndexSlice
|
||||
from pandas.core.series import Series
|
||||
from pandas.core.tools.datetimes import to_datetime
|
||||
from pandas.core.tools.numeric import to_numeric
|
||||
from pandas.core.tools.timedeltas import to_timedelta
|
||||
|
||||
from pandas.io.formats.format import set_eng_float_format
|
||||
from pandas.tseries.offsets import DateOffset
|
||||
|
||||
# DataFrame needs to be imported after NamedAgg to avoid a circular import
|
||||
from pandas.core.frame import DataFrame # isort:skip
|
||||
|
||||
__all__ = [
|
||||
"array",
|
||||
"ArrowDtype",
|
||||
"bdate_range",
|
||||
"BooleanDtype",
|
||||
"Categorical",
|
||||
"CategoricalDtype",
|
||||
"CategoricalIndex",
|
||||
"DataFrame",
|
||||
"DateOffset",
|
||||
"date_range",
|
||||
"DatetimeIndex",
|
||||
"DatetimeTZDtype",
|
||||
"factorize",
|
||||
"Flags",
|
||||
"Float32Dtype",
|
||||
"Float64Dtype",
|
||||
"Grouper",
|
||||
"Index",
|
||||
"IndexSlice",
|
||||
"Int16Dtype",
|
||||
"Int32Dtype",
|
||||
"Int64Dtype",
|
||||
"Int8Dtype",
|
||||
"Interval",
|
||||
"IntervalDtype",
|
||||
"IntervalIndex",
|
||||
"interval_range",
|
||||
"isna",
|
||||
"isnull",
|
||||
"MultiIndex",
|
||||
"NA",
|
||||
"NamedAgg",
|
||||
"NaT",
|
||||
"notna",
|
||||
"notnull",
|
||||
"Period",
|
||||
"PeriodDtype",
|
||||
"PeriodIndex",
|
||||
"period_range",
|
||||
"RangeIndex",
|
||||
"Series",
|
||||
"set_eng_float_format",
|
||||
"StringDtype",
|
||||
"Timedelta",
|
||||
"TimedeltaIndex",
|
||||
"timedelta_range",
|
||||
"Timestamp",
|
||||
"to_datetime",
|
||||
"to_numeric",
|
||||
"to_timedelta",
|
||||
"UInt16Dtype",
|
||||
"UInt32Dtype",
|
||||
"UInt64Dtype",
|
||||
"UInt8Dtype",
|
||||
"unique",
|
||||
"value_counts",
|
||||
]
|
||||
2057
lib/python3.11/site-packages/pandas/core/apply.py
Normal file
2057
lib/python3.11/site-packages/pandas/core/apply.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,9 @@
|
||||
"""
|
||||
core.array_algos is for algorithms that operate on ndarray and ExtensionArray.
|
||||
These should:
|
||||
|
||||
- Assume that any Index, Series, or DataFrame objects have already been unwrapped.
|
||||
- Assume that any list arguments have already been cast to ndarray/EA.
|
||||
- Not depend on Index, Series, or DataFrame, nor import any of these.
|
||||
- May dispatch to ExtensionArray methods, but should not import from core.arrays.
|
||||
"""
|
||||
@ -0,0 +1,67 @@
|
||||
"""
|
||||
datetimelke_accumulations.py is for accumulations of datetimelike extension arrays
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Callable
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import iNaT
|
||||
|
||||
from pandas.core.dtypes.missing import isna
|
||||
|
||||
|
||||
def _cum_func(
|
||||
func: Callable,
|
||||
values: np.ndarray,
|
||||
*,
|
||||
skipna: bool = True,
|
||||
):
|
||||
"""
|
||||
Accumulations for 1D datetimelike arrays.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
func : np.cumsum, np.maximum.accumulate, np.minimum.accumulate
|
||||
values : np.ndarray
|
||||
Numpy array with the values (can be of any dtype that support the
|
||||
operation). Values is changed is modified inplace.
|
||||
skipna : bool, default True
|
||||
Whether to skip NA.
|
||||
"""
|
||||
try:
|
||||
fill_value = {
|
||||
np.maximum.accumulate: np.iinfo(np.int64).min,
|
||||
np.cumsum: 0,
|
||||
np.minimum.accumulate: np.iinfo(np.int64).max,
|
||||
}[func]
|
||||
except KeyError:
|
||||
raise ValueError(f"No accumulation for {func} implemented on BaseMaskedArray")
|
||||
|
||||
mask = isna(values)
|
||||
y = values.view("i8")
|
||||
y[mask] = fill_value
|
||||
|
||||
if not skipna:
|
||||
mask = np.maximum.accumulate(mask)
|
||||
|
||||
result = func(y)
|
||||
result[mask] = iNaT
|
||||
|
||||
if values.dtype.kind in "mM":
|
||||
return result.view(values.dtype.base)
|
||||
return result
|
||||
|
||||
|
||||
def cumsum(values: np.ndarray, *, skipna: bool = True) -> np.ndarray:
|
||||
return _cum_func(np.cumsum, values, skipna=skipna)
|
||||
|
||||
|
||||
def cummin(values: np.ndarray, *, skipna: bool = True):
|
||||
return _cum_func(np.minimum.accumulate, values, skipna=skipna)
|
||||
|
||||
|
||||
def cummax(values: np.ndarray, *, skipna: bool = True):
|
||||
return _cum_func(np.maximum.accumulate, values, skipna=skipna)
|
||||
@ -0,0 +1,90 @@
|
||||
"""
|
||||
masked_accumulations.py is for accumulation algorithms using a mask-based approach
|
||||
for missing values.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Callable,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import npt
|
||||
|
||||
|
||||
def _cum_func(
|
||||
func: Callable,
|
||||
values: np.ndarray,
|
||||
mask: npt.NDArray[np.bool_],
|
||||
*,
|
||||
skipna: bool = True,
|
||||
):
|
||||
"""
|
||||
Accumulations for 1D masked array.
|
||||
|
||||
We will modify values in place to replace NAs with the appropriate fill value.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
func : np.cumsum, np.cumprod, np.maximum.accumulate, np.minimum.accumulate
|
||||
values : np.ndarray
|
||||
Numpy array with the values (can be of any dtype that support the
|
||||
operation).
|
||||
mask : np.ndarray
|
||||
Boolean numpy array (True values indicate missing values).
|
||||
skipna : bool, default True
|
||||
Whether to skip NA.
|
||||
"""
|
||||
dtype_info: np.iinfo | np.finfo
|
||||
if values.dtype.kind == "f":
|
||||
dtype_info = np.finfo(values.dtype.type)
|
||||
elif values.dtype.kind in "iu":
|
||||
dtype_info = np.iinfo(values.dtype.type)
|
||||
elif values.dtype.kind == "b":
|
||||
# Max value of bool is 1, but since we are setting into a boolean
|
||||
# array, 255 is fine as well. Min value has to be 0 when setting
|
||||
# into the boolean array.
|
||||
dtype_info = np.iinfo(np.uint8)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"No masked accumulation defined for dtype {values.dtype.type}"
|
||||
)
|
||||
try:
|
||||
fill_value = {
|
||||
np.cumprod: 1,
|
||||
np.maximum.accumulate: dtype_info.min,
|
||||
np.cumsum: 0,
|
||||
np.minimum.accumulate: dtype_info.max,
|
||||
}[func]
|
||||
except KeyError:
|
||||
raise NotImplementedError(
|
||||
f"No accumulation for {func} implemented on BaseMaskedArray"
|
||||
)
|
||||
|
||||
values[mask] = fill_value
|
||||
|
||||
if not skipna:
|
||||
mask = np.maximum.accumulate(mask)
|
||||
|
||||
values = func(values)
|
||||
return values, mask
|
||||
|
||||
|
||||
def cumsum(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
|
||||
return _cum_func(np.cumsum, values, mask, skipna=skipna)
|
||||
|
||||
|
||||
def cumprod(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
|
||||
return _cum_func(np.cumprod, values, mask, skipna=skipna)
|
||||
|
||||
|
||||
def cummin(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
|
||||
return _cum_func(np.minimum.accumulate, values, mask, skipna=skipna)
|
||||
|
||||
|
||||
def cummax(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
|
||||
return _cum_func(np.maximum.accumulate, values, mask, skipna=skipna)
|
||||
@ -0,0 +1,201 @@
|
||||
"""
|
||||
masked_reductions.py is for reduction algorithms using a mask-based approach
|
||||
for missing values.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Callable,
|
||||
)
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import missing as libmissing
|
||||
|
||||
from pandas.core.nanops import check_below_min_count
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
AxisInt,
|
||||
npt,
|
||||
)
|
||||
|
||||
|
||||
def _reductions(
|
||||
func: Callable,
|
||||
values: np.ndarray,
|
||||
mask: npt.NDArray[np.bool_],
|
||||
*,
|
||||
skipna: bool = True,
|
||||
min_count: int = 0,
|
||||
axis: AxisInt | None = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Sum, mean or product for 1D masked array.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
func : np.sum or np.prod
|
||||
values : np.ndarray
|
||||
Numpy array with the values (can be of any dtype that support the
|
||||
operation).
|
||||
mask : np.ndarray[bool]
|
||||
Boolean numpy array (True values indicate missing values).
|
||||
skipna : bool, default True
|
||||
Whether to skip NA.
|
||||
min_count : int, default 0
|
||||
The required number of valid values to perform the operation. If fewer than
|
||||
``min_count`` non-NA values are present the result will be NA.
|
||||
axis : int, optional, default None
|
||||
"""
|
||||
if not skipna:
|
||||
if mask.any() or check_below_min_count(values.shape, None, min_count):
|
||||
return libmissing.NA
|
||||
else:
|
||||
return func(values, axis=axis, **kwargs)
|
||||
else:
|
||||
if check_below_min_count(values.shape, mask, min_count) and (
|
||||
axis is None or values.ndim == 1
|
||||
):
|
||||
return libmissing.NA
|
||||
|
||||
if values.dtype == np.dtype(object):
|
||||
# object dtype does not support `where` without passing an initial
|
||||
values = values[~mask]
|
||||
return func(values, axis=axis, **kwargs)
|
||||
return func(values, where=~mask, axis=axis, **kwargs)
|
||||
|
||||
|
||||
def sum(
|
||||
values: np.ndarray,
|
||||
mask: npt.NDArray[np.bool_],
|
||||
*,
|
||||
skipna: bool = True,
|
||||
min_count: int = 0,
|
||||
axis: AxisInt | None = None,
|
||||
):
|
||||
return _reductions(
|
||||
np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis
|
||||
)
|
||||
|
||||
|
||||
def prod(
|
||||
values: np.ndarray,
|
||||
mask: npt.NDArray[np.bool_],
|
||||
*,
|
||||
skipna: bool = True,
|
||||
min_count: int = 0,
|
||||
axis: AxisInt | None = None,
|
||||
):
|
||||
return _reductions(
|
||||
np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis
|
||||
)
|
||||
|
||||
|
||||
def _minmax(
|
||||
func: Callable,
|
||||
values: np.ndarray,
|
||||
mask: npt.NDArray[np.bool_],
|
||||
*,
|
||||
skipna: bool = True,
|
||||
axis: AxisInt | None = None,
|
||||
):
|
||||
"""
|
||||
Reduction for 1D masked array.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
func : np.min or np.max
|
||||
values : np.ndarray
|
||||
Numpy array with the values (can be of any dtype that support the
|
||||
operation).
|
||||
mask : np.ndarray[bool]
|
||||
Boolean numpy array (True values indicate missing values).
|
||||
skipna : bool, default True
|
||||
Whether to skip NA.
|
||||
axis : int, optional, default None
|
||||
"""
|
||||
if not skipna:
|
||||
if mask.any() or not values.size:
|
||||
# min/max with empty array raise in numpy, pandas returns NA
|
||||
return libmissing.NA
|
||||
else:
|
||||
return func(values, axis=axis)
|
||||
else:
|
||||
subset = values[~mask]
|
||||
if subset.size:
|
||||
return func(subset, axis=axis)
|
||||
else:
|
||||
# min/max with empty array raise in numpy, pandas returns NA
|
||||
return libmissing.NA
|
||||
|
||||
|
||||
def min(
|
||||
values: np.ndarray,
|
||||
mask: npt.NDArray[np.bool_],
|
||||
*,
|
||||
skipna: bool = True,
|
||||
axis: AxisInt | None = None,
|
||||
):
|
||||
return _minmax(np.min, values=values, mask=mask, skipna=skipna, axis=axis)
|
||||
|
||||
|
||||
def max(
|
||||
values: np.ndarray,
|
||||
mask: npt.NDArray[np.bool_],
|
||||
*,
|
||||
skipna: bool = True,
|
||||
axis: AxisInt | None = None,
|
||||
):
|
||||
return _minmax(np.max, values=values, mask=mask, skipna=skipna, axis=axis)
|
||||
|
||||
|
||||
def mean(
|
||||
values: np.ndarray,
|
||||
mask: npt.NDArray[np.bool_],
|
||||
*,
|
||||
skipna: bool = True,
|
||||
axis: AxisInt | None = None,
|
||||
):
|
||||
if not values.size or mask.all():
|
||||
return libmissing.NA
|
||||
return _reductions(np.mean, values=values, mask=mask, skipna=skipna, axis=axis)
|
||||
|
||||
|
||||
def var(
|
||||
values: np.ndarray,
|
||||
mask: npt.NDArray[np.bool_],
|
||||
*,
|
||||
skipna: bool = True,
|
||||
axis: AxisInt | None = None,
|
||||
ddof: int = 1,
|
||||
):
|
||||
if not values.size or mask.all():
|
||||
return libmissing.NA
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", RuntimeWarning)
|
||||
return _reductions(
|
||||
np.var, values=values, mask=mask, skipna=skipna, axis=axis, ddof=ddof
|
||||
)
|
||||
|
||||
|
||||
def std(
|
||||
values: np.ndarray,
|
||||
mask: npt.NDArray[np.bool_],
|
||||
*,
|
||||
skipna: bool = True,
|
||||
axis: AxisInt | None = None,
|
||||
ddof: int = 1,
|
||||
):
|
||||
if not values.size or mask.all():
|
||||
return libmissing.NA
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", RuntimeWarning)
|
||||
return _reductions(
|
||||
np.std, values=values, mask=mask, skipna=skipna, axis=axis, ddof=ddof
|
||||
)
|
||||
149
lib/python3.11/site-packages/pandas/core/array_algos/putmask.py
Normal file
149
lib/python3.11/site-packages/pandas/core/array_algos/putmask.py
Normal file
@ -0,0 +1,149 @@
|
||||
"""
|
||||
EA-compatible analogue to np.putmask
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import lib
|
||||
|
||||
from pandas.core.dtypes.cast import infer_dtype_from
|
||||
from pandas.core.dtypes.common import is_list_like
|
||||
|
||||
from pandas.core.arrays import ExtensionArray
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
npt,
|
||||
)
|
||||
|
||||
from pandas import MultiIndex
|
||||
|
||||
|
||||
def putmask_inplace(values: ArrayLike, mask: npt.NDArray[np.bool_], value: Any) -> None:
|
||||
"""
|
||||
ExtensionArray-compatible implementation of np.putmask. The main
|
||||
difference is we do not handle repeating or truncating like numpy.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values: np.ndarray or ExtensionArray
|
||||
mask : np.ndarray[bool]
|
||||
We assume extract_bool_array has already been called.
|
||||
value : Any
|
||||
"""
|
||||
|
||||
if (
|
||||
not isinstance(values, np.ndarray)
|
||||
or (values.dtype == object and not lib.is_scalar(value))
|
||||
# GH#43424: np.putmask raises TypeError if we cannot cast between types with
|
||||
# rule = "safe", a stricter guarantee we may not have here
|
||||
or (
|
||||
isinstance(value, np.ndarray) and not np.can_cast(value.dtype, values.dtype)
|
||||
)
|
||||
):
|
||||
# GH#19266 using np.putmask gives unexpected results with listlike value
|
||||
# along with object dtype
|
||||
if is_list_like(value) and len(value) == len(values):
|
||||
values[mask] = value[mask]
|
||||
else:
|
||||
values[mask] = value
|
||||
else:
|
||||
# GH#37833 np.putmask is more performant than __setitem__
|
||||
np.putmask(values, mask, value)
|
||||
|
||||
|
||||
def putmask_without_repeat(
|
||||
values: np.ndarray, mask: npt.NDArray[np.bool_], new: Any
|
||||
) -> None:
|
||||
"""
|
||||
np.putmask will truncate or repeat if `new` is a listlike with
|
||||
len(new) != len(values). We require an exact match.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : np.ndarray
|
||||
mask : np.ndarray[bool]
|
||||
new : Any
|
||||
"""
|
||||
if getattr(new, "ndim", 0) >= 1:
|
||||
new = new.astype(values.dtype, copy=False)
|
||||
|
||||
# TODO: this prob needs some better checking for 2D cases
|
||||
nlocs = mask.sum()
|
||||
if nlocs > 0 and is_list_like(new) and getattr(new, "ndim", 1) == 1:
|
||||
shape = np.shape(new)
|
||||
# np.shape compat for if setitem_datetimelike_compat
|
||||
# changed arraylike to list e.g. test_where_dt64_2d
|
||||
if nlocs == shape[-1]:
|
||||
# GH#30567
|
||||
# If length of ``new`` is less than the length of ``values``,
|
||||
# `np.putmask` would first repeat the ``new`` array and then
|
||||
# assign the masked values hence produces incorrect result.
|
||||
# `np.place` on the other hand uses the ``new`` values at it is
|
||||
# to place in the masked locations of ``values``
|
||||
np.place(values, mask, new)
|
||||
# i.e. values[mask] = new
|
||||
elif mask.shape[-1] == shape[-1] or shape[-1] == 1:
|
||||
np.putmask(values, mask, new)
|
||||
else:
|
||||
raise ValueError("cannot assign mismatch length to masked array")
|
||||
else:
|
||||
np.putmask(values, mask, new)
|
||||
|
||||
|
||||
def validate_putmask(
|
||||
values: ArrayLike | MultiIndex, mask: np.ndarray
|
||||
) -> tuple[npt.NDArray[np.bool_], bool]:
|
||||
"""
|
||||
Validate mask and check if this putmask operation is a no-op.
|
||||
"""
|
||||
mask = extract_bool_array(mask)
|
||||
if mask.shape != values.shape:
|
||||
raise ValueError("putmask: mask and data must be the same size")
|
||||
|
||||
noop = not mask.any()
|
||||
return mask, noop
|
||||
|
||||
|
||||
def extract_bool_array(mask: ArrayLike) -> npt.NDArray[np.bool_]:
|
||||
"""
|
||||
If we have a SparseArray or BooleanArray, convert it to ndarray[bool].
|
||||
"""
|
||||
if isinstance(mask, ExtensionArray):
|
||||
# We could have BooleanArray, Sparse[bool], ...
|
||||
# Except for BooleanArray, this is equivalent to just
|
||||
# np.asarray(mask, dtype=bool)
|
||||
mask = mask.to_numpy(dtype=bool, na_value=False)
|
||||
|
||||
mask = np.asarray(mask, dtype=bool)
|
||||
return mask
|
||||
|
||||
|
||||
def setitem_datetimelike_compat(values: np.ndarray, num_set: int, other):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
values : np.ndarray
|
||||
num_set : int
|
||||
For putmask, this is mask.sum()
|
||||
other : Any
|
||||
"""
|
||||
if values.dtype == object:
|
||||
dtype, _ = infer_dtype_from(other)
|
||||
|
||||
if lib.is_np_dtype(dtype, "mM"):
|
||||
# https://github.com/numpy/numpy/issues/12550
|
||||
# timedelta64 will incorrectly cast to int
|
||||
if not is_list_like(other):
|
||||
other = [other] * num_set
|
||||
else:
|
||||
other = list(other)
|
||||
|
||||
return other
|
||||
226
lib/python3.11/site-packages/pandas/core/array_algos/quantile.py
Normal file
226
lib/python3.11/site-packages/pandas/core/array_algos/quantile.py
Normal file
@ -0,0 +1,226 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.core.dtypes.missing import (
|
||||
isna,
|
||||
na_value_for_dtype,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
Scalar,
|
||||
npt,
|
||||
)
|
||||
|
||||
|
||||
def quantile_compat(
|
||||
values: ArrayLike, qs: npt.NDArray[np.float64], interpolation: str
|
||||
) -> ArrayLike:
|
||||
"""
|
||||
Compute the quantiles of the given values for each quantile in `qs`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : np.ndarray or ExtensionArray
|
||||
qs : np.ndarray[float64]
|
||||
interpolation : str
|
||||
|
||||
Returns
|
||||
-------
|
||||
np.ndarray or ExtensionArray
|
||||
"""
|
||||
if isinstance(values, np.ndarray):
|
||||
fill_value = na_value_for_dtype(values.dtype, compat=False)
|
||||
mask = isna(values)
|
||||
return quantile_with_mask(values, mask, fill_value, qs, interpolation)
|
||||
else:
|
||||
return values._quantile(qs, interpolation)
|
||||
|
||||
|
||||
def quantile_with_mask(
|
||||
values: np.ndarray,
|
||||
mask: npt.NDArray[np.bool_],
|
||||
fill_value,
|
||||
qs: npt.NDArray[np.float64],
|
||||
interpolation: str,
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Compute the quantiles of the given values for each quantile in `qs`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : np.ndarray
|
||||
For ExtensionArray, this is _values_for_factorize()[0]
|
||||
mask : np.ndarray[bool]
|
||||
mask = isna(values)
|
||||
For ExtensionArray, this is computed before calling _value_for_factorize
|
||||
fill_value : Scalar
|
||||
The value to interpret fill NA entries with
|
||||
For ExtensionArray, this is _values_for_factorize()[1]
|
||||
qs : np.ndarray[float64]
|
||||
interpolation : str
|
||||
Type of interpolation
|
||||
|
||||
Returns
|
||||
-------
|
||||
np.ndarray
|
||||
|
||||
Notes
|
||||
-----
|
||||
Assumes values is already 2D. For ExtensionArray this means np.atleast_2d
|
||||
has been called on _values_for_factorize()[0]
|
||||
|
||||
Quantile is computed along axis=1.
|
||||
"""
|
||||
assert values.shape == mask.shape
|
||||
if values.ndim == 1:
|
||||
# unsqueeze, operate, re-squeeze
|
||||
values = np.atleast_2d(values)
|
||||
mask = np.atleast_2d(mask)
|
||||
res_values = quantile_with_mask(values, mask, fill_value, qs, interpolation)
|
||||
return res_values[0]
|
||||
|
||||
assert values.ndim == 2
|
||||
|
||||
is_empty = values.shape[1] == 0
|
||||
|
||||
if is_empty:
|
||||
# create the array of na_values
|
||||
# 2d len(values) * len(qs)
|
||||
flat = np.array([fill_value] * len(qs))
|
||||
result = np.repeat(flat, len(values)).reshape(len(values), len(qs))
|
||||
else:
|
||||
result = _nanpercentile(
|
||||
values,
|
||||
qs * 100.0,
|
||||
na_value=fill_value,
|
||||
mask=mask,
|
||||
interpolation=interpolation,
|
||||
)
|
||||
|
||||
result = np.asarray(result)
|
||||
result = result.T
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _nanpercentile_1d(
|
||||
values: np.ndarray,
|
||||
mask: npt.NDArray[np.bool_],
|
||||
qs: npt.NDArray[np.float64],
|
||||
na_value: Scalar,
|
||||
interpolation: str,
|
||||
) -> Scalar | np.ndarray:
|
||||
"""
|
||||
Wrapper for np.percentile that skips missing values, specialized to
|
||||
1-dimensional case.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : array over which to find quantiles
|
||||
mask : ndarray[bool]
|
||||
locations in values that should be considered missing
|
||||
qs : np.ndarray[float64] of quantile indices to find
|
||||
na_value : scalar
|
||||
value to return for empty or all-null values
|
||||
interpolation : str
|
||||
|
||||
Returns
|
||||
-------
|
||||
quantiles : scalar or array
|
||||
"""
|
||||
# mask is Union[ExtensionArray, ndarray]
|
||||
values = values[~mask]
|
||||
|
||||
if len(values) == 0:
|
||||
# Can't pass dtype=values.dtype here bc we might have na_value=np.nan
|
||||
# with values.dtype=int64 see test_quantile_empty
|
||||
# equiv: 'np.array([na_value] * len(qs))' but much faster
|
||||
return np.full(len(qs), na_value)
|
||||
|
||||
return np.percentile(
|
||||
values,
|
||||
qs,
|
||||
# error: No overload variant of "percentile" matches argument
|
||||
# types "ndarray[Any, Any]", "ndarray[Any, dtype[floating[_64Bit]]]"
|
||||
# , "Dict[str, str]" [call-overload]
|
||||
method=interpolation, # type: ignore[call-overload]
|
||||
)
|
||||
|
||||
|
||||
def _nanpercentile(
|
||||
values: np.ndarray,
|
||||
qs: npt.NDArray[np.float64],
|
||||
*,
|
||||
na_value,
|
||||
mask: npt.NDArray[np.bool_],
|
||||
interpolation: str,
|
||||
):
|
||||
"""
|
||||
Wrapper for np.percentile that skips missing values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : np.ndarray[ndim=2] over which to find quantiles
|
||||
qs : np.ndarray[float64] of quantile indices to find
|
||||
na_value : scalar
|
||||
value to return for empty or all-null values
|
||||
mask : np.ndarray[bool]
|
||||
locations in values that should be considered missing
|
||||
interpolation : str
|
||||
|
||||
Returns
|
||||
-------
|
||||
quantiles : scalar or array
|
||||
"""
|
||||
|
||||
if values.dtype.kind in "mM":
|
||||
# need to cast to integer to avoid rounding errors in numpy
|
||||
result = _nanpercentile(
|
||||
values.view("i8"),
|
||||
qs=qs,
|
||||
na_value=na_value.view("i8"),
|
||||
mask=mask,
|
||||
interpolation=interpolation,
|
||||
)
|
||||
|
||||
# Note: we have to do `astype` and not view because in general we
|
||||
# have float result at this point, not i8
|
||||
return result.astype(values.dtype)
|
||||
|
||||
if mask.any():
|
||||
# Caller is responsible for ensuring mask shape match
|
||||
assert mask.shape == values.shape
|
||||
result = [
|
||||
_nanpercentile_1d(val, m, qs, na_value, interpolation=interpolation)
|
||||
for (val, m) in zip(list(values), list(mask))
|
||||
]
|
||||
if values.dtype.kind == "f":
|
||||
# preserve itemsize
|
||||
result = np.asarray(result, dtype=values.dtype).T
|
||||
else:
|
||||
result = np.asarray(result).T
|
||||
if (
|
||||
result.dtype != values.dtype
|
||||
and not mask.all()
|
||||
and (result == result.astype(values.dtype, copy=False)).all()
|
||||
):
|
||||
# mask.all() will never get cast back to int
|
||||
# e.g. values id integer dtype and result is floating dtype,
|
||||
# only cast back to integer dtype if result values are all-integer.
|
||||
result = result.astype(values.dtype, copy=False)
|
||||
return result
|
||||
else:
|
||||
return np.percentile(
|
||||
values,
|
||||
qs,
|
||||
axis=1,
|
||||
# error: No overload variant of "percentile" matches argument types
|
||||
# "ndarray[Any, Any]", "ndarray[Any, dtype[floating[_64Bit]]]",
|
||||
# "int", "Dict[str, str]" [call-overload]
|
||||
method=interpolation, # type: ignore[call-overload]
|
||||
)
|
||||
155
lib/python3.11/site-packages/pandas/core/array_algos/replace.py
Normal file
155
lib/python3.11/site-packages/pandas/core/array_algos/replace.py
Normal file
@ -0,0 +1,155 @@
|
||||
"""
|
||||
Methods used by Block.replace and related methods.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import operator
|
||||
import re
|
||||
from re import Pattern
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_bool,
|
||||
is_re,
|
||||
is_re_compilable,
|
||||
)
|
||||
from pandas.core.dtypes.missing import isna
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
Scalar,
|
||||
npt,
|
||||
)
|
||||
|
||||
|
||||
def should_use_regex(regex: bool, to_replace: Any) -> bool:
|
||||
"""
|
||||
Decide whether to treat `to_replace` as a regular expression.
|
||||
"""
|
||||
if is_re(to_replace):
|
||||
regex = True
|
||||
|
||||
regex = regex and is_re_compilable(to_replace)
|
||||
|
||||
# Don't use regex if the pattern is empty.
|
||||
regex = regex and re.compile(to_replace).pattern != ""
|
||||
return regex
|
||||
|
||||
|
||||
def compare_or_regex_search(
|
||||
a: ArrayLike, b: Scalar | Pattern, regex: bool, mask: npt.NDArray[np.bool_]
|
||||
) -> ArrayLike:
|
||||
"""
|
||||
Compare two array-like inputs of the same shape or two scalar values
|
||||
|
||||
Calls operator.eq or re.search, depending on regex argument. If regex is
|
||||
True, perform an element-wise regex matching.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a : array-like
|
||||
b : scalar or regex pattern
|
||||
regex : bool
|
||||
mask : np.ndarray[bool]
|
||||
|
||||
Returns
|
||||
-------
|
||||
mask : array-like of bool
|
||||
"""
|
||||
if isna(b):
|
||||
return ~mask
|
||||
|
||||
def _check_comparison_types(
|
||||
result: ArrayLike | bool, a: ArrayLike, b: Scalar | Pattern
|
||||
):
|
||||
"""
|
||||
Raises an error if the two arrays (a,b) cannot be compared.
|
||||
Otherwise, returns the comparison result as expected.
|
||||
"""
|
||||
if is_bool(result) and isinstance(a, np.ndarray):
|
||||
type_names = [type(a).__name__, type(b).__name__]
|
||||
|
||||
type_names[0] = f"ndarray(dtype={a.dtype})"
|
||||
|
||||
raise TypeError(
|
||||
f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}"
|
||||
)
|
||||
|
||||
if not regex or not should_use_regex(regex, b):
|
||||
# TODO: should use missing.mask_missing?
|
||||
op = lambda x: operator.eq(x, b)
|
||||
else:
|
||||
op = np.vectorize(
|
||||
lambda x: bool(re.search(b, x))
|
||||
if isinstance(x, str) and isinstance(b, (str, Pattern))
|
||||
else False
|
||||
)
|
||||
|
||||
# GH#32621 use mask to avoid comparing to NAs
|
||||
if isinstance(a, np.ndarray) and mask is not None:
|
||||
a = a[mask]
|
||||
result = op(a)
|
||||
|
||||
if isinstance(result, np.ndarray):
|
||||
# The shape of the mask can differ to that of the result
|
||||
# since we may compare only a subset of a's or b's elements
|
||||
tmp = np.zeros(mask.shape, dtype=np.bool_)
|
||||
np.place(tmp, mask, result)
|
||||
result = tmp
|
||||
else:
|
||||
result = op(a)
|
||||
|
||||
_check_comparison_types(result, a, b)
|
||||
return result
|
||||
|
||||
|
||||
def replace_regex(
|
||||
values: ArrayLike, rx: re.Pattern, value, mask: npt.NDArray[np.bool_] | None
|
||||
) -> None:
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
values : ArrayLike
|
||||
Object dtype.
|
||||
rx : re.Pattern
|
||||
value : Any
|
||||
mask : np.ndarray[bool], optional
|
||||
|
||||
Notes
|
||||
-----
|
||||
Alters values in-place.
|
||||
"""
|
||||
|
||||
# deal with replacing values with objects (strings) that match but
|
||||
# whose replacement is not a string (numeric, nan, object)
|
||||
if isna(value) or not isinstance(value, str):
|
||||
|
||||
def re_replacer(s):
|
||||
if is_re(rx) and isinstance(s, str):
|
||||
return value if rx.search(s) is not None else s
|
||||
else:
|
||||
return s
|
||||
|
||||
else:
|
||||
# value is guaranteed to be a string here, s can be either a string
|
||||
# or null if it's null it gets returned
|
||||
def re_replacer(s):
|
||||
if is_re(rx) and isinstance(s, str):
|
||||
return rx.sub(value, s)
|
||||
else:
|
||||
return s
|
||||
|
||||
f = np.vectorize(re_replacer, otypes=[np.object_])
|
||||
|
||||
if mask is None:
|
||||
values[:] = f(values)
|
||||
else:
|
||||
if values.ndim != mask.ndim:
|
||||
mask = np.broadcast_to(mask, values.shape)
|
||||
values[mask] = f(values[mask])
|
||||
594
lib/python3.11/site-packages/pandas/core/array_algos/take.py
Normal file
594
lib/python3.11/site-packages/pandas/core/array_algos/take.py
Normal file
@ -0,0 +1,594 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import functools
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
cast,
|
||||
overload,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import (
|
||||
algos as libalgos,
|
||||
lib,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.cast import maybe_promote
|
||||
from pandas.core.dtypes.common import (
|
||||
ensure_platform_int,
|
||||
is_1d_only_ea_dtype,
|
||||
)
|
||||
from pandas.core.dtypes.missing import na_value_for_dtype
|
||||
|
||||
from pandas.core.construction import ensure_wrapped_if_datetimelike
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
AxisInt,
|
||||
npt,
|
||||
)
|
||||
|
||||
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
|
||||
from pandas.core.arrays.base import ExtensionArray
|
||||
|
||||
|
||||
@overload
|
||||
def take_nd(
|
||||
arr: np.ndarray,
|
||||
indexer,
|
||||
axis: AxisInt = ...,
|
||||
fill_value=...,
|
||||
allow_fill: bool = ...,
|
||||
) -> np.ndarray:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def take_nd(
|
||||
arr: ExtensionArray,
|
||||
indexer,
|
||||
axis: AxisInt = ...,
|
||||
fill_value=...,
|
||||
allow_fill: bool = ...,
|
||||
) -> ArrayLike:
|
||||
...
|
||||
|
||||
|
||||
def take_nd(
|
||||
arr: ArrayLike,
|
||||
indexer,
|
||||
axis: AxisInt = 0,
|
||||
fill_value=lib.no_default,
|
||||
allow_fill: bool = True,
|
||||
) -> ArrayLike:
|
||||
"""
|
||||
Specialized Cython take which sets NaN values in one pass
|
||||
|
||||
This dispatches to ``take`` defined on ExtensionArrays.
|
||||
|
||||
Note: this function assumes that the indexer is a valid(ated) indexer with
|
||||
no out of bound indices.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
arr : np.ndarray or ExtensionArray
|
||||
Input array.
|
||||
indexer : ndarray
|
||||
1-D array of indices to take, subarrays corresponding to -1 value
|
||||
indices are filed with fill_value
|
||||
axis : int, default 0
|
||||
Axis to take from
|
||||
fill_value : any, default np.nan
|
||||
Fill value to replace -1 values with
|
||||
allow_fill : bool, default True
|
||||
If False, indexer is assumed to contain no -1 values so no filling
|
||||
will be done. This short-circuits computation of a mask. Result is
|
||||
undefined if allow_fill == False and -1 is present in indexer.
|
||||
|
||||
Returns
|
||||
-------
|
||||
subarray : np.ndarray or ExtensionArray
|
||||
May be the same type as the input, or cast to an ndarray.
|
||||
"""
|
||||
if fill_value is lib.no_default:
|
||||
fill_value = na_value_for_dtype(arr.dtype, compat=False)
|
||||
elif lib.is_np_dtype(arr.dtype, "mM"):
|
||||
dtype, fill_value = maybe_promote(arr.dtype, fill_value)
|
||||
if arr.dtype != dtype:
|
||||
# EA.take is strict about returning a new object of the same type
|
||||
# so for that case cast upfront
|
||||
arr = arr.astype(dtype)
|
||||
|
||||
if not isinstance(arr, np.ndarray):
|
||||
# i.e. ExtensionArray,
|
||||
# includes for EA to catch DatetimeArray, TimedeltaArray
|
||||
if not is_1d_only_ea_dtype(arr.dtype):
|
||||
# i.e. DatetimeArray, TimedeltaArray
|
||||
arr = cast("NDArrayBackedExtensionArray", arr)
|
||||
return arr.take(
|
||||
indexer, fill_value=fill_value, allow_fill=allow_fill, axis=axis
|
||||
)
|
||||
|
||||
return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
|
||||
|
||||
arr = np.asarray(arr)
|
||||
return _take_nd_ndarray(arr, indexer, axis, fill_value, allow_fill)
|
||||
|
||||
|
||||
def _take_nd_ndarray(
|
||||
arr: np.ndarray,
|
||||
indexer: npt.NDArray[np.intp] | None,
|
||||
axis: AxisInt,
|
||||
fill_value,
|
||||
allow_fill: bool,
|
||||
) -> np.ndarray:
|
||||
if indexer is None:
|
||||
indexer = np.arange(arr.shape[axis], dtype=np.intp)
|
||||
dtype, fill_value = arr.dtype, arr.dtype.type()
|
||||
else:
|
||||
indexer = ensure_platform_int(indexer)
|
||||
|
||||
dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value(
|
||||
arr, indexer, fill_value, allow_fill
|
||||
)
|
||||
|
||||
flip_order = False
|
||||
if arr.ndim == 2 and arr.flags.f_contiguous:
|
||||
flip_order = True
|
||||
|
||||
if flip_order:
|
||||
arr = arr.T
|
||||
axis = arr.ndim - axis - 1
|
||||
|
||||
# at this point, it's guaranteed that dtype can hold both the arr values
|
||||
# and the fill_value
|
||||
out_shape_ = list(arr.shape)
|
||||
out_shape_[axis] = len(indexer)
|
||||
out_shape = tuple(out_shape_)
|
||||
if arr.flags.f_contiguous and axis == arr.ndim - 1:
|
||||
# minor tweak that can make an order-of-magnitude difference
|
||||
# for dataframes initialized directly from 2-d ndarrays
|
||||
# (s.t. df.values is c-contiguous and df._mgr.blocks[0] is its
|
||||
# f-contiguous transpose)
|
||||
out = np.empty(out_shape, dtype=dtype, order="F")
|
||||
else:
|
||||
out = np.empty(out_shape, dtype=dtype)
|
||||
|
||||
func = _get_take_nd_function(
|
||||
arr.ndim, arr.dtype, out.dtype, axis=axis, mask_info=mask_info
|
||||
)
|
||||
func(arr, indexer, out, fill_value)
|
||||
|
||||
if flip_order:
|
||||
out = out.T
|
||||
return out
|
||||
|
||||
|
||||
def take_1d(
|
||||
arr: ArrayLike,
|
||||
indexer: npt.NDArray[np.intp],
|
||||
fill_value=None,
|
||||
allow_fill: bool = True,
|
||||
mask: npt.NDArray[np.bool_] | None = None,
|
||||
) -> ArrayLike:
|
||||
"""
|
||||
Specialized version for 1D arrays. Differences compared to `take_nd`:
|
||||
|
||||
- Assumes input array has already been converted to numpy array / EA
|
||||
- Assumes indexer is already guaranteed to be intp dtype ndarray
|
||||
- Only works for 1D arrays
|
||||
|
||||
To ensure the lowest possible overhead.
|
||||
|
||||
Note: similarly to `take_nd`, this function assumes that the indexer is
|
||||
a valid(ated) indexer with no out of bound indices.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
arr : np.ndarray or ExtensionArray
|
||||
Input array.
|
||||
indexer : ndarray
|
||||
1-D array of indices to take (validated indices, intp dtype).
|
||||
fill_value : any, default np.nan
|
||||
Fill value to replace -1 values with
|
||||
allow_fill : bool, default True
|
||||
If False, indexer is assumed to contain no -1 values so no filling
|
||||
will be done. This short-circuits computation of a mask. Result is
|
||||
undefined if allow_fill == False and -1 is present in indexer.
|
||||
mask : np.ndarray, optional, default None
|
||||
If `allow_fill` is True, and the mask (where indexer == -1) is already
|
||||
known, it can be passed to avoid recomputation.
|
||||
"""
|
||||
if not isinstance(arr, np.ndarray):
|
||||
# ExtensionArray -> dispatch to their method
|
||||
return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
|
||||
|
||||
if not allow_fill:
|
||||
return arr.take(indexer)
|
||||
|
||||
dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value(
|
||||
arr, indexer, fill_value, True, mask
|
||||
)
|
||||
|
||||
# at this point, it's guaranteed that dtype can hold both the arr values
|
||||
# and the fill_value
|
||||
out = np.empty(indexer.shape, dtype=dtype)
|
||||
|
||||
func = _get_take_nd_function(
|
||||
arr.ndim, arr.dtype, out.dtype, axis=0, mask_info=mask_info
|
||||
)
|
||||
func(arr, indexer, out, fill_value)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def take_2d_multi(
|
||||
arr: np.ndarray,
|
||||
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
|
||||
fill_value=np.nan,
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Specialized Cython take which sets NaN values in one pass.
|
||||
"""
|
||||
# This is only called from one place in DataFrame._reindex_multi,
|
||||
# so we know indexer is well-behaved.
|
||||
assert indexer is not None
|
||||
assert indexer[0] is not None
|
||||
assert indexer[1] is not None
|
||||
|
||||
row_idx, col_idx = indexer
|
||||
|
||||
row_idx = ensure_platform_int(row_idx)
|
||||
col_idx = ensure_platform_int(col_idx)
|
||||
indexer = row_idx, col_idx
|
||||
mask_info = None
|
||||
|
||||
# check for promotion based on types only (do this first because
|
||||
# it's faster than computing a mask)
|
||||
dtype, fill_value = maybe_promote(arr.dtype, fill_value)
|
||||
if dtype != arr.dtype:
|
||||
# check if promotion is actually required based on indexer
|
||||
row_mask = row_idx == -1
|
||||
col_mask = col_idx == -1
|
||||
row_needs = row_mask.any()
|
||||
col_needs = col_mask.any()
|
||||
mask_info = (row_mask, col_mask), (row_needs, col_needs)
|
||||
|
||||
if not (row_needs or col_needs):
|
||||
# if not, then depromote, set fill_value to dummy
|
||||
# (it won't be used but we don't want the cython code
|
||||
# to crash when trying to cast it to dtype)
|
||||
dtype, fill_value = arr.dtype, arr.dtype.type()
|
||||
|
||||
# at this point, it's guaranteed that dtype can hold both the arr values
|
||||
# and the fill_value
|
||||
out_shape = len(row_idx), len(col_idx)
|
||||
out = np.empty(out_shape, dtype=dtype)
|
||||
|
||||
func = _take_2d_multi_dict.get((arr.dtype.name, out.dtype.name), None)
|
||||
if func is None and arr.dtype != out.dtype:
|
||||
func = _take_2d_multi_dict.get((out.dtype.name, out.dtype.name), None)
|
||||
if func is not None:
|
||||
func = _convert_wrapper(func, out.dtype)
|
||||
|
||||
if func is not None:
|
||||
func(arr, indexer, out=out, fill_value=fill_value)
|
||||
else:
|
||||
# test_reindex_multi
|
||||
_take_2d_multi_object(
|
||||
arr, indexer, out, fill_value=fill_value, mask_info=mask_info
|
||||
)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
@functools.lru_cache
|
||||
def _get_take_nd_function_cached(
|
||||
ndim: int, arr_dtype: np.dtype, out_dtype: np.dtype, axis: AxisInt
|
||||
):
|
||||
"""
|
||||
Part of _get_take_nd_function below that doesn't need `mask_info` and thus
|
||||
can be cached (mask_info potentially contains a numpy ndarray which is not
|
||||
hashable and thus cannot be used as argument for cached function).
|
||||
"""
|
||||
tup = (arr_dtype.name, out_dtype.name)
|
||||
if ndim == 1:
|
||||
func = _take_1d_dict.get(tup, None)
|
||||
elif ndim == 2:
|
||||
if axis == 0:
|
||||
func = _take_2d_axis0_dict.get(tup, None)
|
||||
else:
|
||||
func = _take_2d_axis1_dict.get(tup, None)
|
||||
if func is not None:
|
||||
return func
|
||||
|
||||
# We get here with string, uint, float16, and complex dtypes that could
|
||||
# potentially be handled in algos_take_helper.
|
||||
# Also a couple with (M8[ns], object) and (m8[ns], object)
|
||||
tup = (out_dtype.name, out_dtype.name)
|
||||
if ndim == 1:
|
||||
func = _take_1d_dict.get(tup, None)
|
||||
elif ndim == 2:
|
||||
if axis == 0:
|
||||
func = _take_2d_axis0_dict.get(tup, None)
|
||||
else:
|
||||
func = _take_2d_axis1_dict.get(tup, None)
|
||||
if func is not None:
|
||||
func = _convert_wrapper(func, out_dtype)
|
||||
return func
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _get_take_nd_function(
|
||||
ndim: int,
|
||||
arr_dtype: np.dtype,
|
||||
out_dtype: np.dtype,
|
||||
axis: AxisInt = 0,
|
||||
mask_info=None,
|
||||
):
|
||||
"""
|
||||
Get the appropriate "take" implementation for the given dimension, axis
|
||||
and dtypes.
|
||||
"""
|
||||
func = None
|
||||
if ndim <= 2:
|
||||
# for this part we don't need `mask_info` -> use the cached algo lookup
|
||||
func = _get_take_nd_function_cached(ndim, arr_dtype, out_dtype, axis)
|
||||
|
||||
if func is None:
|
||||
|
||||
def func(arr, indexer, out, fill_value=np.nan) -> None:
|
||||
indexer = ensure_platform_int(indexer)
|
||||
_take_nd_object(
|
||||
arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info
|
||||
)
|
||||
|
||||
return func
|
||||
|
||||
|
||||
def _view_wrapper(f, arr_dtype=None, out_dtype=None, fill_wrap=None):
|
||||
def wrapper(
|
||||
arr: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
|
||||
) -> None:
|
||||
if arr_dtype is not None:
|
||||
arr = arr.view(arr_dtype)
|
||||
if out_dtype is not None:
|
||||
out = out.view(out_dtype)
|
||||
if fill_wrap is not None:
|
||||
# FIXME: if we get here with dt64/td64 we need to be sure we have
|
||||
# matching resos
|
||||
if fill_value.dtype.kind == "m":
|
||||
fill_value = fill_value.astype("m8[ns]")
|
||||
else:
|
||||
fill_value = fill_value.astype("M8[ns]")
|
||||
fill_value = fill_wrap(fill_value)
|
||||
|
||||
f(arr, indexer, out, fill_value=fill_value)
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def _convert_wrapper(f, conv_dtype):
|
||||
def wrapper(
|
||||
arr: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
|
||||
) -> None:
|
||||
if conv_dtype == object:
|
||||
# GH#39755 avoid casting dt64/td64 to integers
|
||||
arr = ensure_wrapped_if_datetimelike(arr)
|
||||
arr = arr.astype(conv_dtype)
|
||||
f(arr, indexer, out, fill_value=fill_value)
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
_take_1d_dict = {
|
||||
("int8", "int8"): libalgos.take_1d_int8_int8,
|
||||
("int8", "int32"): libalgos.take_1d_int8_int32,
|
||||
("int8", "int64"): libalgos.take_1d_int8_int64,
|
||||
("int8", "float64"): libalgos.take_1d_int8_float64,
|
||||
("int16", "int16"): libalgos.take_1d_int16_int16,
|
||||
("int16", "int32"): libalgos.take_1d_int16_int32,
|
||||
("int16", "int64"): libalgos.take_1d_int16_int64,
|
||||
("int16", "float64"): libalgos.take_1d_int16_float64,
|
||||
("int32", "int32"): libalgos.take_1d_int32_int32,
|
||||
("int32", "int64"): libalgos.take_1d_int32_int64,
|
||||
("int32", "float64"): libalgos.take_1d_int32_float64,
|
||||
("int64", "int64"): libalgos.take_1d_int64_int64,
|
||||
("int64", "float64"): libalgos.take_1d_int64_float64,
|
||||
("float32", "float32"): libalgos.take_1d_float32_float32,
|
||||
("float32", "float64"): libalgos.take_1d_float32_float64,
|
||||
("float64", "float64"): libalgos.take_1d_float64_float64,
|
||||
("object", "object"): libalgos.take_1d_object_object,
|
||||
("bool", "bool"): _view_wrapper(libalgos.take_1d_bool_bool, np.uint8, np.uint8),
|
||||
("bool", "object"): _view_wrapper(libalgos.take_1d_bool_object, np.uint8, None),
|
||||
("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
|
||||
libalgos.take_1d_int64_int64, np.int64, np.int64, np.int64
|
||||
),
|
||||
("timedelta64[ns]", "timedelta64[ns]"): _view_wrapper(
|
||||
libalgos.take_1d_int64_int64, np.int64, np.int64, np.int64
|
||||
),
|
||||
}
|
||||
|
||||
_take_2d_axis0_dict = {
|
||||
("int8", "int8"): libalgos.take_2d_axis0_int8_int8,
|
||||
("int8", "int32"): libalgos.take_2d_axis0_int8_int32,
|
||||
("int8", "int64"): libalgos.take_2d_axis0_int8_int64,
|
||||
("int8", "float64"): libalgos.take_2d_axis0_int8_float64,
|
||||
("int16", "int16"): libalgos.take_2d_axis0_int16_int16,
|
||||
("int16", "int32"): libalgos.take_2d_axis0_int16_int32,
|
||||
("int16", "int64"): libalgos.take_2d_axis0_int16_int64,
|
||||
("int16", "float64"): libalgos.take_2d_axis0_int16_float64,
|
||||
("int32", "int32"): libalgos.take_2d_axis0_int32_int32,
|
||||
("int32", "int64"): libalgos.take_2d_axis0_int32_int64,
|
||||
("int32", "float64"): libalgos.take_2d_axis0_int32_float64,
|
||||
("int64", "int64"): libalgos.take_2d_axis0_int64_int64,
|
||||
("int64", "float64"): libalgos.take_2d_axis0_int64_float64,
|
||||
("float32", "float32"): libalgos.take_2d_axis0_float32_float32,
|
||||
("float32", "float64"): libalgos.take_2d_axis0_float32_float64,
|
||||
("float64", "float64"): libalgos.take_2d_axis0_float64_float64,
|
||||
("object", "object"): libalgos.take_2d_axis0_object_object,
|
||||
("bool", "bool"): _view_wrapper(
|
||||
libalgos.take_2d_axis0_bool_bool, np.uint8, np.uint8
|
||||
),
|
||||
("bool", "object"): _view_wrapper(
|
||||
libalgos.take_2d_axis0_bool_object, np.uint8, None
|
||||
),
|
||||
("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
|
||||
libalgos.take_2d_axis0_int64_int64, np.int64, np.int64, fill_wrap=np.int64
|
||||
),
|
||||
("timedelta64[ns]", "timedelta64[ns]"): _view_wrapper(
|
||||
libalgos.take_2d_axis0_int64_int64, np.int64, np.int64, fill_wrap=np.int64
|
||||
),
|
||||
}
|
||||
|
||||
_take_2d_axis1_dict = {
|
||||
("int8", "int8"): libalgos.take_2d_axis1_int8_int8,
|
||||
("int8", "int32"): libalgos.take_2d_axis1_int8_int32,
|
||||
("int8", "int64"): libalgos.take_2d_axis1_int8_int64,
|
||||
("int8", "float64"): libalgos.take_2d_axis1_int8_float64,
|
||||
("int16", "int16"): libalgos.take_2d_axis1_int16_int16,
|
||||
("int16", "int32"): libalgos.take_2d_axis1_int16_int32,
|
||||
("int16", "int64"): libalgos.take_2d_axis1_int16_int64,
|
||||
("int16", "float64"): libalgos.take_2d_axis1_int16_float64,
|
||||
("int32", "int32"): libalgos.take_2d_axis1_int32_int32,
|
||||
("int32", "int64"): libalgos.take_2d_axis1_int32_int64,
|
||||
("int32", "float64"): libalgos.take_2d_axis1_int32_float64,
|
||||
("int64", "int64"): libalgos.take_2d_axis1_int64_int64,
|
||||
("int64", "float64"): libalgos.take_2d_axis1_int64_float64,
|
||||
("float32", "float32"): libalgos.take_2d_axis1_float32_float32,
|
||||
("float32", "float64"): libalgos.take_2d_axis1_float32_float64,
|
||||
("float64", "float64"): libalgos.take_2d_axis1_float64_float64,
|
||||
("object", "object"): libalgos.take_2d_axis1_object_object,
|
||||
("bool", "bool"): _view_wrapper(
|
||||
libalgos.take_2d_axis1_bool_bool, np.uint8, np.uint8
|
||||
),
|
||||
("bool", "object"): _view_wrapper(
|
||||
libalgos.take_2d_axis1_bool_object, np.uint8, None
|
||||
),
|
||||
("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
|
||||
libalgos.take_2d_axis1_int64_int64, np.int64, np.int64, fill_wrap=np.int64
|
||||
),
|
||||
("timedelta64[ns]", "timedelta64[ns]"): _view_wrapper(
|
||||
libalgos.take_2d_axis1_int64_int64, np.int64, np.int64, fill_wrap=np.int64
|
||||
),
|
||||
}
|
||||
|
||||
_take_2d_multi_dict = {
|
||||
("int8", "int8"): libalgos.take_2d_multi_int8_int8,
|
||||
("int8", "int32"): libalgos.take_2d_multi_int8_int32,
|
||||
("int8", "int64"): libalgos.take_2d_multi_int8_int64,
|
||||
("int8", "float64"): libalgos.take_2d_multi_int8_float64,
|
||||
("int16", "int16"): libalgos.take_2d_multi_int16_int16,
|
||||
("int16", "int32"): libalgos.take_2d_multi_int16_int32,
|
||||
("int16", "int64"): libalgos.take_2d_multi_int16_int64,
|
||||
("int16", "float64"): libalgos.take_2d_multi_int16_float64,
|
||||
("int32", "int32"): libalgos.take_2d_multi_int32_int32,
|
||||
("int32", "int64"): libalgos.take_2d_multi_int32_int64,
|
||||
("int32", "float64"): libalgos.take_2d_multi_int32_float64,
|
||||
("int64", "int64"): libalgos.take_2d_multi_int64_int64,
|
||||
("int64", "float64"): libalgos.take_2d_multi_int64_float64,
|
||||
("float32", "float32"): libalgos.take_2d_multi_float32_float32,
|
||||
("float32", "float64"): libalgos.take_2d_multi_float32_float64,
|
||||
("float64", "float64"): libalgos.take_2d_multi_float64_float64,
|
||||
("object", "object"): libalgos.take_2d_multi_object_object,
|
||||
("bool", "bool"): _view_wrapper(
|
||||
libalgos.take_2d_multi_bool_bool, np.uint8, np.uint8
|
||||
),
|
||||
("bool", "object"): _view_wrapper(
|
||||
libalgos.take_2d_multi_bool_object, np.uint8, None
|
||||
),
|
||||
("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
|
||||
libalgos.take_2d_multi_int64_int64, np.int64, np.int64, fill_wrap=np.int64
|
||||
),
|
||||
("timedelta64[ns]", "timedelta64[ns]"): _view_wrapper(
|
||||
libalgos.take_2d_multi_int64_int64, np.int64, np.int64, fill_wrap=np.int64
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def _take_nd_object(
|
||||
arr: np.ndarray,
|
||||
indexer: npt.NDArray[np.intp],
|
||||
out: np.ndarray,
|
||||
axis: AxisInt,
|
||||
fill_value,
|
||||
mask_info,
|
||||
) -> None:
|
||||
if mask_info is not None:
|
||||
mask, needs_masking = mask_info
|
||||
else:
|
||||
mask = indexer == -1
|
||||
needs_masking = mask.any()
|
||||
if arr.dtype != out.dtype:
|
||||
arr = arr.astype(out.dtype)
|
||||
if arr.shape[axis] > 0:
|
||||
arr.take(indexer, axis=axis, out=out)
|
||||
if needs_masking:
|
||||
outindexer = [slice(None)] * arr.ndim
|
||||
outindexer[axis] = mask
|
||||
out[tuple(outindexer)] = fill_value
|
||||
|
||||
|
||||
def _take_2d_multi_object(
|
||||
arr: np.ndarray,
|
||||
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
|
||||
out: np.ndarray,
|
||||
fill_value,
|
||||
mask_info,
|
||||
) -> None:
|
||||
# this is not ideal, performance-wise, but it's better than raising
|
||||
# an exception (best to optimize in Cython to avoid getting here)
|
||||
row_idx, col_idx = indexer # both np.intp
|
||||
if mask_info is not None:
|
||||
(row_mask, col_mask), (row_needs, col_needs) = mask_info
|
||||
else:
|
||||
row_mask = row_idx == -1
|
||||
col_mask = col_idx == -1
|
||||
row_needs = row_mask.any()
|
||||
col_needs = col_mask.any()
|
||||
if fill_value is not None:
|
||||
if row_needs:
|
||||
out[row_mask, :] = fill_value
|
||||
if col_needs:
|
||||
out[:, col_mask] = fill_value
|
||||
for i, u_ in enumerate(row_idx):
|
||||
if u_ != -1:
|
||||
for j, v in enumerate(col_idx):
|
||||
if v != -1:
|
||||
out[i, j] = arr[u_, v]
|
||||
|
||||
|
||||
def _take_preprocess_indexer_and_fill_value(
|
||||
arr: np.ndarray,
|
||||
indexer: npt.NDArray[np.intp],
|
||||
fill_value,
|
||||
allow_fill: bool,
|
||||
mask: npt.NDArray[np.bool_] | None = None,
|
||||
):
|
||||
mask_info: tuple[np.ndarray | None, bool] | None = None
|
||||
|
||||
if not allow_fill:
|
||||
dtype, fill_value = arr.dtype, arr.dtype.type()
|
||||
mask_info = None, False
|
||||
else:
|
||||
# check for promotion based on types only (do this first because
|
||||
# it's faster than computing a mask)
|
||||
dtype, fill_value = maybe_promote(arr.dtype, fill_value)
|
||||
if dtype != arr.dtype:
|
||||
# check if promotion is actually required based on indexer
|
||||
if mask is not None:
|
||||
needs_masking = True
|
||||
else:
|
||||
mask = indexer == -1
|
||||
needs_masking = bool(mask.any())
|
||||
mask_info = mask, needs_masking
|
||||
if not needs_masking:
|
||||
# if not, then depromote, set fill_value to dummy
|
||||
# (it won't be used but we don't want the cython code
|
||||
# to crash when trying to cast it to dtype)
|
||||
dtype, fill_value = arr.dtype, arr.dtype.type()
|
||||
|
||||
return dtype, fill_value, mask_info
|
||||
@ -0,0 +1,50 @@
|
||||
"""
|
||||
transforms.py is for shape-preserving functions.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
AxisInt,
|
||||
Scalar,
|
||||
)
|
||||
|
||||
|
||||
def shift(
|
||||
values: np.ndarray, periods: int, axis: AxisInt, fill_value: Scalar
|
||||
) -> np.ndarray:
|
||||
new_values = values
|
||||
|
||||
if periods == 0 or values.size == 0:
|
||||
return new_values.copy()
|
||||
|
||||
# make sure array sent to np.roll is c_contiguous
|
||||
f_ordered = values.flags.f_contiguous
|
||||
if f_ordered:
|
||||
new_values = new_values.T
|
||||
axis = new_values.ndim - axis - 1
|
||||
|
||||
if new_values.size:
|
||||
new_values = np.roll(
|
||||
new_values,
|
||||
np.intp(periods),
|
||||
axis=axis,
|
||||
)
|
||||
|
||||
axis_indexer = [slice(None)] * values.ndim
|
||||
if periods > 0:
|
||||
axis_indexer[axis] = slice(None, periods)
|
||||
else:
|
||||
axis_indexer[axis] = slice(periods, None)
|
||||
new_values[tuple(axis_indexer)] = fill_value
|
||||
|
||||
# restore original order
|
||||
if f_ordered:
|
||||
new_values = new_values.T
|
||||
|
||||
return new_values
|
||||
530
lib/python3.11/site-packages/pandas/core/arraylike.py
Normal file
530
lib/python3.11/site-packages/pandas/core/arraylike.py
Normal file
@ -0,0 +1,530 @@
|
||||
"""
|
||||
Methods that can be shared by many array-like classes or subclasses:
|
||||
Series
|
||||
Index
|
||||
ExtensionArray
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import operator
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op
|
||||
|
||||
from pandas.core.dtypes.generic import ABCNDFrame
|
||||
|
||||
from pandas.core import roperator
|
||||
from pandas.core.construction import extract_array
|
||||
from pandas.core.ops.common import unpack_zerodim_and_defer
|
||||
|
||||
REDUCTION_ALIASES = {
|
||||
"maximum": "max",
|
||||
"minimum": "min",
|
||||
"add": "sum",
|
||||
"multiply": "prod",
|
||||
}
|
||||
|
||||
|
||||
class OpsMixin:
|
||||
# -------------------------------------------------------------
|
||||
# Comparisons
|
||||
|
||||
def _cmp_method(self, other, op):
|
||||
return NotImplemented
|
||||
|
||||
@unpack_zerodim_and_defer("__eq__")
|
||||
def __eq__(self, other):
|
||||
return self._cmp_method(other, operator.eq)
|
||||
|
||||
@unpack_zerodim_and_defer("__ne__")
|
||||
def __ne__(self, other):
|
||||
return self._cmp_method(other, operator.ne)
|
||||
|
||||
@unpack_zerodim_and_defer("__lt__")
|
||||
def __lt__(self, other):
|
||||
return self._cmp_method(other, operator.lt)
|
||||
|
||||
@unpack_zerodim_and_defer("__le__")
|
||||
def __le__(self, other):
|
||||
return self._cmp_method(other, operator.le)
|
||||
|
||||
@unpack_zerodim_and_defer("__gt__")
|
||||
def __gt__(self, other):
|
||||
return self._cmp_method(other, operator.gt)
|
||||
|
||||
@unpack_zerodim_and_defer("__ge__")
|
||||
def __ge__(self, other):
|
||||
return self._cmp_method(other, operator.ge)
|
||||
|
||||
# -------------------------------------------------------------
|
||||
# Logical Methods
|
||||
|
||||
def _logical_method(self, other, op):
|
||||
return NotImplemented
|
||||
|
||||
@unpack_zerodim_and_defer("__and__")
|
||||
def __and__(self, other):
|
||||
return self._logical_method(other, operator.and_)
|
||||
|
||||
@unpack_zerodim_and_defer("__rand__")
|
||||
def __rand__(self, other):
|
||||
return self._logical_method(other, roperator.rand_)
|
||||
|
||||
@unpack_zerodim_and_defer("__or__")
|
||||
def __or__(self, other):
|
||||
return self._logical_method(other, operator.or_)
|
||||
|
||||
@unpack_zerodim_and_defer("__ror__")
|
||||
def __ror__(self, other):
|
||||
return self._logical_method(other, roperator.ror_)
|
||||
|
||||
@unpack_zerodim_and_defer("__xor__")
|
||||
def __xor__(self, other):
|
||||
return self._logical_method(other, operator.xor)
|
||||
|
||||
@unpack_zerodim_and_defer("__rxor__")
|
||||
def __rxor__(self, other):
|
||||
return self._logical_method(other, roperator.rxor)
|
||||
|
||||
# -------------------------------------------------------------
|
||||
# Arithmetic Methods
|
||||
|
||||
def _arith_method(self, other, op):
|
||||
return NotImplemented
|
||||
|
||||
@unpack_zerodim_and_defer("__add__")
|
||||
def __add__(self, other):
|
||||
"""
|
||||
Get Addition of DataFrame and other, column-wise.
|
||||
|
||||
Equivalent to ``DataFrame.add(other)``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
other : scalar, sequence, Series, dict or DataFrame
|
||||
Object to be added to the DataFrame.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
The result of adding ``other`` to DataFrame.
|
||||
|
||||
See Also
|
||||
--------
|
||||
DataFrame.add : Add a DataFrame and another object, with option for index-
|
||||
or column-oriented addition.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.DataFrame({'height': [1.5, 2.6], 'weight': [500, 800]},
|
||||
... index=['elk', 'moose'])
|
||||
>>> df
|
||||
height weight
|
||||
elk 1.5 500
|
||||
moose 2.6 800
|
||||
|
||||
Adding a scalar affects all rows and columns.
|
||||
|
||||
>>> df[['height', 'weight']] + 1.5
|
||||
height weight
|
||||
elk 3.0 501.5
|
||||
moose 4.1 801.5
|
||||
|
||||
Each element of a list is added to a column of the DataFrame, in order.
|
||||
|
||||
>>> df[['height', 'weight']] + [0.5, 1.5]
|
||||
height weight
|
||||
elk 2.0 501.5
|
||||
moose 3.1 801.5
|
||||
|
||||
Keys of a dictionary are aligned to the DataFrame, based on column names;
|
||||
each value in the dictionary is added to the corresponding column.
|
||||
|
||||
>>> df[['height', 'weight']] + {'height': 0.5, 'weight': 1.5}
|
||||
height weight
|
||||
elk 2.0 501.5
|
||||
moose 3.1 801.5
|
||||
|
||||
When `other` is a :class:`Series`, the index of `other` is aligned with the
|
||||
columns of the DataFrame.
|
||||
|
||||
>>> s1 = pd.Series([0.5, 1.5], index=['weight', 'height'])
|
||||
>>> df[['height', 'weight']] + s1
|
||||
height weight
|
||||
elk 3.0 500.5
|
||||
moose 4.1 800.5
|
||||
|
||||
Even when the index of `other` is the same as the index of the DataFrame,
|
||||
the :class:`Series` will not be reoriented. If index-wise alignment is desired,
|
||||
:meth:`DataFrame.add` should be used with `axis='index'`.
|
||||
|
||||
>>> s2 = pd.Series([0.5, 1.5], index=['elk', 'moose'])
|
||||
>>> df[['height', 'weight']] + s2
|
||||
elk height moose weight
|
||||
elk NaN NaN NaN NaN
|
||||
moose NaN NaN NaN NaN
|
||||
|
||||
>>> df[['height', 'weight']].add(s2, axis='index')
|
||||
height weight
|
||||
elk 2.0 500.5
|
||||
moose 4.1 801.5
|
||||
|
||||
When `other` is a :class:`DataFrame`, both columns names and the
|
||||
index are aligned.
|
||||
|
||||
>>> other = pd.DataFrame({'height': [0.2, 0.4, 0.6]},
|
||||
... index=['elk', 'moose', 'deer'])
|
||||
>>> df[['height', 'weight']] + other
|
||||
height weight
|
||||
deer NaN NaN
|
||||
elk 1.7 NaN
|
||||
moose 3.0 NaN
|
||||
"""
|
||||
return self._arith_method(other, operator.add)
|
||||
|
||||
@unpack_zerodim_and_defer("__radd__")
|
||||
def __radd__(self, other):
|
||||
return self._arith_method(other, roperator.radd)
|
||||
|
||||
@unpack_zerodim_and_defer("__sub__")
|
||||
def __sub__(self, other):
|
||||
return self._arith_method(other, operator.sub)
|
||||
|
||||
@unpack_zerodim_and_defer("__rsub__")
|
||||
def __rsub__(self, other):
|
||||
return self._arith_method(other, roperator.rsub)
|
||||
|
||||
@unpack_zerodim_and_defer("__mul__")
|
||||
def __mul__(self, other):
|
||||
return self._arith_method(other, operator.mul)
|
||||
|
||||
@unpack_zerodim_and_defer("__rmul__")
|
||||
def __rmul__(self, other):
|
||||
return self._arith_method(other, roperator.rmul)
|
||||
|
||||
@unpack_zerodim_and_defer("__truediv__")
|
||||
def __truediv__(self, other):
|
||||
return self._arith_method(other, operator.truediv)
|
||||
|
||||
@unpack_zerodim_and_defer("__rtruediv__")
|
||||
def __rtruediv__(self, other):
|
||||
return self._arith_method(other, roperator.rtruediv)
|
||||
|
||||
@unpack_zerodim_and_defer("__floordiv__")
|
||||
def __floordiv__(self, other):
|
||||
return self._arith_method(other, operator.floordiv)
|
||||
|
||||
@unpack_zerodim_and_defer("__rfloordiv")
|
||||
def __rfloordiv__(self, other):
|
||||
return self._arith_method(other, roperator.rfloordiv)
|
||||
|
||||
@unpack_zerodim_and_defer("__mod__")
|
||||
def __mod__(self, other):
|
||||
return self._arith_method(other, operator.mod)
|
||||
|
||||
@unpack_zerodim_and_defer("__rmod__")
|
||||
def __rmod__(self, other):
|
||||
return self._arith_method(other, roperator.rmod)
|
||||
|
||||
@unpack_zerodim_and_defer("__divmod__")
|
||||
def __divmod__(self, other):
|
||||
return self._arith_method(other, divmod)
|
||||
|
||||
@unpack_zerodim_and_defer("__rdivmod__")
|
||||
def __rdivmod__(self, other):
|
||||
return self._arith_method(other, roperator.rdivmod)
|
||||
|
||||
@unpack_zerodim_and_defer("__pow__")
|
||||
def __pow__(self, other):
|
||||
return self._arith_method(other, operator.pow)
|
||||
|
||||
@unpack_zerodim_and_defer("__rpow__")
|
||||
def __rpow__(self, other):
|
||||
return self._arith_method(other, roperator.rpow)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Helpers to implement __array_ufunc__
|
||||
|
||||
|
||||
def array_ufunc(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any):
|
||||
"""
|
||||
Compatibility with numpy ufuncs.
|
||||
|
||||
See also
|
||||
--------
|
||||
numpy.org/doc/stable/reference/arrays.classes.html#numpy.class.__array_ufunc__
|
||||
"""
|
||||
from pandas.core.frame import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
from pandas.core.generic import NDFrame
|
||||
from pandas.core.internals import (
|
||||
ArrayManager,
|
||||
BlockManager,
|
||||
)
|
||||
|
||||
cls = type(self)
|
||||
|
||||
kwargs = _standardize_out_kwarg(**kwargs)
|
||||
|
||||
# for binary ops, use our custom dunder methods
|
||||
result = maybe_dispatch_ufunc_to_dunder_op(self, ufunc, method, *inputs, **kwargs)
|
||||
if result is not NotImplemented:
|
||||
return result
|
||||
|
||||
# Determine if we should defer.
|
||||
no_defer = (
|
||||
np.ndarray.__array_ufunc__,
|
||||
cls.__array_ufunc__,
|
||||
)
|
||||
|
||||
for item in inputs:
|
||||
higher_priority = (
|
||||
hasattr(item, "__array_priority__")
|
||||
and item.__array_priority__ > self.__array_priority__
|
||||
)
|
||||
has_array_ufunc = (
|
||||
hasattr(item, "__array_ufunc__")
|
||||
and type(item).__array_ufunc__ not in no_defer
|
||||
and not isinstance(item, self._HANDLED_TYPES)
|
||||
)
|
||||
if higher_priority or has_array_ufunc:
|
||||
return NotImplemented
|
||||
|
||||
# align all the inputs.
|
||||
types = tuple(type(x) for x in inputs)
|
||||
alignable = [x for x, t in zip(inputs, types) if issubclass(t, NDFrame)]
|
||||
|
||||
if len(alignable) > 1:
|
||||
# This triggers alignment.
|
||||
# At the moment, there aren't any ufuncs with more than two inputs
|
||||
# so this ends up just being x1.index | x2.index, but we write
|
||||
# it to handle *args.
|
||||
set_types = set(types)
|
||||
if len(set_types) > 1 and {DataFrame, Series}.issubset(set_types):
|
||||
# We currently don't handle ufunc(DataFrame, Series)
|
||||
# well. Previously this raised an internal ValueError. We might
|
||||
# support it someday, so raise a NotImplementedError.
|
||||
raise NotImplementedError(
|
||||
f"Cannot apply ufunc {ufunc} to mixed DataFrame and Series inputs."
|
||||
)
|
||||
axes = self.axes
|
||||
for obj in alignable[1:]:
|
||||
# this relies on the fact that we aren't handling mixed
|
||||
# series / frame ufuncs.
|
||||
for i, (ax1, ax2) in enumerate(zip(axes, obj.axes)):
|
||||
axes[i] = ax1.union(ax2)
|
||||
|
||||
reconstruct_axes = dict(zip(self._AXIS_ORDERS, axes))
|
||||
inputs = tuple(
|
||||
x.reindex(**reconstruct_axes) if issubclass(t, NDFrame) else x
|
||||
for x, t in zip(inputs, types)
|
||||
)
|
||||
else:
|
||||
reconstruct_axes = dict(zip(self._AXIS_ORDERS, self.axes))
|
||||
|
||||
if self.ndim == 1:
|
||||
names = [getattr(x, "name") for x in inputs if hasattr(x, "name")]
|
||||
name = names[0] if len(set(names)) == 1 else None
|
||||
reconstruct_kwargs = {"name": name}
|
||||
else:
|
||||
reconstruct_kwargs = {}
|
||||
|
||||
def reconstruct(result):
|
||||
if ufunc.nout > 1:
|
||||
# np.modf, np.frexp, np.divmod
|
||||
return tuple(_reconstruct(x) for x in result)
|
||||
|
||||
return _reconstruct(result)
|
||||
|
||||
def _reconstruct(result):
|
||||
if lib.is_scalar(result):
|
||||
return result
|
||||
|
||||
if result.ndim != self.ndim:
|
||||
if method == "outer":
|
||||
raise NotImplementedError
|
||||
return result
|
||||
if isinstance(result, (BlockManager, ArrayManager)):
|
||||
# we went through BlockManager.apply e.g. np.sqrt
|
||||
result = self._constructor_from_mgr(result, axes=result.axes)
|
||||
else:
|
||||
# we converted an array, lost our axes
|
||||
result = self._constructor(
|
||||
result, **reconstruct_axes, **reconstruct_kwargs, copy=False
|
||||
)
|
||||
# TODO: When we support multiple values in __finalize__, this
|
||||
# should pass alignable to `__finalize__` instead of self.
|
||||
# Then `np.add(a, b)` would consider attrs from both a and b
|
||||
# when a and b are NDFrames.
|
||||
if len(alignable) == 1:
|
||||
result = result.__finalize__(self)
|
||||
return result
|
||||
|
||||
if "out" in kwargs:
|
||||
# e.g. test_multiindex_get_loc
|
||||
result = dispatch_ufunc_with_out(self, ufunc, method, *inputs, **kwargs)
|
||||
return reconstruct(result)
|
||||
|
||||
if method == "reduce":
|
||||
# e.g. test.series.test_ufunc.test_reduce
|
||||
result = dispatch_reduction_ufunc(self, ufunc, method, *inputs, **kwargs)
|
||||
if result is not NotImplemented:
|
||||
return result
|
||||
|
||||
# We still get here with kwargs `axis` for e.g. np.maximum.accumulate
|
||||
# and `dtype` and `keepdims` for np.ptp
|
||||
|
||||
if self.ndim > 1 and (len(inputs) > 1 or ufunc.nout > 1):
|
||||
# Just give up on preserving types in the complex case.
|
||||
# In theory we could preserve them for them.
|
||||
# * nout>1 is doable if BlockManager.apply took nout and
|
||||
# returned a Tuple[BlockManager].
|
||||
# * len(inputs) > 1 is doable when we know that we have
|
||||
# aligned blocks / dtypes.
|
||||
|
||||
# e.g. my_ufunc, modf, logaddexp, heaviside, subtract, add
|
||||
inputs = tuple(np.asarray(x) for x in inputs)
|
||||
# Note: we can't use default_array_ufunc here bc reindexing means
|
||||
# that `self` may not be among `inputs`
|
||||
result = getattr(ufunc, method)(*inputs, **kwargs)
|
||||
elif self.ndim == 1:
|
||||
# ufunc(series, ...)
|
||||
inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs)
|
||||
result = getattr(ufunc, method)(*inputs, **kwargs)
|
||||
else:
|
||||
# ufunc(dataframe)
|
||||
if method == "__call__" and not kwargs:
|
||||
# for np.<ufunc>(..) calls
|
||||
# kwargs cannot necessarily be handled block-by-block, so only
|
||||
# take this path if there are no kwargs
|
||||
mgr = inputs[0]._mgr
|
||||
result = mgr.apply(getattr(ufunc, method))
|
||||
else:
|
||||
# otherwise specific ufunc methods (eg np.<ufunc>.accumulate(..))
|
||||
# Those can have an axis keyword and thus can't be called block-by-block
|
||||
result = default_array_ufunc(inputs[0], ufunc, method, *inputs, **kwargs)
|
||||
# e.g. np.negative (only one reached), with "where" and "out" in kwargs
|
||||
|
||||
result = reconstruct(result)
|
||||
return result
|
||||
|
||||
|
||||
def _standardize_out_kwarg(**kwargs) -> dict:
|
||||
"""
|
||||
If kwargs contain "out1" and "out2", replace that with a tuple "out"
|
||||
|
||||
np.divmod, np.modf, np.frexp can have either `out=(out1, out2)` or
|
||||
`out1=out1, out2=out2)`
|
||||
"""
|
||||
if "out" not in kwargs and "out1" in kwargs and "out2" in kwargs:
|
||||
out1 = kwargs.pop("out1")
|
||||
out2 = kwargs.pop("out2")
|
||||
out = (out1, out2)
|
||||
kwargs["out"] = out
|
||||
return kwargs
|
||||
|
||||
|
||||
def dispatch_ufunc_with_out(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
|
||||
"""
|
||||
If we have an `out` keyword, then call the ufunc without `out` and then
|
||||
set the result into the given `out`.
|
||||
"""
|
||||
|
||||
# Note: we assume _standardize_out_kwarg has already been called.
|
||||
out = kwargs.pop("out")
|
||||
where = kwargs.pop("where", None)
|
||||
|
||||
result = getattr(ufunc, method)(*inputs, **kwargs)
|
||||
|
||||
if result is NotImplemented:
|
||||
return NotImplemented
|
||||
|
||||
if isinstance(result, tuple):
|
||||
# i.e. np.divmod, np.modf, np.frexp
|
||||
if not isinstance(out, tuple) or len(out) != len(result):
|
||||
raise NotImplementedError
|
||||
|
||||
for arr, res in zip(out, result):
|
||||
_assign_where(arr, res, where)
|
||||
|
||||
return out
|
||||
|
||||
if isinstance(out, tuple):
|
||||
if len(out) == 1:
|
||||
out = out[0]
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
_assign_where(out, result, where)
|
||||
return out
|
||||
|
||||
|
||||
def _assign_where(out, result, where) -> None:
|
||||
"""
|
||||
Set a ufunc result into 'out', masking with a 'where' argument if necessary.
|
||||
"""
|
||||
if where is None:
|
||||
# no 'where' arg passed to ufunc
|
||||
out[:] = result
|
||||
else:
|
||||
np.putmask(out, where, result)
|
||||
|
||||
|
||||
def default_array_ufunc(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
|
||||
"""
|
||||
Fallback to the behavior we would get if we did not define __array_ufunc__.
|
||||
|
||||
Notes
|
||||
-----
|
||||
We are assuming that `self` is among `inputs`.
|
||||
"""
|
||||
if not any(x is self for x in inputs):
|
||||
raise NotImplementedError
|
||||
|
||||
new_inputs = [x if x is not self else np.asarray(x) for x in inputs]
|
||||
|
||||
return getattr(ufunc, method)(*new_inputs, **kwargs)
|
||||
|
||||
|
||||
def dispatch_reduction_ufunc(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
|
||||
"""
|
||||
Dispatch ufunc reductions to self's reduction methods.
|
||||
"""
|
||||
assert method == "reduce"
|
||||
|
||||
if len(inputs) != 1 or inputs[0] is not self:
|
||||
return NotImplemented
|
||||
|
||||
if ufunc.__name__ not in REDUCTION_ALIASES:
|
||||
return NotImplemented
|
||||
|
||||
method_name = REDUCTION_ALIASES[ufunc.__name__]
|
||||
|
||||
# NB: we are assuming that min/max represent minimum/maximum methods,
|
||||
# which would not be accurate for e.g. Timestamp.min
|
||||
if not hasattr(self, method_name):
|
||||
return NotImplemented
|
||||
|
||||
if self.ndim > 1:
|
||||
if isinstance(self, ABCNDFrame):
|
||||
# TODO: test cases where this doesn't hold, i.e. 2D DTA/TDA
|
||||
kwargs["numeric_only"] = False
|
||||
|
||||
if "axis" not in kwargs:
|
||||
# For DataFrame reductions we don't want the default axis=0
|
||||
# Note: np.min is not a ufunc, but uses array_function_dispatch,
|
||||
# so calls DataFrame.min (without ever getting here) with the np.min
|
||||
# default of axis=None, which DataFrame.min catches and changes to axis=0.
|
||||
# np.minimum.reduce(df) gets here bc axis is not in kwargs,
|
||||
# so we set axis=0 to match the behaviorof np.minimum.reduce(df.values)
|
||||
kwargs["axis"] = 0
|
||||
|
||||
# By default, numpy's reductions do not skip NaNs, so we have to
|
||||
# pass skipna=False
|
||||
return getattr(self, method_name)(skipna=False, **kwargs)
|
||||
43
lib/python3.11/site-packages/pandas/core/arrays/__init__.py
Normal file
43
lib/python3.11/site-packages/pandas/core/arrays/__init__.py
Normal file
@ -0,0 +1,43 @@
|
||||
from pandas.core.arrays.arrow import ArrowExtensionArray
|
||||
from pandas.core.arrays.base import (
|
||||
ExtensionArray,
|
||||
ExtensionOpsMixin,
|
||||
ExtensionScalarOpsMixin,
|
||||
)
|
||||
from pandas.core.arrays.boolean import BooleanArray
|
||||
from pandas.core.arrays.categorical import Categorical
|
||||
from pandas.core.arrays.datetimes import DatetimeArray
|
||||
from pandas.core.arrays.floating import FloatingArray
|
||||
from pandas.core.arrays.integer import IntegerArray
|
||||
from pandas.core.arrays.interval import IntervalArray
|
||||
from pandas.core.arrays.masked import BaseMaskedArray
|
||||
from pandas.core.arrays.numpy_ import NumpyExtensionArray
|
||||
from pandas.core.arrays.period import (
|
||||
PeriodArray,
|
||||
period_array,
|
||||
)
|
||||
from pandas.core.arrays.sparse import SparseArray
|
||||
from pandas.core.arrays.string_ import StringArray
|
||||
from pandas.core.arrays.string_arrow import ArrowStringArray
|
||||
from pandas.core.arrays.timedeltas import TimedeltaArray
|
||||
|
||||
__all__ = [
|
||||
"ArrowExtensionArray",
|
||||
"ExtensionArray",
|
||||
"ExtensionOpsMixin",
|
||||
"ExtensionScalarOpsMixin",
|
||||
"ArrowStringArray",
|
||||
"BaseMaskedArray",
|
||||
"BooleanArray",
|
||||
"Categorical",
|
||||
"DatetimeArray",
|
||||
"FloatingArray",
|
||||
"IntegerArray",
|
||||
"IntervalArray",
|
||||
"NumpyExtensionArray",
|
||||
"PeriodArray",
|
||||
"period_array",
|
||||
"SparseArray",
|
||||
"StringArray",
|
||||
"TimedeltaArray",
|
||||
]
|
||||
@ -0,0 +1,362 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from functools import partial
|
||||
import re
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Literal,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas.compat import (
|
||||
pa_version_under10p1,
|
||||
pa_version_under11p0,
|
||||
pa_version_under13p0,
|
||||
pa_version_under17p0,
|
||||
)
|
||||
|
||||
if not pa_version_under10p1:
|
||||
import pyarrow as pa
|
||||
import pyarrow.compute as pc
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable
|
||||
|
||||
from pandas._typing import (
|
||||
Scalar,
|
||||
Self,
|
||||
)
|
||||
|
||||
|
||||
class ArrowStringArrayMixin:
|
||||
_pa_array: pa.ChunkedArray
|
||||
|
||||
def __init__(self, *args, **kwargs) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
def _convert_bool_result(self, result, na=lib.no_default, method_name=None):
|
||||
# Convert a bool-dtype result to the appropriate result type
|
||||
raise NotImplementedError
|
||||
|
||||
def _convert_int_result(self, result):
|
||||
# Convert an integer-dtype result to the appropriate result type
|
||||
raise NotImplementedError
|
||||
|
||||
def _apply_elementwise(self, func: Callable) -> list[list[Any]]:
|
||||
raise NotImplementedError
|
||||
|
||||
def _str_len(self):
|
||||
result = pc.utf8_length(self._pa_array)
|
||||
return self._convert_int_result(result)
|
||||
|
||||
def _str_lower(self) -> Self:
|
||||
return type(self)(pc.utf8_lower(self._pa_array))
|
||||
|
||||
def _str_upper(self) -> Self:
|
||||
return type(self)(pc.utf8_upper(self._pa_array))
|
||||
|
||||
def _str_strip(self, to_strip=None) -> Self:
|
||||
if to_strip is None:
|
||||
result = pc.utf8_trim_whitespace(self._pa_array)
|
||||
else:
|
||||
result = pc.utf8_trim(self._pa_array, characters=to_strip)
|
||||
return type(self)(result)
|
||||
|
||||
def _str_lstrip(self, to_strip=None) -> Self:
|
||||
if to_strip is None:
|
||||
result = pc.utf8_ltrim_whitespace(self._pa_array)
|
||||
else:
|
||||
result = pc.utf8_ltrim(self._pa_array, characters=to_strip)
|
||||
return type(self)(result)
|
||||
|
||||
def _str_rstrip(self, to_strip=None) -> Self:
|
||||
if to_strip is None:
|
||||
result = pc.utf8_rtrim_whitespace(self._pa_array)
|
||||
else:
|
||||
result = pc.utf8_rtrim(self._pa_array, characters=to_strip)
|
||||
return type(self)(result)
|
||||
|
||||
def _str_pad(
|
||||
self,
|
||||
width: int,
|
||||
side: Literal["left", "right", "both"] = "left",
|
||||
fillchar: str = " ",
|
||||
):
|
||||
if side == "left":
|
||||
pa_pad = pc.utf8_lpad
|
||||
elif side == "right":
|
||||
pa_pad = pc.utf8_rpad
|
||||
elif side == "both":
|
||||
if pa_version_under17p0:
|
||||
# GH#59624 fall back to object dtype
|
||||
from pandas import array as pd_array
|
||||
|
||||
obj_arr = self.astype(object, copy=False) # type: ignore[attr-defined]
|
||||
obj = pd_array(obj_arr, dtype=object)
|
||||
result = obj._str_pad(width, side, fillchar) # type: ignore[attr-defined]
|
||||
return type(self)._from_sequence(result, dtype=self.dtype) # type: ignore[attr-defined]
|
||||
else:
|
||||
# GH#54792
|
||||
# https://github.com/apache/arrow/issues/15053#issuecomment-2317032347
|
||||
lean_left = (width % 2) == 0
|
||||
pa_pad = partial(pc.utf8_center, lean_left_on_odd_padding=lean_left)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'"
|
||||
)
|
||||
return type(self)(pa_pad(self._pa_array, width=width, padding=fillchar))
|
||||
|
||||
def _str_get(self, i: int):
|
||||
lengths = pc.utf8_length(self._pa_array)
|
||||
if i >= 0:
|
||||
out_of_bounds = pc.greater_equal(i, lengths)
|
||||
start = i
|
||||
stop = i + 1
|
||||
step = 1
|
||||
else:
|
||||
out_of_bounds = pc.greater(-i, lengths)
|
||||
start = i
|
||||
stop = i - 1
|
||||
step = -1
|
||||
not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True))
|
||||
selected = pc.utf8_slice_codeunits(
|
||||
self._pa_array, start=start, stop=stop, step=step
|
||||
)
|
||||
null_value = pa.scalar(None, type=self._pa_array.type)
|
||||
result = pc.if_else(not_out_of_bounds, selected, null_value)
|
||||
return type(self)(result)
|
||||
|
||||
def _str_slice(
|
||||
self, start: int | None = None, stop: int | None = None, step: int | None = None
|
||||
):
|
||||
if pa_version_under11p0:
|
||||
# GH#59724
|
||||
result = self._apply_elementwise(lambda val: val[start:stop:step])
|
||||
return type(self)(pa.chunked_array(result, type=self._pa_array.type))
|
||||
if start is None:
|
||||
if step is not None and step < 0:
|
||||
# GH#59710
|
||||
start = -1
|
||||
else:
|
||||
start = 0
|
||||
if step is None:
|
||||
step = 1
|
||||
return type(self)(
|
||||
pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
|
||||
)
|
||||
|
||||
def _str_slice_replace(
|
||||
self, start: int | None = None, stop: int | None = None, repl: str | None = None
|
||||
):
|
||||
if repl is None:
|
||||
repl = ""
|
||||
if start is None:
|
||||
start = 0
|
||||
if stop is None:
|
||||
stop = np.iinfo(np.int64).max
|
||||
return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl))
|
||||
|
||||
def _str_replace(
|
||||
self,
|
||||
pat: str | re.Pattern,
|
||||
repl: str | Callable,
|
||||
n: int = -1,
|
||||
case: bool = True,
|
||||
flags: int = 0,
|
||||
regex: bool = True,
|
||||
) -> Self:
|
||||
if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:
|
||||
raise NotImplementedError(
|
||||
"replace is not supported with a re.Pattern, callable repl, "
|
||||
"case=False, or flags!=0"
|
||||
)
|
||||
|
||||
func = pc.replace_substring_regex if regex else pc.replace_substring
|
||||
# https://github.com/apache/arrow/issues/39149
|
||||
# GH 56404, unexpected behavior with negative max_replacements with pyarrow.
|
||||
pa_max_replacements = None if n < 0 else n
|
||||
result = func(
|
||||
self._pa_array,
|
||||
pattern=pat,
|
||||
replacement=repl,
|
||||
max_replacements=pa_max_replacements,
|
||||
)
|
||||
return type(self)(result)
|
||||
|
||||
def _str_capitalize(self) -> Self:
|
||||
return type(self)(pc.utf8_capitalize(self._pa_array))
|
||||
|
||||
def _str_title(self):
|
||||
return type(self)(pc.utf8_title(self._pa_array))
|
||||
|
||||
def _str_swapcase(self):
|
||||
return type(self)(pc.utf8_swapcase(self._pa_array))
|
||||
|
||||
def _str_removeprefix(self, prefix: str):
|
||||
if not pa_version_under13p0:
|
||||
starts_with = pc.starts_with(self._pa_array, pattern=prefix)
|
||||
removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix))
|
||||
result = pc.if_else(starts_with, removed, self._pa_array)
|
||||
return type(self)(result)
|
||||
predicate = lambda val: val.removeprefix(prefix)
|
||||
result = self._apply_elementwise(predicate)
|
||||
return type(self)(pa.chunked_array(result))
|
||||
|
||||
def _str_removesuffix(self, suffix: str):
|
||||
ends_with = pc.ends_with(self._pa_array, pattern=suffix)
|
||||
removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))
|
||||
result = pc.if_else(ends_with, removed, self._pa_array)
|
||||
return type(self)(result)
|
||||
|
||||
def _str_startswith(
|
||||
self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default
|
||||
):
|
||||
if isinstance(pat, str):
|
||||
result = pc.starts_with(self._pa_array, pattern=pat)
|
||||
else:
|
||||
if len(pat) == 0:
|
||||
# For empty tuple we return null for missing values and False
|
||||
# for valid values.
|
||||
result = pc.if_else(pc.is_null(self._pa_array), None, False)
|
||||
else:
|
||||
result = pc.starts_with(self._pa_array, pattern=pat[0])
|
||||
|
||||
for p in pat[1:]:
|
||||
result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p))
|
||||
return self._convert_bool_result(result, na=na, method_name="startswith")
|
||||
|
||||
def _str_endswith(
|
||||
self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default
|
||||
):
|
||||
if isinstance(pat, str):
|
||||
result = pc.ends_with(self._pa_array, pattern=pat)
|
||||
else:
|
||||
if len(pat) == 0:
|
||||
# For empty tuple we return null for missing values and False
|
||||
# for valid values.
|
||||
result = pc.if_else(pc.is_null(self._pa_array), None, False)
|
||||
else:
|
||||
result = pc.ends_with(self._pa_array, pattern=pat[0])
|
||||
|
||||
for p in pat[1:]:
|
||||
result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p))
|
||||
return self._convert_bool_result(result, na=na, method_name="endswith")
|
||||
|
||||
def _str_isalnum(self):
|
||||
result = pc.utf8_is_alnum(self._pa_array)
|
||||
return self._convert_bool_result(result)
|
||||
|
||||
def _str_isalpha(self):
|
||||
result = pc.utf8_is_alpha(self._pa_array)
|
||||
return self._convert_bool_result(result)
|
||||
|
||||
def _str_isdecimal(self):
|
||||
result = pc.utf8_is_decimal(self._pa_array)
|
||||
return self._convert_bool_result(result)
|
||||
|
||||
def _str_isdigit(self):
|
||||
result = pc.utf8_is_digit(self._pa_array)
|
||||
return self._convert_bool_result(result)
|
||||
|
||||
def _str_islower(self):
|
||||
result = pc.utf8_is_lower(self._pa_array)
|
||||
return self._convert_bool_result(result)
|
||||
|
||||
def _str_isnumeric(self):
|
||||
result = pc.utf8_is_numeric(self._pa_array)
|
||||
return self._convert_bool_result(result)
|
||||
|
||||
def _str_isspace(self):
|
||||
result = pc.utf8_is_space(self._pa_array)
|
||||
return self._convert_bool_result(result)
|
||||
|
||||
def _str_istitle(self):
|
||||
result = pc.utf8_is_title(self._pa_array)
|
||||
return self._convert_bool_result(result)
|
||||
|
||||
def _str_isupper(self):
|
||||
result = pc.utf8_is_upper(self._pa_array)
|
||||
return self._convert_bool_result(result)
|
||||
|
||||
def _str_contains(
|
||||
self,
|
||||
pat,
|
||||
case: bool = True,
|
||||
flags: int = 0,
|
||||
na: Scalar | lib.NoDefault = lib.no_default,
|
||||
regex: bool = True,
|
||||
):
|
||||
if flags:
|
||||
raise NotImplementedError(f"contains not implemented with {flags=}")
|
||||
|
||||
if regex:
|
||||
pa_contains = pc.match_substring_regex
|
||||
else:
|
||||
pa_contains = pc.match_substring
|
||||
result = pa_contains(self._pa_array, pat, ignore_case=not case)
|
||||
return self._convert_bool_result(result, na=na, method_name="contains")
|
||||
|
||||
def _str_match(
|
||||
self,
|
||||
pat: str | re.Pattern,
|
||||
case: bool = True,
|
||||
flags: int = 0,
|
||||
na: Scalar | lib.NoDefault = lib.no_default,
|
||||
):
|
||||
if isinstance(pat, re.Pattern):
|
||||
# GH#61952
|
||||
pat = pat.pattern
|
||||
if isinstance(pat, str) and not pat.startswith("^"):
|
||||
pat = f"^{pat}"
|
||||
return self._str_contains(pat, case, flags, na, regex=True)
|
||||
|
||||
def _str_fullmatch(
|
||||
self,
|
||||
pat: str | re.Pattern,
|
||||
case: bool = True,
|
||||
flags: int = 0,
|
||||
na: Scalar | lib.NoDefault = lib.no_default,
|
||||
):
|
||||
if isinstance(pat, re.Pattern):
|
||||
# GH#61952
|
||||
pat = pat.pattern
|
||||
if isinstance(pat, str) and (not pat.endswith("$") or pat.endswith("\\$")):
|
||||
pat = f"{pat}$"
|
||||
return self._str_match(pat, case, flags, na)
|
||||
|
||||
def _str_find(self, sub: str, start: int = 0, end: int | None = None):
|
||||
if (
|
||||
pa_version_under13p0
|
||||
and not (start != 0 and end is not None)
|
||||
and not (start == 0 and end is None)
|
||||
):
|
||||
# GH#59562
|
||||
res_list = self._apply_elementwise(lambda val: val.find(sub, start, end))
|
||||
return self._convert_int_result(pa.chunked_array(res_list))
|
||||
|
||||
if (start == 0 or start is None) and end is None:
|
||||
result = pc.find_substring(self._pa_array, sub)
|
||||
else:
|
||||
if sub == "":
|
||||
# GH#56792
|
||||
res_list = self._apply_elementwise(
|
||||
lambda val: val.find(sub, start, end)
|
||||
)
|
||||
return self._convert_int_result(pa.chunked_array(res_list))
|
||||
if start is None:
|
||||
start_offset = 0
|
||||
start = 0
|
||||
elif start < 0:
|
||||
start_offset = pc.add(start, pc.utf8_length(self._pa_array))
|
||||
start_offset = pc.if_else(pc.less(start_offset, 0), 0, start_offset)
|
||||
else:
|
||||
start_offset = start
|
||||
slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
|
||||
result = pc.find_substring(slices, sub)
|
||||
found = pc.not_equal(result, pa.scalar(-1, type=result.type))
|
||||
offset_result = pc.add(result, start_offset)
|
||||
result = pc.if_else(found, offset_result, -1)
|
||||
return self._convert_int_result(result)
|
||||
544
lib/python3.11/site-packages/pandas/core/arrays/_mixins.py
Normal file
544
lib/python3.11/site-packages/pandas/core/arrays/_mixins.py
Normal file
@ -0,0 +1,544 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from functools import wraps
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Literal,
|
||||
cast,
|
||||
overload,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas._libs.arrays import NDArrayBacked
|
||||
from pandas._libs.tslibs import is_supported_dtype
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
AxisInt,
|
||||
Dtype,
|
||||
F,
|
||||
FillnaOptions,
|
||||
PositionalIndexer2D,
|
||||
PositionalIndexerTuple,
|
||||
ScalarIndexer,
|
||||
Self,
|
||||
SequenceIndexer,
|
||||
Shape,
|
||||
TakeIndexer,
|
||||
npt,
|
||||
)
|
||||
from pandas.errors import AbstractMethodError
|
||||
from pandas.util._decorators import doc
|
||||
from pandas.util._validators import (
|
||||
validate_bool_kwarg,
|
||||
validate_fillna_kwargs,
|
||||
validate_insert_loc,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.common import pandas_dtype
|
||||
from pandas.core.dtypes.dtypes import (
|
||||
DatetimeTZDtype,
|
||||
ExtensionDtype,
|
||||
PeriodDtype,
|
||||
)
|
||||
from pandas.core.dtypes.missing import array_equivalent
|
||||
|
||||
from pandas.core import missing
|
||||
from pandas.core.algorithms import (
|
||||
take,
|
||||
unique,
|
||||
value_counts_internal as value_counts,
|
||||
)
|
||||
from pandas.core.array_algos.quantile import quantile_with_mask
|
||||
from pandas.core.array_algos.transforms import shift
|
||||
from pandas.core.arrays.base import ExtensionArray
|
||||
from pandas.core.construction import extract_array
|
||||
from pandas.core.indexers import check_array_indexer
|
||||
from pandas.core.sorting import nargminmax
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Sequence
|
||||
|
||||
from pandas._typing import (
|
||||
NumpySorter,
|
||||
NumpyValueArrayLike,
|
||||
)
|
||||
|
||||
from pandas import Series
|
||||
|
||||
|
||||
def ravel_compat(meth: F) -> F:
|
||||
"""
|
||||
Decorator to ravel a 2D array before passing it to a cython operation,
|
||||
then reshape the result to our own shape.
|
||||
"""
|
||||
|
||||
@wraps(meth)
|
||||
def method(self, *args, **kwargs):
|
||||
if self.ndim == 1:
|
||||
return meth(self, *args, **kwargs)
|
||||
|
||||
flags = self._ndarray.flags
|
||||
flat = self.ravel("K")
|
||||
result = meth(flat, *args, **kwargs)
|
||||
order = "F" if flags.f_contiguous else "C"
|
||||
return result.reshape(self.shape, order=order)
|
||||
|
||||
return cast(F, method)
|
||||
|
||||
|
||||
class NDArrayBackedExtensionArray(NDArrayBacked, ExtensionArray):
|
||||
"""
|
||||
ExtensionArray that is backed by a single NumPy ndarray.
|
||||
"""
|
||||
|
||||
_ndarray: np.ndarray
|
||||
|
||||
# scalar used to denote NA value inside our self._ndarray, e.g. -1
|
||||
# for Categorical, iNaT for Period. Outside of object dtype,
|
||||
# self.isna() should be exactly locations in self._ndarray with
|
||||
# _internal_fill_value.
|
||||
_internal_fill_value: Any
|
||||
|
||||
def _box_func(self, x):
|
||||
"""
|
||||
Wrap numpy type in our dtype.type if necessary.
|
||||
"""
|
||||
return x
|
||||
|
||||
def _validate_scalar(self, value):
|
||||
# used by NDArrayBackedExtensionIndex.insert
|
||||
raise AbstractMethodError(self)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
|
||||
def view(self, dtype: Dtype | None = None) -> ArrayLike:
|
||||
# We handle datetime64, datetime64tz, timedelta64, and period
|
||||
# dtypes here. Everything else we pass through to the underlying
|
||||
# ndarray.
|
||||
if dtype is None or dtype is self.dtype:
|
||||
return self._from_backing_data(self._ndarray)
|
||||
|
||||
if isinstance(dtype, type):
|
||||
# we sometimes pass non-dtype objects, e.g np.ndarray;
|
||||
# pass those through to the underlying ndarray
|
||||
return self._ndarray.view(dtype)
|
||||
|
||||
dtype = pandas_dtype(dtype)
|
||||
arr = self._ndarray
|
||||
|
||||
if isinstance(dtype, PeriodDtype):
|
||||
cls = dtype.construct_array_type()
|
||||
return cls(arr.view("i8"), dtype=dtype)
|
||||
elif isinstance(dtype, DatetimeTZDtype):
|
||||
dt_cls = dtype.construct_array_type()
|
||||
dt64_values = arr.view(f"M8[{dtype.unit}]")
|
||||
return dt_cls._simple_new(dt64_values, dtype=dtype)
|
||||
elif lib.is_np_dtype(dtype, "M") and is_supported_dtype(dtype):
|
||||
from pandas.core.arrays import DatetimeArray
|
||||
|
||||
dt64_values = arr.view(dtype)
|
||||
return DatetimeArray._simple_new(dt64_values, dtype=dtype)
|
||||
|
||||
elif lib.is_np_dtype(dtype, "m") and is_supported_dtype(dtype):
|
||||
from pandas.core.arrays import TimedeltaArray
|
||||
|
||||
td64_values = arr.view(dtype)
|
||||
return TimedeltaArray._simple_new(td64_values, dtype=dtype)
|
||||
|
||||
# error: Argument "dtype" to "view" of "_ArrayOrScalarCommon" has incompatible
|
||||
# type "Union[ExtensionDtype, dtype[Any]]"; expected "Union[dtype[Any], None,
|
||||
# type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, Union[int,
|
||||
# Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]"
|
||||
return arr.view(dtype=dtype) # type: ignore[arg-type]
|
||||
|
||||
def take(
|
||||
self,
|
||||
indices: TakeIndexer,
|
||||
*,
|
||||
allow_fill: bool = False,
|
||||
fill_value: Any = None,
|
||||
axis: AxisInt = 0,
|
||||
) -> Self:
|
||||
if allow_fill:
|
||||
fill_value = self._validate_scalar(fill_value)
|
||||
|
||||
new_data = take(
|
||||
self._ndarray,
|
||||
indices,
|
||||
allow_fill=allow_fill,
|
||||
fill_value=fill_value,
|
||||
axis=axis,
|
||||
)
|
||||
return self._from_backing_data(new_data)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
|
||||
def equals(self, other) -> bool:
|
||||
if type(self) is not type(other):
|
||||
return False
|
||||
if self.dtype != other.dtype:
|
||||
return False
|
||||
return bool(array_equivalent(self._ndarray, other._ndarray, dtype_equal=True))
|
||||
|
||||
@classmethod
|
||||
def _from_factorized(cls, values, original):
|
||||
assert values.dtype == original._ndarray.dtype
|
||||
return original._from_backing_data(values)
|
||||
|
||||
def _values_for_argsort(self) -> np.ndarray:
|
||||
return self._ndarray
|
||||
|
||||
def _values_for_factorize(self):
|
||||
return self._ndarray, self._internal_fill_value
|
||||
|
||||
def _hash_pandas_object(
|
||||
self, *, encoding: str, hash_key: str, categorize: bool
|
||||
) -> npt.NDArray[np.uint64]:
|
||||
from pandas.core.util.hashing import hash_array
|
||||
|
||||
values = self._ndarray
|
||||
return hash_array(
|
||||
values, encoding=encoding, hash_key=hash_key, categorize=categorize
|
||||
)
|
||||
|
||||
# Signature of "argmin" incompatible with supertype "ExtensionArray"
|
||||
def argmin(self, axis: AxisInt = 0, skipna: bool = True): # type: ignore[override]
|
||||
# override base class by adding axis keyword
|
||||
validate_bool_kwarg(skipna, "skipna")
|
||||
if not skipna and self._hasna:
|
||||
raise NotImplementedError
|
||||
return nargminmax(self, "argmin", axis=axis)
|
||||
|
||||
# Signature of "argmax" incompatible with supertype "ExtensionArray"
|
||||
def argmax(self, axis: AxisInt = 0, skipna: bool = True): # type: ignore[override]
|
||||
# override base class by adding axis keyword
|
||||
validate_bool_kwarg(skipna, "skipna")
|
||||
if not skipna and self._hasna:
|
||||
raise NotImplementedError
|
||||
return nargminmax(self, "argmax", axis=axis)
|
||||
|
||||
def unique(self) -> Self:
|
||||
new_data = unique(self._ndarray)
|
||||
return self._from_backing_data(new_data)
|
||||
|
||||
@classmethod
|
||||
@doc(ExtensionArray._concat_same_type)
|
||||
def _concat_same_type(
|
||||
cls,
|
||||
to_concat: Sequence[Self],
|
||||
axis: AxisInt = 0,
|
||||
) -> Self:
|
||||
if not lib.dtypes_all_equal([x.dtype for x in to_concat]):
|
||||
dtypes = {str(x.dtype) for x in to_concat}
|
||||
raise ValueError("to_concat must have the same dtype", dtypes)
|
||||
|
||||
return super()._concat_same_type(to_concat, axis=axis)
|
||||
|
||||
@doc(ExtensionArray.searchsorted)
|
||||
def searchsorted(
|
||||
self,
|
||||
value: NumpyValueArrayLike | ExtensionArray,
|
||||
side: Literal["left", "right"] = "left",
|
||||
sorter: NumpySorter | None = None,
|
||||
) -> npt.NDArray[np.intp] | np.intp:
|
||||
npvalue = self._validate_setitem_value(value)
|
||||
return self._ndarray.searchsorted(npvalue, side=side, sorter=sorter)
|
||||
|
||||
@doc(ExtensionArray.shift)
|
||||
def shift(self, periods: int = 1, fill_value=None):
|
||||
# NB: shift is always along axis=0
|
||||
axis = 0
|
||||
fill_value = self._validate_scalar(fill_value)
|
||||
new_values = shift(self._ndarray, periods, axis, fill_value)
|
||||
|
||||
return self._from_backing_data(new_values)
|
||||
|
||||
def __setitem__(self, key, value) -> None:
|
||||
key = check_array_indexer(self, key)
|
||||
value = self._validate_setitem_value(value)
|
||||
self._ndarray[key] = value
|
||||
|
||||
def _validate_setitem_value(self, value):
|
||||
return value
|
||||
|
||||
@overload
|
||||
def __getitem__(self, key: ScalarIndexer) -> Any:
|
||||
...
|
||||
|
||||
@overload
|
||||
def __getitem__(
|
||||
self,
|
||||
key: SequenceIndexer | PositionalIndexerTuple,
|
||||
) -> Self:
|
||||
...
|
||||
|
||||
def __getitem__(
|
||||
self,
|
||||
key: PositionalIndexer2D,
|
||||
) -> Self | Any:
|
||||
if lib.is_integer(key):
|
||||
# fast-path
|
||||
result = self._ndarray[key]
|
||||
if self.ndim == 1:
|
||||
return self._box_func(result)
|
||||
return self._from_backing_data(result)
|
||||
|
||||
# error: Incompatible types in assignment (expression has type "ExtensionArray",
|
||||
# variable has type "Union[int, slice, ndarray]")
|
||||
key = extract_array(key, extract_numpy=True) # type: ignore[assignment]
|
||||
key = check_array_indexer(self, key)
|
||||
result = self._ndarray[key]
|
||||
if lib.is_scalar(result):
|
||||
return self._box_func(result)
|
||||
|
||||
result = self._from_backing_data(result)
|
||||
return result
|
||||
|
||||
def _fill_mask_inplace(
|
||||
self, method: str, limit: int | None, mask: npt.NDArray[np.bool_]
|
||||
) -> None:
|
||||
# (for now) when self.ndim == 2, we assume axis=0
|
||||
func = missing.get_fill_func(method, ndim=self.ndim)
|
||||
func(self._ndarray.T, limit=limit, mask=mask.T)
|
||||
|
||||
def _pad_or_backfill(
|
||||
self,
|
||||
*,
|
||||
method: FillnaOptions,
|
||||
limit: int | None = None,
|
||||
limit_area: Literal["inside", "outside"] | None = None,
|
||||
copy: bool = True,
|
||||
) -> Self:
|
||||
mask = self.isna()
|
||||
if mask.any():
|
||||
# (for now) when self.ndim == 2, we assume axis=0
|
||||
func = missing.get_fill_func(method, ndim=self.ndim)
|
||||
|
||||
npvalues = self._ndarray.T
|
||||
if copy:
|
||||
npvalues = npvalues.copy()
|
||||
func(npvalues, limit=limit, limit_area=limit_area, mask=mask.T)
|
||||
npvalues = npvalues.T
|
||||
|
||||
if copy:
|
||||
new_values = self._from_backing_data(npvalues)
|
||||
else:
|
||||
new_values = self
|
||||
|
||||
else:
|
||||
if copy:
|
||||
new_values = self.copy()
|
||||
else:
|
||||
new_values = self
|
||||
return new_values
|
||||
|
||||
@doc(ExtensionArray.fillna)
|
||||
def fillna(
|
||||
self, value=None, method=None, limit: int | None = None, copy: bool = True
|
||||
) -> Self:
|
||||
value, method = validate_fillna_kwargs(
|
||||
value, method, validate_scalar_dict_value=False
|
||||
)
|
||||
|
||||
mask = self.isna()
|
||||
# error: Argument 2 to "check_value_size" has incompatible type
|
||||
# "ExtensionArray"; expected "ndarray"
|
||||
value = missing.check_value_size(
|
||||
value, mask, len(self) # type: ignore[arg-type]
|
||||
)
|
||||
|
||||
if mask.any():
|
||||
if method is not None:
|
||||
# (for now) when self.ndim == 2, we assume axis=0
|
||||
func = missing.get_fill_func(method, ndim=self.ndim)
|
||||
npvalues = self._ndarray.T
|
||||
if copy:
|
||||
npvalues = npvalues.copy()
|
||||
func(npvalues, limit=limit, mask=mask.T)
|
||||
npvalues = npvalues.T
|
||||
|
||||
# TODO: NumpyExtensionArray didn't used to copy, need tests
|
||||
# for this
|
||||
new_values = self._from_backing_data(npvalues)
|
||||
else:
|
||||
# fill with value
|
||||
if copy:
|
||||
new_values = self.copy()
|
||||
else:
|
||||
new_values = self[:]
|
||||
new_values[mask] = value
|
||||
else:
|
||||
# We validate the fill_value even if there is nothing to fill
|
||||
if value is not None:
|
||||
self._validate_setitem_value(value)
|
||||
|
||||
if not copy:
|
||||
new_values = self[:]
|
||||
else:
|
||||
new_values = self.copy()
|
||||
return new_values
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# Reductions
|
||||
|
||||
def _wrap_reduction_result(self, axis: AxisInt | None, result):
|
||||
if axis is None or self.ndim == 1:
|
||||
return self._box_func(result)
|
||||
return self._from_backing_data(result)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# __array_function__ methods
|
||||
|
||||
def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None:
|
||||
"""
|
||||
Analogue to np.putmask(self, mask, value)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mask : np.ndarray[bool]
|
||||
value : scalar or listlike
|
||||
|
||||
Raises
|
||||
------
|
||||
TypeError
|
||||
If value cannot be cast to self.dtype.
|
||||
"""
|
||||
value = self._validate_setitem_value(value)
|
||||
|
||||
np.putmask(self._ndarray, mask, value)
|
||||
|
||||
def _where(self: Self, mask: npt.NDArray[np.bool_], value) -> Self:
|
||||
"""
|
||||
Analogue to np.where(mask, self, value)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mask : np.ndarray[bool]
|
||||
value : scalar or listlike
|
||||
|
||||
Raises
|
||||
------
|
||||
TypeError
|
||||
If value cannot be cast to self.dtype.
|
||||
"""
|
||||
value = self._validate_setitem_value(value)
|
||||
|
||||
res_values = np.where(mask, self._ndarray, value)
|
||||
if res_values.dtype != self._ndarray.dtype:
|
||||
raise AssertionError(
|
||||
# GH#56410
|
||||
"Something has gone wrong, please report a bug at "
|
||||
"github.com/pandas-dev/pandas/"
|
||||
)
|
||||
return self._from_backing_data(res_values)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# Index compat methods
|
||||
|
||||
def insert(self, loc: int, item) -> Self:
|
||||
"""
|
||||
Make new ExtensionArray inserting new item at location. Follows
|
||||
Python list.append semantics for negative values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
loc : int
|
||||
item : object
|
||||
|
||||
Returns
|
||||
-------
|
||||
type(self)
|
||||
"""
|
||||
loc = validate_insert_loc(loc, len(self))
|
||||
|
||||
code = self._validate_scalar(item)
|
||||
|
||||
new_vals = np.concatenate(
|
||||
(
|
||||
self._ndarray[:loc],
|
||||
np.asarray([code], dtype=self._ndarray.dtype),
|
||||
self._ndarray[loc:],
|
||||
)
|
||||
)
|
||||
return self._from_backing_data(new_vals)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# Additional array methods
|
||||
# These are not part of the EA API, but we implement them because
|
||||
# pandas assumes they're there.
|
||||
|
||||
def value_counts(self, dropna: bool = True) -> Series:
|
||||
"""
|
||||
Return a Series containing counts of unique values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dropna : bool, default True
|
||||
Don't include counts of NA values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Series
|
||||
"""
|
||||
if self.ndim != 1:
|
||||
raise NotImplementedError
|
||||
|
||||
from pandas import (
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
|
||||
if dropna:
|
||||
# error: Unsupported operand type for ~ ("ExtensionArray")
|
||||
values = self[~self.isna()]._ndarray # type: ignore[operator]
|
||||
else:
|
||||
values = self._ndarray
|
||||
|
||||
result = value_counts(values, sort=False, dropna=dropna)
|
||||
|
||||
index_arr = self._from_backing_data(np.asarray(result.index._data))
|
||||
index = Index(index_arr, name=result.index.name)
|
||||
return Series(result._values, index=index, name=result.name, copy=False)
|
||||
|
||||
def _quantile(
|
||||
self,
|
||||
qs: npt.NDArray[np.float64],
|
||||
interpolation: str,
|
||||
) -> Self:
|
||||
# TODO: disable for Categorical if not ordered?
|
||||
|
||||
mask = np.asarray(self.isna())
|
||||
arr = self._ndarray
|
||||
fill_value = self._internal_fill_value
|
||||
|
||||
res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation)
|
||||
if res_values.dtype == self._ndarray.dtype:
|
||||
return self._from_backing_data(res_values)
|
||||
else:
|
||||
# e.g. test_quantile_empty we are empty integer dtype and res_values
|
||||
# has floating dtype
|
||||
# TODO: technically __init__ isn't defined here.
|
||||
# Should we raise NotImplementedError and handle this on NumpyEA?
|
||||
return type(self)(res_values) # type: ignore[call-arg]
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# numpy-like methods
|
||||
|
||||
@classmethod
|
||||
def _empty(cls, shape: Shape, dtype: ExtensionDtype) -> Self:
|
||||
"""
|
||||
Analogous to np.empty(shape, dtype=dtype)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
shape : tuple[int]
|
||||
dtype : ExtensionDtype
|
||||
"""
|
||||
# The base implementation uses a naive approach to find the dtype
|
||||
# for the backing ndarray
|
||||
arr = cls._from_sequence([], dtype=dtype)
|
||||
backing = np.empty(shape, dtype=arr._ndarray.dtype)
|
||||
return arr._from_backing_data(backing)
|
||||
207
lib/python3.11/site-packages/pandas/core/arrays/_ranges.py
Normal file
207
lib/python3.11/site-packages/pandas/core/arrays/_ranges.py
Normal file
@ -0,0 +1,207 @@
|
||||
"""
|
||||
Helper functions to generate range-like data for DatetimeArray
|
||||
(and possibly TimedeltaArray/PeriodArray)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs.lib import i8max
|
||||
from pandas._libs.tslibs import (
|
||||
BaseOffset,
|
||||
OutOfBoundsDatetime,
|
||||
Timedelta,
|
||||
Timestamp,
|
||||
iNaT,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import npt
|
||||
|
||||
|
||||
def generate_regular_range(
|
||||
start: Timestamp | Timedelta | None,
|
||||
end: Timestamp | Timedelta | None,
|
||||
periods: int | None,
|
||||
freq: BaseOffset,
|
||||
unit: str = "ns",
|
||||
) -> npt.NDArray[np.intp]:
|
||||
"""
|
||||
Generate a range of dates or timestamps with the spans between dates
|
||||
described by the given `freq` DateOffset.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
start : Timedelta, Timestamp or None
|
||||
First point of produced date range.
|
||||
end : Timedelta, Timestamp or None
|
||||
Last point of produced date range.
|
||||
periods : int or None
|
||||
Number of periods in produced date range.
|
||||
freq : Tick
|
||||
Describes space between dates in produced date range.
|
||||
unit : str, default "ns"
|
||||
The resolution the output is meant to represent.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ndarray[np.int64]
|
||||
Representing the given resolution.
|
||||
"""
|
||||
istart = start._value if start is not None else None
|
||||
iend = end._value if end is not None else None
|
||||
freq.nanos # raises if non-fixed frequency
|
||||
td = Timedelta(freq)
|
||||
b: int
|
||||
e: int
|
||||
try:
|
||||
td = td.as_unit(unit, round_ok=False)
|
||||
except ValueError as err:
|
||||
raise ValueError(
|
||||
f"freq={freq} is incompatible with unit={unit}. "
|
||||
"Use a lower freq or a higher unit instead."
|
||||
) from err
|
||||
stride = int(td._value)
|
||||
|
||||
if periods is None and istart is not None and iend is not None:
|
||||
b = istart
|
||||
# cannot just use e = Timestamp(end) + 1 because arange breaks when
|
||||
# stride is too large, see GH10887
|
||||
e = b + (iend - b) // stride * stride + stride // 2 + 1
|
||||
elif istart is not None and periods is not None:
|
||||
b = istart
|
||||
e = _generate_range_overflow_safe(b, periods, stride, side="start")
|
||||
elif iend is not None and periods is not None:
|
||||
e = iend + stride
|
||||
b = _generate_range_overflow_safe(e, periods, stride, side="end")
|
||||
else:
|
||||
raise ValueError(
|
||||
"at least 'start' or 'end' should be specified if a 'period' is given."
|
||||
)
|
||||
|
||||
with np.errstate(over="raise"):
|
||||
# If the range is sufficiently large, np.arange may overflow
|
||||
# and incorrectly return an empty array if not caught.
|
||||
try:
|
||||
values = np.arange(b, e, stride, dtype=np.int64)
|
||||
except FloatingPointError:
|
||||
xdr = [b]
|
||||
while xdr[-1] != e:
|
||||
xdr.append(xdr[-1] + stride)
|
||||
values = np.array(xdr[:-1], dtype=np.int64)
|
||||
return values
|
||||
|
||||
|
||||
def _generate_range_overflow_safe(
|
||||
endpoint: int, periods: int, stride: int, side: str = "start"
|
||||
) -> int:
|
||||
"""
|
||||
Calculate the second endpoint for passing to np.arange, checking
|
||||
to avoid an integer overflow. Catch OverflowError and re-raise
|
||||
as OutOfBoundsDatetime.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
endpoint : int
|
||||
nanosecond timestamp of the known endpoint of the desired range
|
||||
periods : int
|
||||
number of periods in the desired range
|
||||
stride : int
|
||||
nanoseconds between periods in the desired range
|
||||
side : {'start', 'end'}
|
||||
which end of the range `endpoint` refers to
|
||||
|
||||
Returns
|
||||
-------
|
||||
other_end : int
|
||||
|
||||
Raises
|
||||
------
|
||||
OutOfBoundsDatetime
|
||||
"""
|
||||
# GH#14187 raise instead of incorrectly wrapping around
|
||||
assert side in ["start", "end"]
|
||||
|
||||
i64max = np.uint64(i8max)
|
||||
msg = f"Cannot generate range with {side}={endpoint} and periods={periods}"
|
||||
|
||||
with np.errstate(over="raise"):
|
||||
# if periods * strides cannot be multiplied within the *uint64* bounds,
|
||||
# we cannot salvage the operation by recursing, so raise
|
||||
try:
|
||||
addend = np.uint64(periods) * np.uint64(np.abs(stride))
|
||||
except FloatingPointError as err:
|
||||
raise OutOfBoundsDatetime(msg) from err
|
||||
|
||||
if np.abs(addend) <= i64max:
|
||||
# relatively easy case without casting concerns
|
||||
return _generate_range_overflow_safe_signed(endpoint, periods, stride, side)
|
||||
|
||||
elif (endpoint > 0 and side == "start" and stride > 0) or (
|
||||
endpoint < 0 < stride and side == "end"
|
||||
):
|
||||
# no chance of not-overflowing
|
||||
raise OutOfBoundsDatetime(msg)
|
||||
|
||||
elif side == "end" and endpoint - stride <= i64max < endpoint:
|
||||
# in _generate_regular_range we added `stride` thereby overflowing
|
||||
# the bounds. Adjust to fix this.
|
||||
return _generate_range_overflow_safe(
|
||||
endpoint - stride, periods - 1, stride, side
|
||||
)
|
||||
|
||||
# split into smaller pieces
|
||||
mid_periods = periods // 2
|
||||
remaining = periods - mid_periods
|
||||
assert 0 < remaining < periods, (remaining, periods, endpoint, stride)
|
||||
|
||||
midpoint = int(_generate_range_overflow_safe(endpoint, mid_periods, stride, side))
|
||||
return _generate_range_overflow_safe(midpoint, remaining, stride, side)
|
||||
|
||||
|
||||
def _generate_range_overflow_safe_signed(
|
||||
endpoint: int, periods: int, stride: int, side: str
|
||||
) -> int:
|
||||
"""
|
||||
A special case for _generate_range_overflow_safe where `periods * stride`
|
||||
can be calculated without overflowing int64 bounds.
|
||||
"""
|
||||
assert side in ["start", "end"]
|
||||
if side == "end":
|
||||
stride *= -1
|
||||
|
||||
with np.errstate(over="raise"):
|
||||
addend = np.int64(periods) * np.int64(stride)
|
||||
try:
|
||||
# easy case with no overflows
|
||||
result = np.int64(endpoint) + addend
|
||||
if result == iNaT:
|
||||
# Putting this into a DatetimeArray/TimedeltaArray
|
||||
# would incorrectly be interpreted as NaT
|
||||
raise OverflowError
|
||||
return int(result)
|
||||
except (FloatingPointError, OverflowError):
|
||||
# with endpoint negative and addend positive we risk
|
||||
# FloatingPointError; with reversed signed we risk OverflowError
|
||||
pass
|
||||
|
||||
# if stride and endpoint had opposite signs, then endpoint + addend
|
||||
# should never overflow. so they must have the same signs
|
||||
assert (stride > 0 and endpoint >= 0) or (stride < 0 and endpoint <= 0)
|
||||
|
||||
if stride > 0:
|
||||
# watch out for very special case in which we just slightly
|
||||
# exceed implementation bounds, but when passing the result to
|
||||
# np.arange will get a result slightly within the bounds
|
||||
|
||||
uresult = np.uint64(endpoint) + np.uint64(addend)
|
||||
i64max = np.uint64(i8max)
|
||||
assert uresult > i64max
|
||||
if uresult <= i64max + np.uint64(stride):
|
||||
return int(uresult)
|
||||
|
||||
raise OutOfBoundsDatetime(
|
||||
f"Cannot generate range with {side}={endpoint} and periods={periods}"
|
||||
)
|
||||
63
lib/python3.11/site-packages/pandas/core/arrays/_utils.py
Normal file
63
lib/python3.11/site-packages/pandas/core/arrays/_utils.py
Normal file
@ -0,0 +1,63 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas.errors import LossySetitemError
|
||||
|
||||
from pandas.core.dtypes.cast import np_can_hold_element
|
||||
from pandas.core.dtypes.common import is_numeric_dtype
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
npt,
|
||||
)
|
||||
|
||||
|
||||
def to_numpy_dtype_inference(
|
||||
arr: ArrayLike, dtype: npt.DTypeLike | None, na_value, hasna: bool
|
||||
) -> tuple[npt.DTypeLike, Any]:
|
||||
if dtype is None and is_numeric_dtype(arr.dtype):
|
||||
dtype_given = False
|
||||
if hasna:
|
||||
if arr.dtype.kind == "b":
|
||||
dtype = np.dtype(np.object_)
|
||||
else:
|
||||
if arr.dtype.kind in "iu":
|
||||
dtype = np.dtype(np.float64)
|
||||
else:
|
||||
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
|
||||
if na_value is lib.no_default:
|
||||
na_value = np.nan
|
||||
else:
|
||||
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
|
||||
elif dtype is not None:
|
||||
dtype = np.dtype(dtype)
|
||||
dtype_given = True
|
||||
else:
|
||||
dtype_given = True
|
||||
|
||||
if na_value is lib.no_default:
|
||||
if dtype is None or not hasna:
|
||||
na_value = arr.dtype.na_value
|
||||
elif dtype.kind == "f": # type: ignore[union-attr]
|
||||
na_value = np.nan
|
||||
elif dtype.kind == "M": # type: ignore[union-attr]
|
||||
na_value = np.datetime64("nat")
|
||||
elif dtype.kind == "m": # type: ignore[union-attr]
|
||||
na_value = np.timedelta64("nat")
|
||||
else:
|
||||
na_value = arr.dtype.na_value
|
||||
|
||||
if not dtype_given and hasna:
|
||||
try:
|
||||
np_can_hold_element(dtype, na_value) # type: ignore[arg-type]
|
||||
except LossySetitemError:
|
||||
dtype = np.dtype(np.object_)
|
||||
return dtype, na_value
|
||||
@ -0,0 +1,7 @@
|
||||
from pandas.core.arrays.arrow.accessors import (
|
||||
ListAccessor,
|
||||
StructAccessor,
|
||||
)
|
||||
from pandas.core.arrays.arrow.array import ArrowExtensionArray
|
||||
|
||||
__all__ = ["ArrowExtensionArray", "StructAccessor", "ListAccessor"]
|
||||
@ -0,0 +1,50 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
import pyarrow
|
||||
|
||||
|
||||
def pyarrow_array_to_numpy_and_mask(
|
||||
arr, dtype: np.dtype
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
"""
|
||||
Convert a primitive pyarrow.Array to a numpy array and boolean mask based
|
||||
on the buffers of the Array.
|
||||
|
||||
At the moment pyarrow.BooleanArray is not supported.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
arr : pyarrow.Array
|
||||
dtype : numpy.dtype
|
||||
|
||||
Returns
|
||||
-------
|
||||
(data, mask)
|
||||
Tuple of two numpy arrays with the raw data (with specified dtype) and
|
||||
a boolean mask (validity mask, so False means missing)
|
||||
"""
|
||||
dtype = np.dtype(dtype)
|
||||
|
||||
if pyarrow.types.is_null(arr.type):
|
||||
# No initialization of data is needed since everything is null
|
||||
data = np.empty(len(arr), dtype=dtype)
|
||||
mask = np.zeros(len(arr), dtype=bool)
|
||||
return data, mask
|
||||
buflist = arr.buffers()
|
||||
# Since Arrow buffers might contain padding and the data might be offset,
|
||||
# the buffer gets sliced here before handing it to numpy.
|
||||
# See also https://github.com/pandas-dev/pandas/issues/40896
|
||||
offset = arr.offset * dtype.itemsize
|
||||
length = len(arr) * dtype.itemsize
|
||||
data_buf = buflist[1][offset : offset + length]
|
||||
data = np.frombuffer(data_buf, dtype=dtype)
|
||||
bitmask = buflist[0]
|
||||
if bitmask is not None:
|
||||
mask = pyarrow.BooleanArray.from_buffers(
|
||||
pyarrow.bool_(), len(arr), [None, bitmask], offset=arr.offset
|
||||
)
|
||||
mask = np.asarray(mask)
|
||||
else:
|
||||
mask = np.ones(len(arr), dtype=bool)
|
||||
return data, mask
|
||||
@ -0,0 +1,473 @@
|
||||
"""Accessors for arrow-backed data."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import (
|
||||
ABCMeta,
|
||||
abstractmethod,
|
||||
)
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
cast,
|
||||
)
|
||||
|
||||
from pandas.compat import (
|
||||
pa_version_under10p1,
|
||||
pa_version_under11p0,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.common import is_list_like
|
||||
|
||||
if not pa_version_under10p1:
|
||||
import pyarrow as pa
|
||||
import pyarrow.compute as pc
|
||||
|
||||
from pandas.core.dtypes.dtypes import ArrowDtype
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterator
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
|
||||
|
||||
class ArrowAccessor(metaclass=ABCMeta):
|
||||
@abstractmethod
|
||||
def __init__(self, data, validation_msg: str) -> None:
|
||||
self._data = data
|
||||
self._validation_msg = validation_msg
|
||||
self._validate(data)
|
||||
|
||||
@abstractmethod
|
||||
def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
|
||||
pass
|
||||
|
||||
def _validate(self, data):
|
||||
dtype = data.dtype
|
||||
if pa_version_under10p1 or not isinstance(dtype, ArrowDtype):
|
||||
# Raise AttributeError so that inspect can handle non-struct Series.
|
||||
raise AttributeError(self._validation_msg.format(dtype=dtype))
|
||||
|
||||
if not self._is_valid_pyarrow_dtype(dtype.pyarrow_dtype):
|
||||
# Raise AttributeError so that inspect can handle invalid Series.
|
||||
raise AttributeError(self._validation_msg.format(dtype=dtype))
|
||||
|
||||
@property
|
||||
def _pa_array(self):
|
||||
return self._data.array._pa_array
|
||||
|
||||
|
||||
class ListAccessor(ArrowAccessor):
|
||||
"""
|
||||
Accessor object for list data properties of the Series values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : Series
|
||||
Series containing Arrow list data.
|
||||
"""
|
||||
|
||||
def __init__(self, data=None) -> None:
|
||||
super().__init__(
|
||||
data,
|
||||
validation_msg="Can only use the '.list' accessor with "
|
||||
"'list[pyarrow]' dtype, not {dtype}.",
|
||||
)
|
||||
|
||||
def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
|
||||
return (
|
||||
pa.types.is_list(pyarrow_dtype)
|
||||
or pa.types.is_fixed_size_list(pyarrow_dtype)
|
||||
or pa.types.is_large_list(pyarrow_dtype)
|
||||
)
|
||||
|
||||
def len(self) -> Series:
|
||||
"""
|
||||
Return the length of each list in the Series.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.Series
|
||||
The length of each list.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow as pa
|
||||
>>> s = pd.Series(
|
||||
... [
|
||||
... [1, 2, 3],
|
||||
... [3],
|
||||
... ],
|
||||
... dtype=pd.ArrowDtype(pa.list_(
|
||||
... pa.int64()
|
||||
... ))
|
||||
... )
|
||||
>>> s.list.len()
|
||||
0 3
|
||||
1 1
|
||||
dtype: int32[pyarrow]
|
||||
"""
|
||||
from pandas import Series
|
||||
|
||||
value_lengths = pc.list_value_length(self._pa_array)
|
||||
return Series(value_lengths, dtype=ArrowDtype(value_lengths.type))
|
||||
|
||||
def __getitem__(self, key: int | slice) -> Series:
|
||||
"""
|
||||
Index or slice lists in the Series.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
key : int | slice
|
||||
Index or slice of indices to access from each list.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.Series
|
||||
The list at requested index.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow as pa
|
||||
>>> s = pd.Series(
|
||||
... [
|
||||
... [1, 2, 3],
|
||||
... [3],
|
||||
... ],
|
||||
... dtype=pd.ArrowDtype(pa.list_(
|
||||
... pa.int64()
|
||||
... ))
|
||||
... )
|
||||
>>> s.list[0]
|
||||
0 1
|
||||
1 3
|
||||
dtype: int64[pyarrow]
|
||||
"""
|
||||
from pandas import Series
|
||||
|
||||
if isinstance(key, int):
|
||||
# TODO: Support negative key but pyarrow does not allow
|
||||
# element index to be an array.
|
||||
# if key < 0:
|
||||
# key = pc.add(key, pc.list_value_length(self._pa_array))
|
||||
element = pc.list_element(self._pa_array, key)
|
||||
return Series(element, dtype=ArrowDtype(element.type))
|
||||
elif isinstance(key, slice):
|
||||
if pa_version_under11p0:
|
||||
raise NotImplementedError(
|
||||
f"List slice not supported by pyarrow {pa.__version__}."
|
||||
)
|
||||
|
||||
# TODO: Support negative start/stop/step, ideally this would be added
|
||||
# upstream in pyarrow.
|
||||
start, stop, step = key.start, key.stop, key.step
|
||||
if start is None:
|
||||
# TODO: When adding negative step support
|
||||
# this should be setto last element of array
|
||||
# when step is negative.
|
||||
start = 0
|
||||
if step is None:
|
||||
step = 1
|
||||
sliced = pc.list_slice(self._pa_array, start, stop, step)
|
||||
return Series(sliced, dtype=ArrowDtype(sliced.type))
|
||||
else:
|
||||
raise ValueError(f"key must be an int or slice, got {type(key).__name__}")
|
||||
|
||||
def __iter__(self) -> Iterator:
|
||||
raise TypeError(f"'{type(self).__name__}' object is not iterable")
|
||||
|
||||
def flatten(self) -> Series:
|
||||
"""
|
||||
Flatten list values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.Series
|
||||
The data from all lists in the series flattened.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow as pa
|
||||
>>> s = pd.Series(
|
||||
... [
|
||||
... [1, 2, 3],
|
||||
... [3],
|
||||
... ],
|
||||
... dtype=pd.ArrowDtype(pa.list_(
|
||||
... pa.int64()
|
||||
... ))
|
||||
... )
|
||||
>>> s.list.flatten()
|
||||
0 1
|
||||
1 2
|
||||
2 3
|
||||
3 3
|
||||
dtype: int64[pyarrow]
|
||||
"""
|
||||
from pandas import Series
|
||||
|
||||
flattened = pc.list_flatten(self._pa_array)
|
||||
return Series(flattened, dtype=ArrowDtype(flattened.type))
|
||||
|
||||
|
||||
class StructAccessor(ArrowAccessor):
|
||||
"""
|
||||
Accessor object for structured data properties of the Series values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : Series
|
||||
Series containing Arrow struct data.
|
||||
"""
|
||||
|
||||
def __init__(self, data=None) -> None:
|
||||
super().__init__(
|
||||
data,
|
||||
validation_msg=(
|
||||
"Can only use the '.struct' accessor with 'struct[pyarrow]' "
|
||||
"dtype, not {dtype}."
|
||||
),
|
||||
)
|
||||
|
||||
def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
|
||||
return pa.types.is_struct(pyarrow_dtype)
|
||||
|
||||
@property
|
||||
def dtypes(self) -> Series:
|
||||
"""
|
||||
Return the dtype object of each child field of the struct.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.Series
|
||||
The data type of each child field.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow as pa
|
||||
>>> s = pd.Series(
|
||||
... [
|
||||
... {"version": 1, "project": "pandas"},
|
||||
... {"version": 2, "project": "pandas"},
|
||||
... {"version": 1, "project": "numpy"},
|
||||
... ],
|
||||
... dtype=pd.ArrowDtype(pa.struct(
|
||||
... [("version", pa.int64()), ("project", pa.string())]
|
||||
... ))
|
||||
... )
|
||||
>>> s.struct.dtypes
|
||||
version int64[pyarrow]
|
||||
project string[pyarrow]
|
||||
dtype: object
|
||||
"""
|
||||
from pandas import (
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
|
||||
pa_type = self._data.dtype.pyarrow_dtype
|
||||
types = [ArrowDtype(struct.type) for struct in pa_type]
|
||||
names = [struct.name for struct in pa_type]
|
||||
return Series(types, index=Index(names))
|
||||
|
||||
def field(
|
||||
self,
|
||||
name_or_index: list[str]
|
||||
| list[bytes]
|
||||
| list[int]
|
||||
| pc.Expression
|
||||
| bytes
|
||||
| str
|
||||
| int,
|
||||
) -> Series:
|
||||
"""
|
||||
Extract a child field of a struct as a Series.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name_or_index : str | bytes | int | expression | list
|
||||
Name or index of the child field to extract.
|
||||
|
||||
For list-like inputs, this will index into a nested
|
||||
struct.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.Series
|
||||
The data corresponding to the selected child field.
|
||||
|
||||
See Also
|
||||
--------
|
||||
Series.struct.explode : Return all child fields as a DataFrame.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The name of the resulting Series will be set using the following
|
||||
rules:
|
||||
|
||||
- For string, bytes, or integer `name_or_index` (or a list of these, for
|
||||
a nested selection), the Series name is set to the selected
|
||||
field's name.
|
||||
- For a :class:`pyarrow.compute.Expression`, this is set to
|
||||
the string form of the expression.
|
||||
- For list-like `name_or_index`, the name will be set to the
|
||||
name of the final field selected.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow as pa
|
||||
>>> s = pd.Series(
|
||||
... [
|
||||
... {"version": 1, "project": "pandas"},
|
||||
... {"version": 2, "project": "pandas"},
|
||||
... {"version": 1, "project": "numpy"},
|
||||
... ],
|
||||
... dtype=pd.ArrowDtype(pa.struct(
|
||||
... [("version", pa.int64()), ("project", pa.string())]
|
||||
... ))
|
||||
... )
|
||||
|
||||
Extract by field name.
|
||||
|
||||
>>> s.struct.field("project")
|
||||
0 pandas
|
||||
1 pandas
|
||||
2 numpy
|
||||
Name: project, dtype: string[pyarrow]
|
||||
|
||||
Extract by field index.
|
||||
|
||||
>>> s.struct.field(0)
|
||||
0 1
|
||||
1 2
|
||||
2 1
|
||||
Name: version, dtype: int64[pyarrow]
|
||||
|
||||
Or an expression
|
||||
|
||||
>>> import pyarrow.compute as pc
|
||||
>>> s.struct.field(pc.field("project"))
|
||||
0 pandas
|
||||
1 pandas
|
||||
2 numpy
|
||||
Name: project, dtype: string[pyarrow]
|
||||
|
||||
For nested struct types, you can pass a list of values to index
|
||||
multiple levels:
|
||||
|
||||
>>> version_type = pa.struct([
|
||||
... ("major", pa.int64()),
|
||||
... ("minor", pa.int64()),
|
||||
... ])
|
||||
>>> s = pd.Series(
|
||||
... [
|
||||
... {"version": {"major": 1, "minor": 5}, "project": "pandas"},
|
||||
... {"version": {"major": 2, "minor": 1}, "project": "pandas"},
|
||||
... {"version": {"major": 1, "minor": 26}, "project": "numpy"},
|
||||
... ],
|
||||
... dtype=pd.ArrowDtype(pa.struct(
|
||||
... [("version", version_type), ("project", pa.string())]
|
||||
... ))
|
||||
... )
|
||||
>>> s.struct.field(["version", "minor"])
|
||||
0 5
|
||||
1 1
|
||||
2 26
|
||||
Name: minor, dtype: int64[pyarrow]
|
||||
>>> s.struct.field([0, 0])
|
||||
0 1
|
||||
1 2
|
||||
2 1
|
||||
Name: major, dtype: int64[pyarrow]
|
||||
"""
|
||||
from pandas import Series
|
||||
|
||||
def get_name(
|
||||
level_name_or_index: list[str]
|
||||
| list[bytes]
|
||||
| list[int]
|
||||
| pc.Expression
|
||||
| bytes
|
||||
| str
|
||||
| int,
|
||||
data: pa.ChunkedArray,
|
||||
):
|
||||
if isinstance(level_name_or_index, int):
|
||||
name = data.type.field(level_name_or_index).name
|
||||
elif isinstance(level_name_or_index, (str, bytes)):
|
||||
name = level_name_or_index
|
||||
elif isinstance(level_name_or_index, pc.Expression):
|
||||
name = str(level_name_or_index)
|
||||
elif is_list_like(level_name_or_index):
|
||||
# For nested input like [2, 1, 2]
|
||||
# iteratively get the struct and field name. The last
|
||||
# one is used for the name of the index.
|
||||
level_name_or_index = list(reversed(level_name_or_index))
|
||||
selected = data
|
||||
while level_name_or_index:
|
||||
# we need the cast, otherwise mypy complains about
|
||||
# getting ints, bytes, or str here, which isn't possible.
|
||||
level_name_or_index = cast(list, level_name_or_index)
|
||||
name_or_index = level_name_or_index.pop()
|
||||
name = get_name(name_or_index, selected)
|
||||
selected = selected.type.field(selected.type.get_field_index(name))
|
||||
name = selected.name
|
||||
else:
|
||||
raise ValueError(
|
||||
"name_or_index must be an int, str, bytes, "
|
||||
"pyarrow.compute.Expression, or list of those"
|
||||
)
|
||||
return name
|
||||
|
||||
pa_arr = self._data.array._pa_array
|
||||
name = get_name(name_or_index, pa_arr)
|
||||
field_arr = pc.struct_field(pa_arr, name_or_index)
|
||||
|
||||
return Series(
|
||||
field_arr,
|
||||
dtype=ArrowDtype(field_arr.type),
|
||||
index=self._data.index,
|
||||
name=name,
|
||||
)
|
||||
|
||||
def explode(self) -> DataFrame:
|
||||
"""
|
||||
Extract all child fields of a struct as a DataFrame.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.DataFrame
|
||||
The data corresponding to all child fields.
|
||||
|
||||
See Also
|
||||
--------
|
||||
Series.struct.field : Return a single child field as a Series.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow as pa
|
||||
>>> s = pd.Series(
|
||||
... [
|
||||
... {"version": 1, "project": "pandas"},
|
||||
... {"version": 2, "project": "pandas"},
|
||||
... {"version": 1, "project": "numpy"},
|
||||
... ],
|
||||
... dtype=pd.ArrowDtype(pa.struct(
|
||||
... [("version", pa.int64()), ("project", pa.string())]
|
||||
... ))
|
||||
... )
|
||||
|
||||
>>> s.struct.explode()
|
||||
version project
|
||||
0 1 pandas
|
||||
1 2 pandas
|
||||
2 1 numpy
|
||||
"""
|
||||
from pandas import concat
|
||||
|
||||
pa_type = self._pa_array.type
|
||||
return concat(
|
||||
[self.field(i) for i in range(pa_type.num_fields)], axis="columns"
|
||||
)
|
||||
2946
lib/python3.11/site-packages/pandas/core/arrays/arrow/array.py
Normal file
2946
lib/python3.11/site-packages/pandas/core/arrays/arrow/array.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,174 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pyarrow
|
||||
|
||||
from pandas.compat import pa_version_under14p1
|
||||
|
||||
from pandas.core.dtypes.dtypes import (
|
||||
IntervalDtype,
|
||||
PeriodDtype,
|
||||
)
|
||||
|
||||
from pandas.core.arrays.interval import VALID_CLOSED
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import IntervalClosedType
|
||||
|
||||
|
||||
class ArrowPeriodType(pyarrow.ExtensionType):
|
||||
def __init__(self, freq) -> None:
|
||||
# attributes need to be set first before calling
|
||||
# super init (as that calls serialize)
|
||||
self._freq = freq
|
||||
pyarrow.ExtensionType.__init__(self, pyarrow.int64(), "pandas.period")
|
||||
|
||||
@property
|
||||
def freq(self):
|
||||
return self._freq
|
||||
|
||||
def __arrow_ext_serialize__(self) -> bytes:
|
||||
metadata = {"freq": self.freq}
|
||||
return json.dumps(metadata).encode()
|
||||
|
||||
@classmethod
|
||||
def __arrow_ext_deserialize__(cls, storage_type, serialized) -> ArrowPeriodType:
|
||||
metadata = json.loads(serialized.decode())
|
||||
return ArrowPeriodType(metadata["freq"])
|
||||
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, pyarrow.BaseExtensionType):
|
||||
return type(self) == type(other) and self.freq == other.freq
|
||||
else:
|
||||
return NotImplemented
|
||||
|
||||
def __ne__(self, other) -> bool:
|
||||
return not self == other
|
||||
|
||||
def __hash__(self) -> int:
|
||||
return hash((str(self), self.freq))
|
||||
|
||||
def to_pandas_dtype(self) -> PeriodDtype:
|
||||
return PeriodDtype(freq=self.freq)
|
||||
|
||||
|
||||
# register the type with a dummy instance
|
||||
_period_type = ArrowPeriodType("D")
|
||||
pyarrow.register_extension_type(_period_type)
|
||||
|
||||
|
||||
class ArrowIntervalType(pyarrow.ExtensionType):
|
||||
def __init__(self, subtype, closed: IntervalClosedType) -> None:
|
||||
# attributes need to be set first before calling
|
||||
# super init (as that calls serialize)
|
||||
assert closed in VALID_CLOSED
|
||||
self._closed: IntervalClosedType = closed
|
||||
if not isinstance(subtype, pyarrow.DataType):
|
||||
subtype = pyarrow.type_for_alias(str(subtype))
|
||||
self._subtype = subtype
|
||||
|
||||
storage_type = pyarrow.struct([("left", subtype), ("right", subtype)])
|
||||
pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval")
|
||||
|
||||
@property
|
||||
def subtype(self):
|
||||
return self._subtype
|
||||
|
||||
@property
|
||||
def closed(self) -> IntervalClosedType:
|
||||
return self._closed
|
||||
|
||||
def __arrow_ext_serialize__(self) -> bytes:
|
||||
metadata = {"subtype": str(self.subtype), "closed": self.closed}
|
||||
return json.dumps(metadata).encode()
|
||||
|
||||
@classmethod
|
||||
def __arrow_ext_deserialize__(cls, storage_type, serialized) -> ArrowIntervalType:
|
||||
metadata = json.loads(serialized.decode())
|
||||
subtype = pyarrow.type_for_alias(metadata["subtype"])
|
||||
closed = metadata["closed"]
|
||||
return ArrowIntervalType(subtype, closed)
|
||||
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, pyarrow.BaseExtensionType):
|
||||
return (
|
||||
type(self) == type(other)
|
||||
and self.subtype == other.subtype
|
||||
and self.closed == other.closed
|
||||
)
|
||||
else:
|
||||
return NotImplemented
|
||||
|
||||
def __ne__(self, other) -> bool:
|
||||
return not self == other
|
||||
|
||||
def __hash__(self) -> int:
|
||||
return hash((str(self), str(self.subtype), self.closed))
|
||||
|
||||
def to_pandas_dtype(self) -> IntervalDtype:
|
||||
return IntervalDtype(self.subtype.to_pandas_dtype(), self.closed)
|
||||
|
||||
|
||||
# register the type with a dummy instance
|
||||
_interval_type = ArrowIntervalType(pyarrow.int64(), "left")
|
||||
pyarrow.register_extension_type(_interval_type)
|
||||
|
||||
|
||||
_ERROR_MSG = """\
|
||||
Disallowed deserialization of 'arrow.py_extension_type':
|
||||
storage_type = {storage_type}
|
||||
serialized = {serialized}
|
||||
pickle disassembly:\n{pickle_disassembly}
|
||||
|
||||
Reading of untrusted Parquet or Feather files with a PyExtensionType column
|
||||
allows arbitrary code execution.
|
||||
If you trust this file, you can enable reading the extension type by one of:
|
||||
|
||||
- upgrading to pyarrow >= 14.0.1, and call `pa.PyExtensionType.set_auto_load(True)`
|
||||
- install pyarrow-hotfix (`pip install pyarrow-hotfix`) and disable it by running
|
||||
`import pyarrow_hotfix; pyarrow_hotfix.uninstall()`
|
||||
|
||||
We strongly recommend updating your Parquet/Feather files to use extension types
|
||||
derived from `pyarrow.ExtensionType` instead, and register this type explicitly.
|
||||
"""
|
||||
|
||||
|
||||
def patch_pyarrow():
|
||||
# starting from pyarrow 14.0.1, it has its own mechanism
|
||||
if not pa_version_under14p1:
|
||||
return
|
||||
|
||||
# if https://github.com/pitrou/pyarrow-hotfix was installed and enabled
|
||||
if getattr(pyarrow, "_hotfix_installed", False):
|
||||
return
|
||||
|
||||
class ForbiddenExtensionType(pyarrow.ExtensionType):
|
||||
def __arrow_ext_serialize__(self):
|
||||
return b""
|
||||
|
||||
@classmethod
|
||||
def __arrow_ext_deserialize__(cls, storage_type, serialized):
|
||||
import io
|
||||
import pickletools
|
||||
|
||||
out = io.StringIO()
|
||||
pickletools.dis(serialized, out)
|
||||
raise RuntimeError(
|
||||
_ERROR_MSG.format(
|
||||
storage_type=storage_type,
|
||||
serialized=serialized,
|
||||
pickle_disassembly=out.getvalue(),
|
||||
)
|
||||
)
|
||||
|
||||
pyarrow.unregister_extension_type("arrow.py_extension_type")
|
||||
pyarrow.register_extension_type(
|
||||
ForbiddenExtensionType(pyarrow.null(), "arrow.py_extension_type")
|
||||
)
|
||||
|
||||
pyarrow._hotfix_installed = True
|
||||
|
||||
|
||||
patch_pyarrow()
|
||||
2609
lib/python3.11/site-packages/pandas/core/arrays/base.py
Normal file
2609
lib/python3.11/site-packages/pandas/core/arrays/base.py
Normal file
File diff suppressed because it is too large
Load Diff
407
lib/python3.11/site-packages/pandas/core/arrays/boolean.py
Normal file
407
lib/python3.11/site-packages/pandas/core/arrays/boolean.py
Normal file
@ -0,0 +1,407 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import numbers
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
ClassVar,
|
||||
cast,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import (
|
||||
lib,
|
||||
missing as libmissing,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.common import is_list_like
|
||||
from pandas.core.dtypes.dtypes import register_extension_dtype
|
||||
from pandas.core.dtypes.missing import isna
|
||||
|
||||
from pandas.core import ops
|
||||
from pandas.core.array_algos import masked_accumulations
|
||||
from pandas.core.arrays.masked import (
|
||||
BaseMaskedArray,
|
||||
BaseMaskedDtype,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import pyarrow
|
||||
|
||||
from pandas._typing import (
|
||||
Dtype,
|
||||
DtypeObj,
|
||||
Self,
|
||||
npt,
|
||||
type_t,
|
||||
)
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class BooleanDtype(BaseMaskedDtype):
|
||||
"""
|
||||
Extension dtype for boolean data.
|
||||
|
||||
.. warning::
|
||||
|
||||
BooleanDtype is considered experimental. The implementation and
|
||||
parts of the API may change without warning.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> pd.BooleanDtype()
|
||||
BooleanDtype
|
||||
"""
|
||||
|
||||
name: ClassVar[str] = "boolean"
|
||||
|
||||
# https://github.com/python/mypy/issues/4125
|
||||
# error: Signature of "type" incompatible with supertype "BaseMaskedDtype"
|
||||
@property
|
||||
def type(self) -> type: # type: ignore[override]
|
||||
return np.bool_
|
||||
|
||||
@property
|
||||
def kind(self) -> str:
|
||||
return "b"
|
||||
|
||||
@property
|
||||
def numpy_dtype(self) -> np.dtype:
|
||||
return np.dtype("bool")
|
||||
|
||||
@classmethod
|
||||
def construct_array_type(cls) -> type_t[BooleanArray]:
|
||||
"""
|
||||
Return the array type associated with this dtype.
|
||||
|
||||
Returns
|
||||
-------
|
||||
type
|
||||
"""
|
||||
return BooleanArray
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return "BooleanDtype"
|
||||
|
||||
@property
|
||||
def _is_boolean(self) -> bool:
|
||||
return True
|
||||
|
||||
@property
|
||||
def _is_numeric(self) -> bool:
|
||||
return True
|
||||
|
||||
def __from_arrow__(
|
||||
self, array: pyarrow.Array | pyarrow.ChunkedArray
|
||||
) -> BooleanArray:
|
||||
"""
|
||||
Construct BooleanArray from pyarrow Array/ChunkedArray.
|
||||
"""
|
||||
import pyarrow
|
||||
|
||||
if array.type != pyarrow.bool_() and not pyarrow.types.is_null(array.type):
|
||||
raise TypeError(f"Expected array of boolean type, got {array.type} instead")
|
||||
|
||||
if isinstance(array, pyarrow.Array):
|
||||
chunks = [array]
|
||||
length = len(array)
|
||||
else:
|
||||
# pyarrow.ChunkedArray
|
||||
chunks = array.chunks
|
||||
length = array.length()
|
||||
|
||||
if pyarrow.types.is_null(array.type):
|
||||
mask = np.ones(length, dtype=bool)
|
||||
# No need to init data, since all null
|
||||
data = np.empty(length, dtype=bool)
|
||||
return BooleanArray(data, mask)
|
||||
|
||||
results = []
|
||||
for arr in chunks:
|
||||
buflist = arr.buffers()
|
||||
data = pyarrow.BooleanArray.from_buffers(
|
||||
arr.type, len(arr), [None, buflist[1]], offset=arr.offset
|
||||
).to_numpy(zero_copy_only=False)
|
||||
if arr.null_count != 0:
|
||||
mask = pyarrow.BooleanArray.from_buffers(
|
||||
arr.type, len(arr), [None, buflist[0]], offset=arr.offset
|
||||
).to_numpy(zero_copy_only=False)
|
||||
mask = ~mask
|
||||
else:
|
||||
mask = np.zeros(len(arr), dtype=bool)
|
||||
|
||||
bool_arr = BooleanArray(data, mask)
|
||||
results.append(bool_arr)
|
||||
|
||||
if not results:
|
||||
return BooleanArray(
|
||||
np.array([], dtype=np.bool_), np.array([], dtype=np.bool_)
|
||||
)
|
||||
else:
|
||||
return BooleanArray._concat_same_type(results)
|
||||
|
||||
|
||||
def coerce_to_array(
|
||||
values, mask=None, copy: bool = False
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
"""
|
||||
Coerce the input values array to numpy arrays with a mask.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : 1D list-like
|
||||
mask : bool 1D array, optional
|
||||
copy : bool, default False
|
||||
if True, copy the input
|
||||
|
||||
Returns
|
||||
-------
|
||||
tuple of (values, mask)
|
||||
"""
|
||||
if isinstance(values, BooleanArray):
|
||||
if mask is not None:
|
||||
raise ValueError("cannot pass mask for BooleanArray input")
|
||||
values, mask = values._data, values._mask
|
||||
if copy:
|
||||
values = values.copy()
|
||||
mask = mask.copy()
|
||||
return values, mask
|
||||
|
||||
mask_values = None
|
||||
if isinstance(values, np.ndarray) and values.dtype == np.bool_:
|
||||
if copy:
|
||||
values = values.copy()
|
||||
elif isinstance(values, np.ndarray) and values.dtype.kind in "iufcb":
|
||||
mask_values = isna(values)
|
||||
|
||||
values_bool = np.zeros(len(values), dtype=bool)
|
||||
values_bool[~mask_values] = values[~mask_values].astype(bool)
|
||||
|
||||
if not np.all(
|
||||
values_bool[~mask_values].astype(values.dtype) == values[~mask_values]
|
||||
):
|
||||
raise TypeError("Need to pass bool-like values")
|
||||
|
||||
values = values_bool
|
||||
else:
|
||||
values_object = np.asarray(values, dtype=object)
|
||||
|
||||
inferred_dtype = lib.infer_dtype(values_object, skipna=True)
|
||||
integer_like = ("floating", "integer", "mixed-integer-float")
|
||||
if inferred_dtype not in ("boolean", "empty") + integer_like:
|
||||
raise TypeError("Need to pass bool-like values")
|
||||
|
||||
# mypy does not narrow the type of mask_values to npt.NDArray[np.bool_]
|
||||
# within this branch, it assumes it can also be None
|
||||
mask_values = cast("npt.NDArray[np.bool_]", isna(values_object))
|
||||
values = np.zeros(len(values), dtype=bool)
|
||||
values[~mask_values] = values_object[~mask_values].astype(bool)
|
||||
|
||||
# if the values were integer-like, validate it were actually 0/1's
|
||||
if (inferred_dtype in integer_like) and not (
|
||||
np.all(
|
||||
values[~mask_values].astype(float)
|
||||
== values_object[~mask_values].astype(float)
|
||||
)
|
||||
):
|
||||
raise TypeError("Need to pass bool-like values")
|
||||
|
||||
if mask is None and mask_values is None:
|
||||
mask = np.zeros(values.shape, dtype=bool)
|
||||
elif mask is None:
|
||||
mask = mask_values
|
||||
else:
|
||||
if isinstance(mask, np.ndarray) and mask.dtype == np.bool_:
|
||||
if mask_values is not None:
|
||||
mask = mask | mask_values
|
||||
else:
|
||||
if copy:
|
||||
mask = mask.copy()
|
||||
else:
|
||||
mask = np.array(mask, dtype=bool)
|
||||
if mask_values is not None:
|
||||
mask = mask | mask_values
|
||||
|
||||
if values.shape != mask.shape:
|
||||
raise ValueError("values.shape and mask.shape must match")
|
||||
|
||||
return values, mask
|
||||
|
||||
|
||||
class BooleanArray(BaseMaskedArray):
|
||||
"""
|
||||
Array of boolean (True/False) data with missing values.
|
||||
|
||||
This is a pandas Extension array for boolean data, under the hood
|
||||
represented by 2 numpy arrays: a boolean array with the data and
|
||||
a boolean array with the mask (True indicating missing).
|
||||
|
||||
BooleanArray implements Kleene logic (sometimes called three-value
|
||||
logic) for logical operations. See :ref:`boolean.kleene` for more.
|
||||
|
||||
To construct an BooleanArray from generic array-like input, use
|
||||
:func:`pandas.array` specifying ``dtype="boolean"`` (see examples
|
||||
below).
|
||||
|
||||
.. warning::
|
||||
|
||||
BooleanArray is considered experimental. The implementation and
|
||||
parts of the API may change without warning.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : numpy.ndarray
|
||||
A 1-d boolean-dtype array with the data.
|
||||
mask : numpy.ndarray
|
||||
A 1-d boolean-dtype array indicating missing values (True
|
||||
indicates missing).
|
||||
copy : bool, default False
|
||||
Whether to copy the `values` and `mask` arrays.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
Returns
|
||||
-------
|
||||
BooleanArray
|
||||
|
||||
Examples
|
||||
--------
|
||||
Create an BooleanArray with :func:`pandas.array`:
|
||||
|
||||
>>> pd.array([True, False, None], dtype="boolean")
|
||||
<BooleanArray>
|
||||
[True, False, <NA>]
|
||||
Length: 3, dtype: boolean
|
||||
"""
|
||||
|
||||
# The value used to fill '_data' to avoid upcasting
|
||||
_internal_fill_value = False
|
||||
# Fill values used for any/all
|
||||
# Incompatible types in assignment (expression has type "bool", base class
|
||||
# "BaseMaskedArray" defined the type as "<typing special form>")
|
||||
_truthy_value = True # type: ignore[assignment]
|
||||
_falsey_value = False # type: ignore[assignment]
|
||||
_TRUE_VALUES = {"True", "TRUE", "true", "1", "1.0"}
|
||||
_FALSE_VALUES = {"False", "FALSE", "false", "0", "0.0"}
|
||||
|
||||
@classmethod
|
||||
def _simple_new(cls, values: np.ndarray, mask: npt.NDArray[np.bool_]) -> Self:
|
||||
result = super()._simple_new(values, mask)
|
||||
result._dtype = BooleanDtype()
|
||||
return result
|
||||
|
||||
def __init__(
|
||||
self, values: np.ndarray, mask: np.ndarray, copy: bool = False
|
||||
) -> None:
|
||||
if not (isinstance(values, np.ndarray) and values.dtype == np.bool_):
|
||||
raise TypeError(
|
||||
"values should be boolean numpy array. Use "
|
||||
"the 'pd.array' function instead"
|
||||
)
|
||||
self._dtype = BooleanDtype()
|
||||
super().__init__(values, mask, copy=copy)
|
||||
|
||||
@property
|
||||
def dtype(self) -> BooleanDtype:
|
||||
return self._dtype
|
||||
|
||||
@classmethod
|
||||
def _from_sequence_of_strings(
|
||||
cls,
|
||||
strings: list[str],
|
||||
*,
|
||||
dtype: Dtype | None = None,
|
||||
copy: bool = False,
|
||||
true_values: list[str] | None = None,
|
||||
false_values: list[str] | None = None,
|
||||
) -> BooleanArray:
|
||||
true_values_union = cls._TRUE_VALUES.union(true_values or [])
|
||||
false_values_union = cls._FALSE_VALUES.union(false_values or [])
|
||||
|
||||
def map_string(s) -> bool:
|
||||
if s in true_values_union:
|
||||
return True
|
||||
elif s in false_values_union:
|
||||
return False
|
||||
else:
|
||||
raise ValueError(f"{s} cannot be cast to bool")
|
||||
|
||||
scalars = np.array(strings, dtype=object)
|
||||
mask = isna(scalars)
|
||||
scalars[~mask] = list(map(map_string, scalars[~mask]))
|
||||
return cls._from_sequence(scalars, dtype=dtype, copy=copy)
|
||||
|
||||
_HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_)
|
||||
|
||||
@classmethod
|
||||
def _coerce_to_array(
|
||||
cls, value, *, dtype: DtypeObj, copy: bool = False
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
if dtype:
|
||||
assert dtype == "boolean"
|
||||
return coerce_to_array(value, copy=copy)
|
||||
|
||||
def _logical_method(self, other, op):
|
||||
assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"}
|
||||
other_is_scalar = lib.is_scalar(other)
|
||||
mask = None
|
||||
|
||||
if isinstance(other, BooleanArray):
|
||||
other, mask = other._data, other._mask
|
||||
elif is_list_like(other):
|
||||
other = np.asarray(other, dtype="bool")
|
||||
if other.ndim > 1:
|
||||
raise NotImplementedError("can only perform ops with 1-d structures")
|
||||
other, mask = coerce_to_array(other, copy=False)
|
||||
elif isinstance(other, np.bool_):
|
||||
other = other.item()
|
||||
|
||||
if other_is_scalar and other is not libmissing.NA and not lib.is_bool(other):
|
||||
raise TypeError(
|
||||
"'other' should be pandas.NA or a bool. "
|
||||
f"Got {type(other).__name__} instead."
|
||||
)
|
||||
|
||||
if not other_is_scalar and len(self) != len(other):
|
||||
raise ValueError("Lengths must match")
|
||||
|
||||
if op.__name__ in {"or_", "ror_"}:
|
||||
result, mask = ops.kleene_or(self._data, other, self._mask, mask)
|
||||
elif op.__name__ in {"and_", "rand_"}:
|
||||
result, mask = ops.kleene_and(self._data, other, self._mask, mask)
|
||||
else:
|
||||
# i.e. xor, rxor
|
||||
result, mask = ops.kleene_xor(self._data, other, self._mask, mask)
|
||||
|
||||
# i.e. BooleanArray
|
||||
return self._maybe_mask_result(result, mask)
|
||||
|
||||
def _accumulate(
|
||||
self, name: str, *, skipna: bool = True, **kwargs
|
||||
) -> BaseMaskedArray:
|
||||
data = self._data
|
||||
mask = self._mask
|
||||
if name in ("cummin", "cummax"):
|
||||
op = getattr(masked_accumulations, name)
|
||||
data, mask = op(data, mask, skipna=skipna, **kwargs)
|
||||
return self._simple_new(data, mask)
|
||||
else:
|
||||
from pandas.core.arrays import IntegerArray
|
||||
|
||||
return IntegerArray(data.astype(int), mask)._accumulate(
|
||||
name, skipna=skipna, **kwargs
|
||||
)
|
||||
3111
lib/python3.11/site-packages/pandas/core/arrays/categorical.py
Normal file
3111
lib/python3.11/site-packages/pandas/core/arrays/categorical.py
Normal file
File diff suppressed because it is too large
Load Diff
2583
lib/python3.11/site-packages/pandas/core/arrays/datetimelike.py
Normal file
2583
lib/python3.11/site-packages/pandas/core/arrays/datetimelike.py
Normal file
File diff suppressed because it is too large
Load Diff
2837
lib/python3.11/site-packages/pandas/core/arrays/datetimes.py
Normal file
2837
lib/python3.11/site-packages/pandas/core/arrays/datetimes.py
Normal file
File diff suppressed because it is too large
Load Diff
173
lib/python3.11/site-packages/pandas/core/arrays/floating.py
Normal file
173
lib/python3.11/site-packages/pandas/core/arrays/floating.py
Normal file
@ -0,0 +1,173 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import ClassVar
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.core.dtypes.base import register_extension_dtype
|
||||
from pandas.core.dtypes.common import is_float_dtype
|
||||
|
||||
from pandas.core.arrays.numeric import (
|
||||
NumericArray,
|
||||
NumericDtype,
|
||||
)
|
||||
|
||||
|
||||
class FloatingDtype(NumericDtype):
|
||||
"""
|
||||
An ExtensionDtype to hold a single size of floating dtype.
|
||||
|
||||
These specific implementations are subclasses of the non-public
|
||||
FloatingDtype. For example we have Float32Dtype to represent float32.
|
||||
|
||||
The attributes name & type are set when these subclasses are created.
|
||||
"""
|
||||
|
||||
_default_np_dtype = np.dtype(np.float64)
|
||||
_checker = is_float_dtype
|
||||
|
||||
@classmethod
|
||||
def construct_array_type(cls) -> type[FloatingArray]:
|
||||
"""
|
||||
Return the array type associated with this dtype.
|
||||
|
||||
Returns
|
||||
-------
|
||||
type
|
||||
"""
|
||||
return FloatingArray
|
||||
|
||||
@classmethod
|
||||
def _get_dtype_mapping(cls) -> dict[np.dtype, FloatingDtype]:
|
||||
return NUMPY_FLOAT_TO_DTYPE
|
||||
|
||||
@classmethod
|
||||
def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray:
|
||||
"""
|
||||
Safely cast the values to the given dtype.
|
||||
|
||||
"safe" in this context means the casting is lossless.
|
||||
"""
|
||||
# This is really only here for compatibility with IntegerDtype
|
||||
# Here for compat with IntegerDtype
|
||||
return values.astype(dtype, copy=copy)
|
||||
|
||||
|
||||
class FloatingArray(NumericArray):
|
||||
"""
|
||||
Array of floating (optional missing) values.
|
||||
|
||||
.. warning::
|
||||
|
||||
FloatingArray is currently experimental, and its API or internal
|
||||
implementation may change without warning. Especially the behaviour
|
||||
regarding NaN (distinct from NA missing values) is subject to change.
|
||||
|
||||
We represent a FloatingArray with 2 numpy arrays:
|
||||
|
||||
- data: contains a numpy float array of the appropriate dtype
|
||||
- mask: a boolean array holding a mask on the data, True is missing
|
||||
|
||||
To construct an FloatingArray from generic array-like input, use
|
||||
:func:`pandas.array` with one of the float dtypes (see examples).
|
||||
|
||||
See :ref:`integer_na` for more.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : numpy.ndarray
|
||||
A 1-d float-dtype array.
|
||||
mask : numpy.ndarray
|
||||
A 1-d boolean-dtype array indicating missing values.
|
||||
copy : bool, default False
|
||||
Whether to copy the `values` and `mask`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
Returns
|
||||
-------
|
||||
FloatingArray
|
||||
|
||||
Examples
|
||||
--------
|
||||
Create an FloatingArray with :func:`pandas.array`:
|
||||
|
||||
>>> pd.array([0.1, None, 0.3], dtype=pd.Float32Dtype())
|
||||
<FloatingArray>
|
||||
[0.1, <NA>, 0.3]
|
||||
Length: 3, dtype: Float32
|
||||
|
||||
String aliases for the dtypes are also available. They are capitalized.
|
||||
|
||||
>>> pd.array([0.1, None, 0.3], dtype="Float32")
|
||||
<FloatingArray>
|
||||
[0.1, <NA>, 0.3]
|
||||
Length: 3, dtype: Float32
|
||||
"""
|
||||
|
||||
_dtype_cls = FloatingDtype
|
||||
|
||||
# The value used to fill '_data' to avoid upcasting
|
||||
_internal_fill_value = np.nan
|
||||
# Fill values used for any/all
|
||||
# Incompatible types in assignment (expression has type "float", base class
|
||||
# "BaseMaskedArray" defined the type as "<typing special form>")
|
||||
_truthy_value = 1.0 # type: ignore[assignment]
|
||||
_falsey_value = 0.0 # type: ignore[assignment]
|
||||
|
||||
|
||||
_dtype_docstring = """
|
||||
An ExtensionDtype for {dtype} data.
|
||||
|
||||
This dtype uses ``pd.NA`` as missing value indicator.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
Examples
|
||||
--------
|
||||
For Float32Dtype:
|
||||
|
||||
>>> ser = pd.Series([2.25, pd.NA], dtype=pd.Float32Dtype())
|
||||
>>> ser.dtype
|
||||
Float32Dtype()
|
||||
|
||||
For Float64Dtype:
|
||||
|
||||
>>> ser = pd.Series([2.25, pd.NA], dtype=pd.Float64Dtype())
|
||||
>>> ser.dtype
|
||||
Float64Dtype()
|
||||
"""
|
||||
|
||||
# create the Dtype
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class Float32Dtype(FloatingDtype):
|
||||
type = np.float32
|
||||
name: ClassVar[str] = "Float32"
|
||||
__doc__ = _dtype_docstring.format(dtype="float32")
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class Float64Dtype(FloatingDtype):
|
||||
type = np.float64
|
||||
name: ClassVar[str] = "Float64"
|
||||
__doc__ = _dtype_docstring.format(dtype="float64")
|
||||
|
||||
|
||||
NUMPY_FLOAT_TO_DTYPE: dict[np.dtype, FloatingDtype] = {
|
||||
np.dtype(np.float32): Float32Dtype(),
|
||||
np.dtype(np.float64): Float64Dtype(),
|
||||
}
|
||||
272
lib/python3.11/site-packages/pandas/core/arrays/integer.py
Normal file
272
lib/python3.11/site-packages/pandas/core/arrays/integer.py
Normal file
@ -0,0 +1,272 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import ClassVar
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.core.dtypes.base import register_extension_dtype
|
||||
from pandas.core.dtypes.common import is_integer_dtype
|
||||
|
||||
from pandas.core.arrays.numeric import (
|
||||
NumericArray,
|
||||
NumericDtype,
|
||||
)
|
||||
|
||||
|
||||
class IntegerDtype(NumericDtype):
|
||||
"""
|
||||
An ExtensionDtype to hold a single size & kind of integer dtype.
|
||||
|
||||
These specific implementations are subclasses of the non-public
|
||||
IntegerDtype. For example, we have Int8Dtype to represent signed int 8s.
|
||||
|
||||
The attributes name & type are set when these subclasses are created.
|
||||
"""
|
||||
|
||||
_default_np_dtype = np.dtype(np.int64)
|
||||
_checker = is_integer_dtype
|
||||
|
||||
@classmethod
|
||||
def construct_array_type(cls) -> type[IntegerArray]:
|
||||
"""
|
||||
Return the array type associated with this dtype.
|
||||
|
||||
Returns
|
||||
-------
|
||||
type
|
||||
"""
|
||||
return IntegerArray
|
||||
|
||||
@classmethod
|
||||
def _get_dtype_mapping(cls) -> dict[np.dtype, IntegerDtype]:
|
||||
return NUMPY_INT_TO_DTYPE
|
||||
|
||||
@classmethod
|
||||
def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray:
|
||||
"""
|
||||
Safely cast the values to the given dtype.
|
||||
|
||||
"safe" in this context means the casting is lossless. e.g. if 'values'
|
||||
has a floating dtype, each value must be an integer.
|
||||
"""
|
||||
try:
|
||||
return values.astype(dtype, casting="safe", copy=copy)
|
||||
except TypeError as err:
|
||||
casted = values.astype(dtype, copy=copy)
|
||||
if (casted == values).all():
|
||||
return casted
|
||||
|
||||
raise TypeError(
|
||||
f"cannot safely cast non-equivalent {values.dtype} to {np.dtype(dtype)}"
|
||||
) from err
|
||||
|
||||
|
||||
class IntegerArray(NumericArray):
|
||||
"""
|
||||
Array of integer (optional missing) values.
|
||||
|
||||
Uses :attr:`pandas.NA` as the missing value.
|
||||
|
||||
.. warning::
|
||||
|
||||
IntegerArray is currently experimental, and its API or internal
|
||||
implementation may change without warning.
|
||||
|
||||
We represent an IntegerArray with 2 numpy arrays:
|
||||
|
||||
- data: contains a numpy integer array of the appropriate dtype
|
||||
- mask: a boolean array holding a mask on the data, True is missing
|
||||
|
||||
To construct an IntegerArray from generic array-like input, use
|
||||
:func:`pandas.array` with one of the integer dtypes (see examples).
|
||||
|
||||
See :ref:`integer_na` for more.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : numpy.ndarray
|
||||
A 1-d integer-dtype array.
|
||||
mask : numpy.ndarray
|
||||
A 1-d boolean-dtype array indicating missing values.
|
||||
copy : bool, default False
|
||||
Whether to copy the `values` and `mask`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
Returns
|
||||
-------
|
||||
IntegerArray
|
||||
|
||||
Examples
|
||||
--------
|
||||
Create an IntegerArray with :func:`pandas.array`.
|
||||
|
||||
>>> int_array = pd.array([1, None, 3], dtype=pd.Int32Dtype())
|
||||
>>> int_array
|
||||
<IntegerArray>
|
||||
[1, <NA>, 3]
|
||||
Length: 3, dtype: Int32
|
||||
|
||||
String aliases for the dtypes are also available. They are capitalized.
|
||||
|
||||
>>> pd.array([1, None, 3], dtype='Int32')
|
||||
<IntegerArray>
|
||||
[1, <NA>, 3]
|
||||
Length: 3, dtype: Int32
|
||||
|
||||
>>> pd.array([1, None, 3], dtype='UInt16')
|
||||
<IntegerArray>
|
||||
[1, <NA>, 3]
|
||||
Length: 3, dtype: UInt16
|
||||
"""
|
||||
|
||||
_dtype_cls = IntegerDtype
|
||||
|
||||
# The value used to fill '_data' to avoid upcasting
|
||||
_internal_fill_value = 1
|
||||
# Fill values used for any/all
|
||||
# Incompatible types in assignment (expression has type "int", base class
|
||||
# "BaseMaskedArray" defined the type as "<typing special form>")
|
||||
_truthy_value = 1 # type: ignore[assignment]
|
||||
_falsey_value = 0 # type: ignore[assignment]
|
||||
|
||||
|
||||
_dtype_docstring = """
|
||||
An ExtensionDtype for {dtype} integer data.
|
||||
|
||||
Uses :attr:`pandas.NA` as its missing value, rather than :attr:`numpy.nan`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
Examples
|
||||
--------
|
||||
For Int8Dtype:
|
||||
|
||||
>>> ser = pd.Series([2, pd.NA], dtype=pd.Int8Dtype())
|
||||
>>> ser.dtype
|
||||
Int8Dtype()
|
||||
|
||||
For Int16Dtype:
|
||||
|
||||
>>> ser = pd.Series([2, pd.NA], dtype=pd.Int16Dtype())
|
||||
>>> ser.dtype
|
||||
Int16Dtype()
|
||||
|
||||
For Int32Dtype:
|
||||
|
||||
>>> ser = pd.Series([2, pd.NA], dtype=pd.Int32Dtype())
|
||||
>>> ser.dtype
|
||||
Int32Dtype()
|
||||
|
||||
For Int64Dtype:
|
||||
|
||||
>>> ser = pd.Series([2, pd.NA], dtype=pd.Int64Dtype())
|
||||
>>> ser.dtype
|
||||
Int64Dtype()
|
||||
|
||||
For UInt8Dtype:
|
||||
|
||||
>>> ser = pd.Series([2, pd.NA], dtype=pd.UInt8Dtype())
|
||||
>>> ser.dtype
|
||||
UInt8Dtype()
|
||||
|
||||
For UInt16Dtype:
|
||||
|
||||
>>> ser = pd.Series([2, pd.NA], dtype=pd.UInt16Dtype())
|
||||
>>> ser.dtype
|
||||
UInt16Dtype()
|
||||
|
||||
For UInt32Dtype:
|
||||
|
||||
>>> ser = pd.Series([2, pd.NA], dtype=pd.UInt32Dtype())
|
||||
>>> ser.dtype
|
||||
UInt32Dtype()
|
||||
|
||||
For UInt64Dtype:
|
||||
|
||||
>>> ser = pd.Series([2, pd.NA], dtype=pd.UInt64Dtype())
|
||||
>>> ser.dtype
|
||||
UInt64Dtype()
|
||||
"""
|
||||
|
||||
# create the Dtype
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class Int8Dtype(IntegerDtype):
|
||||
type = np.int8
|
||||
name: ClassVar[str] = "Int8"
|
||||
__doc__ = _dtype_docstring.format(dtype="int8")
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class Int16Dtype(IntegerDtype):
|
||||
type = np.int16
|
||||
name: ClassVar[str] = "Int16"
|
||||
__doc__ = _dtype_docstring.format(dtype="int16")
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class Int32Dtype(IntegerDtype):
|
||||
type = np.int32
|
||||
name: ClassVar[str] = "Int32"
|
||||
__doc__ = _dtype_docstring.format(dtype="int32")
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class Int64Dtype(IntegerDtype):
|
||||
type = np.int64
|
||||
name: ClassVar[str] = "Int64"
|
||||
__doc__ = _dtype_docstring.format(dtype="int64")
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class UInt8Dtype(IntegerDtype):
|
||||
type = np.uint8
|
||||
name: ClassVar[str] = "UInt8"
|
||||
__doc__ = _dtype_docstring.format(dtype="uint8")
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class UInt16Dtype(IntegerDtype):
|
||||
type = np.uint16
|
||||
name: ClassVar[str] = "UInt16"
|
||||
__doc__ = _dtype_docstring.format(dtype="uint16")
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class UInt32Dtype(IntegerDtype):
|
||||
type = np.uint32
|
||||
name: ClassVar[str] = "UInt32"
|
||||
__doc__ = _dtype_docstring.format(dtype="uint32")
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class UInt64Dtype(IntegerDtype):
|
||||
type = np.uint64
|
||||
name: ClassVar[str] = "UInt64"
|
||||
__doc__ = _dtype_docstring.format(dtype="uint64")
|
||||
|
||||
|
||||
NUMPY_INT_TO_DTYPE: dict[np.dtype, IntegerDtype] = {
|
||||
np.dtype(np.int8): Int8Dtype(),
|
||||
np.dtype(np.int16): Int16Dtype(),
|
||||
np.dtype(np.int32): Int32Dtype(),
|
||||
np.dtype(np.int64): Int64Dtype(),
|
||||
np.dtype(np.uint8): UInt8Dtype(),
|
||||
np.dtype(np.uint16): UInt16Dtype(),
|
||||
np.dtype(np.uint32): UInt32Dtype(),
|
||||
np.dtype(np.uint64): UInt64Dtype(),
|
||||
}
|
||||
1930
lib/python3.11/site-packages/pandas/core/arrays/interval.py
Normal file
1930
lib/python3.11/site-packages/pandas/core/arrays/interval.py
Normal file
File diff suppressed because it is too large
Load Diff
1669
lib/python3.11/site-packages/pandas/core/arrays/masked.py
Normal file
1669
lib/python3.11/site-packages/pandas/core/arrays/masked.py
Normal file
File diff suppressed because it is too large
Load Diff
286
lib/python3.11/site-packages/pandas/core/arrays/numeric.py
Normal file
286
lib/python3.11/site-packages/pandas/core/arrays/numeric.py
Normal file
@ -0,0 +1,286 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import numbers
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import (
|
||||
lib,
|
||||
missing as libmissing,
|
||||
)
|
||||
from pandas.errors import AbstractMethodError
|
||||
from pandas.util._decorators import cache_readonly
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_integer_dtype,
|
||||
is_string_dtype,
|
||||
pandas_dtype,
|
||||
)
|
||||
|
||||
from pandas.core.arrays.masked import (
|
||||
BaseMaskedArray,
|
||||
BaseMaskedDtype,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Mapping
|
||||
|
||||
import pyarrow
|
||||
|
||||
from pandas._typing import (
|
||||
Dtype,
|
||||
DtypeObj,
|
||||
Self,
|
||||
npt,
|
||||
)
|
||||
|
||||
|
||||
class NumericDtype(BaseMaskedDtype):
|
||||
_default_np_dtype: np.dtype
|
||||
_checker: Callable[[Any], bool] # is_foo_dtype
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"{self.name}Dtype()"
|
||||
|
||||
@cache_readonly
|
||||
def is_signed_integer(self) -> bool:
|
||||
return self.kind == "i"
|
||||
|
||||
@cache_readonly
|
||||
def is_unsigned_integer(self) -> bool:
|
||||
return self.kind == "u"
|
||||
|
||||
@property
|
||||
def _is_numeric(self) -> bool:
|
||||
return True
|
||||
|
||||
def __from_arrow__(
|
||||
self, array: pyarrow.Array | pyarrow.ChunkedArray
|
||||
) -> BaseMaskedArray:
|
||||
"""
|
||||
Construct IntegerArray/FloatingArray from pyarrow Array/ChunkedArray.
|
||||
"""
|
||||
import pyarrow
|
||||
|
||||
from pandas.core.arrays.arrow._arrow_utils import (
|
||||
pyarrow_array_to_numpy_and_mask,
|
||||
)
|
||||
|
||||
array_class = self.construct_array_type()
|
||||
|
||||
pyarrow_type = pyarrow.from_numpy_dtype(self.type)
|
||||
if not array.type.equals(pyarrow_type) and not pyarrow.types.is_null(
|
||||
array.type
|
||||
):
|
||||
# test_from_arrow_type_error raise for string, but allow
|
||||
# through itemsize conversion GH#31896
|
||||
rt_dtype = pandas_dtype(array.type.to_pandas_dtype())
|
||||
if rt_dtype.kind not in "iuf":
|
||||
# Could allow "c" or potentially disallow float<->int conversion,
|
||||
# but at the moment we specifically test that uint<->int works
|
||||
raise TypeError(
|
||||
f"Expected array of {self} type, got {array.type} instead"
|
||||
)
|
||||
|
||||
array = array.cast(pyarrow_type)
|
||||
|
||||
if isinstance(array, pyarrow.ChunkedArray):
|
||||
# TODO this "if" can be removed when requiring pyarrow >= 10.0, which fixed
|
||||
# combine_chunks for empty arrays https://github.com/apache/arrow/pull/13757
|
||||
if array.num_chunks == 0:
|
||||
array = pyarrow.array([], type=array.type)
|
||||
else:
|
||||
array = array.combine_chunks()
|
||||
|
||||
data, mask = pyarrow_array_to_numpy_and_mask(array, dtype=self.numpy_dtype)
|
||||
return array_class(data.copy(), ~mask, copy=False)
|
||||
|
||||
@classmethod
|
||||
def _get_dtype_mapping(cls) -> Mapping[np.dtype, NumericDtype]:
|
||||
raise AbstractMethodError(cls)
|
||||
|
||||
@classmethod
|
||||
def _standardize_dtype(cls, dtype: NumericDtype | str | np.dtype) -> NumericDtype:
|
||||
"""
|
||||
Convert a string representation or a numpy dtype to NumericDtype.
|
||||
"""
|
||||
if isinstance(dtype, str) and (dtype.startswith(("Int", "UInt", "Float"))):
|
||||
# Avoid DeprecationWarning from NumPy about np.dtype("Int64")
|
||||
# https://github.com/numpy/numpy/pull/7476
|
||||
dtype = dtype.lower()
|
||||
|
||||
if not isinstance(dtype, NumericDtype):
|
||||
mapping = cls._get_dtype_mapping()
|
||||
try:
|
||||
dtype = mapping[np.dtype(dtype)]
|
||||
except KeyError as err:
|
||||
raise ValueError(f"invalid dtype specified {dtype}") from err
|
||||
return dtype
|
||||
|
||||
@classmethod
|
||||
def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray:
|
||||
"""
|
||||
Safely cast the values to the given dtype.
|
||||
|
||||
"safe" in this context means the casting is lossless.
|
||||
"""
|
||||
raise AbstractMethodError(cls)
|
||||
|
||||
|
||||
def _coerce_to_data_and_mask(
|
||||
values, dtype, copy: bool, dtype_cls: type[NumericDtype], default_dtype: np.dtype
|
||||
):
|
||||
checker = dtype_cls._checker
|
||||
|
||||
mask = None
|
||||
inferred_type = None
|
||||
|
||||
if dtype is None and hasattr(values, "dtype"):
|
||||
if checker(values.dtype):
|
||||
dtype = values.dtype
|
||||
|
||||
if dtype is not None:
|
||||
dtype = dtype_cls._standardize_dtype(dtype)
|
||||
|
||||
cls = dtype_cls.construct_array_type()
|
||||
if isinstance(values, cls):
|
||||
values, mask = values._data, values._mask
|
||||
if dtype is not None:
|
||||
values = values.astype(dtype.numpy_dtype, copy=False)
|
||||
|
||||
if copy:
|
||||
values = values.copy()
|
||||
mask = mask.copy()
|
||||
return values, mask, dtype, inferred_type
|
||||
|
||||
original = values
|
||||
if not copy:
|
||||
values = np.asarray(values)
|
||||
else:
|
||||
values = np.array(values, copy=copy)
|
||||
inferred_type = None
|
||||
if values.dtype == object or is_string_dtype(values.dtype):
|
||||
inferred_type = lib.infer_dtype(values, skipna=True)
|
||||
if inferred_type == "boolean" and dtype is None:
|
||||
name = dtype_cls.__name__.strip("_")
|
||||
raise TypeError(f"{values.dtype} cannot be converted to {name}")
|
||||
|
||||
elif values.dtype.kind == "b" and checker(dtype):
|
||||
if not copy:
|
||||
values = np.asarray(values, dtype=default_dtype)
|
||||
else:
|
||||
values = np.array(values, dtype=default_dtype, copy=copy)
|
||||
|
||||
elif values.dtype.kind not in "iuf":
|
||||
name = dtype_cls.__name__.strip("_")
|
||||
raise TypeError(f"{values.dtype} cannot be converted to {name}")
|
||||
|
||||
if values.ndim != 1:
|
||||
raise TypeError("values must be a 1D list-like")
|
||||
|
||||
if mask is None:
|
||||
if values.dtype.kind in "iu":
|
||||
# fastpath
|
||||
mask = np.zeros(len(values), dtype=np.bool_)
|
||||
else:
|
||||
mask = libmissing.is_numeric_na(values)
|
||||
else:
|
||||
assert len(mask) == len(values)
|
||||
|
||||
if mask.ndim != 1:
|
||||
raise TypeError("mask must be a 1D list-like")
|
||||
|
||||
# infer dtype if needed
|
||||
if dtype is None:
|
||||
dtype = default_dtype
|
||||
else:
|
||||
dtype = dtype.numpy_dtype
|
||||
|
||||
if is_integer_dtype(dtype) and values.dtype.kind == "f" and len(values) > 0:
|
||||
if mask.all():
|
||||
values = np.ones(values.shape, dtype=dtype)
|
||||
else:
|
||||
idx = np.nanargmax(values)
|
||||
if int(values[idx]) != original[idx]:
|
||||
# We have ints that lost precision during the cast.
|
||||
inferred_type = lib.infer_dtype(original, skipna=True)
|
||||
if (
|
||||
inferred_type not in ["floating", "mixed-integer-float"]
|
||||
and not mask.any()
|
||||
):
|
||||
values = np.asarray(original, dtype=dtype)
|
||||
else:
|
||||
values = np.asarray(original, dtype="object")
|
||||
|
||||
# we copy as need to coerce here
|
||||
if mask.any():
|
||||
values = values.copy()
|
||||
values[mask] = cls._internal_fill_value
|
||||
if inferred_type in ("string", "unicode"):
|
||||
# casts from str are always safe since they raise
|
||||
# a ValueError if the str cannot be parsed into a float
|
||||
values = values.astype(dtype, copy=copy)
|
||||
else:
|
||||
values = dtype_cls._safe_cast(values, dtype, copy=False)
|
||||
|
||||
return values, mask, dtype, inferred_type
|
||||
|
||||
|
||||
class NumericArray(BaseMaskedArray):
|
||||
"""
|
||||
Base class for IntegerArray and FloatingArray.
|
||||
"""
|
||||
|
||||
_dtype_cls: type[NumericDtype]
|
||||
|
||||
def __init__(
|
||||
self, values: np.ndarray, mask: npt.NDArray[np.bool_], copy: bool = False
|
||||
) -> None:
|
||||
checker = self._dtype_cls._checker
|
||||
if not (isinstance(values, np.ndarray) and checker(values.dtype)):
|
||||
descr = (
|
||||
"floating"
|
||||
if self._dtype_cls.kind == "f" # type: ignore[comparison-overlap]
|
||||
else "integer"
|
||||
)
|
||||
raise TypeError(
|
||||
f"values should be {descr} numpy array. Use "
|
||||
"the 'pd.array' function instead"
|
||||
)
|
||||
if values.dtype == np.float16:
|
||||
# If we don't raise here, then accessing self.dtype would raise
|
||||
raise TypeError("FloatingArray does not support np.float16 dtype.")
|
||||
|
||||
super().__init__(values, mask, copy=copy)
|
||||
|
||||
@cache_readonly
|
||||
def dtype(self) -> NumericDtype:
|
||||
mapping = self._dtype_cls._get_dtype_mapping()
|
||||
return mapping[self._data.dtype]
|
||||
|
||||
@classmethod
|
||||
def _coerce_to_array(
|
||||
cls, value, *, dtype: DtypeObj, copy: bool = False
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
dtype_cls = cls._dtype_cls
|
||||
default_dtype = dtype_cls._default_np_dtype
|
||||
values, mask, _, _ = _coerce_to_data_and_mask(
|
||||
value, dtype, copy, dtype_cls, default_dtype
|
||||
)
|
||||
return values, mask
|
||||
|
||||
@classmethod
|
||||
def _from_sequence_of_strings(
|
||||
cls, strings, *, dtype: Dtype | None = None, copy: bool = False
|
||||
) -> Self:
|
||||
from pandas.core.tools.numeric import to_numeric
|
||||
|
||||
scalars = to_numeric(strings, errors="raise", dtype_backend="numpy_nullable")
|
||||
return cls._from_sequence(scalars, dtype=dtype, copy=copy)
|
||||
|
||||
_HANDLED_TYPES = (np.ndarray, numbers.Number)
|
||||
574
lib/python3.11/site-packages/pandas/core/arrays/numpy_.py
Normal file
574
lib/python3.11/site-packages/pandas/core/arrays/numpy_.py
Normal file
@ -0,0 +1,574 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Literal,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas._libs.tslibs import is_supported_dtype
|
||||
from pandas.compat.numpy import function as nv
|
||||
|
||||
from pandas.core.dtypes.astype import astype_array
|
||||
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
|
||||
from pandas.core.dtypes.common import pandas_dtype
|
||||
from pandas.core.dtypes.dtypes import NumpyEADtype
|
||||
from pandas.core.dtypes.missing import isna
|
||||
|
||||
from pandas.core import (
|
||||
arraylike,
|
||||
missing,
|
||||
nanops,
|
||||
ops,
|
||||
)
|
||||
from pandas.core.arraylike import OpsMixin
|
||||
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
|
||||
from pandas.core.construction import ensure_wrapped_if_datetimelike
|
||||
from pandas.core.strings.object_array import ObjectStringArrayMixin
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable
|
||||
|
||||
from pandas._typing import (
|
||||
AxisInt,
|
||||
Dtype,
|
||||
FillnaOptions,
|
||||
InterpolateOptions,
|
||||
NpDtype,
|
||||
Scalar,
|
||||
Self,
|
||||
npt,
|
||||
)
|
||||
|
||||
from pandas import Index
|
||||
|
||||
|
||||
# error: Definition of "_concat_same_type" in base class "NDArrayBacked" is
|
||||
# incompatible with definition in base class "ExtensionArray"
|
||||
class NumpyExtensionArray( # type: ignore[misc]
|
||||
OpsMixin,
|
||||
NDArrayBackedExtensionArray,
|
||||
ObjectStringArrayMixin,
|
||||
):
|
||||
"""
|
||||
A pandas ExtensionArray for NumPy data.
|
||||
|
||||
This is mostly for internal compatibility, and is not especially
|
||||
useful on its own.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : ndarray
|
||||
The NumPy ndarray to wrap. Must be 1-dimensional.
|
||||
copy : bool, default False
|
||||
Whether to copy `values`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> pd.arrays.NumpyExtensionArray(np.array([0, 1, 2, 3]))
|
||||
<NumpyExtensionArray>
|
||||
[0, 1, 2, 3]
|
||||
Length: 4, dtype: int64
|
||||
"""
|
||||
|
||||
# If you're wondering why pd.Series(cls) doesn't put the array in an
|
||||
# ExtensionBlock, search for `ABCNumpyExtensionArray`. We check for
|
||||
# that _typ to ensure that users don't unnecessarily use EAs inside
|
||||
# pandas internals, which turns off things like block consolidation.
|
||||
_typ = "npy_extension"
|
||||
__array_priority__ = 1000
|
||||
_ndarray: np.ndarray
|
||||
_dtype: NumpyEADtype
|
||||
_internal_fill_value = np.nan
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# Constructors
|
||||
|
||||
def __init__(
|
||||
self, values: np.ndarray | NumpyExtensionArray, copy: bool = False
|
||||
) -> None:
|
||||
if isinstance(values, type(self)):
|
||||
values = values._ndarray
|
||||
if not isinstance(values, np.ndarray):
|
||||
raise ValueError(
|
||||
f"'values' must be a NumPy array, not {type(values).__name__}"
|
||||
)
|
||||
|
||||
if values.ndim == 0:
|
||||
# Technically we support 2, but do not advertise that fact.
|
||||
raise ValueError("NumpyExtensionArray must be 1-dimensional.")
|
||||
|
||||
if copy:
|
||||
values = values.copy()
|
||||
|
||||
dtype = NumpyEADtype(values.dtype)
|
||||
super().__init__(values, dtype)
|
||||
|
||||
@classmethod
|
||||
def _from_sequence(
|
||||
cls, scalars, *, dtype: Dtype | None = None, copy: bool = False
|
||||
) -> NumpyExtensionArray:
|
||||
if isinstance(dtype, NumpyEADtype):
|
||||
dtype = dtype._dtype
|
||||
|
||||
# error: Argument "dtype" to "asarray" has incompatible type
|
||||
# "Union[ExtensionDtype, str, dtype[Any], dtype[floating[_64Bit]], Type[object],
|
||||
# None]"; expected "Union[dtype[Any], None, type, _SupportsDType, str,
|
||||
# Union[Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], List[Any],
|
||||
# _DTypeDict, Tuple[Any, Any]]]"
|
||||
result = np.asarray(scalars, dtype=dtype) # type: ignore[arg-type]
|
||||
if (
|
||||
result.ndim > 1
|
||||
and not hasattr(scalars, "dtype")
|
||||
and (dtype is None or dtype == object)
|
||||
):
|
||||
# e.g. list-of-tuples
|
||||
result = construct_1d_object_array_from_listlike(scalars)
|
||||
|
||||
if copy and result is scalars:
|
||||
result = result.copy()
|
||||
return cls(result)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# Data
|
||||
|
||||
@property
|
||||
def dtype(self) -> NumpyEADtype:
|
||||
return self._dtype
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# NumPy Array Interface
|
||||
|
||||
def __array__(
|
||||
self, dtype: NpDtype | None = None, copy: bool | None = None
|
||||
) -> np.ndarray:
|
||||
if copy is not None:
|
||||
# Note: branch avoids `copy=None` for NumPy 1.x support
|
||||
return np.array(self._ndarray, dtype=dtype, copy=copy)
|
||||
return np.asarray(self._ndarray, dtype=dtype)
|
||||
|
||||
def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
|
||||
# Lightly modified version of
|
||||
# https://numpy.org/doc/stable/reference/generated/numpy.lib.mixins.NDArrayOperatorsMixin.html
|
||||
# The primary modification is not boxing scalar return values
|
||||
# in NumpyExtensionArray, since pandas' ExtensionArrays are 1-d.
|
||||
out = kwargs.get("out", ())
|
||||
|
||||
result = arraylike.maybe_dispatch_ufunc_to_dunder_op(
|
||||
self, ufunc, method, *inputs, **kwargs
|
||||
)
|
||||
if result is not NotImplemented:
|
||||
return result
|
||||
|
||||
if "out" in kwargs:
|
||||
# e.g. test_ufunc_unary
|
||||
return arraylike.dispatch_ufunc_with_out(
|
||||
self, ufunc, method, *inputs, **kwargs
|
||||
)
|
||||
|
||||
if method == "reduce":
|
||||
result = arraylike.dispatch_reduction_ufunc(
|
||||
self, ufunc, method, *inputs, **kwargs
|
||||
)
|
||||
if result is not NotImplemented:
|
||||
# e.g. tests.series.test_ufunc.TestNumpyReductions
|
||||
return result
|
||||
|
||||
# Defer to the implementation of the ufunc on unwrapped values.
|
||||
inputs = tuple(
|
||||
x._ndarray if isinstance(x, NumpyExtensionArray) else x for x in inputs
|
||||
)
|
||||
if out:
|
||||
kwargs["out"] = tuple(
|
||||
x._ndarray if isinstance(x, NumpyExtensionArray) else x for x in out
|
||||
)
|
||||
result = getattr(ufunc, method)(*inputs, **kwargs)
|
||||
|
||||
if ufunc.nout > 1:
|
||||
# multiple return values; re-box array-like results
|
||||
return tuple(type(self)(x) for x in result)
|
||||
elif method == "at":
|
||||
# no return value
|
||||
return None
|
||||
elif method == "reduce":
|
||||
if isinstance(result, np.ndarray):
|
||||
# e.g. test_np_reduce_2d
|
||||
return type(self)(result)
|
||||
|
||||
# e.g. test_np_max_nested_tuples
|
||||
return result
|
||||
else:
|
||||
# one return value; re-box array-like results
|
||||
return type(self)(result)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# Pandas ExtensionArray Interface
|
||||
|
||||
def astype(self, dtype, copy: bool = True):
|
||||
dtype = pandas_dtype(dtype)
|
||||
|
||||
if dtype == self.dtype:
|
||||
if copy:
|
||||
return self.copy()
|
||||
return self
|
||||
|
||||
result = astype_array(self._ndarray, dtype=dtype, copy=copy)
|
||||
return result
|
||||
|
||||
def isna(self) -> np.ndarray:
|
||||
return isna(self._ndarray)
|
||||
|
||||
def _validate_scalar(self, fill_value):
|
||||
if fill_value is None:
|
||||
# Primarily for subclasses
|
||||
fill_value = self.dtype.na_value
|
||||
return fill_value
|
||||
|
||||
def _values_for_factorize(self) -> tuple[np.ndarray, float | None]:
|
||||
if self.dtype.kind in "iub":
|
||||
fv = None
|
||||
else:
|
||||
fv = np.nan
|
||||
return self._ndarray, fv
|
||||
|
||||
# Base EA class (and all other EA classes) don't have limit_area keyword
|
||||
# This can be removed here as well when the interpolate ffill/bfill method
|
||||
# deprecation is enforced
|
||||
def _pad_or_backfill(
|
||||
self,
|
||||
*,
|
||||
method: FillnaOptions,
|
||||
limit: int | None = None,
|
||||
limit_area: Literal["inside", "outside"] | None = None,
|
||||
copy: bool = True,
|
||||
) -> Self:
|
||||
"""
|
||||
ffill or bfill along axis=0.
|
||||
"""
|
||||
if copy:
|
||||
out_data = self._ndarray.copy()
|
||||
else:
|
||||
out_data = self._ndarray
|
||||
|
||||
meth = missing.clean_fill_method(method)
|
||||
missing.pad_or_backfill_inplace(
|
||||
out_data.T,
|
||||
method=meth,
|
||||
axis=0,
|
||||
limit=limit,
|
||||
limit_area=limit_area,
|
||||
)
|
||||
|
||||
if not copy:
|
||||
return self
|
||||
return type(self)._simple_new(out_data, dtype=self.dtype)
|
||||
|
||||
def interpolate(
|
||||
self,
|
||||
*,
|
||||
method: InterpolateOptions,
|
||||
axis: int,
|
||||
index: Index,
|
||||
limit,
|
||||
limit_direction,
|
||||
limit_area,
|
||||
copy: bool,
|
||||
**kwargs,
|
||||
) -> Self:
|
||||
"""
|
||||
See NDFrame.interpolate.__doc__.
|
||||
"""
|
||||
# NB: we return type(self) even if copy=False
|
||||
if not self.dtype._is_numeric:
|
||||
raise TypeError(f"Cannot interpolate with {self.dtype} dtype")
|
||||
|
||||
if not copy:
|
||||
out_data = self._ndarray
|
||||
else:
|
||||
out_data = self._ndarray.copy()
|
||||
|
||||
# TODO: assert we have floating dtype?
|
||||
missing.interpolate_2d_inplace(
|
||||
out_data,
|
||||
method=method,
|
||||
axis=axis,
|
||||
index=index,
|
||||
limit=limit,
|
||||
limit_direction=limit_direction,
|
||||
limit_area=limit_area,
|
||||
**kwargs,
|
||||
)
|
||||
if not copy:
|
||||
return self
|
||||
return type(self)._simple_new(out_data, dtype=self.dtype)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# Reductions
|
||||
|
||||
def any(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
out=None,
|
||||
keepdims: bool = False,
|
||||
skipna: bool = True,
|
||||
):
|
||||
nv.validate_any((), {"out": out, "keepdims": keepdims})
|
||||
result = nanops.nanany(self._ndarray, axis=axis, skipna=skipna)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def all(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
out=None,
|
||||
keepdims: bool = False,
|
||||
skipna: bool = True,
|
||||
):
|
||||
nv.validate_all((), {"out": out, "keepdims": keepdims})
|
||||
result = nanops.nanall(self._ndarray, axis=axis, skipna=skipna)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def min(
|
||||
self, *, axis: AxisInt | None = None, skipna: bool = True, **kwargs
|
||||
) -> Scalar:
|
||||
nv.validate_min((), kwargs)
|
||||
result = nanops.nanmin(
|
||||
values=self._ndarray, axis=axis, mask=self.isna(), skipna=skipna
|
||||
)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def max(
|
||||
self, *, axis: AxisInt | None = None, skipna: bool = True, **kwargs
|
||||
) -> Scalar:
|
||||
nv.validate_max((), kwargs)
|
||||
result = nanops.nanmax(
|
||||
values=self._ndarray, axis=axis, mask=self.isna(), skipna=skipna
|
||||
)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def sum(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
skipna: bool = True,
|
||||
min_count: int = 0,
|
||||
**kwargs,
|
||||
) -> Scalar:
|
||||
nv.validate_sum((), kwargs)
|
||||
result = nanops.nansum(
|
||||
self._ndarray, axis=axis, skipna=skipna, min_count=min_count
|
||||
)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def prod(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
skipna: bool = True,
|
||||
min_count: int = 0,
|
||||
**kwargs,
|
||||
) -> Scalar:
|
||||
nv.validate_prod((), kwargs)
|
||||
result = nanops.nanprod(
|
||||
self._ndarray, axis=axis, skipna=skipna, min_count=min_count
|
||||
)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def mean(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
dtype: NpDtype | None = None,
|
||||
out=None,
|
||||
keepdims: bool = False,
|
||||
skipna: bool = True,
|
||||
):
|
||||
nv.validate_mean((), {"dtype": dtype, "out": out, "keepdims": keepdims})
|
||||
result = nanops.nanmean(self._ndarray, axis=axis, skipna=skipna)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def median(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
out=None,
|
||||
overwrite_input: bool = False,
|
||||
keepdims: bool = False,
|
||||
skipna: bool = True,
|
||||
):
|
||||
nv.validate_median(
|
||||
(), {"out": out, "overwrite_input": overwrite_input, "keepdims": keepdims}
|
||||
)
|
||||
result = nanops.nanmedian(self._ndarray, axis=axis, skipna=skipna)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def std(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
dtype: NpDtype | None = None,
|
||||
out=None,
|
||||
ddof: int = 1,
|
||||
keepdims: bool = False,
|
||||
skipna: bool = True,
|
||||
):
|
||||
nv.validate_stat_ddof_func(
|
||||
(), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="std"
|
||||
)
|
||||
result = nanops.nanstd(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def var(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
dtype: NpDtype | None = None,
|
||||
out=None,
|
||||
ddof: int = 1,
|
||||
keepdims: bool = False,
|
||||
skipna: bool = True,
|
||||
):
|
||||
nv.validate_stat_ddof_func(
|
||||
(), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="var"
|
||||
)
|
||||
result = nanops.nanvar(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def sem(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
dtype: NpDtype | None = None,
|
||||
out=None,
|
||||
ddof: int = 1,
|
||||
keepdims: bool = False,
|
||||
skipna: bool = True,
|
||||
):
|
||||
nv.validate_stat_ddof_func(
|
||||
(), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="sem"
|
||||
)
|
||||
result = nanops.nansem(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def kurt(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
dtype: NpDtype | None = None,
|
||||
out=None,
|
||||
keepdims: bool = False,
|
||||
skipna: bool = True,
|
||||
):
|
||||
nv.validate_stat_ddof_func(
|
||||
(), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="kurt"
|
||||
)
|
||||
result = nanops.nankurt(self._ndarray, axis=axis, skipna=skipna)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def skew(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
dtype: NpDtype | None = None,
|
||||
out=None,
|
||||
keepdims: bool = False,
|
||||
skipna: bool = True,
|
||||
):
|
||||
nv.validate_stat_ddof_func(
|
||||
(), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="skew"
|
||||
)
|
||||
result = nanops.nanskew(self._ndarray, axis=axis, skipna=skipna)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# Additional Methods
|
||||
|
||||
def to_numpy(
|
||||
self,
|
||||
dtype: npt.DTypeLike | None = None,
|
||||
copy: bool = False,
|
||||
na_value: object = lib.no_default,
|
||||
) -> np.ndarray:
|
||||
mask = self.isna()
|
||||
if na_value is not lib.no_default and mask.any():
|
||||
result = self._ndarray.copy()
|
||||
result[mask] = na_value
|
||||
else:
|
||||
result = self._ndarray
|
||||
|
||||
result = np.asarray(result, dtype=dtype)
|
||||
|
||||
if copy and result is self._ndarray:
|
||||
result = result.copy()
|
||||
|
||||
return result
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# Ops
|
||||
|
||||
def __invert__(self) -> NumpyExtensionArray:
|
||||
return type(self)(~self._ndarray)
|
||||
|
||||
def __neg__(self) -> NumpyExtensionArray:
|
||||
return type(self)(-self._ndarray)
|
||||
|
||||
def __pos__(self) -> NumpyExtensionArray:
|
||||
return type(self)(+self._ndarray)
|
||||
|
||||
def __abs__(self) -> NumpyExtensionArray:
|
||||
return type(self)(abs(self._ndarray))
|
||||
|
||||
def _cmp_method(self, other, op):
|
||||
if isinstance(other, NumpyExtensionArray):
|
||||
other = other._ndarray
|
||||
|
||||
other = ops.maybe_prepare_scalar_for_op(other, (len(self),))
|
||||
pd_op = ops.get_array_op(op)
|
||||
other = ensure_wrapped_if_datetimelike(other)
|
||||
result = pd_op(self._ndarray, other)
|
||||
|
||||
if op is divmod or op is ops.rdivmod:
|
||||
a, b = result
|
||||
if isinstance(a, np.ndarray):
|
||||
# for e.g. op vs TimedeltaArray, we may already
|
||||
# have an ExtensionArray, in which case we do not wrap
|
||||
return self._wrap_ndarray_result(a), self._wrap_ndarray_result(b)
|
||||
return a, b
|
||||
|
||||
if isinstance(result, np.ndarray):
|
||||
# for e.g. multiplication vs TimedeltaArray, we may already
|
||||
# have an ExtensionArray, in which case we do not wrap
|
||||
return self._wrap_ndarray_result(result)
|
||||
return result
|
||||
|
||||
_arith_method = _cmp_method
|
||||
|
||||
def _wrap_ndarray_result(self, result: np.ndarray):
|
||||
# If we have timedelta64[ns] result, return a TimedeltaArray instead
|
||||
# of a NumpyExtensionArray
|
||||
if result.dtype.kind == "m" and is_supported_dtype(result.dtype):
|
||||
from pandas.core.arrays import TimedeltaArray
|
||||
|
||||
return TimedeltaArray._simple_new(result, dtype=result.dtype)
|
||||
return type(self)(result)
|
||||
|
||||
def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]:
|
||||
# NEP 51: https://github.com/numpy/numpy/pull/22449
|
||||
if self.dtype.kind in "SU":
|
||||
return "'{}'".format
|
||||
elif self.dtype == "object":
|
||||
return repr
|
||||
else:
|
||||
return str
|
||||
1331
lib/python3.11/site-packages/pandas/core/arrays/period.py
Normal file
1331
lib/python3.11/site-packages/pandas/core/arrays/period.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,19 @@
|
||||
from pandas.core.arrays.sparse.accessor import (
|
||||
SparseAccessor,
|
||||
SparseFrameAccessor,
|
||||
)
|
||||
from pandas.core.arrays.sparse.array import (
|
||||
BlockIndex,
|
||||
IntIndex,
|
||||
SparseArray,
|
||||
make_sparse_index,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"BlockIndex",
|
||||
"IntIndex",
|
||||
"make_sparse_index",
|
||||
"SparseAccessor",
|
||||
"SparseArray",
|
||||
"SparseFrameAccessor",
|
||||
]
|
||||
@ -0,0 +1,414 @@
|
||||
"""Sparse accessor"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
|
||||
from pandas.core.dtypes.cast import find_common_type
|
||||
from pandas.core.dtypes.dtypes import SparseDtype
|
||||
|
||||
from pandas.core.accessor import (
|
||||
PandasDelegate,
|
||||
delegate_names,
|
||||
)
|
||||
from pandas.core.arrays.sparse.array import SparseArray
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
|
||||
|
||||
class BaseAccessor:
|
||||
_validation_msg = "Can only use the '.sparse' accessor with Sparse data."
|
||||
|
||||
def __init__(self, data=None) -> None:
|
||||
self._parent = data
|
||||
self._validate(data)
|
||||
|
||||
def _validate(self, data):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@delegate_names(
|
||||
SparseArray, ["npoints", "density", "fill_value", "sp_values"], typ="property"
|
||||
)
|
||||
class SparseAccessor(BaseAccessor, PandasDelegate):
|
||||
"""
|
||||
Accessor for SparseSparse from other sparse matrix data types.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> ser = pd.Series([0, 0, 2, 2, 2], dtype="Sparse[int]")
|
||||
>>> ser.sparse.density
|
||||
0.6
|
||||
>>> ser.sparse.sp_values
|
||||
array([2, 2, 2])
|
||||
"""
|
||||
|
||||
def _validate(self, data):
|
||||
if not isinstance(data.dtype, SparseDtype):
|
||||
raise AttributeError(self._validation_msg)
|
||||
|
||||
def _delegate_property_get(self, name: str, *args, **kwargs):
|
||||
return getattr(self._parent.array, name)
|
||||
|
||||
def _delegate_method(self, name: str, *args, **kwargs):
|
||||
if name == "from_coo":
|
||||
return self.from_coo(*args, **kwargs)
|
||||
elif name == "to_coo":
|
||||
return self.to_coo(*args, **kwargs)
|
||||
else:
|
||||
raise ValueError
|
||||
|
||||
@classmethod
|
||||
def from_coo(cls, A, dense_index: bool = False) -> Series:
|
||||
"""
|
||||
Create a Series with sparse values from a scipy.sparse.coo_matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
A : scipy.sparse.coo_matrix
|
||||
dense_index : bool, default False
|
||||
If False (default), the index consists of only the
|
||||
coords of the non-null entries of the original coo_matrix.
|
||||
If True, the index consists of the full sorted
|
||||
(row, col) coordinates of the coo_matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
s : Series
|
||||
A Series with sparse values.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy import sparse
|
||||
|
||||
>>> A = sparse.coo_matrix(
|
||||
... ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4)
|
||||
... )
|
||||
>>> A
|
||||
<COOrdinate sparse matrix of dtype 'float64'
|
||||
with 3 stored elements and shape (3, 4)>
|
||||
|
||||
>>> A.todense()
|
||||
matrix([[0., 0., 1., 2.],
|
||||
[3., 0., 0., 0.],
|
||||
[0., 0., 0., 0.]])
|
||||
|
||||
>>> ss = pd.Series.sparse.from_coo(A)
|
||||
>>> ss
|
||||
0 2 1.0
|
||||
3 2.0
|
||||
1 0 3.0
|
||||
dtype: Sparse[float64, nan]
|
||||
"""
|
||||
from pandas import Series
|
||||
from pandas.core.arrays.sparse.scipy_sparse import coo_to_sparse_series
|
||||
|
||||
result = coo_to_sparse_series(A, dense_index=dense_index)
|
||||
result = Series(result.array, index=result.index, copy=False)
|
||||
|
||||
return result
|
||||
|
||||
def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels: bool = False):
|
||||
"""
|
||||
Create a scipy.sparse.coo_matrix from a Series with MultiIndex.
|
||||
|
||||
Use row_levels and column_levels to determine the row and column
|
||||
coordinates respectively. row_levels and column_levels are the names
|
||||
(labels) or numbers of the levels. {row_levels, column_levels} must be
|
||||
a partition of the MultiIndex level names (or numbers).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
row_levels : tuple/list
|
||||
column_levels : tuple/list
|
||||
sort_labels : bool, default False
|
||||
Sort the row and column labels before forming the sparse matrix.
|
||||
When `row_levels` and/or `column_levels` refer to a single level,
|
||||
set to `True` for a faster execution.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : scipy.sparse.coo_matrix
|
||||
rows : list (row labels)
|
||||
columns : list (column labels)
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan])
|
||||
>>> s.index = pd.MultiIndex.from_tuples(
|
||||
... [
|
||||
... (1, 2, "a", 0),
|
||||
... (1, 2, "a", 1),
|
||||
... (1, 1, "b", 0),
|
||||
... (1, 1, "b", 1),
|
||||
... (2, 1, "b", 0),
|
||||
... (2, 1, "b", 1)
|
||||
... ],
|
||||
... names=["A", "B", "C", "D"],
|
||||
... )
|
||||
>>> s
|
||||
A B C D
|
||||
1 2 a 0 3.0
|
||||
1 NaN
|
||||
1 b 0 1.0
|
||||
1 3.0
|
||||
2 1 b 0 NaN
|
||||
1 NaN
|
||||
dtype: float64
|
||||
|
||||
>>> ss = s.astype("Sparse")
|
||||
>>> ss
|
||||
A B C D
|
||||
1 2 a 0 3.0
|
||||
1 NaN
|
||||
1 b 0 1.0
|
||||
1 3.0
|
||||
2 1 b 0 NaN
|
||||
1 NaN
|
||||
dtype: Sparse[float64, nan]
|
||||
|
||||
>>> A, rows, columns = ss.sparse.to_coo(
|
||||
... row_levels=["A", "B"], column_levels=["C", "D"], sort_labels=True
|
||||
... )
|
||||
>>> A
|
||||
<COOrdinate sparse matrix of dtype 'float64'
|
||||
with 3 stored elements and shape (3, 4)>
|
||||
>>> A.todense()
|
||||
matrix([[0., 0., 1., 3.],
|
||||
[3., 0., 0., 0.],
|
||||
[0., 0., 0., 0.]])
|
||||
|
||||
>>> rows
|
||||
[(1, 1), (1, 2), (2, 1)]
|
||||
>>> columns
|
||||
[('a', 0), ('a', 1), ('b', 0), ('b', 1)]
|
||||
"""
|
||||
from pandas.core.arrays.sparse.scipy_sparse import sparse_series_to_coo
|
||||
|
||||
A, rows, columns = sparse_series_to_coo(
|
||||
self._parent, row_levels, column_levels, sort_labels=sort_labels
|
||||
)
|
||||
return A, rows, columns
|
||||
|
||||
def to_dense(self) -> Series:
|
||||
"""
|
||||
Convert a Series from sparse values to dense.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Series:
|
||||
A Series with the same values, stored as a dense array.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> series = pd.Series(pd.arrays.SparseArray([0, 1, 0]))
|
||||
>>> series
|
||||
0 0
|
||||
1 1
|
||||
2 0
|
||||
dtype: Sparse[int64, 0]
|
||||
|
||||
>>> series.sparse.to_dense()
|
||||
0 0
|
||||
1 1
|
||||
2 0
|
||||
dtype: int64
|
||||
"""
|
||||
from pandas import Series
|
||||
|
||||
return Series(
|
||||
self._parent.array.to_dense(),
|
||||
index=self._parent.index,
|
||||
name=self._parent.name,
|
||||
copy=False,
|
||||
)
|
||||
|
||||
|
||||
class SparseFrameAccessor(BaseAccessor, PandasDelegate):
|
||||
"""
|
||||
DataFrame accessor for sparse data.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.DataFrame({"a": [1, 2, 0, 0],
|
||||
... "b": [3, 0, 0, 4]}, dtype="Sparse[int]")
|
||||
>>> df.sparse.density
|
||||
0.5
|
||||
"""
|
||||
|
||||
def _validate(self, data):
|
||||
dtypes = data.dtypes
|
||||
if not all(isinstance(t, SparseDtype) for t in dtypes):
|
||||
raise AttributeError(self._validation_msg)
|
||||
|
||||
@classmethod
|
||||
def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame:
|
||||
"""
|
||||
Create a new DataFrame from a scipy sparse matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : scipy.sparse.spmatrix
|
||||
Must be convertible to csc format.
|
||||
index, columns : Index, optional
|
||||
Row and column labels to use for the resulting DataFrame.
|
||||
Defaults to a RangeIndex.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
Each column of the DataFrame is stored as a
|
||||
:class:`arrays.SparseArray`.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import scipy.sparse
|
||||
>>> mat = scipy.sparse.eye(3, dtype=float)
|
||||
>>> pd.DataFrame.sparse.from_spmatrix(mat)
|
||||
0 1 2
|
||||
0 1.0 0 0
|
||||
1 0 1.0 0
|
||||
2 0 0 1.0
|
||||
"""
|
||||
from pandas._libs.sparse import IntIndex
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
data = data.tocsc()
|
||||
index, columns = cls._prep_index(data, index, columns)
|
||||
n_rows, n_columns = data.shape
|
||||
# We need to make sure indices are sorted, as we create
|
||||
# IntIndex with no input validation (i.e. check_integrity=False ).
|
||||
# Indices may already be sorted in scipy in which case this adds
|
||||
# a small overhead.
|
||||
data.sort_indices()
|
||||
indices = data.indices
|
||||
indptr = data.indptr
|
||||
array_data = data.data
|
||||
dtype = SparseDtype(array_data.dtype, 0)
|
||||
arrays = []
|
||||
for i in range(n_columns):
|
||||
sl = slice(indptr[i], indptr[i + 1])
|
||||
idx = IntIndex(n_rows, indices[sl], check_integrity=False)
|
||||
arr = SparseArray._simple_new(array_data[sl], idx, dtype)
|
||||
arrays.append(arr)
|
||||
return DataFrame._from_arrays(
|
||||
arrays, columns=columns, index=index, verify_integrity=False
|
||||
)
|
||||
|
||||
def to_dense(self) -> DataFrame:
|
||||
"""
|
||||
Convert a DataFrame with sparse values to dense.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
A DataFrame with the same values stored as dense arrays.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0])})
|
||||
>>> df.sparse.to_dense()
|
||||
A
|
||||
0 0
|
||||
1 1
|
||||
2 0
|
||||
"""
|
||||
from pandas import DataFrame
|
||||
|
||||
data = {k: v.array.to_dense() for k, v in self._parent.items()}
|
||||
return DataFrame(data, index=self._parent.index, columns=self._parent.columns)
|
||||
|
||||
def to_coo(self):
|
||||
"""
|
||||
Return the contents of the frame as a sparse SciPy COO matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
scipy.sparse.spmatrix
|
||||
If the caller is heterogeneous and contains booleans or objects,
|
||||
the result will be of dtype=object. See Notes.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The dtype will be the lowest-common-denominator type (implicit
|
||||
upcasting); that is to say if the dtypes (even of numeric types)
|
||||
are mixed, the one that accommodates all will be chosen.
|
||||
|
||||
e.g. If the dtypes are float16 and float32, dtype will be upcast to
|
||||
float32. By numpy.find_common_type convention, mixing int64 and
|
||||
and uint64 will result in a float64 dtype.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])})
|
||||
>>> df.sparse.to_coo()
|
||||
<COOrdinate sparse matrix of dtype 'int64'
|
||||
with 2 stored elements and shape (4, 1)>
|
||||
"""
|
||||
import_optional_dependency("scipy")
|
||||
from scipy.sparse import coo_matrix
|
||||
|
||||
dtype = find_common_type(self._parent.dtypes.to_list())
|
||||
if isinstance(dtype, SparseDtype):
|
||||
dtype = dtype.subtype
|
||||
|
||||
cols, rows, data = [], [], []
|
||||
for col, (_, ser) in enumerate(self._parent.items()):
|
||||
sp_arr = ser.array
|
||||
if sp_arr.fill_value != 0:
|
||||
raise ValueError("fill value must be 0 when converting to COO matrix")
|
||||
|
||||
row = sp_arr.sp_index.indices
|
||||
cols.append(np.repeat(col, len(row)))
|
||||
rows.append(row)
|
||||
data.append(sp_arr.sp_values.astype(dtype, copy=False))
|
||||
|
||||
cols = np.concatenate(cols)
|
||||
rows = np.concatenate(rows)
|
||||
data = np.concatenate(data)
|
||||
return coo_matrix((data, (rows, cols)), shape=self._parent.shape)
|
||||
|
||||
@property
|
||||
def density(self) -> float:
|
||||
"""
|
||||
Ratio of non-sparse points to total (dense) data points.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])})
|
||||
>>> df.sparse.density
|
||||
0.5
|
||||
"""
|
||||
tmp = np.mean([column.array.density for _, column in self._parent.items()])
|
||||
return tmp
|
||||
|
||||
@staticmethod
|
||||
def _prep_index(data, index, columns):
|
||||
from pandas.core.indexes.api import (
|
||||
default_index,
|
||||
ensure_index,
|
||||
)
|
||||
|
||||
N, K = data.shape
|
||||
if index is None:
|
||||
index = default_index(N)
|
||||
else:
|
||||
index = ensure_index(index)
|
||||
if columns is None:
|
||||
columns = default_index(K)
|
||||
else:
|
||||
columns = ensure_index(columns)
|
||||
|
||||
if len(columns) != K:
|
||||
raise ValueError(f"Column length mismatch: {len(columns)} vs. {K}")
|
||||
if len(index) != N:
|
||||
raise ValueError(f"Index length mismatch: {len(index)} vs. {N}")
|
||||
return index, columns
|
||||
1945
lib/python3.11/site-packages/pandas/core/arrays/sparse/array.py
Normal file
1945
lib/python3.11/site-packages/pandas/core/arrays/sparse/array.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,207 @@
|
||||
"""
|
||||
Interaction with scipy.sparse matrices.
|
||||
|
||||
Currently only includes to_coo helpers.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from pandas._libs import lib
|
||||
|
||||
from pandas.core.dtypes.missing import notna
|
||||
|
||||
from pandas.core.algorithms import factorize
|
||||
from pandas.core.indexes.api import MultiIndex
|
||||
from pandas.core.series import Series
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable
|
||||
|
||||
import numpy as np
|
||||
import scipy.sparse
|
||||
|
||||
from pandas._typing import (
|
||||
IndexLabel,
|
||||
npt,
|
||||
)
|
||||
|
||||
|
||||
def _check_is_partition(parts: Iterable, whole: Iterable):
|
||||
whole = set(whole)
|
||||
parts = [set(x) for x in parts]
|
||||
if set.intersection(*parts) != set():
|
||||
raise ValueError("Is not a partition because intersection is not null.")
|
||||
if set.union(*parts) != whole:
|
||||
raise ValueError("Is not a partition because union is not the whole.")
|
||||
|
||||
|
||||
def _levels_to_axis(
|
||||
ss,
|
||||
levels: tuple[int] | list[int],
|
||||
valid_ilocs: npt.NDArray[np.intp],
|
||||
sort_labels: bool = False,
|
||||
) -> tuple[npt.NDArray[np.intp], list[IndexLabel]]:
|
||||
"""
|
||||
For a MultiIndexed sparse Series `ss`, return `ax_coords` and `ax_labels`,
|
||||
where `ax_coords` are the coordinates along one of the two axes of the
|
||||
destination sparse matrix, and `ax_labels` are the labels from `ss`' Index
|
||||
which correspond to these coordinates.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ss : Series
|
||||
levels : tuple/list
|
||||
valid_ilocs : numpy.ndarray
|
||||
Array of integer positions of valid values for the sparse matrix in ss.
|
||||
sort_labels : bool, default False
|
||||
Sort the axis labels before forming the sparse matrix. When `levels`
|
||||
refers to a single level, set to True for a faster execution.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ax_coords : numpy.ndarray (axis coordinates)
|
||||
ax_labels : list (axis labels)
|
||||
"""
|
||||
# Since the labels are sorted in `Index.levels`, when we wish to sort and
|
||||
# there is only one level of the MultiIndex for this axis, the desired
|
||||
# output can be obtained in the following simpler, more efficient way.
|
||||
if sort_labels and len(levels) == 1:
|
||||
ax_coords = ss.index.codes[levels[0]][valid_ilocs]
|
||||
ax_labels = ss.index.levels[levels[0]]
|
||||
|
||||
else:
|
||||
levels_values = lib.fast_zip(
|
||||
[ss.index.get_level_values(lvl).to_numpy() for lvl in levels]
|
||||
)
|
||||
codes, ax_labels = factorize(levels_values, sort=sort_labels)
|
||||
ax_coords = codes[valid_ilocs]
|
||||
|
||||
ax_labels = ax_labels.tolist()
|
||||
return ax_coords, ax_labels
|
||||
|
||||
|
||||
def _to_ijv(
|
||||
ss,
|
||||
row_levels: tuple[int] | list[int] = (0,),
|
||||
column_levels: tuple[int] | list[int] = (1,),
|
||||
sort_labels: bool = False,
|
||||
) -> tuple[
|
||||
np.ndarray,
|
||||
npt.NDArray[np.intp],
|
||||
npt.NDArray[np.intp],
|
||||
list[IndexLabel],
|
||||
list[IndexLabel],
|
||||
]:
|
||||
"""
|
||||
For an arbitrary MultiIndexed sparse Series return (v, i, j, ilabels,
|
||||
jlabels) where (v, (i, j)) is suitable for passing to scipy.sparse.coo
|
||||
constructor, and ilabels and jlabels are the row and column labels
|
||||
respectively.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ss : Series
|
||||
row_levels : tuple/list
|
||||
column_levels : tuple/list
|
||||
sort_labels : bool, default False
|
||||
Sort the row and column labels before forming the sparse matrix.
|
||||
When `row_levels` and/or `column_levels` refer to a single level,
|
||||
set to `True` for a faster execution.
|
||||
|
||||
Returns
|
||||
-------
|
||||
values : numpy.ndarray
|
||||
Valid values to populate a sparse matrix, extracted from
|
||||
ss.
|
||||
i_coords : numpy.ndarray (row coordinates of the values)
|
||||
j_coords : numpy.ndarray (column coordinates of the values)
|
||||
i_labels : list (row labels)
|
||||
j_labels : list (column labels)
|
||||
"""
|
||||
# index and column levels must be a partition of the index
|
||||
_check_is_partition([row_levels, column_levels], range(ss.index.nlevels))
|
||||
# From the sparse Series, get the integer indices and data for valid sparse
|
||||
# entries.
|
||||
sp_vals = ss.array.sp_values
|
||||
na_mask = notna(sp_vals)
|
||||
values = sp_vals[na_mask]
|
||||
valid_ilocs = ss.array.sp_index.indices[na_mask]
|
||||
|
||||
i_coords, i_labels = _levels_to_axis(
|
||||
ss, row_levels, valid_ilocs, sort_labels=sort_labels
|
||||
)
|
||||
|
||||
j_coords, j_labels = _levels_to_axis(
|
||||
ss, column_levels, valid_ilocs, sort_labels=sort_labels
|
||||
)
|
||||
|
||||
return values, i_coords, j_coords, i_labels, j_labels
|
||||
|
||||
|
||||
def sparse_series_to_coo(
|
||||
ss: Series,
|
||||
row_levels: Iterable[int] = (0,),
|
||||
column_levels: Iterable[int] = (1,),
|
||||
sort_labels: bool = False,
|
||||
) -> tuple[scipy.sparse.coo_matrix, list[IndexLabel], list[IndexLabel]]:
|
||||
"""
|
||||
Convert a sparse Series to a scipy.sparse.coo_matrix using index
|
||||
levels row_levels, column_levels as the row and column
|
||||
labels respectively. Returns the sparse_matrix, row and column labels.
|
||||
"""
|
||||
import scipy.sparse
|
||||
|
||||
if ss.index.nlevels < 2:
|
||||
raise ValueError("to_coo requires MultiIndex with nlevels >= 2.")
|
||||
if not ss.index.is_unique:
|
||||
raise ValueError(
|
||||
"Duplicate index entries are not allowed in to_coo transformation."
|
||||
)
|
||||
|
||||
# to keep things simple, only rely on integer indexing (not labels)
|
||||
row_levels = [ss.index._get_level_number(x) for x in row_levels]
|
||||
column_levels = [ss.index._get_level_number(x) for x in column_levels]
|
||||
|
||||
v, i, j, rows, columns = _to_ijv(
|
||||
ss, row_levels=row_levels, column_levels=column_levels, sort_labels=sort_labels
|
||||
)
|
||||
sparse_matrix = scipy.sparse.coo_matrix(
|
||||
(v, (i, j)), shape=(len(rows), len(columns))
|
||||
)
|
||||
return sparse_matrix, rows, columns
|
||||
|
||||
|
||||
def coo_to_sparse_series(
|
||||
A: scipy.sparse.coo_matrix, dense_index: bool = False
|
||||
) -> Series:
|
||||
"""
|
||||
Convert a scipy.sparse.coo_matrix to a Series with type sparse.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
A : scipy.sparse.coo_matrix
|
||||
dense_index : bool, default False
|
||||
|
||||
Returns
|
||||
-------
|
||||
Series
|
||||
|
||||
Raises
|
||||
------
|
||||
TypeError if A is not a coo_matrix
|
||||
"""
|
||||
from pandas import SparseDtype
|
||||
|
||||
try:
|
||||
ser = Series(A.data, MultiIndex.from_arrays((A.row, A.col)), copy=False)
|
||||
except AttributeError as err:
|
||||
raise TypeError(
|
||||
f"Expected coo_matrix. Got {type(A).__name__} instead."
|
||||
) from err
|
||||
ser = ser.sort_index()
|
||||
ser = ser.astype(SparseDtype(ser.dtype))
|
||||
if dense_index:
|
||||
ind = MultiIndex.from_product([A.row, A.col])
|
||||
ser = ser.reindex(ind)
|
||||
return ser
|
||||
1131
lib/python3.11/site-packages/pandas/core/arrays/string_.py
Normal file
1131
lib/python3.11/site-packages/pandas/core/arrays/string_.py
Normal file
File diff suppressed because it is too large
Load Diff
495
lib/python3.11/site-packages/pandas/core/arrays/string_arrow.py
Normal file
495
lib/python3.11/site-packages/pandas/core/arrays/string_arrow.py
Normal file
@ -0,0 +1,495 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import operator
|
||||
import re
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Callable,
|
||||
Union,
|
||||
)
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import (
|
||||
lib,
|
||||
missing as libmissing,
|
||||
)
|
||||
from pandas.compat import (
|
||||
pa_version_under10p1,
|
||||
pa_version_under13p0,
|
||||
pa_version_under16p0,
|
||||
)
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_scalar,
|
||||
pandas_dtype,
|
||||
)
|
||||
from pandas.core.dtypes.missing import isna
|
||||
|
||||
from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin
|
||||
from pandas.core.arrays.arrow import ArrowExtensionArray
|
||||
from pandas.core.arrays.boolean import BooleanDtype
|
||||
from pandas.core.arrays.floating import Float64Dtype
|
||||
from pandas.core.arrays.integer import Int64Dtype
|
||||
from pandas.core.arrays.numeric import NumericDtype
|
||||
from pandas.core.arrays.string_ import (
|
||||
BaseStringArray,
|
||||
StringDtype,
|
||||
)
|
||||
from pandas.core.strings.object_array import ObjectStringArrayMixin
|
||||
|
||||
if not pa_version_under10p1:
|
||||
import pyarrow as pa
|
||||
import pyarrow.compute as pc
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Sequence
|
||||
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
Dtype,
|
||||
Self,
|
||||
npt,
|
||||
)
|
||||
|
||||
from pandas import Series
|
||||
|
||||
|
||||
ArrowStringScalarOrNAT = Union[str, libmissing.NAType]
|
||||
|
||||
|
||||
def _chk_pyarrow_available() -> None:
|
||||
if pa_version_under10p1:
|
||||
msg = "pyarrow>=10.0.1 is required for PyArrow backed ArrowExtensionArray."
|
||||
raise ImportError(msg)
|
||||
|
||||
|
||||
def _is_string_view(typ):
|
||||
return not pa_version_under16p0 and pa.types.is_string_view(typ)
|
||||
|
||||
|
||||
# TODO: Inherit directly from BaseStringArrayMethods. Currently we inherit from
|
||||
# ObjectStringArrayMixin because we want to have the object-dtype based methods as
|
||||
# fallback for the ones that pyarrow doesn't yet support
|
||||
|
||||
|
||||
class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringArray):
|
||||
"""
|
||||
Extension array for string data in a ``pyarrow.ChunkedArray``.
|
||||
|
||||
.. warning::
|
||||
|
||||
ArrowStringArray is considered experimental. The implementation and
|
||||
parts of the API may change without warning.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : pyarrow.Array or pyarrow.ChunkedArray
|
||||
The array of data.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
See Also
|
||||
--------
|
||||
:func:`pandas.array`
|
||||
The recommended function for creating a ArrowStringArray.
|
||||
Series.str
|
||||
The string methods are available on Series backed by
|
||||
a ArrowStringArray.
|
||||
|
||||
Notes
|
||||
-----
|
||||
ArrowStringArray returns a BooleanArray for comparison methods.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> pd.array(['This is', 'some text', None, 'data.'], dtype="string[pyarrow]")
|
||||
<ArrowStringArray>
|
||||
['This is', 'some text', <NA>, 'data.']
|
||||
Length: 4, dtype: string
|
||||
"""
|
||||
|
||||
# error: Incompatible types in assignment (expression has type "StringDtype",
|
||||
# base class "ArrowExtensionArray" defined the type as "ArrowDtype")
|
||||
_dtype: StringDtype # type: ignore[assignment]
|
||||
_storage = "pyarrow"
|
||||
_na_value: libmissing.NAType | float = libmissing.NA
|
||||
|
||||
def __init__(self, values) -> None:
|
||||
_chk_pyarrow_available()
|
||||
if isinstance(values, (pa.Array, pa.ChunkedArray)) and (
|
||||
pa.types.is_string(values.type)
|
||||
or _is_string_view(values.type)
|
||||
or (
|
||||
pa.types.is_dictionary(values.type)
|
||||
and (
|
||||
pa.types.is_string(values.type.value_type)
|
||||
or pa.types.is_large_string(values.type.value_type)
|
||||
or _is_string_view(values.type.value_type)
|
||||
)
|
||||
)
|
||||
):
|
||||
values = pc.cast(values, pa.large_string())
|
||||
|
||||
super().__init__(values)
|
||||
self._dtype = StringDtype(storage=self._storage, na_value=self._na_value)
|
||||
|
||||
if not pa.types.is_large_string(self._pa_array.type):
|
||||
raise ValueError(
|
||||
"ArrowStringArray requires a PyArrow (chunked) array of "
|
||||
"large_string type"
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
|
||||
pa_scalar = super()._box_pa_scalar(value, pa_type)
|
||||
if pa.types.is_string(pa_scalar.type) and pa_type is None:
|
||||
pa_scalar = pc.cast(pa_scalar, pa.large_string())
|
||||
return pa_scalar
|
||||
|
||||
@classmethod
|
||||
def _box_pa_array(
|
||||
cls, value, pa_type: pa.DataType | None = None, copy: bool = False
|
||||
) -> pa.Array | pa.ChunkedArray:
|
||||
pa_array = super()._box_pa_array(value, pa_type)
|
||||
if pa.types.is_string(pa_array.type) and pa_type is None:
|
||||
pa_array = pc.cast(pa_array, pa.large_string())
|
||||
return pa_array
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""
|
||||
Length of this array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
length : int
|
||||
"""
|
||||
return len(self._pa_array)
|
||||
|
||||
@classmethod
|
||||
def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
|
||||
from pandas.core.arrays.masked import BaseMaskedArray
|
||||
|
||||
_chk_pyarrow_available()
|
||||
|
||||
if dtype and not (isinstance(dtype, str) and dtype == "string"):
|
||||
dtype = pandas_dtype(dtype)
|
||||
assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow"
|
||||
|
||||
if isinstance(scalars, BaseMaskedArray):
|
||||
# avoid costly conversion to object dtype in ensure_string_array and
|
||||
# numerical issues with Float32Dtype
|
||||
na_values = scalars._mask
|
||||
result = scalars._data
|
||||
result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
|
||||
return cls(pa.array(result, mask=na_values, type=pa.large_string()))
|
||||
elif isinstance(scalars, (pa.Array, pa.ChunkedArray)):
|
||||
return cls(pc.cast(scalars, pa.large_string()))
|
||||
|
||||
# convert non-na-likes to str
|
||||
result = lib.ensure_string_array(scalars, copy=copy)
|
||||
return cls(pa.array(result, type=pa.large_string(), from_pandas=True))
|
||||
|
||||
@classmethod
|
||||
def _from_sequence_of_strings(
|
||||
cls, strings, dtype: Dtype | None = None, copy: bool = False
|
||||
):
|
||||
return cls._from_sequence(strings, dtype=dtype, copy=copy)
|
||||
|
||||
@property
|
||||
def dtype(self) -> StringDtype: # type: ignore[override]
|
||||
"""
|
||||
An instance of 'string[pyarrow]'.
|
||||
"""
|
||||
return self._dtype
|
||||
|
||||
def insert(self, loc: int, item) -> ArrowStringArray:
|
||||
if self.dtype.na_value is np.nan and item is np.nan:
|
||||
item = libmissing.NA
|
||||
if not isinstance(item, str) and item is not libmissing.NA:
|
||||
raise TypeError(
|
||||
f"Invalid value '{item}' for dtype 'str'. Value should be a "
|
||||
f"string or missing value, got '{type(item).__name__}' instead."
|
||||
)
|
||||
return super().insert(loc, item)
|
||||
|
||||
def _convert_bool_result(self, values, na=lib.no_default, method_name=None):
|
||||
if na is not lib.no_default and not isna(na) and not isinstance(na, bool):
|
||||
# GH#59561
|
||||
warnings.warn(
|
||||
f"Allowing a non-bool 'na' in obj.str.{method_name} is deprecated "
|
||||
"and will raise in a future version.",
|
||||
FutureWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
na = bool(na)
|
||||
|
||||
if self.dtype.na_value is np.nan:
|
||||
if na is lib.no_default or isna(na):
|
||||
# NaN propagates as False
|
||||
values = values.fill_null(False)
|
||||
else:
|
||||
values = values.fill_null(na)
|
||||
return values.to_numpy()
|
||||
else:
|
||||
if na is not lib.no_default and not isna(
|
||||
na
|
||||
): # pyright: ignore [reportGeneralTypeIssues]
|
||||
values = values.fill_null(na)
|
||||
return BooleanDtype().__from_arrow__(values)
|
||||
|
||||
def _maybe_convert_setitem_value(self, value):
|
||||
"""Maybe convert value to be pyarrow compatible."""
|
||||
if is_scalar(value):
|
||||
if isna(value):
|
||||
value = None
|
||||
elif not isinstance(value, str):
|
||||
raise TypeError(
|
||||
f"Invalid value '{value}' for dtype 'str'. Value should be a "
|
||||
f"string or missing value, got '{type(value).__name__}' instead."
|
||||
)
|
||||
else:
|
||||
value = np.array(value, dtype=object, copy=True)
|
||||
value[isna(value)] = None
|
||||
for v in value:
|
||||
if not (v is None or isinstance(v, str)):
|
||||
raise TypeError(
|
||||
"Invalid value for dtype 'str'. Value should be a "
|
||||
"string or missing value (or array of those)."
|
||||
)
|
||||
return super()._maybe_convert_setitem_value(value)
|
||||
|
||||
def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
|
||||
value_set = [
|
||||
pa_scalar.as_py()
|
||||
for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values]
|
||||
if pa_scalar.type in (pa.string(), pa.null(), pa.large_string())
|
||||
]
|
||||
|
||||
# short-circuit to return all False array.
|
||||
if not len(value_set):
|
||||
return np.zeros(len(self), dtype=bool)
|
||||
|
||||
result = pc.is_in(
|
||||
self._pa_array, value_set=pa.array(value_set, type=self._pa_array.type)
|
||||
)
|
||||
# pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls
|
||||
# to False
|
||||
return np.array(result, dtype=np.bool_)
|
||||
|
||||
def astype(self, dtype, copy: bool = True):
|
||||
dtype = pandas_dtype(dtype)
|
||||
|
||||
if dtype == self.dtype:
|
||||
if copy:
|
||||
return self.copy()
|
||||
return self
|
||||
elif isinstance(dtype, NumericDtype):
|
||||
data = self._pa_array.cast(pa.from_numpy_dtype(dtype.numpy_dtype))
|
||||
return dtype.__from_arrow__(data)
|
||||
elif isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.floating):
|
||||
return self.to_numpy(dtype=dtype, na_value=np.nan)
|
||||
|
||||
return super().astype(dtype, copy=copy)
|
||||
|
||||
@property
|
||||
def _data(self):
|
||||
# dask accesses ._data directlys
|
||||
warnings.warn(
|
||||
f"{type(self).__name__}._data is a deprecated and will be removed "
|
||||
"in a future version, use ._pa_array instead",
|
||||
FutureWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
return self._pa_array
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# String methods interface
|
||||
|
||||
_str_isalnum = ArrowStringArrayMixin._str_isalnum
|
||||
_str_isalpha = ArrowStringArrayMixin._str_isalpha
|
||||
_str_isdecimal = ArrowStringArrayMixin._str_isdecimal
|
||||
_str_isdigit = ArrowStringArrayMixin._str_isdigit
|
||||
_str_islower = ArrowStringArrayMixin._str_islower
|
||||
_str_isnumeric = ArrowStringArrayMixin._str_isnumeric
|
||||
_str_isspace = ArrowStringArrayMixin._str_isspace
|
||||
_str_istitle = ArrowStringArrayMixin._str_istitle
|
||||
_str_isupper = ArrowStringArrayMixin._str_isupper
|
||||
|
||||
_str_map = BaseStringArray._str_map
|
||||
_str_startswith = ArrowStringArrayMixin._str_startswith
|
||||
_str_endswith = ArrowStringArrayMixin._str_endswith
|
||||
_str_pad = ArrowStringArrayMixin._str_pad
|
||||
_str_match = ArrowStringArrayMixin._str_match
|
||||
_str_fullmatch = ArrowStringArrayMixin._str_fullmatch
|
||||
_str_lower = ArrowStringArrayMixin._str_lower
|
||||
_str_upper = ArrowStringArrayMixin._str_upper
|
||||
_str_strip = ArrowStringArrayMixin._str_strip
|
||||
_str_lstrip = ArrowStringArrayMixin._str_lstrip
|
||||
_str_rstrip = ArrowStringArrayMixin._str_rstrip
|
||||
_str_removesuffix = ArrowStringArrayMixin._str_removesuffix
|
||||
_str_get = ArrowStringArrayMixin._str_get
|
||||
_str_capitalize = ArrowStringArrayMixin._str_capitalize
|
||||
_str_title = ArrowStringArrayMixin._str_title
|
||||
_str_swapcase = ArrowStringArrayMixin._str_swapcase
|
||||
_str_slice_replace = ArrowStringArrayMixin._str_slice_replace
|
||||
_str_len = ArrowStringArrayMixin._str_len
|
||||
_str_slice = ArrowStringArrayMixin._str_slice
|
||||
|
||||
def _str_contains(
|
||||
self,
|
||||
pat,
|
||||
case: bool = True,
|
||||
flags: int = 0,
|
||||
na=lib.no_default,
|
||||
regex: bool = True,
|
||||
):
|
||||
if flags:
|
||||
return super()._str_contains(pat, case, flags, na, regex)
|
||||
if isinstance(pat, re.Pattern):
|
||||
pat = pat.pattern
|
||||
|
||||
return ArrowStringArrayMixin._str_contains(self, pat, case, flags, na, regex)
|
||||
|
||||
def _str_replace(
|
||||
self,
|
||||
pat: str | re.Pattern,
|
||||
repl: str | Callable,
|
||||
n: int = -1,
|
||||
case: bool = True,
|
||||
flags: int = 0,
|
||||
regex: bool = True,
|
||||
):
|
||||
if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:
|
||||
return super()._str_replace(pat, repl, n, case, flags, regex)
|
||||
|
||||
return ArrowStringArrayMixin._str_replace(
|
||||
self, pat, repl, n, case, flags, regex
|
||||
)
|
||||
|
||||
def _str_repeat(self, repeats: int | Sequence[int]):
|
||||
if not isinstance(repeats, int):
|
||||
return super()._str_repeat(repeats)
|
||||
else:
|
||||
return ArrowExtensionArray._str_repeat(self, repeats=repeats)
|
||||
|
||||
def _str_removeprefix(self, prefix: str):
|
||||
if not pa_version_under13p0:
|
||||
return ArrowStringArrayMixin._str_removeprefix(self, prefix)
|
||||
return super()._str_removeprefix(prefix)
|
||||
|
||||
def _str_count(self, pat: str, flags: int = 0):
|
||||
if flags:
|
||||
return super()._str_count(pat, flags)
|
||||
result = pc.count_substring_regex(self._pa_array, pat)
|
||||
return self._convert_int_result(result)
|
||||
|
||||
def _str_find(self, sub: str, start: int = 0, end: int | None = None):
|
||||
if (
|
||||
pa_version_under13p0
|
||||
and not (start != 0 and end is not None)
|
||||
and not (start == 0 and end is None)
|
||||
):
|
||||
# GH#59562
|
||||
return super()._str_find(sub, start, end)
|
||||
return ArrowStringArrayMixin._str_find(self, sub, start, end)
|
||||
|
||||
def _str_get_dummies(self, sep: str = "|"):
|
||||
dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(sep)
|
||||
if len(labels) == 0:
|
||||
return np.empty(shape=(0, 0), dtype=np.int64), labels
|
||||
dummies = np.vstack(dummies_pa.to_numpy())
|
||||
return dummies.astype(np.int64, copy=False), labels
|
||||
|
||||
def _convert_int_result(self, result):
|
||||
if self.dtype.na_value is np.nan:
|
||||
if isinstance(result, pa.Array):
|
||||
result = result.to_numpy(zero_copy_only=False)
|
||||
else:
|
||||
result = result.to_numpy()
|
||||
if result.dtype == np.int32:
|
||||
result = result.astype(np.int64)
|
||||
return result
|
||||
|
||||
return Int64Dtype().__from_arrow__(result)
|
||||
|
||||
def _convert_rank_result(self, result):
|
||||
if self.dtype.na_value is np.nan:
|
||||
if isinstance(result, pa.Array):
|
||||
result = result.to_numpy(zero_copy_only=False)
|
||||
else:
|
||||
result = result.to_numpy()
|
||||
return result.astype("float64", copy=False)
|
||||
|
||||
return Float64Dtype().__from_arrow__(result)
|
||||
|
||||
def _reduce(
|
||||
self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
|
||||
):
|
||||
if self.dtype.na_value is np.nan and name in ["any", "all"]:
|
||||
if not skipna:
|
||||
nas = pc.is_null(self._pa_array)
|
||||
arr = pc.or_kleene(nas, pc.not_equal(self._pa_array, ""))
|
||||
else:
|
||||
arr = pc.not_equal(self._pa_array, "")
|
||||
result = ArrowExtensionArray(arr)._reduce(
|
||||
name, skipna=skipna, keepdims=keepdims, **kwargs
|
||||
)
|
||||
if keepdims:
|
||||
# ArrowExtensionArray will return a length-1 bool[pyarrow] array
|
||||
return result.astype(np.bool_)
|
||||
return result
|
||||
|
||||
if name in ("min", "max", "sum", "argmin", "argmax"):
|
||||
result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs)
|
||||
else:
|
||||
raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
|
||||
|
||||
if name in ("argmin", "argmax") and isinstance(result, pa.Array):
|
||||
return self._convert_int_result(result)
|
||||
elif isinstance(result, pa.Array):
|
||||
return type(self)(result)
|
||||
else:
|
||||
return result
|
||||
|
||||
def value_counts(self, dropna: bool = True) -> Series:
|
||||
result = super().value_counts(dropna=dropna)
|
||||
if self.dtype.na_value is np.nan:
|
||||
res_values = result._values.to_numpy()
|
||||
return result._constructor(
|
||||
res_values, index=result.index, name=result.name, copy=False
|
||||
)
|
||||
return result
|
||||
|
||||
def _cmp_method(self, other, op):
|
||||
if (
|
||||
isinstance(other, (BaseStringArray, ArrowExtensionArray))
|
||||
and self.dtype.na_value is not libmissing.NA
|
||||
and other.dtype.na_value is libmissing.NA
|
||||
):
|
||||
# NA has priority of NaN semantics
|
||||
return NotImplemented
|
||||
|
||||
result = super()._cmp_method(other, op)
|
||||
if self.dtype.na_value is np.nan:
|
||||
if op == operator.ne:
|
||||
return result.to_numpy(np.bool_, na_value=True)
|
||||
else:
|
||||
return result.to_numpy(np.bool_, na_value=False)
|
||||
return result
|
||||
|
||||
def __pos__(self) -> Self:
|
||||
raise TypeError(f"bad operand type for unary +: '{self.dtype}'")
|
||||
|
||||
|
||||
class ArrowStringArrayNumpySemantics(ArrowStringArray):
|
||||
_na_value = np.nan
|
||||
1185
lib/python3.11/site-packages/pandas/core/arrays/timedeltas.py
Normal file
1185
lib/python3.11/site-packages/pandas/core/arrays/timedeltas.py
Normal file
File diff suppressed because it is too large
Load Diff
1400
lib/python3.11/site-packages/pandas/core/base.py
Normal file
1400
lib/python3.11/site-packages/pandas/core/base.py
Normal file
File diff suppressed because it is too large
Load Diff
657
lib/python3.11/site-packages/pandas/core/common.py
Normal file
657
lib/python3.11/site-packages/pandas/core/common.py
Normal file
@ -0,0 +1,657 @@
|
||||
"""
|
||||
Misc tools for implementing data structures
|
||||
|
||||
Note: pandas.core.common is *not* part of the public API.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import builtins
|
||||
from collections import (
|
||||
abc,
|
||||
defaultdict,
|
||||
)
|
||||
from collections.abc import (
|
||||
Collection,
|
||||
Generator,
|
||||
Hashable,
|
||||
Iterable,
|
||||
Sequence,
|
||||
)
|
||||
import contextlib
|
||||
from functools import partial
|
||||
import inspect
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
cast,
|
||||
overload,
|
||||
)
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas.compat.numpy import np_version_gte1p24
|
||||
|
||||
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
|
||||
from pandas.core.dtypes.common import (
|
||||
is_bool_dtype,
|
||||
is_integer,
|
||||
)
|
||||
from pandas.core.dtypes.generic import (
|
||||
ABCExtensionArray,
|
||||
ABCIndex,
|
||||
ABCMultiIndex,
|
||||
ABCSeries,
|
||||
)
|
||||
from pandas.core.dtypes.inference import iterable_not_string
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
AnyArrayLike,
|
||||
ArrayLike,
|
||||
NpDtype,
|
||||
RandomState,
|
||||
T,
|
||||
)
|
||||
|
||||
from pandas import Index
|
||||
|
||||
|
||||
def flatten(line):
|
||||
"""
|
||||
Flatten an arbitrarily nested sequence.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
line : sequence
|
||||
The non string sequence to flatten
|
||||
|
||||
Notes
|
||||
-----
|
||||
This doesn't consider strings sequences.
|
||||
|
||||
Returns
|
||||
-------
|
||||
flattened : generator
|
||||
"""
|
||||
for element in line:
|
||||
if iterable_not_string(element):
|
||||
yield from flatten(element)
|
||||
else:
|
||||
yield element
|
||||
|
||||
|
||||
def consensus_name_attr(objs):
|
||||
name = objs[0].name
|
||||
for obj in objs[1:]:
|
||||
try:
|
||||
if obj.name != name:
|
||||
name = None
|
||||
except ValueError:
|
||||
name = None
|
||||
return name
|
||||
|
||||
|
||||
def is_bool_indexer(key: Any) -> bool:
|
||||
"""
|
||||
Check whether `key` is a valid boolean indexer.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
key : Any
|
||||
Only list-likes may be considered boolean indexers.
|
||||
All other types are not considered a boolean indexer.
|
||||
For array-like input, boolean ndarrays or ExtensionArrays
|
||||
with ``_is_boolean`` set are considered boolean indexers.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
Whether `key` is a valid boolean indexer.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
When the array is an object-dtype ndarray or ExtensionArray
|
||||
and contains missing values.
|
||||
|
||||
See Also
|
||||
--------
|
||||
check_array_indexer : Check that `key` is a valid array to index,
|
||||
and convert to an ndarray.
|
||||
"""
|
||||
if isinstance(
|
||||
key, (ABCSeries, np.ndarray, ABCIndex, ABCExtensionArray)
|
||||
) and not isinstance(key, ABCMultiIndex):
|
||||
if key.dtype == np.object_:
|
||||
key_array = np.asarray(key)
|
||||
|
||||
if not lib.is_bool_array(key_array):
|
||||
na_msg = "Cannot mask with non-boolean array containing NA / NaN values"
|
||||
if lib.is_bool_array(key_array, skipna=True):
|
||||
# Don't raise on e.g. ["A", "B", np.nan], see
|
||||
# test_loc_getitem_list_of_labels_categoricalindex_with_na
|
||||
raise ValueError(na_msg)
|
||||
return False
|
||||
return True
|
||||
elif is_bool_dtype(key.dtype):
|
||||
return True
|
||||
elif isinstance(key, list):
|
||||
# check if np.array(key).dtype would be bool
|
||||
if len(key) > 0:
|
||||
if type(key) is not list: # noqa: E721
|
||||
# GH#42461 cython will raise TypeError if we pass a subclass
|
||||
key = list(key)
|
||||
return lib.is_bool_list(key)
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def cast_scalar_indexer(val):
|
||||
"""
|
||||
Disallow indexing with a float key, even if that key is a round number.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
val : scalar
|
||||
|
||||
Returns
|
||||
-------
|
||||
outval : scalar
|
||||
"""
|
||||
# assumes lib.is_scalar(val)
|
||||
if lib.is_float(val) and val.is_integer():
|
||||
raise IndexError(
|
||||
# GH#34193
|
||||
"Indexing with a float is no longer supported. Manually convert "
|
||||
"to an integer key instead."
|
||||
)
|
||||
return val
|
||||
|
||||
|
||||
def not_none(*args):
|
||||
"""
|
||||
Returns a generator consisting of the arguments that are not None.
|
||||
"""
|
||||
return (arg for arg in args if arg is not None)
|
||||
|
||||
|
||||
def any_none(*args) -> bool:
|
||||
"""
|
||||
Returns a boolean indicating if any argument is None.
|
||||
"""
|
||||
return any(arg is None for arg in args)
|
||||
|
||||
|
||||
def all_none(*args) -> bool:
|
||||
"""
|
||||
Returns a boolean indicating if all arguments are None.
|
||||
"""
|
||||
return all(arg is None for arg in args)
|
||||
|
||||
|
||||
def any_not_none(*args) -> bool:
|
||||
"""
|
||||
Returns a boolean indicating if any argument is not None.
|
||||
"""
|
||||
return any(arg is not None for arg in args)
|
||||
|
||||
|
||||
def all_not_none(*args) -> bool:
|
||||
"""
|
||||
Returns a boolean indicating if all arguments are not None.
|
||||
"""
|
||||
return all(arg is not None for arg in args)
|
||||
|
||||
|
||||
def count_not_none(*args) -> int:
|
||||
"""
|
||||
Returns the count of arguments that are not None.
|
||||
"""
|
||||
return sum(x is not None for x in args)
|
||||
|
||||
|
||||
@overload
|
||||
def asarray_tuplesafe(
|
||||
values: ArrayLike | list | tuple | zip, dtype: NpDtype | None = ...
|
||||
) -> np.ndarray:
|
||||
# ExtensionArray can only be returned when values is an Index, all other iterables
|
||||
# will return np.ndarray. Unfortunately "all other" cannot be encoded in a type
|
||||
# signature, so instead we special-case some common types.
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = ...) -> ArrayLike:
|
||||
...
|
||||
|
||||
|
||||
def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = None) -> ArrayLike:
|
||||
if not (isinstance(values, (list, tuple)) or hasattr(values, "__array__")):
|
||||
values = list(values)
|
||||
elif isinstance(values, ABCIndex):
|
||||
return values._values
|
||||
elif isinstance(values, ABCSeries):
|
||||
return values._values
|
||||
|
||||
if isinstance(values, list) and dtype in [np.object_, object]:
|
||||
return construct_1d_object_array_from_listlike(values)
|
||||
|
||||
try:
|
||||
with warnings.catch_warnings():
|
||||
# Can remove warning filter once NumPy 1.24 is min version
|
||||
if not np_version_gte1p24:
|
||||
warnings.simplefilter("ignore", np.VisibleDeprecationWarning)
|
||||
result = np.asarray(values, dtype=dtype)
|
||||
except ValueError:
|
||||
# Using try/except since it's more performant than checking is_list_like
|
||||
# over each element
|
||||
# error: Argument 1 to "construct_1d_object_array_from_listlike"
|
||||
# has incompatible type "Iterable[Any]"; expected "Sized"
|
||||
return construct_1d_object_array_from_listlike(values) # type: ignore[arg-type]
|
||||
|
||||
if issubclass(result.dtype.type, str):
|
||||
result = np.asarray(values, dtype=object)
|
||||
|
||||
if result.ndim == 2:
|
||||
# Avoid building an array of arrays:
|
||||
values = [tuple(x) for x in values]
|
||||
result = construct_1d_object_array_from_listlike(values)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def index_labels_to_array(
|
||||
labels: np.ndarray | Iterable, dtype: NpDtype | None = None
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Transform label or iterable of labels to array, for use in Index.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dtype : dtype
|
||||
If specified, use as dtype of the resulting array, otherwise infer.
|
||||
|
||||
Returns
|
||||
-------
|
||||
array
|
||||
"""
|
||||
if isinstance(labels, (str, tuple)):
|
||||
labels = [labels]
|
||||
|
||||
if not isinstance(labels, (list, np.ndarray)):
|
||||
try:
|
||||
labels = list(labels)
|
||||
except TypeError: # non-iterable
|
||||
labels = [labels]
|
||||
|
||||
labels = asarray_tuplesafe(labels, dtype=dtype)
|
||||
|
||||
return labels
|
||||
|
||||
|
||||
def maybe_make_list(obj):
|
||||
if obj is not None and not isinstance(obj, (tuple, list)):
|
||||
return [obj]
|
||||
return obj
|
||||
|
||||
|
||||
def maybe_iterable_to_list(obj: Iterable[T] | T) -> Collection[T] | T:
|
||||
"""
|
||||
If obj is Iterable but not list-like, consume into list.
|
||||
"""
|
||||
if isinstance(obj, abc.Iterable) and not isinstance(obj, abc.Sized):
|
||||
return list(obj)
|
||||
obj = cast(Collection, obj)
|
||||
return obj
|
||||
|
||||
|
||||
def is_null_slice(obj) -> bool:
|
||||
"""
|
||||
We have a null slice.
|
||||
"""
|
||||
return (
|
||||
isinstance(obj, slice)
|
||||
and obj.start is None
|
||||
and obj.stop is None
|
||||
and obj.step is None
|
||||
)
|
||||
|
||||
|
||||
def is_empty_slice(obj) -> bool:
|
||||
"""
|
||||
We have an empty slice, e.g. no values are selected.
|
||||
"""
|
||||
return (
|
||||
isinstance(obj, slice)
|
||||
and obj.start is not None
|
||||
and obj.stop is not None
|
||||
and obj.start == obj.stop
|
||||
)
|
||||
|
||||
|
||||
def is_true_slices(line) -> list[bool]:
|
||||
"""
|
||||
Find non-trivial slices in "line": return a list of booleans with same length.
|
||||
"""
|
||||
return [isinstance(k, slice) and not is_null_slice(k) for k in line]
|
||||
|
||||
|
||||
# TODO: used only once in indexing; belongs elsewhere?
|
||||
def is_full_slice(obj, line: int) -> bool:
|
||||
"""
|
||||
We have a full length slice.
|
||||
"""
|
||||
return (
|
||||
isinstance(obj, slice)
|
||||
and obj.start == 0
|
||||
and obj.stop == line
|
||||
and obj.step is None
|
||||
)
|
||||
|
||||
|
||||
def get_callable_name(obj):
|
||||
# typical case has name
|
||||
if hasattr(obj, "__name__"):
|
||||
return getattr(obj, "__name__")
|
||||
# some objects don't; could recurse
|
||||
if isinstance(obj, partial):
|
||||
return get_callable_name(obj.func)
|
||||
# fall back to class name
|
||||
if callable(obj):
|
||||
return type(obj).__name__
|
||||
# everything failed (probably because the argument
|
||||
# wasn't actually callable); we return None
|
||||
# instead of the empty string in this case to allow
|
||||
# distinguishing between no name and a name of ''
|
||||
return None
|
||||
|
||||
|
||||
def apply_if_callable(maybe_callable, obj, **kwargs):
|
||||
"""
|
||||
Evaluate possibly callable input using obj and kwargs if it is callable,
|
||||
otherwise return as it is.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
maybe_callable : possibly a callable
|
||||
obj : NDFrame
|
||||
**kwargs
|
||||
"""
|
||||
if callable(maybe_callable):
|
||||
return maybe_callable(obj, **kwargs)
|
||||
|
||||
return maybe_callable
|
||||
|
||||
|
||||
def standardize_mapping(into):
|
||||
"""
|
||||
Helper function to standardize a supplied mapping.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
into : instance or subclass of collections.abc.Mapping
|
||||
Must be a class, an initialized collections.defaultdict,
|
||||
or an instance of a collections.abc.Mapping subclass.
|
||||
|
||||
Returns
|
||||
-------
|
||||
mapping : a collections.abc.Mapping subclass or other constructor
|
||||
a callable object that can accept an iterator to create
|
||||
the desired Mapping.
|
||||
|
||||
See Also
|
||||
--------
|
||||
DataFrame.to_dict
|
||||
Series.to_dict
|
||||
"""
|
||||
if not inspect.isclass(into):
|
||||
if isinstance(into, defaultdict):
|
||||
return partial(defaultdict, into.default_factory)
|
||||
into = type(into)
|
||||
if not issubclass(into, abc.Mapping):
|
||||
raise TypeError(f"unsupported type: {into}")
|
||||
if into == defaultdict:
|
||||
raise TypeError("to_dict() only accepts initialized defaultdicts")
|
||||
return into
|
||||
|
||||
|
||||
@overload
|
||||
def random_state(state: np.random.Generator) -> np.random.Generator:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def random_state(
|
||||
state: int | np.ndarray | np.random.BitGenerator | np.random.RandomState | None,
|
||||
) -> np.random.RandomState:
|
||||
...
|
||||
|
||||
|
||||
def random_state(state: RandomState | None = None):
|
||||
"""
|
||||
Helper function for processing random_state arguments.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
state : int, array-like, BitGenerator, Generator, np.random.RandomState, None.
|
||||
If receives an int, array-like, or BitGenerator, passes to
|
||||
np.random.RandomState() as seed.
|
||||
If receives an np.random RandomState or Generator, just returns that unchanged.
|
||||
If receives `None`, returns np.random.
|
||||
If receives anything else, raises an informative ValueError.
|
||||
|
||||
Default None.
|
||||
|
||||
Returns
|
||||
-------
|
||||
np.random.RandomState or np.random.Generator. If state is None, returns np.random
|
||||
|
||||
"""
|
||||
if is_integer(state) or isinstance(state, (np.ndarray, np.random.BitGenerator)):
|
||||
return np.random.RandomState(state)
|
||||
elif isinstance(state, np.random.RandomState):
|
||||
return state
|
||||
elif isinstance(state, np.random.Generator):
|
||||
return state
|
||||
elif state is None:
|
||||
return np.random
|
||||
else:
|
||||
raise ValueError(
|
||||
"random_state must be an integer, array-like, a BitGenerator, Generator, "
|
||||
"a numpy RandomState, or None"
|
||||
)
|
||||
|
||||
|
||||
def pipe(
|
||||
obj, func: Callable[..., T] | tuple[Callable[..., T], str], *args, **kwargs
|
||||
) -> T:
|
||||
"""
|
||||
Apply a function ``func`` to object ``obj`` either by passing obj as the
|
||||
first argument to the function or, in the case that the func is a tuple,
|
||||
interpret the first element of the tuple as a function and pass the obj to
|
||||
that function as a keyword argument whose key is the value of the second
|
||||
element of the tuple.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
func : callable or tuple of (callable, str)
|
||||
Function to apply to this object or, alternatively, a
|
||||
``(callable, data_keyword)`` tuple where ``data_keyword`` is a
|
||||
string indicating the keyword of ``callable`` that expects the
|
||||
object.
|
||||
*args : iterable, optional
|
||||
Positional arguments passed into ``func``.
|
||||
**kwargs : dict, optional
|
||||
A dictionary of keyword arguments passed into ``func``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
object : the return type of ``func``.
|
||||
"""
|
||||
if isinstance(func, tuple):
|
||||
func, target = func
|
||||
if target in kwargs:
|
||||
msg = f"{target} is both the pipe target and a keyword argument"
|
||||
raise ValueError(msg)
|
||||
kwargs[target] = obj
|
||||
return func(*args, **kwargs)
|
||||
else:
|
||||
return func(obj, *args, **kwargs)
|
||||
|
||||
|
||||
def get_rename_function(mapper):
|
||||
"""
|
||||
Returns a function that will map names/labels, dependent if mapper
|
||||
is a dict, Series or just a function.
|
||||
"""
|
||||
|
||||
def f(x):
|
||||
if x in mapper:
|
||||
return mapper[x]
|
||||
else:
|
||||
return x
|
||||
|
||||
return f if isinstance(mapper, (abc.Mapping, ABCSeries)) else mapper
|
||||
|
||||
|
||||
def convert_to_list_like(
|
||||
values: Hashable | Iterable | AnyArrayLike,
|
||||
) -> list | AnyArrayLike:
|
||||
"""
|
||||
Convert list-like or scalar input to list-like. List, numpy and pandas array-like
|
||||
inputs are returned unmodified whereas others are converted to list.
|
||||
"""
|
||||
if isinstance(values, (list, np.ndarray, ABCIndex, ABCSeries, ABCExtensionArray)):
|
||||
return values
|
||||
elif isinstance(values, abc.Iterable) and not isinstance(values, str):
|
||||
return list(values)
|
||||
|
||||
return [values]
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def temp_setattr(
|
||||
obj, attr: str, value, condition: bool = True
|
||||
) -> Generator[None, None, None]:
|
||||
"""
|
||||
Temporarily set attribute on an object.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : object
|
||||
Object whose attribute will be modified.
|
||||
attr : str
|
||||
Attribute to modify.
|
||||
value : Any
|
||||
Value to temporarily set attribute to.
|
||||
condition : bool, default True
|
||||
Whether to set the attribute. Provided in order to not have to
|
||||
conditionally use this context manager.
|
||||
|
||||
Yields
|
||||
------
|
||||
object : obj with modified attribute.
|
||||
"""
|
||||
if condition:
|
||||
old_value = getattr(obj, attr)
|
||||
setattr(obj, attr, value)
|
||||
try:
|
||||
yield obj
|
||||
finally:
|
||||
if condition:
|
||||
setattr(obj, attr, old_value)
|
||||
|
||||
|
||||
def require_length_match(data, index: Index) -> None:
|
||||
"""
|
||||
Check the length of data matches the length of the index.
|
||||
"""
|
||||
if len(data) != len(index):
|
||||
raise ValueError(
|
||||
"Length of values "
|
||||
f"({len(data)}) "
|
||||
"does not match length of index "
|
||||
f"({len(index)})"
|
||||
)
|
||||
|
||||
|
||||
# the ufuncs np.maximum.reduce and np.minimum.reduce default to axis=0,
|
||||
# whereas np.min and np.max (which directly call obj.min and obj.max)
|
||||
# default to axis=None.
|
||||
_builtin_table = {
|
||||
builtins.sum: np.sum,
|
||||
builtins.max: np.maximum.reduce,
|
||||
builtins.min: np.minimum.reduce,
|
||||
}
|
||||
|
||||
# GH#53425: Only for deprecation
|
||||
_builtin_table_alias = {
|
||||
builtins.sum: "np.sum",
|
||||
builtins.max: "np.maximum.reduce",
|
||||
builtins.min: "np.minimum.reduce",
|
||||
}
|
||||
|
||||
_cython_table = {
|
||||
builtins.sum: "sum",
|
||||
builtins.max: "max",
|
||||
builtins.min: "min",
|
||||
np.all: "all",
|
||||
np.any: "any",
|
||||
np.sum: "sum",
|
||||
np.nansum: "sum",
|
||||
np.mean: "mean",
|
||||
np.nanmean: "mean",
|
||||
np.prod: "prod",
|
||||
np.nanprod: "prod",
|
||||
np.std: "std",
|
||||
np.nanstd: "std",
|
||||
np.var: "var",
|
||||
np.nanvar: "var",
|
||||
np.median: "median",
|
||||
np.nanmedian: "median",
|
||||
np.max: "max",
|
||||
np.nanmax: "max",
|
||||
np.min: "min",
|
||||
np.nanmin: "min",
|
||||
np.cumprod: "cumprod",
|
||||
np.nancumprod: "cumprod",
|
||||
np.cumsum: "cumsum",
|
||||
np.nancumsum: "cumsum",
|
||||
}
|
||||
|
||||
|
||||
def get_cython_func(arg: Callable) -> str | None:
|
||||
"""
|
||||
if we define an internal function for this argument, return it
|
||||
"""
|
||||
return _cython_table.get(arg)
|
||||
|
||||
|
||||
def is_builtin_func(arg):
|
||||
"""
|
||||
if we define a builtin function for this argument, return it,
|
||||
otherwise return the arg
|
||||
"""
|
||||
return _builtin_table.get(arg, arg)
|
||||
|
||||
|
||||
def fill_missing_names(names: Sequence[Hashable | None]) -> list[Hashable]:
|
||||
"""
|
||||
If a name is missing then replace it by level_n, where n is the count
|
||||
|
||||
.. versionadded:: 1.4.0
|
||||
|
||||
Parameters
|
||||
----------
|
||||
names : list-like
|
||||
list of column names or None values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list
|
||||
list of column names with the None values replaced.
|
||||
"""
|
||||
return [f"level_{i}" if name is None else name for i, name in enumerate(names)]
|
||||
213
lib/python3.11/site-packages/pandas/core/computation/align.py
Normal file
213
lib/python3.11/site-packages/pandas/core/computation/align.py
Normal file
@ -0,0 +1,213 @@
|
||||
"""
|
||||
Core eval alignment algorithms.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from functools import (
|
||||
partial,
|
||||
wraps,
|
||||
)
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Callable,
|
||||
)
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.errors import PerformanceWarning
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
from pandas.core.dtypes.generic import (
|
||||
ABCDataFrame,
|
||||
ABCSeries,
|
||||
)
|
||||
|
||||
from pandas.core.base import PandasObject
|
||||
import pandas.core.common as com
|
||||
from pandas.core.computation.common import result_type_many
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Sequence
|
||||
|
||||
from pandas._typing import F
|
||||
|
||||
from pandas.core.generic import NDFrame
|
||||
from pandas.core.indexes.api import Index
|
||||
|
||||
|
||||
def _align_core_single_unary_op(
|
||||
term,
|
||||
) -> tuple[partial | type[NDFrame], dict[str, Index] | None]:
|
||||
typ: partial | type[NDFrame]
|
||||
axes: dict[str, Index] | None = None
|
||||
|
||||
if isinstance(term.value, np.ndarray):
|
||||
typ = partial(np.asanyarray, dtype=term.value.dtype)
|
||||
else:
|
||||
typ = type(term.value)
|
||||
if hasattr(term.value, "axes"):
|
||||
axes = _zip_axes_from_type(typ, term.value.axes)
|
||||
|
||||
return typ, axes
|
||||
|
||||
|
||||
def _zip_axes_from_type(
|
||||
typ: type[NDFrame], new_axes: Sequence[Index]
|
||||
) -> dict[str, Index]:
|
||||
return {name: new_axes[i] for i, name in enumerate(typ._AXIS_ORDERS)}
|
||||
|
||||
|
||||
def _any_pandas_objects(terms) -> bool:
|
||||
"""
|
||||
Check a sequence of terms for instances of PandasObject.
|
||||
"""
|
||||
return any(isinstance(term.value, PandasObject) for term in terms)
|
||||
|
||||
|
||||
def _filter_special_cases(f) -> Callable[[F], F]:
|
||||
@wraps(f)
|
||||
def wrapper(terms):
|
||||
# single unary operand
|
||||
if len(terms) == 1:
|
||||
return _align_core_single_unary_op(terms[0])
|
||||
|
||||
term_values = (term.value for term in terms)
|
||||
|
||||
# we don't have any pandas objects
|
||||
if not _any_pandas_objects(terms):
|
||||
return result_type_many(*term_values), None
|
||||
|
||||
return f(terms)
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
@_filter_special_cases
|
||||
def _align_core(terms):
|
||||
term_index = [i for i, term in enumerate(terms) if hasattr(term.value, "axes")]
|
||||
term_dims = [terms[i].value.ndim for i in term_index]
|
||||
|
||||
from pandas import Series
|
||||
|
||||
ndims = Series(dict(zip(term_index, term_dims)))
|
||||
|
||||
# initial axes are the axes of the largest-axis'd term
|
||||
biggest = terms[ndims.idxmax()].value
|
||||
typ = biggest._constructor
|
||||
axes = biggest.axes
|
||||
naxes = len(axes)
|
||||
gt_than_one_axis = naxes > 1
|
||||
|
||||
for value in (terms[i].value for i in term_index):
|
||||
is_series = isinstance(value, ABCSeries)
|
||||
is_series_and_gt_one_axis = is_series and gt_than_one_axis
|
||||
|
||||
for axis, items in enumerate(value.axes):
|
||||
if is_series_and_gt_one_axis:
|
||||
ax, itm = naxes - 1, value.index
|
||||
else:
|
||||
ax, itm = axis, items
|
||||
|
||||
if not axes[ax].is_(itm):
|
||||
axes[ax] = axes[ax].union(itm)
|
||||
|
||||
for i, ndim in ndims.items():
|
||||
for axis, items in zip(range(ndim), axes):
|
||||
ti = terms[i].value
|
||||
|
||||
if hasattr(ti, "reindex"):
|
||||
transpose = isinstance(ti, ABCSeries) and naxes > 1
|
||||
reindexer = axes[naxes - 1] if transpose else items
|
||||
|
||||
term_axis_size = len(ti.axes[axis])
|
||||
reindexer_size = len(reindexer)
|
||||
|
||||
ordm = np.log10(max(1, abs(reindexer_size - term_axis_size)))
|
||||
if ordm >= 1 and reindexer_size >= 10000:
|
||||
w = (
|
||||
f"Alignment difference on axis {axis} is larger "
|
||||
f"than an order of magnitude on term {repr(terms[i].name)}, "
|
||||
f"by more than {ordm:.4g}; performance may suffer."
|
||||
)
|
||||
warnings.warn(
|
||||
w, category=PerformanceWarning, stacklevel=find_stack_level()
|
||||
)
|
||||
|
||||
obj = ti.reindex(reindexer, axis=axis, copy=False)
|
||||
terms[i].update(obj)
|
||||
|
||||
terms[i].update(terms[i].value.values)
|
||||
|
||||
return typ, _zip_axes_from_type(typ, axes)
|
||||
|
||||
|
||||
def align_terms(terms):
|
||||
"""
|
||||
Align a set of terms.
|
||||
"""
|
||||
try:
|
||||
# flatten the parse tree (a nested list, really)
|
||||
terms = list(com.flatten(terms))
|
||||
except TypeError:
|
||||
# can't iterate so it must just be a constant or single variable
|
||||
if isinstance(terms.value, (ABCSeries, ABCDataFrame)):
|
||||
typ = type(terms.value)
|
||||
return typ, _zip_axes_from_type(typ, terms.value.axes)
|
||||
return np.result_type(terms.type), None
|
||||
|
||||
# if all resolved variables are numeric scalars
|
||||
if all(term.is_scalar for term in terms):
|
||||
return result_type_many(*(term.value for term in terms)).type, None
|
||||
|
||||
# perform the main alignment
|
||||
typ, axes = _align_core(terms)
|
||||
return typ, axes
|
||||
|
||||
|
||||
def reconstruct_object(typ, obj, axes, dtype):
|
||||
"""
|
||||
Reconstruct an object given its type, raw value, and possibly empty
|
||||
(None) axes.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
typ : object
|
||||
A type
|
||||
obj : object
|
||||
The value to use in the type constructor
|
||||
axes : dict
|
||||
The axes to use to construct the resulting pandas object
|
||||
|
||||
Returns
|
||||
-------
|
||||
ret : typ
|
||||
An object of type ``typ`` with the value `obj` and possible axes
|
||||
`axes`.
|
||||
"""
|
||||
try:
|
||||
typ = typ.type
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
res_t = np.result_type(obj.dtype, dtype)
|
||||
|
||||
if not isinstance(typ, partial) and issubclass(typ, PandasObject):
|
||||
return typ(obj, dtype=res_t, **axes)
|
||||
|
||||
# special case for pathological things like ~True/~False
|
||||
if hasattr(res_t, "type") and typ == np.bool_ and res_t != np.bool_:
|
||||
ret_value = res_t.type(obj)
|
||||
else:
|
||||
ret_value = typ(obj).astype(res_t)
|
||||
# The condition is to distinguish 0-dim array (returned in case of
|
||||
# scalar) and 1 element array
|
||||
# e.g. np.array(0) and np.array([0])
|
||||
if (
|
||||
len(obj.shape) == 1
|
||||
and len(obj) == 1
|
||||
and not isinstance(ret_value, np.ndarray)
|
||||
):
|
||||
ret_value = np.array([ret_value]).astype(res_t)
|
||||
|
||||
return ret_value
|
||||
@ -0,0 +1,2 @@
|
||||
__all__ = ["eval"]
|
||||
from pandas.core.computation.eval import eval
|
||||
@ -0,0 +1,8 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
|
||||
ne = import_optional_dependency("numexpr", errors="warn")
|
||||
NUMEXPR_INSTALLED = ne is not None
|
||||
|
||||
__all__ = ["NUMEXPR_INSTALLED"]
|
||||
@ -0,0 +1,48 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from functools import reduce
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._config import get_option
|
||||
|
||||
|
||||
def ensure_decoded(s) -> str:
|
||||
"""
|
||||
If we have bytes, decode them to unicode.
|
||||
"""
|
||||
if isinstance(s, (np.bytes_, bytes)):
|
||||
s = s.decode(get_option("display.encoding"))
|
||||
return s
|
||||
|
||||
|
||||
def result_type_many(*arrays_and_dtypes):
|
||||
"""
|
||||
Wrapper around numpy.result_type which overcomes the NPY_MAXARGS (32)
|
||||
argument limit.
|
||||
"""
|
||||
try:
|
||||
return np.result_type(*arrays_and_dtypes)
|
||||
except ValueError:
|
||||
# we have > NPY_MAXARGS terms in our expression
|
||||
return reduce(np.result_type, arrays_and_dtypes)
|
||||
except TypeError:
|
||||
from pandas.core.dtypes.cast import find_common_type
|
||||
from pandas.core.dtypes.common import is_extension_array_dtype
|
||||
|
||||
arr_and_dtypes = list(arrays_and_dtypes)
|
||||
ea_dtypes, non_ea_dtypes = [], []
|
||||
for arr_or_dtype in arr_and_dtypes:
|
||||
if is_extension_array_dtype(arr_or_dtype):
|
||||
ea_dtypes.append(arr_or_dtype)
|
||||
else:
|
||||
non_ea_dtypes.append(arr_or_dtype)
|
||||
|
||||
if non_ea_dtypes:
|
||||
try:
|
||||
np_dtype = np.result_type(*non_ea_dtypes)
|
||||
except ValueError:
|
||||
np_dtype = reduce(np.result_type, arrays_and_dtypes)
|
||||
return find_common_type(ea_dtypes + [np_dtype])
|
||||
|
||||
return find_common_type(ea_dtypes)
|
||||
143
lib/python3.11/site-packages/pandas/core/computation/engines.py
Normal file
143
lib/python3.11/site-packages/pandas/core/computation/engines.py
Normal file
@ -0,0 +1,143 @@
|
||||
"""
|
||||
Engine classes for :func:`~pandas.eval`
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import abc
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from pandas.errors import NumExprClobberingError
|
||||
|
||||
from pandas.core.computation.align import (
|
||||
align_terms,
|
||||
reconstruct_object,
|
||||
)
|
||||
from pandas.core.computation.ops import (
|
||||
MATHOPS,
|
||||
REDUCTIONS,
|
||||
)
|
||||
|
||||
from pandas.io.formats import printing
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas.core.computation.expr import Expr
|
||||
|
||||
_ne_builtins = frozenset(MATHOPS + REDUCTIONS)
|
||||
|
||||
|
||||
def _check_ne_builtin_clash(expr: Expr) -> None:
|
||||
"""
|
||||
Attempt to prevent foot-shooting in a helpful way.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
expr : Expr
|
||||
Terms can contain
|
||||
"""
|
||||
names = expr.names
|
||||
overlap = names & _ne_builtins
|
||||
|
||||
if overlap:
|
||||
s = ", ".join([repr(x) for x in overlap])
|
||||
raise NumExprClobberingError(
|
||||
f'Variables in expression "{expr}" overlap with builtins: ({s})'
|
||||
)
|
||||
|
||||
|
||||
class AbstractEngine(metaclass=abc.ABCMeta):
|
||||
"""Object serving as a base class for all engines."""
|
||||
|
||||
has_neg_frac = False
|
||||
|
||||
def __init__(self, expr) -> None:
|
||||
self.expr = expr
|
||||
self.aligned_axes = None
|
||||
self.result_type = None
|
||||
|
||||
def convert(self) -> str:
|
||||
"""
|
||||
Convert an expression for evaluation.
|
||||
|
||||
Defaults to return the expression as a string.
|
||||
"""
|
||||
return printing.pprint_thing(self.expr)
|
||||
|
||||
def evaluate(self) -> object:
|
||||
"""
|
||||
Run the engine on the expression.
|
||||
|
||||
This method performs alignment which is necessary no matter what engine
|
||||
is being used, thus its implementation is in the base class.
|
||||
|
||||
Returns
|
||||
-------
|
||||
object
|
||||
The result of the passed expression.
|
||||
"""
|
||||
if not self._is_aligned:
|
||||
self.result_type, self.aligned_axes = align_terms(self.expr.terms)
|
||||
|
||||
# make sure no names in resolvers and locals/globals clash
|
||||
res = self._evaluate()
|
||||
return reconstruct_object(
|
||||
self.result_type, res, self.aligned_axes, self.expr.terms.return_type
|
||||
)
|
||||
|
||||
@property
|
||||
def _is_aligned(self) -> bool:
|
||||
return self.aligned_axes is not None and self.result_type is not None
|
||||
|
||||
@abc.abstractmethod
|
||||
def _evaluate(self):
|
||||
"""
|
||||
Return an evaluated expression.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
env : Scope
|
||||
The local and global environment in which to evaluate an
|
||||
expression.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Must be implemented by subclasses.
|
||||
"""
|
||||
|
||||
|
||||
class NumExprEngine(AbstractEngine):
|
||||
"""NumExpr engine class"""
|
||||
|
||||
has_neg_frac = True
|
||||
|
||||
def _evaluate(self):
|
||||
import numexpr as ne
|
||||
|
||||
# convert the expression to a valid numexpr expression
|
||||
s = self.convert()
|
||||
|
||||
env = self.expr.env
|
||||
scope = env.full_scope
|
||||
_check_ne_builtin_clash(self.expr)
|
||||
return ne.evaluate(s, local_dict=scope)
|
||||
|
||||
|
||||
class PythonEngine(AbstractEngine):
|
||||
"""
|
||||
Evaluate an expression in Python space.
|
||||
|
||||
Mostly for testing purposes.
|
||||
"""
|
||||
|
||||
has_neg_frac = False
|
||||
|
||||
def evaluate(self):
|
||||
return self.expr()
|
||||
|
||||
def _evaluate(self) -> None:
|
||||
pass
|
||||
|
||||
|
||||
ENGINES: dict[str, type[AbstractEngine]] = {
|
||||
"numexpr": NumExprEngine,
|
||||
"python": PythonEngine,
|
||||
}
|
||||
421
lib/python3.11/site-packages/pandas/core/computation/eval.py
Normal file
421
lib/python3.11/site-packages/pandas/core/computation/eval.py
Normal file
@ -0,0 +1,421 @@
|
||||
"""
|
||||
Top level ``eval`` module.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import tokenize
|
||||
from typing import TYPE_CHECKING
|
||||
import warnings
|
||||
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
from pandas.util._validators import validate_bool_kwarg
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_extension_array_dtype,
|
||||
is_string_dtype,
|
||||
)
|
||||
|
||||
from pandas.core.computation.engines import ENGINES
|
||||
from pandas.core.computation.expr import (
|
||||
PARSERS,
|
||||
Expr,
|
||||
)
|
||||
from pandas.core.computation.parsing import tokenize_string
|
||||
from pandas.core.computation.scope import ensure_scope
|
||||
from pandas.core.generic import NDFrame
|
||||
|
||||
from pandas.io.formats.printing import pprint_thing
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas.core.computation.ops import BinOp
|
||||
|
||||
|
||||
def _check_engine(engine: str | None) -> str:
|
||||
"""
|
||||
Make sure a valid engine is passed.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
engine : str
|
||||
String to validate.
|
||||
|
||||
Raises
|
||||
------
|
||||
KeyError
|
||||
* If an invalid engine is passed.
|
||||
ImportError
|
||||
* If numexpr was requested but doesn't exist.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
Engine name.
|
||||
"""
|
||||
from pandas.core.computation.check import NUMEXPR_INSTALLED
|
||||
from pandas.core.computation.expressions import USE_NUMEXPR
|
||||
|
||||
if engine is None:
|
||||
engine = "numexpr" if USE_NUMEXPR else "python"
|
||||
|
||||
if engine not in ENGINES:
|
||||
valid_engines = list(ENGINES.keys())
|
||||
raise KeyError(
|
||||
f"Invalid engine '{engine}' passed, valid engines are {valid_engines}"
|
||||
)
|
||||
|
||||
# TODO: validate this in a more general way (thinking of future engines
|
||||
# that won't necessarily be import-able)
|
||||
# Could potentially be done on engine instantiation
|
||||
if engine == "numexpr" and not NUMEXPR_INSTALLED:
|
||||
raise ImportError(
|
||||
"'numexpr' is not installed or an unsupported version. Cannot use "
|
||||
"engine='numexpr' for query/eval if 'numexpr' is not installed"
|
||||
)
|
||||
|
||||
return engine
|
||||
|
||||
|
||||
def _check_parser(parser: str):
|
||||
"""
|
||||
Make sure a valid parser is passed.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
parser : str
|
||||
|
||||
Raises
|
||||
------
|
||||
KeyError
|
||||
* If an invalid parser is passed
|
||||
"""
|
||||
if parser not in PARSERS:
|
||||
raise KeyError(
|
||||
f"Invalid parser '{parser}' passed, valid parsers are {PARSERS.keys()}"
|
||||
)
|
||||
|
||||
|
||||
def _check_resolvers(resolvers):
|
||||
if resolvers is not None:
|
||||
for resolver in resolvers:
|
||||
if not hasattr(resolver, "__getitem__"):
|
||||
name = type(resolver).__name__
|
||||
raise TypeError(
|
||||
f"Resolver of type '{name}' does not "
|
||||
"implement the __getitem__ method"
|
||||
)
|
||||
|
||||
|
||||
def _check_expression(expr):
|
||||
"""
|
||||
Make sure an expression is not an empty string
|
||||
|
||||
Parameters
|
||||
----------
|
||||
expr : object
|
||||
An object that can be converted to a string
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
* If expr is an empty string
|
||||
"""
|
||||
if not expr:
|
||||
raise ValueError("expr cannot be an empty string")
|
||||
|
||||
|
||||
def _convert_expression(expr) -> str:
|
||||
"""
|
||||
Convert an object to an expression.
|
||||
|
||||
This function converts an object to an expression (a unicode string) and
|
||||
checks to make sure it isn't empty after conversion. This is used to
|
||||
convert operators to their string representation for recursive calls to
|
||||
:func:`~pandas.eval`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
expr : object
|
||||
The object to be converted to a string.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
The string representation of an object.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
* If the expression is empty.
|
||||
"""
|
||||
s = pprint_thing(expr)
|
||||
_check_expression(s)
|
||||
return s
|
||||
|
||||
|
||||
def _check_for_locals(expr: str, stack_level: int, parser: str):
|
||||
at_top_of_stack = stack_level == 0
|
||||
not_pandas_parser = parser != "pandas"
|
||||
|
||||
if not_pandas_parser:
|
||||
msg = "The '@' prefix is only supported by the pandas parser"
|
||||
elif at_top_of_stack:
|
||||
msg = (
|
||||
"The '@' prefix is not allowed in top-level eval calls.\n"
|
||||
"please refer to your variables by name without the '@' prefix."
|
||||
)
|
||||
|
||||
if at_top_of_stack or not_pandas_parser:
|
||||
for toknum, tokval in tokenize_string(expr):
|
||||
if toknum == tokenize.OP and tokval == "@":
|
||||
raise SyntaxError(msg)
|
||||
|
||||
|
||||
def eval(
|
||||
expr: str | BinOp, # we leave BinOp out of the docstr bc it isn't for users
|
||||
parser: str = "pandas",
|
||||
engine: str | None = None,
|
||||
local_dict=None,
|
||||
global_dict=None,
|
||||
resolvers=(),
|
||||
level: int = 0,
|
||||
target=None,
|
||||
inplace: bool = False,
|
||||
):
|
||||
"""
|
||||
Evaluate a Python expression as a string using various backends.
|
||||
|
||||
The following arithmetic operations are supported: ``+``, ``-``, ``*``,
|
||||
``/``, ``**``, ``%``, ``//`` (python engine only) along with the following
|
||||
boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not).
|
||||
Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`,
|
||||
:keyword:`or`, and :keyword:`not` with the same semantics as the
|
||||
corresponding bitwise operators. :class:`~pandas.Series` and
|
||||
:class:`~pandas.DataFrame` objects are supported and behave as they would
|
||||
with plain ol' Python evaluation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
expr : str
|
||||
The expression to evaluate. This string cannot contain any Python
|
||||
`statements
|
||||
<https://docs.python.org/3/reference/simple_stmts.html#simple-statements>`__,
|
||||
only Python `expressions
|
||||
<https://docs.python.org/3/reference/simple_stmts.html#expression-statements>`__.
|
||||
parser : {'pandas', 'python'}, default 'pandas'
|
||||
The parser to use to construct the syntax tree from the expression. The
|
||||
default of ``'pandas'`` parses code slightly different than standard
|
||||
Python. Alternatively, you can parse an expression using the
|
||||
``'python'`` parser to retain strict Python semantics. See the
|
||||
:ref:`enhancing performance <enhancingperf.eval>` documentation for
|
||||
more details.
|
||||
engine : {'python', 'numexpr'}, default 'numexpr'
|
||||
|
||||
The engine used to evaluate the expression. Supported engines are
|
||||
|
||||
- None : tries to use ``numexpr``, falls back to ``python``
|
||||
- ``'numexpr'`` : This default engine evaluates pandas objects using
|
||||
numexpr for large speed ups in complex expressions with large frames.
|
||||
- ``'python'`` : Performs operations as if you had ``eval``'d in top
|
||||
level python. This engine is generally not that useful.
|
||||
|
||||
More backends may be available in the future.
|
||||
local_dict : dict or None, optional
|
||||
A dictionary of local variables, taken from locals() by default.
|
||||
global_dict : dict or None, optional
|
||||
A dictionary of global variables, taken from globals() by default.
|
||||
resolvers : list of dict-like or None, optional
|
||||
A list of objects implementing the ``__getitem__`` special method that
|
||||
you can use to inject an additional collection of namespaces to use for
|
||||
variable lookup. For example, this is used in the
|
||||
:meth:`~DataFrame.query` method to inject the
|
||||
``DataFrame.index`` and ``DataFrame.columns``
|
||||
variables that refer to their respective :class:`~pandas.DataFrame`
|
||||
instance attributes.
|
||||
level : int, optional
|
||||
The number of prior stack frames to traverse and add to the current
|
||||
scope. Most users will **not** need to change this parameter.
|
||||
target : object, optional, default None
|
||||
This is the target object for assignment. It is used when there is
|
||||
variable assignment in the expression. If so, then `target` must
|
||||
support item assignment with string keys, and if a copy is being
|
||||
returned, it must also support `.copy()`.
|
||||
inplace : bool, default False
|
||||
If `target` is provided, and the expression mutates `target`, whether
|
||||
to modify `target` inplace. Otherwise, return a copy of `target` with
|
||||
the mutation.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ndarray, numeric scalar, DataFrame, Series, or None
|
||||
The completion value of evaluating the given code or None if ``inplace=True``.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
There are many instances where such an error can be raised:
|
||||
|
||||
- `target=None`, but the expression is multiline.
|
||||
- The expression is multiline, but not all them have item assignment.
|
||||
An example of such an arrangement is this:
|
||||
|
||||
a = b + 1
|
||||
a + 2
|
||||
|
||||
Here, there are expressions on different lines, making it multiline,
|
||||
but the last line has no variable assigned to the output of `a + 2`.
|
||||
- `inplace=True`, but the expression is missing item assignment.
|
||||
- Item assignment is provided, but the `target` does not support
|
||||
string item assignment.
|
||||
- Item assignment is provided and `inplace=False`, but the `target`
|
||||
does not support the `.copy()` method
|
||||
|
||||
See Also
|
||||
--------
|
||||
DataFrame.query : Evaluates a boolean expression to query the columns
|
||||
of a frame.
|
||||
DataFrame.eval : Evaluate a string describing operations on
|
||||
DataFrame columns.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The ``dtype`` of any objects involved in an arithmetic ``%`` operation are
|
||||
recursively cast to ``float64``.
|
||||
|
||||
See the :ref:`enhancing performance <enhancingperf.eval>` documentation for
|
||||
more details.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.DataFrame({"animal": ["dog", "pig"], "age": [10, 20]})
|
||||
>>> df
|
||||
animal age
|
||||
0 dog 10
|
||||
1 pig 20
|
||||
|
||||
We can add a new column using ``pd.eval``:
|
||||
|
||||
>>> pd.eval("double_age = df.age * 2", target=df)
|
||||
animal age double_age
|
||||
0 dog 10 20
|
||||
1 pig 20 40
|
||||
"""
|
||||
inplace = validate_bool_kwarg(inplace, "inplace")
|
||||
|
||||
exprs: list[str | BinOp]
|
||||
if isinstance(expr, str):
|
||||
_check_expression(expr)
|
||||
exprs = [e.strip() for e in expr.splitlines() if e.strip() != ""]
|
||||
else:
|
||||
# ops.BinOp; for internal compat, not intended to be passed by users
|
||||
exprs = [expr]
|
||||
multi_line = len(exprs) > 1
|
||||
|
||||
if multi_line and target is None:
|
||||
raise ValueError(
|
||||
"multi-line expressions are only valid in the "
|
||||
"context of data, use DataFrame.eval"
|
||||
)
|
||||
engine = _check_engine(engine)
|
||||
_check_parser(parser)
|
||||
_check_resolvers(resolvers)
|
||||
|
||||
ret = None
|
||||
first_expr = True
|
||||
target_modified = False
|
||||
|
||||
for expr in exprs:
|
||||
expr = _convert_expression(expr)
|
||||
_check_for_locals(expr, level, parser)
|
||||
|
||||
# get our (possibly passed-in) scope
|
||||
env = ensure_scope(
|
||||
level + 1,
|
||||
global_dict=global_dict,
|
||||
local_dict=local_dict,
|
||||
resolvers=resolvers,
|
||||
target=target,
|
||||
)
|
||||
|
||||
parsed_expr = Expr(expr, engine=engine, parser=parser, env=env)
|
||||
|
||||
if engine == "numexpr" and (
|
||||
(
|
||||
is_extension_array_dtype(parsed_expr.terms.return_type)
|
||||
and not is_string_dtype(parsed_expr.terms.return_type)
|
||||
)
|
||||
or getattr(parsed_expr.terms, "operand_types", None) is not None
|
||||
and any(
|
||||
(is_extension_array_dtype(elem) and not is_string_dtype(elem))
|
||||
for elem in parsed_expr.terms.operand_types
|
||||
)
|
||||
):
|
||||
warnings.warn(
|
||||
"Engine has switched to 'python' because numexpr does not support "
|
||||
"extension array dtypes. Please set your engine to python manually.",
|
||||
RuntimeWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
engine = "python"
|
||||
|
||||
# construct the engine and evaluate the parsed expression
|
||||
eng = ENGINES[engine]
|
||||
eng_inst = eng(parsed_expr)
|
||||
ret = eng_inst.evaluate()
|
||||
|
||||
if parsed_expr.assigner is None:
|
||||
if multi_line:
|
||||
raise ValueError(
|
||||
"Multi-line expressions are only valid "
|
||||
"if all expressions contain an assignment"
|
||||
)
|
||||
if inplace:
|
||||
raise ValueError("Cannot operate inplace if there is no assignment")
|
||||
|
||||
# assign if needed
|
||||
assigner = parsed_expr.assigner
|
||||
if env.target is not None and assigner is not None:
|
||||
target_modified = True
|
||||
|
||||
# if returning a copy, copy only on the first assignment
|
||||
if not inplace and first_expr:
|
||||
try:
|
||||
target = env.target
|
||||
if isinstance(target, NDFrame):
|
||||
target = target.copy(deep=None)
|
||||
else:
|
||||
target = target.copy()
|
||||
except AttributeError as err:
|
||||
raise ValueError("Cannot return a copy of the target") from err
|
||||
else:
|
||||
target = env.target
|
||||
|
||||
# TypeError is most commonly raised (e.g. int, list), but you
|
||||
# get IndexError if you try to do this assignment on np.ndarray.
|
||||
# we will ignore numpy warnings here; e.g. if trying
|
||||
# to use a non-numeric indexer
|
||||
try:
|
||||
if inplace and isinstance(target, NDFrame):
|
||||
target.loc[:, assigner] = ret
|
||||
else:
|
||||
target[assigner] = ret # pyright: ignore[reportGeneralTypeIssues]
|
||||
except (TypeError, IndexError) as err:
|
||||
raise ValueError("Cannot assign expression output to target") from err
|
||||
|
||||
if not resolvers:
|
||||
resolvers = ({assigner: ret},)
|
||||
else:
|
||||
# existing resolver needs updated to handle
|
||||
# case of mutating existing column in copy
|
||||
for resolver in resolvers:
|
||||
if assigner in resolver:
|
||||
resolver[assigner] = ret
|
||||
break
|
||||
else:
|
||||
resolvers += ({assigner: ret},)
|
||||
|
||||
ret = None
|
||||
first_expr = False
|
||||
|
||||
# We want to exclude `inplace=None` as being False.
|
||||
if inplace is False:
|
||||
return target if target_modified else ret
|
||||
840
lib/python3.11/site-packages/pandas/core/computation/expr.py
Normal file
840
lib/python3.11/site-packages/pandas/core/computation/expr.py
Normal file
@ -0,0 +1,840 @@
|
||||
"""
|
||||
:func:`~pandas.eval` parsers.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import ast
|
||||
from functools import (
|
||||
partial,
|
||||
reduce,
|
||||
)
|
||||
from keyword import iskeyword
|
||||
import tokenize
|
||||
from typing import (
|
||||
Callable,
|
||||
ClassVar,
|
||||
TypeVar,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.errors import UndefinedVariableError
|
||||
|
||||
from pandas.core.dtypes.common import is_string_dtype
|
||||
|
||||
import pandas.core.common as com
|
||||
from pandas.core.computation.ops import (
|
||||
ARITH_OPS_SYMS,
|
||||
BOOL_OPS_SYMS,
|
||||
CMP_OPS_SYMS,
|
||||
LOCAL_TAG,
|
||||
MATHOPS,
|
||||
REDUCTIONS,
|
||||
UNARY_OPS_SYMS,
|
||||
BinOp,
|
||||
Constant,
|
||||
FuncNode,
|
||||
Op,
|
||||
Term,
|
||||
UnaryOp,
|
||||
is_term,
|
||||
)
|
||||
from pandas.core.computation.parsing import (
|
||||
clean_backtick_quoted_toks,
|
||||
tokenize_string,
|
||||
)
|
||||
from pandas.core.computation.scope import Scope
|
||||
|
||||
from pandas.io.formats import printing
|
||||
|
||||
|
||||
def _rewrite_assign(tok: tuple[int, str]) -> tuple[int, str]:
|
||||
"""
|
||||
Rewrite the assignment operator for PyTables expressions that use ``=``
|
||||
as a substitute for ``==``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
tok : tuple of int, str
|
||||
ints correspond to the all caps constants in the tokenize module
|
||||
|
||||
Returns
|
||||
-------
|
||||
tuple of int, str
|
||||
Either the input or token or the replacement values
|
||||
"""
|
||||
toknum, tokval = tok
|
||||
return toknum, "==" if tokval == "=" else tokval
|
||||
|
||||
|
||||
def _replace_booleans(tok: tuple[int, str]) -> tuple[int, str]:
|
||||
"""
|
||||
Replace ``&`` with ``and`` and ``|`` with ``or`` so that bitwise
|
||||
precedence is changed to boolean precedence.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
tok : tuple of int, str
|
||||
ints correspond to the all caps constants in the tokenize module
|
||||
|
||||
Returns
|
||||
-------
|
||||
tuple of int, str
|
||||
Either the input or token or the replacement values
|
||||
"""
|
||||
toknum, tokval = tok
|
||||
if toknum == tokenize.OP:
|
||||
if tokval == "&":
|
||||
return tokenize.NAME, "and"
|
||||
elif tokval == "|":
|
||||
return tokenize.NAME, "or"
|
||||
return toknum, tokval
|
||||
return toknum, tokval
|
||||
|
||||
|
||||
def _replace_locals(tok: tuple[int, str]) -> tuple[int, str]:
|
||||
"""
|
||||
Replace local variables with a syntactically valid name.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
tok : tuple of int, str
|
||||
ints correspond to the all caps constants in the tokenize module
|
||||
|
||||
Returns
|
||||
-------
|
||||
tuple of int, str
|
||||
Either the input or token or the replacement values
|
||||
|
||||
Notes
|
||||
-----
|
||||
This is somewhat of a hack in that we rewrite a string such as ``'@a'`` as
|
||||
``'__pd_eval_local_a'`` by telling the tokenizer that ``__pd_eval_local_``
|
||||
is a ``tokenize.OP`` and to replace the ``'@'`` symbol with it.
|
||||
"""
|
||||
toknum, tokval = tok
|
||||
if toknum == tokenize.OP and tokval == "@":
|
||||
return tokenize.OP, LOCAL_TAG
|
||||
return toknum, tokval
|
||||
|
||||
|
||||
def _compose2(f, g):
|
||||
"""
|
||||
Compose 2 callables.
|
||||
"""
|
||||
return lambda *args, **kwargs: f(g(*args, **kwargs))
|
||||
|
||||
|
||||
def _compose(*funcs):
|
||||
"""
|
||||
Compose 2 or more callables.
|
||||
"""
|
||||
assert len(funcs) > 1, "At least 2 callables must be passed to compose"
|
||||
return reduce(_compose2, funcs)
|
||||
|
||||
|
||||
def _preparse(
|
||||
source: str,
|
||||
f=_compose(
|
||||
_replace_locals, _replace_booleans, _rewrite_assign, clean_backtick_quoted_toks
|
||||
),
|
||||
) -> str:
|
||||
"""
|
||||
Compose a collection of tokenization functions.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
source : str
|
||||
A Python source code string
|
||||
f : callable
|
||||
This takes a tuple of (toknum, tokval) as its argument and returns a
|
||||
tuple with the same structure but possibly different elements. Defaults
|
||||
to the composition of ``_rewrite_assign``, ``_replace_booleans``, and
|
||||
``_replace_locals``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
Valid Python source code
|
||||
|
||||
Notes
|
||||
-----
|
||||
The `f` parameter can be any callable that takes *and* returns input of the
|
||||
form ``(toknum, tokval)``, where ``toknum`` is one of the constants from
|
||||
the ``tokenize`` module and ``tokval`` is a string.
|
||||
"""
|
||||
assert callable(f), "f must be callable"
|
||||
return tokenize.untokenize(f(x) for x in tokenize_string(source))
|
||||
|
||||
|
||||
def _is_type(t):
|
||||
"""
|
||||
Factory for a type checking function of type ``t`` or tuple of types.
|
||||
"""
|
||||
return lambda x: isinstance(x.value, t)
|
||||
|
||||
|
||||
_is_list = _is_type(list)
|
||||
_is_str = _is_type(str)
|
||||
|
||||
|
||||
# partition all AST nodes
|
||||
_all_nodes = frozenset(
|
||||
node
|
||||
for node in (getattr(ast, name) for name in dir(ast))
|
||||
if isinstance(node, type) and issubclass(node, ast.AST)
|
||||
)
|
||||
|
||||
|
||||
def _filter_nodes(superclass, all_nodes=_all_nodes):
|
||||
"""
|
||||
Filter out AST nodes that are subclasses of ``superclass``.
|
||||
"""
|
||||
node_names = (node.__name__ for node in all_nodes if issubclass(node, superclass))
|
||||
return frozenset(node_names)
|
||||
|
||||
|
||||
_all_node_names = frozenset(x.__name__ for x in _all_nodes)
|
||||
_mod_nodes = _filter_nodes(ast.mod)
|
||||
_stmt_nodes = _filter_nodes(ast.stmt)
|
||||
_expr_nodes = _filter_nodes(ast.expr)
|
||||
_expr_context_nodes = _filter_nodes(ast.expr_context)
|
||||
_boolop_nodes = _filter_nodes(ast.boolop)
|
||||
_operator_nodes = _filter_nodes(ast.operator)
|
||||
_unary_op_nodes = _filter_nodes(ast.unaryop)
|
||||
_cmp_op_nodes = _filter_nodes(ast.cmpop)
|
||||
_comprehension_nodes = _filter_nodes(ast.comprehension)
|
||||
_handler_nodes = _filter_nodes(ast.excepthandler)
|
||||
_arguments_nodes = _filter_nodes(ast.arguments)
|
||||
_keyword_nodes = _filter_nodes(ast.keyword)
|
||||
_alias_nodes = _filter_nodes(ast.alias)
|
||||
|
||||
|
||||
# nodes that we don't support directly but are needed for parsing
|
||||
_hacked_nodes = frozenset(["Assign", "Module", "Expr"])
|
||||
|
||||
|
||||
_unsupported_expr_nodes = frozenset(
|
||||
[
|
||||
"Yield",
|
||||
"GeneratorExp",
|
||||
"IfExp",
|
||||
"DictComp",
|
||||
"SetComp",
|
||||
"Repr",
|
||||
"Lambda",
|
||||
"Set",
|
||||
"AST",
|
||||
"Is",
|
||||
"IsNot",
|
||||
]
|
||||
)
|
||||
|
||||
# these nodes are low priority or won't ever be supported (e.g., AST)
|
||||
_unsupported_nodes = (
|
||||
_stmt_nodes
|
||||
| _mod_nodes
|
||||
| _handler_nodes
|
||||
| _arguments_nodes
|
||||
| _keyword_nodes
|
||||
| _alias_nodes
|
||||
| _expr_context_nodes
|
||||
| _unsupported_expr_nodes
|
||||
) - _hacked_nodes
|
||||
|
||||
# we're adding a different assignment in some cases to be equality comparison
|
||||
# and we don't want `stmt` and friends in their so get only the class whose
|
||||
# names are capitalized
|
||||
_base_supported_nodes = (_all_node_names - _unsupported_nodes) | _hacked_nodes
|
||||
intersection = _unsupported_nodes & _base_supported_nodes
|
||||
_msg = f"cannot both support and not support {intersection}"
|
||||
assert not intersection, _msg
|
||||
|
||||
|
||||
def _node_not_implemented(node_name: str) -> Callable[..., None]:
|
||||
"""
|
||||
Return a function that raises a NotImplementedError with a passed node name.
|
||||
"""
|
||||
|
||||
def f(self, *args, **kwargs):
|
||||
raise NotImplementedError(f"'{node_name}' nodes are not implemented")
|
||||
|
||||
return f
|
||||
|
||||
|
||||
# should be bound by BaseExprVisitor but that creates a circular dependency:
|
||||
# _T is used in disallow, but disallow is used to define BaseExprVisitor
|
||||
# https://github.com/microsoft/pyright/issues/2315
|
||||
_T = TypeVar("_T")
|
||||
|
||||
|
||||
def disallow(nodes: set[str]) -> Callable[[type[_T]], type[_T]]:
|
||||
"""
|
||||
Decorator to disallow certain nodes from parsing. Raises a
|
||||
NotImplementedError instead.
|
||||
|
||||
Returns
|
||||
-------
|
||||
callable
|
||||
"""
|
||||
|
||||
def disallowed(cls: type[_T]) -> type[_T]:
|
||||
# error: "Type[_T]" has no attribute "unsupported_nodes"
|
||||
cls.unsupported_nodes = () # type: ignore[attr-defined]
|
||||
for node in nodes:
|
||||
new_method = _node_not_implemented(node)
|
||||
name = f"visit_{node}"
|
||||
# error: "Type[_T]" has no attribute "unsupported_nodes"
|
||||
cls.unsupported_nodes += (name,) # type: ignore[attr-defined]
|
||||
setattr(cls, name, new_method)
|
||||
return cls
|
||||
|
||||
return disallowed
|
||||
|
||||
|
||||
def _op_maker(op_class, op_symbol):
|
||||
"""
|
||||
Return a function to create an op class with its symbol already passed.
|
||||
|
||||
Returns
|
||||
-------
|
||||
callable
|
||||
"""
|
||||
|
||||
def f(self, node, *args, **kwargs):
|
||||
"""
|
||||
Return a partial function with an Op subclass with an operator already passed.
|
||||
|
||||
Returns
|
||||
-------
|
||||
callable
|
||||
"""
|
||||
return partial(op_class, op_symbol, *args, **kwargs)
|
||||
|
||||
return f
|
||||
|
||||
|
||||
_op_classes = {"binary": BinOp, "unary": UnaryOp}
|
||||
|
||||
|
||||
def add_ops(op_classes):
|
||||
"""
|
||||
Decorator to add default implementation of ops.
|
||||
"""
|
||||
|
||||
def f(cls):
|
||||
for op_attr_name, op_class in op_classes.items():
|
||||
ops = getattr(cls, f"{op_attr_name}_ops")
|
||||
ops_map = getattr(cls, f"{op_attr_name}_op_nodes_map")
|
||||
for op in ops:
|
||||
op_node = ops_map[op]
|
||||
if op_node is not None:
|
||||
made_op = _op_maker(op_class, op)
|
||||
setattr(cls, f"visit_{op_node}", made_op)
|
||||
return cls
|
||||
|
||||
return f
|
||||
|
||||
|
||||
@disallow(_unsupported_nodes)
|
||||
@add_ops(_op_classes)
|
||||
class BaseExprVisitor(ast.NodeVisitor):
|
||||
"""
|
||||
Custom ast walker. Parsers of other engines should subclass this class
|
||||
if necessary.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
env : Scope
|
||||
engine : str
|
||||
parser : str
|
||||
preparser : callable
|
||||
"""
|
||||
|
||||
const_type: ClassVar[type[Term]] = Constant
|
||||
term_type: ClassVar[type[Term]] = Term
|
||||
|
||||
binary_ops = CMP_OPS_SYMS + BOOL_OPS_SYMS + ARITH_OPS_SYMS
|
||||
binary_op_nodes = (
|
||||
"Gt",
|
||||
"Lt",
|
||||
"GtE",
|
||||
"LtE",
|
||||
"Eq",
|
||||
"NotEq",
|
||||
"In",
|
||||
"NotIn",
|
||||
"BitAnd",
|
||||
"BitOr",
|
||||
"And",
|
||||
"Or",
|
||||
"Add",
|
||||
"Sub",
|
||||
"Mult",
|
||||
"Div",
|
||||
"Pow",
|
||||
"FloorDiv",
|
||||
"Mod",
|
||||
)
|
||||
binary_op_nodes_map = dict(zip(binary_ops, binary_op_nodes))
|
||||
|
||||
unary_ops = UNARY_OPS_SYMS
|
||||
unary_op_nodes = "UAdd", "USub", "Invert", "Not"
|
||||
unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes))
|
||||
|
||||
rewrite_map = {
|
||||
ast.Eq: ast.In,
|
||||
ast.NotEq: ast.NotIn,
|
||||
ast.In: ast.In,
|
||||
ast.NotIn: ast.NotIn,
|
||||
}
|
||||
|
||||
unsupported_nodes: tuple[str, ...]
|
||||
|
||||
def __init__(self, env, engine, parser, preparser=_preparse) -> None:
|
||||
self.env = env
|
||||
self.engine = engine
|
||||
self.parser = parser
|
||||
self.preparser = preparser
|
||||
self.assigner = None
|
||||
|
||||
def visit(self, node, **kwargs):
|
||||
if isinstance(node, str):
|
||||
clean = self.preparser(node)
|
||||
try:
|
||||
node = ast.fix_missing_locations(ast.parse(clean))
|
||||
except SyntaxError as e:
|
||||
if any(iskeyword(x) for x in clean.split()):
|
||||
e.msg = "Python keyword not valid identifier in numexpr query"
|
||||
raise e
|
||||
|
||||
method = f"visit_{type(node).__name__}"
|
||||
visitor = getattr(self, method)
|
||||
return visitor(node, **kwargs)
|
||||
|
||||
def visit_Module(self, node, **kwargs):
|
||||
if len(node.body) != 1:
|
||||
raise SyntaxError("only a single expression is allowed")
|
||||
expr = node.body[0]
|
||||
return self.visit(expr, **kwargs)
|
||||
|
||||
def visit_Expr(self, node, **kwargs):
|
||||
return self.visit(node.value, **kwargs)
|
||||
|
||||
def _rewrite_membership_op(self, node, left, right):
|
||||
# the kind of the operator (is actually an instance)
|
||||
op_instance = node.op
|
||||
op_type = type(op_instance)
|
||||
|
||||
# must be two terms and the comparison operator must be ==/!=/in/not in
|
||||
if is_term(left) and is_term(right) and op_type in self.rewrite_map:
|
||||
left_list, right_list = map(_is_list, (left, right))
|
||||
left_str, right_str = map(_is_str, (left, right))
|
||||
|
||||
# if there are any strings or lists in the expression
|
||||
if left_list or right_list or left_str or right_str:
|
||||
op_instance = self.rewrite_map[op_type]()
|
||||
|
||||
# pop the string variable out of locals and replace it with a list
|
||||
# of one string, kind of a hack
|
||||
if right_str:
|
||||
name = self.env.add_tmp([right.value])
|
||||
right = self.term_type(name, self.env)
|
||||
|
||||
if left_str:
|
||||
name = self.env.add_tmp([left.value])
|
||||
left = self.term_type(name, self.env)
|
||||
|
||||
op = self.visit(op_instance)
|
||||
return op, op_instance, left, right
|
||||
|
||||
def _maybe_transform_eq_ne(self, node, left=None, right=None):
|
||||
if left is None:
|
||||
left = self.visit(node.left, side="left")
|
||||
if right is None:
|
||||
right = self.visit(node.right, side="right")
|
||||
op, op_class, left, right = self._rewrite_membership_op(node, left, right)
|
||||
return op, op_class, left, right
|
||||
|
||||
def _maybe_downcast_constants(self, left, right):
|
||||
f32 = np.dtype(np.float32)
|
||||
if (
|
||||
left.is_scalar
|
||||
and hasattr(left, "value")
|
||||
and not right.is_scalar
|
||||
and right.return_type == f32
|
||||
):
|
||||
# right is a float32 array, left is a scalar
|
||||
name = self.env.add_tmp(np.float32(left.value))
|
||||
left = self.term_type(name, self.env)
|
||||
if (
|
||||
right.is_scalar
|
||||
and hasattr(right, "value")
|
||||
and not left.is_scalar
|
||||
and left.return_type == f32
|
||||
):
|
||||
# left is a float32 array, right is a scalar
|
||||
name = self.env.add_tmp(np.float32(right.value))
|
||||
right = self.term_type(name, self.env)
|
||||
|
||||
return left, right
|
||||
|
||||
def _maybe_eval(self, binop, eval_in_python):
|
||||
# eval `in` and `not in` (for now) in "partial" python space
|
||||
# things that can be evaluated in "eval" space will be turned into
|
||||
# temporary variables. for example,
|
||||
# [1,2] in a + 2 * b
|
||||
# in that case a + 2 * b will be evaluated using numexpr, and the "in"
|
||||
# call will be evaluated using isin (in python space)
|
||||
return binop.evaluate(
|
||||
self.env, self.engine, self.parser, self.term_type, eval_in_python
|
||||
)
|
||||
|
||||
def _maybe_evaluate_binop(
|
||||
self,
|
||||
op,
|
||||
op_class,
|
||||
lhs,
|
||||
rhs,
|
||||
eval_in_python=("in", "not in"),
|
||||
maybe_eval_in_python=("==", "!=", "<", ">", "<=", ">="),
|
||||
):
|
||||
res = op(lhs, rhs)
|
||||
|
||||
if res.has_invalid_return_type:
|
||||
raise TypeError(
|
||||
f"unsupported operand type(s) for {res.op}: "
|
||||
f"'{lhs.type}' and '{rhs.type}'"
|
||||
)
|
||||
|
||||
if self.engine != "pytables" and (
|
||||
res.op in CMP_OPS_SYMS
|
||||
and getattr(lhs, "is_datetime", False)
|
||||
or getattr(rhs, "is_datetime", False)
|
||||
):
|
||||
# all date ops must be done in python bc numexpr doesn't work
|
||||
# well with NaT
|
||||
return self._maybe_eval(res, self.binary_ops)
|
||||
|
||||
if res.op in eval_in_python:
|
||||
# "in"/"not in" ops are always evaluated in python
|
||||
return self._maybe_eval(res, eval_in_python)
|
||||
elif self.engine != "pytables":
|
||||
if (
|
||||
getattr(lhs, "return_type", None) == object
|
||||
or is_string_dtype(getattr(lhs, "return_type", None))
|
||||
or getattr(rhs, "return_type", None) == object
|
||||
or is_string_dtype(getattr(rhs, "return_type", None))
|
||||
):
|
||||
# evaluate "==" and "!=" in python if either of our operands
|
||||
# has an object or string return type
|
||||
return self._maybe_eval(res, eval_in_python + maybe_eval_in_python)
|
||||
return res
|
||||
|
||||
def visit_BinOp(self, node, **kwargs):
|
||||
op, op_class, left, right = self._maybe_transform_eq_ne(node)
|
||||
left, right = self._maybe_downcast_constants(left, right)
|
||||
return self._maybe_evaluate_binop(op, op_class, left, right)
|
||||
|
||||
def visit_UnaryOp(self, node, **kwargs):
|
||||
op = self.visit(node.op)
|
||||
operand = self.visit(node.operand)
|
||||
return op(operand)
|
||||
|
||||
def visit_Name(self, node, **kwargs) -> Term:
|
||||
return self.term_type(node.id, self.env, **kwargs)
|
||||
|
||||
# TODO(py314): deprecated since Python 3.8. Remove after Python 3.14 is min
|
||||
def visit_NameConstant(self, node, **kwargs) -> Term:
|
||||
return self.const_type(node.value, self.env)
|
||||
|
||||
# TODO(py314): deprecated since Python 3.8. Remove after Python 3.14 is min
|
||||
def visit_Num(self, node, **kwargs) -> Term:
|
||||
return self.const_type(node.value, self.env)
|
||||
|
||||
def visit_Constant(self, node, **kwargs) -> Term:
|
||||
return self.const_type(node.value, self.env)
|
||||
|
||||
# TODO(py314): deprecated since Python 3.8. Remove after Python 3.14 is min
|
||||
def visit_Str(self, node, **kwargs) -> Term:
|
||||
name = self.env.add_tmp(node.s)
|
||||
return self.term_type(name, self.env)
|
||||
|
||||
def visit_List(self, node, **kwargs) -> Term:
|
||||
name = self.env.add_tmp([self.visit(e)(self.env) for e in node.elts])
|
||||
return self.term_type(name, self.env)
|
||||
|
||||
visit_Tuple = visit_List
|
||||
|
||||
def visit_Index(self, node, **kwargs):
|
||||
"""df.index[4]"""
|
||||
return self.visit(node.value)
|
||||
|
||||
def visit_Subscript(self, node, **kwargs) -> Term:
|
||||
from pandas import eval as pd_eval
|
||||
|
||||
value = self.visit(node.value)
|
||||
slobj = self.visit(node.slice)
|
||||
result = pd_eval(
|
||||
slobj, local_dict=self.env, engine=self.engine, parser=self.parser
|
||||
)
|
||||
try:
|
||||
# a Term instance
|
||||
v = value.value[result]
|
||||
except AttributeError:
|
||||
# an Op instance
|
||||
lhs = pd_eval(
|
||||
value, local_dict=self.env, engine=self.engine, parser=self.parser
|
||||
)
|
||||
v = lhs[result]
|
||||
name = self.env.add_tmp(v)
|
||||
return self.term_type(name, env=self.env)
|
||||
|
||||
def visit_Slice(self, node, **kwargs) -> slice:
|
||||
"""df.index[slice(4,6)]"""
|
||||
lower = node.lower
|
||||
if lower is not None:
|
||||
lower = self.visit(lower).value
|
||||
upper = node.upper
|
||||
if upper is not None:
|
||||
upper = self.visit(upper).value
|
||||
step = node.step
|
||||
if step is not None:
|
||||
step = self.visit(step).value
|
||||
|
||||
return slice(lower, upper, step)
|
||||
|
||||
def visit_Assign(self, node, **kwargs):
|
||||
"""
|
||||
support a single assignment node, like
|
||||
|
||||
c = a + b
|
||||
|
||||
set the assigner at the top level, must be a Name node which
|
||||
might or might not exist in the resolvers
|
||||
|
||||
"""
|
||||
if len(node.targets) != 1:
|
||||
raise SyntaxError("can only assign a single expression")
|
||||
if not isinstance(node.targets[0], ast.Name):
|
||||
raise SyntaxError("left hand side of an assignment must be a single name")
|
||||
if self.env.target is None:
|
||||
raise ValueError("cannot assign without a target object")
|
||||
|
||||
try:
|
||||
assigner = self.visit(node.targets[0], **kwargs)
|
||||
except UndefinedVariableError:
|
||||
assigner = node.targets[0].id
|
||||
|
||||
self.assigner = getattr(assigner, "name", assigner)
|
||||
if self.assigner is None:
|
||||
raise SyntaxError(
|
||||
"left hand side of an assignment must be a single resolvable name"
|
||||
)
|
||||
|
||||
return self.visit(node.value, **kwargs)
|
||||
|
||||
def visit_Attribute(self, node, **kwargs):
|
||||
attr = node.attr
|
||||
value = node.value
|
||||
|
||||
ctx = node.ctx
|
||||
if isinstance(ctx, ast.Load):
|
||||
# resolve the value
|
||||
resolved = self.visit(value).value
|
||||
try:
|
||||
v = getattr(resolved, attr)
|
||||
name = self.env.add_tmp(v)
|
||||
return self.term_type(name, self.env)
|
||||
except AttributeError:
|
||||
# something like datetime.datetime where scope is overridden
|
||||
if isinstance(value, ast.Name) and value.id == attr:
|
||||
return resolved
|
||||
raise
|
||||
|
||||
raise ValueError(f"Invalid Attribute context {type(ctx).__name__}")
|
||||
|
||||
def visit_Call(self, node, side=None, **kwargs):
|
||||
if isinstance(node.func, ast.Attribute) and node.func.attr != "__call__":
|
||||
res = self.visit_Attribute(node.func)
|
||||
elif not isinstance(node.func, ast.Name):
|
||||
raise TypeError("Only named functions are supported")
|
||||
else:
|
||||
try:
|
||||
res = self.visit(node.func)
|
||||
except UndefinedVariableError:
|
||||
# Check if this is a supported function name
|
||||
try:
|
||||
res = FuncNode(node.func.id)
|
||||
except ValueError:
|
||||
# Raise original error
|
||||
raise
|
||||
|
||||
if res is None:
|
||||
# error: "expr" has no attribute "id"
|
||||
raise ValueError(
|
||||
f"Invalid function call {node.func.id}" # type: ignore[attr-defined]
|
||||
)
|
||||
if hasattr(res, "value"):
|
||||
res = res.value
|
||||
|
||||
if isinstance(res, FuncNode):
|
||||
new_args = [self.visit(arg) for arg in node.args]
|
||||
|
||||
if node.keywords:
|
||||
raise TypeError(
|
||||
f'Function "{res.name}" does not support keyword arguments'
|
||||
)
|
||||
|
||||
return res(*new_args)
|
||||
|
||||
else:
|
||||
new_args = [self.visit(arg)(self.env) for arg in node.args]
|
||||
|
||||
for key in node.keywords:
|
||||
if not isinstance(key, ast.keyword):
|
||||
# error: "expr" has no attribute "id"
|
||||
raise ValueError(
|
||||
"keyword error in function call "
|
||||
f"'{node.func.id}'" # type: ignore[attr-defined]
|
||||
)
|
||||
|
||||
if key.arg:
|
||||
kwargs[key.arg] = self.visit(key.value)(self.env)
|
||||
|
||||
name = self.env.add_tmp(res(*new_args, **kwargs))
|
||||
return self.term_type(name=name, env=self.env)
|
||||
|
||||
def translate_In(self, op):
|
||||
return op
|
||||
|
||||
def visit_Compare(self, node, **kwargs):
|
||||
ops = node.ops
|
||||
comps = node.comparators
|
||||
|
||||
# base case: we have something like a CMP b
|
||||
if len(comps) == 1:
|
||||
op = self.translate_In(ops[0])
|
||||
binop = ast.BinOp(op=op, left=node.left, right=comps[0])
|
||||
return self.visit(binop)
|
||||
|
||||
# recursive case: we have a chained comparison, a CMP b CMP c, etc.
|
||||
left = node.left
|
||||
values = []
|
||||
for op, comp in zip(ops, comps):
|
||||
new_node = self.visit(
|
||||
ast.Compare(comparators=[comp], left=left, ops=[self.translate_In(op)])
|
||||
)
|
||||
left = comp
|
||||
values.append(new_node)
|
||||
return self.visit(ast.BoolOp(op=ast.And(), values=values))
|
||||
|
||||
def _try_visit_binop(self, bop):
|
||||
if isinstance(bop, (Op, Term)):
|
||||
return bop
|
||||
return self.visit(bop)
|
||||
|
||||
def visit_BoolOp(self, node, **kwargs):
|
||||
def visitor(x, y):
|
||||
lhs = self._try_visit_binop(x)
|
||||
rhs = self._try_visit_binop(y)
|
||||
|
||||
op, op_class, lhs, rhs = self._maybe_transform_eq_ne(node, lhs, rhs)
|
||||
return self._maybe_evaluate_binop(op, node.op, lhs, rhs)
|
||||
|
||||
operands = node.values
|
||||
return reduce(visitor, operands)
|
||||
|
||||
|
||||
_python_not_supported = frozenset(["Dict", "BoolOp", "In", "NotIn"])
|
||||
_numexpr_supported_calls = frozenset(REDUCTIONS + MATHOPS)
|
||||
|
||||
|
||||
@disallow(
|
||||
(_unsupported_nodes | _python_not_supported)
|
||||
- (_boolop_nodes | frozenset(["BoolOp", "Attribute", "In", "NotIn", "Tuple"]))
|
||||
)
|
||||
class PandasExprVisitor(BaseExprVisitor):
|
||||
def __init__(
|
||||
self,
|
||||
env,
|
||||
engine,
|
||||
parser,
|
||||
preparser=partial(
|
||||
_preparse,
|
||||
f=_compose(_replace_locals, _replace_booleans, clean_backtick_quoted_toks),
|
||||
),
|
||||
) -> None:
|
||||
super().__init__(env, engine, parser, preparser)
|
||||
|
||||
|
||||
@disallow(_unsupported_nodes | _python_not_supported | frozenset(["Not"]))
|
||||
class PythonExprVisitor(BaseExprVisitor):
|
||||
def __init__(
|
||||
self, env, engine, parser, preparser=lambda source, f=None: source
|
||||
) -> None:
|
||||
super().__init__(env, engine, parser, preparser=preparser)
|
||||
|
||||
|
||||
class Expr:
|
||||
"""
|
||||
Object encapsulating an expression.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
expr : str
|
||||
engine : str, optional, default 'numexpr'
|
||||
parser : str, optional, default 'pandas'
|
||||
env : Scope, optional, default None
|
||||
level : int, optional, default 2
|
||||
"""
|
||||
|
||||
env: Scope
|
||||
engine: str
|
||||
parser: str
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
expr,
|
||||
engine: str = "numexpr",
|
||||
parser: str = "pandas",
|
||||
env: Scope | None = None,
|
||||
level: int = 0,
|
||||
) -> None:
|
||||
self.expr = expr
|
||||
self.env = env or Scope(level=level + 1)
|
||||
self.engine = engine
|
||||
self.parser = parser
|
||||
self._visitor = PARSERS[parser](self.env, self.engine, self.parser)
|
||||
self.terms = self.parse()
|
||||
|
||||
@property
|
||||
def assigner(self):
|
||||
return getattr(self._visitor, "assigner", None)
|
||||
|
||||
def __call__(self):
|
||||
return self.terms(self.env)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return printing.pprint_thing(self.terms)
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.expr)
|
||||
|
||||
def parse(self):
|
||||
"""
|
||||
Parse an expression.
|
||||
"""
|
||||
return self._visitor.visit(self.expr)
|
||||
|
||||
@property
|
||||
def names(self):
|
||||
"""
|
||||
Get the names in an expression.
|
||||
"""
|
||||
if is_term(self.terms):
|
||||
return frozenset([self.terms.name])
|
||||
return frozenset(term.name for term in com.flatten(self.terms))
|
||||
|
||||
|
||||
PARSERS = {"python": PythonExprVisitor, "pandas": PandasExprVisitor}
|
||||
@ -0,0 +1,286 @@
|
||||
"""
|
||||
Expressions
|
||||
-----------
|
||||
|
||||
Offer fast expression evaluation through numexpr
|
||||
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import operator
|
||||
from typing import TYPE_CHECKING
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._config import get_option
|
||||
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
from pandas.core import roperator
|
||||
from pandas.core.computation.check import NUMEXPR_INSTALLED
|
||||
|
||||
if NUMEXPR_INSTALLED:
|
||||
import numexpr as ne
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import FuncType
|
||||
|
||||
_TEST_MODE: bool | None = None
|
||||
_TEST_RESULT: list[bool] = []
|
||||
USE_NUMEXPR = NUMEXPR_INSTALLED
|
||||
_evaluate: FuncType | None = None
|
||||
_where: FuncType | None = None
|
||||
|
||||
# the set of dtypes that we will allow pass to numexpr
|
||||
_ALLOWED_DTYPES = {
|
||||
"evaluate": {"int64", "int32", "float64", "float32", "bool"},
|
||||
"where": {"int64", "float64", "bool"},
|
||||
}
|
||||
|
||||
# the minimum prod shape that we will use numexpr
|
||||
_MIN_ELEMENTS = 1_000_000
|
||||
|
||||
|
||||
def set_use_numexpr(v: bool = True) -> None:
|
||||
# set/unset to use numexpr
|
||||
global USE_NUMEXPR
|
||||
if NUMEXPR_INSTALLED:
|
||||
USE_NUMEXPR = v
|
||||
|
||||
# choose what we are going to do
|
||||
global _evaluate, _where
|
||||
|
||||
_evaluate = _evaluate_numexpr if USE_NUMEXPR else _evaluate_standard
|
||||
_where = _where_numexpr if USE_NUMEXPR else _where_standard
|
||||
|
||||
|
||||
def set_numexpr_threads(n=None) -> None:
|
||||
# if we are using numexpr, set the threads to n
|
||||
# otherwise reset
|
||||
if NUMEXPR_INSTALLED and USE_NUMEXPR:
|
||||
if n is None:
|
||||
n = ne.detect_number_of_cores()
|
||||
ne.set_num_threads(n)
|
||||
|
||||
|
||||
def _evaluate_standard(op, op_str, a, b):
|
||||
"""
|
||||
Standard evaluation.
|
||||
"""
|
||||
if _TEST_MODE:
|
||||
_store_test_result(False)
|
||||
return op(a, b)
|
||||
|
||||
|
||||
def _can_use_numexpr(op, op_str, a, b, dtype_check) -> bool:
|
||||
"""return a boolean if we WILL be using numexpr"""
|
||||
if op_str is not None:
|
||||
# required min elements (otherwise we are adding overhead)
|
||||
if a.size > _MIN_ELEMENTS:
|
||||
# check for dtype compatibility
|
||||
dtypes: set[str] = set()
|
||||
for o in [a, b]:
|
||||
# ndarray and Series Case
|
||||
if hasattr(o, "dtype"):
|
||||
dtypes |= {o.dtype.name}
|
||||
|
||||
# allowed are a superset
|
||||
if not len(dtypes) or _ALLOWED_DTYPES[dtype_check] >= dtypes:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def _evaluate_numexpr(op, op_str, a, b):
|
||||
result = None
|
||||
|
||||
if _can_use_numexpr(op, op_str, a, b, "evaluate"):
|
||||
is_reversed = op.__name__.strip("_").startswith("r")
|
||||
if is_reversed:
|
||||
# we were originally called by a reversed op method
|
||||
a, b = b, a
|
||||
|
||||
a_value = a
|
||||
b_value = b
|
||||
|
||||
try:
|
||||
result = ne.evaluate(
|
||||
f"a_value {op_str} b_value",
|
||||
local_dict={"a_value": a_value, "b_value": b_value},
|
||||
casting="safe",
|
||||
)
|
||||
except TypeError:
|
||||
# numexpr raises eg for array ** array with integers
|
||||
# (https://github.com/pydata/numexpr/issues/379)
|
||||
pass
|
||||
except NotImplementedError:
|
||||
if _bool_arith_fallback(op_str, a, b):
|
||||
pass
|
||||
else:
|
||||
raise
|
||||
|
||||
if is_reversed:
|
||||
# reverse order to original for fallback
|
||||
a, b = b, a
|
||||
|
||||
if _TEST_MODE:
|
||||
_store_test_result(result is not None)
|
||||
|
||||
if result is None:
|
||||
result = _evaluate_standard(op, op_str, a, b)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
_op_str_mapping = {
|
||||
operator.add: "+",
|
||||
roperator.radd: "+",
|
||||
operator.mul: "*",
|
||||
roperator.rmul: "*",
|
||||
operator.sub: "-",
|
||||
roperator.rsub: "-",
|
||||
operator.truediv: "/",
|
||||
roperator.rtruediv: "/",
|
||||
# floordiv not supported by numexpr 2.x
|
||||
operator.floordiv: None,
|
||||
roperator.rfloordiv: None,
|
||||
# we require Python semantics for mod of negative for backwards compatibility
|
||||
# see https://github.com/pydata/numexpr/issues/365
|
||||
# so sticking with unaccelerated for now GH#36552
|
||||
operator.mod: None,
|
||||
roperator.rmod: None,
|
||||
operator.pow: "**",
|
||||
roperator.rpow: "**",
|
||||
operator.eq: "==",
|
||||
operator.ne: "!=",
|
||||
operator.le: "<=",
|
||||
operator.lt: "<",
|
||||
operator.ge: ">=",
|
||||
operator.gt: ">",
|
||||
operator.and_: "&",
|
||||
roperator.rand_: "&",
|
||||
operator.or_: "|",
|
||||
roperator.ror_: "|",
|
||||
operator.xor: "^",
|
||||
roperator.rxor: "^",
|
||||
divmod: None,
|
||||
roperator.rdivmod: None,
|
||||
}
|
||||
|
||||
|
||||
def _where_standard(cond, a, b):
|
||||
# Caller is responsible for extracting ndarray if necessary
|
||||
return np.where(cond, a, b)
|
||||
|
||||
|
||||
def _where_numexpr(cond, a, b):
|
||||
# Caller is responsible for extracting ndarray if necessary
|
||||
result = None
|
||||
|
||||
if _can_use_numexpr(None, "where", a, b, "where"):
|
||||
result = ne.evaluate(
|
||||
"where(cond_value, a_value, b_value)",
|
||||
local_dict={"cond_value": cond, "a_value": a, "b_value": b},
|
||||
casting="safe",
|
||||
)
|
||||
|
||||
if result is None:
|
||||
result = _where_standard(cond, a, b)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# turn myself on
|
||||
set_use_numexpr(get_option("compute.use_numexpr"))
|
||||
|
||||
|
||||
def _has_bool_dtype(x):
|
||||
try:
|
||||
return x.dtype == bool
|
||||
except AttributeError:
|
||||
return isinstance(x, (bool, np.bool_))
|
||||
|
||||
|
||||
_BOOL_OP_UNSUPPORTED = {"+": "|", "*": "&", "-": "^"}
|
||||
|
||||
|
||||
def _bool_arith_fallback(op_str, a, b) -> bool:
|
||||
"""
|
||||
Check if we should fallback to the python `_evaluate_standard` in case
|
||||
of an unsupported operation by numexpr, which is the case for some
|
||||
boolean ops.
|
||||
"""
|
||||
if _has_bool_dtype(a) and _has_bool_dtype(b):
|
||||
if op_str in _BOOL_OP_UNSUPPORTED:
|
||||
warnings.warn(
|
||||
f"evaluating in Python space because the {repr(op_str)} "
|
||||
"operator is not supported by numexpr for the bool dtype, "
|
||||
f"use {repr(_BOOL_OP_UNSUPPORTED[op_str])} instead.",
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def evaluate(op, a, b, use_numexpr: bool = True):
|
||||
"""
|
||||
Evaluate and return the expression of the op on a and b.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
op : the actual operand
|
||||
a : left operand
|
||||
b : right operand
|
||||
use_numexpr : bool, default True
|
||||
Whether to try to use numexpr.
|
||||
"""
|
||||
op_str = _op_str_mapping[op]
|
||||
if op_str is not None:
|
||||
if use_numexpr:
|
||||
# error: "None" not callable
|
||||
return _evaluate(op, op_str, a, b) # type: ignore[misc]
|
||||
return _evaluate_standard(op, op_str, a, b)
|
||||
|
||||
|
||||
def where(cond, a, b, use_numexpr: bool = True):
|
||||
"""
|
||||
Evaluate the where condition cond on a and b.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cond : np.ndarray[bool]
|
||||
a : return if cond is True
|
||||
b : return if cond is False
|
||||
use_numexpr : bool, default True
|
||||
Whether to try to use numexpr.
|
||||
"""
|
||||
assert _where is not None
|
||||
return _where(cond, a, b) if use_numexpr else _where_standard(cond, a, b)
|
||||
|
||||
|
||||
def set_test_mode(v: bool = True) -> None:
|
||||
"""
|
||||
Keeps track of whether numexpr was used.
|
||||
|
||||
Stores an additional ``True`` for every successful use of evaluate with
|
||||
numexpr since the last ``get_test_result``.
|
||||
"""
|
||||
global _TEST_MODE, _TEST_RESULT
|
||||
_TEST_MODE = v
|
||||
_TEST_RESULT = []
|
||||
|
||||
|
||||
def _store_test_result(used_numexpr: bool) -> None:
|
||||
if used_numexpr:
|
||||
_TEST_RESULT.append(used_numexpr)
|
||||
|
||||
|
||||
def get_test_result() -> list[bool]:
|
||||
"""
|
||||
Get test result and reset test_results.
|
||||
"""
|
||||
global _TEST_RESULT
|
||||
res = _TEST_RESULT
|
||||
_TEST_RESULT = []
|
||||
return res
|
||||
572
lib/python3.11/site-packages/pandas/core/computation/ops.py
Normal file
572
lib/python3.11/site-packages/pandas/core/computation/ops.py
Normal file
@ -0,0 +1,572 @@
|
||||
"""
|
||||
Operator classes for eval.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
from functools import partial
|
||||
import operator
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Callable,
|
||||
Literal,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs.tslibs import Timestamp
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_list_like,
|
||||
is_scalar,
|
||||
)
|
||||
|
||||
import pandas.core.common as com
|
||||
from pandas.core.computation.common import (
|
||||
ensure_decoded,
|
||||
result_type_many,
|
||||
)
|
||||
from pandas.core.computation.scope import DEFAULT_GLOBALS
|
||||
|
||||
from pandas.io.formats.printing import (
|
||||
pprint_thing,
|
||||
pprint_thing_encoded,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import (
|
||||
Iterable,
|
||||
Iterator,
|
||||
)
|
||||
|
||||
REDUCTIONS = ("sum", "prod", "min", "max")
|
||||
|
||||
_unary_math_ops = (
|
||||
"sin",
|
||||
"cos",
|
||||
"exp",
|
||||
"log",
|
||||
"expm1",
|
||||
"log1p",
|
||||
"sqrt",
|
||||
"sinh",
|
||||
"cosh",
|
||||
"tanh",
|
||||
"arcsin",
|
||||
"arccos",
|
||||
"arctan",
|
||||
"arccosh",
|
||||
"arcsinh",
|
||||
"arctanh",
|
||||
"abs",
|
||||
"log10",
|
||||
"floor",
|
||||
"ceil",
|
||||
)
|
||||
_binary_math_ops = ("arctan2",)
|
||||
|
||||
MATHOPS = _unary_math_ops + _binary_math_ops
|
||||
|
||||
|
||||
LOCAL_TAG = "__pd_eval_local_"
|
||||
|
||||
|
||||
class Term:
|
||||
def __new__(cls, name, env, side=None, encoding=None):
|
||||
klass = Constant if not isinstance(name, str) else cls
|
||||
# error: Argument 2 for "super" not an instance of argument 1
|
||||
supr_new = super(Term, klass).__new__ # type: ignore[misc]
|
||||
return supr_new(klass)
|
||||
|
||||
is_local: bool
|
||||
|
||||
def __init__(self, name, env, side=None, encoding=None) -> None:
|
||||
# name is a str for Term, but may be something else for subclasses
|
||||
self._name = name
|
||||
self.env = env
|
||||
self.side = side
|
||||
tname = str(name)
|
||||
self.is_local = tname.startswith(LOCAL_TAG) or tname in DEFAULT_GLOBALS
|
||||
self._value = self._resolve_name()
|
||||
self.encoding = encoding
|
||||
|
||||
@property
|
||||
def local_name(self) -> str:
|
||||
return self.name.replace(LOCAL_TAG, "")
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return pprint_thing(self.name)
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
return self.value
|
||||
|
||||
def evaluate(self, *args, **kwargs) -> Term:
|
||||
return self
|
||||
|
||||
def _resolve_name(self):
|
||||
local_name = str(self.local_name)
|
||||
is_local = self.is_local
|
||||
if local_name in self.env.scope and isinstance(
|
||||
self.env.scope[local_name], type
|
||||
):
|
||||
is_local = False
|
||||
|
||||
res = self.env.resolve(local_name, is_local=is_local)
|
||||
self.update(res)
|
||||
|
||||
if hasattr(res, "ndim") and res.ndim > 2:
|
||||
raise NotImplementedError(
|
||||
"N-dimensional objects, where N > 2, are not supported with eval"
|
||||
)
|
||||
return res
|
||||
|
||||
def update(self, value) -> None:
|
||||
"""
|
||||
search order for local (i.e., @variable) variables:
|
||||
|
||||
scope, key_variable
|
||||
[('locals', 'local_name'),
|
||||
('globals', 'local_name'),
|
||||
('locals', 'key'),
|
||||
('globals', 'key')]
|
||||
"""
|
||||
key = self.name
|
||||
|
||||
# if it's a variable name (otherwise a constant)
|
||||
if isinstance(key, str):
|
||||
self.env.swapkey(self.local_name, key, new_value=value)
|
||||
|
||||
self.value = value
|
||||
|
||||
@property
|
||||
def is_scalar(self) -> bool:
|
||||
return is_scalar(self._value)
|
||||
|
||||
@property
|
||||
def type(self):
|
||||
try:
|
||||
# potentially very slow for large, mixed dtype frames
|
||||
return self._value.values.dtype
|
||||
except AttributeError:
|
||||
try:
|
||||
# ndarray
|
||||
return self._value.dtype
|
||||
except AttributeError:
|
||||
# scalar
|
||||
return type(self._value)
|
||||
|
||||
return_type = type
|
||||
|
||||
@property
|
||||
def raw(self) -> str:
|
||||
return f"{type(self).__name__}(name={repr(self.name)}, type={self.type})"
|
||||
|
||||
@property
|
||||
def is_datetime(self) -> bool:
|
||||
try:
|
||||
t = self.type.type
|
||||
except AttributeError:
|
||||
t = self.type
|
||||
|
||||
return issubclass(t, (datetime, np.datetime64))
|
||||
|
||||
@property
|
||||
def value(self):
|
||||
return self._value
|
||||
|
||||
@value.setter
|
||||
def value(self, new_value) -> None:
|
||||
self._value = new_value
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
return self._name
|
||||
|
||||
@property
|
||||
def ndim(self) -> int:
|
||||
return self._value.ndim
|
||||
|
||||
|
||||
class Constant(Term):
|
||||
def _resolve_name(self):
|
||||
return self._name
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
return self.value
|
||||
|
||||
def __repr__(self) -> str:
|
||||
# in python 2 str() of float
|
||||
# can truncate shorter than repr()
|
||||
return repr(self.name)
|
||||
|
||||
|
||||
_bool_op_map = {"not": "~", "and": "&", "or": "|"}
|
||||
|
||||
|
||||
class Op:
|
||||
"""
|
||||
Hold an operator of arbitrary arity.
|
||||
"""
|
||||
|
||||
op: str
|
||||
|
||||
def __init__(self, op: str, operands: Iterable[Term | Op], encoding=None) -> None:
|
||||
self.op = _bool_op_map.get(op, op)
|
||||
self.operands = operands
|
||||
self.encoding = encoding
|
||||
|
||||
def __iter__(self) -> Iterator:
|
||||
return iter(self.operands)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
"""
|
||||
Print a generic n-ary operator and its operands using infix notation.
|
||||
"""
|
||||
# recurse over the operands
|
||||
parened = (f"({pprint_thing(opr)})" for opr in self.operands)
|
||||
return pprint_thing(f" {self.op} ".join(parened))
|
||||
|
||||
@property
|
||||
def return_type(self):
|
||||
# clobber types to bool if the op is a boolean operator
|
||||
if self.op in (CMP_OPS_SYMS + BOOL_OPS_SYMS):
|
||||
return np.bool_
|
||||
return result_type_many(*(term.type for term in com.flatten(self)))
|
||||
|
||||
@property
|
||||
def has_invalid_return_type(self) -> bool:
|
||||
types = self.operand_types
|
||||
obj_dtype_set = frozenset([np.dtype("object")])
|
||||
return self.return_type == object and types - obj_dtype_set
|
||||
|
||||
@property
|
||||
def operand_types(self):
|
||||
return frozenset(term.type for term in com.flatten(self))
|
||||
|
||||
@property
|
||||
def is_scalar(self) -> bool:
|
||||
return all(operand.is_scalar for operand in self.operands)
|
||||
|
||||
@property
|
||||
def is_datetime(self) -> bool:
|
||||
try:
|
||||
t = self.return_type.type
|
||||
except AttributeError:
|
||||
t = self.return_type
|
||||
|
||||
return issubclass(t, (datetime, np.datetime64))
|
||||
|
||||
|
||||
def _in(x, y):
|
||||
"""
|
||||
Compute the vectorized membership of ``x in y`` if possible, otherwise
|
||||
use Python.
|
||||
"""
|
||||
try:
|
||||
return x.isin(y)
|
||||
except AttributeError:
|
||||
if is_list_like(x):
|
||||
try:
|
||||
return y.isin(x)
|
||||
except AttributeError:
|
||||
pass
|
||||
return x in y
|
||||
|
||||
|
||||
def _not_in(x, y):
|
||||
"""
|
||||
Compute the vectorized membership of ``x not in y`` if possible,
|
||||
otherwise use Python.
|
||||
"""
|
||||
try:
|
||||
return ~x.isin(y)
|
||||
except AttributeError:
|
||||
if is_list_like(x):
|
||||
try:
|
||||
return ~y.isin(x)
|
||||
except AttributeError:
|
||||
pass
|
||||
return x not in y
|
||||
|
||||
|
||||
CMP_OPS_SYMS = (">", "<", ">=", "<=", "==", "!=", "in", "not in")
|
||||
_cmp_ops_funcs = (
|
||||
operator.gt,
|
||||
operator.lt,
|
||||
operator.ge,
|
||||
operator.le,
|
||||
operator.eq,
|
||||
operator.ne,
|
||||
_in,
|
||||
_not_in,
|
||||
)
|
||||
_cmp_ops_dict = dict(zip(CMP_OPS_SYMS, _cmp_ops_funcs))
|
||||
|
||||
BOOL_OPS_SYMS = ("&", "|", "and", "or")
|
||||
_bool_ops_funcs = (operator.and_, operator.or_, operator.and_, operator.or_)
|
||||
_bool_ops_dict = dict(zip(BOOL_OPS_SYMS, _bool_ops_funcs))
|
||||
|
||||
ARITH_OPS_SYMS = ("+", "-", "*", "/", "**", "//", "%")
|
||||
_arith_ops_funcs = (
|
||||
operator.add,
|
||||
operator.sub,
|
||||
operator.mul,
|
||||
operator.truediv,
|
||||
operator.pow,
|
||||
operator.floordiv,
|
||||
operator.mod,
|
||||
)
|
||||
_arith_ops_dict = dict(zip(ARITH_OPS_SYMS, _arith_ops_funcs))
|
||||
|
||||
SPECIAL_CASE_ARITH_OPS_SYMS = ("**", "//", "%")
|
||||
_special_case_arith_ops_funcs = (operator.pow, operator.floordiv, operator.mod)
|
||||
_special_case_arith_ops_dict = dict(
|
||||
zip(SPECIAL_CASE_ARITH_OPS_SYMS, _special_case_arith_ops_funcs)
|
||||
)
|
||||
|
||||
_binary_ops_dict = {}
|
||||
|
||||
for d in (_cmp_ops_dict, _bool_ops_dict, _arith_ops_dict):
|
||||
_binary_ops_dict.update(d)
|
||||
|
||||
|
||||
def is_term(obj) -> bool:
|
||||
return isinstance(obj, Term)
|
||||
|
||||
|
||||
class BinOp(Op):
|
||||
"""
|
||||
Hold a binary operator and its operands.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
op : str
|
||||
lhs : Term or Op
|
||||
rhs : Term or Op
|
||||
"""
|
||||
|
||||
def __init__(self, op: str, lhs, rhs) -> None:
|
||||
super().__init__(op, (lhs, rhs))
|
||||
self.lhs = lhs
|
||||
self.rhs = rhs
|
||||
|
||||
self._disallow_scalar_only_bool_ops()
|
||||
|
||||
self.convert_values()
|
||||
|
||||
try:
|
||||
self.func = _binary_ops_dict[op]
|
||||
except KeyError as err:
|
||||
# has to be made a list for python3
|
||||
keys = list(_binary_ops_dict.keys())
|
||||
raise ValueError(
|
||||
f"Invalid binary operator {repr(op)}, valid operators are {keys}"
|
||||
) from err
|
||||
|
||||
def __call__(self, env):
|
||||
"""
|
||||
Recursively evaluate an expression in Python space.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
env : Scope
|
||||
|
||||
Returns
|
||||
-------
|
||||
object
|
||||
The result of an evaluated expression.
|
||||
"""
|
||||
# recurse over the left/right nodes
|
||||
left = self.lhs(env)
|
||||
right = self.rhs(env)
|
||||
|
||||
return self.func(left, right)
|
||||
|
||||
def evaluate(self, env, engine: str, parser, term_type, eval_in_python):
|
||||
"""
|
||||
Evaluate a binary operation *before* being passed to the engine.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
env : Scope
|
||||
engine : str
|
||||
parser : str
|
||||
term_type : type
|
||||
eval_in_python : list
|
||||
|
||||
Returns
|
||||
-------
|
||||
term_type
|
||||
The "pre-evaluated" expression as an instance of ``term_type``
|
||||
"""
|
||||
if engine == "python":
|
||||
res = self(env)
|
||||
else:
|
||||
# recurse over the left/right nodes
|
||||
|
||||
left = self.lhs.evaluate(
|
||||
env,
|
||||
engine=engine,
|
||||
parser=parser,
|
||||
term_type=term_type,
|
||||
eval_in_python=eval_in_python,
|
||||
)
|
||||
|
||||
right = self.rhs.evaluate(
|
||||
env,
|
||||
engine=engine,
|
||||
parser=parser,
|
||||
term_type=term_type,
|
||||
eval_in_python=eval_in_python,
|
||||
)
|
||||
|
||||
# base cases
|
||||
if self.op in eval_in_python:
|
||||
res = self.func(left.value, right.value)
|
||||
else:
|
||||
from pandas.core.computation.eval import eval
|
||||
|
||||
res = eval(self, local_dict=env, engine=engine, parser=parser)
|
||||
|
||||
name = env.add_tmp(res)
|
||||
return term_type(name, env=env)
|
||||
|
||||
def convert_values(self) -> None:
|
||||
"""
|
||||
Convert datetimes to a comparable value in an expression.
|
||||
"""
|
||||
|
||||
def stringify(value):
|
||||
encoder: Callable
|
||||
if self.encoding is not None:
|
||||
encoder = partial(pprint_thing_encoded, encoding=self.encoding)
|
||||
else:
|
||||
encoder = pprint_thing
|
||||
return encoder(value)
|
||||
|
||||
lhs, rhs = self.lhs, self.rhs
|
||||
|
||||
if is_term(lhs) and lhs.is_datetime and is_term(rhs) and rhs.is_scalar:
|
||||
v = rhs.value
|
||||
if isinstance(v, (int, float)):
|
||||
v = stringify(v)
|
||||
v = Timestamp(ensure_decoded(v))
|
||||
if v.tz is not None:
|
||||
v = v.tz_convert("UTC")
|
||||
self.rhs.update(v)
|
||||
|
||||
if is_term(rhs) and rhs.is_datetime and is_term(lhs) and lhs.is_scalar:
|
||||
v = lhs.value
|
||||
if isinstance(v, (int, float)):
|
||||
v = stringify(v)
|
||||
v = Timestamp(ensure_decoded(v))
|
||||
if v.tz is not None:
|
||||
v = v.tz_convert("UTC")
|
||||
self.lhs.update(v)
|
||||
|
||||
def _disallow_scalar_only_bool_ops(self):
|
||||
rhs = self.rhs
|
||||
lhs = self.lhs
|
||||
|
||||
# GH#24883 unwrap dtype if necessary to ensure we have a type object
|
||||
rhs_rt = rhs.return_type
|
||||
rhs_rt = getattr(rhs_rt, "type", rhs_rt)
|
||||
lhs_rt = lhs.return_type
|
||||
lhs_rt = getattr(lhs_rt, "type", lhs_rt)
|
||||
if (
|
||||
(lhs.is_scalar or rhs.is_scalar)
|
||||
and self.op in _bool_ops_dict
|
||||
and (
|
||||
not (
|
||||
issubclass(rhs_rt, (bool, np.bool_))
|
||||
and issubclass(lhs_rt, (bool, np.bool_))
|
||||
)
|
||||
)
|
||||
):
|
||||
raise NotImplementedError("cannot evaluate scalar only bool ops")
|
||||
|
||||
|
||||
def isnumeric(dtype) -> bool:
|
||||
return issubclass(np.dtype(dtype).type, np.number)
|
||||
|
||||
|
||||
UNARY_OPS_SYMS = ("+", "-", "~", "not")
|
||||
_unary_ops_funcs = (operator.pos, operator.neg, operator.invert, operator.invert)
|
||||
_unary_ops_dict = dict(zip(UNARY_OPS_SYMS, _unary_ops_funcs))
|
||||
|
||||
|
||||
class UnaryOp(Op):
|
||||
"""
|
||||
Hold a unary operator and its operands.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
op : str
|
||||
The token used to represent the operator.
|
||||
operand : Term or Op
|
||||
The Term or Op operand to the operator.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
* If no function associated with the passed operator token is found.
|
||||
"""
|
||||
|
||||
def __init__(self, op: Literal["+", "-", "~", "not"], operand) -> None:
|
||||
super().__init__(op, (operand,))
|
||||
self.operand = operand
|
||||
|
||||
try:
|
||||
self.func = _unary_ops_dict[op]
|
||||
except KeyError as err:
|
||||
raise ValueError(
|
||||
f"Invalid unary operator {repr(op)}, "
|
||||
f"valid operators are {UNARY_OPS_SYMS}"
|
||||
) from err
|
||||
|
||||
def __call__(self, env) -> MathCall:
|
||||
operand = self.operand(env)
|
||||
# error: Cannot call function of unknown type
|
||||
return self.func(operand) # type: ignore[operator]
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return pprint_thing(f"{self.op}({self.operand})")
|
||||
|
||||
@property
|
||||
def return_type(self) -> np.dtype:
|
||||
operand = self.operand
|
||||
if operand.return_type == np.dtype("bool"):
|
||||
return np.dtype("bool")
|
||||
if isinstance(operand, Op) and (
|
||||
operand.op in _cmp_ops_dict or operand.op in _bool_ops_dict
|
||||
):
|
||||
return np.dtype("bool")
|
||||
return np.dtype("int")
|
||||
|
||||
|
||||
class MathCall(Op):
|
||||
def __init__(self, func, args) -> None:
|
||||
super().__init__(func.name, args)
|
||||
self.func = func
|
||||
|
||||
def __call__(self, env):
|
||||
# error: "Op" not callable
|
||||
operands = [op(env) for op in self.operands] # type: ignore[operator]
|
||||
return self.func.func(*operands)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
operands = map(str, self.operands)
|
||||
return pprint_thing(f"{self.op}({','.join(operands)})")
|
||||
|
||||
|
||||
class FuncNode:
|
||||
def __init__(self, name: str) -> None:
|
||||
if name not in MATHOPS:
|
||||
raise ValueError(f'"{name}" is not a supported function')
|
||||
self.name = name
|
||||
self.func = getattr(np, name)
|
||||
|
||||
def __call__(self, *args) -> MathCall:
|
||||
return MathCall(self, args)
|
||||
198
lib/python3.11/site-packages/pandas/core/computation/parsing.py
Normal file
198
lib/python3.11/site-packages/pandas/core/computation/parsing.py
Normal file
@ -0,0 +1,198 @@
|
||||
"""
|
||||
:func:`~pandas.eval` source string parsing functions
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from io import StringIO
|
||||
from keyword import iskeyword
|
||||
import token
|
||||
import tokenize
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import (
|
||||
Hashable,
|
||||
Iterator,
|
||||
)
|
||||
|
||||
# A token value Python's tokenizer probably will never use.
|
||||
BACKTICK_QUOTED_STRING = 100
|
||||
|
||||
|
||||
def create_valid_python_identifier(name: str) -> str:
|
||||
"""
|
||||
Create valid Python identifiers from any string.
|
||||
|
||||
Check if name contains any special characters. If it contains any
|
||||
special characters, the special characters will be replaced by
|
||||
a special string and a prefix is added.
|
||||
|
||||
Raises
|
||||
------
|
||||
SyntaxError
|
||||
If the returned name is not a Python valid identifier, raise an exception.
|
||||
This can happen if there is a hashtag in the name, as the tokenizer will
|
||||
than terminate and not find the backtick.
|
||||
But also for characters that fall out of the range of (U+0001..U+007F).
|
||||
"""
|
||||
if name.isidentifier() and not iskeyword(name):
|
||||
return name
|
||||
|
||||
# Create a dict with the special characters and their replacement string.
|
||||
# EXACT_TOKEN_TYPES contains these special characters
|
||||
# token.tok_name contains a readable description of the replacement string.
|
||||
special_characters_replacements = {
|
||||
char: f"_{token.tok_name[tokval]}_"
|
||||
for char, tokval in (tokenize.EXACT_TOKEN_TYPES.items())
|
||||
}
|
||||
special_characters_replacements.update(
|
||||
{
|
||||
" ": "_",
|
||||
"?": "_QUESTIONMARK_",
|
||||
"!": "_EXCLAMATIONMARK_",
|
||||
"$": "_DOLLARSIGN_",
|
||||
"€": "_EUROSIGN_",
|
||||
"°": "_DEGREESIGN_",
|
||||
# Including quotes works, but there are exceptions.
|
||||
"'": "_SINGLEQUOTE_",
|
||||
'"': "_DOUBLEQUOTE_",
|
||||
# Currently not possible. Terminates parser and won't find backtick.
|
||||
# "#": "_HASH_",
|
||||
}
|
||||
)
|
||||
|
||||
name = "".join([special_characters_replacements.get(char, char) for char in name])
|
||||
name = f"BACKTICK_QUOTED_STRING_{name}"
|
||||
|
||||
if not name.isidentifier():
|
||||
raise SyntaxError(f"Could not convert '{name}' to a valid Python identifier.")
|
||||
|
||||
return name
|
||||
|
||||
|
||||
def clean_backtick_quoted_toks(tok: tuple[int, str]) -> tuple[int, str]:
|
||||
"""
|
||||
Clean up a column name if surrounded by backticks.
|
||||
|
||||
Backtick quoted string are indicated by a certain tokval value. If a string
|
||||
is a backtick quoted token it will processed by
|
||||
:func:`_create_valid_python_identifier` so that the parser can find this
|
||||
string when the query is executed.
|
||||
In this case the tok will get the NAME tokval.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
tok : tuple of int, str
|
||||
ints correspond to the all caps constants in the tokenize module
|
||||
|
||||
Returns
|
||||
-------
|
||||
tok : Tuple[int, str]
|
||||
Either the input or token or the replacement values
|
||||
"""
|
||||
toknum, tokval = tok
|
||||
if toknum == BACKTICK_QUOTED_STRING:
|
||||
return tokenize.NAME, create_valid_python_identifier(tokval)
|
||||
return toknum, tokval
|
||||
|
||||
|
||||
def clean_column_name(name: Hashable) -> Hashable:
|
||||
"""
|
||||
Function to emulate the cleaning of a backtick quoted name.
|
||||
|
||||
The purpose for this function is to see what happens to the name of
|
||||
identifier if it goes to the process of being parsed a Python code
|
||||
inside a backtick quoted string and than being cleaned
|
||||
(removed of any special characters).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name : hashable
|
||||
Name to be cleaned.
|
||||
|
||||
Returns
|
||||
-------
|
||||
name : hashable
|
||||
Returns the name after tokenizing and cleaning.
|
||||
|
||||
Notes
|
||||
-----
|
||||
For some cases, a name cannot be converted to a valid Python identifier.
|
||||
In that case :func:`tokenize_string` raises a SyntaxError.
|
||||
In that case, we just return the name unmodified.
|
||||
|
||||
If this name was used in the query string (this makes the query call impossible)
|
||||
an error will be raised by :func:`tokenize_backtick_quoted_string` instead,
|
||||
which is not caught and propagates to the user level.
|
||||
"""
|
||||
try:
|
||||
tokenized = tokenize_string(f"`{name}`")
|
||||
tokval = next(tokenized)[1]
|
||||
return create_valid_python_identifier(tokval)
|
||||
except SyntaxError:
|
||||
return name
|
||||
|
||||
|
||||
def tokenize_backtick_quoted_string(
|
||||
token_generator: Iterator[tokenize.TokenInfo], source: str, string_start: int
|
||||
) -> tuple[int, str]:
|
||||
"""
|
||||
Creates a token from a backtick quoted string.
|
||||
|
||||
Moves the token_generator forwards till right after the next backtick.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
token_generator : Iterator[tokenize.TokenInfo]
|
||||
The generator that yields the tokens of the source string (Tuple[int, str]).
|
||||
The generator is at the first token after the backtick (`)
|
||||
|
||||
source : str
|
||||
The Python source code string.
|
||||
|
||||
string_start : int
|
||||
This is the start of backtick quoted string inside the source string.
|
||||
|
||||
Returns
|
||||
-------
|
||||
tok: Tuple[int, str]
|
||||
The token that represents the backtick quoted string.
|
||||
The integer is equal to BACKTICK_QUOTED_STRING (100).
|
||||
"""
|
||||
for _, tokval, start, _, _ in token_generator:
|
||||
if tokval == "`":
|
||||
string_end = start[1]
|
||||
break
|
||||
|
||||
return BACKTICK_QUOTED_STRING, source[string_start:string_end]
|
||||
|
||||
|
||||
def tokenize_string(source: str) -> Iterator[tuple[int, str]]:
|
||||
"""
|
||||
Tokenize a Python source code string.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
source : str
|
||||
The Python source code string.
|
||||
|
||||
Returns
|
||||
-------
|
||||
tok_generator : Iterator[Tuple[int, str]]
|
||||
An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]).
|
||||
"""
|
||||
line_reader = StringIO(source).readline
|
||||
token_generator = tokenize.generate_tokens(line_reader)
|
||||
|
||||
# Loop over all tokens till a backtick (`) is found.
|
||||
# Then, take all tokens till the next backtick to form a backtick quoted string
|
||||
for toknum, tokval, start, _, _ in token_generator:
|
||||
if tokval == "`":
|
||||
try:
|
||||
yield tokenize_backtick_quoted_string(
|
||||
token_generator, source, string_start=start[1] + 1
|
||||
)
|
||||
except Exception as err:
|
||||
raise SyntaxError(f"Failed to parse backticks in '{source}'.") from err
|
||||
else:
|
||||
yield toknum, tokval
|
||||
666
lib/python3.11/site-packages/pandas/core/computation/pytables.py
Normal file
666
lib/python3.11/site-packages/pandas/core/computation/pytables.py
Normal file
@ -0,0 +1,666 @@
|
||||
""" manage PyTables query interface via Expressions """
|
||||
from __future__ import annotations
|
||||
|
||||
import ast
|
||||
from decimal import (
|
||||
Decimal,
|
||||
InvalidOperation,
|
||||
)
|
||||
from functools import partial
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
ClassVar,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs.tslibs import (
|
||||
Timedelta,
|
||||
Timestamp,
|
||||
)
|
||||
from pandas.errors import UndefinedVariableError
|
||||
|
||||
from pandas.core.dtypes.common import is_list_like
|
||||
|
||||
import pandas.core.common as com
|
||||
from pandas.core.computation import (
|
||||
expr,
|
||||
ops,
|
||||
scope as _scope,
|
||||
)
|
||||
from pandas.core.computation.common import ensure_decoded
|
||||
from pandas.core.computation.expr import BaseExprVisitor
|
||||
from pandas.core.computation.ops import is_term
|
||||
from pandas.core.construction import extract_array
|
||||
from pandas.core.indexes.base import Index
|
||||
|
||||
from pandas.io.formats.printing import (
|
||||
pprint_thing,
|
||||
pprint_thing_encoded,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
Self,
|
||||
npt,
|
||||
)
|
||||
|
||||
|
||||
class PyTablesScope(_scope.Scope):
|
||||
__slots__ = ("queryables",)
|
||||
|
||||
queryables: dict[str, Any]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
level: int,
|
||||
global_dict=None,
|
||||
local_dict=None,
|
||||
queryables: dict[str, Any] | None = None,
|
||||
) -> None:
|
||||
super().__init__(level + 1, global_dict=global_dict, local_dict=local_dict)
|
||||
self.queryables = queryables or {}
|
||||
|
||||
|
||||
class Term(ops.Term):
|
||||
env: PyTablesScope
|
||||
|
||||
def __new__(cls, name, env, side=None, encoding=None):
|
||||
if isinstance(name, str):
|
||||
klass = cls
|
||||
else:
|
||||
klass = Constant
|
||||
return object.__new__(klass)
|
||||
|
||||
def __init__(self, name, env: PyTablesScope, side=None, encoding=None) -> None:
|
||||
super().__init__(name, env, side=side, encoding=encoding)
|
||||
|
||||
def _resolve_name(self):
|
||||
# must be a queryables
|
||||
if self.side == "left":
|
||||
# Note: The behavior of __new__ ensures that self.name is a str here
|
||||
if self.name not in self.env.queryables:
|
||||
raise NameError(f"name {repr(self.name)} is not defined")
|
||||
return self.name
|
||||
|
||||
# resolve the rhs (and allow it to be None)
|
||||
try:
|
||||
return self.env.resolve(self.name, is_local=False)
|
||||
except UndefinedVariableError:
|
||||
return self.name
|
||||
|
||||
# read-only property overwriting read/write property
|
||||
@property # type: ignore[misc]
|
||||
def value(self):
|
||||
return self._value
|
||||
|
||||
|
||||
class Constant(Term):
|
||||
def __init__(self, name, env: PyTablesScope, side=None, encoding=None) -> None:
|
||||
assert isinstance(env, PyTablesScope), type(env)
|
||||
super().__init__(name, env, side=side, encoding=encoding)
|
||||
|
||||
def _resolve_name(self):
|
||||
return self._name
|
||||
|
||||
|
||||
class BinOp(ops.BinOp):
|
||||
_max_selectors = 31
|
||||
|
||||
op: str
|
||||
queryables: dict[str, Any]
|
||||
condition: str | None
|
||||
|
||||
def __init__(self, op: str, lhs, rhs, queryables: dict[str, Any], encoding) -> None:
|
||||
super().__init__(op, lhs, rhs)
|
||||
self.queryables = queryables
|
||||
self.encoding = encoding
|
||||
self.condition = None
|
||||
|
||||
def _disallow_scalar_only_bool_ops(self) -> None:
|
||||
pass
|
||||
|
||||
def prune(self, klass):
|
||||
def pr(left, right):
|
||||
"""create and return a new specialized BinOp from myself"""
|
||||
if left is None:
|
||||
return right
|
||||
elif right is None:
|
||||
return left
|
||||
|
||||
k = klass
|
||||
if isinstance(left, ConditionBinOp):
|
||||
if isinstance(right, ConditionBinOp):
|
||||
k = JointConditionBinOp
|
||||
elif isinstance(left, k):
|
||||
return left
|
||||
elif isinstance(right, k):
|
||||
return right
|
||||
|
||||
elif isinstance(left, FilterBinOp):
|
||||
if isinstance(right, FilterBinOp):
|
||||
k = JointFilterBinOp
|
||||
elif isinstance(left, k):
|
||||
return left
|
||||
elif isinstance(right, k):
|
||||
return right
|
||||
|
||||
return k(
|
||||
self.op, left, right, queryables=self.queryables, encoding=self.encoding
|
||||
).evaluate()
|
||||
|
||||
left, right = self.lhs, self.rhs
|
||||
|
||||
if is_term(left) and is_term(right):
|
||||
res = pr(left.value, right.value)
|
||||
elif not is_term(left) and is_term(right):
|
||||
res = pr(left.prune(klass), right.value)
|
||||
elif is_term(left) and not is_term(right):
|
||||
res = pr(left.value, right.prune(klass))
|
||||
elif not (is_term(left) or is_term(right)):
|
||||
res = pr(left.prune(klass), right.prune(klass))
|
||||
|
||||
return res
|
||||
|
||||
def conform(self, rhs):
|
||||
"""inplace conform rhs"""
|
||||
if not is_list_like(rhs):
|
||||
rhs = [rhs]
|
||||
if isinstance(rhs, np.ndarray):
|
||||
rhs = rhs.ravel()
|
||||
return rhs
|
||||
|
||||
@property
|
||||
def is_valid(self) -> bool:
|
||||
"""return True if this is a valid field"""
|
||||
return self.lhs in self.queryables
|
||||
|
||||
@property
|
||||
def is_in_table(self) -> bool:
|
||||
"""
|
||||
return True if this is a valid column name for generation (e.g. an
|
||||
actual column in the table)
|
||||
"""
|
||||
return self.queryables.get(self.lhs) is not None
|
||||
|
||||
@property
|
||||
def kind(self):
|
||||
"""the kind of my field"""
|
||||
return getattr(self.queryables.get(self.lhs), "kind", None)
|
||||
|
||||
@property
|
||||
def meta(self):
|
||||
"""the meta of my field"""
|
||||
return getattr(self.queryables.get(self.lhs), "meta", None)
|
||||
|
||||
@property
|
||||
def metadata(self):
|
||||
"""the metadata of my field"""
|
||||
return getattr(self.queryables.get(self.lhs), "metadata", None)
|
||||
|
||||
def generate(self, v) -> str:
|
||||
"""create and return the op string for this TermValue"""
|
||||
val = v.tostring(self.encoding)
|
||||
return f"({self.lhs} {self.op} {val})"
|
||||
|
||||
def convert_value(self, v) -> TermValue:
|
||||
"""
|
||||
convert the expression that is in the term to something that is
|
||||
accepted by pytables
|
||||
"""
|
||||
|
||||
def stringify(value):
|
||||
if self.encoding is not None:
|
||||
return pprint_thing_encoded(value, encoding=self.encoding)
|
||||
return pprint_thing(value)
|
||||
|
||||
kind = ensure_decoded(self.kind)
|
||||
meta = ensure_decoded(self.meta)
|
||||
if kind == "datetime" or (kind and kind.startswith("datetime64")):
|
||||
if isinstance(v, (int, float)):
|
||||
v = stringify(v)
|
||||
v = ensure_decoded(v)
|
||||
v = Timestamp(v).as_unit("ns")
|
||||
if v.tz is not None:
|
||||
v = v.tz_convert("UTC")
|
||||
return TermValue(v, v._value, kind)
|
||||
elif kind in ("timedelta64", "timedelta"):
|
||||
if isinstance(v, str):
|
||||
v = Timedelta(v)
|
||||
else:
|
||||
v = Timedelta(v, unit="s")
|
||||
v = v.as_unit("ns")._value
|
||||
return TermValue(int(v), v, kind)
|
||||
elif meta == "category":
|
||||
metadata = extract_array(self.metadata, extract_numpy=True)
|
||||
result: npt.NDArray[np.intp] | np.intp | int
|
||||
if v not in metadata:
|
||||
result = -1
|
||||
else:
|
||||
result = metadata.searchsorted(v, side="left")
|
||||
return TermValue(result, result, "integer")
|
||||
elif kind == "integer":
|
||||
try:
|
||||
v_dec = Decimal(v)
|
||||
except InvalidOperation:
|
||||
# GH 54186
|
||||
# convert v to float to raise float's ValueError
|
||||
float(v)
|
||||
else:
|
||||
v = int(v_dec.to_integral_exact(rounding="ROUND_HALF_EVEN"))
|
||||
return TermValue(v, v, kind)
|
||||
elif kind == "float":
|
||||
v = float(v)
|
||||
return TermValue(v, v, kind)
|
||||
elif kind == "bool":
|
||||
if isinstance(v, str):
|
||||
v = v.strip().lower() not in [
|
||||
"false",
|
||||
"f",
|
||||
"no",
|
||||
"n",
|
||||
"none",
|
||||
"0",
|
||||
"[]",
|
||||
"{}",
|
||||
"",
|
||||
]
|
||||
else:
|
||||
v = bool(v)
|
||||
return TermValue(v, v, kind)
|
||||
elif isinstance(v, str):
|
||||
# string quoting
|
||||
return TermValue(v, stringify(v), "string")
|
||||
else:
|
||||
raise TypeError(f"Cannot compare {v} of type {type(v)} to {kind} column")
|
||||
|
||||
def convert_values(self) -> None:
|
||||
pass
|
||||
|
||||
|
||||
class FilterBinOp(BinOp):
|
||||
filter: tuple[Any, Any, Index] | None = None
|
||||
|
||||
def __repr__(self) -> str:
|
||||
if self.filter is None:
|
||||
return "Filter: Not Initialized"
|
||||
return pprint_thing(f"[Filter : [{self.filter[0]}] -> [{self.filter[1]}]")
|
||||
|
||||
def invert(self) -> Self:
|
||||
"""invert the filter"""
|
||||
if self.filter is not None:
|
||||
self.filter = (
|
||||
self.filter[0],
|
||||
self.generate_filter_op(invert=True),
|
||||
self.filter[2],
|
||||
)
|
||||
return self
|
||||
|
||||
def format(self):
|
||||
"""return the actual filter format"""
|
||||
return [self.filter]
|
||||
|
||||
# error: Signature of "evaluate" incompatible with supertype "BinOp"
|
||||
def evaluate(self) -> Self | None: # type: ignore[override]
|
||||
if not self.is_valid:
|
||||
raise ValueError(f"query term is not valid [{self}]")
|
||||
|
||||
rhs = self.conform(self.rhs)
|
||||
values = list(rhs)
|
||||
|
||||
if self.is_in_table:
|
||||
# if too many values to create the expression, use a filter instead
|
||||
if self.op in ["==", "!="] and len(values) > self._max_selectors:
|
||||
filter_op = self.generate_filter_op()
|
||||
self.filter = (self.lhs, filter_op, Index(values))
|
||||
|
||||
return self
|
||||
return None
|
||||
|
||||
# equality conditions
|
||||
if self.op in ["==", "!="]:
|
||||
filter_op = self.generate_filter_op()
|
||||
self.filter = (self.lhs, filter_op, Index(values))
|
||||
|
||||
else:
|
||||
raise TypeError(
|
||||
f"passing a filterable condition to a non-table indexer [{self}]"
|
||||
)
|
||||
|
||||
return self
|
||||
|
||||
def generate_filter_op(self, invert: bool = False):
|
||||
if (self.op == "!=" and not invert) or (self.op == "==" and invert):
|
||||
return lambda axis, vals: ~axis.isin(vals)
|
||||
else:
|
||||
return lambda axis, vals: axis.isin(vals)
|
||||
|
||||
|
||||
class JointFilterBinOp(FilterBinOp):
|
||||
def format(self):
|
||||
raise NotImplementedError("unable to collapse Joint Filters")
|
||||
|
||||
# error: Signature of "evaluate" incompatible with supertype "BinOp"
|
||||
def evaluate(self) -> Self: # type: ignore[override]
|
||||
return self
|
||||
|
||||
|
||||
class ConditionBinOp(BinOp):
|
||||
def __repr__(self) -> str:
|
||||
return pprint_thing(f"[Condition : [{self.condition}]]")
|
||||
|
||||
def invert(self):
|
||||
"""invert the condition"""
|
||||
# if self.condition is not None:
|
||||
# self.condition = "~(%s)" % self.condition
|
||||
# return self
|
||||
raise NotImplementedError(
|
||||
"cannot use an invert condition when passing to numexpr"
|
||||
)
|
||||
|
||||
def format(self):
|
||||
"""return the actual ne format"""
|
||||
return self.condition
|
||||
|
||||
# error: Signature of "evaluate" incompatible with supertype "BinOp"
|
||||
def evaluate(self) -> Self | None: # type: ignore[override]
|
||||
if not self.is_valid:
|
||||
raise ValueError(f"query term is not valid [{self}]")
|
||||
|
||||
# convert values if we are in the table
|
||||
if not self.is_in_table:
|
||||
return None
|
||||
|
||||
rhs = self.conform(self.rhs)
|
||||
values = [self.convert_value(v) for v in rhs]
|
||||
|
||||
# equality conditions
|
||||
if self.op in ["==", "!="]:
|
||||
# too many values to create the expression?
|
||||
if len(values) <= self._max_selectors:
|
||||
vs = [self.generate(v) for v in values]
|
||||
self.condition = f"({' | '.join(vs)})"
|
||||
|
||||
# use a filter after reading
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
self.condition = self.generate(values[0])
|
||||
|
||||
return self
|
||||
|
||||
|
||||
class JointConditionBinOp(ConditionBinOp):
|
||||
# error: Signature of "evaluate" incompatible with supertype "BinOp"
|
||||
def evaluate(self) -> Self: # type: ignore[override]
|
||||
self.condition = f"({self.lhs.condition} {self.op} {self.rhs.condition})"
|
||||
return self
|
||||
|
||||
|
||||
class UnaryOp(ops.UnaryOp):
|
||||
def prune(self, klass):
|
||||
if self.op != "~":
|
||||
raise NotImplementedError("UnaryOp only support invert type ops")
|
||||
|
||||
operand = self.operand
|
||||
operand = operand.prune(klass)
|
||||
|
||||
if operand is not None and (
|
||||
issubclass(klass, ConditionBinOp)
|
||||
and operand.condition is not None
|
||||
or not issubclass(klass, ConditionBinOp)
|
||||
and issubclass(klass, FilterBinOp)
|
||||
and operand.filter is not None
|
||||
):
|
||||
return operand.invert()
|
||||
return None
|
||||
|
||||
|
||||
class PyTablesExprVisitor(BaseExprVisitor):
|
||||
const_type: ClassVar[type[ops.Term]] = Constant
|
||||
term_type: ClassVar[type[Term]] = Term
|
||||
|
||||
def __init__(self, env, engine, parser, **kwargs) -> None:
|
||||
super().__init__(env, engine, parser)
|
||||
for bin_op in self.binary_ops:
|
||||
bin_node = self.binary_op_nodes_map[bin_op]
|
||||
setattr(
|
||||
self,
|
||||
f"visit_{bin_node}",
|
||||
lambda node, bin_op=bin_op: partial(BinOp, bin_op, **kwargs),
|
||||
)
|
||||
|
||||
def visit_UnaryOp(self, node, **kwargs) -> ops.Term | UnaryOp | None:
|
||||
if isinstance(node.op, (ast.Not, ast.Invert)):
|
||||
return UnaryOp("~", self.visit(node.operand))
|
||||
elif isinstance(node.op, ast.USub):
|
||||
return self.const_type(-self.visit(node.operand).value, self.env)
|
||||
elif isinstance(node.op, ast.UAdd):
|
||||
raise NotImplementedError("Unary addition not supported")
|
||||
# TODO: return None might never be reached
|
||||
return None
|
||||
|
||||
def visit_Index(self, node, **kwargs):
|
||||
return self.visit(node.value).value
|
||||
|
||||
def visit_Assign(self, node, **kwargs):
|
||||
cmpr = ast.Compare(
|
||||
ops=[ast.Eq()], left=node.targets[0], comparators=[node.value]
|
||||
)
|
||||
return self.visit(cmpr)
|
||||
|
||||
def visit_Subscript(self, node, **kwargs) -> ops.Term:
|
||||
# only allow simple subscripts
|
||||
|
||||
value = self.visit(node.value)
|
||||
slobj = self.visit(node.slice)
|
||||
try:
|
||||
value = value.value
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
if isinstance(slobj, Term):
|
||||
# In py39 np.ndarray lookups with Term containing int raise
|
||||
slobj = slobj.value
|
||||
|
||||
try:
|
||||
return self.const_type(value[slobj], self.env)
|
||||
except TypeError as err:
|
||||
raise ValueError(
|
||||
f"cannot subscript {repr(value)} with {repr(slobj)}"
|
||||
) from err
|
||||
|
||||
def visit_Attribute(self, node, **kwargs):
|
||||
attr = node.attr
|
||||
value = node.value
|
||||
|
||||
ctx = type(node.ctx)
|
||||
if ctx == ast.Load:
|
||||
# resolve the value
|
||||
resolved = self.visit(value)
|
||||
|
||||
# try to get the value to see if we are another expression
|
||||
try:
|
||||
resolved = resolved.value
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
try:
|
||||
return self.term_type(getattr(resolved, attr), self.env)
|
||||
except AttributeError:
|
||||
# something like datetime.datetime where scope is overridden
|
||||
if isinstance(value, ast.Name) and value.id == attr:
|
||||
return resolved
|
||||
|
||||
raise ValueError(f"Invalid Attribute context {ctx.__name__}")
|
||||
|
||||
def translate_In(self, op):
|
||||
return ast.Eq() if isinstance(op, ast.In) else op
|
||||
|
||||
def _rewrite_membership_op(self, node, left, right):
|
||||
return self.visit(node.op), node.op, left, right
|
||||
|
||||
|
||||
def _validate_where(w):
|
||||
"""
|
||||
Validate that the where statement is of the right type.
|
||||
|
||||
The type may either be String, Expr, or list-like of Exprs.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
w : String term expression, Expr, or list-like of Exprs.
|
||||
|
||||
Returns
|
||||
-------
|
||||
where : The original where clause if the check was successful.
|
||||
|
||||
Raises
|
||||
------
|
||||
TypeError : An invalid data type was passed in for w (e.g. dict).
|
||||
"""
|
||||
if not (isinstance(w, (PyTablesExpr, str)) or is_list_like(w)):
|
||||
raise TypeError(
|
||||
"where must be passed as a string, PyTablesExpr, "
|
||||
"or list-like of PyTablesExpr"
|
||||
)
|
||||
|
||||
return w
|
||||
|
||||
|
||||
class PyTablesExpr(expr.Expr):
|
||||
"""
|
||||
Hold a pytables-like expression, comprised of possibly multiple 'terms'.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
where : string term expression, PyTablesExpr, or list-like of PyTablesExprs
|
||||
queryables : a "kinds" map (dict of column name -> kind), or None if column
|
||||
is non-indexable
|
||||
encoding : an encoding that will encode the query terms
|
||||
|
||||
Returns
|
||||
-------
|
||||
a PyTablesExpr object
|
||||
|
||||
Examples
|
||||
--------
|
||||
'index>=date'
|
||||
"columns=['A', 'D']"
|
||||
'columns=A'
|
||||
'columns==A'
|
||||
"~(columns=['A','B'])"
|
||||
'index>df.index[3] & string="bar"'
|
||||
'(index>df.index[3] & index<=df.index[6]) | string="bar"'
|
||||
"ts>=Timestamp('2012-02-01')"
|
||||
"major_axis>=20130101"
|
||||
"""
|
||||
|
||||
_visitor: PyTablesExprVisitor | None
|
||||
env: PyTablesScope
|
||||
expr: str
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
where,
|
||||
queryables: dict[str, Any] | None = None,
|
||||
encoding=None,
|
||||
scope_level: int = 0,
|
||||
) -> None:
|
||||
where = _validate_where(where)
|
||||
|
||||
self.encoding = encoding
|
||||
self.condition = None
|
||||
self.filter = None
|
||||
self.terms = None
|
||||
self._visitor = None
|
||||
|
||||
# capture the environment if needed
|
||||
local_dict: _scope.DeepChainMap[Any, Any] | None = None
|
||||
|
||||
if isinstance(where, PyTablesExpr):
|
||||
local_dict = where.env.scope
|
||||
_where = where.expr
|
||||
|
||||
elif is_list_like(where):
|
||||
where = list(where)
|
||||
for idx, w in enumerate(where):
|
||||
if isinstance(w, PyTablesExpr):
|
||||
local_dict = w.env.scope
|
||||
else:
|
||||
where[idx] = _validate_where(w)
|
||||
_where = " & ".join([f"({w})" for w in com.flatten(where)])
|
||||
else:
|
||||
# _validate_where ensures we otherwise have a string
|
||||
_where = where
|
||||
|
||||
self.expr = _where
|
||||
self.env = PyTablesScope(scope_level + 1, local_dict=local_dict)
|
||||
|
||||
if queryables is not None and isinstance(self.expr, str):
|
||||
self.env.queryables.update(queryables)
|
||||
self._visitor = PyTablesExprVisitor(
|
||||
self.env,
|
||||
queryables=queryables,
|
||||
parser="pytables",
|
||||
engine="pytables",
|
||||
encoding=encoding,
|
||||
)
|
||||
self.terms = self.parse()
|
||||
|
||||
def __repr__(self) -> str:
|
||||
if self.terms is not None:
|
||||
return pprint_thing(self.terms)
|
||||
return pprint_thing(self.expr)
|
||||
|
||||
def evaluate(self):
|
||||
"""create and return the numexpr condition and filter"""
|
||||
try:
|
||||
self.condition = self.terms.prune(ConditionBinOp)
|
||||
except AttributeError as err:
|
||||
raise ValueError(
|
||||
f"cannot process expression [{self.expr}], [{self}] "
|
||||
"is not a valid condition"
|
||||
) from err
|
||||
try:
|
||||
self.filter = self.terms.prune(FilterBinOp)
|
||||
except AttributeError as err:
|
||||
raise ValueError(
|
||||
f"cannot process expression [{self.expr}], [{self}] "
|
||||
"is not a valid filter"
|
||||
) from err
|
||||
|
||||
return self.condition, self.filter
|
||||
|
||||
|
||||
class TermValue:
|
||||
"""hold a term value the we use to construct a condition/filter"""
|
||||
|
||||
def __init__(self, value, converted, kind: str) -> None:
|
||||
assert isinstance(kind, str), kind
|
||||
self.value = value
|
||||
self.converted = converted
|
||||
self.kind = kind
|
||||
|
||||
def tostring(self, encoding) -> str:
|
||||
"""quote the string if not encoded else encode and return"""
|
||||
if self.kind == "string":
|
||||
if encoding is not None:
|
||||
return str(self.converted)
|
||||
return f'"{self.converted}"'
|
||||
elif self.kind == "float":
|
||||
# python 2 str(float) is not always
|
||||
# round-trippable so use repr()
|
||||
return repr(self.converted)
|
||||
return str(self.converted)
|
||||
|
||||
|
||||
def maybe_expression(s) -> bool:
|
||||
"""loose checking if s is a pytables-acceptable expression"""
|
||||
if not isinstance(s, str):
|
||||
return False
|
||||
operations = PyTablesExprVisitor.binary_ops + PyTablesExprVisitor.unary_ops + ("=",)
|
||||
|
||||
# make sure we have an op at least
|
||||
return any(op in s for op in operations)
|
||||
355
lib/python3.11/site-packages/pandas/core/computation/scope.py
Normal file
355
lib/python3.11/site-packages/pandas/core/computation/scope.py
Normal file
@ -0,0 +1,355 @@
|
||||
"""
|
||||
Module for scope operations
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import ChainMap
|
||||
import datetime
|
||||
import inspect
|
||||
from io import StringIO
|
||||
import itertools
|
||||
import pprint
|
||||
import struct
|
||||
import sys
|
||||
from typing import TypeVar
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs.tslibs import Timestamp
|
||||
from pandas.errors import UndefinedVariableError
|
||||
|
||||
_KT = TypeVar("_KT")
|
||||
_VT = TypeVar("_VT")
|
||||
|
||||
|
||||
# https://docs.python.org/3/library/collections.html#chainmap-examples-and-recipes
|
||||
class DeepChainMap(ChainMap[_KT, _VT]):
|
||||
"""
|
||||
Variant of ChainMap that allows direct updates to inner scopes.
|
||||
|
||||
Only works when all passed mapping are mutable.
|
||||
"""
|
||||
|
||||
def __setitem__(self, key: _KT, value: _VT) -> None:
|
||||
for mapping in self.maps:
|
||||
if key in mapping:
|
||||
mapping[key] = value
|
||||
return
|
||||
self.maps[0][key] = value
|
||||
|
||||
def __delitem__(self, key: _KT) -> None:
|
||||
"""
|
||||
Raises
|
||||
------
|
||||
KeyError
|
||||
If `key` doesn't exist.
|
||||
"""
|
||||
for mapping in self.maps:
|
||||
if key in mapping:
|
||||
del mapping[key]
|
||||
return
|
||||
raise KeyError(key)
|
||||
|
||||
|
||||
def ensure_scope(
|
||||
level: int, global_dict=None, local_dict=None, resolvers=(), target=None
|
||||
) -> Scope:
|
||||
"""Ensure that we are grabbing the correct scope."""
|
||||
return Scope(
|
||||
level + 1,
|
||||
global_dict=global_dict,
|
||||
local_dict=local_dict,
|
||||
resolvers=resolvers,
|
||||
target=target,
|
||||
)
|
||||
|
||||
|
||||
def _replacer(x) -> str:
|
||||
"""
|
||||
Replace a number with its hexadecimal representation. Used to tag
|
||||
temporary variables with their calling scope's id.
|
||||
"""
|
||||
# get the hex repr of the binary char and remove 0x and pad by pad_size
|
||||
# zeros
|
||||
try:
|
||||
hexin = ord(x)
|
||||
except TypeError:
|
||||
# bytes literals masquerade as ints when iterating in py3
|
||||
hexin = x
|
||||
|
||||
return hex(hexin)
|
||||
|
||||
|
||||
def _raw_hex_id(obj) -> str:
|
||||
"""Return the padded hexadecimal id of ``obj``."""
|
||||
# interpret as a pointer since that's what really what id returns
|
||||
packed = struct.pack("@P", id(obj))
|
||||
return "".join([_replacer(x) for x in packed])
|
||||
|
||||
|
||||
DEFAULT_GLOBALS = {
|
||||
"Timestamp": Timestamp,
|
||||
"datetime": datetime.datetime,
|
||||
"True": True,
|
||||
"False": False,
|
||||
"list": list,
|
||||
"tuple": tuple,
|
||||
"inf": np.inf,
|
||||
"Inf": np.inf,
|
||||
}
|
||||
|
||||
|
||||
def _get_pretty_string(obj) -> str:
|
||||
"""
|
||||
Return a prettier version of obj.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : object
|
||||
Object to pretty print
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
Pretty print object repr
|
||||
"""
|
||||
sio = StringIO()
|
||||
pprint.pprint(obj, stream=sio)
|
||||
return sio.getvalue()
|
||||
|
||||
|
||||
class Scope:
|
||||
"""
|
||||
Object to hold scope, with a few bells to deal with some custom syntax
|
||||
and contexts added by pandas.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
level : int
|
||||
global_dict : dict or None, optional, default None
|
||||
local_dict : dict or Scope or None, optional, default None
|
||||
resolvers : list-like or None, optional, default None
|
||||
target : object
|
||||
|
||||
Attributes
|
||||
----------
|
||||
level : int
|
||||
scope : DeepChainMap
|
||||
target : object
|
||||
temps : dict
|
||||
"""
|
||||
|
||||
__slots__ = ["level", "scope", "target", "resolvers", "temps"]
|
||||
level: int
|
||||
scope: DeepChainMap
|
||||
resolvers: DeepChainMap
|
||||
temps: dict
|
||||
|
||||
def __init__(
|
||||
self, level: int, global_dict=None, local_dict=None, resolvers=(), target=None
|
||||
) -> None:
|
||||
self.level = level + 1
|
||||
|
||||
# shallow copy because we don't want to keep filling this up with what
|
||||
# was there before if there are multiple calls to Scope/_ensure_scope
|
||||
self.scope = DeepChainMap(DEFAULT_GLOBALS.copy())
|
||||
self.target = target
|
||||
|
||||
if isinstance(local_dict, Scope):
|
||||
self.scope.update(local_dict.scope)
|
||||
if local_dict.target is not None:
|
||||
self.target = local_dict.target
|
||||
self._update(local_dict.level)
|
||||
|
||||
frame = sys._getframe(self.level)
|
||||
|
||||
try:
|
||||
# shallow copy here because we don't want to replace what's in
|
||||
# scope when we align terms (alignment accesses the underlying
|
||||
# numpy array of pandas objects)
|
||||
scope_global = self.scope.new_child(
|
||||
(global_dict if global_dict is not None else frame.f_globals).copy()
|
||||
)
|
||||
self.scope = DeepChainMap(scope_global)
|
||||
if not isinstance(local_dict, Scope):
|
||||
scope_local = self.scope.new_child(
|
||||
(local_dict if local_dict is not None else frame.f_locals).copy()
|
||||
)
|
||||
self.scope = DeepChainMap(scope_local)
|
||||
finally:
|
||||
del frame
|
||||
|
||||
# assumes that resolvers are going from outermost scope to inner
|
||||
if isinstance(local_dict, Scope):
|
||||
resolvers += tuple(local_dict.resolvers.maps)
|
||||
self.resolvers = DeepChainMap(*resolvers)
|
||||
self.temps = {}
|
||||
|
||||
def __repr__(self) -> str:
|
||||
scope_keys = _get_pretty_string(list(self.scope.keys()))
|
||||
res_keys = _get_pretty_string(list(self.resolvers.keys()))
|
||||
return f"{type(self).__name__}(scope={scope_keys}, resolvers={res_keys})"
|
||||
|
||||
@property
|
||||
def has_resolvers(self) -> bool:
|
||||
"""
|
||||
Return whether we have any extra scope.
|
||||
|
||||
For example, DataFrames pass Their columns as resolvers during calls to
|
||||
``DataFrame.eval()`` and ``DataFrame.query()``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
hr : bool
|
||||
"""
|
||||
return bool(len(self.resolvers))
|
||||
|
||||
def resolve(self, key: str, is_local: bool):
|
||||
"""
|
||||
Resolve a variable name in a possibly local context.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
key : str
|
||||
A variable name
|
||||
is_local : bool
|
||||
Flag indicating whether the variable is local or not (prefixed with
|
||||
the '@' symbol)
|
||||
|
||||
Returns
|
||||
-------
|
||||
value : object
|
||||
The value of a particular variable
|
||||
"""
|
||||
try:
|
||||
# only look for locals in outer scope
|
||||
if is_local:
|
||||
return self.scope[key]
|
||||
|
||||
# not a local variable so check in resolvers if we have them
|
||||
if self.has_resolvers:
|
||||
return self.resolvers[key]
|
||||
|
||||
# if we're here that means that we have no locals and we also have
|
||||
# no resolvers
|
||||
assert not is_local and not self.has_resolvers
|
||||
return self.scope[key]
|
||||
except KeyError:
|
||||
try:
|
||||
# last ditch effort we look in temporaries
|
||||
# these are created when parsing indexing expressions
|
||||
# e.g., df[df > 0]
|
||||
return self.temps[key]
|
||||
except KeyError as err:
|
||||
raise UndefinedVariableError(key, is_local) from err
|
||||
|
||||
def swapkey(self, old_key: str, new_key: str, new_value=None) -> None:
|
||||
"""
|
||||
Replace a variable name, with a potentially new value.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
old_key : str
|
||||
Current variable name to replace
|
||||
new_key : str
|
||||
New variable name to replace `old_key` with
|
||||
new_value : object
|
||||
Value to be replaced along with the possible renaming
|
||||
"""
|
||||
if self.has_resolvers:
|
||||
maps = self.resolvers.maps + self.scope.maps
|
||||
else:
|
||||
maps = self.scope.maps
|
||||
|
||||
maps.append(self.temps)
|
||||
|
||||
for mapping in maps:
|
||||
if old_key in mapping:
|
||||
mapping[new_key] = new_value
|
||||
return
|
||||
|
||||
def _get_vars(self, stack, scopes: list[str]) -> None:
|
||||
"""
|
||||
Get specifically scoped variables from a list of stack frames.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
stack : list
|
||||
A list of stack frames as returned by ``inspect.stack()``
|
||||
scopes : sequence of strings
|
||||
A sequence containing valid stack frame attribute names that
|
||||
evaluate to a dictionary. For example, ('locals', 'globals')
|
||||
"""
|
||||
variables = itertools.product(scopes, stack)
|
||||
for scope, (frame, _, _, _, _, _) in variables:
|
||||
try:
|
||||
d = getattr(frame, f"f_{scope}")
|
||||
self.scope = DeepChainMap(self.scope.new_child(d))
|
||||
finally:
|
||||
# won't remove it, but DECREF it
|
||||
# in Py3 this probably isn't necessary since frame won't be
|
||||
# scope after the loop
|
||||
del frame
|
||||
|
||||
def _update(self, level: int) -> None:
|
||||
"""
|
||||
Update the current scope by going back `level` levels.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
level : int
|
||||
"""
|
||||
sl = level + 1
|
||||
|
||||
# add sl frames to the scope starting with the
|
||||
# most distant and overwriting with more current
|
||||
# makes sure that we can capture variable scope
|
||||
stack = inspect.stack()
|
||||
|
||||
try:
|
||||
self._get_vars(stack[:sl], scopes=["locals"])
|
||||
finally:
|
||||
del stack[:], stack
|
||||
|
||||
def add_tmp(self, value) -> str:
|
||||
"""
|
||||
Add a temporary variable to the scope.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
value : object
|
||||
An arbitrary object to be assigned to a temporary variable.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
The name of the temporary variable created.
|
||||
"""
|
||||
name = f"{type(value).__name__}_{self.ntemps}_{_raw_hex_id(self)}"
|
||||
|
||||
# add to inner most scope
|
||||
assert name not in self.temps
|
||||
self.temps[name] = value
|
||||
assert name in self.temps
|
||||
|
||||
# only increment if the variable gets put in the scope
|
||||
return name
|
||||
|
||||
@property
|
||||
def ntemps(self) -> int:
|
||||
"""The number of temporary variables in this scope"""
|
||||
return len(self.temps)
|
||||
|
||||
@property
|
||||
def full_scope(self) -> DeepChainMap:
|
||||
"""
|
||||
Return the full scope for use with passing to engines transparently
|
||||
as a mapping.
|
||||
|
||||
Returns
|
||||
-------
|
||||
vars : DeepChainMap
|
||||
All variables in this scope.
|
||||
"""
|
||||
maps = [self.temps] + self.resolvers.maps + self.scope.maps
|
||||
return DeepChainMap(*maps)
|
||||
941
lib/python3.11/site-packages/pandas/core/config_init.py
Normal file
941
lib/python3.11/site-packages/pandas/core/config_init.py
Normal file
@ -0,0 +1,941 @@
|
||||
"""
|
||||
This module is imported from the pandas package __init__.py file
|
||||
in order to ensure that the core.config options registered here will
|
||||
be available as soon as the user loads the package. if register_option
|
||||
is invoked inside specific modules, they will not be registered until that
|
||||
module is imported, which may or may not be a problem.
|
||||
|
||||
If you need to make sure options are available even before a certain
|
||||
module is imported, register them here rather than in the module.
|
||||
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
)
|
||||
|
||||
import pandas._config.config as cf
|
||||
from pandas._config.config import (
|
||||
is_bool,
|
||||
is_callable,
|
||||
is_instance_factory,
|
||||
is_int,
|
||||
is_nonnegative_int,
|
||||
is_one_of_factory,
|
||||
is_str,
|
||||
is_text,
|
||||
)
|
||||
|
||||
# compute
|
||||
|
||||
use_bottleneck_doc = """
|
||||
: bool
|
||||
Use the bottleneck library to accelerate if it is installed,
|
||||
the default is True
|
||||
Valid values: False,True
|
||||
"""
|
||||
|
||||
|
||||
def use_bottleneck_cb(key) -> None:
|
||||
from pandas.core import nanops
|
||||
|
||||
nanops.set_use_bottleneck(cf.get_option(key))
|
||||
|
||||
|
||||
use_numexpr_doc = """
|
||||
: bool
|
||||
Use the numexpr library to accelerate computation if it is installed,
|
||||
the default is True
|
||||
Valid values: False,True
|
||||
"""
|
||||
|
||||
|
||||
def use_numexpr_cb(key) -> None:
|
||||
from pandas.core.computation import expressions
|
||||
|
||||
expressions.set_use_numexpr(cf.get_option(key))
|
||||
|
||||
|
||||
use_numba_doc = """
|
||||
: bool
|
||||
Use the numba engine option for select operations if it is installed,
|
||||
the default is False
|
||||
Valid values: False,True
|
||||
"""
|
||||
|
||||
|
||||
def use_numba_cb(key) -> None:
|
||||
from pandas.core.util import numba_
|
||||
|
||||
numba_.set_use_numba(cf.get_option(key))
|
||||
|
||||
|
||||
with cf.config_prefix("compute"):
|
||||
cf.register_option(
|
||||
"use_bottleneck",
|
||||
True,
|
||||
use_bottleneck_doc,
|
||||
validator=is_bool,
|
||||
cb=use_bottleneck_cb,
|
||||
)
|
||||
cf.register_option(
|
||||
"use_numexpr", True, use_numexpr_doc, validator=is_bool, cb=use_numexpr_cb
|
||||
)
|
||||
cf.register_option(
|
||||
"use_numba", False, use_numba_doc, validator=is_bool, cb=use_numba_cb
|
||||
)
|
||||
#
|
||||
# options from the "display" namespace
|
||||
|
||||
pc_precision_doc = """
|
||||
: int
|
||||
Floating point output precision in terms of number of places after the
|
||||
decimal, for regular formatting as well as scientific notation. Similar
|
||||
to ``precision`` in :meth:`numpy.set_printoptions`.
|
||||
"""
|
||||
|
||||
pc_colspace_doc = """
|
||||
: int
|
||||
Default space for DataFrame columns.
|
||||
"""
|
||||
|
||||
pc_max_rows_doc = """
|
||||
: int
|
||||
If max_rows is exceeded, switch to truncate view. Depending on
|
||||
`large_repr`, objects are either centrally truncated or printed as
|
||||
a summary view. 'None' value means unlimited.
|
||||
|
||||
In case python/IPython is running in a terminal and `large_repr`
|
||||
equals 'truncate' this can be set to 0 and pandas will auto-detect
|
||||
the height of the terminal and print a truncated object which fits
|
||||
the screen height. The IPython notebook, IPython qtconsole, or
|
||||
IDLE do not run in a terminal and hence it is not possible to do
|
||||
correct auto-detection.
|
||||
"""
|
||||
|
||||
pc_min_rows_doc = """
|
||||
: int
|
||||
The numbers of rows to show in a truncated view (when `max_rows` is
|
||||
exceeded). Ignored when `max_rows` is set to None or 0. When set to
|
||||
None, follows the value of `max_rows`.
|
||||
"""
|
||||
|
||||
pc_max_cols_doc = """
|
||||
: int
|
||||
If max_cols is exceeded, switch to truncate view. Depending on
|
||||
`large_repr`, objects are either centrally truncated or printed as
|
||||
a summary view. 'None' value means unlimited.
|
||||
|
||||
In case python/IPython is running in a terminal and `large_repr`
|
||||
equals 'truncate' this can be set to 0 or None and pandas will auto-detect
|
||||
the width of the terminal and print a truncated object which fits
|
||||
the screen width. The IPython notebook, IPython qtconsole, or IDLE
|
||||
do not run in a terminal and hence it is not possible to do
|
||||
correct auto-detection and defaults to 20.
|
||||
"""
|
||||
|
||||
pc_max_categories_doc = """
|
||||
: int
|
||||
This sets the maximum number of categories pandas should output when
|
||||
printing out a `Categorical` or a Series of dtype "category".
|
||||
"""
|
||||
|
||||
pc_max_info_cols_doc = """
|
||||
: int
|
||||
max_info_columns is used in DataFrame.info method to decide if
|
||||
per column information will be printed.
|
||||
"""
|
||||
|
||||
pc_nb_repr_h_doc = """
|
||||
: boolean
|
||||
When True, IPython notebook will use html representation for
|
||||
pandas objects (if it is available).
|
||||
"""
|
||||
|
||||
pc_pprint_nest_depth = """
|
||||
: int
|
||||
Controls the number of nested levels to process when pretty-printing
|
||||
"""
|
||||
|
||||
pc_multi_sparse_doc = """
|
||||
: boolean
|
||||
"sparsify" MultiIndex display (don't display repeated
|
||||
elements in outer levels within groups)
|
||||
"""
|
||||
|
||||
float_format_doc = """
|
||||
: callable
|
||||
The callable should accept a floating point number and return
|
||||
a string with the desired format of the number. This is used
|
||||
in some places like SeriesFormatter.
|
||||
See formats.format.EngFormatter for an example.
|
||||
"""
|
||||
|
||||
max_colwidth_doc = """
|
||||
: int or None
|
||||
The maximum width in characters of a column in the repr of
|
||||
a pandas data structure. When the column overflows, a "..."
|
||||
placeholder is embedded in the output. A 'None' value means unlimited.
|
||||
"""
|
||||
|
||||
colheader_justify_doc = """
|
||||
: 'left'/'right'
|
||||
Controls the justification of column headers. used by DataFrameFormatter.
|
||||
"""
|
||||
|
||||
pc_expand_repr_doc = """
|
||||
: boolean
|
||||
Whether to print out the full DataFrame repr for wide DataFrames across
|
||||
multiple lines, `max_columns` is still respected, but the output will
|
||||
wrap-around across multiple "pages" if its width exceeds `display.width`.
|
||||
"""
|
||||
|
||||
pc_show_dimensions_doc = """
|
||||
: boolean or 'truncate'
|
||||
Whether to print out dimensions at the end of DataFrame repr.
|
||||
If 'truncate' is specified, only print out the dimensions if the
|
||||
frame is truncated (e.g. not display all rows and/or columns)
|
||||
"""
|
||||
|
||||
pc_east_asian_width_doc = """
|
||||
: boolean
|
||||
Whether to use the Unicode East Asian Width to calculate the display text
|
||||
width.
|
||||
Enabling this may affect to the performance (default: False)
|
||||
"""
|
||||
|
||||
pc_ambiguous_as_wide_doc = """
|
||||
: boolean
|
||||
Whether to handle Unicode characters belong to Ambiguous as Wide (width=2)
|
||||
(default: False)
|
||||
"""
|
||||
|
||||
pc_table_schema_doc = """
|
||||
: boolean
|
||||
Whether to publish a Table Schema representation for frontends
|
||||
that support it.
|
||||
(default: False)
|
||||
"""
|
||||
|
||||
pc_html_border_doc = """
|
||||
: int
|
||||
A ``border=value`` attribute is inserted in the ``<table>`` tag
|
||||
for the DataFrame HTML repr.
|
||||
"""
|
||||
|
||||
pc_html_use_mathjax_doc = """\
|
||||
: boolean
|
||||
When True, Jupyter notebook will process table contents using MathJax,
|
||||
rendering mathematical expressions enclosed by the dollar symbol.
|
||||
(default: True)
|
||||
"""
|
||||
|
||||
pc_max_dir_items = """\
|
||||
: int
|
||||
The number of items that will be added to `dir(...)`. 'None' value means
|
||||
unlimited. Because dir is cached, changing this option will not immediately
|
||||
affect already existing dataframes until a column is deleted or added.
|
||||
|
||||
This is for instance used to suggest columns from a dataframe to tab
|
||||
completion.
|
||||
"""
|
||||
|
||||
pc_width_doc = """
|
||||
: int
|
||||
Width of the display in characters. In case python/IPython is running in
|
||||
a terminal this can be set to None and pandas will correctly auto-detect
|
||||
the width.
|
||||
Note that the IPython notebook, IPython qtconsole, or IDLE do not run in a
|
||||
terminal and hence it is not possible to correctly detect the width.
|
||||
"""
|
||||
|
||||
pc_chop_threshold_doc = """
|
||||
: float or None
|
||||
if set to a float value, all float values smaller than the given threshold
|
||||
will be displayed as exactly 0 by repr and friends.
|
||||
"""
|
||||
|
||||
pc_max_seq_items = """
|
||||
: int or None
|
||||
When pretty-printing a long sequence, no more then `max_seq_items`
|
||||
will be printed. If items are omitted, they will be denoted by the
|
||||
addition of "..." to the resulting string.
|
||||
|
||||
If set to None, the number of items to be printed is unlimited.
|
||||
"""
|
||||
|
||||
pc_max_info_rows_doc = """
|
||||
: int
|
||||
df.info() will usually show null-counts for each column.
|
||||
For large frames this can be quite slow. max_info_rows and max_info_cols
|
||||
limit this null check only to frames with smaller dimensions than
|
||||
specified.
|
||||
"""
|
||||
|
||||
pc_large_repr_doc = """
|
||||
: 'truncate'/'info'
|
||||
For DataFrames exceeding max_rows/max_cols, the repr (and HTML repr) can
|
||||
show a truncated table, or switch to the view from
|
||||
df.info() (the behaviour in earlier versions of pandas).
|
||||
"""
|
||||
|
||||
pc_memory_usage_doc = """
|
||||
: bool, string or None
|
||||
This specifies if the memory usage of a DataFrame should be displayed when
|
||||
df.info() is called. Valid values True,False,'deep'
|
||||
"""
|
||||
|
||||
|
||||
def table_schema_cb(key) -> None:
|
||||
from pandas.io.formats.printing import enable_data_resource_formatter
|
||||
|
||||
enable_data_resource_formatter(cf.get_option(key))
|
||||
|
||||
|
||||
def is_terminal() -> bool:
|
||||
"""
|
||||
Detect if Python is running in a terminal.
|
||||
|
||||
Returns True if Python is running in a terminal or False if not.
|
||||
"""
|
||||
try:
|
||||
# error: Name 'get_ipython' is not defined
|
||||
ip = get_ipython() # type: ignore[name-defined]
|
||||
except NameError: # assume standard Python interpreter in a terminal
|
||||
return True
|
||||
else:
|
||||
if hasattr(ip, "kernel"): # IPython as a Jupyter kernel
|
||||
return False
|
||||
else: # IPython in a terminal
|
||||
return True
|
||||
|
||||
|
||||
with cf.config_prefix("display"):
|
||||
cf.register_option("precision", 6, pc_precision_doc, validator=is_nonnegative_int)
|
||||
cf.register_option(
|
||||
"float_format",
|
||||
None,
|
||||
float_format_doc,
|
||||
validator=is_one_of_factory([None, is_callable]),
|
||||
)
|
||||
cf.register_option(
|
||||
"max_info_rows",
|
||||
1690785,
|
||||
pc_max_info_rows_doc,
|
||||
validator=is_int,
|
||||
)
|
||||
cf.register_option("max_rows", 60, pc_max_rows_doc, validator=is_nonnegative_int)
|
||||
cf.register_option(
|
||||
"min_rows",
|
||||
10,
|
||||
pc_min_rows_doc,
|
||||
validator=is_instance_factory([type(None), int]),
|
||||
)
|
||||
cf.register_option("max_categories", 8, pc_max_categories_doc, validator=is_int)
|
||||
|
||||
cf.register_option(
|
||||
"max_colwidth",
|
||||
50,
|
||||
max_colwidth_doc,
|
||||
validator=is_nonnegative_int,
|
||||
)
|
||||
if is_terminal():
|
||||
max_cols = 0 # automatically determine optimal number of columns
|
||||
else:
|
||||
max_cols = 20 # cannot determine optimal number of columns
|
||||
cf.register_option(
|
||||
"max_columns", max_cols, pc_max_cols_doc, validator=is_nonnegative_int
|
||||
)
|
||||
cf.register_option(
|
||||
"large_repr",
|
||||
"truncate",
|
||||
pc_large_repr_doc,
|
||||
validator=is_one_of_factory(["truncate", "info"]),
|
||||
)
|
||||
cf.register_option("max_info_columns", 100, pc_max_info_cols_doc, validator=is_int)
|
||||
cf.register_option(
|
||||
"colheader_justify", "right", colheader_justify_doc, validator=is_text
|
||||
)
|
||||
cf.register_option("notebook_repr_html", True, pc_nb_repr_h_doc, validator=is_bool)
|
||||
cf.register_option("pprint_nest_depth", 3, pc_pprint_nest_depth, validator=is_int)
|
||||
cf.register_option("multi_sparse", True, pc_multi_sparse_doc, validator=is_bool)
|
||||
cf.register_option("expand_frame_repr", True, pc_expand_repr_doc)
|
||||
cf.register_option(
|
||||
"show_dimensions",
|
||||
"truncate",
|
||||
pc_show_dimensions_doc,
|
||||
validator=is_one_of_factory([True, False, "truncate"]),
|
||||
)
|
||||
cf.register_option("chop_threshold", None, pc_chop_threshold_doc)
|
||||
cf.register_option("max_seq_items", 100, pc_max_seq_items)
|
||||
cf.register_option(
|
||||
"width", 80, pc_width_doc, validator=is_instance_factory([type(None), int])
|
||||
)
|
||||
cf.register_option(
|
||||
"memory_usage",
|
||||
True,
|
||||
pc_memory_usage_doc,
|
||||
validator=is_one_of_factory([None, True, False, "deep"]),
|
||||
)
|
||||
cf.register_option(
|
||||
"unicode.east_asian_width", False, pc_east_asian_width_doc, validator=is_bool
|
||||
)
|
||||
cf.register_option(
|
||||
"unicode.ambiguous_as_wide", False, pc_east_asian_width_doc, validator=is_bool
|
||||
)
|
||||
cf.register_option(
|
||||
"html.table_schema",
|
||||
False,
|
||||
pc_table_schema_doc,
|
||||
validator=is_bool,
|
||||
cb=table_schema_cb,
|
||||
)
|
||||
cf.register_option("html.border", 1, pc_html_border_doc, validator=is_int)
|
||||
cf.register_option(
|
||||
"html.use_mathjax", True, pc_html_use_mathjax_doc, validator=is_bool
|
||||
)
|
||||
cf.register_option(
|
||||
"max_dir_items", 100, pc_max_dir_items, validator=is_nonnegative_int
|
||||
)
|
||||
|
||||
tc_sim_interactive_doc = """
|
||||
: boolean
|
||||
Whether to simulate interactive mode for purposes of testing
|
||||
"""
|
||||
|
||||
with cf.config_prefix("mode"):
|
||||
cf.register_option("sim_interactive", False, tc_sim_interactive_doc)
|
||||
|
||||
use_inf_as_na_doc = """
|
||||
: boolean
|
||||
True means treat None, NaN, INF, -INF as NA (old way),
|
||||
False means None and NaN are null, but INF, -INF are not NA
|
||||
(new way).
|
||||
|
||||
This option is deprecated in pandas 2.1.0 and will be removed in 3.0.
|
||||
"""
|
||||
|
||||
# We don't want to start importing everything at the global context level
|
||||
# or we'll hit circular deps.
|
||||
|
||||
|
||||
def use_inf_as_na_cb(key) -> None:
|
||||
# TODO(3.0): enforcing this deprecation will close GH#52501
|
||||
from pandas.core.dtypes.missing import _use_inf_as_na
|
||||
|
||||
_use_inf_as_na(key)
|
||||
|
||||
|
||||
with cf.config_prefix("mode"):
|
||||
cf.register_option("use_inf_as_na", False, use_inf_as_na_doc, cb=use_inf_as_na_cb)
|
||||
|
||||
cf.deprecate_option(
|
||||
# GH#51684
|
||||
"mode.use_inf_as_na",
|
||||
"use_inf_as_na option is deprecated and will be removed in a future "
|
||||
"version. Convert inf values to NaN before operating instead.",
|
||||
)
|
||||
|
||||
data_manager_doc = """
|
||||
: string
|
||||
Internal data manager type; can be "block" or "array". Defaults to "block",
|
||||
unless overridden by the 'PANDAS_DATA_MANAGER' environment variable (needs
|
||||
to be set before pandas is imported).
|
||||
"""
|
||||
|
||||
|
||||
with cf.config_prefix("mode"):
|
||||
cf.register_option(
|
||||
"data_manager",
|
||||
# Get the default from an environment variable, if set, otherwise defaults
|
||||
# to "block". This environment variable can be set for testing.
|
||||
os.environ.get("PANDAS_DATA_MANAGER", "block"),
|
||||
data_manager_doc,
|
||||
validator=is_one_of_factory(["block", "array"]),
|
||||
)
|
||||
|
||||
cf.deprecate_option(
|
||||
# GH#55043
|
||||
"mode.data_manager",
|
||||
"data_manager option is deprecated and will be removed in a future "
|
||||
"version. Only the BlockManager will be available.",
|
||||
)
|
||||
|
||||
|
||||
# TODO better name?
|
||||
copy_on_write_doc = """
|
||||
: bool
|
||||
Use new copy-view behaviour using Copy-on-Write. Defaults to False,
|
||||
unless overridden by the 'PANDAS_COPY_ON_WRITE' environment variable
|
||||
(if set to "1" for True, needs to be set before pandas is imported).
|
||||
"""
|
||||
|
||||
|
||||
with cf.config_prefix("mode"):
|
||||
cf.register_option(
|
||||
"copy_on_write",
|
||||
# Get the default from an environment variable, if set, otherwise defaults
|
||||
# to False. This environment variable can be set for testing.
|
||||
"warn"
|
||||
if os.environ.get("PANDAS_COPY_ON_WRITE", "0") == "warn"
|
||||
else os.environ.get("PANDAS_COPY_ON_WRITE", "0") == "1",
|
||||
copy_on_write_doc,
|
||||
validator=is_one_of_factory([True, False, "warn"]),
|
||||
)
|
||||
|
||||
|
||||
# user warnings
|
||||
chained_assignment = """
|
||||
: string
|
||||
Raise an exception, warn, or no action if trying to use chained assignment,
|
||||
The default is warn
|
||||
"""
|
||||
|
||||
with cf.config_prefix("mode"):
|
||||
cf.register_option(
|
||||
"chained_assignment",
|
||||
"warn",
|
||||
chained_assignment,
|
||||
validator=is_one_of_factory([None, "warn", "raise"]),
|
||||
)
|
||||
|
||||
|
||||
string_storage_doc = """
|
||||
: string
|
||||
The default storage for StringDtype.
|
||||
"""
|
||||
|
||||
|
||||
def is_valid_string_storage(value: Any) -> None:
|
||||
legal_values = ["auto", "python", "pyarrow"]
|
||||
if value not in legal_values:
|
||||
msg = "Value must be one of python|pyarrow"
|
||||
if value == "pyarrow_numpy":
|
||||
# TODO: we can remove extra message after 3.0
|
||||
msg += (
|
||||
". 'pyarrow_numpy' was specified, but this option should be "
|
||||
"enabled using pandas.options.future.infer_string instead"
|
||||
)
|
||||
raise ValueError(msg)
|
||||
|
||||
|
||||
with cf.config_prefix("mode"):
|
||||
cf.register_option(
|
||||
"string_storage",
|
||||
"auto",
|
||||
string_storage_doc,
|
||||
# validator=is_one_of_factory(["python", "pyarrow"]),
|
||||
validator=is_valid_string_storage,
|
||||
)
|
||||
|
||||
|
||||
# Set up the io.excel specific reader configuration.
|
||||
reader_engine_doc = """
|
||||
: string
|
||||
The default Excel reader engine for '{ext}' files. Available options:
|
||||
auto, {others}.
|
||||
"""
|
||||
|
||||
_xls_options = ["xlrd", "calamine"]
|
||||
_xlsm_options = ["xlrd", "openpyxl", "calamine"]
|
||||
_xlsx_options = ["xlrd", "openpyxl", "calamine"]
|
||||
_ods_options = ["odf", "calamine"]
|
||||
_xlsb_options = ["pyxlsb", "calamine"]
|
||||
|
||||
|
||||
with cf.config_prefix("io.excel.xls"):
|
||||
cf.register_option(
|
||||
"reader",
|
||||
"auto",
|
||||
reader_engine_doc.format(ext="xls", others=", ".join(_xls_options)),
|
||||
validator=is_one_of_factory(_xls_options + ["auto"]),
|
||||
)
|
||||
|
||||
with cf.config_prefix("io.excel.xlsm"):
|
||||
cf.register_option(
|
||||
"reader",
|
||||
"auto",
|
||||
reader_engine_doc.format(ext="xlsm", others=", ".join(_xlsm_options)),
|
||||
validator=is_one_of_factory(_xlsm_options + ["auto"]),
|
||||
)
|
||||
|
||||
|
||||
with cf.config_prefix("io.excel.xlsx"):
|
||||
cf.register_option(
|
||||
"reader",
|
||||
"auto",
|
||||
reader_engine_doc.format(ext="xlsx", others=", ".join(_xlsx_options)),
|
||||
validator=is_one_of_factory(_xlsx_options + ["auto"]),
|
||||
)
|
||||
|
||||
|
||||
with cf.config_prefix("io.excel.ods"):
|
||||
cf.register_option(
|
||||
"reader",
|
||||
"auto",
|
||||
reader_engine_doc.format(ext="ods", others=", ".join(_ods_options)),
|
||||
validator=is_one_of_factory(_ods_options + ["auto"]),
|
||||
)
|
||||
|
||||
with cf.config_prefix("io.excel.xlsb"):
|
||||
cf.register_option(
|
||||
"reader",
|
||||
"auto",
|
||||
reader_engine_doc.format(ext="xlsb", others=", ".join(_xlsb_options)),
|
||||
validator=is_one_of_factory(_xlsb_options + ["auto"]),
|
||||
)
|
||||
|
||||
# Set up the io.excel specific writer configuration.
|
||||
writer_engine_doc = """
|
||||
: string
|
||||
The default Excel writer engine for '{ext}' files. Available options:
|
||||
auto, {others}.
|
||||
"""
|
||||
|
||||
_xlsm_options = ["openpyxl"]
|
||||
_xlsx_options = ["openpyxl", "xlsxwriter"]
|
||||
_ods_options = ["odf"]
|
||||
|
||||
|
||||
with cf.config_prefix("io.excel.xlsm"):
|
||||
cf.register_option(
|
||||
"writer",
|
||||
"auto",
|
||||
writer_engine_doc.format(ext="xlsm", others=", ".join(_xlsm_options)),
|
||||
validator=str,
|
||||
)
|
||||
|
||||
|
||||
with cf.config_prefix("io.excel.xlsx"):
|
||||
cf.register_option(
|
||||
"writer",
|
||||
"auto",
|
||||
writer_engine_doc.format(ext="xlsx", others=", ".join(_xlsx_options)),
|
||||
validator=str,
|
||||
)
|
||||
|
||||
|
||||
with cf.config_prefix("io.excel.ods"):
|
||||
cf.register_option(
|
||||
"writer",
|
||||
"auto",
|
||||
writer_engine_doc.format(ext="ods", others=", ".join(_ods_options)),
|
||||
validator=str,
|
||||
)
|
||||
|
||||
|
||||
# Set up the io.parquet specific configuration.
|
||||
parquet_engine_doc = """
|
||||
: string
|
||||
The default parquet reader/writer engine. Available options:
|
||||
'auto', 'pyarrow', 'fastparquet', the default is 'auto'
|
||||
"""
|
||||
|
||||
with cf.config_prefix("io.parquet"):
|
||||
cf.register_option(
|
||||
"engine",
|
||||
"auto",
|
||||
parquet_engine_doc,
|
||||
validator=is_one_of_factory(["auto", "pyarrow", "fastparquet"]),
|
||||
)
|
||||
|
||||
|
||||
# Set up the io.sql specific configuration.
|
||||
sql_engine_doc = """
|
||||
: string
|
||||
The default sql reader/writer engine. Available options:
|
||||
'auto', 'sqlalchemy', the default is 'auto'
|
||||
"""
|
||||
|
||||
with cf.config_prefix("io.sql"):
|
||||
cf.register_option(
|
||||
"engine",
|
||||
"auto",
|
||||
sql_engine_doc,
|
||||
validator=is_one_of_factory(["auto", "sqlalchemy"]),
|
||||
)
|
||||
|
||||
# --------
|
||||
# Plotting
|
||||
# ---------
|
||||
|
||||
plotting_backend_doc = """
|
||||
: str
|
||||
The plotting backend to use. The default value is "matplotlib", the
|
||||
backend provided with pandas. Other backends can be specified by
|
||||
providing the name of the module that implements the backend.
|
||||
"""
|
||||
|
||||
|
||||
def register_plotting_backend_cb(key) -> None:
|
||||
if key == "matplotlib":
|
||||
# We defer matplotlib validation, since it's the default
|
||||
return
|
||||
from pandas.plotting._core import _get_plot_backend
|
||||
|
||||
_get_plot_backend(key)
|
||||
|
||||
|
||||
with cf.config_prefix("plotting"):
|
||||
cf.register_option(
|
||||
"backend",
|
||||
defval="matplotlib",
|
||||
doc=plotting_backend_doc,
|
||||
validator=register_plotting_backend_cb,
|
||||
)
|
||||
|
||||
|
||||
register_converter_doc = """
|
||||
: bool or 'auto'.
|
||||
Whether to register converters with matplotlib's units registry for
|
||||
dates, times, datetimes, and Periods. Toggling to False will remove
|
||||
the converters, restoring any converters that pandas overwrote.
|
||||
"""
|
||||
|
||||
|
||||
def register_converter_cb(key) -> None:
|
||||
from pandas.plotting import (
|
||||
deregister_matplotlib_converters,
|
||||
register_matplotlib_converters,
|
||||
)
|
||||
|
||||
if cf.get_option(key):
|
||||
register_matplotlib_converters()
|
||||
else:
|
||||
deregister_matplotlib_converters()
|
||||
|
||||
|
||||
with cf.config_prefix("plotting.matplotlib"):
|
||||
cf.register_option(
|
||||
"register_converters",
|
||||
"auto",
|
||||
register_converter_doc,
|
||||
validator=is_one_of_factory(["auto", True, False]),
|
||||
cb=register_converter_cb,
|
||||
)
|
||||
|
||||
# ------
|
||||
# Styler
|
||||
# ------
|
||||
|
||||
styler_sparse_index_doc = """
|
||||
: bool
|
||||
Whether to sparsify the display of a hierarchical index. Setting to False will
|
||||
display each explicit level element in a hierarchical key for each row.
|
||||
"""
|
||||
|
||||
styler_sparse_columns_doc = """
|
||||
: bool
|
||||
Whether to sparsify the display of hierarchical columns. Setting to False will
|
||||
display each explicit level element in a hierarchical key for each column.
|
||||
"""
|
||||
|
||||
styler_render_repr = """
|
||||
: str
|
||||
Determine which output to use in Jupyter Notebook in {"html", "latex"}.
|
||||
"""
|
||||
|
||||
styler_max_elements = """
|
||||
: int
|
||||
The maximum number of data-cell (<td>) elements that will be rendered before
|
||||
trimming will occur over columns, rows or both if needed.
|
||||
"""
|
||||
|
||||
styler_max_rows = """
|
||||
: int, optional
|
||||
The maximum number of rows that will be rendered. May still be reduced to
|
||||
satisfy ``max_elements``, which takes precedence.
|
||||
"""
|
||||
|
||||
styler_max_columns = """
|
||||
: int, optional
|
||||
The maximum number of columns that will be rendered. May still be reduced to
|
||||
satisfy ``max_elements``, which takes precedence.
|
||||
"""
|
||||
|
||||
styler_precision = """
|
||||
: int
|
||||
The precision for floats and complex numbers.
|
||||
"""
|
||||
|
||||
styler_decimal = """
|
||||
: str
|
||||
The character representation for the decimal separator for floats and complex.
|
||||
"""
|
||||
|
||||
styler_thousands = """
|
||||
: str, optional
|
||||
The character representation for thousands separator for floats, int and complex.
|
||||
"""
|
||||
|
||||
styler_na_rep = """
|
||||
: str, optional
|
||||
The string representation for values identified as missing.
|
||||
"""
|
||||
|
||||
styler_escape = """
|
||||
: str, optional
|
||||
Whether to escape certain characters according to the given context; html or latex.
|
||||
"""
|
||||
|
||||
styler_formatter = """
|
||||
: str, callable, dict, optional
|
||||
A formatter object to be used as default within ``Styler.format``.
|
||||
"""
|
||||
|
||||
styler_multirow_align = """
|
||||
: {"c", "t", "b"}
|
||||
The specifier for vertical alignment of sparsified LaTeX multirows.
|
||||
"""
|
||||
|
||||
styler_multicol_align = r"""
|
||||
: {"r", "c", "l", "naive-l", "naive-r"}
|
||||
The specifier for horizontal alignment of sparsified LaTeX multicolumns. Pipe
|
||||
decorators can also be added to non-naive values to draw vertical
|
||||
rules, e.g. "\|r" will draw a rule on the left side of right aligned merged cells.
|
||||
"""
|
||||
|
||||
styler_hrules = """
|
||||
: bool
|
||||
Whether to add horizontal rules on top and bottom and below the headers.
|
||||
"""
|
||||
|
||||
styler_environment = """
|
||||
: str
|
||||
The environment to replace ``\\begin{table}``. If "longtable" is used results
|
||||
in a specific longtable environment format.
|
||||
"""
|
||||
|
||||
styler_encoding = """
|
||||
: str
|
||||
The encoding used for output HTML and LaTeX files.
|
||||
"""
|
||||
|
||||
styler_mathjax = """
|
||||
: bool
|
||||
If False will render special CSS classes to table attributes that indicate Mathjax
|
||||
will not be used in Jupyter Notebook.
|
||||
"""
|
||||
|
||||
with cf.config_prefix("styler"):
|
||||
cf.register_option("sparse.index", True, styler_sparse_index_doc, validator=is_bool)
|
||||
|
||||
cf.register_option(
|
||||
"sparse.columns", True, styler_sparse_columns_doc, validator=is_bool
|
||||
)
|
||||
|
||||
cf.register_option(
|
||||
"render.repr",
|
||||
"html",
|
||||
styler_render_repr,
|
||||
validator=is_one_of_factory(["html", "latex"]),
|
||||
)
|
||||
|
||||
cf.register_option(
|
||||
"render.max_elements",
|
||||
2**18,
|
||||
styler_max_elements,
|
||||
validator=is_nonnegative_int,
|
||||
)
|
||||
|
||||
cf.register_option(
|
||||
"render.max_rows",
|
||||
None,
|
||||
styler_max_rows,
|
||||
validator=is_nonnegative_int,
|
||||
)
|
||||
|
||||
cf.register_option(
|
||||
"render.max_columns",
|
||||
None,
|
||||
styler_max_columns,
|
||||
validator=is_nonnegative_int,
|
||||
)
|
||||
|
||||
cf.register_option("render.encoding", "utf-8", styler_encoding, validator=is_str)
|
||||
|
||||
cf.register_option("format.decimal", ".", styler_decimal, validator=is_str)
|
||||
|
||||
cf.register_option(
|
||||
"format.precision", 6, styler_precision, validator=is_nonnegative_int
|
||||
)
|
||||
|
||||
cf.register_option(
|
||||
"format.thousands",
|
||||
None,
|
||||
styler_thousands,
|
||||
validator=is_instance_factory([type(None), str]),
|
||||
)
|
||||
|
||||
cf.register_option(
|
||||
"format.na_rep",
|
||||
None,
|
||||
styler_na_rep,
|
||||
validator=is_instance_factory([type(None), str]),
|
||||
)
|
||||
|
||||
cf.register_option(
|
||||
"format.escape",
|
||||
None,
|
||||
styler_escape,
|
||||
validator=is_one_of_factory([None, "html", "latex", "latex-math"]),
|
||||
)
|
||||
|
||||
cf.register_option(
|
||||
"format.formatter",
|
||||
None,
|
||||
styler_formatter,
|
||||
validator=is_instance_factory([type(None), dict, Callable, str]),
|
||||
)
|
||||
|
||||
cf.register_option("html.mathjax", True, styler_mathjax, validator=is_bool)
|
||||
|
||||
cf.register_option(
|
||||
"latex.multirow_align",
|
||||
"c",
|
||||
styler_multirow_align,
|
||||
validator=is_one_of_factory(["c", "t", "b", "naive"]),
|
||||
)
|
||||
|
||||
val_mca = ["r", "|r|", "|r", "r|", "c", "|c|", "|c", "c|", "l", "|l|", "|l", "l|"]
|
||||
val_mca += ["naive-l", "naive-r"]
|
||||
cf.register_option(
|
||||
"latex.multicol_align",
|
||||
"r",
|
||||
styler_multicol_align,
|
||||
validator=is_one_of_factory(val_mca),
|
||||
)
|
||||
|
||||
cf.register_option("latex.hrules", False, styler_hrules, validator=is_bool)
|
||||
|
||||
cf.register_option(
|
||||
"latex.environment",
|
||||
None,
|
||||
styler_environment,
|
||||
validator=is_instance_factory([type(None), str]),
|
||||
)
|
||||
|
||||
|
||||
with cf.config_prefix("future"):
|
||||
cf.register_option(
|
||||
"infer_string",
|
||||
True if os.environ.get("PANDAS_FUTURE_INFER_STRING", "0") == "1" else False,
|
||||
"Whether to infer sequence of str objects as pyarrow string "
|
||||
"dtype, which will be the default in pandas 3.0 "
|
||||
"(at which point this option will be deprecated).",
|
||||
validator=is_one_of_factory([True, False]),
|
||||
)
|
||||
|
||||
cf.register_option(
|
||||
"no_silent_downcasting",
|
||||
False,
|
||||
"Whether to opt-in to the future behavior which will *not* silently "
|
||||
"downcast results from Series and DataFrame `where`, `mask`, and `clip` "
|
||||
"methods. "
|
||||
"Silent downcasting will be removed in pandas 3.0 "
|
||||
"(at which point this option will be deprecated).",
|
||||
validator=is_one_of_factory([True, False]),
|
||||
)
|
||||
821
lib/python3.11/site-packages/pandas/core/construction.py
Normal file
821
lib/python3.11/site-packages/pandas/core/construction.py
Normal file
@ -0,0 +1,821 @@
|
||||
"""
|
||||
Constructor functions intended to be shared by pd.array, Series.__init__,
|
||||
and Index.__new__.
|
||||
|
||||
These should not depend on core.internals.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Sequence
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Optional,
|
||||
Union,
|
||||
cast,
|
||||
overload,
|
||||
)
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
from numpy import ma
|
||||
|
||||
from pandas._config import using_string_dtype
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas._libs.tslibs import (
|
||||
Period,
|
||||
get_supported_dtype,
|
||||
is_supported_dtype,
|
||||
)
|
||||
from pandas._typing import (
|
||||
AnyArrayLike,
|
||||
ArrayLike,
|
||||
Dtype,
|
||||
DtypeObj,
|
||||
T,
|
||||
)
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
from pandas.core.dtypes.base import ExtensionDtype
|
||||
from pandas.core.dtypes.cast import (
|
||||
construct_1d_arraylike_from_scalar,
|
||||
construct_1d_object_array_from_listlike,
|
||||
maybe_cast_to_datetime,
|
||||
maybe_cast_to_integer_array,
|
||||
maybe_convert_platform,
|
||||
maybe_infer_to_datetimelike,
|
||||
maybe_promote,
|
||||
)
|
||||
from pandas.core.dtypes.common import (
|
||||
is_list_like,
|
||||
is_object_dtype,
|
||||
is_string_dtype,
|
||||
pandas_dtype,
|
||||
)
|
||||
from pandas.core.dtypes.dtypes import NumpyEADtype
|
||||
from pandas.core.dtypes.generic import (
|
||||
ABCDataFrame,
|
||||
ABCExtensionArray,
|
||||
ABCIndex,
|
||||
ABCSeries,
|
||||
)
|
||||
from pandas.core.dtypes.missing import isna
|
||||
|
||||
import pandas.core.common as com
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas import (
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
from pandas.core.arrays.base import ExtensionArray
|
||||
|
||||
|
||||
def array(
|
||||
data: Sequence[object] | AnyArrayLike,
|
||||
dtype: Dtype | None = None,
|
||||
copy: bool = True,
|
||||
) -> ExtensionArray:
|
||||
"""
|
||||
Create an array.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : Sequence of objects
|
||||
The scalars inside `data` should be instances of the
|
||||
scalar type for `dtype`. It's expected that `data`
|
||||
represents a 1-dimensional array of data.
|
||||
|
||||
When `data` is an Index or Series, the underlying array
|
||||
will be extracted from `data`.
|
||||
|
||||
dtype : str, np.dtype, or ExtensionDtype, optional
|
||||
The dtype to use for the array. This may be a NumPy
|
||||
dtype or an extension type registered with pandas using
|
||||
:meth:`pandas.api.extensions.register_extension_dtype`.
|
||||
|
||||
If not specified, there are two possibilities:
|
||||
|
||||
1. When `data` is a :class:`Series`, :class:`Index`, or
|
||||
:class:`ExtensionArray`, the `dtype` will be taken
|
||||
from the data.
|
||||
2. Otherwise, pandas will attempt to infer the `dtype`
|
||||
from the data.
|
||||
|
||||
Note that when `data` is a NumPy array, ``data.dtype`` is
|
||||
*not* used for inferring the array type. This is because
|
||||
NumPy cannot represent all the types of data that can be
|
||||
held in extension arrays.
|
||||
|
||||
Currently, pandas will infer an extension dtype for sequences of
|
||||
|
||||
============================== =======================================
|
||||
Scalar Type Array Type
|
||||
============================== =======================================
|
||||
:class:`pandas.Interval` :class:`pandas.arrays.IntervalArray`
|
||||
:class:`pandas.Period` :class:`pandas.arrays.PeriodArray`
|
||||
:class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray`
|
||||
:class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray`
|
||||
:class:`int` :class:`pandas.arrays.IntegerArray`
|
||||
:class:`float` :class:`pandas.arrays.FloatingArray`
|
||||
:class:`str` :class:`pandas.arrays.StringArray` or
|
||||
:class:`pandas.arrays.ArrowStringArray`
|
||||
:class:`bool` :class:`pandas.arrays.BooleanArray`
|
||||
============================== =======================================
|
||||
|
||||
The ExtensionArray created when the scalar type is :class:`str` is determined by
|
||||
``pd.options.mode.string_storage`` if the dtype is not explicitly given.
|
||||
|
||||
For all other cases, NumPy's usual inference rules will be used.
|
||||
copy : bool, default True
|
||||
Whether to copy the data, even if not necessary. Depending
|
||||
on the type of `data`, creating the new array may require
|
||||
copying data, even if ``copy=False``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ExtensionArray
|
||||
The newly created array.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
When `data` is not 1-dimensional.
|
||||
|
||||
See Also
|
||||
--------
|
||||
numpy.array : Construct a NumPy array.
|
||||
Series : Construct a pandas Series.
|
||||
Index : Construct a pandas Index.
|
||||
arrays.NumpyExtensionArray : ExtensionArray wrapping a NumPy array.
|
||||
Series.array : Extract the array stored within a Series.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Omitting the `dtype` argument means pandas will attempt to infer the
|
||||
best array type from the values in the data. As new array types are
|
||||
added by pandas and 3rd party libraries, the "best" array type may
|
||||
change. We recommend specifying `dtype` to ensure that
|
||||
|
||||
1. the correct array type for the data is returned
|
||||
2. the returned array type doesn't change as new extension types
|
||||
are added by pandas and third-party libraries
|
||||
|
||||
Additionally, if the underlying memory representation of the returned
|
||||
array matters, we recommend specifying the `dtype` as a concrete object
|
||||
rather than a string alias or allowing it to be inferred. For example,
|
||||
a future version of pandas or a 3rd-party library may include a
|
||||
dedicated ExtensionArray for string data. In this event, the following
|
||||
would no longer return a :class:`arrays.NumpyExtensionArray` backed by a
|
||||
NumPy array.
|
||||
|
||||
>>> pd.array(['a', 'b'], dtype=str)
|
||||
<NumpyExtensionArray>
|
||||
['a', 'b']
|
||||
Length: 2, dtype: str32
|
||||
|
||||
This would instead return the new ExtensionArray dedicated for string
|
||||
data. If you really need the new array to be backed by a NumPy array,
|
||||
specify that in the dtype.
|
||||
|
||||
>>> pd.array(['a', 'b'], dtype=np.dtype("<U1"))
|
||||
<NumpyExtensionArray>
|
||||
['a', 'b']
|
||||
Length: 2, dtype: str32
|
||||
|
||||
Finally, Pandas has arrays that mostly overlap with NumPy
|
||||
|
||||
* :class:`arrays.DatetimeArray`
|
||||
* :class:`arrays.TimedeltaArray`
|
||||
|
||||
When data with a ``datetime64[ns]`` or ``timedelta64[ns]`` dtype is
|
||||
passed, pandas will always return a ``DatetimeArray`` or ``TimedeltaArray``
|
||||
rather than a ``NumpyExtensionArray``. This is for symmetry with the case of
|
||||
timezone-aware data, which NumPy does not natively support.
|
||||
|
||||
>>> pd.array(['2015', '2016'], dtype='datetime64[ns]')
|
||||
<DatetimeArray>
|
||||
['2015-01-01 00:00:00', '2016-01-01 00:00:00']
|
||||
Length: 2, dtype: datetime64[ns]
|
||||
|
||||
>>> pd.array(["1h", "2h"], dtype='timedelta64[ns]')
|
||||
<TimedeltaArray>
|
||||
['0 days 01:00:00', '0 days 02:00:00']
|
||||
Length: 2, dtype: timedelta64[ns]
|
||||
|
||||
Examples
|
||||
--------
|
||||
If a dtype is not specified, pandas will infer the best dtype from the values.
|
||||
See the description of `dtype` for the types pandas infers for.
|
||||
|
||||
>>> pd.array([1, 2])
|
||||
<IntegerArray>
|
||||
[1, 2]
|
||||
Length: 2, dtype: Int64
|
||||
|
||||
>>> pd.array([1, 2, np.nan])
|
||||
<IntegerArray>
|
||||
[1, 2, <NA>]
|
||||
Length: 3, dtype: Int64
|
||||
|
||||
>>> pd.array([1.1, 2.2])
|
||||
<FloatingArray>
|
||||
[1.1, 2.2]
|
||||
Length: 2, dtype: Float64
|
||||
|
||||
>>> pd.array(["a", None, "c"])
|
||||
<StringArray>
|
||||
['a', <NA>, 'c']
|
||||
Length: 3, dtype: string
|
||||
|
||||
>>> with pd.option_context("string_storage", "pyarrow"):
|
||||
... arr = pd.array(["a", None, "c"])
|
||||
...
|
||||
>>> arr
|
||||
<ArrowStringArray>
|
||||
['a', <NA>, 'c']
|
||||
Length: 3, dtype: string
|
||||
|
||||
>>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")])
|
||||
<PeriodArray>
|
||||
['2000-01-01', '2000-01-01']
|
||||
Length: 2, dtype: period[D]
|
||||
|
||||
You can use the string alias for `dtype`
|
||||
|
||||
>>> pd.array(['a', 'b', 'a'], dtype='category')
|
||||
['a', 'b', 'a']
|
||||
Categories (2, object): ['a', 'b']
|
||||
|
||||
Or specify the actual dtype
|
||||
|
||||
>>> pd.array(['a', 'b', 'a'],
|
||||
... dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True))
|
||||
['a', 'b', 'a']
|
||||
Categories (3, object): ['a' < 'b' < 'c']
|
||||
|
||||
If pandas does not infer a dedicated extension type a
|
||||
:class:`arrays.NumpyExtensionArray` is returned.
|
||||
|
||||
>>> pd.array([1 + 1j, 3 + 2j])
|
||||
<NumpyExtensionArray>
|
||||
[(1+1j), (3+2j)]
|
||||
Length: 2, dtype: complex128
|
||||
|
||||
As mentioned in the "Notes" section, new extension types may be added
|
||||
in the future (by pandas or 3rd party libraries), causing the return
|
||||
value to no longer be a :class:`arrays.NumpyExtensionArray`. Specify the
|
||||
`dtype` as a NumPy dtype if you need to ensure there's no future change in
|
||||
behavior.
|
||||
|
||||
>>> pd.array([1, 2], dtype=np.dtype("int32"))
|
||||
<NumpyExtensionArray>
|
||||
[1, 2]
|
||||
Length: 2, dtype: int32
|
||||
|
||||
`data` must be 1-dimensional. A ValueError is raised when the input
|
||||
has the wrong dimensionality.
|
||||
|
||||
>>> pd.array(1)
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
ValueError: Cannot pass scalar '1' to 'pandas.array'.
|
||||
"""
|
||||
from pandas.core.arrays import (
|
||||
BooleanArray,
|
||||
DatetimeArray,
|
||||
ExtensionArray,
|
||||
FloatingArray,
|
||||
IntegerArray,
|
||||
IntervalArray,
|
||||
NumpyExtensionArray,
|
||||
PeriodArray,
|
||||
TimedeltaArray,
|
||||
)
|
||||
from pandas.core.arrays.string_ import StringDtype
|
||||
|
||||
if lib.is_scalar(data):
|
||||
msg = f"Cannot pass scalar '{data}' to 'pandas.array'."
|
||||
raise ValueError(msg)
|
||||
elif isinstance(data, ABCDataFrame):
|
||||
raise TypeError("Cannot pass DataFrame to 'pandas.array'")
|
||||
|
||||
if dtype is None and isinstance(data, (ABCSeries, ABCIndex, ExtensionArray)):
|
||||
# Note: we exclude np.ndarray here, will do type inference on it
|
||||
dtype = data.dtype
|
||||
|
||||
data = extract_array(data, extract_numpy=True)
|
||||
|
||||
# this returns None for not-found dtypes.
|
||||
if dtype is not None:
|
||||
dtype = pandas_dtype(dtype)
|
||||
|
||||
if isinstance(data, ExtensionArray) and (dtype is None or data.dtype == dtype):
|
||||
# e.g. TimedeltaArray[s], avoid casting to NumpyExtensionArray
|
||||
if copy:
|
||||
return data.copy()
|
||||
return data
|
||||
|
||||
if isinstance(dtype, ExtensionDtype):
|
||||
cls = dtype.construct_array_type()
|
||||
return cls._from_sequence(data, dtype=dtype, copy=copy)
|
||||
|
||||
if dtype is None:
|
||||
inferred_dtype = lib.infer_dtype(data, skipna=True)
|
||||
if inferred_dtype == "period":
|
||||
period_data = cast(Union[Sequence[Optional[Period]], AnyArrayLike], data)
|
||||
return PeriodArray._from_sequence(period_data, copy=copy)
|
||||
|
||||
elif inferred_dtype == "interval":
|
||||
return IntervalArray(data, copy=copy)
|
||||
|
||||
elif inferred_dtype.startswith("datetime"):
|
||||
# datetime, datetime64
|
||||
try:
|
||||
return DatetimeArray._from_sequence(data, copy=copy)
|
||||
except ValueError:
|
||||
# Mixture of timezones, fall back to NumpyExtensionArray
|
||||
pass
|
||||
|
||||
elif inferred_dtype.startswith("timedelta"):
|
||||
# timedelta, timedelta64
|
||||
return TimedeltaArray._from_sequence(data, copy=copy)
|
||||
|
||||
elif inferred_dtype == "string":
|
||||
# StringArray/ArrowStringArray depending on pd.options.mode.string_storage
|
||||
dtype = StringDtype()
|
||||
cls = dtype.construct_array_type()
|
||||
return cls._from_sequence(data, dtype=dtype, copy=copy)
|
||||
|
||||
elif inferred_dtype == "integer":
|
||||
return IntegerArray._from_sequence(data, copy=copy)
|
||||
elif inferred_dtype == "empty" and not hasattr(data, "dtype") and not len(data):
|
||||
return FloatingArray._from_sequence(data, copy=copy)
|
||||
elif (
|
||||
inferred_dtype in ("floating", "mixed-integer-float")
|
||||
and getattr(data, "dtype", None) != np.float16
|
||||
):
|
||||
# GH#44715 Exclude np.float16 bc FloatingArray does not support it;
|
||||
# we will fall back to NumpyExtensionArray.
|
||||
return FloatingArray._from_sequence(data, copy=copy)
|
||||
|
||||
elif inferred_dtype == "boolean":
|
||||
return BooleanArray._from_sequence(data, dtype="boolean", copy=copy)
|
||||
|
||||
# Pandas overrides NumPy for
|
||||
# 1. datetime64[ns,us,ms,s]
|
||||
# 2. timedelta64[ns,us,ms,s]
|
||||
# so that a DatetimeArray is returned.
|
||||
if lib.is_np_dtype(dtype, "M") and is_supported_dtype(dtype):
|
||||
return DatetimeArray._from_sequence(data, dtype=dtype, copy=copy)
|
||||
if lib.is_np_dtype(dtype, "m") and is_supported_dtype(dtype):
|
||||
return TimedeltaArray._from_sequence(data, dtype=dtype, copy=copy)
|
||||
|
||||
elif lib.is_np_dtype(dtype, "mM"):
|
||||
warnings.warn(
|
||||
r"datetime64 and timedelta64 dtype resolutions other than "
|
||||
r"'s', 'ms', 'us', and 'ns' are deprecated. "
|
||||
r"In future releases passing unsupported resolutions will "
|
||||
r"raise an exception.",
|
||||
FutureWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
|
||||
return NumpyExtensionArray._from_sequence(data, dtype=dtype, copy=copy)
|
||||
|
||||
|
||||
_typs = frozenset(
|
||||
{
|
||||
"index",
|
||||
"rangeindex",
|
||||
"multiindex",
|
||||
"datetimeindex",
|
||||
"timedeltaindex",
|
||||
"periodindex",
|
||||
"categoricalindex",
|
||||
"intervalindex",
|
||||
"series",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@overload
|
||||
def extract_array(
|
||||
obj: Series | Index, extract_numpy: bool = ..., extract_range: bool = ...
|
||||
) -> ArrayLike:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def extract_array(
|
||||
obj: T, extract_numpy: bool = ..., extract_range: bool = ...
|
||||
) -> T | ArrayLike:
|
||||
...
|
||||
|
||||
|
||||
def extract_array(
|
||||
obj: T, extract_numpy: bool = False, extract_range: bool = False
|
||||
) -> T | ArrayLike:
|
||||
"""
|
||||
Extract the ndarray or ExtensionArray from a Series or Index.
|
||||
|
||||
For all other types, `obj` is just returned as is.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : object
|
||||
For Series / Index, the underlying ExtensionArray is unboxed.
|
||||
|
||||
extract_numpy : bool, default False
|
||||
Whether to extract the ndarray from a NumpyExtensionArray.
|
||||
|
||||
extract_range : bool, default False
|
||||
If we have a RangeIndex, return range._values if True
|
||||
(which is a materialized integer ndarray), otherwise return unchanged.
|
||||
|
||||
Returns
|
||||
-------
|
||||
arr : object
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> extract_array(pd.Series(['a', 'b', 'c'], dtype='category'))
|
||||
['a', 'b', 'c']
|
||||
Categories (3, object): ['a', 'b', 'c']
|
||||
|
||||
Other objects like lists, arrays, and DataFrames are just passed through.
|
||||
|
||||
>>> extract_array([1, 2, 3])
|
||||
[1, 2, 3]
|
||||
|
||||
For an ndarray-backed Series / Index the ndarray is returned.
|
||||
|
||||
>>> extract_array(pd.Series([1, 2, 3]))
|
||||
array([1, 2, 3])
|
||||
|
||||
To extract all the way down to the ndarray, pass ``extract_numpy=True``.
|
||||
|
||||
>>> extract_array(pd.Series([1, 2, 3]), extract_numpy=True)
|
||||
array([1, 2, 3])
|
||||
"""
|
||||
typ = getattr(obj, "_typ", None)
|
||||
if typ in _typs:
|
||||
# i.e. isinstance(obj, (ABCIndex, ABCSeries))
|
||||
if typ == "rangeindex":
|
||||
if extract_range:
|
||||
# error: "T" has no attribute "_values"
|
||||
return obj._values # type: ignore[attr-defined]
|
||||
return obj
|
||||
|
||||
# error: "T" has no attribute "_values"
|
||||
return obj._values # type: ignore[attr-defined]
|
||||
|
||||
elif extract_numpy and typ == "npy_extension":
|
||||
# i.e. isinstance(obj, ABCNumpyExtensionArray)
|
||||
# error: "T" has no attribute "to_numpy"
|
||||
return obj.to_numpy() # type: ignore[attr-defined]
|
||||
|
||||
return obj
|
||||
|
||||
|
||||
def ensure_wrapped_if_datetimelike(arr):
|
||||
"""
|
||||
Wrap datetime64 and timedelta64 ndarrays in DatetimeArray/TimedeltaArray.
|
||||
"""
|
||||
if isinstance(arr, np.ndarray):
|
||||
if arr.dtype.kind == "M":
|
||||
from pandas.core.arrays import DatetimeArray
|
||||
|
||||
dtype = get_supported_dtype(arr.dtype)
|
||||
return DatetimeArray._from_sequence(arr, dtype=dtype)
|
||||
|
||||
elif arr.dtype.kind == "m":
|
||||
from pandas.core.arrays import TimedeltaArray
|
||||
|
||||
dtype = get_supported_dtype(arr.dtype)
|
||||
return TimedeltaArray._from_sequence(arr, dtype=dtype)
|
||||
|
||||
return arr
|
||||
|
||||
|
||||
def sanitize_masked_array(data: ma.MaskedArray) -> np.ndarray:
|
||||
"""
|
||||
Convert numpy MaskedArray to ensure mask is softened.
|
||||
"""
|
||||
mask = ma.getmaskarray(data)
|
||||
if mask.any():
|
||||
dtype, fill_value = maybe_promote(data.dtype, np.nan)
|
||||
dtype = cast(np.dtype, dtype)
|
||||
data = ma.asarray(data.astype(dtype, copy=True))
|
||||
data.soften_mask() # set hardmask False if it was True
|
||||
data[mask] = fill_value
|
||||
else:
|
||||
data = data.copy()
|
||||
return data
|
||||
|
||||
|
||||
def sanitize_array(
|
||||
data,
|
||||
index: Index | None,
|
||||
dtype: DtypeObj | None = None,
|
||||
copy: bool = False,
|
||||
*,
|
||||
allow_2d: bool = False,
|
||||
) -> ArrayLike:
|
||||
"""
|
||||
Sanitize input data to an ndarray or ExtensionArray, copy if specified,
|
||||
coerce to the dtype if specified.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : Any
|
||||
index : Index or None, default None
|
||||
dtype : np.dtype, ExtensionDtype, or None, default None
|
||||
copy : bool, default False
|
||||
allow_2d : bool, default False
|
||||
If False, raise if we have a 2D Arraylike.
|
||||
|
||||
Returns
|
||||
-------
|
||||
np.ndarray or ExtensionArray
|
||||
"""
|
||||
original_dtype = dtype
|
||||
if isinstance(data, ma.MaskedArray):
|
||||
data = sanitize_masked_array(data)
|
||||
|
||||
if isinstance(dtype, NumpyEADtype):
|
||||
# Avoid ending up with a NumpyExtensionArray
|
||||
dtype = dtype.numpy_dtype
|
||||
|
||||
object_index = False
|
||||
if isinstance(data, ABCIndex) and data.dtype == object and dtype is None:
|
||||
object_index = True
|
||||
|
||||
# extract ndarray or ExtensionArray, ensure we have no NumpyExtensionArray
|
||||
data = extract_array(data, extract_numpy=True, extract_range=True)
|
||||
|
||||
if isinstance(data, np.ndarray) and data.ndim == 0:
|
||||
if dtype is None:
|
||||
dtype = data.dtype
|
||||
data = lib.item_from_zerodim(data)
|
||||
elif isinstance(data, range):
|
||||
# GH#16804
|
||||
data = range_to_ndarray(data)
|
||||
copy = False
|
||||
|
||||
if not is_list_like(data):
|
||||
if index is None:
|
||||
raise ValueError("index must be specified when data is not list-like")
|
||||
if isinstance(data, str) and using_string_dtype() and original_dtype is None:
|
||||
from pandas.core.arrays.string_ import StringDtype
|
||||
|
||||
dtype = StringDtype(na_value=np.nan)
|
||||
data = construct_1d_arraylike_from_scalar(data, len(index), dtype)
|
||||
|
||||
return data
|
||||
|
||||
elif isinstance(data, ABCExtensionArray):
|
||||
# it is already ensured above this is not a NumpyExtensionArray
|
||||
# Until GH#49309 is fixed this check needs to come before the
|
||||
# ExtensionDtype check
|
||||
if dtype is not None:
|
||||
subarr = data.astype(dtype, copy=copy)
|
||||
elif copy:
|
||||
subarr = data.copy()
|
||||
else:
|
||||
subarr = data
|
||||
|
||||
elif isinstance(dtype, ExtensionDtype):
|
||||
# create an extension array from its dtype
|
||||
_sanitize_non_ordered(data)
|
||||
cls = dtype.construct_array_type()
|
||||
if not hasattr(data, "__array__"):
|
||||
data = list(data)
|
||||
subarr = cls._from_sequence(data, dtype=dtype, copy=copy)
|
||||
|
||||
# GH#846
|
||||
elif isinstance(data, np.ndarray):
|
||||
if isinstance(data, np.matrix):
|
||||
data = data.A
|
||||
|
||||
if dtype is None:
|
||||
subarr = data
|
||||
if data.dtype == object:
|
||||
subarr = maybe_infer_to_datetimelike(data)
|
||||
if object_index and using_string_dtype() and is_string_dtype(subarr):
|
||||
# Avoid inference when string option is set
|
||||
subarr = data
|
||||
elif data.dtype.kind == "U" and using_string_dtype():
|
||||
from pandas.core.arrays.string_ import StringDtype
|
||||
|
||||
dtype = StringDtype(na_value=np.nan)
|
||||
subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype)
|
||||
|
||||
if (
|
||||
subarr is data
|
||||
or (subarr.dtype == "str" and subarr.dtype.storage == "python") # type: ignore[union-attr]
|
||||
) and copy:
|
||||
subarr = subarr.copy()
|
||||
|
||||
else:
|
||||
# we will try to copy by-definition here
|
||||
subarr = _try_cast(data, dtype, copy)
|
||||
|
||||
elif hasattr(data, "__array__"):
|
||||
# e.g. dask array GH#38645
|
||||
if not copy:
|
||||
data = np.asarray(data)
|
||||
else:
|
||||
data = np.array(data, copy=copy)
|
||||
return sanitize_array(
|
||||
data,
|
||||
index=index,
|
||||
dtype=dtype,
|
||||
copy=False,
|
||||
allow_2d=allow_2d,
|
||||
)
|
||||
|
||||
else:
|
||||
_sanitize_non_ordered(data)
|
||||
# materialize e.g. generators, convert e.g. tuples, abc.ValueView
|
||||
data = list(data)
|
||||
|
||||
if len(data) == 0 and dtype is None:
|
||||
# We default to float64, matching numpy
|
||||
subarr = np.array([], dtype=np.float64)
|
||||
|
||||
elif dtype is not None:
|
||||
subarr = _try_cast(data, dtype, copy)
|
||||
|
||||
else:
|
||||
subarr = maybe_convert_platform(data)
|
||||
if subarr.dtype == object:
|
||||
subarr = cast(np.ndarray, subarr)
|
||||
subarr = maybe_infer_to_datetimelike(subarr)
|
||||
|
||||
subarr = _sanitize_ndim(subarr, data, dtype, index, allow_2d=allow_2d)
|
||||
|
||||
if isinstance(subarr, np.ndarray):
|
||||
# at this point we should have dtype be None or subarr.dtype == dtype
|
||||
dtype = cast(np.dtype, dtype)
|
||||
subarr = _sanitize_str_dtypes(subarr, data, dtype, copy)
|
||||
|
||||
return subarr
|
||||
|
||||
|
||||
def range_to_ndarray(rng: range) -> np.ndarray:
|
||||
"""
|
||||
Cast a range object to ndarray.
|
||||
"""
|
||||
# GH#30171 perf avoid realizing range as a list in np.array
|
||||
try:
|
||||
arr = np.arange(rng.start, rng.stop, rng.step, dtype="int64")
|
||||
except OverflowError:
|
||||
# GH#30173 handling for ranges that overflow int64
|
||||
if (rng.start >= 0 and rng.step > 0) or (rng.step < 0 <= rng.stop):
|
||||
try:
|
||||
arr = np.arange(rng.start, rng.stop, rng.step, dtype="uint64")
|
||||
except OverflowError:
|
||||
arr = construct_1d_object_array_from_listlike(list(rng))
|
||||
else:
|
||||
arr = construct_1d_object_array_from_listlike(list(rng))
|
||||
return arr
|
||||
|
||||
|
||||
def _sanitize_non_ordered(data) -> None:
|
||||
"""
|
||||
Raise only for unordered sets, e.g., not for dict_keys
|
||||
"""
|
||||
if isinstance(data, (set, frozenset)):
|
||||
raise TypeError(f"'{type(data).__name__}' type is unordered")
|
||||
|
||||
|
||||
def _sanitize_ndim(
|
||||
result: ArrayLike,
|
||||
data,
|
||||
dtype: DtypeObj | None,
|
||||
index: Index | None,
|
||||
*,
|
||||
allow_2d: bool = False,
|
||||
) -> ArrayLike:
|
||||
"""
|
||||
Ensure we have a 1-dimensional result array.
|
||||
"""
|
||||
if getattr(result, "ndim", 0) == 0:
|
||||
raise ValueError("result should be arraylike with ndim > 0")
|
||||
|
||||
if result.ndim == 1:
|
||||
# the result that we want
|
||||
result = _maybe_repeat(result, index)
|
||||
|
||||
elif result.ndim > 1:
|
||||
if isinstance(data, np.ndarray):
|
||||
if allow_2d:
|
||||
return result
|
||||
raise ValueError(
|
||||
f"Data must be 1-dimensional, got ndarray of shape {data.shape} instead"
|
||||
)
|
||||
if is_object_dtype(dtype) and isinstance(dtype, ExtensionDtype):
|
||||
# i.e. NumpyEADtype("O")
|
||||
|
||||
result = com.asarray_tuplesafe(data, dtype=np.dtype("object"))
|
||||
cls = dtype.construct_array_type()
|
||||
result = cls._from_sequence(result, dtype=dtype)
|
||||
else:
|
||||
# error: Argument "dtype" to "asarray_tuplesafe" has incompatible type
|
||||
# "Union[dtype[Any], ExtensionDtype, None]"; expected "Union[str,
|
||||
# dtype[Any], None]"
|
||||
result = com.asarray_tuplesafe(data, dtype=dtype) # type: ignore[arg-type]
|
||||
return result
|
||||
|
||||
|
||||
def _sanitize_str_dtypes(
|
||||
result: np.ndarray, data, dtype: np.dtype | None, copy: bool
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Ensure we have a dtype that is supported by pandas.
|
||||
"""
|
||||
|
||||
# This is to prevent mixed-type Series getting all casted to
|
||||
# NumPy string type, e.g. NaN --> '-1#IND'.
|
||||
if issubclass(result.dtype.type, str):
|
||||
# GH#16605
|
||||
# If not empty convert the data to dtype
|
||||
# GH#19853: If data is a scalar, result has already the result
|
||||
if not lib.is_scalar(data):
|
||||
if not np.all(isna(data)):
|
||||
data = np.asarray(data, dtype=dtype)
|
||||
if not copy:
|
||||
result = np.asarray(data, dtype=object)
|
||||
else:
|
||||
result = np.array(data, dtype=object, copy=copy)
|
||||
return result
|
||||
|
||||
|
||||
def _maybe_repeat(arr: ArrayLike, index: Index | None) -> ArrayLike:
|
||||
"""
|
||||
If we have a length-1 array and an index describing how long we expect
|
||||
the result to be, repeat the array.
|
||||
"""
|
||||
if index is not None:
|
||||
if 1 == len(arr) != len(index):
|
||||
arr = arr.repeat(len(index))
|
||||
return arr
|
||||
|
||||
|
||||
def _try_cast(
|
||||
arr: list | np.ndarray,
|
||||
dtype: np.dtype,
|
||||
copy: bool,
|
||||
) -> ArrayLike:
|
||||
"""
|
||||
Convert input to numpy ndarray and optionally cast to a given dtype.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
arr : ndarray or list
|
||||
Excludes: ExtensionArray, Series, Index.
|
||||
dtype : np.dtype
|
||||
copy : bool
|
||||
If False, don't copy the data if not needed.
|
||||
|
||||
Returns
|
||||
-------
|
||||
np.ndarray or ExtensionArray
|
||||
"""
|
||||
is_ndarray = isinstance(arr, np.ndarray)
|
||||
|
||||
if dtype == object:
|
||||
if not is_ndarray:
|
||||
subarr = construct_1d_object_array_from_listlike(arr)
|
||||
return subarr
|
||||
return ensure_wrapped_if_datetimelike(arr).astype(dtype, copy=copy)
|
||||
|
||||
elif dtype.kind == "U":
|
||||
# TODO: test cases with arr.dtype.kind in "mM"
|
||||
if is_ndarray:
|
||||
arr = cast(np.ndarray, arr)
|
||||
shape = arr.shape
|
||||
if arr.ndim > 1:
|
||||
arr = arr.ravel()
|
||||
else:
|
||||
shape = (len(arr),)
|
||||
return lib.ensure_string_array(arr, convert_na_value=False, copy=copy).reshape(
|
||||
shape
|
||||
)
|
||||
|
||||
elif dtype.kind in "mM":
|
||||
return maybe_cast_to_datetime(arr, dtype)
|
||||
|
||||
# GH#15832: Check if we are requesting a numeric dtype and
|
||||
# that we can convert the data to the requested dtype.
|
||||
elif dtype.kind in "iu":
|
||||
# this will raise if we have e.g. floats
|
||||
|
||||
subarr = maybe_cast_to_integer_array(arr, dtype)
|
||||
elif not copy:
|
||||
subarr = np.asarray(arr, dtype=dtype)
|
||||
else:
|
||||
subarr = np.array(arr, dtype=dtype, copy=copy)
|
||||
|
||||
return subarr
|
||||
85
lib/python3.11/site-packages/pandas/core/dtypes/api.py
Normal file
85
lib/python3.11/site-packages/pandas/core/dtypes/api.py
Normal file
@ -0,0 +1,85 @@
|
||||
from pandas.core.dtypes.common import (
|
||||
is_any_real_numeric_dtype,
|
||||
is_array_like,
|
||||
is_bool,
|
||||
is_bool_dtype,
|
||||
is_categorical_dtype,
|
||||
is_complex,
|
||||
is_complex_dtype,
|
||||
is_datetime64_any_dtype,
|
||||
is_datetime64_dtype,
|
||||
is_datetime64_ns_dtype,
|
||||
is_datetime64tz_dtype,
|
||||
is_dict_like,
|
||||
is_dtype_equal,
|
||||
is_extension_array_dtype,
|
||||
is_file_like,
|
||||
is_float,
|
||||
is_float_dtype,
|
||||
is_hashable,
|
||||
is_int64_dtype,
|
||||
is_integer,
|
||||
is_integer_dtype,
|
||||
is_interval,
|
||||
is_interval_dtype,
|
||||
is_iterator,
|
||||
is_list_like,
|
||||
is_named_tuple,
|
||||
is_number,
|
||||
is_numeric_dtype,
|
||||
is_object_dtype,
|
||||
is_period_dtype,
|
||||
is_re,
|
||||
is_re_compilable,
|
||||
is_scalar,
|
||||
is_signed_integer_dtype,
|
||||
is_sparse,
|
||||
is_string_dtype,
|
||||
is_timedelta64_dtype,
|
||||
is_timedelta64_ns_dtype,
|
||||
is_unsigned_integer_dtype,
|
||||
pandas_dtype,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"is_any_real_numeric_dtype",
|
||||
"is_array_like",
|
||||
"is_bool",
|
||||
"is_bool_dtype",
|
||||
"is_categorical_dtype",
|
||||
"is_complex",
|
||||
"is_complex_dtype",
|
||||
"is_datetime64_any_dtype",
|
||||
"is_datetime64_dtype",
|
||||
"is_datetime64_ns_dtype",
|
||||
"is_datetime64tz_dtype",
|
||||
"is_dict_like",
|
||||
"is_dtype_equal",
|
||||
"is_extension_array_dtype",
|
||||
"is_file_like",
|
||||
"is_float",
|
||||
"is_float_dtype",
|
||||
"is_hashable",
|
||||
"is_int64_dtype",
|
||||
"is_integer",
|
||||
"is_integer_dtype",
|
||||
"is_interval",
|
||||
"is_interval_dtype",
|
||||
"is_iterator",
|
||||
"is_list_like",
|
||||
"is_named_tuple",
|
||||
"is_number",
|
||||
"is_numeric_dtype",
|
||||
"is_object_dtype",
|
||||
"is_period_dtype",
|
||||
"is_re",
|
||||
"is_re_compilable",
|
||||
"is_scalar",
|
||||
"is_signed_integer_dtype",
|
||||
"is_sparse",
|
||||
"is_string_dtype",
|
||||
"is_timedelta64_dtype",
|
||||
"is_timedelta64_ns_dtype",
|
||||
"is_unsigned_integer_dtype",
|
||||
"pandas_dtype",
|
||||
]
|
||||
301
lib/python3.11/site-packages/pandas/core/dtypes/astype.py
Normal file
301
lib/python3.11/site-packages/pandas/core/dtypes/astype.py
Normal file
@ -0,0 +1,301 @@
|
||||
"""
|
||||
Functions for implementing 'astype' methods according to pandas conventions,
|
||||
particularly ones that differ from numpy.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import inspect
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
overload,
|
||||
)
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas._libs.tslibs.timedeltas import array_to_timedelta64
|
||||
from pandas.errors import IntCastingNaNError
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_object_dtype,
|
||||
is_string_dtype,
|
||||
pandas_dtype,
|
||||
)
|
||||
from pandas.core.dtypes.dtypes import (
|
||||
ExtensionDtype,
|
||||
NumpyEADtype,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
DtypeObj,
|
||||
IgnoreRaise,
|
||||
)
|
||||
|
||||
from pandas.core.arrays import ExtensionArray
|
||||
|
||||
_dtype_obj = np.dtype(object)
|
||||
|
||||
|
||||
@overload
|
||||
def _astype_nansafe(
|
||||
arr: np.ndarray, dtype: np.dtype, copy: bool = ..., skipna: bool = ...
|
||||
) -> np.ndarray:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def _astype_nansafe(
|
||||
arr: np.ndarray, dtype: ExtensionDtype, copy: bool = ..., skipna: bool = ...
|
||||
) -> ExtensionArray:
|
||||
...
|
||||
|
||||
|
||||
def _astype_nansafe(
|
||||
arr: np.ndarray, dtype: DtypeObj, copy: bool = True, skipna: bool = False
|
||||
) -> ArrayLike:
|
||||
"""
|
||||
Cast the elements of an array to a given dtype a nan-safe manner.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
arr : ndarray
|
||||
dtype : np.dtype or ExtensionDtype
|
||||
copy : bool, default True
|
||||
If False, a view will be attempted but may fail, if
|
||||
e.g. the item sizes don't align.
|
||||
skipna: bool, default False
|
||||
Whether or not we should skip NaN when casting as a string-type.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
The dtype was a datetime64/timedelta64 dtype, but it had no unit.
|
||||
"""
|
||||
|
||||
# dispatch on extension dtype if needed
|
||||
if isinstance(dtype, ExtensionDtype):
|
||||
return dtype.construct_array_type()._from_sequence(arr, dtype=dtype, copy=copy)
|
||||
|
||||
elif not isinstance(dtype, np.dtype): # pragma: no cover
|
||||
raise ValueError("dtype must be np.dtype or ExtensionDtype")
|
||||
|
||||
if arr.dtype.kind in "mM":
|
||||
from pandas.core.construction import ensure_wrapped_if_datetimelike
|
||||
|
||||
arr = ensure_wrapped_if_datetimelike(arr)
|
||||
res = arr.astype(dtype, copy=copy)
|
||||
return np.asarray(res)
|
||||
|
||||
if issubclass(dtype.type, str):
|
||||
shape = arr.shape
|
||||
if arr.ndim > 1:
|
||||
arr = arr.ravel()
|
||||
return lib.ensure_string_array(
|
||||
arr, skipna=skipna, convert_na_value=False
|
||||
).reshape(shape)
|
||||
|
||||
elif np.issubdtype(arr.dtype, np.floating) and dtype.kind in "iu":
|
||||
return _astype_float_to_int_nansafe(arr, dtype, copy)
|
||||
|
||||
elif arr.dtype == object:
|
||||
# if we have a datetime/timedelta array of objects
|
||||
# then coerce to datetime64[ns] and use DatetimeArray.astype
|
||||
|
||||
if lib.is_np_dtype(dtype, "M"):
|
||||
from pandas.core.arrays import DatetimeArray
|
||||
|
||||
dta = DatetimeArray._from_sequence(arr, dtype=dtype)
|
||||
return dta._ndarray
|
||||
|
||||
elif lib.is_np_dtype(dtype, "m"):
|
||||
from pandas.core.construction import ensure_wrapped_if_datetimelike
|
||||
|
||||
# bc we know arr.dtype == object, this is equivalent to
|
||||
# `np.asarray(to_timedelta(arr))`, but using a lower-level API that
|
||||
# does not require a circular import.
|
||||
tdvals = array_to_timedelta64(arr).view("m8[ns]")
|
||||
|
||||
tda = ensure_wrapped_if_datetimelike(tdvals)
|
||||
return tda.astype(dtype, copy=False)._ndarray
|
||||
|
||||
if dtype.name in ("datetime64", "timedelta64"):
|
||||
msg = (
|
||||
f"The '{dtype.name}' dtype has no unit. Please pass in "
|
||||
f"'{dtype.name}[ns]' instead."
|
||||
)
|
||||
raise ValueError(msg)
|
||||
|
||||
if copy or arr.dtype == object or dtype == object:
|
||||
# Explicit copy, or required since NumPy can't view from / to object.
|
||||
return arr.astype(dtype, copy=True)
|
||||
|
||||
return arr.astype(dtype, copy=copy)
|
||||
|
||||
|
||||
def _astype_float_to_int_nansafe(
|
||||
values: np.ndarray, dtype: np.dtype, copy: bool
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
astype with a check preventing converting NaN to an meaningless integer value.
|
||||
"""
|
||||
if not np.isfinite(values).all():
|
||||
raise IntCastingNaNError(
|
||||
"Cannot convert non-finite values (NA or inf) to integer"
|
||||
)
|
||||
if dtype.kind == "u":
|
||||
# GH#45151
|
||||
if not (values >= 0).all():
|
||||
raise ValueError(f"Cannot losslessly cast from {values.dtype} to {dtype}")
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
||||
return values.astype(dtype, copy=copy)
|
||||
|
||||
|
||||
def astype_array(values: ArrayLike, dtype: DtypeObj, copy: bool = False) -> ArrayLike:
|
||||
"""
|
||||
Cast array (ndarray or ExtensionArray) to the new dtype.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : ndarray or ExtensionArray
|
||||
dtype : dtype object
|
||||
copy : bool, default False
|
||||
copy if indicated
|
||||
|
||||
Returns
|
||||
-------
|
||||
ndarray or ExtensionArray
|
||||
"""
|
||||
if values.dtype == dtype:
|
||||
if copy:
|
||||
return values.copy()
|
||||
return values
|
||||
|
||||
if not isinstance(values, np.ndarray):
|
||||
# i.e. ExtensionArray
|
||||
values = values.astype(dtype, copy=copy)
|
||||
|
||||
else:
|
||||
values = _astype_nansafe(values, dtype, copy=copy)
|
||||
|
||||
# in pandas we don't store numpy str dtypes, so convert to object
|
||||
if isinstance(dtype, np.dtype) and issubclass(values.dtype.type, str):
|
||||
values = np.array(values, dtype=object)
|
||||
|
||||
return values
|
||||
|
||||
|
||||
def astype_array_safe(
|
||||
values: ArrayLike, dtype, copy: bool = False, errors: IgnoreRaise = "raise"
|
||||
) -> ArrayLike:
|
||||
"""
|
||||
Cast array (ndarray or ExtensionArray) to the new dtype.
|
||||
|
||||
This basically is the implementation for DataFrame/Series.astype and
|
||||
includes all custom logic for pandas (NaN-safety, converting str to object,
|
||||
not allowing )
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : ndarray or ExtensionArray
|
||||
dtype : str, dtype convertible
|
||||
copy : bool, default False
|
||||
copy if indicated
|
||||
errors : str, {'raise', 'ignore'}, default 'raise'
|
||||
- ``raise`` : allow exceptions to be raised
|
||||
- ``ignore`` : suppress exceptions. On error return original object
|
||||
|
||||
Returns
|
||||
-------
|
||||
ndarray or ExtensionArray
|
||||
"""
|
||||
errors_legal_values = ("raise", "ignore")
|
||||
|
||||
if errors not in errors_legal_values:
|
||||
invalid_arg = (
|
||||
"Expected value of kwarg 'errors' to be one of "
|
||||
f"{list(errors_legal_values)}. Supplied value is '{errors}'"
|
||||
)
|
||||
raise ValueError(invalid_arg)
|
||||
|
||||
if inspect.isclass(dtype) and issubclass(dtype, ExtensionDtype):
|
||||
msg = (
|
||||
f"Expected an instance of {dtype.__name__}, "
|
||||
"but got the class instead. Try instantiating 'dtype'."
|
||||
)
|
||||
raise TypeError(msg)
|
||||
|
||||
dtype = pandas_dtype(dtype)
|
||||
if isinstance(dtype, NumpyEADtype):
|
||||
# Ensure we don't end up with a NumpyExtensionArray
|
||||
dtype = dtype.numpy_dtype
|
||||
|
||||
try:
|
||||
new_values = astype_array(values, dtype, copy=copy)
|
||||
except (ValueError, TypeError):
|
||||
# e.g. _astype_nansafe can fail on object-dtype of strings
|
||||
# trying to convert to float
|
||||
if errors == "ignore":
|
||||
new_values = values
|
||||
else:
|
||||
raise
|
||||
|
||||
return new_values
|
||||
|
||||
|
||||
def astype_is_view(dtype: DtypeObj, new_dtype: DtypeObj) -> bool:
|
||||
"""Checks if astype avoided copying the data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dtype : Original dtype
|
||||
new_dtype : target dtype
|
||||
|
||||
Returns
|
||||
-------
|
||||
True if new data is a view or not guaranteed to be a copy, False otherwise
|
||||
"""
|
||||
if isinstance(dtype, np.dtype) and not isinstance(new_dtype, np.dtype):
|
||||
new_dtype, dtype = dtype, new_dtype
|
||||
|
||||
if dtype == new_dtype:
|
||||
return True
|
||||
|
||||
elif isinstance(dtype, np.dtype) and isinstance(new_dtype, np.dtype):
|
||||
# Only equal numpy dtypes avoid a copy
|
||||
return False
|
||||
|
||||
elif is_string_dtype(dtype) and is_string_dtype(new_dtype):
|
||||
# Potentially! a view when converting from object to string
|
||||
return True
|
||||
|
||||
elif is_object_dtype(dtype) and new_dtype.kind == "O":
|
||||
# When the underlying array has dtype object, we don't have to make a copy
|
||||
return True
|
||||
|
||||
elif dtype.kind in "mM" and new_dtype.kind in "mM":
|
||||
dtype = getattr(dtype, "numpy_dtype", dtype)
|
||||
new_dtype = getattr(new_dtype, "numpy_dtype", new_dtype)
|
||||
return getattr(dtype, "unit", None) == getattr(new_dtype, "unit", None)
|
||||
|
||||
numpy_dtype = getattr(dtype, "numpy_dtype", None)
|
||||
new_numpy_dtype = getattr(new_dtype, "numpy_dtype", None)
|
||||
|
||||
if numpy_dtype is None and isinstance(dtype, np.dtype):
|
||||
numpy_dtype = dtype
|
||||
|
||||
if new_numpy_dtype is None and isinstance(new_dtype, np.dtype):
|
||||
new_numpy_dtype = new_dtype
|
||||
|
||||
if numpy_dtype is not None and new_numpy_dtype is not None:
|
||||
# if both have NumPy dtype or one of them is a numpy dtype
|
||||
# they are only a view when the numpy dtypes are equal, e.g.
|
||||
# int64 -> Int64 or int64[pyarrow]
|
||||
# int64 -> Int32 copies
|
||||
return numpy_dtype == new_numpy_dtype
|
||||
|
||||
# Assume this is a view since we don't know for sure if a copy was made
|
||||
return True
|
||||
583
lib/python3.11/site-packages/pandas/core/dtypes/base.py
Normal file
583
lib/python3.11/site-packages/pandas/core/dtypes/base.py
Normal file
@ -0,0 +1,583 @@
|
||||
"""
|
||||
Extend pandas with custom array types.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
TypeVar,
|
||||
cast,
|
||||
overload,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import missing as libmissing
|
||||
from pandas._libs.hashtable import object_hash
|
||||
from pandas._libs.properties import cache_readonly
|
||||
from pandas.errors import AbstractMethodError
|
||||
|
||||
from pandas.core.dtypes.generic import (
|
||||
ABCDataFrame,
|
||||
ABCIndex,
|
||||
ABCSeries,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
DtypeObj,
|
||||
Self,
|
||||
Shape,
|
||||
npt,
|
||||
type_t,
|
||||
)
|
||||
|
||||
from pandas import Index
|
||||
from pandas.core.arrays import ExtensionArray
|
||||
|
||||
# To parameterize on same ExtensionDtype
|
||||
ExtensionDtypeT = TypeVar("ExtensionDtypeT", bound="ExtensionDtype")
|
||||
|
||||
|
||||
class ExtensionDtype:
|
||||
"""
|
||||
A custom data type, to be paired with an ExtensionArray.
|
||||
|
||||
See Also
|
||||
--------
|
||||
extensions.register_extension_dtype: Register an ExtensionType
|
||||
with pandas as class decorator.
|
||||
extensions.ExtensionArray: Abstract base class for custom 1-D array types.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The interface includes the following abstract methods that must
|
||||
be implemented by subclasses:
|
||||
|
||||
* type
|
||||
* name
|
||||
* construct_array_type
|
||||
|
||||
The following attributes and methods influence the behavior of the dtype in
|
||||
pandas operations
|
||||
|
||||
* _is_numeric
|
||||
* _is_boolean
|
||||
* _get_common_dtype
|
||||
|
||||
The `na_value` class attribute can be used to set the default NA value
|
||||
for this type. :attr:`numpy.nan` is used by default.
|
||||
|
||||
ExtensionDtypes are required to be hashable. The base class provides
|
||||
a default implementation, which relies on the ``_metadata`` class
|
||||
attribute. ``_metadata`` should be a tuple containing the strings
|
||||
that define your data type. For example, with ``PeriodDtype`` that's
|
||||
the ``freq`` attribute.
|
||||
|
||||
**If you have a parametrized dtype you should set the ``_metadata``
|
||||
class property**.
|
||||
|
||||
Ideally, the attributes in ``_metadata`` will match the
|
||||
parameters to your ``ExtensionDtype.__init__`` (if any). If any of
|
||||
the attributes in ``_metadata`` don't implement the standard
|
||||
``__eq__`` or ``__hash__``, the default implementations here will not
|
||||
work.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
For interaction with Apache Arrow (pyarrow), a ``__from_arrow__`` method
|
||||
can be implemented: this method receives a pyarrow Array or ChunkedArray
|
||||
as only argument and is expected to return the appropriate pandas
|
||||
ExtensionArray for this dtype and the passed values:
|
||||
|
||||
>>> import pyarrow
|
||||
>>> from pandas.api.extensions import ExtensionArray
|
||||
>>> class ExtensionDtype:
|
||||
... def __from_arrow__(
|
||||
... self,
|
||||
... array: pyarrow.Array | pyarrow.ChunkedArray
|
||||
... ) -> ExtensionArray:
|
||||
... ...
|
||||
|
||||
This class does not inherit from 'abc.ABCMeta' for performance reasons.
|
||||
Methods and properties required by the interface raise
|
||||
``pandas.errors.AbstractMethodError`` and no ``register`` method is
|
||||
provided for registering virtual subclasses.
|
||||
"""
|
||||
|
||||
_metadata: tuple[str, ...] = ()
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.name
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
"""
|
||||
Check whether 'other' is equal to self.
|
||||
|
||||
By default, 'other' is considered equal if either
|
||||
|
||||
* it's a string matching 'self.name'.
|
||||
* it's an instance of this type and all of the attributes
|
||||
in ``self._metadata`` are equal between `self` and `other`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
other : Any
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
"""
|
||||
if isinstance(other, str):
|
||||
try:
|
||||
other = self.construct_from_string(other)
|
||||
except TypeError:
|
||||
return False
|
||||
if isinstance(other, type(self)):
|
||||
return all(
|
||||
getattr(self, attr) == getattr(other, attr) for attr in self._metadata
|
||||
)
|
||||
return False
|
||||
|
||||
def __hash__(self) -> int:
|
||||
# for python>=3.10, different nan objects have different hashes
|
||||
# we need to avoid that and thus use hash function with old behavior
|
||||
return object_hash(tuple(getattr(self, attr) for attr in self._metadata))
|
||||
|
||||
def __ne__(self, other: object) -> bool:
|
||||
return not self.__eq__(other)
|
||||
|
||||
@property
|
||||
def na_value(self) -> object:
|
||||
"""
|
||||
Default NA value to use for this type.
|
||||
|
||||
This is used in e.g. ExtensionArray.take. This should be the
|
||||
user-facing "boxed" version of the NA value, not the physical NA value
|
||||
for storage. e.g. for JSONArray, this is an empty dictionary.
|
||||
"""
|
||||
return np.nan
|
||||
|
||||
@property
|
||||
def type(self) -> type_t[Any]:
|
||||
"""
|
||||
The scalar type for the array, e.g. ``int``
|
||||
|
||||
It's expected ``ExtensionArray[item]`` returns an instance
|
||||
of ``ExtensionDtype.type`` for scalar ``item``, assuming
|
||||
that value is valid (not NA). NA values do not need to be
|
||||
instances of `type`.
|
||||
"""
|
||||
raise AbstractMethodError(self)
|
||||
|
||||
@property
|
||||
def kind(self) -> str:
|
||||
"""
|
||||
A character code (one of 'biufcmMOSUV'), default 'O'
|
||||
|
||||
This should match the NumPy dtype used when the array is
|
||||
converted to an ndarray, which is probably 'O' for object if
|
||||
the extension type cannot be represented as a built-in NumPy
|
||||
type.
|
||||
|
||||
See Also
|
||||
--------
|
||||
numpy.dtype.kind
|
||||
"""
|
||||
return "O"
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
"""
|
||||
A string identifying the data type.
|
||||
|
||||
Will be used for display in, e.g. ``Series.dtype``
|
||||
"""
|
||||
raise AbstractMethodError(self)
|
||||
|
||||
@property
|
||||
def names(self) -> list[str] | None:
|
||||
"""
|
||||
Ordered list of field names, or None if there are no fields.
|
||||
|
||||
This is for compatibility with NumPy arrays, and may be removed in the
|
||||
future.
|
||||
"""
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def construct_array_type(cls) -> type_t[ExtensionArray]:
|
||||
"""
|
||||
Return the array type associated with this dtype.
|
||||
|
||||
Returns
|
||||
-------
|
||||
type
|
||||
"""
|
||||
raise AbstractMethodError(cls)
|
||||
|
||||
def empty(self, shape: Shape) -> ExtensionArray:
|
||||
"""
|
||||
Construct an ExtensionArray of this dtype with the given shape.
|
||||
|
||||
Analogous to numpy.empty.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
shape : int or tuple[int]
|
||||
|
||||
Returns
|
||||
-------
|
||||
ExtensionArray
|
||||
"""
|
||||
cls = self.construct_array_type()
|
||||
return cls._empty(shape, dtype=self)
|
||||
|
||||
@classmethod
|
||||
def construct_from_string(cls, string: str) -> Self:
|
||||
r"""
|
||||
Construct this type from a string.
|
||||
|
||||
This is useful mainly for data types that accept parameters.
|
||||
For example, a period dtype accepts a frequency parameter that
|
||||
can be set as ``period[h]`` (where H means hourly frequency).
|
||||
|
||||
By default, in the abstract class, just the name of the type is
|
||||
expected. But subclasses can overwrite this method to accept
|
||||
parameters.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
string : str
|
||||
The name of the type, for example ``category``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ExtensionDtype
|
||||
Instance of the dtype.
|
||||
|
||||
Raises
|
||||
------
|
||||
TypeError
|
||||
If a class cannot be constructed from this 'string'.
|
||||
|
||||
Examples
|
||||
--------
|
||||
For extension dtypes with arguments the following may be an
|
||||
adequate implementation.
|
||||
|
||||
>>> import re
|
||||
>>> @classmethod
|
||||
... def construct_from_string(cls, string):
|
||||
... pattern = re.compile(r"^my_type\[(?P<arg_name>.+)\]$")
|
||||
... match = pattern.match(string)
|
||||
... if match:
|
||||
... return cls(**match.groupdict())
|
||||
... else:
|
||||
... raise TypeError(
|
||||
... f"Cannot construct a '{cls.__name__}' from '{string}'"
|
||||
... )
|
||||
"""
|
||||
if not isinstance(string, str):
|
||||
raise TypeError(
|
||||
f"'construct_from_string' expects a string, got {type(string)}"
|
||||
)
|
||||
# error: Non-overlapping equality check (left operand type: "str", right
|
||||
# operand type: "Callable[[ExtensionDtype], str]") [comparison-overlap]
|
||||
assert isinstance(cls.name, str), (cls, type(cls.name))
|
||||
if string != cls.name:
|
||||
raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
|
||||
return cls()
|
||||
|
||||
@classmethod
|
||||
def is_dtype(cls, dtype: object) -> bool:
|
||||
"""
|
||||
Check if we match 'dtype'.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dtype : object
|
||||
The object to check.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
|
||||
Notes
|
||||
-----
|
||||
The default implementation is True if
|
||||
|
||||
1. ``cls.construct_from_string(dtype)`` is an instance
|
||||
of ``cls``.
|
||||
2. ``dtype`` is an object and is an instance of ``cls``
|
||||
3. ``dtype`` has a ``dtype`` attribute, and any of the above
|
||||
conditions is true for ``dtype.dtype``.
|
||||
"""
|
||||
dtype = getattr(dtype, "dtype", dtype)
|
||||
|
||||
if isinstance(dtype, (ABCSeries, ABCIndex, ABCDataFrame, np.dtype)):
|
||||
# https://github.com/pandas-dev/pandas/issues/22960
|
||||
# avoid passing data to `construct_from_string`. This could
|
||||
# cause a FutureWarning from numpy about failing elementwise
|
||||
# comparison from, e.g., comparing DataFrame == 'category'.
|
||||
return False
|
||||
elif dtype is None:
|
||||
return False
|
||||
elif isinstance(dtype, cls):
|
||||
return True
|
||||
if isinstance(dtype, str):
|
||||
try:
|
||||
return cls.construct_from_string(dtype) is not None
|
||||
except TypeError:
|
||||
return False
|
||||
return False
|
||||
|
||||
@property
|
||||
def _is_numeric(self) -> bool:
|
||||
"""
|
||||
Whether columns with this dtype should be considered numeric.
|
||||
|
||||
By default ExtensionDtypes are assumed to be non-numeric.
|
||||
They'll be excluded from operations that exclude non-numeric
|
||||
columns, like (groupby) reductions, plotting, etc.
|
||||
"""
|
||||
return False
|
||||
|
||||
@property
|
||||
def _is_boolean(self) -> bool:
|
||||
"""
|
||||
Whether this dtype should be considered boolean.
|
||||
|
||||
By default, ExtensionDtypes are assumed to be non-numeric.
|
||||
Setting this to True will affect the behavior of several places,
|
||||
e.g.
|
||||
|
||||
* is_bool
|
||||
* boolean indexing
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
"""
|
||||
return False
|
||||
|
||||
def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
|
||||
"""
|
||||
Return the common dtype, if one exists.
|
||||
|
||||
Used in `find_common_type` implementation. This is for example used
|
||||
to determine the resulting dtype in a concat operation.
|
||||
|
||||
If no common dtype exists, return None (which gives the other dtypes
|
||||
the chance to determine a common dtype). If all dtypes in the list
|
||||
return None, then the common dtype will be "object" dtype (this means
|
||||
it is never needed to return "object" dtype from this method itself).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dtypes : list of dtypes
|
||||
The dtypes for which to determine a common dtype. This is a list
|
||||
of np.dtype or ExtensionDtype instances.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Common dtype (np.dtype or ExtensionDtype) or None
|
||||
"""
|
||||
if len(set(dtypes)) == 1:
|
||||
# only itself
|
||||
return self
|
||||
else:
|
||||
return None
|
||||
|
||||
@property
|
||||
def _can_hold_na(self) -> bool:
|
||||
"""
|
||||
Can arrays of this dtype hold NA values?
|
||||
"""
|
||||
return True
|
||||
|
||||
@property
|
||||
def _is_immutable(self) -> bool:
|
||||
"""
|
||||
Can arrays with this dtype be modified with __setitem__? If not, return
|
||||
True.
|
||||
|
||||
Immutable arrays are expected to raise TypeError on __setitem__ calls.
|
||||
"""
|
||||
return False
|
||||
|
||||
@cache_readonly
|
||||
def index_class(self) -> type_t[Index]:
|
||||
"""
|
||||
The Index subclass to return from Index.__new__ when this dtype is
|
||||
encountered.
|
||||
"""
|
||||
from pandas import Index
|
||||
|
||||
return Index
|
||||
|
||||
@property
|
||||
def _supports_2d(self) -> bool:
|
||||
"""
|
||||
Do ExtensionArrays with this dtype support 2D arrays?
|
||||
|
||||
Historically ExtensionArrays were limited to 1D. By returning True here,
|
||||
authors can indicate that their arrays support 2D instances. This can
|
||||
improve performance in some cases, particularly operations with `axis=1`.
|
||||
|
||||
Arrays that support 2D values should:
|
||||
|
||||
- implement Array.reshape
|
||||
- subclass the Dim2CompatTests in tests.extension.base
|
||||
- _concat_same_type should support `axis` keyword
|
||||
- _reduce and reductions should support `axis` keyword
|
||||
"""
|
||||
return False
|
||||
|
||||
@property
|
||||
def _can_fast_transpose(self) -> bool:
|
||||
"""
|
||||
Is transposing an array with this dtype zero-copy?
|
||||
|
||||
Only relevant for cases where _supports_2d is True.
|
||||
"""
|
||||
return False
|
||||
|
||||
|
||||
class StorageExtensionDtype(ExtensionDtype):
|
||||
"""ExtensionDtype that may be backed by more than one implementation."""
|
||||
|
||||
name: str
|
||||
_metadata = ("storage",)
|
||||
|
||||
def __init__(self, storage: str | None = None) -> None:
|
||||
self.storage = storage
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"{self.name}[{self.storage}]"
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.name
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
if isinstance(other, str) and other == self.name:
|
||||
return True
|
||||
return super().__eq__(other)
|
||||
|
||||
def __hash__(self) -> int:
|
||||
# custom __eq__ so have to override __hash__
|
||||
return super().__hash__()
|
||||
|
||||
@property
|
||||
def na_value(self) -> libmissing.NAType:
|
||||
return libmissing.NA
|
||||
|
||||
|
||||
def register_extension_dtype(cls: type_t[ExtensionDtypeT]) -> type_t[ExtensionDtypeT]:
|
||||
"""
|
||||
Register an ExtensionType with pandas as class decorator.
|
||||
|
||||
This enables operations like ``.astype(name)`` for the name
|
||||
of the ExtensionDtype.
|
||||
|
||||
Returns
|
||||
-------
|
||||
callable
|
||||
A class decorator.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from pandas.api.extensions import register_extension_dtype, ExtensionDtype
|
||||
>>> @register_extension_dtype
|
||||
... class MyExtensionDtype(ExtensionDtype):
|
||||
... name = "myextension"
|
||||
"""
|
||||
_registry.register(cls)
|
||||
return cls
|
||||
|
||||
|
||||
class Registry:
|
||||
"""
|
||||
Registry for dtype inference.
|
||||
|
||||
The registry allows one to map a string repr of a extension
|
||||
dtype to an extension dtype. The string alias can be used in several
|
||||
places, including
|
||||
|
||||
* Series and Index constructors
|
||||
* :meth:`pandas.array`
|
||||
* :meth:`pandas.Series.astype`
|
||||
|
||||
Multiple extension types can be registered.
|
||||
These are tried in order.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.dtypes: list[type_t[ExtensionDtype]] = []
|
||||
|
||||
def register(self, dtype: type_t[ExtensionDtype]) -> None:
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
dtype : ExtensionDtype class
|
||||
"""
|
||||
if not issubclass(dtype, ExtensionDtype):
|
||||
raise ValueError("can only register pandas extension dtypes")
|
||||
|
||||
self.dtypes.append(dtype)
|
||||
|
||||
@overload
|
||||
def find(self, dtype: type_t[ExtensionDtypeT]) -> type_t[ExtensionDtypeT]:
|
||||
...
|
||||
|
||||
@overload
|
||||
def find(self, dtype: ExtensionDtypeT) -> ExtensionDtypeT:
|
||||
...
|
||||
|
||||
@overload
|
||||
def find(self, dtype: str) -> ExtensionDtype | None:
|
||||
...
|
||||
|
||||
@overload
|
||||
def find(
|
||||
self, dtype: npt.DTypeLike
|
||||
) -> type_t[ExtensionDtype] | ExtensionDtype | None:
|
||||
...
|
||||
|
||||
def find(
|
||||
self, dtype: type_t[ExtensionDtype] | ExtensionDtype | npt.DTypeLike
|
||||
) -> type_t[ExtensionDtype] | ExtensionDtype | None:
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
dtype : ExtensionDtype class or instance or str or numpy dtype or python type
|
||||
|
||||
Returns
|
||||
-------
|
||||
return the first matching dtype, otherwise return None
|
||||
"""
|
||||
if not isinstance(dtype, str):
|
||||
dtype_type: type_t
|
||||
if not isinstance(dtype, type):
|
||||
dtype_type = type(dtype)
|
||||
else:
|
||||
dtype_type = dtype
|
||||
if issubclass(dtype_type, ExtensionDtype):
|
||||
# cast needed here as mypy doesn't know we have figured
|
||||
# out it is an ExtensionDtype or type_t[ExtensionDtype]
|
||||
return cast("ExtensionDtype | type_t[ExtensionDtype]", dtype)
|
||||
|
||||
return None
|
||||
|
||||
for dtype_type in self.dtypes:
|
||||
try:
|
||||
return dtype_type.construct_from_string(dtype)
|
||||
except TypeError:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
|
||||
_registry = Registry()
|
||||
1988
lib/python3.11/site-packages/pandas/core/dtypes/cast.py
Normal file
1988
lib/python3.11/site-packages/pandas/core/dtypes/cast.py
Normal file
File diff suppressed because it is too large
Load Diff
1766
lib/python3.11/site-packages/pandas/core/dtypes/common.py
Normal file
1766
lib/python3.11/site-packages/pandas/core/dtypes/common.py
Normal file
File diff suppressed because it is too large
Load Diff
348
lib/python3.11/site-packages/pandas/core/dtypes/concat.py
Normal file
348
lib/python3.11/site-packages/pandas/core/dtypes/concat.py
Normal file
@ -0,0 +1,348 @@
|
||||
"""
|
||||
Utility functions related to concat.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
cast,
|
||||
)
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
from pandas.core.dtypes.astype import astype_array
|
||||
from pandas.core.dtypes.cast import (
|
||||
common_dtype_categorical_compat,
|
||||
find_common_type,
|
||||
np_find_common_type,
|
||||
)
|
||||
from pandas.core.dtypes.dtypes import CategoricalDtype
|
||||
from pandas.core.dtypes.generic import (
|
||||
ABCCategoricalIndex,
|
||||
ABCSeries,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Sequence
|
||||
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
AxisInt,
|
||||
DtypeObj,
|
||||
)
|
||||
|
||||
from pandas.core.arrays import (
|
||||
Categorical,
|
||||
ExtensionArray,
|
||||
)
|
||||
|
||||
|
||||
def _is_nonempty(x, axis) -> bool:
|
||||
# filter empty arrays
|
||||
# 1-d dtypes always are included here
|
||||
if x.ndim <= axis:
|
||||
return True
|
||||
return x.shape[axis] > 0
|
||||
|
||||
|
||||
def concat_compat(
|
||||
to_concat: Sequence[ArrayLike], axis: AxisInt = 0, ea_compat_axis: bool = False
|
||||
) -> ArrayLike:
|
||||
"""
|
||||
provide concatenation of an array of arrays each of which is a single
|
||||
'normalized' dtypes (in that for example, if it's object, then it is a
|
||||
non-datetimelike and provide a combined dtype for the resulting array that
|
||||
preserves the overall dtype if possible)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
to_concat : sequence of arrays
|
||||
axis : axis to provide concatenation
|
||||
ea_compat_axis : bool, default False
|
||||
For ExtensionArray compat, behave as if axis == 1 when determining
|
||||
whether to drop empty arrays.
|
||||
|
||||
Returns
|
||||
-------
|
||||
a single array, preserving the combined dtypes
|
||||
"""
|
||||
if len(to_concat) and lib.dtypes_all_equal([obj.dtype for obj in to_concat]):
|
||||
# fastpath!
|
||||
obj = to_concat[0]
|
||||
if isinstance(obj, np.ndarray):
|
||||
to_concat_arrs = cast("Sequence[np.ndarray]", to_concat)
|
||||
return np.concatenate(to_concat_arrs, axis=axis)
|
||||
|
||||
to_concat_eas = cast("Sequence[ExtensionArray]", to_concat)
|
||||
if ea_compat_axis:
|
||||
# We have 1D objects, that don't support axis keyword
|
||||
return obj._concat_same_type(to_concat_eas)
|
||||
elif axis == 0:
|
||||
return obj._concat_same_type(to_concat_eas)
|
||||
else:
|
||||
# e.g. DatetimeArray
|
||||
# NB: We are assuming here that ensure_wrapped_if_arraylike has
|
||||
# been called where relevant.
|
||||
return obj._concat_same_type(
|
||||
# error: Unexpected keyword argument "axis" for "_concat_same_type"
|
||||
# of "ExtensionArray"
|
||||
to_concat_eas,
|
||||
axis=axis, # type: ignore[call-arg]
|
||||
)
|
||||
|
||||
# If all arrays are empty, there's nothing to convert, just short-cut to
|
||||
# the concatenation, #3121.
|
||||
#
|
||||
# Creating an empty array directly is tempting, but the winnings would be
|
||||
# marginal given that it would still require shape & dtype calculation and
|
||||
# np.concatenate which has them both implemented is compiled.
|
||||
orig = to_concat
|
||||
non_empties = [x for x in to_concat if _is_nonempty(x, axis)]
|
||||
if non_empties and axis == 0 and not ea_compat_axis:
|
||||
# ea_compat_axis see GH#39574
|
||||
to_concat = non_empties
|
||||
|
||||
any_ea, kinds, target_dtype = _get_result_dtype(to_concat, non_empties)
|
||||
|
||||
if len(to_concat) < len(orig):
|
||||
_, _, alt_dtype = _get_result_dtype(orig, non_empties)
|
||||
if alt_dtype != target_dtype:
|
||||
# GH#39122
|
||||
warnings.warn(
|
||||
"The behavior of array concatenation with empty entries is "
|
||||
"deprecated. In a future version, this will no longer exclude "
|
||||
"empty items when determining the result dtype. "
|
||||
"To retain the old behavior, exclude the empty entries before "
|
||||
"the concat operation.",
|
||||
FutureWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
|
||||
if target_dtype is not None:
|
||||
to_concat = [astype_array(arr, target_dtype, copy=False) for arr in to_concat]
|
||||
|
||||
if not isinstance(to_concat[0], np.ndarray):
|
||||
# i.e. isinstance(to_concat[0], ExtensionArray)
|
||||
to_concat_eas = cast("Sequence[ExtensionArray]", to_concat)
|
||||
cls = type(to_concat[0])
|
||||
# GH#53640: eg. for datetime array, axis=1 but 0 is default
|
||||
# However, class method `_concat_same_type()` for some classes
|
||||
# may not support the `axis` keyword
|
||||
if ea_compat_axis or axis == 0:
|
||||
return cls._concat_same_type(to_concat_eas)
|
||||
else:
|
||||
return cls._concat_same_type(
|
||||
to_concat_eas,
|
||||
axis=axis, # type: ignore[call-arg]
|
||||
)
|
||||
else:
|
||||
to_concat_arrs = cast("Sequence[np.ndarray]", to_concat)
|
||||
result = np.concatenate(to_concat_arrs, axis=axis)
|
||||
|
||||
if not any_ea and "b" in kinds and result.dtype.kind in "iuf":
|
||||
# GH#39817 cast to object instead of casting bools to numeric
|
||||
result = result.astype(object, copy=False)
|
||||
return result
|
||||
|
||||
|
||||
def _get_result_dtype(
|
||||
to_concat: Sequence[ArrayLike], non_empties: Sequence[ArrayLike]
|
||||
) -> tuple[bool, set[str], DtypeObj | None]:
|
||||
target_dtype = None
|
||||
|
||||
dtypes = {obj.dtype for obj in to_concat}
|
||||
kinds = {obj.dtype.kind for obj in to_concat}
|
||||
|
||||
any_ea = any(not isinstance(x, np.ndarray) for x in to_concat)
|
||||
if any_ea:
|
||||
# i.e. any ExtensionArrays
|
||||
|
||||
# we ignore axis here, as internally concatting with EAs is always
|
||||
# for axis=0
|
||||
if len(dtypes) != 1:
|
||||
target_dtype = find_common_type([x.dtype for x in to_concat])
|
||||
target_dtype = common_dtype_categorical_compat(to_concat, target_dtype)
|
||||
|
||||
elif not len(non_empties):
|
||||
# we have all empties, but may need to coerce the result dtype to
|
||||
# object if we have non-numeric type operands (numpy would otherwise
|
||||
# cast this to float)
|
||||
if len(kinds) != 1:
|
||||
if not len(kinds - {"i", "u", "f"}) or not len(kinds - {"b", "i", "u"}):
|
||||
# let numpy coerce
|
||||
pass
|
||||
else:
|
||||
# coerce to object
|
||||
target_dtype = np.dtype(object)
|
||||
kinds = {"o"}
|
||||
else:
|
||||
# error: Argument 1 to "np_find_common_type" has incompatible type
|
||||
# "*Set[Union[ExtensionDtype, Any]]"; expected "dtype[Any]"
|
||||
target_dtype = np_find_common_type(*dtypes) # type: ignore[arg-type]
|
||||
|
||||
return any_ea, kinds, target_dtype
|
||||
|
||||
|
||||
def union_categoricals(
|
||||
to_union, sort_categories: bool = False, ignore_order: bool = False
|
||||
) -> Categorical:
|
||||
"""
|
||||
Combine list-like of Categorical-like, unioning categories.
|
||||
|
||||
All categories must have the same dtype.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
to_union : list-like
|
||||
Categorical, CategoricalIndex, or Series with dtype='category'.
|
||||
sort_categories : bool, default False
|
||||
If true, resulting categories will be lexsorted, otherwise
|
||||
they will be ordered as they appear in the data.
|
||||
ignore_order : bool, default False
|
||||
If true, the ordered attribute of the Categoricals will be ignored.
|
||||
Results in an unordered categorical.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Categorical
|
||||
|
||||
Raises
|
||||
------
|
||||
TypeError
|
||||
- all inputs do not have the same dtype
|
||||
- all inputs do not have the same ordered property
|
||||
- all inputs are ordered and their categories are not identical
|
||||
- sort_categories=True and Categoricals are ordered
|
||||
ValueError
|
||||
Empty list of categoricals passed
|
||||
|
||||
Notes
|
||||
-----
|
||||
To learn more about categories, see `link
|
||||
<https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html#unioning>`__
|
||||
|
||||
Examples
|
||||
--------
|
||||
If you want to combine categoricals that do not necessarily have
|
||||
the same categories, `union_categoricals` will combine a list-like
|
||||
of categoricals. The new categories will be the union of the
|
||||
categories being combined.
|
||||
|
||||
>>> a = pd.Categorical(["b", "c"])
|
||||
>>> b = pd.Categorical(["a", "b"])
|
||||
>>> pd.api.types.union_categoricals([a, b])
|
||||
['b', 'c', 'a', 'b']
|
||||
Categories (3, object): ['b', 'c', 'a']
|
||||
|
||||
By default, the resulting categories will be ordered as they appear
|
||||
in the `categories` of the data. If you want the categories to be
|
||||
lexsorted, use `sort_categories=True` argument.
|
||||
|
||||
>>> pd.api.types.union_categoricals([a, b], sort_categories=True)
|
||||
['b', 'c', 'a', 'b']
|
||||
Categories (3, object): ['a', 'b', 'c']
|
||||
|
||||
`union_categoricals` also works with the case of combining two
|
||||
categoricals of the same categories and order information (e.g. what
|
||||
you could also `append` for).
|
||||
|
||||
>>> a = pd.Categorical(["a", "b"], ordered=True)
|
||||
>>> b = pd.Categorical(["a", "b", "a"], ordered=True)
|
||||
>>> pd.api.types.union_categoricals([a, b])
|
||||
['a', 'b', 'a', 'b', 'a']
|
||||
Categories (2, object): ['a' < 'b']
|
||||
|
||||
Raises `TypeError` because the categories are ordered and not identical.
|
||||
|
||||
>>> a = pd.Categorical(["a", "b"], ordered=True)
|
||||
>>> b = pd.Categorical(["a", "b", "c"], ordered=True)
|
||||
>>> pd.api.types.union_categoricals([a, b])
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
TypeError: to union ordered Categoricals, all categories must be the same
|
||||
|
||||
Ordered categoricals with different categories or orderings can be
|
||||
combined by using the `ignore_ordered=True` argument.
|
||||
|
||||
>>> a = pd.Categorical(["a", "b", "c"], ordered=True)
|
||||
>>> b = pd.Categorical(["c", "b", "a"], ordered=True)
|
||||
>>> pd.api.types.union_categoricals([a, b], ignore_order=True)
|
||||
['a', 'b', 'c', 'c', 'b', 'a']
|
||||
Categories (3, object): ['a', 'b', 'c']
|
||||
|
||||
`union_categoricals` also works with a `CategoricalIndex`, or `Series`
|
||||
containing categorical data, but note that the resulting array will
|
||||
always be a plain `Categorical`
|
||||
|
||||
>>> a = pd.Series(["b", "c"], dtype='category')
|
||||
>>> b = pd.Series(["a", "b"], dtype='category')
|
||||
>>> pd.api.types.union_categoricals([a, b])
|
||||
['b', 'c', 'a', 'b']
|
||||
Categories (3, object): ['b', 'c', 'a']
|
||||
"""
|
||||
from pandas import Categorical
|
||||
from pandas.core.arrays.categorical import recode_for_categories
|
||||
|
||||
if len(to_union) == 0:
|
||||
raise ValueError("No Categoricals to union")
|
||||
|
||||
def _maybe_unwrap(x):
|
||||
if isinstance(x, (ABCCategoricalIndex, ABCSeries)):
|
||||
return x._values
|
||||
elif isinstance(x, Categorical):
|
||||
return x
|
||||
else:
|
||||
raise TypeError("all components to combine must be Categorical")
|
||||
|
||||
to_union = [_maybe_unwrap(x) for x in to_union]
|
||||
first = to_union[0]
|
||||
|
||||
if not lib.dtypes_all_equal([obj.categories.dtype for obj in to_union]):
|
||||
raise TypeError("dtype of categories must be the same")
|
||||
|
||||
ordered = False
|
||||
if all(first._categories_match_up_to_permutation(other) for other in to_union[1:]):
|
||||
# identical categories - fastpath
|
||||
categories = first.categories
|
||||
ordered = first.ordered
|
||||
|
||||
all_codes = [first._encode_with_my_categories(x)._codes for x in to_union]
|
||||
new_codes = np.concatenate(all_codes)
|
||||
|
||||
if sort_categories and not ignore_order and ordered:
|
||||
raise TypeError("Cannot use sort_categories=True with ordered Categoricals")
|
||||
|
||||
if sort_categories and not categories.is_monotonic_increasing:
|
||||
categories = categories.sort_values()
|
||||
indexer = categories.get_indexer(first.categories)
|
||||
|
||||
from pandas.core.algorithms import take_nd
|
||||
|
||||
new_codes = take_nd(indexer, new_codes, fill_value=-1)
|
||||
elif ignore_order or all(not c.ordered for c in to_union):
|
||||
# different categories - union and recode
|
||||
cats = first.categories.append([c.categories for c in to_union[1:]])
|
||||
categories = cats.unique()
|
||||
if sort_categories:
|
||||
categories = categories.sort_values()
|
||||
|
||||
new_codes = [
|
||||
recode_for_categories(c.codes, c.categories, categories) for c in to_union
|
||||
]
|
||||
new_codes = np.concatenate(new_codes)
|
||||
else:
|
||||
# ordered - to show a proper error message
|
||||
if all(c.ordered for c in to_union):
|
||||
msg = "to union ordered Categoricals, all categories must be the same"
|
||||
raise TypeError(msg)
|
||||
raise TypeError("Categorical.ordered must be the same")
|
||||
|
||||
if ignore_order:
|
||||
ordered = False
|
||||
|
||||
dtype = CategoricalDtype(categories=categories, ordered=ordered)
|
||||
return Categorical._simple_new(new_codes, dtype=dtype)
|
||||
2348
lib/python3.11/site-packages/pandas/core/dtypes/dtypes.py
Normal file
2348
lib/python3.11/site-packages/pandas/core/dtypes/dtypes.py
Normal file
File diff suppressed because it is too large
Load Diff
147
lib/python3.11/site-packages/pandas/core/dtypes/generic.py
Normal file
147
lib/python3.11/site-packages/pandas/core/dtypes/generic.py
Normal file
@ -0,0 +1,147 @@
|
||||
""" define generic base classes for pandas objects """
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Type,
|
||||
cast,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas import (
|
||||
Categorical,
|
||||
CategoricalIndex,
|
||||
DataFrame,
|
||||
DatetimeIndex,
|
||||
Index,
|
||||
IntervalIndex,
|
||||
MultiIndex,
|
||||
PeriodIndex,
|
||||
RangeIndex,
|
||||
Series,
|
||||
TimedeltaIndex,
|
||||
)
|
||||
from pandas.core.arrays import (
|
||||
DatetimeArray,
|
||||
ExtensionArray,
|
||||
NumpyExtensionArray,
|
||||
PeriodArray,
|
||||
TimedeltaArray,
|
||||
)
|
||||
from pandas.core.generic import NDFrame
|
||||
|
||||
|
||||
# define abstract base classes to enable isinstance type checking on our
|
||||
# objects
|
||||
def create_pandas_abc_type(name, attr, comp):
|
||||
def _check(inst) -> bool:
|
||||
return getattr(inst, attr, "_typ") in comp
|
||||
|
||||
# https://github.com/python/mypy/issues/1006
|
||||
# error: 'classmethod' used with a non-method
|
||||
@classmethod # type: ignore[misc]
|
||||
def _instancecheck(cls, inst) -> bool:
|
||||
return _check(inst) and not isinstance(inst, type)
|
||||
|
||||
@classmethod # type: ignore[misc]
|
||||
def _subclasscheck(cls, inst) -> bool:
|
||||
# Raise instead of returning False
|
||||
# This is consistent with default __subclasscheck__ behavior
|
||||
if not isinstance(inst, type):
|
||||
raise TypeError("issubclass() arg 1 must be a class")
|
||||
|
||||
return _check(inst)
|
||||
|
||||
dct = {"__instancecheck__": _instancecheck, "__subclasscheck__": _subclasscheck}
|
||||
meta = type("ABCBase", (type,), dct)
|
||||
return meta(name, (), dct)
|
||||
|
||||
|
||||
ABCRangeIndex = cast(
|
||||
"Type[RangeIndex]",
|
||||
create_pandas_abc_type("ABCRangeIndex", "_typ", ("rangeindex",)),
|
||||
)
|
||||
ABCMultiIndex = cast(
|
||||
"Type[MultiIndex]",
|
||||
create_pandas_abc_type("ABCMultiIndex", "_typ", ("multiindex",)),
|
||||
)
|
||||
ABCDatetimeIndex = cast(
|
||||
"Type[DatetimeIndex]",
|
||||
create_pandas_abc_type("ABCDatetimeIndex", "_typ", ("datetimeindex",)),
|
||||
)
|
||||
ABCTimedeltaIndex = cast(
|
||||
"Type[TimedeltaIndex]",
|
||||
create_pandas_abc_type("ABCTimedeltaIndex", "_typ", ("timedeltaindex",)),
|
||||
)
|
||||
ABCPeriodIndex = cast(
|
||||
"Type[PeriodIndex]",
|
||||
create_pandas_abc_type("ABCPeriodIndex", "_typ", ("periodindex",)),
|
||||
)
|
||||
ABCCategoricalIndex = cast(
|
||||
"Type[CategoricalIndex]",
|
||||
create_pandas_abc_type("ABCCategoricalIndex", "_typ", ("categoricalindex",)),
|
||||
)
|
||||
ABCIntervalIndex = cast(
|
||||
"Type[IntervalIndex]",
|
||||
create_pandas_abc_type("ABCIntervalIndex", "_typ", ("intervalindex",)),
|
||||
)
|
||||
ABCIndex = cast(
|
||||
"Type[Index]",
|
||||
create_pandas_abc_type(
|
||||
"ABCIndex",
|
||||
"_typ",
|
||||
{
|
||||
"index",
|
||||
"rangeindex",
|
||||
"multiindex",
|
||||
"datetimeindex",
|
||||
"timedeltaindex",
|
||||
"periodindex",
|
||||
"categoricalindex",
|
||||
"intervalindex",
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
ABCNDFrame = cast(
|
||||
"Type[NDFrame]",
|
||||
create_pandas_abc_type("ABCNDFrame", "_typ", ("series", "dataframe")),
|
||||
)
|
||||
ABCSeries = cast(
|
||||
"Type[Series]",
|
||||
create_pandas_abc_type("ABCSeries", "_typ", ("series",)),
|
||||
)
|
||||
ABCDataFrame = cast(
|
||||
"Type[DataFrame]", create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe",))
|
||||
)
|
||||
|
||||
ABCCategorical = cast(
|
||||
"Type[Categorical]",
|
||||
create_pandas_abc_type("ABCCategorical", "_typ", ("categorical")),
|
||||
)
|
||||
ABCDatetimeArray = cast(
|
||||
"Type[DatetimeArray]",
|
||||
create_pandas_abc_type("ABCDatetimeArray", "_typ", ("datetimearray")),
|
||||
)
|
||||
ABCTimedeltaArray = cast(
|
||||
"Type[TimedeltaArray]",
|
||||
create_pandas_abc_type("ABCTimedeltaArray", "_typ", ("timedeltaarray")),
|
||||
)
|
||||
ABCPeriodArray = cast(
|
||||
"Type[PeriodArray]",
|
||||
create_pandas_abc_type("ABCPeriodArray", "_typ", ("periodarray",)),
|
||||
)
|
||||
ABCExtensionArray = cast(
|
||||
"Type[ExtensionArray]",
|
||||
create_pandas_abc_type(
|
||||
"ABCExtensionArray",
|
||||
"_typ",
|
||||
# Note: IntervalArray and SparseArray are included bc they have _typ="extension"
|
||||
{"extension", "categorical", "periodarray", "datetimearray", "timedeltaarray"},
|
||||
),
|
||||
)
|
||||
ABCNumpyExtensionArray = cast(
|
||||
"Type[NumpyExtensionArray]",
|
||||
create_pandas_abc_type("ABCNumpyExtensionArray", "_typ", ("npy_extension",)),
|
||||
)
|
||||
437
lib/python3.11/site-packages/pandas/core/dtypes/inference.py
Normal file
437
lib/python3.11/site-packages/pandas/core/dtypes/inference.py
Normal file
@ -0,0 +1,437 @@
|
||||
""" basic inference routines """
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import abc
|
||||
from numbers import Number
|
||||
import re
|
||||
from re import Pattern
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import lib
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Hashable
|
||||
|
||||
from pandas._typing import TypeGuard
|
||||
|
||||
is_bool = lib.is_bool
|
||||
|
||||
is_integer = lib.is_integer
|
||||
|
||||
is_float = lib.is_float
|
||||
|
||||
is_complex = lib.is_complex
|
||||
|
||||
is_scalar = lib.is_scalar
|
||||
|
||||
is_decimal = lib.is_decimal
|
||||
|
||||
is_interval = lib.is_interval
|
||||
|
||||
is_list_like = lib.is_list_like
|
||||
|
||||
is_iterator = lib.is_iterator
|
||||
|
||||
|
||||
def is_number(obj) -> TypeGuard[Number | np.number]:
|
||||
"""
|
||||
Check if the object is a number.
|
||||
|
||||
Returns True when the object is a number, and False if is not.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : any type
|
||||
The object to check if is a number.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
Whether `obj` is a number or not.
|
||||
|
||||
See Also
|
||||
--------
|
||||
api.types.is_integer: Checks a subgroup of numbers.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from pandas.api.types import is_number
|
||||
>>> is_number(1)
|
||||
True
|
||||
>>> is_number(7.15)
|
||||
True
|
||||
|
||||
Booleans are valid because they are int subclass.
|
||||
|
||||
>>> is_number(False)
|
||||
True
|
||||
|
||||
>>> is_number("foo")
|
||||
False
|
||||
>>> is_number("5")
|
||||
False
|
||||
"""
|
||||
return isinstance(obj, (Number, np.number))
|
||||
|
||||
|
||||
def iterable_not_string(obj) -> bool:
|
||||
"""
|
||||
Check if the object is an iterable but not a string.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : The object to check.
|
||||
|
||||
Returns
|
||||
-------
|
||||
is_iter_not_string : bool
|
||||
Whether `obj` is a non-string iterable.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> iterable_not_string([1, 2, 3])
|
||||
True
|
||||
>>> iterable_not_string("foo")
|
||||
False
|
||||
>>> iterable_not_string(1)
|
||||
False
|
||||
"""
|
||||
return isinstance(obj, abc.Iterable) and not isinstance(obj, str)
|
||||
|
||||
|
||||
def is_file_like(obj) -> bool:
|
||||
"""
|
||||
Check if the object is a file-like object.
|
||||
|
||||
For objects to be considered file-like, they must
|
||||
be an iterator AND have either a `read` and/or `write`
|
||||
method as an attribute.
|
||||
|
||||
Note: file-like objects must be iterable, but
|
||||
iterable objects need not be file-like.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : The object to check
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
Whether `obj` has file-like properties.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import io
|
||||
>>> from pandas.api.types import is_file_like
|
||||
>>> buffer = io.StringIO("data")
|
||||
>>> is_file_like(buffer)
|
||||
True
|
||||
>>> is_file_like([1, 2, 3])
|
||||
False
|
||||
"""
|
||||
if not (hasattr(obj, "read") or hasattr(obj, "write")):
|
||||
return False
|
||||
|
||||
return bool(hasattr(obj, "__iter__"))
|
||||
|
||||
|
||||
def is_re(obj) -> TypeGuard[Pattern]:
|
||||
"""
|
||||
Check if the object is a regex pattern instance.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : The object to check
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
Whether `obj` is a regex pattern.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from pandas.api.types import is_re
|
||||
>>> import re
|
||||
>>> is_re(re.compile(".*"))
|
||||
True
|
||||
>>> is_re("foo")
|
||||
False
|
||||
"""
|
||||
return isinstance(obj, Pattern)
|
||||
|
||||
|
||||
def is_re_compilable(obj) -> bool:
|
||||
"""
|
||||
Check if the object can be compiled into a regex pattern instance.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : The object to check
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
Whether `obj` can be compiled as a regex pattern.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from pandas.api.types import is_re_compilable
|
||||
>>> is_re_compilable(".*")
|
||||
True
|
||||
>>> is_re_compilable(1)
|
||||
False
|
||||
"""
|
||||
try:
|
||||
re.compile(obj)
|
||||
except TypeError:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
def is_array_like(obj) -> bool:
|
||||
"""
|
||||
Check if the object is array-like.
|
||||
|
||||
For an object to be considered array-like, it must be list-like and
|
||||
have a `dtype` attribute.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : The object to check
|
||||
|
||||
Returns
|
||||
-------
|
||||
is_array_like : bool
|
||||
Whether `obj` has array-like properties.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> is_array_like(np.array([1, 2, 3]))
|
||||
True
|
||||
>>> is_array_like(pd.Series(["a", "b"]))
|
||||
True
|
||||
>>> is_array_like(pd.Index(["2016-01-01"]))
|
||||
True
|
||||
>>> is_array_like([1, 2, 3])
|
||||
False
|
||||
>>> is_array_like(("a", "b"))
|
||||
False
|
||||
"""
|
||||
return is_list_like(obj) and hasattr(obj, "dtype")
|
||||
|
||||
|
||||
def is_nested_list_like(obj) -> bool:
|
||||
"""
|
||||
Check if the object is list-like, and that all of its elements
|
||||
are also list-like.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : The object to check
|
||||
|
||||
Returns
|
||||
-------
|
||||
is_list_like : bool
|
||||
Whether `obj` has list-like properties.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> is_nested_list_like([[1, 2, 3]])
|
||||
True
|
||||
>>> is_nested_list_like([{1, 2, 3}, {1, 2, 3}])
|
||||
True
|
||||
>>> is_nested_list_like(["foo"])
|
||||
False
|
||||
>>> is_nested_list_like([])
|
||||
False
|
||||
>>> is_nested_list_like([[1, 2, 3], 1])
|
||||
False
|
||||
|
||||
Notes
|
||||
-----
|
||||
This won't reliably detect whether a consumable iterator (e. g.
|
||||
a generator) is a nested-list-like without consuming the iterator.
|
||||
To avoid consuming it, we always return False if the outer container
|
||||
doesn't define `__len__`.
|
||||
|
||||
See Also
|
||||
--------
|
||||
is_list_like
|
||||
"""
|
||||
return (
|
||||
is_list_like(obj)
|
||||
and hasattr(obj, "__len__")
|
||||
and len(obj) > 0
|
||||
and all(is_list_like(item) for item in obj)
|
||||
)
|
||||
|
||||
|
||||
def is_dict_like(obj) -> bool:
|
||||
"""
|
||||
Check if the object is dict-like.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : The object to check
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
Whether `obj` has dict-like properties.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from pandas.api.types import is_dict_like
|
||||
>>> is_dict_like({1: 2})
|
||||
True
|
||||
>>> is_dict_like([1, 2, 3])
|
||||
False
|
||||
>>> is_dict_like(dict)
|
||||
False
|
||||
>>> is_dict_like(dict())
|
||||
True
|
||||
"""
|
||||
dict_like_attrs = ("__getitem__", "keys", "__contains__")
|
||||
return (
|
||||
all(hasattr(obj, attr) for attr in dict_like_attrs)
|
||||
# [GH 25196] exclude classes
|
||||
and not isinstance(obj, type)
|
||||
)
|
||||
|
||||
|
||||
def is_named_tuple(obj) -> bool:
|
||||
"""
|
||||
Check if the object is a named tuple.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : The object to check
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
Whether `obj` is a named tuple.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from collections import namedtuple
|
||||
>>> from pandas.api.types import is_named_tuple
|
||||
>>> Point = namedtuple("Point", ["x", "y"])
|
||||
>>> p = Point(1, 2)
|
||||
>>>
|
||||
>>> is_named_tuple(p)
|
||||
True
|
||||
>>> is_named_tuple((1, 2))
|
||||
False
|
||||
"""
|
||||
return isinstance(obj, abc.Sequence) and hasattr(obj, "_fields")
|
||||
|
||||
|
||||
def is_hashable(obj) -> TypeGuard[Hashable]:
|
||||
"""
|
||||
Return True if hash(obj) will succeed, False otherwise.
|
||||
|
||||
Some types will pass a test against collections.abc.Hashable but fail when
|
||||
they are actually hashed with hash().
|
||||
|
||||
Distinguish between these and other types by trying the call to hash() and
|
||||
seeing if they raise TypeError.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import collections
|
||||
>>> from pandas.api.types import is_hashable
|
||||
>>> a = ([],)
|
||||
>>> isinstance(a, collections.abc.Hashable)
|
||||
True
|
||||
>>> is_hashable(a)
|
||||
False
|
||||
"""
|
||||
# Unfortunately, we can't use isinstance(obj, collections.abc.Hashable),
|
||||
# which can be faster than calling hash. That is because numpy scalars
|
||||
# fail this test.
|
||||
|
||||
# Reconsider this decision once this numpy bug is fixed:
|
||||
# https://github.com/numpy/numpy/issues/5562
|
||||
|
||||
try:
|
||||
hash(obj)
|
||||
except TypeError:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
def is_sequence(obj) -> bool:
|
||||
"""
|
||||
Check if the object is a sequence of objects.
|
||||
String types are not included as sequences here.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : The object to check
|
||||
|
||||
Returns
|
||||
-------
|
||||
is_sequence : bool
|
||||
Whether `obj` is a sequence of objects.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> l = [1, 2, 3]
|
||||
>>>
|
||||
>>> is_sequence(l)
|
||||
True
|
||||
>>> is_sequence(iter(l))
|
||||
False
|
||||
"""
|
||||
try:
|
||||
iter(obj) # Can iterate over it.
|
||||
len(obj) # Has a length associated with it.
|
||||
return not isinstance(obj, (str, bytes))
|
||||
except (TypeError, AttributeError):
|
||||
return False
|
||||
|
||||
|
||||
def is_dataclass(item) -> bool:
|
||||
"""
|
||||
Checks if the object is a data-class instance
|
||||
|
||||
Parameters
|
||||
----------
|
||||
item : object
|
||||
|
||||
Returns
|
||||
--------
|
||||
is_dataclass : bool
|
||||
True if the item is an instance of a data-class,
|
||||
will return false if you pass the data class itself
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from dataclasses import dataclass
|
||||
>>> @dataclass
|
||||
... class Point:
|
||||
... x: int
|
||||
... y: int
|
||||
|
||||
>>> is_dataclass(Point)
|
||||
False
|
||||
>>> is_dataclass(Point(0,2))
|
||||
True
|
||||
|
||||
"""
|
||||
try:
|
||||
import dataclasses
|
||||
|
||||
return dataclasses.is_dataclass(item) and not isinstance(item, type)
|
||||
except ImportError:
|
||||
return False
|
||||
810
lib/python3.11/site-packages/pandas/core/dtypes/missing.py
Normal file
810
lib/python3.11/site-packages/pandas/core/dtypes/missing.py
Normal file
@ -0,0 +1,810 @@
|
||||
"""
|
||||
missing types & inference
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from decimal import Decimal
|
||||
from functools import partial
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
overload,
|
||||
)
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._config import get_option
|
||||
|
||||
from pandas._libs import lib
|
||||
import pandas._libs.missing as libmissing
|
||||
from pandas._libs.tslibs import (
|
||||
NaT,
|
||||
iNaT,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
DT64NS_DTYPE,
|
||||
TD64NS_DTYPE,
|
||||
ensure_object,
|
||||
is_scalar,
|
||||
is_string_or_object_np_dtype,
|
||||
)
|
||||
from pandas.core.dtypes.dtypes import (
|
||||
CategoricalDtype,
|
||||
DatetimeTZDtype,
|
||||
ExtensionDtype,
|
||||
IntervalDtype,
|
||||
PeriodDtype,
|
||||
)
|
||||
from pandas.core.dtypes.generic import (
|
||||
ABCDataFrame,
|
||||
ABCExtensionArray,
|
||||
ABCIndex,
|
||||
ABCMultiIndex,
|
||||
ABCSeries,
|
||||
)
|
||||
from pandas.core.dtypes.inference import is_list_like
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from re import Pattern
|
||||
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
DtypeObj,
|
||||
NDFrame,
|
||||
NDFrameT,
|
||||
Scalar,
|
||||
npt,
|
||||
)
|
||||
|
||||
from pandas import Series
|
||||
from pandas.core.indexes.base import Index
|
||||
|
||||
|
||||
isposinf_scalar = libmissing.isposinf_scalar
|
||||
isneginf_scalar = libmissing.isneginf_scalar
|
||||
|
||||
nan_checker = np.isnan
|
||||
INF_AS_NA = False
|
||||
_dtype_object = np.dtype("object")
|
||||
_dtype_str = np.dtype(str)
|
||||
|
||||
|
||||
@overload
|
||||
def isna(obj: Scalar | Pattern) -> bool:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def isna(
|
||||
obj: ArrayLike | Index | list,
|
||||
) -> npt.NDArray[np.bool_]:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def isna(obj: NDFrameT) -> NDFrameT:
|
||||
...
|
||||
|
||||
|
||||
# handle unions
|
||||
@overload
|
||||
def isna(obj: NDFrameT | ArrayLike | Index | list) -> NDFrameT | npt.NDArray[np.bool_]:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def isna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame:
|
||||
...
|
||||
|
||||
|
||||
def isna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame:
|
||||
"""
|
||||
Detect missing values for an array-like object.
|
||||
|
||||
This function takes a scalar or array-like object and indicates
|
||||
whether values are missing (``NaN`` in numeric arrays, ``None`` or ``NaN``
|
||||
in object arrays, ``NaT`` in datetimelike).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : scalar or array-like
|
||||
Object to check for null or missing values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool or array-like of bool
|
||||
For scalar input, returns a scalar boolean.
|
||||
For array input, returns an array of boolean indicating whether each
|
||||
corresponding element is missing.
|
||||
|
||||
See Also
|
||||
--------
|
||||
notna : Boolean inverse of pandas.isna.
|
||||
Series.isna : Detect missing values in a Series.
|
||||
DataFrame.isna : Detect missing values in a DataFrame.
|
||||
Index.isna : Detect missing values in an Index.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Scalar arguments (including strings) result in a scalar boolean.
|
||||
|
||||
>>> pd.isna('dog')
|
||||
False
|
||||
|
||||
>>> pd.isna(pd.NA)
|
||||
True
|
||||
|
||||
>>> pd.isna(np.nan)
|
||||
True
|
||||
|
||||
ndarrays result in an ndarray of booleans.
|
||||
|
||||
>>> array = np.array([[1, np.nan, 3], [4, 5, np.nan]])
|
||||
>>> array
|
||||
array([[ 1., nan, 3.],
|
||||
[ 4., 5., nan]])
|
||||
>>> pd.isna(array)
|
||||
array([[False, True, False],
|
||||
[False, False, True]])
|
||||
|
||||
For indexes, an ndarray of booleans is returned.
|
||||
|
||||
>>> index = pd.DatetimeIndex(["2017-07-05", "2017-07-06", None,
|
||||
... "2017-07-08"])
|
||||
>>> index
|
||||
DatetimeIndex(['2017-07-05', '2017-07-06', 'NaT', '2017-07-08'],
|
||||
dtype='datetime64[ns]', freq=None)
|
||||
>>> pd.isna(index)
|
||||
array([False, False, True, False])
|
||||
|
||||
For Series and DataFrame, the same type is returned, containing booleans.
|
||||
|
||||
>>> df = pd.DataFrame([['ant', 'bee', 'cat'], ['dog', None, 'fly']])
|
||||
>>> df
|
||||
0 1 2
|
||||
0 ant bee cat
|
||||
1 dog None fly
|
||||
>>> pd.isna(df)
|
||||
0 1 2
|
||||
0 False False False
|
||||
1 False True False
|
||||
|
||||
>>> pd.isna(df[1])
|
||||
0 False
|
||||
1 True
|
||||
Name: 1, dtype: bool
|
||||
"""
|
||||
return _isna(obj)
|
||||
|
||||
|
||||
isnull = isna
|
||||
|
||||
|
||||
def _isna(obj, inf_as_na: bool = False):
|
||||
"""
|
||||
Detect missing values, treating None, NaN or NA as null. Infinite
|
||||
values will also be treated as null if inf_as_na is True.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj: ndarray or object value
|
||||
Input array or scalar value.
|
||||
inf_as_na: bool
|
||||
Whether to treat infinity as null.
|
||||
|
||||
Returns
|
||||
-------
|
||||
boolean ndarray or boolean
|
||||
"""
|
||||
if is_scalar(obj):
|
||||
return libmissing.checknull(obj, inf_as_na=inf_as_na)
|
||||
elif isinstance(obj, ABCMultiIndex):
|
||||
raise NotImplementedError("isna is not defined for MultiIndex")
|
||||
elif isinstance(obj, type):
|
||||
return False
|
||||
elif isinstance(obj, (np.ndarray, ABCExtensionArray)):
|
||||
return _isna_array(obj, inf_as_na=inf_as_na)
|
||||
elif isinstance(obj, ABCIndex):
|
||||
# Try to use cached isna, which also short-circuits for integer dtypes
|
||||
# and avoids materializing RangeIndex._values
|
||||
if not obj._can_hold_na:
|
||||
return obj.isna()
|
||||
return _isna_array(obj._values, inf_as_na=inf_as_na)
|
||||
|
||||
elif isinstance(obj, ABCSeries):
|
||||
result = _isna_array(obj._values, inf_as_na=inf_as_na)
|
||||
# box
|
||||
result = obj._constructor(result, index=obj.index, name=obj.name, copy=False)
|
||||
return result
|
||||
elif isinstance(obj, ABCDataFrame):
|
||||
return obj.isna()
|
||||
elif isinstance(obj, list):
|
||||
return _isna_array(np.asarray(obj, dtype=object), inf_as_na=inf_as_na)
|
||||
elif hasattr(obj, "__array__"):
|
||||
return _isna_array(np.asarray(obj), inf_as_na=inf_as_na)
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def _use_inf_as_na(key) -> None:
|
||||
"""
|
||||
Option change callback for na/inf behaviour.
|
||||
|
||||
Choose which replacement for numpy.isnan / -numpy.isfinite is used.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
flag: bool
|
||||
True means treat None, NaN, INF, -INF as null (old way),
|
||||
False means None and NaN are null, but INF, -INF are not null
|
||||
(new way).
|
||||
|
||||
Notes
|
||||
-----
|
||||
This approach to setting global module values is discussed and
|
||||
approved here:
|
||||
|
||||
* https://stackoverflow.com/questions/4859217/
|
||||
programmatically-creating-variables-in-python/4859312#4859312
|
||||
"""
|
||||
inf_as_na = get_option(key)
|
||||
globals()["_isna"] = partial(_isna, inf_as_na=inf_as_na)
|
||||
if inf_as_na:
|
||||
globals()["nan_checker"] = lambda x: ~np.isfinite(x)
|
||||
globals()["INF_AS_NA"] = True
|
||||
else:
|
||||
globals()["nan_checker"] = np.isnan
|
||||
globals()["INF_AS_NA"] = False
|
||||
|
||||
|
||||
def _isna_array(values: ArrayLike, inf_as_na: bool = False):
|
||||
"""
|
||||
Return an array indicating which values of the input array are NaN / NA.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj: ndarray or ExtensionArray
|
||||
The input array whose elements are to be checked.
|
||||
inf_as_na: bool
|
||||
Whether or not to treat infinite values as NA.
|
||||
|
||||
Returns
|
||||
-------
|
||||
array-like
|
||||
Array of boolean values denoting the NA status of each element.
|
||||
"""
|
||||
dtype = values.dtype
|
||||
|
||||
if not isinstance(values, np.ndarray):
|
||||
# i.e. ExtensionArray
|
||||
if inf_as_na and isinstance(dtype, CategoricalDtype):
|
||||
result = libmissing.isnaobj(values.to_numpy(), inf_as_na=inf_as_na)
|
||||
else:
|
||||
# error: Incompatible types in assignment (expression has type
|
||||
# "Union[ndarray[Any, Any], ExtensionArraySupportsAnyAll]", variable has
|
||||
# type "ndarray[Any, dtype[bool_]]")
|
||||
result = values.isna() # type: ignore[assignment]
|
||||
elif isinstance(values, np.rec.recarray):
|
||||
# GH 48526
|
||||
result = _isna_recarray_dtype(values, inf_as_na=inf_as_na)
|
||||
elif is_string_or_object_np_dtype(values.dtype):
|
||||
result = _isna_string_dtype(values, inf_as_na=inf_as_na)
|
||||
elif dtype.kind in "mM":
|
||||
# this is the NaT pattern
|
||||
result = values.view("i8") == iNaT
|
||||
else:
|
||||
if inf_as_na:
|
||||
result = ~np.isfinite(values)
|
||||
else:
|
||||
result = np.isnan(values)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _isna_string_dtype(values: np.ndarray, inf_as_na: bool) -> npt.NDArray[np.bool_]:
|
||||
# Working around NumPy ticket 1542
|
||||
dtype = values.dtype
|
||||
|
||||
if dtype.kind in ("S", "U"):
|
||||
result = np.zeros(values.shape, dtype=bool)
|
||||
else:
|
||||
if values.ndim in {1, 2}:
|
||||
result = libmissing.isnaobj(values, inf_as_na=inf_as_na)
|
||||
else:
|
||||
# 0-D, reached via e.g. mask_missing
|
||||
result = libmissing.isnaobj(values.ravel(), inf_as_na=inf_as_na)
|
||||
result = result.reshape(values.shape)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _has_record_inf_value(record_as_array: np.ndarray) -> np.bool_:
|
||||
is_inf_in_record = np.zeros(len(record_as_array), dtype=bool)
|
||||
for i, value in enumerate(record_as_array):
|
||||
is_element_inf = False
|
||||
try:
|
||||
is_element_inf = np.isinf(value)
|
||||
except TypeError:
|
||||
is_element_inf = False
|
||||
is_inf_in_record[i] = is_element_inf
|
||||
|
||||
return np.any(is_inf_in_record)
|
||||
|
||||
|
||||
def _isna_recarray_dtype(
|
||||
values: np.rec.recarray, inf_as_na: bool
|
||||
) -> npt.NDArray[np.bool_]:
|
||||
result = np.zeros(values.shape, dtype=bool)
|
||||
for i, record in enumerate(values):
|
||||
record_as_array = np.array(record.tolist())
|
||||
does_record_contain_nan = isna_all(record_as_array)
|
||||
does_record_contain_inf = False
|
||||
if inf_as_na:
|
||||
does_record_contain_inf = bool(_has_record_inf_value(record_as_array))
|
||||
result[i] = np.any(
|
||||
np.logical_or(does_record_contain_nan, does_record_contain_inf)
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@overload
|
||||
def notna(obj: Scalar) -> bool:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def notna(
|
||||
obj: ArrayLike | Index | list,
|
||||
) -> npt.NDArray[np.bool_]:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def notna(obj: NDFrameT) -> NDFrameT:
|
||||
...
|
||||
|
||||
|
||||
# handle unions
|
||||
@overload
|
||||
def notna(obj: NDFrameT | ArrayLike | Index | list) -> NDFrameT | npt.NDArray[np.bool_]:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def notna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame:
|
||||
...
|
||||
|
||||
|
||||
def notna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame:
|
||||
"""
|
||||
Detect non-missing values for an array-like object.
|
||||
|
||||
This function takes a scalar or array-like object and indicates
|
||||
whether values are valid (not missing, which is ``NaN`` in numeric
|
||||
arrays, ``None`` or ``NaN`` in object arrays, ``NaT`` in datetimelike).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : array-like or object value
|
||||
Object to check for *not* null or *non*-missing values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool or array-like of bool
|
||||
For scalar input, returns a scalar boolean.
|
||||
For array input, returns an array of boolean indicating whether each
|
||||
corresponding element is valid.
|
||||
|
||||
See Also
|
||||
--------
|
||||
isna : Boolean inverse of pandas.notna.
|
||||
Series.notna : Detect valid values in a Series.
|
||||
DataFrame.notna : Detect valid values in a DataFrame.
|
||||
Index.notna : Detect valid values in an Index.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Scalar arguments (including strings) result in a scalar boolean.
|
||||
|
||||
>>> pd.notna('dog')
|
||||
True
|
||||
|
||||
>>> pd.notna(pd.NA)
|
||||
False
|
||||
|
||||
>>> pd.notna(np.nan)
|
||||
False
|
||||
|
||||
ndarrays result in an ndarray of booleans.
|
||||
|
||||
>>> array = np.array([[1, np.nan, 3], [4, 5, np.nan]])
|
||||
>>> array
|
||||
array([[ 1., nan, 3.],
|
||||
[ 4., 5., nan]])
|
||||
>>> pd.notna(array)
|
||||
array([[ True, False, True],
|
||||
[ True, True, False]])
|
||||
|
||||
For indexes, an ndarray of booleans is returned.
|
||||
|
||||
>>> index = pd.DatetimeIndex(["2017-07-05", "2017-07-06", None,
|
||||
... "2017-07-08"])
|
||||
>>> index
|
||||
DatetimeIndex(['2017-07-05', '2017-07-06', 'NaT', '2017-07-08'],
|
||||
dtype='datetime64[ns]', freq=None)
|
||||
>>> pd.notna(index)
|
||||
array([ True, True, False, True])
|
||||
|
||||
For Series and DataFrame, the same type is returned, containing booleans.
|
||||
|
||||
>>> df = pd.DataFrame([['ant', 'bee', 'cat'], ['dog', None, 'fly']])
|
||||
>>> df
|
||||
0 1 2
|
||||
0 ant bee cat
|
||||
1 dog None fly
|
||||
>>> pd.notna(df)
|
||||
0 1 2
|
||||
0 True True True
|
||||
1 True False True
|
||||
|
||||
>>> pd.notna(df[1])
|
||||
0 True
|
||||
1 False
|
||||
Name: 1, dtype: bool
|
||||
"""
|
||||
res = isna(obj)
|
||||
if isinstance(res, bool):
|
||||
return not res
|
||||
return ~res
|
||||
|
||||
|
||||
notnull = notna
|
||||
|
||||
|
||||
def array_equivalent(
|
||||
left,
|
||||
right,
|
||||
strict_nan: bool = False,
|
||||
dtype_equal: bool = False,
|
||||
) -> bool:
|
||||
"""
|
||||
True if two arrays, left and right, have equal non-NaN elements, and NaNs
|
||||
in corresponding locations. False otherwise. It is assumed that left and
|
||||
right are NumPy arrays of the same dtype. The behavior of this function
|
||||
(particularly with respect to NaNs) is not defined if the dtypes are
|
||||
different.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
left, right : ndarrays
|
||||
strict_nan : bool, default False
|
||||
If True, consider NaN and None to be different.
|
||||
dtype_equal : bool, default False
|
||||
Whether `left` and `right` are known to have the same dtype
|
||||
according to `is_dtype_equal`. Some methods like `BlockManager.equals`.
|
||||
require that the dtypes match. Setting this to ``True`` can improve
|
||||
performance, but will give different results for arrays that are
|
||||
equal but different dtypes.
|
||||
|
||||
Returns
|
||||
-------
|
||||
b : bool
|
||||
Returns True if the arrays are equivalent.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> array_equivalent(
|
||||
... np.array([1, 2, np.nan]),
|
||||
... np.array([1, 2, np.nan]))
|
||||
True
|
||||
>>> array_equivalent(
|
||||
... np.array([1, np.nan, 2]),
|
||||
... np.array([1, 2, np.nan]))
|
||||
False
|
||||
"""
|
||||
left, right = np.asarray(left), np.asarray(right)
|
||||
|
||||
# shape compat
|
||||
if left.shape != right.shape:
|
||||
return False
|
||||
|
||||
if dtype_equal:
|
||||
# fastpath when we require that the dtypes match (Block.equals)
|
||||
if left.dtype.kind in "fc":
|
||||
return _array_equivalent_float(left, right)
|
||||
elif left.dtype.kind in "mM":
|
||||
return _array_equivalent_datetimelike(left, right)
|
||||
elif is_string_or_object_np_dtype(left.dtype):
|
||||
# TODO: fastpath for pandas' StringDtype
|
||||
return _array_equivalent_object(left, right, strict_nan)
|
||||
else:
|
||||
return np.array_equal(left, right)
|
||||
|
||||
# Slow path when we allow comparing different dtypes.
|
||||
# Object arrays can contain None, NaN and NaT.
|
||||
# string dtypes must be come to this path for NumPy 1.7.1 compat
|
||||
if left.dtype.kind in "OSU" or right.dtype.kind in "OSU":
|
||||
# Note: `in "OSU"` is non-trivially faster than `in ["O", "S", "U"]`
|
||||
# or `in ("O", "S", "U")`
|
||||
return _array_equivalent_object(left, right, strict_nan)
|
||||
|
||||
# NaNs can occur in float and complex arrays.
|
||||
if left.dtype.kind in "fc":
|
||||
if not (left.size and right.size):
|
||||
return True
|
||||
return ((left == right) | (isna(left) & isna(right))).all()
|
||||
|
||||
elif left.dtype.kind in "mM" or right.dtype.kind in "mM":
|
||||
# datetime64, timedelta64, Period
|
||||
if left.dtype != right.dtype:
|
||||
return False
|
||||
|
||||
left = left.view("i8")
|
||||
right = right.view("i8")
|
||||
|
||||
# if we have structured dtypes, compare first
|
||||
if (
|
||||
left.dtype.type is np.void or right.dtype.type is np.void
|
||||
) and left.dtype != right.dtype:
|
||||
return False
|
||||
|
||||
return np.array_equal(left, right)
|
||||
|
||||
|
||||
def _array_equivalent_float(left: np.ndarray, right: np.ndarray) -> bool:
|
||||
return bool(((left == right) | (np.isnan(left) & np.isnan(right))).all())
|
||||
|
||||
|
||||
def _array_equivalent_datetimelike(left: np.ndarray, right: np.ndarray):
|
||||
return np.array_equal(left.view("i8"), right.view("i8"))
|
||||
|
||||
|
||||
def _array_equivalent_object(left: np.ndarray, right: np.ndarray, strict_nan: bool):
|
||||
left = ensure_object(left)
|
||||
right = ensure_object(right)
|
||||
|
||||
mask: npt.NDArray[np.bool_] | None = None
|
||||
if strict_nan:
|
||||
mask = isna(left) & isna(right)
|
||||
if not mask.any():
|
||||
mask = None
|
||||
|
||||
try:
|
||||
if mask is None:
|
||||
return lib.array_equivalent_object(left, right)
|
||||
if not lib.array_equivalent_object(left[~mask], right[~mask]):
|
||||
return False
|
||||
left_remaining = left[mask]
|
||||
right_remaining = right[mask]
|
||||
except ValueError:
|
||||
# can raise a ValueError if left and right cannot be
|
||||
# compared (e.g. nested arrays)
|
||||
left_remaining = left
|
||||
right_remaining = right
|
||||
|
||||
for left_value, right_value in zip(left_remaining, right_remaining):
|
||||
if left_value is NaT and right_value is not NaT:
|
||||
return False
|
||||
|
||||
elif left_value is libmissing.NA and right_value is not libmissing.NA:
|
||||
return False
|
||||
|
||||
elif isinstance(left_value, float) and np.isnan(left_value):
|
||||
if not isinstance(right_value, float) or not np.isnan(right_value):
|
||||
return False
|
||||
else:
|
||||
with warnings.catch_warnings():
|
||||
# suppress numpy's "elementwise comparison failed"
|
||||
warnings.simplefilter("ignore", DeprecationWarning)
|
||||
try:
|
||||
if np.any(np.asarray(left_value != right_value)):
|
||||
return False
|
||||
except TypeError as err:
|
||||
if "boolean value of NA is ambiguous" in str(err):
|
||||
return False
|
||||
raise
|
||||
except ValueError:
|
||||
# numpy can raise a ValueError if left and right cannot be
|
||||
# compared (e.g. nested arrays)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def array_equals(left: ArrayLike, right: ArrayLike) -> bool:
|
||||
"""
|
||||
ExtensionArray-compatible implementation of array_equivalent.
|
||||
"""
|
||||
if left.dtype != right.dtype:
|
||||
return False
|
||||
elif isinstance(left, ABCExtensionArray):
|
||||
return left.equals(right)
|
||||
else:
|
||||
return array_equivalent(left, right, dtype_equal=True)
|
||||
|
||||
|
||||
def infer_fill_value(val):
|
||||
"""
|
||||
infer the fill value for the nan/NaT from the provided
|
||||
scalar/ndarray/list-like if we are a NaT, return the correct dtyped
|
||||
element to provide proper block construction
|
||||
"""
|
||||
if not is_list_like(val):
|
||||
val = [val]
|
||||
val = np.asarray(val)
|
||||
if val.dtype.kind in "mM":
|
||||
return np.array("NaT", dtype=val.dtype)
|
||||
elif val.dtype == object:
|
||||
dtype = lib.infer_dtype(ensure_object(val), skipna=False)
|
||||
if dtype in ["datetime", "datetime64"]:
|
||||
return np.array("NaT", dtype=DT64NS_DTYPE)
|
||||
elif dtype in ["timedelta", "timedelta64"]:
|
||||
return np.array("NaT", dtype=TD64NS_DTYPE)
|
||||
return np.array(np.nan, dtype=object)
|
||||
elif val.dtype.kind == "U":
|
||||
return np.array(np.nan, dtype=val.dtype)
|
||||
return np.nan
|
||||
|
||||
|
||||
def construct_1d_array_from_inferred_fill_value(
|
||||
value: object, length: int
|
||||
) -> ArrayLike:
|
||||
# Find our empty_value dtype by constructing an array
|
||||
# from our value and doing a .take on it
|
||||
from pandas.core.algorithms import take_nd
|
||||
from pandas.core.construction import sanitize_array
|
||||
from pandas.core.indexes.base import Index
|
||||
|
||||
arr = sanitize_array(value, Index(range(1)), copy=False)
|
||||
taker = -1 * np.ones(length, dtype=np.intp)
|
||||
return take_nd(arr, taker)
|
||||
|
||||
|
||||
def maybe_fill(arr: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Fill numpy.ndarray with NaN, unless we have a integer or boolean dtype.
|
||||
"""
|
||||
if arr.dtype.kind not in "iub":
|
||||
arr.fill(np.nan)
|
||||
return arr
|
||||
|
||||
|
||||
def na_value_for_dtype(dtype: DtypeObj, compat: bool = True):
|
||||
"""
|
||||
Return a dtype compat na value
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dtype : string / dtype
|
||||
compat : bool, default True
|
||||
|
||||
Returns
|
||||
-------
|
||||
np.dtype or a pandas dtype
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> na_value_for_dtype(np.dtype('int64'))
|
||||
0
|
||||
>>> na_value_for_dtype(np.dtype('int64'), compat=False)
|
||||
nan
|
||||
>>> na_value_for_dtype(np.dtype('float64'))
|
||||
nan
|
||||
>>> na_value_for_dtype(np.dtype('bool'))
|
||||
False
|
||||
>>> na_value_for_dtype(np.dtype('datetime64[ns]'))
|
||||
numpy.datetime64('NaT')
|
||||
"""
|
||||
|
||||
if isinstance(dtype, ExtensionDtype):
|
||||
return dtype.na_value
|
||||
elif dtype.kind in "mM":
|
||||
unit = np.datetime_data(dtype)[0]
|
||||
return dtype.type("NaT", unit)
|
||||
elif dtype.kind == "f":
|
||||
return np.nan
|
||||
elif dtype.kind in "iu":
|
||||
if compat:
|
||||
return 0
|
||||
return np.nan
|
||||
elif dtype.kind == "b":
|
||||
if compat:
|
||||
return False
|
||||
return np.nan
|
||||
return np.nan
|
||||
|
||||
|
||||
def remove_na_arraylike(arr: Series | Index | np.ndarray):
|
||||
"""
|
||||
Return array-like containing only true/non-NaN values, possibly empty.
|
||||
"""
|
||||
if isinstance(arr.dtype, ExtensionDtype):
|
||||
return arr[notna(arr)]
|
||||
else:
|
||||
return arr[notna(np.asarray(arr))]
|
||||
|
||||
|
||||
def is_valid_na_for_dtype(obj, dtype: DtypeObj) -> bool:
|
||||
"""
|
||||
isna check that excludes incompatible dtypes
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : object
|
||||
dtype : np.datetime64, np.timedelta64, DatetimeTZDtype, or PeriodDtype
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
"""
|
||||
if not lib.is_scalar(obj) or not isna(obj):
|
||||
return False
|
||||
elif dtype.kind == "M":
|
||||
if isinstance(dtype, np.dtype):
|
||||
# i.e. not tzaware
|
||||
return not isinstance(obj, (np.timedelta64, Decimal))
|
||||
# we have to rule out tznaive dt64("NaT")
|
||||
return not isinstance(obj, (np.timedelta64, np.datetime64, Decimal))
|
||||
elif dtype.kind == "m":
|
||||
return not isinstance(obj, (np.datetime64, Decimal))
|
||||
elif dtype.kind in "iufc":
|
||||
# Numeric
|
||||
return obj is not NaT and not isinstance(obj, (np.datetime64, np.timedelta64))
|
||||
elif dtype.kind == "b":
|
||||
# We allow pd.NA, None, np.nan in BooleanArray (same as IntervalDtype)
|
||||
return lib.is_float(obj) or obj is None or obj is libmissing.NA
|
||||
|
||||
elif dtype == _dtype_str:
|
||||
# numpy string dtypes to avoid float np.nan
|
||||
return not isinstance(obj, (np.datetime64, np.timedelta64, Decimal, float))
|
||||
|
||||
elif dtype == _dtype_object:
|
||||
# This is needed for Categorical, but is kind of weird
|
||||
return True
|
||||
|
||||
elif isinstance(dtype, PeriodDtype):
|
||||
return not isinstance(obj, (np.datetime64, np.timedelta64, Decimal))
|
||||
|
||||
elif isinstance(dtype, IntervalDtype):
|
||||
return lib.is_float(obj) or obj is None or obj is libmissing.NA
|
||||
|
||||
elif isinstance(dtype, CategoricalDtype):
|
||||
return is_valid_na_for_dtype(obj, dtype.categories.dtype)
|
||||
|
||||
# fallback, default to allowing NaN, None, NA, NaT
|
||||
return not isinstance(obj, (np.datetime64, np.timedelta64, Decimal))
|
||||
|
||||
|
||||
def isna_all(arr: ArrayLike) -> bool:
|
||||
"""
|
||||
Optimized equivalent to isna(arr).all()
|
||||
"""
|
||||
total_len = len(arr)
|
||||
|
||||
# Usually it's enough to check but a small fraction of values to see if
|
||||
# a block is NOT null, chunks should help in such cases.
|
||||
# parameters 1000 and 40 were chosen arbitrarily
|
||||
chunk_len = max(total_len // 40, 1000)
|
||||
|
||||
dtype = arr.dtype
|
||||
if lib.is_np_dtype(dtype, "f"):
|
||||
checker = nan_checker
|
||||
|
||||
elif (lib.is_np_dtype(dtype, "mM")) or isinstance(
|
||||
dtype, (DatetimeTZDtype, PeriodDtype)
|
||||
):
|
||||
# error: Incompatible types in assignment (expression has type
|
||||
# "Callable[[Any], Any]", variable has type "ufunc")
|
||||
checker = lambda x: np.asarray(x.view("i8")) == iNaT # type: ignore[assignment]
|
||||
|
||||
else:
|
||||
# error: Incompatible types in assignment (expression has type "Callable[[Any],
|
||||
# Any]", variable has type "ufunc")
|
||||
checker = lambda x: _isna_array( # type: ignore[assignment]
|
||||
x, inf_as_na=INF_AS_NA
|
||||
)
|
||||
|
||||
return all(
|
||||
checker(arr[i : i + chunk_len]).all() for i in range(0, total_len, chunk_len)
|
||||
)
|
||||
117
lib/python3.11/site-packages/pandas/core/flags.py
Normal file
117
lib/python3.11/site-packages/pandas/core/flags.py
Normal file
@ -0,0 +1,117 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
import weakref
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas.core.generic import NDFrame
|
||||
|
||||
|
||||
class Flags:
|
||||
"""
|
||||
Flags that apply to pandas objects.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : Series or DataFrame
|
||||
The object these flags are associated with.
|
||||
allows_duplicate_labels : bool, default True
|
||||
Whether to allow duplicate labels in this object. By default,
|
||||
duplicate labels are permitted. Setting this to ``False`` will
|
||||
cause an :class:`errors.DuplicateLabelError` to be raised when
|
||||
`index` (or columns for DataFrame) is not unique, or any
|
||||
subsequent operation on introduces duplicates.
|
||||
See :ref:`duplicates.disallow` for more.
|
||||
|
||||
.. warning::
|
||||
|
||||
This is an experimental feature. Currently, many methods fail to
|
||||
propagate the ``allows_duplicate_labels`` value. In future versions
|
||||
it is expected that every method taking or returning one or more
|
||||
DataFrame or Series objects will propagate ``allows_duplicate_labels``.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Attributes can be set in two ways:
|
||||
|
||||
>>> df = pd.DataFrame()
|
||||
>>> df.flags
|
||||
<Flags(allows_duplicate_labels=True)>
|
||||
>>> df.flags.allows_duplicate_labels = False
|
||||
>>> df.flags
|
||||
<Flags(allows_duplicate_labels=False)>
|
||||
|
||||
>>> df.flags['allows_duplicate_labels'] = True
|
||||
>>> df.flags
|
||||
<Flags(allows_duplicate_labels=True)>
|
||||
"""
|
||||
|
||||
_keys: set[str] = {"allows_duplicate_labels"}
|
||||
|
||||
def __init__(self, obj: NDFrame, *, allows_duplicate_labels: bool) -> None:
|
||||
self._allows_duplicate_labels = allows_duplicate_labels
|
||||
self._obj = weakref.ref(obj)
|
||||
|
||||
@property
|
||||
def allows_duplicate_labels(self) -> bool:
|
||||
"""
|
||||
Whether this object allows duplicate labels.
|
||||
|
||||
Setting ``allows_duplicate_labels=False`` ensures that the
|
||||
index (and columns of a DataFrame) are unique. Most methods
|
||||
that accept and return a Series or DataFrame will propagate
|
||||
the value of ``allows_duplicate_labels``.
|
||||
|
||||
See :ref:`duplicates` for more.
|
||||
|
||||
See Also
|
||||
--------
|
||||
DataFrame.attrs : Set global metadata on this object.
|
||||
DataFrame.set_flags : Set global flags on this object.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.DataFrame({"A": [1, 2]}, index=['a', 'a'])
|
||||
>>> df.flags.allows_duplicate_labels
|
||||
True
|
||||
>>> df.flags.allows_duplicate_labels = False
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
pandas.errors.DuplicateLabelError: Index has duplicates.
|
||||
positions
|
||||
label
|
||||
a [0, 1]
|
||||
"""
|
||||
return self._allows_duplicate_labels
|
||||
|
||||
@allows_duplicate_labels.setter
|
||||
def allows_duplicate_labels(self, value: bool) -> None:
|
||||
value = bool(value)
|
||||
obj = self._obj()
|
||||
if obj is None:
|
||||
raise ValueError("This flag's object has been deleted.")
|
||||
|
||||
if not value:
|
||||
for ax in obj.axes:
|
||||
ax._maybe_check_unique()
|
||||
|
||||
self._allows_duplicate_labels = value
|
||||
|
||||
def __getitem__(self, key: str):
|
||||
if key not in self._keys:
|
||||
raise KeyError(key)
|
||||
|
||||
return getattr(self, key)
|
||||
|
||||
def __setitem__(self, key: str, value) -> None:
|
||||
if key not in self._keys:
|
||||
raise ValueError(f"Unknown flag {key}. Must be one of {self._keys}")
|
||||
setattr(self, key, value)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<Flags(allows_duplicate_labels={self.allows_duplicate_labels})>"
|
||||
|
||||
def __eq__(self, other) -> bool:
|
||||
if isinstance(other, type(self)):
|
||||
return self.allows_duplicate_labels == other.allows_duplicate_labels
|
||||
return False
|
||||
12704
lib/python3.11/site-packages/pandas/core/frame.py
Normal file
12704
lib/python3.11/site-packages/pandas/core/frame.py
Normal file
File diff suppressed because it is too large
Load Diff
13979
lib/python3.11/site-packages/pandas/core/generic.py
Normal file
13979
lib/python3.11/site-packages/pandas/core/generic.py
Normal file
File diff suppressed because it is too large
Load Diff
15
lib/python3.11/site-packages/pandas/core/groupby/__init__.py
Normal file
15
lib/python3.11/site-packages/pandas/core/groupby/__init__.py
Normal file
@ -0,0 +1,15 @@
|
||||
from pandas.core.groupby.generic import (
|
||||
DataFrameGroupBy,
|
||||
NamedAgg,
|
||||
SeriesGroupBy,
|
||||
)
|
||||
from pandas.core.groupby.groupby import GroupBy
|
||||
from pandas.core.groupby.grouper import Grouper
|
||||
|
||||
__all__ = [
|
||||
"DataFrameGroupBy",
|
||||
"NamedAgg",
|
||||
"SeriesGroupBy",
|
||||
"GroupBy",
|
||||
"Grouper",
|
||||
]
|
||||
121
lib/python3.11/site-packages/pandas/core/groupby/base.py
Normal file
121
lib/python3.11/site-packages/pandas/core/groupby/base.py
Normal file
@ -0,0 +1,121 @@
|
||||
"""
|
||||
Provide basic components for groupby.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import dataclasses
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Hashable
|
||||
|
||||
|
||||
@dataclasses.dataclass(order=True, frozen=True)
|
||||
class OutputKey:
|
||||
label: Hashable
|
||||
position: int
|
||||
|
||||
|
||||
# special case to prevent duplicate plots when catching exceptions when
|
||||
# forwarding methods from NDFrames
|
||||
plotting_methods = frozenset(["plot", "hist"])
|
||||
|
||||
# cythonized transformations or canned "agg+broadcast", which do not
|
||||
# require postprocessing of the result by transform.
|
||||
cythonized_kernels = frozenset(["cumprod", "cumsum", "shift", "cummin", "cummax"])
|
||||
|
||||
# List of aggregation/reduction functions.
|
||||
# These map each group to a single numeric value
|
||||
reduction_kernels = frozenset(
|
||||
[
|
||||
"all",
|
||||
"any",
|
||||
"corrwith",
|
||||
"count",
|
||||
"first",
|
||||
"idxmax",
|
||||
"idxmin",
|
||||
"last",
|
||||
"max",
|
||||
"mean",
|
||||
"median",
|
||||
"min",
|
||||
"nunique",
|
||||
"prod",
|
||||
# as long as `quantile`'s signature accepts only
|
||||
# a single quantile value, it's a reduction.
|
||||
# GH#27526 might change that.
|
||||
"quantile",
|
||||
"sem",
|
||||
"size",
|
||||
"skew",
|
||||
"std",
|
||||
"sum",
|
||||
"var",
|
||||
]
|
||||
)
|
||||
|
||||
# List of transformation functions.
|
||||
# a transformation is a function that, for each group,
|
||||
# produces a result that has the same shape as the group.
|
||||
|
||||
|
||||
transformation_kernels = frozenset(
|
||||
[
|
||||
"bfill",
|
||||
"cumcount",
|
||||
"cummax",
|
||||
"cummin",
|
||||
"cumprod",
|
||||
"cumsum",
|
||||
"diff",
|
||||
"ffill",
|
||||
"fillna",
|
||||
"ngroup",
|
||||
"pct_change",
|
||||
"rank",
|
||||
"shift",
|
||||
]
|
||||
)
|
||||
|
||||
# these are all the public methods on Grouper which don't belong
|
||||
# in either of the above lists
|
||||
groupby_other_methods = frozenset(
|
||||
[
|
||||
"agg",
|
||||
"aggregate",
|
||||
"apply",
|
||||
"boxplot",
|
||||
# corr and cov return ngroups*ncolumns rows, so they
|
||||
# are neither a transformation nor a reduction
|
||||
"corr",
|
||||
"cov",
|
||||
"describe",
|
||||
"dtypes",
|
||||
"expanding",
|
||||
"ewm",
|
||||
"filter",
|
||||
"get_group",
|
||||
"groups",
|
||||
"head",
|
||||
"hist",
|
||||
"indices",
|
||||
"ndim",
|
||||
"ngroups",
|
||||
"nth",
|
||||
"ohlc",
|
||||
"pipe",
|
||||
"plot",
|
||||
"resample",
|
||||
"rolling",
|
||||
"tail",
|
||||
"take",
|
||||
"transform",
|
||||
"sample",
|
||||
"value_counts",
|
||||
]
|
||||
)
|
||||
# Valid values of `name` for `groupby.transform(name)`
|
||||
# NOTE: do NOT edit this directly. New additions should be inserted
|
||||
# into the appropriate list above.
|
||||
transform_kernel_allowlist = reduction_kernels | transformation_kernels
|
||||
@ -0,0 +1,87 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.core.algorithms import unique1d
|
||||
from pandas.core.arrays.categorical import (
|
||||
Categorical,
|
||||
CategoricalDtype,
|
||||
recode_for_categories,
|
||||
)
|
||||
|
||||
|
||||
def recode_for_groupby(
|
||||
c: Categorical, sort: bool, observed: bool
|
||||
) -> tuple[Categorical, Categorical | None]:
|
||||
"""
|
||||
Code the categories to ensure we can groupby for categoricals.
|
||||
|
||||
If observed=True, we return a new Categorical with the observed
|
||||
categories only.
|
||||
|
||||
If sort=False, return a copy of self, coded with categories as
|
||||
returned by .unique(), followed by any categories not appearing in
|
||||
the data. If sort=True, return self.
|
||||
|
||||
This method is needed solely to ensure the categorical index of the
|
||||
GroupBy result has categories in the order of appearance in the data
|
||||
(GH-8868).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
c : Categorical
|
||||
sort : bool
|
||||
The value of the sort parameter groupby was called with.
|
||||
observed : bool
|
||||
Account only for the observed values
|
||||
|
||||
Returns
|
||||
-------
|
||||
Categorical
|
||||
If sort=False, the new categories are set to the order of
|
||||
appearance in codes (unless ordered=True, in which case the
|
||||
original order is preserved), followed by any unrepresented
|
||||
categories in the original order.
|
||||
Categorical or None
|
||||
If we are observed, return the original categorical, otherwise None
|
||||
"""
|
||||
# we only care about observed values
|
||||
if observed:
|
||||
# In cases with c.ordered, this is equivalent to
|
||||
# return c.remove_unused_categories(), c
|
||||
|
||||
unique_codes = unique1d(c.codes)
|
||||
|
||||
take_codes = unique_codes[unique_codes != -1]
|
||||
if sort:
|
||||
take_codes = np.sort(take_codes)
|
||||
|
||||
# we recode according to the uniques
|
||||
categories = c.categories.take(take_codes)
|
||||
codes = recode_for_categories(c.codes, c.categories, categories)
|
||||
|
||||
# return a new categorical that maps our new codes
|
||||
# and categories
|
||||
dtype = CategoricalDtype(categories, ordered=c.ordered)
|
||||
return Categorical._simple_new(codes, dtype=dtype), c
|
||||
|
||||
# Already sorted according to c.categories; all is fine
|
||||
if sort:
|
||||
return c, None
|
||||
|
||||
# sort=False should order groups in as-encountered order (GH-8868)
|
||||
|
||||
# xref GH:46909: Re-ordering codes faster than using (set|add|reorder)_categories
|
||||
all_codes = np.arange(c.categories.nunique())
|
||||
# GH 38140: exclude nan from indexer for categories
|
||||
unique_notnan_codes = unique1d(c.codes[c.codes != -1])
|
||||
if sort:
|
||||
unique_notnan_codes = np.sort(unique_notnan_codes)
|
||||
if len(all_codes) > len(unique_notnan_codes):
|
||||
# GH 13179: All categories need to be present, even if missing from the data
|
||||
missing_codes = np.setdiff1d(all_codes, unique_notnan_codes, assume_unique=True)
|
||||
take_codes = np.concatenate((unique_notnan_codes, missing_codes))
|
||||
else:
|
||||
take_codes = unique_notnan_codes
|
||||
|
||||
return Categorical(c, c.unique().categories.take(take_codes)), None
|
||||
2852
lib/python3.11/site-packages/pandas/core/groupby/generic.py
Normal file
2852
lib/python3.11/site-packages/pandas/core/groupby/generic.py
Normal file
File diff suppressed because it is too large
Load Diff
6003
lib/python3.11/site-packages/pandas/core/groupby/groupby.py
Normal file
6003
lib/python3.11/site-packages/pandas/core/groupby/groupby.py
Normal file
File diff suppressed because it is too large
Load Diff
1102
lib/python3.11/site-packages/pandas/core/groupby/grouper.py
Normal file
1102
lib/python3.11/site-packages/pandas/core/groupby/grouper.py
Normal file
File diff suppressed because it is too large
Load Diff
304
lib/python3.11/site-packages/pandas/core/groupby/indexing.py
Normal file
304
lib/python3.11/site-packages/pandas/core/groupby/indexing.py
Normal file
@ -0,0 +1,304 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Iterable
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Literal,
|
||||
cast,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.util._decorators import (
|
||||
cache_readonly,
|
||||
doc,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_integer,
|
||||
is_list_like,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import PositionalIndexer
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
from pandas.core.groupby import groupby
|
||||
|
||||
|
||||
class GroupByIndexingMixin:
|
||||
"""
|
||||
Mixin for adding ._positional_selector to GroupBy.
|
||||
"""
|
||||
|
||||
@cache_readonly
|
||||
def _positional_selector(self) -> GroupByPositionalSelector:
|
||||
"""
|
||||
Return positional selection for each group.
|
||||
|
||||
``groupby._positional_selector[i:j]`` is similar to
|
||||
``groupby.apply(lambda x: x.iloc[i:j])``
|
||||
but much faster and preserves the original index and order.
|
||||
|
||||
``_positional_selector[]`` is compatible with and extends :meth:`~GroupBy.head`
|
||||
and :meth:`~GroupBy.tail`. For example:
|
||||
|
||||
- ``head(5)``
|
||||
- ``_positional_selector[5:-5]``
|
||||
- ``tail(5)``
|
||||
|
||||
together return all the rows.
|
||||
|
||||
Allowed inputs for the index are:
|
||||
|
||||
- An integer valued iterable, e.g. ``range(2, 4)``.
|
||||
- A comma separated list of integers and slices, e.g. ``5``, ``2, 4``, ``2:4``.
|
||||
|
||||
The output format is the same as :meth:`~GroupBy.head` and
|
||||
:meth:`~GroupBy.tail`, namely
|
||||
a subset of the ``DataFrame`` or ``Series`` with the index and order preserved.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Series
|
||||
The filtered subset of the original Series.
|
||||
DataFrame
|
||||
The filtered subset of the original DataFrame.
|
||||
|
||||
See Also
|
||||
--------
|
||||
DataFrame.iloc : Purely integer-location based indexing for selection by
|
||||
position.
|
||||
GroupBy.head : Return first n rows of each group.
|
||||
GroupBy.tail : Return last n rows of each group.
|
||||
GroupBy.nth : Take the nth row from each group if n is an int, or a
|
||||
subset of rows, if n is a list of ints.
|
||||
|
||||
Notes
|
||||
-----
|
||||
- The slice step cannot be negative.
|
||||
- If the index specification results in overlaps, the item is not duplicated.
|
||||
- If the index specification changes the order of items, then
|
||||
they are returned in their original order.
|
||||
By contrast, ``DataFrame.iloc`` can change the row order.
|
||||
- ``groupby()`` parameters such as as_index and dropna are ignored.
|
||||
|
||||
The differences between ``_positional_selector[]`` and :meth:`~GroupBy.nth`
|
||||
with ``as_index=False`` are:
|
||||
|
||||
- Input to ``_positional_selector`` can include
|
||||
one or more slices whereas ``nth``
|
||||
just handles an integer or a list of integers.
|
||||
- ``_positional_selector`` can accept a slice relative to the
|
||||
last row of each group.
|
||||
- ``_positional_selector`` does not have an equivalent to the
|
||||
``nth()`` ``dropna`` parameter.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]],
|
||||
... columns=["A", "B"])
|
||||
>>> df.groupby("A")._positional_selector[1:2]
|
||||
A B
|
||||
1 a 2
|
||||
4 b 5
|
||||
|
||||
>>> df.groupby("A")._positional_selector[1, -1]
|
||||
A B
|
||||
1 a 2
|
||||
2 a 3
|
||||
4 b 5
|
||||
"""
|
||||
if TYPE_CHECKING:
|
||||
# pylint: disable-next=used-before-assignment
|
||||
groupby_self = cast(groupby.GroupBy, self)
|
||||
else:
|
||||
groupby_self = self
|
||||
|
||||
return GroupByPositionalSelector(groupby_self)
|
||||
|
||||
def _make_mask_from_positional_indexer(
|
||||
self,
|
||||
arg: PositionalIndexer | tuple,
|
||||
) -> np.ndarray:
|
||||
if is_list_like(arg):
|
||||
if all(is_integer(i) for i in cast(Iterable, arg)):
|
||||
mask = self._make_mask_from_list(cast(Iterable[int], arg))
|
||||
else:
|
||||
mask = self._make_mask_from_tuple(cast(tuple, arg))
|
||||
|
||||
elif isinstance(arg, slice):
|
||||
mask = self._make_mask_from_slice(arg)
|
||||
elif is_integer(arg):
|
||||
mask = self._make_mask_from_int(cast(int, arg))
|
||||
else:
|
||||
raise TypeError(
|
||||
f"Invalid index {type(arg)}. "
|
||||
"Must be integer, list-like, slice or a tuple of "
|
||||
"integers and slices"
|
||||
)
|
||||
|
||||
if isinstance(mask, bool):
|
||||
if mask:
|
||||
mask = self._ascending_count >= 0
|
||||
else:
|
||||
mask = self._ascending_count < 0
|
||||
|
||||
return cast(np.ndarray, mask)
|
||||
|
||||
def _make_mask_from_int(self, arg: int) -> np.ndarray:
|
||||
if arg >= 0:
|
||||
return self._ascending_count == arg
|
||||
else:
|
||||
return self._descending_count == (-arg - 1)
|
||||
|
||||
def _make_mask_from_list(self, args: Iterable[int]) -> bool | np.ndarray:
|
||||
positive = [arg for arg in args if arg >= 0]
|
||||
negative = [-arg - 1 for arg in args if arg < 0]
|
||||
|
||||
mask: bool | np.ndarray = False
|
||||
|
||||
if positive:
|
||||
mask |= np.isin(self._ascending_count, positive)
|
||||
|
||||
if negative:
|
||||
mask |= np.isin(self._descending_count, negative)
|
||||
|
||||
return mask
|
||||
|
||||
def _make_mask_from_tuple(self, args: tuple) -> bool | np.ndarray:
|
||||
mask: bool | np.ndarray = False
|
||||
|
||||
for arg in args:
|
||||
if is_integer(arg):
|
||||
mask |= self._make_mask_from_int(cast(int, arg))
|
||||
elif isinstance(arg, slice):
|
||||
mask |= self._make_mask_from_slice(arg)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Invalid argument {type(arg)}. Should be int or slice."
|
||||
)
|
||||
|
||||
return mask
|
||||
|
||||
def _make_mask_from_slice(self, arg: slice) -> bool | np.ndarray:
|
||||
start = arg.start
|
||||
stop = arg.stop
|
||||
step = arg.step
|
||||
|
||||
if step is not None and step < 0:
|
||||
raise ValueError(f"Invalid step {step}. Must be non-negative")
|
||||
|
||||
mask: bool | np.ndarray = True
|
||||
|
||||
if step is None:
|
||||
step = 1
|
||||
|
||||
if start is None:
|
||||
if step > 1:
|
||||
mask &= self._ascending_count % step == 0
|
||||
|
||||
elif start >= 0:
|
||||
mask &= self._ascending_count >= start
|
||||
|
||||
if step > 1:
|
||||
mask &= (self._ascending_count - start) % step == 0
|
||||
|
||||
else:
|
||||
mask &= self._descending_count < -start
|
||||
|
||||
offset_array = self._descending_count + start + 1
|
||||
limit_array = (
|
||||
self._ascending_count + self._descending_count + (start + 1)
|
||||
) < 0
|
||||
offset_array = np.where(limit_array, self._ascending_count, offset_array)
|
||||
|
||||
mask &= offset_array % step == 0
|
||||
|
||||
if stop is not None:
|
||||
if stop >= 0:
|
||||
mask &= self._ascending_count < stop
|
||||
else:
|
||||
mask &= self._descending_count >= -stop
|
||||
|
||||
return mask
|
||||
|
||||
@cache_readonly
|
||||
def _ascending_count(self) -> np.ndarray:
|
||||
if TYPE_CHECKING:
|
||||
groupby_self = cast(groupby.GroupBy, self)
|
||||
else:
|
||||
groupby_self = self
|
||||
|
||||
return groupby_self._cumcount_array()
|
||||
|
||||
@cache_readonly
|
||||
def _descending_count(self) -> np.ndarray:
|
||||
if TYPE_CHECKING:
|
||||
groupby_self = cast(groupby.GroupBy, self)
|
||||
else:
|
||||
groupby_self = self
|
||||
|
||||
return groupby_self._cumcount_array(ascending=False)
|
||||
|
||||
|
||||
@doc(GroupByIndexingMixin._positional_selector)
|
||||
class GroupByPositionalSelector:
|
||||
def __init__(self, groupby_object: groupby.GroupBy) -> None:
|
||||
self.groupby_object = groupby_object
|
||||
|
||||
def __getitem__(self, arg: PositionalIndexer | tuple) -> DataFrame | Series:
|
||||
"""
|
||||
Select by positional index per group.
|
||||
|
||||
Implements GroupBy._positional_selector
|
||||
|
||||
Parameters
|
||||
----------
|
||||
arg : PositionalIndexer | tuple
|
||||
Allowed values are:
|
||||
- int
|
||||
- int valued iterable such as list or range
|
||||
- slice with step either None or positive
|
||||
- tuple of integers and slices
|
||||
|
||||
Returns
|
||||
-------
|
||||
Series
|
||||
The filtered subset of the original groupby Series.
|
||||
DataFrame
|
||||
The filtered subset of the original groupby DataFrame.
|
||||
|
||||
See Also
|
||||
--------
|
||||
DataFrame.iloc : Integer-location based indexing for selection by position.
|
||||
GroupBy.head : Return first n rows of each group.
|
||||
GroupBy.tail : Return last n rows of each group.
|
||||
GroupBy._positional_selector : Return positional selection for each group.
|
||||
GroupBy.nth : Take the nth row from each group if n is an int, or a
|
||||
subset of rows, if n is a list of ints.
|
||||
"""
|
||||
mask = self.groupby_object._make_mask_from_positional_indexer(arg)
|
||||
return self.groupby_object._mask_selected_obj(mask)
|
||||
|
||||
|
||||
class GroupByNthSelector:
|
||||
"""
|
||||
Dynamically substituted for GroupBy.nth to enable both call and index
|
||||
"""
|
||||
|
||||
def __init__(self, groupby_object: groupby.GroupBy) -> None:
|
||||
self.groupby_object = groupby_object
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
n: PositionalIndexer | tuple,
|
||||
dropna: Literal["any", "all", None] = None,
|
||||
) -> DataFrame | Series:
|
||||
return self.groupby_object._nth(n, dropna)
|
||||
|
||||
def __getitem__(self, n: PositionalIndexer | tuple) -> DataFrame | Series:
|
||||
return self.groupby_object._nth(n)
|
||||
181
lib/python3.11/site-packages/pandas/core/groupby/numba_.py
Normal file
181
lib/python3.11/site-packages/pandas/core/groupby/numba_.py
Normal file
@ -0,0 +1,181 @@
|
||||
"""Common utilities for Numba operations with groupby ops"""
|
||||
from __future__ import annotations
|
||||
|
||||
import functools
|
||||
import inspect
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
|
||||
from pandas.core.util.numba_ import (
|
||||
NumbaUtilError,
|
||||
jit_user_function,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import Scalar
|
||||
|
||||
|
||||
def validate_udf(func: Callable) -> None:
|
||||
"""
|
||||
Validate user defined function for ops when using Numba with groupby ops.
|
||||
|
||||
The first signature arguments should include:
|
||||
|
||||
def f(values, index, ...):
|
||||
...
|
||||
|
||||
Parameters
|
||||
----------
|
||||
func : function, default False
|
||||
user defined function
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
|
||||
Raises
|
||||
------
|
||||
NumbaUtilError
|
||||
"""
|
||||
if not callable(func):
|
||||
raise NotImplementedError(
|
||||
"Numba engine can only be used with a single function."
|
||||
)
|
||||
udf_signature = list(inspect.signature(func).parameters.keys())
|
||||
expected_args = ["values", "index"]
|
||||
min_number_args = len(expected_args)
|
||||
if (
|
||||
len(udf_signature) < min_number_args
|
||||
or udf_signature[:min_number_args] != expected_args
|
||||
):
|
||||
raise NumbaUtilError(
|
||||
f"The first {min_number_args} arguments to {func.__name__} must be "
|
||||
f"{expected_args}"
|
||||
)
|
||||
|
||||
|
||||
@functools.cache
|
||||
def generate_numba_agg_func(
|
||||
func: Callable[..., Scalar],
|
||||
nopython: bool,
|
||||
nogil: bool,
|
||||
parallel: bool,
|
||||
) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, Any], np.ndarray]:
|
||||
"""
|
||||
Generate a numba jitted agg function specified by values from engine_kwargs.
|
||||
|
||||
1. jit the user's function
|
||||
2. Return a groupby agg function with the jitted function inline
|
||||
|
||||
Configurations specified in engine_kwargs apply to both the user's
|
||||
function _AND_ the groupby evaluation loop.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
func : function
|
||||
function to be applied to each group and will be JITed
|
||||
nopython : bool
|
||||
nopython to be passed into numba.jit
|
||||
nogil : bool
|
||||
nogil to be passed into numba.jit
|
||||
parallel : bool
|
||||
parallel to be passed into numba.jit
|
||||
|
||||
Returns
|
||||
-------
|
||||
Numba function
|
||||
"""
|
||||
numba_func = jit_user_function(func)
|
||||
if TYPE_CHECKING:
|
||||
import numba
|
||||
else:
|
||||
numba = import_optional_dependency("numba")
|
||||
|
||||
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
|
||||
def group_agg(
|
||||
values: np.ndarray,
|
||||
index: np.ndarray,
|
||||
begin: np.ndarray,
|
||||
end: np.ndarray,
|
||||
num_columns: int,
|
||||
*args: Any,
|
||||
) -> np.ndarray:
|
||||
assert len(begin) == len(end)
|
||||
num_groups = len(begin)
|
||||
|
||||
result = np.empty((num_groups, num_columns))
|
||||
for i in numba.prange(num_groups):
|
||||
group_index = index[begin[i] : end[i]]
|
||||
for j in numba.prange(num_columns):
|
||||
group = values[begin[i] : end[i], j]
|
||||
result[i, j] = numba_func(group, group_index, *args)
|
||||
return result
|
||||
|
||||
return group_agg
|
||||
|
||||
|
||||
@functools.cache
|
||||
def generate_numba_transform_func(
|
||||
func: Callable[..., np.ndarray],
|
||||
nopython: bool,
|
||||
nogil: bool,
|
||||
parallel: bool,
|
||||
) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, Any], np.ndarray]:
|
||||
"""
|
||||
Generate a numba jitted transform function specified by values from engine_kwargs.
|
||||
|
||||
1. jit the user's function
|
||||
2. Return a groupby transform function with the jitted function inline
|
||||
|
||||
Configurations specified in engine_kwargs apply to both the user's
|
||||
function _AND_ the groupby evaluation loop.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
func : function
|
||||
function to be applied to each window and will be JITed
|
||||
nopython : bool
|
||||
nopython to be passed into numba.jit
|
||||
nogil : bool
|
||||
nogil to be passed into numba.jit
|
||||
parallel : bool
|
||||
parallel to be passed into numba.jit
|
||||
|
||||
Returns
|
||||
-------
|
||||
Numba function
|
||||
"""
|
||||
numba_func = jit_user_function(func)
|
||||
if TYPE_CHECKING:
|
||||
import numba
|
||||
else:
|
||||
numba = import_optional_dependency("numba")
|
||||
|
||||
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
|
||||
def group_transform(
|
||||
values: np.ndarray,
|
||||
index: np.ndarray,
|
||||
begin: np.ndarray,
|
||||
end: np.ndarray,
|
||||
num_columns: int,
|
||||
*args: Any,
|
||||
) -> np.ndarray:
|
||||
assert len(begin) == len(end)
|
||||
num_groups = len(begin)
|
||||
|
||||
result = np.empty((len(values), num_columns))
|
||||
for i in numba.prange(num_groups):
|
||||
group_index = index[begin[i] : end[i]]
|
||||
for j in numba.prange(num_columns):
|
||||
group = values[begin[i] : end[i], j]
|
||||
result[begin[i] : end[i], j] = numba_func(group, group_index, *args)
|
||||
return result
|
||||
|
||||
return group_transform
|
||||
1208
lib/python3.11/site-packages/pandas/core/groupby/ops.py
Normal file
1208
lib/python3.11/site-packages/pandas/core/groupby/ops.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,31 @@
|
||||
from pandas.core.indexers.utils import (
|
||||
check_array_indexer,
|
||||
check_key_length,
|
||||
check_setitem_lengths,
|
||||
disallow_ndim_indexing,
|
||||
is_empty_indexer,
|
||||
is_list_like_indexer,
|
||||
is_scalar_indexer,
|
||||
is_valid_positional_slice,
|
||||
length_of_indexer,
|
||||
maybe_convert_indices,
|
||||
unpack_1tuple,
|
||||
unpack_tuple_and_ellipses,
|
||||
validate_indices,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"is_valid_positional_slice",
|
||||
"is_list_like_indexer",
|
||||
"is_scalar_indexer",
|
||||
"is_empty_indexer",
|
||||
"check_setitem_lengths",
|
||||
"validate_indices",
|
||||
"maybe_convert_indices",
|
||||
"length_of_indexer",
|
||||
"disallow_ndim_indexing",
|
||||
"unpack_1tuple",
|
||||
"check_key_length",
|
||||
"check_array_indexer",
|
||||
"unpack_tuple_and_ellipses",
|
||||
]
|
||||
453
lib/python3.11/site-packages/pandas/core/indexers/objects.py
Normal file
453
lib/python3.11/site-packages/pandas/core/indexers/objects.py
Normal file
@ -0,0 +1,453 @@
|
||||
"""Indexer objects for computing start/end window bounds for rolling operations"""
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import timedelta
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs.tslibs import BaseOffset
|
||||
from pandas._libs.window.indexers import calculate_variable_window_bounds
|
||||
from pandas.util._decorators import Appender
|
||||
|
||||
from pandas.core.dtypes.common import ensure_platform_int
|
||||
|
||||
from pandas.core.indexes.datetimes import DatetimeIndex
|
||||
|
||||
from pandas.tseries.offsets import Nano
|
||||
|
||||
get_window_bounds_doc = """
|
||||
Computes the bounds of a window.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
num_values : int, default 0
|
||||
number of values that will be aggregated over
|
||||
window_size : int, default 0
|
||||
the number of rows in a window
|
||||
min_periods : int, default None
|
||||
min_periods passed from the top level rolling API
|
||||
center : bool, default None
|
||||
center passed from the top level rolling API
|
||||
closed : str, default None
|
||||
closed passed from the top level rolling API
|
||||
step : int, default None
|
||||
step passed from the top level rolling API
|
||||
.. versionadded:: 1.5
|
||||
win_type : str, default None
|
||||
win_type passed from the top level rolling API
|
||||
|
||||
Returns
|
||||
-------
|
||||
A tuple of ndarray[int64]s, indicating the boundaries of each
|
||||
window
|
||||
"""
|
||||
|
||||
|
||||
class BaseIndexer:
|
||||
"""
|
||||
Base class for window bounds calculations.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from pandas.api.indexers import BaseIndexer
|
||||
>>> class CustomIndexer(BaseIndexer):
|
||||
... def get_window_bounds(self, num_values, min_periods, center, closed, step):
|
||||
... start = np.empty(num_values, dtype=np.int64)
|
||||
... end = np.empty(num_values, dtype=np.int64)
|
||||
... for i in range(num_values):
|
||||
... start[i] = i
|
||||
... end[i] = i + self.window_size
|
||||
... return start, end
|
||||
>>> df = pd.DataFrame({"values": range(5)})
|
||||
>>> indexer = CustomIndexer(window_size=2)
|
||||
>>> df.rolling(indexer).sum()
|
||||
values
|
||||
0 1.0
|
||||
1 3.0
|
||||
2 5.0
|
||||
3 7.0
|
||||
4 4.0
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, index_array: np.ndarray | None = None, window_size: int = 0, **kwargs
|
||||
) -> None:
|
||||
self.index_array = index_array
|
||||
self.window_size = window_size
|
||||
# Set user defined kwargs as attributes that can be used in get_window_bounds
|
||||
for key, value in kwargs.items():
|
||||
setattr(self, key, value)
|
||||
|
||||
@Appender(get_window_bounds_doc)
|
||||
def get_window_bounds(
|
||||
self,
|
||||
num_values: int = 0,
|
||||
min_periods: int | None = None,
|
||||
center: bool | None = None,
|
||||
closed: str | None = None,
|
||||
step: int | None = None,
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class FixedWindowIndexer(BaseIndexer):
|
||||
"""Creates window boundaries that are of fixed length."""
|
||||
|
||||
@Appender(get_window_bounds_doc)
|
||||
def get_window_bounds(
|
||||
self,
|
||||
num_values: int = 0,
|
||||
min_periods: int | None = None,
|
||||
center: bool | None = None,
|
||||
closed: str | None = None,
|
||||
step: int | None = None,
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
if center or self.window_size == 0:
|
||||
offset = (self.window_size - 1) // 2
|
||||
else:
|
||||
offset = 0
|
||||
|
||||
end = np.arange(1 + offset, num_values + 1 + offset, step, dtype="int64")
|
||||
start = end - self.window_size
|
||||
if closed in ["left", "both"]:
|
||||
start -= 1
|
||||
if closed in ["left", "neither"]:
|
||||
end -= 1
|
||||
|
||||
end = np.clip(end, 0, num_values)
|
||||
start = np.clip(start, 0, num_values)
|
||||
|
||||
return start, end
|
||||
|
||||
|
||||
class VariableWindowIndexer(BaseIndexer):
|
||||
"""Creates window boundaries that are of variable length, namely for time series."""
|
||||
|
||||
@Appender(get_window_bounds_doc)
|
||||
def get_window_bounds(
|
||||
self,
|
||||
num_values: int = 0,
|
||||
min_periods: int | None = None,
|
||||
center: bool | None = None,
|
||||
closed: str | None = None,
|
||||
step: int | None = None,
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
# error: Argument 4 to "calculate_variable_window_bounds" has incompatible
|
||||
# type "Optional[bool]"; expected "bool"
|
||||
# error: Argument 6 to "calculate_variable_window_bounds" has incompatible
|
||||
# type "Optional[ndarray]"; expected "ndarray"
|
||||
return calculate_variable_window_bounds(
|
||||
num_values,
|
||||
self.window_size,
|
||||
min_periods,
|
||||
center, # type: ignore[arg-type]
|
||||
closed,
|
||||
self.index_array, # type: ignore[arg-type]
|
||||
)
|
||||
|
||||
|
||||
class VariableOffsetWindowIndexer(BaseIndexer):
|
||||
"""
|
||||
Calculate window boundaries based on a non-fixed offset such as a BusinessDay.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from pandas.api.indexers import VariableOffsetWindowIndexer
|
||||
>>> df = pd.DataFrame(range(10), index=pd.date_range("2020", periods=10))
|
||||
>>> offset = pd.offsets.BDay(1)
|
||||
>>> indexer = VariableOffsetWindowIndexer(index=df.index, offset=offset)
|
||||
>>> df
|
||||
0
|
||||
2020-01-01 0
|
||||
2020-01-02 1
|
||||
2020-01-03 2
|
||||
2020-01-04 3
|
||||
2020-01-05 4
|
||||
2020-01-06 5
|
||||
2020-01-07 6
|
||||
2020-01-08 7
|
||||
2020-01-09 8
|
||||
2020-01-10 9
|
||||
>>> df.rolling(indexer).sum()
|
||||
0
|
||||
2020-01-01 0.0
|
||||
2020-01-02 1.0
|
||||
2020-01-03 2.0
|
||||
2020-01-04 3.0
|
||||
2020-01-05 7.0
|
||||
2020-01-06 12.0
|
||||
2020-01-07 6.0
|
||||
2020-01-08 7.0
|
||||
2020-01-09 8.0
|
||||
2020-01-10 9.0
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
index_array: np.ndarray | None = None,
|
||||
window_size: int = 0,
|
||||
index: DatetimeIndex | None = None,
|
||||
offset: BaseOffset | None = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__(index_array, window_size, **kwargs)
|
||||
if not isinstance(index, DatetimeIndex):
|
||||
raise ValueError("index must be a DatetimeIndex.")
|
||||
self.index = index
|
||||
if not isinstance(offset, BaseOffset):
|
||||
raise ValueError("offset must be a DateOffset-like object.")
|
||||
self.offset = offset
|
||||
|
||||
@Appender(get_window_bounds_doc)
|
||||
def get_window_bounds(
|
||||
self,
|
||||
num_values: int = 0,
|
||||
min_periods: int | None = None,
|
||||
center: bool | None = None,
|
||||
closed: str | None = None,
|
||||
step: int | None = None,
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
if step is not None:
|
||||
raise NotImplementedError("step not implemented for variable offset window")
|
||||
if num_values <= 0:
|
||||
return np.empty(0, dtype="int64"), np.empty(0, dtype="int64")
|
||||
|
||||
# if windows is variable, default is 'right', otherwise default is 'both'
|
||||
if closed is None:
|
||||
closed = "right" if self.index is not None else "both"
|
||||
|
||||
right_closed = closed in ["right", "both"]
|
||||
left_closed = closed in ["left", "both"]
|
||||
|
||||
if self.index[num_values - 1] < self.index[0]:
|
||||
index_growth_sign = -1
|
||||
else:
|
||||
index_growth_sign = 1
|
||||
offset_diff = index_growth_sign * self.offset
|
||||
|
||||
start = np.empty(num_values, dtype="int64")
|
||||
start.fill(-1)
|
||||
end = np.empty(num_values, dtype="int64")
|
||||
end.fill(-1)
|
||||
|
||||
start[0] = 0
|
||||
|
||||
# right endpoint is closed
|
||||
if right_closed:
|
||||
end[0] = 1
|
||||
# right endpoint is open
|
||||
else:
|
||||
end[0] = 0
|
||||
|
||||
zero = timedelta(0)
|
||||
# start is start of slice interval (including)
|
||||
# end is end of slice interval (not including)
|
||||
for i in range(1, num_values):
|
||||
end_bound = self.index[i]
|
||||
start_bound = end_bound - offset_diff
|
||||
|
||||
# left endpoint is closed
|
||||
if left_closed:
|
||||
start_bound -= Nano(1)
|
||||
|
||||
# advance the start bound until we are
|
||||
# within the constraint
|
||||
start[i] = i
|
||||
for j in range(start[i - 1], i):
|
||||
start_diff = (self.index[j] - start_bound) * index_growth_sign
|
||||
if start_diff > zero:
|
||||
start[i] = j
|
||||
break
|
||||
|
||||
# end bound is previous end
|
||||
# or current index
|
||||
end_diff = (self.index[end[i - 1]] - end_bound) * index_growth_sign
|
||||
if end_diff == zero and not right_closed:
|
||||
end[i] = end[i - 1] + 1
|
||||
elif end_diff <= zero:
|
||||
end[i] = i + 1
|
||||
else:
|
||||
end[i] = end[i - 1]
|
||||
|
||||
# right endpoint is open
|
||||
if not right_closed:
|
||||
end[i] -= 1
|
||||
|
||||
return start, end
|
||||
|
||||
|
||||
class ExpandingIndexer(BaseIndexer):
|
||||
"""Calculate expanding window bounds, mimicking df.expanding()"""
|
||||
|
||||
@Appender(get_window_bounds_doc)
|
||||
def get_window_bounds(
|
||||
self,
|
||||
num_values: int = 0,
|
||||
min_periods: int | None = None,
|
||||
center: bool | None = None,
|
||||
closed: str | None = None,
|
||||
step: int | None = None,
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
return (
|
||||
np.zeros(num_values, dtype=np.int64),
|
||||
np.arange(1, num_values + 1, dtype=np.int64),
|
||||
)
|
||||
|
||||
|
||||
class FixedForwardWindowIndexer(BaseIndexer):
|
||||
"""
|
||||
Creates window boundaries for fixed-length windows that include the current row.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]})
|
||||
>>> df
|
||||
B
|
||||
0 0.0
|
||||
1 1.0
|
||||
2 2.0
|
||||
3 NaN
|
||||
4 4.0
|
||||
|
||||
>>> indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=2)
|
||||
>>> df.rolling(window=indexer, min_periods=1).sum()
|
||||
B
|
||||
0 1.0
|
||||
1 3.0
|
||||
2 2.0
|
||||
3 4.0
|
||||
4 4.0
|
||||
"""
|
||||
|
||||
@Appender(get_window_bounds_doc)
|
||||
def get_window_bounds(
|
||||
self,
|
||||
num_values: int = 0,
|
||||
min_periods: int | None = None,
|
||||
center: bool | None = None,
|
||||
closed: str | None = None,
|
||||
step: int | None = None,
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
if center:
|
||||
raise ValueError("Forward-looking windows can't have center=True")
|
||||
if closed is not None:
|
||||
raise ValueError(
|
||||
"Forward-looking windows don't support setting the closed argument"
|
||||
)
|
||||
if step is None:
|
||||
step = 1
|
||||
|
||||
start = np.arange(0, num_values, step, dtype="int64")
|
||||
end = start + self.window_size
|
||||
if self.window_size:
|
||||
end = np.clip(end, 0, num_values)
|
||||
|
||||
return start, end
|
||||
|
||||
|
||||
class GroupbyIndexer(BaseIndexer):
|
||||
"""Calculate bounds to compute groupby rolling, mimicking df.groupby().rolling()"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
index_array: np.ndarray | None = None,
|
||||
window_size: int | BaseIndexer = 0,
|
||||
groupby_indices: dict | None = None,
|
||||
window_indexer: type[BaseIndexer] = BaseIndexer,
|
||||
indexer_kwargs: dict | None = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
index_array : np.ndarray or None
|
||||
np.ndarray of the index of the original object that we are performing
|
||||
a chained groupby operation over. This index has been pre-sorted relative to
|
||||
the groups
|
||||
window_size : int or BaseIndexer
|
||||
window size during the windowing operation
|
||||
groupby_indices : dict or None
|
||||
dict of {group label: [positional index of rows belonging to the group]}
|
||||
window_indexer : BaseIndexer
|
||||
BaseIndexer class determining the start and end bounds of each group
|
||||
indexer_kwargs : dict or None
|
||||
Custom kwargs to be passed to window_indexer
|
||||
**kwargs :
|
||||
keyword arguments that will be available when get_window_bounds is called
|
||||
"""
|
||||
self.groupby_indices = groupby_indices or {}
|
||||
self.window_indexer = window_indexer
|
||||
self.indexer_kwargs = indexer_kwargs.copy() if indexer_kwargs else {}
|
||||
super().__init__(
|
||||
index_array=index_array,
|
||||
window_size=self.indexer_kwargs.pop("window_size", window_size),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@Appender(get_window_bounds_doc)
|
||||
def get_window_bounds(
|
||||
self,
|
||||
num_values: int = 0,
|
||||
min_periods: int | None = None,
|
||||
center: bool | None = None,
|
||||
closed: str | None = None,
|
||||
step: int | None = None,
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
# 1) For each group, get the indices that belong to the group
|
||||
# 2) Use the indices to calculate the start & end bounds of the window
|
||||
# 3) Append the window bounds in group order
|
||||
start_arrays = []
|
||||
end_arrays = []
|
||||
window_indices_start = 0
|
||||
for key, indices in self.groupby_indices.items():
|
||||
index_array: np.ndarray | None
|
||||
|
||||
if self.index_array is not None:
|
||||
index_array = self.index_array.take(ensure_platform_int(indices))
|
||||
else:
|
||||
index_array = self.index_array
|
||||
indexer = self.window_indexer(
|
||||
index_array=index_array,
|
||||
window_size=self.window_size,
|
||||
**self.indexer_kwargs,
|
||||
)
|
||||
start, end = indexer.get_window_bounds(
|
||||
len(indices), min_periods, center, closed, step
|
||||
)
|
||||
start = start.astype(np.int64)
|
||||
end = end.astype(np.int64)
|
||||
assert len(start) == len(
|
||||
end
|
||||
), "these should be equal in length from get_window_bounds"
|
||||
# Cannot use groupby_indices as they might not be monotonic with the object
|
||||
# we're rolling over
|
||||
window_indices = np.arange(
|
||||
window_indices_start, window_indices_start + len(indices)
|
||||
)
|
||||
window_indices_start += len(indices)
|
||||
# Extend as we'll be slicing window like [start, end)
|
||||
window_indices = np.append(window_indices, [window_indices[-1] + 1]).astype(
|
||||
np.int64, copy=False
|
||||
)
|
||||
start_arrays.append(window_indices.take(ensure_platform_int(start)))
|
||||
end_arrays.append(window_indices.take(ensure_platform_int(end)))
|
||||
if len(start_arrays) == 0:
|
||||
return np.array([], dtype=np.int64), np.array([], dtype=np.int64)
|
||||
start = np.concatenate(start_arrays)
|
||||
end = np.concatenate(end_arrays)
|
||||
return start, end
|
||||
|
||||
|
||||
class ExponentialMovingWindowIndexer(BaseIndexer):
|
||||
"""Calculate ewm window bounds (the entire window)"""
|
||||
|
||||
@Appender(get_window_bounds_doc)
|
||||
def get_window_bounds(
|
||||
self,
|
||||
num_values: int = 0,
|
||||
min_periods: int | None = None,
|
||||
center: bool | None = None,
|
||||
closed: str | None = None,
|
||||
step: int | None = None,
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
return np.array([0], dtype=np.int64), np.array([num_values], dtype=np.int64)
|
||||
553
lib/python3.11/site-packages/pandas/core/indexers/utils.py
Normal file
553
lib/python3.11/site-packages/pandas/core/indexers/utils.py
Normal file
@ -0,0 +1,553 @@
|
||||
"""
|
||||
Low-dependency indexing utilities.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import lib
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_array_like,
|
||||
is_bool_dtype,
|
||||
is_integer,
|
||||
is_integer_dtype,
|
||||
is_list_like,
|
||||
)
|
||||
from pandas.core.dtypes.dtypes import ExtensionDtype
|
||||
from pandas.core.dtypes.generic import (
|
||||
ABCIndex,
|
||||
ABCSeries,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import AnyArrayLike
|
||||
|
||||
from pandas.core.frame import DataFrame
|
||||
from pandas.core.indexes.base import Index
|
||||
|
||||
# -----------------------------------------------------------
|
||||
# Indexer Identification
|
||||
|
||||
|
||||
def is_valid_positional_slice(slc: slice) -> bool:
|
||||
"""
|
||||
Check if a slice object can be interpreted as a positional indexer.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
slc : slice
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
|
||||
Notes
|
||||
-----
|
||||
A valid positional slice may also be interpreted as a label-based slice
|
||||
depending on the index being sliced.
|
||||
"""
|
||||
return (
|
||||
lib.is_int_or_none(slc.start)
|
||||
and lib.is_int_or_none(slc.stop)
|
||||
and lib.is_int_or_none(slc.step)
|
||||
)
|
||||
|
||||
|
||||
def is_list_like_indexer(key) -> bool:
|
||||
"""
|
||||
Check if we have a list-like indexer that is *not* a NamedTuple.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
key : object
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
"""
|
||||
# allow a list_like, but exclude NamedTuples which can be indexers
|
||||
return is_list_like(key) and not (isinstance(key, tuple) and type(key) is not tuple)
|
||||
|
||||
|
||||
def is_scalar_indexer(indexer, ndim: int) -> bool:
|
||||
"""
|
||||
Return True if we are all scalar indexers.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
indexer : object
|
||||
ndim : int
|
||||
Number of dimensions in the object being indexed.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
"""
|
||||
if ndim == 1 and is_integer(indexer):
|
||||
# GH37748: allow indexer to be an integer for Series
|
||||
return True
|
||||
if isinstance(indexer, tuple) and len(indexer) == ndim:
|
||||
return all(is_integer(x) for x in indexer)
|
||||
return False
|
||||
|
||||
|
||||
def is_empty_indexer(indexer) -> bool:
|
||||
"""
|
||||
Check if we have an empty indexer.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
indexer : object
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
"""
|
||||
if is_list_like(indexer) and not len(indexer):
|
||||
return True
|
||||
if not isinstance(indexer, tuple):
|
||||
indexer = (indexer,)
|
||||
return any(isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer)
|
||||
|
||||
|
||||
# -----------------------------------------------------------
|
||||
# Indexer Validation
|
||||
|
||||
|
||||
def check_setitem_lengths(indexer, value, values) -> bool:
|
||||
"""
|
||||
Validate that value and indexer are the same length.
|
||||
|
||||
An special-case is allowed for when the indexer is a boolean array
|
||||
and the number of true values equals the length of ``value``. In
|
||||
this case, no exception is raised.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
indexer : sequence
|
||||
Key for the setitem.
|
||||
value : array-like
|
||||
Value for the setitem.
|
||||
values : array-like
|
||||
Values being set into.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
Whether this is an empty listlike setting which is a no-op.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
When the indexer is an ndarray or list and the lengths don't match.
|
||||
"""
|
||||
no_op = False
|
||||
|
||||
if isinstance(indexer, (np.ndarray, list)):
|
||||
# We can ignore other listlikes because they are either
|
||||
# a) not necessarily 1-D indexers, e.g. tuple
|
||||
# b) boolean indexers e.g. BoolArray
|
||||
if is_list_like(value):
|
||||
if len(indexer) != len(value) and values.ndim == 1:
|
||||
# boolean with truth values == len of the value is ok too
|
||||
if isinstance(indexer, list):
|
||||
indexer = np.array(indexer)
|
||||
if not (
|
||||
isinstance(indexer, np.ndarray)
|
||||
and indexer.dtype == np.bool_
|
||||
and indexer.sum() == len(value)
|
||||
):
|
||||
raise ValueError(
|
||||
"cannot set using a list-like indexer "
|
||||
"with a different length than the value"
|
||||
)
|
||||
if not len(indexer):
|
||||
no_op = True
|
||||
|
||||
elif isinstance(indexer, slice):
|
||||
if is_list_like(value):
|
||||
if len(value) != length_of_indexer(indexer, values) and values.ndim == 1:
|
||||
# In case of two dimensional value is used row-wise and broadcasted
|
||||
raise ValueError(
|
||||
"cannot set using a slice indexer with a "
|
||||
"different length than the value"
|
||||
)
|
||||
if not len(value):
|
||||
no_op = True
|
||||
|
||||
return no_op
|
||||
|
||||
|
||||
def validate_indices(indices: np.ndarray, n: int) -> None:
|
||||
"""
|
||||
Perform bounds-checking for an indexer.
|
||||
|
||||
-1 is allowed for indicating missing values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
indices : ndarray
|
||||
n : int
|
||||
Length of the array being indexed.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> validate_indices(np.array([1, 2]), 3) # OK
|
||||
|
||||
>>> validate_indices(np.array([1, -2]), 3)
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
ValueError: negative dimensions are not allowed
|
||||
|
||||
>>> validate_indices(np.array([1, 2, 3]), 3)
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
IndexError: indices are out-of-bounds
|
||||
|
||||
>>> validate_indices(np.array([-1, -1]), 0) # OK
|
||||
|
||||
>>> validate_indices(np.array([0, 1]), 0)
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
IndexError: indices are out-of-bounds
|
||||
"""
|
||||
if len(indices):
|
||||
min_idx = indices.min()
|
||||
if min_idx < -1:
|
||||
msg = f"'indices' contains values less than allowed ({min_idx} < -1)"
|
||||
raise ValueError(msg)
|
||||
|
||||
max_idx = indices.max()
|
||||
if max_idx >= n:
|
||||
raise IndexError("indices are out-of-bounds")
|
||||
|
||||
|
||||
# -----------------------------------------------------------
|
||||
# Indexer Conversion
|
||||
|
||||
|
||||
def maybe_convert_indices(indices, n: int, verify: bool = True) -> np.ndarray:
|
||||
"""
|
||||
Attempt to convert indices into valid, positive indices.
|
||||
|
||||
If we have negative indices, translate to positive here.
|
||||
If we have indices that are out-of-bounds, raise an IndexError.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
indices : array-like
|
||||
Array of indices that we are to convert.
|
||||
n : int
|
||||
Number of elements in the array that we are indexing.
|
||||
verify : bool, default True
|
||||
Check that all entries are between 0 and n - 1, inclusive.
|
||||
|
||||
Returns
|
||||
-------
|
||||
array-like
|
||||
An array-like of positive indices that correspond to the ones
|
||||
that were passed in initially to this function.
|
||||
|
||||
Raises
|
||||
------
|
||||
IndexError
|
||||
One of the converted indices either exceeded the number of,
|
||||
elements (specified by `n`), or was still negative.
|
||||
"""
|
||||
if isinstance(indices, list):
|
||||
indices = np.array(indices)
|
||||
if len(indices) == 0:
|
||||
# If `indices` is empty, np.array will return a float,
|
||||
# and will cause indexing errors.
|
||||
return np.empty(0, dtype=np.intp)
|
||||
|
||||
mask = indices < 0
|
||||
if mask.any():
|
||||
indices = indices.copy()
|
||||
indices[mask] += n
|
||||
|
||||
if verify:
|
||||
mask = (indices >= n) | (indices < 0)
|
||||
if mask.any():
|
||||
raise IndexError("indices are out-of-bounds")
|
||||
return indices
|
||||
|
||||
|
||||
# -----------------------------------------------------------
|
||||
# Unsorted
|
||||
|
||||
|
||||
def length_of_indexer(indexer, target=None) -> int:
|
||||
"""
|
||||
Return the expected length of target[indexer]
|
||||
|
||||
Returns
|
||||
-------
|
||||
int
|
||||
"""
|
||||
if target is not None and isinstance(indexer, slice):
|
||||
target_len = len(target)
|
||||
start = indexer.start
|
||||
stop = indexer.stop
|
||||
step = indexer.step
|
||||
if start is None:
|
||||
start = 0
|
||||
elif start < 0:
|
||||
start += target_len
|
||||
if stop is None or stop > target_len:
|
||||
stop = target_len
|
||||
elif stop < 0:
|
||||
stop += target_len
|
||||
if step is None:
|
||||
step = 1
|
||||
elif step < 0:
|
||||
start, stop = stop + 1, start + 1
|
||||
step = -step
|
||||
return (stop - start + step - 1) // step
|
||||
elif isinstance(indexer, (ABCSeries, ABCIndex, np.ndarray, list)):
|
||||
if isinstance(indexer, list):
|
||||
indexer = np.array(indexer)
|
||||
|
||||
if indexer.dtype == bool:
|
||||
# GH#25774
|
||||
return indexer.sum()
|
||||
return len(indexer)
|
||||
elif isinstance(indexer, range):
|
||||
return (indexer.stop - indexer.start) // indexer.step
|
||||
elif not is_list_like_indexer(indexer):
|
||||
return 1
|
||||
raise AssertionError("cannot find the length of the indexer")
|
||||
|
||||
|
||||
def disallow_ndim_indexing(result) -> None:
|
||||
"""
|
||||
Helper function to disallow multi-dimensional indexing on 1D Series/Index.
|
||||
|
||||
GH#27125 indexer like idx[:, None] expands dim, but we cannot do that
|
||||
and keep an index, so we used to return ndarray, which was deprecated
|
||||
in GH#30588.
|
||||
"""
|
||||
if np.ndim(result) > 1:
|
||||
raise ValueError(
|
||||
"Multi-dimensional indexing (e.g. `obj[:, None]`) is no longer "
|
||||
"supported. Convert to a numpy array before indexing instead."
|
||||
)
|
||||
|
||||
|
||||
def unpack_1tuple(tup):
|
||||
"""
|
||||
If we have a length-1 tuple/list that contains a slice, unpack to just
|
||||
the slice.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The list case is deprecated.
|
||||
"""
|
||||
if len(tup) == 1 and isinstance(tup[0], slice):
|
||||
# if we don't have a MultiIndex, we may still be able to handle
|
||||
# a 1-tuple. see test_1tuple_without_multiindex
|
||||
|
||||
if isinstance(tup, list):
|
||||
# GH#31299
|
||||
raise ValueError(
|
||||
"Indexing with a single-item list containing a "
|
||||
"slice is not allowed. Pass a tuple instead.",
|
||||
)
|
||||
|
||||
return tup[0]
|
||||
return tup
|
||||
|
||||
|
||||
def check_key_length(columns: Index, key, value: DataFrame) -> None:
|
||||
"""
|
||||
Checks if a key used as indexer has the same length as the columns it is
|
||||
associated with.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
columns : Index The columns of the DataFrame to index.
|
||||
key : A list-like of keys to index with.
|
||||
value : DataFrame The value to set for the keys.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError: If the length of key is not equal to the number of columns in value
|
||||
or if the number of columns referenced by key is not equal to number
|
||||
of columns.
|
||||
"""
|
||||
if columns.is_unique:
|
||||
if len(value.columns) != len(key):
|
||||
raise ValueError("Columns must be same length as key")
|
||||
else:
|
||||
# Missing keys in columns are represented as -1
|
||||
if len(columns.get_indexer_non_unique(key)[0]) != len(value.columns):
|
||||
raise ValueError("Columns must be same length as key")
|
||||
|
||||
|
||||
def unpack_tuple_and_ellipses(item: tuple):
|
||||
"""
|
||||
Possibly unpack arr[..., n] to arr[n]
|
||||
"""
|
||||
if len(item) > 1:
|
||||
# Note: we are assuming this indexing is being done on a 1D arraylike
|
||||
if item[0] is Ellipsis:
|
||||
item = item[1:]
|
||||
elif item[-1] is Ellipsis:
|
||||
item = item[:-1]
|
||||
|
||||
if len(item) > 1:
|
||||
raise IndexError("too many indices for array.")
|
||||
|
||||
item = item[0]
|
||||
return item
|
||||
|
||||
|
||||
# -----------------------------------------------------------
|
||||
# Public indexer validation
|
||||
|
||||
|
||||
def check_array_indexer(array: AnyArrayLike, indexer: Any) -> Any:
|
||||
"""
|
||||
Check if `indexer` is a valid array indexer for `array`.
|
||||
|
||||
For a boolean mask, `array` and `indexer` are checked to have the same
|
||||
length. The dtype is validated, and if it is an integer or boolean
|
||||
ExtensionArray, it is checked if there are missing values present, and
|
||||
it is converted to the appropriate numpy array. Other dtypes will raise
|
||||
an error.
|
||||
|
||||
Non-array indexers (integer, slice, Ellipsis, tuples, ..) are passed
|
||||
through as is.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
array : array-like
|
||||
The array that is being indexed (only used for the length).
|
||||
indexer : array-like or list-like
|
||||
The array-like that's used to index. List-like input that is not yet
|
||||
a numpy array or an ExtensionArray is converted to one. Other input
|
||||
types are passed through as is.
|
||||
|
||||
Returns
|
||||
-------
|
||||
numpy.ndarray
|
||||
The validated indexer as a numpy array that can be used to index.
|
||||
|
||||
Raises
|
||||
------
|
||||
IndexError
|
||||
When the lengths don't match.
|
||||
ValueError
|
||||
When `indexer` cannot be converted to a numpy ndarray to index
|
||||
(e.g. presence of missing values).
|
||||
|
||||
See Also
|
||||
--------
|
||||
api.types.is_bool_dtype : Check if `key` is of boolean dtype.
|
||||
|
||||
Examples
|
||||
--------
|
||||
When checking a boolean mask, a boolean ndarray is returned when the
|
||||
arguments are all valid.
|
||||
|
||||
>>> mask = pd.array([True, False])
|
||||
>>> arr = pd.array([1, 2])
|
||||
>>> pd.api.indexers.check_array_indexer(arr, mask)
|
||||
array([ True, False])
|
||||
|
||||
An IndexError is raised when the lengths don't match.
|
||||
|
||||
>>> mask = pd.array([True, False, True])
|
||||
>>> pd.api.indexers.check_array_indexer(arr, mask)
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
IndexError: Boolean index has wrong length: 3 instead of 2.
|
||||
|
||||
NA values in a boolean array are treated as False.
|
||||
|
||||
>>> mask = pd.array([True, pd.NA])
|
||||
>>> pd.api.indexers.check_array_indexer(arr, mask)
|
||||
array([ True, False])
|
||||
|
||||
A numpy boolean mask will get passed through (if the length is correct):
|
||||
|
||||
>>> mask = np.array([True, False])
|
||||
>>> pd.api.indexers.check_array_indexer(arr, mask)
|
||||
array([ True, False])
|
||||
|
||||
Similarly for integer indexers, an integer ndarray is returned when it is
|
||||
a valid indexer, otherwise an error is (for integer indexers, a matching
|
||||
length is not required):
|
||||
|
||||
>>> indexer = pd.array([0, 2], dtype="Int64")
|
||||
>>> arr = pd.array([1, 2, 3])
|
||||
>>> pd.api.indexers.check_array_indexer(arr, indexer)
|
||||
array([0, 2])
|
||||
|
||||
>>> indexer = pd.array([0, pd.NA], dtype="Int64")
|
||||
>>> pd.api.indexers.check_array_indexer(arr, indexer)
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
ValueError: Cannot index with an integer indexer containing NA values
|
||||
|
||||
For non-integer/boolean dtypes, an appropriate error is raised:
|
||||
|
||||
>>> indexer = np.array([0., 2.], dtype="float64")
|
||||
>>> pd.api.indexers.check_array_indexer(arr, indexer)
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
IndexError: arrays used as indices must be of integer or boolean type
|
||||
"""
|
||||
from pandas.core.construction import array as pd_array
|
||||
|
||||
# whatever is not an array-like is returned as-is (possible valid array
|
||||
# indexers that are not array-like: integer, slice, Ellipsis, None)
|
||||
# In this context, tuples are not considered as array-like, as they have
|
||||
# a specific meaning in indexing (multi-dimensional indexing)
|
||||
if is_list_like(indexer):
|
||||
if isinstance(indexer, tuple):
|
||||
return indexer
|
||||
else:
|
||||
return indexer
|
||||
|
||||
# convert list-likes to array
|
||||
if not is_array_like(indexer):
|
||||
indexer = pd_array(indexer)
|
||||
if len(indexer) == 0:
|
||||
# empty list is converted to float array by pd.array
|
||||
indexer = np.array([], dtype=np.intp)
|
||||
|
||||
dtype = indexer.dtype
|
||||
if is_bool_dtype(dtype):
|
||||
if isinstance(dtype, ExtensionDtype):
|
||||
indexer = indexer.to_numpy(dtype=bool, na_value=False)
|
||||
else:
|
||||
indexer = np.asarray(indexer, dtype=bool)
|
||||
|
||||
# GH26658
|
||||
if len(indexer) != len(array):
|
||||
raise IndexError(
|
||||
f"Boolean index has wrong length: "
|
||||
f"{len(indexer)} instead of {len(array)}"
|
||||
)
|
||||
elif is_integer_dtype(dtype):
|
||||
try:
|
||||
indexer = np.asarray(indexer, dtype=np.intp)
|
||||
except ValueError as err:
|
||||
raise ValueError(
|
||||
"Cannot index with an integer indexer containing NA values"
|
||||
) from err
|
||||
else:
|
||||
raise IndexError("arrays used as indices must be of integer or boolean type")
|
||||
|
||||
return indexer
|
||||
643
lib/python3.11/site-packages/pandas/core/indexes/accessors.py
Normal file
643
lib/python3.11/site-packages/pandas/core/indexes/accessors.py
Normal file
@ -0,0 +1,643 @@
|
||||
"""
|
||||
datetimelike delegation
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
cast,
|
||||
)
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_integer_dtype,
|
||||
is_list_like,
|
||||
)
|
||||
from pandas.core.dtypes.dtypes import (
|
||||
ArrowDtype,
|
||||
CategoricalDtype,
|
||||
DatetimeTZDtype,
|
||||
PeriodDtype,
|
||||
)
|
||||
from pandas.core.dtypes.generic import ABCSeries
|
||||
|
||||
from pandas.core.accessor import (
|
||||
PandasDelegate,
|
||||
delegate_names,
|
||||
)
|
||||
from pandas.core.arrays import (
|
||||
DatetimeArray,
|
||||
PeriodArray,
|
||||
TimedeltaArray,
|
||||
)
|
||||
from pandas.core.arrays.arrow.array import ArrowExtensionArray
|
||||
from pandas.core.base import (
|
||||
NoNewAttributesMixin,
|
||||
PandasObject,
|
||||
)
|
||||
from pandas.core.indexes.datetimes import DatetimeIndex
|
||||
from pandas.core.indexes.timedeltas import TimedeltaIndex
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
|
||||
|
||||
class Properties(PandasDelegate, PandasObject, NoNewAttributesMixin):
|
||||
_hidden_attrs = PandasObject._hidden_attrs | {
|
||||
"orig",
|
||||
"name",
|
||||
}
|
||||
|
||||
def __init__(self, data: Series, orig) -> None:
|
||||
if not isinstance(data, ABCSeries):
|
||||
raise TypeError(
|
||||
f"cannot convert an object of type {type(data)} to a datetimelike index"
|
||||
)
|
||||
|
||||
self._parent = data
|
||||
self.orig = orig
|
||||
self.name = getattr(data, "name", None)
|
||||
self._freeze()
|
||||
|
||||
def _get_values(self):
|
||||
data = self._parent
|
||||
if lib.is_np_dtype(data.dtype, "M"):
|
||||
return DatetimeIndex(data, copy=False, name=self.name)
|
||||
|
||||
elif isinstance(data.dtype, DatetimeTZDtype):
|
||||
return DatetimeIndex(data, copy=False, name=self.name)
|
||||
|
||||
elif lib.is_np_dtype(data.dtype, "m"):
|
||||
return TimedeltaIndex(data, copy=False, name=self.name)
|
||||
|
||||
elif isinstance(data.dtype, PeriodDtype):
|
||||
return PeriodArray(data, copy=False)
|
||||
|
||||
raise TypeError(
|
||||
f"cannot convert an object of type {type(data)} to a datetimelike index"
|
||||
)
|
||||
|
||||
def _delegate_property_get(self, name: str):
|
||||
from pandas import Series
|
||||
|
||||
values = self._get_values()
|
||||
|
||||
result = getattr(values, name)
|
||||
|
||||
# maybe need to upcast (ints)
|
||||
if isinstance(result, np.ndarray):
|
||||
if is_integer_dtype(result):
|
||||
result = result.astype("int64")
|
||||
elif not is_list_like(result):
|
||||
return result
|
||||
|
||||
result = np.asarray(result)
|
||||
|
||||
if self.orig is not None:
|
||||
index = self.orig.index
|
||||
else:
|
||||
index = self._parent.index
|
||||
# return the result as a Series
|
||||
result = Series(result, index=index, name=self.name).__finalize__(self._parent)
|
||||
|
||||
# setting this object will show a SettingWithCopyWarning/Error
|
||||
result._is_copy = (
|
||||
"modifications to a property of a datetimelike "
|
||||
"object are not supported and are discarded. "
|
||||
"Change values on the original."
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
def _delegate_property_set(self, name: str, value, *args, **kwargs):
|
||||
raise ValueError(
|
||||
"modifications to a property of a datetimelike object are not supported. "
|
||||
"Change values on the original."
|
||||
)
|
||||
|
||||
def _delegate_method(self, name: str, *args, **kwargs):
|
||||
from pandas import Series
|
||||
|
||||
values = self._get_values()
|
||||
|
||||
method = getattr(values, name)
|
||||
result = method(*args, **kwargs)
|
||||
|
||||
if not is_list_like(result):
|
||||
return result
|
||||
|
||||
result = Series(result, index=self._parent.index, name=self.name).__finalize__(
|
||||
self._parent
|
||||
)
|
||||
|
||||
# setting this object will show a SettingWithCopyWarning/Error
|
||||
result._is_copy = (
|
||||
"modifications to a method of a datetimelike "
|
||||
"object are not supported and are discarded. "
|
||||
"Change values on the original."
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@delegate_names(
|
||||
delegate=ArrowExtensionArray,
|
||||
accessors=TimedeltaArray._datetimelike_ops,
|
||||
typ="property",
|
||||
accessor_mapping=lambda x: f"_dt_{x}",
|
||||
raise_on_missing=False,
|
||||
)
|
||||
@delegate_names(
|
||||
delegate=ArrowExtensionArray,
|
||||
accessors=TimedeltaArray._datetimelike_methods,
|
||||
typ="method",
|
||||
accessor_mapping=lambda x: f"_dt_{x}",
|
||||
raise_on_missing=False,
|
||||
)
|
||||
@delegate_names(
|
||||
delegate=ArrowExtensionArray,
|
||||
accessors=DatetimeArray._datetimelike_ops,
|
||||
typ="property",
|
||||
accessor_mapping=lambda x: f"_dt_{x}",
|
||||
raise_on_missing=False,
|
||||
)
|
||||
@delegate_names(
|
||||
delegate=ArrowExtensionArray,
|
||||
accessors=DatetimeArray._datetimelike_methods,
|
||||
typ="method",
|
||||
accessor_mapping=lambda x: f"_dt_{x}",
|
||||
raise_on_missing=False,
|
||||
)
|
||||
class ArrowTemporalProperties(PandasDelegate, PandasObject, NoNewAttributesMixin):
|
||||
def __init__(self, data: Series, orig) -> None:
|
||||
if not isinstance(data, ABCSeries):
|
||||
raise TypeError(
|
||||
f"cannot convert an object of type {type(data)} to a datetimelike index"
|
||||
)
|
||||
|
||||
self._parent = data
|
||||
self._orig = orig
|
||||
self._freeze()
|
||||
|
||||
def _delegate_property_get(self, name: str):
|
||||
if not hasattr(self._parent.array, f"_dt_{name}"):
|
||||
raise NotImplementedError(
|
||||
f"dt.{name} is not supported for {self._parent.dtype}"
|
||||
)
|
||||
result = getattr(self._parent.array, f"_dt_{name}")
|
||||
|
||||
if not is_list_like(result):
|
||||
return result
|
||||
|
||||
if self._orig is not None:
|
||||
index = self._orig.index
|
||||
else:
|
||||
index = self._parent.index
|
||||
# return the result as a Series, which is by definition a copy
|
||||
result = type(self._parent)(
|
||||
result, index=index, name=self._parent.name
|
||||
).__finalize__(self._parent)
|
||||
|
||||
return result
|
||||
|
||||
def _delegate_method(self, name: str, *args, **kwargs):
|
||||
if not hasattr(self._parent.array, f"_dt_{name}"):
|
||||
raise NotImplementedError(
|
||||
f"dt.{name} is not supported for {self._parent.dtype}"
|
||||
)
|
||||
|
||||
result = getattr(self._parent.array, f"_dt_{name}")(*args, **kwargs)
|
||||
|
||||
if self._orig is not None:
|
||||
index = self._orig.index
|
||||
else:
|
||||
index = self._parent.index
|
||||
# return the result as a Series, which is by definition a copy
|
||||
result = type(self._parent)(
|
||||
result, index=index, name=self._parent.name
|
||||
).__finalize__(self._parent)
|
||||
|
||||
return result
|
||||
|
||||
def to_pytimedelta(self):
|
||||
return cast(ArrowExtensionArray, self._parent.array)._dt_to_pytimedelta()
|
||||
|
||||
def to_pydatetime(self):
|
||||
# GH#20306
|
||||
warnings.warn(
|
||||
f"The behavior of {type(self).__name__}.to_pydatetime is deprecated, "
|
||||
"in a future version this will return a Series containing python "
|
||||
"datetime objects instead of an ndarray. To retain the old behavior, "
|
||||
"call `np.array` on the result",
|
||||
FutureWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
return cast(ArrowExtensionArray, self._parent.array)._dt_to_pydatetime()
|
||||
|
||||
def isocalendar(self) -> DataFrame:
|
||||
from pandas import DataFrame
|
||||
|
||||
result = (
|
||||
cast(ArrowExtensionArray, self._parent.array)
|
||||
._dt_isocalendar()
|
||||
._pa_array.combine_chunks()
|
||||
)
|
||||
iso_calendar_df = DataFrame(
|
||||
{
|
||||
col: type(self._parent.array)(result.field(i)) # type: ignore[call-arg]
|
||||
for i, col in enumerate(["year", "week", "day"])
|
||||
}
|
||||
)
|
||||
return iso_calendar_df
|
||||
|
||||
@property
|
||||
def components(self) -> DataFrame:
|
||||
from pandas import DataFrame
|
||||
|
||||
components_df = DataFrame(
|
||||
{
|
||||
col: getattr(self._parent.array, f"_dt_{col}")
|
||||
for col in [
|
||||
"days",
|
||||
"hours",
|
||||
"minutes",
|
||||
"seconds",
|
||||
"milliseconds",
|
||||
"microseconds",
|
||||
"nanoseconds",
|
||||
]
|
||||
}
|
||||
)
|
||||
return components_df
|
||||
|
||||
|
||||
@delegate_names(
|
||||
delegate=DatetimeArray,
|
||||
accessors=DatetimeArray._datetimelike_ops + ["unit"],
|
||||
typ="property",
|
||||
)
|
||||
@delegate_names(
|
||||
delegate=DatetimeArray,
|
||||
accessors=DatetimeArray._datetimelike_methods + ["as_unit"],
|
||||
typ="method",
|
||||
)
|
||||
class DatetimeProperties(Properties):
|
||||
"""
|
||||
Accessor object for datetimelike properties of the Series values.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> seconds_series = pd.Series(pd.date_range("2000-01-01", periods=3, freq="s"))
|
||||
>>> seconds_series
|
||||
0 2000-01-01 00:00:00
|
||||
1 2000-01-01 00:00:01
|
||||
2 2000-01-01 00:00:02
|
||||
dtype: datetime64[ns]
|
||||
>>> seconds_series.dt.second
|
||||
0 0
|
||||
1 1
|
||||
2 2
|
||||
dtype: int32
|
||||
|
||||
>>> hours_series = pd.Series(pd.date_range("2000-01-01", periods=3, freq="h"))
|
||||
>>> hours_series
|
||||
0 2000-01-01 00:00:00
|
||||
1 2000-01-01 01:00:00
|
||||
2 2000-01-01 02:00:00
|
||||
dtype: datetime64[ns]
|
||||
>>> hours_series.dt.hour
|
||||
0 0
|
||||
1 1
|
||||
2 2
|
||||
dtype: int32
|
||||
|
||||
>>> quarters_series = pd.Series(pd.date_range("2000-01-01", periods=3, freq="QE"))
|
||||
>>> quarters_series
|
||||
0 2000-03-31
|
||||
1 2000-06-30
|
||||
2 2000-09-30
|
||||
dtype: datetime64[ns]
|
||||
>>> quarters_series.dt.quarter
|
||||
0 1
|
||||
1 2
|
||||
2 3
|
||||
dtype: int32
|
||||
|
||||
Returns a Series indexed like the original Series.
|
||||
Raises TypeError if the Series does not contain datetimelike values.
|
||||
"""
|
||||
|
||||
def to_pydatetime(self) -> np.ndarray:
|
||||
"""
|
||||
Return the data as an array of :class:`datetime.datetime` objects.
|
||||
|
||||
.. deprecated:: 2.1.0
|
||||
|
||||
The current behavior of dt.to_pydatetime is deprecated.
|
||||
In a future version this will return a Series containing python
|
||||
datetime objects instead of a ndarray.
|
||||
|
||||
Timezone information is retained if present.
|
||||
|
||||
.. warning::
|
||||
|
||||
Python's datetime uses microsecond resolution, which is lower than
|
||||
pandas (nanosecond). The values are truncated.
|
||||
|
||||
Returns
|
||||
-------
|
||||
numpy.ndarray
|
||||
Object dtype array containing native Python datetime objects.
|
||||
|
||||
See Also
|
||||
--------
|
||||
datetime.datetime : Standard library value for a datetime.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> s = pd.Series(pd.date_range('20180310', periods=2))
|
||||
>>> s
|
||||
0 2018-03-10
|
||||
1 2018-03-11
|
||||
dtype: datetime64[ns]
|
||||
|
||||
>>> s.dt.to_pydatetime()
|
||||
array([datetime.datetime(2018, 3, 10, 0, 0),
|
||||
datetime.datetime(2018, 3, 11, 0, 0)], dtype=object)
|
||||
|
||||
pandas' nanosecond precision is truncated to microseconds.
|
||||
|
||||
>>> s = pd.Series(pd.date_range('20180310', periods=2, freq='ns'))
|
||||
>>> s
|
||||
0 2018-03-10 00:00:00.000000000
|
||||
1 2018-03-10 00:00:00.000000001
|
||||
dtype: datetime64[ns]
|
||||
|
||||
>>> s.dt.to_pydatetime()
|
||||
array([datetime.datetime(2018, 3, 10, 0, 0),
|
||||
datetime.datetime(2018, 3, 10, 0, 0)], dtype=object)
|
||||
"""
|
||||
# GH#20306
|
||||
warnings.warn(
|
||||
f"The behavior of {type(self).__name__}.to_pydatetime is deprecated, "
|
||||
"in a future version this will return a Series containing python "
|
||||
"datetime objects instead of an ndarray. To retain the old behavior, "
|
||||
"call `np.array` on the result",
|
||||
FutureWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
return self._get_values().to_pydatetime()
|
||||
|
||||
@property
|
||||
def freq(self):
|
||||
return self._get_values().inferred_freq
|
||||
|
||||
def isocalendar(self) -> DataFrame:
|
||||
"""
|
||||
Calculate year, week, and day according to the ISO 8601 standard.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
With columns year, week and day.
|
||||
|
||||
See Also
|
||||
--------
|
||||
Timestamp.isocalendar : Function return a 3-tuple containing ISO year,
|
||||
week number, and weekday for the given Timestamp object.
|
||||
datetime.date.isocalendar : Return a named tuple object with
|
||||
three components: year, week and weekday.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> ser = pd.to_datetime(pd.Series(["2010-01-01", pd.NaT]))
|
||||
>>> ser.dt.isocalendar()
|
||||
year week day
|
||||
0 2009 53 5
|
||||
1 <NA> <NA> <NA>
|
||||
>>> ser.dt.isocalendar().week
|
||||
0 53
|
||||
1 <NA>
|
||||
Name: week, dtype: UInt32
|
||||
"""
|
||||
return self._get_values().isocalendar().set_index(self._parent.index)
|
||||
|
||||
|
||||
@delegate_names(
|
||||
delegate=TimedeltaArray, accessors=TimedeltaArray._datetimelike_ops, typ="property"
|
||||
)
|
||||
@delegate_names(
|
||||
delegate=TimedeltaArray,
|
||||
accessors=TimedeltaArray._datetimelike_methods,
|
||||
typ="method",
|
||||
)
|
||||
class TimedeltaProperties(Properties):
|
||||
"""
|
||||
Accessor object for datetimelike properties of the Series values.
|
||||
|
||||
Returns a Series indexed like the original Series.
|
||||
Raises TypeError if the Series does not contain datetimelike values.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> seconds_series = pd.Series(
|
||||
... pd.timedelta_range(start="1 second", periods=3, freq="s")
|
||||
... )
|
||||
>>> seconds_series
|
||||
0 0 days 00:00:01
|
||||
1 0 days 00:00:02
|
||||
2 0 days 00:00:03
|
||||
dtype: timedelta64[ns]
|
||||
>>> seconds_series.dt.seconds
|
||||
0 1
|
||||
1 2
|
||||
2 3
|
||||
dtype: int32
|
||||
"""
|
||||
|
||||
def to_pytimedelta(self) -> np.ndarray:
|
||||
"""
|
||||
Return an array of native :class:`datetime.timedelta` objects.
|
||||
|
||||
Python's standard `datetime` library uses a different representation
|
||||
timedelta's. This method converts a Series of pandas Timedeltas
|
||||
to `datetime.timedelta` format with the same length as the original
|
||||
Series.
|
||||
|
||||
Returns
|
||||
-------
|
||||
numpy.ndarray
|
||||
Array of 1D containing data with `datetime.timedelta` type.
|
||||
|
||||
See Also
|
||||
--------
|
||||
datetime.timedelta : A duration expressing the difference
|
||||
between two date, time, or datetime.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> s = pd.Series(pd.to_timedelta(np.arange(5), unit="d"))
|
||||
>>> s
|
||||
0 0 days
|
||||
1 1 days
|
||||
2 2 days
|
||||
3 3 days
|
||||
4 4 days
|
||||
dtype: timedelta64[ns]
|
||||
|
||||
>>> s.dt.to_pytimedelta()
|
||||
array([datetime.timedelta(0), datetime.timedelta(days=1),
|
||||
datetime.timedelta(days=2), datetime.timedelta(days=3),
|
||||
datetime.timedelta(days=4)], dtype=object)
|
||||
"""
|
||||
return self._get_values().to_pytimedelta()
|
||||
|
||||
@property
|
||||
def components(self):
|
||||
"""
|
||||
Return a Dataframe of the components of the Timedeltas.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> s = pd.Series(pd.to_timedelta(np.arange(5), unit='s'))
|
||||
>>> s
|
||||
0 0 days 00:00:00
|
||||
1 0 days 00:00:01
|
||||
2 0 days 00:00:02
|
||||
3 0 days 00:00:03
|
||||
4 0 days 00:00:04
|
||||
dtype: timedelta64[ns]
|
||||
>>> s.dt.components
|
||||
days hours minutes seconds milliseconds microseconds nanoseconds
|
||||
0 0 0 0 0 0 0 0
|
||||
1 0 0 0 1 0 0 0
|
||||
2 0 0 0 2 0 0 0
|
||||
3 0 0 0 3 0 0 0
|
||||
4 0 0 0 4 0 0 0
|
||||
"""
|
||||
return (
|
||||
self._get_values()
|
||||
.components.set_index(self._parent.index)
|
||||
.__finalize__(self._parent)
|
||||
)
|
||||
|
||||
@property
|
||||
def freq(self):
|
||||
return self._get_values().inferred_freq
|
||||
|
||||
|
||||
@delegate_names(
|
||||
delegate=PeriodArray, accessors=PeriodArray._datetimelike_ops, typ="property"
|
||||
)
|
||||
@delegate_names(
|
||||
delegate=PeriodArray, accessors=PeriodArray._datetimelike_methods, typ="method"
|
||||
)
|
||||
class PeriodProperties(Properties):
|
||||
"""
|
||||
Accessor object for datetimelike properties of the Series values.
|
||||
|
||||
Returns a Series indexed like the original Series.
|
||||
Raises TypeError if the Series does not contain datetimelike values.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> seconds_series = pd.Series(
|
||||
... pd.period_range(
|
||||
... start="2000-01-01 00:00:00", end="2000-01-01 00:00:03", freq="s"
|
||||
... )
|
||||
... )
|
||||
>>> seconds_series
|
||||
0 2000-01-01 00:00:00
|
||||
1 2000-01-01 00:00:01
|
||||
2 2000-01-01 00:00:02
|
||||
3 2000-01-01 00:00:03
|
||||
dtype: period[s]
|
||||
>>> seconds_series.dt.second
|
||||
0 0
|
||||
1 1
|
||||
2 2
|
||||
3 3
|
||||
dtype: int64
|
||||
|
||||
>>> hours_series = pd.Series(
|
||||
... pd.period_range(start="2000-01-01 00:00", end="2000-01-01 03:00", freq="h")
|
||||
... )
|
||||
>>> hours_series
|
||||
0 2000-01-01 00:00
|
||||
1 2000-01-01 01:00
|
||||
2 2000-01-01 02:00
|
||||
3 2000-01-01 03:00
|
||||
dtype: period[h]
|
||||
>>> hours_series.dt.hour
|
||||
0 0
|
||||
1 1
|
||||
2 2
|
||||
3 3
|
||||
dtype: int64
|
||||
|
||||
>>> quarters_series = pd.Series(
|
||||
... pd.period_range(start="2000-01-01", end="2000-12-31", freq="Q-DEC")
|
||||
... )
|
||||
>>> quarters_series
|
||||
0 2000Q1
|
||||
1 2000Q2
|
||||
2 2000Q3
|
||||
3 2000Q4
|
||||
dtype: period[Q-DEC]
|
||||
>>> quarters_series.dt.quarter
|
||||
0 1
|
||||
1 2
|
||||
2 3
|
||||
3 4
|
||||
dtype: int64
|
||||
"""
|
||||
|
||||
|
||||
class CombinedDatetimelikeProperties(
|
||||
DatetimeProperties, TimedeltaProperties, PeriodProperties
|
||||
):
|
||||
def __new__(cls, data: Series): # pyright: ignore[reportInconsistentConstructor]
|
||||
# CombinedDatetimelikeProperties isn't really instantiated. Instead
|
||||
# we need to choose which parent (datetime or timedelta) is
|
||||
# appropriate. Since we're checking the dtypes anyway, we'll just
|
||||
# do all the validation here.
|
||||
|
||||
if not isinstance(data, ABCSeries):
|
||||
raise TypeError(
|
||||
f"cannot convert an object of type {type(data)} to a datetimelike index"
|
||||
)
|
||||
|
||||
orig = data if isinstance(data.dtype, CategoricalDtype) else None
|
||||
if orig is not None:
|
||||
data = data._constructor(
|
||||
orig.array,
|
||||
name=orig.name,
|
||||
copy=False,
|
||||
dtype=orig._values.categories.dtype,
|
||||
index=orig.index,
|
||||
)
|
||||
|
||||
if isinstance(data.dtype, ArrowDtype) and data.dtype.kind in "Mm":
|
||||
return ArrowTemporalProperties(data, orig)
|
||||
if lib.is_np_dtype(data.dtype, "M"):
|
||||
return DatetimeProperties(data, orig)
|
||||
elif isinstance(data.dtype, DatetimeTZDtype):
|
||||
return DatetimeProperties(data, orig)
|
||||
elif lib.is_np_dtype(data.dtype, "m"):
|
||||
return TimedeltaProperties(data, orig)
|
||||
elif isinstance(data.dtype, PeriodDtype):
|
||||
return PeriodProperties(data, orig)
|
||||
|
||||
raise AttributeError("Can only use .dt accessor with datetimelike values")
|
||||
388
lib/python3.11/site-packages/pandas/core/indexes/api.py
Normal file
388
lib/python3.11/site-packages/pandas/core/indexes/api.py
Normal file
@ -0,0 +1,388 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import textwrap
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
cast,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import (
|
||||
NaT,
|
||||
lib,
|
||||
)
|
||||
from pandas.errors import InvalidIndexError
|
||||
|
||||
from pandas.core.dtypes.cast import find_common_type
|
||||
|
||||
from pandas.core.algorithms import safe_sort
|
||||
from pandas.core.indexes.base import (
|
||||
Index,
|
||||
_new_Index,
|
||||
ensure_index,
|
||||
ensure_index_from_sequences,
|
||||
get_unanimous_names,
|
||||
)
|
||||
from pandas.core.indexes.category import CategoricalIndex
|
||||
from pandas.core.indexes.datetimes import DatetimeIndex
|
||||
from pandas.core.indexes.interval import IntervalIndex
|
||||
from pandas.core.indexes.multi import MultiIndex
|
||||
from pandas.core.indexes.period import PeriodIndex
|
||||
from pandas.core.indexes.range import RangeIndex
|
||||
from pandas.core.indexes.timedeltas import TimedeltaIndex
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import Axis
|
||||
_sort_msg = textwrap.dedent(
|
||||
"""\
|
||||
Sorting because non-concatenation axis is not aligned. A future version
|
||||
of pandas will change to not sort by default.
|
||||
|
||||
To accept the future behavior, pass 'sort=False'.
|
||||
|
||||
To retain the current behavior and silence the warning, pass 'sort=True'.
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"Index",
|
||||
"MultiIndex",
|
||||
"CategoricalIndex",
|
||||
"IntervalIndex",
|
||||
"RangeIndex",
|
||||
"InvalidIndexError",
|
||||
"TimedeltaIndex",
|
||||
"PeriodIndex",
|
||||
"DatetimeIndex",
|
||||
"_new_Index",
|
||||
"NaT",
|
||||
"ensure_index",
|
||||
"ensure_index_from_sequences",
|
||||
"get_objs_combined_axis",
|
||||
"union_indexes",
|
||||
"get_unanimous_names",
|
||||
"all_indexes_same",
|
||||
"default_index",
|
||||
"safe_sort_index",
|
||||
]
|
||||
|
||||
|
||||
def get_objs_combined_axis(
|
||||
objs,
|
||||
intersect: bool = False,
|
||||
axis: Axis = 0,
|
||||
sort: bool = True,
|
||||
copy: bool = False,
|
||||
) -> Index:
|
||||
"""
|
||||
Extract combined index: return intersection or union (depending on the
|
||||
value of "intersect") of indexes on given axis, or None if all objects
|
||||
lack indexes (e.g. they are numpy arrays).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
objs : list
|
||||
Series or DataFrame objects, may be mix of the two.
|
||||
intersect : bool, default False
|
||||
If True, calculate the intersection between indexes. Otherwise,
|
||||
calculate the union.
|
||||
axis : {0 or 'index', 1 or 'outer'}, default 0
|
||||
The axis to extract indexes from.
|
||||
sort : bool, default True
|
||||
Whether the result index should come out sorted or not.
|
||||
copy : bool, default False
|
||||
If True, return a copy of the combined index.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Index
|
||||
"""
|
||||
obs_idxes = [obj._get_axis(axis) for obj in objs]
|
||||
return _get_combined_index(obs_idxes, intersect=intersect, sort=sort, copy=copy)
|
||||
|
||||
|
||||
def _get_distinct_objs(objs: list[Index]) -> list[Index]:
|
||||
"""
|
||||
Return a list with distinct elements of "objs" (different ids).
|
||||
Preserves order.
|
||||
"""
|
||||
ids: set[int] = set()
|
||||
res = []
|
||||
for obj in objs:
|
||||
if id(obj) not in ids:
|
||||
ids.add(id(obj))
|
||||
res.append(obj)
|
||||
return res
|
||||
|
||||
|
||||
def _get_combined_index(
|
||||
indexes: list[Index],
|
||||
intersect: bool = False,
|
||||
sort: bool = False,
|
||||
copy: bool = False,
|
||||
) -> Index:
|
||||
"""
|
||||
Return the union or intersection of indexes.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
indexes : list of Index or list objects
|
||||
When intersect=True, do not accept list of lists.
|
||||
intersect : bool, default False
|
||||
If True, calculate the intersection between indexes. Otherwise,
|
||||
calculate the union.
|
||||
sort : bool, default False
|
||||
Whether the result index should come out sorted or not.
|
||||
copy : bool, default False
|
||||
If True, return a copy of the combined index.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Index
|
||||
"""
|
||||
# TODO: handle index names!
|
||||
indexes = _get_distinct_objs(indexes)
|
||||
if len(indexes) == 0:
|
||||
index = Index([])
|
||||
elif len(indexes) == 1:
|
||||
index = indexes[0]
|
||||
elif intersect:
|
||||
index = indexes[0]
|
||||
for other in indexes[1:]:
|
||||
index = index.intersection(other)
|
||||
else:
|
||||
index = union_indexes(indexes, sort=False)
|
||||
index = ensure_index(index)
|
||||
|
||||
if sort:
|
||||
index = safe_sort_index(index)
|
||||
# GH 29879
|
||||
if copy:
|
||||
index = index.copy()
|
||||
|
||||
return index
|
||||
|
||||
|
||||
def safe_sort_index(index: Index) -> Index:
|
||||
"""
|
||||
Returns the sorted index
|
||||
|
||||
We keep the dtypes and the name attributes.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
index : an Index
|
||||
|
||||
Returns
|
||||
-------
|
||||
Index
|
||||
"""
|
||||
if index.is_monotonic_increasing:
|
||||
return index
|
||||
|
||||
try:
|
||||
array_sorted = safe_sort(index)
|
||||
except TypeError:
|
||||
pass
|
||||
else:
|
||||
if isinstance(array_sorted, Index):
|
||||
return array_sorted
|
||||
|
||||
array_sorted = cast(np.ndarray, array_sorted)
|
||||
if isinstance(index, MultiIndex):
|
||||
index = MultiIndex.from_tuples(array_sorted, names=index.names)
|
||||
else:
|
||||
index = Index(array_sorted, name=index.name, dtype=index.dtype)
|
||||
|
||||
return index
|
||||
|
||||
|
||||
def union_indexes(indexes, sort: bool | None = True) -> Index:
|
||||
"""
|
||||
Return the union of indexes.
|
||||
|
||||
The behavior of sort and names is not consistent.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
indexes : list of Index or list objects
|
||||
sort : bool, default True
|
||||
Whether the result index should come out sorted or not.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Index
|
||||
"""
|
||||
if len(indexes) == 0:
|
||||
raise AssertionError("Must have at least 1 Index to union")
|
||||
if len(indexes) == 1:
|
||||
result = indexes[0]
|
||||
if isinstance(result, list):
|
||||
if not sort:
|
||||
result = Index(result)
|
||||
else:
|
||||
result = Index(sorted(result))
|
||||
return result
|
||||
|
||||
indexes, kind = _sanitize_and_check(indexes)
|
||||
|
||||
def _unique_indices(inds, dtype) -> Index:
|
||||
"""
|
||||
Concatenate indices and remove duplicates.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
inds : list of Index or list objects
|
||||
dtype : dtype to set for the resulting Index
|
||||
|
||||
Returns
|
||||
-------
|
||||
Index
|
||||
"""
|
||||
if all(isinstance(ind, Index) for ind in inds):
|
||||
inds = [ind.astype(dtype, copy=False) for ind in inds]
|
||||
result = inds[0].unique()
|
||||
other = inds[1].append(inds[2:])
|
||||
diff = other[result.get_indexer_for(other) == -1]
|
||||
if len(diff):
|
||||
result = result.append(diff.unique())
|
||||
if sort:
|
||||
result = result.sort_values()
|
||||
return result
|
||||
|
||||
def conv(i):
|
||||
if isinstance(i, Index):
|
||||
i = i.tolist()
|
||||
return i
|
||||
|
||||
return Index(
|
||||
lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort),
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
def _find_common_index_dtype(inds):
|
||||
"""
|
||||
Finds a common type for the indexes to pass through to resulting index.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
inds: list of Index or list objects
|
||||
|
||||
Returns
|
||||
-------
|
||||
The common type or None if no indexes were given
|
||||
"""
|
||||
dtypes = [idx.dtype for idx in indexes if isinstance(idx, Index)]
|
||||
if dtypes:
|
||||
dtype = find_common_type(dtypes)
|
||||
else:
|
||||
dtype = None
|
||||
|
||||
return dtype
|
||||
|
||||
if kind == "special":
|
||||
result = indexes[0]
|
||||
|
||||
dtis = [x for x in indexes if isinstance(x, DatetimeIndex)]
|
||||
dti_tzs = [x for x in dtis if x.tz is not None]
|
||||
if len(dti_tzs) not in [0, len(dtis)]:
|
||||
# TODO: this behavior is not tested (so may not be desired),
|
||||
# but is kept in order to keep behavior the same when
|
||||
# deprecating union_many
|
||||
# test_frame_from_dict_with_mixed_indexes
|
||||
raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex")
|
||||
|
||||
if len(dtis) == len(indexes):
|
||||
sort = True
|
||||
result = indexes[0]
|
||||
|
||||
elif len(dtis) > 1:
|
||||
# If we have mixed timezones, our casting behavior may depend on
|
||||
# the order of indexes, which we don't want.
|
||||
sort = False
|
||||
|
||||
# TODO: what about Categorical[dt64]?
|
||||
# test_frame_from_dict_with_mixed_indexes
|
||||
indexes = [x.astype(object, copy=False) for x in indexes]
|
||||
result = indexes[0]
|
||||
|
||||
for other in indexes[1:]:
|
||||
result = result.union(other, sort=None if sort else False)
|
||||
return result
|
||||
|
||||
elif kind == "array":
|
||||
dtype = _find_common_index_dtype(indexes)
|
||||
index = indexes[0]
|
||||
if not all(index.equals(other) for other in indexes[1:]):
|
||||
index = _unique_indices(indexes, dtype)
|
||||
|
||||
name = get_unanimous_names(*indexes)[0]
|
||||
if name != index.name:
|
||||
index = index.rename(name)
|
||||
return index
|
||||
else: # kind='list'
|
||||
dtype = _find_common_index_dtype(indexes)
|
||||
return _unique_indices(indexes, dtype)
|
||||
|
||||
|
||||
def _sanitize_and_check(indexes):
|
||||
"""
|
||||
Verify the type of indexes and convert lists to Index.
|
||||
|
||||
Cases:
|
||||
|
||||
- [list, list, ...]: Return ([list, list, ...], 'list')
|
||||
- [list, Index, ...]: Return _sanitize_and_check([Index, Index, ...])
|
||||
Lists are sorted and converted to Index.
|
||||
- [Index, Index, ...]: Return ([Index, Index, ...], TYPE)
|
||||
TYPE = 'special' if at least one special type, 'array' otherwise.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
indexes : list of Index or list objects
|
||||
|
||||
Returns
|
||||
-------
|
||||
sanitized_indexes : list of Index or list objects
|
||||
type : {'list', 'array', 'special'}
|
||||
"""
|
||||
kinds = list({type(index) for index in indexes})
|
||||
|
||||
if list in kinds:
|
||||
if len(kinds) > 1:
|
||||
indexes = [
|
||||
Index(list(x)) if not isinstance(x, Index) else x for x in indexes
|
||||
]
|
||||
kinds.remove(list)
|
||||
else:
|
||||
return indexes, "list"
|
||||
|
||||
if len(kinds) > 1 or Index not in kinds:
|
||||
return indexes, "special"
|
||||
else:
|
||||
return indexes, "array"
|
||||
|
||||
|
||||
def all_indexes_same(indexes) -> bool:
|
||||
"""
|
||||
Determine if all indexes contain the same elements.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
indexes : iterable of Index objects
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
True if all indexes contain the same elements, False otherwise.
|
||||
"""
|
||||
itr = iter(indexes)
|
||||
first = next(itr)
|
||||
return all(first.equals(index) for index in itr)
|
||||
|
||||
|
||||
def default_index(n: int) -> RangeIndex:
|
||||
rng = range(n)
|
||||
return RangeIndex._simple_new(rng, name=None)
|
||||
7943
lib/python3.11/site-packages/pandas/core/indexes/base.py
Normal file
7943
lib/python3.11/site-packages/pandas/core/indexes/base.py
Normal file
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user