This commit is contained in:
2025-09-07 22:09:54 +02:00
parent e1b817252c
commit 2fc0d000b6
7796 changed files with 2159515 additions and 933 deletions

View File

@ -0,0 +1,43 @@
from pandas.core.arrays.arrow import ArrowExtensionArray
from pandas.core.arrays.base import (
ExtensionArray,
ExtensionOpsMixin,
ExtensionScalarOpsMixin,
)
from pandas.core.arrays.boolean import BooleanArray
from pandas.core.arrays.categorical import Categorical
from pandas.core.arrays.datetimes import DatetimeArray
from pandas.core.arrays.floating import FloatingArray
from pandas.core.arrays.integer import IntegerArray
from pandas.core.arrays.interval import IntervalArray
from pandas.core.arrays.masked import BaseMaskedArray
from pandas.core.arrays.numpy_ import NumpyExtensionArray
from pandas.core.arrays.period import (
PeriodArray,
period_array,
)
from pandas.core.arrays.sparse import SparseArray
from pandas.core.arrays.string_ import StringArray
from pandas.core.arrays.string_arrow import ArrowStringArray
from pandas.core.arrays.timedeltas import TimedeltaArray
__all__ = [
"ArrowExtensionArray",
"ExtensionArray",
"ExtensionOpsMixin",
"ExtensionScalarOpsMixin",
"ArrowStringArray",
"BaseMaskedArray",
"BooleanArray",
"Categorical",
"DatetimeArray",
"FloatingArray",
"IntegerArray",
"IntervalArray",
"NumpyExtensionArray",
"PeriodArray",
"period_array",
"SparseArray",
"StringArray",
"TimedeltaArray",
]

View File

@ -0,0 +1,362 @@
from __future__ import annotations
from functools import partial
import re
from typing import (
TYPE_CHECKING,
Any,
Literal,
)
import numpy as np
from pandas._libs import lib
from pandas.compat import (
pa_version_under10p1,
pa_version_under11p0,
pa_version_under13p0,
pa_version_under17p0,
)
if not pa_version_under10p1:
import pyarrow as pa
import pyarrow.compute as pc
if TYPE_CHECKING:
from collections.abc import Callable
from pandas._typing import (
Scalar,
Self,
)
class ArrowStringArrayMixin:
_pa_array: pa.ChunkedArray
def __init__(self, *args, **kwargs) -> None:
raise NotImplementedError
def _convert_bool_result(self, result, na=lib.no_default, method_name=None):
# Convert a bool-dtype result to the appropriate result type
raise NotImplementedError
def _convert_int_result(self, result):
# Convert an integer-dtype result to the appropriate result type
raise NotImplementedError
def _apply_elementwise(self, func: Callable) -> list[list[Any]]:
raise NotImplementedError
def _str_len(self):
result = pc.utf8_length(self._pa_array)
return self._convert_int_result(result)
def _str_lower(self) -> Self:
return type(self)(pc.utf8_lower(self._pa_array))
def _str_upper(self) -> Self:
return type(self)(pc.utf8_upper(self._pa_array))
def _str_strip(self, to_strip=None) -> Self:
if to_strip is None:
result = pc.utf8_trim_whitespace(self._pa_array)
else:
result = pc.utf8_trim(self._pa_array, characters=to_strip)
return type(self)(result)
def _str_lstrip(self, to_strip=None) -> Self:
if to_strip is None:
result = pc.utf8_ltrim_whitespace(self._pa_array)
else:
result = pc.utf8_ltrim(self._pa_array, characters=to_strip)
return type(self)(result)
def _str_rstrip(self, to_strip=None) -> Self:
if to_strip is None:
result = pc.utf8_rtrim_whitespace(self._pa_array)
else:
result = pc.utf8_rtrim(self._pa_array, characters=to_strip)
return type(self)(result)
def _str_pad(
self,
width: int,
side: Literal["left", "right", "both"] = "left",
fillchar: str = " ",
):
if side == "left":
pa_pad = pc.utf8_lpad
elif side == "right":
pa_pad = pc.utf8_rpad
elif side == "both":
if pa_version_under17p0:
# GH#59624 fall back to object dtype
from pandas import array as pd_array
obj_arr = self.astype(object, copy=False) # type: ignore[attr-defined]
obj = pd_array(obj_arr, dtype=object)
result = obj._str_pad(width, side, fillchar) # type: ignore[attr-defined]
return type(self)._from_sequence(result, dtype=self.dtype) # type: ignore[attr-defined]
else:
# GH#54792
# https://github.com/apache/arrow/issues/15053#issuecomment-2317032347
lean_left = (width % 2) == 0
pa_pad = partial(pc.utf8_center, lean_left_on_odd_padding=lean_left)
else:
raise ValueError(
f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'"
)
return type(self)(pa_pad(self._pa_array, width=width, padding=fillchar))
def _str_get(self, i: int):
lengths = pc.utf8_length(self._pa_array)
if i >= 0:
out_of_bounds = pc.greater_equal(i, lengths)
start = i
stop = i + 1
step = 1
else:
out_of_bounds = pc.greater(-i, lengths)
start = i
stop = i - 1
step = -1
not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True))
selected = pc.utf8_slice_codeunits(
self._pa_array, start=start, stop=stop, step=step
)
null_value = pa.scalar(None, type=self._pa_array.type)
result = pc.if_else(not_out_of_bounds, selected, null_value)
return type(self)(result)
def _str_slice(
self, start: int | None = None, stop: int | None = None, step: int | None = None
):
if pa_version_under11p0:
# GH#59724
result = self._apply_elementwise(lambda val: val[start:stop:step])
return type(self)(pa.chunked_array(result, type=self._pa_array.type))
if start is None:
if step is not None and step < 0:
# GH#59710
start = -1
else:
start = 0
if step is None:
step = 1
return type(self)(
pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
)
def _str_slice_replace(
self, start: int | None = None, stop: int | None = None, repl: str | None = None
):
if repl is None:
repl = ""
if start is None:
start = 0
if stop is None:
stop = np.iinfo(np.int64).max
return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl))
def _str_replace(
self,
pat: str | re.Pattern,
repl: str | Callable,
n: int = -1,
case: bool = True,
flags: int = 0,
regex: bool = True,
) -> Self:
if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:
raise NotImplementedError(
"replace is not supported with a re.Pattern, callable repl, "
"case=False, or flags!=0"
)
func = pc.replace_substring_regex if regex else pc.replace_substring
# https://github.com/apache/arrow/issues/39149
# GH 56404, unexpected behavior with negative max_replacements with pyarrow.
pa_max_replacements = None if n < 0 else n
result = func(
self._pa_array,
pattern=pat,
replacement=repl,
max_replacements=pa_max_replacements,
)
return type(self)(result)
def _str_capitalize(self) -> Self:
return type(self)(pc.utf8_capitalize(self._pa_array))
def _str_title(self):
return type(self)(pc.utf8_title(self._pa_array))
def _str_swapcase(self):
return type(self)(pc.utf8_swapcase(self._pa_array))
def _str_removeprefix(self, prefix: str):
if not pa_version_under13p0:
starts_with = pc.starts_with(self._pa_array, pattern=prefix)
removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix))
result = pc.if_else(starts_with, removed, self._pa_array)
return type(self)(result)
predicate = lambda val: val.removeprefix(prefix)
result = self._apply_elementwise(predicate)
return type(self)(pa.chunked_array(result))
def _str_removesuffix(self, suffix: str):
ends_with = pc.ends_with(self._pa_array, pattern=suffix)
removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))
result = pc.if_else(ends_with, removed, self._pa_array)
return type(self)(result)
def _str_startswith(
self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default
):
if isinstance(pat, str):
result = pc.starts_with(self._pa_array, pattern=pat)
else:
if len(pat) == 0:
# For empty tuple we return null for missing values and False
# for valid values.
result = pc.if_else(pc.is_null(self._pa_array), None, False)
else:
result = pc.starts_with(self._pa_array, pattern=pat[0])
for p in pat[1:]:
result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p))
return self._convert_bool_result(result, na=na, method_name="startswith")
def _str_endswith(
self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default
):
if isinstance(pat, str):
result = pc.ends_with(self._pa_array, pattern=pat)
else:
if len(pat) == 0:
# For empty tuple we return null for missing values and False
# for valid values.
result = pc.if_else(pc.is_null(self._pa_array), None, False)
else:
result = pc.ends_with(self._pa_array, pattern=pat[0])
for p in pat[1:]:
result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p))
return self._convert_bool_result(result, na=na, method_name="endswith")
def _str_isalnum(self):
result = pc.utf8_is_alnum(self._pa_array)
return self._convert_bool_result(result)
def _str_isalpha(self):
result = pc.utf8_is_alpha(self._pa_array)
return self._convert_bool_result(result)
def _str_isdecimal(self):
result = pc.utf8_is_decimal(self._pa_array)
return self._convert_bool_result(result)
def _str_isdigit(self):
result = pc.utf8_is_digit(self._pa_array)
return self._convert_bool_result(result)
def _str_islower(self):
result = pc.utf8_is_lower(self._pa_array)
return self._convert_bool_result(result)
def _str_isnumeric(self):
result = pc.utf8_is_numeric(self._pa_array)
return self._convert_bool_result(result)
def _str_isspace(self):
result = pc.utf8_is_space(self._pa_array)
return self._convert_bool_result(result)
def _str_istitle(self):
result = pc.utf8_is_title(self._pa_array)
return self._convert_bool_result(result)
def _str_isupper(self):
result = pc.utf8_is_upper(self._pa_array)
return self._convert_bool_result(result)
def _str_contains(
self,
pat,
case: bool = True,
flags: int = 0,
na: Scalar | lib.NoDefault = lib.no_default,
regex: bool = True,
):
if flags:
raise NotImplementedError(f"contains not implemented with {flags=}")
if regex:
pa_contains = pc.match_substring_regex
else:
pa_contains = pc.match_substring
result = pa_contains(self._pa_array, pat, ignore_case=not case)
return self._convert_bool_result(result, na=na, method_name="contains")
def _str_match(
self,
pat: str | re.Pattern,
case: bool = True,
flags: int = 0,
na: Scalar | lib.NoDefault = lib.no_default,
):
if isinstance(pat, re.Pattern):
# GH#61952
pat = pat.pattern
if isinstance(pat, str) and not pat.startswith("^"):
pat = f"^{pat}"
return self._str_contains(pat, case, flags, na, regex=True)
def _str_fullmatch(
self,
pat: str | re.Pattern,
case: bool = True,
flags: int = 0,
na: Scalar | lib.NoDefault = lib.no_default,
):
if isinstance(pat, re.Pattern):
# GH#61952
pat = pat.pattern
if isinstance(pat, str) and (not pat.endswith("$") or pat.endswith("\\$")):
pat = f"{pat}$"
return self._str_match(pat, case, flags, na)
def _str_find(self, sub: str, start: int = 0, end: int | None = None):
if (
pa_version_under13p0
and not (start != 0 and end is not None)
and not (start == 0 and end is None)
):
# GH#59562
res_list = self._apply_elementwise(lambda val: val.find(sub, start, end))
return self._convert_int_result(pa.chunked_array(res_list))
if (start == 0 or start is None) and end is None:
result = pc.find_substring(self._pa_array, sub)
else:
if sub == "":
# GH#56792
res_list = self._apply_elementwise(
lambda val: val.find(sub, start, end)
)
return self._convert_int_result(pa.chunked_array(res_list))
if start is None:
start_offset = 0
start = 0
elif start < 0:
start_offset = pc.add(start, pc.utf8_length(self._pa_array))
start_offset = pc.if_else(pc.less(start_offset, 0), 0, start_offset)
else:
start_offset = start
slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
result = pc.find_substring(slices, sub)
found = pc.not_equal(result, pa.scalar(-1, type=result.type))
offset_result = pc.add(result, start_offset)
result = pc.if_else(found, offset_result, -1)
return self._convert_int_result(result)

View File

@ -0,0 +1,544 @@
from __future__ import annotations
from functools import wraps
from typing import (
TYPE_CHECKING,
Any,
Literal,
cast,
overload,
)
import numpy as np
from pandas._libs import lib
from pandas._libs.arrays import NDArrayBacked
from pandas._libs.tslibs import is_supported_dtype
from pandas._typing import (
ArrayLike,
AxisInt,
Dtype,
F,
FillnaOptions,
PositionalIndexer2D,
PositionalIndexerTuple,
ScalarIndexer,
Self,
SequenceIndexer,
Shape,
TakeIndexer,
npt,
)
from pandas.errors import AbstractMethodError
from pandas.util._decorators import doc
from pandas.util._validators import (
validate_bool_kwarg,
validate_fillna_kwargs,
validate_insert_loc,
)
from pandas.core.dtypes.common import pandas_dtype
from pandas.core.dtypes.dtypes import (
DatetimeTZDtype,
ExtensionDtype,
PeriodDtype,
)
from pandas.core.dtypes.missing import array_equivalent
from pandas.core import missing
from pandas.core.algorithms import (
take,
unique,
value_counts_internal as value_counts,
)
from pandas.core.array_algos.quantile import quantile_with_mask
from pandas.core.array_algos.transforms import shift
from pandas.core.arrays.base import ExtensionArray
from pandas.core.construction import extract_array
from pandas.core.indexers import check_array_indexer
from pandas.core.sorting import nargminmax
if TYPE_CHECKING:
from collections.abc import Sequence
from pandas._typing import (
NumpySorter,
NumpyValueArrayLike,
)
from pandas import Series
def ravel_compat(meth: F) -> F:
"""
Decorator to ravel a 2D array before passing it to a cython operation,
then reshape the result to our own shape.
"""
@wraps(meth)
def method(self, *args, **kwargs):
if self.ndim == 1:
return meth(self, *args, **kwargs)
flags = self._ndarray.flags
flat = self.ravel("K")
result = meth(flat, *args, **kwargs)
order = "F" if flags.f_contiguous else "C"
return result.reshape(self.shape, order=order)
return cast(F, method)
class NDArrayBackedExtensionArray(NDArrayBacked, ExtensionArray):
"""
ExtensionArray that is backed by a single NumPy ndarray.
"""
_ndarray: np.ndarray
# scalar used to denote NA value inside our self._ndarray, e.g. -1
# for Categorical, iNaT for Period. Outside of object dtype,
# self.isna() should be exactly locations in self._ndarray with
# _internal_fill_value.
_internal_fill_value: Any
def _box_func(self, x):
"""
Wrap numpy type in our dtype.type if necessary.
"""
return x
def _validate_scalar(self, value):
# used by NDArrayBackedExtensionIndex.insert
raise AbstractMethodError(self)
# ------------------------------------------------------------------------
def view(self, dtype: Dtype | None = None) -> ArrayLike:
# We handle datetime64, datetime64tz, timedelta64, and period
# dtypes here. Everything else we pass through to the underlying
# ndarray.
if dtype is None or dtype is self.dtype:
return self._from_backing_data(self._ndarray)
if isinstance(dtype, type):
# we sometimes pass non-dtype objects, e.g np.ndarray;
# pass those through to the underlying ndarray
return self._ndarray.view(dtype)
dtype = pandas_dtype(dtype)
arr = self._ndarray
if isinstance(dtype, PeriodDtype):
cls = dtype.construct_array_type()
return cls(arr.view("i8"), dtype=dtype)
elif isinstance(dtype, DatetimeTZDtype):
dt_cls = dtype.construct_array_type()
dt64_values = arr.view(f"M8[{dtype.unit}]")
return dt_cls._simple_new(dt64_values, dtype=dtype)
elif lib.is_np_dtype(dtype, "M") and is_supported_dtype(dtype):
from pandas.core.arrays import DatetimeArray
dt64_values = arr.view(dtype)
return DatetimeArray._simple_new(dt64_values, dtype=dtype)
elif lib.is_np_dtype(dtype, "m") and is_supported_dtype(dtype):
from pandas.core.arrays import TimedeltaArray
td64_values = arr.view(dtype)
return TimedeltaArray._simple_new(td64_values, dtype=dtype)
# error: Argument "dtype" to "view" of "_ArrayOrScalarCommon" has incompatible
# type "Union[ExtensionDtype, dtype[Any]]"; expected "Union[dtype[Any], None,
# type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, Union[int,
# Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]"
return arr.view(dtype=dtype) # type: ignore[arg-type]
def take(
self,
indices: TakeIndexer,
*,
allow_fill: bool = False,
fill_value: Any = None,
axis: AxisInt = 0,
) -> Self:
if allow_fill:
fill_value = self._validate_scalar(fill_value)
new_data = take(
self._ndarray,
indices,
allow_fill=allow_fill,
fill_value=fill_value,
axis=axis,
)
return self._from_backing_data(new_data)
# ------------------------------------------------------------------------
def equals(self, other) -> bool:
if type(self) is not type(other):
return False
if self.dtype != other.dtype:
return False
return bool(array_equivalent(self._ndarray, other._ndarray, dtype_equal=True))
@classmethod
def _from_factorized(cls, values, original):
assert values.dtype == original._ndarray.dtype
return original._from_backing_data(values)
def _values_for_argsort(self) -> np.ndarray:
return self._ndarray
def _values_for_factorize(self):
return self._ndarray, self._internal_fill_value
def _hash_pandas_object(
self, *, encoding: str, hash_key: str, categorize: bool
) -> npt.NDArray[np.uint64]:
from pandas.core.util.hashing import hash_array
values = self._ndarray
return hash_array(
values, encoding=encoding, hash_key=hash_key, categorize=categorize
)
# Signature of "argmin" incompatible with supertype "ExtensionArray"
def argmin(self, axis: AxisInt = 0, skipna: bool = True): # type: ignore[override]
# override base class by adding axis keyword
validate_bool_kwarg(skipna, "skipna")
if not skipna and self._hasna:
raise NotImplementedError
return nargminmax(self, "argmin", axis=axis)
# Signature of "argmax" incompatible with supertype "ExtensionArray"
def argmax(self, axis: AxisInt = 0, skipna: bool = True): # type: ignore[override]
# override base class by adding axis keyword
validate_bool_kwarg(skipna, "skipna")
if not skipna and self._hasna:
raise NotImplementedError
return nargminmax(self, "argmax", axis=axis)
def unique(self) -> Self:
new_data = unique(self._ndarray)
return self._from_backing_data(new_data)
@classmethod
@doc(ExtensionArray._concat_same_type)
def _concat_same_type(
cls,
to_concat: Sequence[Self],
axis: AxisInt = 0,
) -> Self:
if not lib.dtypes_all_equal([x.dtype for x in to_concat]):
dtypes = {str(x.dtype) for x in to_concat}
raise ValueError("to_concat must have the same dtype", dtypes)
return super()._concat_same_type(to_concat, axis=axis)
@doc(ExtensionArray.searchsorted)
def searchsorted(
self,
value: NumpyValueArrayLike | ExtensionArray,
side: Literal["left", "right"] = "left",
sorter: NumpySorter | None = None,
) -> npt.NDArray[np.intp] | np.intp:
npvalue = self._validate_setitem_value(value)
return self._ndarray.searchsorted(npvalue, side=side, sorter=sorter)
@doc(ExtensionArray.shift)
def shift(self, periods: int = 1, fill_value=None):
# NB: shift is always along axis=0
axis = 0
fill_value = self._validate_scalar(fill_value)
new_values = shift(self._ndarray, periods, axis, fill_value)
return self._from_backing_data(new_values)
def __setitem__(self, key, value) -> None:
key = check_array_indexer(self, key)
value = self._validate_setitem_value(value)
self._ndarray[key] = value
def _validate_setitem_value(self, value):
return value
@overload
def __getitem__(self, key: ScalarIndexer) -> Any:
...
@overload
def __getitem__(
self,
key: SequenceIndexer | PositionalIndexerTuple,
) -> Self:
...
def __getitem__(
self,
key: PositionalIndexer2D,
) -> Self | Any:
if lib.is_integer(key):
# fast-path
result = self._ndarray[key]
if self.ndim == 1:
return self._box_func(result)
return self._from_backing_data(result)
# error: Incompatible types in assignment (expression has type "ExtensionArray",
# variable has type "Union[int, slice, ndarray]")
key = extract_array(key, extract_numpy=True) # type: ignore[assignment]
key = check_array_indexer(self, key)
result = self._ndarray[key]
if lib.is_scalar(result):
return self._box_func(result)
result = self._from_backing_data(result)
return result
def _fill_mask_inplace(
self, method: str, limit: int | None, mask: npt.NDArray[np.bool_]
) -> None:
# (for now) when self.ndim == 2, we assume axis=0
func = missing.get_fill_func(method, ndim=self.ndim)
func(self._ndarray.T, limit=limit, mask=mask.T)
def _pad_or_backfill(
self,
*,
method: FillnaOptions,
limit: int | None = None,
limit_area: Literal["inside", "outside"] | None = None,
copy: bool = True,
) -> Self:
mask = self.isna()
if mask.any():
# (for now) when self.ndim == 2, we assume axis=0
func = missing.get_fill_func(method, ndim=self.ndim)
npvalues = self._ndarray.T
if copy:
npvalues = npvalues.copy()
func(npvalues, limit=limit, limit_area=limit_area, mask=mask.T)
npvalues = npvalues.T
if copy:
new_values = self._from_backing_data(npvalues)
else:
new_values = self
else:
if copy:
new_values = self.copy()
else:
new_values = self
return new_values
@doc(ExtensionArray.fillna)
def fillna(
self, value=None, method=None, limit: int | None = None, copy: bool = True
) -> Self:
value, method = validate_fillna_kwargs(
value, method, validate_scalar_dict_value=False
)
mask = self.isna()
# error: Argument 2 to "check_value_size" has incompatible type
# "ExtensionArray"; expected "ndarray"
value = missing.check_value_size(
value, mask, len(self) # type: ignore[arg-type]
)
if mask.any():
if method is not None:
# (for now) when self.ndim == 2, we assume axis=0
func = missing.get_fill_func(method, ndim=self.ndim)
npvalues = self._ndarray.T
if copy:
npvalues = npvalues.copy()
func(npvalues, limit=limit, mask=mask.T)
npvalues = npvalues.T
# TODO: NumpyExtensionArray didn't used to copy, need tests
# for this
new_values = self._from_backing_data(npvalues)
else:
# fill with value
if copy:
new_values = self.copy()
else:
new_values = self[:]
new_values[mask] = value
else:
# We validate the fill_value even if there is nothing to fill
if value is not None:
self._validate_setitem_value(value)
if not copy:
new_values = self[:]
else:
new_values = self.copy()
return new_values
# ------------------------------------------------------------------------
# Reductions
def _wrap_reduction_result(self, axis: AxisInt | None, result):
if axis is None or self.ndim == 1:
return self._box_func(result)
return self._from_backing_data(result)
# ------------------------------------------------------------------------
# __array_function__ methods
def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None:
"""
Analogue to np.putmask(self, mask, value)
Parameters
----------
mask : np.ndarray[bool]
value : scalar or listlike
Raises
------
TypeError
If value cannot be cast to self.dtype.
"""
value = self._validate_setitem_value(value)
np.putmask(self._ndarray, mask, value)
def _where(self: Self, mask: npt.NDArray[np.bool_], value) -> Self:
"""
Analogue to np.where(mask, self, value)
Parameters
----------
mask : np.ndarray[bool]
value : scalar or listlike
Raises
------
TypeError
If value cannot be cast to self.dtype.
"""
value = self._validate_setitem_value(value)
res_values = np.where(mask, self._ndarray, value)
if res_values.dtype != self._ndarray.dtype:
raise AssertionError(
# GH#56410
"Something has gone wrong, please report a bug at "
"github.com/pandas-dev/pandas/"
)
return self._from_backing_data(res_values)
# ------------------------------------------------------------------------
# Index compat methods
def insert(self, loc: int, item) -> Self:
"""
Make new ExtensionArray inserting new item at location. Follows
Python list.append semantics for negative values.
Parameters
----------
loc : int
item : object
Returns
-------
type(self)
"""
loc = validate_insert_loc(loc, len(self))
code = self._validate_scalar(item)
new_vals = np.concatenate(
(
self._ndarray[:loc],
np.asarray([code], dtype=self._ndarray.dtype),
self._ndarray[loc:],
)
)
return self._from_backing_data(new_vals)
# ------------------------------------------------------------------------
# Additional array methods
# These are not part of the EA API, but we implement them because
# pandas assumes they're there.
def value_counts(self, dropna: bool = True) -> Series:
"""
Return a Series containing counts of unique values.
Parameters
----------
dropna : bool, default True
Don't include counts of NA values.
Returns
-------
Series
"""
if self.ndim != 1:
raise NotImplementedError
from pandas import (
Index,
Series,
)
if dropna:
# error: Unsupported operand type for ~ ("ExtensionArray")
values = self[~self.isna()]._ndarray # type: ignore[operator]
else:
values = self._ndarray
result = value_counts(values, sort=False, dropna=dropna)
index_arr = self._from_backing_data(np.asarray(result.index._data))
index = Index(index_arr, name=result.index.name)
return Series(result._values, index=index, name=result.name, copy=False)
def _quantile(
self,
qs: npt.NDArray[np.float64],
interpolation: str,
) -> Self:
# TODO: disable for Categorical if not ordered?
mask = np.asarray(self.isna())
arr = self._ndarray
fill_value = self._internal_fill_value
res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation)
if res_values.dtype == self._ndarray.dtype:
return self._from_backing_data(res_values)
else:
# e.g. test_quantile_empty we are empty integer dtype and res_values
# has floating dtype
# TODO: technically __init__ isn't defined here.
# Should we raise NotImplementedError and handle this on NumpyEA?
return type(self)(res_values) # type: ignore[call-arg]
# ------------------------------------------------------------------------
# numpy-like methods
@classmethod
def _empty(cls, shape: Shape, dtype: ExtensionDtype) -> Self:
"""
Analogous to np.empty(shape, dtype=dtype)
Parameters
----------
shape : tuple[int]
dtype : ExtensionDtype
"""
# The base implementation uses a naive approach to find the dtype
# for the backing ndarray
arr = cls._from_sequence([], dtype=dtype)
backing = np.empty(shape, dtype=arr._ndarray.dtype)
return arr._from_backing_data(backing)

View File

@ -0,0 +1,207 @@
"""
Helper functions to generate range-like data for DatetimeArray
(and possibly TimedeltaArray/PeriodArray)
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import numpy as np
from pandas._libs.lib import i8max
from pandas._libs.tslibs import (
BaseOffset,
OutOfBoundsDatetime,
Timedelta,
Timestamp,
iNaT,
)
if TYPE_CHECKING:
from pandas._typing import npt
def generate_regular_range(
start: Timestamp | Timedelta | None,
end: Timestamp | Timedelta | None,
periods: int | None,
freq: BaseOffset,
unit: str = "ns",
) -> npt.NDArray[np.intp]:
"""
Generate a range of dates or timestamps with the spans between dates
described by the given `freq` DateOffset.
Parameters
----------
start : Timedelta, Timestamp or None
First point of produced date range.
end : Timedelta, Timestamp or None
Last point of produced date range.
periods : int or None
Number of periods in produced date range.
freq : Tick
Describes space between dates in produced date range.
unit : str, default "ns"
The resolution the output is meant to represent.
Returns
-------
ndarray[np.int64]
Representing the given resolution.
"""
istart = start._value if start is not None else None
iend = end._value if end is not None else None
freq.nanos # raises if non-fixed frequency
td = Timedelta(freq)
b: int
e: int
try:
td = td.as_unit(unit, round_ok=False)
except ValueError as err:
raise ValueError(
f"freq={freq} is incompatible with unit={unit}. "
"Use a lower freq or a higher unit instead."
) from err
stride = int(td._value)
if periods is None and istart is not None and iend is not None:
b = istart
# cannot just use e = Timestamp(end) + 1 because arange breaks when
# stride is too large, see GH10887
e = b + (iend - b) // stride * stride + stride // 2 + 1
elif istart is not None and periods is not None:
b = istart
e = _generate_range_overflow_safe(b, periods, stride, side="start")
elif iend is not None and periods is not None:
e = iend + stride
b = _generate_range_overflow_safe(e, periods, stride, side="end")
else:
raise ValueError(
"at least 'start' or 'end' should be specified if a 'period' is given."
)
with np.errstate(over="raise"):
# If the range is sufficiently large, np.arange may overflow
# and incorrectly return an empty array if not caught.
try:
values = np.arange(b, e, stride, dtype=np.int64)
except FloatingPointError:
xdr = [b]
while xdr[-1] != e:
xdr.append(xdr[-1] + stride)
values = np.array(xdr[:-1], dtype=np.int64)
return values
def _generate_range_overflow_safe(
endpoint: int, periods: int, stride: int, side: str = "start"
) -> int:
"""
Calculate the second endpoint for passing to np.arange, checking
to avoid an integer overflow. Catch OverflowError and re-raise
as OutOfBoundsDatetime.
Parameters
----------
endpoint : int
nanosecond timestamp of the known endpoint of the desired range
periods : int
number of periods in the desired range
stride : int
nanoseconds between periods in the desired range
side : {'start', 'end'}
which end of the range `endpoint` refers to
Returns
-------
other_end : int
Raises
------
OutOfBoundsDatetime
"""
# GH#14187 raise instead of incorrectly wrapping around
assert side in ["start", "end"]
i64max = np.uint64(i8max)
msg = f"Cannot generate range with {side}={endpoint} and periods={periods}"
with np.errstate(over="raise"):
# if periods * strides cannot be multiplied within the *uint64* bounds,
# we cannot salvage the operation by recursing, so raise
try:
addend = np.uint64(periods) * np.uint64(np.abs(stride))
except FloatingPointError as err:
raise OutOfBoundsDatetime(msg) from err
if np.abs(addend) <= i64max:
# relatively easy case without casting concerns
return _generate_range_overflow_safe_signed(endpoint, periods, stride, side)
elif (endpoint > 0 and side == "start" and stride > 0) or (
endpoint < 0 < stride and side == "end"
):
# no chance of not-overflowing
raise OutOfBoundsDatetime(msg)
elif side == "end" and endpoint - stride <= i64max < endpoint:
# in _generate_regular_range we added `stride` thereby overflowing
# the bounds. Adjust to fix this.
return _generate_range_overflow_safe(
endpoint - stride, periods - 1, stride, side
)
# split into smaller pieces
mid_periods = periods // 2
remaining = periods - mid_periods
assert 0 < remaining < periods, (remaining, periods, endpoint, stride)
midpoint = int(_generate_range_overflow_safe(endpoint, mid_periods, stride, side))
return _generate_range_overflow_safe(midpoint, remaining, stride, side)
def _generate_range_overflow_safe_signed(
endpoint: int, periods: int, stride: int, side: str
) -> int:
"""
A special case for _generate_range_overflow_safe where `periods * stride`
can be calculated without overflowing int64 bounds.
"""
assert side in ["start", "end"]
if side == "end":
stride *= -1
with np.errstate(over="raise"):
addend = np.int64(periods) * np.int64(stride)
try:
# easy case with no overflows
result = np.int64(endpoint) + addend
if result == iNaT:
# Putting this into a DatetimeArray/TimedeltaArray
# would incorrectly be interpreted as NaT
raise OverflowError
return int(result)
except (FloatingPointError, OverflowError):
# with endpoint negative and addend positive we risk
# FloatingPointError; with reversed signed we risk OverflowError
pass
# if stride and endpoint had opposite signs, then endpoint + addend
# should never overflow. so they must have the same signs
assert (stride > 0 and endpoint >= 0) or (stride < 0 and endpoint <= 0)
if stride > 0:
# watch out for very special case in which we just slightly
# exceed implementation bounds, but when passing the result to
# np.arange will get a result slightly within the bounds
uresult = np.uint64(endpoint) + np.uint64(addend)
i64max = np.uint64(i8max)
assert uresult > i64max
if uresult <= i64max + np.uint64(stride):
return int(uresult)
raise OutOfBoundsDatetime(
f"Cannot generate range with {side}={endpoint} and periods={periods}"
)

View File

@ -0,0 +1,63 @@
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Any,
)
import numpy as np
from pandas._libs import lib
from pandas.errors import LossySetitemError
from pandas.core.dtypes.cast import np_can_hold_element
from pandas.core.dtypes.common import is_numeric_dtype
if TYPE_CHECKING:
from pandas._typing import (
ArrayLike,
npt,
)
def to_numpy_dtype_inference(
arr: ArrayLike, dtype: npt.DTypeLike | None, na_value, hasna: bool
) -> tuple[npt.DTypeLike, Any]:
if dtype is None and is_numeric_dtype(arr.dtype):
dtype_given = False
if hasna:
if arr.dtype.kind == "b":
dtype = np.dtype(np.object_)
else:
if arr.dtype.kind in "iu":
dtype = np.dtype(np.float64)
else:
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
if na_value is lib.no_default:
na_value = np.nan
else:
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
elif dtype is not None:
dtype = np.dtype(dtype)
dtype_given = True
else:
dtype_given = True
if na_value is lib.no_default:
if dtype is None or not hasna:
na_value = arr.dtype.na_value
elif dtype.kind == "f": # type: ignore[union-attr]
na_value = np.nan
elif dtype.kind == "M": # type: ignore[union-attr]
na_value = np.datetime64("nat")
elif dtype.kind == "m": # type: ignore[union-attr]
na_value = np.timedelta64("nat")
else:
na_value = arr.dtype.na_value
if not dtype_given and hasna:
try:
np_can_hold_element(dtype, na_value) # type: ignore[arg-type]
except LossySetitemError:
dtype = np.dtype(np.object_)
return dtype, na_value

View File

@ -0,0 +1,7 @@
from pandas.core.arrays.arrow.accessors import (
ListAccessor,
StructAccessor,
)
from pandas.core.arrays.arrow.array import ArrowExtensionArray
__all__ = ["ArrowExtensionArray", "StructAccessor", "ListAccessor"]

View File

@ -0,0 +1,50 @@
from __future__ import annotations
import numpy as np
import pyarrow
def pyarrow_array_to_numpy_and_mask(
arr, dtype: np.dtype
) -> tuple[np.ndarray, np.ndarray]:
"""
Convert a primitive pyarrow.Array to a numpy array and boolean mask based
on the buffers of the Array.
At the moment pyarrow.BooleanArray is not supported.
Parameters
----------
arr : pyarrow.Array
dtype : numpy.dtype
Returns
-------
(data, mask)
Tuple of two numpy arrays with the raw data (with specified dtype) and
a boolean mask (validity mask, so False means missing)
"""
dtype = np.dtype(dtype)
if pyarrow.types.is_null(arr.type):
# No initialization of data is needed since everything is null
data = np.empty(len(arr), dtype=dtype)
mask = np.zeros(len(arr), dtype=bool)
return data, mask
buflist = arr.buffers()
# Since Arrow buffers might contain padding and the data might be offset,
# the buffer gets sliced here before handing it to numpy.
# See also https://github.com/pandas-dev/pandas/issues/40896
offset = arr.offset * dtype.itemsize
length = len(arr) * dtype.itemsize
data_buf = buflist[1][offset : offset + length]
data = np.frombuffer(data_buf, dtype=dtype)
bitmask = buflist[0]
if bitmask is not None:
mask = pyarrow.BooleanArray.from_buffers(
pyarrow.bool_(), len(arr), [None, bitmask], offset=arr.offset
)
mask = np.asarray(mask)
else:
mask = np.ones(len(arr), dtype=bool)
return data, mask

View File

@ -0,0 +1,473 @@
"""Accessors for arrow-backed data."""
from __future__ import annotations
from abc import (
ABCMeta,
abstractmethod,
)
from typing import (
TYPE_CHECKING,
cast,
)
from pandas.compat import (
pa_version_under10p1,
pa_version_under11p0,
)
from pandas.core.dtypes.common import is_list_like
if not pa_version_under10p1:
import pyarrow as pa
import pyarrow.compute as pc
from pandas.core.dtypes.dtypes import ArrowDtype
if TYPE_CHECKING:
from collections.abc import Iterator
from pandas import (
DataFrame,
Series,
)
class ArrowAccessor(metaclass=ABCMeta):
@abstractmethod
def __init__(self, data, validation_msg: str) -> None:
self._data = data
self._validation_msg = validation_msg
self._validate(data)
@abstractmethod
def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
pass
def _validate(self, data):
dtype = data.dtype
if pa_version_under10p1 or not isinstance(dtype, ArrowDtype):
# Raise AttributeError so that inspect can handle non-struct Series.
raise AttributeError(self._validation_msg.format(dtype=dtype))
if not self._is_valid_pyarrow_dtype(dtype.pyarrow_dtype):
# Raise AttributeError so that inspect can handle invalid Series.
raise AttributeError(self._validation_msg.format(dtype=dtype))
@property
def _pa_array(self):
return self._data.array._pa_array
class ListAccessor(ArrowAccessor):
"""
Accessor object for list data properties of the Series values.
Parameters
----------
data : Series
Series containing Arrow list data.
"""
def __init__(self, data=None) -> None:
super().__init__(
data,
validation_msg="Can only use the '.list' accessor with "
"'list[pyarrow]' dtype, not {dtype}.",
)
def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
return (
pa.types.is_list(pyarrow_dtype)
or pa.types.is_fixed_size_list(pyarrow_dtype)
or pa.types.is_large_list(pyarrow_dtype)
)
def len(self) -> Series:
"""
Return the length of each list in the Series.
Returns
-------
pandas.Series
The length of each list.
Examples
--------
>>> import pyarrow as pa
>>> s = pd.Series(
... [
... [1, 2, 3],
... [3],
... ],
... dtype=pd.ArrowDtype(pa.list_(
... pa.int64()
... ))
... )
>>> s.list.len()
0 3
1 1
dtype: int32[pyarrow]
"""
from pandas import Series
value_lengths = pc.list_value_length(self._pa_array)
return Series(value_lengths, dtype=ArrowDtype(value_lengths.type))
def __getitem__(self, key: int | slice) -> Series:
"""
Index or slice lists in the Series.
Parameters
----------
key : int | slice
Index or slice of indices to access from each list.
Returns
-------
pandas.Series
The list at requested index.
Examples
--------
>>> import pyarrow as pa
>>> s = pd.Series(
... [
... [1, 2, 3],
... [3],
... ],
... dtype=pd.ArrowDtype(pa.list_(
... pa.int64()
... ))
... )
>>> s.list[0]
0 1
1 3
dtype: int64[pyarrow]
"""
from pandas import Series
if isinstance(key, int):
# TODO: Support negative key but pyarrow does not allow
# element index to be an array.
# if key < 0:
# key = pc.add(key, pc.list_value_length(self._pa_array))
element = pc.list_element(self._pa_array, key)
return Series(element, dtype=ArrowDtype(element.type))
elif isinstance(key, slice):
if pa_version_under11p0:
raise NotImplementedError(
f"List slice not supported by pyarrow {pa.__version__}."
)
# TODO: Support negative start/stop/step, ideally this would be added
# upstream in pyarrow.
start, stop, step = key.start, key.stop, key.step
if start is None:
# TODO: When adding negative step support
# this should be setto last element of array
# when step is negative.
start = 0
if step is None:
step = 1
sliced = pc.list_slice(self._pa_array, start, stop, step)
return Series(sliced, dtype=ArrowDtype(sliced.type))
else:
raise ValueError(f"key must be an int or slice, got {type(key).__name__}")
def __iter__(self) -> Iterator:
raise TypeError(f"'{type(self).__name__}' object is not iterable")
def flatten(self) -> Series:
"""
Flatten list values.
Returns
-------
pandas.Series
The data from all lists in the series flattened.
Examples
--------
>>> import pyarrow as pa
>>> s = pd.Series(
... [
... [1, 2, 3],
... [3],
... ],
... dtype=pd.ArrowDtype(pa.list_(
... pa.int64()
... ))
... )
>>> s.list.flatten()
0 1
1 2
2 3
3 3
dtype: int64[pyarrow]
"""
from pandas import Series
flattened = pc.list_flatten(self._pa_array)
return Series(flattened, dtype=ArrowDtype(flattened.type))
class StructAccessor(ArrowAccessor):
"""
Accessor object for structured data properties of the Series values.
Parameters
----------
data : Series
Series containing Arrow struct data.
"""
def __init__(self, data=None) -> None:
super().__init__(
data,
validation_msg=(
"Can only use the '.struct' accessor with 'struct[pyarrow]' "
"dtype, not {dtype}."
),
)
def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
return pa.types.is_struct(pyarrow_dtype)
@property
def dtypes(self) -> Series:
"""
Return the dtype object of each child field of the struct.
Returns
-------
pandas.Series
The data type of each child field.
Examples
--------
>>> import pyarrow as pa
>>> s = pd.Series(
... [
... {"version": 1, "project": "pandas"},
... {"version": 2, "project": "pandas"},
... {"version": 1, "project": "numpy"},
... ],
... dtype=pd.ArrowDtype(pa.struct(
... [("version", pa.int64()), ("project", pa.string())]
... ))
... )
>>> s.struct.dtypes
version int64[pyarrow]
project string[pyarrow]
dtype: object
"""
from pandas import (
Index,
Series,
)
pa_type = self._data.dtype.pyarrow_dtype
types = [ArrowDtype(struct.type) for struct in pa_type]
names = [struct.name for struct in pa_type]
return Series(types, index=Index(names))
def field(
self,
name_or_index: list[str]
| list[bytes]
| list[int]
| pc.Expression
| bytes
| str
| int,
) -> Series:
"""
Extract a child field of a struct as a Series.
Parameters
----------
name_or_index : str | bytes | int | expression | list
Name or index of the child field to extract.
For list-like inputs, this will index into a nested
struct.
Returns
-------
pandas.Series
The data corresponding to the selected child field.
See Also
--------
Series.struct.explode : Return all child fields as a DataFrame.
Notes
-----
The name of the resulting Series will be set using the following
rules:
- For string, bytes, or integer `name_or_index` (or a list of these, for
a nested selection), the Series name is set to the selected
field's name.
- For a :class:`pyarrow.compute.Expression`, this is set to
the string form of the expression.
- For list-like `name_or_index`, the name will be set to the
name of the final field selected.
Examples
--------
>>> import pyarrow as pa
>>> s = pd.Series(
... [
... {"version": 1, "project": "pandas"},
... {"version": 2, "project": "pandas"},
... {"version": 1, "project": "numpy"},
... ],
... dtype=pd.ArrowDtype(pa.struct(
... [("version", pa.int64()), ("project", pa.string())]
... ))
... )
Extract by field name.
>>> s.struct.field("project")
0 pandas
1 pandas
2 numpy
Name: project, dtype: string[pyarrow]
Extract by field index.
>>> s.struct.field(0)
0 1
1 2
2 1
Name: version, dtype: int64[pyarrow]
Or an expression
>>> import pyarrow.compute as pc
>>> s.struct.field(pc.field("project"))
0 pandas
1 pandas
2 numpy
Name: project, dtype: string[pyarrow]
For nested struct types, you can pass a list of values to index
multiple levels:
>>> version_type = pa.struct([
... ("major", pa.int64()),
... ("minor", pa.int64()),
... ])
>>> s = pd.Series(
... [
... {"version": {"major": 1, "minor": 5}, "project": "pandas"},
... {"version": {"major": 2, "minor": 1}, "project": "pandas"},
... {"version": {"major": 1, "minor": 26}, "project": "numpy"},
... ],
... dtype=pd.ArrowDtype(pa.struct(
... [("version", version_type), ("project", pa.string())]
... ))
... )
>>> s.struct.field(["version", "minor"])
0 5
1 1
2 26
Name: minor, dtype: int64[pyarrow]
>>> s.struct.field([0, 0])
0 1
1 2
2 1
Name: major, dtype: int64[pyarrow]
"""
from pandas import Series
def get_name(
level_name_or_index: list[str]
| list[bytes]
| list[int]
| pc.Expression
| bytes
| str
| int,
data: pa.ChunkedArray,
):
if isinstance(level_name_or_index, int):
name = data.type.field(level_name_or_index).name
elif isinstance(level_name_or_index, (str, bytes)):
name = level_name_or_index
elif isinstance(level_name_or_index, pc.Expression):
name = str(level_name_or_index)
elif is_list_like(level_name_or_index):
# For nested input like [2, 1, 2]
# iteratively get the struct and field name. The last
# one is used for the name of the index.
level_name_or_index = list(reversed(level_name_or_index))
selected = data
while level_name_or_index:
# we need the cast, otherwise mypy complains about
# getting ints, bytes, or str here, which isn't possible.
level_name_or_index = cast(list, level_name_or_index)
name_or_index = level_name_or_index.pop()
name = get_name(name_or_index, selected)
selected = selected.type.field(selected.type.get_field_index(name))
name = selected.name
else:
raise ValueError(
"name_or_index must be an int, str, bytes, "
"pyarrow.compute.Expression, or list of those"
)
return name
pa_arr = self._data.array._pa_array
name = get_name(name_or_index, pa_arr)
field_arr = pc.struct_field(pa_arr, name_or_index)
return Series(
field_arr,
dtype=ArrowDtype(field_arr.type),
index=self._data.index,
name=name,
)
def explode(self) -> DataFrame:
"""
Extract all child fields of a struct as a DataFrame.
Returns
-------
pandas.DataFrame
The data corresponding to all child fields.
See Also
--------
Series.struct.field : Return a single child field as a Series.
Examples
--------
>>> import pyarrow as pa
>>> s = pd.Series(
... [
... {"version": 1, "project": "pandas"},
... {"version": 2, "project": "pandas"},
... {"version": 1, "project": "numpy"},
... ],
... dtype=pd.ArrowDtype(pa.struct(
... [("version", pa.int64()), ("project", pa.string())]
... ))
... )
>>> s.struct.explode()
version project
0 1 pandas
1 2 pandas
2 1 numpy
"""
from pandas import concat
pa_type = self._pa_array.type
return concat(
[self.field(i) for i in range(pa_type.num_fields)], axis="columns"
)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,174 @@
from __future__ import annotations
import json
from typing import TYPE_CHECKING
import pyarrow
from pandas.compat import pa_version_under14p1
from pandas.core.dtypes.dtypes import (
IntervalDtype,
PeriodDtype,
)
from pandas.core.arrays.interval import VALID_CLOSED
if TYPE_CHECKING:
from pandas._typing import IntervalClosedType
class ArrowPeriodType(pyarrow.ExtensionType):
def __init__(self, freq) -> None:
# attributes need to be set first before calling
# super init (as that calls serialize)
self._freq = freq
pyarrow.ExtensionType.__init__(self, pyarrow.int64(), "pandas.period")
@property
def freq(self):
return self._freq
def __arrow_ext_serialize__(self) -> bytes:
metadata = {"freq": self.freq}
return json.dumps(metadata).encode()
@classmethod
def __arrow_ext_deserialize__(cls, storage_type, serialized) -> ArrowPeriodType:
metadata = json.loads(serialized.decode())
return ArrowPeriodType(metadata["freq"])
def __eq__(self, other):
if isinstance(other, pyarrow.BaseExtensionType):
return type(self) == type(other) and self.freq == other.freq
else:
return NotImplemented
def __ne__(self, other) -> bool:
return not self == other
def __hash__(self) -> int:
return hash((str(self), self.freq))
def to_pandas_dtype(self) -> PeriodDtype:
return PeriodDtype(freq=self.freq)
# register the type with a dummy instance
_period_type = ArrowPeriodType("D")
pyarrow.register_extension_type(_period_type)
class ArrowIntervalType(pyarrow.ExtensionType):
def __init__(self, subtype, closed: IntervalClosedType) -> None:
# attributes need to be set first before calling
# super init (as that calls serialize)
assert closed in VALID_CLOSED
self._closed: IntervalClosedType = closed
if not isinstance(subtype, pyarrow.DataType):
subtype = pyarrow.type_for_alias(str(subtype))
self._subtype = subtype
storage_type = pyarrow.struct([("left", subtype), ("right", subtype)])
pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval")
@property
def subtype(self):
return self._subtype
@property
def closed(self) -> IntervalClosedType:
return self._closed
def __arrow_ext_serialize__(self) -> bytes:
metadata = {"subtype": str(self.subtype), "closed": self.closed}
return json.dumps(metadata).encode()
@classmethod
def __arrow_ext_deserialize__(cls, storage_type, serialized) -> ArrowIntervalType:
metadata = json.loads(serialized.decode())
subtype = pyarrow.type_for_alias(metadata["subtype"])
closed = metadata["closed"]
return ArrowIntervalType(subtype, closed)
def __eq__(self, other):
if isinstance(other, pyarrow.BaseExtensionType):
return (
type(self) == type(other)
and self.subtype == other.subtype
and self.closed == other.closed
)
else:
return NotImplemented
def __ne__(self, other) -> bool:
return not self == other
def __hash__(self) -> int:
return hash((str(self), str(self.subtype), self.closed))
def to_pandas_dtype(self) -> IntervalDtype:
return IntervalDtype(self.subtype.to_pandas_dtype(), self.closed)
# register the type with a dummy instance
_interval_type = ArrowIntervalType(pyarrow.int64(), "left")
pyarrow.register_extension_type(_interval_type)
_ERROR_MSG = """\
Disallowed deserialization of 'arrow.py_extension_type':
storage_type = {storage_type}
serialized = {serialized}
pickle disassembly:\n{pickle_disassembly}
Reading of untrusted Parquet or Feather files with a PyExtensionType column
allows arbitrary code execution.
If you trust this file, you can enable reading the extension type by one of:
- upgrading to pyarrow >= 14.0.1, and call `pa.PyExtensionType.set_auto_load(True)`
- install pyarrow-hotfix (`pip install pyarrow-hotfix`) and disable it by running
`import pyarrow_hotfix; pyarrow_hotfix.uninstall()`
We strongly recommend updating your Parquet/Feather files to use extension types
derived from `pyarrow.ExtensionType` instead, and register this type explicitly.
"""
def patch_pyarrow():
# starting from pyarrow 14.0.1, it has its own mechanism
if not pa_version_under14p1:
return
# if https://github.com/pitrou/pyarrow-hotfix was installed and enabled
if getattr(pyarrow, "_hotfix_installed", False):
return
class ForbiddenExtensionType(pyarrow.ExtensionType):
def __arrow_ext_serialize__(self):
return b""
@classmethod
def __arrow_ext_deserialize__(cls, storage_type, serialized):
import io
import pickletools
out = io.StringIO()
pickletools.dis(serialized, out)
raise RuntimeError(
_ERROR_MSG.format(
storage_type=storage_type,
serialized=serialized,
pickle_disassembly=out.getvalue(),
)
)
pyarrow.unregister_extension_type("arrow.py_extension_type")
pyarrow.register_extension_type(
ForbiddenExtensionType(pyarrow.null(), "arrow.py_extension_type")
)
pyarrow._hotfix_installed = True
patch_pyarrow()

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,407 @@
from __future__ import annotations
import numbers
from typing import (
TYPE_CHECKING,
ClassVar,
cast,
)
import numpy as np
from pandas._libs import (
lib,
missing as libmissing,
)
from pandas.core.dtypes.common import is_list_like
from pandas.core.dtypes.dtypes import register_extension_dtype
from pandas.core.dtypes.missing import isna
from pandas.core import ops
from pandas.core.array_algos import masked_accumulations
from pandas.core.arrays.masked import (
BaseMaskedArray,
BaseMaskedDtype,
)
if TYPE_CHECKING:
import pyarrow
from pandas._typing import (
Dtype,
DtypeObj,
Self,
npt,
type_t,
)
@register_extension_dtype
class BooleanDtype(BaseMaskedDtype):
"""
Extension dtype for boolean data.
.. warning::
BooleanDtype is considered experimental. The implementation and
parts of the API may change without warning.
Attributes
----------
None
Methods
-------
None
Examples
--------
>>> pd.BooleanDtype()
BooleanDtype
"""
name: ClassVar[str] = "boolean"
# https://github.com/python/mypy/issues/4125
# error: Signature of "type" incompatible with supertype "BaseMaskedDtype"
@property
def type(self) -> type: # type: ignore[override]
return np.bool_
@property
def kind(self) -> str:
return "b"
@property
def numpy_dtype(self) -> np.dtype:
return np.dtype("bool")
@classmethod
def construct_array_type(cls) -> type_t[BooleanArray]:
"""
Return the array type associated with this dtype.
Returns
-------
type
"""
return BooleanArray
def __repr__(self) -> str:
return "BooleanDtype"
@property
def _is_boolean(self) -> bool:
return True
@property
def _is_numeric(self) -> bool:
return True
def __from_arrow__(
self, array: pyarrow.Array | pyarrow.ChunkedArray
) -> BooleanArray:
"""
Construct BooleanArray from pyarrow Array/ChunkedArray.
"""
import pyarrow
if array.type != pyarrow.bool_() and not pyarrow.types.is_null(array.type):
raise TypeError(f"Expected array of boolean type, got {array.type} instead")
if isinstance(array, pyarrow.Array):
chunks = [array]
length = len(array)
else:
# pyarrow.ChunkedArray
chunks = array.chunks
length = array.length()
if pyarrow.types.is_null(array.type):
mask = np.ones(length, dtype=bool)
# No need to init data, since all null
data = np.empty(length, dtype=bool)
return BooleanArray(data, mask)
results = []
for arr in chunks:
buflist = arr.buffers()
data = pyarrow.BooleanArray.from_buffers(
arr.type, len(arr), [None, buflist[1]], offset=arr.offset
).to_numpy(zero_copy_only=False)
if arr.null_count != 0:
mask = pyarrow.BooleanArray.from_buffers(
arr.type, len(arr), [None, buflist[0]], offset=arr.offset
).to_numpy(zero_copy_only=False)
mask = ~mask
else:
mask = np.zeros(len(arr), dtype=bool)
bool_arr = BooleanArray(data, mask)
results.append(bool_arr)
if not results:
return BooleanArray(
np.array([], dtype=np.bool_), np.array([], dtype=np.bool_)
)
else:
return BooleanArray._concat_same_type(results)
def coerce_to_array(
values, mask=None, copy: bool = False
) -> tuple[np.ndarray, np.ndarray]:
"""
Coerce the input values array to numpy arrays with a mask.
Parameters
----------
values : 1D list-like
mask : bool 1D array, optional
copy : bool, default False
if True, copy the input
Returns
-------
tuple of (values, mask)
"""
if isinstance(values, BooleanArray):
if mask is not None:
raise ValueError("cannot pass mask for BooleanArray input")
values, mask = values._data, values._mask
if copy:
values = values.copy()
mask = mask.copy()
return values, mask
mask_values = None
if isinstance(values, np.ndarray) and values.dtype == np.bool_:
if copy:
values = values.copy()
elif isinstance(values, np.ndarray) and values.dtype.kind in "iufcb":
mask_values = isna(values)
values_bool = np.zeros(len(values), dtype=bool)
values_bool[~mask_values] = values[~mask_values].astype(bool)
if not np.all(
values_bool[~mask_values].astype(values.dtype) == values[~mask_values]
):
raise TypeError("Need to pass bool-like values")
values = values_bool
else:
values_object = np.asarray(values, dtype=object)
inferred_dtype = lib.infer_dtype(values_object, skipna=True)
integer_like = ("floating", "integer", "mixed-integer-float")
if inferred_dtype not in ("boolean", "empty") + integer_like:
raise TypeError("Need to pass bool-like values")
# mypy does not narrow the type of mask_values to npt.NDArray[np.bool_]
# within this branch, it assumes it can also be None
mask_values = cast("npt.NDArray[np.bool_]", isna(values_object))
values = np.zeros(len(values), dtype=bool)
values[~mask_values] = values_object[~mask_values].astype(bool)
# if the values were integer-like, validate it were actually 0/1's
if (inferred_dtype in integer_like) and not (
np.all(
values[~mask_values].astype(float)
== values_object[~mask_values].astype(float)
)
):
raise TypeError("Need to pass bool-like values")
if mask is None and mask_values is None:
mask = np.zeros(values.shape, dtype=bool)
elif mask is None:
mask = mask_values
else:
if isinstance(mask, np.ndarray) and mask.dtype == np.bool_:
if mask_values is not None:
mask = mask | mask_values
else:
if copy:
mask = mask.copy()
else:
mask = np.array(mask, dtype=bool)
if mask_values is not None:
mask = mask | mask_values
if values.shape != mask.shape:
raise ValueError("values.shape and mask.shape must match")
return values, mask
class BooleanArray(BaseMaskedArray):
"""
Array of boolean (True/False) data with missing values.
This is a pandas Extension array for boolean data, under the hood
represented by 2 numpy arrays: a boolean array with the data and
a boolean array with the mask (True indicating missing).
BooleanArray implements Kleene logic (sometimes called three-value
logic) for logical operations. See :ref:`boolean.kleene` for more.
To construct an BooleanArray from generic array-like input, use
:func:`pandas.array` specifying ``dtype="boolean"`` (see examples
below).
.. warning::
BooleanArray is considered experimental. The implementation and
parts of the API may change without warning.
Parameters
----------
values : numpy.ndarray
A 1-d boolean-dtype array with the data.
mask : numpy.ndarray
A 1-d boolean-dtype array indicating missing values (True
indicates missing).
copy : bool, default False
Whether to copy the `values` and `mask` arrays.
Attributes
----------
None
Methods
-------
None
Returns
-------
BooleanArray
Examples
--------
Create an BooleanArray with :func:`pandas.array`:
>>> pd.array([True, False, None], dtype="boolean")
<BooleanArray>
[True, False, <NA>]
Length: 3, dtype: boolean
"""
# The value used to fill '_data' to avoid upcasting
_internal_fill_value = False
# Fill values used for any/all
# Incompatible types in assignment (expression has type "bool", base class
# "BaseMaskedArray" defined the type as "<typing special form>")
_truthy_value = True # type: ignore[assignment]
_falsey_value = False # type: ignore[assignment]
_TRUE_VALUES = {"True", "TRUE", "true", "1", "1.0"}
_FALSE_VALUES = {"False", "FALSE", "false", "0", "0.0"}
@classmethod
def _simple_new(cls, values: np.ndarray, mask: npt.NDArray[np.bool_]) -> Self:
result = super()._simple_new(values, mask)
result._dtype = BooleanDtype()
return result
def __init__(
self, values: np.ndarray, mask: np.ndarray, copy: bool = False
) -> None:
if not (isinstance(values, np.ndarray) and values.dtype == np.bool_):
raise TypeError(
"values should be boolean numpy array. Use "
"the 'pd.array' function instead"
)
self._dtype = BooleanDtype()
super().__init__(values, mask, copy=copy)
@property
def dtype(self) -> BooleanDtype:
return self._dtype
@classmethod
def _from_sequence_of_strings(
cls,
strings: list[str],
*,
dtype: Dtype | None = None,
copy: bool = False,
true_values: list[str] | None = None,
false_values: list[str] | None = None,
) -> BooleanArray:
true_values_union = cls._TRUE_VALUES.union(true_values or [])
false_values_union = cls._FALSE_VALUES.union(false_values or [])
def map_string(s) -> bool:
if s in true_values_union:
return True
elif s in false_values_union:
return False
else:
raise ValueError(f"{s} cannot be cast to bool")
scalars = np.array(strings, dtype=object)
mask = isna(scalars)
scalars[~mask] = list(map(map_string, scalars[~mask]))
return cls._from_sequence(scalars, dtype=dtype, copy=copy)
_HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_)
@classmethod
def _coerce_to_array(
cls, value, *, dtype: DtypeObj, copy: bool = False
) -> tuple[np.ndarray, np.ndarray]:
if dtype:
assert dtype == "boolean"
return coerce_to_array(value, copy=copy)
def _logical_method(self, other, op):
assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"}
other_is_scalar = lib.is_scalar(other)
mask = None
if isinstance(other, BooleanArray):
other, mask = other._data, other._mask
elif is_list_like(other):
other = np.asarray(other, dtype="bool")
if other.ndim > 1:
raise NotImplementedError("can only perform ops with 1-d structures")
other, mask = coerce_to_array(other, copy=False)
elif isinstance(other, np.bool_):
other = other.item()
if other_is_scalar and other is not libmissing.NA and not lib.is_bool(other):
raise TypeError(
"'other' should be pandas.NA or a bool. "
f"Got {type(other).__name__} instead."
)
if not other_is_scalar and len(self) != len(other):
raise ValueError("Lengths must match")
if op.__name__ in {"or_", "ror_"}:
result, mask = ops.kleene_or(self._data, other, self._mask, mask)
elif op.__name__ in {"and_", "rand_"}:
result, mask = ops.kleene_and(self._data, other, self._mask, mask)
else:
# i.e. xor, rxor
result, mask = ops.kleene_xor(self._data, other, self._mask, mask)
# i.e. BooleanArray
return self._maybe_mask_result(result, mask)
def _accumulate(
self, name: str, *, skipna: bool = True, **kwargs
) -> BaseMaskedArray:
data = self._data
mask = self._mask
if name in ("cummin", "cummax"):
op = getattr(masked_accumulations, name)
data, mask = op(data, mask, skipna=skipna, **kwargs)
return self._simple_new(data, mask)
else:
from pandas.core.arrays import IntegerArray
return IntegerArray(data.astype(int), mask)._accumulate(
name, skipna=skipna, **kwargs
)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,173 @@
from __future__ import annotations
from typing import ClassVar
import numpy as np
from pandas.core.dtypes.base import register_extension_dtype
from pandas.core.dtypes.common import is_float_dtype
from pandas.core.arrays.numeric import (
NumericArray,
NumericDtype,
)
class FloatingDtype(NumericDtype):
"""
An ExtensionDtype to hold a single size of floating dtype.
These specific implementations are subclasses of the non-public
FloatingDtype. For example we have Float32Dtype to represent float32.
The attributes name & type are set when these subclasses are created.
"""
_default_np_dtype = np.dtype(np.float64)
_checker = is_float_dtype
@classmethod
def construct_array_type(cls) -> type[FloatingArray]:
"""
Return the array type associated with this dtype.
Returns
-------
type
"""
return FloatingArray
@classmethod
def _get_dtype_mapping(cls) -> dict[np.dtype, FloatingDtype]:
return NUMPY_FLOAT_TO_DTYPE
@classmethod
def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray:
"""
Safely cast the values to the given dtype.
"safe" in this context means the casting is lossless.
"""
# This is really only here for compatibility with IntegerDtype
# Here for compat with IntegerDtype
return values.astype(dtype, copy=copy)
class FloatingArray(NumericArray):
"""
Array of floating (optional missing) values.
.. warning::
FloatingArray is currently experimental, and its API or internal
implementation may change without warning. Especially the behaviour
regarding NaN (distinct from NA missing values) is subject to change.
We represent a FloatingArray with 2 numpy arrays:
- data: contains a numpy float array of the appropriate dtype
- mask: a boolean array holding a mask on the data, True is missing
To construct an FloatingArray from generic array-like input, use
:func:`pandas.array` with one of the float dtypes (see examples).
See :ref:`integer_na` for more.
Parameters
----------
values : numpy.ndarray
A 1-d float-dtype array.
mask : numpy.ndarray
A 1-d boolean-dtype array indicating missing values.
copy : bool, default False
Whether to copy the `values` and `mask`.
Attributes
----------
None
Methods
-------
None
Returns
-------
FloatingArray
Examples
--------
Create an FloatingArray with :func:`pandas.array`:
>>> pd.array([0.1, None, 0.3], dtype=pd.Float32Dtype())
<FloatingArray>
[0.1, <NA>, 0.3]
Length: 3, dtype: Float32
String aliases for the dtypes are also available. They are capitalized.
>>> pd.array([0.1, None, 0.3], dtype="Float32")
<FloatingArray>
[0.1, <NA>, 0.3]
Length: 3, dtype: Float32
"""
_dtype_cls = FloatingDtype
# The value used to fill '_data' to avoid upcasting
_internal_fill_value = np.nan
# Fill values used for any/all
# Incompatible types in assignment (expression has type "float", base class
# "BaseMaskedArray" defined the type as "<typing special form>")
_truthy_value = 1.0 # type: ignore[assignment]
_falsey_value = 0.0 # type: ignore[assignment]
_dtype_docstring = """
An ExtensionDtype for {dtype} data.
This dtype uses ``pd.NA`` as missing value indicator.
Attributes
----------
None
Methods
-------
None
Examples
--------
For Float32Dtype:
>>> ser = pd.Series([2.25, pd.NA], dtype=pd.Float32Dtype())
>>> ser.dtype
Float32Dtype()
For Float64Dtype:
>>> ser = pd.Series([2.25, pd.NA], dtype=pd.Float64Dtype())
>>> ser.dtype
Float64Dtype()
"""
# create the Dtype
@register_extension_dtype
class Float32Dtype(FloatingDtype):
type = np.float32
name: ClassVar[str] = "Float32"
__doc__ = _dtype_docstring.format(dtype="float32")
@register_extension_dtype
class Float64Dtype(FloatingDtype):
type = np.float64
name: ClassVar[str] = "Float64"
__doc__ = _dtype_docstring.format(dtype="float64")
NUMPY_FLOAT_TO_DTYPE: dict[np.dtype, FloatingDtype] = {
np.dtype(np.float32): Float32Dtype(),
np.dtype(np.float64): Float64Dtype(),
}

View File

@ -0,0 +1,272 @@
from __future__ import annotations
from typing import ClassVar
import numpy as np
from pandas.core.dtypes.base import register_extension_dtype
from pandas.core.dtypes.common import is_integer_dtype
from pandas.core.arrays.numeric import (
NumericArray,
NumericDtype,
)
class IntegerDtype(NumericDtype):
"""
An ExtensionDtype to hold a single size & kind of integer dtype.
These specific implementations are subclasses of the non-public
IntegerDtype. For example, we have Int8Dtype to represent signed int 8s.
The attributes name & type are set when these subclasses are created.
"""
_default_np_dtype = np.dtype(np.int64)
_checker = is_integer_dtype
@classmethod
def construct_array_type(cls) -> type[IntegerArray]:
"""
Return the array type associated with this dtype.
Returns
-------
type
"""
return IntegerArray
@classmethod
def _get_dtype_mapping(cls) -> dict[np.dtype, IntegerDtype]:
return NUMPY_INT_TO_DTYPE
@classmethod
def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray:
"""
Safely cast the values to the given dtype.
"safe" in this context means the casting is lossless. e.g. if 'values'
has a floating dtype, each value must be an integer.
"""
try:
return values.astype(dtype, casting="safe", copy=copy)
except TypeError as err:
casted = values.astype(dtype, copy=copy)
if (casted == values).all():
return casted
raise TypeError(
f"cannot safely cast non-equivalent {values.dtype} to {np.dtype(dtype)}"
) from err
class IntegerArray(NumericArray):
"""
Array of integer (optional missing) values.
Uses :attr:`pandas.NA` as the missing value.
.. warning::
IntegerArray is currently experimental, and its API or internal
implementation may change without warning.
We represent an IntegerArray with 2 numpy arrays:
- data: contains a numpy integer array of the appropriate dtype
- mask: a boolean array holding a mask on the data, True is missing
To construct an IntegerArray from generic array-like input, use
:func:`pandas.array` with one of the integer dtypes (see examples).
See :ref:`integer_na` for more.
Parameters
----------
values : numpy.ndarray
A 1-d integer-dtype array.
mask : numpy.ndarray
A 1-d boolean-dtype array indicating missing values.
copy : bool, default False
Whether to copy the `values` and `mask`.
Attributes
----------
None
Methods
-------
None
Returns
-------
IntegerArray
Examples
--------
Create an IntegerArray with :func:`pandas.array`.
>>> int_array = pd.array([1, None, 3], dtype=pd.Int32Dtype())
>>> int_array
<IntegerArray>
[1, <NA>, 3]
Length: 3, dtype: Int32
String aliases for the dtypes are also available. They are capitalized.
>>> pd.array([1, None, 3], dtype='Int32')
<IntegerArray>
[1, <NA>, 3]
Length: 3, dtype: Int32
>>> pd.array([1, None, 3], dtype='UInt16')
<IntegerArray>
[1, <NA>, 3]
Length: 3, dtype: UInt16
"""
_dtype_cls = IntegerDtype
# The value used to fill '_data' to avoid upcasting
_internal_fill_value = 1
# Fill values used for any/all
# Incompatible types in assignment (expression has type "int", base class
# "BaseMaskedArray" defined the type as "<typing special form>")
_truthy_value = 1 # type: ignore[assignment]
_falsey_value = 0 # type: ignore[assignment]
_dtype_docstring = """
An ExtensionDtype for {dtype} integer data.
Uses :attr:`pandas.NA` as its missing value, rather than :attr:`numpy.nan`.
Attributes
----------
None
Methods
-------
None
Examples
--------
For Int8Dtype:
>>> ser = pd.Series([2, pd.NA], dtype=pd.Int8Dtype())
>>> ser.dtype
Int8Dtype()
For Int16Dtype:
>>> ser = pd.Series([2, pd.NA], dtype=pd.Int16Dtype())
>>> ser.dtype
Int16Dtype()
For Int32Dtype:
>>> ser = pd.Series([2, pd.NA], dtype=pd.Int32Dtype())
>>> ser.dtype
Int32Dtype()
For Int64Dtype:
>>> ser = pd.Series([2, pd.NA], dtype=pd.Int64Dtype())
>>> ser.dtype
Int64Dtype()
For UInt8Dtype:
>>> ser = pd.Series([2, pd.NA], dtype=pd.UInt8Dtype())
>>> ser.dtype
UInt8Dtype()
For UInt16Dtype:
>>> ser = pd.Series([2, pd.NA], dtype=pd.UInt16Dtype())
>>> ser.dtype
UInt16Dtype()
For UInt32Dtype:
>>> ser = pd.Series([2, pd.NA], dtype=pd.UInt32Dtype())
>>> ser.dtype
UInt32Dtype()
For UInt64Dtype:
>>> ser = pd.Series([2, pd.NA], dtype=pd.UInt64Dtype())
>>> ser.dtype
UInt64Dtype()
"""
# create the Dtype
@register_extension_dtype
class Int8Dtype(IntegerDtype):
type = np.int8
name: ClassVar[str] = "Int8"
__doc__ = _dtype_docstring.format(dtype="int8")
@register_extension_dtype
class Int16Dtype(IntegerDtype):
type = np.int16
name: ClassVar[str] = "Int16"
__doc__ = _dtype_docstring.format(dtype="int16")
@register_extension_dtype
class Int32Dtype(IntegerDtype):
type = np.int32
name: ClassVar[str] = "Int32"
__doc__ = _dtype_docstring.format(dtype="int32")
@register_extension_dtype
class Int64Dtype(IntegerDtype):
type = np.int64
name: ClassVar[str] = "Int64"
__doc__ = _dtype_docstring.format(dtype="int64")
@register_extension_dtype
class UInt8Dtype(IntegerDtype):
type = np.uint8
name: ClassVar[str] = "UInt8"
__doc__ = _dtype_docstring.format(dtype="uint8")
@register_extension_dtype
class UInt16Dtype(IntegerDtype):
type = np.uint16
name: ClassVar[str] = "UInt16"
__doc__ = _dtype_docstring.format(dtype="uint16")
@register_extension_dtype
class UInt32Dtype(IntegerDtype):
type = np.uint32
name: ClassVar[str] = "UInt32"
__doc__ = _dtype_docstring.format(dtype="uint32")
@register_extension_dtype
class UInt64Dtype(IntegerDtype):
type = np.uint64
name: ClassVar[str] = "UInt64"
__doc__ = _dtype_docstring.format(dtype="uint64")
NUMPY_INT_TO_DTYPE: dict[np.dtype, IntegerDtype] = {
np.dtype(np.int8): Int8Dtype(),
np.dtype(np.int16): Int16Dtype(),
np.dtype(np.int32): Int32Dtype(),
np.dtype(np.int64): Int64Dtype(),
np.dtype(np.uint8): UInt8Dtype(),
np.dtype(np.uint16): UInt16Dtype(),
np.dtype(np.uint32): UInt32Dtype(),
np.dtype(np.uint64): UInt64Dtype(),
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,286 @@
from __future__ import annotations
import numbers
from typing import (
TYPE_CHECKING,
Any,
Callable,
)
import numpy as np
from pandas._libs import (
lib,
missing as libmissing,
)
from pandas.errors import AbstractMethodError
from pandas.util._decorators import cache_readonly
from pandas.core.dtypes.common import (
is_integer_dtype,
is_string_dtype,
pandas_dtype,
)
from pandas.core.arrays.masked import (
BaseMaskedArray,
BaseMaskedDtype,
)
if TYPE_CHECKING:
from collections.abc import Mapping
import pyarrow
from pandas._typing import (
Dtype,
DtypeObj,
Self,
npt,
)
class NumericDtype(BaseMaskedDtype):
_default_np_dtype: np.dtype
_checker: Callable[[Any], bool] # is_foo_dtype
def __repr__(self) -> str:
return f"{self.name}Dtype()"
@cache_readonly
def is_signed_integer(self) -> bool:
return self.kind == "i"
@cache_readonly
def is_unsigned_integer(self) -> bool:
return self.kind == "u"
@property
def _is_numeric(self) -> bool:
return True
def __from_arrow__(
self, array: pyarrow.Array | pyarrow.ChunkedArray
) -> BaseMaskedArray:
"""
Construct IntegerArray/FloatingArray from pyarrow Array/ChunkedArray.
"""
import pyarrow
from pandas.core.arrays.arrow._arrow_utils import (
pyarrow_array_to_numpy_and_mask,
)
array_class = self.construct_array_type()
pyarrow_type = pyarrow.from_numpy_dtype(self.type)
if not array.type.equals(pyarrow_type) and not pyarrow.types.is_null(
array.type
):
# test_from_arrow_type_error raise for string, but allow
# through itemsize conversion GH#31896
rt_dtype = pandas_dtype(array.type.to_pandas_dtype())
if rt_dtype.kind not in "iuf":
# Could allow "c" or potentially disallow float<->int conversion,
# but at the moment we specifically test that uint<->int works
raise TypeError(
f"Expected array of {self} type, got {array.type} instead"
)
array = array.cast(pyarrow_type)
if isinstance(array, pyarrow.ChunkedArray):
# TODO this "if" can be removed when requiring pyarrow >= 10.0, which fixed
# combine_chunks for empty arrays https://github.com/apache/arrow/pull/13757
if array.num_chunks == 0:
array = pyarrow.array([], type=array.type)
else:
array = array.combine_chunks()
data, mask = pyarrow_array_to_numpy_and_mask(array, dtype=self.numpy_dtype)
return array_class(data.copy(), ~mask, copy=False)
@classmethod
def _get_dtype_mapping(cls) -> Mapping[np.dtype, NumericDtype]:
raise AbstractMethodError(cls)
@classmethod
def _standardize_dtype(cls, dtype: NumericDtype | str | np.dtype) -> NumericDtype:
"""
Convert a string representation or a numpy dtype to NumericDtype.
"""
if isinstance(dtype, str) and (dtype.startswith(("Int", "UInt", "Float"))):
# Avoid DeprecationWarning from NumPy about np.dtype("Int64")
# https://github.com/numpy/numpy/pull/7476
dtype = dtype.lower()
if not isinstance(dtype, NumericDtype):
mapping = cls._get_dtype_mapping()
try:
dtype = mapping[np.dtype(dtype)]
except KeyError as err:
raise ValueError(f"invalid dtype specified {dtype}") from err
return dtype
@classmethod
def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray:
"""
Safely cast the values to the given dtype.
"safe" in this context means the casting is lossless.
"""
raise AbstractMethodError(cls)
def _coerce_to_data_and_mask(
values, dtype, copy: bool, dtype_cls: type[NumericDtype], default_dtype: np.dtype
):
checker = dtype_cls._checker
mask = None
inferred_type = None
if dtype is None and hasattr(values, "dtype"):
if checker(values.dtype):
dtype = values.dtype
if dtype is not None:
dtype = dtype_cls._standardize_dtype(dtype)
cls = dtype_cls.construct_array_type()
if isinstance(values, cls):
values, mask = values._data, values._mask
if dtype is not None:
values = values.astype(dtype.numpy_dtype, copy=False)
if copy:
values = values.copy()
mask = mask.copy()
return values, mask, dtype, inferred_type
original = values
if not copy:
values = np.asarray(values)
else:
values = np.array(values, copy=copy)
inferred_type = None
if values.dtype == object or is_string_dtype(values.dtype):
inferred_type = lib.infer_dtype(values, skipna=True)
if inferred_type == "boolean" and dtype is None:
name = dtype_cls.__name__.strip("_")
raise TypeError(f"{values.dtype} cannot be converted to {name}")
elif values.dtype.kind == "b" and checker(dtype):
if not copy:
values = np.asarray(values, dtype=default_dtype)
else:
values = np.array(values, dtype=default_dtype, copy=copy)
elif values.dtype.kind not in "iuf":
name = dtype_cls.__name__.strip("_")
raise TypeError(f"{values.dtype} cannot be converted to {name}")
if values.ndim != 1:
raise TypeError("values must be a 1D list-like")
if mask is None:
if values.dtype.kind in "iu":
# fastpath
mask = np.zeros(len(values), dtype=np.bool_)
else:
mask = libmissing.is_numeric_na(values)
else:
assert len(mask) == len(values)
if mask.ndim != 1:
raise TypeError("mask must be a 1D list-like")
# infer dtype if needed
if dtype is None:
dtype = default_dtype
else:
dtype = dtype.numpy_dtype
if is_integer_dtype(dtype) and values.dtype.kind == "f" and len(values) > 0:
if mask.all():
values = np.ones(values.shape, dtype=dtype)
else:
idx = np.nanargmax(values)
if int(values[idx]) != original[idx]:
# We have ints that lost precision during the cast.
inferred_type = lib.infer_dtype(original, skipna=True)
if (
inferred_type not in ["floating", "mixed-integer-float"]
and not mask.any()
):
values = np.asarray(original, dtype=dtype)
else:
values = np.asarray(original, dtype="object")
# we copy as need to coerce here
if mask.any():
values = values.copy()
values[mask] = cls._internal_fill_value
if inferred_type in ("string", "unicode"):
# casts from str are always safe since they raise
# a ValueError if the str cannot be parsed into a float
values = values.astype(dtype, copy=copy)
else:
values = dtype_cls._safe_cast(values, dtype, copy=False)
return values, mask, dtype, inferred_type
class NumericArray(BaseMaskedArray):
"""
Base class for IntegerArray and FloatingArray.
"""
_dtype_cls: type[NumericDtype]
def __init__(
self, values: np.ndarray, mask: npt.NDArray[np.bool_], copy: bool = False
) -> None:
checker = self._dtype_cls._checker
if not (isinstance(values, np.ndarray) and checker(values.dtype)):
descr = (
"floating"
if self._dtype_cls.kind == "f" # type: ignore[comparison-overlap]
else "integer"
)
raise TypeError(
f"values should be {descr} numpy array. Use "
"the 'pd.array' function instead"
)
if values.dtype == np.float16:
# If we don't raise here, then accessing self.dtype would raise
raise TypeError("FloatingArray does not support np.float16 dtype.")
super().__init__(values, mask, copy=copy)
@cache_readonly
def dtype(self) -> NumericDtype:
mapping = self._dtype_cls._get_dtype_mapping()
return mapping[self._data.dtype]
@classmethod
def _coerce_to_array(
cls, value, *, dtype: DtypeObj, copy: bool = False
) -> tuple[np.ndarray, np.ndarray]:
dtype_cls = cls._dtype_cls
default_dtype = dtype_cls._default_np_dtype
values, mask, _, _ = _coerce_to_data_and_mask(
value, dtype, copy, dtype_cls, default_dtype
)
return values, mask
@classmethod
def _from_sequence_of_strings(
cls, strings, *, dtype: Dtype | None = None, copy: bool = False
) -> Self:
from pandas.core.tools.numeric import to_numeric
scalars = to_numeric(strings, errors="raise", dtype_backend="numpy_nullable")
return cls._from_sequence(scalars, dtype=dtype, copy=copy)
_HANDLED_TYPES = (np.ndarray, numbers.Number)

View File

@ -0,0 +1,574 @@
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Any,
Literal,
)
import numpy as np
from pandas._libs import lib
from pandas._libs.tslibs import is_supported_dtype
from pandas.compat.numpy import function as nv
from pandas.core.dtypes.astype import astype_array
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
from pandas.core.dtypes.common import pandas_dtype
from pandas.core.dtypes.dtypes import NumpyEADtype
from pandas.core.dtypes.missing import isna
from pandas.core import (
arraylike,
missing,
nanops,
ops,
)
from pandas.core.arraylike import OpsMixin
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
from pandas.core.construction import ensure_wrapped_if_datetimelike
from pandas.core.strings.object_array import ObjectStringArrayMixin
if TYPE_CHECKING:
from collections.abc import Callable
from pandas._typing import (
AxisInt,
Dtype,
FillnaOptions,
InterpolateOptions,
NpDtype,
Scalar,
Self,
npt,
)
from pandas import Index
# error: Definition of "_concat_same_type" in base class "NDArrayBacked" is
# incompatible with definition in base class "ExtensionArray"
class NumpyExtensionArray( # type: ignore[misc]
OpsMixin,
NDArrayBackedExtensionArray,
ObjectStringArrayMixin,
):
"""
A pandas ExtensionArray for NumPy data.
This is mostly for internal compatibility, and is not especially
useful on its own.
Parameters
----------
values : ndarray
The NumPy ndarray to wrap. Must be 1-dimensional.
copy : bool, default False
Whether to copy `values`.
Attributes
----------
None
Methods
-------
None
Examples
--------
>>> pd.arrays.NumpyExtensionArray(np.array([0, 1, 2, 3]))
<NumpyExtensionArray>
[0, 1, 2, 3]
Length: 4, dtype: int64
"""
# If you're wondering why pd.Series(cls) doesn't put the array in an
# ExtensionBlock, search for `ABCNumpyExtensionArray`. We check for
# that _typ to ensure that users don't unnecessarily use EAs inside
# pandas internals, which turns off things like block consolidation.
_typ = "npy_extension"
__array_priority__ = 1000
_ndarray: np.ndarray
_dtype: NumpyEADtype
_internal_fill_value = np.nan
# ------------------------------------------------------------------------
# Constructors
def __init__(
self, values: np.ndarray | NumpyExtensionArray, copy: bool = False
) -> None:
if isinstance(values, type(self)):
values = values._ndarray
if not isinstance(values, np.ndarray):
raise ValueError(
f"'values' must be a NumPy array, not {type(values).__name__}"
)
if values.ndim == 0:
# Technically we support 2, but do not advertise that fact.
raise ValueError("NumpyExtensionArray must be 1-dimensional.")
if copy:
values = values.copy()
dtype = NumpyEADtype(values.dtype)
super().__init__(values, dtype)
@classmethod
def _from_sequence(
cls, scalars, *, dtype: Dtype | None = None, copy: bool = False
) -> NumpyExtensionArray:
if isinstance(dtype, NumpyEADtype):
dtype = dtype._dtype
# error: Argument "dtype" to "asarray" has incompatible type
# "Union[ExtensionDtype, str, dtype[Any], dtype[floating[_64Bit]], Type[object],
# None]"; expected "Union[dtype[Any], None, type, _SupportsDType, str,
# Union[Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], List[Any],
# _DTypeDict, Tuple[Any, Any]]]"
result = np.asarray(scalars, dtype=dtype) # type: ignore[arg-type]
if (
result.ndim > 1
and not hasattr(scalars, "dtype")
and (dtype is None or dtype == object)
):
# e.g. list-of-tuples
result = construct_1d_object_array_from_listlike(scalars)
if copy and result is scalars:
result = result.copy()
return cls(result)
# ------------------------------------------------------------------------
# Data
@property
def dtype(self) -> NumpyEADtype:
return self._dtype
# ------------------------------------------------------------------------
# NumPy Array Interface
def __array__(
self, dtype: NpDtype | None = None, copy: bool | None = None
) -> np.ndarray:
if copy is not None:
# Note: branch avoids `copy=None` for NumPy 1.x support
return np.array(self._ndarray, dtype=dtype, copy=copy)
return np.asarray(self._ndarray, dtype=dtype)
def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
# Lightly modified version of
# https://numpy.org/doc/stable/reference/generated/numpy.lib.mixins.NDArrayOperatorsMixin.html
# The primary modification is not boxing scalar return values
# in NumpyExtensionArray, since pandas' ExtensionArrays are 1-d.
out = kwargs.get("out", ())
result = arraylike.maybe_dispatch_ufunc_to_dunder_op(
self, ufunc, method, *inputs, **kwargs
)
if result is not NotImplemented:
return result
if "out" in kwargs:
# e.g. test_ufunc_unary
return arraylike.dispatch_ufunc_with_out(
self, ufunc, method, *inputs, **kwargs
)
if method == "reduce":
result = arraylike.dispatch_reduction_ufunc(
self, ufunc, method, *inputs, **kwargs
)
if result is not NotImplemented:
# e.g. tests.series.test_ufunc.TestNumpyReductions
return result
# Defer to the implementation of the ufunc on unwrapped values.
inputs = tuple(
x._ndarray if isinstance(x, NumpyExtensionArray) else x for x in inputs
)
if out:
kwargs["out"] = tuple(
x._ndarray if isinstance(x, NumpyExtensionArray) else x for x in out
)
result = getattr(ufunc, method)(*inputs, **kwargs)
if ufunc.nout > 1:
# multiple return values; re-box array-like results
return tuple(type(self)(x) for x in result)
elif method == "at":
# no return value
return None
elif method == "reduce":
if isinstance(result, np.ndarray):
# e.g. test_np_reduce_2d
return type(self)(result)
# e.g. test_np_max_nested_tuples
return result
else:
# one return value; re-box array-like results
return type(self)(result)
# ------------------------------------------------------------------------
# Pandas ExtensionArray Interface
def astype(self, dtype, copy: bool = True):
dtype = pandas_dtype(dtype)
if dtype == self.dtype:
if copy:
return self.copy()
return self
result = astype_array(self._ndarray, dtype=dtype, copy=copy)
return result
def isna(self) -> np.ndarray:
return isna(self._ndarray)
def _validate_scalar(self, fill_value):
if fill_value is None:
# Primarily for subclasses
fill_value = self.dtype.na_value
return fill_value
def _values_for_factorize(self) -> tuple[np.ndarray, float | None]:
if self.dtype.kind in "iub":
fv = None
else:
fv = np.nan
return self._ndarray, fv
# Base EA class (and all other EA classes) don't have limit_area keyword
# This can be removed here as well when the interpolate ffill/bfill method
# deprecation is enforced
def _pad_or_backfill(
self,
*,
method: FillnaOptions,
limit: int | None = None,
limit_area: Literal["inside", "outside"] | None = None,
copy: bool = True,
) -> Self:
"""
ffill or bfill along axis=0.
"""
if copy:
out_data = self._ndarray.copy()
else:
out_data = self._ndarray
meth = missing.clean_fill_method(method)
missing.pad_or_backfill_inplace(
out_data.T,
method=meth,
axis=0,
limit=limit,
limit_area=limit_area,
)
if not copy:
return self
return type(self)._simple_new(out_data, dtype=self.dtype)
def interpolate(
self,
*,
method: InterpolateOptions,
axis: int,
index: Index,
limit,
limit_direction,
limit_area,
copy: bool,
**kwargs,
) -> Self:
"""
See NDFrame.interpolate.__doc__.
"""
# NB: we return type(self) even if copy=False
if not self.dtype._is_numeric:
raise TypeError(f"Cannot interpolate with {self.dtype} dtype")
if not copy:
out_data = self._ndarray
else:
out_data = self._ndarray.copy()
# TODO: assert we have floating dtype?
missing.interpolate_2d_inplace(
out_data,
method=method,
axis=axis,
index=index,
limit=limit,
limit_direction=limit_direction,
limit_area=limit_area,
**kwargs,
)
if not copy:
return self
return type(self)._simple_new(out_data, dtype=self.dtype)
# ------------------------------------------------------------------------
# Reductions
def any(
self,
*,
axis: AxisInt | None = None,
out=None,
keepdims: bool = False,
skipna: bool = True,
):
nv.validate_any((), {"out": out, "keepdims": keepdims})
result = nanops.nanany(self._ndarray, axis=axis, skipna=skipna)
return self._wrap_reduction_result(axis, result)
def all(
self,
*,
axis: AxisInt | None = None,
out=None,
keepdims: bool = False,
skipna: bool = True,
):
nv.validate_all((), {"out": out, "keepdims": keepdims})
result = nanops.nanall(self._ndarray, axis=axis, skipna=skipna)
return self._wrap_reduction_result(axis, result)
def min(
self, *, axis: AxisInt | None = None, skipna: bool = True, **kwargs
) -> Scalar:
nv.validate_min((), kwargs)
result = nanops.nanmin(
values=self._ndarray, axis=axis, mask=self.isna(), skipna=skipna
)
return self._wrap_reduction_result(axis, result)
def max(
self, *, axis: AxisInt | None = None, skipna: bool = True, **kwargs
) -> Scalar:
nv.validate_max((), kwargs)
result = nanops.nanmax(
values=self._ndarray, axis=axis, mask=self.isna(), skipna=skipna
)
return self._wrap_reduction_result(axis, result)
def sum(
self,
*,
axis: AxisInt | None = None,
skipna: bool = True,
min_count: int = 0,
**kwargs,
) -> Scalar:
nv.validate_sum((), kwargs)
result = nanops.nansum(
self._ndarray, axis=axis, skipna=skipna, min_count=min_count
)
return self._wrap_reduction_result(axis, result)
def prod(
self,
*,
axis: AxisInt | None = None,
skipna: bool = True,
min_count: int = 0,
**kwargs,
) -> Scalar:
nv.validate_prod((), kwargs)
result = nanops.nanprod(
self._ndarray, axis=axis, skipna=skipna, min_count=min_count
)
return self._wrap_reduction_result(axis, result)
def mean(
self,
*,
axis: AxisInt | None = None,
dtype: NpDtype | None = None,
out=None,
keepdims: bool = False,
skipna: bool = True,
):
nv.validate_mean((), {"dtype": dtype, "out": out, "keepdims": keepdims})
result = nanops.nanmean(self._ndarray, axis=axis, skipna=skipna)
return self._wrap_reduction_result(axis, result)
def median(
self,
*,
axis: AxisInt | None = None,
out=None,
overwrite_input: bool = False,
keepdims: bool = False,
skipna: bool = True,
):
nv.validate_median(
(), {"out": out, "overwrite_input": overwrite_input, "keepdims": keepdims}
)
result = nanops.nanmedian(self._ndarray, axis=axis, skipna=skipna)
return self._wrap_reduction_result(axis, result)
def std(
self,
*,
axis: AxisInt | None = None,
dtype: NpDtype | None = None,
out=None,
ddof: int = 1,
keepdims: bool = False,
skipna: bool = True,
):
nv.validate_stat_ddof_func(
(), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="std"
)
result = nanops.nanstd(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
return self._wrap_reduction_result(axis, result)
def var(
self,
*,
axis: AxisInt | None = None,
dtype: NpDtype | None = None,
out=None,
ddof: int = 1,
keepdims: bool = False,
skipna: bool = True,
):
nv.validate_stat_ddof_func(
(), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="var"
)
result = nanops.nanvar(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
return self._wrap_reduction_result(axis, result)
def sem(
self,
*,
axis: AxisInt | None = None,
dtype: NpDtype | None = None,
out=None,
ddof: int = 1,
keepdims: bool = False,
skipna: bool = True,
):
nv.validate_stat_ddof_func(
(), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="sem"
)
result = nanops.nansem(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
return self._wrap_reduction_result(axis, result)
def kurt(
self,
*,
axis: AxisInt | None = None,
dtype: NpDtype | None = None,
out=None,
keepdims: bool = False,
skipna: bool = True,
):
nv.validate_stat_ddof_func(
(), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="kurt"
)
result = nanops.nankurt(self._ndarray, axis=axis, skipna=skipna)
return self._wrap_reduction_result(axis, result)
def skew(
self,
*,
axis: AxisInt | None = None,
dtype: NpDtype | None = None,
out=None,
keepdims: bool = False,
skipna: bool = True,
):
nv.validate_stat_ddof_func(
(), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="skew"
)
result = nanops.nanskew(self._ndarray, axis=axis, skipna=skipna)
return self._wrap_reduction_result(axis, result)
# ------------------------------------------------------------------------
# Additional Methods
def to_numpy(
self,
dtype: npt.DTypeLike | None = None,
copy: bool = False,
na_value: object = lib.no_default,
) -> np.ndarray:
mask = self.isna()
if na_value is not lib.no_default and mask.any():
result = self._ndarray.copy()
result[mask] = na_value
else:
result = self._ndarray
result = np.asarray(result, dtype=dtype)
if copy and result is self._ndarray:
result = result.copy()
return result
# ------------------------------------------------------------------------
# Ops
def __invert__(self) -> NumpyExtensionArray:
return type(self)(~self._ndarray)
def __neg__(self) -> NumpyExtensionArray:
return type(self)(-self._ndarray)
def __pos__(self) -> NumpyExtensionArray:
return type(self)(+self._ndarray)
def __abs__(self) -> NumpyExtensionArray:
return type(self)(abs(self._ndarray))
def _cmp_method(self, other, op):
if isinstance(other, NumpyExtensionArray):
other = other._ndarray
other = ops.maybe_prepare_scalar_for_op(other, (len(self),))
pd_op = ops.get_array_op(op)
other = ensure_wrapped_if_datetimelike(other)
result = pd_op(self._ndarray, other)
if op is divmod or op is ops.rdivmod:
a, b = result
if isinstance(a, np.ndarray):
# for e.g. op vs TimedeltaArray, we may already
# have an ExtensionArray, in which case we do not wrap
return self._wrap_ndarray_result(a), self._wrap_ndarray_result(b)
return a, b
if isinstance(result, np.ndarray):
# for e.g. multiplication vs TimedeltaArray, we may already
# have an ExtensionArray, in which case we do not wrap
return self._wrap_ndarray_result(result)
return result
_arith_method = _cmp_method
def _wrap_ndarray_result(self, result: np.ndarray):
# If we have timedelta64[ns] result, return a TimedeltaArray instead
# of a NumpyExtensionArray
if result.dtype.kind == "m" and is_supported_dtype(result.dtype):
from pandas.core.arrays import TimedeltaArray
return TimedeltaArray._simple_new(result, dtype=result.dtype)
return type(self)(result)
def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]:
# NEP 51: https://github.com/numpy/numpy/pull/22449
if self.dtype.kind in "SU":
return "'{}'".format
elif self.dtype == "object":
return repr
else:
return str

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,19 @@
from pandas.core.arrays.sparse.accessor import (
SparseAccessor,
SparseFrameAccessor,
)
from pandas.core.arrays.sparse.array import (
BlockIndex,
IntIndex,
SparseArray,
make_sparse_index,
)
__all__ = [
"BlockIndex",
"IntIndex",
"make_sparse_index",
"SparseAccessor",
"SparseArray",
"SparseFrameAccessor",
]

View File

@ -0,0 +1,414 @@
"""Sparse accessor"""
from __future__ import annotations
from typing import TYPE_CHECKING
import numpy as np
from pandas.compat._optional import import_optional_dependency
from pandas.core.dtypes.cast import find_common_type
from pandas.core.dtypes.dtypes import SparseDtype
from pandas.core.accessor import (
PandasDelegate,
delegate_names,
)
from pandas.core.arrays.sparse.array import SparseArray
if TYPE_CHECKING:
from pandas import (
DataFrame,
Series,
)
class BaseAccessor:
_validation_msg = "Can only use the '.sparse' accessor with Sparse data."
def __init__(self, data=None) -> None:
self._parent = data
self._validate(data)
def _validate(self, data):
raise NotImplementedError
@delegate_names(
SparseArray, ["npoints", "density", "fill_value", "sp_values"], typ="property"
)
class SparseAccessor(BaseAccessor, PandasDelegate):
"""
Accessor for SparseSparse from other sparse matrix data types.
Examples
--------
>>> ser = pd.Series([0, 0, 2, 2, 2], dtype="Sparse[int]")
>>> ser.sparse.density
0.6
>>> ser.sparse.sp_values
array([2, 2, 2])
"""
def _validate(self, data):
if not isinstance(data.dtype, SparseDtype):
raise AttributeError(self._validation_msg)
def _delegate_property_get(self, name: str, *args, **kwargs):
return getattr(self._parent.array, name)
def _delegate_method(self, name: str, *args, **kwargs):
if name == "from_coo":
return self.from_coo(*args, **kwargs)
elif name == "to_coo":
return self.to_coo(*args, **kwargs)
else:
raise ValueError
@classmethod
def from_coo(cls, A, dense_index: bool = False) -> Series:
"""
Create a Series with sparse values from a scipy.sparse.coo_matrix.
Parameters
----------
A : scipy.sparse.coo_matrix
dense_index : bool, default False
If False (default), the index consists of only the
coords of the non-null entries of the original coo_matrix.
If True, the index consists of the full sorted
(row, col) coordinates of the coo_matrix.
Returns
-------
s : Series
A Series with sparse values.
Examples
--------
>>> from scipy import sparse
>>> A = sparse.coo_matrix(
... ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4)
... )
>>> A
<COOrdinate sparse matrix of dtype 'float64'
with 3 stored elements and shape (3, 4)>
>>> A.todense()
matrix([[0., 0., 1., 2.],
[3., 0., 0., 0.],
[0., 0., 0., 0.]])
>>> ss = pd.Series.sparse.from_coo(A)
>>> ss
0 2 1.0
3 2.0
1 0 3.0
dtype: Sparse[float64, nan]
"""
from pandas import Series
from pandas.core.arrays.sparse.scipy_sparse import coo_to_sparse_series
result = coo_to_sparse_series(A, dense_index=dense_index)
result = Series(result.array, index=result.index, copy=False)
return result
def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels: bool = False):
"""
Create a scipy.sparse.coo_matrix from a Series with MultiIndex.
Use row_levels and column_levels to determine the row and column
coordinates respectively. row_levels and column_levels are the names
(labels) or numbers of the levels. {row_levels, column_levels} must be
a partition of the MultiIndex level names (or numbers).
Parameters
----------
row_levels : tuple/list
column_levels : tuple/list
sort_labels : bool, default False
Sort the row and column labels before forming the sparse matrix.
When `row_levels` and/or `column_levels` refer to a single level,
set to `True` for a faster execution.
Returns
-------
y : scipy.sparse.coo_matrix
rows : list (row labels)
columns : list (column labels)
Examples
--------
>>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan])
>>> s.index = pd.MultiIndex.from_tuples(
... [
... (1, 2, "a", 0),
... (1, 2, "a", 1),
... (1, 1, "b", 0),
... (1, 1, "b", 1),
... (2, 1, "b", 0),
... (2, 1, "b", 1)
... ],
... names=["A", "B", "C", "D"],
... )
>>> s
A B C D
1 2 a 0 3.0
1 NaN
1 b 0 1.0
1 3.0
2 1 b 0 NaN
1 NaN
dtype: float64
>>> ss = s.astype("Sparse")
>>> ss
A B C D
1 2 a 0 3.0
1 NaN
1 b 0 1.0
1 3.0
2 1 b 0 NaN
1 NaN
dtype: Sparse[float64, nan]
>>> A, rows, columns = ss.sparse.to_coo(
... row_levels=["A", "B"], column_levels=["C", "D"], sort_labels=True
... )
>>> A
<COOrdinate sparse matrix of dtype 'float64'
with 3 stored elements and shape (3, 4)>
>>> A.todense()
matrix([[0., 0., 1., 3.],
[3., 0., 0., 0.],
[0., 0., 0., 0.]])
>>> rows
[(1, 1), (1, 2), (2, 1)]
>>> columns
[('a', 0), ('a', 1), ('b', 0), ('b', 1)]
"""
from pandas.core.arrays.sparse.scipy_sparse import sparse_series_to_coo
A, rows, columns = sparse_series_to_coo(
self._parent, row_levels, column_levels, sort_labels=sort_labels
)
return A, rows, columns
def to_dense(self) -> Series:
"""
Convert a Series from sparse values to dense.
Returns
-------
Series:
A Series with the same values, stored as a dense array.
Examples
--------
>>> series = pd.Series(pd.arrays.SparseArray([0, 1, 0]))
>>> series
0 0
1 1
2 0
dtype: Sparse[int64, 0]
>>> series.sparse.to_dense()
0 0
1 1
2 0
dtype: int64
"""
from pandas import Series
return Series(
self._parent.array.to_dense(),
index=self._parent.index,
name=self._parent.name,
copy=False,
)
class SparseFrameAccessor(BaseAccessor, PandasDelegate):
"""
DataFrame accessor for sparse data.
Examples
--------
>>> df = pd.DataFrame({"a": [1, 2, 0, 0],
... "b": [3, 0, 0, 4]}, dtype="Sparse[int]")
>>> df.sparse.density
0.5
"""
def _validate(self, data):
dtypes = data.dtypes
if not all(isinstance(t, SparseDtype) for t in dtypes):
raise AttributeError(self._validation_msg)
@classmethod
def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame:
"""
Create a new DataFrame from a scipy sparse matrix.
Parameters
----------
data : scipy.sparse.spmatrix
Must be convertible to csc format.
index, columns : Index, optional
Row and column labels to use for the resulting DataFrame.
Defaults to a RangeIndex.
Returns
-------
DataFrame
Each column of the DataFrame is stored as a
:class:`arrays.SparseArray`.
Examples
--------
>>> import scipy.sparse
>>> mat = scipy.sparse.eye(3, dtype=float)
>>> pd.DataFrame.sparse.from_spmatrix(mat)
0 1 2
0 1.0 0 0
1 0 1.0 0
2 0 0 1.0
"""
from pandas._libs.sparse import IntIndex
from pandas import DataFrame
data = data.tocsc()
index, columns = cls._prep_index(data, index, columns)
n_rows, n_columns = data.shape
# We need to make sure indices are sorted, as we create
# IntIndex with no input validation (i.e. check_integrity=False ).
# Indices may already be sorted in scipy in which case this adds
# a small overhead.
data.sort_indices()
indices = data.indices
indptr = data.indptr
array_data = data.data
dtype = SparseDtype(array_data.dtype, 0)
arrays = []
for i in range(n_columns):
sl = slice(indptr[i], indptr[i + 1])
idx = IntIndex(n_rows, indices[sl], check_integrity=False)
arr = SparseArray._simple_new(array_data[sl], idx, dtype)
arrays.append(arr)
return DataFrame._from_arrays(
arrays, columns=columns, index=index, verify_integrity=False
)
def to_dense(self) -> DataFrame:
"""
Convert a DataFrame with sparse values to dense.
Returns
-------
DataFrame
A DataFrame with the same values stored as dense arrays.
Examples
--------
>>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0])})
>>> df.sparse.to_dense()
A
0 0
1 1
2 0
"""
from pandas import DataFrame
data = {k: v.array.to_dense() for k, v in self._parent.items()}
return DataFrame(data, index=self._parent.index, columns=self._parent.columns)
def to_coo(self):
"""
Return the contents of the frame as a sparse SciPy COO matrix.
Returns
-------
scipy.sparse.spmatrix
If the caller is heterogeneous and contains booleans or objects,
the result will be of dtype=object. See Notes.
Notes
-----
The dtype will be the lowest-common-denominator type (implicit
upcasting); that is to say if the dtypes (even of numeric types)
are mixed, the one that accommodates all will be chosen.
e.g. If the dtypes are float16 and float32, dtype will be upcast to
float32. By numpy.find_common_type convention, mixing int64 and
and uint64 will result in a float64 dtype.
Examples
--------
>>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])})
>>> df.sparse.to_coo()
<COOrdinate sparse matrix of dtype 'int64'
with 2 stored elements and shape (4, 1)>
"""
import_optional_dependency("scipy")
from scipy.sparse import coo_matrix
dtype = find_common_type(self._parent.dtypes.to_list())
if isinstance(dtype, SparseDtype):
dtype = dtype.subtype
cols, rows, data = [], [], []
for col, (_, ser) in enumerate(self._parent.items()):
sp_arr = ser.array
if sp_arr.fill_value != 0:
raise ValueError("fill value must be 0 when converting to COO matrix")
row = sp_arr.sp_index.indices
cols.append(np.repeat(col, len(row)))
rows.append(row)
data.append(sp_arr.sp_values.astype(dtype, copy=False))
cols = np.concatenate(cols)
rows = np.concatenate(rows)
data = np.concatenate(data)
return coo_matrix((data, (rows, cols)), shape=self._parent.shape)
@property
def density(self) -> float:
"""
Ratio of non-sparse points to total (dense) data points.
Examples
--------
>>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])})
>>> df.sparse.density
0.5
"""
tmp = np.mean([column.array.density for _, column in self._parent.items()])
return tmp
@staticmethod
def _prep_index(data, index, columns):
from pandas.core.indexes.api import (
default_index,
ensure_index,
)
N, K = data.shape
if index is None:
index = default_index(N)
else:
index = ensure_index(index)
if columns is None:
columns = default_index(K)
else:
columns = ensure_index(columns)
if len(columns) != K:
raise ValueError(f"Column length mismatch: {len(columns)} vs. {K}")
if len(index) != N:
raise ValueError(f"Index length mismatch: {len(index)} vs. {N}")
return index, columns

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,207 @@
"""
Interaction with scipy.sparse matrices.
Currently only includes to_coo helpers.
"""
from __future__ import annotations
from typing import TYPE_CHECKING
from pandas._libs import lib
from pandas.core.dtypes.missing import notna
from pandas.core.algorithms import factorize
from pandas.core.indexes.api import MultiIndex
from pandas.core.series import Series
if TYPE_CHECKING:
from collections.abc import Iterable
import numpy as np
import scipy.sparse
from pandas._typing import (
IndexLabel,
npt,
)
def _check_is_partition(parts: Iterable, whole: Iterable):
whole = set(whole)
parts = [set(x) for x in parts]
if set.intersection(*parts) != set():
raise ValueError("Is not a partition because intersection is not null.")
if set.union(*parts) != whole:
raise ValueError("Is not a partition because union is not the whole.")
def _levels_to_axis(
ss,
levels: tuple[int] | list[int],
valid_ilocs: npt.NDArray[np.intp],
sort_labels: bool = False,
) -> tuple[npt.NDArray[np.intp], list[IndexLabel]]:
"""
For a MultiIndexed sparse Series `ss`, return `ax_coords` and `ax_labels`,
where `ax_coords` are the coordinates along one of the two axes of the
destination sparse matrix, and `ax_labels` are the labels from `ss`' Index
which correspond to these coordinates.
Parameters
----------
ss : Series
levels : tuple/list
valid_ilocs : numpy.ndarray
Array of integer positions of valid values for the sparse matrix in ss.
sort_labels : bool, default False
Sort the axis labels before forming the sparse matrix. When `levels`
refers to a single level, set to True for a faster execution.
Returns
-------
ax_coords : numpy.ndarray (axis coordinates)
ax_labels : list (axis labels)
"""
# Since the labels are sorted in `Index.levels`, when we wish to sort and
# there is only one level of the MultiIndex for this axis, the desired
# output can be obtained in the following simpler, more efficient way.
if sort_labels and len(levels) == 1:
ax_coords = ss.index.codes[levels[0]][valid_ilocs]
ax_labels = ss.index.levels[levels[0]]
else:
levels_values = lib.fast_zip(
[ss.index.get_level_values(lvl).to_numpy() for lvl in levels]
)
codes, ax_labels = factorize(levels_values, sort=sort_labels)
ax_coords = codes[valid_ilocs]
ax_labels = ax_labels.tolist()
return ax_coords, ax_labels
def _to_ijv(
ss,
row_levels: tuple[int] | list[int] = (0,),
column_levels: tuple[int] | list[int] = (1,),
sort_labels: bool = False,
) -> tuple[
np.ndarray,
npt.NDArray[np.intp],
npt.NDArray[np.intp],
list[IndexLabel],
list[IndexLabel],
]:
"""
For an arbitrary MultiIndexed sparse Series return (v, i, j, ilabels,
jlabels) where (v, (i, j)) is suitable for passing to scipy.sparse.coo
constructor, and ilabels and jlabels are the row and column labels
respectively.
Parameters
----------
ss : Series
row_levels : tuple/list
column_levels : tuple/list
sort_labels : bool, default False
Sort the row and column labels before forming the sparse matrix.
When `row_levels` and/or `column_levels` refer to a single level,
set to `True` for a faster execution.
Returns
-------
values : numpy.ndarray
Valid values to populate a sparse matrix, extracted from
ss.
i_coords : numpy.ndarray (row coordinates of the values)
j_coords : numpy.ndarray (column coordinates of the values)
i_labels : list (row labels)
j_labels : list (column labels)
"""
# index and column levels must be a partition of the index
_check_is_partition([row_levels, column_levels], range(ss.index.nlevels))
# From the sparse Series, get the integer indices and data for valid sparse
# entries.
sp_vals = ss.array.sp_values
na_mask = notna(sp_vals)
values = sp_vals[na_mask]
valid_ilocs = ss.array.sp_index.indices[na_mask]
i_coords, i_labels = _levels_to_axis(
ss, row_levels, valid_ilocs, sort_labels=sort_labels
)
j_coords, j_labels = _levels_to_axis(
ss, column_levels, valid_ilocs, sort_labels=sort_labels
)
return values, i_coords, j_coords, i_labels, j_labels
def sparse_series_to_coo(
ss: Series,
row_levels: Iterable[int] = (0,),
column_levels: Iterable[int] = (1,),
sort_labels: bool = False,
) -> tuple[scipy.sparse.coo_matrix, list[IndexLabel], list[IndexLabel]]:
"""
Convert a sparse Series to a scipy.sparse.coo_matrix using index
levels row_levels, column_levels as the row and column
labels respectively. Returns the sparse_matrix, row and column labels.
"""
import scipy.sparse
if ss.index.nlevels < 2:
raise ValueError("to_coo requires MultiIndex with nlevels >= 2.")
if not ss.index.is_unique:
raise ValueError(
"Duplicate index entries are not allowed in to_coo transformation."
)
# to keep things simple, only rely on integer indexing (not labels)
row_levels = [ss.index._get_level_number(x) for x in row_levels]
column_levels = [ss.index._get_level_number(x) for x in column_levels]
v, i, j, rows, columns = _to_ijv(
ss, row_levels=row_levels, column_levels=column_levels, sort_labels=sort_labels
)
sparse_matrix = scipy.sparse.coo_matrix(
(v, (i, j)), shape=(len(rows), len(columns))
)
return sparse_matrix, rows, columns
def coo_to_sparse_series(
A: scipy.sparse.coo_matrix, dense_index: bool = False
) -> Series:
"""
Convert a scipy.sparse.coo_matrix to a Series with type sparse.
Parameters
----------
A : scipy.sparse.coo_matrix
dense_index : bool, default False
Returns
-------
Series
Raises
------
TypeError if A is not a coo_matrix
"""
from pandas import SparseDtype
try:
ser = Series(A.data, MultiIndex.from_arrays((A.row, A.col)), copy=False)
except AttributeError as err:
raise TypeError(
f"Expected coo_matrix. Got {type(A).__name__} instead."
) from err
ser = ser.sort_index()
ser = ser.astype(SparseDtype(ser.dtype))
if dense_index:
ind = MultiIndex.from_product([A.row, A.col])
ser = ser.reindex(ind)
return ser

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,495 @@
from __future__ import annotations
import operator
import re
from typing import (
TYPE_CHECKING,
Callable,
Union,
)
import warnings
import numpy as np
from pandas._libs import (
lib,
missing as libmissing,
)
from pandas.compat import (
pa_version_under10p1,
pa_version_under13p0,
pa_version_under16p0,
)
from pandas.util._exceptions import find_stack_level
from pandas.core.dtypes.common import (
is_scalar,
pandas_dtype,
)
from pandas.core.dtypes.missing import isna
from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin
from pandas.core.arrays.arrow import ArrowExtensionArray
from pandas.core.arrays.boolean import BooleanDtype
from pandas.core.arrays.floating import Float64Dtype
from pandas.core.arrays.integer import Int64Dtype
from pandas.core.arrays.numeric import NumericDtype
from pandas.core.arrays.string_ import (
BaseStringArray,
StringDtype,
)
from pandas.core.strings.object_array import ObjectStringArrayMixin
if not pa_version_under10p1:
import pyarrow as pa
import pyarrow.compute as pc
if TYPE_CHECKING:
from collections.abc import Sequence
from pandas._typing import (
ArrayLike,
Dtype,
Self,
npt,
)
from pandas import Series
ArrowStringScalarOrNAT = Union[str, libmissing.NAType]
def _chk_pyarrow_available() -> None:
if pa_version_under10p1:
msg = "pyarrow>=10.0.1 is required for PyArrow backed ArrowExtensionArray."
raise ImportError(msg)
def _is_string_view(typ):
return not pa_version_under16p0 and pa.types.is_string_view(typ)
# TODO: Inherit directly from BaseStringArrayMethods. Currently we inherit from
# ObjectStringArrayMixin because we want to have the object-dtype based methods as
# fallback for the ones that pyarrow doesn't yet support
class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringArray):
"""
Extension array for string data in a ``pyarrow.ChunkedArray``.
.. warning::
ArrowStringArray is considered experimental. The implementation and
parts of the API may change without warning.
Parameters
----------
values : pyarrow.Array or pyarrow.ChunkedArray
The array of data.
Attributes
----------
None
Methods
-------
None
See Also
--------
:func:`pandas.array`
The recommended function for creating a ArrowStringArray.
Series.str
The string methods are available on Series backed by
a ArrowStringArray.
Notes
-----
ArrowStringArray returns a BooleanArray for comparison methods.
Examples
--------
>>> pd.array(['This is', 'some text', None, 'data.'], dtype="string[pyarrow]")
<ArrowStringArray>
['This is', 'some text', <NA>, 'data.']
Length: 4, dtype: string
"""
# error: Incompatible types in assignment (expression has type "StringDtype",
# base class "ArrowExtensionArray" defined the type as "ArrowDtype")
_dtype: StringDtype # type: ignore[assignment]
_storage = "pyarrow"
_na_value: libmissing.NAType | float = libmissing.NA
def __init__(self, values) -> None:
_chk_pyarrow_available()
if isinstance(values, (pa.Array, pa.ChunkedArray)) and (
pa.types.is_string(values.type)
or _is_string_view(values.type)
or (
pa.types.is_dictionary(values.type)
and (
pa.types.is_string(values.type.value_type)
or pa.types.is_large_string(values.type.value_type)
or _is_string_view(values.type.value_type)
)
)
):
values = pc.cast(values, pa.large_string())
super().__init__(values)
self._dtype = StringDtype(storage=self._storage, na_value=self._na_value)
if not pa.types.is_large_string(self._pa_array.type):
raise ValueError(
"ArrowStringArray requires a PyArrow (chunked) array of "
"large_string type"
)
@classmethod
def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
pa_scalar = super()._box_pa_scalar(value, pa_type)
if pa.types.is_string(pa_scalar.type) and pa_type is None:
pa_scalar = pc.cast(pa_scalar, pa.large_string())
return pa_scalar
@classmethod
def _box_pa_array(
cls, value, pa_type: pa.DataType | None = None, copy: bool = False
) -> pa.Array | pa.ChunkedArray:
pa_array = super()._box_pa_array(value, pa_type)
if pa.types.is_string(pa_array.type) and pa_type is None:
pa_array = pc.cast(pa_array, pa.large_string())
return pa_array
def __len__(self) -> int:
"""
Length of this array.
Returns
-------
length : int
"""
return len(self._pa_array)
@classmethod
def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
from pandas.core.arrays.masked import BaseMaskedArray
_chk_pyarrow_available()
if dtype and not (isinstance(dtype, str) and dtype == "string"):
dtype = pandas_dtype(dtype)
assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow"
if isinstance(scalars, BaseMaskedArray):
# avoid costly conversion to object dtype in ensure_string_array and
# numerical issues with Float32Dtype
na_values = scalars._mask
result = scalars._data
result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
return cls(pa.array(result, mask=na_values, type=pa.large_string()))
elif isinstance(scalars, (pa.Array, pa.ChunkedArray)):
return cls(pc.cast(scalars, pa.large_string()))
# convert non-na-likes to str
result = lib.ensure_string_array(scalars, copy=copy)
return cls(pa.array(result, type=pa.large_string(), from_pandas=True))
@classmethod
def _from_sequence_of_strings(
cls, strings, dtype: Dtype | None = None, copy: bool = False
):
return cls._from_sequence(strings, dtype=dtype, copy=copy)
@property
def dtype(self) -> StringDtype: # type: ignore[override]
"""
An instance of 'string[pyarrow]'.
"""
return self._dtype
def insert(self, loc: int, item) -> ArrowStringArray:
if self.dtype.na_value is np.nan and item is np.nan:
item = libmissing.NA
if not isinstance(item, str) and item is not libmissing.NA:
raise TypeError(
f"Invalid value '{item}' for dtype 'str'. Value should be a "
f"string or missing value, got '{type(item).__name__}' instead."
)
return super().insert(loc, item)
def _convert_bool_result(self, values, na=lib.no_default, method_name=None):
if na is not lib.no_default and not isna(na) and not isinstance(na, bool):
# GH#59561
warnings.warn(
f"Allowing a non-bool 'na' in obj.str.{method_name} is deprecated "
"and will raise in a future version.",
FutureWarning,
stacklevel=find_stack_level(),
)
na = bool(na)
if self.dtype.na_value is np.nan:
if na is lib.no_default or isna(na):
# NaN propagates as False
values = values.fill_null(False)
else:
values = values.fill_null(na)
return values.to_numpy()
else:
if na is not lib.no_default and not isna(
na
): # pyright: ignore [reportGeneralTypeIssues]
values = values.fill_null(na)
return BooleanDtype().__from_arrow__(values)
def _maybe_convert_setitem_value(self, value):
"""Maybe convert value to be pyarrow compatible."""
if is_scalar(value):
if isna(value):
value = None
elif not isinstance(value, str):
raise TypeError(
f"Invalid value '{value}' for dtype 'str'. Value should be a "
f"string or missing value, got '{type(value).__name__}' instead."
)
else:
value = np.array(value, dtype=object, copy=True)
value[isna(value)] = None
for v in value:
if not (v is None or isinstance(v, str)):
raise TypeError(
"Invalid value for dtype 'str'. Value should be a "
"string or missing value (or array of those)."
)
return super()._maybe_convert_setitem_value(value)
def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
value_set = [
pa_scalar.as_py()
for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values]
if pa_scalar.type in (pa.string(), pa.null(), pa.large_string())
]
# short-circuit to return all False array.
if not len(value_set):
return np.zeros(len(self), dtype=bool)
result = pc.is_in(
self._pa_array, value_set=pa.array(value_set, type=self._pa_array.type)
)
# pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls
# to False
return np.array(result, dtype=np.bool_)
def astype(self, dtype, copy: bool = True):
dtype = pandas_dtype(dtype)
if dtype == self.dtype:
if copy:
return self.copy()
return self
elif isinstance(dtype, NumericDtype):
data = self._pa_array.cast(pa.from_numpy_dtype(dtype.numpy_dtype))
return dtype.__from_arrow__(data)
elif isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.floating):
return self.to_numpy(dtype=dtype, na_value=np.nan)
return super().astype(dtype, copy=copy)
@property
def _data(self):
# dask accesses ._data directlys
warnings.warn(
f"{type(self).__name__}._data is a deprecated and will be removed "
"in a future version, use ._pa_array instead",
FutureWarning,
stacklevel=find_stack_level(),
)
return self._pa_array
# ------------------------------------------------------------------------
# String methods interface
_str_isalnum = ArrowStringArrayMixin._str_isalnum
_str_isalpha = ArrowStringArrayMixin._str_isalpha
_str_isdecimal = ArrowStringArrayMixin._str_isdecimal
_str_isdigit = ArrowStringArrayMixin._str_isdigit
_str_islower = ArrowStringArrayMixin._str_islower
_str_isnumeric = ArrowStringArrayMixin._str_isnumeric
_str_isspace = ArrowStringArrayMixin._str_isspace
_str_istitle = ArrowStringArrayMixin._str_istitle
_str_isupper = ArrowStringArrayMixin._str_isupper
_str_map = BaseStringArray._str_map
_str_startswith = ArrowStringArrayMixin._str_startswith
_str_endswith = ArrowStringArrayMixin._str_endswith
_str_pad = ArrowStringArrayMixin._str_pad
_str_match = ArrowStringArrayMixin._str_match
_str_fullmatch = ArrowStringArrayMixin._str_fullmatch
_str_lower = ArrowStringArrayMixin._str_lower
_str_upper = ArrowStringArrayMixin._str_upper
_str_strip = ArrowStringArrayMixin._str_strip
_str_lstrip = ArrowStringArrayMixin._str_lstrip
_str_rstrip = ArrowStringArrayMixin._str_rstrip
_str_removesuffix = ArrowStringArrayMixin._str_removesuffix
_str_get = ArrowStringArrayMixin._str_get
_str_capitalize = ArrowStringArrayMixin._str_capitalize
_str_title = ArrowStringArrayMixin._str_title
_str_swapcase = ArrowStringArrayMixin._str_swapcase
_str_slice_replace = ArrowStringArrayMixin._str_slice_replace
_str_len = ArrowStringArrayMixin._str_len
_str_slice = ArrowStringArrayMixin._str_slice
def _str_contains(
self,
pat,
case: bool = True,
flags: int = 0,
na=lib.no_default,
regex: bool = True,
):
if flags:
return super()._str_contains(pat, case, flags, na, regex)
if isinstance(pat, re.Pattern):
pat = pat.pattern
return ArrowStringArrayMixin._str_contains(self, pat, case, flags, na, regex)
def _str_replace(
self,
pat: str | re.Pattern,
repl: str | Callable,
n: int = -1,
case: bool = True,
flags: int = 0,
regex: bool = True,
):
if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:
return super()._str_replace(pat, repl, n, case, flags, regex)
return ArrowStringArrayMixin._str_replace(
self, pat, repl, n, case, flags, regex
)
def _str_repeat(self, repeats: int | Sequence[int]):
if not isinstance(repeats, int):
return super()._str_repeat(repeats)
else:
return ArrowExtensionArray._str_repeat(self, repeats=repeats)
def _str_removeprefix(self, prefix: str):
if not pa_version_under13p0:
return ArrowStringArrayMixin._str_removeprefix(self, prefix)
return super()._str_removeprefix(prefix)
def _str_count(self, pat: str, flags: int = 0):
if flags:
return super()._str_count(pat, flags)
result = pc.count_substring_regex(self._pa_array, pat)
return self._convert_int_result(result)
def _str_find(self, sub: str, start: int = 0, end: int | None = None):
if (
pa_version_under13p0
and not (start != 0 and end is not None)
and not (start == 0 and end is None)
):
# GH#59562
return super()._str_find(sub, start, end)
return ArrowStringArrayMixin._str_find(self, sub, start, end)
def _str_get_dummies(self, sep: str = "|"):
dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(sep)
if len(labels) == 0:
return np.empty(shape=(0, 0), dtype=np.int64), labels
dummies = np.vstack(dummies_pa.to_numpy())
return dummies.astype(np.int64, copy=False), labels
def _convert_int_result(self, result):
if self.dtype.na_value is np.nan:
if isinstance(result, pa.Array):
result = result.to_numpy(zero_copy_only=False)
else:
result = result.to_numpy()
if result.dtype == np.int32:
result = result.astype(np.int64)
return result
return Int64Dtype().__from_arrow__(result)
def _convert_rank_result(self, result):
if self.dtype.na_value is np.nan:
if isinstance(result, pa.Array):
result = result.to_numpy(zero_copy_only=False)
else:
result = result.to_numpy()
return result.astype("float64", copy=False)
return Float64Dtype().__from_arrow__(result)
def _reduce(
self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
):
if self.dtype.na_value is np.nan and name in ["any", "all"]:
if not skipna:
nas = pc.is_null(self._pa_array)
arr = pc.or_kleene(nas, pc.not_equal(self._pa_array, ""))
else:
arr = pc.not_equal(self._pa_array, "")
result = ArrowExtensionArray(arr)._reduce(
name, skipna=skipna, keepdims=keepdims, **kwargs
)
if keepdims:
# ArrowExtensionArray will return a length-1 bool[pyarrow] array
return result.astype(np.bool_)
return result
if name in ("min", "max", "sum", "argmin", "argmax"):
result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs)
else:
raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
if name in ("argmin", "argmax") and isinstance(result, pa.Array):
return self._convert_int_result(result)
elif isinstance(result, pa.Array):
return type(self)(result)
else:
return result
def value_counts(self, dropna: bool = True) -> Series:
result = super().value_counts(dropna=dropna)
if self.dtype.na_value is np.nan:
res_values = result._values.to_numpy()
return result._constructor(
res_values, index=result.index, name=result.name, copy=False
)
return result
def _cmp_method(self, other, op):
if (
isinstance(other, (BaseStringArray, ArrowExtensionArray))
and self.dtype.na_value is not libmissing.NA
and other.dtype.na_value is libmissing.NA
):
# NA has priority of NaN semantics
return NotImplemented
result = super()._cmp_method(other, op)
if self.dtype.na_value is np.nan:
if op == operator.ne:
return result.to_numpy(np.bool_, na_value=True)
else:
return result.to_numpy(np.bool_, na_value=False)
return result
def __pos__(self) -> Self:
raise TypeError(f"bad operand type for unary +: '{self.dtype}'")
class ArrowStringArrayNumpySemantics(ArrowStringArray):
_na_value = np.nan

File diff suppressed because it is too large Load Diff