done
This commit is contained in:
43
lib/python3.11/site-packages/pandas/core/arrays/__init__.py
Normal file
43
lib/python3.11/site-packages/pandas/core/arrays/__init__.py
Normal file
@ -0,0 +1,43 @@
|
||||
from pandas.core.arrays.arrow import ArrowExtensionArray
|
||||
from pandas.core.arrays.base import (
|
||||
ExtensionArray,
|
||||
ExtensionOpsMixin,
|
||||
ExtensionScalarOpsMixin,
|
||||
)
|
||||
from pandas.core.arrays.boolean import BooleanArray
|
||||
from pandas.core.arrays.categorical import Categorical
|
||||
from pandas.core.arrays.datetimes import DatetimeArray
|
||||
from pandas.core.arrays.floating import FloatingArray
|
||||
from pandas.core.arrays.integer import IntegerArray
|
||||
from pandas.core.arrays.interval import IntervalArray
|
||||
from pandas.core.arrays.masked import BaseMaskedArray
|
||||
from pandas.core.arrays.numpy_ import NumpyExtensionArray
|
||||
from pandas.core.arrays.period import (
|
||||
PeriodArray,
|
||||
period_array,
|
||||
)
|
||||
from pandas.core.arrays.sparse import SparseArray
|
||||
from pandas.core.arrays.string_ import StringArray
|
||||
from pandas.core.arrays.string_arrow import ArrowStringArray
|
||||
from pandas.core.arrays.timedeltas import TimedeltaArray
|
||||
|
||||
__all__ = [
|
||||
"ArrowExtensionArray",
|
||||
"ExtensionArray",
|
||||
"ExtensionOpsMixin",
|
||||
"ExtensionScalarOpsMixin",
|
||||
"ArrowStringArray",
|
||||
"BaseMaskedArray",
|
||||
"BooleanArray",
|
||||
"Categorical",
|
||||
"DatetimeArray",
|
||||
"FloatingArray",
|
||||
"IntegerArray",
|
||||
"IntervalArray",
|
||||
"NumpyExtensionArray",
|
||||
"PeriodArray",
|
||||
"period_array",
|
||||
"SparseArray",
|
||||
"StringArray",
|
||||
"TimedeltaArray",
|
||||
]
|
@ -0,0 +1,362 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from functools import partial
|
||||
import re
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Literal,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas.compat import (
|
||||
pa_version_under10p1,
|
||||
pa_version_under11p0,
|
||||
pa_version_under13p0,
|
||||
pa_version_under17p0,
|
||||
)
|
||||
|
||||
if not pa_version_under10p1:
|
||||
import pyarrow as pa
|
||||
import pyarrow.compute as pc
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable
|
||||
|
||||
from pandas._typing import (
|
||||
Scalar,
|
||||
Self,
|
||||
)
|
||||
|
||||
|
||||
class ArrowStringArrayMixin:
|
||||
_pa_array: pa.ChunkedArray
|
||||
|
||||
def __init__(self, *args, **kwargs) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
def _convert_bool_result(self, result, na=lib.no_default, method_name=None):
|
||||
# Convert a bool-dtype result to the appropriate result type
|
||||
raise NotImplementedError
|
||||
|
||||
def _convert_int_result(self, result):
|
||||
# Convert an integer-dtype result to the appropriate result type
|
||||
raise NotImplementedError
|
||||
|
||||
def _apply_elementwise(self, func: Callable) -> list[list[Any]]:
|
||||
raise NotImplementedError
|
||||
|
||||
def _str_len(self):
|
||||
result = pc.utf8_length(self._pa_array)
|
||||
return self._convert_int_result(result)
|
||||
|
||||
def _str_lower(self) -> Self:
|
||||
return type(self)(pc.utf8_lower(self._pa_array))
|
||||
|
||||
def _str_upper(self) -> Self:
|
||||
return type(self)(pc.utf8_upper(self._pa_array))
|
||||
|
||||
def _str_strip(self, to_strip=None) -> Self:
|
||||
if to_strip is None:
|
||||
result = pc.utf8_trim_whitespace(self._pa_array)
|
||||
else:
|
||||
result = pc.utf8_trim(self._pa_array, characters=to_strip)
|
||||
return type(self)(result)
|
||||
|
||||
def _str_lstrip(self, to_strip=None) -> Self:
|
||||
if to_strip is None:
|
||||
result = pc.utf8_ltrim_whitespace(self._pa_array)
|
||||
else:
|
||||
result = pc.utf8_ltrim(self._pa_array, characters=to_strip)
|
||||
return type(self)(result)
|
||||
|
||||
def _str_rstrip(self, to_strip=None) -> Self:
|
||||
if to_strip is None:
|
||||
result = pc.utf8_rtrim_whitespace(self._pa_array)
|
||||
else:
|
||||
result = pc.utf8_rtrim(self._pa_array, characters=to_strip)
|
||||
return type(self)(result)
|
||||
|
||||
def _str_pad(
|
||||
self,
|
||||
width: int,
|
||||
side: Literal["left", "right", "both"] = "left",
|
||||
fillchar: str = " ",
|
||||
):
|
||||
if side == "left":
|
||||
pa_pad = pc.utf8_lpad
|
||||
elif side == "right":
|
||||
pa_pad = pc.utf8_rpad
|
||||
elif side == "both":
|
||||
if pa_version_under17p0:
|
||||
# GH#59624 fall back to object dtype
|
||||
from pandas import array as pd_array
|
||||
|
||||
obj_arr = self.astype(object, copy=False) # type: ignore[attr-defined]
|
||||
obj = pd_array(obj_arr, dtype=object)
|
||||
result = obj._str_pad(width, side, fillchar) # type: ignore[attr-defined]
|
||||
return type(self)._from_sequence(result, dtype=self.dtype) # type: ignore[attr-defined]
|
||||
else:
|
||||
# GH#54792
|
||||
# https://github.com/apache/arrow/issues/15053#issuecomment-2317032347
|
||||
lean_left = (width % 2) == 0
|
||||
pa_pad = partial(pc.utf8_center, lean_left_on_odd_padding=lean_left)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'"
|
||||
)
|
||||
return type(self)(pa_pad(self._pa_array, width=width, padding=fillchar))
|
||||
|
||||
def _str_get(self, i: int):
|
||||
lengths = pc.utf8_length(self._pa_array)
|
||||
if i >= 0:
|
||||
out_of_bounds = pc.greater_equal(i, lengths)
|
||||
start = i
|
||||
stop = i + 1
|
||||
step = 1
|
||||
else:
|
||||
out_of_bounds = pc.greater(-i, lengths)
|
||||
start = i
|
||||
stop = i - 1
|
||||
step = -1
|
||||
not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True))
|
||||
selected = pc.utf8_slice_codeunits(
|
||||
self._pa_array, start=start, stop=stop, step=step
|
||||
)
|
||||
null_value = pa.scalar(None, type=self._pa_array.type)
|
||||
result = pc.if_else(not_out_of_bounds, selected, null_value)
|
||||
return type(self)(result)
|
||||
|
||||
def _str_slice(
|
||||
self, start: int | None = None, stop: int | None = None, step: int | None = None
|
||||
):
|
||||
if pa_version_under11p0:
|
||||
# GH#59724
|
||||
result = self._apply_elementwise(lambda val: val[start:stop:step])
|
||||
return type(self)(pa.chunked_array(result, type=self._pa_array.type))
|
||||
if start is None:
|
||||
if step is not None and step < 0:
|
||||
# GH#59710
|
||||
start = -1
|
||||
else:
|
||||
start = 0
|
||||
if step is None:
|
||||
step = 1
|
||||
return type(self)(
|
||||
pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
|
||||
)
|
||||
|
||||
def _str_slice_replace(
|
||||
self, start: int | None = None, stop: int | None = None, repl: str | None = None
|
||||
):
|
||||
if repl is None:
|
||||
repl = ""
|
||||
if start is None:
|
||||
start = 0
|
||||
if stop is None:
|
||||
stop = np.iinfo(np.int64).max
|
||||
return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl))
|
||||
|
||||
def _str_replace(
|
||||
self,
|
||||
pat: str | re.Pattern,
|
||||
repl: str | Callable,
|
||||
n: int = -1,
|
||||
case: bool = True,
|
||||
flags: int = 0,
|
||||
regex: bool = True,
|
||||
) -> Self:
|
||||
if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:
|
||||
raise NotImplementedError(
|
||||
"replace is not supported with a re.Pattern, callable repl, "
|
||||
"case=False, or flags!=0"
|
||||
)
|
||||
|
||||
func = pc.replace_substring_regex if regex else pc.replace_substring
|
||||
# https://github.com/apache/arrow/issues/39149
|
||||
# GH 56404, unexpected behavior with negative max_replacements with pyarrow.
|
||||
pa_max_replacements = None if n < 0 else n
|
||||
result = func(
|
||||
self._pa_array,
|
||||
pattern=pat,
|
||||
replacement=repl,
|
||||
max_replacements=pa_max_replacements,
|
||||
)
|
||||
return type(self)(result)
|
||||
|
||||
def _str_capitalize(self) -> Self:
|
||||
return type(self)(pc.utf8_capitalize(self._pa_array))
|
||||
|
||||
def _str_title(self):
|
||||
return type(self)(pc.utf8_title(self._pa_array))
|
||||
|
||||
def _str_swapcase(self):
|
||||
return type(self)(pc.utf8_swapcase(self._pa_array))
|
||||
|
||||
def _str_removeprefix(self, prefix: str):
|
||||
if not pa_version_under13p0:
|
||||
starts_with = pc.starts_with(self._pa_array, pattern=prefix)
|
||||
removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix))
|
||||
result = pc.if_else(starts_with, removed, self._pa_array)
|
||||
return type(self)(result)
|
||||
predicate = lambda val: val.removeprefix(prefix)
|
||||
result = self._apply_elementwise(predicate)
|
||||
return type(self)(pa.chunked_array(result))
|
||||
|
||||
def _str_removesuffix(self, suffix: str):
|
||||
ends_with = pc.ends_with(self._pa_array, pattern=suffix)
|
||||
removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))
|
||||
result = pc.if_else(ends_with, removed, self._pa_array)
|
||||
return type(self)(result)
|
||||
|
||||
def _str_startswith(
|
||||
self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default
|
||||
):
|
||||
if isinstance(pat, str):
|
||||
result = pc.starts_with(self._pa_array, pattern=pat)
|
||||
else:
|
||||
if len(pat) == 0:
|
||||
# For empty tuple we return null for missing values and False
|
||||
# for valid values.
|
||||
result = pc.if_else(pc.is_null(self._pa_array), None, False)
|
||||
else:
|
||||
result = pc.starts_with(self._pa_array, pattern=pat[0])
|
||||
|
||||
for p in pat[1:]:
|
||||
result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p))
|
||||
return self._convert_bool_result(result, na=na, method_name="startswith")
|
||||
|
||||
def _str_endswith(
|
||||
self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default
|
||||
):
|
||||
if isinstance(pat, str):
|
||||
result = pc.ends_with(self._pa_array, pattern=pat)
|
||||
else:
|
||||
if len(pat) == 0:
|
||||
# For empty tuple we return null for missing values and False
|
||||
# for valid values.
|
||||
result = pc.if_else(pc.is_null(self._pa_array), None, False)
|
||||
else:
|
||||
result = pc.ends_with(self._pa_array, pattern=pat[0])
|
||||
|
||||
for p in pat[1:]:
|
||||
result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p))
|
||||
return self._convert_bool_result(result, na=na, method_name="endswith")
|
||||
|
||||
def _str_isalnum(self):
|
||||
result = pc.utf8_is_alnum(self._pa_array)
|
||||
return self._convert_bool_result(result)
|
||||
|
||||
def _str_isalpha(self):
|
||||
result = pc.utf8_is_alpha(self._pa_array)
|
||||
return self._convert_bool_result(result)
|
||||
|
||||
def _str_isdecimal(self):
|
||||
result = pc.utf8_is_decimal(self._pa_array)
|
||||
return self._convert_bool_result(result)
|
||||
|
||||
def _str_isdigit(self):
|
||||
result = pc.utf8_is_digit(self._pa_array)
|
||||
return self._convert_bool_result(result)
|
||||
|
||||
def _str_islower(self):
|
||||
result = pc.utf8_is_lower(self._pa_array)
|
||||
return self._convert_bool_result(result)
|
||||
|
||||
def _str_isnumeric(self):
|
||||
result = pc.utf8_is_numeric(self._pa_array)
|
||||
return self._convert_bool_result(result)
|
||||
|
||||
def _str_isspace(self):
|
||||
result = pc.utf8_is_space(self._pa_array)
|
||||
return self._convert_bool_result(result)
|
||||
|
||||
def _str_istitle(self):
|
||||
result = pc.utf8_is_title(self._pa_array)
|
||||
return self._convert_bool_result(result)
|
||||
|
||||
def _str_isupper(self):
|
||||
result = pc.utf8_is_upper(self._pa_array)
|
||||
return self._convert_bool_result(result)
|
||||
|
||||
def _str_contains(
|
||||
self,
|
||||
pat,
|
||||
case: bool = True,
|
||||
flags: int = 0,
|
||||
na: Scalar | lib.NoDefault = lib.no_default,
|
||||
regex: bool = True,
|
||||
):
|
||||
if flags:
|
||||
raise NotImplementedError(f"contains not implemented with {flags=}")
|
||||
|
||||
if regex:
|
||||
pa_contains = pc.match_substring_regex
|
||||
else:
|
||||
pa_contains = pc.match_substring
|
||||
result = pa_contains(self._pa_array, pat, ignore_case=not case)
|
||||
return self._convert_bool_result(result, na=na, method_name="contains")
|
||||
|
||||
def _str_match(
|
||||
self,
|
||||
pat: str | re.Pattern,
|
||||
case: bool = True,
|
||||
flags: int = 0,
|
||||
na: Scalar | lib.NoDefault = lib.no_default,
|
||||
):
|
||||
if isinstance(pat, re.Pattern):
|
||||
# GH#61952
|
||||
pat = pat.pattern
|
||||
if isinstance(pat, str) and not pat.startswith("^"):
|
||||
pat = f"^{pat}"
|
||||
return self._str_contains(pat, case, flags, na, regex=True)
|
||||
|
||||
def _str_fullmatch(
|
||||
self,
|
||||
pat: str | re.Pattern,
|
||||
case: bool = True,
|
||||
flags: int = 0,
|
||||
na: Scalar | lib.NoDefault = lib.no_default,
|
||||
):
|
||||
if isinstance(pat, re.Pattern):
|
||||
# GH#61952
|
||||
pat = pat.pattern
|
||||
if isinstance(pat, str) and (not pat.endswith("$") or pat.endswith("\\$")):
|
||||
pat = f"{pat}$"
|
||||
return self._str_match(pat, case, flags, na)
|
||||
|
||||
def _str_find(self, sub: str, start: int = 0, end: int | None = None):
|
||||
if (
|
||||
pa_version_under13p0
|
||||
and not (start != 0 and end is not None)
|
||||
and not (start == 0 and end is None)
|
||||
):
|
||||
# GH#59562
|
||||
res_list = self._apply_elementwise(lambda val: val.find(sub, start, end))
|
||||
return self._convert_int_result(pa.chunked_array(res_list))
|
||||
|
||||
if (start == 0 or start is None) and end is None:
|
||||
result = pc.find_substring(self._pa_array, sub)
|
||||
else:
|
||||
if sub == "":
|
||||
# GH#56792
|
||||
res_list = self._apply_elementwise(
|
||||
lambda val: val.find(sub, start, end)
|
||||
)
|
||||
return self._convert_int_result(pa.chunked_array(res_list))
|
||||
if start is None:
|
||||
start_offset = 0
|
||||
start = 0
|
||||
elif start < 0:
|
||||
start_offset = pc.add(start, pc.utf8_length(self._pa_array))
|
||||
start_offset = pc.if_else(pc.less(start_offset, 0), 0, start_offset)
|
||||
else:
|
||||
start_offset = start
|
||||
slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
|
||||
result = pc.find_substring(slices, sub)
|
||||
found = pc.not_equal(result, pa.scalar(-1, type=result.type))
|
||||
offset_result = pc.add(result, start_offset)
|
||||
result = pc.if_else(found, offset_result, -1)
|
||||
return self._convert_int_result(result)
|
544
lib/python3.11/site-packages/pandas/core/arrays/_mixins.py
Normal file
544
lib/python3.11/site-packages/pandas/core/arrays/_mixins.py
Normal file
@ -0,0 +1,544 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from functools import wraps
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Literal,
|
||||
cast,
|
||||
overload,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas._libs.arrays import NDArrayBacked
|
||||
from pandas._libs.tslibs import is_supported_dtype
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
AxisInt,
|
||||
Dtype,
|
||||
F,
|
||||
FillnaOptions,
|
||||
PositionalIndexer2D,
|
||||
PositionalIndexerTuple,
|
||||
ScalarIndexer,
|
||||
Self,
|
||||
SequenceIndexer,
|
||||
Shape,
|
||||
TakeIndexer,
|
||||
npt,
|
||||
)
|
||||
from pandas.errors import AbstractMethodError
|
||||
from pandas.util._decorators import doc
|
||||
from pandas.util._validators import (
|
||||
validate_bool_kwarg,
|
||||
validate_fillna_kwargs,
|
||||
validate_insert_loc,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.common import pandas_dtype
|
||||
from pandas.core.dtypes.dtypes import (
|
||||
DatetimeTZDtype,
|
||||
ExtensionDtype,
|
||||
PeriodDtype,
|
||||
)
|
||||
from pandas.core.dtypes.missing import array_equivalent
|
||||
|
||||
from pandas.core import missing
|
||||
from pandas.core.algorithms import (
|
||||
take,
|
||||
unique,
|
||||
value_counts_internal as value_counts,
|
||||
)
|
||||
from pandas.core.array_algos.quantile import quantile_with_mask
|
||||
from pandas.core.array_algos.transforms import shift
|
||||
from pandas.core.arrays.base import ExtensionArray
|
||||
from pandas.core.construction import extract_array
|
||||
from pandas.core.indexers import check_array_indexer
|
||||
from pandas.core.sorting import nargminmax
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Sequence
|
||||
|
||||
from pandas._typing import (
|
||||
NumpySorter,
|
||||
NumpyValueArrayLike,
|
||||
)
|
||||
|
||||
from pandas import Series
|
||||
|
||||
|
||||
def ravel_compat(meth: F) -> F:
|
||||
"""
|
||||
Decorator to ravel a 2D array before passing it to a cython operation,
|
||||
then reshape the result to our own shape.
|
||||
"""
|
||||
|
||||
@wraps(meth)
|
||||
def method(self, *args, **kwargs):
|
||||
if self.ndim == 1:
|
||||
return meth(self, *args, **kwargs)
|
||||
|
||||
flags = self._ndarray.flags
|
||||
flat = self.ravel("K")
|
||||
result = meth(flat, *args, **kwargs)
|
||||
order = "F" if flags.f_contiguous else "C"
|
||||
return result.reshape(self.shape, order=order)
|
||||
|
||||
return cast(F, method)
|
||||
|
||||
|
||||
class NDArrayBackedExtensionArray(NDArrayBacked, ExtensionArray):
|
||||
"""
|
||||
ExtensionArray that is backed by a single NumPy ndarray.
|
||||
"""
|
||||
|
||||
_ndarray: np.ndarray
|
||||
|
||||
# scalar used to denote NA value inside our self._ndarray, e.g. -1
|
||||
# for Categorical, iNaT for Period. Outside of object dtype,
|
||||
# self.isna() should be exactly locations in self._ndarray with
|
||||
# _internal_fill_value.
|
||||
_internal_fill_value: Any
|
||||
|
||||
def _box_func(self, x):
|
||||
"""
|
||||
Wrap numpy type in our dtype.type if necessary.
|
||||
"""
|
||||
return x
|
||||
|
||||
def _validate_scalar(self, value):
|
||||
# used by NDArrayBackedExtensionIndex.insert
|
||||
raise AbstractMethodError(self)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
|
||||
def view(self, dtype: Dtype | None = None) -> ArrayLike:
|
||||
# We handle datetime64, datetime64tz, timedelta64, and period
|
||||
# dtypes here. Everything else we pass through to the underlying
|
||||
# ndarray.
|
||||
if dtype is None or dtype is self.dtype:
|
||||
return self._from_backing_data(self._ndarray)
|
||||
|
||||
if isinstance(dtype, type):
|
||||
# we sometimes pass non-dtype objects, e.g np.ndarray;
|
||||
# pass those through to the underlying ndarray
|
||||
return self._ndarray.view(dtype)
|
||||
|
||||
dtype = pandas_dtype(dtype)
|
||||
arr = self._ndarray
|
||||
|
||||
if isinstance(dtype, PeriodDtype):
|
||||
cls = dtype.construct_array_type()
|
||||
return cls(arr.view("i8"), dtype=dtype)
|
||||
elif isinstance(dtype, DatetimeTZDtype):
|
||||
dt_cls = dtype.construct_array_type()
|
||||
dt64_values = arr.view(f"M8[{dtype.unit}]")
|
||||
return dt_cls._simple_new(dt64_values, dtype=dtype)
|
||||
elif lib.is_np_dtype(dtype, "M") and is_supported_dtype(dtype):
|
||||
from pandas.core.arrays import DatetimeArray
|
||||
|
||||
dt64_values = arr.view(dtype)
|
||||
return DatetimeArray._simple_new(dt64_values, dtype=dtype)
|
||||
|
||||
elif lib.is_np_dtype(dtype, "m") and is_supported_dtype(dtype):
|
||||
from pandas.core.arrays import TimedeltaArray
|
||||
|
||||
td64_values = arr.view(dtype)
|
||||
return TimedeltaArray._simple_new(td64_values, dtype=dtype)
|
||||
|
||||
# error: Argument "dtype" to "view" of "_ArrayOrScalarCommon" has incompatible
|
||||
# type "Union[ExtensionDtype, dtype[Any]]"; expected "Union[dtype[Any], None,
|
||||
# type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, Union[int,
|
||||
# Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]"
|
||||
return arr.view(dtype=dtype) # type: ignore[arg-type]
|
||||
|
||||
def take(
|
||||
self,
|
||||
indices: TakeIndexer,
|
||||
*,
|
||||
allow_fill: bool = False,
|
||||
fill_value: Any = None,
|
||||
axis: AxisInt = 0,
|
||||
) -> Self:
|
||||
if allow_fill:
|
||||
fill_value = self._validate_scalar(fill_value)
|
||||
|
||||
new_data = take(
|
||||
self._ndarray,
|
||||
indices,
|
||||
allow_fill=allow_fill,
|
||||
fill_value=fill_value,
|
||||
axis=axis,
|
||||
)
|
||||
return self._from_backing_data(new_data)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
|
||||
def equals(self, other) -> bool:
|
||||
if type(self) is not type(other):
|
||||
return False
|
||||
if self.dtype != other.dtype:
|
||||
return False
|
||||
return bool(array_equivalent(self._ndarray, other._ndarray, dtype_equal=True))
|
||||
|
||||
@classmethod
|
||||
def _from_factorized(cls, values, original):
|
||||
assert values.dtype == original._ndarray.dtype
|
||||
return original._from_backing_data(values)
|
||||
|
||||
def _values_for_argsort(self) -> np.ndarray:
|
||||
return self._ndarray
|
||||
|
||||
def _values_for_factorize(self):
|
||||
return self._ndarray, self._internal_fill_value
|
||||
|
||||
def _hash_pandas_object(
|
||||
self, *, encoding: str, hash_key: str, categorize: bool
|
||||
) -> npt.NDArray[np.uint64]:
|
||||
from pandas.core.util.hashing import hash_array
|
||||
|
||||
values = self._ndarray
|
||||
return hash_array(
|
||||
values, encoding=encoding, hash_key=hash_key, categorize=categorize
|
||||
)
|
||||
|
||||
# Signature of "argmin" incompatible with supertype "ExtensionArray"
|
||||
def argmin(self, axis: AxisInt = 0, skipna: bool = True): # type: ignore[override]
|
||||
# override base class by adding axis keyword
|
||||
validate_bool_kwarg(skipna, "skipna")
|
||||
if not skipna and self._hasna:
|
||||
raise NotImplementedError
|
||||
return nargminmax(self, "argmin", axis=axis)
|
||||
|
||||
# Signature of "argmax" incompatible with supertype "ExtensionArray"
|
||||
def argmax(self, axis: AxisInt = 0, skipna: bool = True): # type: ignore[override]
|
||||
# override base class by adding axis keyword
|
||||
validate_bool_kwarg(skipna, "skipna")
|
||||
if not skipna and self._hasna:
|
||||
raise NotImplementedError
|
||||
return nargminmax(self, "argmax", axis=axis)
|
||||
|
||||
def unique(self) -> Self:
|
||||
new_data = unique(self._ndarray)
|
||||
return self._from_backing_data(new_data)
|
||||
|
||||
@classmethod
|
||||
@doc(ExtensionArray._concat_same_type)
|
||||
def _concat_same_type(
|
||||
cls,
|
||||
to_concat: Sequence[Self],
|
||||
axis: AxisInt = 0,
|
||||
) -> Self:
|
||||
if not lib.dtypes_all_equal([x.dtype for x in to_concat]):
|
||||
dtypes = {str(x.dtype) for x in to_concat}
|
||||
raise ValueError("to_concat must have the same dtype", dtypes)
|
||||
|
||||
return super()._concat_same_type(to_concat, axis=axis)
|
||||
|
||||
@doc(ExtensionArray.searchsorted)
|
||||
def searchsorted(
|
||||
self,
|
||||
value: NumpyValueArrayLike | ExtensionArray,
|
||||
side: Literal["left", "right"] = "left",
|
||||
sorter: NumpySorter | None = None,
|
||||
) -> npt.NDArray[np.intp] | np.intp:
|
||||
npvalue = self._validate_setitem_value(value)
|
||||
return self._ndarray.searchsorted(npvalue, side=side, sorter=sorter)
|
||||
|
||||
@doc(ExtensionArray.shift)
|
||||
def shift(self, periods: int = 1, fill_value=None):
|
||||
# NB: shift is always along axis=0
|
||||
axis = 0
|
||||
fill_value = self._validate_scalar(fill_value)
|
||||
new_values = shift(self._ndarray, periods, axis, fill_value)
|
||||
|
||||
return self._from_backing_data(new_values)
|
||||
|
||||
def __setitem__(self, key, value) -> None:
|
||||
key = check_array_indexer(self, key)
|
||||
value = self._validate_setitem_value(value)
|
||||
self._ndarray[key] = value
|
||||
|
||||
def _validate_setitem_value(self, value):
|
||||
return value
|
||||
|
||||
@overload
|
||||
def __getitem__(self, key: ScalarIndexer) -> Any:
|
||||
...
|
||||
|
||||
@overload
|
||||
def __getitem__(
|
||||
self,
|
||||
key: SequenceIndexer | PositionalIndexerTuple,
|
||||
) -> Self:
|
||||
...
|
||||
|
||||
def __getitem__(
|
||||
self,
|
||||
key: PositionalIndexer2D,
|
||||
) -> Self | Any:
|
||||
if lib.is_integer(key):
|
||||
# fast-path
|
||||
result = self._ndarray[key]
|
||||
if self.ndim == 1:
|
||||
return self._box_func(result)
|
||||
return self._from_backing_data(result)
|
||||
|
||||
# error: Incompatible types in assignment (expression has type "ExtensionArray",
|
||||
# variable has type "Union[int, slice, ndarray]")
|
||||
key = extract_array(key, extract_numpy=True) # type: ignore[assignment]
|
||||
key = check_array_indexer(self, key)
|
||||
result = self._ndarray[key]
|
||||
if lib.is_scalar(result):
|
||||
return self._box_func(result)
|
||||
|
||||
result = self._from_backing_data(result)
|
||||
return result
|
||||
|
||||
def _fill_mask_inplace(
|
||||
self, method: str, limit: int | None, mask: npt.NDArray[np.bool_]
|
||||
) -> None:
|
||||
# (for now) when self.ndim == 2, we assume axis=0
|
||||
func = missing.get_fill_func(method, ndim=self.ndim)
|
||||
func(self._ndarray.T, limit=limit, mask=mask.T)
|
||||
|
||||
def _pad_or_backfill(
|
||||
self,
|
||||
*,
|
||||
method: FillnaOptions,
|
||||
limit: int | None = None,
|
||||
limit_area: Literal["inside", "outside"] | None = None,
|
||||
copy: bool = True,
|
||||
) -> Self:
|
||||
mask = self.isna()
|
||||
if mask.any():
|
||||
# (for now) when self.ndim == 2, we assume axis=0
|
||||
func = missing.get_fill_func(method, ndim=self.ndim)
|
||||
|
||||
npvalues = self._ndarray.T
|
||||
if copy:
|
||||
npvalues = npvalues.copy()
|
||||
func(npvalues, limit=limit, limit_area=limit_area, mask=mask.T)
|
||||
npvalues = npvalues.T
|
||||
|
||||
if copy:
|
||||
new_values = self._from_backing_data(npvalues)
|
||||
else:
|
||||
new_values = self
|
||||
|
||||
else:
|
||||
if copy:
|
||||
new_values = self.copy()
|
||||
else:
|
||||
new_values = self
|
||||
return new_values
|
||||
|
||||
@doc(ExtensionArray.fillna)
|
||||
def fillna(
|
||||
self, value=None, method=None, limit: int | None = None, copy: bool = True
|
||||
) -> Self:
|
||||
value, method = validate_fillna_kwargs(
|
||||
value, method, validate_scalar_dict_value=False
|
||||
)
|
||||
|
||||
mask = self.isna()
|
||||
# error: Argument 2 to "check_value_size" has incompatible type
|
||||
# "ExtensionArray"; expected "ndarray"
|
||||
value = missing.check_value_size(
|
||||
value, mask, len(self) # type: ignore[arg-type]
|
||||
)
|
||||
|
||||
if mask.any():
|
||||
if method is not None:
|
||||
# (for now) when self.ndim == 2, we assume axis=0
|
||||
func = missing.get_fill_func(method, ndim=self.ndim)
|
||||
npvalues = self._ndarray.T
|
||||
if copy:
|
||||
npvalues = npvalues.copy()
|
||||
func(npvalues, limit=limit, mask=mask.T)
|
||||
npvalues = npvalues.T
|
||||
|
||||
# TODO: NumpyExtensionArray didn't used to copy, need tests
|
||||
# for this
|
||||
new_values = self._from_backing_data(npvalues)
|
||||
else:
|
||||
# fill with value
|
||||
if copy:
|
||||
new_values = self.copy()
|
||||
else:
|
||||
new_values = self[:]
|
||||
new_values[mask] = value
|
||||
else:
|
||||
# We validate the fill_value even if there is nothing to fill
|
||||
if value is not None:
|
||||
self._validate_setitem_value(value)
|
||||
|
||||
if not copy:
|
||||
new_values = self[:]
|
||||
else:
|
||||
new_values = self.copy()
|
||||
return new_values
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# Reductions
|
||||
|
||||
def _wrap_reduction_result(self, axis: AxisInt | None, result):
|
||||
if axis is None or self.ndim == 1:
|
||||
return self._box_func(result)
|
||||
return self._from_backing_data(result)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# __array_function__ methods
|
||||
|
||||
def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None:
|
||||
"""
|
||||
Analogue to np.putmask(self, mask, value)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mask : np.ndarray[bool]
|
||||
value : scalar or listlike
|
||||
|
||||
Raises
|
||||
------
|
||||
TypeError
|
||||
If value cannot be cast to self.dtype.
|
||||
"""
|
||||
value = self._validate_setitem_value(value)
|
||||
|
||||
np.putmask(self._ndarray, mask, value)
|
||||
|
||||
def _where(self: Self, mask: npt.NDArray[np.bool_], value) -> Self:
|
||||
"""
|
||||
Analogue to np.where(mask, self, value)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mask : np.ndarray[bool]
|
||||
value : scalar or listlike
|
||||
|
||||
Raises
|
||||
------
|
||||
TypeError
|
||||
If value cannot be cast to self.dtype.
|
||||
"""
|
||||
value = self._validate_setitem_value(value)
|
||||
|
||||
res_values = np.where(mask, self._ndarray, value)
|
||||
if res_values.dtype != self._ndarray.dtype:
|
||||
raise AssertionError(
|
||||
# GH#56410
|
||||
"Something has gone wrong, please report a bug at "
|
||||
"github.com/pandas-dev/pandas/"
|
||||
)
|
||||
return self._from_backing_data(res_values)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# Index compat methods
|
||||
|
||||
def insert(self, loc: int, item) -> Self:
|
||||
"""
|
||||
Make new ExtensionArray inserting new item at location. Follows
|
||||
Python list.append semantics for negative values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
loc : int
|
||||
item : object
|
||||
|
||||
Returns
|
||||
-------
|
||||
type(self)
|
||||
"""
|
||||
loc = validate_insert_loc(loc, len(self))
|
||||
|
||||
code = self._validate_scalar(item)
|
||||
|
||||
new_vals = np.concatenate(
|
||||
(
|
||||
self._ndarray[:loc],
|
||||
np.asarray([code], dtype=self._ndarray.dtype),
|
||||
self._ndarray[loc:],
|
||||
)
|
||||
)
|
||||
return self._from_backing_data(new_vals)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# Additional array methods
|
||||
# These are not part of the EA API, but we implement them because
|
||||
# pandas assumes they're there.
|
||||
|
||||
def value_counts(self, dropna: bool = True) -> Series:
|
||||
"""
|
||||
Return a Series containing counts of unique values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dropna : bool, default True
|
||||
Don't include counts of NA values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Series
|
||||
"""
|
||||
if self.ndim != 1:
|
||||
raise NotImplementedError
|
||||
|
||||
from pandas import (
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
|
||||
if dropna:
|
||||
# error: Unsupported operand type for ~ ("ExtensionArray")
|
||||
values = self[~self.isna()]._ndarray # type: ignore[operator]
|
||||
else:
|
||||
values = self._ndarray
|
||||
|
||||
result = value_counts(values, sort=False, dropna=dropna)
|
||||
|
||||
index_arr = self._from_backing_data(np.asarray(result.index._data))
|
||||
index = Index(index_arr, name=result.index.name)
|
||||
return Series(result._values, index=index, name=result.name, copy=False)
|
||||
|
||||
def _quantile(
|
||||
self,
|
||||
qs: npt.NDArray[np.float64],
|
||||
interpolation: str,
|
||||
) -> Self:
|
||||
# TODO: disable for Categorical if not ordered?
|
||||
|
||||
mask = np.asarray(self.isna())
|
||||
arr = self._ndarray
|
||||
fill_value = self._internal_fill_value
|
||||
|
||||
res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation)
|
||||
if res_values.dtype == self._ndarray.dtype:
|
||||
return self._from_backing_data(res_values)
|
||||
else:
|
||||
# e.g. test_quantile_empty we are empty integer dtype and res_values
|
||||
# has floating dtype
|
||||
# TODO: technically __init__ isn't defined here.
|
||||
# Should we raise NotImplementedError and handle this on NumpyEA?
|
||||
return type(self)(res_values) # type: ignore[call-arg]
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# numpy-like methods
|
||||
|
||||
@classmethod
|
||||
def _empty(cls, shape: Shape, dtype: ExtensionDtype) -> Self:
|
||||
"""
|
||||
Analogous to np.empty(shape, dtype=dtype)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
shape : tuple[int]
|
||||
dtype : ExtensionDtype
|
||||
"""
|
||||
# The base implementation uses a naive approach to find the dtype
|
||||
# for the backing ndarray
|
||||
arr = cls._from_sequence([], dtype=dtype)
|
||||
backing = np.empty(shape, dtype=arr._ndarray.dtype)
|
||||
return arr._from_backing_data(backing)
|
207
lib/python3.11/site-packages/pandas/core/arrays/_ranges.py
Normal file
207
lib/python3.11/site-packages/pandas/core/arrays/_ranges.py
Normal file
@ -0,0 +1,207 @@
|
||||
"""
|
||||
Helper functions to generate range-like data for DatetimeArray
|
||||
(and possibly TimedeltaArray/PeriodArray)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs.lib import i8max
|
||||
from pandas._libs.tslibs import (
|
||||
BaseOffset,
|
||||
OutOfBoundsDatetime,
|
||||
Timedelta,
|
||||
Timestamp,
|
||||
iNaT,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import npt
|
||||
|
||||
|
||||
def generate_regular_range(
|
||||
start: Timestamp | Timedelta | None,
|
||||
end: Timestamp | Timedelta | None,
|
||||
periods: int | None,
|
||||
freq: BaseOffset,
|
||||
unit: str = "ns",
|
||||
) -> npt.NDArray[np.intp]:
|
||||
"""
|
||||
Generate a range of dates or timestamps with the spans between dates
|
||||
described by the given `freq` DateOffset.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
start : Timedelta, Timestamp or None
|
||||
First point of produced date range.
|
||||
end : Timedelta, Timestamp or None
|
||||
Last point of produced date range.
|
||||
periods : int or None
|
||||
Number of periods in produced date range.
|
||||
freq : Tick
|
||||
Describes space between dates in produced date range.
|
||||
unit : str, default "ns"
|
||||
The resolution the output is meant to represent.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ndarray[np.int64]
|
||||
Representing the given resolution.
|
||||
"""
|
||||
istart = start._value if start is not None else None
|
||||
iend = end._value if end is not None else None
|
||||
freq.nanos # raises if non-fixed frequency
|
||||
td = Timedelta(freq)
|
||||
b: int
|
||||
e: int
|
||||
try:
|
||||
td = td.as_unit(unit, round_ok=False)
|
||||
except ValueError as err:
|
||||
raise ValueError(
|
||||
f"freq={freq} is incompatible with unit={unit}. "
|
||||
"Use a lower freq or a higher unit instead."
|
||||
) from err
|
||||
stride = int(td._value)
|
||||
|
||||
if periods is None and istart is not None and iend is not None:
|
||||
b = istart
|
||||
# cannot just use e = Timestamp(end) + 1 because arange breaks when
|
||||
# stride is too large, see GH10887
|
||||
e = b + (iend - b) // stride * stride + stride // 2 + 1
|
||||
elif istart is not None and periods is not None:
|
||||
b = istart
|
||||
e = _generate_range_overflow_safe(b, periods, stride, side="start")
|
||||
elif iend is not None and periods is not None:
|
||||
e = iend + stride
|
||||
b = _generate_range_overflow_safe(e, periods, stride, side="end")
|
||||
else:
|
||||
raise ValueError(
|
||||
"at least 'start' or 'end' should be specified if a 'period' is given."
|
||||
)
|
||||
|
||||
with np.errstate(over="raise"):
|
||||
# If the range is sufficiently large, np.arange may overflow
|
||||
# and incorrectly return an empty array if not caught.
|
||||
try:
|
||||
values = np.arange(b, e, stride, dtype=np.int64)
|
||||
except FloatingPointError:
|
||||
xdr = [b]
|
||||
while xdr[-1] != e:
|
||||
xdr.append(xdr[-1] + stride)
|
||||
values = np.array(xdr[:-1], dtype=np.int64)
|
||||
return values
|
||||
|
||||
|
||||
def _generate_range_overflow_safe(
|
||||
endpoint: int, periods: int, stride: int, side: str = "start"
|
||||
) -> int:
|
||||
"""
|
||||
Calculate the second endpoint for passing to np.arange, checking
|
||||
to avoid an integer overflow. Catch OverflowError and re-raise
|
||||
as OutOfBoundsDatetime.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
endpoint : int
|
||||
nanosecond timestamp of the known endpoint of the desired range
|
||||
periods : int
|
||||
number of periods in the desired range
|
||||
stride : int
|
||||
nanoseconds between periods in the desired range
|
||||
side : {'start', 'end'}
|
||||
which end of the range `endpoint` refers to
|
||||
|
||||
Returns
|
||||
-------
|
||||
other_end : int
|
||||
|
||||
Raises
|
||||
------
|
||||
OutOfBoundsDatetime
|
||||
"""
|
||||
# GH#14187 raise instead of incorrectly wrapping around
|
||||
assert side in ["start", "end"]
|
||||
|
||||
i64max = np.uint64(i8max)
|
||||
msg = f"Cannot generate range with {side}={endpoint} and periods={periods}"
|
||||
|
||||
with np.errstate(over="raise"):
|
||||
# if periods * strides cannot be multiplied within the *uint64* bounds,
|
||||
# we cannot salvage the operation by recursing, so raise
|
||||
try:
|
||||
addend = np.uint64(periods) * np.uint64(np.abs(stride))
|
||||
except FloatingPointError as err:
|
||||
raise OutOfBoundsDatetime(msg) from err
|
||||
|
||||
if np.abs(addend) <= i64max:
|
||||
# relatively easy case without casting concerns
|
||||
return _generate_range_overflow_safe_signed(endpoint, periods, stride, side)
|
||||
|
||||
elif (endpoint > 0 and side == "start" and stride > 0) or (
|
||||
endpoint < 0 < stride and side == "end"
|
||||
):
|
||||
# no chance of not-overflowing
|
||||
raise OutOfBoundsDatetime(msg)
|
||||
|
||||
elif side == "end" and endpoint - stride <= i64max < endpoint:
|
||||
# in _generate_regular_range we added `stride` thereby overflowing
|
||||
# the bounds. Adjust to fix this.
|
||||
return _generate_range_overflow_safe(
|
||||
endpoint - stride, periods - 1, stride, side
|
||||
)
|
||||
|
||||
# split into smaller pieces
|
||||
mid_periods = periods // 2
|
||||
remaining = periods - mid_periods
|
||||
assert 0 < remaining < periods, (remaining, periods, endpoint, stride)
|
||||
|
||||
midpoint = int(_generate_range_overflow_safe(endpoint, mid_periods, stride, side))
|
||||
return _generate_range_overflow_safe(midpoint, remaining, stride, side)
|
||||
|
||||
|
||||
def _generate_range_overflow_safe_signed(
|
||||
endpoint: int, periods: int, stride: int, side: str
|
||||
) -> int:
|
||||
"""
|
||||
A special case for _generate_range_overflow_safe where `periods * stride`
|
||||
can be calculated without overflowing int64 bounds.
|
||||
"""
|
||||
assert side in ["start", "end"]
|
||||
if side == "end":
|
||||
stride *= -1
|
||||
|
||||
with np.errstate(over="raise"):
|
||||
addend = np.int64(periods) * np.int64(stride)
|
||||
try:
|
||||
# easy case with no overflows
|
||||
result = np.int64(endpoint) + addend
|
||||
if result == iNaT:
|
||||
# Putting this into a DatetimeArray/TimedeltaArray
|
||||
# would incorrectly be interpreted as NaT
|
||||
raise OverflowError
|
||||
return int(result)
|
||||
except (FloatingPointError, OverflowError):
|
||||
# with endpoint negative and addend positive we risk
|
||||
# FloatingPointError; with reversed signed we risk OverflowError
|
||||
pass
|
||||
|
||||
# if stride and endpoint had opposite signs, then endpoint + addend
|
||||
# should never overflow. so they must have the same signs
|
||||
assert (stride > 0 and endpoint >= 0) or (stride < 0 and endpoint <= 0)
|
||||
|
||||
if stride > 0:
|
||||
# watch out for very special case in which we just slightly
|
||||
# exceed implementation bounds, but when passing the result to
|
||||
# np.arange will get a result slightly within the bounds
|
||||
|
||||
uresult = np.uint64(endpoint) + np.uint64(addend)
|
||||
i64max = np.uint64(i8max)
|
||||
assert uresult > i64max
|
||||
if uresult <= i64max + np.uint64(stride):
|
||||
return int(uresult)
|
||||
|
||||
raise OutOfBoundsDatetime(
|
||||
f"Cannot generate range with {side}={endpoint} and periods={periods}"
|
||||
)
|
63
lib/python3.11/site-packages/pandas/core/arrays/_utils.py
Normal file
63
lib/python3.11/site-packages/pandas/core/arrays/_utils.py
Normal file
@ -0,0 +1,63 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas.errors import LossySetitemError
|
||||
|
||||
from pandas.core.dtypes.cast import np_can_hold_element
|
||||
from pandas.core.dtypes.common import is_numeric_dtype
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
npt,
|
||||
)
|
||||
|
||||
|
||||
def to_numpy_dtype_inference(
|
||||
arr: ArrayLike, dtype: npt.DTypeLike | None, na_value, hasna: bool
|
||||
) -> tuple[npt.DTypeLike, Any]:
|
||||
if dtype is None and is_numeric_dtype(arr.dtype):
|
||||
dtype_given = False
|
||||
if hasna:
|
||||
if arr.dtype.kind == "b":
|
||||
dtype = np.dtype(np.object_)
|
||||
else:
|
||||
if arr.dtype.kind in "iu":
|
||||
dtype = np.dtype(np.float64)
|
||||
else:
|
||||
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
|
||||
if na_value is lib.no_default:
|
||||
na_value = np.nan
|
||||
else:
|
||||
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
|
||||
elif dtype is not None:
|
||||
dtype = np.dtype(dtype)
|
||||
dtype_given = True
|
||||
else:
|
||||
dtype_given = True
|
||||
|
||||
if na_value is lib.no_default:
|
||||
if dtype is None or not hasna:
|
||||
na_value = arr.dtype.na_value
|
||||
elif dtype.kind == "f": # type: ignore[union-attr]
|
||||
na_value = np.nan
|
||||
elif dtype.kind == "M": # type: ignore[union-attr]
|
||||
na_value = np.datetime64("nat")
|
||||
elif dtype.kind == "m": # type: ignore[union-attr]
|
||||
na_value = np.timedelta64("nat")
|
||||
else:
|
||||
na_value = arr.dtype.na_value
|
||||
|
||||
if not dtype_given and hasna:
|
||||
try:
|
||||
np_can_hold_element(dtype, na_value) # type: ignore[arg-type]
|
||||
except LossySetitemError:
|
||||
dtype = np.dtype(np.object_)
|
||||
return dtype, na_value
|
@ -0,0 +1,7 @@
|
||||
from pandas.core.arrays.arrow.accessors import (
|
||||
ListAccessor,
|
||||
StructAccessor,
|
||||
)
|
||||
from pandas.core.arrays.arrow.array import ArrowExtensionArray
|
||||
|
||||
__all__ = ["ArrowExtensionArray", "StructAccessor", "ListAccessor"]
|
@ -0,0 +1,50 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
import pyarrow
|
||||
|
||||
|
||||
def pyarrow_array_to_numpy_and_mask(
|
||||
arr, dtype: np.dtype
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
"""
|
||||
Convert a primitive pyarrow.Array to a numpy array and boolean mask based
|
||||
on the buffers of the Array.
|
||||
|
||||
At the moment pyarrow.BooleanArray is not supported.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
arr : pyarrow.Array
|
||||
dtype : numpy.dtype
|
||||
|
||||
Returns
|
||||
-------
|
||||
(data, mask)
|
||||
Tuple of two numpy arrays with the raw data (with specified dtype) and
|
||||
a boolean mask (validity mask, so False means missing)
|
||||
"""
|
||||
dtype = np.dtype(dtype)
|
||||
|
||||
if pyarrow.types.is_null(arr.type):
|
||||
# No initialization of data is needed since everything is null
|
||||
data = np.empty(len(arr), dtype=dtype)
|
||||
mask = np.zeros(len(arr), dtype=bool)
|
||||
return data, mask
|
||||
buflist = arr.buffers()
|
||||
# Since Arrow buffers might contain padding and the data might be offset,
|
||||
# the buffer gets sliced here before handing it to numpy.
|
||||
# See also https://github.com/pandas-dev/pandas/issues/40896
|
||||
offset = arr.offset * dtype.itemsize
|
||||
length = len(arr) * dtype.itemsize
|
||||
data_buf = buflist[1][offset : offset + length]
|
||||
data = np.frombuffer(data_buf, dtype=dtype)
|
||||
bitmask = buflist[0]
|
||||
if bitmask is not None:
|
||||
mask = pyarrow.BooleanArray.from_buffers(
|
||||
pyarrow.bool_(), len(arr), [None, bitmask], offset=arr.offset
|
||||
)
|
||||
mask = np.asarray(mask)
|
||||
else:
|
||||
mask = np.ones(len(arr), dtype=bool)
|
||||
return data, mask
|
@ -0,0 +1,473 @@
|
||||
"""Accessors for arrow-backed data."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import (
|
||||
ABCMeta,
|
||||
abstractmethod,
|
||||
)
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
cast,
|
||||
)
|
||||
|
||||
from pandas.compat import (
|
||||
pa_version_under10p1,
|
||||
pa_version_under11p0,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.common import is_list_like
|
||||
|
||||
if not pa_version_under10p1:
|
||||
import pyarrow as pa
|
||||
import pyarrow.compute as pc
|
||||
|
||||
from pandas.core.dtypes.dtypes import ArrowDtype
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterator
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
|
||||
|
||||
class ArrowAccessor(metaclass=ABCMeta):
|
||||
@abstractmethod
|
||||
def __init__(self, data, validation_msg: str) -> None:
|
||||
self._data = data
|
||||
self._validation_msg = validation_msg
|
||||
self._validate(data)
|
||||
|
||||
@abstractmethod
|
||||
def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
|
||||
pass
|
||||
|
||||
def _validate(self, data):
|
||||
dtype = data.dtype
|
||||
if pa_version_under10p1 or not isinstance(dtype, ArrowDtype):
|
||||
# Raise AttributeError so that inspect can handle non-struct Series.
|
||||
raise AttributeError(self._validation_msg.format(dtype=dtype))
|
||||
|
||||
if not self._is_valid_pyarrow_dtype(dtype.pyarrow_dtype):
|
||||
# Raise AttributeError so that inspect can handle invalid Series.
|
||||
raise AttributeError(self._validation_msg.format(dtype=dtype))
|
||||
|
||||
@property
|
||||
def _pa_array(self):
|
||||
return self._data.array._pa_array
|
||||
|
||||
|
||||
class ListAccessor(ArrowAccessor):
|
||||
"""
|
||||
Accessor object for list data properties of the Series values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : Series
|
||||
Series containing Arrow list data.
|
||||
"""
|
||||
|
||||
def __init__(self, data=None) -> None:
|
||||
super().__init__(
|
||||
data,
|
||||
validation_msg="Can only use the '.list' accessor with "
|
||||
"'list[pyarrow]' dtype, not {dtype}.",
|
||||
)
|
||||
|
||||
def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
|
||||
return (
|
||||
pa.types.is_list(pyarrow_dtype)
|
||||
or pa.types.is_fixed_size_list(pyarrow_dtype)
|
||||
or pa.types.is_large_list(pyarrow_dtype)
|
||||
)
|
||||
|
||||
def len(self) -> Series:
|
||||
"""
|
||||
Return the length of each list in the Series.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.Series
|
||||
The length of each list.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow as pa
|
||||
>>> s = pd.Series(
|
||||
... [
|
||||
... [1, 2, 3],
|
||||
... [3],
|
||||
... ],
|
||||
... dtype=pd.ArrowDtype(pa.list_(
|
||||
... pa.int64()
|
||||
... ))
|
||||
... )
|
||||
>>> s.list.len()
|
||||
0 3
|
||||
1 1
|
||||
dtype: int32[pyarrow]
|
||||
"""
|
||||
from pandas import Series
|
||||
|
||||
value_lengths = pc.list_value_length(self._pa_array)
|
||||
return Series(value_lengths, dtype=ArrowDtype(value_lengths.type))
|
||||
|
||||
def __getitem__(self, key: int | slice) -> Series:
|
||||
"""
|
||||
Index or slice lists in the Series.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
key : int | slice
|
||||
Index or slice of indices to access from each list.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.Series
|
||||
The list at requested index.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow as pa
|
||||
>>> s = pd.Series(
|
||||
... [
|
||||
... [1, 2, 3],
|
||||
... [3],
|
||||
... ],
|
||||
... dtype=pd.ArrowDtype(pa.list_(
|
||||
... pa.int64()
|
||||
... ))
|
||||
... )
|
||||
>>> s.list[0]
|
||||
0 1
|
||||
1 3
|
||||
dtype: int64[pyarrow]
|
||||
"""
|
||||
from pandas import Series
|
||||
|
||||
if isinstance(key, int):
|
||||
# TODO: Support negative key but pyarrow does not allow
|
||||
# element index to be an array.
|
||||
# if key < 0:
|
||||
# key = pc.add(key, pc.list_value_length(self._pa_array))
|
||||
element = pc.list_element(self._pa_array, key)
|
||||
return Series(element, dtype=ArrowDtype(element.type))
|
||||
elif isinstance(key, slice):
|
||||
if pa_version_under11p0:
|
||||
raise NotImplementedError(
|
||||
f"List slice not supported by pyarrow {pa.__version__}."
|
||||
)
|
||||
|
||||
# TODO: Support negative start/stop/step, ideally this would be added
|
||||
# upstream in pyarrow.
|
||||
start, stop, step = key.start, key.stop, key.step
|
||||
if start is None:
|
||||
# TODO: When adding negative step support
|
||||
# this should be setto last element of array
|
||||
# when step is negative.
|
||||
start = 0
|
||||
if step is None:
|
||||
step = 1
|
||||
sliced = pc.list_slice(self._pa_array, start, stop, step)
|
||||
return Series(sliced, dtype=ArrowDtype(sliced.type))
|
||||
else:
|
||||
raise ValueError(f"key must be an int or slice, got {type(key).__name__}")
|
||||
|
||||
def __iter__(self) -> Iterator:
|
||||
raise TypeError(f"'{type(self).__name__}' object is not iterable")
|
||||
|
||||
def flatten(self) -> Series:
|
||||
"""
|
||||
Flatten list values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.Series
|
||||
The data from all lists in the series flattened.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow as pa
|
||||
>>> s = pd.Series(
|
||||
... [
|
||||
... [1, 2, 3],
|
||||
... [3],
|
||||
... ],
|
||||
... dtype=pd.ArrowDtype(pa.list_(
|
||||
... pa.int64()
|
||||
... ))
|
||||
... )
|
||||
>>> s.list.flatten()
|
||||
0 1
|
||||
1 2
|
||||
2 3
|
||||
3 3
|
||||
dtype: int64[pyarrow]
|
||||
"""
|
||||
from pandas import Series
|
||||
|
||||
flattened = pc.list_flatten(self._pa_array)
|
||||
return Series(flattened, dtype=ArrowDtype(flattened.type))
|
||||
|
||||
|
||||
class StructAccessor(ArrowAccessor):
|
||||
"""
|
||||
Accessor object for structured data properties of the Series values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : Series
|
||||
Series containing Arrow struct data.
|
||||
"""
|
||||
|
||||
def __init__(self, data=None) -> None:
|
||||
super().__init__(
|
||||
data,
|
||||
validation_msg=(
|
||||
"Can only use the '.struct' accessor with 'struct[pyarrow]' "
|
||||
"dtype, not {dtype}."
|
||||
),
|
||||
)
|
||||
|
||||
def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
|
||||
return pa.types.is_struct(pyarrow_dtype)
|
||||
|
||||
@property
|
||||
def dtypes(self) -> Series:
|
||||
"""
|
||||
Return the dtype object of each child field of the struct.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.Series
|
||||
The data type of each child field.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow as pa
|
||||
>>> s = pd.Series(
|
||||
... [
|
||||
... {"version": 1, "project": "pandas"},
|
||||
... {"version": 2, "project": "pandas"},
|
||||
... {"version": 1, "project": "numpy"},
|
||||
... ],
|
||||
... dtype=pd.ArrowDtype(pa.struct(
|
||||
... [("version", pa.int64()), ("project", pa.string())]
|
||||
... ))
|
||||
... )
|
||||
>>> s.struct.dtypes
|
||||
version int64[pyarrow]
|
||||
project string[pyarrow]
|
||||
dtype: object
|
||||
"""
|
||||
from pandas import (
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
|
||||
pa_type = self._data.dtype.pyarrow_dtype
|
||||
types = [ArrowDtype(struct.type) for struct in pa_type]
|
||||
names = [struct.name for struct in pa_type]
|
||||
return Series(types, index=Index(names))
|
||||
|
||||
def field(
|
||||
self,
|
||||
name_or_index: list[str]
|
||||
| list[bytes]
|
||||
| list[int]
|
||||
| pc.Expression
|
||||
| bytes
|
||||
| str
|
||||
| int,
|
||||
) -> Series:
|
||||
"""
|
||||
Extract a child field of a struct as a Series.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name_or_index : str | bytes | int | expression | list
|
||||
Name or index of the child field to extract.
|
||||
|
||||
For list-like inputs, this will index into a nested
|
||||
struct.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.Series
|
||||
The data corresponding to the selected child field.
|
||||
|
||||
See Also
|
||||
--------
|
||||
Series.struct.explode : Return all child fields as a DataFrame.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The name of the resulting Series will be set using the following
|
||||
rules:
|
||||
|
||||
- For string, bytes, or integer `name_or_index` (or a list of these, for
|
||||
a nested selection), the Series name is set to the selected
|
||||
field's name.
|
||||
- For a :class:`pyarrow.compute.Expression`, this is set to
|
||||
the string form of the expression.
|
||||
- For list-like `name_or_index`, the name will be set to the
|
||||
name of the final field selected.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow as pa
|
||||
>>> s = pd.Series(
|
||||
... [
|
||||
... {"version": 1, "project": "pandas"},
|
||||
... {"version": 2, "project": "pandas"},
|
||||
... {"version": 1, "project": "numpy"},
|
||||
... ],
|
||||
... dtype=pd.ArrowDtype(pa.struct(
|
||||
... [("version", pa.int64()), ("project", pa.string())]
|
||||
... ))
|
||||
... )
|
||||
|
||||
Extract by field name.
|
||||
|
||||
>>> s.struct.field("project")
|
||||
0 pandas
|
||||
1 pandas
|
||||
2 numpy
|
||||
Name: project, dtype: string[pyarrow]
|
||||
|
||||
Extract by field index.
|
||||
|
||||
>>> s.struct.field(0)
|
||||
0 1
|
||||
1 2
|
||||
2 1
|
||||
Name: version, dtype: int64[pyarrow]
|
||||
|
||||
Or an expression
|
||||
|
||||
>>> import pyarrow.compute as pc
|
||||
>>> s.struct.field(pc.field("project"))
|
||||
0 pandas
|
||||
1 pandas
|
||||
2 numpy
|
||||
Name: project, dtype: string[pyarrow]
|
||||
|
||||
For nested struct types, you can pass a list of values to index
|
||||
multiple levels:
|
||||
|
||||
>>> version_type = pa.struct([
|
||||
... ("major", pa.int64()),
|
||||
... ("minor", pa.int64()),
|
||||
... ])
|
||||
>>> s = pd.Series(
|
||||
... [
|
||||
... {"version": {"major": 1, "minor": 5}, "project": "pandas"},
|
||||
... {"version": {"major": 2, "minor": 1}, "project": "pandas"},
|
||||
... {"version": {"major": 1, "minor": 26}, "project": "numpy"},
|
||||
... ],
|
||||
... dtype=pd.ArrowDtype(pa.struct(
|
||||
... [("version", version_type), ("project", pa.string())]
|
||||
... ))
|
||||
... )
|
||||
>>> s.struct.field(["version", "minor"])
|
||||
0 5
|
||||
1 1
|
||||
2 26
|
||||
Name: minor, dtype: int64[pyarrow]
|
||||
>>> s.struct.field([0, 0])
|
||||
0 1
|
||||
1 2
|
||||
2 1
|
||||
Name: major, dtype: int64[pyarrow]
|
||||
"""
|
||||
from pandas import Series
|
||||
|
||||
def get_name(
|
||||
level_name_or_index: list[str]
|
||||
| list[bytes]
|
||||
| list[int]
|
||||
| pc.Expression
|
||||
| bytes
|
||||
| str
|
||||
| int,
|
||||
data: pa.ChunkedArray,
|
||||
):
|
||||
if isinstance(level_name_or_index, int):
|
||||
name = data.type.field(level_name_or_index).name
|
||||
elif isinstance(level_name_or_index, (str, bytes)):
|
||||
name = level_name_or_index
|
||||
elif isinstance(level_name_or_index, pc.Expression):
|
||||
name = str(level_name_or_index)
|
||||
elif is_list_like(level_name_or_index):
|
||||
# For nested input like [2, 1, 2]
|
||||
# iteratively get the struct and field name. The last
|
||||
# one is used for the name of the index.
|
||||
level_name_or_index = list(reversed(level_name_or_index))
|
||||
selected = data
|
||||
while level_name_or_index:
|
||||
# we need the cast, otherwise mypy complains about
|
||||
# getting ints, bytes, or str here, which isn't possible.
|
||||
level_name_or_index = cast(list, level_name_or_index)
|
||||
name_or_index = level_name_or_index.pop()
|
||||
name = get_name(name_or_index, selected)
|
||||
selected = selected.type.field(selected.type.get_field_index(name))
|
||||
name = selected.name
|
||||
else:
|
||||
raise ValueError(
|
||||
"name_or_index must be an int, str, bytes, "
|
||||
"pyarrow.compute.Expression, or list of those"
|
||||
)
|
||||
return name
|
||||
|
||||
pa_arr = self._data.array._pa_array
|
||||
name = get_name(name_or_index, pa_arr)
|
||||
field_arr = pc.struct_field(pa_arr, name_or_index)
|
||||
|
||||
return Series(
|
||||
field_arr,
|
||||
dtype=ArrowDtype(field_arr.type),
|
||||
index=self._data.index,
|
||||
name=name,
|
||||
)
|
||||
|
||||
def explode(self) -> DataFrame:
|
||||
"""
|
||||
Extract all child fields of a struct as a DataFrame.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.DataFrame
|
||||
The data corresponding to all child fields.
|
||||
|
||||
See Also
|
||||
--------
|
||||
Series.struct.field : Return a single child field as a Series.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow as pa
|
||||
>>> s = pd.Series(
|
||||
... [
|
||||
... {"version": 1, "project": "pandas"},
|
||||
... {"version": 2, "project": "pandas"},
|
||||
... {"version": 1, "project": "numpy"},
|
||||
... ],
|
||||
... dtype=pd.ArrowDtype(pa.struct(
|
||||
... [("version", pa.int64()), ("project", pa.string())]
|
||||
... ))
|
||||
... )
|
||||
|
||||
>>> s.struct.explode()
|
||||
version project
|
||||
0 1 pandas
|
||||
1 2 pandas
|
||||
2 1 numpy
|
||||
"""
|
||||
from pandas import concat
|
||||
|
||||
pa_type = self._pa_array.type
|
||||
return concat(
|
||||
[self.field(i) for i in range(pa_type.num_fields)], axis="columns"
|
||||
)
|
2946
lib/python3.11/site-packages/pandas/core/arrays/arrow/array.py
Normal file
2946
lib/python3.11/site-packages/pandas/core/arrays/arrow/array.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,174 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pyarrow
|
||||
|
||||
from pandas.compat import pa_version_under14p1
|
||||
|
||||
from pandas.core.dtypes.dtypes import (
|
||||
IntervalDtype,
|
||||
PeriodDtype,
|
||||
)
|
||||
|
||||
from pandas.core.arrays.interval import VALID_CLOSED
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import IntervalClosedType
|
||||
|
||||
|
||||
class ArrowPeriodType(pyarrow.ExtensionType):
|
||||
def __init__(self, freq) -> None:
|
||||
# attributes need to be set first before calling
|
||||
# super init (as that calls serialize)
|
||||
self._freq = freq
|
||||
pyarrow.ExtensionType.__init__(self, pyarrow.int64(), "pandas.period")
|
||||
|
||||
@property
|
||||
def freq(self):
|
||||
return self._freq
|
||||
|
||||
def __arrow_ext_serialize__(self) -> bytes:
|
||||
metadata = {"freq": self.freq}
|
||||
return json.dumps(metadata).encode()
|
||||
|
||||
@classmethod
|
||||
def __arrow_ext_deserialize__(cls, storage_type, serialized) -> ArrowPeriodType:
|
||||
metadata = json.loads(serialized.decode())
|
||||
return ArrowPeriodType(metadata["freq"])
|
||||
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, pyarrow.BaseExtensionType):
|
||||
return type(self) == type(other) and self.freq == other.freq
|
||||
else:
|
||||
return NotImplemented
|
||||
|
||||
def __ne__(self, other) -> bool:
|
||||
return not self == other
|
||||
|
||||
def __hash__(self) -> int:
|
||||
return hash((str(self), self.freq))
|
||||
|
||||
def to_pandas_dtype(self) -> PeriodDtype:
|
||||
return PeriodDtype(freq=self.freq)
|
||||
|
||||
|
||||
# register the type with a dummy instance
|
||||
_period_type = ArrowPeriodType("D")
|
||||
pyarrow.register_extension_type(_period_type)
|
||||
|
||||
|
||||
class ArrowIntervalType(pyarrow.ExtensionType):
|
||||
def __init__(self, subtype, closed: IntervalClosedType) -> None:
|
||||
# attributes need to be set first before calling
|
||||
# super init (as that calls serialize)
|
||||
assert closed in VALID_CLOSED
|
||||
self._closed: IntervalClosedType = closed
|
||||
if not isinstance(subtype, pyarrow.DataType):
|
||||
subtype = pyarrow.type_for_alias(str(subtype))
|
||||
self._subtype = subtype
|
||||
|
||||
storage_type = pyarrow.struct([("left", subtype), ("right", subtype)])
|
||||
pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval")
|
||||
|
||||
@property
|
||||
def subtype(self):
|
||||
return self._subtype
|
||||
|
||||
@property
|
||||
def closed(self) -> IntervalClosedType:
|
||||
return self._closed
|
||||
|
||||
def __arrow_ext_serialize__(self) -> bytes:
|
||||
metadata = {"subtype": str(self.subtype), "closed": self.closed}
|
||||
return json.dumps(metadata).encode()
|
||||
|
||||
@classmethod
|
||||
def __arrow_ext_deserialize__(cls, storage_type, serialized) -> ArrowIntervalType:
|
||||
metadata = json.loads(serialized.decode())
|
||||
subtype = pyarrow.type_for_alias(metadata["subtype"])
|
||||
closed = metadata["closed"]
|
||||
return ArrowIntervalType(subtype, closed)
|
||||
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, pyarrow.BaseExtensionType):
|
||||
return (
|
||||
type(self) == type(other)
|
||||
and self.subtype == other.subtype
|
||||
and self.closed == other.closed
|
||||
)
|
||||
else:
|
||||
return NotImplemented
|
||||
|
||||
def __ne__(self, other) -> bool:
|
||||
return not self == other
|
||||
|
||||
def __hash__(self) -> int:
|
||||
return hash((str(self), str(self.subtype), self.closed))
|
||||
|
||||
def to_pandas_dtype(self) -> IntervalDtype:
|
||||
return IntervalDtype(self.subtype.to_pandas_dtype(), self.closed)
|
||||
|
||||
|
||||
# register the type with a dummy instance
|
||||
_interval_type = ArrowIntervalType(pyarrow.int64(), "left")
|
||||
pyarrow.register_extension_type(_interval_type)
|
||||
|
||||
|
||||
_ERROR_MSG = """\
|
||||
Disallowed deserialization of 'arrow.py_extension_type':
|
||||
storage_type = {storage_type}
|
||||
serialized = {serialized}
|
||||
pickle disassembly:\n{pickle_disassembly}
|
||||
|
||||
Reading of untrusted Parquet or Feather files with a PyExtensionType column
|
||||
allows arbitrary code execution.
|
||||
If you trust this file, you can enable reading the extension type by one of:
|
||||
|
||||
- upgrading to pyarrow >= 14.0.1, and call `pa.PyExtensionType.set_auto_load(True)`
|
||||
- install pyarrow-hotfix (`pip install pyarrow-hotfix`) and disable it by running
|
||||
`import pyarrow_hotfix; pyarrow_hotfix.uninstall()`
|
||||
|
||||
We strongly recommend updating your Parquet/Feather files to use extension types
|
||||
derived from `pyarrow.ExtensionType` instead, and register this type explicitly.
|
||||
"""
|
||||
|
||||
|
||||
def patch_pyarrow():
|
||||
# starting from pyarrow 14.0.1, it has its own mechanism
|
||||
if not pa_version_under14p1:
|
||||
return
|
||||
|
||||
# if https://github.com/pitrou/pyarrow-hotfix was installed and enabled
|
||||
if getattr(pyarrow, "_hotfix_installed", False):
|
||||
return
|
||||
|
||||
class ForbiddenExtensionType(pyarrow.ExtensionType):
|
||||
def __arrow_ext_serialize__(self):
|
||||
return b""
|
||||
|
||||
@classmethod
|
||||
def __arrow_ext_deserialize__(cls, storage_type, serialized):
|
||||
import io
|
||||
import pickletools
|
||||
|
||||
out = io.StringIO()
|
||||
pickletools.dis(serialized, out)
|
||||
raise RuntimeError(
|
||||
_ERROR_MSG.format(
|
||||
storage_type=storage_type,
|
||||
serialized=serialized,
|
||||
pickle_disassembly=out.getvalue(),
|
||||
)
|
||||
)
|
||||
|
||||
pyarrow.unregister_extension_type("arrow.py_extension_type")
|
||||
pyarrow.register_extension_type(
|
||||
ForbiddenExtensionType(pyarrow.null(), "arrow.py_extension_type")
|
||||
)
|
||||
|
||||
pyarrow._hotfix_installed = True
|
||||
|
||||
|
||||
patch_pyarrow()
|
2609
lib/python3.11/site-packages/pandas/core/arrays/base.py
Normal file
2609
lib/python3.11/site-packages/pandas/core/arrays/base.py
Normal file
File diff suppressed because it is too large
Load Diff
407
lib/python3.11/site-packages/pandas/core/arrays/boolean.py
Normal file
407
lib/python3.11/site-packages/pandas/core/arrays/boolean.py
Normal file
@ -0,0 +1,407 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import numbers
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
ClassVar,
|
||||
cast,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import (
|
||||
lib,
|
||||
missing as libmissing,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.common import is_list_like
|
||||
from pandas.core.dtypes.dtypes import register_extension_dtype
|
||||
from pandas.core.dtypes.missing import isna
|
||||
|
||||
from pandas.core import ops
|
||||
from pandas.core.array_algos import masked_accumulations
|
||||
from pandas.core.arrays.masked import (
|
||||
BaseMaskedArray,
|
||||
BaseMaskedDtype,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import pyarrow
|
||||
|
||||
from pandas._typing import (
|
||||
Dtype,
|
||||
DtypeObj,
|
||||
Self,
|
||||
npt,
|
||||
type_t,
|
||||
)
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class BooleanDtype(BaseMaskedDtype):
|
||||
"""
|
||||
Extension dtype for boolean data.
|
||||
|
||||
.. warning::
|
||||
|
||||
BooleanDtype is considered experimental. The implementation and
|
||||
parts of the API may change without warning.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> pd.BooleanDtype()
|
||||
BooleanDtype
|
||||
"""
|
||||
|
||||
name: ClassVar[str] = "boolean"
|
||||
|
||||
# https://github.com/python/mypy/issues/4125
|
||||
# error: Signature of "type" incompatible with supertype "BaseMaskedDtype"
|
||||
@property
|
||||
def type(self) -> type: # type: ignore[override]
|
||||
return np.bool_
|
||||
|
||||
@property
|
||||
def kind(self) -> str:
|
||||
return "b"
|
||||
|
||||
@property
|
||||
def numpy_dtype(self) -> np.dtype:
|
||||
return np.dtype("bool")
|
||||
|
||||
@classmethod
|
||||
def construct_array_type(cls) -> type_t[BooleanArray]:
|
||||
"""
|
||||
Return the array type associated with this dtype.
|
||||
|
||||
Returns
|
||||
-------
|
||||
type
|
||||
"""
|
||||
return BooleanArray
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return "BooleanDtype"
|
||||
|
||||
@property
|
||||
def _is_boolean(self) -> bool:
|
||||
return True
|
||||
|
||||
@property
|
||||
def _is_numeric(self) -> bool:
|
||||
return True
|
||||
|
||||
def __from_arrow__(
|
||||
self, array: pyarrow.Array | pyarrow.ChunkedArray
|
||||
) -> BooleanArray:
|
||||
"""
|
||||
Construct BooleanArray from pyarrow Array/ChunkedArray.
|
||||
"""
|
||||
import pyarrow
|
||||
|
||||
if array.type != pyarrow.bool_() and not pyarrow.types.is_null(array.type):
|
||||
raise TypeError(f"Expected array of boolean type, got {array.type} instead")
|
||||
|
||||
if isinstance(array, pyarrow.Array):
|
||||
chunks = [array]
|
||||
length = len(array)
|
||||
else:
|
||||
# pyarrow.ChunkedArray
|
||||
chunks = array.chunks
|
||||
length = array.length()
|
||||
|
||||
if pyarrow.types.is_null(array.type):
|
||||
mask = np.ones(length, dtype=bool)
|
||||
# No need to init data, since all null
|
||||
data = np.empty(length, dtype=bool)
|
||||
return BooleanArray(data, mask)
|
||||
|
||||
results = []
|
||||
for arr in chunks:
|
||||
buflist = arr.buffers()
|
||||
data = pyarrow.BooleanArray.from_buffers(
|
||||
arr.type, len(arr), [None, buflist[1]], offset=arr.offset
|
||||
).to_numpy(zero_copy_only=False)
|
||||
if arr.null_count != 0:
|
||||
mask = pyarrow.BooleanArray.from_buffers(
|
||||
arr.type, len(arr), [None, buflist[0]], offset=arr.offset
|
||||
).to_numpy(zero_copy_only=False)
|
||||
mask = ~mask
|
||||
else:
|
||||
mask = np.zeros(len(arr), dtype=bool)
|
||||
|
||||
bool_arr = BooleanArray(data, mask)
|
||||
results.append(bool_arr)
|
||||
|
||||
if not results:
|
||||
return BooleanArray(
|
||||
np.array([], dtype=np.bool_), np.array([], dtype=np.bool_)
|
||||
)
|
||||
else:
|
||||
return BooleanArray._concat_same_type(results)
|
||||
|
||||
|
||||
def coerce_to_array(
|
||||
values, mask=None, copy: bool = False
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
"""
|
||||
Coerce the input values array to numpy arrays with a mask.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : 1D list-like
|
||||
mask : bool 1D array, optional
|
||||
copy : bool, default False
|
||||
if True, copy the input
|
||||
|
||||
Returns
|
||||
-------
|
||||
tuple of (values, mask)
|
||||
"""
|
||||
if isinstance(values, BooleanArray):
|
||||
if mask is not None:
|
||||
raise ValueError("cannot pass mask for BooleanArray input")
|
||||
values, mask = values._data, values._mask
|
||||
if copy:
|
||||
values = values.copy()
|
||||
mask = mask.copy()
|
||||
return values, mask
|
||||
|
||||
mask_values = None
|
||||
if isinstance(values, np.ndarray) and values.dtype == np.bool_:
|
||||
if copy:
|
||||
values = values.copy()
|
||||
elif isinstance(values, np.ndarray) and values.dtype.kind in "iufcb":
|
||||
mask_values = isna(values)
|
||||
|
||||
values_bool = np.zeros(len(values), dtype=bool)
|
||||
values_bool[~mask_values] = values[~mask_values].astype(bool)
|
||||
|
||||
if not np.all(
|
||||
values_bool[~mask_values].astype(values.dtype) == values[~mask_values]
|
||||
):
|
||||
raise TypeError("Need to pass bool-like values")
|
||||
|
||||
values = values_bool
|
||||
else:
|
||||
values_object = np.asarray(values, dtype=object)
|
||||
|
||||
inferred_dtype = lib.infer_dtype(values_object, skipna=True)
|
||||
integer_like = ("floating", "integer", "mixed-integer-float")
|
||||
if inferred_dtype not in ("boolean", "empty") + integer_like:
|
||||
raise TypeError("Need to pass bool-like values")
|
||||
|
||||
# mypy does not narrow the type of mask_values to npt.NDArray[np.bool_]
|
||||
# within this branch, it assumes it can also be None
|
||||
mask_values = cast("npt.NDArray[np.bool_]", isna(values_object))
|
||||
values = np.zeros(len(values), dtype=bool)
|
||||
values[~mask_values] = values_object[~mask_values].astype(bool)
|
||||
|
||||
# if the values were integer-like, validate it were actually 0/1's
|
||||
if (inferred_dtype in integer_like) and not (
|
||||
np.all(
|
||||
values[~mask_values].astype(float)
|
||||
== values_object[~mask_values].astype(float)
|
||||
)
|
||||
):
|
||||
raise TypeError("Need to pass bool-like values")
|
||||
|
||||
if mask is None and mask_values is None:
|
||||
mask = np.zeros(values.shape, dtype=bool)
|
||||
elif mask is None:
|
||||
mask = mask_values
|
||||
else:
|
||||
if isinstance(mask, np.ndarray) and mask.dtype == np.bool_:
|
||||
if mask_values is not None:
|
||||
mask = mask | mask_values
|
||||
else:
|
||||
if copy:
|
||||
mask = mask.copy()
|
||||
else:
|
||||
mask = np.array(mask, dtype=bool)
|
||||
if mask_values is not None:
|
||||
mask = mask | mask_values
|
||||
|
||||
if values.shape != mask.shape:
|
||||
raise ValueError("values.shape and mask.shape must match")
|
||||
|
||||
return values, mask
|
||||
|
||||
|
||||
class BooleanArray(BaseMaskedArray):
|
||||
"""
|
||||
Array of boolean (True/False) data with missing values.
|
||||
|
||||
This is a pandas Extension array for boolean data, under the hood
|
||||
represented by 2 numpy arrays: a boolean array with the data and
|
||||
a boolean array with the mask (True indicating missing).
|
||||
|
||||
BooleanArray implements Kleene logic (sometimes called three-value
|
||||
logic) for logical operations. See :ref:`boolean.kleene` for more.
|
||||
|
||||
To construct an BooleanArray from generic array-like input, use
|
||||
:func:`pandas.array` specifying ``dtype="boolean"`` (see examples
|
||||
below).
|
||||
|
||||
.. warning::
|
||||
|
||||
BooleanArray is considered experimental. The implementation and
|
||||
parts of the API may change without warning.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : numpy.ndarray
|
||||
A 1-d boolean-dtype array with the data.
|
||||
mask : numpy.ndarray
|
||||
A 1-d boolean-dtype array indicating missing values (True
|
||||
indicates missing).
|
||||
copy : bool, default False
|
||||
Whether to copy the `values` and `mask` arrays.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
Returns
|
||||
-------
|
||||
BooleanArray
|
||||
|
||||
Examples
|
||||
--------
|
||||
Create an BooleanArray with :func:`pandas.array`:
|
||||
|
||||
>>> pd.array([True, False, None], dtype="boolean")
|
||||
<BooleanArray>
|
||||
[True, False, <NA>]
|
||||
Length: 3, dtype: boolean
|
||||
"""
|
||||
|
||||
# The value used to fill '_data' to avoid upcasting
|
||||
_internal_fill_value = False
|
||||
# Fill values used for any/all
|
||||
# Incompatible types in assignment (expression has type "bool", base class
|
||||
# "BaseMaskedArray" defined the type as "<typing special form>")
|
||||
_truthy_value = True # type: ignore[assignment]
|
||||
_falsey_value = False # type: ignore[assignment]
|
||||
_TRUE_VALUES = {"True", "TRUE", "true", "1", "1.0"}
|
||||
_FALSE_VALUES = {"False", "FALSE", "false", "0", "0.0"}
|
||||
|
||||
@classmethod
|
||||
def _simple_new(cls, values: np.ndarray, mask: npt.NDArray[np.bool_]) -> Self:
|
||||
result = super()._simple_new(values, mask)
|
||||
result._dtype = BooleanDtype()
|
||||
return result
|
||||
|
||||
def __init__(
|
||||
self, values: np.ndarray, mask: np.ndarray, copy: bool = False
|
||||
) -> None:
|
||||
if not (isinstance(values, np.ndarray) and values.dtype == np.bool_):
|
||||
raise TypeError(
|
||||
"values should be boolean numpy array. Use "
|
||||
"the 'pd.array' function instead"
|
||||
)
|
||||
self._dtype = BooleanDtype()
|
||||
super().__init__(values, mask, copy=copy)
|
||||
|
||||
@property
|
||||
def dtype(self) -> BooleanDtype:
|
||||
return self._dtype
|
||||
|
||||
@classmethod
|
||||
def _from_sequence_of_strings(
|
||||
cls,
|
||||
strings: list[str],
|
||||
*,
|
||||
dtype: Dtype | None = None,
|
||||
copy: bool = False,
|
||||
true_values: list[str] | None = None,
|
||||
false_values: list[str] | None = None,
|
||||
) -> BooleanArray:
|
||||
true_values_union = cls._TRUE_VALUES.union(true_values or [])
|
||||
false_values_union = cls._FALSE_VALUES.union(false_values or [])
|
||||
|
||||
def map_string(s) -> bool:
|
||||
if s in true_values_union:
|
||||
return True
|
||||
elif s in false_values_union:
|
||||
return False
|
||||
else:
|
||||
raise ValueError(f"{s} cannot be cast to bool")
|
||||
|
||||
scalars = np.array(strings, dtype=object)
|
||||
mask = isna(scalars)
|
||||
scalars[~mask] = list(map(map_string, scalars[~mask]))
|
||||
return cls._from_sequence(scalars, dtype=dtype, copy=copy)
|
||||
|
||||
_HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_)
|
||||
|
||||
@classmethod
|
||||
def _coerce_to_array(
|
||||
cls, value, *, dtype: DtypeObj, copy: bool = False
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
if dtype:
|
||||
assert dtype == "boolean"
|
||||
return coerce_to_array(value, copy=copy)
|
||||
|
||||
def _logical_method(self, other, op):
|
||||
assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"}
|
||||
other_is_scalar = lib.is_scalar(other)
|
||||
mask = None
|
||||
|
||||
if isinstance(other, BooleanArray):
|
||||
other, mask = other._data, other._mask
|
||||
elif is_list_like(other):
|
||||
other = np.asarray(other, dtype="bool")
|
||||
if other.ndim > 1:
|
||||
raise NotImplementedError("can only perform ops with 1-d structures")
|
||||
other, mask = coerce_to_array(other, copy=False)
|
||||
elif isinstance(other, np.bool_):
|
||||
other = other.item()
|
||||
|
||||
if other_is_scalar and other is not libmissing.NA and not lib.is_bool(other):
|
||||
raise TypeError(
|
||||
"'other' should be pandas.NA or a bool. "
|
||||
f"Got {type(other).__name__} instead."
|
||||
)
|
||||
|
||||
if not other_is_scalar and len(self) != len(other):
|
||||
raise ValueError("Lengths must match")
|
||||
|
||||
if op.__name__ in {"or_", "ror_"}:
|
||||
result, mask = ops.kleene_or(self._data, other, self._mask, mask)
|
||||
elif op.__name__ in {"and_", "rand_"}:
|
||||
result, mask = ops.kleene_and(self._data, other, self._mask, mask)
|
||||
else:
|
||||
# i.e. xor, rxor
|
||||
result, mask = ops.kleene_xor(self._data, other, self._mask, mask)
|
||||
|
||||
# i.e. BooleanArray
|
||||
return self._maybe_mask_result(result, mask)
|
||||
|
||||
def _accumulate(
|
||||
self, name: str, *, skipna: bool = True, **kwargs
|
||||
) -> BaseMaskedArray:
|
||||
data = self._data
|
||||
mask = self._mask
|
||||
if name in ("cummin", "cummax"):
|
||||
op = getattr(masked_accumulations, name)
|
||||
data, mask = op(data, mask, skipna=skipna, **kwargs)
|
||||
return self._simple_new(data, mask)
|
||||
else:
|
||||
from pandas.core.arrays import IntegerArray
|
||||
|
||||
return IntegerArray(data.astype(int), mask)._accumulate(
|
||||
name, skipna=skipna, **kwargs
|
||||
)
|
3111
lib/python3.11/site-packages/pandas/core/arrays/categorical.py
Normal file
3111
lib/python3.11/site-packages/pandas/core/arrays/categorical.py
Normal file
File diff suppressed because it is too large
Load Diff
2583
lib/python3.11/site-packages/pandas/core/arrays/datetimelike.py
Normal file
2583
lib/python3.11/site-packages/pandas/core/arrays/datetimelike.py
Normal file
File diff suppressed because it is too large
Load Diff
2837
lib/python3.11/site-packages/pandas/core/arrays/datetimes.py
Normal file
2837
lib/python3.11/site-packages/pandas/core/arrays/datetimes.py
Normal file
File diff suppressed because it is too large
Load Diff
173
lib/python3.11/site-packages/pandas/core/arrays/floating.py
Normal file
173
lib/python3.11/site-packages/pandas/core/arrays/floating.py
Normal file
@ -0,0 +1,173 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import ClassVar
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.core.dtypes.base import register_extension_dtype
|
||||
from pandas.core.dtypes.common import is_float_dtype
|
||||
|
||||
from pandas.core.arrays.numeric import (
|
||||
NumericArray,
|
||||
NumericDtype,
|
||||
)
|
||||
|
||||
|
||||
class FloatingDtype(NumericDtype):
|
||||
"""
|
||||
An ExtensionDtype to hold a single size of floating dtype.
|
||||
|
||||
These specific implementations are subclasses of the non-public
|
||||
FloatingDtype. For example we have Float32Dtype to represent float32.
|
||||
|
||||
The attributes name & type are set when these subclasses are created.
|
||||
"""
|
||||
|
||||
_default_np_dtype = np.dtype(np.float64)
|
||||
_checker = is_float_dtype
|
||||
|
||||
@classmethod
|
||||
def construct_array_type(cls) -> type[FloatingArray]:
|
||||
"""
|
||||
Return the array type associated with this dtype.
|
||||
|
||||
Returns
|
||||
-------
|
||||
type
|
||||
"""
|
||||
return FloatingArray
|
||||
|
||||
@classmethod
|
||||
def _get_dtype_mapping(cls) -> dict[np.dtype, FloatingDtype]:
|
||||
return NUMPY_FLOAT_TO_DTYPE
|
||||
|
||||
@classmethod
|
||||
def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray:
|
||||
"""
|
||||
Safely cast the values to the given dtype.
|
||||
|
||||
"safe" in this context means the casting is lossless.
|
||||
"""
|
||||
# This is really only here for compatibility with IntegerDtype
|
||||
# Here for compat with IntegerDtype
|
||||
return values.astype(dtype, copy=copy)
|
||||
|
||||
|
||||
class FloatingArray(NumericArray):
|
||||
"""
|
||||
Array of floating (optional missing) values.
|
||||
|
||||
.. warning::
|
||||
|
||||
FloatingArray is currently experimental, and its API or internal
|
||||
implementation may change without warning. Especially the behaviour
|
||||
regarding NaN (distinct from NA missing values) is subject to change.
|
||||
|
||||
We represent a FloatingArray with 2 numpy arrays:
|
||||
|
||||
- data: contains a numpy float array of the appropriate dtype
|
||||
- mask: a boolean array holding a mask on the data, True is missing
|
||||
|
||||
To construct an FloatingArray from generic array-like input, use
|
||||
:func:`pandas.array` with one of the float dtypes (see examples).
|
||||
|
||||
See :ref:`integer_na` for more.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : numpy.ndarray
|
||||
A 1-d float-dtype array.
|
||||
mask : numpy.ndarray
|
||||
A 1-d boolean-dtype array indicating missing values.
|
||||
copy : bool, default False
|
||||
Whether to copy the `values` and `mask`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
Returns
|
||||
-------
|
||||
FloatingArray
|
||||
|
||||
Examples
|
||||
--------
|
||||
Create an FloatingArray with :func:`pandas.array`:
|
||||
|
||||
>>> pd.array([0.1, None, 0.3], dtype=pd.Float32Dtype())
|
||||
<FloatingArray>
|
||||
[0.1, <NA>, 0.3]
|
||||
Length: 3, dtype: Float32
|
||||
|
||||
String aliases for the dtypes are also available. They are capitalized.
|
||||
|
||||
>>> pd.array([0.1, None, 0.3], dtype="Float32")
|
||||
<FloatingArray>
|
||||
[0.1, <NA>, 0.3]
|
||||
Length: 3, dtype: Float32
|
||||
"""
|
||||
|
||||
_dtype_cls = FloatingDtype
|
||||
|
||||
# The value used to fill '_data' to avoid upcasting
|
||||
_internal_fill_value = np.nan
|
||||
# Fill values used for any/all
|
||||
# Incompatible types in assignment (expression has type "float", base class
|
||||
# "BaseMaskedArray" defined the type as "<typing special form>")
|
||||
_truthy_value = 1.0 # type: ignore[assignment]
|
||||
_falsey_value = 0.0 # type: ignore[assignment]
|
||||
|
||||
|
||||
_dtype_docstring = """
|
||||
An ExtensionDtype for {dtype} data.
|
||||
|
||||
This dtype uses ``pd.NA`` as missing value indicator.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
Examples
|
||||
--------
|
||||
For Float32Dtype:
|
||||
|
||||
>>> ser = pd.Series([2.25, pd.NA], dtype=pd.Float32Dtype())
|
||||
>>> ser.dtype
|
||||
Float32Dtype()
|
||||
|
||||
For Float64Dtype:
|
||||
|
||||
>>> ser = pd.Series([2.25, pd.NA], dtype=pd.Float64Dtype())
|
||||
>>> ser.dtype
|
||||
Float64Dtype()
|
||||
"""
|
||||
|
||||
# create the Dtype
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class Float32Dtype(FloatingDtype):
|
||||
type = np.float32
|
||||
name: ClassVar[str] = "Float32"
|
||||
__doc__ = _dtype_docstring.format(dtype="float32")
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class Float64Dtype(FloatingDtype):
|
||||
type = np.float64
|
||||
name: ClassVar[str] = "Float64"
|
||||
__doc__ = _dtype_docstring.format(dtype="float64")
|
||||
|
||||
|
||||
NUMPY_FLOAT_TO_DTYPE: dict[np.dtype, FloatingDtype] = {
|
||||
np.dtype(np.float32): Float32Dtype(),
|
||||
np.dtype(np.float64): Float64Dtype(),
|
||||
}
|
272
lib/python3.11/site-packages/pandas/core/arrays/integer.py
Normal file
272
lib/python3.11/site-packages/pandas/core/arrays/integer.py
Normal file
@ -0,0 +1,272 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import ClassVar
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.core.dtypes.base import register_extension_dtype
|
||||
from pandas.core.dtypes.common import is_integer_dtype
|
||||
|
||||
from pandas.core.arrays.numeric import (
|
||||
NumericArray,
|
||||
NumericDtype,
|
||||
)
|
||||
|
||||
|
||||
class IntegerDtype(NumericDtype):
|
||||
"""
|
||||
An ExtensionDtype to hold a single size & kind of integer dtype.
|
||||
|
||||
These specific implementations are subclasses of the non-public
|
||||
IntegerDtype. For example, we have Int8Dtype to represent signed int 8s.
|
||||
|
||||
The attributes name & type are set when these subclasses are created.
|
||||
"""
|
||||
|
||||
_default_np_dtype = np.dtype(np.int64)
|
||||
_checker = is_integer_dtype
|
||||
|
||||
@classmethod
|
||||
def construct_array_type(cls) -> type[IntegerArray]:
|
||||
"""
|
||||
Return the array type associated with this dtype.
|
||||
|
||||
Returns
|
||||
-------
|
||||
type
|
||||
"""
|
||||
return IntegerArray
|
||||
|
||||
@classmethod
|
||||
def _get_dtype_mapping(cls) -> dict[np.dtype, IntegerDtype]:
|
||||
return NUMPY_INT_TO_DTYPE
|
||||
|
||||
@classmethod
|
||||
def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray:
|
||||
"""
|
||||
Safely cast the values to the given dtype.
|
||||
|
||||
"safe" in this context means the casting is lossless. e.g. if 'values'
|
||||
has a floating dtype, each value must be an integer.
|
||||
"""
|
||||
try:
|
||||
return values.astype(dtype, casting="safe", copy=copy)
|
||||
except TypeError as err:
|
||||
casted = values.astype(dtype, copy=copy)
|
||||
if (casted == values).all():
|
||||
return casted
|
||||
|
||||
raise TypeError(
|
||||
f"cannot safely cast non-equivalent {values.dtype} to {np.dtype(dtype)}"
|
||||
) from err
|
||||
|
||||
|
||||
class IntegerArray(NumericArray):
|
||||
"""
|
||||
Array of integer (optional missing) values.
|
||||
|
||||
Uses :attr:`pandas.NA` as the missing value.
|
||||
|
||||
.. warning::
|
||||
|
||||
IntegerArray is currently experimental, and its API or internal
|
||||
implementation may change without warning.
|
||||
|
||||
We represent an IntegerArray with 2 numpy arrays:
|
||||
|
||||
- data: contains a numpy integer array of the appropriate dtype
|
||||
- mask: a boolean array holding a mask on the data, True is missing
|
||||
|
||||
To construct an IntegerArray from generic array-like input, use
|
||||
:func:`pandas.array` with one of the integer dtypes (see examples).
|
||||
|
||||
See :ref:`integer_na` for more.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : numpy.ndarray
|
||||
A 1-d integer-dtype array.
|
||||
mask : numpy.ndarray
|
||||
A 1-d boolean-dtype array indicating missing values.
|
||||
copy : bool, default False
|
||||
Whether to copy the `values` and `mask`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
Returns
|
||||
-------
|
||||
IntegerArray
|
||||
|
||||
Examples
|
||||
--------
|
||||
Create an IntegerArray with :func:`pandas.array`.
|
||||
|
||||
>>> int_array = pd.array([1, None, 3], dtype=pd.Int32Dtype())
|
||||
>>> int_array
|
||||
<IntegerArray>
|
||||
[1, <NA>, 3]
|
||||
Length: 3, dtype: Int32
|
||||
|
||||
String aliases for the dtypes are also available. They are capitalized.
|
||||
|
||||
>>> pd.array([1, None, 3], dtype='Int32')
|
||||
<IntegerArray>
|
||||
[1, <NA>, 3]
|
||||
Length: 3, dtype: Int32
|
||||
|
||||
>>> pd.array([1, None, 3], dtype='UInt16')
|
||||
<IntegerArray>
|
||||
[1, <NA>, 3]
|
||||
Length: 3, dtype: UInt16
|
||||
"""
|
||||
|
||||
_dtype_cls = IntegerDtype
|
||||
|
||||
# The value used to fill '_data' to avoid upcasting
|
||||
_internal_fill_value = 1
|
||||
# Fill values used for any/all
|
||||
# Incompatible types in assignment (expression has type "int", base class
|
||||
# "BaseMaskedArray" defined the type as "<typing special form>")
|
||||
_truthy_value = 1 # type: ignore[assignment]
|
||||
_falsey_value = 0 # type: ignore[assignment]
|
||||
|
||||
|
||||
_dtype_docstring = """
|
||||
An ExtensionDtype for {dtype} integer data.
|
||||
|
||||
Uses :attr:`pandas.NA` as its missing value, rather than :attr:`numpy.nan`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
Examples
|
||||
--------
|
||||
For Int8Dtype:
|
||||
|
||||
>>> ser = pd.Series([2, pd.NA], dtype=pd.Int8Dtype())
|
||||
>>> ser.dtype
|
||||
Int8Dtype()
|
||||
|
||||
For Int16Dtype:
|
||||
|
||||
>>> ser = pd.Series([2, pd.NA], dtype=pd.Int16Dtype())
|
||||
>>> ser.dtype
|
||||
Int16Dtype()
|
||||
|
||||
For Int32Dtype:
|
||||
|
||||
>>> ser = pd.Series([2, pd.NA], dtype=pd.Int32Dtype())
|
||||
>>> ser.dtype
|
||||
Int32Dtype()
|
||||
|
||||
For Int64Dtype:
|
||||
|
||||
>>> ser = pd.Series([2, pd.NA], dtype=pd.Int64Dtype())
|
||||
>>> ser.dtype
|
||||
Int64Dtype()
|
||||
|
||||
For UInt8Dtype:
|
||||
|
||||
>>> ser = pd.Series([2, pd.NA], dtype=pd.UInt8Dtype())
|
||||
>>> ser.dtype
|
||||
UInt8Dtype()
|
||||
|
||||
For UInt16Dtype:
|
||||
|
||||
>>> ser = pd.Series([2, pd.NA], dtype=pd.UInt16Dtype())
|
||||
>>> ser.dtype
|
||||
UInt16Dtype()
|
||||
|
||||
For UInt32Dtype:
|
||||
|
||||
>>> ser = pd.Series([2, pd.NA], dtype=pd.UInt32Dtype())
|
||||
>>> ser.dtype
|
||||
UInt32Dtype()
|
||||
|
||||
For UInt64Dtype:
|
||||
|
||||
>>> ser = pd.Series([2, pd.NA], dtype=pd.UInt64Dtype())
|
||||
>>> ser.dtype
|
||||
UInt64Dtype()
|
||||
"""
|
||||
|
||||
# create the Dtype
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class Int8Dtype(IntegerDtype):
|
||||
type = np.int8
|
||||
name: ClassVar[str] = "Int8"
|
||||
__doc__ = _dtype_docstring.format(dtype="int8")
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class Int16Dtype(IntegerDtype):
|
||||
type = np.int16
|
||||
name: ClassVar[str] = "Int16"
|
||||
__doc__ = _dtype_docstring.format(dtype="int16")
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class Int32Dtype(IntegerDtype):
|
||||
type = np.int32
|
||||
name: ClassVar[str] = "Int32"
|
||||
__doc__ = _dtype_docstring.format(dtype="int32")
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class Int64Dtype(IntegerDtype):
|
||||
type = np.int64
|
||||
name: ClassVar[str] = "Int64"
|
||||
__doc__ = _dtype_docstring.format(dtype="int64")
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class UInt8Dtype(IntegerDtype):
|
||||
type = np.uint8
|
||||
name: ClassVar[str] = "UInt8"
|
||||
__doc__ = _dtype_docstring.format(dtype="uint8")
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class UInt16Dtype(IntegerDtype):
|
||||
type = np.uint16
|
||||
name: ClassVar[str] = "UInt16"
|
||||
__doc__ = _dtype_docstring.format(dtype="uint16")
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class UInt32Dtype(IntegerDtype):
|
||||
type = np.uint32
|
||||
name: ClassVar[str] = "UInt32"
|
||||
__doc__ = _dtype_docstring.format(dtype="uint32")
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class UInt64Dtype(IntegerDtype):
|
||||
type = np.uint64
|
||||
name: ClassVar[str] = "UInt64"
|
||||
__doc__ = _dtype_docstring.format(dtype="uint64")
|
||||
|
||||
|
||||
NUMPY_INT_TO_DTYPE: dict[np.dtype, IntegerDtype] = {
|
||||
np.dtype(np.int8): Int8Dtype(),
|
||||
np.dtype(np.int16): Int16Dtype(),
|
||||
np.dtype(np.int32): Int32Dtype(),
|
||||
np.dtype(np.int64): Int64Dtype(),
|
||||
np.dtype(np.uint8): UInt8Dtype(),
|
||||
np.dtype(np.uint16): UInt16Dtype(),
|
||||
np.dtype(np.uint32): UInt32Dtype(),
|
||||
np.dtype(np.uint64): UInt64Dtype(),
|
||||
}
|
1930
lib/python3.11/site-packages/pandas/core/arrays/interval.py
Normal file
1930
lib/python3.11/site-packages/pandas/core/arrays/interval.py
Normal file
File diff suppressed because it is too large
Load Diff
1669
lib/python3.11/site-packages/pandas/core/arrays/masked.py
Normal file
1669
lib/python3.11/site-packages/pandas/core/arrays/masked.py
Normal file
File diff suppressed because it is too large
Load Diff
286
lib/python3.11/site-packages/pandas/core/arrays/numeric.py
Normal file
286
lib/python3.11/site-packages/pandas/core/arrays/numeric.py
Normal file
@ -0,0 +1,286 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import numbers
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import (
|
||||
lib,
|
||||
missing as libmissing,
|
||||
)
|
||||
from pandas.errors import AbstractMethodError
|
||||
from pandas.util._decorators import cache_readonly
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_integer_dtype,
|
||||
is_string_dtype,
|
||||
pandas_dtype,
|
||||
)
|
||||
|
||||
from pandas.core.arrays.masked import (
|
||||
BaseMaskedArray,
|
||||
BaseMaskedDtype,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Mapping
|
||||
|
||||
import pyarrow
|
||||
|
||||
from pandas._typing import (
|
||||
Dtype,
|
||||
DtypeObj,
|
||||
Self,
|
||||
npt,
|
||||
)
|
||||
|
||||
|
||||
class NumericDtype(BaseMaskedDtype):
|
||||
_default_np_dtype: np.dtype
|
||||
_checker: Callable[[Any], bool] # is_foo_dtype
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"{self.name}Dtype()"
|
||||
|
||||
@cache_readonly
|
||||
def is_signed_integer(self) -> bool:
|
||||
return self.kind == "i"
|
||||
|
||||
@cache_readonly
|
||||
def is_unsigned_integer(self) -> bool:
|
||||
return self.kind == "u"
|
||||
|
||||
@property
|
||||
def _is_numeric(self) -> bool:
|
||||
return True
|
||||
|
||||
def __from_arrow__(
|
||||
self, array: pyarrow.Array | pyarrow.ChunkedArray
|
||||
) -> BaseMaskedArray:
|
||||
"""
|
||||
Construct IntegerArray/FloatingArray from pyarrow Array/ChunkedArray.
|
||||
"""
|
||||
import pyarrow
|
||||
|
||||
from pandas.core.arrays.arrow._arrow_utils import (
|
||||
pyarrow_array_to_numpy_and_mask,
|
||||
)
|
||||
|
||||
array_class = self.construct_array_type()
|
||||
|
||||
pyarrow_type = pyarrow.from_numpy_dtype(self.type)
|
||||
if not array.type.equals(pyarrow_type) and not pyarrow.types.is_null(
|
||||
array.type
|
||||
):
|
||||
# test_from_arrow_type_error raise for string, but allow
|
||||
# through itemsize conversion GH#31896
|
||||
rt_dtype = pandas_dtype(array.type.to_pandas_dtype())
|
||||
if rt_dtype.kind not in "iuf":
|
||||
# Could allow "c" or potentially disallow float<->int conversion,
|
||||
# but at the moment we specifically test that uint<->int works
|
||||
raise TypeError(
|
||||
f"Expected array of {self} type, got {array.type} instead"
|
||||
)
|
||||
|
||||
array = array.cast(pyarrow_type)
|
||||
|
||||
if isinstance(array, pyarrow.ChunkedArray):
|
||||
# TODO this "if" can be removed when requiring pyarrow >= 10.0, which fixed
|
||||
# combine_chunks for empty arrays https://github.com/apache/arrow/pull/13757
|
||||
if array.num_chunks == 0:
|
||||
array = pyarrow.array([], type=array.type)
|
||||
else:
|
||||
array = array.combine_chunks()
|
||||
|
||||
data, mask = pyarrow_array_to_numpy_and_mask(array, dtype=self.numpy_dtype)
|
||||
return array_class(data.copy(), ~mask, copy=False)
|
||||
|
||||
@classmethod
|
||||
def _get_dtype_mapping(cls) -> Mapping[np.dtype, NumericDtype]:
|
||||
raise AbstractMethodError(cls)
|
||||
|
||||
@classmethod
|
||||
def _standardize_dtype(cls, dtype: NumericDtype | str | np.dtype) -> NumericDtype:
|
||||
"""
|
||||
Convert a string representation or a numpy dtype to NumericDtype.
|
||||
"""
|
||||
if isinstance(dtype, str) and (dtype.startswith(("Int", "UInt", "Float"))):
|
||||
# Avoid DeprecationWarning from NumPy about np.dtype("Int64")
|
||||
# https://github.com/numpy/numpy/pull/7476
|
||||
dtype = dtype.lower()
|
||||
|
||||
if not isinstance(dtype, NumericDtype):
|
||||
mapping = cls._get_dtype_mapping()
|
||||
try:
|
||||
dtype = mapping[np.dtype(dtype)]
|
||||
except KeyError as err:
|
||||
raise ValueError(f"invalid dtype specified {dtype}") from err
|
||||
return dtype
|
||||
|
||||
@classmethod
|
||||
def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray:
|
||||
"""
|
||||
Safely cast the values to the given dtype.
|
||||
|
||||
"safe" in this context means the casting is lossless.
|
||||
"""
|
||||
raise AbstractMethodError(cls)
|
||||
|
||||
|
||||
def _coerce_to_data_and_mask(
|
||||
values, dtype, copy: bool, dtype_cls: type[NumericDtype], default_dtype: np.dtype
|
||||
):
|
||||
checker = dtype_cls._checker
|
||||
|
||||
mask = None
|
||||
inferred_type = None
|
||||
|
||||
if dtype is None and hasattr(values, "dtype"):
|
||||
if checker(values.dtype):
|
||||
dtype = values.dtype
|
||||
|
||||
if dtype is not None:
|
||||
dtype = dtype_cls._standardize_dtype(dtype)
|
||||
|
||||
cls = dtype_cls.construct_array_type()
|
||||
if isinstance(values, cls):
|
||||
values, mask = values._data, values._mask
|
||||
if dtype is not None:
|
||||
values = values.astype(dtype.numpy_dtype, copy=False)
|
||||
|
||||
if copy:
|
||||
values = values.copy()
|
||||
mask = mask.copy()
|
||||
return values, mask, dtype, inferred_type
|
||||
|
||||
original = values
|
||||
if not copy:
|
||||
values = np.asarray(values)
|
||||
else:
|
||||
values = np.array(values, copy=copy)
|
||||
inferred_type = None
|
||||
if values.dtype == object or is_string_dtype(values.dtype):
|
||||
inferred_type = lib.infer_dtype(values, skipna=True)
|
||||
if inferred_type == "boolean" and dtype is None:
|
||||
name = dtype_cls.__name__.strip("_")
|
||||
raise TypeError(f"{values.dtype} cannot be converted to {name}")
|
||||
|
||||
elif values.dtype.kind == "b" and checker(dtype):
|
||||
if not copy:
|
||||
values = np.asarray(values, dtype=default_dtype)
|
||||
else:
|
||||
values = np.array(values, dtype=default_dtype, copy=copy)
|
||||
|
||||
elif values.dtype.kind not in "iuf":
|
||||
name = dtype_cls.__name__.strip("_")
|
||||
raise TypeError(f"{values.dtype} cannot be converted to {name}")
|
||||
|
||||
if values.ndim != 1:
|
||||
raise TypeError("values must be a 1D list-like")
|
||||
|
||||
if mask is None:
|
||||
if values.dtype.kind in "iu":
|
||||
# fastpath
|
||||
mask = np.zeros(len(values), dtype=np.bool_)
|
||||
else:
|
||||
mask = libmissing.is_numeric_na(values)
|
||||
else:
|
||||
assert len(mask) == len(values)
|
||||
|
||||
if mask.ndim != 1:
|
||||
raise TypeError("mask must be a 1D list-like")
|
||||
|
||||
# infer dtype if needed
|
||||
if dtype is None:
|
||||
dtype = default_dtype
|
||||
else:
|
||||
dtype = dtype.numpy_dtype
|
||||
|
||||
if is_integer_dtype(dtype) and values.dtype.kind == "f" and len(values) > 0:
|
||||
if mask.all():
|
||||
values = np.ones(values.shape, dtype=dtype)
|
||||
else:
|
||||
idx = np.nanargmax(values)
|
||||
if int(values[idx]) != original[idx]:
|
||||
# We have ints that lost precision during the cast.
|
||||
inferred_type = lib.infer_dtype(original, skipna=True)
|
||||
if (
|
||||
inferred_type not in ["floating", "mixed-integer-float"]
|
||||
and not mask.any()
|
||||
):
|
||||
values = np.asarray(original, dtype=dtype)
|
||||
else:
|
||||
values = np.asarray(original, dtype="object")
|
||||
|
||||
# we copy as need to coerce here
|
||||
if mask.any():
|
||||
values = values.copy()
|
||||
values[mask] = cls._internal_fill_value
|
||||
if inferred_type in ("string", "unicode"):
|
||||
# casts from str are always safe since they raise
|
||||
# a ValueError if the str cannot be parsed into a float
|
||||
values = values.astype(dtype, copy=copy)
|
||||
else:
|
||||
values = dtype_cls._safe_cast(values, dtype, copy=False)
|
||||
|
||||
return values, mask, dtype, inferred_type
|
||||
|
||||
|
||||
class NumericArray(BaseMaskedArray):
|
||||
"""
|
||||
Base class for IntegerArray and FloatingArray.
|
||||
"""
|
||||
|
||||
_dtype_cls: type[NumericDtype]
|
||||
|
||||
def __init__(
|
||||
self, values: np.ndarray, mask: npt.NDArray[np.bool_], copy: bool = False
|
||||
) -> None:
|
||||
checker = self._dtype_cls._checker
|
||||
if not (isinstance(values, np.ndarray) and checker(values.dtype)):
|
||||
descr = (
|
||||
"floating"
|
||||
if self._dtype_cls.kind == "f" # type: ignore[comparison-overlap]
|
||||
else "integer"
|
||||
)
|
||||
raise TypeError(
|
||||
f"values should be {descr} numpy array. Use "
|
||||
"the 'pd.array' function instead"
|
||||
)
|
||||
if values.dtype == np.float16:
|
||||
# If we don't raise here, then accessing self.dtype would raise
|
||||
raise TypeError("FloatingArray does not support np.float16 dtype.")
|
||||
|
||||
super().__init__(values, mask, copy=copy)
|
||||
|
||||
@cache_readonly
|
||||
def dtype(self) -> NumericDtype:
|
||||
mapping = self._dtype_cls._get_dtype_mapping()
|
||||
return mapping[self._data.dtype]
|
||||
|
||||
@classmethod
|
||||
def _coerce_to_array(
|
||||
cls, value, *, dtype: DtypeObj, copy: bool = False
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
dtype_cls = cls._dtype_cls
|
||||
default_dtype = dtype_cls._default_np_dtype
|
||||
values, mask, _, _ = _coerce_to_data_and_mask(
|
||||
value, dtype, copy, dtype_cls, default_dtype
|
||||
)
|
||||
return values, mask
|
||||
|
||||
@classmethod
|
||||
def _from_sequence_of_strings(
|
||||
cls, strings, *, dtype: Dtype | None = None, copy: bool = False
|
||||
) -> Self:
|
||||
from pandas.core.tools.numeric import to_numeric
|
||||
|
||||
scalars = to_numeric(strings, errors="raise", dtype_backend="numpy_nullable")
|
||||
return cls._from_sequence(scalars, dtype=dtype, copy=copy)
|
||||
|
||||
_HANDLED_TYPES = (np.ndarray, numbers.Number)
|
574
lib/python3.11/site-packages/pandas/core/arrays/numpy_.py
Normal file
574
lib/python3.11/site-packages/pandas/core/arrays/numpy_.py
Normal file
@ -0,0 +1,574 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Literal,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas._libs.tslibs import is_supported_dtype
|
||||
from pandas.compat.numpy import function as nv
|
||||
|
||||
from pandas.core.dtypes.astype import astype_array
|
||||
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
|
||||
from pandas.core.dtypes.common import pandas_dtype
|
||||
from pandas.core.dtypes.dtypes import NumpyEADtype
|
||||
from pandas.core.dtypes.missing import isna
|
||||
|
||||
from pandas.core import (
|
||||
arraylike,
|
||||
missing,
|
||||
nanops,
|
||||
ops,
|
||||
)
|
||||
from pandas.core.arraylike import OpsMixin
|
||||
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
|
||||
from pandas.core.construction import ensure_wrapped_if_datetimelike
|
||||
from pandas.core.strings.object_array import ObjectStringArrayMixin
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable
|
||||
|
||||
from pandas._typing import (
|
||||
AxisInt,
|
||||
Dtype,
|
||||
FillnaOptions,
|
||||
InterpolateOptions,
|
||||
NpDtype,
|
||||
Scalar,
|
||||
Self,
|
||||
npt,
|
||||
)
|
||||
|
||||
from pandas import Index
|
||||
|
||||
|
||||
# error: Definition of "_concat_same_type" in base class "NDArrayBacked" is
|
||||
# incompatible with definition in base class "ExtensionArray"
|
||||
class NumpyExtensionArray( # type: ignore[misc]
|
||||
OpsMixin,
|
||||
NDArrayBackedExtensionArray,
|
||||
ObjectStringArrayMixin,
|
||||
):
|
||||
"""
|
||||
A pandas ExtensionArray for NumPy data.
|
||||
|
||||
This is mostly for internal compatibility, and is not especially
|
||||
useful on its own.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : ndarray
|
||||
The NumPy ndarray to wrap. Must be 1-dimensional.
|
||||
copy : bool, default False
|
||||
Whether to copy `values`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> pd.arrays.NumpyExtensionArray(np.array([0, 1, 2, 3]))
|
||||
<NumpyExtensionArray>
|
||||
[0, 1, 2, 3]
|
||||
Length: 4, dtype: int64
|
||||
"""
|
||||
|
||||
# If you're wondering why pd.Series(cls) doesn't put the array in an
|
||||
# ExtensionBlock, search for `ABCNumpyExtensionArray`. We check for
|
||||
# that _typ to ensure that users don't unnecessarily use EAs inside
|
||||
# pandas internals, which turns off things like block consolidation.
|
||||
_typ = "npy_extension"
|
||||
__array_priority__ = 1000
|
||||
_ndarray: np.ndarray
|
||||
_dtype: NumpyEADtype
|
||||
_internal_fill_value = np.nan
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# Constructors
|
||||
|
||||
def __init__(
|
||||
self, values: np.ndarray | NumpyExtensionArray, copy: bool = False
|
||||
) -> None:
|
||||
if isinstance(values, type(self)):
|
||||
values = values._ndarray
|
||||
if not isinstance(values, np.ndarray):
|
||||
raise ValueError(
|
||||
f"'values' must be a NumPy array, not {type(values).__name__}"
|
||||
)
|
||||
|
||||
if values.ndim == 0:
|
||||
# Technically we support 2, but do not advertise that fact.
|
||||
raise ValueError("NumpyExtensionArray must be 1-dimensional.")
|
||||
|
||||
if copy:
|
||||
values = values.copy()
|
||||
|
||||
dtype = NumpyEADtype(values.dtype)
|
||||
super().__init__(values, dtype)
|
||||
|
||||
@classmethod
|
||||
def _from_sequence(
|
||||
cls, scalars, *, dtype: Dtype | None = None, copy: bool = False
|
||||
) -> NumpyExtensionArray:
|
||||
if isinstance(dtype, NumpyEADtype):
|
||||
dtype = dtype._dtype
|
||||
|
||||
# error: Argument "dtype" to "asarray" has incompatible type
|
||||
# "Union[ExtensionDtype, str, dtype[Any], dtype[floating[_64Bit]], Type[object],
|
||||
# None]"; expected "Union[dtype[Any], None, type, _SupportsDType, str,
|
||||
# Union[Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], List[Any],
|
||||
# _DTypeDict, Tuple[Any, Any]]]"
|
||||
result = np.asarray(scalars, dtype=dtype) # type: ignore[arg-type]
|
||||
if (
|
||||
result.ndim > 1
|
||||
and not hasattr(scalars, "dtype")
|
||||
and (dtype is None or dtype == object)
|
||||
):
|
||||
# e.g. list-of-tuples
|
||||
result = construct_1d_object_array_from_listlike(scalars)
|
||||
|
||||
if copy and result is scalars:
|
||||
result = result.copy()
|
||||
return cls(result)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# Data
|
||||
|
||||
@property
|
||||
def dtype(self) -> NumpyEADtype:
|
||||
return self._dtype
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# NumPy Array Interface
|
||||
|
||||
def __array__(
|
||||
self, dtype: NpDtype | None = None, copy: bool | None = None
|
||||
) -> np.ndarray:
|
||||
if copy is not None:
|
||||
# Note: branch avoids `copy=None` for NumPy 1.x support
|
||||
return np.array(self._ndarray, dtype=dtype, copy=copy)
|
||||
return np.asarray(self._ndarray, dtype=dtype)
|
||||
|
||||
def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
|
||||
# Lightly modified version of
|
||||
# https://numpy.org/doc/stable/reference/generated/numpy.lib.mixins.NDArrayOperatorsMixin.html
|
||||
# The primary modification is not boxing scalar return values
|
||||
# in NumpyExtensionArray, since pandas' ExtensionArrays are 1-d.
|
||||
out = kwargs.get("out", ())
|
||||
|
||||
result = arraylike.maybe_dispatch_ufunc_to_dunder_op(
|
||||
self, ufunc, method, *inputs, **kwargs
|
||||
)
|
||||
if result is not NotImplemented:
|
||||
return result
|
||||
|
||||
if "out" in kwargs:
|
||||
# e.g. test_ufunc_unary
|
||||
return arraylike.dispatch_ufunc_with_out(
|
||||
self, ufunc, method, *inputs, **kwargs
|
||||
)
|
||||
|
||||
if method == "reduce":
|
||||
result = arraylike.dispatch_reduction_ufunc(
|
||||
self, ufunc, method, *inputs, **kwargs
|
||||
)
|
||||
if result is not NotImplemented:
|
||||
# e.g. tests.series.test_ufunc.TestNumpyReductions
|
||||
return result
|
||||
|
||||
# Defer to the implementation of the ufunc on unwrapped values.
|
||||
inputs = tuple(
|
||||
x._ndarray if isinstance(x, NumpyExtensionArray) else x for x in inputs
|
||||
)
|
||||
if out:
|
||||
kwargs["out"] = tuple(
|
||||
x._ndarray if isinstance(x, NumpyExtensionArray) else x for x in out
|
||||
)
|
||||
result = getattr(ufunc, method)(*inputs, **kwargs)
|
||||
|
||||
if ufunc.nout > 1:
|
||||
# multiple return values; re-box array-like results
|
||||
return tuple(type(self)(x) for x in result)
|
||||
elif method == "at":
|
||||
# no return value
|
||||
return None
|
||||
elif method == "reduce":
|
||||
if isinstance(result, np.ndarray):
|
||||
# e.g. test_np_reduce_2d
|
||||
return type(self)(result)
|
||||
|
||||
# e.g. test_np_max_nested_tuples
|
||||
return result
|
||||
else:
|
||||
# one return value; re-box array-like results
|
||||
return type(self)(result)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# Pandas ExtensionArray Interface
|
||||
|
||||
def astype(self, dtype, copy: bool = True):
|
||||
dtype = pandas_dtype(dtype)
|
||||
|
||||
if dtype == self.dtype:
|
||||
if copy:
|
||||
return self.copy()
|
||||
return self
|
||||
|
||||
result = astype_array(self._ndarray, dtype=dtype, copy=copy)
|
||||
return result
|
||||
|
||||
def isna(self) -> np.ndarray:
|
||||
return isna(self._ndarray)
|
||||
|
||||
def _validate_scalar(self, fill_value):
|
||||
if fill_value is None:
|
||||
# Primarily for subclasses
|
||||
fill_value = self.dtype.na_value
|
||||
return fill_value
|
||||
|
||||
def _values_for_factorize(self) -> tuple[np.ndarray, float | None]:
|
||||
if self.dtype.kind in "iub":
|
||||
fv = None
|
||||
else:
|
||||
fv = np.nan
|
||||
return self._ndarray, fv
|
||||
|
||||
# Base EA class (and all other EA classes) don't have limit_area keyword
|
||||
# This can be removed here as well when the interpolate ffill/bfill method
|
||||
# deprecation is enforced
|
||||
def _pad_or_backfill(
|
||||
self,
|
||||
*,
|
||||
method: FillnaOptions,
|
||||
limit: int | None = None,
|
||||
limit_area: Literal["inside", "outside"] | None = None,
|
||||
copy: bool = True,
|
||||
) -> Self:
|
||||
"""
|
||||
ffill or bfill along axis=0.
|
||||
"""
|
||||
if copy:
|
||||
out_data = self._ndarray.copy()
|
||||
else:
|
||||
out_data = self._ndarray
|
||||
|
||||
meth = missing.clean_fill_method(method)
|
||||
missing.pad_or_backfill_inplace(
|
||||
out_data.T,
|
||||
method=meth,
|
||||
axis=0,
|
||||
limit=limit,
|
||||
limit_area=limit_area,
|
||||
)
|
||||
|
||||
if not copy:
|
||||
return self
|
||||
return type(self)._simple_new(out_data, dtype=self.dtype)
|
||||
|
||||
def interpolate(
|
||||
self,
|
||||
*,
|
||||
method: InterpolateOptions,
|
||||
axis: int,
|
||||
index: Index,
|
||||
limit,
|
||||
limit_direction,
|
||||
limit_area,
|
||||
copy: bool,
|
||||
**kwargs,
|
||||
) -> Self:
|
||||
"""
|
||||
See NDFrame.interpolate.__doc__.
|
||||
"""
|
||||
# NB: we return type(self) even if copy=False
|
||||
if not self.dtype._is_numeric:
|
||||
raise TypeError(f"Cannot interpolate with {self.dtype} dtype")
|
||||
|
||||
if not copy:
|
||||
out_data = self._ndarray
|
||||
else:
|
||||
out_data = self._ndarray.copy()
|
||||
|
||||
# TODO: assert we have floating dtype?
|
||||
missing.interpolate_2d_inplace(
|
||||
out_data,
|
||||
method=method,
|
||||
axis=axis,
|
||||
index=index,
|
||||
limit=limit,
|
||||
limit_direction=limit_direction,
|
||||
limit_area=limit_area,
|
||||
**kwargs,
|
||||
)
|
||||
if not copy:
|
||||
return self
|
||||
return type(self)._simple_new(out_data, dtype=self.dtype)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# Reductions
|
||||
|
||||
def any(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
out=None,
|
||||
keepdims: bool = False,
|
||||
skipna: bool = True,
|
||||
):
|
||||
nv.validate_any((), {"out": out, "keepdims": keepdims})
|
||||
result = nanops.nanany(self._ndarray, axis=axis, skipna=skipna)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def all(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
out=None,
|
||||
keepdims: bool = False,
|
||||
skipna: bool = True,
|
||||
):
|
||||
nv.validate_all((), {"out": out, "keepdims": keepdims})
|
||||
result = nanops.nanall(self._ndarray, axis=axis, skipna=skipna)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def min(
|
||||
self, *, axis: AxisInt | None = None, skipna: bool = True, **kwargs
|
||||
) -> Scalar:
|
||||
nv.validate_min((), kwargs)
|
||||
result = nanops.nanmin(
|
||||
values=self._ndarray, axis=axis, mask=self.isna(), skipna=skipna
|
||||
)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def max(
|
||||
self, *, axis: AxisInt | None = None, skipna: bool = True, **kwargs
|
||||
) -> Scalar:
|
||||
nv.validate_max((), kwargs)
|
||||
result = nanops.nanmax(
|
||||
values=self._ndarray, axis=axis, mask=self.isna(), skipna=skipna
|
||||
)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def sum(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
skipna: bool = True,
|
||||
min_count: int = 0,
|
||||
**kwargs,
|
||||
) -> Scalar:
|
||||
nv.validate_sum((), kwargs)
|
||||
result = nanops.nansum(
|
||||
self._ndarray, axis=axis, skipna=skipna, min_count=min_count
|
||||
)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def prod(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
skipna: bool = True,
|
||||
min_count: int = 0,
|
||||
**kwargs,
|
||||
) -> Scalar:
|
||||
nv.validate_prod((), kwargs)
|
||||
result = nanops.nanprod(
|
||||
self._ndarray, axis=axis, skipna=skipna, min_count=min_count
|
||||
)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def mean(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
dtype: NpDtype | None = None,
|
||||
out=None,
|
||||
keepdims: bool = False,
|
||||
skipna: bool = True,
|
||||
):
|
||||
nv.validate_mean((), {"dtype": dtype, "out": out, "keepdims": keepdims})
|
||||
result = nanops.nanmean(self._ndarray, axis=axis, skipna=skipna)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def median(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
out=None,
|
||||
overwrite_input: bool = False,
|
||||
keepdims: bool = False,
|
||||
skipna: bool = True,
|
||||
):
|
||||
nv.validate_median(
|
||||
(), {"out": out, "overwrite_input": overwrite_input, "keepdims": keepdims}
|
||||
)
|
||||
result = nanops.nanmedian(self._ndarray, axis=axis, skipna=skipna)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def std(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
dtype: NpDtype | None = None,
|
||||
out=None,
|
||||
ddof: int = 1,
|
||||
keepdims: bool = False,
|
||||
skipna: bool = True,
|
||||
):
|
||||
nv.validate_stat_ddof_func(
|
||||
(), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="std"
|
||||
)
|
||||
result = nanops.nanstd(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def var(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
dtype: NpDtype | None = None,
|
||||
out=None,
|
||||
ddof: int = 1,
|
||||
keepdims: bool = False,
|
||||
skipna: bool = True,
|
||||
):
|
||||
nv.validate_stat_ddof_func(
|
||||
(), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="var"
|
||||
)
|
||||
result = nanops.nanvar(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def sem(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
dtype: NpDtype | None = None,
|
||||
out=None,
|
||||
ddof: int = 1,
|
||||
keepdims: bool = False,
|
||||
skipna: bool = True,
|
||||
):
|
||||
nv.validate_stat_ddof_func(
|
||||
(), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="sem"
|
||||
)
|
||||
result = nanops.nansem(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def kurt(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
dtype: NpDtype | None = None,
|
||||
out=None,
|
||||
keepdims: bool = False,
|
||||
skipna: bool = True,
|
||||
):
|
||||
nv.validate_stat_ddof_func(
|
||||
(), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="kurt"
|
||||
)
|
||||
result = nanops.nankurt(self._ndarray, axis=axis, skipna=skipna)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def skew(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
dtype: NpDtype | None = None,
|
||||
out=None,
|
||||
keepdims: bool = False,
|
||||
skipna: bool = True,
|
||||
):
|
||||
nv.validate_stat_ddof_func(
|
||||
(), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="skew"
|
||||
)
|
||||
result = nanops.nanskew(self._ndarray, axis=axis, skipna=skipna)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# Additional Methods
|
||||
|
||||
def to_numpy(
|
||||
self,
|
||||
dtype: npt.DTypeLike | None = None,
|
||||
copy: bool = False,
|
||||
na_value: object = lib.no_default,
|
||||
) -> np.ndarray:
|
||||
mask = self.isna()
|
||||
if na_value is not lib.no_default and mask.any():
|
||||
result = self._ndarray.copy()
|
||||
result[mask] = na_value
|
||||
else:
|
||||
result = self._ndarray
|
||||
|
||||
result = np.asarray(result, dtype=dtype)
|
||||
|
||||
if copy and result is self._ndarray:
|
||||
result = result.copy()
|
||||
|
||||
return result
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# Ops
|
||||
|
||||
def __invert__(self) -> NumpyExtensionArray:
|
||||
return type(self)(~self._ndarray)
|
||||
|
||||
def __neg__(self) -> NumpyExtensionArray:
|
||||
return type(self)(-self._ndarray)
|
||||
|
||||
def __pos__(self) -> NumpyExtensionArray:
|
||||
return type(self)(+self._ndarray)
|
||||
|
||||
def __abs__(self) -> NumpyExtensionArray:
|
||||
return type(self)(abs(self._ndarray))
|
||||
|
||||
def _cmp_method(self, other, op):
|
||||
if isinstance(other, NumpyExtensionArray):
|
||||
other = other._ndarray
|
||||
|
||||
other = ops.maybe_prepare_scalar_for_op(other, (len(self),))
|
||||
pd_op = ops.get_array_op(op)
|
||||
other = ensure_wrapped_if_datetimelike(other)
|
||||
result = pd_op(self._ndarray, other)
|
||||
|
||||
if op is divmod or op is ops.rdivmod:
|
||||
a, b = result
|
||||
if isinstance(a, np.ndarray):
|
||||
# for e.g. op vs TimedeltaArray, we may already
|
||||
# have an ExtensionArray, in which case we do not wrap
|
||||
return self._wrap_ndarray_result(a), self._wrap_ndarray_result(b)
|
||||
return a, b
|
||||
|
||||
if isinstance(result, np.ndarray):
|
||||
# for e.g. multiplication vs TimedeltaArray, we may already
|
||||
# have an ExtensionArray, in which case we do not wrap
|
||||
return self._wrap_ndarray_result(result)
|
||||
return result
|
||||
|
||||
_arith_method = _cmp_method
|
||||
|
||||
def _wrap_ndarray_result(self, result: np.ndarray):
|
||||
# If we have timedelta64[ns] result, return a TimedeltaArray instead
|
||||
# of a NumpyExtensionArray
|
||||
if result.dtype.kind == "m" and is_supported_dtype(result.dtype):
|
||||
from pandas.core.arrays import TimedeltaArray
|
||||
|
||||
return TimedeltaArray._simple_new(result, dtype=result.dtype)
|
||||
return type(self)(result)
|
||||
|
||||
def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]:
|
||||
# NEP 51: https://github.com/numpy/numpy/pull/22449
|
||||
if self.dtype.kind in "SU":
|
||||
return "'{}'".format
|
||||
elif self.dtype == "object":
|
||||
return repr
|
||||
else:
|
||||
return str
|
1331
lib/python3.11/site-packages/pandas/core/arrays/period.py
Normal file
1331
lib/python3.11/site-packages/pandas/core/arrays/period.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,19 @@
|
||||
from pandas.core.arrays.sparse.accessor import (
|
||||
SparseAccessor,
|
||||
SparseFrameAccessor,
|
||||
)
|
||||
from pandas.core.arrays.sparse.array import (
|
||||
BlockIndex,
|
||||
IntIndex,
|
||||
SparseArray,
|
||||
make_sparse_index,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"BlockIndex",
|
||||
"IntIndex",
|
||||
"make_sparse_index",
|
||||
"SparseAccessor",
|
||||
"SparseArray",
|
||||
"SparseFrameAccessor",
|
||||
]
|
@ -0,0 +1,414 @@
|
||||
"""Sparse accessor"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
|
||||
from pandas.core.dtypes.cast import find_common_type
|
||||
from pandas.core.dtypes.dtypes import SparseDtype
|
||||
|
||||
from pandas.core.accessor import (
|
||||
PandasDelegate,
|
||||
delegate_names,
|
||||
)
|
||||
from pandas.core.arrays.sparse.array import SparseArray
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
|
||||
|
||||
class BaseAccessor:
|
||||
_validation_msg = "Can only use the '.sparse' accessor with Sparse data."
|
||||
|
||||
def __init__(self, data=None) -> None:
|
||||
self._parent = data
|
||||
self._validate(data)
|
||||
|
||||
def _validate(self, data):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@delegate_names(
|
||||
SparseArray, ["npoints", "density", "fill_value", "sp_values"], typ="property"
|
||||
)
|
||||
class SparseAccessor(BaseAccessor, PandasDelegate):
|
||||
"""
|
||||
Accessor for SparseSparse from other sparse matrix data types.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> ser = pd.Series([0, 0, 2, 2, 2], dtype="Sparse[int]")
|
||||
>>> ser.sparse.density
|
||||
0.6
|
||||
>>> ser.sparse.sp_values
|
||||
array([2, 2, 2])
|
||||
"""
|
||||
|
||||
def _validate(self, data):
|
||||
if not isinstance(data.dtype, SparseDtype):
|
||||
raise AttributeError(self._validation_msg)
|
||||
|
||||
def _delegate_property_get(self, name: str, *args, **kwargs):
|
||||
return getattr(self._parent.array, name)
|
||||
|
||||
def _delegate_method(self, name: str, *args, **kwargs):
|
||||
if name == "from_coo":
|
||||
return self.from_coo(*args, **kwargs)
|
||||
elif name == "to_coo":
|
||||
return self.to_coo(*args, **kwargs)
|
||||
else:
|
||||
raise ValueError
|
||||
|
||||
@classmethod
|
||||
def from_coo(cls, A, dense_index: bool = False) -> Series:
|
||||
"""
|
||||
Create a Series with sparse values from a scipy.sparse.coo_matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
A : scipy.sparse.coo_matrix
|
||||
dense_index : bool, default False
|
||||
If False (default), the index consists of only the
|
||||
coords of the non-null entries of the original coo_matrix.
|
||||
If True, the index consists of the full sorted
|
||||
(row, col) coordinates of the coo_matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
s : Series
|
||||
A Series with sparse values.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy import sparse
|
||||
|
||||
>>> A = sparse.coo_matrix(
|
||||
... ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4)
|
||||
... )
|
||||
>>> A
|
||||
<COOrdinate sparse matrix of dtype 'float64'
|
||||
with 3 stored elements and shape (3, 4)>
|
||||
|
||||
>>> A.todense()
|
||||
matrix([[0., 0., 1., 2.],
|
||||
[3., 0., 0., 0.],
|
||||
[0., 0., 0., 0.]])
|
||||
|
||||
>>> ss = pd.Series.sparse.from_coo(A)
|
||||
>>> ss
|
||||
0 2 1.0
|
||||
3 2.0
|
||||
1 0 3.0
|
||||
dtype: Sparse[float64, nan]
|
||||
"""
|
||||
from pandas import Series
|
||||
from pandas.core.arrays.sparse.scipy_sparse import coo_to_sparse_series
|
||||
|
||||
result = coo_to_sparse_series(A, dense_index=dense_index)
|
||||
result = Series(result.array, index=result.index, copy=False)
|
||||
|
||||
return result
|
||||
|
||||
def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels: bool = False):
|
||||
"""
|
||||
Create a scipy.sparse.coo_matrix from a Series with MultiIndex.
|
||||
|
||||
Use row_levels and column_levels to determine the row and column
|
||||
coordinates respectively. row_levels and column_levels are the names
|
||||
(labels) or numbers of the levels. {row_levels, column_levels} must be
|
||||
a partition of the MultiIndex level names (or numbers).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
row_levels : tuple/list
|
||||
column_levels : tuple/list
|
||||
sort_labels : bool, default False
|
||||
Sort the row and column labels before forming the sparse matrix.
|
||||
When `row_levels` and/or `column_levels` refer to a single level,
|
||||
set to `True` for a faster execution.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : scipy.sparse.coo_matrix
|
||||
rows : list (row labels)
|
||||
columns : list (column labels)
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan])
|
||||
>>> s.index = pd.MultiIndex.from_tuples(
|
||||
... [
|
||||
... (1, 2, "a", 0),
|
||||
... (1, 2, "a", 1),
|
||||
... (1, 1, "b", 0),
|
||||
... (1, 1, "b", 1),
|
||||
... (2, 1, "b", 0),
|
||||
... (2, 1, "b", 1)
|
||||
... ],
|
||||
... names=["A", "B", "C", "D"],
|
||||
... )
|
||||
>>> s
|
||||
A B C D
|
||||
1 2 a 0 3.0
|
||||
1 NaN
|
||||
1 b 0 1.0
|
||||
1 3.0
|
||||
2 1 b 0 NaN
|
||||
1 NaN
|
||||
dtype: float64
|
||||
|
||||
>>> ss = s.astype("Sparse")
|
||||
>>> ss
|
||||
A B C D
|
||||
1 2 a 0 3.0
|
||||
1 NaN
|
||||
1 b 0 1.0
|
||||
1 3.0
|
||||
2 1 b 0 NaN
|
||||
1 NaN
|
||||
dtype: Sparse[float64, nan]
|
||||
|
||||
>>> A, rows, columns = ss.sparse.to_coo(
|
||||
... row_levels=["A", "B"], column_levels=["C", "D"], sort_labels=True
|
||||
... )
|
||||
>>> A
|
||||
<COOrdinate sparse matrix of dtype 'float64'
|
||||
with 3 stored elements and shape (3, 4)>
|
||||
>>> A.todense()
|
||||
matrix([[0., 0., 1., 3.],
|
||||
[3., 0., 0., 0.],
|
||||
[0., 0., 0., 0.]])
|
||||
|
||||
>>> rows
|
||||
[(1, 1), (1, 2), (2, 1)]
|
||||
>>> columns
|
||||
[('a', 0), ('a', 1), ('b', 0), ('b', 1)]
|
||||
"""
|
||||
from pandas.core.arrays.sparse.scipy_sparse import sparse_series_to_coo
|
||||
|
||||
A, rows, columns = sparse_series_to_coo(
|
||||
self._parent, row_levels, column_levels, sort_labels=sort_labels
|
||||
)
|
||||
return A, rows, columns
|
||||
|
||||
def to_dense(self) -> Series:
|
||||
"""
|
||||
Convert a Series from sparse values to dense.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Series:
|
||||
A Series with the same values, stored as a dense array.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> series = pd.Series(pd.arrays.SparseArray([0, 1, 0]))
|
||||
>>> series
|
||||
0 0
|
||||
1 1
|
||||
2 0
|
||||
dtype: Sparse[int64, 0]
|
||||
|
||||
>>> series.sparse.to_dense()
|
||||
0 0
|
||||
1 1
|
||||
2 0
|
||||
dtype: int64
|
||||
"""
|
||||
from pandas import Series
|
||||
|
||||
return Series(
|
||||
self._parent.array.to_dense(),
|
||||
index=self._parent.index,
|
||||
name=self._parent.name,
|
||||
copy=False,
|
||||
)
|
||||
|
||||
|
||||
class SparseFrameAccessor(BaseAccessor, PandasDelegate):
|
||||
"""
|
||||
DataFrame accessor for sparse data.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.DataFrame({"a": [1, 2, 0, 0],
|
||||
... "b": [3, 0, 0, 4]}, dtype="Sparse[int]")
|
||||
>>> df.sparse.density
|
||||
0.5
|
||||
"""
|
||||
|
||||
def _validate(self, data):
|
||||
dtypes = data.dtypes
|
||||
if not all(isinstance(t, SparseDtype) for t in dtypes):
|
||||
raise AttributeError(self._validation_msg)
|
||||
|
||||
@classmethod
|
||||
def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame:
|
||||
"""
|
||||
Create a new DataFrame from a scipy sparse matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : scipy.sparse.spmatrix
|
||||
Must be convertible to csc format.
|
||||
index, columns : Index, optional
|
||||
Row and column labels to use for the resulting DataFrame.
|
||||
Defaults to a RangeIndex.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
Each column of the DataFrame is stored as a
|
||||
:class:`arrays.SparseArray`.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import scipy.sparse
|
||||
>>> mat = scipy.sparse.eye(3, dtype=float)
|
||||
>>> pd.DataFrame.sparse.from_spmatrix(mat)
|
||||
0 1 2
|
||||
0 1.0 0 0
|
||||
1 0 1.0 0
|
||||
2 0 0 1.0
|
||||
"""
|
||||
from pandas._libs.sparse import IntIndex
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
data = data.tocsc()
|
||||
index, columns = cls._prep_index(data, index, columns)
|
||||
n_rows, n_columns = data.shape
|
||||
# We need to make sure indices are sorted, as we create
|
||||
# IntIndex with no input validation (i.e. check_integrity=False ).
|
||||
# Indices may already be sorted in scipy in which case this adds
|
||||
# a small overhead.
|
||||
data.sort_indices()
|
||||
indices = data.indices
|
||||
indptr = data.indptr
|
||||
array_data = data.data
|
||||
dtype = SparseDtype(array_data.dtype, 0)
|
||||
arrays = []
|
||||
for i in range(n_columns):
|
||||
sl = slice(indptr[i], indptr[i + 1])
|
||||
idx = IntIndex(n_rows, indices[sl], check_integrity=False)
|
||||
arr = SparseArray._simple_new(array_data[sl], idx, dtype)
|
||||
arrays.append(arr)
|
||||
return DataFrame._from_arrays(
|
||||
arrays, columns=columns, index=index, verify_integrity=False
|
||||
)
|
||||
|
||||
def to_dense(self) -> DataFrame:
|
||||
"""
|
||||
Convert a DataFrame with sparse values to dense.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
A DataFrame with the same values stored as dense arrays.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0])})
|
||||
>>> df.sparse.to_dense()
|
||||
A
|
||||
0 0
|
||||
1 1
|
||||
2 0
|
||||
"""
|
||||
from pandas import DataFrame
|
||||
|
||||
data = {k: v.array.to_dense() for k, v in self._parent.items()}
|
||||
return DataFrame(data, index=self._parent.index, columns=self._parent.columns)
|
||||
|
||||
def to_coo(self):
|
||||
"""
|
||||
Return the contents of the frame as a sparse SciPy COO matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
scipy.sparse.spmatrix
|
||||
If the caller is heterogeneous and contains booleans or objects,
|
||||
the result will be of dtype=object. See Notes.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The dtype will be the lowest-common-denominator type (implicit
|
||||
upcasting); that is to say if the dtypes (even of numeric types)
|
||||
are mixed, the one that accommodates all will be chosen.
|
||||
|
||||
e.g. If the dtypes are float16 and float32, dtype will be upcast to
|
||||
float32. By numpy.find_common_type convention, mixing int64 and
|
||||
and uint64 will result in a float64 dtype.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])})
|
||||
>>> df.sparse.to_coo()
|
||||
<COOrdinate sparse matrix of dtype 'int64'
|
||||
with 2 stored elements and shape (4, 1)>
|
||||
"""
|
||||
import_optional_dependency("scipy")
|
||||
from scipy.sparse import coo_matrix
|
||||
|
||||
dtype = find_common_type(self._parent.dtypes.to_list())
|
||||
if isinstance(dtype, SparseDtype):
|
||||
dtype = dtype.subtype
|
||||
|
||||
cols, rows, data = [], [], []
|
||||
for col, (_, ser) in enumerate(self._parent.items()):
|
||||
sp_arr = ser.array
|
||||
if sp_arr.fill_value != 0:
|
||||
raise ValueError("fill value must be 0 when converting to COO matrix")
|
||||
|
||||
row = sp_arr.sp_index.indices
|
||||
cols.append(np.repeat(col, len(row)))
|
||||
rows.append(row)
|
||||
data.append(sp_arr.sp_values.astype(dtype, copy=False))
|
||||
|
||||
cols = np.concatenate(cols)
|
||||
rows = np.concatenate(rows)
|
||||
data = np.concatenate(data)
|
||||
return coo_matrix((data, (rows, cols)), shape=self._parent.shape)
|
||||
|
||||
@property
|
||||
def density(self) -> float:
|
||||
"""
|
||||
Ratio of non-sparse points to total (dense) data points.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])})
|
||||
>>> df.sparse.density
|
||||
0.5
|
||||
"""
|
||||
tmp = np.mean([column.array.density for _, column in self._parent.items()])
|
||||
return tmp
|
||||
|
||||
@staticmethod
|
||||
def _prep_index(data, index, columns):
|
||||
from pandas.core.indexes.api import (
|
||||
default_index,
|
||||
ensure_index,
|
||||
)
|
||||
|
||||
N, K = data.shape
|
||||
if index is None:
|
||||
index = default_index(N)
|
||||
else:
|
||||
index = ensure_index(index)
|
||||
if columns is None:
|
||||
columns = default_index(K)
|
||||
else:
|
||||
columns = ensure_index(columns)
|
||||
|
||||
if len(columns) != K:
|
||||
raise ValueError(f"Column length mismatch: {len(columns)} vs. {K}")
|
||||
if len(index) != N:
|
||||
raise ValueError(f"Index length mismatch: {len(index)} vs. {N}")
|
||||
return index, columns
|
1945
lib/python3.11/site-packages/pandas/core/arrays/sparse/array.py
Normal file
1945
lib/python3.11/site-packages/pandas/core/arrays/sparse/array.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,207 @@
|
||||
"""
|
||||
Interaction with scipy.sparse matrices.
|
||||
|
||||
Currently only includes to_coo helpers.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from pandas._libs import lib
|
||||
|
||||
from pandas.core.dtypes.missing import notna
|
||||
|
||||
from pandas.core.algorithms import factorize
|
||||
from pandas.core.indexes.api import MultiIndex
|
||||
from pandas.core.series import Series
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable
|
||||
|
||||
import numpy as np
|
||||
import scipy.sparse
|
||||
|
||||
from pandas._typing import (
|
||||
IndexLabel,
|
||||
npt,
|
||||
)
|
||||
|
||||
|
||||
def _check_is_partition(parts: Iterable, whole: Iterable):
|
||||
whole = set(whole)
|
||||
parts = [set(x) for x in parts]
|
||||
if set.intersection(*parts) != set():
|
||||
raise ValueError("Is not a partition because intersection is not null.")
|
||||
if set.union(*parts) != whole:
|
||||
raise ValueError("Is not a partition because union is not the whole.")
|
||||
|
||||
|
||||
def _levels_to_axis(
|
||||
ss,
|
||||
levels: tuple[int] | list[int],
|
||||
valid_ilocs: npt.NDArray[np.intp],
|
||||
sort_labels: bool = False,
|
||||
) -> tuple[npt.NDArray[np.intp], list[IndexLabel]]:
|
||||
"""
|
||||
For a MultiIndexed sparse Series `ss`, return `ax_coords` and `ax_labels`,
|
||||
where `ax_coords` are the coordinates along one of the two axes of the
|
||||
destination sparse matrix, and `ax_labels` are the labels from `ss`' Index
|
||||
which correspond to these coordinates.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ss : Series
|
||||
levels : tuple/list
|
||||
valid_ilocs : numpy.ndarray
|
||||
Array of integer positions of valid values for the sparse matrix in ss.
|
||||
sort_labels : bool, default False
|
||||
Sort the axis labels before forming the sparse matrix. When `levels`
|
||||
refers to a single level, set to True for a faster execution.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ax_coords : numpy.ndarray (axis coordinates)
|
||||
ax_labels : list (axis labels)
|
||||
"""
|
||||
# Since the labels are sorted in `Index.levels`, when we wish to sort and
|
||||
# there is only one level of the MultiIndex for this axis, the desired
|
||||
# output can be obtained in the following simpler, more efficient way.
|
||||
if sort_labels and len(levels) == 1:
|
||||
ax_coords = ss.index.codes[levels[0]][valid_ilocs]
|
||||
ax_labels = ss.index.levels[levels[0]]
|
||||
|
||||
else:
|
||||
levels_values = lib.fast_zip(
|
||||
[ss.index.get_level_values(lvl).to_numpy() for lvl in levels]
|
||||
)
|
||||
codes, ax_labels = factorize(levels_values, sort=sort_labels)
|
||||
ax_coords = codes[valid_ilocs]
|
||||
|
||||
ax_labels = ax_labels.tolist()
|
||||
return ax_coords, ax_labels
|
||||
|
||||
|
||||
def _to_ijv(
|
||||
ss,
|
||||
row_levels: tuple[int] | list[int] = (0,),
|
||||
column_levels: tuple[int] | list[int] = (1,),
|
||||
sort_labels: bool = False,
|
||||
) -> tuple[
|
||||
np.ndarray,
|
||||
npt.NDArray[np.intp],
|
||||
npt.NDArray[np.intp],
|
||||
list[IndexLabel],
|
||||
list[IndexLabel],
|
||||
]:
|
||||
"""
|
||||
For an arbitrary MultiIndexed sparse Series return (v, i, j, ilabels,
|
||||
jlabels) where (v, (i, j)) is suitable for passing to scipy.sparse.coo
|
||||
constructor, and ilabels and jlabels are the row and column labels
|
||||
respectively.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ss : Series
|
||||
row_levels : tuple/list
|
||||
column_levels : tuple/list
|
||||
sort_labels : bool, default False
|
||||
Sort the row and column labels before forming the sparse matrix.
|
||||
When `row_levels` and/or `column_levels` refer to a single level,
|
||||
set to `True` for a faster execution.
|
||||
|
||||
Returns
|
||||
-------
|
||||
values : numpy.ndarray
|
||||
Valid values to populate a sparse matrix, extracted from
|
||||
ss.
|
||||
i_coords : numpy.ndarray (row coordinates of the values)
|
||||
j_coords : numpy.ndarray (column coordinates of the values)
|
||||
i_labels : list (row labels)
|
||||
j_labels : list (column labels)
|
||||
"""
|
||||
# index and column levels must be a partition of the index
|
||||
_check_is_partition([row_levels, column_levels], range(ss.index.nlevels))
|
||||
# From the sparse Series, get the integer indices and data for valid sparse
|
||||
# entries.
|
||||
sp_vals = ss.array.sp_values
|
||||
na_mask = notna(sp_vals)
|
||||
values = sp_vals[na_mask]
|
||||
valid_ilocs = ss.array.sp_index.indices[na_mask]
|
||||
|
||||
i_coords, i_labels = _levels_to_axis(
|
||||
ss, row_levels, valid_ilocs, sort_labels=sort_labels
|
||||
)
|
||||
|
||||
j_coords, j_labels = _levels_to_axis(
|
||||
ss, column_levels, valid_ilocs, sort_labels=sort_labels
|
||||
)
|
||||
|
||||
return values, i_coords, j_coords, i_labels, j_labels
|
||||
|
||||
|
||||
def sparse_series_to_coo(
|
||||
ss: Series,
|
||||
row_levels: Iterable[int] = (0,),
|
||||
column_levels: Iterable[int] = (1,),
|
||||
sort_labels: bool = False,
|
||||
) -> tuple[scipy.sparse.coo_matrix, list[IndexLabel], list[IndexLabel]]:
|
||||
"""
|
||||
Convert a sparse Series to a scipy.sparse.coo_matrix using index
|
||||
levels row_levels, column_levels as the row and column
|
||||
labels respectively. Returns the sparse_matrix, row and column labels.
|
||||
"""
|
||||
import scipy.sparse
|
||||
|
||||
if ss.index.nlevels < 2:
|
||||
raise ValueError("to_coo requires MultiIndex with nlevels >= 2.")
|
||||
if not ss.index.is_unique:
|
||||
raise ValueError(
|
||||
"Duplicate index entries are not allowed in to_coo transformation."
|
||||
)
|
||||
|
||||
# to keep things simple, only rely on integer indexing (not labels)
|
||||
row_levels = [ss.index._get_level_number(x) for x in row_levels]
|
||||
column_levels = [ss.index._get_level_number(x) for x in column_levels]
|
||||
|
||||
v, i, j, rows, columns = _to_ijv(
|
||||
ss, row_levels=row_levels, column_levels=column_levels, sort_labels=sort_labels
|
||||
)
|
||||
sparse_matrix = scipy.sparse.coo_matrix(
|
||||
(v, (i, j)), shape=(len(rows), len(columns))
|
||||
)
|
||||
return sparse_matrix, rows, columns
|
||||
|
||||
|
||||
def coo_to_sparse_series(
|
||||
A: scipy.sparse.coo_matrix, dense_index: bool = False
|
||||
) -> Series:
|
||||
"""
|
||||
Convert a scipy.sparse.coo_matrix to a Series with type sparse.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
A : scipy.sparse.coo_matrix
|
||||
dense_index : bool, default False
|
||||
|
||||
Returns
|
||||
-------
|
||||
Series
|
||||
|
||||
Raises
|
||||
------
|
||||
TypeError if A is not a coo_matrix
|
||||
"""
|
||||
from pandas import SparseDtype
|
||||
|
||||
try:
|
||||
ser = Series(A.data, MultiIndex.from_arrays((A.row, A.col)), copy=False)
|
||||
except AttributeError as err:
|
||||
raise TypeError(
|
||||
f"Expected coo_matrix. Got {type(A).__name__} instead."
|
||||
) from err
|
||||
ser = ser.sort_index()
|
||||
ser = ser.astype(SparseDtype(ser.dtype))
|
||||
if dense_index:
|
||||
ind = MultiIndex.from_product([A.row, A.col])
|
||||
ser = ser.reindex(ind)
|
||||
return ser
|
1131
lib/python3.11/site-packages/pandas/core/arrays/string_.py
Normal file
1131
lib/python3.11/site-packages/pandas/core/arrays/string_.py
Normal file
File diff suppressed because it is too large
Load Diff
495
lib/python3.11/site-packages/pandas/core/arrays/string_arrow.py
Normal file
495
lib/python3.11/site-packages/pandas/core/arrays/string_arrow.py
Normal file
@ -0,0 +1,495 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import operator
|
||||
import re
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Callable,
|
||||
Union,
|
||||
)
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import (
|
||||
lib,
|
||||
missing as libmissing,
|
||||
)
|
||||
from pandas.compat import (
|
||||
pa_version_under10p1,
|
||||
pa_version_under13p0,
|
||||
pa_version_under16p0,
|
||||
)
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_scalar,
|
||||
pandas_dtype,
|
||||
)
|
||||
from pandas.core.dtypes.missing import isna
|
||||
|
||||
from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin
|
||||
from pandas.core.arrays.arrow import ArrowExtensionArray
|
||||
from pandas.core.arrays.boolean import BooleanDtype
|
||||
from pandas.core.arrays.floating import Float64Dtype
|
||||
from pandas.core.arrays.integer import Int64Dtype
|
||||
from pandas.core.arrays.numeric import NumericDtype
|
||||
from pandas.core.arrays.string_ import (
|
||||
BaseStringArray,
|
||||
StringDtype,
|
||||
)
|
||||
from pandas.core.strings.object_array import ObjectStringArrayMixin
|
||||
|
||||
if not pa_version_under10p1:
|
||||
import pyarrow as pa
|
||||
import pyarrow.compute as pc
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Sequence
|
||||
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
Dtype,
|
||||
Self,
|
||||
npt,
|
||||
)
|
||||
|
||||
from pandas import Series
|
||||
|
||||
|
||||
ArrowStringScalarOrNAT = Union[str, libmissing.NAType]
|
||||
|
||||
|
||||
def _chk_pyarrow_available() -> None:
|
||||
if pa_version_under10p1:
|
||||
msg = "pyarrow>=10.0.1 is required for PyArrow backed ArrowExtensionArray."
|
||||
raise ImportError(msg)
|
||||
|
||||
|
||||
def _is_string_view(typ):
|
||||
return not pa_version_under16p0 and pa.types.is_string_view(typ)
|
||||
|
||||
|
||||
# TODO: Inherit directly from BaseStringArrayMethods. Currently we inherit from
|
||||
# ObjectStringArrayMixin because we want to have the object-dtype based methods as
|
||||
# fallback for the ones that pyarrow doesn't yet support
|
||||
|
||||
|
||||
class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringArray):
|
||||
"""
|
||||
Extension array for string data in a ``pyarrow.ChunkedArray``.
|
||||
|
||||
.. warning::
|
||||
|
||||
ArrowStringArray is considered experimental. The implementation and
|
||||
parts of the API may change without warning.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : pyarrow.Array or pyarrow.ChunkedArray
|
||||
The array of data.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
See Also
|
||||
--------
|
||||
:func:`pandas.array`
|
||||
The recommended function for creating a ArrowStringArray.
|
||||
Series.str
|
||||
The string methods are available on Series backed by
|
||||
a ArrowStringArray.
|
||||
|
||||
Notes
|
||||
-----
|
||||
ArrowStringArray returns a BooleanArray for comparison methods.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> pd.array(['This is', 'some text', None, 'data.'], dtype="string[pyarrow]")
|
||||
<ArrowStringArray>
|
||||
['This is', 'some text', <NA>, 'data.']
|
||||
Length: 4, dtype: string
|
||||
"""
|
||||
|
||||
# error: Incompatible types in assignment (expression has type "StringDtype",
|
||||
# base class "ArrowExtensionArray" defined the type as "ArrowDtype")
|
||||
_dtype: StringDtype # type: ignore[assignment]
|
||||
_storage = "pyarrow"
|
||||
_na_value: libmissing.NAType | float = libmissing.NA
|
||||
|
||||
def __init__(self, values) -> None:
|
||||
_chk_pyarrow_available()
|
||||
if isinstance(values, (pa.Array, pa.ChunkedArray)) and (
|
||||
pa.types.is_string(values.type)
|
||||
or _is_string_view(values.type)
|
||||
or (
|
||||
pa.types.is_dictionary(values.type)
|
||||
and (
|
||||
pa.types.is_string(values.type.value_type)
|
||||
or pa.types.is_large_string(values.type.value_type)
|
||||
or _is_string_view(values.type.value_type)
|
||||
)
|
||||
)
|
||||
):
|
||||
values = pc.cast(values, pa.large_string())
|
||||
|
||||
super().__init__(values)
|
||||
self._dtype = StringDtype(storage=self._storage, na_value=self._na_value)
|
||||
|
||||
if not pa.types.is_large_string(self._pa_array.type):
|
||||
raise ValueError(
|
||||
"ArrowStringArray requires a PyArrow (chunked) array of "
|
||||
"large_string type"
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
|
||||
pa_scalar = super()._box_pa_scalar(value, pa_type)
|
||||
if pa.types.is_string(pa_scalar.type) and pa_type is None:
|
||||
pa_scalar = pc.cast(pa_scalar, pa.large_string())
|
||||
return pa_scalar
|
||||
|
||||
@classmethod
|
||||
def _box_pa_array(
|
||||
cls, value, pa_type: pa.DataType | None = None, copy: bool = False
|
||||
) -> pa.Array | pa.ChunkedArray:
|
||||
pa_array = super()._box_pa_array(value, pa_type)
|
||||
if pa.types.is_string(pa_array.type) and pa_type is None:
|
||||
pa_array = pc.cast(pa_array, pa.large_string())
|
||||
return pa_array
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""
|
||||
Length of this array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
length : int
|
||||
"""
|
||||
return len(self._pa_array)
|
||||
|
||||
@classmethod
|
||||
def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
|
||||
from pandas.core.arrays.masked import BaseMaskedArray
|
||||
|
||||
_chk_pyarrow_available()
|
||||
|
||||
if dtype and not (isinstance(dtype, str) and dtype == "string"):
|
||||
dtype = pandas_dtype(dtype)
|
||||
assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow"
|
||||
|
||||
if isinstance(scalars, BaseMaskedArray):
|
||||
# avoid costly conversion to object dtype in ensure_string_array and
|
||||
# numerical issues with Float32Dtype
|
||||
na_values = scalars._mask
|
||||
result = scalars._data
|
||||
result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
|
||||
return cls(pa.array(result, mask=na_values, type=pa.large_string()))
|
||||
elif isinstance(scalars, (pa.Array, pa.ChunkedArray)):
|
||||
return cls(pc.cast(scalars, pa.large_string()))
|
||||
|
||||
# convert non-na-likes to str
|
||||
result = lib.ensure_string_array(scalars, copy=copy)
|
||||
return cls(pa.array(result, type=pa.large_string(), from_pandas=True))
|
||||
|
||||
@classmethod
|
||||
def _from_sequence_of_strings(
|
||||
cls, strings, dtype: Dtype | None = None, copy: bool = False
|
||||
):
|
||||
return cls._from_sequence(strings, dtype=dtype, copy=copy)
|
||||
|
||||
@property
|
||||
def dtype(self) -> StringDtype: # type: ignore[override]
|
||||
"""
|
||||
An instance of 'string[pyarrow]'.
|
||||
"""
|
||||
return self._dtype
|
||||
|
||||
def insert(self, loc: int, item) -> ArrowStringArray:
|
||||
if self.dtype.na_value is np.nan and item is np.nan:
|
||||
item = libmissing.NA
|
||||
if not isinstance(item, str) and item is not libmissing.NA:
|
||||
raise TypeError(
|
||||
f"Invalid value '{item}' for dtype 'str'. Value should be a "
|
||||
f"string or missing value, got '{type(item).__name__}' instead."
|
||||
)
|
||||
return super().insert(loc, item)
|
||||
|
||||
def _convert_bool_result(self, values, na=lib.no_default, method_name=None):
|
||||
if na is not lib.no_default and not isna(na) and not isinstance(na, bool):
|
||||
# GH#59561
|
||||
warnings.warn(
|
||||
f"Allowing a non-bool 'na' in obj.str.{method_name} is deprecated "
|
||||
"and will raise in a future version.",
|
||||
FutureWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
na = bool(na)
|
||||
|
||||
if self.dtype.na_value is np.nan:
|
||||
if na is lib.no_default or isna(na):
|
||||
# NaN propagates as False
|
||||
values = values.fill_null(False)
|
||||
else:
|
||||
values = values.fill_null(na)
|
||||
return values.to_numpy()
|
||||
else:
|
||||
if na is not lib.no_default and not isna(
|
||||
na
|
||||
): # pyright: ignore [reportGeneralTypeIssues]
|
||||
values = values.fill_null(na)
|
||||
return BooleanDtype().__from_arrow__(values)
|
||||
|
||||
def _maybe_convert_setitem_value(self, value):
|
||||
"""Maybe convert value to be pyarrow compatible."""
|
||||
if is_scalar(value):
|
||||
if isna(value):
|
||||
value = None
|
||||
elif not isinstance(value, str):
|
||||
raise TypeError(
|
||||
f"Invalid value '{value}' for dtype 'str'. Value should be a "
|
||||
f"string or missing value, got '{type(value).__name__}' instead."
|
||||
)
|
||||
else:
|
||||
value = np.array(value, dtype=object, copy=True)
|
||||
value[isna(value)] = None
|
||||
for v in value:
|
||||
if not (v is None or isinstance(v, str)):
|
||||
raise TypeError(
|
||||
"Invalid value for dtype 'str'. Value should be a "
|
||||
"string or missing value (or array of those)."
|
||||
)
|
||||
return super()._maybe_convert_setitem_value(value)
|
||||
|
||||
def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
|
||||
value_set = [
|
||||
pa_scalar.as_py()
|
||||
for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values]
|
||||
if pa_scalar.type in (pa.string(), pa.null(), pa.large_string())
|
||||
]
|
||||
|
||||
# short-circuit to return all False array.
|
||||
if not len(value_set):
|
||||
return np.zeros(len(self), dtype=bool)
|
||||
|
||||
result = pc.is_in(
|
||||
self._pa_array, value_set=pa.array(value_set, type=self._pa_array.type)
|
||||
)
|
||||
# pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls
|
||||
# to False
|
||||
return np.array(result, dtype=np.bool_)
|
||||
|
||||
def astype(self, dtype, copy: bool = True):
|
||||
dtype = pandas_dtype(dtype)
|
||||
|
||||
if dtype == self.dtype:
|
||||
if copy:
|
||||
return self.copy()
|
||||
return self
|
||||
elif isinstance(dtype, NumericDtype):
|
||||
data = self._pa_array.cast(pa.from_numpy_dtype(dtype.numpy_dtype))
|
||||
return dtype.__from_arrow__(data)
|
||||
elif isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.floating):
|
||||
return self.to_numpy(dtype=dtype, na_value=np.nan)
|
||||
|
||||
return super().astype(dtype, copy=copy)
|
||||
|
||||
@property
|
||||
def _data(self):
|
||||
# dask accesses ._data directlys
|
||||
warnings.warn(
|
||||
f"{type(self).__name__}._data is a deprecated and will be removed "
|
||||
"in a future version, use ._pa_array instead",
|
||||
FutureWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
return self._pa_array
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# String methods interface
|
||||
|
||||
_str_isalnum = ArrowStringArrayMixin._str_isalnum
|
||||
_str_isalpha = ArrowStringArrayMixin._str_isalpha
|
||||
_str_isdecimal = ArrowStringArrayMixin._str_isdecimal
|
||||
_str_isdigit = ArrowStringArrayMixin._str_isdigit
|
||||
_str_islower = ArrowStringArrayMixin._str_islower
|
||||
_str_isnumeric = ArrowStringArrayMixin._str_isnumeric
|
||||
_str_isspace = ArrowStringArrayMixin._str_isspace
|
||||
_str_istitle = ArrowStringArrayMixin._str_istitle
|
||||
_str_isupper = ArrowStringArrayMixin._str_isupper
|
||||
|
||||
_str_map = BaseStringArray._str_map
|
||||
_str_startswith = ArrowStringArrayMixin._str_startswith
|
||||
_str_endswith = ArrowStringArrayMixin._str_endswith
|
||||
_str_pad = ArrowStringArrayMixin._str_pad
|
||||
_str_match = ArrowStringArrayMixin._str_match
|
||||
_str_fullmatch = ArrowStringArrayMixin._str_fullmatch
|
||||
_str_lower = ArrowStringArrayMixin._str_lower
|
||||
_str_upper = ArrowStringArrayMixin._str_upper
|
||||
_str_strip = ArrowStringArrayMixin._str_strip
|
||||
_str_lstrip = ArrowStringArrayMixin._str_lstrip
|
||||
_str_rstrip = ArrowStringArrayMixin._str_rstrip
|
||||
_str_removesuffix = ArrowStringArrayMixin._str_removesuffix
|
||||
_str_get = ArrowStringArrayMixin._str_get
|
||||
_str_capitalize = ArrowStringArrayMixin._str_capitalize
|
||||
_str_title = ArrowStringArrayMixin._str_title
|
||||
_str_swapcase = ArrowStringArrayMixin._str_swapcase
|
||||
_str_slice_replace = ArrowStringArrayMixin._str_slice_replace
|
||||
_str_len = ArrowStringArrayMixin._str_len
|
||||
_str_slice = ArrowStringArrayMixin._str_slice
|
||||
|
||||
def _str_contains(
|
||||
self,
|
||||
pat,
|
||||
case: bool = True,
|
||||
flags: int = 0,
|
||||
na=lib.no_default,
|
||||
regex: bool = True,
|
||||
):
|
||||
if flags:
|
||||
return super()._str_contains(pat, case, flags, na, regex)
|
||||
if isinstance(pat, re.Pattern):
|
||||
pat = pat.pattern
|
||||
|
||||
return ArrowStringArrayMixin._str_contains(self, pat, case, flags, na, regex)
|
||||
|
||||
def _str_replace(
|
||||
self,
|
||||
pat: str | re.Pattern,
|
||||
repl: str | Callable,
|
||||
n: int = -1,
|
||||
case: bool = True,
|
||||
flags: int = 0,
|
||||
regex: bool = True,
|
||||
):
|
||||
if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:
|
||||
return super()._str_replace(pat, repl, n, case, flags, regex)
|
||||
|
||||
return ArrowStringArrayMixin._str_replace(
|
||||
self, pat, repl, n, case, flags, regex
|
||||
)
|
||||
|
||||
def _str_repeat(self, repeats: int | Sequence[int]):
|
||||
if not isinstance(repeats, int):
|
||||
return super()._str_repeat(repeats)
|
||||
else:
|
||||
return ArrowExtensionArray._str_repeat(self, repeats=repeats)
|
||||
|
||||
def _str_removeprefix(self, prefix: str):
|
||||
if not pa_version_under13p0:
|
||||
return ArrowStringArrayMixin._str_removeprefix(self, prefix)
|
||||
return super()._str_removeprefix(prefix)
|
||||
|
||||
def _str_count(self, pat: str, flags: int = 0):
|
||||
if flags:
|
||||
return super()._str_count(pat, flags)
|
||||
result = pc.count_substring_regex(self._pa_array, pat)
|
||||
return self._convert_int_result(result)
|
||||
|
||||
def _str_find(self, sub: str, start: int = 0, end: int | None = None):
|
||||
if (
|
||||
pa_version_under13p0
|
||||
and not (start != 0 and end is not None)
|
||||
and not (start == 0 and end is None)
|
||||
):
|
||||
# GH#59562
|
||||
return super()._str_find(sub, start, end)
|
||||
return ArrowStringArrayMixin._str_find(self, sub, start, end)
|
||||
|
||||
def _str_get_dummies(self, sep: str = "|"):
|
||||
dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(sep)
|
||||
if len(labels) == 0:
|
||||
return np.empty(shape=(0, 0), dtype=np.int64), labels
|
||||
dummies = np.vstack(dummies_pa.to_numpy())
|
||||
return dummies.astype(np.int64, copy=False), labels
|
||||
|
||||
def _convert_int_result(self, result):
|
||||
if self.dtype.na_value is np.nan:
|
||||
if isinstance(result, pa.Array):
|
||||
result = result.to_numpy(zero_copy_only=False)
|
||||
else:
|
||||
result = result.to_numpy()
|
||||
if result.dtype == np.int32:
|
||||
result = result.astype(np.int64)
|
||||
return result
|
||||
|
||||
return Int64Dtype().__from_arrow__(result)
|
||||
|
||||
def _convert_rank_result(self, result):
|
||||
if self.dtype.na_value is np.nan:
|
||||
if isinstance(result, pa.Array):
|
||||
result = result.to_numpy(zero_copy_only=False)
|
||||
else:
|
||||
result = result.to_numpy()
|
||||
return result.astype("float64", copy=False)
|
||||
|
||||
return Float64Dtype().__from_arrow__(result)
|
||||
|
||||
def _reduce(
|
||||
self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
|
||||
):
|
||||
if self.dtype.na_value is np.nan and name in ["any", "all"]:
|
||||
if not skipna:
|
||||
nas = pc.is_null(self._pa_array)
|
||||
arr = pc.or_kleene(nas, pc.not_equal(self._pa_array, ""))
|
||||
else:
|
||||
arr = pc.not_equal(self._pa_array, "")
|
||||
result = ArrowExtensionArray(arr)._reduce(
|
||||
name, skipna=skipna, keepdims=keepdims, **kwargs
|
||||
)
|
||||
if keepdims:
|
||||
# ArrowExtensionArray will return a length-1 bool[pyarrow] array
|
||||
return result.astype(np.bool_)
|
||||
return result
|
||||
|
||||
if name in ("min", "max", "sum", "argmin", "argmax"):
|
||||
result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs)
|
||||
else:
|
||||
raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
|
||||
|
||||
if name in ("argmin", "argmax") and isinstance(result, pa.Array):
|
||||
return self._convert_int_result(result)
|
||||
elif isinstance(result, pa.Array):
|
||||
return type(self)(result)
|
||||
else:
|
||||
return result
|
||||
|
||||
def value_counts(self, dropna: bool = True) -> Series:
|
||||
result = super().value_counts(dropna=dropna)
|
||||
if self.dtype.na_value is np.nan:
|
||||
res_values = result._values.to_numpy()
|
||||
return result._constructor(
|
||||
res_values, index=result.index, name=result.name, copy=False
|
||||
)
|
||||
return result
|
||||
|
||||
def _cmp_method(self, other, op):
|
||||
if (
|
||||
isinstance(other, (BaseStringArray, ArrowExtensionArray))
|
||||
and self.dtype.na_value is not libmissing.NA
|
||||
and other.dtype.na_value is libmissing.NA
|
||||
):
|
||||
# NA has priority of NaN semantics
|
||||
return NotImplemented
|
||||
|
||||
result = super()._cmp_method(other, op)
|
||||
if self.dtype.na_value is np.nan:
|
||||
if op == operator.ne:
|
||||
return result.to_numpy(np.bool_, na_value=True)
|
||||
else:
|
||||
return result.to_numpy(np.bool_, na_value=False)
|
||||
return result
|
||||
|
||||
def __pos__(self) -> Self:
|
||||
raise TypeError(f"bad operand type for unary +: '{self.dtype}'")
|
||||
|
||||
|
||||
class ArrowStringArrayNumpySemantics(ArrowStringArray):
|
||||
_na_value = np.nan
|
1185
lib/python3.11/site-packages/pandas/core/arrays/timedeltas.py
Normal file
1185
lib/python3.11/site-packages/pandas/core/arrays/timedeltas.py
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user