This commit is contained in:
2025-09-07 22:09:54 +02:00
parent e1b817252c
commit 2fc0d000b6
7796 changed files with 2159515 additions and 933 deletions

View File

@ -0,0 +1,134 @@
import operator
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
@pytest.fixture
def data():
"""Fixture returning boolean array with valid and missing values."""
return pd.array(
[True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False],
dtype="boolean",
)
@pytest.fixture
def left_array():
"""Fixture returning boolean array with valid and missing values."""
return pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
@pytest.fixture
def right_array():
"""Fixture returning boolean array with valid and missing values."""
return pd.array([True, False, None] * 3, dtype="boolean")
# Basic test for the arithmetic array ops
# -----------------------------------------------------------------------------
@pytest.mark.parametrize(
"opname, exp",
[
("add", [True, True, None, True, False, None, None, None, None]),
("mul", [True, False, None, False, False, None, None, None, None]),
],
ids=["add", "mul"],
)
def test_add_mul(left_array, right_array, opname, exp):
op = getattr(operator, opname)
result = op(left_array, right_array)
expected = pd.array(exp, dtype="boolean")
tm.assert_extension_array_equal(result, expected)
def test_sub(left_array, right_array):
msg = (
r"numpy boolean subtract, the `-` operator, is (?:deprecated|not supported), "
r"use the bitwise_xor, the `\^` operator, or the logical_xor function instead\."
)
with pytest.raises(TypeError, match=msg):
left_array - right_array
def test_div(left_array, right_array):
msg = "operator '.*' not implemented for bool dtypes"
with pytest.raises(NotImplementedError, match=msg):
# check that we are matching the non-masked Series behavior
pd.Series(left_array._data) / pd.Series(right_array._data)
with pytest.raises(NotImplementedError, match=msg):
left_array / right_array
@pytest.mark.parametrize(
"opname",
[
"floordiv",
"mod",
"pow",
],
)
def test_op_int8(left_array, right_array, opname):
op = getattr(operator, opname)
if opname != "mod":
msg = "operator '.*' not implemented for bool dtypes"
with pytest.raises(NotImplementedError, match=msg):
result = op(left_array, right_array)
return
result = op(left_array, right_array)
expected = op(left_array.astype("Int8"), right_array.astype("Int8"))
tm.assert_extension_array_equal(result, expected)
# Test generic characteristics / errors
# -----------------------------------------------------------------------------
def test_error_invalid_values(data, all_arithmetic_operators):
# invalid ops
op = all_arithmetic_operators
s = pd.Series(data)
ops = getattr(s, op)
# invalid scalars
msg = (
"did not contain a loop with signature matching types|"
"BooleanArray cannot perform the operation|"
"not supported for the input types, and the inputs could not be safely coerced "
"to any supported types according to the casting rule ''safe''|"
"not supported for dtype"
)
with pytest.raises(TypeError, match=msg):
ops("foo")
msg = "|".join(
[
r"unsupported operand type\(s\) for",
"Concatenation operation is not implemented for NumPy arrays",
"has no kernel",
"not supported for dtype",
]
)
with pytest.raises(TypeError, match=msg):
ops(pd.Timestamp("20180101"))
# invalid array-likes
if op not in ("__mul__", "__rmul__"):
# TODO(extension) numpy's mul with object array sees booleans as numbers
msg = "|".join(
[
r"unsupported operand type\(s\) for",
"can only concatenate str",
"not all arguments converted during string formatting",
"has no kernel",
"not implemented",
"not supported for dtype",
]
)
with pytest.raises(TypeError, match=msg):
ops(pd.Series("foo", index=s.index))

View File

@ -0,0 +1,59 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
def test_astype(using_infer_string):
# with missing values
arr = pd.array([True, False, None], dtype="boolean")
with pytest.raises(ValueError, match="cannot convert NA to integer"):
arr.astype("int64")
with pytest.raises(ValueError, match="cannot convert float NaN to"):
arr.astype("bool")
result = arr.astype("float64")
expected = np.array([1, 0, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
result = arr.astype("str")
if using_infer_string:
expected = pd.array(
["True", "False", None], dtype=pd.StringDtype(na_value=np.nan)
)
tm.assert_extension_array_equal(result, expected)
else:
expected = np.array(["True", "False", "<NA>"], dtype=f"{tm.ENDIAN}U5")
tm.assert_numpy_array_equal(result, expected)
# no missing values
arr = pd.array([True, False, True], dtype="boolean")
result = arr.astype("int64")
expected = np.array([1, 0, 1], dtype="int64")
tm.assert_numpy_array_equal(result, expected)
result = arr.astype("bool")
expected = np.array([True, False, True], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
def test_astype_to_boolean_array():
# astype to BooleanArray
arr = pd.array([True, False, None], dtype="boolean")
result = arr.astype("boolean")
tm.assert_extension_array_equal(result, arr)
result = arr.astype(pd.BooleanDtype())
tm.assert_extension_array_equal(result, arr)
def test_astype_to_integer_array():
# astype to IntegerArray
arr = pd.array([True, False, None], dtype="boolean")
result = arr.astype("Int64")
expected = pd.array([1, 0, None], dtype="Int64")
tm.assert_extension_array_equal(result, expected)

View File

@ -0,0 +1,60 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.arrays import BooleanArray
from pandas.tests.arrays.masked_shared import ComparisonOps
@pytest.fixture
def data():
"""Fixture returning boolean array with valid and missing data"""
return pd.array(
[True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False],
dtype="boolean",
)
@pytest.fixture
def dtype():
"""Fixture returning BooleanDtype"""
return pd.BooleanDtype()
class TestComparisonOps(ComparisonOps):
def test_compare_scalar(self, data, comparison_op):
self._compare_other(data, comparison_op, True)
def test_compare_array(self, data, comparison_op):
other = pd.array([True] * len(data), dtype="boolean")
self._compare_other(data, comparison_op, other)
other = np.array([True] * len(data))
self._compare_other(data, comparison_op, other)
other = pd.Series([True] * len(data))
self._compare_other(data, comparison_op, other)
@pytest.mark.parametrize("other", [True, False, pd.NA])
def test_scalar(self, other, comparison_op, dtype):
ComparisonOps.test_scalar(self, other, comparison_op, dtype)
def test_array(self, comparison_op):
op = comparison_op
a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
b = pd.array([True, False, None] * 3, dtype="boolean")
result = op(a, b)
values = op(a._data, b._data)
mask = a._mask | b._mask
expected = BooleanArray(values, mask)
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
result[0] = None
tm.assert_extension_array_equal(
a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
)
tm.assert_extension_array_equal(
b, pd.array([True, False, None] * 3, dtype="boolean")
)

View File

@ -0,0 +1,325 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.arrays import BooleanArray
from pandas.core.arrays.boolean import coerce_to_array
def test_boolean_array_constructor():
values = np.array([True, False, True, False], dtype="bool")
mask = np.array([False, False, False, True], dtype="bool")
result = BooleanArray(values, mask)
expected = pd.array([True, False, True, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
with pytest.raises(TypeError, match="values should be boolean numpy array"):
BooleanArray(values.tolist(), mask)
with pytest.raises(TypeError, match="mask should be boolean numpy array"):
BooleanArray(values, mask.tolist())
with pytest.raises(TypeError, match="values should be boolean numpy array"):
BooleanArray(values.astype(int), mask)
with pytest.raises(TypeError, match="mask should be boolean numpy array"):
BooleanArray(values, None)
with pytest.raises(ValueError, match="values.shape must match mask.shape"):
BooleanArray(values.reshape(1, -1), mask)
with pytest.raises(ValueError, match="values.shape must match mask.shape"):
BooleanArray(values, mask.reshape(1, -1))
def test_boolean_array_constructor_copy():
values = np.array([True, False, True, False], dtype="bool")
mask = np.array([False, False, False, True], dtype="bool")
result = BooleanArray(values, mask)
assert result._data is values
assert result._mask is mask
result = BooleanArray(values, mask, copy=True)
assert result._data is not values
assert result._mask is not mask
def test_to_boolean_array():
expected = BooleanArray(
np.array([True, False, True]), np.array([False, False, False])
)
result = pd.array([True, False, True], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = pd.array(np.array([True, False, True]), dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = pd.array(np.array([True, False, True], dtype=object), dtype="boolean")
tm.assert_extension_array_equal(result, expected)
# with missing values
expected = BooleanArray(
np.array([True, False, True]), np.array([False, False, True])
)
result = pd.array([True, False, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = pd.array(np.array([True, False, None], dtype=object), dtype="boolean")
tm.assert_extension_array_equal(result, expected)
def test_to_boolean_array_all_none():
expected = BooleanArray(np.array([True, True, True]), np.array([True, True, True]))
result = pd.array([None, None, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = pd.array(np.array([None, None, None], dtype=object), dtype="boolean")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
"a, b",
[
([True, False, None, np.nan, pd.NA], [True, False, None, None, None]),
([True, np.nan], [True, None]),
([True, pd.NA], [True, None]),
([np.nan, np.nan], [None, None]),
(np.array([np.nan, np.nan], dtype=float), [None, None]),
],
)
def test_to_boolean_array_missing_indicators(a, b):
result = pd.array(a, dtype="boolean")
expected = pd.array(b, dtype="boolean")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
"values",
[
["foo", "bar"],
["1", "2"],
# "foo",
[1, 2],
[1.0, 2.0],
pd.date_range("20130101", periods=2),
np.array(["foo"]),
np.array([1, 2]),
np.array([1.0, 2.0]),
[np.nan, {"a": 1}],
],
)
def test_to_boolean_array_error(values):
# error in converting existing arrays to BooleanArray
msg = "Need to pass bool-like value"
with pytest.raises(TypeError, match=msg):
pd.array(values, dtype="boolean")
def test_to_boolean_array_from_integer_array():
result = pd.array(np.array([1, 0, 1, 0]), dtype="boolean")
expected = pd.array([True, False, True, False], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
# with missing values
result = pd.array(np.array([1, 0, 1, None]), dtype="boolean")
expected = pd.array([True, False, True, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
def test_to_boolean_array_from_float_array():
result = pd.array(np.array([1.0, 0.0, 1.0, 0.0]), dtype="boolean")
expected = pd.array([True, False, True, False], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
# with missing values
result = pd.array(np.array([1.0, 0.0, 1.0, np.nan]), dtype="boolean")
expected = pd.array([True, False, True, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
def test_to_boolean_array_integer_like():
# integers of 0's and 1's
result = pd.array([1, 0, 1, 0], dtype="boolean")
expected = pd.array([True, False, True, False], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
# with missing values
result = pd.array([1, 0, 1, None], dtype="boolean")
expected = pd.array([True, False, True, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
def test_coerce_to_array():
# TODO this is currently not public API
values = np.array([True, False, True, False], dtype="bool")
mask = np.array([False, False, False, True], dtype="bool")
result = BooleanArray(*coerce_to_array(values, mask=mask))
expected = BooleanArray(values, mask)
tm.assert_extension_array_equal(result, expected)
assert result._data is values
assert result._mask is mask
result = BooleanArray(*coerce_to_array(values, mask=mask, copy=True))
expected = BooleanArray(values, mask)
tm.assert_extension_array_equal(result, expected)
assert result._data is not values
assert result._mask is not mask
# mixed missing from values and mask
values = [True, False, None, False]
mask = np.array([False, False, False, True], dtype="bool")
result = BooleanArray(*coerce_to_array(values, mask=mask))
expected = BooleanArray(
np.array([True, False, True, True]), np.array([False, False, True, True])
)
tm.assert_extension_array_equal(result, expected)
result = BooleanArray(*coerce_to_array(np.array(values, dtype=object), mask=mask))
tm.assert_extension_array_equal(result, expected)
result = BooleanArray(*coerce_to_array(values, mask=mask.tolist()))
tm.assert_extension_array_equal(result, expected)
# raise errors for wrong dimension
values = np.array([True, False, True, False], dtype="bool")
mask = np.array([False, False, False, True], dtype="bool")
# passing 2D values is OK as long as no mask
coerce_to_array(values.reshape(1, -1))
with pytest.raises(ValueError, match="values.shape and mask.shape must match"):
coerce_to_array(values.reshape(1, -1), mask=mask)
with pytest.raises(ValueError, match="values.shape and mask.shape must match"):
coerce_to_array(values, mask=mask.reshape(1, -1))
def test_coerce_to_array_from_boolean_array():
# passing BooleanArray to coerce_to_array
values = np.array([True, False, True, False], dtype="bool")
mask = np.array([False, False, False, True], dtype="bool")
arr = BooleanArray(values, mask)
result = BooleanArray(*coerce_to_array(arr))
tm.assert_extension_array_equal(result, arr)
# no copy
assert result._data is arr._data
assert result._mask is arr._mask
result = BooleanArray(*coerce_to_array(arr), copy=True)
tm.assert_extension_array_equal(result, arr)
assert result._data is not arr._data
assert result._mask is not arr._mask
with pytest.raises(ValueError, match="cannot pass mask for BooleanArray input"):
coerce_to_array(arr, mask=mask)
def test_coerce_to_numpy_array():
# with missing values -> object dtype
arr = pd.array([True, False, None], dtype="boolean")
result = np.array(arr)
expected = np.array([True, False, pd.NA], dtype="object")
tm.assert_numpy_array_equal(result, expected)
# also with no missing values -> object dtype
arr = pd.array([True, False, True], dtype="boolean")
result = np.array(arr)
expected = np.array([True, False, True], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
# force bool dtype
result = np.array(arr, dtype="bool")
expected = np.array([True, False, True], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
# with missing values will raise error
arr = pd.array([True, False, None], dtype="boolean")
msg = (
"cannot convert to 'bool'-dtype NumPy array with missing values. "
"Specify an appropriate 'na_value' for this dtype."
)
with pytest.raises(ValueError, match=msg):
np.array(arr, dtype="bool")
def test_to_boolean_array_from_strings():
result = BooleanArray._from_sequence_of_strings(
np.array(["True", "False", "1", "1.0", "0", "0.0", np.nan], dtype=object),
dtype="boolean",
)
expected = BooleanArray(
np.array([True, False, True, True, False, False, False]),
np.array([False, False, False, False, False, False, True]),
)
tm.assert_extension_array_equal(result, expected)
def test_to_boolean_array_from_strings_invalid_string():
with pytest.raises(ValueError, match="cannot be cast"):
BooleanArray._from_sequence_of_strings(["donkey"], dtype="boolean")
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy(box):
con = pd.Series if box else pd.array
# default (with or without missing values) -> object dtype
arr = con([True, False, True], dtype="boolean")
result = arr.to_numpy()
expected = np.array([True, False, True], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
arr = con([True, False, None], dtype="boolean")
result = arr.to_numpy()
expected = np.array([True, False, pd.NA], dtype="object")
tm.assert_numpy_array_equal(result, expected)
arr = con([True, False, None], dtype="boolean")
result = arr.to_numpy(dtype="str")
expected = np.array([True, False, pd.NA], dtype=f"{tm.ENDIAN}U5")
tm.assert_numpy_array_equal(result, expected)
# no missing values -> can convert to bool, otherwise raises
arr = con([True, False, True], dtype="boolean")
result = arr.to_numpy(dtype="bool")
expected = np.array([True, False, True], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
arr = con([True, False, None], dtype="boolean")
with pytest.raises(ValueError, match="cannot convert to 'bool'-dtype"):
result = arr.to_numpy(dtype="bool")
# specify dtype and na_value
arr = con([True, False, None], dtype="boolean")
result = arr.to_numpy(dtype=object, na_value=None)
expected = np.array([True, False, None], dtype="object")
tm.assert_numpy_array_equal(result, expected)
result = arr.to_numpy(dtype=bool, na_value=False)
expected = np.array([True, False, False], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
result = arr.to_numpy(dtype="int64", na_value=-99)
expected = np.array([1, 0, -99], dtype="int64")
tm.assert_numpy_array_equal(result, expected)
result = arr.to_numpy(dtype="float64", na_value=np.nan)
expected = np.array([1, 0, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
# converting to int or float without specifying na_value raises
with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"):
arr.to_numpy(dtype="int64")
def test_to_numpy_copy():
# to_numpy can be zero-copy if no missing values
arr = pd.array([True, False, True], dtype="boolean")
result = arr.to_numpy(dtype=bool)
result[0] = False
tm.assert_extension_array_equal(
arr, pd.array([False, False, True], dtype="boolean")
)
arr = pd.array([True, False, True], dtype="boolean")
result = arr.to_numpy(dtype=bool, copy=True)
result[0] = False
tm.assert_extension_array_equal(arr, pd.array([True, False, True], dtype="boolean"))

View File

@ -0,0 +1,126 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
@pytest.mark.parametrize(
"ufunc", [np.add, np.logical_or, np.logical_and, np.logical_xor]
)
def test_ufuncs_binary(ufunc):
# two BooleanArrays
a = pd.array([True, False, None], dtype="boolean")
result = ufunc(a, a)
expected = pd.array(ufunc(a._data, a._data), dtype="boolean")
expected[a._mask] = np.nan
tm.assert_extension_array_equal(result, expected)
s = pd.Series(a)
result = ufunc(s, a)
expected = pd.Series(ufunc(a._data, a._data), dtype="boolean")
expected[a._mask] = np.nan
tm.assert_series_equal(result, expected)
# Boolean with numpy array
arr = np.array([True, True, False])
result = ufunc(a, arr)
expected = pd.array(ufunc(a._data, arr), dtype="boolean")
expected[a._mask] = np.nan
tm.assert_extension_array_equal(result, expected)
result = ufunc(arr, a)
expected = pd.array(ufunc(arr, a._data), dtype="boolean")
expected[a._mask] = np.nan
tm.assert_extension_array_equal(result, expected)
# BooleanArray with scalar
result = ufunc(a, True)
expected = pd.array(ufunc(a._data, True), dtype="boolean")
expected[a._mask] = np.nan
tm.assert_extension_array_equal(result, expected)
result = ufunc(True, a)
expected = pd.array(ufunc(True, a._data), dtype="boolean")
expected[a._mask] = np.nan
tm.assert_extension_array_equal(result, expected)
# not handled types
msg = r"operand type\(s\) all returned NotImplemented from __array_ufunc__"
with pytest.raises(TypeError, match=msg):
ufunc(a, "test")
@pytest.mark.parametrize("ufunc", [np.logical_not])
def test_ufuncs_unary(ufunc):
a = pd.array([True, False, None], dtype="boolean")
result = ufunc(a)
expected = pd.array(ufunc(a._data), dtype="boolean")
expected[a._mask] = np.nan
tm.assert_extension_array_equal(result, expected)
ser = pd.Series(a)
result = ufunc(ser)
expected = pd.Series(ufunc(a._data), dtype="boolean")
expected[a._mask] = np.nan
tm.assert_series_equal(result, expected)
def test_ufunc_numeric():
# np.sqrt on np.bool_ returns float16, which we upcast to Float32
# bc we do not have Float16
arr = pd.array([True, False, None], dtype="boolean")
res = np.sqrt(arr)
expected = pd.array([1, 0, None], dtype="Float32")
tm.assert_extension_array_equal(res, expected)
@pytest.mark.parametrize("values", [[True, False], [True, None]])
def test_ufunc_reduce_raises(values):
arr = pd.array(values, dtype="boolean")
res = np.add.reduce(arr)
if arr[-1] is pd.NA:
expected = pd.NA
else:
expected = arr._data.sum()
tm.assert_almost_equal(res, expected)
def test_value_counts_na():
arr = pd.array([True, False, pd.NA], dtype="boolean")
result = arr.value_counts(dropna=False)
expected = pd.Series([1, 1, 1], index=arr, dtype="Int64", name="count")
assert expected.index.dtype == arr.dtype
tm.assert_series_equal(result, expected)
result = arr.value_counts(dropna=True)
expected = pd.Series([1, 1], index=arr[:-1], dtype="Int64", name="count")
assert expected.index.dtype == arr.dtype
tm.assert_series_equal(result, expected)
def test_value_counts_with_normalize():
ser = pd.Series([True, False, pd.NA], dtype="boolean")
result = ser.value_counts(normalize=True)
expected = pd.Series([1, 1], index=ser[:-1], dtype="Float64", name="proportion") / 2
assert expected.index.dtype == "boolean"
tm.assert_series_equal(result, expected)
def test_diff():
a = pd.array(
[True, True, False, False, True, None, True, None, False], dtype="boolean"
)
result = pd.core.algorithms.diff(a, 1)
expected = pd.array(
[None, False, True, False, True, None, None, None, None], dtype="boolean"
)
tm.assert_extension_array_equal(result, expected)
ser = pd.Series(a)
result = ser.diff()
expected = pd.Series(expected)
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,13 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
@pytest.mark.parametrize("na", [None, np.nan, pd.NA])
def test_setitem_missing_values(na):
arr = pd.array([True, False, None], dtype="boolean")
expected = pd.array([True, None, None], dtype="boolean")
arr[1] = na
tm.assert_extension_array_equal(arr, expected)

View File

@ -0,0 +1,254 @@
import operator
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.arrays import BooleanArray
from pandas.core.ops.mask_ops import (
kleene_and,
kleene_or,
kleene_xor,
)
from pandas.tests.extension.base import BaseOpsUtil
class TestLogicalOps(BaseOpsUtil):
def test_numpy_scalars_ok(self, all_logical_operators):
a = pd.array([True, False, None], dtype="boolean")
op = getattr(a, all_logical_operators)
tm.assert_extension_array_equal(op(True), op(np.bool_(True)))
tm.assert_extension_array_equal(op(False), op(np.bool_(False)))
def get_op_from_name(self, op_name):
short_opname = op_name.strip("_")
short_opname = short_opname if "xor" in short_opname else short_opname + "_"
try:
op = getattr(operator, short_opname)
except AttributeError:
# Assume it is the reverse operator
rop = getattr(operator, short_opname[1:])
op = lambda x, y: rop(y, x)
return op
def test_empty_ok(self, all_logical_operators):
a = pd.array([], dtype="boolean")
op_name = all_logical_operators
result = getattr(a, op_name)(True)
tm.assert_extension_array_equal(a, result)
result = getattr(a, op_name)(False)
tm.assert_extension_array_equal(a, result)
result = getattr(a, op_name)(pd.NA)
tm.assert_extension_array_equal(a, result)
@pytest.mark.parametrize(
"other", ["a", pd.Timestamp(2017, 1, 1, 12), np.timedelta64(4)]
)
def test_eq_mismatched_type(self, other):
# GH-44499
arr = pd.array([True, False])
result = arr == other
expected = pd.array([False, False])
tm.assert_extension_array_equal(result, expected)
result = arr != other
expected = pd.array([True, True])
tm.assert_extension_array_equal(result, expected)
def test_logical_length_mismatch_raises(self, all_logical_operators):
op_name = all_logical_operators
a = pd.array([True, False, None], dtype="boolean")
msg = "Lengths must match"
with pytest.raises(ValueError, match=msg):
getattr(a, op_name)([True, False])
with pytest.raises(ValueError, match=msg):
getattr(a, op_name)(np.array([True, False]))
with pytest.raises(ValueError, match=msg):
getattr(a, op_name)(pd.array([True, False], dtype="boolean"))
def test_logical_nan_raises(self, all_logical_operators):
op_name = all_logical_operators
a = pd.array([True, False, None], dtype="boolean")
msg = "Got float instead"
with pytest.raises(TypeError, match=msg):
getattr(a, op_name)(np.nan)
@pytest.mark.parametrize("other", ["a", 1])
def test_non_bool_or_na_other_raises(self, other, all_logical_operators):
a = pd.array([True, False], dtype="boolean")
with pytest.raises(TypeError, match=str(type(other).__name__)):
getattr(a, all_logical_operators)(other)
def test_kleene_or(self):
# A clear test of behavior.
a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
b = pd.array([True, False, None] * 3, dtype="boolean")
result = a | b
expected = pd.array(
[True, True, True, True, False, None, True, None, None], dtype="boolean"
)
tm.assert_extension_array_equal(result, expected)
result = b | a
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
tm.assert_extension_array_equal(
a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
)
tm.assert_extension_array_equal(
b, pd.array([True, False, None] * 3, dtype="boolean")
)
@pytest.mark.parametrize(
"other, expected",
[
(pd.NA, [True, None, None]),
(True, [True, True, True]),
(np.bool_(True), [True, True, True]),
(False, [True, False, None]),
(np.bool_(False), [True, False, None]),
],
)
def test_kleene_or_scalar(self, other, expected):
# TODO: test True & False
a = pd.array([True, False, None], dtype="boolean")
result = a | other
expected = pd.array(expected, dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = other | a
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
tm.assert_extension_array_equal(
a, pd.array([True, False, None], dtype="boolean")
)
def test_kleene_and(self):
# A clear test of behavior.
a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
b = pd.array([True, False, None] * 3, dtype="boolean")
result = a & b
expected = pd.array(
[True, False, None, False, False, False, None, False, None], dtype="boolean"
)
tm.assert_extension_array_equal(result, expected)
result = b & a
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
tm.assert_extension_array_equal(
a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
)
tm.assert_extension_array_equal(
b, pd.array([True, False, None] * 3, dtype="boolean")
)
@pytest.mark.parametrize(
"other, expected",
[
(pd.NA, [None, False, None]),
(True, [True, False, None]),
(False, [False, False, False]),
(np.bool_(True), [True, False, None]),
(np.bool_(False), [False, False, False]),
],
)
def test_kleene_and_scalar(self, other, expected):
a = pd.array([True, False, None], dtype="boolean")
result = a & other
expected = pd.array(expected, dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = other & a
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
tm.assert_extension_array_equal(
a, pd.array([True, False, None], dtype="boolean")
)
def test_kleene_xor(self):
a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
b = pd.array([True, False, None] * 3, dtype="boolean")
result = a ^ b
expected = pd.array(
[False, True, None, True, False, None, None, None, None], dtype="boolean"
)
tm.assert_extension_array_equal(result, expected)
result = b ^ a
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
tm.assert_extension_array_equal(
a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
)
tm.assert_extension_array_equal(
b, pd.array([True, False, None] * 3, dtype="boolean")
)
@pytest.mark.parametrize(
"other, expected",
[
(pd.NA, [None, None, None]),
(True, [False, True, None]),
(np.bool_(True), [False, True, None]),
(np.bool_(False), [True, False, None]),
],
)
def test_kleene_xor_scalar(self, other, expected):
a = pd.array([True, False, None], dtype="boolean")
result = a ^ other
expected = pd.array(expected, dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = other ^ a
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
tm.assert_extension_array_equal(
a, pd.array([True, False, None], dtype="boolean")
)
@pytest.mark.parametrize("other", [True, False, pd.NA, [True, False, None] * 3])
def test_no_masked_assumptions(self, other, all_logical_operators):
# The logical operations should not assume that masked values are False!
a = pd.arrays.BooleanArray(
np.array([True, True, True, False, False, False, True, False, True]),
np.array([False] * 6 + [True, True, True]),
)
b = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
if isinstance(other, list):
other = pd.array(other, dtype="boolean")
result = getattr(a, all_logical_operators)(other)
expected = getattr(b, all_logical_operators)(other)
tm.assert_extension_array_equal(result, expected)
if isinstance(other, BooleanArray):
other._data[other._mask] = True
a._data[a._mask] = False
result = getattr(a, all_logical_operators)(other)
expected = getattr(b, all_logical_operators)(other)
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("operation", [kleene_or, kleene_xor, kleene_and])
def test_error_both_scalar(operation):
msg = r"Either `left` or `right` need to be a np\.ndarray."
with pytest.raises(TypeError, match=msg):
# masks need to be non-None, otherwise it ends up in an infinite recursion
operation(True, True, np.zeros(1), np.zeros(1))

View File

@ -0,0 +1,27 @@
import pandas as pd
import pandas._testing as tm
class TestUnaryOps:
def test_invert(self):
a = pd.array([True, False, None], dtype="boolean")
expected = pd.array([False, True, None], dtype="boolean")
tm.assert_extension_array_equal(~a, expected)
expected = pd.Series(expected, index=["a", "b", "c"], name="name")
result = ~pd.Series(a, index=["a", "b", "c"], name="name")
tm.assert_series_equal(result, expected)
df = pd.DataFrame({"A": a, "B": [True, False, False]}, index=["a", "b", "c"])
result = ~df
expected = pd.DataFrame(
{"A": expected, "B": [False, True, True]}, index=["a", "b", "c"]
)
tm.assert_frame_equal(result, expected)
def test_abs(self):
# matching numpy behavior, abs is the identity function
arr = pd.array([True, False, None], dtype="boolean")
result = abs(arr)
tm.assert_extension_array_equal(result, arr)

View File

@ -0,0 +1,62 @@
import numpy as np
import pytest
import pandas as pd
@pytest.fixture
def data():
"""Fixture returning boolean array, with valid and missing values."""
return pd.array(
[True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False],
dtype="boolean",
)
@pytest.mark.parametrize(
"values, exp_any, exp_all, exp_any_noskip, exp_all_noskip",
[
([True, pd.NA], True, True, True, pd.NA),
([False, pd.NA], False, False, pd.NA, False),
([pd.NA], False, True, pd.NA, pd.NA),
([], False, True, False, True),
# GH-33253: all True / all False values buggy with skipna=False
([True, True], True, True, True, True),
([False, False], False, False, False, False),
],
)
def test_any_all(values, exp_any, exp_all, exp_any_noskip, exp_all_noskip):
# the methods return numpy scalars
exp_any = pd.NA if exp_any is pd.NA else np.bool_(exp_any)
exp_all = pd.NA if exp_all is pd.NA else np.bool_(exp_all)
exp_any_noskip = pd.NA if exp_any_noskip is pd.NA else np.bool_(exp_any_noskip)
exp_all_noskip = pd.NA if exp_all_noskip is pd.NA else np.bool_(exp_all_noskip)
for con in [pd.array, pd.Series]:
a = con(values, dtype="boolean")
assert a.any() is exp_any
assert a.all() is exp_all
assert a.any(skipna=False) is exp_any_noskip
assert a.all(skipna=False) is exp_all_noskip
assert np.any(a.any()) is exp_any
assert np.all(a.all()) is exp_all
@pytest.mark.parametrize("dropna", [True, False])
def test_reductions_return_types(dropna, data, all_numeric_reductions):
op = all_numeric_reductions
s = pd.Series(data)
if dropna:
s = s.dropna()
if op in ("sum", "prod"):
assert isinstance(getattr(s, op)(), np.int_)
elif op == "count":
# Oddly on the 32 bit build (but not Windows), this is intc (!= intp)
assert isinstance(getattr(s, op)(), np.integer)
elif op in ("min", "max"):
assert isinstance(getattr(s, op)(), np.bool_)
else:
# "mean", "std", "var", "median", "kurt", "skew"
assert isinstance(getattr(s, op)(), np.float64)

View File

@ -0,0 +1,13 @@
import pandas as pd
def test_repr():
df = pd.DataFrame({"A": pd.array([True, False, None], dtype="boolean")})
expected = " A\n0 True\n1 False\n2 <NA>"
assert repr(df) == expected
expected = "0 True\n1 False\n2 <NA>\nName: A, dtype: boolean"
assert repr(df.A) == expected
expected = "<BooleanArray>\n[True, False, <NA>]\nLength: 3, dtype: boolean"
assert repr(df.A.array) == expected

View File

@ -0,0 +1,89 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
@pytest.mark.parametrize("ordered", [True, False])
@pytest.mark.parametrize("categories", [["b", "a", "c"], ["a", "b", "c", "d"]])
def test_factorize(categories, ordered):
cat = pd.Categorical(
["b", "b", "a", "c", None], categories=categories, ordered=ordered
)
codes, uniques = pd.factorize(cat)
expected_codes = np.array([0, 0, 1, 2, -1], dtype=np.intp)
expected_uniques = pd.Categorical(
["b", "a", "c"], categories=categories, ordered=ordered
)
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_categorical_equal(uniques, expected_uniques)
def test_factorized_sort():
cat = pd.Categorical(["b", "b", None, "a"])
codes, uniques = pd.factorize(cat, sort=True)
expected_codes = np.array([1, 1, -1, 0], dtype=np.intp)
expected_uniques = pd.Categorical(["a", "b"])
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_categorical_equal(uniques, expected_uniques)
def test_factorized_sort_ordered():
cat = pd.Categorical(
["b", "b", None, "a"], categories=["c", "b", "a"], ordered=True
)
codes, uniques = pd.factorize(cat, sort=True)
expected_codes = np.array([0, 0, -1, 1], dtype=np.intp)
expected_uniques = pd.Categorical(
["b", "a"], categories=["c", "b", "a"], ordered=True
)
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_categorical_equal(uniques, expected_uniques)
def test_isin_cats():
# GH2003
cat = pd.Categorical(["a", "b", np.nan])
result = cat.isin(["a", np.nan])
expected = np.array([True, False, True], dtype=bool)
tm.assert_numpy_array_equal(expected, result)
result = cat.isin(["a", "c"])
expected = np.array([True, False, False], dtype=bool)
tm.assert_numpy_array_equal(expected, result)
@pytest.mark.parametrize("value", [[""], [None, ""], [pd.NaT, ""]])
def test_isin_cats_corner_cases(value):
# GH36550
cat = pd.Categorical([""])
result = cat.isin(value)
expected = np.array([True], dtype=bool)
tm.assert_numpy_array_equal(expected, result)
@pytest.mark.parametrize("empty", [[], pd.Series(dtype=object), np.array([])])
def test_isin_empty(empty):
s = pd.Categorical(["a", "b"])
expected = np.array([False, False], dtype=bool)
result = s.isin(empty)
tm.assert_numpy_array_equal(expected, result)
def test_diff():
ser = pd.Series([1, 2, 3], dtype="category")
msg = "Convert to a suitable dtype"
with pytest.raises(TypeError, match=msg):
ser.diff()
df = ser.to_frame(name="A")
with pytest.raises(TypeError, match=msg):
df.diff()

View File

@ -0,0 +1,355 @@
import re
import sys
import numpy as np
import pytest
from pandas.compat import PYPY
from pandas import (
Categorical,
CategoricalDtype,
DataFrame,
Index,
NaT,
Series,
date_range,
)
import pandas._testing as tm
from pandas.api.types import is_scalar
class TestCategoricalAnalytics:
@pytest.mark.parametrize("aggregation", ["min", "max"])
def test_min_max_not_ordered_raises(self, aggregation):
# unordered cats have no min/max
cat = Categorical(["a", "b", "c", "d"], ordered=False)
msg = f"Categorical is not ordered for operation {aggregation}"
agg_func = getattr(cat, aggregation)
with pytest.raises(TypeError, match=msg):
agg_func()
ufunc = np.minimum if aggregation == "min" else np.maximum
with pytest.raises(TypeError, match=msg):
ufunc.reduce(cat)
def test_min_max_ordered(self, index_or_series_or_array):
cat = Categorical(["a", "b", "c", "d"], ordered=True)
obj = index_or_series_or_array(cat)
_min = obj.min()
_max = obj.max()
assert _min == "a"
assert _max == "d"
assert np.minimum.reduce(obj) == "a"
assert np.maximum.reduce(obj) == "d"
# TODO: raises if we pass axis=0 (on Index and Categorical, not Series)
cat = Categorical(
["a", "b", "c", "d"], categories=["d", "c", "b", "a"], ordered=True
)
obj = index_or_series_or_array(cat)
_min = obj.min()
_max = obj.max()
assert _min == "d"
assert _max == "a"
assert np.minimum.reduce(obj) == "d"
assert np.maximum.reduce(obj) == "a"
def test_min_max_reduce(self):
# GH52788
cat = Categorical(["a", "b", "c", "d"], ordered=True)
df = DataFrame(cat)
result_max = df.agg("max")
expected_max = Series(Categorical(["d"], dtype=cat.dtype))
tm.assert_series_equal(result_max, expected_max)
result_min = df.agg("min")
expected_min = Series(Categorical(["a"], dtype=cat.dtype))
tm.assert_series_equal(result_min, expected_min)
@pytest.mark.parametrize(
"categories,expected",
[
(list("ABC"), np.nan),
([1, 2, 3], np.nan),
pytest.param(
Series(date_range("2020-01-01", periods=3), dtype="category"),
NaT,
marks=pytest.mark.xfail(
reason="https://github.com/pandas-dev/pandas/issues/29962"
),
),
],
)
@pytest.mark.parametrize("aggregation", ["min", "max"])
def test_min_max_ordered_empty(self, categories, expected, aggregation):
# GH 30227
cat = Categorical([], categories=categories, ordered=True)
agg_func = getattr(cat, aggregation)
result = agg_func()
assert result is expected
@pytest.mark.parametrize(
"values, categories",
[(["a", "b", "c", np.nan], list("cba")), ([1, 2, 3, np.nan], [3, 2, 1])],
)
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("function", ["min", "max"])
def test_min_max_with_nan(self, values, categories, function, skipna):
# GH 25303
cat = Categorical(values, categories=categories, ordered=True)
result = getattr(cat, function)(skipna=skipna)
if skipna is False:
assert result is np.nan
else:
expected = categories[0] if function == "min" else categories[2]
assert result == expected
@pytest.mark.parametrize("function", ["min", "max"])
@pytest.mark.parametrize("skipna", [True, False])
def test_min_max_only_nan(self, function, skipna):
# https://github.com/pandas-dev/pandas/issues/33450
cat = Categorical([np.nan], categories=[1, 2], ordered=True)
result = getattr(cat, function)(skipna=skipna)
assert result is np.nan
@pytest.mark.parametrize("method", ["min", "max"])
def test_numeric_only_min_max_raises(self, method):
# GH 25303
cat = Categorical(
[np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True
)
with pytest.raises(TypeError, match=".* got an unexpected keyword"):
getattr(cat, method)(numeric_only=True)
@pytest.mark.parametrize("method", ["min", "max"])
def test_numpy_min_max_raises(self, method):
cat = Categorical(["a", "b", "c", "b"], ordered=False)
msg = (
f"Categorical is not ordered for operation {method}\n"
"you can use .as_ordered() to change the Categorical to an ordered one"
)
method = getattr(np, method)
with pytest.raises(TypeError, match=re.escape(msg)):
method(cat)
@pytest.mark.parametrize("kwarg", ["axis", "out", "keepdims"])
@pytest.mark.parametrize("method", ["min", "max"])
def test_numpy_min_max_unsupported_kwargs_raises(self, method, kwarg):
cat = Categorical(["a", "b", "c", "b"], ordered=True)
msg = (
f"the '{kwarg}' parameter is not supported in the pandas implementation "
f"of {method}"
)
if kwarg == "axis":
msg = r"`axis` must be fewer than the number of dimensions \(1\)"
kwargs = {kwarg: 42}
method = getattr(np, method)
with pytest.raises(ValueError, match=msg):
method(cat, **kwargs)
@pytest.mark.parametrize("method, expected", [("min", "a"), ("max", "c")])
def test_numpy_min_max_axis_equals_none(self, method, expected):
cat = Categorical(["a", "b", "c", "b"], ordered=True)
method = getattr(np, method)
result = method(cat, axis=None)
assert result == expected
@pytest.mark.parametrize(
"values,categories,exp_mode",
[
([1, 1, 2, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5]),
([1, 1, 1, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5, 1]),
([1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [5, 4, 3, 2, 1]),
([np.nan, np.nan, np.nan, 4, 5], [5, 4, 3, 2, 1], [5, 4]),
([np.nan, np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4]),
([np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4]),
],
)
def test_mode(self, values, categories, exp_mode):
cat = Categorical(values, categories=categories, ordered=True)
res = Series(cat).mode()._values
exp = Categorical(exp_mode, categories=categories, ordered=True)
tm.assert_categorical_equal(res, exp)
def test_searchsorted(self, ordered):
# https://github.com/pandas-dev/pandas/issues/8420
# https://github.com/pandas-dev/pandas/issues/14522
cat = Categorical(
["cheese", "milk", "apple", "bread", "bread"],
categories=["cheese", "milk", "apple", "bread"],
ordered=ordered,
)
ser = Series(cat)
# Searching for single item argument, side='left' (default)
res_cat = cat.searchsorted("apple")
assert res_cat == 2
assert is_scalar(res_cat)
res_ser = ser.searchsorted("apple")
assert res_ser == 2
assert is_scalar(res_ser)
# Searching for single item array, side='left' (default)
res_cat = cat.searchsorted(["bread"])
res_ser = ser.searchsorted(["bread"])
exp = np.array([3], dtype=np.intp)
tm.assert_numpy_array_equal(res_cat, exp)
tm.assert_numpy_array_equal(res_ser, exp)
# Searching for several items array, side='right'
res_cat = cat.searchsorted(["apple", "bread"], side="right")
res_ser = ser.searchsorted(["apple", "bread"], side="right")
exp = np.array([3, 5], dtype=np.intp)
tm.assert_numpy_array_equal(res_cat, exp)
tm.assert_numpy_array_equal(res_ser, exp)
# Searching for a single value that is not from the Categorical
with pytest.raises(TypeError, match="cucumber"):
cat.searchsorted("cucumber")
with pytest.raises(TypeError, match="cucumber"):
ser.searchsorted("cucumber")
# Searching for multiple values one of each is not from the Categorical
msg = (
"Cannot setitem on a Categorical with a new category, "
"set the categories first"
)
with pytest.raises(TypeError, match=msg):
cat.searchsorted(["bread", "cucumber"])
with pytest.raises(TypeError, match=msg):
ser.searchsorted(["bread", "cucumber"])
def test_unique(self, ordered):
# GH38140
dtype = CategoricalDtype(["a", "b", "c"], ordered=ordered)
# categories are reordered based on value when ordered=False
cat = Categorical(["a", "b", "c"], dtype=dtype)
res = cat.unique()
tm.assert_categorical_equal(res, cat)
cat = Categorical(["a", "b", "a", "a"], dtype=dtype)
res = cat.unique()
tm.assert_categorical_equal(res, Categorical(["a", "b"], dtype=dtype))
cat = Categorical(["c", "a", "b", "a", "a"], dtype=dtype)
res = cat.unique()
exp_cat = Categorical(["c", "a", "b"], dtype=dtype)
tm.assert_categorical_equal(res, exp_cat)
# nan must be removed
cat = Categorical(["b", np.nan, "b", np.nan, "a"], dtype=dtype)
res = cat.unique()
exp_cat = Categorical(["b", np.nan, "a"], dtype=dtype)
tm.assert_categorical_equal(res, exp_cat)
def test_unique_index_series(self, ordered):
# GH38140
dtype = CategoricalDtype([3, 2, 1], ordered=ordered)
c = Categorical([3, 1, 2, 2, 1], dtype=dtype)
# Categorical.unique sorts categories by appearance order
# if ordered=False
exp = Categorical([3, 1, 2], dtype=dtype)
tm.assert_categorical_equal(c.unique(), exp)
tm.assert_index_equal(Index(c).unique(), Index(exp))
tm.assert_categorical_equal(Series(c).unique(), exp)
c = Categorical([1, 1, 2, 2], dtype=dtype)
exp = Categorical([1, 2], dtype=dtype)
tm.assert_categorical_equal(c.unique(), exp)
tm.assert_index_equal(Index(c).unique(), Index(exp))
tm.assert_categorical_equal(Series(c).unique(), exp)
def test_shift(self):
# GH 9416
cat = Categorical(["a", "b", "c", "d", "a"])
# shift forward
sp1 = cat.shift(1)
xp1 = Categorical([np.nan, "a", "b", "c", "d"])
tm.assert_categorical_equal(sp1, xp1)
tm.assert_categorical_equal(cat[:-1], sp1[1:])
# shift back
sn2 = cat.shift(-2)
xp2 = Categorical(
["c", "d", "a", np.nan, np.nan], categories=["a", "b", "c", "d"]
)
tm.assert_categorical_equal(sn2, xp2)
tm.assert_categorical_equal(cat[2:], sn2[:-2])
# shift by zero
tm.assert_categorical_equal(cat, cat.shift(0))
def test_nbytes(self):
cat = Categorical([1, 2, 3])
exp = 3 + 3 * 8 # 3 int8s for values + 3 int64s for categories
assert cat.nbytes == exp
def test_memory_usage(self, using_infer_string):
cat = Categorical([1, 2, 3])
# .categories is an index, so we include the hashtable
assert 0 < cat.nbytes <= cat.memory_usage()
assert 0 < cat.nbytes <= cat.memory_usage(deep=True)
cat = Categorical(["foo", "foo", "bar"])
if using_infer_string:
if cat.categories.dtype.storage == "python":
assert cat.memory_usage(deep=True) > cat.nbytes
else:
assert cat.memory_usage(deep=True) >= cat.nbytes
else:
assert cat.memory_usage(deep=True) > cat.nbytes
if not PYPY:
# sys.getsizeof will call the .memory_usage with
# deep=True, and add on some GC overhead
diff = cat.memory_usage(deep=True) - sys.getsizeof(cat)
assert abs(diff) < 100
def test_map(self):
c = Categorical(list("ABABC"), categories=list("CBA"), ordered=True)
result = c.map(lambda x: x.lower(), na_action=None)
exp = Categorical(list("ababc"), categories=list("cba"), ordered=True)
tm.assert_categorical_equal(result, exp)
c = Categorical(list("ABABC"), categories=list("ABC"), ordered=False)
result = c.map(lambda x: x.lower(), na_action=None)
exp = Categorical(list("ababc"), categories=list("abc"), ordered=False)
tm.assert_categorical_equal(result, exp)
result = c.map(lambda x: 1, na_action=None)
# GH 12766: Return an index not an array
tm.assert_index_equal(result, Index(np.array([1] * 5, dtype=np.int64)))
@pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0])
def test_validate_inplace_raises(self, value):
cat = Categorical(["A", "B", "B", "C", "A"])
msg = (
'For argument "inplace" expected type bool, '
f"received type {type(value).__name__}"
)
with pytest.raises(ValueError, match=msg):
cat.sort_values(inplace=value)
def test_quantile_empty(self):
# make sure we have correct itemsize on resulting codes
cat = Categorical(["A", "B"])
idx = Index([0.0, 0.5])
result = cat[:0]._quantile(idx, interpolation="linear")
assert result._codes.dtype == np.int8
expected = cat.take([-1, -1], allow_fill=True)
tm.assert_extension_array_equal(result, expected)

View File

@ -0,0 +1,501 @@
import re
import numpy as np
import pytest
from pandas.compat import PY311
from pandas import (
Categorical,
CategoricalIndex,
DataFrame,
Index,
Series,
StringDtype,
)
import pandas._testing as tm
from pandas.core.arrays.categorical import recode_for_categories
class TestCategoricalAPI:
def test_to_list_deprecated(self):
# GH#51254
cat1 = Categorical(list("acb"), ordered=False)
msg = "Categorical.to_list is deprecated and will be removed"
with tm.assert_produces_warning(FutureWarning, match=msg):
cat1.to_list()
def test_ordered_api(self):
# GH 9347
cat1 = Categorical(list("acb"), ordered=False)
tm.assert_index_equal(cat1.categories, Index(["a", "b", "c"]))
assert not cat1.ordered
cat2 = Categorical(list("acb"), categories=list("bca"), ordered=False)
tm.assert_index_equal(cat2.categories, Index(["b", "c", "a"]))
assert not cat2.ordered
cat3 = Categorical(list("acb"), ordered=True)
tm.assert_index_equal(cat3.categories, Index(["a", "b", "c"]))
assert cat3.ordered
cat4 = Categorical(list("acb"), categories=list("bca"), ordered=True)
tm.assert_index_equal(cat4.categories, Index(["b", "c", "a"]))
assert cat4.ordered
def test_set_ordered(self):
cat = Categorical(["a", "b", "c", "a"], ordered=True)
cat2 = cat.as_unordered()
assert not cat2.ordered
cat2 = cat.as_ordered()
assert cat2.ordered
assert cat2.set_ordered(True).ordered
assert not cat2.set_ordered(False).ordered
# removed in 0.19.0
msg = (
"property 'ordered' of 'Categorical' object has no setter"
if PY311
else "can't set attribute"
)
with pytest.raises(AttributeError, match=msg):
cat.ordered = True
with pytest.raises(AttributeError, match=msg):
cat.ordered = False
def test_rename_categories(self):
cat = Categorical(["a", "b", "c", "a"])
# inplace=False: the old one must not be changed
res = cat.rename_categories([1, 2, 3])
tm.assert_numpy_array_equal(
res.__array__(), np.array([1, 2, 3, 1], dtype=np.int64)
)
tm.assert_index_equal(res.categories, Index([1, 2, 3]))
exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_)
tm.assert_numpy_array_equal(cat.__array__(), exp_cat)
exp_cat = Index(["a", "b", "c"])
tm.assert_index_equal(cat.categories, exp_cat)
# GH18862 (let rename_categories take callables)
result = cat.rename_categories(lambda x: x.upper())
expected = Categorical(["A", "B", "C", "A"])
tm.assert_categorical_equal(result, expected)
@pytest.mark.parametrize("new_categories", [[1, 2, 3, 4], [1, 2]])
def test_rename_categories_wrong_length_raises(self, new_categories):
cat = Categorical(["a", "b", "c", "a"])
msg = (
"new categories need to have the same number of items as the "
"old categories!"
)
with pytest.raises(ValueError, match=msg):
cat.rename_categories(new_categories)
def test_rename_categories_series(self):
# https://github.com/pandas-dev/pandas/issues/17981
c = Categorical(["a", "b"])
result = c.rename_categories(Series([0, 1], index=["a", "b"]))
expected = Categorical([0, 1])
tm.assert_categorical_equal(result, expected)
def test_rename_categories_dict(self):
# GH 17336
cat = Categorical(["a", "b", "c", "d"])
res = cat.rename_categories({"a": 4, "b": 3, "c": 2, "d": 1})
expected = Index([4, 3, 2, 1])
tm.assert_index_equal(res.categories, expected)
# Test for dicts of smaller length
cat = Categorical(["a", "b", "c", "d"])
res = cat.rename_categories({"a": 1, "c": 3})
expected = Index([1, "b", 3, "d"])
tm.assert_index_equal(res.categories, expected)
# Test for dicts with bigger length
cat = Categorical(["a", "b", "c", "d"])
res = cat.rename_categories({"a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6})
expected = Index([1, 2, 3, 4])
tm.assert_index_equal(res.categories, expected)
# Test for dicts with no items from old categories
cat = Categorical(["a", "b", "c", "d"])
res = cat.rename_categories({"f": 1, "g": 3})
expected = Index(["a", "b", "c", "d"])
tm.assert_index_equal(res.categories, expected)
def test_reorder_categories(self):
cat = Categorical(["a", "b", "c", "a"], ordered=True)
old = cat.copy()
new = Categorical(
["a", "b", "c", "a"], categories=["c", "b", "a"], ordered=True
)
res = cat.reorder_categories(["c", "b", "a"])
# cat must be the same as before
tm.assert_categorical_equal(cat, old)
# only res is changed
tm.assert_categorical_equal(res, new)
@pytest.mark.parametrize(
"new_categories",
[
["a"], # not all "old" included in "new"
["a", "b", "d"], # still not all "old" in "new"
["a", "b", "c", "d"], # all "old" included in "new", but too long
],
)
def test_reorder_categories_raises(self, new_categories):
cat = Categorical(["a", "b", "c", "a"], ordered=True)
msg = "items in new_categories are not the same as in old categories"
with pytest.raises(ValueError, match=msg):
cat.reorder_categories(new_categories)
def test_add_categories(self):
cat = Categorical(["a", "b", "c", "a"], ordered=True)
old = cat.copy()
new = Categorical(
["a", "b", "c", "a"], categories=["a", "b", "c", "d"], ordered=True
)
res = cat.add_categories("d")
tm.assert_categorical_equal(cat, old)
tm.assert_categorical_equal(res, new)
res = cat.add_categories(["d"])
tm.assert_categorical_equal(cat, old)
tm.assert_categorical_equal(res, new)
# GH 9927
cat = Categorical(list("abc"), ordered=True)
expected = Categorical(list("abc"), categories=list("abcde"), ordered=True)
# test with Series, np.array, index, list
res = cat.add_categories(Series(["d", "e"]))
tm.assert_categorical_equal(res, expected)
res = cat.add_categories(np.array(["d", "e"]))
tm.assert_categorical_equal(res, expected)
res = cat.add_categories(Index(["d", "e"]))
tm.assert_categorical_equal(res, expected)
res = cat.add_categories(["d", "e"])
tm.assert_categorical_equal(res, expected)
def test_add_categories_existing_raises(self):
# new is in old categories
cat = Categorical(["a", "b", "c", "d"], ordered=True)
msg = re.escape("new categories must not include old categories: {'d'}")
with pytest.raises(ValueError, match=msg):
cat.add_categories(["d"])
def test_add_categories_losing_dtype_information(self):
# GH#48812
cat = Categorical(Series([1, 2], dtype="Int64"))
ser = Series([4], dtype="Int64")
result = cat.add_categories(ser)
expected = Categorical(
Series([1, 2], dtype="Int64"), categories=Series([1, 2, 4], dtype="Int64")
)
tm.assert_categorical_equal(result, expected)
cat = Categorical(Series(["a", "b", "a"], dtype=StringDtype()))
ser = Series(["d"], dtype=StringDtype())
result = cat.add_categories(ser)
expected = Categorical(
Series(["a", "b", "a"], dtype=StringDtype()),
categories=Series(["a", "b", "d"], dtype=StringDtype()),
)
tm.assert_categorical_equal(result, expected)
def test_set_categories(self):
cat = Categorical(["a", "b", "c", "a"], ordered=True)
exp_categories = Index(["c", "b", "a"])
exp_values = np.array(["a", "b", "c", "a"], dtype=np.object_)
cat = cat.set_categories(["c", "b", "a"])
res = cat.set_categories(["a", "b", "c"])
# cat must be the same as before
tm.assert_index_equal(cat.categories, exp_categories)
tm.assert_numpy_array_equal(cat.__array__(), exp_values)
# only res is changed
exp_categories_back = Index(["a", "b", "c"])
tm.assert_index_equal(res.categories, exp_categories_back)
tm.assert_numpy_array_equal(res.__array__(), exp_values)
# not all "old" included in "new" -> all not included ones are now
# np.nan
cat = Categorical(["a", "b", "c", "a"], ordered=True)
res = cat.set_categories(["a"])
tm.assert_numpy_array_equal(res.codes, np.array([0, -1, -1, 0], dtype=np.int8))
# still not all "old" in "new"
res = cat.set_categories(["a", "b", "d"])
tm.assert_numpy_array_equal(res.codes, np.array([0, 1, -1, 0], dtype=np.int8))
tm.assert_index_equal(res.categories, Index(["a", "b", "d"]))
# all "old" included in "new"
cat = cat.set_categories(["a", "b", "c", "d"])
exp_categories = Index(["a", "b", "c", "d"])
tm.assert_index_equal(cat.categories, exp_categories)
# internals...
c = Categorical([1, 2, 3, 4, 1], categories=[1, 2, 3, 4], ordered=True)
tm.assert_numpy_array_equal(c._codes, np.array([0, 1, 2, 3, 0], dtype=np.int8))
tm.assert_index_equal(c.categories, Index([1, 2, 3, 4]))
exp = np.array([1, 2, 3, 4, 1], dtype=np.int64)
tm.assert_numpy_array_equal(np.asarray(c), exp)
# all "pointers" to '4' must be changed from 3 to 0,...
c = c.set_categories([4, 3, 2, 1])
# positions are changed
tm.assert_numpy_array_equal(c._codes, np.array([3, 2, 1, 0, 3], dtype=np.int8))
# categories are now in new order
tm.assert_index_equal(c.categories, Index([4, 3, 2, 1]))
# output is the same
exp = np.array([1, 2, 3, 4, 1], dtype=np.int64)
tm.assert_numpy_array_equal(np.asarray(c), exp)
assert c.min() == 4
assert c.max() == 1
# set_categories should set the ordering if specified
c2 = c.set_categories([4, 3, 2, 1], ordered=False)
assert not c2.ordered
tm.assert_numpy_array_equal(np.asarray(c), np.asarray(c2))
# set_categories should pass thru the ordering
c2 = c.set_ordered(False).set_categories([4, 3, 2, 1])
assert not c2.ordered
tm.assert_numpy_array_equal(np.asarray(c), np.asarray(c2))
@pytest.mark.parametrize(
"values, categories, new_categories",
[
# No NaNs, same cats, same order
(["a", "b", "a"], ["a", "b"], ["a", "b"]),
# No NaNs, same cats, different order
(["a", "b", "a"], ["a", "b"], ["b", "a"]),
# Same, unsorted
(["b", "a", "a"], ["a", "b"], ["a", "b"]),
# No NaNs, same cats, different order
(["b", "a", "a"], ["a", "b"], ["b", "a"]),
# NaNs
(["a", "b", "c"], ["a", "b"], ["a", "b"]),
(["a", "b", "c"], ["a", "b"], ["b", "a"]),
(["b", "a", "c"], ["a", "b"], ["a", "b"]),
(["b", "a", "c"], ["a", "b"], ["a", "b"]),
# Introduce NaNs
(["a", "b", "c"], ["a", "b"], ["a"]),
(["a", "b", "c"], ["a", "b"], ["b"]),
(["b", "a", "c"], ["a", "b"], ["a"]),
(["b", "a", "c"], ["a", "b"], ["a"]),
# No overlap
(["a", "b", "c"], ["a", "b"], ["d", "e"]),
],
)
@pytest.mark.parametrize("ordered", [True, False])
def test_set_categories_many(self, values, categories, new_categories, ordered):
c = Categorical(values, categories)
expected = Categorical(values, new_categories, ordered)
result = c.set_categories(new_categories, ordered=ordered)
tm.assert_categorical_equal(result, expected)
def test_set_categories_rename_less(self):
# GH 24675
cat = Categorical(["A", "B"])
result = cat.set_categories(["A"], rename=True)
expected = Categorical(["A", np.nan])
tm.assert_categorical_equal(result, expected)
def test_set_categories_private(self):
cat = Categorical(["a", "b", "c"], categories=["a", "b", "c", "d"])
cat._set_categories(["a", "c", "d", "e"])
expected = Categorical(["a", "c", "d"], categories=list("acde"))
tm.assert_categorical_equal(cat, expected)
# fastpath
cat = Categorical(["a", "b", "c"], categories=["a", "b", "c", "d"])
cat._set_categories(["a", "c", "d", "e"], fastpath=True)
expected = Categorical(["a", "c", "d"], categories=list("acde"))
tm.assert_categorical_equal(cat, expected)
def test_remove_categories(self):
cat = Categorical(["a", "b", "c", "a"], ordered=True)
old = cat.copy()
new = Categorical(["a", "b", np.nan, "a"], categories=["a", "b"], ordered=True)
res = cat.remove_categories("c")
tm.assert_categorical_equal(cat, old)
tm.assert_categorical_equal(res, new)
res = cat.remove_categories(["c"])
tm.assert_categorical_equal(cat, old)
tm.assert_categorical_equal(res, new)
@pytest.mark.parametrize("removals", [["c"], ["c", np.nan], "c", ["c", "c"]])
def test_remove_categories_raises(self, removals):
cat = Categorical(["a", "b", "a"])
message = re.escape("removals must all be in old categories: {'c'}")
with pytest.raises(ValueError, match=message):
cat.remove_categories(removals)
def test_remove_unused_categories(self):
c = Categorical(["a", "b", "c", "d", "a"], categories=["a", "b", "c", "d", "e"])
exp_categories_all = Index(["a", "b", "c", "d", "e"])
exp_categories_dropped = Index(["a", "b", "c", "d"])
tm.assert_index_equal(c.categories, exp_categories_all)
res = c.remove_unused_categories()
tm.assert_index_equal(res.categories, exp_categories_dropped)
tm.assert_index_equal(c.categories, exp_categories_all)
# with NaN values (GH11599)
c = Categorical(["a", "b", "c", np.nan], categories=["a", "b", "c", "d", "e"])
res = c.remove_unused_categories()
tm.assert_index_equal(res.categories, Index(np.array(["a", "b", "c"])))
exp_codes = np.array([0, 1, 2, -1], dtype=np.int8)
tm.assert_numpy_array_equal(res.codes, exp_codes)
tm.assert_index_equal(c.categories, exp_categories_all)
val = ["F", np.nan, "D", "B", "D", "F", np.nan]
cat = Categorical(values=val, categories=list("ABCDEFG"))
out = cat.remove_unused_categories()
tm.assert_index_equal(out.categories, Index(["B", "D", "F"]))
exp_codes = np.array([2, -1, 1, 0, 1, 2, -1], dtype=np.int8)
tm.assert_numpy_array_equal(out.codes, exp_codes)
assert out.tolist() == val
alpha = list("abcdefghijklmnopqrstuvwxyz")
val = np.random.default_rng(2).choice(alpha[::2], 10000).astype("object")
val[np.random.default_rng(2).choice(len(val), 100)] = np.nan
cat = Categorical(values=val, categories=alpha)
out = cat.remove_unused_categories()
assert out.tolist() == val.tolist()
class TestCategoricalAPIWithFactor:
def test_describe(self):
factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True)
# string type
desc = factor.describe()
assert factor.ordered
exp_index = CategoricalIndex(
["a", "b", "c"], name="categories", ordered=factor.ordered
)
expected = DataFrame(
{"counts": [3, 2, 3], "freqs": [3 / 8.0, 2 / 8.0, 3 / 8.0]}, index=exp_index
)
tm.assert_frame_equal(desc, expected)
# check unused categories
cat = factor.copy()
cat = cat.set_categories(["a", "b", "c", "d"])
desc = cat.describe()
exp_index = CategoricalIndex(
list("abcd"), ordered=factor.ordered, name="categories"
)
expected = DataFrame(
{"counts": [3, 2, 3, 0], "freqs": [3 / 8.0, 2 / 8.0, 3 / 8.0, 0]},
index=exp_index,
)
tm.assert_frame_equal(desc, expected)
# check an integer one
cat = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1])
desc = cat.describe()
exp_index = CategoricalIndex([1, 2, 3], ordered=cat.ordered, name="categories")
expected = DataFrame(
{"counts": [5, 3, 3], "freqs": [5 / 11.0, 3 / 11.0, 3 / 11.0]},
index=exp_index,
)
tm.assert_frame_equal(desc, expected)
# https://github.com/pandas-dev/pandas/issues/3678
# describe should work with NaN
cat = Categorical([np.nan, 1, 2, 2])
desc = cat.describe()
expected = DataFrame(
{"counts": [1, 2, 1], "freqs": [1 / 4.0, 2 / 4.0, 1 / 4.0]},
index=CategoricalIndex(
[1, 2, np.nan], categories=[1, 2], name="categories"
),
)
tm.assert_frame_equal(desc, expected)
class TestPrivateCategoricalAPI:
def test_codes_immutable(self):
# Codes should be read only
c = Categorical(["a", "b", "c", "a", np.nan])
exp = np.array([0, 1, 2, 0, -1], dtype="int8")
tm.assert_numpy_array_equal(c.codes, exp)
# Assignments to codes should raise
msg = (
"property 'codes' of 'Categorical' object has no setter"
if PY311
else "can't set attribute"
)
with pytest.raises(AttributeError, match=msg):
c.codes = np.array([0, 1, 2, 0, 1], dtype="int8")
# changes in the codes array should raise
codes = c.codes
with pytest.raises(ValueError, match="assignment destination is read-only"):
codes[4] = 1
# But even after getting the codes, the original array should still be
# writeable!
c[4] = "a"
exp = np.array([0, 1, 2, 0, 0], dtype="int8")
tm.assert_numpy_array_equal(c.codes, exp)
c._codes[4] = 2
exp = np.array([0, 1, 2, 0, 2], dtype="int8")
tm.assert_numpy_array_equal(c.codes, exp)
@pytest.mark.parametrize(
"codes, old, new, expected",
[
([0, 1], ["a", "b"], ["a", "b"], [0, 1]),
([0, 1], ["b", "a"], ["b", "a"], [0, 1]),
([0, 1], ["a", "b"], ["b", "a"], [1, 0]),
([0, 1], ["b", "a"], ["a", "b"], [1, 0]),
([0, 1, 0, 1], ["a", "b"], ["a", "b", "c"], [0, 1, 0, 1]),
([0, 1, 2, 2], ["a", "b", "c"], ["a", "b"], [0, 1, -1, -1]),
([0, 1, -1], ["a", "b", "c"], ["a", "b", "c"], [0, 1, -1]),
([0, 1, -1], ["a", "b", "c"], ["b"], [-1, 0, -1]),
([0, 1, -1], ["a", "b", "c"], ["d"], [-1, -1, -1]),
([0, 1, -1], ["a", "b", "c"], [], [-1, -1, -1]),
([-1, -1], [], ["a", "b"], [-1, -1]),
([1, 0], ["b", "a"], ["a", "b"], [0, 1]),
],
)
def test_recode_to_categories(self, codes, old, new, expected):
codes = np.asanyarray(codes, dtype=np.int8)
expected = np.asanyarray(expected, dtype=np.int8)
old = Index(old)
new = Index(new)
result = recode_for_categories(codes, old, new)
tm.assert_numpy_array_equal(result, expected)
def test_recode_to_categories_large(self):
N = 1000
codes = np.arange(N)
old = Index(codes)
expected = np.arange(N - 1, -1, -1, dtype=np.int16)
new = Index(expected)
result = recode_for_categories(codes, old, new)
tm.assert_numpy_array_equal(result, expected)

View File

@ -0,0 +1,155 @@
import numpy as np
import pytest
from pandas import (
Categorical,
CategoricalDtype,
CategoricalIndex,
DatetimeIndex,
Interval,
NaT,
Period,
Timestamp,
array,
to_datetime,
)
import pandas._testing as tm
class TestAstype:
@pytest.mark.parametrize("cls", [Categorical, CategoricalIndex])
@pytest.mark.parametrize("values", [[1, np.nan], [Timestamp("2000"), NaT]])
def test_astype_nan_to_int(self, cls, values):
# GH#28406
obj = cls(values)
msg = "Cannot (cast|convert)"
with pytest.raises((ValueError, TypeError), match=msg):
obj.astype(int)
@pytest.mark.parametrize(
"expected",
[
array(["2019", "2020"], dtype="datetime64[ns, UTC]"),
array([0, 0], dtype="timedelta64[ns]"),
array([Period("2019"), Period("2020")], dtype="period[Y-DEC]"),
array([Interval(0, 1), Interval(1, 2)], dtype="interval"),
array([1, np.nan], dtype="Int64"),
],
)
def test_astype_category_to_extension_dtype(self, expected):
# GH#28668
result = expected.astype("category").astype(expected.dtype)
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
"dtype, expected",
[
(
"datetime64[ns]",
np.array(["2015-01-01T00:00:00.000000000"], dtype="datetime64[ns]"),
),
(
"datetime64[ns, MET]",
DatetimeIndex([Timestamp("2015-01-01 00:00:00+0100", tz="MET")]).array,
),
],
)
def test_astype_to_datetime64(self, dtype, expected):
# GH#28448
result = Categorical(["2015-01-01"]).astype(dtype)
assert result == expected
def test_astype_str_int_categories_to_nullable_int(self):
# GH#39616
dtype = CategoricalDtype([str(i) for i in range(5)])
codes = np.random.default_rng(2).integers(5, size=20)
arr = Categorical.from_codes(codes, dtype=dtype)
res = arr.astype("Int64")
expected = array(codes, dtype="Int64")
tm.assert_extension_array_equal(res, expected)
def test_astype_str_int_categories_to_nullable_float(self):
# GH#39616
dtype = CategoricalDtype([str(i / 2) for i in range(5)])
codes = np.random.default_rng(2).integers(5, size=20)
arr = Categorical.from_codes(codes, dtype=dtype)
res = arr.astype("Float64")
expected = array(codes, dtype="Float64") / 2
tm.assert_extension_array_equal(res, expected)
@pytest.mark.parametrize("ordered", [True, False])
def test_astype(self, ordered):
# string
cat = Categorical(list("abbaaccc"), ordered=ordered)
result = cat.astype(object)
expected = np.array(cat)
tm.assert_numpy_array_equal(result, expected)
msg = r"Cannot cast object|str dtype to float64"
with pytest.raises(ValueError, match=msg):
cat.astype(float)
# numeric
cat = Categorical([0, 1, 2, 2, 1, 0, 1, 0, 2], ordered=ordered)
result = cat.astype(object)
expected = np.array(cat, dtype=object)
tm.assert_numpy_array_equal(result, expected)
result = cat.astype(int)
expected = np.array(cat, dtype="int")
tm.assert_numpy_array_equal(result, expected)
result = cat.astype(float)
expected = np.array(cat, dtype=float)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("dtype_ordered", [True, False])
@pytest.mark.parametrize("cat_ordered", [True, False])
def test_astype_category(self, dtype_ordered, cat_ordered):
# GH#10696/GH#18593
data = list("abcaacbab")
cat = Categorical(data, categories=list("bac"), ordered=cat_ordered)
# standard categories
dtype = CategoricalDtype(ordered=dtype_ordered)
result = cat.astype(dtype)
expected = Categorical(data, categories=cat.categories, ordered=dtype_ordered)
tm.assert_categorical_equal(result, expected)
# non-standard categories
dtype = CategoricalDtype(list("adc"), dtype_ordered)
result = cat.astype(dtype)
expected = Categorical(data, dtype=dtype)
tm.assert_categorical_equal(result, expected)
if dtype_ordered is False:
# dtype='category' can't specify ordered, so only test once
result = cat.astype("category")
expected = cat
tm.assert_categorical_equal(result, expected)
def test_astype_object_datetime_categories(self):
# GH#40754
cat = Categorical(to_datetime(["2021-03-27", NaT]))
result = cat.astype(object)
expected = np.array([Timestamp("2021-03-27 00:00:00"), NaT], dtype="object")
tm.assert_numpy_array_equal(result, expected)
def test_astype_object_timestamp_categories(self):
# GH#18024
cat = Categorical([Timestamp("2014-01-01")])
result = cat.astype(object)
expected = np.array([Timestamp("2014-01-01 00:00:00")], dtype="object")
tm.assert_numpy_array_equal(result, expected)
def test_astype_category_readonly_mask_values(self):
# GH#53658
arr = array([0, 1, 2], dtype="Int64")
arr._mask.flags["WRITEABLE"] = False
result = arr.astype("category")
expected = array([0, 1, 2], dtype="Int64").astype("category")
tm.assert_extension_array_equal(result, expected)

View File

@ -0,0 +1,787 @@
from datetime import (
date,
datetime,
)
import numpy as np
import pytest
from pandas._config import using_string_dtype
from pandas.compat import HAS_PYARROW
from pandas.core.dtypes.common import (
is_float_dtype,
is_integer_dtype,
)
from pandas.core.dtypes.dtypes import CategoricalDtype
import pandas as pd
from pandas import (
Categorical,
CategoricalIndex,
DatetimeIndex,
Index,
Interval,
IntervalIndex,
MultiIndex,
NaT,
Series,
Timestamp,
date_range,
period_range,
timedelta_range,
)
import pandas._testing as tm
class TestCategoricalConstructors:
def test_fastpath_deprecated(self):
codes = np.array([1, 2, 3])
dtype = CategoricalDtype(categories=["a", "b", "c", "d"], ordered=False)
msg = "The 'fastpath' keyword in Categorical is deprecated"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
Categorical(codes, dtype=dtype, fastpath=True)
def test_categorical_from_cat_and_dtype_str_preserve_ordered(self):
# GH#49309 we should preserve orderedness in `res`
cat = Categorical([3, 1], categories=[3, 2, 1], ordered=True)
res = Categorical(cat, dtype="category")
assert res.dtype.ordered
def test_categorical_disallows_scalar(self):
# GH#38433
with pytest.raises(TypeError, match="Categorical input must be list-like"):
Categorical("A", categories=["A", "B"])
def test_categorical_1d_only(self):
# ndim > 1
msg = "> 1 ndim Categorical are not supported at this time"
with pytest.raises(NotImplementedError, match=msg):
Categorical(np.array([list("abcd")]))
def test_validate_ordered(self):
# see gh-14058
exp_msg = "'ordered' must either be 'True' or 'False'"
exp_err = TypeError
# This should be a boolean.
ordered = np.array([0, 1, 2])
with pytest.raises(exp_err, match=exp_msg):
Categorical([1, 2, 3], ordered=ordered)
with pytest.raises(exp_err, match=exp_msg):
Categorical.from_codes(
[0, 0, 1], categories=["a", "b", "c"], ordered=ordered
)
def test_constructor_empty(self):
# GH 17248
c = Categorical([])
expected = Index([])
tm.assert_index_equal(c.categories, expected)
c = Categorical([], categories=[1, 2, 3])
expected = Index([1, 2, 3], dtype=np.int64)
tm.assert_index_equal(c.categories, expected)
def test_constructor_empty_boolean(self):
# see gh-22702
cat = Categorical([], categories=[True, False])
categories = sorted(cat.categories.tolist())
assert categories == [False, True]
def test_constructor_tuples(self):
values = np.array([(1,), (1, 2), (1,), (1, 2)], dtype=object)
result = Categorical(values)
expected = Index([(1,), (1, 2)], tupleize_cols=False)
tm.assert_index_equal(result.categories, expected)
assert result.ordered is False
def test_constructor_tuples_datetimes(self):
# numpy will auto reshape when all of the tuples are the
# same len, so add an extra one with 2 items and slice it off
values = np.array(
[
(Timestamp("2010-01-01"),),
(Timestamp("2010-01-02"),),
(Timestamp("2010-01-01"),),
(Timestamp("2010-01-02"),),
("a", "b"),
],
dtype=object,
)[:-1]
result = Categorical(values)
expected = Index(
[(Timestamp("2010-01-01"),), (Timestamp("2010-01-02"),)],
tupleize_cols=False,
)
tm.assert_index_equal(result.categories, expected)
def test_constructor_unsortable(self):
# it works!
arr = np.array([1, 2, 3, datetime.now()], dtype="O")
factor = Categorical(arr, ordered=False)
assert not factor.ordered
# this however will raise as cannot be sorted
msg = (
"'values' is not ordered, please explicitly specify the "
"categories order by passing in a categories argument."
)
with pytest.raises(TypeError, match=msg):
Categorical(arr, ordered=True)
def test_constructor_interval(self):
result = Categorical(
[Interval(1, 2), Interval(2, 3), Interval(3, 6)], ordered=True
)
ii = IntervalIndex([Interval(1, 2), Interval(2, 3), Interval(3, 6)])
exp = Categorical(ii, ordered=True)
tm.assert_categorical_equal(result, exp)
tm.assert_index_equal(result.categories, ii)
def test_constructor(self):
exp_arr = np.array(["a", "b", "c", "a", "b", "c"], dtype=np.object_)
c1 = Categorical(exp_arr)
tm.assert_numpy_array_equal(c1.__array__(), exp_arr)
c2 = Categorical(exp_arr, categories=["a", "b", "c"])
tm.assert_numpy_array_equal(c2.__array__(), exp_arr)
c2 = Categorical(exp_arr, categories=["c", "b", "a"])
tm.assert_numpy_array_equal(c2.__array__(), exp_arr)
# categories must be unique
msg = "Categorical categories must be unique"
with pytest.raises(ValueError, match=msg):
Categorical([1, 2], [1, 2, 2])
with pytest.raises(ValueError, match=msg):
Categorical(["a", "b"], ["a", "b", "b"])
# The default should be unordered
c1 = Categorical(["a", "b", "c", "a"])
assert not c1.ordered
# Categorical as input
c1 = Categorical(["a", "b", "c", "a"])
c2 = Categorical(c1)
tm.assert_categorical_equal(c1, c2)
c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
c2 = Categorical(c1)
tm.assert_categorical_equal(c1, c2)
c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"])
c2 = Categorical(c1)
tm.assert_categorical_equal(c1, c2)
c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"])
c2 = Categorical(c1, categories=["a", "b", "c"])
tm.assert_numpy_array_equal(c1.__array__(), c2.__array__())
tm.assert_index_equal(c2.categories, Index(["a", "b", "c"]))
# Series of dtype category
c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
c2 = Categorical(Series(c1))
tm.assert_categorical_equal(c1, c2)
c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"])
c2 = Categorical(Series(c1))
tm.assert_categorical_equal(c1, c2)
# Series
c1 = Categorical(["a", "b", "c", "a"])
c2 = Categorical(Series(["a", "b", "c", "a"]))
tm.assert_categorical_equal(c1, c2)
c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
c2 = Categorical(Series(["a", "b", "c", "a"]), categories=["a", "b", "c", "d"])
tm.assert_categorical_equal(c1, c2)
# This should result in integer categories, not float!
cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
assert is_integer_dtype(cat.categories)
# https://github.com/pandas-dev/pandas/issues/3678
cat = Categorical([np.nan, 1, 2, 3])
assert is_integer_dtype(cat.categories)
# this should result in floats
cat = Categorical([np.nan, 1, 2.0, 3])
assert is_float_dtype(cat.categories)
cat = Categorical([np.nan, 1.0, 2.0, 3.0])
assert is_float_dtype(cat.categories)
# This doesn't work -> this would probably need some kind of "remember
# the original type" feature to try to cast the array interface result
# to...
# vals = np.asarray(cat[cat.notna()])
# assert is_integer_dtype(vals)
# corner cases
cat = Categorical([1])
assert len(cat.categories) == 1
assert cat.categories[0] == 1
assert len(cat.codes) == 1
assert cat.codes[0] == 0
cat = Categorical(["a"])
assert len(cat.categories) == 1
assert cat.categories[0] == "a"
assert len(cat.codes) == 1
assert cat.codes[0] == 0
# two arrays
# - when the first is an integer dtype and the second is not
# - when the resulting codes are all -1/NaN
with tm.assert_produces_warning(None):
Categorical([0, 1, 2, 0, 1, 2], categories=["a", "b", "c"])
with tm.assert_produces_warning(None):
Categorical([0, 1, 2, 0, 1, 2], categories=[3, 4, 5])
# the next one are from the old docs
with tm.assert_produces_warning(None):
Categorical([0, 1, 2, 0, 1, 2], [1, 2, 3])
cat = Categorical([1, 2], categories=[1, 2, 3])
# this is a legitimate constructor
with tm.assert_produces_warning(None):
Categorical(np.array([], dtype="int64"), categories=[3, 2, 1], ordered=True)
def test_constructor_with_existing_categories(self):
# GH25318: constructing with pd.Series used to bogusly skip recoding
# categories
c0 = Categorical(["a", "b", "c", "a"])
c1 = Categorical(["a", "b", "c", "a"], categories=["b", "c"])
c2 = Categorical(c0, categories=c1.categories)
tm.assert_categorical_equal(c1, c2)
c3 = Categorical(Series(c0), categories=c1.categories)
tm.assert_categorical_equal(c1, c3)
def test_constructor_not_sequence(self):
# https://github.com/pandas-dev/pandas/issues/16022
msg = r"^Parameter 'categories' must be list-like, was"
with pytest.raises(TypeError, match=msg):
Categorical(["a", "b"], categories="a")
def test_constructor_with_null(self):
# Cannot have NaN in categories
msg = "Categorical categories cannot be null"
with pytest.raises(ValueError, match=msg):
Categorical([np.nan, "a", "b", "c"], categories=[np.nan, "a", "b", "c"])
with pytest.raises(ValueError, match=msg):
Categorical([None, "a", "b", "c"], categories=[None, "a", "b", "c"])
with pytest.raises(ValueError, match=msg):
Categorical(
DatetimeIndex(["nat", "20160101"]),
categories=[NaT, Timestamp("20160101")],
)
def test_constructor_with_index(self):
ci = CategoricalIndex(list("aabbca"), categories=list("cab"))
tm.assert_categorical_equal(ci.values, Categorical(ci))
ci = CategoricalIndex(list("aabbca"), categories=list("cab"))
tm.assert_categorical_equal(
ci.values, Categorical(ci.astype(object), categories=ci.categories)
)
def test_constructor_with_generator(self):
# This was raising an Error in isna(single_val).any() because isna
# returned a scalar for a generator
exp = Categorical([0, 1, 2])
cat = Categorical(x for x in [0, 1, 2])
tm.assert_categorical_equal(cat, exp)
cat = Categorical(range(3))
tm.assert_categorical_equal(cat, exp)
MultiIndex.from_product([range(5), ["a", "b", "c"]])
# check that categories accept generators and sequences
cat = Categorical([0, 1, 2], categories=(x for x in [0, 1, 2]))
tm.assert_categorical_equal(cat, exp)
cat = Categorical([0, 1, 2], categories=range(3))
tm.assert_categorical_equal(cat, exp)
def test_constructor_with_rangeindex(self):
# RangeIndex is preserved in Categories
rng = Index(range(3))
cat = Categorical(rng)
tm.assert_index_equal(cat.categories, rng, exact=True)
cat = Categorical([1, 2, 0], categories=rng)
tm.assert_index_equal(cat.categories, rng, exact=True)
@pytest.mark.parametrize(
"dtl",
[
date_range("1995-01-01 00:00:00", periods=5, freq="s"),
date_range("1995-01-01 00:00:00", periods=5, freq="s", tz="US/Eastern"),
timedelta_range("1 day", periods=5, freq="s"),
],
)
def test_constructor_with_datetimelike(self, dtl):
# see gh-12077
# constructor with a datetimelike and NaT
s = Series(dtl)
c = Categorical(s)
expected = type(dtl)(s)
expected._data.freq = None
tm.assert_index_equal(c.categories, expected)
tm.assert_numpy_array_equal(c.codes, np.arange(5, dtype="int8"))
# with NaT
s2 = s.copy()
s2.iloc[-1] = NaT
c = Categorical(s2)
expected = type(dtl)(s2.dropna())
expected._data.freq = None
tm.assert_index_equal(c.categories, expected)
exp = np.array([0, 1, 2, 3, -1], dtype=np.int8)
tm.assert_numpy_array_equal(c.codes, exp)
result = repr(c)
assert "NaT" in result
def test_constructor_from_index_series_datetimetz(self):
idx = date_range("2015-01-01 10:00", freq="D", periods=3, tz="US/Eastern")
idx = idx._with_freq(None) # freq not preserved in result.categories
result = Categorical(idx)
tm.assert_index_equal(result.categories, idx)
result = Categorical(Series(idx))
tm.assert_index_equal(result.categories, idx)
def test_constructor_date_objects(self):
# we dont cast date objects to timestamps, matching Index constructor
v = date.today()
cat = Categorical([v, v])
assert cat.categories.dtype == object
assert type(cat.categories[0]) is date
def test_constructor_from_index_series_timedelta(self):
idx = timedelta_range("1 days", freq="D", periods=3)
idx = idx._with_freq(None) # freq not preserved in result.categories
result = Categorical(idx)
tm.assert_index_equal(result.categories, idx)
result = Categorical(Series(idx))
tm.assert_index_equal(result.categories, idx)
def test_constructor_from_index_series_period(self):
idx = period_range("2015-01-01", freq="D", periods=3)
result = Categorical(idx)
tm.assert_index_equal(result.categories, idx)
result = Categorical(Series(idx))
tm.assert_index_equal(result.categories, idx)
@pytest.mark.parametrize(
"values",
[
np.array([1.0, 1.2, 1.8, np.nan]),
np.array([1, 2, 3], dtype="int64"),
["a", "b", "c", np.nan],
[pd.Period("2014-01"), pd.Period("2014-02"), NaT],
[Timestamp("2014-01-01"), Timestamp("2014-01-02"), NaT],
[
Timestamp("2014-01-01", tz="US/Eastern"),
Timestamp("2014-01-02", tz="US/Eastern"),
NaT,
],
],
)
def test_constructor_invariant(self, values):
# GH 14190
c = Categorical(values)
c2 = Categorical(c)
tm.assert_categorical_equal(c, c2)
@pytest.mark.parametrize("ordered", [True, False])
def test_constructor_with_dtype(self, ordered):
categories = ["b", "a", "c"]
dtype = CategoricalDtype(categories, ordered=ordered)
result = Categorical(["a", "b", "a", "c"], dtype=dtype)
expected = Categorical(
["a", "b", "a", "c"], categories=categories, ordered=ordered
)
tm.assert_categorical_equal(result, expected)
assert result.ordered is ordered
def test_constructor_dtype_and_others_raises(self):
dtype = CategoricalDtype(["a", "b"], ordered=True)
msg = "Cannot specify `categories` or `ordered` together with `dtype`."
with pytest.raises(ValueError, match=msg):
Categorical(["a", "b"], categories=["a", "b"], dtype=dtype)
with pytest.raises(ValueError, match=msg):
Categorical(["a", "b"], ordered=True, dtype=dtype)
with pytest.raises(ValueError, match=msg):
Categorical(["a", "b"], ordered=False, dtype=dtype)
@pytest.mark.parametrize("categories", [None, ["a", "b"], ["a", "c"]])
@pytest.mark.parametrize("ordered", [True, False])
def test_constructor_str_category(self, categories, ordered):
result = Categorical(
["a", "b"], categories=categories, ordered=ordered, dtype="category"
)
expected = Categorical(["a", "b"], categories=categories, ordered=ordered)
tm.assert_categorical_equal(result, expected)
def test_constructor_str_unknown(self):
with pytest.raises(ValueError, match="Unknown dtype"):
Categorical([1, 2], dtype="foo")
@pytest.mark.xfail(
using_string_dtype() and HAS_PYARROW, reason="Can't be NumPy strings"
)
def test_constructor_np_strs(self):
# GH#31499 Hashtable.map_locations needs to work on np.str_ objects
cat = Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")])
assert all(isinstance(x, np.str_) for x in cat.categories)
def test_constructor_from_categorical_with_dtype(self):
dtype = CategoricalDtype(["a", "b", "c"], ordered=True)
values = Categorical(["a", "b", "d"])
result = Categorical(values, dtype=dtype)
# We use dtype.categories, not values.categories
expected = Categorical(
["a", "b", "d"], categories=["a", "b", "c"], ordered=True
)
tm.assert_categorical_equal(result, expected)
def test_constructor_from_categorical_with_unknown_dtype(self):
dtype = CategoricalDtype(None, ordered=True)
values = Categorical(["a", "b", "d"])
result = Categorical(values, dtype=dtype)
# We use values.categories, not dtype.categories
expected = Categorical(
["a", "b", "d"], categories=["a", "b", "d"], ordered=True
)
tm.assert_categorical_equal(result, expected)
def test_constructor_from_categorical_string(self):
values = Categorical(["a", "b", "d"])
# use categories, ordered
result = Categorical(
values, categories=["a", "b", "c"], ordered=True, dtype="category"
)
expected = Categorical(
["a", "b", "d"], categories=["a", "b", "c"], ordered=True
)
tm.assert_categorical_equal(result, expected)
# No string
result = Categorical(values, categories=["a", "b", "c"], ordered=True)
tm.assert_categorical_equal(result, expected)
def test_constructor_with_categorical_categories(self):
# GH17884
expected = Categorical(["a", "b"], categories=["a", "b", "c"])
result = Categorical(["a", "b"], categories=Categorical(["a", "b", "c"]))
tm.assert_categorical_equal(result, expected)
result = Categorical(["a", "b"], categories=CategoricalIndex(["a", "b", "c"]))
tm.assert_categorical_equal(result, expected)
@pytest.mark.parametrize("klass", [lambda x: np.array(x, dtype=object), list])
def test_construction_with_null(self, klass, nulls_fixture):
# https://github.com/pandas-dev/pandas/issues/31927
values = klass(["a", nulls_fixture, "b"])
result = Categorical(values)
dtype = CategoricalDtype(["a", "b"])
codes = [0, -1, 1]
expected = Categorical.from_codes(codes=codes, dtype=dtype)
tm.assert_categorical_equal(result, expected)
@pytest.mark.parametrize("validate", [True, False])
def test_from_codes_nullable_int_categories(self, any_numeric_ea_dtype, validate):
# GH#39649
cats = pd.array(range(5), dtype=any_numeric_ea_dtype)
codes = np.random.default_rng(2).integers(5, size=3)
dtype = CategoricalDtype(cats)
arr = Categorical.from_codes(codes, dtype=dtype, validate=validate)
assert arr.categories.dtype == cats.dtype
tm.assert_index_equal(arr.categories, Index(cats))
def test_from_codes_empty(self):
cat = ["a", "b", "c"]
result = Categorical.from_codes([], categories=cat)
expected = Categorical([], categories=cat)
tm.assert_categorical_equal(result, expected)
@pytest.mark.parametrize("validate", [True, False])
def test_from_codes_validate(self, validate):
# GH53122
dtype = CategoricalDtype(["a", "b"])
if validate:
with pytest.raises(ValueError, match="codes need to be between "):
Categorical.from_codes([4, 5], dtype=dtype, validate=validate)
else:
# passes, though has incorrect codes, but that's the user responsibility
Categorical.from_codes([4, 5], dtype=dtype, validate=validate)
def test_from_codes_too_few_categories(self):
dtype = CategoricalDtype(categories=[1, 2])
msg = "codes need to be between "
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([1, 2], categories=dtype.categories)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([1, 2], dtype=dtype)
def test_from_codes_non_int_codes(self):
dtype = CategoricalDtype(categories=[1, 2])
msg = "codes need to be array-like integers"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(["a"], categories=dtype.categories)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(["a"], dtype=dtype)
def test_from_codes_non_unique_categories(self):
with pytest.raises(ValueError, match="Categorical categories must be unique"):
Categorical.from_codes([0, 1, 2], categories=["a", "a", "b"])
def test_from_codes_nan_cat_included(self):
with pytest.raises(ValueError, match="Categorical categories cannot be null"):
Categorical.from_codes([0, 1, 2], categories=["a", "b", np.nan])
def test_from_codes_too_negative(self):
dtype = CategoricalDtype(categories=["a", "b", "c"])
msg = r"codes need to be between -1 and len\(categories\)-1"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([-2, 1, 2], categories=dtype.categories)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([-2, 1, 2], dtype=dtype)
def test_from_codes(self):
dtype = CategoricalDtype(categories=["a", "b", "c"])
exp = Categorical(["a", "b", "c"], ordered=False)
res = Categorical.from_codes([0, 1, 2], categories=dtype.categories)
tm.assert_categorical_equal(exp, res)
res = Categorical.from_codes([0, 1, 2], dtype=dtype)
tm.assert_categorical_equal(exp, res)
@pytest.mark.parametrize("klass", [Categorical, CategoricalIndex])
def test_from_codes_with_categorical_categories(self, klass):
# GH17884
expected = Categorical(["a", "b"], categories=["a", "b", "c"])
result = Categorical.from_codes([0, 1], categories=klass(["a", "b", "c"]))
tm.assert_categorical_equal(result, expected)
@pytest.mark.parametrize("klass", [Categorical, CategoricalIndex])
def test_from_codes_with_non_unique_categorical_categories(self, klass):
with pytest.raises(ValueError, match="Categorical categories must be unique"):
Categorical.from_codes([0, 1], klass(["a", "b", "a"]))
def test_from_codes_with_nan_code(self):
# GH21767
codes = [1, 2, np.nan]
dtype = CategoricalDtype(categories=["a", "b", "c"])
with pytest.raises(ValueError, match="codes need to be array-like integers"):
Categorical.from_codes(codes, categories=dtype.categories)
with pytest.raises(ValueError, match="codes need to be array-like integers"):
Categorical.from_codes(codes, dtype=dtype)
@pytest.mark.parametrize("codes", [[1.0, 2.0, 0], [1.1, 2.0, 0]])
def test_from_codes_with_float(self, codes):
# GH21767
# float codes should raise even if values are equal to integers
dtype = CategoricalDtype(categories=["a", "b", "c"])
msg = "codes need to be array-like integers"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(codes, dtype.categories)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(codes, dtype=dtype)
def test_from_codes_with_dtype_raises(self):
msg = "Cannot specify"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(
[0, 1], categories=["a", "b"], dtype=CategoricalDtype(["a", "b"])
)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(
[0, 1], ordered=True, dtype=CategoricalDtype(["a", "b"])
)
def test_from_codes_neither(self):
msg = "Both were None"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([0, 1])
def test_from_codes_with_nullable_int(self):
codes = pd.array([0, 1], dtype="Int64")
categories = ["a", "b"]
result = Categorical.from_codes(codes, categories=categories)
expected = Categorical.from_codes(codes.to_numpy(int), categories=categories)
tm.assert_categorical_equal(result, expected)
def test_from_codes_with_nullable_int_na_raises(self):
codes = pd.array([0, None], dtype="Int64")
categories = ["a", "b"]
msg = "codes cannot contain NA values"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(codes, categories=categories)
@pytest.mark.parametrize("dtype", [None, "category"])
def test_from_inferred_categories(self, dtype):
cats = ["a", "b"]
codes = np.array([0, 0, 1, 1], dtype="i8")
result = Categorical._from_inferred_categories(cats, codes, dtype)
expected = Categorical.from_codes(codes, cats)
tm.assert_categorical_equal(result, expected)
@pytest.mark.parametrize("dtype", [None, "category"])
def test_from_inferred_categories_sorts(self, dtype):
cats = ["b", "a"]
codes = np.array([0, 1, 1, 1], dtype="i8")
result = Categorical._from_inferred_categories(cats, codes, dtype)
expected = Categorical.from_codes([1, 0, 0, 0], ["a", "b"])
tm.assert_categorical_equal(result, expected)
def test_from_inferred_categories_dtype(self):
cats = ["a", "b", "d"]
codes = np.array([0, 1, 0, 2], dtype="i8")
dtype = CategoricalDtype(["c", "b", "a"], ordered=True)
result = Categorical._from_inferred_categories(cats, codes, dtype)
expected = Categorical(
["a", "b", "a", "d"], categories=["c", "b", "a"], ordered=True
)
tm.assert_categorical_equal(result, expected)
def test_from_inferred_categories_coerces(self):
cats = ["1", "2", "bad"]
codes = np.array([0, 0, 1, 2], dtype="i8")
dtype = CategoricalDtype([1, 2])
result = Categorical._from_inferred_categories(cats, codes, dtype)
expected = Categorical([1, 1, 2, np.nan])
tm.assert_categorical_equal(result, expected)
@pytest.mark.parametrize("ordered", [None, True, False])
def test_construction_with_ordered(self, ordered):
# GH 9347, 9190
cat = Categorical([0, 1, 2], ordered=ordered)
assert cat.ordered == bool(ordered)
def test_constructor_imaginary(self):
values = [1, 2, 3 + 1j]
c1 = Categorical(values)
tm.assert_index_equal(c1.categories, Index(values))
tm.assert_numpy_array_equal(np.array(c1), np.array(values))
def test_constructor_string_and_tuples(self):
# GH 21416
c = Categorical(np.array(["c", ("a", "b"), ("b", "a"), "c"], dtype=object))
expected_index = Index([("a", "b"), ("b", "a"), "c"])
assert c.categories.equals(expected_index)
def test_interval(self):
idx = pd.interval_range(0, 10, periods=10)
cat = Categorical(idx, categories=idx)
expected_codes = np.arange(10, dtype="int8")
tm.assert_numpy_array_equal(cat.codes, expected_codes)
tm.assert_index_equal(cat.categories, idx)
# infer categories
cat = Categorical(idx)
tm.assert_numpy_array_equal(cat.codes, expected_codes)
tm.assert_index_equal(cat.categories, idx)
# list values
cat = Categorical(list(idx))
tm.assert_numpy_array_equal(cat.codes, expected_codes)
tm.assert_index_equal(cat.categories, idx)
# list values, categories
cat = Categorical(list(idx), categories=list(idx))
tm.assert_numpy_array_equal(cat.codes, expected_codes)
tm.assert_index_equal(cat.categories, idx)
# shuffled
values = idx.take([1, 2, 0])
cat = Categorical(values, categories=idx)
tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype="int8"))
tm.assert_index_equal(cat.categories, idx)
# extra
values = pd.interval_range(8, 11, periods=3)
cat = Categorical(values, categories=idx)
expected_codes = np.array([8, 9, -1], dtype="int8")
tm.assert_numpy_array_equal(cat.codes, expected_codes)
tm.assert_index_equal(cat.categories, idx)
# overlapping
idx = IntervalIndex([Interval(0, 2), Interval(0, 1)])
cat = Categorical(idx, categories=idx)
expected_codes = np.array([0, 1], dtype="int8")
tm.assert_numpy_array_equal(cat.codes, expected_codes)
tm.assert_index_equal(cat.categories, idx)
def test_categorical_extension_array_nullable(self, nulls_fixture):
# GH:
arr = pd.arrays.StringArray._from_sequence(
[nulls_fixture] * 2, dtype=pd.StringDtype()
)
result = Categorical(arr)
assert arr.dtype == result.categories.dtype
expected = Categorical(Series([pd.NA, pd.NA], dtype=arr.dtype))
tm.assert_categorical_equal(result, expected)
def test_from_sequence_copy(self):
cat = Categorical(np.arange(5).repeat(2))
result = Categorical._from_sequence(cat, dtype=cat.dtype, copy=False)
# more generally, we'd be OK with a view
assert result._codes is cat._codes
result = Categorical._from_sequence(cat, dtype=cat.dtype, copy=True)
assert not tm.shares_memory(result, cat)
def test_constructor_datetime64_non_nano(self):
categories = np.arange(10).view("M8[D]")
values = categories[::2].copy()
cat = Categorical(values, categories=categories)
assert (cat == values).all()
def test_constructor_preserves_freq(self):
# GH33830 freq retention in categorical
dti = date_range("2016-01-01", periods=5)
expected = dti.freq
cat = Categorical(dti)
result = cat.categories.freq
assert expected == result

View File

@ -0,0 +1,139 @@
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas import (
Categorical,
CategoricalIndex,
Index,
IntervalIndex,
Series,
Timestamp,
)
import pandas._testing as tm
class TestCategoricalDtypes:
def test_categories_match_up_to_permutation(self):
# test dtype comparisons between cats
c1 = Categorical(list("aabca"), categories=list("abc"), ordered=False)
c2 = Categorical(list("aabca"), categories=list("cab"), ordered=False)
c3 = Categorical(list("aabca"), categories=list("cab"), ordered=True)
assert c1._categories_match_up_to_permutation(c1)
assert c2._categories_match_up_to_permutation(c2)
assert c3._categories_match_up_to_permutation(c3)
assert c1._categories_match_up_to_permutation(c2)
assert not c1._categories_match_up_to_permutation(c3)
assert not c1._categories_match_up_to_permutation(Index(list("aabca")))
assert not c1._categories_match_up_to_permutation(c1.astype(object))
assert c1._categories_match_up_to_permutation(CategoricalIndex(c1))
assert c1._categories_match_up_to_permutation(
CategoricalIndex(c1, categories=list("cab"))
)
assert not c1._categories_match_up_to_permutation(
CategoricalIndex(c1, ordered=True)
)
# GH 16659
s1 = Series(c1)
s2 = Series(c2)
s3 = Series(c3)
assert c1._categories_match_up_to_permutation(s1)
assert c2._categories_match_up_to_permutation(s2)
assert c3._categories_match_up_to_permutation(s3)
assert c1._categories_match_up_to_permutation(s2)
assert not c1._categories_match_up_to_permutation(s3)
assert not c1._categories_match_up_to_permutation(s1.astype(object))
def test_set_dtype_same(self):
c = Categorical(["a", "b", "c"])
result = c._set_dtype(CategoricalDtype(["a", "b", "c"]))
tm.assert_categorical_equal(result, c)
def test_set_dtype_new_categories(self):
c = Categorical(["a", "b", "c"])
result = c._set_dtype(CategoricalDtype(list("abcd")))
tm.assert_numpy_array_equal(result.codes, c.codes)
tm.assert_index_equal(result.dtype.categories, Index(list("abcd")))
@pytest.mark.parametrize(
"values, categories, new_categories",
[
# No NaNs, same cats, same order
(["a", "b", "a"], ["a", "b"], ["a", "b"]),
# No NaNs, same cats, different order
(["a", "b", "a"], ["a", "b"], ["b", "a"]),
# Same, unsorted
(["b", "a", "a"], ["a", "b"], ["a", "b"]),
# No NaNs, same cats, different order
(["b", "a", "a"], ["a", "b"], ["b", "a"]),
# NaNs
(["a", "b", "c"], ["a", "b"], ["a", "b"]),
(["a", "b", "c"], ["a", "b"], ["b", "a"]),
(["b", "a", "c"], ["a", "b"], ["a", "b"]),
(["b", "a", "c"], ["a", "b"], ["a", "b"]),
# Introduce NaNs
(["a", "b", "c"], ["a", "b"], ["a"]),
(["a", "b", "c"], ["a", "b"], ["b"]),
(["b", "a", "c"], ["a", "b"], ["a"]),
(["b", "a", "c"], ["a", "b"], ["a"]),
# No overlap
(["a", "b", "c"], ["a", "b"], ["d", "e"]),
],
)
@pytest.mark.parametrize("ordered", [True, False])
def test_set_dtype_many(self, values, categories, new_categories, ordered):
c = Categorical(values, categories)
expected = Categorical(values, new_categories, ordered)
result = c._set_dtype(expected.dtype)
tm.assert_categorical_equal(result, expected)
def test_set_dtype_no_overlap(self):
c = Categorical(["a", "b", "c"], ["d", "e"])
result = c._set_dtype(CategoricalDtype(["a", "b"]))
expected = Categorical([None, None, None], categories=["a", "b"])
tm.assert_categorical_equal(result, expected)
def test_codes_dtypes(self):
# GH 8453
result = Categorical(["foo", "bar", "baz"])
assert result.codes.dtype == "int8"
result = Categorical([f"foo{i:05d}" for i in range(400)])
assert result.codes.dtype == "int16"
result = Categorical([f"foo{i:05d}" for i in range(40000)])
assert result.codes.dtype == "int32"
# adding cats
result = Categorical(["foo", "bar", "baz"])
assert result.codes.dtype == "int8"
result = result.add_categories([f"foo{i:05d}" for i in range(400)])
assert result.codes.dtype == "int16"
# removing cats
result = result.remove_categories([f"foo{i:05d}" for i in range(300)])
assert result.codes.dtype == "int8"
def test_iter_python_types(self):
# GH-19909
cat = Categorical([1, 2])
assert isinstance(next(iter(cat)), int)
assert isinstance(cat.tolist()[0], int)
def test_iter_python_types_datetime(self):
cat = Categorical([Timestamp("2017-01-01"), Timestamp("2017-01-02")])
assert isinstance(next(iter(cat)), Timestamp)
assert isinstance(cat.tolist()[0], Timestamp)
def test_interval_index_category(self):
# GH 38316
index = IntervalIndex.from_breaks(np.arange(3, dtype="uint64"))
result = CategoricalIndex(index).dtype.categories
expected = IntervalIndex.from_arrays(
[0, 1], [1, 2], dtype="interval[uint64, right]"
)
tm.assert_index_equal(result, expected)

View File

@ -0,0 +1,388 @@
import math
import numpy as np
import pytest
from pandas import (
NA,
Categorical,
CategoricalIndex,
Index,
Interval,
IntervalIndex,
NaT,
PeriodIndex,
Series,
Timedelta,
Timestamp,
)
import pandas._testing as tm
import pandas.core.common as com
class TestCategoricalIndexingWithFactor:
def test_getitem(self):
factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True)
assert factor[0] == "a"
assert factor[-1] == "c"
subf = factor[[0, 1, 2]]
tm.assert_numpy_array_equal(subf._codes, np.array([0, 1, 1], dtype=np.int8))
subf = factor[np.asarray(factor) == "c"]
tm.assert_numpy_array_equal(subf._codes, np.array([2, 2, 2], dtype=np.int8))
def test_setitem(self):
factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True)
# int/positional
c = factor.copy()
c[0] = "b"
assert c[0] == "b"
c[-1] = "a"
assert c[-1] == "a"
# boolean
c = factor.copy()
indexer = np.zeros(len(c), dtype="bool")
indexer[0] = True
indexer[-1] = True
c[indexer] = "c"
expected = Categorical(["c", "b", "b", "a", "a", "c", "c", "c"], ordered=True)
tm.assert_categorical_equal(c, expected)
@pytest.mark.parametrize(
"other",
[Categorical(["b", "a"]), Categorical(["b", "a"], categories=["b", "a"])],
)
def test_setitem_same_but_unordered(self, other):
# GH-24142
target = Categorical(["a", "b"], categories=["a", "b"])
mask = np.array([True, False])
target[mask] = other[mask]
expected = Categorical(["b", "b"], categories=["a", "b"])
tm.assert_categorical_equal(target, expected)
@pytest.mark.parametrize(
"other",
[
Categorical(["b", "a"], categories=["b", "a", "c"]),
Categorical(["b", "a"], categories=["a", "b", "c"]),
Categorical(["a", "a"], categories=["a"]),
Categorical(["b", "b"], categories=["b"]),
],
)
def test_setitem_different_unordered_raises(self, other):
# GH-24142
target = Categorical(["a", "b"], categories=["a", "b"])
mask = np.array([True, False])
msg = "Cannot set a Categorical with another, without identical categories"
with pytest.raises(TypeError, match=msg):
target[mask] = other[mask]
@pytest.mark.parametrize(
"other",
[
Categorical(["b", "a"]),
Categorical(["b", "a"], categories=["b", "a"], ordered=True),
Categorical(["b", "a"], categories=["a", "b", "c"], ordered=True),
],
)
def test_setitem_same_ordered_raises(self, other):
# Gh-24142
target = Categorical(["a", "b"], categories=["a", "b"], ordered=True)
mask = np.array([True, False])
msg = "Cannot set a Categorical with another, without identical categories"
with pytest.raises(TypeError, match=msg):
target[mask] = other[mask]
def test_setitem_tuple(self):
# GH#20439
cat = Categorical([(0, 1), (0, 2), (0, 1)])
# This should not raise
cat[1] = cat[0]
assert cat[1] == (0, 1)
def test_setitem_listlike(self):
# GH#9469
# properly coerce the input indexers
cat = Categorical(
np.random.default_rng(2).integers(0, 5, size=150000).astype(np.int8)
).add_categories([-1000])
indexer = np.array([100000]).astype(np.int64)
cat[indexer] = -1000
# we are asserting the code result here
# which maps to the -1000 category
result = cat.codes[np.array([100000]).astype(np.int64)]
tm.assert_numpy_array_equal(result, np.array([5], dtype="int8"))
class TestCategoricalIndexing:
def test_getitem_slice(self):
cat = Categorical(["a", "b", "c", "d", "a", "b", "c"])
sliced = cat[3]
assert sliced == "d"
sliced = cat[3:5]
expected = Categorical(["d", "a"], categories=["a", "b", "c", "d"])
tm.assert_categorical_equal(sliced, expected)
def test_getitem_listlike(self):
# GH 9469
# properly coerce the input indexers
c = Categorical(
np.random.default_rng(2).integers(0, 5, size=150000).astype(np.int8)
)
result = c.codes[np.array([100000]).astype(np.int64)]
expected = c[np.array([100000]).astype(np.int64)].codes
tm.assert_numpy_array_equal(result, expected)
def test_periodindex(self):
idx1 = PeriodIndex(
["2014-01", "2014-01", "2014-02", "2014-02", "2014-03", "2014-03"],
freq="M",
)
cat1 = Categorical(idx1)
str(cat1)
exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.int8)
exp_idx = PeriodIndex(["2014-01", "2014-02", "2014-03"], freq="M")
tm.assert_numpy_array_equal(cat1._codes, exp_arr)
tm.assert_index_equal(cat1.categories, exp_idx)
idx2 = PeriodIndex(
["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"],
freq="M",
)
cat2 = Categorical(idx2, ordered=True)
str(cat2)
exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.int8)
exp_idx2 = PeriodIndex(["2014-01", "2014-02", "2014-03"], freq="M")
tm.assert_numpy_array_equal(cat2._codes, exp_arr)
tm.assert_index_equal(cat2.categories, exp_idx2)
idx3 = PeriodIndex(
[
"2013-12",
"2013-11",
"2013-10",
"2013-09",
"2013-08",
"2013-07",
"2013-05",
],
freq="M",
)
cat3 = Categorical(idx3, ordered=True)
exp_arr = np.array([6, 5, 4, 3, 2, 1, 0], dtype=np.int8)
exp_idx = PeriodIndex(
[
"2013-05",
"2013-07",
"2013-08",
"2013-09",
"2013-10",
"2013-11",
"2013-12",
],
freq="M",
)
tm.assert_numpy_array_equal(cat3._codes, exp_arr)
tm.assert_index_equal(cat3.categories, exp_idx)
@pytest.mark.parametrize(
"null_val",
[None, np.nan, NaT, NA, math.nan, "NaT", "nat", "NAT", "nan", "NaN", "NAN"],
)
def test_periodindex_on_null_types(self, null_val):
# GH 46673
result = PeriodIndex(["2022-04-06", "2022-04-07", null_val], freq="D")
expected = PeriodIndex(["2022-04-06", "2022-04-07", "NaT"], dtype="period[D]")
assert result[2] is NaT
tm.assert_index_equal(result, expected)
@pytest.mark.parametrize("new_categories", [[1, 2, 3, 4], [1, 2]])
def test_categories_assignments_wrong_length_raises(self, new_categories):
cat = Categorical(["a", "b", "c", "a"])
msg = (
"new categories need to have the same number of items "
"as the old categories!"
)
with pytest.raises(ValueError, match=msg):
cat.rename_categories(new_categories)
# Combinations of sorted/unique:
@pytest.mark.parametrize(
"idx_values", [[1, 2, 3, 4], [1, 3, 2, 4], [1, 3, 3, 4], [1, 2, 2, 4]]
)
# Combinations of missing/unique
@pytest.mark.parametrize("key_values", [[1, 2], [1, 5], [1, 1], [5, 5]])
@pytest.mark.parametrize("key_class", [Categorical, CategoricalIndex])
@pytest.mark.parametrize("dtype", [None, "category", "key"])
def test_get_indexer_non_unique(self, idx_values, key_values, key_class, dtype):
# GH 21448
key = key_class(key_values, categories=range(1, 5))
if dtype == "key":
dtype = key.dtype
# Test for flat index and CategoricalIndex with same/different cats:
idx = Index(idx_values, dtype=dtype)
expected, exp_miss = idx.get_indexer_non_unique(key_values)
result, res_miss = idx.get_indexer_non_unique(key)
tm.assert_numpy_array_equal(expected, result)
tm.assert_numpy_array_equal(exp_miss, res_miss)
exp_unique = idx.unique().get_indexer(key_values)
res_unique = idx.unique().get_indexer(key)
tm.assert_numpy_array_equal(res_unique, exp_unique)
def test_where_unobserved_nan(self):
ser = Series(Categorical(["a", "b"]))
result = ser.where([True, False])
expected = Series(Categorical(["a", None], categories=["a", "b"]))
tm.assert_series_equal(result, expected)
# all NA
ser = Series(Categorical(["a", "b"]))
result = ser.where([False, False])
expected = Series(Categorical([None, None], categories=["a", "b"]))
tm.assert_series_equal(result, expected)
def test_where_unobserved_categories(self):
ser = Series(Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"]))
result = ser.where([True, True, False], other="b")
expected = Series(Categorical(["a", "b", "b"], categories=ser.cat.categories))
tm.assert_series_equal(result, expected)
def test_where_other_categorical(self):
ser = Series(Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"]))
other = Categorical(["b", "c", "a"], categories=["a", "c", "b", "d"])
result = ser.where([True, False, True], other)
expected = Series(Categorical(["a", "c", "c"], dtype=ser.dtype))
tm.assert_series_equal(result, expected)
def test_where_new_category_raises(self):
ser = Series(Categorical(["a", "b", "c"]))
msg = "Cannot setitem on a Categorical with a new category"
with pytest.raises(TypeError, match=msg):
ser.where([True, False, True], "d")
def test_where_ordered_differs_rasies(self):
ser = Series(
Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"], ordered=True)
)
other = Categorical(
["b", "c", "a"], categories=["a", "c", "b", "d"], ordered=True
)
with pytest.raises(TypeError, match="without identical categories"):
ser.where([True, False, True], other)
class TestContains:
def test_contains(self):
# GH#21508
cat = Categorical(list("aabbca"), categories=list("cab"))
assert "b" in cat
assert "z" not in cat
assert np.nan not in cat
with pytest.raises(TypeError, match="unhashable type: 'list'"):
assert [1] in cat
# assert codes NOT in index
assert 0 not in cat
assert 1 not in cat
cat = Categorical(list("aabbca") + [np.nan], categories=list("cab"))
assert np.nan in cat
@pytest.mark.parametrize(
"item, expected",
[
(Interval(0, 1), True),
(1.5, True),
(Interval(0.5, 1.5), False),
("a", False),
(Timestamp(1), False),
(Timedelta(1), False),
],
ids=str,
)
def test_contains_interval(self, item, expected):
# GH#23705
cat = Categorical(IntervalIndex.from_breaks(range(3)))
result = item in cat
assert result is expected
def test_contains_list(self):
# GH#21729
cat = Categorical([1, 2, 3])
assert "a" not in cat
with pytest.raises(TypeError, match="unhashable type"):
["a"] in cat
with pytest.raises(TypeError, match="unhashable type"):
["a", "b"] in cat
@pytest.mark.parametrize("index", [True, False])
def test_mask_with_boolean(index):
ser = Series(range(3))
idx = Categorical([True, False, True])
if index:
idx = CategoricalIndex(idx)
assert com.is_bool_indexer(idx)
result = ser[idx]
expected = ser[idx.astype("object")]
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("index", [True, False])
def test_mask_with_boolean_na_treated_as_false(index):
# https://github.com/pandas-dev/pandas/issues/31503
ser = Series(range(3))
idx = Categorical([True, False, None])
if index:
idx = CategoricalIndex(idx)
result = ser[idx]
expected = ser[idx.fillna(False)]
tm.assert_series_equal(result, expected)
@pytest.fixture
def non_coercible_categorical(monkeypatch):
"""
Monkeypatch Categorical.__array__ to ensure no implicit conversion.
Raises
------
ValueError
When Categorical.__array__ is called.
"""
# TODO(Categorical): identify other places where this may be
# useful and move to a conftest.py
def array(self, dtype=None):
raise ValueError("I cannot be converted.")
with monkeypatch.context() as m:
m.setattr(Categorical, "__array__", array)
yield
def test_series_at():
arr = Categorical(["a", "b", "c"])
ser = Series(arr)
result = ser.at[0]
assert result == "a"

View File

@ -0,0 +1,154 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
Categorical,
Index,
Series,
)
import pandas._testing as tm
@pytest.fixture(params=[None, "ignore"])
def na_action(request):
return request.param
@pytest.mark.parametrize(
"data, categories",
[
(list("abcbca"), list("cab")),
(pd.interval_range(0, 3).repeat(3), pd.interval_range(0, 3)),
],
ids=["string", "interval"],
)
def test_map_str(data, categories, ordered, na_action):
# GH 31202 - override base class since we want to maintain categorical/ordered
cat = Categorical(data, categories=categories, ordered=ordered)
result = cat.map(str, na_action=na_action)
expected = Categorical(
map(str, data), categories=map(str, categories), ordered=ordered
)
tm.assert_categorical_equal(result, expected)
def test_map(na_action):
cat = Categorical(list("ABABC"), categories=list("CBA"), ordered=True)
result = cat.map(lambda x: x.lower(), na_action=na_action)
exp = Categorical(list("ababc"), categories=list("cba"), ordered=True)
tm.assert_categorical_equal(result, exp)
cat = Categorical(list("ABABC"), categories=list("BAC"), ordered=False)
result = cat.map(lambda x: x.lower(), na_action=na_action)
exp = Categorical(list("ababc"), categories=list("bac"), ordered=False)
tm.assert_categorical_equal(result, exp)
# GH 12766: Return an index not an array
result = cat.map(lambda x: 1, na_action=na_action)
exp = Index(np.array([1] * 5, dtype=np.int64))
tm.assert_index_equal(result, exp)
# change categories dtype
cat = Categorical(list("ABABC"), categories=list("BAC"), ordered=False)
def f(x):
return {"A": 10, "B": 20, "C": 30}.get(x)
result = cat.map(f, na_action=na_action)
exp = Categorical([10, 20, 10, 20, 30], categories=[20, 10, 30], ordered=False)
tm.assert_categorical_equal(result, exp)
mapper = Series([10, 20, 30], index=["A", "B", "C"])
result = cat.map(mapper, na_action=na_action)
tm.assert_categorical_equal(result, exp)
result = cat.map({"A": 10, "B": 20, "C": 30}, na_action=na_action)
tm.assert_categorical_equal(result, exp)
@pytest.mark.parametrize(
("data", "f", "expected"),
(
([1, 1, np.nan], pd.isna, Index([False, False, True])),
([1, 2, np.nan], pd.isna, Index([False, False, True])),
([1, 1, np.nan], {1: False}, Categorical([False, False, np.nan])),
([1, 2, np.nan], {1: False, 2: False}, Index([False, False, np.nan])),
(
[1, 1, np.nan],
Series([False, False]),
Categorical([False, False, np.nan]),
),
(
[1, 2, np.nan],
Series([False] * 3),
Index([False, False, np.nan]),
),
),
)
def test_map_with_nan_none(data, f, expected): # GH 24241
values = Categorical(data)
result = values.map(f, na_action=None)
if isinstance(expected, Categorical):
tm.assert_categorical_equal(result, expected)
else:
tm.assert_index_equal(result, expected)
@pytest.mark.parametrize(
("data", "f", "expected"),
(
([1, 1, np.nan], pd.isna, Categorical([False, False, np.nan])),
([1, 2, np.nan], pd.isna, Index([False, False, np.nan])),
([1, 1, np.nan], {1: False}, Categorical([False, False, np.nan])),
([1, 2, np.nan], {1: False, 2: False}, Index([False, False, np.nan])),
(
[1, 1, np.nan],
Series([False, False]),
Categorical([False, False, np.nan]),
),
(
[1, 2, np.nan],
Series([False, False, False]),
Index([False, False, np.nan]),
),
),
)
def test_map_with_nan_ignore(data, f, expected): # GH 24241
values = Categorical(data)
result = values.map(f, na_action="ignore")
if data[1] == 1:
tm.assert_categorical_equal(result, expected)
else:
tm.assert_index_equal(result, expected)
def test_map_with_dict_or_series(na_action):
orig_values = ["a", "B", 1, "a"]
new_values = ["one", 2, 3.0, "one"]
cat = Categorical(orig_values)
mapper = Series(new_values[:-1], index=orig_values[:-1])
result = cat.map(mapper, na_action=na_action)
# Order of categories in result can be different
expected = Categorical(new_values, categories=[3.0, 2, "one"])
tm.assert_categorical_equal(result, expected)
mapper = dict(zip(orig_values[:-1], new_values[:-1]))
result = cat.map(mapper, na_action=na_action)
# Order of categories in result can be different
tm.assert_categorical_equal(result, expected)
def test_map_na_action_no_default_deprecated():
# GH51645
cat = Categorical(["a", "b", "c"])
msg = (
"The default value of 'ignore' for the `na_action` parameter in "
"pandas.Categorical.map is deprecated and will be "
"changed to 'None' in a future version. Please set na_action to the "
"desired value to avoid seeing this warning"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
cat.map(lambda x: x)

View File

@ -0,0 +1,216 @@
import collections
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import CategoricalDtype
import pandas as pd
from pandas import (
Categorical,
DataFrame,
Index,
Series,
isna,
)
import pandas._testing as tm
class TestCategoricalMissing:
def test_isna(self):
exp = np.array([False, False, True])
cat = Categorical(["a", "b", np.nan])
res = cat.isna()
tm.assert_numpy_array_equal(res, exp)
def test_na_flags_int_categories(self):
# #1457
categories = list(range(10))
labels = np.random.default_rng(2).integers(0, 10, 20)
labels[::5] = -1
cat = Categorical(labels, categories)
repr(cat)
tm.assert_numpy_array_equal(isna(cat), labels == -1)
def test_nan_handling(self):
# Nans are represented as -1 in codes
c = Categorical(["a", "b", np.nan, "a"])
tm.assert_index_equal(c.categories, Index(["a", "b"]))
tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8))
c[1] = np.nan
tm.assert_index_equal(c.categories, Index(["a", "b"]))
tm.assert_numpy_array_equal(c._codes, np.array([0, -1, -1, 0], dtype=np.int8))
# Adding nan to categories should make assigned nan point to the
# category!
c = Categorical(["a", "b", np.nan, "a"])
tm.assert_index_equal(c.categories, Index(["a", "b"]))
tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8))
def test_set_dtype_nans(self):
c = Categorical(["a", "b", np.nan])
result = c._set_dtype(CategoricalDtype(["a", "c"]))
tm.assert_numpy_array_equal(result.codes, np.array([0, -1, -1], dtype="int8"))
def test_set_item_nan(self):
cat = Categorical([1, 2, 3])
cat[1] = np.nan
exp = Categorical([1, np.nan, 3], categories=[1, 2, 3])
tm.assert_categorical_equal(cat, exp)
@pytest.mark.parametrize(
"fillna_kwargs, msg",
[
(
{"value": 1, "method": "ffill"},
"Cannot specify both 'value' and 'method'.",
),
({}, "Must specify a fill 'value' or 'method'."),
({"method": "bad"}, "Invalid fill method. Expecting .* bad"),
(
{"value": Series([1, 2, 3, 4, "a"])},
"Cannot setitem on a Categorical with a new category",
),
],
)
def test_fillna_raises(self, fillna_kwargs, msg):
# https://github.com/pandas-dev/pandas/issues/19682
# https://github.com/pandas-dev/pandas/issues/13628
cat = Categorical([1, 2, 3, None, None])
if len(fillna_kwargs) == 1 and "value" in fillna_kwargs:
err = TypeError
else:
err = ValueError
with pytest.raises(err, match=msg):
cat.fillna(**fillna_kwargs)
@pytest.mark.parametrize("named", [True, False])
def test_fillna_iterable_category(self, named):
# https://github.com/pandas-dev/pandas/issues/21097
if named:
Point = collections.namedtuple("Point", "x y")
else:
Point = lambda *args: args # tuple
cat = Categorical(np.array([Point(0, 0), Point(0, 1), None], dtype=object))
result = cat.fillna(Point(0, 0))
expected = Categorical([Point(0, 0), Point(0, 1), Point(0, 0)])
tm.assert_categorical_equal(result, expected)
# Case where the Point is not among our categories; we want ValueError,
# not NotImplementedError GH#41914
cat = Categorical(np.array([Point(1, 0), Point(0, 1), None], dtype=object))
msg = "Cannot setitem on a Categorical with a new category"
with pytest.raises(TypeError, match=msg):
cat.fillna(Point(0, 0))
def test_fillna_array(self):
# accept Categorical or ndarray value if it holds appropriate values
cat = Categorical(["A", "B", "C", None, None])
other = cat.fillna("C")
result = cat.fillna(other)
tm.assert_categorical_equal(result, other)
assert isna(cat[-1]) # didn't modify original inplace
other = np.array(["A", "B", "C", "B", "A"])
result = cat.fillna(other)
expected = Categorical(["A", "B", "C", "B", "A"], dtype=cat.dtype)
tm.assert_categorical_equal(result, expected)
assert isna(cat[-1]) # didn't modify original inplace
@pytest.mark.parametrize(
"values, expected",
[
([1, 2, 3], np.array([False, False, False])),
([1, 2, np.nan], np.array([False, False, True])),
([1, 2, np.inf], np.array([False, False, True])),
([1, 2, pd.NA], np.array([False, False, True])),
],
)
def test_use_inf_as_na(self, values, expected):
# https://github.com/pandas-dev/pandas/issues/33594
msg = "use_inf_as_na option is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
with pd.option_context("mode.use_inf_as_na", True):
cat = Categorical(values)
result = cat.isna()
tm.assert_numpy_array_equal(result, expected)
result = Series(cat).isna()
expected = Series(expected)
tm.assert_series_equal(result, expected)
result = DataFrame(cat).isna()
expected = DataFrame(expected)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"values, expected",
[
([1, 2, 3], np.array([False, False, False])),
([1, 2, np.nan], np.array([False, False, True])),
([1, 2, np.inf], np.array([False, False, True])),
([1, 2, pd.NA], np.array([False, False, True])),
],
)
def test_use_inf_as_na_outside_context(self, values, expected):
# https://github.com/pandas-dev/pandas/issues/33594
# Using isna directly for Categorical will fail in general here
cat = Categorical(values)
msg = "use_inf_as_na option is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
with pd.option_context("mode.use_inf_as_na", True):
result = isna(cat)
tm.assert_numpy_array_equal(result, expected)
result = isna(Series(cat))
expected = Series(expected)
tm.assert_series_equal(result, expected)
result = isna(DataFrame(cat))
expected = DataFrame(expected)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"a1, a2, categories",
[
(["a", "b", "c"], [np.nan, "a", "b"], ["a", "b", "c"]),
([1, 2, 3], [np.nan, 1, 2], [1, 2, 3]),
],
)
def test_compare_categorical_with_missing(self, a1, a2, categories):
# GH 28384
cat_type = CategoricalDtype(categories)
# !=
result = Series(a1, dtype=cat_type) != Series(a2, dtype=cat_type)
expected = Series(a1) != Series(a2)
tm.assert_series_equal(result, expected)
# ==
result = Series(a1, dtype=cat_type) == Series(a2, dtype=cat_type)
expected = Series(a1) == Series(a2)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"na_value, dtype",
[
(pd.NaT, "datetime64[ns]"),
(None, "float64"),
(np.nan, "float64"),
(pd.NA, "float64"),
],
)
def test_categorical_only_missing_values_no_cast(self, na_value, dtype):
# GH#44900
result = Categorical([na_value, na_value])
tm.assert_index_equal(result.categories, Index([], dtype=dtype))

View File

@ -0,0 +1,414 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
Categorical,
DataFrame,
Series,
Timestamp,
date_range,
)
import pandas._testing as tm
class TestCategoricalOpsWithFactor:
def test_categories_none_comparisons(self):
factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True)
tm.assert_categorical_equal(factor, factor)
def test_comparisons(self):
factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True)
result = factor[factor == "a"]
expected = factor[np.asarray(factor) == "a"]
tm.assert_categorical_equal(result, expected)
result = factor[factor != "a"]
expected = factor[np.asarray(factor) != "a"]
tm.assert_categorical_equal(result, expected)
result = factor[factor < "c"]
expected = factor[np.asarray(factor) < "c"]
tm.assert_categorical_equal(result, expected)
result = factor[factor > "a"]
expected = factor[np.asarray(factor) > "a"]
tm.assert_categorical_equal(result, expected)
result = factor[factor >= "b"]
expected = factor[np.asarray(factor) >= "b"]
tm.assert_categorical_equal(result, expected)
result = factor[factor <= "b"]
expected = factor[np.asarray(factor) <= "b"]
tm.assert_categorical_equal(result, expected)
n = len(factor)
other = factor[np.random.default_rng(2).permutation(n)]
result = factor == other
expected = np.asarray(factor) == np.asarray(other)
tm.assert_numpy_array_equal(result, expected)
result = factor == "d"
expected = np.zeros(len(factor), dtype=bool)
tm.assert_numpy_array_equal(result, expected)
# comparisons with categoricals
cat_rev = Categorical(["a", "b", "c"], categories=["c", "b", "a"], ordered=True)
cat_rev_base = Categorical(
["b", "b", "b"], categories=["c", "b", "a"], ordered=True
)
cat = Categorical(["a", "b", "c"], ordered=True)
cat_base = Categorical(["b", "b", "b"], categories=cat.categories, ordered=True)
# comparisons need to take categories ordering into account
res_rev = cat_rev > cat_rev_base
exp_rev = np.array([True, False, False])
tm.assert_numpy_array_equal(res_rev, exp_rev)
res_rev = cat_rev < cat_rev_base
exp_rev = np.array([False, False, True])
tm.assert_numpy_array_equal(res_rev, exp_rev)
res = cat > cat_base
exp = np.array([False, False, True])
tm.assert_numpy_array_equal(res, exp)
# Only categories with same categories can be compared
msg = "Categoricals can only be compared if 'categories' are the same"
with pytest.raises(TypeError, match=msg):
cat > cat_rev
cat_rev_base2 = Categorical(["b", "b", "b"], categories=["c", "b", "a", "d"])
with pytest.raises(TypeError, match=msg):
cat_rev > cat_rev_base2
# Only categories with same ordering information can be compared
cat_unordered = cat.set_ordered(False)
assert not (cat > cat).any()
with pytest.raises(TypeError, match=msg):
cat > cat_unordered
# comparison (in both directions) with Series will raise
s = Series(["b", "b", "b"], dtype=object)
msg = (
"Cannot compare a Categorical for op __gt__ with type "
r"<class 'numpy\.ndarray'>"
)
with pytest.raises(TypeError, match=msg):
cat > s
with pytest.raises(TypeError, match=msg):
cat_rev > s
with pytest.raises(TypeError, match=msg):
s < cat
with pytest.raises(TypeError, match=msg):
s < cat_rev
# comparison with numpy.array will raise in both direction, but only on
# newer numpy versions
a = np.array(["b", "b", "b"], dtype=object)
with pytest.raises(TypeError, match=msg):
cat > a
with pytest.raises(TypeError, match=msg):
cat_rev > a
# Make sure that unequal comparison take the categories order in
# account
cat_rev = Categorical(list("abc"), categories=list("cba"), ordered=True)
exp = np.array([True, False, False])
res = cat_rev > "b"
tm.assert_numpy_array_equal(res, exp)
# check that zero-dim array gets unboxed
res = cat_rev > np.array("b")
tm.assert_numpy_array_equal(res, exp)
class TestCategoricalOps:
@pytest.mark.parametrize(
"categories",
[["a", "b"], [0, 1], [Timestamp("2019"), Timestamp("2020")]],
)
def test_not_equal_with_na(self, categories):
# https://github.com/pandas-dev/pandas/issues/32276
c1 = Categorical.from_codes([-1, 0], categories=categories)
c2 = Categorical.from_codes([0, 1], categories=categories)
result = c1 != c2
assert result.all()
def test_compare_frame(self):
# GH#24282 check that Categorical.__cmp__(DataFrame) defers to frame
data = ["a", "b", 2, "a"]
cat = Categorical(data)
df = DataFrame(cat)
result = cat == df.T
expected = DataFrame([[True, True, True, True]])
tm.assert_frame_equal(result, expected)
result = cat[::-1] != df.T
expected = DataFrame([[False, True, True, False]])
tm.assert_frame_equal(result, expected)
def test_compare_frame_raises(self, comparison_op):
# alignment raises unless we transpose
op = comparison_op
cat = Categorical(["a", "b", 2, "a"])
df = DataFrame(cat)
msg = "Unable to coerce to Series, length must be 1: given 4"
with pytest.raises(ValueError, match=msg):
op(cat, df)
def test_datetime_categorical_comparison(self):
dt_cat = Categorical(date_range("2014-01-01", periods=3), ordered=True)
tm.assert_numpy_array_equal(dt_cat > dt_cat[0], np.array([False, True, True]))
tm.assert_numpy_array_equal(dt_cat[0] < dt_cat, np.array([False, True, True]))
def test_reflected_comparison_with_scalars(self):
# GH8658
cat = Categorical([1, 2, 3], ordered=True)
tm.assert_numpy_array_equal(cat > cat[0], np.array([False, True, True]))
tm.assert_numpy_array_equal(cat[0] < cat, np.array([False, True, True]))
def test_comparison_with_unknown_scalars(self):
# https://github.com/pandas-dev/pandas/issues/9836#issuecomment-92123057
# and following comparisons with scalars not in categories should raise
# for unequal comps, but not for equal/not equal
cat = Categorical([1, 2, 3], ordered=True)
msg = "Invalid comparison between dtype=category and int"
with pytest.raises(TypeError, match=msg):
cat < 4
with pytest.raises(TypeError, match=msg):
cat > 4
with pytest.raises(TypeError, match=msg):
4 < cat
with pytest.raises(TypeError, match=msg):
4 > cat
tm.assert_numpy_array_equal(cat == 4, np.array([False, False, False]))
tm.assert_numpy_array_equal(cat != 4, np.array([True, True, True]))
def test_comparison_with_tuple(self):
cat = Categorical(np.array(["foo", (0, 1), 3, (0, 1)], dtype=object))
result = cat == "foo"
expected = np.array([True, False, False, False], dtype=bool)
tm.assert_numpy_array_equal(result, expected)
result = cat == (0, 1)
expected = np.array([False, True, False, True], dtype=bool)
tm.assert_numpy_array_equal(result, expected)
result = cat != (0, 1)
tm.assert_numpy_array_equal(result, ~expected)
@pytest.mark.filterwarnings("ignore::RuntimeWarning")
def test_comparison_of_ordered_categorical_with_nan_to_scalar(
self, compare_operators_no_eq_ne
):
# https://github.com/pandas-dev/pandas/issues/26504
# BUG: fix ordered categorical comparison with missing values (#26504 )
# and following comparisons with scalars in categories with missing
# values should be evaluated as False
cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True)
scalar = 2
expected = getattr(np.array(cat), compare_operators_no_eq_ne)(scalar)
actual = getattr(cat, compare_operators_no_eq_ne)(scalar)
tm.assert_numpy_array_equal(actual, expected)
@pytest.mark.filterwarnings("ignore::RuntimeWarning")
def test_comparison_of_ordered_categorical_with_nan_to_listlike(
self, compare_operators_no_eq_ne
):
# https://github.com/pandas-dev/pandas/issues/26504
# and following comparisons of missing values in ordered Categorical
# with listlike should be evaluated as False
cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True)
other = Categorical([2, 2, 2, 2], categories=[1, 2, 3], ordered=True)
expected = getattr(np.array(cat), compare_operators_no_eq_ne)(2)
actual = getattr(cat, compare_operators_no_eq_ne)(other)
tm.assert_numpy_array_equal(actual, expected)
@pytest.mark.parametrize(
"data,reverse,base",
[(list("abc"), list("cba"), list("bbb")), ([1, 2, 3], [3, 2, 1], [2, 2, 2])],
)
def test_comparisons(self, data, reverse, base):
cat_rev = Series(Categorical(data, categories=reverse, ordered=True))
cat_rev_base = Series(Categorical(base, categories=reverse, ordered=True))
cat = Series(Categorical(data, ordered=True))
cat_base = Series(
Categorical(base, categories=cat.cat.categories, ordered=True)
)
s = Series(base, dtype=object if base == list("bbb") else None)
a = np.array(base)
# comparisons need to take categories ordering into account
res_rev = cat_rev > cat_rev_base
exp_rev = Series([True, False, False])
tm.assert_series_equal(res_rev, exp_rev)
res_rev = cat_rev < cat_rev_base
exp_rev = Series([False, False, True])
tm.assert_series_equal(res_rev, exp_rev)
res = cat > cat_base
exp = Series([False, False, True])
tm.assert_series_equal(res, exp)
scalar = base[1]
res = cat > scalar
exp = Series([False, False, True])
exp2 = cat.values > scalar
tm.assert_series_equal(res, exp)
tm.assert_numpy_array_equal(res.values, exp2)
res_rev = cat_rev > scalar
exp_rev = Series([True, False, False])
exp_rev2 = cat_rev.values > scalar
tm.assert_series_equal(res_rev, exp_rev)
tm.assert_numpy_array_equal(res_rev.values, exp_rev2)
# Only categories with same categories can be compared
msg = "Categoricals can only be compared if 'categories' are the same"
with pytest.raises(TypeError, match=msg):
cat > cat_rev
# categorical cannot be compared to Series or numpy array, and also
# not the other way around
msg = (
"Cannot compare a Categorical for op __gt__ with type "
r"<class 'numpy\.ndarray'>"
)
with pytest.raises(TypeError, match=msg):
cat > s
with pytest.raises(TypeError, match=msg):
cat_rev > s
with pytest.raises(TypeError, match=msg):
cat > a
with pytest.raises(TypeError, match=msg):
cat_rev > a
with pytest.raises(TypeError, match=msg):
s < cat
with pytest.raises(TypeError, match=msg):
s < cat_rev
with pytest.raises(TypeError, match=msg):
a < cat
with pytest.raises(TypeError, match=msg):
a < cat_rev
@pytest.mark.parametrize(
"ctor",
[
lambda *args, **kwargs: Categorical(*args, **kwargs),
lambda *args, **kwargs: Series(Categorical(*args, **kwargs)),
],
)
def test_unordered_different_order_equal(self, ctor):
# https://github.com/pandas-dev/pandas/issues/16014
c1 = ctor(["a", "b"], categories=["a", "b"], ordered=False)
c2 = ctor(["a", "b"], categories=["b", "a"], ordered=False)
assert (c1 == c2).all()
c1 = ctor(["a", "b"], categories=["a", "b"], ordered=False)
c2 = ctor(["b", "a"], categories=["b", "a"], ordered=False)
assert (c1 != c2).all()
c1 = ctor(["a", "a"], categories=["a", "b"], ordered=False)
c2 = ctor(["b", "b"], categories=["b", "a"], ordered=False)
assert (c1 != c2).all()
c1 = ctor(["a", "a"], categories=["a", "b"], ordered=False)
c2 = ctor(["a", "b"], categories=["b", "a"], ordered=False)
result = c1 == c2
tm.assert_numpy_array_equal(np.array(result), np.array([True, False]))
def test_unordered_different_categories_raises(self):
c1 = Categorical(["a", "b"], categories=["a", "b"], ordered=False)
c2 = Categorical(["a", "c"], categories=["c", "a"], ordered=False)
with pytest.raises(TypeError, match=("Categoricals can only be compared")):
c1 == c2
def test_compare_different_lengths(self):
c1 = Categorical([], categories=["a", "b"])
c2 = Categorical([], categories=["a"])
msg = "Categoricals can only be compared if 'categories' are the same."
with pytest.raises(TypeError, match=msg):
c1 == c2
def test_compare_unordered_different_order(self):
# https://github.com/pandas-dev/pandas/issues/16603#issuecomment-
# 349290078
a = Categorical(["a"], categories=["a", "b"])
b = Categorical(["b"], categories=["b", "a"])
assert not a.equals(b)
def test_numeric_like_ops(self):
df = DataFrame({"value": np.random.default_rng(2).integers(0, 10000, 100)})
labels = [f"{i} - {i + 499}" for i in range(0, 10000, 500)]
cat_labels = Categorical(labels, labels)
df = df.sort_values(by=["value"], ascending=True)
df["value_group"] = pd.cut(
df.value, range(0, 10500, 500), right=False, labels=cat_labels
)
# numeric ops should not succeed
for op, str_rep in [
("__add__", r"\+"),
("__sub__", "-"),
("__mul__", r"\*"),
("__truediv__", "/"),
]:
msg = f"Series cannot perform the operation {str_rep}|unsupported operand"
with pytest.raises(TypeError, match=msg):
getattr(df, op)(df)
# reduction ops should not succeed (unless specifically defined, e.g.
# min/max)
s = df["value_group"]
for op in ["kurt", "skew", "var", "std", "mean", "sum", "median"]:
msg = f"does not support reduction '{op}'"
with pytest.raises(TypeError, match=msg):
getattr(s, op)(numeric_only=False)
def test_numeric_like_ops_series(self):
# numpy ops
s = Series(Categorical([1, 2, 3, 4]))
with pytest.raises(TypeError, match="does not support reduction 'sum'"):
np.sum(s)
@pytest.mark.parametrize(
"op, str_rep",
[
("__add__", r"\+"),
("__sub__", "-"),
("__mul__", r"\*"),
("__truediv__", "/"),
],
)
def test_numeric_like_ops_series_arith(self, op, str_rep):
# numeric ops on a Series
s = Series(Categorical([1, 2, 3, 4]))
msg = f"Series cannot perform the operation {str_rep}|unsupported operand"
with pytest.raises(TypeError, match=msg):
getattr(s, op)(2)
def test_numeric_like_ops_series_invalid(self):
# invalid ufunc
s = Series(Categorical([1, 2, 3, 4]))
msg = "Object with dtype category cannot perform the numpy op log"
with pytest.raises(TypeError, match=msg):
np.log(s)

View File

@ -0,0 +1,111 @@
import pytest
import pandas as pd
from pandas import Categorical
import pandas._testing as tm
@pytest.mark.parametrize(
"to_replace,value,expected,flip_categories",
[
# one-to-one
(1, 2, [2, 2, 3], False),
(1, 4, [4, 2, 3], False),
(4, 1, [1, 2, 3], False),
(5, 6, [1, 2, 3], False),
# many-to-one
([1], 2, [2, 2, 3], False),
([1, 2], 3, [3, 3, 3], False),
([1, 2], 4, [4, 4, 3], False),
((1, 2, 4), 5, [5, 5, 3], False),
((5, 6), 2, [1, 2, 3], False),
([1], [2], [2, 2, 3], False),
([1, 4], [5, 2], [5, 2, 3], False),
# GH49404: overlap between to_replace and value
([1, 2, 3], [2, 3, 4], [2, 3, 4], False),
# GH50872, GH46884: replace with null
(1, None, [None, 2, 3], False),
(1, pd.NA, [None, 2, 3], False),
# check_categorical sorts categories, which crashes on mixed dtypes
(3, "4", [1, 2, "4"], False),
([1, 2, "3"], "5", ["5", "5", 3], True),
],
)
@pytest.mark.filterwarnings(
"ignore:.*with CategoricalDtype is deprecated:FutureWarning"
)
def test_replace_categorical_series(to_replace, value, expected, flip_categories):
# GH 31720
ser = pd.Series([1, 2, 3], dtype="category")
result = ser.replace(to_replace, value)
expected = pd.Series(expected, dtype="category")
ser.replace(to_replace, value, inplace=True)
if flip_categories:
expected = expected.cat.set_categories(expected.cat.categories[::-1])
tm.assert_series_equal(expected, result, check_category_order=False)
tm.assert_series_equal(expected, ser, check_category_order=False)
@pytest.mark.parametrize(
"to_replace, value, result, expected_error_msg",
[
("b", "c", ["a", "c"], "Categorical.categories are different"),
("c", "d", ["a", "b"], None),
# https://github.com/pandas-dev/pandas/issues/33288
("a", "a", ["a", "b"], None),
("b", None, ["a", None], "Categorical.categories length are different"),
],
)
def test_replace_categorical(to_replace, value, result, expected_error_msg):
# GH#26988
cat = Categorical(["a", "b"])
expected = Categorical(result)
msg = (
r"The behavior of Series\.replace \(and DataFrame.replace\) "
"with CategoricalDtype"
)
warn = FutureWarning if expected_error_msg is not None else None
with tm.assert_produces_warning(warn, match=msg):
result = pd.Series(cat, copy=False).replace(to_replace, value)._values
tm.assert_categorical_equal(result, expected)
if to_replace == "b": # the "c" test is supposed to be unchanged
with pytest.raises(AssertionError, match=expected_error_msg):
# ensure non-inplace call does not affect original
tm.assert_categorical_equal(cat, expected)
ser = pd.Series(cat, copy=False)
with tm.assert_produces_warning(warn, match=msg):
ser.replace(to_replace, value, inplace=True)
tm.assert_categorical_equal(cat, expected)
def test_replace_categorical_ea_dtype():
# GH49404
cat = Categorical(pd.array(["a", "b"], dtype="string"))
msg = (
r"The behavior of Series\.replace \(and DataFrame.replace\) "
"with CategoricalDtype"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
result = pd.Series(cat).replace(["a", "b"], ["c", pd.NA])._values
expected = Categorical(pd.array(["c", pd.NA], dtype="string"))
tm.assert_categorical_equal(result, expected)
def test_replace_maintain_ordering():
# GH51016
dtype = pd.CategoricalDtype([0, 1, 2], ordered=True)
ser = pd.Series([0, 1, 2], dtype=dtype)
msg = (
r"The behavior of Series\.replace \(and DataFrame.replace\) "
"with CategoricalDtype"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
result = ser.replace(0, 2)
expected_dtype = pd.CategoricalDtype([1, 2], ordered=True)
expected = pd.Series([2, 1, 2], dtype=expected_dtype)
tm.assert_series_equal(expected, result, check_category_order=True)

View File

@ -0,0 +1,545 @@
import numpy as np
import pytest
from pandas._config import using_string_dtype
from pandas import (
Categorical,
CategoricalDtype,
CategoricalIndex,
Index,
Series,
date_range,
option_context,
period_range,
timedelta_range,
)
class TestCategoricalReprWithFactor:
def test_print(self, using_infer_string):
factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True)
dtype = "str" if using_infer_string else "object"
expected = [
"['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
f"Categories (3, {dtype}): ['a' < 'b' < 'c']",
]
expected = "\n".join(expected)
actual = repr(factor)
assert actual == expected
class TestCategoricalRepr:
def test_big_print(self):
codes = np.array([0, 1, 2, 0, 1, 2] * 100)
dtype = CategoricalDtype(categories=Index(["a", "b", "c"], dtype=object))
factor = Categorical.from_codes(codes, dtype=dtype)
expected = [
"['a', 'b', 'c', 'a', 'b', ..., 'b', 'c', 'a', 'b', 'c']",
"Length: 600",
"Categories (3, object): ['a', 'b', 'c']",
]
expected = "\n".join(expected)
actual = repr(factor)
assert actual == expected
def test_empty_print(self):
factor = Categorical([], Index(["a", "b", "c"], dtype=object))
expected = "[], Categories (3, object): ['a', 'b', 'c']"
actual = repr(factor)
assert actual == expected
assert expected == actual
factor = Categorical([], Index(["a", "b", "c"], dtype=object), ordered=True)
expected = "[], Categories (3, object): ['a' < 'b' < 'c']"
actual = repr(factor)
assert expected == actual
factor = Categorical([], [])
expected = "[], Categories (0, object): []"
assert expected == repr(factor)
def test_print_none_width(self):
# GH10087
a = Series(Categorical([1, 2, 3, 4]))
exp = (
"0 1\n1 2\n2 3\n3 4\n"
"dtype: category\nCategories (4, int64): [1, 2, 3, 4]"
)
with option_context("display.width", None):
assert exp == repr(a)
@pytest.mark.skipif(
using_string_dtype(),
reason="Change once infer_string is set to True by default",
)
def test_unicode_print(self):
c = Categorical(["aaaaa", "bb", "cccc"] * 20)
expected = """\
['aaaaa', 'bb', 'cccc', 'aaaaa', 'bb', ..., 'bb', 'cccc', 'aaaaa', 'bb', 'cccc']
Length: 60
Categories (3, object): ['aaaaa', 'bb', 'cccc']"""
assert repr(c) == expected
c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20)
expected = """\
['ああああ', 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', ..., 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', 'ううううううう']
Length: 60
Categories (3, object): ['ああああ', 'いいいいい', 'ううううううう']""" # noqa: E501
assert repr(c) == expected
# unicode option should not affect to Categorical, as it doesn't care
# the repr width
with option_context("display.unicode.east_asian_width", True):
c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20)
expected = """['ああああ', 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', ..., 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', 'ううううううう']
Length: 60
Categories (3, object): ['ああああ', 'いいいいい', 'ううううううう']""" # noqa: E501
assert repr(c) == expected
def test_categorical_repr(self):
c = Categorical([1, 2, 3])
exp = """[1, 2, 3]
Categories (3, int64): [1, 2, 3]"""
assert repr(c) == exp
c = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3])
exp = """[1, 2, 3, 1, 2, 3]
Categories (3, int64): [1, 2, 3]"""
assert repr(c) == exp
c = Categorical([1, 2, 3, 4, 5] * 10)
exp = """[1, 2, 3, 4, 5, ..., 1, 2, 3, 4, 5]
Length: 50
Categories (5, int64): [1, 2, 3, 4, 5]"""
assert repr(c) == exp
c = Categorical(np.arange(20, dtype=np.int64))
exp = """[0, 1, 2, 3, 4, ..., 15, 16, 17, 18, 19]
Length: 20
Categories (20, int64): [0, 1, 2, 3, ..., 16, 17, 18, 19]"""
assert repr(c) == exp
def test_categorical_repr_ordered(self):
c = Categorical([1, 2, 3], ordered=True)
exp = """[1, 2, 3]
Categories (3, int64): [1 < 2 < 3]"""
assert repr(c) == exp
c = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3], ordered=True)
exp = """[1, 2, 3, 1, 2, 3]
Categories (3, int64): [1 < 2 < 3]"""
assert repr(c) == exp
c = Categorical([1, 2, 3, 4, 5] * 10, ordered=True)
exp = """[1, 2, 3, 4, 5, ..., 1, 2, 3, 4, 5]
Length: 50
Categories (5, int64): [1 < 2 < 3 < 4 < 5]"""
assert repr(c) == exp
c = Categorical(np.arange(20, dtype=np.int64), ordered=True)
exp = """[0, 1, 2, 3, 4, ..., 15, 16, 17, 18, 19]
Length: 20
Categories (20, int64): [0 < 1 < 2 < 3 ... 16 < 17 < 18 < 19]"""
assert repr(c) == exp
def test_categorical_repr_datetime(self):
idx = date_range("2011-01-01 09:00", freq="h", periods=5)
c = Categorical(idx)
exp = (
"[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, "
"2011-01-01 12:00:00, 2011-01-01 13:00:00]\n"
"Categories (5, datetime64[ns]): [2011-01-01 09:00:00, "
"2011-01-01 10:00:00, 2011-01-01 11:00:00,\n"
" 2011-01-01 12:00:00, "
"2011-01-01 13:00:00]"
""
)
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx)
exp = (
"[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, "
"2011-01-01 12:00:00, 2011-01-01 13:00:00, 2011-01-01 09:00:00, "
"2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, "
"2011-01-01 13:00:00]\n"
"Categories (5, datetime64[ns]): [2011-01-01 09:00:00, "
"2011-01-01 10:00:00, 2011-01-01 11:00:00,\n"
" 2011-01-01 12:00:00, "
"2011-01-01 13:00:00]"
)
assert repr(c) == exp
idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern")
c = Categorical(idx)
exp = (
"[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, "
"2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, "
"2011-01-01 13:00:00-05:00]\n"
"Categories (5, datetime64[ns, US/Eastern]): "
"[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00,\n"
" "
"2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00,\n"
" "
"2011-01-01 13:00:00-05:00]"
)
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx)
exp = (
"[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, "
"2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, "
"2011-01-01 13:00:00-05:00, 2011-01-01 09:00:00-05:00, "
"2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, "
"2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00]\n"
"Categories (5, datetime64[ns, US/Eastern]): "
"[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00,\n"
" "
"2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00,\n"
" "
"2011-01-01 13:00:00-05:00]"
)
assert repr(c) == exp
def test_categorical_repr_datetime_ordered(self):
idx = date_range("2011-01-01 09:00", freq="h", periods=5)
c = Categorical(idx, ordered=True)
exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00]
Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 <
2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" # noqa: E501
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx, ordered=True)
exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00, 2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00]
Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 <
2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" # noqa: E501
assert repr(c) == exp
idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern")
c = Categorical(idx, ordered=True)
exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00]
Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 <
2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 <
2011-01-01 13:00:00-05:00]""" # noqa: E501
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx, ordered=True)
exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00, 2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00]
Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 <
2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 <
2011-01-01 13:00:00-05:00]""" # noqa: E501
assert repr(c) == exp
def test_categorical_repr_int_with_nan(self):
c = Categorical([1, 2, np.nan])
c_exp = """[1, 2, NaN]\nCategories (2, int64): [1, 2]"""
assert repr(c) == c_exp
s = Series([1, 2, np.nan], dtype="object").astype("category")
s_exp = """0 1\n1 2\n2 NaN
dtype: category
Categories (2, int64): [1, 2]"""
assert repr(s) == s_exp
def test_categorical_repr_period(self):
idx = period_range("2011-01-01 09:00", freq="h", periods=5)
c = Categorical(idx)
exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]
Categories (5, period[h]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00,
2011-01-01 13:00]""" # noqa: E501
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx)
exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]
Categories (5, period[h]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00,
2011-01-01 13:00]""" # noqa: E501
assert repr(c) == exp
idx = period_range("2011-01", freq="M", periods=5)
c = Categorical(idx)
exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05]
Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]"""
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx)
exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05, 2011-01, 2011-02, 2011-03, 2011-04, 2011-05]
Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" # noqa: E501
assert repr(c) == exp
def test_categorical_repr_period_ordered(self):
idx = period_range("2011-01-01 09:00", freq="h", periods=5)
c = Categorical(idx, ordered=True)
exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]
Categories (5, period[h]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 <
2011-01-01 13:00]""" # noqa: E501
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx, ordered=True)
exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]
Categories (5, period[h]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 <
2011-01-01 13:00]""" # noqa: E501
assert repr(c) == exp
idx = period_range("2011-01", freq="M", periods=5)
c = Categorical(idx, ordered=True)
exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05]
Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]"""
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx, ordered=True)
exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05, 2011-01, 2011-02, 2011-03, 2011-04, 2011-05]
Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" # noqa: E501
assert repr(c) == exp
def test_categorical_repr_timedelta(self):
idx = timedelta_range("1 days", periods=5)
c = Categorical(idx)
exp = """[1 days, 2 days, 3 days, 4 days, 5 days]
Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]"""
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx)
exp = """[1 days, 2 days, 3 days, 4 days, 5 days, 1 days, 2 days, 3 days, 4 days, 5 days]
Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]""" # noqa: E501
assert repr(c) == exp
idx = timedelta_range("1 hours", periods=20)
c = Categorical(idx)
exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]
Length: 20
Categories (20, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00,
3 days 01:00:00, ..., 16 days 01:00:00, 17 days 01:00:00,
18 days 01:00:00, 19 days 01:00:00]""" # noqa: E501
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx)
exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]
Length: 40
Categories (20, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00,
3 days 01:00:00, ..., 16 days 01:00:00, 17 days 01:00:00,
18 days 01:00:00, 19 days 01:00:00]""" # noqa: E501
assert repr(c) == exp
def test_categorical_repr_timedelta_ordered(self):
idx = timedelta_range("1 days", periods=5)
c = Categorical(idx, ordered=True)
exp = """[1 days, 2 days, 3 days, 4 days, 5 days]
Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]"""
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx, ordered=True)
exp = """[1 days, 2 days, 3 days, 4 days, 5 days, 1 days, 2 days, 3 days, 4 days, 5 days]
Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" # noqa: E501
assert repr(c) == exp
idx = timedelta_range("1 hours", periods=20)
c = Categorical(idx, ordered=True)
exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]
Length: 20
Categories (20, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 <
3 days 01:00:00 ... 16 days 01:00:00 < 17 days 01:00:00 <
18 days 01:00:00 < 19 days 01:00:00]""" # noqa: E501
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx, ordered=True)
exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]
Length: 40
Categories (20, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 <
3 days 01:00:00 ... 16 days 01:00:00 < 17 days 01:00:00 <
18 days 01:00:00 < 19 days 01:00:00]""" # noqa: E501
assert repr(c) == exp
def test_categorical_index_repr(self):
idx = CategoricalIndex(Categorical([1, 2, 3]))
exp = """CategoricalIndex([1, 2, 3], categories=[1, 2, 3], ordered=False, dtype='category')""" # noqa: E501
assert repr(idx) == exp
i = CategoricalIndex(Categorical(np.arange(10, dtype=np.int64)))
exp = """CategoricalIndex([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], categories=[0, 1, 2, 3, ..., 6, 7, 8, 9], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
def test_categorical_index_repr_ordered(self):
i = CategoricalIndex(Categorical([1, 2, 3], ordered=True))
exp = """CategoricalIndex([1, 2, 3], categories=[1, 2, 3], ordered=True, dtype='category')""" # noqa: E501
assert repr(i) == exp
i = CategoricalIndex(Categorical(np.arange(10, dtype=np.int64), ordered=True))
exp = """CategoricalIndex([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], categories=[0, 1, 2, 3, ..., 6, 7, 8, 9], ordered=True, dtype='category')""" # noqa: E501
assert repr(i) == exp
def test_categorical_index_repr_datetime(self):
idx = date_range("2011-01-01 09:00", freq="h", periods=5)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00',
'2011-01-01 11:00:00', '2011-01-01 12:00:00',
'2011-01-01 13:00:00'],
categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern")
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00',
'2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00',
'2011-01-01 13:00:00-05:00'],
categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
def test_categorical_index_repr_datetime_ordered(self):
idx = date_range("2011-01-01 09:00", freq="h", periods=5)
i = CategoricalIndex(Categorical(idx, ordered=True))
exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00',
'2011-01-01 11:00:00', '2011-01-01 12:00:00',
'2011-01-01 13:00:00'],
categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=True, dtype='category')""" # noqa: E501
assert repr(i) == exp
idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern")
i = CategoricalIndex(Categorical(idx, ordered=True))
exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00',
'2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00',
'2011-01-01 13:00:00-05:00'],
categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category')""" # noqa: E501
assert repr(i) == exp
i = CategoricalIndex(Categorical(idx.append(idx), ordered=True))
exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00',
'2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00',
'2011-01-01 13:00:00-05:00', '2011-01-01 09:00:00-05:00',
'2011-01-01 10:00:00-05:00', '2011-01-01 11:00:00-05:00',
'2011-01-01 12:00:00-05:00', '2011-01-01 13:00:00-05:00'],
categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category')""" # noqa: E501
assert repr(i) == exp
def test_categorical_index_repr_period(self):
# test all length
idx = period_range("2011-01-01 09:00", freq="h", periods=1)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01-01 09:00'], categories=[2011-01-01 09:00], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
idx = period_range("2011-01-01 09:00", freq="h", periods=2)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
idx = period_range("2011-01-01 09:00", freq="h", periods=3)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
idx = period_range("2011-01-01 09:00", freq="h", periods=5)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00',
'2011-01-01 12:00', '2011-01-01 13:00'],
categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
i = CategoricalIndex(Categorical(idx.append(idx)))
exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00',
'2011-01-01 12:00', '2011-01-01 13:00', '2011-01-01 09:00',
'2011-01-01 10:00', '2011-01-01 11:00', '2011-01-01 12:00',
'2011-01-01 13:00'],
categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
idx = period_range("2011-01", freq="M", periods=5)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
def test_categorical_index_repr_period_ordered(self):
idx = period_range("2011-01-01 09:00", freq="h", periods=5)
i = CategoricalIndex(Categorical(idx, ordered=True))
exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00',
'2011-01-01 12:00', '2011-01-01 13:00'],
categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=True, dtype='category')""" # noqa: E501
assert repr(i) == exp
idx = period_range("2011-01", freq="M", periods=5)
i = CategoricalIndex(Categorical(idx, ordered=True))
exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=True, dtype='category')""" # noqa: E501
assert repr(i) == exp
def test_categorical_index_repr_timedelta(self):
idx = timedelta_range("1 days", periods=5)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days, 2 days, 3 days, 4 days, 5 days], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
idx = timedelta_range("1 hours", periods=10)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['0 days 01:00:00', '1 days 01:00:00', '2 days 01:00:00',
'3 days 01:00:00', '4 days 01:00:00', '5 days 01:00:00',
'6 days 01:00:00', '7 days 01:00:00', '8 days 01:00:00',
'9 days 01:00:00'],
categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, ..., 6 days 01:00:00, 7 days 01:00:00, 8 days 01:00:00, 9 days 01:00:00], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
def test_categorical_index_repr_timedelta_ordered(self):
idx = timedelta_range("1 days", periods=5)
i = CategoricalIndex(Categorical(idx, ordered=True))
exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days, 2 days, 3 days, 4 days, 5 days], ordered=True, dtype='category')""" # noqa: E501
assert repr(i) == exp
idx = timedelta_range("1 hours", periods=10)
i = CategoricalIndex(Categorical(idx, ordered=True))
exp = """CategoricalIndex(['0 days 01:00:00', '1 days 01:00:00', '2 days 01:00:00',
'3 days 01:00:00', '4 days 01:00:00', '5 days 01:00:00',
'6 days 01:00:00', '7 days 01:00:00', '8 days 01:00:00',
'9 days 01:00:00'],
categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, ..., 6 days 01:00:00, 7 days 01:00:00, 8 days 01:00:00, 9 days 01:00:00], ordered=True, dtype='category')""" # noqa: E501
assert repr(i) == exp
def test_categorical_str_repr(self):
# GH 33676
result = repr(Categorical([1, "2", 3, 4]))
expected = "[1, '2', 3, 4]\nCategories (4, object): [1, 3, 4, '2']"
assert result == expected

View File

@ -0,0 +1,128 @@
import numpy as np
import pytest
from pandas import (
Categorical,
Index,
)
import pandas._testing as tm
class TestCategoricalSort:
def test_argsort(self):
c = Categorical([5, 3, 1, 4, 2], ordered=True)
expected = np.array([2, 4, 1, 3, 0])
tm.assert_numpy_array_equal(
c.argsort(ascending=True), expected, check_dtype=False
)
expected = expected[::-1]
tm.assert_numpy_array_equal(
c.argsort(ascending=False), expected, check_dtype=False
)
def test_numpy_argsort(self):
c = Categorical([5, 3, 1, 4, 2], ordered=True)
expected = np.array([2, 4, 1, 3, 0])
tm.assert_numpy_array_equal(np.argsort(c), expected, check_dtype=False)
tm.assert_numpy_array_equal(
np.argsort(c, kind="mergesort"), expected, check_dtype=False
)
msg = "the 'axis' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.argsort(c, axis=0)
msg = "the 'order' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.argsort(c, order="C")
def test_sort_values(self):
# unordered cats are sortable
cat = Categorical(["a", "b", "b", "a"], ordered=False)
cat.sort_values()
cat = Categorical(["a", "c", "b", "d"], ordered=True)
# sort_values
res = cat.sort_values()
exp = np.array(["a", "b", "c", "d"], dtype=object)
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, cat.categories)
cat = Categorical(
["a", "c", "b", "d"], categories=["a", "b", "c", "d"], ordered=True
)
res = cat.sort_values()
exp = np.array(["a", "b", "c", "d"], dtype=object)
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, cat.categories)
res = cat.sort_values(ascending=False)
exp = np.array(["d", "c", "b", "a"], dtype=object)
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, cat.categories)
# sort (inplace order)
cat1 = cat.copy()
orig_codes = cat1._codes
cat1.sort_values(inplace=True)
assert cat1._codes is orig_codes
exp = np.array(["a", "b", "c", "d"], dtype=object)
tm.assert_numpy_array_equal(cat1.__array__(), exp)
tm.assert_index_equal(res.categories, cat.categories)
# reverse
cat = Categorical(["a", "c", "c", "b", "d"], ordered=True)
res = cat.sort_values(ascending=False)
exp_val = np.array(["d", "c", "c", "b", "a"], dtype=object)
exp_categories = Index(["a", "b", "c", "d"])
tm.assert_numpy_array_equal(res.__array__(), exp_val)
tm.assert_index_equal(res.categories, exp_categories)
def test_sort_values_na_position(self):
# see gh-12882
cat = Categorical([5, 2, np.nan, 2, np.nan], ordered=True)
exp_categories = Index([2, 5])
exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan])
res = cat.sort_values() # default arguments
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, exp_categories)
exp = np.array([np.nan, np.nan, 2.0, 2.0, 5.0])
res = cat.sort_values(ascending=True, na_position="first")
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, exp_categories)
exp = np.array([np.nan, np.nan, 5.0, 2.0, 2.0])
res = cat.sort_values(ascending=False, na_position="first")
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, exp_categories)
exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan])
res = cat.sort_values(ascending=True, na_position="last")
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, exp_categories)
exp = np.array([5.0, 2.0, 2.0, np.nan, np.nan])
res = cat.sort_values(ascending=False, na_position="last")
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, exp_categories)
cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
res = cat.sort_values(ascending=False, na_position="last")
exp_val = np.array(["d", "c", "b", "a", np.nan], dtype=object)
exp_categories = Index(["a", "b", "c", "d"])
tm.assert_numpy_array_equal(res.__array__(), exp_val)
tm.assert_index_equal(res.categories, exp_categories)
cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
res = cat.sort_values(ascending=False, na_position="first")
exp_val = np.array([np.nan, "d", "c", "b", "a"], dtype=object)
exp_categories = Index(["a", "b", "c", "d"])
tm.assert_numpy_array_equal(res.__array__(), exp_val)
tm.assert_index_equal(res.categories, exp_categories)

View File

@ -0,0 +1,26 @@
from pandas import Categorical
import pandas._testing as tm
class SubclassedCategorical(Categorical):
pass
class TestCategoricalSubclassing:
def test_constructor(self):
sc = SubclassedCategorical(["a", "b", "c"])
assert isinstance(sc, SubclassedCategorical)
tm.assert_categorical_equal(sc, Categorical(["a", "b", "c"]))
def test_from_codes(self):
sc = SubclassedCategorical.from_codes([1, 0, 2], ["a", "b", "c"])
assert isinstance(sc, SubclassedCategorical)
exp = Categorical.from_codes([1, 0, 2], ["a", "b", "c"])
tm.assert_categorical_equal(sc, exp)
def test_map(self):
sc = SubclassedCategorical(["a", "b", "c"])
res = sc.map(lambda x: x.upper(), na_action=None)
assert isinstance(res, SubclassedCategorical)
exp = Categorical(["A", "B", "C"])
tm.assert_categorical_equal(res, exp)

View File

@ -0,0 +1,89 @@
import numpy as np
import pytest
from pandas import Categorical
import pandas._testing as tm
@pytest.fixture(params=[True, False])
def allow_fill(request):
"""Boolean 'allow_fill' parameter for Categorical.take"""
return request.param
class TestTake:
# https://github.com/pandas-dev/pandas/issues/20664
def test_take_default_allow_fill(self):
cat = Categorical(["a", "b"])
with tm.assert_produces_warning(None):
result = cat.take([0, -1])
assert result.equals(cat)
def test_take_positive_no_warning(self):
cat = Categorical(["a", "b"])
with tm.assert_produces_warning(None):
cat.take([0, 0])
def test_take_bounds(self, allow_fill):
# https://github.com/pandas-dev/pandas/issues/20664
cat = Categorical(["a", "b", "a"])
if allow_fill:
msg = "indices are out-of-bounds"
else:
msg = "index 4 is out of bounds for( axis 0 with)? size 3"
with pytest.raises(IndexError, match=msg):
cat.take([4, 5], allow_fill=allow_fill)
def test_take_empty(self, allow_fill):
# https://github.com/pandas-dev/pandas/issues/20664
cat = Categorical([], categories=["a", "b"])
if allow_fill:
msg = "indices are out-of-bounds"
else:
msg = "cannot do a non-empty take from an empty axes"
with pytest.raises(IndexError, match=msg):
cat.take([0], allow_fill=allow_fill)
def test_positional_take(self, ordered):
cat = Categorical(["a", "a", "b", "b"], categories=["b", "a"], ordered=ordered)
result = cat.take([0, 1, 2], allow_fill=False)
expected = Categorical(
["a", "a", "b"], categories=cat.categories, ordered=ordered
)
tm.assert_categorical_equal(result, expected)
def test_positional_take_unobserved(self, ordered):
cat = Categorical(["a", "b"], categories=["a", "b", "c"], ordered=ordered)
result = cat.take([1, 0], allow_fill=False)
expected = Categorical(["b", "a"], categories=cat.categories, ordered=ordered)
tm.assert_categorical_equal(result, expected)
def test_take_allow_fill(self):
# https://github.com/pandas-dev/pandas/issues/23296
cat = Categorical(["a", "a", "b"])
result = cat.take([0, -1, -1], allow_fill=True)
expected = Categorical(["a", np.nan, np.nan], categories=["a", "b"])
tm.assert_categorical_equal(result, expected)
def test_take_fill_with_negative_one(self):
# -1 was a category
cat = Categorical([-1, 0, 1])
result = cat.take([0, -1, 1], allow_fill=True, fill_value=-1)
expected = Categorical([-1, -1, 0], categories=[-1, 0, 1])
tm.assert_categorical_equal(result, expected)
def test_take_fill_value(self):
# https://github.com/pandas-dev/pandas/issues/23296
cat = Categorical(["a", "b", "c"])
result = cat.take([0, 1, -1], fill_value="a", allow_fill=True)
expected = Categorical(["a", "b", "a"], categories=["a", "b", "c"])
tm.assert_categorical_equal(result, expected)
def test_take_fill_value_new_raises(self):
# https://github.com/pandas-dev/pandas/issues/23296
cat = Categorical(["a", "b", "c"])
xpr = r"Cannot setitem on a Categorical with a new category \(d\)"
with pytest.raises(TypeError, match=xpr):
cat.take([0, 1, -1], fill_value="d", allow_fill=True)

View File

@ -0,0 +1,19 @@
import pytest
import pandas._testing as tm
class TestCategoricalWarnings:
def test_tab_complete_warning(self, ip):
# https://github.com/pandas-dev/pandas/issues/16409
pytest.importorskip("IPython", minversion="6.0.0")
from IPython.core.completer import provisionalcompleter
code = "import pandas as pd; c = pd.Categorical([])"
ip.run_cell(code)
# GH 31324 newer jedi version raises Deprecation warning;
# appears resolved 2021-02-02
with tm.assert_produces_warning(None, raise_on_extra_warnings=False):
with provisionalcompleter("ignore"):
list(ip.Completer.completions("c.", 1))

View File

@ -0,0 +1,284 @@
import numpy as np
import pytest
from pandas._libs import iNaT
from pandas.core.dtypes.dtypes import DatetimeTZDtype
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import DatetimeArray
class TestDatetimeArrayConstructor:
def test_from_sequence_invalid_type(self):
mi = pd.MultiIndex.from_product([np.arange(5), np.arange(5)])
with pytest.raises(TypeError, match="Cannot create a DatetimeArray"):
DatetimeArray._from_sequence(mi, dtype="M8[ns]")
def test_only_1dim_accepted(self):
arr = np.array([0, 1, 2, 3], dtype="M8[h]").astype("M8[ns]")
depr_msg = "DatetimeArray.__init__ is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match="Only 1-dimensional"):
# 3-dim, we allow 2D to sneak in for ops purposes GH#29853
DatetimeArray(arr.reshape(2, 2, 1))
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match="Only 1-dimensional"):
# 0-dim
DatetimeArray(arr[[0]].squeeze())
def test_freq_validation(self):
# GH#24623 check that invalid instances cannot be created with the
# public constructor
arr = np.arange(5, dtype=np.int64) * 3600 * 10**9
msg = (
"Inferred frequency h from passed values does not "
"conform to passed frequency W-SUN"
)
depr_msg = "DatetimeArray.__init__ is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match=msg):
DatetimeArray(arr, freq="W")
@pytest.mark.parametrize(
"meth",
[
DatetimeArray._from_sequence,
pd.to_datetime,
pd.DatetimeIndex,
],
)
def test_mixing_naive_tzaware_raises(self, meth):
# GH#24569
arr = np.array([pd.Timestamp("2000"), pd.Timestamp("2000", tz="CET")])
msg = (
"Cannot mix tz-aware with tz-naive values|"
"Tz-aware datetime.datetime cannot be converted "
"to datetime64 unless utc=True"
)
for obj in [arr, arr[::-1]]:
# check that we raise regardless of whether naive is found
# before aware or vice-versa
with pytest.raises(ValueError, match=msg):
meth(obj)
def test_from_pandas_array(self):
arr = pd.array(np.arange(5, dtype=np.int64)) * 3600 * 10**9
result = DatetimeArray._from_sequence(arr, dtype="M8[ns]")._with_freq("infer")
expected = pd.date_range("1970-01-01", periods=5, freq="h")._data
tm.assert_datetime_array_equal(result, expected)
def test_mismatched_timezone_raises(self):
depr_msg = "DatetimeArray.__init__ is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
arr = DatetimeArray(
np.array(["2000-01-01T06:00:00"], dtype="M8[ns]"),
dtype=DatetimeTZDtype(tz="US/Central"),
)
dtype = DatetimeTZDtype(tz="US/Eastern")
msg = r"dtype=datetime64\[ns.*\] does not match data dtype datetime64\[ns.*\]"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(TypeError, match=msg):
DatetimeArray(arr, dtype=dtype)
# also with mismatched tzawareness
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(TypeError, match=msg):
DatetimeArray(arr, dtype=np.dtype("M8[ns]"))
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(TypeError, match=msg):
DatetimeArray(arr.tz_localize(None), dtype=arr.dtype)
def test_non_array_raises(self):
depr_msg = "DatetimeArray.__init__ is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match="list"):
DatetimeArray([1, 2, 3])
def test_bool_dtype_raises(self):
arr = np.array([1, 2, 3], dtype="bool")
depr_msg = "DatetimeArray.__init__ is deprecated"
msg = "Unexpected value for 'dtype': 'bool'. Must be"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match=msg):
DatetimeArray(arr)
msg = r"dtype bool cannot be converted to datetime64\[ns\]"
with pytest.raises(TypeError, match=msg):
DatetimeArray._from_sequence(arr, dtype="M8[ns]")
with pytest.raises(TypeError, match=msg):
pd.DatetimeIndex(arr)
with pytest.raises(TypeError, match=msg):
pd.to_datetime(arr)
def test_incorrect_dtype_raises(self):
depr_msg = "DatetimeArray.__init__ is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match="Unexpected value for 'dtype'."):
DatetimeArray(np.array([1, 2, 3], dtype="i8"), dtype="category")
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match="Unexpected value for 'dtype'."):
DatetimeArray(np.array([1, 2, 3], dtype="i8"), dtype="m8[s]")
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match="Unexpected value for 'dtype'."):
DatetimeArray(np.array([1, 2, 3], dtype="i8"), dtype="M8[D]")
def test_mismatched_values_dtype_units(self):
arr = np.array([1, 2, 3], dtype="M8[s]")
dtype = np.dtype("M8[ns]")
msg = "Values resolution does not match dtype."
depr_msg = "DatetimeArray.__init__ is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match=msg):
DatetimeArray(arr, dtype=dtype)
dtype2 = DatetimeTZDtype(tz="UTC", unit="ns")
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match=msg):
DatetimeArray(arr, dtype=dtype2)
def test_freq_infer_raises(self):
depr_msg = "DatetimeArray.__init__ is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match="Frequency inference"):
DatetimeArray(np.array([1, 2, 3], dtype="i8"), freq="infer")
def test_copy(self):
data = np.array([1, 2, 3], dtype="M8[ns]")
arr = DatetimeArray._from_sequence(data, copy=False)
assert arr._ndarray is data
arr = DatetimeArray._from_sequence(data, copy=True)
assert arr._ndarray is not data
@pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"])
def test_numpy_datetime_unit(self, unit):
data = np.array([1, 2, 3], dtype=f"M8[{unit}]")
arr = DatetimeArray._from_sequence(data)
assert arr.unit == unit
assert arr[0].unit == unit
class TestSequenceToDT64NS:
def test_tz_dtype_mismatch_raises(self):
arr = DatetimeArray._from_sequence(
["2000"], dtype=DatetimeTZDtype(tz="US/Central")
)
with pytest.raises(TypeError, match="data is already tz-aware"):
DatetimeArray._from_sequence(arr, dtype=DatetimeTZDtype(tz="UTC"))
def test_tz_dtype_matches(self):
dtype = DatetimeTZDtype(tz="US/Central")
arr = DatetimeArray._from_sequence(["2000"], dtype=dtype)
result = DatetimeArray._from_sequence(arr, dtype=dtype)
tm.assert_equal(arr, result)
@pytest.mark.parametrize("order", ["F", "C"])
def test_2d(self, order):
dti = pd.date_range("2016-01-01", periods=6, tz="US/Pacific")
arr = np.array(dti, dtype=object).reshape(3, 2)
if order == "F":
arr = arr.T
res = DatetimeArray._from_sequence(arr, dtype=dti.dtype)
expected = DatetimeArray._from_sequence(arr.ravel(), dtype=dti.dtype).reshape(
arr.shape
)
tm.assert_datetime_array_equal(res, expected)
# ----------------------------------------------------------------------------
# Arrow interaction
EXTREME_VALUES = [0, 123456789, None, iNaT, 2**63 - 1, -(2**63) + 1]
FINE_TO_COARSE_SAFE = [123_000_000_000, None, -123_000_000_000]
COARSE_TO_FINE_SAFE = [123, None, -123]
@pytest.mark.parametrize(
("pa_unit", "pd_unit", "pa_tz", "pd_tz", "data"),
[
("s", "s", "UTC", "UTC", EXTREME_VALUES),
("ms", "ms", "UTC", "Europe/Berlin", EXTREME_VALUES),
("us", "us", "US/Eastern", "UTC", EXTREME_VALUES),
("ns", "ns", "US/Central", "Asia/Kolkata", EXTREME_VALUES),
("ns", "s", "UTC", "UTC", FINE_TO_COARSE_SAFE),
("us", "ms", "UTC", "Europe/Berlin", FINE_TO_COARSE_SAFE),
("ms", "us", "US/Eastern", "UTC", COARSE_TO_FINE_SAFE),
("s", "ns", "US/Central", "Asia/Kolkata", COARSE_TO_FINE_SAFE),
],
)
def test_from_arrow_with_different_units_and_timezones_with(
pa_unit, pd_unit, pa_tz, pd_tz, data
):
pa = pytest.importorskip("pyarrow")
pa_type = pa.timestamp(pa_unit, tz=pa_tz)
arr = pa.array(data, type=pa_type)
dtype = DatetimeTZDtype(unit=pd_unit, tz=pd_tz)
result = dtype.__from_arrow__(arr)
expected = DatetimeArray._from_sequence(data, dtype=f"M8[{pa_unit}, UTC]").astype(
dtype, copy=False
)
tm.assert_extension_array_equal(result, expected)
result = dtype.__from_arrow__(pa.chunked_array([arr]))
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
("unit", "tz"),
[
("s", "UTC"),
("ms", "Europe/Berlin"),
("us", "US/Eastern"),
("ns", "Asia/Kolkata"),
("ns", "UTC"),
],
)
def test_from_arrow_from_empty(unit, tz):
pa = pytest.importorskip("pyarrow")
data = []
arr = pa.array(data)
dtype = DatetimeTZDtype(unit=unit, tz=tz)
result = dtype.__from_arrow__(arr)
expected = DatetimeArray._from_sequence(np.array(data, dtype=f"datetime64[{unit}]"))
expected = expected.tz_localize(tz=tz)
tm.assert_extension_array_equal(result, expected)
result = dtype.__from_arrow__(pa.chunked_array([arr]))
tm.assert_extension_array_equal(result, expected)
def test_from_arrow_from_integers():
pa = pytest.importorskip("pyarrow")
data = [0, 123456789, None, 2**63 - 1, iNaT, -123456789]
arr = pa.array(data)
dtype = DatetimeTZDtype(unit="ns", tz="UTC")
result = dtype.__from_arrow__(arr)
expected = DatetimeArray._from_sequence(np.array(data, dtype="datetime64[ns]"))
expected = expected.tz_localize("UTC")
tm.assert_extension_array_equal(result, expected)
result = dtype.__from_arrow__(pa.chunked_array([arr]))
tm.assert_extension_array_equal(result, expected)

View File

@ -0,0 +1,44 @@
import pytest
import pandas._testing as tm
from pandas.core.arrays import DatetimeArray
class TestAccumulator:
def test_accumulators_freq(self):
# GH#50297
arr = DatetimeArray._from_sequence(
[
"2000-01-01",
"2000-01-02",
"2000-01-03",
],
dtype="M8[ns]",
)._with_freq("infer")
result = arr._accumulate("cummin")
expected = DatetimeArray._from_sequence(["2000-01-01"] * 3, dtype="M8[ns]")
tm.assert_datetime_array_equal(result, expected)
result = arr._accumulate("cummax")
expected = DatetimeArray._from_sequence(
[
"2000-01-01",
"2000-01-02",
"2000-01-03",
],
dtype="M8[ns]",
)
tm.assert_datetime_array_equal(result, expected)
@pytest.mark.parametrize("func", ["cumsum", "cumprod"])
def test_accumulators_disallowed(self, func):
# GH#50297
arr = DatetimeArray._from_sequence(
[
"2000-01-01",
"2000-01-02",
],
dtype="M8[ns]",
)._with_freq("infer")
with pytest.raises(TypeError, match=f"Accumulation {func}"):
arr._accumulate(func)

View File

@ -0,0 +1,183 @@
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import DatetimeTZDtype
import pandas as pd
from pandas import NaT
import pandas._testing as tm
from pandas.core.arrays import DatetimeArray
class TestReductions:
@pytest.fixture(params=["s", "ms", "us", "ns"])
def unit(self, request):
return request.param
@pytest.fixture
def arr1d(self, tz_naive_fixture):
"""Fixture returning DatetimeArray with parametrized timezones"""
tz = tz_naive_fixture
dtype = DatetimeTZDtype(tz=tz) if tz is not None else np.dtype("M8[ns]")
arr = DatetimeArray._from_sequence(
[
"2000-01-03",
"2000-01-03",
"NaT",
"2000-01-02",
"2000-01-05",
"2000-01-04",
],
dtype=dtype,
)
return arr
def test_min_max(self, arr1d, unit):
arr = arr1d
arr = arr.as_unit(unit)
tz = arr.tz
result = arr.min()
expected = pd.Timestamp("2000-01-02", tz=tz).as_unit(unit)
assert result == expected
assert result.unit == expected.unit
result = arr.max()
expected = pd.Timestamp("2000-01-05", tz=tz).as_unit(unit)
assert result == expected
assert result.unit == expected.unit
result = arr.min(skipna=False)
assert result is NaT
result = arr.max(skipna=False)
assert result is NaT
@pytest.mark.parametrize("tz", [None, "US/Central"])
@pytest.mark.parametrize("skipna", [True, False])
def test_min_max_empty(self, skipna, tz):
dtype = DatetimeTZDtype(tz=tz) if tz is not None else np.dtype("M8[ns]")
arr = DatetimeArray._from_sequence([], dtype=dtype)
result = arr.min(skipna=skipna)
assert result is NaT
result = arr.max(skipna=skipna)
assert result is NaT
@pytest.mark.parametrize("tz", [None, "US/Central"])
@pytest.mark.parametrize("skipna", [True, False])
def test_median_empty(self, skipna, tz):
dtype = DatetimeTZDtype(tz=tz) if tz is not None else np.dtype("M8[ns]")
arr = DatetimeArray._from_sequence([], dtype=dtype)
result = arr.median(skipna=skipna)
assert result is NaT
arr = arr.reshape(0, 3)
result = arr.median(axis=0, skipna=skipna)
expected = type(arr)._from_sequence([NaT, NaT, NaT], dtype=arr.dtype)
tm.assert_equal(result, expected)
result = arr.median(axis=1, skipna=skipna)
expected = type(arr)._from_sequence([], dtype=arr.dtype)
tm.assert_equal(result, expected)
def test_median(self, arr1d):
arr = arr1d
result = arr.median()
assert result == arr[0]
result = arr.median(skipna=False)
assert result is NaT
result = arr.dropna().median(skipna=False)
assert result == arr[0]
result = arr.median(axis=0)
assert result == arr[0]
def test_median_axis(self, arr1d):
arr = arr1d
assert arr.median(axis=0) == arr.median()
assert arr.median(axis=0, skipna=False) is NaT
msg = r"abs\(axis\) must be less than ndim"
with pytest.raises(ValueError, match=msg):
arr.median(axis=1)
@pytest.mark.filterwarnings("ignore:All-NaN slice encountered:RuntimeWarning")
def test_median_2d(self, arr1d):
arr = arr1d.reshape(1, -1)
# axis = None
assert arr.median() == arr1d.median()
assert arr.median(skipna=False) is NaT
# axis = 0
result = arr.median(axis=0)
expected = arr1d
tm.assert_equal(result, expected)
# Since column 3 is all-NaT, we get NaT there with or without skipna
result = arr.median(axis=0, skipna=False)
expected = arr1d
tm.assert_equal(result, expected)
# axis = 1
result = arr.median(axis=1)
expected = type(arr)._from_sequence([arr1d.median()], dtype=arr.dtype)
tm.assert_equal(result, expected)
result = arr.median(axis=1, skipna=False)
expected = type(arr)._from_sequence([NaT], dtype=arr.dtype)
tm.assert_equal(result, expected)
def test_mean(self, arr1d):
arr = arr1d
# manually verified result
expected = arr[0] + 0.4 * pd.Timedelta(days=1)
result = arr.mean()
assert result == expected
result = arr.mean(skipna=False)
assert result is NaT
result = arr.dropna().mean(skipna=False)
assert result == expected
result = arr.mean(axis=0)
assert result == expected
def test_mean_2d(self):
dti = pd.date_range("2016-01-01", periods=6, tz="US/Pacific")
dta = dti._data.reshape(3, 2)
result = dta.mean(axis=0)
expected = dta[1]
tm.assert_datetime_array_equal(result, expected)
result = dta.mean(axis=1)
expected = dta[:, 0] + pd.Timedelta(hours=12)
tm.assert_datetime_array_equal(result, expected)
result = dta.mean(axis=None)
expected = dti.mean()
assert result == expected
@pytest.mark.parametrize("skipna", [True, False])
def test_mean_empty(self, arr1d, skipna):
arr = arr1d[:0]
assert arr.mean(skipna=skipna) is NaT
arr2d = arr.reshape(0, 3)
result = arr2d.mean(axis=0, skipna=skipna)
expected = DatetimeArray._from_sequence([NaT, NaT, NaT], dtype=arr.dtype)
tm.assert_datetime_array_equal(result, expected)
result = arr2d.mean(axis=1, skipna=skipna)
expected = arr # i.e. 1D, empty
tm.assert_datetime_array_equal(result, expected)
result = arr2d.mean(axis=None, skipna=skipna)
assert result is NaT

View File

@ -0,0 +1,48 @@
import numpy as np
import pytest
import pandas as pd
from pandas.core.arrays.floating import (
Float32Dtype,
Float64Dtype,
)
@pytest.fixture(params=[Float32Dtype, Float64Dtype])
def dtype(request):
"""Parametrized fixture returning a float 'dtype'"""
return request.param()
@pytest.fixture
def data(dtype):
"""Fixture returning 'data' array according to parametrized float 'dtype'"""
return pd.array(
list(np.arange(0.1, 0.9, 0.1))
+ [pd.NA]
+ list(np.arange(1, 9.8, 0.1))
+ [pd.NA]
+ [9.9, 10.0],
dtype=dtype,
)
@pytest.fixture
def data_missing(dtype):
"""
Fixture returning array with missing data according to parametrized float
'dtype'.
"""
return pd.array([np.nan, 0.1], dtype=dtype)
@pytest.fixture(params=["data", "data_missing"])
def all_data(request, data, data_missing):
"""Parametrized fixture returning 'data' or 'data_missing' float arrays.
Used to test dtype conversion with and without missing values.
"""
if request.param == "data":
return data
elif request.param == "data_missing":
return data_missing

View File

@ -0,0 +1,240 @@
import operator
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import FloatingArray
# Basic test for the arithmetic array ops
# -----------------------------------------------------------------------------
@pytest.mark.parametrize(
"opname, exp",
[
("add", [1.1, 2.2, None, None, 5.5]),
("mul", [0.1, 0.4, None, None, 2.5]),
("sub", [0.9, 1.8, None, None, 4.5]),
("truediv", [10.0, 10.0, None, None, 10.0]),
("floordiv", [9.0, 9.0, None, None, 10.0]),
("mod", [0.1, 0.2, None, None, 0.0]),
],
ids=["add", "mul", "sub", "div", "floordiv", "mod"],
)
def test_array_op(dtype, opname, exp):
a = pd.array([1.0, 2.0, None, 4.0, 5.0], dtype=dtype)
b = pd.array([0.1, 0.2, 0.3, None, 0.5], dtype=dtype)
op = getattr(operator, opname)
result = op(a, b)
expected = pd.array(exp, dtype=dtype)
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)])
def test_divide_by_zero(dtype, zero, negative):
# TODO pending NA/NaN discussion
# https://github.com/pandas-dev/pandas/issues/32265/
a = pd.array([0, 1, -1, None], dtype=dtype)
result = a / zero
expected = FloatingArray(
np.array([np.nan, np.inf, -np.inf, np.nan], dtype=dtype.numpy_dtype),
np.array([False, False, False, True]),
)
if negative:
expected *= -1
tm.assert_extension_array_equal(result, expected)
def test_pow_scalar(dtype):
a = pd.array([-1, 0, 1, None, 2], dtype=dtype)
result = a**0
expected = pd.array([1, 1, 1, 1, 1], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
result = a**1
expected = pd.array([-1, 0, 1, None, 2], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
result = a**pd.NA
expected = pd.array([None, None, 1, None, None], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
result = a**np.nan
# TODO np.nan should be converted to pd.NA / missing before operation?
expected = FloatingArray(
np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype=dtype.numpy_dtype),
mask=a._mask,
)
tm.assert_extension_array_equal(result, expected)
# reversed
a = a[1:] # Can't raise integers to negative powers.
result = 0**a
expected = pd.array([1, 0, None, 0], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
result = 1**a
expected = pd.array([1, 1, 1, 1], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
result = pd.NA**a
expected = pd.array([1, None, None, None], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
result = np.nan**a
expected = FloatingArray(
np.array([1, np.nan, np.nan, np.nan], dtype=dtype.numpy_dtype), mask=a._mask
)
tm.assert_extension_array_equal(result, expected)
def test_pow_array(dtype):
a = pd.array([0, 0, 0, 1, 1, 1, None, None, None], dtype=dtype)
b = pd.array([0, 1, None, 0, 1, None, 0, 1, None], dtype=dtype)
result = a**b
expected = pd.array([1, 0, None, 1, 1, 1, 1, None, None], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
def test_rpow_one_to_na():
# https://github.com/pandas-dev/pandas/issues/22022
# https://github.com/pandas-dev/pandas/issues/29997
arr = pd.array([np.nan, np.nan], dtype="Float64")
result = np.array([1.0, 2.0]) ** arr
expected = pd.array([1.0, np.nan], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("other", [0, 0.5])
def test_arith_zero_dim_ndarray(other):
arr = pd.array([1, None, 2], dtype="Float64")
result = arr + np.array(other)
expected = arr + other
tm.assert_equal(result, expected)
# Test generic characteristics / errors
# -----------------------------------------------------------------------------
def test_error_invalid_values(data, all_arithmetic_operators):
op = all_arithmetic_operators
s = pd.Series(data)
ops = getattr(s, op)
# invalid scalars
msg = "|".join(
[
r"can only perform ops with numeric values",
r"FloatingArray cannot perform the operation mod",
"unsupported operand type",
"not all arguments converted during string formatting",
"can't multiply sequence by non-int of type 'float'",
"ufunc 'subtract' cannot use operands with types dtype",
r"can only concatenate str \(not \"float\"\) to str",
"ufunc '.*' not supported for the input types, and the inputs could not",
"ufunc '.*' did not contain a loop with signature matching types",
"Concatenation operation is not implemented for NumPy arrays",
"has no kernel",
"not implemented",
"not supported for dtype",
"Can only string multiply by an integer",
]
)
with pytest.raises(TypeError, match=msg):
ops("foo")
with pytest.raises(TypeError, match=msg):
ops(pd.Timestamp("20180101"))
# invalid array-likes
with pytest.raises(TypeError, match=msg):
ops(pd.Series("foo", index=s.index))
msg = "|".join(
[
"can only perform ops with numeric values",
"cannot perform .* with this index type: DatetimeArray",
"Addition/subtraction of integers and integer-arrays "
"with DatetimeArray is no longer supported. *",
"unsupported operand type",
"not all arguments converted during string formatting",
"can't multiply sequence by non-int of type 'float'",
"ufunc 'subtract' cannot use operands with types dtype",
(
"ufunc 'add' cannot use operands with types "
rf"dtype\('{tm.ENDIAN}M8\[ns\]'\)"
),
r"ufunc 'add' cannot use operands with types dtype\('float\d{2}'\)",
"cannot subtract DatetimeArray from ndarray",
"has no kernel",
"not implemented",
"not supported for dtype",
]
)
with pytest.raises(TypeError, match=msg):
ops(pd.Series(pd.date_range("20180101", periods=len(s))))
# Various
# -----------------------------------------------------------------------------
def test_cross_type_arithmetic():
df = pd.DataFrame(
{
"A": pd.array([1, 2, np.nan], dtype="Float64"),
"B": pd.array([1, np.nan, 3], dtype="Float32"),
"C": np.array([1, 2, 3], dtype="float64"),
}
)
result = df.A + df.C
expected = pd.Series([2, 4, np.nan], dtype="Float64")
tm.assert_series_equal(result, expected)
result = (df.A + df.C) * 3 == 12
expected = pd.Series([False, True, None], dtype="boolean")
tm.assert_series_equal(result, expected)
result = df.A + df.B
expected = pd.Series([2, np.nan, np.nan], dtype="Float64")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"source, neg_target, abs_target",
[
([1.1, 2.2, 3.3], [-1.1, -2.2, -3.3], [1.1, 2.2, 3.3]),
([1.1, 2.2, None], [-1.1, -2.2, None], [1.1, 2.2, None]),
([-1.1, 0.0, 1.1], [1.1, 0.0, -1.1], [1.1, 0.0, 1.1]),
],
)
def test_unary_float_operators(float_ea_dtype, source, neg_target, abs_target):
# GH38794
dtype = float_ea_dtype
arr = pd.array(source, dtype=dtype)
neg_result, pos_result, abs_result = -arr, +arr, abs(arr)
neg_target = pd.array(neg_target, dtype=dtype)
abs_target = pd.array(abs_target, dtype=dtype)
tm.assert_extension_array_equal(neg_result, neg_target)
tm.assert_extension_array_equal(pos_result, arr)
assert not tm.shares_memory(pos_result, arr)
tm.assert_extension_array_equal(abs_result, abs_target)
def test_bitwise(dtype):
left = pd.array([1, None, 3, 4], dtype=dtype)
right = pd.array([None, 3, 5, 4], dtype=dtype)
with pytest.raises(TypeError, match="unsupported operand type"):
left | right
with pytest.raises(TypeError, match="unsupported operand type"):
left & right
with pytest.raises(TypeError, match="unsupported operand type"):
left ^ right

View File

@ -0,0 +1,135 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
def test_astype():
# with missing values
arr = pd.array([0.1, 0.2, None], dtype="Float64")
with pytest.raises(ValueError, match="cannot convert NA to integer"):
arr.astype("int64")
with pytest.raises(ValueError, match="cannot convert float NaN to bool"):
arr.astype("bool")
result = arr.astype("float64")
expected = np.array([0.1, 0.2, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
# no missing values
arr = pd.array([0.0, 1.0, 0.5], dtype="Float64")
result = arr.astype("int64")
expected = np.array([0, 1, 0], dtype="int64")
tm.assert_numpy_array_equal(result, expected)
result = arr.astype("bool")
expected = np.array([False, True, True], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
def test_astype_to_floating_array():
# astype to FloatingArray
arr = pd.array([0.0, 1.0, None], dtype="Float64")
result = arr.astype("Float64")
tm.assert_extension_array_equal(result, arr)
result = arr.astype(pd.Float64Dtype())
tm.assert_extension_array_equal(result, arr)
result = arr.astype("Float32")
expected = pd.array([0.0, 1.0, None], dtype="Float32")
tm.assert_extension_array_equal(result, expected)
def test_astype_to_boolean_array():
# astype to BooleanArray
arr = pd.array([0.0, 1.0, None], dtype="Float64")
result = arr.astype("boolean")
expected = pd.array([False, True, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = arr.astype(pd.BooleanDtype())
tm.assert_extension_array_equal(result, expected)
def test_astype_to_integer_array():
# astype to IntegerArray
arr = pd.array([0.0, 1.5, None], dtype="Float64")
result = arr.astype("Int64")
expected = pd.array([0, 1, None], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
def test_astype_str(using_infer_string):
a = pd.array([0.1, 0.2, None], dtype="Float64")
if using_infer_string:
expected = pd.array(["0.1", "0.2", None], dtype=pd.StringDtype(na_value=np.nan))
tm.assert_extension_array_equal(a.astype(str), expected)
tm.assert_extension_array_equal(a.astype("str"), expected)
else:
expected = np.array(["0.1", "0.2", "<NA>"], dtype="U32")
tm.assert_numpy_array_equal(a.astype(str), expected)
tm.assert_numpy_array_equal(a.astype("str"), expected)
def test_astype_copy():
arr = pd.array([0.1, 0.2, None], dtype="Float64")
orig = pd.array([0.1, 0.2, None], dtype="Float64")
# copy=True -> ensure both data and mask are actual copies
result = arr.astype("Float64", copy=True)
assert result is not arr
assert not tm.shares_memory(result, arr)
result[0] = 10
tm.assert_extension_array_equal(arr, orig)
result[0] = pd.NA
tm.assert_extension_array_equal(arr, orig)
# copy=False
result = arr.astype("Float64", copy=False)
assert result is arr
assert np.shares_memory(result._data, arr._data)
assert np.shares_memory(result._mask, arr._mask)
result[0] = 10
assert arr[0] == 10
result[0] = pd.NA
assert arr[0] is pd.NA
# astype to different dtype -> always needs a copy -> even with copy=False
# we need to ensure that also the mask is actually copied
arr = pd.array([0.1, 0.2, None], dtype="Float64")
orig = pd.array([0.1, 0.2, None], dtype="Float64")
result = arr.astype("Float32", copy=False)
assert not tm.shares_memory(result, arr)
result[0] = 10
tm.assert_extension_array_equal(arr, orig)
result[0] = pd.NA
tm.assert_extension_array_equal(arr, orig)
def test_astype_object(dtype):
arr = pd.array([1.0, pd.NA], dtype=dtype)
result = arr.astype(object)
expected = np.array([1.0, pd.NA], dtype=object)
tm.assert_numpy_array_equal(result, expected)
# check exact element types
assert isinstance(result[0], float)
assert result[1] is pd.NA
def test_Float64_conversion():
# GH#40729
testseries = pd.Series(["1", "2", "3", "4"], dtype="object")
result = testseries.astype(pd.Float64Dtype())
expected = pd.Series([1.0, 2.0, 3.0, 4.0], dtype=pd.Float64Dtype())
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,65 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import FloatingArray
from pandas.tests.arrays.masked_shared import (
ComparisonOps,
NumericOps,
)
class TestComparisonOps(NumericOps, ComparisonOps):
@pytest.mark.parametrize("other", [True, False, pd.NA, -1.0, 0.0, 1])
def test_scalar(self, other, comparison_op, dtype):
ComparisonOps.test_scalar(self, other, comparison_op, dtype)
def test_compare_with_integerarray(self, comparison_op):
op = comparison_op
a = pd.array([0, 1, None] * 3, dtype="Int64")
b = pd.array([0] * 3 + [1] * 3 + [None] * 3, dtype="Float64")
other = b.astype("Int64")
expected = op(a, other)
result = op(a, b)
tm.assert_extension_array_equal(result, expected)
expected = op(other, a)
result = op(b, a)
tm.assert_extension_array_equal(result, expected)
def test_equals():
# GH-30652
# equals is generally tested in /tests/extension/base/methods, but this
# specifically tests that two arrays of the same class but different dtype
# do not evaluate equal
a1 = pd.array([1, 2, None], dtype="Float64")
a2 = pd.array([1, 2, None], dtype="Float32")
assert a1.equals(a2) is False
def test_equals_nan_vs_na():
# GH#44382
mask = np.zeros(3, dtype=bool)
data = np.array([1.0, np.nan, 3.0], dtype=np.float64)
left = FloatingArray(data, mask)
assert left.equals(left)
tm.assert_extension_array_equal(left, left)
assert left.equals(left.copy())
assert left.equals(FloatingArray(data.copy(), mask.copy()))
mask2 = np.array([False, True, False], dtype=bool)
data2 = np.array([1.0, 2.0, 3.0], dtype=np.float64)
right = FloatingArray(data2, mask2)
assert right.equals(right)
tm.assert_extension_array_equal(right, right)
assert not left.equals(right)
# with mask[1] = True, the only difference is data[1], which should
# not matter for equals
mask[1] = True
assert left.equals(right)

View File

@ -0,0 +1,20 @@
import pytest
import pandas as pd
import pandas._testing as tm
@pytest.mark.parametrize(
"to_concat_dtypes, result_dtype",
[
(["Float64", "Float64"], "Float64"),
(["Float32", "Float64"], "Float64"),
(["Float32", "Float32"], "Float32"),
],
)
def test_concat_series(to_concat_dtypes, result_dtype):
result = pd.concat([pd.Series([1, 2, pd.NA], dtype=t) for t in to_concat_dtypes])
expected = pd.concat([pd.Series([1, 2, pd.NA], dtype=object)] * 2).astype(
result_dtype
)
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,204 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import FloatingArray
from pandas.core.arrays.floating import (
Float32Dtype,
Float64Dtype,
)
def test_uses_pandas_na():
a = pd.array([1, None], dtype=Float64Dtype())
assert a[1] is pd.NA
def test_floating_array_constructor():
values = np.array([1, 2, 3, 4], dtype="float64")
mask = np.array([False, False, False, True], dtype="bool")
result = FloatingArray(values, mask)
expected = pd.array([1, 2, 3, np.nan], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
tm.assert_numpy_array_equal(result._data, values)
tm.assert_numpy_array_equal(result._mask, mask)
msg = r".* should be .* numpy array. Use the 'pd.array' function instead"
with pytest.raises(TypeError, match=msg):
FloatingArray(values.tolist(), mask)
with pytest.raises(TypeError, match=msg):
FloatingArray(values, mask.tolist())
with pytest.raises(TypeError, match=msg):
FloatingArray(values.astype(int), mask)
msg = r"__init__\(\) missing 1 required positional argument: 'mask'"
with pytest.raises(TypeError, match=msg):
FloatingArray(values)
def test_floating_array_disallows_float16():
# GH#44715
arr = np.array([1, 2], dtype=np.float16)
mask = np.array([False, False])
msg = "FloatingArray does not support np.float16 dtype"
with pytest.raises(TypeError, match=msg):
FloatingArray(arr, mask)
def test_floating_array_disallows_Float16_dtype(request):
# GH#44715
with pytest.raises(TypeError, match="data type 'Float16' not understood"):
pd.array([1.0, 2.0], dtype="Float16")
def test_floating_array_constructor_copy():
values = np.array([1, 2, 3, 4], dtype="float64")
mask = np.array([False, False, False, True], dtype="bool")
result = FloatingArray(values, mask)
assert result._data is values
assert result._mask is mask
result = FloatingArray(values, mask, copy=True)
assert result._data is not values
assert result._mask is not mask
def test_to_array():
result = pd.array([0.1, 0.2, 0.3, 0.4])
expected = pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
"a, b",
[
([1, None], [1, pd.NA]),
([None], [pd.NA]),
([None, np.nan], [pd.NA, pd.NA]),
([1, np.nan], [1, pd.NA]),
([np.nan], [pd.NA]),
],
)
def test_to_array_none_is_nan(a, b):
result = pd.array(a, dtype="Float64")
expected = pd.array(b, dtype="Float64")
tm.assert_extension_array_equal(result, expected)
def test_to_array_mixed_integer_float():
result = pd.array([1, 2.0])
expected = pd.array([1.0, 2.0], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
result = pd.array([1, None, 2.0])
expected = pd.array([1.0, None, 2.0], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
"values",
[
["foo", "bar"],
"foo",
1,
1.0,
pd.date_range("20130101", periods=2),
np.array(["foo"]),
[[1, 2], [3, 4]],
[np.nan, {"a": 1}],
# GH#44514 all-NA case used to get quietly swapped out before checking ndim
np.array([pd.NA] * 6, dtype=object).reshape(3, 2),
],
)
def test_to_array_error(values):
# error in converting existing arrays to FloatingArray
msg = "|".join(
[
"cannot be converted to FloatingDtype",
"values must be a 1D list-like",
"Cannot pass scalar",
r"float\(\) argument must be a string or a (real )?number, not 'dict'",
"could not convert string to float: 'foo'",
r"could not convert string to float: np\.str_\('foo'\)",
]
)
with pytest.raises((TypeError, ValueError), match=msg):
pd.array(values, dtype="Float64")
@pytest.mark.parametrize("values", [["1", "2", None], ["1.5", "2", None]])
def test_construct_from_float_strings(values):
# see also test_to_integer_array_str
expected = pd.array([float(values[0]), 2, None], dtype="Float64")
res = pd.array(values, dtype="Float64")
tm.assert_extension_array_equal(res, expected)
res = FloatingArray._from_sequence(values)
tm.assert_extension_array_equal(res, expected)
def test_to_array_inferred_dtype():
# if values has dtype -> respect it
result = pd.array(np.array([1, 2], dtype="float32"))
assert result.dtype == Float32Dtype()
# if values have no dtype -> always float64
result = pd.array([1.0, 2.0])
assert result.dtype == Float64Dtype()
def test_to_array_dtype_keyword():
result = pd.array([1, 2], dtype="Float32")
assert result.dtype == Float32Dtype()
# if values has dtype -> override it
result = pd.array(np.array([1, 2], dtype="float32"), dtype="Float64")
assert result.dtype == Float64Dtype()
def test_to_array_integer():
result = pd.array([1, 2], dtype="Float64")
expected = pd.array([1.0, 2.0], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
# for integer dtypes, the itemsize is not preserved
# TODO can we specify "floating" in general?
result = pd.array(np.array([1, 2], dtype="int32"), dtype="Float64")
assert result.dtype == Float64Dtype()
@pytest.mark.parametrize(
"bool_values, values, target_dtype, expected_dtype",
[
([False, True], [0, 1], Float64Dtype(), Float64Dtype()),
([False, True], [0, 1], "Float64", Float64Dtype()),
([False, True, np.nan], [0, 1, np.nan], Float64Dtype(), Float64Dtype()),
],
)
def test_to_array_bool(bool_values, values, target_dtype, expected_dtype):
result = pd.array(bool_values, dtype=target_dtype)
assert result.dtype == expected_dtype
expected = pd.array(values, dtype=target_dtype)
tm.assert_extension_array_equal(result, expected)
def test_series_from_float(data):
# construct from our dtype & string dtype
dtype = data.dtype
# from float
expected = pd.Series(data)
result = pd.Series(data.to_numpy(na_value=np.nan, dtype="float"), dtype=str(dtype))
tm.assert_series_equal(result, expected)
# from list
expected = pd.Series(data)
result = pd.Series(np.array(data).tolist(), dtype=str(dtype))
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,12 @@
import numpy as np
import pandas as pd
def test_contains_nan():
# GH#52840
arr = pd.array(range(5)) / 0
assert np.isnan(arr._data[0])
assert not arr.isna()[0]
assert np.nan in arr

View File

@ -0,0 +1,194 @@
import numpy as np
import pytest
from pandas.compat import IS64
import pandas as pd
import pandas._testing as tm
@pytest.mark.parametrize("ufunc", [np.abs, np.sign])
# np.sign emits a warning with nans, <https://github.com/numpy/numpy/issues/15127>
@pytest.mark.filterwarnings("ignore:invalid value encountered in sign:RuntimeWarning")
def test_ufuncs_single(ufunc):
a = pd.array([1, 2, -3, np.nan], dtype="Float64")
result = ufunc(a)
expected = pd.array(ufunc(a.astype(float)), dtype="Float64")
tm.assert_extension_array_equal(result, expected)
s = pd.Series(a)
result = ufunc(s)
expected = pd.Series(expected)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("ufunc", [np.log, np.exp, np.sin, np.cos, np.sqrt])
def test_ufuncs_single_float(ufunc):
a = pd.array([1.0, 0.2, 3.0, np.nan], dtype="Float64")
with np.errstate(invalid="ignore"):
result = ufunc(a)
expected = pd.array(ufunc(a.astype(float)), dtype="Float64")
tm.assert_extension_array_equal(result, expected)
s = pd.Series(a)
with np.errstate(invalid="ignore"):
result = ufunc(s)
expected = pd.Series(ufunc(s.astype(float)), dtype="Float64")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("ufunc", [np.add, np.subtract])
def test_ufuncs_binary_float(ufunc):
# two FloatingArrays
a = pd.array([1, 0.2, -3, np.nan], dtype="Float64")
result = ufunc(a, a)
expected = pd.array(ufunc(a.astype(float), a.astype(float)), dtype="Float64")
tm.assert_extension_array_equal(result, expected)
# FloatingArray with numpy array
arr = np.array([1, 2, 3, 4])
result = ufunc(a, arr)
expected = pd.array(ufunc(a.astype(float), arr), dtype="Float64")
tm.assert_extension_array_equal(result, expected)
result = ufunc(arr, a)
expected = pd.array(ufunc(arr, a.astype(float)), dtype="Float64")
tm.assert_extension_array_equal(result, expected)
# FloatingArray with scalar
result = ufunc(a, 1)
expected = pd.array(ufunc(a.astype(float), 1), dtype="Float64")
tm.assert_extension_array_equal(result, expected)
result = ufunc(1, a)
expected = pd.array(ufunc(1, a.astype(float)), dtype="Float64")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("values", [[0, 1], [0, None]])
def test_ufunc_reduce_raises(values):
arr = pd.array(values, dtype="Float64")
res = np.add.reduce(arr)
expected = arr.sum(skipna=False)
tm.assert_almost_equal(res, expected)
@pytest.mark.skipif(not IS64, reason="GH 36579: fail on 32-bit system")
@pytest.mark.parametrize(
"pandasmethname, kwargs",
[
("var", {"ddof": 0}),
("var", {"ddof": 1}),
("std", {"ddof": 0}),
("std", {"ddof": 1}),
("kurtosis", {}),
("skew", {}),
("sem", {}),
],
)
def test_stat_method(pandasmethname, kwargs):
s = pd.Series(data=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, np.nan, np.nan], dtype="Float64")
pandasmeth = getattr(s, pandasmethname)
result = pandasmeth(**kwargs)
s2 = pd.Series(data=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6], dtype="float64")
pandasmeth = getattr(s2, pandasmethname)
expected = pandasmeth(**kwargs)
assert expected == result
def test_value_counts_na():
arr = pd.array([0.1, 0.2, 0.1, pd.NA], dtype="Float64")
result = arr.value_counts(dropna=False)
idx = pd.Index([0.1, 0.2, pd.NA], dtype=arr.dtype)
assert idx.dtype == arr.dtype
expected = pd.Series([2, 1, 1], index=idx, dtype="Int64", name="count")
tm.assert_series_equal(result, expected)
result = arr.value_counts(dropna=True)
expected = pd.Series([2, 1], index=idx[:-1], dtype="Int64", name="count")
tm.assert_series_equal(result, expected)
def test_value_counts_empty():
ser = pd.Series([], dtype="Float64")
result = ser.value_counts()
idx = pd.Index([], dtype="Float64")
assert idx.dtype == "Float64"
expected = pd.Series([], index=idx, dtype="Int64", name="count")
tm.assert_series_equal(result, expected)
def test_value_counts_with_normalize():
ser = pd.Series([0.1, 0.2, 0.1, pd.NA], dtype="Float64")
result = ser.value_counts(normalize=True)
expected = pd.Series([2, 1], index=ser[:2], dtype="Float64", name="proportion") / 3
assert expected.index.dtype == ser.dtype
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("min_count", [0, 4])
def test_floating_array_sum(skipna, min_count, dtype):
arr = pd.array([1, 2, 3, None], dtype=dtype)
result = arr.sum(skipna=skipna, min_count=min_count)
if skipna and min_count == 0:
assert result == 6.0
else:
assert result is pd.NA
@pytest.mark.parametrize(
"values, expected", [([1, 2, 3], 6.0), ([1, 2, 3, None], 6.0), ([None], 0.0)]
)
def test_floating_array_numpy_sum(values, expected):
arr = pd.array(values, dtype="Float64")
result = np.sum(arr)
assert result == expected
@pytest.mark.parametrize("op", ["sum", "min", "max", "prod"])
def test_preserve_dtypes(op):
df = pd.DataFrame(
{
"A": ["a", "b", "b"],
"B": [1, None, 3],
"C": pd.array([0.1, None, 3.0], dtype="Float64"),
}
)
# op
result = getattr(df.C, op)()
assert isinstance(result, np.float64)
# groupby
result = getattr(df.groupby("A"), op)()
expected = pd.DataFrame(
{"B": np.array([1.0, 3.0]), "C": pd.array([0.1, 3], dtype="Float64")},
index=pd.Index(["a", "b"], name="A"),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("method", ["min", "max"])
def test_floating_array_min_max(skipna, method, dtype):
arr = pd.array([0.0, 1.0, None], dtype=dtype)
func = getattr(arr, method)
result = func(skipna=skipna)
if skipna:
assert result == (0 if method == "min" else 1)
else:
assert result is pd.NA
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("min_count", [0, 9])
def test_floating_array_prod(skipna, min_count, dtype):
arr = pd.array([1.0, 2.0, None], dtype=dtype)
result = arr.prod(skipna=skipna, min_count=min_count)
if skipna and min_count == 0:
assert result == 2
else:
assert result is pd.NA

View File

@ -0,0 +1,47 @@
import numpy as np
import pytest
import pandas as pd
from pandas.core.arrays.floating import (
Float32Dtype,
Float64Dtype,
)
def test_dtypes(dtype):
# smoke tests on auto dtype construction
np.dtype(dtype.type).kind == "f"
assert dtype.name is not None
@pytest.mark.parametrize(
"dtype, expected",
[(Float32Dtype(), "Float32Dtype()"), (Float64Dtype(), "Float64Dtype()")],
)
def test_repr_dtype(dtype, expected):
assert repr(dtype) == expected
def test_repr_array():
result = repr(pd.array([1.0, None, 3.0]))
expected = "<FloatingArray>\n[1.0, <NA>, 3.0]\nLength: 3, dtype: Float64"
assert result == expected
def test_repr_array_long():
data = pd.array([1.0, 2.0, None] * 1000)
expected = """<FloatingArray>
[ 1.0, 2.0, <NA>, 1.0, 2.0, <NA>, 1.0, 2.0, <NA>, 1.0,
...
<NA>, 1.0, 2.0, <NA>, 1.0, 2.0, <NA>, 1.0, 2.0, <NA>]
Length: 3000, dtype: Float64"""
result = repr(data)
assert result == expected
def test_frame_repr(data_missing):
df = pd.DataFrame({"A": data_missing})
result = repr(df)
expected = " A\n0 <NA>\n1 0.1"
assert result == expected

View File

@ -0,0 +1,132 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import FloatingArray
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy(box):
con = pd.Series if box else pd.array
# default (with or without missing values) -> object dtype
arr = con([0.1, 0.2, 0.3], dtype="Float64")
result = arr.to_numpy()
expected = np.array([0.1, 0.2, 0.3], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
arr = con([0.1, 0.2, None], dtype="Float64")
result = arr.to_numpy()
expected = np.array([0.1, 0.2, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy_float(box):
con = pd.Series if box else pd.array
# no missing values -> can convert to float, otherwise raises
arr = con([0.1, 0.2, 0.3], dtype="Float64")
result = arr.to_numpy(dtype="float64")
expected = np.array([0.1, 0.2, 0.3], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
arr = con([0.1, 0.2, None], dtype="Float64")
result = arr.to_numpy(dtype="float64")
expected = np.array([0.1, 0.2, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
result = arr.to_numpy(dtype="float64", na_value=np.nan)
expected = np.array([0.1, 0.2, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy_int(box):
con = pd.Series if box else pd.array
# no missing values -> can convert to int, otherwise raises
arr = con([1.0, 2.0, 3.0], dtype="Float64")
result = arr.to_numpy(dtype="int64")
expected = np.array([1, 2, 3], dtype="int64")
tm.assert_numpy_array_equal(result, expected)
arr = con([1.0, 2.0, None], dtype="Float64")
with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"):
result = arr.to_numpy(dtype="int64")
# automatic casting (floors the values)
arr = con([0.1, 0.9, 1.1], dtype="Float64")
result = arr.to_numpy(dtype="int64")
expected = np.array([0, 0, 1], dtype="int64")
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy_na_value(box):
con = pd.Series if box else pd.array
arr = con([0.0, 1.0, None], dtype="Float64")
result = arr.to_numpy(dtype=object, na_value=None)
expected = np.array([0.0, 1.0, None], dtype="object")
tm.assert_numpy_array_equal(result, expected)
result = arr.to_numpy(dtype=bool, na_value=False)
expected = np.array([False, True, False], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
result = arr.to_numpy(dtype="int64", na_value=-99)
expected = np.array([0, 1, -99], dtype="int64")
tm.assert_numpy_array_equal(result, expected)
def test_to_numpy_na_value_with_nan():
# array with both NaN and NA -> only fill NA with `na_value`
arr = FloatingArray(np.array([0.0, np.nan, 0.0]), np.array([False, False, True]))
result = arr.to_numpy(dtype="float64", na_value=-1)
expected = np.array([0.0, np.nan, -1.0], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("dtype", ["float64", "float32", "int32", "int64", "bool"])
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy_dtype(box, dtype):
con = pd.Series if box else pd.array
arr = con([0.0, 1.0], dtype="Float64")
result = arr.to_numpy(dtype=dtype)
expected = np.array([0, 1], dtype=dtype)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("dtype", ["int32", "int64", "bool"])
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy_na_raises(box, dtype):
con = pd.Series if box else pd.array
arr = con([0.0, 1.0, None], dtype="Float64")
with pytest.raises(ValueError, match=dtype):
arr.to_numpy(dtype=dtype)
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy_string(box, dtype):
con = pd.Series if box else pd.array
arr = con([0.0, 1.0, None], dtype="Float64")
result = arr.to_numpy(dtype="str")
expected = np.array([0.0, 1.0, pd.NA], dtype=f"{tm.ENDIAN}U32")
tm.assert_numpy_array_equal(result, expected)
def test_to_numpy_copy():
# to_numpy can be zero-copy if no missing values
arr = pd.array([0.1, 0.2, 0.3], dtype="Float64")
result = arr.to_numpy(dtype="float64")
result[0] = 10
tm.assert_extension_array_equal(arr, pd.array([10, 0.2, 0.3], dtype="Float64"))
arr = pd.array([0.1, 0.2, 0.3], dtype="Float64")
result = arr.to_numpy(dtype="float64", copy=True)
result[0] = 10
tm.assert_extension_array_equal(arr, pd.array([0.1, 0.2, 0.3], dtype="Float64"))

View File

@ -0,0 +1,68 @@
import numpy as np
import pytest
import pandas as pd
from pandas.core.arrays.integer import (
Int8Dtype,
Int16Dtype,
Int32Dtype,
Int64Dtype,
UInt8Dtype,
UInt16Dtype,
UInt32Dtype,
UInt64Dtype,
)
@pytest.fixture(
params=[
Int8Dtype,
Int16Dtype,
Int32Dtype,
Int64Dtype,
UInt8Dtype,
UInt16Dtype,
UInt32Dtype,
UInt64Dtype,
]
)
def dtype(request):
"""Parametrized fixture returning integer 'dtype'"""
return request.param()
@pytest.fixture
def data(dtype):
"""
Fixture returning 'data' array with valid and missing values according to
parametrized integer 'dtype'.
Used to test dtype conversion with and without missing values.
"""
return pd.array(
list(range(8)) + [np.nan] + list(range(10, 98)) + [np.nan] + [99, 100],
dtype=dtype,
)
@pytest.fixture
def data_missing(dtype):
"""
Fixture returning array with exactly one NaN and one valid integer,
according to parametrized integer 'dtype'.
Used to test dtype conversion with and without missing values.
"""
return pd.array([np.nan, 1], dtype=dtype)
@pytest.fixture(params=["data", "data_missing"])
def all_data(request, data, data_missing):
"""Parametrized fixture returning 'data' or 'data_missing' integer arrays.
Used to test dtype conversion with and without missing values.
"""
if request.param == "data":
return data
elif request.param == "data_missing":
return data_missing

View File

@ -0,0 +1,345 @@
import operator
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.core import ops
from pandas.core.arrays import FloatingArray
# Basic test for the arithmetic array ops
# -----------------------------------------------------------------------------
@pytest.mark.parametrize(
"opname, exp",
[("add", [1, 3, None, None, 9]), ("mul", [0, 2, None, None, 20])],
ids=["add", "mul"],
)
def test_add_mul(dtype, opname, exp):
a = pd.array([0, 1, None, 3, 4], dtype=dtype)
b = pd.array([1, 2, 3, None, 5], dtype=dtype)
# array / array
expected = pd.array(exp, dtype=dtype)
op = getattr(operator, opname)
result = op(a, b)
tm.assert_extension_array_equal(result, expected)
op = getattr(ops, "r" + opname)
result = op(a, b)
tm.assert_extension_array_equal(result, expected)
def test_sub(dtype):
a = pd.array([1, 2, 3, None, 5], dtype=dtype)
b = pd.array([0, 1, None, 3, 4], dtype=dtype)
result = a - b
expected = pd.array([1, 1, None, None, 1], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
def test_div(dtype):
a = pd.array([1, 2, 3, None, 5], dtype=dtype)
b = pd.array([0, 1, None, 3, 4], dtype=dtype)
result = a / b
expected = pd.array([np.inf, 2, None, None, 1.25], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)])
def test_divide_by_zero(zero, negative):
# https://github.com/pandas-dev/pandas/issues/27398, GH#22793
a = pd.array([0, 1, -1, None], dtype="Int64")
result = a / zero
expected = FloatingArray(
np.array([np.nan, np.inf, -np.inf, 1], dtype="float64"),
np.array([False, False, False, True]),
)
if negative:
expected *= -1
tm.assert_extension_array_equal(result, expected)
def test_floordiv(dtype):
a = pd.array([1, 2, 3, None, 5], dtype=dtype)
b = pd.array([0, 1, None, 3, 4], dtype=dtype)
result = a // b
# Series op sets 1//0 to np.inf, which IntegerArray does not do (yet)
expected = pd.array([0, 2, None, None, 1], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
def test_floordiv_by_int_zero_no_mask(any_int_ea_dtype):
# GH 48223: Aligns with non-masked floordiv
# but differs from numpy
# https://github.com/pandas-dev/pandas/issues/30188#issuecomment-564452740
ser = pd.Series([0, 1], dtype=any_int_ea_dtype)
result = 1 // ser
expected = pd.Series([np.inf, 1.0], dtype="Float64")
tm.assert_series_equal(result, expected)
ser_non_nullable = ser.astype(ser.dtype.numpy_dtype)
result = 1 // ser_non_nullable
expected = expected.astype(np.float64)
tm.assert_series_equal(result, expected)
def test_mod(dtype):
a = pd.array([1, 2, 3, None, 5], dtype=dtype)
b = pd.array([0, 1, None, 3, 4], dtype=dtype)
result = a % b
expected = pd.array([0, 0, None, None, 1], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
def test_pow_scalar():
a = pd.array([-1, 0, 1, None, 2], dtype="Int64")
result = a**0
expected = pd.array([1, 1, 1, 1, 1], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
result = a**1
expected = pd.array([-1, 0, 1, None, 2], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
result = a**pd.NA
expected = pd.array([None, None, 1, None, None], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
result = a**np.nan
expected = FloatingArray(
np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64"),
np.array([False, False, False, True, False]),
)
tm.assert_extension_array_equal(result, expected)
# reversed
a = a[1:] # Can't raise integers to negative powers.
result = 0**a
expected = pd.array([1, 0, None, 0], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
result = 1**a
expected = pd.array([1, 1, 1, 1], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
result = pd.NA**a
expected = pd.array([1, None, None, None], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
result = np.nan**a
expected = FloatingArray(
np.array([1, np.nan, np.nan, np.nan], dtype="float64"),
np.array([False, False, True, False]),
)
tm.assert_extension_array_equal(result, expected)
def test_pow_array():
a = pd.array([0, 0, 0, 1, 1, 1, None, None, None])
b = pd.array([0, 1, None, 0, 1, None, 0, 1, None])
result = a**b
expected = pd.array([1, 0, None, 1, 1, 1, 1, None, None])
tm.assert_extension_array_equal(result, expected)
def test_rpow_one_to_na():
# https://github.com/pandas-dev/pandas/issues/22022
# https://github.com/pandas-dev/pandas/issues/29997
arr = pd.array([np.nan, np.nan], dtype="Int64")
result = np.array([1.0, 2.0]) ** arr
expected = pd.array([1.0, np.nan], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("other", [0, 0.5])
def test_numpy_zero_dim_ndarray(other):
arr = pd.array([1, None, 2])
result = arr + np.array(other)
expected = arr + other
tm.assert_equal(result, expected)
# Test generic characteristics / errors
# -----------------------------------------------------------------------------
def test_error_invalid_values(data, all_arithmetic_operators):
op = all_arithmetic_operators
s = pd.Series(data)
ops = getattr(s, op)
# invalid scalars
with tm.external_error_raised(TypeError):
ops("foo")
with tm.external_error_raised(TypeError):
ops(pd.Timestamp("20180101"))
# invalid array-likes
str_ser = pd.Series("foo", index=s.index)
# with pytest.raises(TypeError, match=msg):
if all_arithmetic_operators in [
"__mul__",
"__rmul__",
]: # (data[~data.isna()] >= 0).all():
res = ops(str_ser)
expected = pd.Series(["foo" * x for x in data], index=s.index)
expected = expected.fillna(np.nan)
# TODO: doing this fillna to keep tests passing as we make
# assert_almost_equal stricter, but the expected with pd.NA seems
# more-correct than np.nan here.
tm.assert_series_equal(res, expected)
else:
with tm.external_error_raised(TypeError):
ops(str_ser)
with tm.external_error_raised(TypeError):
ops(pd.Series(pd.date_range("20180101", periods=len(s))))
# Various
# -----------------------------------------------------------------------------
# TODO test unsigned overflow
def test_arith_coerce_scalar(data, all_arithmetic_operators):
op = tm.get_op_from_name(all_arithmetic_operators)
s = pd.Series(data)
other = 0.01
result = op(s, other)
expected = op(s.astype(float), other)
expected = expected.astype("Float64")
# rmod results in NaN that wasn't NA in original nullable Series -> unmask it
if all_arithmetic_operators == "__rmod__":
mask = (s == 0).fillna(False).to_numpy(bool)
expected.array._mask[mask] = False
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("other", [1.0, np.array(1.0)])
def test_arithmetic_conversion(all_arithmetic_operators, other):
# if we have a float operand we should have a float result
# if that is equal to an integer
op = tm.get_op_from_name(all_arithmetic_operators)
s = pd.Series([1, 2, 3], dtype="Int64")
result = op(s, other)
assert result.dtype == "Float64"
def test_cross_type_arithmetic():
df = pd.DataFrame(
{
"A": pd.Series([1, 2, np.nan], dtype="Int64"),
"B": pd.Series([1, np.nan, 3], dtype="UInt8"),
"C": [1, 2, 3],
}
)
result = df.A + df.C
expected = pd.Series([2, 4, np.nan], dtype="Int64")
tm.assert_series_equal(result, expected)
result = (df.A + df.C) * 3 == 12
expected = pd.Series([False, True, None], dtype="boolean")
tm.assert_series_equal(result, expected)
result = df.A + df.B
expected = pd.Series([2, np.nan, np.nan], dtype="Int64")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("op", ["mean"])
def test_reduce_to_float(op):
# some reduce ops always return float, even if the result
# is a rounded number
df = pd.DataFrame(
{
"A": ["a", "b", "b"],
"B": [1, None, 3],
"C": pd.array([1, None, 3], dtype="Int64"),
}
)
# op
result = getattr(df.C, op)()
assert isinstance(result, float)
# groupby
result = getattr(df.groupby("A"), op)()
expected = pd.DataFrame(
{"B": np.array([1.0, 3.0]), "C": pd.array([1, 3], dtype="Float64")},
index=pd.Index(["a", "b"], name="A"),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"source, neg_target, abs_target",
[
([1, 2, 3], [-1, -2, -3], [1, 2, 3]),
([1, 2, None], [-1, -2, None], [1, 2, None]),
([-1, 0, 1], [1, 0, -1], [1, 0, 1]),
],
)
def test_unary_int_operators(any_signed_int_ea_dtype, source, neg_target, abs_target):
dtype = any_signed_int_ea_dtype
arr = pd.array(source, dtype=dtype)
neg_result, pos_result, abs_result = -arr, +arr, abs(arr)
neg_target = pd.array(neg_target, dtype=dtype)
abs_target = pd.array(abs_target, dtype=dtype)
tm.assert_extension_array_equal(neg_result, neg_target)
tm.assert_extension_array_equal(pos_result, arr)
assert not tm.shares_memory(pos_result, arr)
tm.assert_extension_array_equal(abs_result, abs_target)
def test_values_multiplying_large_series_by_NA():
# GH#33701
result = pd.NA * pd.Series(np.zeros(10001))
expected = pd.Series([pd.NA] * 10001)
tm.assert_series_equal(result, expected)
def test_bitwise(dtype):
left = pd.array([1, None, 3, 4], dtype=dtype)
right = pd.array([None, 3, 5, 4], dtype=dtype)
result = left | right
expected = pd.array([None, None, 3 | 5, 4 | 4], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
result = left & right
expected = pd.array([None, None, 3 & 5, 4 & 4], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
result = left ^ right
expected = pd.array([None, None, 3 ^ 5, 4 ^ 4], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
# TODO: desired behavior when operating with boolean? defer?
floats = right.astype("Float64")
with pytest.raises(TypeError, match="unsupported operand type"):
left | floats
with pytest.raises(TypeError, match="unsupported operand type"):
left & floats
with pytest.raises(TypeError, match="unsupported operand type"):
left ^ floats

View File

@ -0,0 +1,39 @@
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.tests.arrays.masked_shared import (
ComparisonOps,
NumericOps,
)
class TestComparisonOps(NumericOps, ComparisonOps):
@pytest.mark.parametrize("other", [True, False, pd.NA, -1, 0, 1])
def test_scalar(self, other, comparison_op, dtype):
ComparisonOps.test_scalar(self, other, comparison_op, dtype)
def test_compare_to_int(self, dtype, comparison_op):
# GH 28930
op_name = f"__{comparison_op.__name__}__"
s1 = pd.Series([1, None, 3], dtype=dtype)
s2 = pd.Series([1, None, 3], dtype="float")
method = getattr(s1, op_name)
result = method(2)
method = getattr(s2, op_name)
expected = method(2).astype("boolean")
expected[s2.isna()] = pd.NA
tm.assert_series_equal(result, expected)
def test_equals():
# GH-30652
# equals is generally tested in /tests/extension/base/methods, but this
# specifically tests that two arrays of the same class but different dtype
# do not evaluate equal
a1 = pd.array([1, 2, None], dtype="Int64")
a2 = pd.array([1, 2, None], dtype="Int32")
assert a1.equals(a2) is False

View File

@ -0,0 +1,69 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
@pytest.mark.parametrize(
"to_concat_dtypes, result_dtype",
[
(["Int64", "Int64"], "Int64"),
(["UInt64", "UInt64"], "UInt64"),
(["Int8", "Int8"], "Int8"),
(["Int8", "Int16"], "Int16"),
(["UInt8", "Int8"], "Int16"),
(["Int32", "UInt32"], "Int64"),
(["Int64", "UInt64"], "Float64"),
(["Int64", "boolean"], "object"),
(["UInt8", "boolean"], "object"),
],
)
def test_concat_series(to_concat_dtypes, result_dtype):
# we expect the same dtypes as we would get with non-masked inputs,
# just masked where available.
result = pd.concat([pd.Series([0, 1, pd.NA], dtype=t) for t in to_concat_dtypes])
expected = pd.concat([pd.Series([0, 1, pd.NA], dtype=object)] * 2).astype(
result_dtype
)
tm.assert_series_equal(result, expected)
# order doesn't matter for result
result = pd.concat(
[pd.Series([0, 1, pd.NA], dtype=t) for t in to_concat_dtypes[::-1]]
)
expected = pd.concat([pd.Series([0, 1, pd.NA], dtype=object)] * 2).astype(
result_dtype
)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"to_concat_dtypes, result_dtype",
[
(["Int64", "int64"], "Int64"),
(["UInt64", "uint64"], "UInt64"),
(["Int8", "int8"], "Int8"),
(["Int8", "int16"], "Int16"),
(["UInt8", "int8"], "Int16"),
(["Int32", "uint32"], "Int64"),
(["Int64", "uint64"], "Float64"),
(["Int64", "bool"], "object"),
(["UInt8", "bool"], "object"),
],
)
def test_concat_series_with_numpy(to_concat_dtypes, result_dtype):
# we expect the same dtypes as we would get with non-masked inputs,
# just masked where available.
s1 = pd.Series([0, 1, pd.NA], dtype=to_concat_dtypes[0])
s2 = pd.Series(np.array([0, 1], dtype=to_concat_dtypes[1]))
result = pd.concat([s1, s2], ignore_index=True)
expected = pd.Series([0, 1, pd.NA, 0, 1], dtype=object).astype(result_dtype)
tm.assert_series_equal(result, expected)
# order doesn't matter for result
result = pd.concat([s2, s1], ignore_index=True)
expected = pd.Series([0, 1, 0, 1, pd.NA], dtype=object).astype(result_dtype)
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,245 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.api.types import is_integer
from pandas.core.arrays import IntegerArray
from pandas.core.arrays.integer import (
Int8Dtype,
Int32Dtype,
Int64Dtype,
)
@pytest.fixture(params=[pd.array, IntegerArray._from_sequence])
def constructor(request):
"""Fixture returning parametrized IntegerArray from given sequence.
Used to test dtype conversions.
"""
return request.param
def test_uses_pandas_na():
a = pd.array([1, None], dtype=Int64Dtype())
assert a[1] is pd.NA
def test_from_dtype_from_float(data):
# construct from our dtype & string dtype
dtype = data.dtype
# from float
expected = pd.Series(data)
result = pd.Series(data.to_numpy(na_value=np.nan, dtype="float"), dtype=str(dtype))
tm.assert_series_equal(result, expected)
# from int / list
expected = pd.Series(data)
result = pd.Series(np.array(data).tolist(), dtype=str(dtype))
tm.assert_series_equal(result, expected)
# from int / array
expected = pd.Series(data).dropna().reset_index(drop=True)
dropped = np.array(data.dropna()).astype(np.dtype(dtype.type))
result = pd.Series(dropped, dtype=str(dtype))
tm.assert_series_equal(result, expected)
def test_conversions(data_missing):
# astype to object series
df = pd.DataFrame({"A": data_missing})
result = df["A"].astype("object")
expected = pd.Series(np.array([pd.NA, 1], dtype=object), name="A")
tm.assert_series_equal(result, expected)
# convert to object ndarray
# we assert that we are exactly equal
# including type conversions of scalars
result = df["A"].astype("object").values
expected = np.array([pd.NA, 1], dtype=object)
tm.assert_numpy_array_equal(result, expected)
for r, e in zip(result, expected):
if pd.isnull(r):
assert pd.isnull(e)
elif is_integer(r):
assert r == e
assert is_integer(e)
else:
assert r == e
assert type(r) == type(e)
def test_integer_array_constructor():
values = np.array([1, 2, 3, 4], dtype="int64")
mask = np.array([False, False, False, True], dtype="bool")
result = IntegerArray(values, mask)
expected = pd.array([1, 2, 3, np.nan], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
msg = r".* should be .* numpy array. Use the 'pd.array' function instead"
with pytest.raises(TypeError, match=msg):
IntegerArray(values.tolist(), mask)
with pytest.raises(TypeError, match=msg):
IntegerArray(values, mask.tolist())
with pytest.raises(TypeError, match=msg):
IntegerArray(values.astype(float), mask)
msg = r"__init__\(\) missing 1 required positional argument: 'mask'"
with pytest.raises(TypeError, match=msg):
IntegerArray(values)
def test_integer_array_constructor_copy():
values = np.array([1, 2, 3, 4], dtype="int64")
mask = np.array([False, False, False, True], dtype="bool")
result = IntegerArray(values, mask)
assert result._data is values
assert result._mask is mask
result = IntegerArray(values, mask, copy=True)
assert result._data is not values
assert result._mask is not mask
@pytest.mark.parametrize(
"a, b",
[
([1, None], [1, np.nan]),
([None], [np.nan]),
([None, np.nan], [np.nan, np.nan]),
([np.nan, np.nan], [np.nan, np.nan]),
],
)
def test_to_integer_array_none_is_nan(a, b):
result = pd.array(a, dtype="Int64")
expected = pd.array(b, dtype="Int64")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
"values",
[
["foo", "bar"],
"foo",
1,
1.0,
pd.date_range("20130101", periods=2),
np.array(["foo"]),
[[1, 2], [3, 4]],
[np.nan, {"a": 1}],
],
)
def test_to_integer_array_error(values):
# error in converting existing arrays to IntegerArrays
msg = "|".join(
[
r"cannot be converted to IntegerDtype",
r"invalid literal for int\(\) with base 10:",
r"values must be a 1D list-like",
r"Cannot pass scalar",
r"int\(\) argument must be a string",
]
)
with pytest.raises((ValueError, TypeError), match=msg):
pd.array(values, dtype="Int64")
with pytest.raises((ValueError, TypeError), match=msg):
IntegerArray._from_sequence(values)
def test_to_integer_array_inferred_dtype(constructor):
# if values has dtype -> respect it
result = constructor(np.array([1, 2], dtype="int8"))
assert result.dtype == Int8Dtype()
result = constructor(np.array([1, 2], dtype="int32"))
assert result.dtype == Int32Dtype()
# if values have no dtype -> always int64
result = constructor([1, 2])
assert result.dtype == Int64Dtype()
def test_to_integer_array_dtype_keyword(constructor):
result = constructor([1, 2], dtype="Int8")
assert result.dtype == Int8Dtype()
# if values has dtype -> override it
result = constructor(np.array([1, 2], dtype="int8"), dtype="Int32")
assert result.dtype == Int32Dtype()
def test_to_integer_array_float():
result = IntegerArray._from_sequence([1.0, 2.0], dtype="Int64")
expected = pd.array([1, 2], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
with pytest.raises(TypeError, match="cannot safely cast non-equivalent"):
IntegerArray._from_sequence([1.5, 2.0], dtype="Int64")
# for float dtypes, the itemsize is not preserved
result = IntegerArray._from_sequence(
np.array([1.0, 2.0], dtype="float32"), dtype="Int64"
)
assert result.dtype == Int64Dtype()
def test_to_integer_array_str():
result = IntegerArray._from_sequence(["1", "2", None], dtype="Int64")
expected = pd.array([1, 2, np.nan], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
with pytest.raises(
ValueError, match=r"invalid literal for int\(\) with base 10: .*"
):
IntegerArray._from_sequence(["1", "2", ""], dtype="Int64")
with pytest.raises(
ValueError, match=r"invalid literal for int\(\) with base 10: .*"
):
IntegerArray._from_sequence(["1.5", "2.0"], dtype="Int64")
@pytest.mark.parametrize(
"bool_values, int_values, target_dtype, expected_dtype",
[
([False, True], [0, 1], Int64Dtype(), Int64Dtype()),
([False, True], [0, 1], "Int64", Int64Dtype()),
([False, True, np.nan], [0, 1, np.nan], Int64Dtype(), Int64Dtype()),
],
)
def test_to_integer_array_bool(
constructor, bool_values, int_values, target_dtype, expected_dtype
):
result = constructor(bool_values, dtype=target_dtype)
assert result.dtype == expected_dtype
expected = pd.array(int_values, dtype=target_dtype)
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
"values, to_dtype, result_dtype",
[
(np.array([1], dtype="int64"), None, Int64Dtype),
(np.array([1, np.nan]), None, Int64Dtype),
(np.array([1, np.nan]), "int8", Int8Dtype),
],
)
def test_to_integer_array(values, to_dtype, result_dtype):
# convert existing arrays to IntegerArrays
result = IntegerArray._from_sequence(values, dtype=to_dtype)
assert result.dtype == result_dtype()
expected = pd.array(values, dtype=result_dtype())
tm.assert_extension_array_equal(result, expected)
def test_integer_array_from_boolean():
# GH31104
expected = pd.array(np.array([True, False]), dtype="Int64")
result = pd.array(np.array([True, False], dtype=object), dtype="Int64")
tm.assert_extension_array_equal(result, expected)

View File

@ -0,0 +1,301 @@
import numpy as np
import pytest
from pandas.core.dtypes.generic import ABCIndex
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays.integer import (
Int8Dtype,
UInt32Dtype,
)
def test_dtypes(dtype):
# smoke tests on auto dtype construction
if dtype.is_signed_integer:
assert np.dtype(dtype.type).kind == "i"
else:
assert np.dtype(dtype.type).kind == "u"
assert dtype.name is not None
@pytest.mark.parametrize("op", ["sum", "min", "max", "prod"])
def test_preserve_dtypes(op):
# for ops that enable (mean would actually work here
# but generally it is a float return value)
df = pd.DataFrame(
{
"A": ["a", "b", "b"],
"B": [1, None, 3],
"C": pd.array([1, None, 3], dtype="Int64"),
}
)
# op
result = getattr(df.C, op)()
if op in {"sum", "prod", "min", "max"}:
assert isinstance(result, np.int64)
else:
assert isinstance(result, int)
# groupby
result = getattr(df.groupby("A"), op)()
expected = pd.DataFrame(
{"B": np.array([1.0, 3.0]), "C": pd.array([1, 3], dtype="Int64")},
index=pd.Index(["a", "b"], name="A"),
)
tm.assert_frame_equal(result, expected)
def test_astype_nansafe():
# see gh-22343
arr = pd.array([np.nan, 1, 2], dtype="Int8")
msg = "cannot convert NA to integer"
with pytest.raises(ValueError, match=msg):
arr.astype("uint32")
@pytest.mark.parametrize("dropna", [True, False])
def test_construct_index(all_data, dropna):
# ensure that we do not coerce to different Index dtype or non-index
all_data = all_data[:10]
if dropna:
other = np.array(all_data[~all_data.isna()])
else:
other = all_data
result = pd.Index(pd.array(other, dtype=all_data.dtype))
expected = pd.Index(other, dtype=all_data.dtype)
assert all_data.dtype == expected.dtype # dont coerce to object
tm.assert_index_equal(result, expected)
@pytest.mark.parametrize("dropna", [True, False])
def test_astype_index(all_data, dropna):
# as an int/uint index to Index
all_data = all_data[:10]
if dropna:
other = all_data[~all_data.isna()]
else:
other = all_data
dtype = all_data.dtype
idx = pd.Index(np.array(other))
assert isinstance(idx, ABCIndex)
result = idx.astype(dtype)
expected = idx.astype(object).astype(dtype)
tm.assert_index_equal(result, expected)
def test_astype(all_data):
all_data = all_data[:10]
ints = all_data[~all_data.isna()]
mixed = all_data
dtype = Int8Dtype()
# coerce to same type - ints
s = pd.Series(ints)
result = s.astype(all_data.dtype)
expected = pd.Series(ints)
tm.assert_series_equal(result, expected)
# coerce to same other - ints
s = pd.Series(ints)
result = s.astype(dtype)
expected = pd.Series(ints, dtype=dtype)
tm.assert_series_equal(result, expected)
# coerce to same numpy_dtype - ints
s = pd.Series(ints)
result = s.astype(all_data.dtype.numpy_dtype)
expected = pd.Series(ints._data.astype(all_data.dtype.numpy_dtype))
tm.assert_series_equal(result, expected)
# coerce to same type - mixed
s = pd.Series(mixed)
result = s.astype(all_data.dtype)
expected = pd.Series(mixed)
tm.assert_series_equal(result, expected)
# coerce to same other - mixed
s = pd.Series(mixed)
result = s.astype(dtype)
expected = pd.Series(mixed, dtype=dtype)
tm.assert_series_equal(result, expected)
# coerce to same numpy_dtype - mixed
s = pd.Series(mixed)
msg = "cannot convert NA to integer"
with pytest.raises(ValueError, match=msg):
s.astype(all_data.dtype.numpy_dtype)
# coerce to object
s = pd.Series(mixed)
result = s.astype("object")
expected = pd.Series(np.asarray(mixed, dtype=object))
tm.assert_series_equal(result, expected)
def test_astype_copy():
arr = pd.array([1, 2, 3, None], dtype="Int64")
orig = pd.array([1, 2, 3, None], dtype="Int64")
# copy=True -> ensure both data and mask are actual copies
result = arr.astype("Int64", copy=True)
assert result is not arr
assert not tm.shares_memory(result, arr)
result[0] = 10
tm.assert_extension_array_equal(arr, orig)
result[0] = pd.NA
tm.assert_extension_array_equal(arr, orig)
# copy=False
result = arr.astype("Int64", copy=False)
assert result is arr
assert np.shares_memory(result._data, arr._data)
assert np.shares_memory(result._mask, arr._mask)
result[0] = 10
assert arr[0] == 10
result[0] = pd.NA
assert arr[0] is pd.NA
# astype to different dtype -> always needs a copy -> even with copy=False
# we need to ensure that also the mask is actually copied
arr = pd.array([1, 2, 3, None], dtype="Int64")
orig = pd.array([1, 2, 3, None], dtype="Int64")
result = arr.astype("Int32", copy=False)
assert not tm.shares_memory(result, arr)
result[0] = 10
tm.assert_extension_array_equal(arr, orig)
result[0] = pd.NA
tm.assert_extension_array_equal(arr, orig)
def test_astype_to_larger_numpy():
a = pd.array([1, 2], dtype="Int32")
result = a.astype("int64")
expected = np.array([1, 2], dtype="int64")
tm.assert_numpy_array_equal(result, expected)
a = pd.array([1, 2], dtype="UInt32")
result = a.astype("uint64")
expected = np.array([1, 2], dtype="uint64")
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("dtype", [Int8Dtype(), "Int8", UInt32Dtype(), "UInt32"])
def test_astype_specific_casting(dtype):
s = pd.Series([1, 2, 3], dtype="Int64")
result = s.astype(dtype)
expected = pd.Series([1, 2, 3], dtype=dtype)
tm.assert_series_equal(result, expected)
s = pd.Series([1, 2, 3, None], dtype="Int64")
result = s.astype(dtype)
expected = pd.Series([1, 2, 3, None], dtype=dtype)
tm.assert_series_equal(result, expected)
def test_astype_floating():
arr = pd.array([1, 2, None], dtype="Int64")
result = arr.astype("Float64")
expected = pd.array([1.0, 2.0, None], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
def test_astype_dt64():
# GH#32435
arr = pd.array([1, 2, 3, pd.NA]) * 10**9
result = arr.astype("datetime64[ns]")
expected = np.array([1, 2, 3, "NaT"], dtype="M8[s]").astype("M8[ns]")
tm.assert_numpy_array_equal(result, expected)
def test_construct_cast_invalid(dtype):
msg = "cannot safely"
arr = [1.2, 2.3, 3.7]
with pytest.raises(TypeError, match=msg):
pd.array(arr, dtype=dtype)
with pytest.raises(TypeError, match=msg):
pd.Series(arr).astype(dtype)
arr = [1.2, 2.3, 3.7, np.nan]
with pytest.raises(TypeError, match=msg):
pd.array(arr, dtype=dtype)
with pytest.raises(TypeError, match=msg):
pd.Series(arr).astype(dtype)
@pytest.mark.parametrize("in_series", [True, False])
def test_to_numpy_na_nan(in_series):
a = pd.array([0, 1, None], dtype="Int64")
if in_series:
a = pd.Series(a)
result = a.to_numpy(dtype="float64", na_value=np.nan)
expected = np.array([0.0, 1.0, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
result = a.to_numpy(dtype="int64", na_value=-1)
expected = np.array([0, 1, -1], dtype="int64")
tm.assert_numpy_array_equal(result, expected)
result = a.to_numpy(dtype="bool", na_value=False)
expected = np.array([False, True, False], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("in_series", [True, False])
@pytest.mark.parametrize("dtype", ["int32", "int64", "bool"])
def test_to_numpy_dtype(dtype, in_series):
a = pd.array([0, 1], dtype="Int64")
if in_series:
a = pd.Series(a)
result = a.to_numpy(dtype=dtype)
expected = np.array([0, 1], dtype=dtype)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("dtype", ["int64", "bool"])
def test_to_numpy_na_raises(dtype):
a = pd.array([0, 1, None], dtype="Int64")
with pytest.raises(ValueError, match=dtype):
a.to_numpy(dtype=dtype)
def test_astype_str(using_infer_string):
a = pd.array([1, 2, None], dtype="Int64")
if using_infer_string:
expected = pd.array(["1", "2", None], dtype=pd.StringDtype(na_value=np.nan))
tm.assert_extension_array_equal(a.astype(str), expected)
tm.assert_extension_array_equal(a.astype("str"), expected)
else:
expected = np.array(["1", "2", "<NA>"], dtype=f"{tm.ENDIAN}U21")
tm.assert_numpy_array_equal(a.astype(str), expected)
tm.assert_numpy_array_equal(a.astype("str"), expected)
def test_astype_boolean():
# https://github.com/pandas-dev/pandas/issues/31102
a = pd.array([1, 0, -1, 2, None], dtype="Int64")
result = a.astype("boolean")
expected = pd.array([True, False, True, True, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)

View File

@ -0,0 +1,203 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import FloatingArray
@pytest.mark.parametrize("ufunc", [np.abs, np.sign])
# np.sign emits a warning with nans, <https://github.com/numpy/numpy/issues/15127>
@pytest.mark.filterwarnings("ignore:invalid value encountered in sign:RuntimeWarning")
def test_ufuncs_single_int(ufunc):
a = pd.array([1, 2, -3, np.nan])
result = ufunc(a)
expected = pd.array(ufunc(a.astype(float)), dtype="Int64")
tm.assert_extension_array_equal(result, expected)
s = pd.Series(a)
result = ufunc(s)
expected = pd.Series(pd.array(ufunc(a.astype(float)), dtype="Int64"))
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("ufunc", [np.log, np.exp, np.sin, np.cos, np.sqrt])
def test_ufuncs_single_float(ufunc):
a = pd.array([1, 2, -3, np.nan])
with np.errstate(invalid="ignore"):
result = ufunc(a)
expected = FloatingArray(ufunc(a.astype(float)), mask=a._mask)
tm.assert_extension_array_equal(result, expected)
s = pd.Series(a)
with np.errstate(invalid="ignore"):
result = ufunc(s)
expected = pd.Series(expected)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("ufunc", [np.add, np.subtract])
def test_ufuncs_binary_int(ufunc):
# two IntegerArrays
a = pd.array([1, 2, -3, np.nan])
result = ufunc(a, a)
expected = pd.array(ufunc(a.astype(float), a.astype(float)), dtype="Int64")
tm.assert_extension_array_equal(result, expected)
# IntegerArray with numpy array
arr = np.array([1, 2, 3, 4])
result = ufunc(a, arr)
expected = pd.array(ufunc(a.astype(float), arr), dtype="Int64")
tm.assert_extension_array_equal(result, expected)
result = ufunc(arr, a)
expected = pd.array(ufunc(arr, a.astype(float)), dtype="Int64")
tm.assert_extension_array_equal(result, expected)
# IntegerArray with scalar
result = ufunc(a, 1)
expected = pd.array(ufunc(a.astype(float), 1), dtype="Int64")
tm.assert_extension_array_equal(result, expected)
result = ufunc(1, a)
expected = pd.array(ufunc(1, a.astype(float)), dtype="Int64")
tm.assert_extension_array_equal(result, expected)
def test_ufunc_binary_output():
a = pd.array([1, 2, np.nan])
result = np.modf(a)
expected = np.modf(a.to_numpy(na_value=np.nan, dtype="float"))
expected = (pd.array(expected[0]), pd.array(expected[1]))
assert isinstance(result, tuple)
assert len(result) == 2
for x, y in zip(result, expected):
tm.assert_extension_array_equal(x, y)
@pytest.mark.parametrize("values", [[0, 1], [0, None]])
def test_ufunc_reduce_raises(values):
arr = pd.array(values)
res = np.add.reduce(arr)
expected = arr.sum(skipna=False)
tm.assert_almost_equal(res, expected)
@pytest.mark.parametrize(
"pandasmethname, kwargs",
[
("var", {"ddof": 0}),
("var", {"ddof": 1}),
("std", {"ddof": 0}),
("std", {"ddof": 1}),
("kurtosis", {}),
("skew", {}),
("sem", {}),
],
)
def test_stat_method(pandasmethname, kwargs):
s = pd.Series(data=[1, 2, 3, 4, 5, 6, np.nan, np.nan], dtype="Int64")
pandasmeth = getattr(s, pandasmethname)
result = pandasmeth(**kwargs)
s2 = pd.Series(data=[1, 2, 3, 4, 5, 6], dtype="Int64")
pandasmeth = getattr(s2, pandasmethname)
expected = pandasmeth(**kwargs)
assert expected == result
def test_value_counts_na():
arr = pd.array([1, 2, 1, pd.NA], dtype="Int64")
result = arr.value_counts(dropna=False)
ex_index = pd.Index([1, 2, pd.NA], dtype="Int64")
assert ex_index.dtype == "Int64"
expected = pd.Series([2, 1, 1], index=ex_index, dtype="Int64", name="count")
tm.assert_series_equal(result, expected)
result = arr.value_counts(dropna=True)
expected = pd.Series([2, 1], index=arr[:2], dtype="Int64", name="count")
assert expected.index.dtype == arr.dtype
tm.assert_series_equal(result, expected)
def test_value_counts_empty():
# https://github.com/pandas-dev/pandas/issues/33317
ser = pd.Series([], dtype="Int64")
result = ser.value_counts()
idx = pd.Index([], dtype=ser.dtype)
assert idx.dtype == ser.dtype
expected = pd.Series([], index=idx, dtype="Int64", name="count")
tm.assert_series_equal(result, expected)
def test_value_counts_with_normalize():
# GH 33172
ser = pd.Series([1, 2, 1, pd.NA], dtype="Int64")
result = ser.value_counts(normalize=True)
expected = pd.Series([2, 1], index=ser[:2], dtype="Float64", name="proportion") / 3
assert expected.index.dtype == ser.dtype
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("min_count", [0, 4])
def test_integer_array_sum(skipna, min_count, any_int_ea_dtype):
dtype = any_int_ea_dtype
arr = pd.array([1, 2, 3, None], dtype=dtype)
result = arr.sum(skipna=skipna, min_count=min_count)
if skipna and min_count == 0:
assert result == 6
else:
assert result is pd.NA
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("method", ["min", "max"])
def test_integer_array_min_max(skipna, method, any_int_ea_dtype):
dtype = any_int_ea_dtype
arr = pd.array([0, 1, None], dtype=dtype)
func = getattr(arr, method)
result = func(skipna=skipna)
if skipna:
assert result == (0 if method == "min" else 1)
else:
assert result is pd.NA
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("min_count", [0, 9])
def test_integer_array_prod(skipna, min_count, any_int_ea_dtype):
dtype = any_int_ea_dtype
arr = pd.array([1, 2, None], dtype=dtype)
result = arr.prod(skipna=skipna, min_count=min_count)
if skipna and min_count == 0:
assert result == 2
else:
assert result is pd.NA
@pytest.mark.parametrize(
"values, expected", [([1, 2, 3], 6), ([1, 2, 3, None], 6), ([None], 0)]
)
def test_integer_array_numpy_sum(values, expected):
arr = pd.array(values, dtype="Int64")
result = np.sum(arr)
assert result == expected
@pytest.mark.parametrize("op", ["sum", "prod", "min", "max"])
def test_dataframe_reductions(op):
# https://github.com/pandas-dev/pandas/pull/32867
# ensure the integers are not cast to float during reductions
df = pd.DataFrame({"a": pd.array([1, 2], dtype="Int64")})
result = df.max()
assert isinstance(result["a"], np.int64)
# TODO(jreback) - these need testing / are broken
# shift
# set_index (destroys type)

View File

@ -0,0 +1,19 @@
import pandas as pd
import pandas._testing as tm
def test_array_setitem_nullable_boolean_mask():
# GH 31446
ser = pd.Series([1, 2], dtype="Int64")
result = ser.where(ser > 1)
expected = pd.Series([pd.NA, 2], dtype="Int64")
tm.assert_series_equal(result, expected)
def test_array_setitem():
# GH 31446
arr = pd.Series([1, 2], dtype="Int64").array
arr[arr > 1] = 1
expected = pd.array([1, 1], dtype="Int64")
tm.assert_extension_array_equal(arr, expected)

View File

@ -0,0 +1,123 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Series,
array,
)
import pandas._testing as tm
@pytest.mark.parametrize(
"op, expected",
[
["sum", np.int64(3)],
["prod", np.int64(2)],
["min", np.int64(1)],
["max", np.int64(2)],
["mean", np.float64(1.5)],
["median", np.float64(1.5)],
["var", np.float64(0.5)],
["std", np.float64(0.5**0.5)],
["skew", pd.NA],
["kurt", pd.NA],
["any", True],
["all", True],
],
)
def test_series_reductions(op, expected):
ser = Series([1, 2], dtype="Int64")
result = getattr(ser, op)()
tm.assert_equal(result, expected)
@pytest.mark.parametrize(
"op, expected",
[
["sum", Series([3], index=["a"], dtype="Int64")],
["prod", Series([2], index=["a"], dtype="Int64")],
["min", Series([1], index=["a"], dtype="Int64")],
["max", Series([2], index=["a"], dtype="Int64")],
["mean", Series([1.5], index=["a"], dtype="Float64")],
["median", Series([1.5], index=["a"], dtype="Float64")],
["var", Series([0.5], index=["a"], dtype="Float64")],
["std", Series([0.5**0.5], index=["a"], dtype="Float64")],
["skew", Series([pd.NA], index=["a"], dtype="Float64")],
["kurt", Series([pd.NA], index=["a"], dtype="Float64")],
["any", Series([True], index=["a"], dtype="boolean")],
["all", Series([True], index=["a"], dtype="boolean")],
],
)
def test_dataframe_reductions(op, expected):
df = DataFrame({"a": array([1, 2], dtype="Int64")})
result = getattr(df, op)()
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"op, expected",
[
["sum", array([1, 3], dtype="Int64")],
["prod", array([1, 3], dtype="Int64")],
["min", array([1, 3], dtype="Int64")],
["max", array([1, 3], dtype="Int64")],
["mean", array([1, 3], dtype="Float64")],
["median", array([1, 3], dtype="Float64")],
["var", array([pd.NA], dtype="Float64")],
["std", array([pd.NA], dtype="Float64")],
["skew", array([pd.NA], dtype="Float64")],
["any", array([True, True], dtype="boolean")],
["all", array([True, True], dtype="boolean")],
],
)
def test_groupby_reductions(op, expected):
df = DataFrame(
{
"A": ["a", "b", "b"],
"B": array([1, None, 3], dtype="Int64"),
}
)
result = getattr(df.groupby("A"), op)()
expected = DataFrame(expected, index=pd.Index(["a", "b"], name="A"), columns=["B"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"op, expected",
[
["sum", Series([4, 4], index=["B", "C"], dtype="Float64")],
["prod", Series([3, 3], index=["B", "C"], dtype="Float64")],
["min", Series([1, 1], index=["B", "C"], dtype="Float64")],
["max", Series([3, 3], index=["B", "C"], dtype="Float64")],
["mean", Series([2, 2], index=["B", "C"], dtype="Float64")],
["median", Series([2, 2], index=["B", "C"], dtype="Float64")],
["var", Series([2, 2], index=["B", "C"], dtype="Float64")],
["std", Series([2**0.5, 2**0.5], index=["B", "C"], dtype="Float64")],
["skew", Series([pd.NA, pd.NA], index=["B", "C"], dtype="Float64")],
["kurt", Series([pd.NA, pd.NA], index=["B", "C"], dtype="Float64")],
["any", Series([True, True, True], index=["A", "B", "C"], dtype="boolean")],
["all", Series([True, True, True], index=["A", "B", "C"], dtype="boolean")],
],
)
def test_mixed_reductions(op, expected):
df = DataFrame(
{
"A": ["a", "b", "b"],
"B": [1, None, 3],
"C": array([1, None, 3], dtype="Int64"),
}
)
# series
result = getattr(df.C, op)()
tm.assert_equal(result, expected["C"])
# frame
if op in ["any", "all"]:
result = getattr(df, op)()
else:
result = getattr(df, op)(numeric_only=True)
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,67 @@
import numpy as np
import pytest
import pandas as pd
from pandas.core.arrays.integer import (
Int8Dtype,
Int16Dtype,
Int32Dtype,
Int64Dtype,
UInt8Dtype,
UInt16Dtype,
UInt32Dtype,
UInt64Dtype,
)
def test_dtypes(dtype):
# smoke tests on auto dtype construction
if dtype.is_signed_integer:
assert np.dtype(dtype.type).kind == "i"
else:
assert np.dtype(dtype.type).kind == "u"
assert dtype.name is not None
@pytest.mark.parametrize(
"dtype, expected",
[
(Int8Dtype(), "Int8Dtype()"),
(Int16Dtype(), "Int16Dtype()"),
(Int32Dtype(), "Int32Dtype()"),
(Int64Dtype(), "Int64Dtype()"),
(UInt8Dtype(), "UInt8Dtype()"),
(UInt16Dtype(), "UInt16Dtype()"),
(UInt32Dtype(), "UInt32Dtype()"),
(UInt64Dtype(), "UInt64Dtype()"),
],
)
def test_repr_dtype(dtype, expected):
assert repr(dtype) == expected
def test_repr_array():
result = repr(pd.array([1, None, 3]))
expected = "<IntegerArray>\n[1, <NA>, 3]\nLength: 3, dtype: Int64"
assert result == expected
def test_repr_array_long():
data = pd.array([1, 2, None] * 1000)
expected = (
"<IntegerArray>\n"
"[ 1, 2, <NA>, 1, 2, <NA>, 1, 2, <NA>, 1,\n"
" ...\n"
" <NA>, 1, 2, <NA>, 1, 2, <NA>, 1, 2, <NA>]\n"
"Length: 3000, dtype: Int64"
)
result = repr(data)
assert result == expected
def test_frame_repr(data_missing):
df = pd.DataFrame({"A": data_missing})
result = repr(df)
expected = " A\n0 <NA>\n1 1"
assert result == expected

View File

@ -0,0 +1,28 @@
import pytest
from pandas import (
Categorical,
CategoricalDtype,
Index,
IntervalIndex,
)
import pandas._testing as tm
class TestAstype:
@pytest.mark.parametrize("ordered", [True, False])
def test_astype_categorical_retains_ordered(self, ordered):
index = IntervalIndex.from_breaks(range(5))
arr = index._data
dtype = CategoricalDtype(None, ordered=ordered)
expected = Categorical(list(arr), ordered=ordered)
result = arr.astype(dtype)
assert result.ordered is ordered
tm.assert_categorical_equal(result, expected)
# test IntervalIndex.astype while we're at it.
result = index.astype(dtype)
expected = Index(expected)
tm.assert_index_equal(result, expected)

View File

@ -0,0 +1,13 @@
from pandas.core.arrays import IntervalArray
def test_repr():
# GH#25022
arr = IntervalArray.from_tuples([(0, 1), (1, 2)])
result = repr(arr)
expected = (
"<IntervalArray>\n"
"[(0, 1], (1, 2]]\n"
"Length: 2, dtype: interval[int64, right]"
)
assert result == expected

View File

@ -0,0 +1,231 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
Index,
Interval,
IntervalIndex,
Timedelta,
Timestamp,
date_range,
timedelta_range,
)
import pandas._testing as tm
from pandas.core.arrays import IntervalArray
@pytest.fixture(
params=[
(Index([0, 2, 4]), Index([1, 3, 5])),
(Index([0.0, 1.0, 2.0]), Index([1.0, 2.0, 3.0])),
(timedelta_range("0 days", periods=3), timedelta_range("1 day", periods=3)),
(date_range("20170101", periods=3), date_range("20170102", periods=3)),
(
date_range("20170101", periods=3, tz="US/Eastern"),
date_range("20170102", periods=3, tz="US/Eastern"),
),
],
ids=lambda x: str(x[0].dtype),
)
def left_right_dtypes(request):
"""
Fixture for building an IntervalArray from various dtypes
"""
return request.param
class TestAttributes:
@pytest.mark.parametrize(
"left, right",
[
(0, 1),
(Timedelta("0 days"), Timedelta("1 day")),
(Timestamp("2018-01-01"), Timestamp("2018-01-02")),
(
Timestamp("2018-01-01", tz="US/Eastern"),
Timestamp("2018-01-02", tz="US/Eastern"),
),
],
)
@pytest.mark.parametrize("constructor", [IntervalArray, IntervalIndex])
def test_is_empty(self, constructor, left, right, closed):
# GH27219
tuples = [(left, left), (left, right), np.nan]
expected = np.array([closed != "both", False, False])
result = constructor.from_tuples(tuples, closed=closed).is_empty
tm.assert_numpy_array_equal(result, expected)
class TestMethods:
@pytest.mark.parametrize("new_closed", ["left", "right", "both", "neither"])
def test_set_closed(self, closed, new_closed):
# GH 21670
array = IntervalArray.from_breaks(range(10), closed=closed)
result = array.set_closed(new_closed)
expected = IntervalArray.from_breaks(range(10), closed=new_closed)
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
"other",
[
Interval(0, 1, closed="right"),
IntervalArray.from_breaks([1, 2, 3, 4], closed="right"),
],
)
def test_where_raises(self, other):
# GH#45768 The IntervalArray methods raises; the Series method coerces
ser = pd.Series(IntervalArray.from_breaks([1, 2, 3, 4], closed="left"))
mask = np.array([True, False, True])
match = "'value.closed' is 'right', expected 'left'."
with pytest.raises(ValueError, match=match):
ser.array._where(mask, other)
res = ser.where(mask, other=other)
expected = ser.astype(object).where(mask, other)
tm.assert_series_equal(res, expected)
def test_shift(self):
# https://github.com/pandas-dev/pandas/issues/31495, GH#22428, GH#31502
a = IntervalArray.from_breaks([1, 2, 3])
result = a.shift()
# int -> float
expected = IntervalArray.from_tuples([(np.nan, np.nan), (1.0, 2.0)])
tm.assert_interval_array_equal(result, expected)
msg = "can only insert Interval objects and NA into an IntervalArray"
with pytest.raises(TypeError, match=msg):
a.shift(1, fill_value=pd.NaT)
def test_shift_datetime(self):
# GH#31502, GH#31504
a = IntervalArray.from_breaks(date_range("2000", periods=4))
result = a.shift(2)
expected = a.take([-1, -1, 0], allow_fill=True)
tm.assert_interval_array_equal(result, expected)
result = a.shift(-1)
expected = a.take([1, 2, -1], allow_fill=True)
tm.assert_interval_array_equal(result, expected)
msg = "can only insert Interval objects and NA into an IntervalArray"
with pytest.raises(TypeError, match=msg):
a.shift(1, fill_value=np.timedelta64("NaT", "ns"))
class TestSetitem:
def test_set_na(self, left_right_dtypes):
left, right = left_right_dtypes
left = left.copy(deep=True)
right = right.copy(deep=True)
result = IntervalArray.from_arrays(left, right)
if result.dtype.subtype.kind not in ["m", "M"]:
msg = "'value' should be an interval type, got <.*NaTType'> instead."
with pytest.raises(TypeError, match=msg):
result[0] = pd.NaT
if result.dtype.subtype.kind in ["i", "u"]:
msg = "Cannot set float NaN to integer-backed IntervalArray"
# GH#45484 TypeError, not ValueError, matches what we get with
# non-NA un-holdable value.
with pytest.raises(TypeError, match=msg):
result[0] = np.nan
return
result[0] = np.nan
expected_left = Index([left._na_value] + list(left[1:]))
expected_right = Index([right._na_value] + list(right[1:]))
expected = IntervalArray.from_arrays(expected_left, expected_right)
tm.assert_extension_array_equal(result, expected)
def test_setitem_mismatched_closed(self):
arr = IntervalArray.from_breaks(range(4))
orig = arr.copy()
other = arr.set_closed("both")
msg = "'value.closed' is 'both', expected 'right'"
with pytest.raises(ValueError, match=msg):
arr[0] = other[0]
with pytest.raises(ValueError, match=msg):
arr[:1] = other[:1]
with pytest.raises(ValueError, match=msg):
arr[:0] = other[:0]
with pytest.raises(ValueError, match=msg):
arr[:] = other[::-1]
with pytest.raises(ValueError, match=msg):
arr[:] = list(other[::-1])
with pytest.raises(ValueError, match=msg):
arr[:] = other[::-1].astype(object)
with pytest.raises(ValueError, match=msg):
arr[:] = other[::-1].astype("category")
# empty list should be no-op
arr[:0] = []
tm.assert_interval_array_equal(arr, orig)
class TestReductions:
def test_min_max_invalid_axis(self, left_right_dtypes):
left, right = left_right_dtypes
left = left.copy(deep=True)
right = right.copy(deep=True)
arr = IntervalArray.from_arrays(left, right)
msg = "`axis` must be fewer than the number of dimensions"
for axis in [-2, 1]:
with pytest.raises(ValueError, match=msg):
arr.min(axis=axis)
with pytest.raises(ValueError, match=msg):
arr.max(axis=axis)
msg = "'>=' not supported between"
with pytest.raises(TypeError, match=msg):
arr.min(axis="foo")
with pytest.raises(TypeError, match=msg):
arr.max(axis="foo")
def test_min_max(self, left_right_dtypes, index_or_series_or_array):
# GH#44746
left, right = left_right_dtypes
left = left.copy(deep=True)
right = right.copy(deep=True)
arr = IntervalArray.from_arrays(left, right)
# The expected results below are only valid if monotonic
assert left.is_monotonic_increasing
assert Index(arr).is_monotonic_increasing
MIN = arr[0]
MAX = arr[-1]
indexer = np.arange(len(arr))
np.random.default_rng(2).shuffle(indexer)
arr = arr.take(indexer)
arr_na = arr.insert(2, np.nan)
arr = index_or_series_or_array(arr)
arr_na = index_or_series_or_array(arr_na)
for skipna in [True, False]:
res = arr.min(skipna=skipna)
assert res == MIN
assert type(res) == type(MIN)
res = arr.max(skipna=skipna)
assert res == MAX
assert type(res) == type(MAX)
res = arr_na.min(skipna=False)
assert np.isnan(res)
res = arr_na.max(skipna=False)
assert np.isnan(res)
res = arr_na.min(skipna=True)
assert res == MIN
assert type(res) == type(MIN)
res = arr_na.max(skipna=True)
assert res == MAX
assert type(res) == type(MAX)

View File

@ -0,0 +1,160 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import IntervalArray
def test_arrow_extension_type():
pa = pytest.importorskip("pyarrow")
from pandas.core.arrays.arrow.extension_types import ArrowIntervalType
p1 = ArrowIntervalType(pa.int64(), "left")
p2 = ArrowIntervalType(pa.int64(), "left")
p3 = ArrowIntervalType(pa.int64(), "right")
assert p1.closed == "left"
assert p1 == p2
assert p1 != p3
assert hash(p1) == hash(p2)
assert hash(p1) != hash(p3)
def test_arrow_array():
pa = pytest.importorskip("pyarrow")
from pandas.core.arrays.arrow.extension_types import ArrowIntervalType
intervals = pd.interval_range(1, 5, freq=1).array
result = pa.array(intervals)
assert isinstance(result.type, ArrowIntervalType)
assert result.type.closed == intervals.closed
assert result.type.subtype == pa.int64()
assert result.storage.field("left").equals(pa.array([1, 2, 3, 4], type="int64"))
assert result.storage.field("right").equals(pa.array([2, 3, 4, 5], type="int64"))
expected = pa.array([{"left": i, "right": i + 1} for i in range(1, 5)])
assert result.storage.equals(expected)
# convert to its storage type
result = pa.array(intervals, type=expected.type)
assert result.equals(expected)
# unsupported conversions
with pytest.raises(TypeError, match="Not supported to convert IntervalArray"):
pa.array(intervals, type="float64")
with pytest.raises(TypeError, match="Not supported to convert IntervalArray"):
pa.array(intervals, type=ArrowIntervalType(pa.float64(), "left"))
def test_arrow_array_missing():
pa = pytest.importorskip("pyarrow")
from pandas.core.arrays.arrow.extension_types import ArrowIntervalType
arr = IntervalArray.from_breaks([0.0, 1.0, 2.0, 3.0])
arr[1] = None
result = pa.array(arr)
assert isinstance(result.type, ArrowIntervalType)
assert result.type.closed == arr.closed
assert result.type.subtype == pa.float64()
# fields have missing values (not NaN)
left = pa.array([0.0, None, 2.0], type="float64")
right = pa.array([1.0, None, 3.0], type="float64")
assert result.storage.field("left").equals(left)
assert result.storage.field("right").equals(right)
# structarray itself also has missing values on the array level
vals = [
{"left": 0.0, "right": 1.0},
{"left": None, "right": None},
{"left": 2.0, "right": 3.0},
]
expected = pa.StructArray.from_pandas(vals, mask=np.array([False, True, False]))
assert result.storage.equals(expected)
@pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
@pytest.mark.parametrize(
"breaks",
[[0.0, 1.0, 2.0, 3.0], pd.date_range("2017", periods=4, freq="D")],
ids=["float", "datetime64[ns]"],
)
def test_arrow_table_roundtrip(breaks):
pa = pytest.importorskip("pyarrow")
from pandas.core.arrays.arrow.extension_types import ArrowIntervalType
arr = IntervalArray.from_breaks(breaks)
arr[1] = None
df = pd.DataFrame({"a": arr})
table = pa.table(df)
assert isinstance(table.field("a").type, ArrowIntervalType)
result = table.to_pandas()
assert isinstance(result["a"].dtype, pd.IntervalDtype)
tm.assert_frame_equal(result, df)
table2 = pa.concat_tables([table, table])
result = table2.to_pandas()
expected = pd.concat([df, df], ignore_index=True)
tm.assert_frame_equal(result, expected)
# GH#41040
table = pa.table(
[pa.chunked_array([], type=table.column(0).type)], schema=table.schema
)
result = table.to_pandas()
tm.assert_frame_equal(result, expected[0:0])
@pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
@pytest.mark.parametrize(
"breaks",
[[0.0, 1.0, 2.0, 3.0], pd.date_range("2017", periods=4, freq="D")],
ids=["float", "datetime64[ns]"],
)
def test_arrow_table_roundtrip_without_metadata(breaks):
pa = pytest.importorskip("pyarrow")
arr = IntervalArray.from_breaks(breaks)
arr[1] = None
df = pd.DataFrame({"a": arr})
table = pa.table(df)
# remove the metadata
table = table.replace_schema_metadata()
assert table.schema.metadata is None
result = table.to_pandas()
assert isinstance(result["a"].dtype, pd.IntervalDtype)
tm.assert_frame_equal(result, df)
def test_from_arrow_from_raw_struct_array():
# in case pyarrow lost the Interval extension type (eg on parquet roundtrip
# with datetime64[ns] subtype, see GH-45881), still allow conversion
# from arrow to IntervalArray
pa = pytest.importorskip("pyarrow")
arr = pa.array([{"left": 0, "right": 1}, {"left": 1, "right": 2}])
dtype = pd.IntervalDtype(np.dtype("int64"), closed="neither")
result = dtype.__from_arrow__(arr)
expected = IntervalArray.from_breaks(
np.array([0, 1, 2], dtype="int64"), closed="neither"
)
tm.assert_extension_array_equal(result, expected)
result = dtype.__from_arrow__(pa.chunked_array([arr]))
tm.assert_extension_array_equal(result, expected)

View File

@ -0,0 +1,93 @@
"""Tests for Interval-Interval operations, such as overlaps, contains, etc."""
import numpy as np
import pytest
from pandas import (
Interval,
IntervalIndex,
Timedelta,
Timestamp,
)
import pandas._testing as tm
from pandas.core.arrays import IntervalArray
@pytest.fixture(params=[IntervalArray, IntervalIndex])
def constructor(request):
"""
Fixture for testing both interval container classes.
"""
return request.param
@pytest.fixture(
params=[
(Timedelta("0 days"), Timedelta("1 day")),
(Timestamp("2018-01-01"), Timedelta("1 day")),
(0, 1),
],
ids=lambda x: type(x[0]).__name__,
)
def start_shift(request):
"""
Fixture for generating intervals of different types from a start value
and a shift value that can be added to start to generate an endpoint.
"""
return request.param
class TestOverlaps:
def test_overlaps_interval(self, constructor, start_shift, closed, other_closed):
start, shift = start_shift
interval = Interval(start, start + 3 * shift, other_closed)
# intervals: identical, nested, spanning, partial, adjacent, disjoint
tuples = [
(start, start + 3 * shift),
(start + shift, start + 2 * shift),
(start - shift, start + 4 * shift),
(start + 2 * shift, start + 4 * shift),
(start + 3 * shift, start + 4 * shift),
(start + 4 * shift, start + 5 * shift),
]
interval_container = constructor.from_tuples(tuples, closed)
adjacent = interval.closed_right and interval_container.closed_left
expected = np.array([True, True, True, True, adjacent, False])
result = interval_container.overlaps(interval)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("other_constructor", [IntervalArray, IntervalIndex])
def test_overlaps_interval_container(self, constructor, other_constructor):
# TODO: modify this test when implemented
interval_container = constructor.from_breaks(range(5))
other_container = other_constructor.from_breaks(range(5))
with pytest.raises(NotImplementedError, match="^$"):
interval_container.overlaps(other_container)
def test_overlaps_na(self, constructor, start_shift):
"""NA values are marked as False"""
start, shift = start_shift
interval = Interval(start, start + shift)
tuples = [
(start, start + shift),
np.nan,
(start + 2 * shift, start + 3 * shift),
]
interval_container = constructor.from_tuples(tuples)
expected = np.array([True, False, False])
result = interval_container.overlaps(interval)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize(
"other",
[10, True, "foo", Timedelta("1 day"), Timestamp("2018-01-01")],
ids=lambda x: type(x).__name__,
)
def test_overlaps_invalid_type(self, constructor, other):
interval_container = constructor.from_breaks(range(5))
msg = f"`other` must be Interval-like, got {type(other).__name__}"
with pytest.raises(TypeError, match=msg):
interval_container.overlaps(other)

View File

@ -0,0 +1,248 @@
from __future__ import annotations
from typing import Any
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
# integer dtypes
arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_INT_EA_DTYPES]
scalars: list[Any] = [2] * len(arrays)
# floating dtypes
arrays += [pd.array([0.1, 0.2, 0.3, None], dtype=dtype) for dtype in tm.FLOAT_EA_DTYPES]
scalars += [0.2, 0.2]
# boolean
arrays += [pd.array([True, False, True, None], dtype="boolean")]
scalars += [False]
@pytest.fixture(params=zip(arrays, scalars), ids=[a.dtype.name for a in arrays])
def data(request):
"""Fixture returning parametrized (array, scalar) tuple.
Used to test equivalence of scalars, numpy arrays with array ops, and the
equivalence of DataFrame and Series ops.
"""
return request.param
def check_skip(data, op_name):
if isinstance(data.dtype, pd.BooleanDtype) and "sub" in op_name:
pytest.skip("subtract not implemented for boolean")
def is_bool_not_implemented(data, op_name):
# match non-masked behavior
return data.dtype.kind == "b" and op_name.strip("_").lstrip("r") in [
"pow",
"truediv",
"floordiv",
]
# Test equivalence of scalars, numpy arrays with array ops
# -----------------------------------------------------------------------------
def test_array_scalar_like_equivalence(data, all_arithmetic_operators):
data, scalar = data
op = tm.get_op_from_name(all_arithmetic_operators)
check_skip(data, all_arithmetic_operators)
scalar_array = pd.array([scalar] * len(data), dtype=data.dtype)
# TODO also add len-1 array (np.array([scalar], dtype=data.dtype.numpy_dtype))
for scalar in [scalar, data.dtype.type(scalar)]:
if is_bool_not_implemented(data, all_arithmetic_operators):
msg = "operator '.*' not implemented for bool dtypes"
with pytest.raises(NotImplementedError, match=msg):
op(data, scalar)
with pytest.raises(NotImplementedError, match=msg):
op(data, scalar_array)
else:
result = op(data, scalar)
expected = op(data, scalar_array)
tm.assert_extension_array_equal(result, expected)
def test_array_NA(data, all_arithmetic_operators):
data, _ = data
op = tm.get_op_from_name(all_arithmetic_operators)
check_skip(data, all_arithmetic_operators)
scalar = pd.NA
scalar_array = pd.array([pd.NA] * len(data), dtype=data.dtype)
mask = data._mask.copy()
if is_bool_not_implemented(data, all_arithmetic_operators):
msg = "operator '.*' not implemented for bool dtypes"
with pytest.raises(NotImplementedError, match=msg):
op(data, scalar)
# GH#45421 check op doesn't alter data._mask inplace
tm.assert_numpy_array_equal(mask, data._mask)
return
result = op(data, scalar)
# GH#45421 check op doesn't alter data._mask inplace
tm.assert_numpy_array_equal(mask, data._mask)
expected = op(data, scalar_array)
tm.assert_numpy_array_equal(mask, data._mask)
tm.assert_extension_array_equal(result, expected)
def test_numpy_array_equivalence(data, all_arithmetic_operators):
data, scalar = data
op = tm.get_op_from_name(all_arithmetic_operators)
check_skip(data, all_arithmetic_operators)
numpy_array = np.array([scalar] * len(data), dtype=data.dtype.numpy_dtype)
pd_array = pd.array(numpy_array, dtype=data.dtype)
if is_bool_not_implemented(data, all_arithmetic_operators):
msg = "operator '.*' not implemented for bool dtypes"
with pytest.raises(NotImplementedError, match=msg):
op(data, numpy_array)
with pytest.raises(NotImplementedError, match=msg):
op(data, pd_array)
return
result = op(data, numpy_array)
expected = op(data, pd_array)
tm.assert_extension_array_equal(result, expected)
# Test equivalence with Series and DataFrame ops
# -----------------------------------------------------------------------------
def test_frame(data, all_arithmetic_operators):
data, scalar = data
op = tm.get_op_from_name(all_arithmetic_operators)
check_skip(data, all_arithmetic_operators)
# DataFrame with scalar
df = pd.DataFrame({"A": data})
if is_bool_not_implemented(data, all_arithmetic_operators):
msg = "operator '.*' not implemented for bool dtypes"
with pytest.raises(NotImplementedError, match=msg):
op(df, scalar)
with pytest.raises(NotImplementedError, match=msg):
op(data, scalar)
return
result = op(df, scalar)
expected = pd.DataFrame({"A": op(data, scalar)})
tm.assert_frame_equal(result, expected)
def test_series(data, all_arithmetic_operators):
data, scalar = data
op = tm.get_op_from_name(all_arithmetic_operators)
check_skip(data, all_arithmetic_operators)
ser = pd.Series(data)
others = [
scalar,
np.array([scalar] * len(data), dtype=data.dtype.numpy_dtype),
pd.array([scalar] * len(data), dtype=data.dtype),
pd.Series([scalar] * len(data), dtype=data.dtype),
]
for other in others:
if is_bool_not_implemented(data, all_arithmetic_operators):
msg = "operator '.*' not implemented for bool dtypes"
with pytest.raises(NotImplementedError, match=msg):
op(ser, other)
else:
result = op(ser, other)
expected = pd.Series(op(data, other))
tm.assert_series_equal(result, expected)
# Test generic characteristics / errors
# -----------------------------------------------------------------------------
def test_error_invalid_object(data, all_arithmetic_operators):
data, _ = data
op = all_arithmetic_operators
opa = getattr(data, op)
# 2d -> return NotImplemented
result = opa(pd.DataFrame({"A": data}))
assert result is NotImplemented
msg = r"can only perform ops with 1-d structures"
with pytest.raises(NotImplementedError, match=msg):
opa(np.arange(len(data)).reshape(-1, len(data)))
def test_error_len_mismatch(data, all_arithmetic_operators):
# operating with a list-like with non-matching length raises
data, scalar = data
op = tm.get_op_from_name(all_arithmetic_operators)
other = [scalar] * (len(data) - 1)
err = ValueError
msg = "|".join(
[
r"operands could not be broadcast together with shapes \(3,\) \(4,\)",
r"operands could not be broadcast together with shapes \(4,\) \(3,\)",
]
)
if data.dtype.kind == "b" and all_arithmetic_operators.strip("_") in [
"sub",
"rsub",
]:
err = TypeError
msg = (
r"numpy boolean subtract, the `\-` operator, is not supported, use "
r"the bitwise_xor, the `\^` operator, or the logical_xor function instead"
)
elif is_bool_not_implemented(data, all_arithmetic_operators):
msg = "operator '.*' not implemented for bool dtypes"
err = NotImplementedError
for other in [other, np.array(other)]:
with pytest.raises(err, match=msg):
op(data, other)
s = pd.Series(data)
with pytest.raises(err, match=msg):
op(s, other)
@pytest.mark.parametrize("op", ["__neg__", "__abs__", "__invert__"])
def test_unary_op_does_not_propagate_mask(data, op):
# https://github.com/pandas-dev/pandas/issues/39943
data, _ = data
ser = pd.Series(data)
if op == "__invert__" and data.dtype.kind == "f":
# we follow numpy in raising
msg = "ufunc 'invert' not supported for the input types"
with pytest.raises(TypeError, match=msg):
getattr(ser, op)()
with pytest.raises(TypeError, match=msg):
getattr(data, op)()
with pytest.raises(TypeError, match=msg):
# Check that this is still the numpy behavior
getattr(data._data, op)()
return
result = getattr(ser, op)()
expected = result.copy(deep=True)
ser[0] = None
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,210 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
pa = pytest.importorskip("pyarrow")
from pandas.core.arrays.arrow._arrow_utils import pyarrow_array_to_numpy_and_mask
arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_INT_EA_DTYPES]
arrays += [pd.array([0.1, 0.2, 0.3, None], dtype=dtype) for dtype in tm.FLOAT_EA_DTYPES]
arrays += [pd.array([True, False, True, None], dtype="boolean")]
@pytest.fixture(params=arrays, ids=[a.dtype.name for a in arrays])
def data(request):
"""
Fixture returning parametrized array from given dtype, including integer,
float and boolean
"""
return request.param
def test_arrow_array(data):
arr = pa.array(data)
expected = pa.array(
data.to_numpy(object, na_value=None),
type=pa.from_numpy_dtype(data.dtype.numpy_dtype),
)
assert arr.equals(expected)
def test_arrow_roundtrip(data):
df = pd.DataFrame({"a": data})
table = pa.table(df)
assert table.field("a").type == str(data.dtype.numpy_dtype)
result = table.to_pandas()
assert result["a"].dtype == data.dtype
tm.assert_frame_equal(result, df)
def test_dataframe_from_arrow_types_mapper():
def types_mapper(arrow_type):
if pa.types.is_boolean(arrow_type):
return pd.BooleanDtype()
elif pa.types.is_integer(arrow_type):
return pd.Int64Dtype()
bools_array = pa.array([True, None, False], type=pa.bool_())
ints_array = pa.array([1, None, 2], type=pa.int64())
small_ints_array = pa.array([-1, 0, 7], type=pa.int8())
record_batch = pa.RecordBatch.from_arrays(
[bools_array, ints_array, small_ints_array], ["bools", "ints", "small_ints"]
)
result = record_batch.to_pandas(types_mapper=types_mapper)
bools = pd.Series([True, None, False], dtype="boolean")
ints = pd.Series([1, None, 2], dtype="Int64")
small_ints = pd.Series([-1, 0, 7], dtype="Int64")
expected = pd.DataFrame({"bools": bools, "ints": ints, "small_ints": small_ints})
tm.assert_frame_equal(result, expected)
def test_arrow_load_from_zero_chunks(data):
# GH-41040
df = pd.DataFrame({"a": data[0:0]})
table = pa.table(df)
assert table.field("a").type == str(data.dtype.numpy_dtype)
table = pa.table(
[pa.chunked_array([], type=table.field("a").type)], schema=table.schema
)
result = table.to_pandas()
assert result["a"].dtype == data.dtype
tm.assert_frame_equal(result, df)
def test_arrow_from_arrow_uint():
# https://github.com/pandas-dev/pandas/issues/31896
# possible mismatch in types
dtype = pd.UInt32Dtype()
result = dtype.__from_arrow__(pa.array([1, 2, 3, 4, None], type="int64"))
expected = pd.array([1, 2, 3, 4, None], dtype="UInt32")
tm.assert_extension_array_equal(result, expected)
def test_arrow_sliced(data):
# https://github.com/pandas-dev/pandas/issues/38525
df = pd.DataFrame({"a": data})
table = pa.table(df)
result = table.slice(2, None).to_pandas()
expected = df.iloc[2:].reset_index(drop=True)
tm.assert_frame_equal(result, expected)
# no missing values
df2 = df.fillna(data[0])
table = pa.table(df2)
result = table.slice(2, None).to_pandas()
expected = df2.iloc[2:].reset_index(drop=True)
tm.assert_frame_equal(result, expected)
@pytest.fixture
def np_dtype_to_arrays(any_real_numpy_dtype):
"""
Fixture returning actual and expected dtype, pandas and numpy arrays and
mask from a given numpy dtype
"""
np_dtype = np.dtype(any_real_numpy_dtype)
pa_type = pa.from_numpy_dtype(np_dtype)
# None ensures the creation of a bitmask buffer.
pa_array = pa.array([0, 1, 2, None], type=pa_type)
# Since masked Arrow buffer slots are not required to contain a specific
# value, assert only the first three values of the created np.array
np_expected = np.array([0, 1, 2], dtype=np_dtype)
mask_expected = np.array([True, True, True, False])
return np_dtype, pa_array, np_expected, mask_expected
def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays):
"""
Test conversion from pyarrow array to numpy array.
Modifies the pyarrow buffer to contain padding and offset, which are
considered valid buffers by pyarrow.
Also tests empty pyarrow arrays with non empty buffers.
See https://github.com/pandas-dev/pandas/issues/40896
"""
np_dtype, pa_array, np_expected, mask_expected = np_dtype_to_arrays
data, mask = pyarrow_array_to_numpy_and_mask(pa_array, np_dtype)
tm.assert_numpy_array_equal(data[:3], np_expected)
tm.assert_numpy_array_equal(mask, mask_expected)
mask_buffer = pa_array.buffers()[0]
data_buffer = pa_array.buffers()[1]
data_buffer_bytes = pa_array.buffers()[1].to_pybytes()
# Add trailing padding to the buffer.
data_buffer_trail = pa.py_buffer(data_buffer_bytes + b"\x00")
pa_array_trail = pa.Array.from_buffers(
type=pa_array.type,
length=len(pa_array),
buffers=[mask_buffer, data_buffer_trail],
offset=pa_array.offset,
)
pa_array_trail.validate()
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_trail, np_dtype)
tm.assert_numpy_array_equal(data[:3], np_expected)
tm.assert_numpy_array_equal(mask, mask_expected)
# Add offset to the buffer.
offset = b"\x00" * (pa_array.type.bit_width // 8)
data_buffer_offset = pa.py_buffer(offset + data_buffer_bytes)
mask_buffer_offset = pa.py_buffer(b"\x0E")
pa_array_offset = pa.Array.from_buffers(
type=pa_array.type,
length=len(pa_array),
buffers=[mask_buffer_offset, data_buffer_offset],
offset=pa_array.offset + 1,
)
pa_array_offset.validate()
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype)
tm.assert_numpy_array_equal(data[:3], np_expected)
tm.assert_numpy_array_equal(mask, mask_expected)
# Empty array
np_expected_empty = np.array([], dtype=np_dtype)
mask_expected_empty = np.array([], dtype=np.bool_)
pa_array_offset = pa.Array.from_buffers(
type=pa_array.type,
length=0,
buffers=[mask_buffer, data_buffer],
offset=pa_array.offset,
)
pa_array_offset.validate()
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype)
tm.assert_numpy_array_equal(data[:3], np_expected_empty)
tm.assert_numpy_array_equal(mask, mask_expected_empty)
@pytest.mark.parametrize(
"arr", [pa.nulls(10), pa.chunked_array([pa.nulls(4), pa.nulls(6)])]
)
def test_from_arrow_null(data, arr):
res = data.dtype.__from_arrow__(arr)
assert res.isna().all()
assert len(res) == 10
def test_from_arrow_type_error(data):
# ensure that __from_arrow__ returns a TypeError when getting a wrong
# array type
arr = pa.array(data).cast("string")
with pytest.raises(TypeError, match=None):
# we don't test the exact error message, only the fact that it raises
# a TypeError is relevant
data.dtype.__from_arrow__(arr)

View File

@ -0,0 +1,74 @@
import numpy as np
import pytest
from pandas.core.dtypes.common import is_integer_dtype
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import BaseMaskedArray
arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_INT_EA_DTYPES]
arrays += [
pd.array([0.141, -0.268, 5.895, None], dtype=dtype) for dtype in tm.FLOAT_EA_DTYPES
]
@pytest.fixture(params=arrays, ids=[a.dtype.name for a in arrays])
def data(request):
"""
Fixture returning parametrized 'data' array with different integer and
floating point types
"""
return request.param
@pytest.fixture()
def numpy_dtype(data):
"""
Fixture returning numpy dtype from 'data' input array.
"""
# For integer dtype, the numpy conversion must be done to float
if is_integer_dtype(data):
numpy_dtype = float
else:
numpy_dtype = data.dtype.type
return numpy_dtype
def test_round(data, numpy_dtype):
# No arguments
result = data.round()
expected = pd.array(
np.round(data.to_numpy(dtype=numpy_dtype, na_value=None)), dtype=data.dtype
)
tm.assert_extension_array_equal(result, expected)
# Decimals argument
result = data.round(decimals=2)
expected = pd.array(
np.round(data.to_numpy(dtype=numpy_dtype, na_value=None), decimals=2),
dtype=data.dtype,
)
tm.assert_extension_array_equal(result, expected)
def test_tolist(data):
result = data.tolist()
expected = list(data)
tm.assert_equal(result, expected)
def test_to_numpy():
# GH#56991
class MyStringArray(BaseMaskedArray):
dtype = pd.StringDtype()
_dtype_cls = pd.StringDtype
_internal_fill_value = pd.NA
arr = MyStringArray(
values=np.array(["a", "b", "c"]), mask=np.array([False, True, False])
)
result = arr.to_numpy()
expected = np.array(["a", pd.NA, "c"])
tm.assert_numpy_array_equal(result, expected)

View File

@ -0,0 +1,60 @@
import re
import numpy as np
import pytest
import pandas as pd
class TestSetitemValidation:
def _check_setitem_invalid(self, arr, invalid):
msg = f"Invalid value '{invalid!s}' for dtype '{arr.dtype}'"
msg = re.escape(msg)
with pytest.raises(TypeError, match=msg):
arr[0] = invalid
with pytest.raises(TypeError, match=msg):
arr[:] = invalid
with pytest.raises(TypeError, match=msg):
arr[[0]] = invalid
# FIXME: don't leave commented-out
# with pytest.raises(TypeError):
# arr[[0]] = [invalid]
# with pytest.raises(TypeError):
# arr[[0]] = np.array([invalid], dtype=object)
# Series non-coercion, behavior subject to change
ser = pd.Series(arr)
with pytest.raises(TypeError, match=msg):
ser[0] = invalid
# TODO: so, so many other variants of this...
_invalid_scalars = [
1 + 2j,
"True",
"1",
"1.0",
pd.NaT,
np.datetime64("NaT"),
np.timedelta64("NaT"),
]
@pytest.mark.parametrize(
"invalid", _invalid_scalars + [1, 1.0, np.int64(1), np.float64(1)]
)
def test_setitem_validation_scalar_bool(self, invalid):
arr = pd.array([True, False, None], dtype="boolean")
self._check_setitem_invalid(arr, invalid)
@pytest.mark.parametrize("invalid", _invalid_scalars + [True, 1.5, np.float64(1.5)])
def test_setitem_validation_scalar_int(self, invalid, any_int_ea_dtype):
arr = pd.array([1, 2, None], dtype=any_int_ea_dtype)
self._check_setitem_invalid(arr, invalid)
@pytest.mark.parametrize("invalid", _invalid_scalars + [True])
def test_setitem_validation_scalar_float(self, invalid, float_ea_dtype):
arr = pd.array([1, 2, None], dtype=float_ea_dtype)
self._check_setitem_invalid(arr, invalid)

View File

@ -0,0 +1,154 @@
"""
Tests shared by MaskedArray subclasses.
"""
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.tests.extension.base import BaseOpsUtil
class ComparisonOps(BaseOpsUtil):
def _compare_other(self, data, op, other):
# array
result = pd.Series(op(data, other))
expected = pd.Series(op(data._data, other), dtype="boolean")
# fill the nan locations
expected[data._mask] = pd.NA
tm.assert_series_equal(result, expected)
# series
ser = pd.Series(data)
result = op(ser, other)
# Set nullable dtype here to avoid upcasting when setting to pd.NA below
expected = op(pd.Series(data._data), other).astype("boolean")
# fill the nan locations
expected[data._mask] = pd.NA
tm.assert_series_equal(result, expected)
# subclass will override to parametrize 'other'
def test_scalar(self, other, comparison_op, dtype):
op = comparison_op
left = pd.array([1, 0, None], dtype=dtype)
result = op(left, other)
if other is pd.NA:
expected = pd.array([None, None, None], dtype="boolean")
else:
values = op(left._data, other)
expected = pd.arrays.BooleanArray(values, left._mask, copy=True)
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
result[0] = pd.NA
tm.assert_extension_array_equal(left, pd.array([1, 0, None], dtype=dtype))
class NumericOps:
# Shared by IntegerArray and FloatingArray, not BooleanArray
def test_searchsorted_nan(self, dtype):
# The base class casts to object dtype, for which searchsorted returns
# 0 from the left and 10 from the right.
arr = pd.array(range(10), dtype=dtype)
assert arr.searchsorted(np.nan, side="left") == 10
assert arr.searchsorted(np.nan, side="right") == 10
def test_no_shared_mask(self, data):
result = data + 1
assert not tm.shares_memory(result, data)
def test_array(self, comparison_op, dtype):
op = comparison_op
left = pd.array([0, 1, 2, None, None, None], dtype=dtype)
right = pd.array([0, 1, None, 0, 1, None], dtype=dtype)
result = op(left, right)
values = op(left._data, right._data)
mask = left._mask | right._mask
expected = pd.arrays.BooleanArray(values, mask)
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
result[0] = pd.NA
tm.assert_extension_array_equal(
left, pd.array([0, 1, 2, None, None, None], dtype=dtype)
)
tm.assert_extension_array_equal(
right, pd.array([0, 1, None, 0, 1, None], dtype=dtype)
)
def test_compare_with_booleanarray(self, comparison_op, dtype):
op = comparison_op
left = pd.array([True, False, None] * 3, dtype="boolean")
right = pd.array([0] * 3 + [1] * 3 + [None] * 3, dtype=dtype)
other = pd.array([False] * 3 + [True] * 3 + [None] * 3, dtype="boolean")
expected = op(left, other)
result = op(left, right)
tm.assert_extension_array_equal(result, expected)
# reversed op
expected = op(other, left)
result = op(right, left)
tm.assert_extension_array_equal(result, expected)
def test_compare_to_string(self, dtype):
# GH#28930
ser = pd.Series([1, None], dtype=dtype)
result = ser == "a"
expected = pd.Series([False, pd.NA], dtype="boolean")
tm.assert_series_equal(result, expected)
def test_ufunc_with_out(self, dtype):
arr = pd.array([1, 2, 3], dtype=dtype)
arr2 = pd.array([1, 2, pd.NA], dtype=dtype)
mask = arr == arr
mask2 = arr2 == arr2
result = np.zeros(3, dtype=bool)
result |= mask
# If MaskedArray.__array_ufunc__ handled "out" appropriately,
# `result` should still be an ndarray.
assert isinstance(result, np.ndarray)
assert result.all()
# result |= mask worked because mask could be cast losslessly to
# boolean ndarray. mask2 can't, so this raises
result = np.zeros(3, dtype=bool)
msg = "Specify an appropriate 'na_value' for this dtype"
with pytest.raises(ValueError, match=msg):
result |= mask2
# addition
res = np.add(arr, arr2)
expected = pd.array([2, 4, pd.NA], dtype=dtype)
tm.assert_extension_array_equal(res, expected)
# when passing out=arr, we will modify 'arr' inplace.
res = np.add(arr, arr2, out=arr)
assert res is arr
tm.assert_extension_array_equal(res, expected)
tm.assert_extension_array_equal(arr, expected)
def test_mul_td64_array(self, dtype):
# GH#45622
arr = pd.array([1, 2, pd.NA], dtype=dtype)
other = np.arange(3, dtype=np.int64).view("m8[ns]")
result = arr * other
expected = pd.array([pd.Timedelta(0), pd.Timedelta(2), pd.NaT])
tm.assert_extension_array_equal(result, expected)

View File

@ -0,0 +1,41 @@
import numpy as np
from pandas.core.dtypes.common import is_scalar
import pandas as pd
import pandas._testing as tm
class TestSearchsorted:
def test_searchsorted_string(self, string_dtype):
arr = pd.array(["a", "b", "c"], dtype=string_dtype)
result = arr.searchsorted("a", side="left")
assert is_scalar(result)
assert result == 0
result = arr.searchsorted("a", side="right")
assert is_scalar(result)
assert result == 1
def test_searchsorted_numeric_dtypes_scalar(self, any_real_numpy_dtype):
arr = pd.array([1, 3, 90], dtype=any_real_numpy_dtype)
result = arr.searchsorted(30)
assert is_scalar(result)
assert result == 2
result = arr.searchsorted([30])
expected = np.array([2], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)
def test_searchsorted_numeric_dtypes_vector(self, any_real_numpy_dtype):
arr = pd.array([1, 3, 90], dtype=any_real_numpy_dtype)
result = arr.searchsorted([2, 30])
expected = np.array([1, 2], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)
def test_searchsorted_sorter(self, any_real_numpy_dtype):
arr = pd.array([3, 1, 2], dtype=any_real_numpy_dtype)
result = arr.searchsorted([0, 3], sorter=np.argsort(arr))
expected = np.array([0, 2], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)

View File

@ -0,0 +1,351 @@
"""
Additional tests for NumpyExtensionArray that aren't covered by
the interface tests.
"""
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import NumpyEADtype
import pandas as pd
import pandas._testing as tm
from pandas.arrays import NumpyExtensionArray
@pytest.fixture(
params=[
np.array(["a", "b"], dtype=object),
np.array([0, 1], dtype=float),
np.array([0, 1], dtype=int),
np.array([0, 1 + 2j], dtype=complex),
np.array([True, False], dtype=bool),
np.array([0, 1], dtype="datetime64[ns]"),
np.array([0, 1], dtype="timedelta64[ns]"),
],
)
def any_numpy_array(request):
"""
Parametrized fixture for NumPy arrays with different dtypes.
This excludes string and bytes.
"""
return request.param.copy()
# ----------------------------------------------------------------------------
# NumpyEADtype
@pytest.mark.parametrize(
"dtype, expected",
[
("bool", True),
("int", True),
("uint", True),
("float", True),
("complex", True),
("str", False),
("bytes", False),
("datetime64[ns]", False),
("object", False),
("void", False),
],
)
def test_is_numeric(dtype, expected):
dtype = NumpyEADtype(dtype)
assert dtype._is_numeric is expected
@pytest.mark.parametrize(
"dtype, expected",
[
("bool", True),
("int", False),
("uint", False),
("float", False),
("complex", False),
("str", False),
("bytes", False),
("datetime64[ns]", False),
("object", False),
("void", False),
],
)
def test_is_boolean(dtype, expected):
dtype = NumpyEADtype(dtype)
assert dtype._is_boolean is expected
def test_repr():
dtype = NumpyEADtype(np.dtype("int64"))
assert repr(dtype) == "NumpyEADtype('int64')"
def test_constructor_from_string():
result = NumpyEADtype.construct_from_string("int64")
expected = NumpyEADtype(np.dtype("int64"))
assert result == expected
def test_dtype_idempotent(any_numpy_dtype):
dtype = NumpyEADtype(any_numpy_dtype)
result = NumpyEADtype(dtype)
assert result == dtype
# ----------------------------------------------------------------------------
# Construction
def test_constructor_no_coercion():
with pytest.raises(ValueError, match="NumPy array"):
NumpyExtensionArray([1, 2, 3])
def test_series_constructor_with_copy():
ndarray = np.array([1, 2, 3])
ser = pd.Series(NumpyExtensionArray(ndarray), copy=True)
assert ser.values is not ndarray
def test_series_constructor_with_astype():
ndarray = np.array([1, 2, 3])
result = pd.Series(NumpyExtensionArray(ndarray), dtype="float64")
expected = pd.Series([1.0, 2.0, 3.0], dtype="float64")
tm.assert_series_equal(result, expected)
def test_from_sequence_dtype():
arr = np.array([1, 2, 3], dtype="int64")
result = NumpyExtensionArray._from_sequence(arr, dtype="uint64")
expected = NumpyExtensionArray(np.array([1, 2, 3], dtype="uint64"))
tm.assert_extension_array_equal(result, expected)
def test_constructor_copy():
arr = np.array([0, 1])
result = NumpyExtensionArray(arr, copy=True)
assert not tm.shares_memory(result, arr)
def test_constructor_with_data(any_numpy_array):
nparr = any_numpy_array
arr = NumpyExtensionArray(nparr)
assert arr.dtype.numpy_dtype == nparr.dtype
# ----------------------------------------------------------------------------
# Conversion
def test_to_numpy():
arr = NumpyExtensionArray(np.array([1, 2, 3]))
result = arr.to_numpy()
assert result is arr._ndarray
result = arr.to_numpy(copy=True)
assert result is not arr._ndarray
result = arr.to_numpy(dtype="f8")
expected = np.array([1, 2, 3], dtype="f8")
tm.assert_numpy_array_equal(result, expected)
# ----------------------------------------------------------------------------
# Setitem
def test_setitem_series():
ser = pd.Series([1, 2, 3])
ser.array[0] = 10
expected = pd.Series([10, 2, 3])
tm.assert_series_equal(ser, expected)
def test_setitem(any_numpy_array):
nparr = any_numpy_array
arr = NumpyExtensionArray(nparr, copy=True)
arr[0] = arr[1]
nparr[0] = nparr[1]
tm.assert_numpy_array_equal(arr.to_numpy(), nparr)
# ----------------------------------------------------------------------------
# Reductions
def test_bad_reduce_raises():
arr = np.array([1, 2, 3], dtype="int64")
arr = NumpyExtensionArray(arr)
msg = "cannot perform not_a_method with type int"
with pytest.raises(TypeError, match=msg):
arr._reduce(msg)
def test_validate_reduction_keyword_args():
arr = NumpyExtensionArray(np.array([1, 2, 3]))
msg = "the 'keepdims' parameter is not supported .*all"
with pytest.raises(ValueError, match=msg):
arr.all(keepdims=True)
def test_np_max_nested_tuples():
# case where checking in ufunc.nout works while checking for tuples
# does not
vals = [
(("j", "k"), ("l", "m")),
(("l", "m"), ("o", "p")),
(("o", "p"), ("j", "k")),
]
ser = pd.Series(vals)
arr = ser.array
assert arr.max() is arr[2]
assert ser.max() is arr[2]
result = np.maximum.reduce(arr)
assert result == arr[2]
result = np.maximum.reduce(ser)
assert result == arr[2]
def test_np_reduce_2d():
raw = np.arange(12).reshape(4, 3)
arr = NumpyExtensionArray(raw)
res = np.maximum.reduce(arr, axis=0)
tm.assert_extension_array_equal(res, arr[-1])
alt = arr.max(axis=0)
tm.assert_extension_array_equal(alt, arr[-1])
# ----------------------------------------------------------------------------
# Ops
@pytest.mark.parametrize("ufunc", [np.abs, np.negative, np.positive])
def test_ufunc_unary(ufunc):
arr = NumpyExtensionArray(np.array([-1.0, 0.0, 1.0]))
result = ufunc(arr)
expected = NumpyExtensionArray(ufunc(arr._ndarray))
tm.assert_extension_array_equal(result, expected)
# same thing but with the 'out' keyword
out = NumpyExtensionArray(np.array([-9.0, -9.0, -9.0]))
ufunc(arr, out=out)
tm.assert_extension_array_equal(out, expected)
def test_ufunc():
arr = NumpyExtensionArray(np.array([-1.0, 0.0, 1.0]))
r1, r2 = np.divmod(arr, np.add(arr, 2))
e1, e2 = np.divmod(arr._ndarray, np.add(arr._ndarray, 2))
e1 = NumpyExtensionArray(e1)
e2 = NumpyExtensionArray(e2)
tm.assert_extension_array_equal(r1, e1)
tm.assert_extension_array_equal(r2, e2)
def test_basic_binop():
# Just a basic smoke test. The EA interface tests exercise this
# more thoroughly.
x = NumpyExtensionArray(np.array([1, 2, 3]))
result = x + x
expected = NumpyExtensionArray(np.array([2, 4, 6]))
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("dtype", [None, object])
def test_setitem_object_typecode(dtype):
arr = NumpyExtensionArray(np.array(["a", "b", "c"], dtype=dtype))
arr[0] = "t"
expected = NumpyExtensionArray(np.array(["t", "b", "c"], dtype=dtype))
tm.assert_extension_array_equal(arr, expected)
def test_setitem_no_coercion():
# https://github.com/pandas-dev/pandas/issues/28150
arr = NumpyExtensionArray(np.array([1, 2, 3]))
with pytest.raises(ValueError, match="int"):
arr[0] = "a"
# With a value that we do coerce, check that we coerce the value
# and not the underlying array.
arr[0] = 2.5
assert isinstance(arr[0], (int, np.integer)), type(arr[0])
def test_setitem_preserves_views():
# GH#28150, see also extension test of the same name
arr = NumpyExtensionArray(np.array([1, 2, 3]))
view1 = arr.view()
view2 = arr[:]
view3 = np.asarray(arr)
arr[0] = 9
assert view1[0] == 9
assert view2[0] == 9
assert view3[0] == 9
arr[-1] = 2.5
view1[-1] = 5
assert arr[-1] == 5
@pytest.mark.parametrize("dtype", [np.int64, np.uint64])
def test_quantile_empty(dtype):
# we should get back np.nans, not -1s
arr = NumpyExtensionArray(np.array([], dtype=dtype))
idx = pd.Index([0.0, 0.5])
result = arr._quantile(idx, interpolation="linear")
expected = NumpyExtensionArray(np.array([np.nan, np.nan]))
tm.assert_extension_array_equal(result, expected)
def test_factorize_unsigned():
# don't raise when calling factorize on unsigned int NumpyExtensionArray
arr = np.array([1, 2, 3], dtype=np.uint64)
obj = NumpyExtensionArray(arr)
res_codes, res_unique = obj.factorize()
exp_codes, exp_unique = pd.factorize(arr)
tm.assert_numpy_array_equal(res_codes, exp_codes)
tm.assert_extension_array_equal(res_unique, NumpyExtensionArray(exp_unique))
# ----------------------------------------------------------------------------
# Output formatting
def test_array_repr(any_numpy_array):
# GH#61085
nparray = any_numpy_array
arr = NumpyExtensionArray(nparray)
if nparray.dtype == "object":
values = "['a', 'b']"
elif nparray.dtype == "float64":
values = "[0.0, 1.0]"
elif str(nparray.dtype).startswith("int"):
values = "[0, 1]"
elif nparray.dtype == "complex128":
values = "[0j, (1+2j)]"
elif nparray.dtype == "bool":
values = "[True, False]"
elif nparray.dtype == "datetime64[ns]":
values = "[1970-01-01T00:00:00.000000000, 1970-01-01T00:00:00.000000001]"
elif nparray.dtype == "timedelta64[ns]":
values = "[0 nanoseconds, 1 nanoseconds]"
expected = f"<NumpyExtensionArray>\n{values}\nLength: 2, dtype: {nparray.dtype}"
result = repr(arr)
assert result == expected, f"{result} vs {expected}"

View File

@ -0,0 +1,130 @@
import pytest
from pandas.compat.pyarrow import pa_version_under10p1
from pandas.core.dtypes.dtypes import PeriodDtype
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import (
PeriodArray,
period_array,
)
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
pa = pytest.importorskip("pyarrow")
def test_arrow_extension_type():
from pandas.core.arrays.arrow.extension_types import ArrowPeriodType
p1 = ArrowPeriodType("D")
p2 = ArrowPeriodType("D")
p3 = ArrowPeriodType("M")
assert p1.freq == "D"
assert p1 == p2
assert p1 != p3
assert hash(p1) == hash(p2)
assert hash(p1) != hash(p3)
@pytest.mark.xfail(not pa_version_under10p1, reason="Wrong behavior with pyarrow 10")
@pytest.mark.parametrize(
"data, freq",
[
(pd.date_range("2017", periods=3), "D"),
(pd.date_range("2017", periods=3, freq="YE"), "Y-DEC"),
],
)
def test_arrow_array(data, freq):
from pandas.core.arrays.arrow.extension_types import ArrowPeriodType
periods = period_array(data, freq=freq)
result = pa.array(periods)
assert isinstance(result.type, ArrowPeriodType)
assert result.type.freq == freq
expected = pa.array(periods.asi8, type="int64")
assert result.storage.equals(expected)
# convert to its storage type
result = pa.array(periods, type=pa.int64())
assert result.equals(expected)
# unsupported conversions
msg = "Not supported to convert PeriodArray to 'double' type"
with pytest.raises(TypeError, match=msg):
pa.array(periods, type="float64")
with pytest.raises(TypeError, match="different 'freq'"):
pa.array(periods, type=ArrowPeriodType("T"))
def test_arrow_array_missing():
from pandas.core.arrays.arrow.extension_types import ArrowPeriodType
arr = PeriodArray([1, 2, 3], dtype="period[D]")
arr[1] = pd.NaT
result = pa.array(arr)
assert isinstance(result.type, ArrowPeriodType)
assert result.type.freq == "D"
expected = pa.array([1, None, 3], type="int64")
assert result.storage.equals(expected)
def test_arrow_table_roundtrip():
from pandas.core.arrays.arrow.extension_types import ArrowPeriodType
arr = PeriodArray([1, 2, 3], dtype="period[D]")
arr[1] = pd.NaT
df = pd.DataFrame({"a": arr})
table = pa.table(df)
assert isinstance(table.field("a").type, ArrowPeriodType)
result = table.to_pandas()
assert isinstance(result["a"].dtype, PeriodDtype)
tm.assert_frame_equal(result, df)
table2 = pa.concat_tables([table, table])
result = table2.to_pandas()
expected = pd.concat([df, df], ignore_index=True)
tm.assert_frame_equal(result, expected)
def test_arrow_load_from_zero_chunks():
# GH-41040
from pandas.core.arrays.arrow.extension_types import ArrowPeriodType
arr = PeriodArray([], dtype="period[D]")
df = pd.DataFrame({"a": arr})
table = pa.table(df)
assert isinstance(table.field("a").type, ArrowPeriodType)
table = pa.table(
[pa.chunked_array([], type=table.column(0).type)], schema=table.schema
)
result = table.to_pandas()
assert isinstance(result["a"].dtype, PeriodDtype)
tm.assert_frame_equal(result, df)
def test_arrow_table_roundtrip_without_metadata():
arr = PeriodArray([1, 2, 3], dtype="period[h]")
arr[1] = pd.NaT
df = pd.DataFrame({"a": arr})
table = pa.table(df)
# remove the metadata
table = table.replace_schema_metadata()
assert table.schema.metadata is None
result = table.to_pandas()
assert isinstance(result["a"].dtype, PeriodDtype)
tm.assert_frame_equal(result, df)

View File

@ -0,0 +1,67 @@
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import PeriodDtype
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import period_array
@pytest.mark.parametrize("dtype", [int, np.int32, np.int64, "uint32", "uint64"])
def test_astype_int(dtype):
# We choose to ignore the sign and size of integers for
# Period/Datetime/Timedelta astype
arr = period_array(["2000", "2001", None], freq="D")
if np.dtype(dtype) != np.int64:
with pytest.raises(TypeError, match=r"Do obj.astype\('int64'\)"):
arr.astype(dtype)
return
result = arr.astype(dtype)
expected = arr._ndarray.view("i8")
tm.assert_numpy_array_equal(result, expected)
def test_astype_copies():
arr = period_array(["2000", "2001", None], freq="D")
result = arr.astype(np.int64, copy=False)
# Add the `.base`, since we now use `.asi8` which returns a view.
# We could maybe override it in PeriodArray to return ._ndarray directly.
assert result.base is arr._ndarray
result = arr.astype(np.int64, copy=True)
assert result is not arr._ndarray
tm.assert_numpy_array_equal(result, arr._ndarray.view("i8"))
def test_astype_categorical():
arr = period_array(["2000", "2001", "2001", None], freq="D")
result = arr.astype("category")
categories = pd.PeriodIndex(["2000", "2001"], freq="D")
expected = pd.Categorical.from_codes([0, 1, 1, -1], categories=categories)
tm.assert_categorical_equal(result, expected)
def test_astype_period():
arr = period_array(["2000", "2001", None], freq="D")
result = arr.astype(PeriodDtype("M"))
expected = period_array(["2000", "2001", None], freq="M")
tm.assert_period_array_equal(result, expected)
@pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"])
def test_astype_datetime(dtype):
arr = period_array(["2000", "2001", None], freq="D")
# slice off the [ns] so that the regex matches.
if dtype == "timedelta64[ns]":
with pytest.raises(TypeError, match=dtype[:-4]):
arr.astype(dtype)
else:
# GH#45038 allow period->dt64 because we allow dt64->period
result = arr.astype(dtype)
expected = pd.DatetimeIndex(["2000", "2001", pd.NaT], dtype=dtype)._data
tm.assert_datetime_array_equal(result, expected)

View File

@ -0,0 +1,156 @@
import numpy as np
import pytest
from pandas._libs.tslibs import iNaT
from pandas._libs.tslibs.offsets import MonthEnd
from pandas._libs.tslibs.period import IncompatibleFrequency
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import (
PeriodArray,
period_array,
)
@pytest.mark.parametrize(
"data, freq, expected",
[
([pd.Period("2017", "D")], None, [17167]),
([pd.Period("2017", "D")], "D", [17167]),
([2017], "D", [17167]),
(["2017"], "D", [17167]),
([pd.Period("2017", "D")], pd.tseries.offsets.Day(), [17167]),
([pd.Period("2017", "D"), None], None, [17167, iNaT]),
(pd.Series(pd.date_range("2017", periods=3)), None, [17167, 17168, 17169]),
(pd.date_range("2017", periods=3), None, [17167, 17168, 17169]),
(pd.period_range("2017", periods=4, freq="Q"), None, [188, 189, 190, 191]),
],
)
def test_period_array_ok(data, freq, expected):
result = period_array(data, freq=freq).asi8
expected = np.asarray(expected, dtype=np.int64)
tm.assert_numpy_array_equal(result, expected)
def test_period_array_readonly_object():
# https://github.com/pandas-dev/pandas/issues/25403
pa = period_array([pd.Period("2019-01-01")])
arr = np.asarray(pa, dtype="object")
arr.setflags(write=False)
result = period_array(arr)
tm.assert_period_array_equal(result, pa)
result = pd.Series(arr)
tm.assert_series_equal(result, pd.Series(pa))
result = pd.DataFrame({"A": arr})
tm.assert_frame_equal(result, pd.DataFrame({"A": pa}))
def test_from_datetime64_freq_changes():
# https://github.com/pandas-dev/pandas/issues/23438
arr = pd.date_range("2017", periods=3, freq="D")
result = PeriodArray._from_datetime64(arr, freq="M")
expected = period_array(["2017-01-01", "2017-01-01", "2017-01-01"], freq="M")
tm.assert_period_array_equal(result, expected)
@pytest.mark.parametrize("freq", ["2M", MonthEnd(2)])
def test_from_datetime64_freq_2M(freq):
arr = np.array(
["2020-01-01T00:00:00", "2020-01-02T00:00:00"], dtype="datetime64[ns]"
)
result = PeriodArray._from_datetime64(arr, freq)
expected = period_array(["2020-01", "2020-01"], freq=freq)
tm.assert_period_array_equal(result, expected)
@pytest.mark.parametrize(
"data, freq, msg",
[
(
[pd.Period("2017", "D"), pd.Period("2017", "Y")],
None,
"Input has different freq",
),
([pd.Period("2017", "D")], "Y", "Input has different freq"),
],
)
def test_period_array_raises(data, freq, msg):
with pytest.raises(IncompatibleFrequency, match=msg):
period_array(data, freq)
def test_period_array_non_period_series_raies():
ser = pd.Series([1, 2, 3])
with pytest.raises(TypeError, match="dtype"):
PeriodArray(ser, dtype="period[D]")
def test_period_array_freq_mismatch():
arr = period_array(["2000", "2001"], freq="D")
with pytest.raises(IncompatibleFrequency, match="freq"):
PeriodArray(arr, dtype="period[M]")
dtype = pd.PeriodDtype(pd.tseries.offsets.MonthEnd())
with pytest.raises(IncompatibleFrequency, match="freq"):
PeriodArray(arr, dtype=dtype)
def test_from_sequence_disallows_i8():
arr = period_array(["2000", "2001"], freq="D")
msg = str(arr[0].ordinal)
with pytest.raises(TypeError, match=msg):
PeriodArray._from_sequence(arr.asi8, dtype=arr.dtype)
with pytest.raises(TypeError, match=msg):
PeriodArray._from_sequence(list(arr.asi8), dtype=arr.dtype)
def test_from_td64nat_sequence_raises():
# GH#44507
td = pd.NaT.to_numpy("m8[ns]")
dtype = pd.period_range("2005-01-01", periods=3, freq="D").dtype
arr = np.array([None], dtype=object)
arr[0] = td
msg = "Value must be Period, string, integer, or datetime"
with pytest.raises(ValueError, match=msg):
PeriodArray._from_sequence(arr, dtype=dtype)
with pytest.raises(ValueError, match=msg):
pd.PeriodIndex(arr, dtype=dtype)
with pytest.raises(ValueError, match=msg):
pd.Index(arr, dtype=dtype)
with pytest.raises(ValueError, match=msg):
pd.array(arr, dtype=dtype)
with pytest.raises(ValueError, match=msg):
pd.Series(arr, dtype=dtype)
with pytest.raises(ValueError, match=msg):
pd.DataFrame(arr, dtype=dtype)
def test_freq_deprecated():
# GH#52462
data = np.arange(5).astype(np.int64)
msg = "The 'freq' keyword in the PeriodArray constructor is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
res = PeriodArray(data, freq="M")
expected = PeriodArray(data, dtype="period[M]")
tm.assert_equal(res, expected)
def test_period_array_from_datetime64():
arr = np.array(
["2020-01-01T00:00:00", "2020-02-02T00:00:00"], dtype="datetime64[ns]"
)
result = PeriodArray._from_datetime64(arr, freq=MonthEnd(2))
expected = period_array(["2020-01-01", "2020-02-01"], freq=MonthEnd(2))
tm.assert_period_array_equal(result, expected)

View File

@ -0,0 +1,42 @@
import pytest
import pandas as pd
from pandas.core.arrays import period_array
class TestReductions:
def test_min_max(self):
arr = period_array(
[
"2000-01-03",
"2000-01-03",
"NaT",
"2000-01-02",
"2000-01-05",
"2000-01-04",
],
freq="D",
)
result = arr.min()
expected = pd.Period("2000-01-02", freq="D")
assert result == expected
result = arr.max()
expected = pd.Period("2000-01-05", freq="D")
assert result == expected
result = arr.min(skipna=False)
assert result is pd.NaT
result = arr.max(skipna=False)
assert result is pd.NaT
@pytest.mark.parametrize("skipna", [True, False])
def test_min_max_empty(self, skipna):
arr = period_array([], freq="D")
result = arr.min(skipna=skipna)
assert result is pd.NaT
result = arr.max(skipna=skipna)
assert result is pd.NaT

View File

@ -0,0 +1,253 @@
import string
import numpy as np
import pytest
import pandas as pd
from pandas import SparseDtype
import pandas._testing as tm
from pandas.core.arrays.sparse import SparseArray
class TestSeriesAccessor:
def test_to_dense(self):
ser = pd.Series([0, 1, 0, 10], dtype="Sparse[int64]")
result = ser.sparse.to_dense()
expected = pd.Series([0, 1, 0, 10])
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("attr", ["npoints", "density", "fill_value", "sp_values"])
def test_get_attributes(self, attr):
arr = SparseArray([0, 1])
ser = pd.Series(arr)
result = getattr(ser.sparse, attr)
expected = getattr(arr, attr)
assert result == expected
def test_from_coo(self):
scipy_sparse = pytest.importorskip("scipy.sparse")
row = [0, 3, 1, 0]
col = [0, 3, 1, 2]
data = [4, 5, 7, 9]
sp_array = scipy_sparse.coo_matrix((data, (row, col)))
result = pd.Series.sparse.from_coo(sp_array)
index = pd.MultiIndex.from_arrays(
[
np.array([0, 0, 1, 3], dtype=np.int32),
np.array([0, 2, 1, 3], dtype=np.int32),
],
)
expected = pd.Series([4, 9, 7, 5], index=index, dtype="Sparse[int]")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"sort_labels, expected_rows, expected_cols, expected_values_pos",
[
(
False,
[("b", 2), ("a", 2), ("b", 1), ("a", 1)],
[("z", 1), ("z", 2), ("x", 2), ("z", 0)],
{1: (1, 0), 3: (3, 3)},
),
(
True,
[("a", 1), ("a", 2), ("b", 1), ("b", 2)],
[("x", 2), ("z", 0), ("z", 1), ("z", 2)],
{1: (1, 2), 3: (0, 1)},
),
],
)
def test_to_coo(
self, sort_labels, expected_rows, expected_cols, expected_values_pos
):
sp_sparse = pytest.importorskip("scipy.sparse")
values = SparseArray([0, np.nan, 1, 0, None, 3], fill_value=0)
index = pd.MultiIndex.from_tuples(
[
("b", 2, "z", 1),
("a", 2, "z", 2),
("a", 2, "z", 1),
("a", 2, "x", 2),
("b", 1, "z", 1),
("a", 1, "z", 0),
]
)
ss = pd.Series(values, index=index)
expected_A = np.zeros((4, 4))
for value, (row, col) in expected_values_pos.items():
expected_A[row, col] = value
A, rows, cols = ss.sparse.to_coo(
row_levels=(0, 1), column_levels=(2, 3), sort_labels=sort_labels
)
assert isinstance(A, sp_sparse.coo_matrix)
tm.assert_numpy_array_equal(A.toarray(), expected_A)
assert rows == expected_rows
assert cols == expected_cols
def test_non_sparse_raises(self):
ser = pd.Series([1, 2, 3])
with pytest.raises(AttributeError, match=".sparse"):
ser.sparse.density
class TestFrameAccessor:
def test_accessor_raises(self):
df = pd.DataFrame({"A": [0, 1]})
with pytest.raises(AttributeError, match="sparse"):
df.sparse
@pytest.mark.parametrize("format", ["csc", "csr", "coo"])
@pytest.mark.parametrize("labels", [None, list(string.ascii_letters[:10])])
@pytest.mark.parametrize("dtype", ["float64", "int64"])
def test_from_spmatrix(self, format, labels, dtype):
sp_sparse = pytest.importorskip("scipy.sparse")
sp_dtype = SparseDtype(dtype, np.array(0, dtype=dtype).item())
mat = sp_sparse.eye(10, format=format, dtype=dtype)
result = pd.DataFrame.sparse.from_spmatrix(mat, index=labels, columns=labels)
expected = pd.DataFrame(
np.eye(10, dtype=dtype), index=labels, columns=labels
).astype(sp_dtype)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("format", ["csc", "csr", "coo"])
def test_from_spmatrix_including_explicit_zero(self, format):
sp_sparse = pytest.importorskip("scipy.sparse")
mat = sp_sparse.random(10, 2, density=0.5, format=format)
mat.data[0] = 0
result = pd.DataFrame.sparse.from_spmatrix(mat)
dtype = SparseDtype("float64", 0.0)
expected = pd.DataFrame(mat.todense()).astype(dtype)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"columns",
[["a", "b"], pd.MultiIndex.from_product([["A"], ["a", "b"]]), ["a", "a"]],
)
def test_from_spmatrix_columns(self, columns):
sp_sparse = pytest.importorskip("scipy.sparse")
dtype = SparseDtype("float64", 0.0)
mat = sp_sparse.random(10, 2, density=0.5)
result = pd.DataFrame.sparse.from_spmatrix(mat, columns=columns)
expected = pd.DataFrame(mat.toarray(), columns=columns).astype(dtype)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"colnames", [("A", "B"), (1, 2), (1, pd.NA), (0.1, 0.2), ("x", "x"), (0, 0)]
)
def test_to_coo(self, colnames):
sp_sparse = pytest.importorskip("scipy.sparse")
df = pd.DataFrame(
{colnames[0]: [0, 1, 0], colnames[1]: [1, 0, 0]}, dtype="Sparse[int64, 0]"
)
result = df.sparse.to_coo()
expected = sp_sparse.coo_matrix(np.asarray(df))
assert (result != expected).nnz == 0
@pytest.mark.parametrize("fill_value", [1, np.nan])
def test_to_coo_nonzero_fill_val_raises(self, fill_value):
pytest.importorskip("scipy")
df = pd.DataFrame(
{
"A": SparseArray(
[fill_value, fill_value, fill_value, 2], fill_value=fill_value
),
"B": SparseArray(
[fill_value, 2, fill_value, fill_value], fill_value=fill_value
),
}
)
with pytest.raises(ValueError, match="fill value must be 0"):
df.sparse.to_coo()
def test_to_coo_midx_categorical(self):
# GH#50996
sp_sparse = pytest.importorskip("scipy.sparse")
midx = pd.MultiIndex.from_arrays(
[
pd.CategoricalIndex(list("ab"), name="x"),
pd.CategoricalIndex([0, 1], name="y"),
]
)
ser = pd.Series(1, index=midx, dtype="Sparse[int]")
result = ser.sparse.to_coo(row_levels=["x"], column_levels=["y"])[0]
expected = sp_sparse.coo_matrix(
(np.array([1, 1]), (np.array([0, 1]), np.array([0, 1]))), shape=(2, 2)
)
assert (result != expected).nnz == 0
def test_to_dense(self):
df = pd.DataFrame(
{
"A": SparseArray([1, 0], dtype=SparseDtype("int64", 0)),
"B": SparseArray([1, 0], dtype=SparseDtype("int64", 1)),
"C": SparseArray([1.0, 0.0], dtype=SparseDtype("float64", 0.0)),
},
index=["b", "a"],
)
result = df.sparse.to_dense()
expected = pd.DataFrame(
{"A": [1, 0], "B": [1, 0], "C": [1.0, 0.0]}, index=["b", "a"]
)
tm.assert_frame_equal(result, expected)
def test_density(self):
df = pd.DataFrame(
{
"A": SparseArray([1, 0, 2, 1], fill_value=0),
"B": SparseArray([0, 1, 1, 1], fill_value=0),
}
)
res = df.sparse.density
expected = 0.75
assert res == expected
@pytest.mark.parametrize("dtype", ["int64", "float64"])
@pytest.mark.parametrize("dense_index", [True, False])
def test_series_from_coo(self, dtype, dense_index):
sp_sparse = pytest.importorskip("scipy.sparse")
A = sp_sparse.eye(3, format="coo", dtype=dtype)
result = pd.Series.sparse.from_coo(A, dense_index=dense_index)
index = pd.MultiIndex.from_tuples(
[
np.array([0, 0], dtype=np.int32),
np.array([1, 1], dtype=np.int32),
np.array([2, 2], dtype=np.int32),
],
)
expected = pd.Series(SparseArray(np.array([1, 1, 1], dtype=dtype)), index=index)
if dense_index:
expected = expected.reindex(pd.MultiIndex.from_product(index.levels))
tm.assert_series_equal(result, expected)
def test_series_from_coo_incorrect_format_raises(self):
# gh-26554
sp_sparse = pytest.importorskip("scipy.sparse")
m = sp_sparse.csr_matrix(np.array([[0, 1], [0, 0]]))
with pytest.raises(
TypeError, match="Expected coo_matrix. Got csr_matrix instead."
):
pd.Series.sparse.from_coo(m)
def test_with_column_named_sparse(self):
# https://github.com/pandas-dev/pandas/issues/30758
df = pd.DataFrame({"sparse": pd.arrays.SparseArray([1, 2])})
assert isinstance(df.sparse, pd.core.arrays.sparse.accessor.SparseFrameAccessor)

View File

@ -0,0 +1,514 @@
import operator
import numpy as np
import pytest
import pandas as pd
from pandas import SparseDtype
import pandas._testing as tm
from pandas.core.arrays.sparse import SparseArray
@pytest.fixture(params=["integer", "block"])
def kind(request):
"""kind kwarg to pass to SparseArray"""
return request.param
@pytest.fixture(params=[True, False])
def mix(request):
"""
Fixture returning True or False, determining whether to operate
op(sparse, dense) instead of op(sparse, sparse)
"""
return request.param
class TestSparseArrayArithmetics:
def _assert(self, a, b):
# We have to use tm.assert_sp_array_equal. See GH #45126
tm.assert_numpy_array_equal(a, b)
def _check_numeric_ops(self, a, b, a_dense, b_dense, mix: bool, op):
# Check that arithmetic behavior matches non-Sparse Series arithmetic
if isinstance(a_dense, np.ndarray):
expected = op(pd.Series(a_dense), b_dense).values
elif isinstance(b_dense, np.ndarray):
expected = op(a_dense, pd.Series(b_dense)).values
else:
raise NotImplementedError
with np.errstate(invalid="ignore", divide="ignore"):
if mix:
result = op(a, b_dense).to_dense()
else:
result = op(a, b).to_dense()
self._assert(result, expected)
def _check_bool_result(self, res):
assert isinstance(res, SparseArray)
assert isinstance(res.dtype, SparseDtype)
assert res.dtype.subtype == np.bool_
assert isinstance(res.fill_value, bool)
def _check_comparison_ops(self, a, b, a_dense, b_dense):
with np.errstate(invalid="ignore"):
# Unfortunately, trying to wrap the computation of each expected
# value is with np.errstate() is too tedious.
#
# sparse & sparse
self._check_bool_result(a == b)
self._assert((a == b).to_dense(), a_dense == b_dense)
self._check_bool_result(a != b)
self._assert((a != b).to_dense(), a_dense != b_dense)
self._check_bool_result(a >= b)
self._assert((a >= b).to_dense(), a_dense >= b_dense)
self._check_bool_result(a <= b)
self._assert((a <= b).to_dense(), a_dense <= b_dense)
self._check_bool_result(a > b)
self._assert((a > b).to_dense(), a_dense > b_dense)
self._check_bool_result(a < b)
self._assert((a < b).to_dense(), a_dense < b_dense)
# sparse & dense
self._check_bool_result(a == b_dense)
self._assert((a == b_dense).to_dense(), a_dense == b_dense)
self._check_bool_result(a != b_dense)
self._assert((a != b_dense).to_dense(), a_dense != b_dense)
self._check_bool_result(a >= b_dense)
self._assert((a >= b_dense).to_dense(), a_dense >= b_dense)
self._check_bool_result(a <= b_dense)
self._assert((a <= b_dense).to_dense(), a_dense <= b_dense)
self._check_bool_result(a > b_dense)
self._assert((a > b_dense).to_dense(), a_dense > b_dense)
self._check_bool_result(a < b_dense)
self._assert((a < b_dense).to_dense(), a_dense < b_dense)
def _check_logical_ops(self, a, b, a_dense, b_dense):
# sparse & sparse
self._check_bool_result(a & b)
self._assert((a & b).to_dense(), a_dense & b_dense)
self._check_bool_result(a | b)
self._assert((a | b).to_dense(), a_dense | b_dense)
# sparse & dense
self._check_bool_result(a & b_dense)
self._assert((a & b_dense).to_dense(), a_dense & b_dense)
self._check_bool_result(a | b_dense)
self._assert((a | b_dense).to_dense(), a_dense | b_dense)
@pytest.mark.parametrize("scalar", [0, 1, 3])
@pytest.mark.parametrize("fill_value", [None, 0, 2])
def test_float_scalar(
self, kind, mix, all_arithmetic_functions, fill_value, scalar, request
):
op = all_arithmetic_functions
values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
a = SparseArray(values, kind=kind, fill_value=fill_value)
self._check_numeric_ops(a, scalar, values, scalar, mix, op)
def test_float_scalar_comparison(self, kind):
values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
a = SparseArray(values, kind=kind)
self._check_comparison_ops(a, 1, values, 1)
self._check_comparison_ops(a, 0, values, 0)
self._check_comparison_ops(a, 3, values, 3)
a = SparseArray(values, kind=kind, fill_value=0)
self._check_comparison_ops(a, 1, values, 1)
self._check_comparison_ops(a, 0, values, 0)
self._check_comparison_ops(a, 3, values, 3)
a = SparseArray(values, kind=kind, fill_value=2)
self._check_comparison_ops(a, 1, values, 1)
self._check_comparison_ops(a, 0, values, 0)
self._check_comparison_ops(a, 3, values, 3)
def test_float_same_index_without_nans(self, kind, mix, all_arithmetic_functions):
# when sp_index are the same
op = all_arithmetic_functions
values = np.array([0.0, 1.0, 2.0, 6.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0])
rvalues = np.array([0.0, 2.0, 3.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0, 0.0])
a = SparseArray(values, kind=kind, fill_value=0)
b = SparseArray(rvalues, kind=kind, fill_value=0)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
def test_float_same_index_with_nans(
self, kind, mix, all_arithmetic_functions, request
):
# when sp_index are the same
op = all_arithmetic_functions
values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = np.array([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan])
a = SparseArray(values, kind=kind)
b = SparseArray(rvalues, kind=kind)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
def test_float_same_index_comparison(self, kind):
# when sp_index are the same
values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = np.array([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan])
a = SparseArray(values, kind=kind)
b = SparseArray(rvalues, kind=kind)
self._check_comparison_ops(a, b, values, rvalues)
values = np.array([0.0, 1.0, 2.0, 6.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0])
rvalues = np.array([0.0, 2.0, 3.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0, 0.0])
a = SparseArray(values, kind=kind, fill_value=0)
b = SparseArray(rvalues, kind=kind, fill_value=0)
self._check_comparison_ops(a, b, values, rvalues)
def test_float_array(self, kind, mix, all_arithmetic_functions):
op = all_arithmetic_functions
values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = np.array([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan])
a = SparseArray(values, kind=kind)
b = SparseArray(rvalues, kind=kind)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op)
a = SparseArray(values, kind=kind, fill_value=0)
b = SparseArray(rvalues, kind=kind)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = SparseArray(values, kind=kind, fill_value=0)
b = SparseArray(rvalues, kind=kind, fill_value=0)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = SparseArray(values, kind=kind, fill_value=1)
b = SparseArray(rvalues, kind=kind, fill_value=2)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
def test_float_array_different_kind(self, mix, all_arithmetic_functions):
op = all_arithmetic_functions
values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = np.array([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan])
a = SparseArray(values, kind="integer")
b = SparseArray(rvalues, kind="block")
self._check_numeric_ops(a, b, values, rvalues, mix, op)
self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op)
a = SparseArray(values, kind="integer", fill_value=0)
b = SparseArray(rvalues, kind="block")
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = SparseArray(values, kind="integer", fill_value=0)
b = SparseArray(rvalues, kind="block", fill_value=0)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = SparseArray(values, kind="integer", fill_value=1)
b = SparseArray(rvalues, kind="block", fill_value=2)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
def test_float_array_comparison(self, kind):
values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = np.array([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan])
a = SparseArray(values, kind=kind)
b = SparseArray(rvalues, kind=kind)
self._check_comparison_ops(a, b, values, rvalues)
self._check_comparison_ops(a, b * 0, values, rvalues * 0)
a = SparseArray(values, kind=kind, fill_value=0)
b = SparseArray(rvalues, kind=kind)
self._check_comparison_ops(a, b, values, rvalues)
a = SparseArray(values, kind=kind, fill_value=0)
b = SparseArray(rvalues, kind=kind, fill_value=0)
self._check_comparison_ops(a, b, values, rvalues)
a = SparseArray(values, kind=kind, fill_value=1)
b = SparseArray(rvalues, kind=kind, fill_value=2)
self._check_comparison_ops(a, b, values, rvalues)
def test_int_array(self, kind, mix, all_arithmetic_functions):
op = all_arithmetic_functions
# have to specify dtype explicitly until fixing GH 667
dtype = np.int64
values = np.array([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype)
rvalues = np.array([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype)
a = SparseArray(values, dtype=dtype, kind=kind)
assert a.dtype == SparseDtype(dtype)
b = SparseArray(rvalues, dtype=dtype, kind=kind)
assert b.dtype == SparseDtype(dtype)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op)
a = SparseArray(values, fill_value=0, dtype=dtype, kind=kind)
assert a.dtype == SparseDtype(dtype)
b = SparseArray(rvalues, dtype=dtype, kind=kind)
assert b.dtype == SparseDtype(dtype)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = SparseArray(values, fill_value=0, dtype=dtype, kind=kind)
assert a.dtype == SparseDtype(dtype)
b = SparseArray(rvalues, fill_value=0, dtype=dtype, kind=kind)
assert b.dtype == SparseDtype(dtype)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = SparseArray(values, fill_value=1, dtype=dtype, kind=kind)
assert a.dtype == SparseDtype(dtype, fill_value=1)
b = SparseArray(rvalues, fill_value=2, dtype=dtype, kind=kind)
assert b.dtype == SparseDtype(dtype, fill_value=2)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
def test_int_array_comparison(self, kind):
dtype = "int64"
# int32 NI ATM
values = np.array([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype)
rvalues = np.array([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype)
a = SparseArray(values, dtype=dtype, kind=kind)
b = SparseArray(rvalues, dtype=dtype, kind=kind)
self._check_comparison_ops(a, b, values, rvalues)
self._check_comparison_ops(a, b * 0, values, rvalues * 0)
a = SparseArray(values, dtype=dtype, kind=kind, fill_value=0)
b = SparseArray(rvalues, dtype=dtype, kind=kind)
self._check_comparison_ops(a, b, values, rvalues)
a = SparseArray(values, dtype=dtype, kind=kind, fill_value=0)
b = SparseArray(rvalues, dtype=dtype, kind=kind, fill_value=0)
self._check_comparison_ops(a, b, values, rvalues)
a = SparseArray(values, dtype=dtype, kind=kind, fill_value=1)
b = SparseArray(rvalues, dtype=dtype, kind=kind, fill_value=2)
self._check_comparison_ops(a, b, values, rvalues)
@pytest.mark.parametrize("fill_value", [True, False, np.nan])
def test_bool_same_index(self, kind, fill_value):
# GH 14000
# when sp_index are the same
values = np.array([True, False, True, True], dtype=np.bool_)
rvalues = np.array([True, False, True, True], dtype=np.bool_)
a = SparseArray(values, kind=kind, dtype=np.bool_, fill_value=fill_value)
b = SparseArray(rvalues, kind=kind, dtype=np.bool_, fill_value=fill_value)
self._check_logical_ops(a, b, values, rvalues)
@pytest.mark.parametrize("fill_value", [True, False, np.nan])
def test_bool_array_logical(self, kind, fill_value):
# GH 14000
# when sp_index are the same
values = np.array([True, False, True, False, True, True], dtype=np.bool_)
rvalues = np.array([True, False, False, True, False, True], dtype=np.bool_)
a = SparseArray(values, kind=kind, dtype=np.bool_, fill_value=fill_value)
b = SparseArray(rvalues, kind=kind, dtype=np.bool_, fill_value=fill_value)
self._check_logical_ops(a, b, values, rvalues)
def test_mixed_array_float_int(self, kind, mix, all_arithmetic_functions, request):
op = all_arithmetic_functions
rdtype = "int64"
values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = np.array([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype)
a = SparseArray(values, kind=kind)
b = SparseArray(rvalues, kind=kind)
assert b.dtype == SparseDtype(rdtype)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op)
a = SparseArray(values, kind=kind, fill_value=0)
b = SparseArray(rvalues, kind=kind)
assert b.dtype == SparseDtype(rdtype)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = SparseArray(values, kind=kind, fill_value=0)
b = SparseArray(rvalues, kind=kind, fill_value=0)
assert b.dtype == SparseDtype(rdtype)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = SparseArray(values, kind=kind, fill_value=1)
b = SparseArray(rvalues, kind=kind, fill_value=2)
assert b.dtype == SparseDtype(rdtype, fill_value=2)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
def test_mixed_array_comparison(self, kind):
rdtype = "int64"
# int32 NI ATM
values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = np.array([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype)
a = SparseArray(values, kind=kind)
b = SparseArray(rvalues, kind=kind)
assert b.dtype == SparseDtype(rdtype)
self._check_comparison_ops(a, b, values, rvalues)
self._check_comparison_ops(a, b * 0, values, rvalues * 0)
a = SparseArray(values, kind=kind, fill_value=0)
b = SparseArray(rvalues, kind=kind)
assert b.dtype == SparseDtype(rdtype)
self._check_comparison_ops(a, b, values, rvalues)
a = SparseArray(values, kind=kind, fill_value=0)
b = SparseArray(rvalues, kind=kind, fill_value=0)
assert b.dtype == SparseDtype(rdtype)
self._check_comparison_ops(a, b, values, rvalues)
a = SparseArray(values, kind=kind, fill_value=1)
b = SparseArray(rvalues, kind=kind, fill_value=2)
assert b.dtype == SparseDtype(rdtype, fill_value=2)
self._check_comparison_ops(a, b, values, rvalues)
def test_xor(self):
s = SparseArray([True, True, False, False])
t = SparseArray([True, False, True, False])
result = s ^ t
sp_index = pd.core.arrays.sparse.IntIndex(4, np.array([0, 1, 2], dtype="int32"))
expected = SparseArray([False, True, True], sparse_index=sp_index)
tm.assert_sp_array_equal(result, expected)
@pytest.mark.parametrize("op", [operator.eq, operator.add])
def test_with_list(op):
arr = SparseArray([0, 1], fill_value=0)
result = op(arr, [0, 1])
expected = op(arr, SparseArray([0, 1]))
tm.assert_sp_array_equal(result, expected)
def test_with_dataframe():
# GH#27910
arr = SparseArray([0, 1], fill_value=0)
df = pd.DataFrame([[1, 2], [3, 4]])
result = arr.__add__(df)
assert result is NotImplemented
def test_with_zerodim_ndarray():
# GH#27910
arr = SparseArray([0, 1], fill_value=0)
result = arr * np.array(2)
expected = arr * 2
tm.assert_sp_array_equal(result, expected)
@pytest.mark.parametrize("ufunc", [np.abs, np.exp])
@pytest.mark.parametrize(
"arr", [SparseArray([0, 0, -1, 1]), SparseArray([None, None, -1, 1])]
)
def test_ufuncs(ufunc, arr):
result = ufunc(arr)
fill_value = ufunc(arr.fill_value)
expected = SparseArray(ufunc(np.asarray(arr)), fill_value=fill_value)
tm.assert_sp_array_equal(result, expected)
@pytest.mark.parametrize(
"a, b",
[
(SparseArray([0, 0, 0]), np.array([0, 1, 2])),
(SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])),
(SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])),
(SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])),
(SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])),
],
)
@pytest.mark.parametrize("ufunc", [np.add, np.greater])
def test_binary_ufuncs(ufunc, a, b):
# can't say anything about fill value here.
result = ufunc(a, b)
expected = ufunc(np.asarray(a), np.asarray(b))
assert isinstance(result, SparseArray)
tm.assert_numpy_array_equal(np.asarray(result), expected)
def test_ndarray_inplace():
sparray = SparseArray([0, 2, 0, 0])
ndarray = np.array([0, 1, 2, 3])
ndarray += sparray
expected = np.array([0, 3, 2, 3])
tm.assert_numpy_array_equal(ndarray, expected)
def test_sparray_inplace():
sparray = SparseArray([0, 2, 0, 0])
ndarray = np.array([0, 1, 2, 3])
sparray += ndarray
expected = SparseArray([0, 3, 2, 3], fill_value=0)
tm.assert_sp_array_equal(sparray, expected)
@pytest.mark.parametrize("cons", [list, np.array, SparseArray])
def test_mismatched_length_cmp_op(cons):
left = SparseArray([True, True])
right = cons([True, True, True])
with pytest.raises(ValueError, match="operands have mismatched length"):
left & right
@pytest.mark.parametrize("op", ["add", "sub", "mul", "truediv", "floordiv", "pow"])
@pytest.mark.parametrize("fill_value", [np.nan, 3])
def test_binary_operators(op, fill_value):
op = getattr(operator, op)
data1 = np.random.default_rng(2).standard_normal(20)
data2 = np.random.default_rng(2).standard_normal(20)
data1[::2] = fill_value
data2[::3] = fill_value
first = SparseArray(data1, fill_value=fill_value)
second = SparseArray(data2, fill_value=fill_value)
with np.errstate(all="ignore"):
res = op(first, second)
exp = SparseArray(
op(first.to_dense(), second.to_dense()), fill_value=first.fill_value
)
assert isinstance(res, SparseArray)
tm.assert_almost_equal(res.to_dense(), exp.to_dense())
res2 = op(first, second.to_dense())
assert isinstance(res2, SparseArray)
tm.assert_sp_array_equal(res, res2)
res3 = op(first.to_dense(), second)
assert isinstance(res3, SparseArray)
tm.assert_sp_array_equal(res, res3)
res4 = op(first, 4)
assert isinstance(res4, SparseArray)
# Ignore this if the actual op raises (e.g. pow).
try:
exp = op(first.to_dense(), 4)
exp_fv = op(first.fill_value, 4)
except ValueError:
pass
else:
tm.assert_almost_equal(res4.fill_value, exp_fv)
tm.assert_almost_equal(res4.to_dense(), exp)

View File

@ -0,0 +1,511 @@
import re
import numpy as np
import pytest
from pandas._libs.sparse import IntIndex
from pandas.compat.numpy import np_version_gt2
import pandas as pd
from pandas import (
SparseDtype,
isna,
)
import pandas._testing as tm
from pandas.core.arrays.sparse import SparseArray
@pytest.fixture
def arr_data():
"""Fixture returning numpy array with valid and missing entries"""
return np.array([np.nan, np.nan, 1, 2, 3, np.nan, 4, 5, np.nan, 6])
@pytest.fixture
def arr(arr_data):
"""Fixture returning SparseArray from 'arr_data'"""
return SparseArray(arr_data)
@pytest.fixture
def zarr():
"""Fixture returning SparseArray with integer entries and 'fill_value=0'"""
return SparseArray([0, 0, 1, 2, 3, 0, 4, 5, 0, 6], fill_value=0)
class TestSparseArray:
@pytest.mark.parametrize("fill_value", [0, None, np.nan])
def test_shift_fill_value(self, fill_value):
# GH #24128
sparse = SparseArray(np.array([1, 0, 0, 3, 0]), fill_value=8.0)
res = sparse.shift(1, fill_value=fill_value)
if isna(fill_value):
fill_value = res.dtype.na_value
exp = SparseArray(np.array([fill_value, 1, 0, 0, 3]), fill_value=8.0)
tm.assert_sp_array_equal(res, exp)
def test_set_fill_value(self):
arr = SparseArray([1.0, np.nan, 2.0], fill_value=np.nan)
arr.fill_value = 2
assert arr.fill_value == 2
arr = SparseArray([1, 0, 2], fill_value=0, dtype=np.int64)
arr.fill_value = 2
assert arr.fill_value == 2
msg = "Allowing arbitrary scalar fill_value in SparseDtype is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
arr.fill_value = 3.1
assert arr.fill_value == 3.1
arr.fill_value = np.nan
assert np.isnan(arr.fill_value)
arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool_)
arr.fill_value = True
assert arr.fill_value is True
with tm.assert_produces_warning(FutureWarning, match=msg):
arr.fill_value = 0
arr.fill_value = np.nan
assert np.isnan(arr.fill_value)
@pytest.mark.parametrize("val", [[1, 2, 3], np.array([1, 2]), (1, 2, 3)])
def test_set_fill_invalid_non_scalar(self, val):
arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool_)
msg = "fill_value must be a scalar"
with pytest.raises(ValueError, match=msg):
arr.fill_value = val
def test_copy(self, arr):
arr2 = arr.copy()
assert arr2.sp_values is not arr.sp_values
assert arr2.sp_index is arr.sp_index
def test_values_asarray(self, arr_data, arr):
tm.assert_almost_equal(arr.to_dense(), arr_data)
@pytest.mark.parametrize(
"data,shape,dtype",
[
([0, 0, 0, 0, 0], (5,), None),
([], (0,), None),
([0], (1,), None),
(["A", "A", np.nan, "B"], (4,), object),
],
)
def test_shape(self, data, shape, dtype):
# GH 21126
out = SparseArray(data, dtype=dtype)
assert out.shape == shape
@pytest.mark.parametrize(
"vals",
[
[np.nan, np.nan, np.nan, np.nan, np.nan],
[1, np.nan, np.nan, 3, np.nan],
[1, np.nan, 0, 3, 0],
],
)
@pytest.mark.parametrize("fill_value", [None, 0])
def test_dense_repr(self, vals, fill_value):
vals = np.array(vals)
arr = SparseArray(vals, fill_value=fill_value)
res = arr.to_dense()
tm.assert_numpy_array_equal(res, vals)
@pytest.mark.parametrize("fix", ["arr", "zarr"])
def test_pickle(self, fix, request):
obj = request.getfixturevalue(fix)
unpickled = tm.round_trip_pickle(obj)
tm.assert_sp_array_equal(unpickled, obj)
def test_generator_warnings(self):
sp_arr = SparseArray([1, 2, 3])
with tm.assert_produces_warning(None):
for _ in sp_arr:
pass
def test_where_retain_fill_value(self):
# GH#45691 don't lose fill_value on _where
arr = SparseArray([np.nan, 1.0], fill_value=0)
mask = np.array([True, False])
res = arr._where(~mask, 1)
exp = SparseArray([1, 1.0], fill_value=0)
tm.assert_sp_array_equal(res, exp)
ser = pd.Series(arr)
res = ser.where(~mask, 1)
tm.assert_series_equal(res, pd.Series(exp))
def test_fillna(self):
s = SparseArray([1, np.nan, np.nan, 3, np.nan])
res = s.fillna(-1)
exp = SparseArray([1, -1, -1, 3, -1], fill_value=-1, dtype=np.float64)
tm.assert_sp_array_equal(res, exp)
s = SparseArray([1, np.nan, np.nan, 3, np.nan], fill_value=0)
res = s.fillna(-1)
exp = SparseArray([1, -1, -1, 3, -1], fill_value=0, dtype=np.float64)
tm.assert_sp_array_equal(res, exp)
s = SparseArray([1, np.nan, 0, 3, 0])
res = s.fillna(-1)
exp = SparseArray([1, -1, 0, 3, 0], fill_value=-1, dtype=np.float64)
tm.assert_sp_array_equal(res, exp)
s = SparseArray([1, np.nan, 0, 3, 0], fill_value=0)
res = s.fillna(-1)
exp = SparseArray([1, -1, 0, 3, 0], fill_value=0, dtype=np.float64)
tm.assert_sp_array_equal(res, exp)
s = SparseArray([np.nan, np.nan, np.nan, np.nan])
res = s.fillna(-1)
exp = SparseArray([-1, -1, -1, -1], fill_value=-1, dtype=np.float64)
tm.assert_sp_array_equal(res, exp)
s = SparseArray([np.nan, np.nan, np.nan, np.nan], fill_value=0)
res = s.fillna(-1)
exp = SparseArray([-1, -1, -1, -1], fill_value=0, dtype=np.float64)
tm.assert_sp_array_equal(res, exp)
# float dtype's fill_value is np.nan, replaced by -1
s = SparseArray([0.0, 0.0, 0.0, 0.0])
res = s.fillna(-1)
exp = SparseArray([0.0, 0.0, 0.0, 0.0], fill_value=-1)
tm.assert_sp_array_equal(res, exp)
# int dtype shouldn't have missing. No changes.
s = SparseArray([0, 0, 0, 0])
assert s.dtype == SparseDtype(np.int64)
assert s.fill_value == 0
res = s.fillna(-1)
tm.assert_sp_array_equal(res, s)
s = SparseArray([0, 0, 0, 0], fill_value=0)
assert s.dtype == SparseDtype(np.int64)
assert s.fill_value == 0
res = s.fillna(-1)
exp = SparseArray([0, 0, 0, 0], fill_value=0)
tm.assert_sp_array_equal(res, exp)
# fill_value can be nan if there is no missing hole.
# only fill_value will be changed
s = SparseArray([0, 0, 0, 0], fill_value=np.nan)
assert s.dtype == SparseDtype(np.int64, fill_value=np.nan)
assert np.isnan(s.fill_value)
res = s.fillna(-1)
exp = SparseArray([0, 0, 0, 0], fill_value=-1)
tm.assert_sp_array_equal(res, exp)
def test_fillna_overlap(self):
s = SparseArray([1, np.nan, np.nan, 3, np.nan])
# filling with existing value doesn't replace existing value with
# fill_value, i.e. existing 3 remains in sp_values
res = s.fillna(3)
exp = np.array([1, 3, 3, 3, 3], dtype=np.float64)
tm.assert_numpy_array_equal(res.to_dense(), exp)
s = SparseArray([1, np.nan, np.nan, 3, np.nan], fill_value=0)
res = s.fillna(3)
exp = SparseArray([1, 3, 3, 3, 3], fill_value=0, dtype=np.float64)
tm.assert_sp_array_equal(res, exp)
def test_nonzero(self):
# Tests regression #21172.
sa = SparseArray([float("nan"), float("nan"), 1, 0, 0, 2, 0, 0, 0, 3, 0, 0])
expected = np.array([2, 5, 9], dtype=np.int32)
(result,) = sa.nonzero()
tm.assert_numpy_array_equal(expected, result)
sa = SparseArray([0, 0, 1, 0, 0, 2, 0, 0, 0, 3, 0, 0])
(result,) = sa.nonzero()
tm.assert_numpy_array_equal(expected, result)
class TestSparseArrayAnalytics:
@pytest.mark.parametrize(
"data,expected",
[
(
np.array([1, 2, 3, 4, 5], dtype=float), # non-null data
SparseArray(np.array([1.0, 3.0, 6.0, 10.0, 15.0])),
),
(
np.array([1, 2, np.nan, 4, 5], dtype=float), # null data
SparseArray(np.array([1.0, 3.0, np.nan, 7.0, 12.0])),
),
],
)
@pytest.mark.parametrize("numpy", [True, False])
def test_cumsum(self, data, expected, numpy):
cumsum = np.cumsum if numpy else lambda s: s.cumsum()
out = cumsum(SparseArray(data))
tm.assert_sp_array_equal(out, expected)
out = cumsum(SparseArray(data, fill_value=np.nan))
tm.assert_sp_array_equal(out, expected)
out = cumsum(SparseArray(data, fill_value=2))
tm.assert_sp_array_equal(out, expected)
if numpy: # numpy compatibility checks.
msg = "the 'dtype' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.cumsum(SparseArray(data), dtype=np.int64)
msg = "the 'out' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.cumsum(SparseArray(data), out=out)
else:
axis = 1 # SparseArray currently 1-D, so only axis = 0 is valid.
msg = re.escape(f"axis(={axis}) out of bounds")
with pytest.raises(ValueError, match=msg):
SparseArray(data).cumsum(axis=axis)
def test_ufunc(self):
# GH 13853 make sure ufunc is applied to fill_value
sparse = SparseArray([1, np.nan, 2, np.nan, -2])
result = SparseArray([1, np.nan, 2, np.nan, 2])
tm.assert_sp_array_equal(abs(sparse), result)
tm.assert_sp_array_equal(np.abs(sparse), result)
sparse = SparseArray([1, -1, 2, -2], fill_value=1)
result = SparseArray([1, 2, 2], sparse_index=sparse.sp_index, fill_value=1)
tm.assert_sp_array_equal(abs(sparse), result)
tm.assert_sp_array_equal(np.abs(sparse), result)
sparse = SparseArray([1, -1, 2, -2], fill_value=-1)
exp = SparseArray([1, 1, 2, 2], fill_value=1)
tm.assert_sp_array_equal(abs(sparse), exp)
tm.assert_sp_array_equal(np.abs(sparse), exp)
sparse = SparseArray([1, np.nan, 2, np.nan, -2])
result = SparseArray(np.sin([1, np.nan, 2, np.nan, -2]))
tm.assert_sp_array_equal(np.sin(sparse), result)
sparse = SparseArray([1, -1, 2, -2], fill_value=1)
result = SparseArray(np.sin([1, -1, 2, -2]), fill_value=np.sin(1))
tm.assert_sp_array_equal(np.sin(sparse), result)
sparse = SparseArray([1, -1, 0, -2], fill_value=0)
result = SparseArray(np.sin([1, -1, 0, -2]), fill_value=np.sin(0))
tm.assert_sp_array_equal(np.sin(sparse), result)
def test_ufunc_args(self):
# GH 13853 make sure ufunc is applied to fill_value, including its arg
sparse = SparseArray([1, np.nan, 2, np.nan, -2])
result = SparseArray([2, np.nan, 3, np.nan, -1])
tm.assert_sp_array_equal(np.add(sparse, 1), result)
sparse = SparseArray([1, -1, 2, -2], fill_value=1)
result = SparseArray([2, 0, 3, -1], fill_value=2)
tm.assert_sp_array_equal(np.add(sparse, 1), result)
sparse = SparseArray([1, -1, 0, -2], fill_value=0)
result = SparseArray([2, 0, 1, -1], fill_value=1)
tm.assert_sp_array_equal(np.add(sparse, 1), result)
@pytest.mark.parametrize("fill_value", [0.0, np.nan])
def test_modf(self, fill_value):
# https://github.com/pandas-dev/pandas/issues/26946
sparse = SparseArray([fill_value] * 10 + [1.1, 2.2], fill_value=fill_value)
r1, r2 = np.modf(sparse)
e1, e2 = np.modf(np.asarray(sparse))
tm.assert_sp_array_equal(r1, SparseArray(e1, fill_value=fill_value))
tm.assert_sp_array_equal(r2, SparseArray(e2, fill_value=fill_value))
def test_nbytes_integer(self):
arr = SparseArray([1, 0, 0, 0, 2], kind="integer")
result = arr.nbytes
# (2 * 8) + 2 * 4
assert result == 24
def test_nbytes_block(self):
arr = SparseArray([1, 2, 0, 0, 0], kind="block")
result = arr.nbytes
# (2 * 8) + 4 + 4
# sp_values, blocs, blengths
assert result == 24
def test_asarray_datetime64(self):
s = SparseArray(pd.to_datetime(["2012", None, None, "2013"]))
np.asarray(s)
def test_density(self):
arr = SparseArray([0, 1])
assert arr.density == 0.5
def test_npoints(self):
arr = SparseArray([0, 1])
assert arr.npoints == 1
def test_setting_fill_value_fillna_still_works():
# This is why letting users update fill_value / dtype is bad
# astype has the same problem.
arr = SparseArray([1.0, np.nan, 1.0], fill_value=0.0)
arr.fill_value = np.nan
result = arr.isna()
# Can't do direct comparison, since the sp_index will be different
# So let's convert to ndarray and check there.
result = np.asarray(result)
expected = np.array([False, True, False])
tm.assert_numpy_array_equal(result, expected)
def test_setting_fill_value_updates():
arr = SparseArray([0.0, np.nan], fill_value=0)
arr.fill_value = np.nan
# use private constructor to get the index right
# otherwise both nans would be un-stored.
expected = SparseArray._simple_new(
sparse_array=np.array([np.nan]),
sparse_index=IntIndex(2, [1]),
dtype=SparseDtype(float, np.nan),
)
tm.assert_sp_array_equal(arr, expected)
@pytest.mark.parametrize(
"arr,fill_value,loc",
[
([None, 1, 2], None, 0),
([0, None, 2], None, 1),
([0, 1, None], None, 2),
([0, 1, 1, None, None], None, 3),
([1, 1, 1, 2], None, -1),
([], None, -1),
([None, 1, 0, 0, None, 2], None, 0),
([None, 1, 0, 0, None, 2], 1, 1),
([None, 1, 0, 0, None, 2], 2, 5),
([None, 1, 0, 0, None, 2], 3, -1),
([None, 0, 0, 1, 2, 1], 0, 1),
([None, 0, 0, 1, 2, 1], 1, 3),
],
)
def test_first_fill_value_loc(arr, fill_value, loc):
result = SparseArray(arr, fill_value=fill_value)._first_fill_value_loc()
assert result == loc
@pytest.mark.parametrize(
"arr",
[
[1, 2, np.nan, np.nan],
[1, np.nan, 2, np.nan],
[1, 2, np.nan],
[np.nan, 1, 0, 0, np.nan, 2],
[np.nan, 0, 0, 1, 2, 1],
],
)
@pytest.mark.parametrize("fill_value", [np.nan, 0, 1])
def test_unique_na_fill(arr, fill_value):
a = SparseArray(arr, fill_value=fill_value).unique()
b = pd.Series(arr).unique()
assert isinstance(a, SparseArray)
a = np.asarray(a)
tm.assert_numpy_array_equal(a, b)
def test_unique_all_sparse():
# https://github.com/pandas-dev/pandas/issues/23168
arr = SparseArray([0, 0])
result = arr.unique()
expected = SparseArray([0])
tm.assert_sp_array_equal(result, expected)
def test_map():
arr = SparseArray([0, 1, 2])
expected = SparseArray([10, 11, 12], fill_value=10)
# dict
result = arr.map({0: 10, 1: 11, 2: 12})
tm.assert_sp_array_equal(result, expected)
# series
result = arr.map(pd.Series({0: 10, 1: 11, 2: 12}))
tm.assert_sp_array_equal(result, expected)
# function
result = arr.map(pd.Series({0: 10, 1: 11, 2: 12}))
expected = SparseArray([10, 11, 12], fill_value=10)
tm.assert_sp_array_equal(result, expected)
def test_map_missing():
arr = SparseArray([0, 1, 2])
expected = SparseArray([10, 11, None], fill_value=10)
result = arr.map({0: 10, 1: 11})
tm.assert_sp_array_equal(result, expected)
@pytest.mark.parametrize("fill_value", [np.nan, 1])
def test_dropna(fill_value):
# GH-28287
arr = SparseArray([np.nan, 1], fill_value=fill_value)
exp = SparseArray([1.0], fill_value=fill_value)
tm.assert_sp_array_equal(arr.dropna(), exp)
df = pd.DataFrame({"a": [0, 1], "b": arr})
expected_df = pd.DataFrame({"a": [1], "b": exp}, index=pd.Index([1]))
tm.assert_equal(df.dropna(), expected_df)
def test_drop_duplicates_fill_value():
# GH 11726
df = pd.DataFrame(np.zeros((5, 5))).apply(lambda x: SparseArray(x, fill_value=0))
result = df.drop_duplicates()
expected = pd.DataFrame({i: SparseArray([0.0], fill_value=0) for i in range(5)})
tm.assert_frame_equal(result, expected)
def test_zero_sparse_column():
# GH 27781
df1 = pd.DataFrame({"A": SparseArray([0, 0, 0]), "B": [1, 2, 3]})
df2 = pd.DataFrame({"A": SparseArray([0, 1, 0]), "B": [1, 2, 3]})
result = df1.loc[df1["B"] != 2]
expected = df2.loc[df2["B"] != 2]
tm.assert_frame_equal(result, expected)
expected = pd.DataFrame({"A": SparseArray([0, 0]), "B": [1, 3]}, index=[0, 2])
tm.assert_frame_equal(result, expected)
def test_array_interface(arr_data, arr):
# https://github.com/pandas-dev/pandas/pull/60046
result = np.asarray(arr)
tm.assert_numpy_array_equal(result, arr_data)
# it always gives a copy by default
result_copy1 = np.asarray(arr)
result_copy2 = np.asarray(arr)
assert not np.may_share_memory(result_copy1, result_copy2)
# or with explicit copy=True
result_copy1 = np.array(arr, copy=True)
result_copy2 = np.array(arr, copy=True)
assert not np.may_share_memory(result_copy1, result_copy2)
if not np_version_gt2:
# copy=False semantics are only supported in NumPy>=2.
return
msg = "Starting with NumPy 2.0, the behavior of the 'copy' keyword has changed"
with tm.assert_produces_warning(FutureWarning, match=msg):
np.array(arr, copy=False)
# except when there are actually no sparse filled values
arr2 = SparseArray(np.array([1, 2, 3]))
result_nocopy1 = np.array(arr2, copy=False)
result_nocopy2 = np.array(arr2, copy=False)
assert np.may_share_memory(result_nocopy1, result_nocopy2)

View File

@ -0,0 +1,133 @@
import numpy as np
import pytest
from pandas._libs.sparse import IntIndex
from pandas import (
SparseDtype,
Timestamp,
)
import pandas._testing as tm
from pandas.core.arrays.sparse import SparseArray
class TestAstype:
def test_astype(self):
# float -> float
arr = SparseArray([None, None, 0, 2])
result = arr.astype("Sparse[float32]")
expected = SparseArray([None, None, 0, 2], dtype=np.dtype("float32"))
tm.assert_sp_array_equal(result, expected)
dtype = SparseDtype("float64", fill_value=0)
result = arr.astype(dtype)
expected = SparseArray._simple_new(
np.array([0.0, 2.0], dtype=dtype.subtype), IntIndex(4, [2, 3]), dtype
)
tm.assert_sp_array_equal(result, expected)
dtype = SparseDtype("int64", 0)
result = arr.astype(dtype)
expected = SparseArray._simple_new(
np.array([0, 2], dtype=np.int64), IntIndex(4, [2, 3]), dtype
)
tm.assert_sp_array_equal(result, expected)
arr = SparseArray([0, np.nan, 0, 1], fill_value=0)
with pytest.raises(ValueError, match="NA"):
arr.astype("Sparse[i8]")
def test_astype_bool(self):
a = SparseArray([1, 0, 0, 1], dtype=SparseDtype(int, 0))
result = a.astype(bool)
expected = np.array([1, 0, 0, 1], dtype=bool)
tm.assert_numpy_array_equal(result, expected)
# update fill value
result = a.astype(SparseDtype(bool, False))
expected = SparseArray(
[True, False, False, True], dtype=SparseDtype(bool, False)
)
tm.assert_sp_array_equal(result, expected)
def test_astype_all(self, any_real_numpy_dtype):
vals = np.array([1, 2, 3])
arr = SparseArray(vals, fill_value=1)
typ = np.dtype(any_real_numpy_dtype)
res = arr.astype(typ)
tm.assert_numpy_array_equal(res, vals.astype(any_real_numpy_dtype))
@pytest.mark.parametrize(
"arr, dtype, expected",
[
(
SparseArray([0, 1]),
"float",
SparseArray([0.0, 1.0], dtype=SparseDtype(float, 0.0)),
),
(SparseArray([0, 1]), bool, SparseArray([False, True])),
(
SparseArray([0, 1], fill_value=1),
bool,
SparseArray([False, True], dtype=SparseDtype(bool, True)),
),
pytest.param(
SparseArray([0, 1]),
"datetime64[ns]",
SparseArray(
np.array([0, 1], dtype="datetime64[ns]"),
dtype=SparseDtype("datetime64[ns]", Timestamp("1970")),
),
),
(
SparseArray([0, 1, 10]),
np.str_,
SparseArray(["0", "1", "10"], dtype=SparseDtype(np.str_, "0")),
),
(SparseArray(["10", "20"]), float, SparseArray([10.0, 20.0])),
(
SparseArray([0, 1, 0]),
object,
SparseArray([0, 1, 0], dtype=SparseDtype(object, 0)),
),
],
)
def test_astype_more(self, arr, dtype, expected):
result = arr.astype(arr.dtype.update_dtype(dtype))
tm.assert_sp_array_equal(result, expected)
def test_astype_nan_raises(self):
arr = SparseArray([1.0, np.nan])
with pytest.raises(ValueError, match="Cannot convert non-finite"):
arr.astype(int)
def test_astype_copy_false(self):
# GH#34456 bug caused by using .view instead of .astype in astype_nansafe
arr = SparseArray([1, 2, 3])
dtype = SparseDtype(float, 0)
result = arr.astype(dtype, copy=False)
expected = SparseArray([1.0, 2.0, 3.0], fill_value=0.0)
tm.assert_sp_array_equal(result, expected)
def test_astype_dt64_to_int64(self):
# GH#49631 match non-sparse behavior
values = np.array(["NaT", "2016-01-02", "2016-01-03"], dtype="M8[ns]")
arr = SparseArray(values)
result = arr.astype("int64")
expected = values.astype("int64")
tm.assert_numpy_array_equal(result, expected)
# we should also be able to cast to equivalent Sparse[int64]
dtype_int64 = SparseDtype("int64", np.iinfo(np.int64).min)
result2 = arr.astype(dtype_int64)
tm.assert_numpy_array_equal(result2.to_numpy(), expected)
# GH#50087 we should match the non-sparse behavior regardless of
# if we have a fill_value other than NaT
dtype = SparseDtype("datetime64[ns]", values[1])
arr3 = SparseArray(values, dtype=dtype)
result3 = arr3.astype("int64")
tm.assert_numpy_array_equal(result3, expected)

View File

@ -0,0 +1,62 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays.sparse import SparseArray
class TestSparseArrayConcat:
@pytest.mark.parametrize("kind", ["integer", "block"])
def test_basic(self, kind):
a = SparseArray([1, 0, 0, 2], kind=kind)
b = SparseArray([1, 0, 2, 2], kind=kind)
result = SparseArray._concat_same_type([a, b])
# Can't make any assertions about the sparse index itself
# since we aren't don't merge sparse blocs across arrays
# in to_concat
expected = np.array([1, 2, 1, 2, 2], dtype="int64")
tm.assert_numpy_array_equal(result.sp_values, expected)
assert result.kind == kind
@pytest.mark.parametrize("kind", ["integer", "block"])
def test_uses_first_kind(self, kind):
other = "integer" if kind == "block" else "block"
a = SparseArray([1, 0, 0, 2], kind=kind)
b = SparseArray([1, 0, 2, 2], kind=other)
result = SparseArray._concat_same_type([a, b])
expected = np.array([1, 2, 1, 2, 2], dtype="int64")
tm.assert_numpy_array_equal(result.sp_values, expected)
assert result.kind == kind
@pytest.mark.parametrize(
"other, expected_dtype",
[
# compatible dtype -> preserve sparse
(pd.Series([3, 4, 5], dtype="int64"), pd.SparseDtype("int64", 0)),
# (pd.Series([3, 4, 5], dtype="Int64"), pd.SparseDtype("int64", 0)),
# incompatible dtype -> Sparse[common dtype]
(pd.Series([1.5, 2.5, 3.5], dtype="float64"), pd.SparseDtype("float64", 0)),
# incompatible dtype -> Sparse[object] dtype
(pd.Series(["a", "b", "c"], dtype=object), pd.SparseDtype(object, 0)),
# categorical with compatible categories -> dtype of the categories
(pd.Series([3, 4, 5], dtype="category"), np.dtype("int64")),
(pd.Series([1.5, 2.5, 3.5], dtype="category"), np.dtype("float64")),
# categorical with incompatible categories -> object dtype
(pd.Series(["a", "b", "c"], dtype="category"), np.dtype(object)),
],
)
def test_concat_with_non_sparse(other, expected_dtype):
# https://github.com/pandas-dev/pandas/issues/34336
s_sparse = pd.Series([1, 0, 2], dtype=pd.SparseDtype("int64", 0))
result = pd.concat([s_sparse, other], ignore_index=True)
expected = pd.Series(list(s_sparse) + list(other)).astype(expected_dtype)
tm.assert_series_equal(result, expected)
result = pd.concat([other, s_sparse], ignore_index=True)
expected = pd.Series(list(other) + list(s_sparse)).astype(expected_dtype)
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,285 @@
import numpy as np
import pytest
from pandas._libs.sparse import IntIndex
import pandas as pd
from pandas import (
SparseDtype,
isna,
)
import pandas._testing as tm
from pandas.core.arrays.sparse import SparseArray
class TestConstructors:
def test_constructor_dtype(self):
arr = SparseArray([np.nan, 1, 2, np.nan])
assert arr.dtype == SparseDtype(np.float64, np.nan)
assert arr.dtype.subtype == np.float64
assert np.isnan(arr.fill_value)
arr = SparseArray([np.nan, 1, 2, np.nan], fill_value=0)
assert arr.dtype == SparseDtype(np.float64, 0)
assert arr.fill_value == 0
arr = SparseArray([0, 1, 2, 4], dtype=np.float64)
assert arr.dtype == SparseDtype(np.float64, np.nan)
assert np.isnan(arr.fill_value)
arr = SparseArray([0, 1, 2, 4], dtype=np.int64)
assert arr.dtype == SparseDtype(np.int64, 0)
assert arr.fill_value == 0
arr = SparseArray([0, 1, 2, 4], fill_value=0, dtype=np.int64)
assert arr.dtype == SparseDtype(np.int64, 0)
assert arr.fill_value == 0
arr = SparseArray([0, 1, 2, 4], dtype=None)
assert arr.dtype == SparseDtype(np.int64, 0)
assert arr.fill_value == 0
arr = SparseArray([0, 1, 2, 4], fill_value=0, dtype=None)
assert arr.dtype == SparseDtype(np.int64, 0)
assert arr.fill_value == 0
def test_constructor_dtype_str(self):
result = SparseArray([1, 2, 3], dtype="int")
expected = SparseArray([1, 2, 3], dtype=int)
tm.assert_sp_array_equal(result, expected)
def test_constructor_sparse_dtype(self):
result = SparseArray([1, 0, 0, 1], dtype=SparseDtype("int64", -1))
expected = SparseArray([1, 0, 0, 1], fill_value=-1, dtype=np.int64)
tm.assert_sp_array_equal(result, expected)
assert result.sp_values.dtype == np.dtype("int64")
def test_constructor_sparse_dtype_str(self):
result = SparseArray([1, 0, 0, 1], dtype="Sparse[int32]")
expected = SparseArray([1, 0, 0, 1], dtype=np.int32)
tm.assert_sp_array_equal(result, expected)
assert result.sp_values.dtype == np.dtype("int32")
def test_constructor_object_dtype(self):
# GH#11856
arr = SparseArray(["A", "A", np.nan, "B"], dtype=object)
assert arr.dtype == SparseDtype(object)
assert np.isnan(arr.fill_value)
arr = SparseArray(["A", "A", np.nan, "B"], dtype=object, fill_value="A")
assert arr.dtype == SparseDtype(object, "A")
assert arr.fill_value == "A"
def test_constructor_object_dtype_bool_fill(self):
# GH#17574
data = [False, 0, 100.0, 0.0]
arr = SparseArray(data, dtype=object, fill_value=False)
assert arr.dtype == SparseDtype(object, False)
assert arr.fill_value is False
arr_expected = np.array(data, dtype=object)
it = (type(x) == type(y) and x == y for x, y in zip(arr, arr_expected))
assert np.fromiter(it, dtype=np.bool_).all()
@pytest.mark.parametrize("dtype", [SparseDtype(int, 0), int])
def test_constructor_na_dtype(self, dtype):
with pytest.raises(ValueError, match="Cannot convert"):
SparseArray([0, 1, np.nan], dtype=dtype)
def test_constructor_warns_when_losing_timezone(self):
# GH#32501 warn when losing timezone information
dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific")
expected = SparseArray(np.asarray(dti, dtype="datetime64[ns]"))
with tm.assert_produces_warning(UserWarning):
result = SparseArray(dti)
tm.assert_sp_array_equal(result, expected)
with tm.assert_produces_warning(UserWarning):
result = SparseArray(pd.Series(dti))
tm.assert_sp_array_equal(result, expected)
def test_constructor_spindex_dtype(self):
arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2]))
# TODO: actionable?
# XXX: Behavior change: specifying SparseIndex no longer changes the
# fill_value
expected = SparseArray([0, 1, 2, 0], kind="integer")
tm.assert_sp_array_equal(arr, expected)
assert arr.dtype == SparseDtype(np.int64)
assert arr.fill_value == 0
arr = SparseArray(
data=[1, 2, 3],
sparse_index=IntIndex(4, [1, 2, 3]),
dtype=np.int64,
fill_value=0,
)
exp = SparseArray([0, 1, 2, 3], dtype=np.int64, fill_value=0)
tm.assert_sp_array_equal(arr, exp)
assert arr.dtype == SparseDtype(np.int64)
assert arr.fill_value == 0
arr = SparseArray(
data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=np.int64
)
exp = SparseArray([0, 1, 2, 0], fill_value=0, dtype=np.int64)
tm.assert_sp_array_equal(arr, exp)
assert arr.dtype == SparseDtype(np.int64)
assert arr.fill_value == 0
arr = SparseArray(
data=[1, 2, 3],
sparse_index=IntIndex(4, [1, 2, 3]),
dtype=None,
fill_value=0,
)
exp = SparseArray([0, 1, 2, 3], dtype=None)
tm.assert_sp_array_equal(arr, exp)
assert arr.dtype == SparseDtype(np.int64)
assert arr.fill_value == 0
@pytest.mark.parametrize("sparse_index", [None, IntIndex(1, [0])])
def test_constructor_spindex_dtype_scalar(self, sparse_index):
# scalar input
msg = "Constructing SparseArray with scalar data is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
arr = SparseArray(data=1, sparse_index=sparse_index, dtype=None)
exp = SparseArray([1], dtype=None)
tm.assert_sp_array_equal(arr, exp)
assert arr.dtype == SparseDtype(np.int64)
assert arr.fill_value == 0
with tm.assert_produces_warning(FutureWarning, match=msg):
arr = SparseArray(data=1, sparse_index=IntIndex(1, [0]), dtype=None)
exp = SparseArray([1], dtype=None)
tm.assert_sp_array_equal(arr, exp)
assert arr.dtype == SparseDtype(np.int64)
assert arr.fill_value == 0
def test_constructor_spindex_dtype_scalar_broadcasts(self):
arr = SparseArray(
data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=None
)
exp = SparseArray([0, 1, 2, 0], fill_value=0, dtype=None)
tm.assert_sp_array_equal(arr, exp)
assert arr.dtype == SparseDtype(np.int64)
assert arr.fill_value == 0
@pytest.mark.parametrize(
"data, fill_value",
[
(np.array([1, 2]), 0),
(np.array([1.0, 2.0]), np.nan),
([True, False], False),
([pd.Timestamp("2017-01-01")], pd.NaT),
],
)
def test_constructor_inferred_fill_value(self, data, fill_value):
result = SparseArray(data).fill_value
if isna(fill_value):
assert isna(result)
else:
assert result == fill_value
@pytest.mark.parametrize("format", ["coo", "csc", "csr"])
@pytest.mark.parametrize("size", [0, 10])
def test_from_spmatrix(self, size, format):
sp_sparse = pytest.importorskip("scipy.sparse")
mat = sp_sparse.random(size, 1, density=0.5, format=format)
result = SparseArray.from_spmatrix(mat)
result = np.asarray(result)
expected = mat.toarray().ravel()
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("format", ["coo", "csc", "csr"])
def test_from_spmatrix_including_explicit_zero(self, format):
sp_sparse = pytest.importorskip("scipy.sparse")
mat = sp_sparse.random(10, 1, density=0.5, format=format)
mat.data[0] = 0
result = SparseArray.from_spmatrix(mat)
result = np.asarray(result)
expected = mat.toarray().ravel()
tm.assert_numpy_array_equal(result, expected)
def test_from_spmatrix_raises(self):
sp_sparse = pytest.importorskip("scipy.sparse")
mat = sp_sparse.eye(5, 4, format="csc")
with pytest.raises(ValueError, match="not '4'"):
SparseArray.from_spmatrix(mat)
def test_constructor_from_too_large_array(self):
with pytest.raises(TypeError, match="expected dimension <= 1 data"):
SparseArray(np.arange(10).reshape((2, 5)))
def test_constructor_from_sparse(self):
zarr = SparseArray([0, 0, 1, 2, 3, 0, 4, 5, 0, 6], fill_value=0)
res = SparseArray(zarr)
assert res.fill_value == 0
tm.assert_almost_equal(res.sp_values, zarr.sp_values)
def test_constructor_copy(self):
arr_data = np.array([np.nan, np.nan, 1, 2, 3, np.nan, 4, 5, np.nan, 6])
arr = SparseArray(arr_data)
cp = SparseArray(arr, copy=True)
cp.sp_values[:3] = 0
assert not (arr.sp_values[:3] == 0).any()
not_copy = SparseArray(arr)
not_copy.sp_values[:3] = 0
assert (arr.sp_values[:3] == 0).all()
def test_constructor_bool(self):
# GH#10648
data = np.array([False, False, True, True, False, False])
arr = SparseArray(data, fill_value=False, dtype=bool)
assert arr.dtype == SparseDtype(bool)
tm.assert_numpy_array_equal(arr.sp_values, np.array([True, True]))
# Behavior change: np.asarray densifies.
# tm.assert_numpy_array_equal(arr.sp_values, np.asarray(arr))
tm.assert_numpy_array_equal(arr.sp_index.indices, np.array([2, 3], np.int32))
dense = arr.to_dense()
assert dense.dtype == bool
tm.assert_numpy_array_equal(dense, data)
def test_constructor_bool_fill_value(self):
arr = SparseArray([True, False, True], dtype=None)
assert arr.dtype == SparseDtype(np.bool_)
assert not arr.fill_value
arr = SparseArray([True, False, True], dtype=np.bool_)
assert arr.dtype == SparseDtype(np.bool_)
assert not arr.fill_value
arr = SparseArray([True, False, True], dtype=np.bool_, fill_value=True)
assert arr.dtype == SparseDtype(np.bool_, True)
assert arr.fill_value
def test_constructor_float32(self):
# GH#10648
data = np.array([1.0, np.nan, 3], dtype=np.float32)
arr = SparseArray(data, dtype=np.float32)
assert arr.dtype == SparseDtype(np.float32)
tm.assert_numpy_array_equal(arr.sp_values, np.array([1, 3], dtype=np.float32))
# Behavior change: np.asarray densifies.
# tm.assert_numpy_array_equal(arr.sp_values, np.asarray(arr))
tm.assert_numpy_array_equal(
arr.sp_index.indices, np.array([0, 2], dtype=np.int32)
)
dense = arr.to_dense()
assert dense.dtype == np.float32
tm.assert_numpy_array_equal(dense, data)

View File

@ -0,0 +1,224 @@
import re
import warnings
import numpy as np
import pytest
import pandas as pd
from pandas import SparseDtype
@pytest.mark.parametrize(
"dtype, fill_value",
[
("int", 0),
("float", np.nan),
("bool", False),
("object", np.nan),
("datetime64[ns]", np.datetime64("NaT", "ns")),
("timedelta64[ns]", np.timedelta64("NaT", "ns")),
],
)
def test_inferred_dtype(dtype, fill_value):
sparse_dtype = SparseDtype(dtype)
result = sparse_dtype.fill_value
if pd.isna(fill_value):
assert pd.isna(result) and type(result) == type(fill_value)
else:
assert result == fill_value
def test_from_sparse_dtype():
dtype = SparseDtype("float", 0)
result = SparseDtype(dtype)
assert result.fill_value == 0
def test_from_sparse_dtype_fill_value():
dtype = SparseDtype("int", 1)
result = SparseDtype(dtype, fill_value=2)
expected = SparseDtype("int", 2)
assert result == expected
@pytest.mark.parametrize(
"dtype, fill_value",
[
("int", None),
("float", None),
("bool", None),
("object", None),
("datetime64[ns]", None),
("timedelta64[ns]", None),
("int", np.nan),
("float", 0),
],
)
def test_equal(dtype, fill_value):
a = SparseDtype(dtype, fill_value)
b = SparseDtype(dtype, fill_value)
assert a == b
assert b == a
def test_nans_equal():
a = SparseDtype(float, float("nan"))
b = SparseDtype(float, np.nan)
assert a == b
assert b == a
with warnings.catch_warnings():
msg = "Allowing arbitrary scalar fill_value in SparseDtype is deprecated"
warnings.filterwarnings("ignore", msg, category=FutureWarning)
tups = [
(SparseDtype("float64"), SparseDtype("float32")),
(SparseDtype("float64"), SparseDtype("float64", 0)),
(SparseDtype("float64"), SparseDtype("datetime64[ns]", np.nan)),
(SparseDtype(int, pd.NaT), SparseDtype(float, pd.NaT)),
(SparseDtype("float64"), np.dtype("float64")),
]
@pytest.mark.parametrize(
"a, b",
tups,
)
def test_not_equal(a, b):
assert a != b
def test_construct_from_string_raises():
with pytest.raises(
TypeError, match="Cannot construct a 'SparseDtype' from 'not a dtype'"
):
SparseDtype.construct_from_string("not a dtype")
@pytest.mark.parametrize(
"dtype, expected",
[
(SparseDtype(int), True),
(SparseDtype(float), True),
(SparseDtype(bool), True),
(SparseDtype(object), False),
(SparseDtype(str), False),
],
)
def test_is_numeric(dtype, expected):
assert dtype._is_numeric is expected
def test_str_uses_object():
result = SparseDtype(str).subtype
assert result == np.dtype("object")
@pytest.mark.parametrize(
"string, expected",
[
("Sparse[float64]", SparseDtype(np.dtype("float64"))),
("Sparse[float32]", SparseDtype(np.dtype("float32"))),
("Sparse[int]", SparseDtype(np.dtype("int"))),
("Sparse[str]", SparseDtype(np.dtype("str"))),
("Sparse[datetime64[ns]]", SparseDtype(np.dtype("datetime64[ns]"))),
("Sparse", SparseDtype(np.dtype("float"), np.nan)),
],
)
def test_construct_from_string(string, expected):
result = SparseDtype.construct_from_string(string)
assert result == expected
@pytest.mark.parametrize(
"a, b, expected",
[
(SparseDtype(float, 0.0), SparseDtype(np.dtype("float"), 0.0), True),
(SparseDtype(int, 0), SparseDtype(int, 0), True),
(SparseDtype(float, float("nan")), SparseDtype(float, np.nan), True),
(SparseDtype(float, 0), SparseDtype(float, np.nan), False),
(SparseDtype(int, 0.0), SparseDtype(float, 0.0), False),
],
)
def test_hash_equal(a, b, expected):
result = a == b
assert result is expected
result = hash(a) == hash(b)
assert result is expected
@pytest.mark.parametrize(
"string, expected",
[
("Sparse[int]", "int"),
("Sparse[int, 0]", "int"),
("Sparse[int64]", "int64"),
("Sparse[int64, 0]", "int64"),
("Sparse[datetime64[ns], 0]", "datetime64[ns]"),
],
)
def test_parse_subtype(string, expected):
subtype, _ = SparseDtype._parse_subtype(string)
assert subtype == expected
@pytest.mark.parametrize(
"string", ["Sparse[int, 1]", "Sparse[float, 0.0]", "Sparse[bool, True]"]
)
def test_construct_from_string_fill_value_raises(string):
with pytest.raises(TypeError, match="fill_value in the string is not"):
SparseDtype.construct_from_string(string)
@pytest.mark.parametrize(
"original, dtype, expected",
[
(SparseDtype(int, 0), float, SparseDtype(float, 0.0)),
(SparseDtype(int, 1), float, SparseDtype(float, 1.0)),
(SparseDtype(int, 1), np.str_, SparseDtype(object, "1")),
(SparseDtype(float, 1.5), int, SparseDtype(int, 1)),
],
)
def test_update_dtype(original, dtype, expected):
result = original.update_dtype(dtype)
assert result == expected
@pytest.mark.parametrize(
"original, dtype, expected_error_msg",
[
(
SparseDtype(float, np.nan),
int,
re.escape("Cannot convert non-finite values (NA or inf) to integer"),
),
(
SparseDtype(str, "abc"),
int,
r"invalid literal for int\(\) with base 10: ('abc'|np\.str_\('abc'\))",
),
],
)
def test_update_dtype_raises(original, dtype, expected_error_msg):
with pytest.raises(ValueError, match=expected_error_msg):
original.update_dtype(dtype)
def test_repr():
# GH-34352
result = str(SparseDtype("int64", fill_value=0))
expected = "Sparse[int64, 0]"
assert result == expected
result = str(SparseDtype(object, fill_value="0"))
expected = "Sparse[object, '0']"
assert result == expected
def test_sparse_dtype_subtype_must_be_numpy_dtype():
# GH#53160
msg = "SparseDtype subtype must be a numpy dtype"
with pytest.raises(TypeError, match=msg):
SparseDtype("category", fill_value="c")

View File

@ -0,0 +1,302 @@
import numpy as np
import pytest
import pandas as pd
from pandas import SparseDtype
import pandas._testing as tm
from pandas.core.arrays.sparse import SparseArray
@pytest.fixture
def arr_data():
return np.array([np.nan, np.nan, 1, 2, 3, np.nan, 4, 5, np.nan, 6])
@pytest.fixture
def arr(arr_data):
return SparseArray(arr_data)
class TestGetitem:
def test_getitem(self, arr):
dense = arr.to_dense()
for i, value in enumerate(arr):
tm.assert_almost_equal(value, dense[i])
tm.assert_almost_equal(arr[-i], dense[-i])
def test_getitem_arraylike_mask(self, arr):
arr = SparseArray([0, 1, 2])
result = arr[[True, False, True]]
expected = SparseArray([0, 2])
tm.assert_sp_array_equal(result, expected)
@pytest.mark.parametrize(
"slc",
[
np.s_[:],
np.s_[1:10],
np.s_[1:100],
np.s_[10:1],
np.s_[:-3],
np.s_[-5:-4],
np.s_[:-12],
np.s_[-12:],
np.s_[2:],
np.s_[2::3],
np.s_[::2],
np.s_[::-1],
np.s_[::-2],
np.s_[1:6:2],
np.s_[:-6:-2],
],
)
@pytest.mark.parametrize(
"as_dense", [[np.nan] * 10, [1] * 10, [np.nan] * 5 + [1] * 5, []]
)
def test_getslice(self, slc, as_dense):
as_dense = np.array(as_dense)
arr = SparseArray(as_dense)
result = arr[slc]
expected = SparseArray(as_dense[slc])
tm.assert_sp_array_equal(result, expected)
def test_getslice_tuple(self):
dense = np.array([np.nan, 0, 3, 4, 0, 5, np.nan, np.nan, 0])
sparse = SparseArray(dense)
res = sparse[(slice(4, None),)]
exp = SparseArray(dense[4:])
tm.assert_sp_array_equal(res, exp)
sparse = SparseArray(dense, fill_value=0)
res = sparse[(slice(4, None),)]
exp = SparseArray(dense[4:], fill_value=0)
tm.assert_sp_array_equal(res, exp)
msg = "too many indices for array"
with pytest.raises(IndexError, match=msg):
sparse[4:, :]
with pytest.raises(IndexError, match=msg):
# check numpy compat
dense[4:, :]
def test_boolean_slice_empty(self):
arr = SparseArray([0, 1, 2])
res = arr[[False, False, False]]
assert res.dtype == arr.dtype
def test_getitem_bool_sparse_array(self, arr):
# GH 23122
spar_bool = SparseArray([False, True] * 5, dtype=np.bool_, fill_value=True)
exp = SparseArray([np.nan, 2, np.nan, 5, 6])
tm.assert_sp_array_equal(arr[spar_bool], exp)
spar_bool = ~spar_bool
res = arr[spar_bool]
exp = SparseArray([np.nan, 1, 3, 4, np.nan])
tm.assert_sp_array_equal(res, exp)
spar_bool = SparseArray(
[False, True, np.nan] * 3, dtype=np.bool_, fill_value=np.nan
)
res = arr[spar_bool]
exp = SparseArray([np.nan, 3, 5])
tm.assert_sp_array_equal(res, exp)
def test_getitem_bool_sparse_array_as_comparison(self):
# GH 45110
arr = SparseArray([1, 2, 3, 4, np.nan, np.nan], fill_value=np.nan)
res = arr[arr > 2]
exp = SparseArray([3.0, 4.0], fill_value=np.nan)
tm.assert_sp_array_equal(res, exp)
def test_get_item(self, arr):
zarr = SparseArray([0, 0, 1, 2, 3, 0, 4, 5, 0, 6], fill_value=0)
assert np.isnan(arr[1])
assert arr[2] == 1
assert arr[7] == 5
assert zarr[0] == 0
assert zarr[2] == 1
assert zarr[7] == 5
errmsg = "must be an integer between -10 and 10"
with pytest.raises(IndexError, match=errmsg):
arr[11]
with pytest.raises(IndexError, match=errmsg):
arr[-11]
assert arr[-1] == arr[len(arr) - 1]
class TestSetitem:
def test_set_item(self, arr_data):
arr = SparseArray(arr_data).copy()
def setitem():
arr[5] = 3
def setslice():
arr[1:5] = 2
with pytest.raises(TypeError, match="assignment via setitem"):
setitem()
with pytest.raises(TypeError, match="assignment via setitem"):
setslice()
class TestTake:
def test_take_scalar_raises(self, arr):
msg = "'indices' must be an array, not a scalar '2'."
with pytest.raises(ValueError, match=msg):
arr.take(2)
def test_take(self, arr_data, arr):
exp = SparseArray(np.take(arr_data, [2, 3]))
tm.assert_sp_array_equal(arr.take([2, 3]), exp)
exp = SparseArray(np.take(arr_data, [0, 1, 2]))
tm.assert_sp_array_equal(arr.take([0, 1, 2]), exp)
def test_take_all_empty(self):
sparse = pd.array([0, 0], dtype=SparseDtype("int64"))
result = sparse.take([0, 1], allow_fill=True, fill_value=np.nan)
tm.assert_sp_array_equal(sparse, result)
def test_take_different_fill_value(self):
# Take with a different fill value shouldn't overwrite the original
sparse = pd.array([0.0], dtype=SparseDtype("float64", fill_value=0.0))
result = sparse.take([0, -1], allow_fill=True, fill_value=np.nan)
expected = pd.array([0, np.nan], dtype=sparse.dtype)
tm.assert_sp_array_equal(expected, result)
def test_take_fill_value(self):
data = np.array([1, np.nan, 0, 3, 0])
sparse = SparseArray(data, fill_value=0)
exp = SparseArray(np.take(data, [0]), fill_value=0)
tm.assert_sp_array_equal(sparse.take([0]), exp)
exp = SparseArray(np.take(data, [1, 3, 4]), fill_value=0)
tm.assert_sp_array_equal(sparse.take([1, 3, 4]), exp)
def test_take_negative(self, arr_data, arr):
exp = SparseArray(np.take(arr_data, [-1]))
tm.assert_sp_array_equal(arr.take([-1]), exp)
exp = SparseArray(np.take(arr_data, [-4, -3, -2]))
tm.assert_sp_array_equal(arr.take([-4, -3, -2]), exp)
def test_bad_take(self, arr):
with pytest.raises(IndexError, match="bounds"):
arr.take([11])
def test_take_filling(self):
# similar tests as GH 12631
sparse = SparseArray([np.nan, np.nan, 1, np.nan, 4])
result = sparse.take(np.array([1, 0, -1]))
expected = SparseArray([np.nan, np.nan, 4])
tm.assert_sp_array_equal(result, expected)
# TODO: actionable?
# XXX: test change: fill_value=True -> allow_fill=True
result = sparse.take(np.array([1, 0, -1]), allow_fill=True)
expected = SparseArray([np.nan, np.nan, np.nan])
tm.assert_sp_array_equal(result, expected)
# allow_fill=False
result = sparse.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True)
expected = SparseArray([np.nan, np.nan, 4])
tm.assert_sp_array_equal(result, expected)
msg = "Invalid value in 'indices'"
with pytest.raises(ValueError, match=msg):
sparse.take(np.array([1, 0, -2]), allow_fill=True)
with pytest.raises(ValueError, match=msg):
sparse.take(np.array([1, 0, -5]), allow_fill=True)
msg = "out of bounds value in 'indices'"
with pytest.raises(IndexError, match=msg):
sparse.take(np.array([1, -6]))
with pytest.raises(IndexError, match=msg):
sparse.take(np.array([1, 5]))
with pytest.raises(IndexError, match=msg):
sparse.take(np.array([1, 5]), allow_fill=True)
def test_take_filling_fill_value(self):
# same tests as GH#12631
sparse = SparseArray([np.nan, 0, 1, 0, 4], fill_value=0)
result = sparse.take(np.array([1, 0, -1]))
expected = SparseArray([0, np.nan, 4], fill_value=0)
tm.assert_sp_array_equal(result, expected)
# fill_value
result = sparse.take(np.array([1, 0, -1]), allow_fill=True)
# TODO: actionable?
# XXX: behavior change.
# the old way of filling self.fill_value doesn't follow EA rules.
# It's supposed to be self.dtype.na_value (nan in this case)
expected = SparseArray([0, np.nan, np.nan], fill_value=0)
tm.assert_sp_array_equal(result, expected)
# allow_fill=False
result = sparse.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True)
expected = SparseArray([0, np.nan, 4], fill_value=0)
tm.assert_sp_array_equal(result, expected)
msg = "Invalid value in 'indices'."
with pytest.raises(ValueError, match=msg):
sparse.take(np.array([1, 0, -2]), allow_fill=True)
with pytest.raises(ValueError, match=msg):
sparse.take(np.array([1, 0, -5]), allow_fill=True)
msg = "out of bounds value in 'indices'"
with pytest.raises(IndexError, match=msg):
sparse.take(np.array([1, -6]))
with pytest.raises(IndexError, match=msg):
sparse.take(np.array([1, 5]))
with pytest.raises(IndexError, match=msg):
sparse.take(np.array([1, 5]), fill_value=True)
@pytest.mark.parametrize("kind", ["block", "integer"])
def test_take_filling_all_nan(self, kind):
sparse = SparseArray([np.nan, np.nan, np.nan, np.nan, np.nan], kind=kind)
result = sparse.take(np.array([1, 0, -1]))
expected = SparseArray([np.nan, np.nan, np.nan], kind=kind)
tm.assert_sp_array_equal(result, expected)
result = sparse.take(np.array([1, 0, -1]), fill_value=True)
expected = SparseArray([np.nan, np.nan, np.nan], kind=kind)
tm.assert_sp_array_equal(result, expected)
msg = "out of bounds value in 'indices'"
with pytest.raises(IndexError, match=msg):
sparse.take(np.array([1, -6]))
with pytest.raises(IndexError, match=msg):
sparse.take(np.array([1, 5]))
with pytest.raises(IndexError, match=msg):
sparse.take(np.array([1, 5]), fill_value=True)
class TestWhere:
def test_where_retain_fill_value(self):
# GH#45691 don't lose fill_value on _where
arr = SparseArray([np.nan, 1.0], fill_value=0)
mask = np.array([True, False])
res = arr._where(~mask, 1)
exp = SparseArray([1, 1.0], fill_value=0)
tm.assert_sp_array_equal(res, exp)
ser = pd.Series(arr)
res = ser.where(~mask, 1)
tm.assert_series_equal(res, pd.Series(exp))

View File

@ -0,0 +1,551 @@
import operator
import numpy as np
import pytest
import pandas._libs.sparse as splib
import pandas.util._test_decorators as td
from pandas import Series
import pandas._testing as tm
from pandas.core.arrays.sparse import (
BlockIndex,
IntIndex,
make_sparse_index,
)
@pytest.fixture
def test_length():
return 20
@pytest.fixture(
params=[
[
[0, 7, 15],
[3, 5, 5],
[2, 9, 14],
[2, 3, 5],
[2, 9, 15],
[1, 3, 4],
],
[
[0, 5],
[4, 4],
[1],
[4],
[1],
[3],
],
[
[0],
[10],
[0, 5],
[3, 7],
[0, 5],
[3, 5],
],
[
[10],
[5],
[0, 12],
[5, 3],
[12],
[3],
],
[
[0, 10],
[4, 6],
[5, 17],
[4, 2],
[],
[],
],
[
[0],
[5],
[],
[],
[],
[],
],
],
ids=[
"plain_case",
"delete_blocks",
"split_blocks",
"skip_block",
"no_intersect",
"one_empty",
],
)
def cases(request):
return request.param
class TestSparseIndexUnion:
@pytest.mark.parametrize(
"xloc, xlen, yloc, ylen, eloc, elen",
[
[[0], [5], [5], [4], [0], [9]],
[[0, 10], [5, 5], [2, 17], [5, 2], [0, 10, 17], [7, 5, 2]],
[[1], [5], [3], [5], [1], [7]],
[[2, 10], [4, 4], [4], [8], [2], [12]],
[[0, 5], [3, 5], [0], [7], [0], [10]],
[[2, 10], [4, 4], [4, 13], [8, 4], [2], [15]],
[[2], [15], [4, 9, 14], [3, 2, 2], [2], [15]],
[[0, 10], [3, 3], [5, 15], [2, 2], [0, 5, 10, 15], [3, 2, 3, 2]],
],
)
def test_index_make_union(self, xloc, xlen, yloc, ylen, eloc, elen, test_length):
# Case 1
# x: ----
# y: ----
# r: --------
# Case 2
# x: ----- -----
# y: ----- --
# Case 3
# x: ------
# y: -------
# r: ----------
# Case 4
# x: ------ -----
# y: -------
# r: -------------
# Case 5
# x: --- -----
# y: -------
# r: -------------
# Case 6
# x: ------ -----
# y: ------- ---
# r: -------------
# Case 7
# x: ----------------------
# y: ---- ---- ---
# r: ----------------------
# Case 8
# x: ---- ---
# y: --- ---
xindex = BlockIndex(test_length, xloc, xlen)
yindex = BlockIndex(test_length, yloc, ylen)
bresult = xindex.make_union(yindex)
assert isinstance(bresult, BlockIndex)
tm.assert_numpy_array_equal(bresult.blocs, np.array(eloc, dtype=np.int32))
tm.assert_numpy_array_equal(bresult.blengths, np.array(elen, dtype=np.int32))
ixindex = xindex.to_int_index()
iyindex = yindex.to_int_index()
iresult = ixindex.make_union(iyindex)
assert isinstance(iresult, IntIndex)
tm.assert_numpy_array_equal(iresult.indices, bresult.to_int_index().indices)
def test_int_index_make_union(self):
a = IntIndex(5, np.array([0, 3, 4], dtype=np.int32))
b = IntIndex(5, np.array([0, 2], dtype=np.int32))
res = a.make_union(b)
exp = IntIndex(5, np.array([0, 2, 3, 4], np.int32))
assert res.equals(exp)
a = IntIndex(5, np.array([], dtype=np.int32))
b = IntIndex(5, np.array([0, 2], dtype=np.int32))
res = a.make_union(b)
exp = IntIndex(5, np.array([0, 2], np.int32))
assert res.equals(exp)
a = IntIndex(5, np.array([], dtype=np.int32))
b = IntIndex(5, np.array([], dtype=np.int32))
res = a.make_union(b)
exp = IntIndex(5, np.array([], np.int32))
assert res.equals(exp)
a = IntIndex(5, np.array([0, 1, 2, 3, 4], dtype=np.int32))
b = IntIndex(5, np.array([0, 1, 2, 3, 4], dtype=np.int32))
res = a.make_union(b)
exp = IntIndex(5, np.array([0, 1, 2, 3, 4], np.int32))
assert res.equals(exp)
a = IntIndex(5, np.array([0, 1], dtype=np.int32))
b = IntIndex(4, np.array([0, 1], dtype=np.int32))
msg = "Indices must reference same underlying length"
with pytest.raises(ValueError, match=msg):
a.make_union(b)
class TestSparseIndexIntersect:
@td.skip_if_windows
def test_intersect(self, cases, test_length):
xloc, xlen, yloc, ylen, eloc, elen = cases
xindex = BlockIndex(test_length, xloc, xlen)
yindex = BlockIndex(test_length, yloc, ylen)
expected = BlockIndex(test_length, eloc, elen)
longer_index = BlockIndex(test_length + 1, yloc, ylen)
result = xindex.intersect(yindex)
assert result.equals(expected)
result = xindex.to_int_index().intersect(yindex.to_int_index())
assert result.equals(expected.to_int_index())
msg = "Indices must reference same underlying length"
with pytest.raises(Exception, match=msg):
xindex.intersect(longer_index)
with pytest.raises(Exception, match=msg):
xindex.to_int_index().intersect(longer_index.to_int_index())
def test_intersect_empty(self):
xindex = IntIndex(4, np.array([], dtype=np.int32))
yindex = IntIndex(4, np.array([2, 3], dtype=np.int32))
assert xindex.intersect(yindex).equals(xindex)
assert yindex.intersect(xindex).equals(xindex)
xindex = xindex.to_block_index()
yindex = yindex.to_block_index()
assert xindex.intersect(yindex).equals(xindex)
assert yindex.intersect(xindex).equals(xindex)
@pytest.mark.parametrize(
"case",
[
# Argument 2 to "IntIndex" has incompatible type "ndarray[Any,
# dtype[signedinteger[_32Bit]]]"; expected "Sequence[int]"
IntIndex(5, np.array([1, 2], dtype=np.int32)), # type: ignore[arg-type]
IntIndex(5, np.array([0, 2, 4], dtype=np.int32)), # type: ignore[arg-type]
IntIndex(0, np.array([], dtype=np.int32)), # type: ignore[arg-type]
IntIndex(5, np.array([], dtype=np.int32)), # type: ignore[arg-type]
],
)
def test_intersect_identical(self, case):
assert case.intersect(case).equals(case)
case = case.to_block_index()
assert case.intersect(case).equals(case)
class TestSparseIndexCommon:
def test_int_internal(self):
idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="integer")
assert isinstance(idx, IntIndex)
assert idx.npoints == 2
tm.assert_numpy_array_equal(idx.indices, np.array([2, 3], dtype=np.int32))
idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="integer")
assert isinstance(idx, IntIndex)
assert idx.npoints == 0
tm.assert_numpy_array_equal(idx.indices, np.array([], dtype=np.int32))
idx = make_sparse_index(
4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer"
)
assert isinstance(idx, IntIndex)
assert idx.npoints == 4
tm.assert_numpy_array_equal(idx.indices, np.array([0, 1, 2, 3], dtype=np.int32))
def test_block_internal(self):
idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 2
tm.assert_numpy_array_equal(idx.blocs, np.array([2], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([2], dtype=np.int32))
idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 0
tm.assert_numpy_array_equal(idx.blocs, np.array([], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([], dtype=np.int32))
idx = make_sparse_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 4
tm.assert_numpy_array_equal(idx.blocs, np.array([0], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([4], dtype=np.int32))
idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 3
tm.assert_numpy_array_equal(idx.blocs, np.array([0, 2], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([1, 2], dtype=np.int32))
@pytest.mark.parametrize("kind", ["integer", "block"])
def test_lookup(self, kind):
idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind=kind)
assert idx.lookup(-1) == -1
assert idx.lookup(0) == -1
assert idx.lookup(1) == -1
assert idx.lookup(2) == 0
assert idx.lookup(3) == 1
assert idx.lookup(4) == -1
idx = make_sparse_index(4, np.array([], dtype=np.int32), kind=kind)
for i in range(-1, 5):
assert idx.lookup(i) == -1
idx = make_sparse_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind)
assert idx.lookup(-1) == -1
assert idx.lookup(0) == 0
assert idx.lookup(1) == 1
assert idx.lookup(2) == 2
assert idx.lookup(3) == 3
assert idx.lookup(4) == -1
idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind)
assert idx.lookup(-1) == -1
assert idx.lookup(0) == 0
assert idx.lookup(1) == -1
assert idx.lookup(2) == 1
assert idx.lookup(3) == 2
assert idx.lookup(4) == -1
@pytest.mark.parametrize("kind", ["integer", "block"])
def test_lookup_array(self, kind):
idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind=kind)
res = idx.lookup_array(np.array([-1, 0, 2], dtype=np.int32))
exp = np.array([-1, -1, 0], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
res = idx.lookup_array(np.array([4, 2, 1, 3], dtype=np.int32))
exp = np.array([-1, 0, -1, 1], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
idx = make_sparse_index(4, np.array([], dtype=np.int32), kind=kind)
res = idx.lookup_array(np.array([-1, 0, 2, 4], dtype=np.int32))
exp = np.array([-1, -1, -1, -1], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
idx = make_sparse_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind)
res = idx.lookup_array(np.array([-1, 0, 2], dtype=np.int32))
exp = np.array([-1, 0, 2], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
res = idx.lookup_array(np.array([4, 2, 1, 3], dtype=np.int32))
exp = np.array([-1, 2, 1, 3], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind)
res = idx.lookup_array(np.array([2, 1, 3, 0], dtype=np.int32))
exp = np.array([1, -1, 2, 0], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
res = idx.lookup_array(np.array([1, 4, 2, 5], dtype=np.int32))
exp = np.array([-1, -1, 1, -1], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
@pytest.mark.parametrize(
"idx, expected",
[
[0, -1],
[5, 0],
[7, 2],
[8, -1],
[9, -1],
[10, -1],
[11, -1],
[12, 3],
[17, 8],
[18, -1],
],
)
def test_lookup_basics(self, idx, expected):
bindex = BlockIndex(20, [5, 12], [3, 6])
assert bindex.lookup(idx) == expected
iindex = bindex.to_int_index()
assert iindex.lookup(idx) == expected
class TestBlockIndex:
def test_block_internal(self):
idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 2
tm.assert_numpy_array_equal(idx.blocs, np.array([2], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([2], dtype=np.int32))
idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 0
tm.assert_numpy_array_equal(idx.blocs, np.array([], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([], dtype=np.int32))
idx = make_sparse_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 4
tm.assert_numpy_array_equal(idx.blocs, np.array([0], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([4], dtype=np.int32))
idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 3
tm.assert_numpy_array_equal(idx.blocs, np.array([0, 2], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([1, 2], dtype=np.int32))
@pytest.mark.parametrize("i", [5, 10, 100, 101])
def test_make_block_boundary(self, i):
idx = make_sparse_index(i, np.arange(0, i, 2, dtype=np.int32), kind="block")
exp = np.arange(0, i, 2, dtype=np.int32)
tm.assert_numpy_array_equal(idx.blocs, exp)
tm.assert_numpy_array_equal(idx.blengths, np.ones(len(exp), dtype=np.int32))
def test_equals(self):
index = BlockIndex(10, [0, 4], [2, 5])
assert index.equals(index)
assert not index.equals(BlockIndex(10, [0, 4], [2, 6]))
def test_check_integrity(self):
locs = []
lengths = []
# 0-length OK
BlockIndex(0, locs, lengths)
# also OK even though empty
BlockIndex(1, locs, lengths)
msg = "Block 0 extends beyond end"
with pytest.raises(ValueError, match=msg):
BlockIndex(10, [5], [10])
msg = "Block 0 overlaps"
with pytest.raises(ValueError, match=msg):
BlockIndex(10, [2, 5], [5, 3])
def test_to_int_index(self):
locs = [0, 10]
lengths = [4, 6]
exp_inds = [0, 1, 2, 3, 10, 11, 12, 13, 14, 15]
block = BlockIndex(20, locs, lengths)
dense = block.to_int_index()
tm.assert_numpy_array_equal(dense.indices, np.array(exp_inds, dtype=np.int32))
def test_to_block_index(self):
index = BlockIndex(10, [0, 5], [4, 5])
assert index.to_block_index() is index
class TestIntIndex:
def test_check_integrity(self):
# Too many indices than specified in self.length
msg = "Too many indices"
with pytest.raises(ValueError, match=msg):
IntIndex(length=1, indices=[1, 2, 3])
# No index can be negative.
msg = "No index can be less than zero"
with pytest.raises(ValueError, match=msg):
IntIndex(length=5, indices=[1, -2, 3])
# No index can be negative.
msg = "No index can be less than zero"
with pytest.raises(ValueError, match=msg):
IntIndex(length=5, indices=[1, -2, 3])
# All indices must be less than the length.
msg = "All indices must be less than the length"
with pytest.raises(ValueError, match=msg):
IntIndex(length=5, indices=[1, 2, 5])
with pytest.raises(ValueError, match=msg):
IntIndex(length=5, indices=[1, 2, 6])
# Indices must be strictly ascending.
msg = "Indices must be strictly increasing"
with pytest.raises(ValueError, match=msg):
IntIndex(length=5, indices=[1, 3, 2])
with pytest.raises(ValueError, match=msg):
IntIndex(length=5, indices=[1, 3, 3])
def test_int_internal(self):
idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="integer")
assert isinstance(idx, IntIndex)
assert idx.npoints == 2
tm.assert_numpy_array_equal(idx.indices, np.array([2, 3], dtype=np.int32))
idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="integer")
assert isinstance(idx, IntIndex)
assert idx.npoints == 0
tm.assert_numpy_array_equal(idx.indices, np.array([], dtype=np.int32))
idx = make_sparse_index(
4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer"
)
assert isinstance(idx, IntIndex)
assert idx.npoints == 4
tm.assert_numpy_array_equal(idx.indices, np.array([0, 1, 2, 3], dtype=np.int32))
def test_equals(self):
index = IntIndex(10, [0, 1, 2, 3, 4])
assert index.equals(index)
assert not index.equals(IntIndex(10, [0, 1, 2, 3]))
def test_to_block_index(self, cases, test_length):
xloc, xlen, yloc, ylen, _, _ = cases
xindex = BlockIndex(test_length, xloc, xlen)
yindex = BlockIndex(test_length, yloc, ylen)
# see if survive the round trip
xbindex = xindex.to_int_index().to_block_index()
ybindex = yindex.to_int_index().to_block_index()
assert isinstance(xbindex, BlockIndex)
assert xbindex.equals(xindex)
assert ybindex.equals(yindex)
def test_to_int_index(self):
index = IntIndex(10, [2, 3, 4, 5, 6])
assert index.to_int_index() is index
class TestSparseOperators:
@pytest.mark.parametrize("opname", ["add", "sub", "mul", "truediv", "floordiv"])
def test_op(self, opname, cases, test_length):
xloc, xlen, yloc, ylen, _, _ = cases
sparse_op = getattr(splib, f"sparse_{opname}_float64")
python_op = getattr(operator, opname)
xindex = BlockIndex(test_length, xloc, xlen)
yindex = BlockIndex(test_length, yloc, ylen)
xdindex = xindex.to_int_index()
ydindex = yindex.to_int_index()
x = np.arange(xindex.npoints) * 10.0 + 1
y = np.arange(yindex.npoints) * 100.0 + 1
xfill = 0
yfill = 2
result_block_vals, rb_index, bfill = sparse_op(
x, xindex, xfill, y, yindex, yfill
)
result_int_vals, ri_index, ifill = sparse_op(
x, xdindex, xfill, y, ydindex, yfill
)
assert rb_index.to_int_index().equals(ri_index)
tm.assert_numpy_array_equal(result_block_vals, result_int_vals)
assert bfill == ifill
# check versus Series...
xseries = Series(x, xdindex.indices)
xseries = xseries.reindex(np.arange(test_length)).fillna(xfill)
yseries = Series(y, ydindex.indices)
yseries = yseries.reindex(np.arange(test_length)).fillna(yfill)
series_result = python_op(xseries, yseries)
series_result = series_result.reindex(ri_index.indices)
tm.assert_numpy_array_equal(result_block_vals, series_result.values)
tm.assert_numpy_array_equal(result_int_vals, series_result.values)

View File

@ -0,0 +1,306 @@
import numpy as np
import pytest
from pandas import (
NaT,
SparseDtype,
Timestamp,
isna,
)
from pandas.core.arrays.sparse import SparseArray
class TestReductions:
@pytest.mark.parametrize(
"data,pos,neg",
[
([True, True, True], True, False),
([1, 2, 1], 1, 0),
([1.0, 2.0, 1.0], 1.0, 0.0),
],
)
def test_all(self, data, pos, neg):
# GH#17570
out = SparseArray(data).all()
assert out
out = SparseArray(data, fill_value=pos).all()
assert out
data[1] = neg
out = SparseArray(data).all()
assert not out
out = SparseArray(data, fill_value=pos).all()
assert not out
@pytest.mark.parametrize(
"data,pos,neg",
[
([True, True, True], True, False),
([1, 2, 1], 1, 0),
([1.0, 2.0, 1.0], 1.0, 0.0),
],
)
def test_numpy_all(self, data, pos, neg):
# GH#17570
out = np.all(SparseArray(data))
assert out
out = np.all(SparseArray(data, fill_value=pos))
assert out
data[1] = neg
out = np.all(SparseArray(data))
assert not out
out = np.all(SparseArray(data, fill_value=pos))
assert not out
# raises with a different message on py2.
msg = "the 'out' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.all(SparseArray(data), out=np.array([]))
@pytest.mark.parametrize(
"data,pos,neg",
[
([False, True, False], True, False),
([0, 2, 0], 2, 0),
([0.0, 2.0, 0.0], 2.0, 0.0),
],
)
def test_any(self, data, pos, neg):
# GH#17570
out = SparseArray(data).any()
assert out
out = SparseArray(data, fill_value=pos).any()
assert out
data[1] = neg
out = SparseArray(data).any()
assert not out
out = SparseArray(data, fill_value=pos).any()
assert not out
@pytest.mark.parametrize(
"data,pos,neg",
[
([False, True, False], True, False),
([0, 2, 0], 2, 0),
([0.0, 2.0, 0.0], 2.0, 0.0),
],
)
def test_numpy_any(self, data, pos, neg):
# GH#17570
out = np.any(SparseArray(data))
assert out
out = np.any(SparseArray(data, fill_value=pos))
assert out
data[1] = neg
out = np.any(SparseArray(data))
assert not out
out = np.any(SparseArray(data, fill_value=pos))
assert not out
msg = "the 'out' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.any(SparseArray(data), out=out)
def test_sum(self):
data = np.arange(10).astype(float)
out = SparseArray(data).sum()
assert out == 45.0
data[5] = np.nan
out = SparseArray(data, fill_value=2).sum()
assert out == 40.0
out = SparseArray(data, fill_value=np.nan).sum()
assert out == 40.0
@pytest.mark.parametrize(
"arr",
[np.array([0, 1, np.nan, 1]), np.array([0, 1, 1])],
)
@pytest.mark.parametrize("fill_value", [0, 1, np.nan])
@pytest.mark.parametrize("min_count, expected", [(3, 2), (4, np.nan)])
def test_sum_min_count(self, arr, fill_value, min_count, expected):
# GH#25777
sparray = SparseArray(arr, fill_value=fill_value)
result = sparray.sum(min_count=min_count)
if np.isnan(expected):
assert np.isnan(result)
else:
assert result == expected
def test_bool_sum_min_count(self):
spar_bool = SparseArray([False, True] * 5, dtype=np.bool_, fill_value=True)
res = spar_bool.sum(min_count=1)
assert res == 5
res = spar_bool.sum(min_count=11)
assert isna(res)
def test_numpy_sum(self):
data = np.arange(10).astype(float)
out = np.sum(SparseArray(data))
assert out == 45.0
data[5] = np.nan
out = np.sum(SparseArray(data, fill_value=2))
assert out == 40.0
out = np.sum(SparseArray(data, fill_value=np.nan))
assert out == 40.0
msg = "the 'dtype' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.sum(SparseArray(data), dtype=np.int64)
msg = "the 'out' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.sum(SparseArray(data), out=out)
def test_mean(self):
data = np.arange(10).astype(float)
out = SparseArray(data).mean()
assert out == 4.5
data[5] = np.nan
out = SparseArray(data).mean()
assert out == 40.0 / 9
def test_numpy_mean(self):
data = np.arange(10).astype(float)
out = np.mean(SparseArray(data))
assert out == 4.5
data[5] = np.nan
out = np.mean(SparseArray(data))
assert out == 40.0 / 9
msg = "the 'dtype' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.mean(SparseArray(data), dtype=np.int64)
msg = "the 'out' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.mean(SparseArray(data), out=out)
class TestMinMax:
@pytest.mark.parametrize(
"raw_data,max_expected,min_expected",
[
(np.arange(5.0), [4], [0]),
(-np.arange(5.0), [0], [-4]),
(np.array([0, 1, 2, np.nan, 4]), [4], [0]),
(np.array([np.nan] * 5), [np.nan], [np.nan]),
(np.array([]), [np.nan], [np.nan]),
],
)
def test_nan_fill_value(self, raw_data, max_expected, min_expected):
arr = SparseArray(raw_data)
max_result = arr.max()
min_result = arr.min()
assert max_result in max_expected
assert min_result in min_expected
max_result = arr.max(skipna=False)
min_result = arr.min(skipna=False)
if np.isnan(raw_data).any():
assert np.isnan(max_result)
assert np.isnan(min_result)
else:
assert max_result in max_expected
assert min_result in min_expected
@pytest.mark.parametrize(
"fill_value,max_expected,min_expected",
[
(100, 100, 0),
(-100, 1, -100),
],
)
def test_fill_value(self, fill_value, max_expected, min_expected):
arr = SparseArray(
np.array([fill_value, 0, 1]), dtype=SparseDtype("int", fill_value)
)
max_result = arr.max()
assert max_result == max_expected
min_result = arr.min()
assert min_result == min_expected
def test_only_fill_value(self):
fv = 100
arr = SparseArray(np.array([fv, fv, fv]), dtype=SparseDtype("int", fv))
assert len(arr._valid_sp_values) == 0
assert arr.max() == fv
assert arr.min() == fv
assert arr.max(skipna=False) == fv
assert arr.min(skipna=False) == fv
@pytest.mark.parametrize("func", ["min", "max"])
@pytest.mark.parametrize("data", [np.array([]), np.array([np.nan, np.nan])])
@pytest.mark.parametrize(
"dtype,expected",
[
(SparseDtype(np.float64, np.nan), np.nan),
(SparseDtype(np.float64, 5.0), np.nan),
(SparseDtype("datetime64[ns]", NaT), NaT),
(SparseDtype("datetime64[ns]", Timestamp("2018-05-05")), NaT),
],
)
def test_na_value_if_no_valid_values(self, func, data, dtype, expected):
arr = SparseArray(data, dtype=dtype)
result = getattr(arr, func)()
if expected is NaT:
# TODO: pin down whether we wrap datetime64("NaT")
assert result is NaT or np.isnat(result)
else:
assert np.isnan(result)
class TestArgmaxArgmin:
@pytest.mark.parametrize(
"arr,argmax_expected,argmin_expected",
[
(SparseArray([1, 2, 0, 1, 2]), 1, 2),
(SparseArray([-1, -2, 0, -1, -2]), 2, 1),
(SparseArray([np.nan, 1, 0, 0, np.nan, -1]), 1, 5),
(SparseArray([np.nan, 1, 0, 0, np.nan, 2]), 5, 2),
(SparseArray([np.nan, 1, 0, 0, np.nan, 2], fill_value=-1), 5, 2),
(SparseArray([np.nan, 1, 0, 0, np.nan, 2], fill_value=0), 5, 2),
(SparseArray([np.nan, 1, 0, 0, np.nan, 2], fill_value=1), 5, 2),
(SparseArray([np.nan, 1, 0, 0, np.nan, 2], fill_value=2), 5, 2),
(SparseArray([np.nan, 1, 0, 0, np.nan, 2], fill_value=3), 5, 2),
(SparseArray([0] * 10 + [-1], fill_value=0), 0, 10),
(SparseArray([0] * 10 + [-1], fill_value=-1), 0, 10),
(SparseArray([0] * 10 + [-1], fill_value=1), 0, 10),
(SparseArray([-1] + [0] * 10, fill_value=0), 1, 0),
(SparseArray([1] + [0] * 10, fill_value=0), 0, 1),
(SparseArray([-1] + [0] * 10, fill_value=-1), 1, 0),
(SparseArray([1] + [0] * 10, fill_value=1), 0, 1),
],
)
def test_argmax_argmin(self, arr, argmax_expected, argmin_expected):
argmax_result = arr.argmax()
argmin_result = arr.argmin()
assert argmax_result == argmax_expected
assert argmin_result == argmin_expected
@pytest.mark.parametrize(
"arr,method",
[(SparseArray([]), "argmax"), (SparseArray([]), "argmin")],
)
def test_empty_array(self, arr, method):
msg = f"attempt to get {method} of an empty sequence"
with pytest.raises(ValueError, match=msg):
arr.argmax() if method == "argmax" else arr.argmin()

View File

@ -0,0 +1,79 @@
import operator
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import SparseArray
@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning")
@pytest.mark.parametrize("fill_value", [0, np.nan])
@pytest.mark.parametrize("op", [operator.pos, operator.neg])
def test_unary_op(op, fill_value):
arr = np.array([0, 1, np.nan, 2])
sparray = SparseArray(arr, fill_value=fill_value)
result = op(sparray)
expected = SparseArray(op(arr), fill_value=op(fill_value))
tm.assert_sp_array_equal(result, expected)
@pytest.mark.parametrize("fill_value", [True, False])
def test_invert(fill_value):
arr = np.array([True, False, False, True])
sparray = SparseArray(arr, fill_value=fill_value)
result = ~sparray
expected = SparseArray(~arr, fill_value=not fill_value)
tm.assert_sp_array_equal(result, expected)
result = ~pd.Series(sparray)
expected = pd.Series(expected)
tm.assert_series_equal(result, expected)
result = ~pd.DataFrame({"A": sparray})
expected = pd.DataFrame({"A": expected})
tm.assert_frame_equal(result, expected)
class TestUnaryMethods:
@pytest.mark.filterwarnings(
"ignore:invalid value encountered in cast:RuntimeWarning"
)
def test_neg_operator(self):
arr = SparseArray([-1, -2, np.nan, 3], fill_value=np.nan, dtype=np.int8)
res = -arr
exp = SparseArray([1, 2, np.nan, -3], fill_value=np.nan, dtype=np.int8)
tm.assert_sp_array_equal(exp, res)
arr = SparseArray([-1, -2, 1, 3], fill_value=-1, dtype=np.int8)
res = -arr
exp = SparseArray([1, 2, -1, -3], fill_value=1, dtype=np.int8)
tm.assert_sp_array_equal(exp, res)
@pytest.mark.filterwarnings(
"ignore:invalid value encountered in cast:RuntimeWarning"
)
def test_abs_operator(self):
arr = SparseArray([-1, -2, np.nan, 3], fill_value=np.nan, dtype=np.int8)
res = abs(arr)
exp = SparseArray([1, 2, np.nan, 3], fill_value=np.nan, dtype=np.int8)
tm.assert_sp_array_equal(exp, res)
arr = SparseArray([-1, -2, 1, 3], fill_value=-1, dtype=np.int8)
res = abs(arr)
exp = SparseArray([1, 2, 1, 3], fill_value=1, dtype=np.int8)
tm.assert_sp_array_equal(exp, res)
def test_invert_operator(self):
arr = SparseArray([False, True, False, True], fill_value=False, dtype=np.bool_)
exp = SparseArray(
np.invert([False, True, False, True]), fill_value=True, dtype=np.bool_
)
res = ~arr
tm.assert_sp_array_equal(exp, res)
arr = SparseArray([0, 1, 0, 2, 3, 0], fill_value=0, dtype=np.int32)
res = ~arr
exp = SparseArray([-1, -2, -1, -3, -4, -1], fill_value=-1, dtype=np.int32)
tm.assert_sp_array_equal(exp, res)

View File

@ -0,0 +1,73 @@
import numpy as np
import pytest
from pandas.compat import HAS_PYARROW
from pandas.core.dtypes.cast import find_common_type
import pandas as pd
import pandas._testing as tm
from pandas.util.version import Version
@pytest.mark.parametrize(
"to_concat_dtypes, result_dtype",
[
# same types
([("pyarrow", pd.NA), ("pyarrow", pd.NA)], ("pyarrow", pd.NA)),
([("pyarrow", np.nan), ("pyarrow", np.nan)], ("pyarrow", np.nan)),
([("python", pd.NA), ("python", pd.NA)], ("python", pd.NA)),
([("python", np.nan), ("python", np.nan)], ("python", np.nan)),
# pyarrow preference
([("pyarrow", pd.NA), ("python", pd.NA)], ("pyarrow", pd.NA)),
# NA preference
([("python", pd.NA), ("python", np.nan)], ("python", pd.NA)),
],
)
def test_concat_series(request, to_concat_dtypes, result_dtype):
if any(storage == "pyarrow" for storage, _ in to_concat_dtypes) and not HAS_PYARROW:
pytest.skip("Could not import 'pyarrow'")
ser_list = [
pd.Series(["a", "b", None], dtype=pd.StringDtype(storage, na_value))
for storage, na_value in to_concat_dtypes
]
result = pd.concat(ser_list, ignore_index=True)
expected = pd.Series(
["a", "b", None, "a", "b", None], dtype=pd.StringDtype(*result_dtype)
)
tm.assert_series_equal(result, expected)
# order doesn't matter for result
result = pd.concat(ser_list[::1], ignore_index=True)
tm.assert_series_equal(result, expected)
def test_concat_with_object(string_dtype_arguments):
# _get_common_dtype cannot inspect values, so object dtype with strings still
# results in object dtype
result = pd.concat(
[
pd.Series(["a", "b", None], dtype=pd.StringDtype(*string_dtype_arguments)),
pd.Series(["a", "b", None], dtype=object),
]
)
assert result.dtype == np.dtype("object")
def test_concat_with_numpy(string_dtype_arguments):
# common type with a numpy string dtype always preserves the pandas string dtype
dtype = pd.StringDtype(*string_dtype_arguments)
assert find_common_type([dtype, np.dtype("U")]) == dtype
assert find_common_type([np.dtype("U"), dtype]) == dtype
assert find_common_type([dtype, np.dtype("U10")]) == dtype
assert find_common_type([np.dtype("U10"), dtype]) == dtype
# with any other numpy dtype -> object
assert find_common_type([dtype, np.dtype("S")]) == np.dtype("object")
assert find_common_type([dtype, np.dtype("int64")]) == np.dtype("object")
if Version(np.__version__) >= Version("2"):
assert find_common_type([dtype, np.dtypes.StringDType()]) == dtype
assert find_common_type([np.dtypes.StringDType(), dtype]) == dtype

View File

@ -0,0 +1,854 @@
"""
This module tests the functionality of StringArray and ArrowStringArray.
Tests for the str accessors are in pandas/tests/strings/test_string_array.py
"""
import operator
import numpy as np
import pytest
from pandas._config import using_string_dtype
from pandas.compat import HAS_PYARROW
from pandas.compat.pyarrow import (
pa_version_under12p0,
pa_version_under19p0,
)
import pandas.util._test_decorators as td
from pandas.core.dtypes.common import is_dtype_equal
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays.string_ import StringArrayNumpySemantics
from pandas.core.arrays.string_arrow import (
ArrowStringArray,
ArrowStringArrayNumpySemantics,
)
@pytest.fixture
def dtype(string_dtype_arguments):
"""Fixture giving StringDtype from parametrized storage and na_value arguments"""
storage, na_value = string_dtype_arguments
return pd.StringDtype(storage=storage, na_value=na_value)
@pytest.fixture
def dtype2(string_dtype_arguments2):
storage, na_value = string_dtype_arguments2
return pd.StringDtype(storage=storage, na_value=na_value)
@pytest.fixture
def cls(dtype):
"""Fixture giving array type from parametrized 'dtype'"""
return dtype.construct_array_type()
def string_dtype_highest_priority(dtype1, dtype2):
if HAS_PYARROW:
DTYPE_HIERARCHY = [
pd.StringDtype("python", na_value=np.nan),
pd.StringDtype("pyarrow", na_value=np.nan),
pd.StringDtype("python", na_value=pd.NA),
pd.StringDtype("pyarrow", na_value=pd.NA),
]
else:
DTYPE_HIERARCHY = [
pd.StringDtype("python", na_value=np.nan),
pd.StringDtype("python", na_value=pd.NA),
]
h1 = DTYPE_HIERARCHY.index(dtype1)
h2 = DTYPE_HIERARCHY.index(dtype2)
return DTYPE_HIERARCHY[max(h1, h2)]
def test_dtype_constructor():
pytest.importorskip("pyarrow")
with tm.assert_produces_warning(FutureWarning):
dtype = pd.StringDtype("pyarrow_numpy")
assert dtype == pd.StringDtype("pyarrow", na_value=np.nan)
def test_dtype_equality():
pytest.importorskip("pyarrow")
dtype1 = pd.StringDtype("python")
dtype2 = pd.StringDtype("pyarrow")
dtype3 = pd.StringDtype("pyarrow", na_value=np.nan)
assert dtype1 == pd.StringDtype("python", na_value=pd.NA)
assert dtype1 != dtype2
assert dtype1 != dtype3
assert dtype2 == pd.StringDtype("pyarrow", na_value=pd.NA)
assert dtype2 != dtype1
assert dtype2 != dtype3
assert dtype3 == pd.StringDtype("pyarrow", na_value=np.nan)
assert dtype3 == pd.StringDtype("pyarrow", na_value=float("nan"))
assert dtype3 != dtype1
assert dtype3 != dtype2
def test_repr(dtype):
df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype=dtype)})
if dtype.na_value is np.nan:
expected = " A\n0 a\n1 NaN\n2 b"
else:
expected = " A\n0 a\n1 <NA>\n2 b"
assert repr(df) == expected
if dtype.na_value is np.nan:
expected = "0 a\n1 NaN\n2 b\nName: A, dtype: str"
else:
expected = "0 a\n1 <NA>\n2 b\nName: A, dtype: string"
assert repr(df.A) == expected
if dtype.storage == "pyarrow" and dtype.na_value is pd.NA:
arr_name = "ArrowStringArray"
expected = f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: string"
elif dtype.storage == "pyarrow" and dtype.na_value is np.nan:
arr_name = "ArrowStringArrayNumpySemantics"
expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: str"
elif dtype.storage == "python" and dtype.na_value is np.nan:
arr_name = "StringArrayNumpySemantics"
expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: str"
else:
arr_name = "StringArray"
expected = f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: string"
assert repr(df.A.array) == expected
def test_none_to_nan(cls, dtype):
a = cls._from_sequence(["a", None, "b"], dtype=dtype)
assert a[1] is not None
assert a[1] is a.dtype.na_value
def test_setitem_validates(cls, dtype):
arr = cls._from_sequence(["a", "b"], dtype=dtype)
msg = "Invalid value '10' for dtype 'str"
with pytest.raises(TypeError, match=msg):
arr[0] = 10
msg = "Invalid value for dtype 'str"
with pytest.raises(TypeError, match=msg):
arr[:] = np.array([1, 2])
def test_setitem_with_scalar_string(dtype):
# is_float_dtype considers some strings, like 'd', to be floats
# which can cause issues.
arr = pd.array(["a", "c"], dtype=dtype)
arr[0] = "d"
expected = pd.array(["d", "c"], dtype=dtype)
tm.assert_extension_array_equal(arr, expected)
def test_setitem_with_array_with_missing(dtype):
# ensure that when setting with an array of values, we don't mutate the
# array `value` in __setitem__(self, key, value)
arr = pd.array(["a", "b", "c"], dtype=dtype)
value = np.array(["A", None])
value_orig = value.copy()
arr[[0, 1]] = value
expected = pd.array(["A", pd.NA, "c"], dtype=dtype)
tm.assert_extension_array_equal(arr, expected)
tm.assert_numpy_array_equal(value, value_orig)
def test_astype_roundtrip(dtype):
ser = pd.Series(pd.date_range("2000", periods=12))
ser[0] = None
casted = ser.astype(dtype)
assert is_dtype_equal(casted.dtype, dtype)
result = casted.astype("datetime64[ns]")
tm.assert_series_equal(result, ser)
# GH#38509 same thing for timedelta64
ser2 = ser - ser.iloc[-1]
casted2 = ser2.astype(dtype)
assert is_dtype_equal(casted2.dtype, dtype)
result2 = casted2.astype(ser2.dtype)
tm.assert_series_equal(result2, ser2)
def test_add(dtype):
a = pd.Series(["a", "b", "c", None, None], dtype=dtype)
b = pd.Series(["x", "y", None, "z", None], dtype=dtype)
result = a + b
expected = pd.Series(["ax", "by", None, None, None], dtype=dtype)
tm.assert_series_equal(result, expected)
result = a.add(b)
tm.assert_series_equal(result, expected)
result = a.radd(b)
expected = pd.Series(["xa", "yb", None, None, None], dtype=dtype)
tm.assert_series_equal(result, expected)
result = a.add(b, fill_value="-")
expected = pd.Series(["ax", "by", "c-", "-z", None], dtype=dtype)
tm.assert_series_equal(result, expected)
def test_add_2d(dtype, request):
if dtype.storage == "pyarrow":
reason = "Failed: DID NOT RAISE <class 'ValueError'>"
mark = pytest.mark.xfail(raises=None, reason=reason)
request.applymarker(mark)
a = pd.array(["a", "b", "c"], dtype=dtype)
b = np.array([["a", "b", "c"]], dtype=object)
with pytest.raises(ValueError, match="3 != 1"):
a + b
s = pd.Series(a)
with pytest.raises(ValueError, match="3 != 1"):
s + b
def test_add_sequence(dtype):
a = pd.array(["a", "b", None, None], dtype=dtype)
other = ["x", None, "y", None]
result = a + other
expected = pd.array(["ax", None, None, None], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
result = other + a
expected = pd.array(["xa", None, None, None], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
def test_mul(dtype):
a = pd.array(["a", "b", None], dtype=dtype)
result = a * 2
expected = pd.array(["aa", "bb", None], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
result = 2 * a
tm.assert_extension_array_equal(result, expected)
@pytest.mark.xfail(reason="GH-28527")
def test_add_strings(dtype):
arr = pd.array(["a", "b", "c", "d"], dtype=dtype)
df = pd.DataFrame([["t", "y", "v", "w"]], dtype=object)
assert arr.__add__(df) is NotImplemented
result = arr + df
expected = pd.DataFrame([["at", "by", "cv", "dw"]]).astype(dtype)
tm.assert_frame_equal(result, expected)
result = df + arr
expected = pd.DataFrame([["ta", "yb", "vc", "wd"]]).astype(dtype)
tm.assert_frame_equal(result, expected)
@pytest.mark.xfail(reason="GH-28527")
def test_add_frame(dtype):
arr = pd.array(["a", "b", np.nan, np.nan], dtype=dtype)
df = pd.DataFrame([["x", np.nan, "y", np.nan]])
assert arr.__add__(df) is NotImplemented
result = arr + df
expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]]).astype(dtype)
tm.assert_frame_equal(result, expected)
result = df + arr
expected = pd.DataFrame([["xa", np.nan, np.nan, np.nan]]).astype(dtype)
tm.assert_frame_equal(result, expected)
def test_comparison_methods_scalar(comparison_op, dtype):
op_name = f"__{comparison_op.__name__}__"
a = pd.array(["a", None, "c"], dtype=dtype)
other = "a"
result = getattr(a, op_name)(other)
if dtype.na_value is np.nan:
expected = np.array([getattr(item, op_name)(other) for item in a])
if comparison_op == operator.ne:
expected[1] = True
else:
expected[1] = False
tm.assert_numpy_array_equal(result, expected.astype(np.bool_))
else:
expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object)
expected = pd.array(expected, dtype=expected_dtype)
tm.assert_extension_array_equal(result, expected)
def test_comparison_methods_scalar_pd_na(comparison_op, dtype):
op_name = f"__{comparison_op.__name__}__"
a = pd.array(["a", None, "c"], dtype=dtype)
result = getattr(a, op_name)(pd.NA)
if dtype.na_value is np.nan:
if operator.ne == comparison_op:
expected = np.array([True, True, True])
else:
expected = np.array([False, False, False])
tm.assert_numpy_array_equal(result, expected)
else:
expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
expected = pd.array([None, None, None], dtype=expected_dtype)
tm.assert_extension_array_equal(result, expected)
tm.assert_extension_array_equal(result, expected)
def test_comparison_methods_scalar_not_string(comparison_op, dtype):
op_name = f"__{comparison_op.__name__}__"
a = pd.array(["a", None, "c"], dtype=dtype)
other = 42
if op_name not in ["__eq__", "__ne__"]:
with pytest.raises(TypeError, match="Invalid comparison|not supported between"):
getattr(a, op_name)(other)
return
result = getattr(a, op_name)(other)
if dtype.na_value is np.nan:
expected_data = {
"__eq__": [False, False, False],
"__ne__": [True, True, True],
}[op_name]
expected = np.array(expected_data)
tm.assert_numpy_array_equal(result, expected)
else:
expected_data = {"__eq__": [False, None, False], "__ne__": [True, None, True]}[
op_name
]
expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
expected = pd.array(expected_data, dtype=expected_dtype)
tm.assert_extension_array_equal(result, expected)
def test_comparison_methods_array(comparison_op, dtype, dtype2):
op_name = f"__{comparison_op.__name__}__"
a = pd.array(["a", None, "c"], dtype=dtype)
other = pd.array([None, None, "c"], dtype=dtype2)
result = comparison_op(a, other)
# ensure operation is commutative
result2 = comparison_op(other, a)
tm.assert_equal(result, result2)
if dtype.na_value is np.nan and dtype2.na_value is np.nan:
if operator.ne == comparison_op:
expected = np.array([True, True, False])
else:
expected = np.array([False, False, False])
expected[-1] = getattr(other[-1], op_name)(a[-1])
tm.assert_numpy_array_equal(result, expected)
else:
max_dtype = string_dtype_highest_priority(dtype, dtype2)
if max_dtype.storage == "python":
expected_dtype = "boolean"
else:
expected_dtype = "bool[pyarrow]"
expected = np.full(len(a), fill_value=None, dtype="object")
expected[-1] = getattr(other[-1], op_name)(a[-1])
expected = pd.array(expected, dtype=expected_dtype)
tm.assert_extension_array_equal(result, expected)
@td.skip_if_no("pyarrow")
def test_comparison_methods_array_arrow_extension(comparison_op, dtype2):
# Test pd.ArrowDtype(pa.string()) against other string arrays
import pyarrow as pa
op_name = f"__{comparison_op.__name__}__"
dtype = pd.ArrowDtype(pa.string())
a = pd.array(["a", None, "c"], dtype=dtype)
other = pd.array([None, None, "c"], dtype=dtype2)
result = comparison_op(a, other)
# ensure operation is commutative
result2 = comparison_op(other, a)
tm.assert_equal(result, result2)
expected = pd.array([None, None, True], dtype="bool[pyarrow]")
expected[-1] = getattr(other[-1], op_name)(a[-1])
tm.assert_extension_array_equal(result, expected)
def test_comparison_methods_list(comparison_op, dtype):
op_name = f"__{comparison_op.__name__}__"
a = pd.array(["a", None, "c"], dtype=dtype)
other = [None, None, "c"]
result = comparison_op(a, other)
# ensure operation is commutative
result2 = comparison_op(other, a)
tm.assert_equal(result, result2)
if dtype.na_value is np.nan:
if operator.ne == comparison_op:
expected = np.array([True, True, False])
else:
expected = np.array([False, False, False])
expected[-1] = getattr(other[-1], op_name)(a[-1])
tm.assert_numpy_array_equal(result, expected)
else:
expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
expected = np.full(len(a), fill_value=None, dtype="object")
expected[-1] = getattr(other[-1], op_name)(a[-1])
expected = pd.array(expected, dtype=expected_dtype)
tm.assert_extension_array_equal(result, expected)
def test_constructor_raises(cls):
if cls is pd.arrays.StringArray:
msg = "StringArray requires a sequence of strings or pandas.NA"
elif cls is StringArrayNumpySemantics:
msg = "StringArrayNumpySemantics requires a sequence of strings or NaN"
else:
msg = "Unsupported type '<class 'numpy.ndarray'>' for ArrowExtensionArray"
with pytest.raises(ValueError, match=msg):
cls(np.array(["a", "b"], dtype="S1"))
with pytest.raises(ValueError, match=msg):
cls(np.array([]))
if cls is pd.arrays.StringArray or cls is StringArrayNumpySemantics:
# GH#45057 np.nan and None do NOT raise, as they are considered valid NAs
# for string dtype
cls(np.array(["a", np.nan], dtype=object))
cls(np.array(["a", None], dtype=object))
else:
with pytest.raises(ValueError, match=msg):
cls(np.array(["a", np.nan], dtype=object))
with pytest.raises(ValueError, match=msg):
cls(np.array(["a", None], dtype=object))
with pytest.raises(ValueError, match=msg):
cls(np.array(["a", pd.NaT], dtype=object))
with pytest.raises(ValueError, match=msg):
cls(np.array(["a", np.datetime64("NaT", "ns")], dtype=object))
with pytest.raises(ValueError, match=msg):
cls(np.array(["a", np.timedelta64("NaT", "ns")], dtype=object))
@pytest.mark.parametrize("na", [np.nan, np.float64("nan"), float("nan"), None, pd.NA])
def test_constructor_nan_like(na):
expected = pd.arrays.StringArray(np.array(["a", pd.NA]))
tm.assert_extension_array_equal(
pd.arrays.StringArray(np.array(["a", na], dtype="object")), expected
)
@pytest.mark.parametrize("copy", [True, False])
def test_from_sequence_no_mutate(copy, cls, dtype):
nan_arr = np.array(["a", np.nan], dtype=object)
expected_input = nan_arr.copy()
na_arr = np.array(["a", pd.NA], dtype=object)
result = cls._from_sequence(nan_arr, dtype=dtype, copy=copy)
if cls in (ArrowStringArray, ArrowStringArrayNumpySemantics):
import pyarrow as pa
expected = cls(pa.array(na_arr, type=pa.string(), from_pandas=True))
elif cls is StringArrayNumpySemantics:
expected = cls(nan_arr)
else:
expected = cls(na_arr)
tm.assert_extension_array_equal(result, expected)
tm.assert_numpy_array_equal(nan_arr, expected_input)
def test_astype_int(dtype):
arr = pd.array(["1", "2", "3"], dtype=dtype)
result = arr.astype("int64")
expected = np.array([1, 2, 3], dtype="int64")
tm.assert_numpy_array_equal(result, expected)
arr = pd.array(["1", pd.NA, "3"], dtype=dtype)
if dtype.na_value is np.nan:
err = ValueError
msg = "cannot convert float NaN to integer"
else:
err = TypeError
msg = (
r"int\(\) argument must be a string, a bytes-like "
r"object or a( real)? number"
)
with pytest.raises(err, match=msg):
arr.astype("int64")
def test_astype_nullable_int(dtype):
arr = pd.array(["1", pd.NA, "3"], dtype=dtype)
result = arr.astype("Int64")
expected = pd.array([1, pd.NA, 3], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
def test_astype_float(dtype, any_float_dtype):
# Don't compare arrays (37974)
ser = pd.Series(["1.1", pd.NA, "3.3"], dtype=dtype)
result = ser.astype(any_float_dtype)
expected = pd.Series([1.1, np.nan, 3.3], dtype=any_float_dtype)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("skipna", [True, False])
def test_reduce(skipna, dtype):
arr = pd.Series(["a", "b", "c"], dtype=dtype)
result = arr.sum(skipna=skipna)
assert result == "abc"
@pytest.mark.parametrize("skipna", [True, False])
def test_reduce_missing(skipna, dtype):
arr = pd.Series([None, "a", None, "b", "c", None], dtype=dtype)
result = arr.sum(skipna=skipna)
if skipna:
assert result == "abc"
else:
assert pd.isna(result)
@pytest.mark.parametrize("method", ["min", "max"])
@pytest.mark.parametrize("skipna", [True, False])
def test_min_max(method, skipna, dtype):
arr = pd.Series(["a", "b", "c", None], dtype=dtype)
result = getattr(arr, method)(skipna=skipna)
if skipna:
expected = "a" if method == "min" else "c"
assert result == expected
else:
assert result is arr.dtype.na_value
@pytest.mark.parametrize("method", ["min", "max"])
@pytest.mark.parametrize("box", [pd.Series, pd.array])
def test_min_max_numpy(method, box, dtype, request):
if dtype.storage == "pyarrow" and box is pd.array:
if box is pd.array:
reason = "'<=' not supported between instances of 'str' and 'NoneType'"
else:
reason = "'ArrowStringArray' object has no attribute 'max'"
mark = pytest.mark.xfail(raises=TypeError, reason=reason)
request.applymarker(mark)
arr = box(["a", "b", "c", None], dtype=dtype)
result = getattr(np, method)(arr)
expected = "a" if method == "min" else "c"
assert result == expected
def test_fillna_args(dtype):
# GH 37987
arr = pd.array(["a", pd.NA], dtype=dtype)
res = arr.fillna(value="b")
expected = pd.array(["a", "b"], dtype=dtype)
tm.assert_extension_array_equal(res, expected)
res = arr.fillna(value=np.str_("b"))
expected = pd.array(["a", "b"], dtype=dtype)
tm.assert_extension_array_equal(res, expected)
msg = "Invalid value '1' for dtype 'str"
with pytest.raises(TypeError, match=msg):
arr.fillna(value=1)
def test_arrow_array(dtype):
# protocol added in 0.15.0
pa = pytest.importorskip("pyarrow")
import pyarrow.compute as pc
data = pd.array(["a", "b", "c"], dtype=dtype)
arr = pa.array(data)
expected = pa.array(list(data), type=pa.large_string(), from_pandas=True)
if dtype.storage == "pyarrow" and pa_version_under12p0:
expected = pa.chunked_array(expected)
if dtype.storage == "python":
expected = pc.cast(expected, pa.string())
assert arr.equals(expected)
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
def test_arrow_roundtrip(dtype, string_storage, using_infer_string):
# roundtrip possible from arrow 1.0.0
pa = pytest.importorskip("pyarrow")
data = pd.array(["a", "b", None], dtype=dtype)
df = pd.DataFrame({"a": data})
table = pa.table(df)
if dtype.storage == "python":
assert table.field("a").type == "string"
else:
assert table.field("a").type == "large_string"
with pd.option_context("string_storage", string_storage):
result = table.to_pandas()
if dtype.na_value is np.nan and not using_infer_string:
assert result["a"].dtype == "object"
else:
assert isinstance(result["a"].dtype, pd.StringDtype)
expected = df.astype(pd.StringDtype(string_storage, na_value=dtype.na_value))
if using_infer_string:
expected.columns = expected.columns.astype(
pd.StringDtype(string_storage, na_value=np.nan)
)
tm.assert_frame_equal(result, expected)
# ensure the missing value is represented by NA and not np.nan or None
assert result.loc[2, "a"] is result["a"].dtype.na_value
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
def test_arrow_from_string(using_infer_string):
# not roundtrip, but starting with pyarrow table without pandas metadata
pa = pytest.importorskip("pyarrow")
table = pa.table({"a": pa.array(["a", "b", None], type=pa.string())})
result = table.to_pandas()
if using_infer_string and not pa_version_under19p0:
expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="str")
else:
expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="object")
tm.assert_frame_equal(result, expected)
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string):
# GH-41040
pa = pytest.importorskip("pyarrow")
data = pd.array([], dtype=dtype)
df = pd.DataFrame({"a": data})
table = pa.table(df)
if dtype.storage == "python":
assert table.field("a").type == "string"
else:
assert table.field("a").type == "large_string"
# Instantiate the same table with no chunks at all
table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema)
with pd.option_context("string_storage", string_storage):
result = table.to_pandas()
if dtype.na_value is np.nan and not using_string_dtype():
assert result["a"].dtype == "object"
else:
assert isinstance(result["a"].dtype, pd.StringDtype)
expected = df.astype(pd.StringDtype(string_storage, na_value=dtype.na_value))
if using_infer_string:
expected.columns = expected.columns.astype(
pd.StringDtype(string_storage, na_value=np.nan)
)
tm.assert_frame_equal(result, expected)
def test_value_counts_na(dtype):
if dtype.na_value is np.nan:
exp_dtype = "int64"
elif dtype.storage == "pyarrow":
exp_dtype = "int64[pyarrow]"
else:
exp_dtype = "Int64"
arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype)
result = arr.value_counts(dropna=False)
expected = pd.Series([2, 1, 1], index=arr[[0, 1, 3]], dtype=exp_dtype, name="count")
tm.assert_series_equal(result, expected)
result = arr.value_counts(dropna=True)
expected = pd.Series([2, 1], index=arr[:2], dtype=exp_dtype, name="count")
tm.assert_series_equal(result, expected)
def test_value_counts_with_normalize(dtype):
if dtype.na_value is np.nan:
exp_dtype = np.float64
elif dtype.storage == "pyarrow":
exp_dtype = "double[pyarrow]"
else:
exp_dtype = "Float64"
ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype)
result = ser.value_counts(normalize=True)
expected = pd.Series([2, 1], index=ser[:2], dtype=exp_dtype, name="proportion") / 3
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"values, expected",
[
(["a", "b", "c"], np.array([False, False, False])),
(["a", "b", None], np.array([False, False, True])),
],
)
def test_use_inf_as_na(values, expected, dtype):
# https://github.com/pandas-dev/pandas/issues/33655
values = pd.array(values, dtype=dtype)
msg = "use_inf_as_na option is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
with pd.option_context("mode.use_inf_as_na", True):
result = values.isna()
tm.assert_numpy_array_equal(result, expected)
result = pd.Series(values).isna()
expected = pd.Series(expected)
tm.assert_series_equal(result, expected)
result = pd.DataFrame(values).isna()
expected = pd.DataFrame(expected)
tm.assert_frame_equal(result, expected)
def test_value_counts_sort_false(dtype):
if dtype.na_value is np.nan:
exp_dtype = "int64"
elif dtype.storage == "pyarrow":
exp_dtype = "int64[pyarrow]"
else:
exp_dtype = "Int64"
ser = pd.Series(["a", "b", "c", "b"], dtype=dtype)
result = ser.value_counts(sort=False)
expected = pd.Series([1, 2, 1], index=ser[:3], dtype=exp_dtype, name="count")
tm.assert_series_equal(result, expected)
def test_memory_usage(dtype):
# GH 33963
if dtype.storage == "pyarrow":
pytest.skip(f"not applicable for {dtype.storage}")
series = pd.Series(["a", "b", "c"], dtype=dtype)
assert 0 < series.nbytes <= series.memory_usage() < series.memory_usage(deep=True)
@pytest.mark.parametrize("float_dtype", [np.float16, np.float32, np.float64])
def test_astype_from_float_dtype(float_dtype, dtype):
# https://github.com/pandas-dev/pandas/issues/36451
ser = pd.Series([0.1], dtype=float_dtype)
result = ser.astype(dtype)
expected = pd.Series(["0.1"], dtype=dtype)
tm.assert_series_equal(result, expected)
def test_to_numpy_returns_pdna_default(dtype):
arr = pd.array(["a", pd.NA, "b"], dtype=dtype)
result = np.array(arr)
expected = np.array(["a", dtype.na_value, "b"], dtype=object)
tm.assert_numpy_array_equal(result, expected)
def test_to_numpy_na_value(dtype, nulls_fixture):
na_value = nulls_fixture
arr = pd.array(["a", pd.NA, "b"], dtype=dtype)
result = arr.to_numpy(na_value=na_value)
expected = np.array(["a", na_value, "b"], dtype=object)
tm.assert_numpy_array_equal(result, expected)
def test_isin(dtype, fixed_now_ts):
s = pd.Series(["a", "b", None], dtype=dtype)
result = s.isin(["a", "c"])
expected = pd.Series([True, False, False])
tm.assert_series_equal(result, expected)
result = s.isin(["a", pd.NA])
expected = pd.Series([True, False, True])
tm.assert_series_equal(result, expected)
result = s.isin([])
expected = pd.Series([False, False, False])
tm.assert_series_equal(result, expected)
result = s.isin(["a", fixed_now_ts])
expected = pd.Series([True, False, False])
tm.assert_series_equal(result, expected)
result = s.isin([fixed_now_ts])
expected = pd.Series([False, False, False])
tm.assert_series_equal(result, expected)
def test_isin_string_array(dtype, dtype2):
s = pd.Series(["a", "b", None], dtype=dtype)
result = s.isin(pd.array(["a", "c"], dtype=dtype2))
expected = pd.Series([True, False, False])
tm.assert_series_equal(result, expected)
result = s.isin(pd.array(["a", None], dtype=dtype2))
expected = pd.Series([True, False, True])
tm.assert_series_equal(result, expected)
def test_isin_arrow_string_array(dtype):
pa = pytest.importorskip("pyarrow")
s = pd.Series(["a", "b", None], dtype=dtype)
result = s.isin(pd.array(["a", "c"], dtype=pd.ArrowDtype(pa.string())))
expected = pd.Series([True, False, False])
tm.assert_series_equal(result, expected)
result = s.isin(pd.array(["a", None], dtype=pd.ArrowDtype(pa.string())))
expected = pd.Series([True, False, True])
tm.assert_series_equal(result, expected)
def test_setitem_scalar_with_mask_validation(dtype):
# https://github.com/pandas-dev/pandas/issues/47628
# setting None with a boolean mask (through _putmaks) should still result
# in pd.NA values in the underlying array
ser = pd.Series(["a", "b", "c"], dtype=dtype)
mask = np.array([False, True, False])
ser[mask] = None
assert ser.array[1] is ser.dtype.na_value
# for other non-string we should also raise an error
ser = pd.Series(["a", "b", "c"], dtype=dtype)
msg = "Invalid value '1' for dtype 'str"
with pytest.raises(TypeError, match=msg):
ser[mask] = 1
def test_from_numpy_str(dtype):
vals = ["a", "b", "c"]
arr = np.array(vals, dtype=np.str_)
result = pd.array(arr, dtype=dtype)
expected = pd.array(vals, dtype=dtype)
tm.assert_extension_array_equal(result, expected)
def test_tolist(dtype):
vals = ["a", "b", "c"]
arr = pd.array(vals, dtype=dtype)
result = arr.tolist()
expected = vals
tm.assert_equal(result, expected)

View File

@ -0,0 +1,282 @@
import pickle
import re
import numpy as np
import pytest
import pandas.util._test_decorators as td
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays.string_ import (
StringArray,
StringDtype,
)
from pandas.core.arrays.string_arrow import (
ArrowStringArray,
ArrowStringArrayNumpySemantics,
)
def test_eq_all_na():
pytest.importorskip("pyarrow")
a = pd.array([pd.NA, pd.NA], dtype=StringDtype("pyarrow"))
result = a == a
expected = pd.array([pd.NA, pd.NA], dtype="boolean[pyarrow]")
tm.assert_extension_array_equal(result, expected)
def test_config(string_storage, using_infer_string):
# with the default string_storage setting
# always "python" at the moment
assert StringDtype().storage == "python"
with pd.option_context("string_storage", string_storage):
assert StringDtype().storage == string_storage
result = pd.array(["a", "b"])
assert result.dtype.storage == string_storage
# pd.array(..) by default always returns the NA-variant
dtype = StringDtype(string_storage, na_value=pd.NA)
expected = dtype.construct_array_type()._from_sequence(["a", "b"], dtype=dtype)
tm.assert_equal(result, expected)
def test_config_bad_storage_raises():
msg = re.escape("Value must be one of python|pyarrow")
with pytest.raises(ValueError, match=msg):
pd.options.mode.string_storage = "foo"
@pytest.mark.parametrize("chunked", [True, False])
@pytest.mark.parametrize("array_lib", ["numpy", "pyarrow"])
def test_constructor_not_string_type_raises(array_lib, chunked):
pa = pytest.importorskip("pyarrow")
array_lib = pa if array_lib == "pyarrow" else np
arr = array_lib.array([1, 2, 3])
if chunked:
if array_lib is np:
pytest.skip("chunked not applicable to numpy array")
arr = pa.chunked_array(arr)
if array_lib is np:
msg = "Unsupported type '<class 'numpy.ndarray'>' for ArrowExtensionArray"
else:
msg = re.escape(
"ArrowStringArray requires a PyArrow (chunked) array of large_string type"
)
with pytest.raises(ValueError, match=msg):
ArrowStringArray(arr)
@pytest.mark.parametrize("chunked", [True, False])
def test_constructor_not_string_type_value_dictionary_raises(chunked):
pa = pytest.importorskip("pyarrow")
arr = pa.array([1, 2, 3], pa.dictionary(pa.int32(), pa.int32()))
if chunked:
arr = pa.chunked_array(arr)
msg = re.escape(
"ArrowStringArray requires a PyArrow (chunked) array of large_string type"
)
with pytest.raises(ValueError, match=msg):
ArrowStringArray(arr)
@pytest.mark.parametrize("string_type", ["string", "large_string"])
@pytest.mark.parametrize("chunked", [True, False])
def test_constructor_valid_string_type_value_dictionary(string_type, chunked):
pa = pytest.importorskip("pyarrow")
arr = pa.array(["1", "2", "3"], getattr(pa, string_type)()).dictionary_encode()
if chunked:
arr = pa.chunked_array(arr)
arr = ArrowStringArray(arr)
# dictionary type get converted to dense large string array
assert pa.types.is_large_string(arr._pa_array.type)
@pytest.mark.parametrize("chunked", [True, False])
def test_constructor_valid_string_view(chunked):
# requires pyarrow>=18 for casting string_view to string
pa = pytest.importorskip("pyarrow", minversion="18")
arr = pa.array(["1", "2", "3"], pa.string_view())
if chunked:
arr = pa.chunked_array(arr)
arr = ArrowStringArray(arr)
# dictionary type get converted to dense large string array
assert pa.types.is_large_string(arr._pa_array.type)
def test_constructor_from_list():
# GH#27673
pytest.importorskip("pyarrow")
result = pd.Series(["E"], dtype=StringDtype(storage="pyarrow"))
assert isinstance(result.dtype, StringDtype)
assert result.dtype.storage == "pyarrow"
def test_from_sequence_wrong_dtype_raises(using_infer_string):
pytest.importorskip("pyarrow")
with pd.option_context("string_storage", "python"):
ArrowStringArray._from_sequence(["a", None, "c"], dtype="string")
with pd.option_context("string_storage", "pyarrow"):
ArrowStringArray._from_sequence(["a", None, "c"], dtype="string")
with pytest.raises(AssertionError, match=None):
ArrowStringArray._from_sequence(["a", None, "c"], dtype="string[python]")
ArrowStringArray._from_sequence(["a", None, "c"], dtype="string[pyarrow]")
if not using_infer_string:
with pytest.raises(AssertionError, match=None):
with pd.option_context("string_storage", "python"):
ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype())
with pd.option_context("string_storage", "pyarrow"):
ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype())
if not using_infer_string:
with pytest.raises(AssertionError, match=None):
ArrowStringArray._from_sequence(
["a", None, "c"], dtype=StringDtype("python")
)
ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype("pyarrow"))
with pd.option_context("string_storage", "python"):
StringArray._from_sequence(["a", None, "c"], dtype="string")
with pd.option_context("string_storage", "pyarrow"):
StringArray._from_sequence(["a", None, "c"], dtype="string")
StringArray._from_sequence(["a", None, "c"], dtype="string[python]")
with pytest.raises(AssertionError, match=None):
StringArray._from_sequence(["a", None, "c"], dtype="string[pyarrow]")
if not using_infer_string:
with pd.option_context("string_storage", "python"):
StringArray._from_sequence(["a", None, "c"], dtype=StringDtype())
if not using_infer_string:
with pytest.raises(AssertionError, match=None):
with pd.option_context("string_storage", "pyarrow"):
StringArray._from_sequence(["a", None, "c"], dtype=StringDtype())
StringArray._from_sequence(["a", None, "c"], dtype=StringDtype("python"))
with pytest.raises(AssertionError, match=None):
StringArray._from_sequence(["a", None, "c"], dtype=StringDtype("pyarrow"))
@td.skip_if_installed("pyarrow")
def test_pyarrow_not_installed_raises():
msg = re.escape("pyarrow>=10.0.1 is required for PyArrow backed")
with pytest.raises(ImportError, match=msg):
StringDtype(storage="pyarrow")
with pytest.raises(ImportError, match=msg):
ArrowStringArray([])
with pytest.raises(ImportError, match=msg):
ArrowStringArrayNumpySemantics([])
with pytest.raises(ImportError, match=msg):
ArrowStringArray._from_sequence(["a", None, "b"])
@pytest.mark.parametrize("multiple_chunks", [False, True])
@pytest.mark.parametrize(
"key, value, expected",
[
(-1, "XX", ["a", "b", "c", "d", "XX"]),
(1, "XX", ["a", "XX", "c", "d", "e"]),
(1, None, ["a", None, "c", "d", "e"]),
(1, pd.NA, ["a", None, "c", "d", "e"]),
([1, 3], "XX", ["a", "XX", "c", "XX", "e"]),
([1, 3], ["XX", "YY"], ["a", "XX", "c", "YY", "e"]),
([1, 3], ["XX", None], ["a", "XX", "c", None, "e"]),
([1, 3], ["XX", pd.NA], ["a", "XX", "c", None, "e"]),
([0, -1], ["XX", "YY"], ["XX", "b", "c", "d", "YY"]),
([-1, 0], ["XX", "YY"], ["YY", "b", "c", "d", "XX"]),
(slice(3, None), "XX", ["a", "b", "c", "XX", "XX"]),
(slice(2, 4), ["XX", "YY"], ["a", "b", "XX", "YY", "e"]),
(slice(3, 1, -1), ["XX", "YY"], ["a", "b", "YY", "XX", "e"]),
(slice(None), "XX", ["XX", "XX", "XX", "XX", "XX"]),
([False, True, False, True, False], ["XX", "YY"], ["a", "XX", "c", "YY", "e"]),
],
)
def test_setitem(multiple_chunks, key, value, expected):
pa = pytest.importorskip("pyarrow")
result = pa.array(list("abcde"))
expected = pa.array(expected)
if multiple_chunks:
result = pa.chunked_array([result[:3], result[3:]])
expected = pa.chunked_array([expected[:3], expected[3:]])
result = ArrowStringArray(result)
expected = ArrowStringArray(expected)
result[key] = value
tm.assert_equal(result, expected)
def test_setitem_invalid_indexer_raises():
pa = pytest.importorskip("pyarrow")
arr = ArrowStringArray(pa.array(list("abcde")))
with pytest.raises(IndexError, match=None):
arr[5] = "foo"
with pytest.raises(IndexError, match=None):
arr[-6] = "foo"
with pytest.raises(IndexError, match=None):
arr[[0, 5]] = "foo"
with pytest.raises(IndexError, match=None):
arr[[0, -6]] = "foo"
with pytest.raises(IndexError, match=None):
arr[[True, True, False]] = "foo"
with pytest.raises(ValueError, match=None):
arr[[0, 1]] = ["foo", "bar", "baz"]
@pytest.mark.parametrize("na_value", [pd.NA, np.nan])
def test_pickle_roundtrip(na_value):
# GH 42600
pytest.importorskip("pyarrow")
dtype = StringDtype("pyarrow", na_value=na_value)
expected = pd.Series(range(10), dtype=dtype)
expected_sliced = expected.head(2)
full_pickled = pickle.dumps(expected)
sliced_pickled = pickle.dumps(expected_sliced)
assert len(full_pickled) > len(sliced_pickled)
result = pickle.loads(full_pickled)
tm.assert_series_equal(result, expected)
result_sliced = pickle.loads(sliced_pickled)
tm.assert_series_equal(result_sliced, expected_sliced)
def test_string_dtype_error_message():
# GH#55051
pytest.importorskip("pyarrow")
msg = "Storage must be 'python' or 'pyarrow'."
with pytest.raises(ValueError, match=msg):
StringDtype("bla")

View File

@ -0,0 +1,519 @@
import datetime
import decimal
import re
import numpy as np
import pytest
import pytz
from pandas._config import using_string_dtype
import pandas as pd
import pandas._testing as tm
from pandas.api.extensions import register_extension_dtype
from pandas.arrays import (
BooleanArray,
DatetimeArray,
FloatingArray,
IntegerArray,
IntervalArray,
SparseArray,
TimedeltaArray,
)
from pandas.core.arrays import (
NumpyExtensionArray,
period_array,
)
from pandas.tests.extension.decimal import (
DecimalArray,
DecimalDtype,
to_decimal,
)
@pytest.mark.parametrize("dtype_unit", ["M8[h]", "M8[m]", "m8[h]", "M8[m]"])
def test_dt64_array(dtype_unit):
# PR 53817
dtype_var = np.dtype(dtype_unit)
msg = (
r"datetime64 and timedelta64 dtype resolutions other than "
r"'s', 'ms', 'us', and 'ns' are deprecated. "
r"In future releases passing unsupported resolutions will "
r"raise an exception."
)
with tm.assert_produces_warning(FutureWarning, match=re.escape(msg)):
pd.array([], dtype=dtype_var)
@pytest.mark.parametrize(
"data, dtype, expected",
[
# Basic NumPy defaults.
([], None, FloatingArray._from_sequence([], dtype="Float64")),
([1, 2], None, IntegerArray._from_sequence([1, 2], dtype="Int64")),
([1, 2], object, NumpyExtensionArray(np.array([1, 2], dtype=object))),
(
[1, 2],
np.dtype("float32"),
NumpyExtensionArray(np.array([1.0, 2.0], dtype=np.dtype("float32"))),
),
(
np.array([], dtype=object),
None,
NumpyExtensionArray(np.array([], dtype=object)),
),
(
np.array([1, 2], dtype="int64"),
None,
IntegerArray._from_sequence([1, 2], dtype="Int64"),
),
(
np.array([1.0, 2.0], dtype="float64"),
None,
FloatingArray._from_sequence([1.0, 2.0], dtype="Float64"),
),
# String alias passes through to NumPy
([1, 2], "float32", NumpyExtensionArray(np.array([1, 2], dtype="float32"))),
([1, 2], "int64", NumpyExtensionArray(np.array([1, 2], dtype=np.int64))),
# GH#44715 FloatingArray does not support float16, so fall
# back to NumpyExtensionArray
(
np.array([1, 2], dtype=np.float16),
None,
NumpyExtensionArray(np.array([1, 2], dtype=np.float16)),
),
# idempotency with e.g. pd.array(pd.array([1, 2], dtype="int64"))
(
NumpyExtensionArray(np.array([1, 2], dtype=np.int32)),
None,
NumpyExtensionArray(np.array([1, 2], dtype=np.int32)),
),
# Period alias
(
[pd.Period("2000", "D"), pd.Period("2001", "D")],
"Period[D]",
period_array(["2000", "2001"], freq="D"),
),
# Period dtype
(
[pd.Period("2000", "D")],
pd.PeriodDtype("D"),
period_array(["2000"], freq="D"),
),
# Datetime (naive)
(
[1, 2],
np.dtype("datetime64[ns]"),
DatetimeArray._from_sequence(
np.array([1, 2], dtype="M8[ns]"), dtype="M8[ns]"
),
),
(
[1, 2],
np.dtype("datetime64[s]"),
DatetimeArray._from_sequence(
np.array([1, 2], dtype="M8[s]"), dtype="M8[s]"
),
),
(
np.array([1, 2], dtype="datetime64[ns]"),
None,
DatetimeArray._from_sequence(
np.array([1, 2], dtype="M8[ns]"), dtype="M8[ns]"
),
),
(
pd.DatetimeIndex(["2000", "2001"]),
np.dtype("datetime64[ns]"),
DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
),
(
pd.DatetimeIndex(["2000", "2001"]),
None,
DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
),
(
["2000", "2001"],
np.dtype("datetime64[ns]"),
DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
),
# Datetime (tz-aware)
(
["2000", "2001"],
pd.DatetimeTZDtype(tz="CET"),
DatetimeArray._from_sequence(
["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET")
),
),
# Timedelta
(
["1h", "2h"],
np.dtype("timedelta64[ns]"),
TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"),
),
(
pd.TimedeltaIndex(["1h", "2h"]),
np.dtype("timedelta64[ns]"),
TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"),
),
(
np.array([1, 2], dtype="m8[s]"),
np.dtype("timedelta64[s]"),
TimedeltaArray._from_sequence(
np.array([1, 2], dtype="m8[s]"), dtype="m8[s]"
),
),
(
pd.TimedeltaIndex(["1h", "2h"]),
None,
TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"),
),
(
# preserve non-nano, i.e. don't cast to NumpyExtensionArray
TimedeltaArray._simple_new(
np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")
),
None,
TimedeltaArray._simple_new(
np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")
),
),
(
# preserve non-nano, i.e. don't cast to NumpyExtensionArray
TimedeltaArray._simple_new(
np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")
),
np.dtype("m8[s]"),
TimedeltaArray._simple_new(
np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")
),
),
# Category
(["a", "b"], "category", pd.Categorical(["a", "b"])),
(
["a", "b"],
pd.CategoricalDtype(None, ordered=True),
pd.Categorical(["a", "b"], ordered=True),
),
# Interval
(
[pd.Interval(1, 2), pd.Interval(3, 4)],
"interval",
IntervalArray.from_tuples([(1, 2), (3, 4)]),
),
# Sparse
([0, 1], "Sparse[int64]", SparseArray([0, 1], dtype="int64")),
# IntegerNA
([1, None], "Int16", pd.array([1, None], dtype="Int16")),
(
pd.Series([1, 2]),
None,
NumpyExtensionArray(np.array([1, 2], dtype=np.int64)),
),
# String
(
["a", None],
"string",
pd.StringDtype()
.construct_array_type()
._from_sequence(["a", None], dtype=pd.StringDtype()),
),
(
["a", None],
"str",
pd.StringDtype(na_value=np.nan)
.construct_array_type()
._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan))
if using_string_dtype()
else NumpyExtensionArray(np.array(["a", "None"])),
),
(
["a", None],
pd.StringDtype(),
pd.StringDtype()
.construct_array_type()
._from_sequence(["a", None], dtype=pd.StringDtype()),
),
(
["a", None],
pd.StringDtype(na_value=np.nan),
pd.StringDtype(na_value=np.nan)
.construct_array_type()
._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan)),
),
(
# numpy array with string dtype
np.array(["a", "b"], dtype=str),
pd.StringDtype(),
pd.StringDtype()
.construct_array_type()
._from_sequence(["a", "b"], dtype=pd.StringDtype()),
),
(
# numpy array with string dtype
np.array(["a", "b"], dtype=str),
pd.StringDtype(na_value=np.nan),
pd.StringDtype(na_value=np.nan)
.construct_array_type()
._from_sequence(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
),
# Boolean
(
[True, None],
"boolean",
BooleanArray._from_sequence([True, None], dtype="boolean"),
),
(
[True, None],
pd.BooleanDtype(),
BooleanArray._from_sequence([True, None], dtype="boolean"),
),
# Index
(pd.Index([1, 2]), None, NumpyExtensionArray(np.array([1, 2], dtype=np.int64))),
# Series[EA] returns the EA
(
pd.Series(pd.Categorical(["a", "b"], categories=["a", "b", "c"])),
None,
pd.Categorical(["a", "b"], categories=["a", "b", "c"]),
),
# "3rd party" EAs work
([decimal.Decimal(0), decimal.Decimal(1)], "decimal", to_decimal([0, 1])),
# pass an ExtensionArray, but a different dtype
(
period_array(["2000", "2001"], freq="D"),
"category",
pd.Categorical([pd.Period("2000", "D"), pd.Period("2001", "D")]),
),
],
)
def test_array(data, dtype, expected):
result = pd.array(data, dtype=dtype)
tm.assert_equal(result, expected)
def test_array_copy():
a = np.array([1, 2])
# default is to copy
b = pd.array(a, dtype=a.dtype)
assert not tm.shares_memory(a, b)
# copy=True
b = pd.array(a, dtype=a.dtype, copy=True)
assert not tm.shares_memory(a, b)
# copy=False
b = pd.array(a, dtype=a.dtype, copy=False)
assert tm.shares_memory(a, b)
cet = pytz.timezone("CET")
@pytest.mark.parametrize(
"data, expected",
[
# period
(
[pd.Period("2000", "D"), pd.Period("2001", "D")],
period_array(["2000", "2001"], freq="D"),
),
# interval
([pd.Interval(0, 1), pd.Interval(1, 2)], IntervalArray.from_breaks([0, 1, 2])),
# datetime
(
[pd.Timestamp("2000"), pd.Timestamp("2001")],
DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
),
(
[datetime.datetime(2000, 1, 1), datetime.datetime(2001, 1, 1)],
DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
),
(
np.array([1, 2], dtype="M8[ns]"),
DatetimeArray._from_sequence(np.array([1, 2], dtype="M8[ns]")),
),
(
np.array([1, 2], dtype="M8[us]"),
DatetimeArray._simple_new(
np.array([1, 2], dtype="M8[us]"), dtype=np.dtype("M8[us]")
),
),
# datetimetz
(
[pd.Timestamp("2000", tz="CET"), pd.Timestamp("2001", tz="CET")],
DatetimeArray._from_sequence(
["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET", unit="ns")
),
),
(
[
datetime.datetime(2000, 1, 1, tzinfo=cet),
datetime.datetime(2001, 1, 1, tzinfo=cet),
],
DatetimeArray._from_sequence(
["2000", "2001"], dtype=pd.DatetimeTZDtype(tz=cet, unit="ns")
),
),
# timedelta
(
[pd.Timedelta("1h"), pd.Timedelta("2h")],
TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"),
),
(
np.array([1, 2], dtype="m8[ns]"),
TimedeltaArray._from_sequence(np.array([1, 2], dtype="m8[ns]")),
),
(
np.array([1, 2], dtype="m8[us]"),
TimedeltaArray._from_sequence(np.array([1, 2], dtype="m8[us]")),
),
# integer
([1, 2], IntegerArray._from_sequence([1, 2], dtype="Int64")),
([1, None], IntegerArray._from_sequence([1, None], dtype="Int64")),
([1, pd.NA], IntegerArray._from_sequence([1, pd.NA], dtype="Int64")),
([1, np.nan], IntegerArray._from_sequence([1, np.nan], dtype="Int64")),
# float
([0.1, 0.2], FloatingArray._from_sequence([0.1, 0.2], dtype="Float64")),
([0.1, None], FloatingArray._from_sequence([0.1, pd.NA], dtype="Float64")),
([0.1, np.nan], FloatingArray._from_sequence([0.1, pd.NA], dtype="Float64")),
([0.1, pd.NA], FloatingArray._from_sequence([0.1, pd.NA], dtype="Float64")),
# integer-like float
([1.0, 2.0], FloatingArray._from_sequence([1.0, 2.0], dtype="Float64")),
([1.0, None], FloatingArray._from_sequence([1.0, pd.NA], dtype="Float64")),
([1.0, np.nan], FloatingArray._from_sequence([1.0, pd.NA], dtype="Float64")),
([1.0, pd.NA], FloatingArray._from_sequence([1.0, pd.NA], dtype="Float64")),
# mixed-integer-float
([1, 2.0], FloatingArray._from_sequence([1.0, 2.0], dtype="Float64")),
(
[1, np.nan, 2.0],
FloatingArray._from_sequence([1.0, None, 2.0], dtype="Float64"),
),
# string
(
["a", "b"],
pd.StringDtype()
.construct_array_type()
._from_sequence(["a", "b"], dtype=pd.StringDtype()),
),
(
["a", None],
pd.StringDtype()
.construct_array_type()
._from_sequence(["a", None], dtype=pd.StringDtype()),
),
(
# numpy array with string dtype
np.array(["a", "b"], dtype=str),
pd.StringDtype()
.construct_array_type()
._from_sequence(["a", "b"], dtype=pd.StringDtype()),
),
# Boolean
([True, False], BooleanArray._from_sequence([True, False], dtype="boolean")),
([True, None], BooleanArray._from_sequence([True, None], dtype="boolean")),
],
)
def test_array_inference(data, expected):
result = pd.array(data)
tm.assert_equal(result, expected)
@pytest.mark.parametrize(
"data",
[
# mix of frequencies
[pd.Period("2000", "D"), pd.Period("2001", "Y")],
# mix of closed
[pd.Interval(0, 1, closed="left"), pd.Interval(1, 2, closed="right")],
# Mix of timezones
[pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000", tz="UTC")],
# Mix of tz-aware and tz-naive
[pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000")],
np.array([pd.Timestamp("2000"), pd.Timestamp("2000", tz="CET")]),
],
)
def test_array_inference_fails(data):
result = pd.array(data)
expected = NumpyExtensionArray(np.array(data, dtype=object))
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("data", [np.array(0)])
def test_nd_raises(data):
with pytest.raises(ValueError, match="NumpyExtensionArray must be 1-dimensional"):
pd.array(data, dtype="int64")
def test_scalar_raises():
with pytest.raises(ValueError, match="Cannot pass scalar '1'"):
pd.array(1)
def test_dataframe_raises():
# GH#51167 don't accidentally cast to StringArray by doing inference on columns
df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
msg = "Cannot pass DataFrame to 'pandas.array'"
with pytest.raises(TypeError, match=msg):
pd.array(df)
def test_bounds_check():
# GH21796
with pytest.raises(
TypeError, match=r"cannot safely cast non-equivalent int(32|64) to uint16"
):
pd.array([-1, 2, 3], dtype="UInt16")
# ---------------------------------------------------------------------------
# A couple dummy classes to ensure that Series and Indexes are unboxed before
# getting to the EA classes.
@register_extension_dtype
class DecimalDtype2(DecimalDtype):
name = "decimal2"
@classmethod
def construct_array_type(cls):
"""
Return the array type associated with this dtype.
Returns
-------
type
"""
return DecimalArray2
class DecimalArray2(DecimalArray):
@classmethod
def _from_sequence(cls, scalars, *, dtype=None, copy=False):
if isinstance(scalars, (pd.Series, pd.Index)):
raise TypeError("scalars should not be of type pd.Series or pd.Index")
return super()._from_sequence(scalars, dtype=dtype, copy=copy)
def test_array_unboxes(index_or_series):
box = index_or_series
data = box([decimal.Decimal("1"), decimal.Decimal("2")])
dtype = DecimalDtype2()
# make sure it works
with pytest.raises(
TypeError, match="scalars should not be of type pd.Series or pd.Index"
):
DecimalArray2._from_sequence(data, dtype=dtype)
result = pd.array(data, dtype="decimal2")
expected = DecimalArray2._from_sequence(data.values, dtype=dtype)
tm.assert_equal(result, expected)
def test_array_to_numpy_na():
# GH#40638
arr = pd.array([pd.NA, 1], dtype="string[python]")
result = arr.to_numpy(na_value=True, dtype=bool)
expected = np.array([True, True])
tm.assert_numpy_array_equal(result, expected)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,840 @@
"""
Tests for DatetimeArray
"""
from __future__ import annotations
from datetime import timedelta
import operator
try:
from zoneinfo import ZoneInfo
except ImportError:
# Cannot assign to a type
ZoneInfo = None # type: ignore[misc, assignment]
import numpy as np
import pytest
from pandas._libs.tslibs import tz_compare
from pandas.core.dtypes.dtypes import DatetimeTZDtype
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import (
DatetimeArray,
TimedeltaArray,
)
class TestNonNano:
@pytest.fixture(params=["s", "ms", "us"])
def unit(self, request):
"""Fixture returning parametrized time units"""
return request.param
@pytest.fixture
def dtype(self, unit, tz_naive_fixture):
tz = tz_naive_fixture
if tz is None:
return np.dtype(f"datetime64[{unit}]")
else:
return DatetimeTZDtype(unit=unit, tz=tz)
@pytest.fixture
def dta_dti(self, unit, dtype):
tz = getattr(dtype, "tz", None)
dti = pd.date_range("2016-01-01", periods=55, freq="D", tz=tz)
if tz is None:
arr = np.asarray(dti).astype(f"M8[{unit}]")
else:
arr = np.asarray(dti.tz_convert("UTC").tz_localize(None)).astype(
f"M8[{unit}]"
)
dta = DatetimeArray._simple_new(arr, dtype=dtype)
return dta, dti
@pytest.fixture
def dta(self, dta_dti):
dta, dti = dta_dti
return dta
def test_non_nano(self, unit, dtype):
arr = np.arange(5, dtype=np.int64).view(f"M8[{unit}]")
dta = DatetimeArray._simple_new(arr, dtype=dtype)
assert dta.dtype == dtype
assert dta[0].unit == unit
assert tz_compare(dta.tz, dta[0].tz)
assert (dta[0] == dta[:1]).all()
@pytest.mark.parametrize(
"field", DatetimeArray._field_ops + DatetimeArray._bool_ops
)
def test_fields(self, unit, field, dtype, dta_dti):
dta, dti = dta_dti
assert (dti == dta).all()
res = getattr(dta, field)
expected = getattr(dti._data, field)
tm.assert_numpy_array_equal(res, expected)
def test_normalize(self, unit):
dti = pd.date_range("2016-01-01 06:00:00", periods=55, freq="D")
arr = np.asarray(dti).astype(f"M8[{unit}]")
dta = DatetimeArray._simple_new(arr, dtype=arr.dtype)
assert not dta.is_normalized
# TODO: simplify once we can just .astype to other unit
exp = np.asarray(dti.normalize()).astype(f"M8[{unit}]")
expected = DatetimeArray._simple_new(exp, dtype=exp.dtype)
res = dta.normalize()
tm.assert_extension_array_equal(res, expected)
def test_simple_new_requires_match(self, unit):
arr = np.arange(5, dtype=np.int64).view(f"M8[{unit}]")
dtype = DatetimeTZDtype(unit, "UTC")
dta = DatetimeArray._simple_new(arr, dtype=dtype)
assert dta.dtype == dtype
wrong = DatetimeTZDtype("ns", "UTC")
with pytest.raises(AssertionError, match=""):
DatetimeArray._simple_new(arr, dtype=wrong)
def test_std_non_nano(self, unit):
dti = pd.date_range("2016-01-01", periods=55, freq="D")
arr = np.asarray(dti).astype(f"M8[{unit}]")
dta = DatetimeArray._simple_new(arr, dtype=arr.dtype)
# we should match the nano-reso std, but floored to our reso.
res = dta.std()
assert res._creso == dta._creso
assert res == dti.std().floor(unit)
@pytest.mark.filterwarnings("ignore:Converting to PeriodArray.*:UserWarning")
def test_to_period(self, dta_dti):
dta, dti = dta_dti
result = dta.to_period("D")
expected = dti._data.to_period("D")
tm.assert_extension_array_equal(result, expected)
def test_iter(self, dta):
res = next(iter(dta))
expected = dta[0]
assert type(res) is pd.Timestamp
assert res._value == expected._value
assert res._creso == expected._creso
assert res == expected
def test_astype_object(self, dta):
result = dta.astype(object)
assert all(x._creso == dta._creso for x in result)
assert all(x == y for x, y in zip(result, dta))
def test_to_pydatetime(self, dta_dti):
dta, dti = dta_dti
result = dta.to_pydatetime()
expected = dti.to_pydatetime()
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("meth", ["time", "timetz", "date"])
def test_time_date(self, dta_dti, meth):
dta, dti = dta_dti
result = getattr(dta, meth)
expected = getattr(dti, meth)
tm.assert_numpy_array_equal(result, expected)
def test_format_native_types(self, unit, dtype, dta_dti):
# In this case we should get the same formatted values with our nano
# version dti._data as we do with the non-nano dta
dta, dti = dta_dti
res = dta._format_native_types()
exp = dti._data._format_native_types()
tm.assert_numpy_array_equal(res, exp)
def test_repr(self, dta_dti, unit):
dta, dti = dta_dti
assert repr(dta) == repr(dti._data).replace("[ns", f"[{unit}")
# TODO: tests with td64
def test_compare_mismatched_resolutions(self, comparison_op):
# comparison that numpy gets wrong bc of silent overflows
op = comparison_op
iinfo = np.iinfo(np.int64)
vals = np.array([iinfo.min, iinfo.min + 1, iinfo.max], dtype=np.int64)
# Construct so that arr2[1] < arr[1] < arr[2] < arr2[2]
arr = np.array(vals).view("M8[ns]")
arr2 = arr.view("M8[s]")
left = DatetimeArray._simple_new(arr, dtype=arr.dtype)
right = DatetimeArray._simple_new(arr2, dtype=arr2.dtype)
if comparison_op is operator.eq:
expected = np.array([False, False, False])
elif comparison_op is operator.ne:
expected = np.array([True, True, True])
elif comparison_op in [operator.lt, operator.le]:
expected = np.array([False, False, True])
else:
expected = np.array([False, True, False])
result = op(left, right)
tm.assert_numpy_array_equal(result, expected)
result = op(left[1], right)
tm.assert_numpy_array_equal(result, expected)
if op not in [operator.eq, operator.ne]:
# check that numpy still gets this wrong; if it is fixed we may be
# able to remove compare_mismatched_resolutions
np_res = op(left._ndarray, right._ndarray)
tm.assert_numpy_array_equal(np_res[1:], ~expected[1:])
def test_add_mismatched_reso_doesnt_downcast(self):
# https://github.com/pandas-dev/pandas/pull/48748#issuecomment-1260181008
td = pd.Timedelta(microseconds=1)
dti = pd.date_range("2016-01-01", periods=3) - td
dta = dti._data.as_unit("us")
res = dta + td.as_unit("us")
# even though the result is an even number of days
# (so we _could_ downcast to unit="s"), we do not.
assert res.unit == "us"
@pytest.mark.parametrize(
"scalar",
[
timedelta(hours=2),
pd.Timedelta(hours=2),
np.timedelta64(2, "h"),
np.timedelta64(2 * 3600 * 1000, "ms"),
pd.offsets.Minute(120),
pd.offsets.Hour(2),
],
)
def test_add_timedeltalike_scalar_mismatched_reso(self, dta_dti, scalar):
dta, dti = dta_dti
td = pd.Timedelta(scalar)
exp_unit = tm.get_finest_unit(dta.unit, td.unit)
expected = (dti + td)._data.as_unit(exp_unit)
result = dta + scalar
tm.assert_extension_array_equal(result, expected)
result = scalar + dta
tm.assert_extension_array_equal(result, expected)
expected = (dti - td)._data.as_unit(exp_unit)
result = dta - scalar
tm.assert_extension_array_equal(result, expected)
def test_sub_datetimelike_scalar_mismatch(self):
dti = pd.date_range("2016-01-01", periods=3)
dta = dti._data.as_unit("us")
ts = dta[0].as_unit("s")
result = dta - ts
expected = (dti - dti[0])._data.as_unit("us")
assert result.dtype == "m8[us]"
tm.assert_extension_array_equal(result, expected)
def test_sub_datetime64_reso_mismatch(self):
dti = pd.date_range("2016-01-01", periods=3)
left = dti._data.as_unit("s")
right = left.as_unit("ms")
result = left - right
exp_values = np.array([0, 0, 0], dtype="m8[ms]")
expected = TimedeltaArray._simple_new(
exp_values,
dtype=exp_values.dtype,
)
tm.assert_extension_array_equal(result, expected)
result2 = right - left
tm.assert_extension_array_equal(result2, expected)
class TestDatetimeArrayComparisons:
# TODO: merge this into tests/arithmetic/test_datetime64 once it is
# sufficiently robust
def test_cmp_dt64_arraylike_tznaive(self, comparison_op):
# arbitrary tz-naive DatetimeIndex
op = comparison_op
dti = pd.date_range("2016-01-1", freq="MS", periods=9, tz=None)
arr = dti._data
assert arr.freq == dti.freq
assert arr.tz == dti.tz
right = dti
expected = np.ones(len(arr), dtype=bool)
if comparison_op.__name__ in ["ne", "gt", "lt"]:
# for these the comparisons should be all-False
expected = ~expected
result = op(arr, arr)
tm.assert_numpy_array_equal(result, expected)
for other in [
right,
np.array(right),
list(right),
tuple(right),
right.astype(object),
]:
result = op(arr, other)
tm.assert_numpy_array_equal(result, expected)
result = op(other, arr)
tm.assert_numpy_array_equal(result, expected)
class TestDatetimeArray:
def test_astype_ns_to_ms_near_bounds(self):
# GH#55979
ts = pd.Timestamp("1677-09-21 00:12:43.145225")
target = ts.as_unit("ms")
dta = DatetimeArray._from_sequence([ts], dtype="M8[ns]")
assert (dta.view("i8") == ts.as_unit("ns").value).all()
result = dta.astype("M8[ms]")
assert result[0] == target
expected = DatetimeArray._from_sequence([ts], dtype="M8[ms]")
assert (expected.view("i8") == target._value).all()
tm.assert_datetime_array_equal(result, expected)
def test_astype_non_nano_tznaive(self):
dti = pd.date_range("2016-01-01", periods=3)
res = dti.astype("M8[s]")
assert res.dtype == "M8[s]"
dta = dti._data
res = dta.astype("M8[s]")
assert res.dtype == "M8[s]"
assert isinstance(res, pd.core.arrays.DatetimeArray) # used to be ndarray
def test_astype_non_nano_tzaware(self):
dti = pd.date_range("2016-01-01", periods=3, tz="UTC")
res = dti.astype("M8[s, US/Pacific]")
assert res.dtype == "M8[s, US/Pacific]"
dta = dti._data
res = dta.astype("M8[s, US/Pacific]")
assert res.dtype == "M8[s, US/Pacific]"
# from non-nano to non-nano, preserving reso
res2 = res.astype("M8[s, UTC]")
assert res2.dtype == "M8[s, UTC]"
assert not tm.shares_memory(res2, res)
res3 = res.astype("M8[s, UTC]", copy=False)
assert res2.dtype == "M8[s, UTC]"
assert tm.shares_memory(res3, res)
def test_astype_to_same(self):
arr = DatetimeArray._from_sequence(
["2000"], dtype=DatetimeTZDtype(tz="US/Central")
)
result = arr.astype(DatetimeTZDtype(tz="US/Central"), copy=False)
assert result is arr
@pytest.mark.parametrize("dtype", ["datetime64[ns]", "datetime64[ns, UTC]"])
@pytest.mark.parametrize(
"other", ["datetime64[ns]", "datetime64[ns, UTC]", "datetime64[ns, CET]"]
)
def test_astype_copies(self, dtype, other):
# https://github.com/pandas-dev/pandas/pull/32490
ser = pd.Series([1, 2], dtype=dtype)
orig = ser.copy()
err = False
if (dtype == "datetime64[ns]") ^ (other == "datetime64[ns]"):
# deprecated in favor of tz_localize
err = True
if err:
if dtype == "datetime64[ns]":
msg = "Use obj.tz_localize instead or series.dt.tz_localize instead"
else:
msg = "from timezone-aware dtype to timezone-naive dtype"
with pytest.raises(TypeError, match=msg):
ser.astype(other)
else:
t = ser.astype(other)
t[:] = pd.NaT
tm.assert_series_equal(ser, orig)
@pytest.mark.parametrize("dtype", [int, np.int32, np.int64, "uint32", "uint64"])
def test_astype_int(self, dtype):
arr = DatetimeArray._from_sequence(
[pd.Timestamp("2000"), pd.Timestamp("2001")], dtype="M8[ns]"
)
if np.dtype(dtype) != np.int64:
with pytest.raises(TypeError, match=r"Do obj.astype\('int64'\)"):
arr.astype(dtype)
return
result = arr.astype(dtype)
expected = arr._ndarray.view("i8")
tm.assert_numpy_array_equal(result, expected)
def test_astype_to_sparse_dt64(self):
# GH#50082
dti = pd.date_range("2016-01-01", periods=4)
dta = dti._data
result = dta.astype("Sparse[datetime64[ns]]")
assert result.dtype == "Sparse[datetime64[ns]]"
assert (result == dta).all()
def test_tz_setter_raises(self):
arr = DatetimeArray._from_sequence(
["2000"], dtype=DatetimeTZDtype(tz="US/Central")
)
with pytest.raises(AttributeError, match="tz_localize"):
arr.tz = "UTC"
def test_setitem_str_impute_tz(self, tz_naive_fixture):
# Like for getitem, if we are passed a naive-like string, we impute
# our own timezone.
tz = tz_naive_fixture
data = np.array([1, 2, 3], dtype="M8[ns]")
dtype = data.dtype if tz is None else DatetimeTZDtype(tz=tz)
arr = DatetimeArray._from_sequence(data, dtype=dtype)
expected = arr.copy()
ts = pd.Timestamp("2020-09-08 16:50").tz_localize(tz)
setter = str(ts.tz_localize(None))
# Setting a scalar tznaive string
expected[0] = ts
arr[0] = setter
tm.assert_equal(arr, expected)
# Setting a listlike of tznaive strings
expected[1] = ts
arr[:2] = [setter, setter]
tm.assert_equal(arr, expected)
def test_setitem_different_tz_raises(self):
# pre-2.0 we required exact tz match, in 2.0 we require only
# tzawareness-match
data = np.array([1, 2, 3], dtype="M8[ns]")
arr = DatetimeArray._from_sequence(
data, copy=False, dtype=DatetimeTZDtype(tz="US/Central")
)
with pytest.raises(TypeError, match="Cannot compare tz-naive and tz-aware"):
arr[0] = pd.Timestamp("2000")
ts = pd.Timestamp("2000", tz="US/Eastern")
arr[0] = ts
assert arr[0] == ts.tz_convert("US/Central")
def test_setitem_clears_freq(self):
a = pd.date_range("2000", periods=2, freq="D", tz="US/Central")._data
a[0] = pd.Timestamp("2000", tz="US/Central")
assert a.freq is None
@pytest.mark.parametrize(
"obj",
[
pd.Timestamp("2021-01-01"),
pd.Timestamp("2021-01-01").to_datetime64(),
pd.Timestamp("2021-01-01").to_pydatetime(),
],
)
def test_setitem_objects(self, obj):
# make sure we accept datetime64 and datetime in addition to Timestamp
dti = pd.date_range("2000", periods=2, freq="D")
arr = dti._data
arr[0] = obj
assert arr[0] == obj
def test_repeat_preserves_tz(self):
dti = pd.date_range("2000", periods=2, freq="D", tz="US/Central")
arr = dti._data
repeated = arr.repeat([1, 1])
# preserves tz and values, but not freq
expected = DatetimeArray._from_sequence(arr.asi8, dtype=arr.dtype)
tm.assert_equal(repeated, expected)
def test_value_counts_preserves_tz(self):
dti = pd.date_range("2000", periods=2, freq="D", tz="US/Central")
arr = dti._data.repeat([4, 3])
result = arr.value_counts()
# Note: not tm.assert_index_equal, since `freq`s do not match
assert result.index.equals(dti)
arr[-2] = pd.NaT
result = arr.value_counts(dropna=False)
expected = pd.Series([4, 2, 1], index=[dti[0], dti[1], pd.NaT], name="count")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("method", ["pad", "backfill"])
def test_fillna_preserves_tz(self, method):
dti = pd.date_range("2000-01-01", periods=5, freq="D", tz="US/Central")
arr = DatetimeArray._from_sequence(dti, copy=True)
arr[2] = pd.NaT
fill_val = dti[1] if method == "pad" else dti[3]
expected = DatetimeArray._from_sequence(
[dti[0], dti[1], fill_val, dti[3], dti[4]],
dtype=DatetimeTZDtype(tz="US/Central"),
)
result = arr._pad_or_backfill(method=method)
tm.assert_extension_array_equal(result, expected)
# assert that arr and dti were not modified in-place
assert arr[2] is pd.NaT
assert dti[2] == pd.Timestamp("2000-01-03", tz="US/Central")
def test_fillna_2d(self):
dti = pd.date_range("2016-01-01", periods=6, tz="US/Pacific")
dta = dti._data.reshape(3, 2).copy()
dta[0, 1] = pd.NaT
dta[1, 0] = pd.NaT
res1 = dta._pad_or_backfill(method="pad")
expected1 = dta.copy()
expected1[1, 0] = dta[0, 0]
tm.assert_extension_array_equal(res1, expected1)
res2 = dta._pad_or_backfill(method="backfill")
expected2 = dta.copy()
expected2 = dta.copy()
expected2[1, 0] = dta[2, 0]
expected2[0, 1] = dta[1, 1]
tm.assert_extension_array_equal(res2, expected2)
# with different ordering for underlying ndarray; behavior should
# be unchanged
dta2 = dta._from_backing_data(dta._ndarray.copy(order="F"))
assert dta2._ndarray.flags["F_CONTIGUOUS"]
assert not dta2._ndarray.flags["C_CONTIGUOUS"]
tm.assert_extension_array_equal(dta, dta2)
res3 = dta2._pad_or_backfill(method="pad")
tm.assert_extension_array_equal(res3, expected1)
res4 = dta2._pad_or_backfill(method="backfill")
tm.assert_extension_array_equal(res4, expected2)
# test the DataFrame method while we're here
df = pd.DataFrame(dta)
res = df.ffill()
expected = pd.DataFrame(expected1)
tm.assert_frame_equal(res, expected)
res = df.bfill()
expected = pd.DataFrame(expected2)
tm.assert_frame_equal(res, expected)
def test_array_interface_tz(self):
tz = "US/Central"
data = pd.date_range("2017", periods=2, tz=tz)._data
result = np.asarray(data)
expected = np.array(
[
pd.Timestamp("2017-01-01T00:00:00", tz=tz),
pd.Timestamp("2017-01-02T00:00:00", tz=tz),
],
dtype=object,
)
tm.assert_numpy_array_equal(result, expected)
result = np.asarray(data, dtype=object)
tm.assert_numpy_array_equal(result, expected)
result = np.asarray(data, dtype="M8[ns]")
expected = np.array(
["2017-01-01T06:00:00", "2017-01-02T06:00:00"], dtype="M8[ns]"
)
tm.assert_numpy_array_equal(result, expected)
def test_array_interface(self):
data = pd.date_range("2017", periods=2)._data
expected = np.array(
["2017-01-01T00:00:00", "2017-01-02T00:00:00"], dtype="datetime64[ns]"
)
result = np.asarray(data)
tm.assert_numpy_array_equal(result, expected)
result = np.asarray(data, dtype=object)
expected = np.array(
[pd.Timestamp("2017-01-01T00:00:00"), pd.Timestamp("2017-01-02T00:00:00")],
dtype=object,
)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("index", [True, False])
def test_searchsorted_different_tz(self, index):
data = np.arange(10, dtype="i8") * 24 * 3600 * 10**9
arr = pd.DatetimeIndex(data, freq="D")._data.tz_localize("Asia/Tokyo")
if index:
arr = pd.Index(arr)
expected = arr.searchsorted(arr[2])
result = arr.searchsorted(arr[2].tz_convert("UTC"))
assert result == expected
expected = arr.searchsorted(arr[2:6])
result = arr.searchsorted(arr[2:6].tz_convert("UTC"))
tm.assert_equal(result, expected)
@pytest.mark.parametrize("index", [True, False])
def test_searchsorted_tzawareness_compat(self, index):
data = np.arange(10, dtype="i8") * 24 * 3600 * 10**9
arr = pd.DatetimeIndex(data, freq="D")._data
if index:
arr = pd.Index(arr)
mismatch = arr.tz_localize("Asia/Tokyo")
msg = "Cannot compare tz-naive and tz-aware datetime-like objects"
with pytest.raises(TypeError, match=msg):
arr.searchsorted(mismatch[0])
with pytest.raises(TypeError, match=msg):
arr.searchsorted(mismatch)
with pytest.raises(TypeError, match=msg):
mismatch.searchsorted(arr[0])
with pytest.raises(TypeError, match=msg):
mismatch.searchsorted(arr)
@pytest.mark.parametrize(
"other",
[
1,
np.int64(1),
1.0,
np.timedelta64("NaT"),
pd.Timedelta(days=2),
"invalid",
np.arange(10, dtype="i8") * 24 * 3600 * 10**9,
np.arange(10).view("timedelta64[ns]") * 24 * 3600 * 10**9,
pd.Timestamp("2021-01-01").to_period("D"),
],
)
@pytest.mark.parametrize("index", [True, False])
def test_searchsorted_invalid_types(self, other, index):
data = np.arange(10, dtype="i8") * 24 * 3600 * 10**9
arr = pd.DatetimeIndex(data, freq="D")._data
if index:
arr = pd.Index(arr)
msg = "|".join(
[
"searchsorted requires compatible dtype or scalar",
"value should be a 'Timestamp', 'NaT', or array of those. Got",
]
)
with pytest.raises(TypeError, match=msg):
arr.searchsorted(other)
def test_shift_fill_value(self):
dti = pd.date_range("2016-01-01", periods=3)
dta = dti._data
expected = DatetimeArray._from_sequence(np.roll(dta._ndarray, 1))
fv = dta[-1]
for fill_value in [fv, fv.to_pydatetime(), fv.to_datetime64()]:
result = dta.shift(1, fill_value=fill_value)
tm.assert_datetime_array_equal(result, expected)
dta = dta.tz_localize("UTC")
expected = expected.tz_localize("UTC")
fv = dta[-1]
for fill_value in [fv, fv.to_pydatetime()]:
result = dta.shift(1, fill_value=fill_value)
tm.assert_datetime_array_equal(result, expected)
def test_shift_value_tzawareness_mismatch(self):
dti = pd.date_range("2016-01-01", periods=3)
dta = dti._data
fv = dta[-1].tz_localize("UTC")
for invalid in [fv, fv.to_pydatetime()]:
with pytest.raises(TypeError, match="Cannot compare"):
dta.shift(1, fill_value=invalid)
dta = dta.tz_localize("UTC")
fv = dta[-1].tz_localize(None)
for invalid in [fv, fv.to_pydatetime(), fv.to_datetime64()]:
with pytest.raises(TypeError, match="Cannot compare"):
dta.shift(1, fill_value=invalid)
def test_shift_requires_tzmatch(self):
# pre-2.0 we required exact tz match, in 2.0 we require just
# matching tzawareness
dti = pd.date_range("2016-01-01", periods=3, tz="UTC")
dta = dti._data
fill_value = pd.Timestamp("2020-10-18 18:44", tz="US/Pacific")
result = dta.shift(1, fill_value=fill_value)
expected = dta.shift(1, fill_value=fill_value.tz_convert("UTC"))
tm.assert_equal(result, expected)
def test_tz_localize_t2d(self):
dti = pd.date_range("1994-05-12", periods=12, tz="US/Pacific")
dta = dti._data.reshape(3, 4)
result = dta.tz_localize(None)
expected = dta.ravel().tz_localize(None).reshape(dta.shape)
tm.assert_datetime_array_equal(result, expected)
roundtrip = expected.tz_localize("US/Pacific")
tm.assert_datetime_array_equal(roundtrip, dta)
easts = ["US/Eastern", "dateutil/US/Eastern"]
if ZoneInfo is not None:
try:
tz = ZoneInfo("US/Eastern")
except KeyError:
# no tzdata
pass
else:
# Argument 1 to "append" of "list" has incompatible type "ZoneInfo";
# expected "str"
easts.append(tz) # type: ignore[arg-type]
@pytest.mark.parametrize("tz", easts)
def test_iter_zoneinfo_fold(self, tz):
# GH#49684
utc_vals = np.array(
[1320552000, 1320555600, 1320559200, 1320562800], dtype=np.int64
)
utc_vals *= 1_000_000_000
dta = DatetimeArray._from_sequence(utc_vals).tz_localize("UTC").tz_convert(tz)
left = dta[2]
right = list(dta)[2]
assert str(left) == str(right)
# previously there was a bug where with non-pytz right would be
# Timestamp('2011-11-06 01:00:00-0400', tz='US/Eastern')
# while left would be
# Timestamp('2011-11-06 01:00:00-0500', tz='US/Eastern')
# The .value's would match (so they would compare as equal),
# but the folds would not
assert left.utcoffset() == right.utcoffset()
# The same bug in ints_to_pydatetime affected .astype, so we test
# that here.
right2 = dta.astype(object)[2]
assert str(left) == str(right2)
assert left.utcoffset() == right2.utcoffset()
@pytest.mark.parametrize(
"freq, freq_depr",
[
("2ME", "2M"),
("2SME", "2SM"),
("2SME", "2sm"),
("2QE", "2Q"),
("2QE-SEP", "2Q-SEP"),
("1YE", "1Y"),
("2YE-MAR", "2Y-MAR"),
("1YE", "1A"),
("2YE-MAR", "2A-MAR"),
("2ME", "2m"),
("2QE-SEP", "2q-sep"),
("2YE-MAR", "2a-mar"),
("2YE", "2y"),
],
)
def test_date_range_frequency_M_Q_Y_A_deprecated(self, freq, freq_depr):
# GH#9586, GH#54275
depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed "
f"in a future version, please use '{freq[1:]}' instead."
expected = pd.date_range("1/1/2000", periods=4, freq=freq)
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
result = pd.date_range("1/1/2000", periods=4, freq=freq_depr)
tm.assert_index_equal(result, expected)
@pytest.mark.parametrize("freq_depr", ["2H", "2CBH", "2MIN", "2S", "2mS", "2Us"])
def test_date_range_uppercase_frequency_deprecated(self, freq_depr):
# GH#9586, GH#54939
depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a "
f"future version. Please use '{freq_depr.lower()[1:]}' instead."
expected = pd.date_range("1/1/2000", periods=4, freq=freq_depr.lower())
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
result = pd.date_range("1/1/2000", periods=4, freq=freq_depr)
tm.assert_index_equal(result, expected)
@pytest.mark.parametrize(
"freq_depr",
[
"2ye-mar",
"2ys",
"2qe",
"2qs-feb",
"2bqs",
"2sms",
"2bms",
"2cbme",
"2me",
"2w",
],
)
def test_date_range_lowercase_frequency_deprecated(self, freq_depr):
# GH#9586, GH#54939
depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a "
f"future version, please use '{freq_depr.upper()[1:]}' instead."
expected = pd.date_range("1/1/2000", periods=4, freq=freq_depr.upper())
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
result = pd.date_range("1/1/2000", periods=4, freq=freq_depr)
tm.assert_index_equal(result, expected)
def test_factorize_sort_without_freq():
dta = DatetimeArray._from_sequence([0, 2, 1], dtype="M8[ns]")
msg = r"call pd.factorize\(obj, sort=True\) instead"
with pytest.raises(NotImplementedError, match=msg):
dta.factorize(sort=True)
# Do TimedeltaArray while we're here
tda = dta - dta[0]
with pytest.raises(NotImplementedError, match=msg):
tda.factorize(sort=True)

View File

@ -0,0 +1,75 @@
"""
Tests for subclasses of NDArrayBackedExtensionArray
"""
import numpy as np
from pandas import (
CategoricalIndex,
date_range,
)
from pandas.core.arrays import (
Categorical,
DatetimeArray,
NumpyExtensionArray,
TimedeltaArray,
)
class TestEmpty:
def test_empty_categorical(self):
ci = CategoricalIndex(["a", "b", "c"], ordered=True)
dtype = ci.dtype
# case with int8 codes
shape = (4,)
result = Categorical._empty(shape, dtype=dtype)
assert isinstance(result, Categorical)
assert result.shape == shape
assert result._ndarray.dtype == np.int8
# case where repr would segfault if we didn't override base implementation
result = Categorical._empty((4096,), dtype=dtype)
assert isinstance(result, Categorical)
assert result.shape == (4096,)
assert result._ndarray.dtype == np.int8
repr(result)
# case with int16 codes
ci = CategoricalIndex(list(range(512)) * 4, ordered=False)
dtype = ci.dtype
result = Categorical._empty(shape, dtype=dtype)
assert isinstance(result, Categorical)
assert result.shape == shape
assert result._ndarray.dtype == np.int16
def test_empty_dt64tz(self):
dti = date_range("2016-01-01", periods=2, tz="Asia/Tokyo")
dtype = dti.dtype
shape = (0,)
result = DatetimeArray._empty(shape, dtype=dtype)
assert result.dtype == dtype
assert isinstance(result, DatetimeArray)
assert result.shape == shape
def test_empty_dt64(self):
shape = (3, 9)
result = DatetimeArray._empty(shape, dtype="datetime64[ns]")
assert isinstance(result, DatetimeArray)
assert result.shape == shape
def test_empty_td64(self):
shape = (3, 9)
result = TimedeltaArray._empty(shape, dtype="m8[ns]")
assert isinstance(result, TimedeltaArray)
assert result.shape == shape
def test_empty_pandas_array(self):
arr = NumpyExtensionArray(np.array([1, 2]))
dtype = arr.dtype
shape = (3, 9)
result = NumpyExtensionArray._empty(shape, dtype=dtype)
assert isinstance(result, NumpyExtensionArray)
assert result.dtype == dtype
assert result.shape == shape

View File

@ -0,0 +1,184 @@
import numpy as np
import pytest
from pandas._libs.tslibs import iNaT
from pandas._libs.tslibs.period import IncompatibleFrequency
from pandas.core.dtypes.base import _registry as registry
from pandas.core.dtypes.dtypes import PeriodDtype
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import PeriodArray
# ----------------------------------------------------------------------------
# Dtype
def test_registered():
assert PeriodDtype in registry.dtypes
result = registry.find("Period[D]")
expected = PeriodDtype("D")
assert result == expected
# ----------------------------------------------------------------------------
# period_array
def test_asi8():
result = PeriodArray._from_sequence(["2000", "2001", None], dtype="period[D]").asi8
expected = np.array([10957, 11323, iNaT])
tm.assert_numpy_array_equal(result, expected)
def test_take_raises():
arr = PeriodArray._from_sequence(["2000", "2001"], dtype="period[D]")
with pytest.raises(IncompatibleFrequency, match="freq"):
arr.take([0, -1], allow_fill=True, fill_value=pd.Period("2000", freq="W"))
msg = "value should be a 'Period' or 'NaT'. Got 'str' instead"
with pytest.raises(TypeError, match=msg):
arr.take([0, -1], allow_fill=True, fill_value="foo")
def test_fillna_raises():
arr = PeriodArray._from_sequence(["2000", "2001", "2002"], dtype="period[D]")
with pytest.raises(ValueError, match="Length"):
arr.fillna(arr[:2])
def test_fillna_copies():
arr = PeriodArray._from_sequence(["2000", "2001", "2002"], dtype="period[D]")
result = arr.fillna(pd.Period("2000", "D"))
assert result is not arr
# ----------------------------------------------------------------------------
# setitem
@pytest.mark.parametrize(
"key, value, expected",
[
([0], pd.Period("2000", "D"), [10957, 1, 2]),
([0], None, [iNaT, 1, 2]),
([0], np.nan, [iNaT, 1, 2]),
([0, 1, 2], pd.Period("2000", "D"), [10957] * 3),
(
[0, 1, 2],
[pd.Period("2000", "D"), pd.Period("2001", "D"), pd.Period("2002", "D")],
[10957, 11323, 11688],
),
],
)
def test_setitem(key, value, expected):
arr = PeriodArray(np.arange(3), dtype="period[D]")
expected = PeriodArray(expected, dtype="period[D]")
arr[key] = value
tm.assert_period_array_equal(arr, expected)
def test_setitem_raises_incompatible_freq():
arr = PeriodArray(np.arange(3), dtype="period[D]")
with pytest.raises(IncompatibleFrequency, match="freq"):
arr[0] = pd.Period("2000", freq="Y")
other = PeriodArray._from_sequence(["2000", "2001"], dtype="period[Y]")
with pytest.raises(IncompatibleFrequency, match="freq"):
arr[[0, 1]] = other
def test_setitem_raises_length():
arr = PeriodArray(np.arange(3), dtype="period[D]")
with pytest.raises(ValueError, match="length"):
arr[[0, 1]] = [pd.Period("2000", freq="D")]
def test_setitem_raises_type():
arr = PeriodArray(np.arange(3), dtype="period[D]")
with pytest.raises(TypeError, match="int"):
arr[0] = 1
# ----------------------------------------------------------------------------
# Ops
def test_sub_period():
arr = PeriodArray._from_sequence(["2000", "2001"], dtype="period[D]")
other = pd.Period("2000", freq="M")
with pytest.raises(IncompatibleFrequency, match="freq"):
arr - other
def test_sub_period_overflow():
# GH#47538
dti = pd.date_range("1677-09-22", periods=2, freq="D")
pi = dti.to_period("ns")
per = pd.Period._from_ordinal(10**14, pi.freq)
with pytest.raises(OverflowError, match="Overflow in int64 addition"):
pi - per
with pytest.raises(OverflowError, match="Overflow in int64 addition"):
per - pi
# ----------------------------------------------------------------------------
# Methods
@pytest.mark.parametrize(
"other",
[
pd.Period("2000", freq="h"),
PeriodArray._from_sequence(["2000", "2001", "2000"], dtype="period[h]"),
],
)
def test_where_different_freq_raises(other):
# GH#45768 The PeriodArray method raises, the Series method coerces
ser = pd.Series(
PeriodArray._from_sequence(["2000", "2001", "2002"], dtype="period[D]")
)
cond = np.array([True, False, True])
with pytest.raises(IncompatibleFrequency, match="freq"):
ser.array._where(cond, other)
res = ser.where(cond, other)
expected = ser.astype(object).where(cond, other)
tm.assert_series_equal(res, expected)
# ----------------------------------------------------------------------------
# Printing
def test_repr_small():
arr = PeriodArray._from_sequence(["2000", "2001"], dtype="period[D]")
result = str(arr)
expected = (
"<PeriodArray>\n['2000-01-01', '2001-01-01']\nLength: 2, dtype: period[D]"
)
assert result == expected
def test_repr_large():
arr = PeriodArray._from_sequence(["2000", "2001"] * 500, dtype="period[D]")
result = str(arr)
expected = (
"<PeriodArray>\n"
"['2000-01-01', '2001-01-01', '2000-01-01', '2001-01-01', "
"'2000-01-01',\n"
" '2001-01-01', '2000-01-01', '2001-01-01', '2000-01-01', "
"'2001-01-01',\n"
" ...\n"
" '2000-01-01', '2001-01-01', '2000-01-01', '2001-01-01', "
"'2000-01-01',\n"
" '2001-01-01', '2000-01-01', '2001-01-01', '2000-01-01', "
"'2001-01-01']\n"
"Length: 1000, dtype: period[D]"
)
assert result == expected

View File

@ -0,0 +1,313 @@
from datetime import timedelta
import numpy as np
import pytest
import pandas as pd
from pandas import Timedelta
import pandas._testing as tm
from pandas.core.arrays import (
DatetimeArray,
TimedeltaArray,
)
class TestNonNano:
@pytest.fixture(params=["s", "ms", "us"])
def unit(self, request):
return request.param
@pytest.fixture
def tda(self, unit):
arr = np.arange(5, dtype=np.int64).view(f"m8[{unit}]")
return TimedeltaArray._simple_new(arr, dtype=arr.dtype)
def test_non_nano(self, unit):
arr = np.arange(5, dtype=np.int64).view(f"m8[{unit}]")
tda = TimedeltaArray._simple_new(arr, dtype=arr.dtype)
assert tda.dtype == arr.dtype
assert tda[0].unit == unit
def test_as_unit_raises(self, tda):
# GH#50616
with pytest.raises(ValueError, match="Supported units"):
tda.as_unit("D")
tdi = pd.Index(tda)
with pytest.raises(ValueError, match="Supported units"):
tdi.as_unit("D")
@pytest.mark.parametrize("field", TimedeltaArray._field_ops)
def test_fields(self, tda, field):
as_nano = tda._ndarray.astype("m8[ns]")
tda_nano = TimedeltaArray._simple_new(as_nano, dtype=as_nano.dtype)
result = getattr(tda, field)
expected = getattr(tda_nano, field)
tm.assert_numpy_array_equal(result, expected)
def test_to_pytimedelta(self, tda):
as_nano = tda._ndarray.astype("m8[ns]")
tda_nano = TimedeltaArray._simple_new(as_nano, dtype=as_nano.dtype)
result = tda.to_pytimedelta()
expected = tda_nano.to_pytimedelta()
tm.assert_numpy_array_equal(result, expected)
def test_total_seconds(self, unit, tda):
as_nano = tda._ndarray.astype("m8[ns]")
tda_nano = TimedeltaArray._simple_new(as_nano, dtype=as_nano.dtype)
result = tda.total_seconds()
expected = tda_nano.total_seconds()
tm.assert_numpy_array_equal(result, expected)
def test_timedelta_array_total_seconds(self):
# GH34290
expected = Timedelta("2 min").total_seconds()
result = pd.array([Timedelta("2 min")]).total_seconds()[0]
assert result == expected
def test_total_seconds_nanoseconds(self):
# issue #48521
start_time = pd.Series(["2145-11-02 06:00:00"]).astype("datetime64[ns]")
end_time = pd.Series(["2145-11-02 07:06:00"]).astype("datetime64[ns]")
expected = (end_time - start_time).values / np.timedelta64(1, "s")
result = (end_time - start_time).dt.total_seconds().values
assert result == expected
@pytest.mark.parametrize(
"nat", [np.datetime64("NaT", "ns"), np.datetime64("NaT", "us")]
)
def test_add_nat_datetimelike_scalar(self, nat, tda):
result = tda + nat
assert isinstance(result, DatetimeArray)
assert result._creso == tda._creso
assert result.isna().all()
result = nat + tda
assert isinstance(result, DatetimeArray)
assert result._creso == tda._creso
assert result.isna().all()
def test_add_pdnat(self, tda):
result = tda + pd.NaT
assert isinstance(result, TimedeltaArray)
assert result._creso == tda._creso
assert result.isna().all()
result = pd.NaT + tda
assert isinstance(result, TimedeltaArray)
assert result._creso == tda._creso
assert result.isna().all()
# TODO: 2022-07-11 this is the only test that gets to DTA.tz_convert
# or tz_localize with non-nano; implement tests specific to that.
def test_add_datetimelike_scalar(self, tda, tz_naive_fixture):
ts = pd.Timestamp("2016-01-01", tz=tz_naive_fixture).as_unit("ns")
expected = tda.as_unit("ns") + ts
res = tda + ts
tm.assert_extension_array_equal(res, expected)
res = ts + tda
tm.assert_extension_array_equal(res, expected)
ts += Timedelta(1) # case where we can't cast losslessly
exp_values = tda._ndarray + ts.asm8
expected = (
DatetimeArray._simple_new(exp_values, dtype=exp_values.dtype)
.tz_localize("UTC")
.tz_convert(ts.tz)
)
result = tda + ts
tm.assert_extension_array_equal(result, expected)
result = ts + tda
tm.assert_extension_array_equal(result, expected)
def test_mul_scalar(self, tda):
other = 2
result = tda * other
expected = TimedeltaArray._simple_new(tda._ndarray * other, dtype=tda.dtype)
tm.assert_extension_array_equal(result, expected)
assert result._creso == tda._creso
def test_mul_listlike(self, tda):
other = np.arange(len(tda))
result = tda * other
expected = TimedeltaArray._simple_new(tda._ndarray * other, dtype=tda.dtype)
tm.assert_extension_array_equal(result, expected)
assert result._creso == tda._creso
def test_mul_listlike_object(self, tda):
other = np.arange(len(tda))
result = tda * other.astype(object)
expected = TimedeltaArray._simple_new(tda._ndarray * other, dtype=tda.dtype)
tm.assert_extension_array_equal(result, expected)
assert result._creso == tda._creso
def test_div_numeric_scalar(self, tda):
other = 2
result = tda / other
expected = TimedeltaArray._simple_new(tda._ndarray / other, dtype=tda.dtype)
tm.assert_extension_array_equal(result, expected)
assert result._creso == tda._creso
def test_div_td_scalar(self, tda):
other = timedelta(seconds=1)
result = tda / other
expected = tda._ndarray / np.timedelta64(1, "s")
tm.assert_numpy_array_equal(result, expected)
def test_div_numeric_array(self, tda):
other = np.arange(len(tda))
result = tda / other
expected = TimedeltaArray._simple_new(tda._ndarray / other, dtype=tda.dtype)
tm.assert_extension_array_equal(result, expected)
assert result._creso == tda._creso
def test_div_td_array(self, tda):
other = tda._ndarray + tda._ndarray[-1]
result = tda / other
expected = tda._ndarray / other
tm.assert_numpy_array_equal(result, expected)
def test_add_timedeltaarraylike(self, tda):
tda_nano = tda.astype("m8[ns]")
expected = tda_nano * 2
res = tda_nano + tda
tm.assert_extension_array_equal(res, expected)
res = tda + tda_nano
tm.assert_extension_array_equal(res, expected)
expected = tda_nano * 0
res = tda - tda_nano
tm.assert_extension_array_equal(res, expected)
res = tda_nano - tda
tm.assert_extension_array_equal(res, expected)
class TestTimedeltaArray:
@pytest.mark.parametrize("dtype", [int, np.int32, np.int64, "uint32", "uint64"])
def test_astype_int(self, dtype):
arr = TimedeltaArray._from_sequence(
[Timedelta("1h"), Timedelta("2h")], dtype="m8[ns]"
)
if np.dtype(dtype) != np.int64:
with pytest.raises(TypeError, match=r"Do obj.astype\('int64'\)"):
arr.astype(dtype)
return
result = arr.astype(dtype)
expected = arr._ndarray.view("i8")
tm.assert_numpy_array_equal(result, expected)
def test_setitem_clears_freq(self):
a = pd.timedelta_range("1h", periods=2, freq="h")._data
a[0] = Timedelta("1h")
assert a.freq is None
@pytest.mark.parametrize(
"obj",
[
Timedelta(seconds=1),
Timedelta(seconds=1).to_timedelta64(),
Timedelta(seconds=1).to_pytimedelta(),
],
)
def test_setitem_objects(self, obj):
# make sure we accept timedelta64 and timedelta in addition to Timedelta
tdi = pd.timedelta_range("2 Days", periods=4, freq="h")
arr = tdi._data
arr[0] = obj
assert arr[0] == Timedelta(seconds=1)
@pytest.mark.parametrize(
"other",
[
1,
np.int64(1),
1.0,
np.datetime64("NaT"),
pd.Timestamp("2021-01-01"),
"invalid",
np.arange(10, dtype="i8") * 24 * 3600 * 10**9,
(np.arange(10) * 24 * 3600 * 10**9).view("datetime64[ns]"),
pd.Timestamp("2021-01-01").to_period("D"),
],
)
@pytest.mark.parametrize("index", [True, False])
def test_searchsorted_invalid_types(self, other, index):
data = np.arange(10, dtype="i8") * 24 * 3600 * 10**9
arr = pd.TimedeltaIndex(data, freq="D")._data
if index:
arr = pd.Index(arr)
msg = "|".join(
[
"searchsorted requires compatible dtype or scalar",
"value should be a 'Timedelta', 'NaT', or array of those. Got",
]
)
with pytest.raises(TypeError, match=msg):
arr.searchsorted(other)
class TestUnaryOps:
def test_abs(self):
vals = np.array([-3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]")
arr = TimedeltaArray._from_sequence(vals)
evals = np.array([3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]")
expected = TimedeltaArray._from_sequence(evals)
result = abs(arr)
tm.assert_timedelta_array_equal(result, expected)
result2 = np.abs(arr)
tm.assert_timedelta_array_equal(result2, expected)
def test_pos(self):
vals = np.array([-3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]")
arr = TimedeltaArray._from_sequence(vals)
result = +arr
tm.assert_timedelta_array_equal(result, arr)
assert not tm.shares_memory(result, arr)
result2 = np.positive(arr)
tm.assert_timedelta_array_equal(result2, arr)
assert not tm.shares_memory(result2, arr)
def test_neg(self):
vals = np.array([-3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]")
arr = TimedeltaArray._from_sequence(vals)
evals = np.array([3600 * 10**9, "NaT", -7200 * 10**9], dtype="m8[ns]")
expected = TimedeltaArray._from_sequence(evals)
result = -arr
tm.assert_timedelta_array_equal(result, expected)
result2 = np.negative(arr)
tm.assert_timedelta_array_equal(result2, expected)
def test_neg_freq(self):
tdi = pd.timedelta_range("2 Days", periods=4, freq="h")
arr = tdi._data
expected = -tdi._data
result = -arr
tm.assert_timedelta_array_equal(result, expected)
result2 = np.negative(arr)
tm.assert_timedelta_array_equal(result2, expected)

View File

@ -0,0 +1,103 @@
import numpy as np
import pytest
import pandas._testing as tm
from pandas.core.arrays import TimedeltaArray
class TestTimedeltaArrayConstructor:
def test_only_1dim_accepted(self):
# GH#25282
arr = np.array([0, 1, 2, 3], dtype="m8[h]").astype("m8[ns]")
depr_msg = "TimedeltaArray.__init__ is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match="Only 1-dimensional"):
# 3-dim, we allow 2D to sneak in for ops purposes GH#29853
TimedeltaArray(arr.reshape(2, 2, 1))
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match="Only 1-dimensional"):
# 0-dim
TimedeltaArray(arr[[0]].squeeze())
def test_freq_validation(self):
# ensure that the public constructor cannot create an invalid instance
arr = np.array([0, 0, 1], dtype=np.int64) * 3600 * 10**9
msg = (
"Inferred frequency None from passed values does not "
"conform to passed frequency D"
)
depr_msg = "TimedeltaArray.__init__ is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match=msg):
TimedeltaArray(arr.view("timedelta64[ns]"), freq="D")
def test_non_array_raises(self):
depr_msg = "TimedeltaArray.__init__ is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match="list"):
TimedeltaArray([1, 2, 3])
def test_other_type_raises(self):
msg = r"dtype bool cannot be converted to timedelta64\[ns\]"
with pytest.raises(TypeError, match=msg):
TimedeltaArray._from_sequence(np.array([1, 2, 3], dtype="bool"))
def test_incorrect_dtype_raises(self):
msg = "dtype 'category' is invalid, should be np.timedelta64 dtype"
with pytest.raises(ValueError, match=msg):
TimedeltaArray._from_sequence(
np.array([1, 2, 3], dtype="i8"), dtype="category"
)
msg = "dtype 'int64' is invalid, should be np.timedelta64 dtype"
with pytest.raises(ValueError, match=msg):
TimedeltaArray._from_sequence(
np.array([1, 2, 3], dtype="i8"), dtype=np.dtype("int64")
)
msg = r"dtype 'datetime64\[ns\]' is invalid, should be np.timedelta64 dtype"
with pytest.raises(ValueError, match=msg):
TimedeltaArray._from_sequence(
np.array([1, 2, 3], dtype="i8"), dtype=np.dtype("M8[ns]")
)
msg = (
r"dtype 'datetime64\[us, UTC\]' is invalid, should be np.timedelta64 dtype"
)
with pytest.raises(ValueError, match=msg):
TimedeltaArray._from_sequence(
np.array([1, 2, 3], dtype="i8"), dtype="M8[us, UTC]"
)
msg = "Supported timedelta64 resolutions are 's', 'ms', 'us', 'ns'"
with pytest.raises(ValueError, match=msg):
TimedeltaArray._from_sequence(
np.array([1, 2, 3], dtype="i8"), dtype=np.dtype("m8[Y]")
)
def test_mismatched_values_dtype_units(self):
arr = np.array([1, 2, 3], dtype="m8[s]")
dtype = np.dtype("m8[ns]")
msg = r"Values resolution does not match dtype"
depr_msg = "TimedeltaArray.__init__ is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match=msg):
TimedeltaArray(arr, dtype=dtype)
def test_copy(self):
data = np.array([1, 2, 3], dtype="m8[ns]")
arr = TimedeltaArray._from_sequence(data, copy=False)
assert arr._ndarray is data
arr = TimedeltaArray._from_sequence(data, copy=True)
assert arr._ndarray is not data
assert arr._ndarray.base is not data
def test_from_sequence_dtype(self):
msg = "dtype 'object' is invalid, should be np.timedelta64 dtype"
with pytest.raises(ValueError, match=msg):
TimedeltaArray._from_sequence([], dtype=object)

View File

@ -0,0 +1,20 @@
import pytest
import pandas._testing as tm
from pandas.core.arrays import TimedeltaArray
class TestAccumulator:
def test_accumulators_disallowed(self):
# GH#50297
arr = TimedeltaArray._from_sequence(["1D", "2D"], dtype="m8[ns]")
with pytest.raises(TypeError, match="cumprod not supported"):
arr._accumulate("cumprod")
def test_cumsum(self, unit):
# GH#50297
dtype = f"m8[{unit}]"
arr = TimedeltaArray._from_sequence(["1D", "2D"], dtype=dtype)
result = arr._accumulate("cumsum")
expected = TimedeltaArray._from_sequence(["1D", "3D"], dtype=dtype)
tm.assert_timedelta_array_equal(result, expected)

Some files were not shown because too many files have changed in this diff Show More