done

2025-09-07 22:09:54 +02:00
parent e1b817252c
commit 2fc0d000b6
7796 changed files with 2159515 additions and 933 deletions
--- a/lib/python3.11/site-packages/pandas/tests/arrays/masked/init.py
+++ b/lib/python3.11/site-packages/pandas/tests/arrays/masked/init.py
--- a/lib/python3.11/site-packages/pandas/tests/arrays/masked/test_arithmetic.py
+++ b/lib/python3.11/site-packages/pandas/tests/arrays/masked/test_arithmetic.py
@ -0,0 +1,248 @@
+from __future__ import annotations
+
+from typing import Any
+
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas._testing as tm
+
+# integer dtypes
+arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_INT_EA_DTYPES]
+scalars: list[Any] = [2] * len(arrays)
+# floating dtypes
+arrays += [pd.array([0.1, 0.2, 0.3, None], dtype=dtype) for dtype in tm.FLOAT_EA_DTYPES]
+scalars += [0.2, 0.2]
+# boolean
+arrays += [pd.array([True, False, True, None], dtype="boolean")]
+scalars += [False]
+
+
+@pytest.fixture(params=zip(arrays, scalars), ids=[a.dtype.name for a in arrays])
+def data(request):
+    """Fixture returning parametrized (array, scalar) tuple.
+
+    Used to test equivalence of scalars, numpy arrays with array ops, and the
+    equivalence of DataFrame and Series ops.
+    """
+    return request.param
+
+
+def check_skip(data, op_name):
+    if isinstance(data.dtype, pd.BooleanDtype) and "sub" in op_name:
+        pytest.skip("subtract not implemented for boolean")
+
+
+def is_bool_not_implemented(data, op_name):
+    # match non-masked behavior
+    return data.dtype.kind == "b" and op_name.strip("_").lstrip("r") in [
+        "pow",
+        "truediv",
+        "floordiv",
+    ]
+
+
+# Test equivalence of scalars, numpy arrays with array ops
+# -----------------------------------------------------------------------------
+
+
+def test_array_scalar_like_equivalence(data, all_arithmetic_operators):
+    data, scalar = data
+    op = tm.get_op_from_name(all_arithmetic_operators)
+    check_skip(data, all_arithmetic_operators)
+
+    scalar_array = pd.array([scalar] * len(data), dtype=data.dtype)
+
+    # TODO also add len-1 array (np.array([scalar], dtype=data.dtype.numpy_dtype))
+    for scalar in [scalar, data.dtype.type(scalar)]:
+        if is_bool_not_implemented(data, all_arithmetic_operators):
+            msg = "operator '.*' not implemented for bool dtypes"
+            with pytest.raises(NotImplementedError, match=msg):
+                op(data, scalar)
+            with pytest.raises(NotImplementedError, match=msg):
+                op(data, scalar_array)
+        else:
+            result = op(data, scalar)
+            expected = op(data, scalar_array)
+            tm.assert_extension_array_equal(result, expected)
+
+
+def test_array_NA(data, all_arithmetic_operators):
+    data, _ = data
+    op = tm.get_op_from_name(all_arithmetic_operators)
+    check_skip(data, all_arithmetic_operators)
+
+    scalar = pd.NA
+    scalar_array = pd.array([pd.NA] * len(data), dtype=data.dtype)
+
+    mask = data._mask.copy()
+
+    if is_bool_not_implemented(data, all_arithmetic_operators):
+        msg = "operator '.*' not implemented for bool dtypes"
+        with pytest.raises(NotImplementedError, match=msg):
+            op(data, scalar)
+        # GH#45421 check op doesn't alter data._mask inplace
+        tm.assert_numpy_array_equal(mask, data._mask)
+        return
+
+    result = op(data, scalar)
+    # GH#45421 check op doesn't alter data._mask inplace
+    tm.assert_numpy_array_equal(mask, data._mask)
+
+    expected = op(data, scalar_array)
+    tm.assert_numpy_array_equal(mask, data._mask)
+
+    tm.assert_extension_array_equal(result, expected)
+
+
+def test_numpy_array_equivalence(data, all_arithmetic_operators):
+    data, scalar = data
+    op = tm.get_op_from_name(all_arithmetic_operators)
+    check_skip(data, all_arithmetic_operators)
+
+    numpy_array = np.array([scalar] * len(data), dtype=data.dtype.numpy_dtype)
+    pd_array = pd.array(numpy_array, dtype=data.dtype)
+
+    if is_bool_not_implemented(data, all_arithmetic_operators):
+        msg = "operator '.*' not implemented for bool dtypes"
+        with pytest.raises(NotImplementedError, match=msg):
+            op(data, numpy_array)
+        with pytest.raises(NotImplementedError, match=msg):
+            op(data, pd_array)
+        return
+
+    result = op(data, numpy_array)
+    expected = op(data, pd_array)
+    tm.assert_extension_array_equal(result, expected)
+
+
+# Test equivalence with Series and DataFrame ops
+# -----------------------------------------------------------------------------
+
+
+def test_frame(data, all_arithmetic_operators):
+    data, scalar = data
+    op = tm.get_op_from_name(all_arithmetic_operators)
+    check_skip(data, all_arithmetic_operators)
+
+    # DataFrame with scalar
+    df = pd.DataFrame({"A": data})
+
+    if is_bool_not_implemented(data, all_arithmetic_operators):
+        msg = "operator '.*' not implemented for bool dtypes"
+        with pytest.raises(NotImplementedError, match=msg):
+            op(df, scalar)
+        with pytest.raises(NotImplementedError, match=msg):
+            op(data, scalar)
+        return
+
+    result = op(df, scalar)
+    expected = pd.DataFrame({"A": op(data, scalar)})
+    tm.assert_frame_equal(result, expected)
+
+
+def test_series(data, all_arithmetic_operators):
+    data, scalar = data
+    op = tm.get_op_from_name(all_arithmetic_operators)
+    check_skip(data, all_arithmetic_operators)
+
+    ser = pd.Series(data)
+
+    others = [
+        scalar,
+        np.array([scalar] * len(data), dtype=data.dtype.numpy_dtype),
+        pd.array([scalar] * len(data), dtype=data.dtype),
+        pd.Series([scalar] * len(data), dtype=data.dtype),
+    ]
+
+    for other in others:
+        if is_bool_not_implemented(data, all_arithmetic_operators):
+            msg = "operator '.*' not implemented for bool dtypes"
+            with pytest.raises(NotImplementedError, match=msg):
+                op(ser, other)
+
+        else:
+            result = op(ser, other)
+            expected = pd.Series(op(data, other))
+            tm.assert_series_equal(result, expected)
+
+
+# Test generic characteristics / errors
+# -----------------------------------------------------------------------------
+
+
+def test_error_invalid_object(data, all_arithmetic_operators):
+    data, _ = data
+
+    op = all_arithmetic_operators
+    opa = getattr(data, op)
+
+    # 2d -> return NotImplemented
+    result = opa(pd.DataFrame({"A": data}))
+    assert result is NotImplemented
+
+    msg = r"can only perform ops with 1-d structures"
+    with pytest.raises(NotImplementedError, match=msg):
+        opa(np.arange(len(data)).reshape(-1, len(data)))
+
+
+def test_error_len_mismatch(data, all_arithmetic_operators):
+    # operating with a list-like with non-matching length raises
+    data, scalar = data
+    op = tm.get_op_from_name(all_arithmetic_operators)
+
+    other = [scalar] * (len(data) - 1)
+
+    err = ValueError
+    msg = "|".join(
+        [
+            r"operands could not be broadcast together with shapes \(3,\) \(4,\)",
+            r"operands could not be broadcast together with shapes \(4,\) \(3,\)",
+        ]
+    )
+    if data.dtype.kind == "b" and all_arithmetic_operators.strip("_") in [
+        "sub",
+        "rsub",
+    ]:
+        err = TypeError
+        msg = (
+            r"numpy boolean subtract, the `\-` operator, is not supported, use "
+            r"the bitwise_xor, the `\^` operator, or the logical_xor function instead"
+        )
+    elif is_bool_not_implemented(data, all_arithmetic_operators):
+        msg = "operator '.*' not implemented for bool dtypes"
+        err = NotImplementedError
+
+    for other in [other, np.array(other)]:
+        with pytest.raises(err, match=msg):
+            op(data, other)
+
+        s = pd.Series(data)
+        with pytest.raises(err, match=msg):
+            op(s, other)
+
+
+@pytest.mark.parametrize("op", ["__neg__", "__abs__", "__invert__"])
+def test_unary_op_does_not_propagate_mask(data, op):
+    # https://github.com/pandas-dev/pandas/issues/39943
+    data, _ = data
+    ser = pd.Series(data)
+
+    if op == "__invert__" and data.dtype.kind == "f":
+        # we follow numpy in raising
+        msg = "ufunc 'invert' not supported for the input types"
+        with pytest.raises(TypeError, match=msg):
+            getattr(ser, op)()
+        with pytest.raises(TypeError, match=msg):
+            getattr(data, op)()
+        with pytest.raises(TypeError, match=msg):
+            # Check that this is still the numpy behavior
+            getattr(data._data, op)()
+
+        return
+
+    result = getattr(ser, op)()
+    expected = result.copy(deep=True)
+    ser[0] = None
+    tm.assert_series_equal(result, expected)
--- a/lib/python3.11/site-packages/pandas/tests/arrays/masked/test_arrow_compat.py
+++ b/lib/python3.11/site-packages/pandas/tests/arrays/masked/test_arrow_compat.py
@ -0,0 +1,210 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas._testing as tm
+
+pytestmark = pytest.mark.filterwarnings(
+    "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
+)
+
+
+pa = pytest.importorskip("pyarrow")
+
+from pandas.core.arrays.arrow._arrow_utils import pyarrow_array_to_numpy_and_mask
+
+arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_INT_EA_DTYPES]
+arrays += [pd.array([0.1, 0.2, 0.3, None], dtype=dtype) for dtype in tm.FLOAT_EA_DTYPES]
+arrays += [pd.array([True, False, True, None], dtype="boolean")]
+
+
+@pytest.fixture(params=arrays, ids=[a.dtype.name for a in arrays])
+def data(request):
+    """
+    Fixture returning parametrized array from given dtype, including integer,
+    float and boolean
+    """
+    return request.param
+
+
+def test_arrow_array(data):
+    arr = pa.array(data)
+    expected = pa.array(
+        data.to_numpy(object, na_value=None),
+        type=pa.from_numpy_dtype(data.dtype.numpy_dtype),
+    )
+    assert arr.equals(expected)
+
+
+def test_arrow_roundtrip(data):
+    df = pd.DataFrame({"a": data})
+    table = pa.table(df)
+    assert table.field("a").type == str(data.dtype.numpy_dtype)
+
+    result = table.to_pandas()
+    assert result["a"].dtype == data.dtype
+    tm.assert_frame_equal(result, df)
+
+
+def test_dataframe_from_arrow_types_mapper():
+    def types_mapper(arrow_type):
+        if pa.types.is_boolean(arrow_type):
+            return pd.BooleanDtype()
+        elif pa.types.is_integer(arrow_type):
+            return pd.Int64Dtype()
+
+    bools_array = pa.array([True, None, False], type=pa.bool_())
+    ints_array = pa.array([1, None, 2], type=pa.int64())
+    small_ints_array = pa.array([-1, 0, 7], type=pa.int8())
+    record_batch = pa.RecordBatch.from_arrays(
+        [bools_array, ints_array, small_ints_array], ["bools", "ints", "small_ints"]
+    )
+    result = record_batch.to_pandas(types_mapper=types_mapper)
+    bools = pd.Series([True, None, False], dtype="boolean")
+    ints = pd.Series([1, None, 2], dtype="Int64")
+    small_ints = pd.Series([-1, 0, 7], dtype="Int64")
+    expected = pd.DataFrame({"bools": bools, "ints": ints, "small_ints": small_ints})
+    tm.assert_frame_equal(result, expected)
+
+
+def test_arrow_load_from_zero_chunks(data):
+    # GH-41040
+
+    df = pd.DataFrame({"a": data[0:0]})
+    table = pa.table(df)
+    assert table.field("a").type == str(data.dtype.numpy_dtype)
+    table = pa.table(
+        [pa.chunked_array([], type=table.field("a").type)], schema=table.schema
+    )
+    result = table.to_pandas()
+    assert result["a"].dtype == data.dtype
+    tm.assert_frame_equal(result, df)
+
+
+def test_arrow_from_arrow_uint():
+    # https://github.com/pandas-dev/pandas/issues/31896
+    # possible mismatch in types
+
+    dtype = pd.UInt32Dtype()
+    result = dtype.__from_arrow__(pa.array([1, 2, 3, 4, None], type="int64"))
+    expected = pd.array([1, 2, 3, 4, None], dtype="UInt32")
+
+    tm.assert_extension_array_equal(result, expected)
+
+
+def test_arrow_sliced(data):
+    # https://github.com/pandas-dev/pandas/issues/38525
+
+    df = pd.DataFrame({"a": data})
+    table = pa.table(df)
+    result = table.slice(2, None).to_pandas()
+    expected = df.iloc[2:].reset_index(drop=True)
+    tm.assert_frame_equal(result, expected)
+
+    # no missing values
+    df2 = df.fillna(data[0])
+    table = pa.table(df2)
+    result = table.slice(2, None).to_pandas()
+    expected = df2.iloc[2:].reset_index(drop=True)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.fixture
+def np_dtype_to_arrays(any_real_numpy_dtype):
+    """
+    Fixture returning actual and expected dtype, pandas and numpy arrays and
+    mask from a given numpy dtype
+    """
+    np_dtype = np.dtype(any_real_numpy_dtype)
+    pa_type = pa.from_numpy_dtype(np_dtype)
+
+    # None ensures the creation of a bitmask buffer.
+    pa_array = pa.array([0, 1, 2, None], type=pa_type)
+    # Since masked Arrow buffer slots are not required to contain a specific
+    # value, assert only the first three values of the created np.array
+    np_expected = np.array([0, 1, 2], dtype=np_dtype)
+    mask_expected = np.array([True, True, True, False])
+    return np_dtype, pa_array, np_expected, mask_expected
+
+
+def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays):
+    """
+    Test conversion from pyarrow array to numpy array.
+
+    Modifies the pyarrow buffer to contain padding and offset, which are
+    considered valid buffers by pyarrow.
+
+    Also tests empty pyarrow arrays with non empty buffers.
+    See https://github.com/pandas-dev/pandas/issues/40896
+    """
+    np_dtype, pa_array, np_expected, mask_expected = np_dtype_to_arrays
+    data, mask = pyarrow_array_to_numpy_and_mask(pa_array, np_dtype)
+    tm.assert_numpy_array_equal(data[:3], np_expected)
+    tm.assert_numpy_array_equal(mask, mask_expected)
+
+    mask_buffer = pa_array.buffers()[0]
+    data_buffer = pa_array.buffers()[1]
+    data_buffer_bytes = pa_array.buffers()[1].to_pybytes()
+
+    # Add trailing padding to the buffer.
+    data_buffer_trail = pa.py_buffer(data_buffer_bytes + b"\x00")
+    pa_array_trail = pa.Array.from_buffers(
+        type=pa_array.type,
+        length=len(pa_array),
+        buffers=[mask_buffer, data_buffer_trail],
+        offset=pa_array.offset,
+    )
+    pa_array_trail.validate()
+    data, mask = pyarrow_array_to_numpy_and_mask(pa_array_trail, np_dtype)
+    tm.assert_numpy_array_equal(data[:3], np_expected)
+    tm.assert_numpy_array_equal(mask, mask_expected)
+
+    # Add offset to the buffer.
+    offset = b"\x00" * (pa_array.type.bit_width // 8)
+    data_buffer_offset = pa.py_buffer(offset + data_buffer_bytes)
+    mask_buffer_offset = pa.py_buffer(b"\x0E")
+    pa_array_offset = pa.Array.from_buffers(
+        type=pa_array.type,
+        length=len(pa_array),
+        buffers=[mask_buffer_offset, data_buffer_offset],
+        offset=pa_array.offset + 1,
+    )
+    pa_array_offset.validate()
+    data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype)
+    tm.assert_numpy_array_equal(data[:3], np_expected)
+    tm.assert_numpy_array_equal(mask, mask_expected)
+
+    # Empty array
+    np_expected_empty = np.array([], dtype=np_dtype)
+    mask_expected_empty = np.array([], dtype=np.bool_)
+
+    pa_array_offset = pa.Array.from_buffers(
+        type=pa_array.type,
+        length=0,
+        buffers=[mask_buffer, data_buffer],
+        offset=pa_array.offset,
+    )
+    pa_array_offset.validate()
+    data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype)
+    tm.assert_numpy_array_equal(data[:3], np_expected_empty)
+    tm.assert_numpy_array_equal(mask, mask_expected_empty)
+
+
+@pytest.mark.parametrize(
+    "arr", [pa.nulls(10), pa.chunked_array([pa.nulls(4), pa.nulls(6)])]
+)
+def test_from_arrow_null(data, arr):
+    res = data.dtype.__from_arrow__(arr)
+    assert res.isna().all()
+    assert len(res) == 10
+
+
+def test_from_arrow_type_error(data):
+    # ensure that __from_arrow__ returns a TypeError when getting a wrong
+    # array type
+
+    arr = pa.array(data).cast("string")
+    with pytest.raises(TypeError, match=None):
+        # we don't test the exact error message, only the fact that it raises
+        # a TypeError is relevant
+        data.dtype.__from_arrow__(arr)
--- a/lib/python3.11/site-packages/pandas/tests/arrays/masked/test_function.py
+++ b/lib/python3.11/site-packages/pandas/tests/arrays/masked/test_function.py
@ -0,0 +1,74 @@
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.common import is_integer_dtype
+
+import pandas as pd
+import pandas._testing as tm
+from pandas.core.arrays import BaseMaskedArray
+
+arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_INT_EA_DTYPES]
+arrays += [
+    pd.array([0.141, -0.268, 5.895, None], dtype=dtype) for dtype in tm.FLOAT_EA_DTYPES
+]
+
+
+@pytest.fixture(params=arrays, ids=[a.dtype.name for a in arrays])
+def data(request):
+    """
+    Fixture returning parametrized 'data' array with different integer and
+    floating point types
+    """
+    return request.param
+
+
+@pytest.fixture()
+def numpy_dtype(data):
+    """
+    Fixture returning numpy dtype from 'data' input array.
+    """
+    # For integer dtype, the numpy conversion must be done to float
+    if is_integer_dtype(data):
+        numpy_dtype = float
+    else:
+        numpy_dtype = data.dtype.type
+    return numpy_dtype
+
+
+def test_round(data, numpy_dtype):
+    # No arguments
+    result = data.round()
+    expected = pd.array(
+        np.round(data.to_numpy(dtype=numpy_dtype, na_value=None)), dtype=data.dtype
+    )
+    tm.assert_extension_array_equal(result, expected)
+
+    # Decimals argument
+    result = data.round(decimals=2)
+    expected = pd.array(
+        np.round(data.to_numpy(dtype=numpy_dtype, na_value=None), decimals=2),
+        dtype=data.dtype,
+    )
+    tm.assert_extension_array_equal(result, expected)
+
+
+def test_tolist(data):
+    result = data.tolist()
+    expected = list(data)
+    tm.assert_equal(result, expected)
+
+
+def test_to_numpy():
+    # GH#56991
+
+    class MyStringArray(BaseMaskedArray):
+        dtype = pd.StringDtype()
+        _dtype_cls = pd.StringDtype
+        _internal_fill_value = pd.NA
+
+    arr = MyStringArray(
+        values=np.array(["a", "b", "c"]), mask=np.array([False, True, False])
+    )
+    result = arr.to_numpy()
+    expected = np.array(["a", pd.NA, "c"])
+    tm.assert_numpy_array_equal(result, expected)
--- a/lib/python3.11/site-packages/pandas/tests/arrays/masked/test_indexing.py
+++ b/lib/python3.11/site-packages/pandas/tests/arrays/masked/test_indexing.py
@ -0,0 +1,60 @@
+import re
+
+import numpy as np
+import pytest
+
+import pandas as pd
+
+
+class TestSetitemValidation:
+    def _check_setitem_invalid(self, arr, invalid):
+        msg = f"Invalid value '{invalid!s}' for dtype '{arr.dtype}'"
+        msg = re.escape(msg)
+        with pytest.raises(TypeError, match=msg):
+            arr[0] = invalid
+
+        with pytest.raises(TypeError, match=msg):
+            arr[:] = invalid
+
+        with pytest.raises(TypeError, match=msg):
+            arr[[0]] = invalid
+
+        # FIXME: don't leave commented-out
+        # with pytest.raises(TypeError):
+        #    arr[[0]] = [invalid]
+
+        # with pytest.raises(TypeError):
+        #    arr[[0]] = np.array([invalid], dtype=object)
+
+        # Series non-coercion, behavior subject to change
+        ser = pd.Series(arr)
+        with pytest.raises(TypeError, match=msg):
+            ser[0] = invalid
+            # TODO: so, so many other variants of this...
+
+    _invalid_scalars = [
+        1 + 2j,
+        "True",
+        "1",
+        "1.0",
+        pd.NaT,
+        np.datetime64("NaT"),
+        np.timedelta64("NaT"),
+    ]
+
+    @pytest.mark.parametrize(
+        "invalid", _invalid_scalars + [1, 1.0, np.int64(1), np.float64(1)]
+    )
+    def test_setitem_validation_scalar_bool(self, invalid):
+        arr = pd.array([True, False, None], dtype="boolean")
+        self._check_setitem_invalid(arr, invalid)
+
+    @pytest.mark.parametrize("invalid", _invalid_scalars + [True, 1.5, np.float64(1.5)])
+    def test_setitem_validation_scalar_int(self, invalid, any_int_ea_dtype):
+        arr = pd.array([1, 2, None], dtype=any_int_ea_dtype)
+        self._check_setitem_invalid(arr, invalid)
+
+    @pytest.mark.parametrize("invalid", _invalid_scalars + [True])
+    def test_setitem_validation_scalar_float(self, invalid, float_ea_dtype):
+        arr = pd.array([1, 2, None], dtype=float_ea_dtype)
+        self._check_setitem_invalid(arr, invalid)