done

2025-09-07 22:09:54 +02:00
parent e1b817252c
commit 2fc0d000b6
7796 changed files with 2159515 additions and 933 deletions
--- a/lib/python3.11/site-packages/pandas/tests/arrays/string_/init.py
+++ b/lib/python3.11/site-packages/pandas/tests/arrays/string_/init.py
--- a/lib/python3.11/site-packages/pandas/tests/arrays/string_/test_concat.py
+++ b/lib/python3.11/site-packages/pandas/tests/arrays/string_/test_concat.py
@ -0,0 +1,73 @@
+import numpy as np
+import pytest
+
+from pandas.compat import HAS_PYARROW
+
+from pandas.core.dtypes.cast import find_common_type
+
+import pandas as pd
+import pandas._testing as tm
+from pandas.util.version import Version
+
+
+@pytest.mark.parametrize(
+    "to_concat_dtypes, result_dtype",
+    [
+        # same types
+        ([("pyarrow", pd.NA), ("pyarrow", pd.NA)], ("pyarrow", pd.NA)),
+        ([("pyarrow", np.nan), ("pyarrow", np.nan)], ("pyarrow", np.nan)),
+        ([("python", pd.NA), ("python", pd.NA)], ("python", pd.NA)),
+        ([("python", np.nan), ("python", np.nan)], ("python", np.nan)),
+        # pyarrow preference
+        ([("pyarrow", pd.NA), ("python", pd.NA)], ("pyarrow", pd.NA)),
+        # NA preference
+        ([("python", pd.NA), ("python", np.nan)], ("python", pd.NA)),
+    ],
+)
+def test_concat_series(request, to_concat_dtypes, result_dtype):
+    if any(storage == "pyarrow" for storage, _ in to_concat_dtypes) and not HAS_PYARROW:
+        pytest.skip("Could not import 'pyarrow'")
+
+    ser_list = [
+        pd.Series(["a", "b", None], dtype=pd.StringDtype(storage, na_value))
+        for storage, na_value in to_concat_dtypes
+    ]
+
+    result = pd.concat(ser_list, ignore_index=True)
+    expected = pd.Series(
+        ["a", "b", None, "a", "b", None], dtype=pd.StringDtype(*result_dtype)
+    )
+    tm.assert_series_equal(result, expected)
+
+    # order doesn't matter for result
+    result = pd.concat(ser_list[::1], ignore_index=True)
+    tm.assert_series_equal(result, expected)
+
+
+def test_concat_with_object(string_dtype_arguments):
+    # _get_common_dtype cannot inspect values, so object dtype with strings still
+    # results in object dtype
+    result = pd.concat(
+        [
+            pd.Series(["a", "b", None], dtype=pd.StringDtype(*string_dtype_arguments)),
+            pd.Series(["a", "b", None], dtype=object),
+        ]
+    )
+    assert result.dtype == np.dtype("object")
+
+
+def test_concat_with_numpy(string_dtype_arguments):
+    # common type with a numpy string dtype always preserves the pandas string dtype
+    dtype = pd.StringDtype(*string_dtype_arguments)
+    assert find_common_type([dtype, np.dtype("U")]) == dtype
+    assert find_common_type([np.dtype("U"), dtype]) == dtype
+    assert find_common_type([dtype, np.dtype("U10")]) == dtype
+    assert find_common_type([np.dtype("U10"), dtype]) == dtype
+
+    # with any other numpy dtype -> object
+    assert find_common_type([dtype, np.dtype("S")]) == np.dtype("object")
+    assert find_common_type([dtype, np.dtype("int64")]) == np.dtype("object")
+
+    if Version(np.__version__) >= Version("2"):
+        assert find_common_type([dtype, np.dtypes.StringDType()]) == dtype
+        assert find_common_type([np.dtypes.StringDType(), dtype]) == dtype
--- a/lib/python3.11/site-packages/pandas/tests/arrays/string_/test_string.py
+++ b/lib/python3.11/site-packages/pandas/tests/arrays/string_/test_string.py
@ -0,0 +1,854 @@
+"""
+This module tests the functionality of StringArray and ArrowStringArray.
+Tests for the str accessors are in pandas/tests/strings/test_string_array.py
+"""
+import operator
+
+import numpy as np
+import pytest
+
+from pandas._config import using_string_dtype
+
+from pandas.compat import HAS_PYARROW
+from pandas.compat.pyarrow import (
+    pa_version_under12p0,
+    pa_version_under19p0,
+)
+import pandas.util._test_decorators as td
+
+from pandas.core.dtypes.common import is_dtype_equal
+
+import pandas as pd
+import pandas._testing as tm
+from pandas.core.arrays.string_ import StringArrayNumpySemantics
+from pandas.core.arrays.string_arrow import (
+    ArrowStringArray,
+    ArrowStringArrayNumpySemantics,
+)
+
+
+@pytest.fixture
+def dtype(string_dtype_arguments):
+    """Fixture giving StringDtype from parametrized storage and na_value arguments"""
+    storage, na_value = string_dtype_arguments
+    return pd.StringDtype(storage=storage, na_value=na_value)
+
+
+@pytest.fixture
+def dtype2(string_dtype_arguments2):
+    storage, na_value = string_dtype_arguments2
+    return pd.StringDtype(storage=storage, na_value=na_value)
+
+
+@pytest.fixture
+def cls(dtype):
+    """Fixture giving array type from parametrized 'dtype'"""
+    return dtype.construct_array_type()
+
+
+def string_dtype_highest_priority(dtype1, dtype2):
+    if HAS_PYARROW:
+        DTYPE_HIERARCHY = [
+            pd.StringDtype("python", na_value=np.nan),
+            pd.StringDtype("pyarrow", na_value=np.nan),
+            pd.StringDtype("python", na_value=pd.NA),
+            pd.StringDtype("pyarrow", na_value=pd.NA),
+        ]
+    else:
+        DTYPE_HIERARCHY = [
+            pd.StringDtype("python", na_value=np.nan),
+            pd.StringDtype("python", na_value=pd.NA),
+        ]
+
+    h1 = DTYPE_HIERARCHY.index(dtype1)
+    h2 = DTYPE_HIERARCHY.index(dtype2)
+    return DTYPE_HIERARCHY[max(h1, h2)]
+
+
+def test_dtype_constructor():
+    pytest.importorskip("pyarrow")
+
+    with tm.assert_produces_warning(FutureWarning):
+        dtype = pd.StringDtype("pyarrow_numpy")
+    assert dtype == pd.StringDtype("pyarrow", na_value=np.nan)
+
+
+def test_dtype_equality():
+    pytest.importorskip("pyarrow")
+
+    dtype1 = pd.StringDtype("python")
+    dtype2 = pd.StringDtype("pyarrow")
+    dtype3 = pd.StringDtype("pyarrow", na_value=np.nan)
+
+    assert dtype1 == pd.StringDtype("python", na_value=pd.NA)
+    assert dtype1 != dtype2
+    assert dtype1 != dtype3
+
+    assert dtype2 == pd.StringDtype("pyarrow", na_value=pd.NA)
+    assert dtype2 != dtype1
+    assert dtype2 != dtype3
+
+    assert dtype3 == pd.StringDtype("pyarrow", na_value=np.nan)
+    assert dtype3 == pd.StringDtype("pyarrow", na_value=float("nan"))
+    assert dtype3 != dtype1
+    assert dtype3 != dtype2
+
+
+def test_repr(dtype):
+    df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype=dtype)})
+    if dtype.na_value is np.nan:
+        expected = "     A\n0    a\n1  NaN\n2    b"
+    else:
+        expected = "      A\n0     a\n1  <NA>\n2     b"
+    assert repr(df) == expected
+
+    if dtype.na_value is np.nan:
+        expected = "0      a\n1    NaN\n2      b\nName: A, dtype: str"
+    else:
+        expected = "0       a\n1    <NA>\n2       b\nName: A, dtype: string"
+    assert repr(df.A) == expected
+
+    if dtype.storage == "pyarrow" and dtype.na_value is pd.NA:
+        arr_name = "ArrowStringArray"
+        expected = f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: string"
+    elif dtype.storage == "pyarrow" and dtype.na_value is np.nan:
+        arr_name = "ArrowStringArrayNumpySemantics"
+        expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: str"
+    elif dtype.storage == "python" and dtype.na_value is np.nan:
+        arr_name = "StringArrayNumpySemantics"
+        expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: str"
+    else:
+        arr_name = "StringArray"
+        expected = f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: string"
+    assert repr(df.A.array) == expected
+
+
+def test_none_to_nan(cls, dtype):
+    a = cls._from_sequence(["a", None, "b"], dtype=dtype)
+    assert a[1] is not None
+    assert a[1] is a.dtype.na_value
+
+
+def test_setitem_validates(cls, dtype):
+    arr = cls._from_sequence(["a", "b"], dtype=dtype)
+
+    msg = "Invalid value '10' for dtype 'str"
+    with pytest.raises(TypeError, match=msg):
+        arr[0] = 10
+
+    msg = "Invalid value for dtype 'str"
+    with pytest.raises(TypeError, match=msg):
+        arr[:] = np.array([1, 2])
+
+
+def test_setitem_with_scalar_string(dtype):
+    # is_float_dtype considers some strings, like 'd', to be floats
+    # which can cause issues.
+    arr = pd.array(["a", "c"], dtype=dtype)
+    arr[0] = "d"
+    expected = pd.array(["d", "c"], dtype=dtype)
+    tm.assert_extension_array_equal(arr, expected)
+
+
+def test_setitem_with_array_with_missing(dtype):
+    # ensure that when setting with an array of values, we don't mutate the
+    # array `value` in __setitem__(self, key, value)
+    arr = pd.array(["a", "b", "c"], dtype=dtype)
+    value = np.array(["A", None])
+    value_orig = value.copy()
+    arr[[0, 1]] = value
+
+    expected = pd.array(["A", pd.NA, "c"], dtype=dtype)
+    tm.assert_extension_array_equal(arr, expected)
+    tm.assert_numpy_array_equal(value, value_orig)
+
+
+def test_astype_roundtrip(dtype):
+    ser = pd.Series(pd.date_range("2000", periods=12))
+    ser[0] = None
+
+    casted = ser.astype(dtype)
+    assert is_dtype_equal(casted.dtype, dtype)
+
+    result = casted.astype("datetime64[ns]")
+    tm.assert_series_equal(result, ser)
+
+    # GH#38509 same thing for timedelta64
+    ser2 = ser - ser.iloc[-1]
+    casted2 = ser2.astype(dtype)
+    assert is_dtype_equal(casted2.dtype, dtype)
+
+    result2 = casted2.astype(ser2.dtype)
+    tm.assert_series_equal(result2, ser2)
+
+
+def test_add(dtype):
+    a = pd.Series(["a", "b", "c", None, None], dtype=dtype)
+    b = pd.Series(["x", "y", None, "z", None], dtype=dtype)
+
+    result = a + b
+    expected = pd.Series(["ax", "by", None, None, None], dtype=dtype)
+    tm.assert_series_equal(result, expected)
+
+    result = a.add(b)
+    tm.assert_series_equal(result, expected)
+
+    result = a.radd(b)
+    expected = pd.Series(["xa", "yb", None, None, None], dtype=dtype)
+    tm.assert_series_equal(result, expected)
+
+    result = a.add(b, fill_value="-")
+    expected = pd.Series(["ax", "by", "c-", "-z", None], dtype=dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_add_2d(dtype, request):
+    if dtype.storage == "pyarrow":
+        reason = "Failed: DID NOT RAISE <class 'ValueError'>"
+        mark = pytest.mark.xfail(raises=None, reason=reason)
+        request.applymarker(mark)
+
+    a = pd.array(["a", "b", "c"], dtype=dtype)
+    b = np.array([["a", "b", "c"]], dtype=object)
+    with pytest.raises(ValueError, match="3 != 1"):
+        a + b
+
+    s = pd.Series(a)
+    with pytest.raises(ValueError, match="3 != 1"):
+        s + b
+
+
+def test_add_sequence(dtype):
+    a = pd.array(["a", "b", None, None], dtype=dtype)
+    other = ["x", None, "y", None]
+
+    result = a + other
+    expected = pd.array(["ax", None, None, None], dtype=dtype)
+    tm.assert_extension_array_equal(result, expected)
+
+    result = other + a
+    expected = pd.array(["xa", None, None, None], dtype=dtype)
+    tm.assert_extension_array_equal(result, expected)
+
+
+def test_mul(dtype):
+    a = pd.array(["a", "b", None], dtype=dtype)
+    result = a * 2
+    expected = pd.array(["aa", "bb", None], dtype=dtype)
+    tm.assert_extension_array_equal(result, expected)
+
+    result = 2 * a
+    tm.assert_extension_array_equal(result, expected)
+
+
+@pytest.mark.xfail(reason="GH-28527")
+def test_add_strings(dtype):
+    arr = pd.array(["a", "b", "c", "d"], dtype=dtype)
+    df = pd.DataFrame([["t", "y", "v", "w"]], dtype=object)
+    assert arr.__add__(df) is NotImplemented
+
+    result = arr + df
+    expected = pd.DataFrame([["at", "by", "cv", "dw"]]).astype(dtype)
+    tm.assert_frame_equal(result, expected)
+
+    result = df + arr
+    expected = pd.DataFrame([["ta", "yb", "vc", "wd"]]).astype(dtype)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.xfail(reason="GH-28527")
+def test_add_frame(dtype):
+    arr = pd.array(["a", "b", np.nan, np.nan], dtype=dtype)
+    df = pd.DataFrame([["x", np.nan, "y", np.nan]])
+
+    assert arr.__add__(df) is NotImplemented
+
+    result = arr + df
+    expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]]).astype(dtype)
+    tm.assert_frame_equal(result, expected)
+
+    result = df + arr
+    expected = pd.DataFrame([["xa", np.nan, np.nan, np.nan]]).astype(dtype)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_comparison_methods_scalar(comparison_op, dtype):
+    op_name = f"__{comparison_op.__name__}__"
+    a = pd.array(["a", None, "c"], dtype=dtype)
+    other = "a"
+    result = getattr(a, op_name)(other)
+    if dtype.na_value is np.nan:
+        expected = np.array([getattr(item, op_name)(other) for item in a])
+        if comparison_op == operator.ne:
+            expected[1] = True
+        else:
+            expected[1] = False
+        tm.assert_numpy_array_equal(result, expected.astype(np.bool_))
+    else:
+        expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
+        expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object)
+        expected = pd.array(expected, dtype=expected_dtype)
+        tm.assert_extension_array_equal(result, expected)
+
+
+def test_comparison_methods_scalar_pd_na(comparison_op, dtype):
+    op_name = f"__{comparison_op.__name__}__"
+    a = pd.array(["a", None, "c"], dtype=dtype)
+    result = getattr(a, op_name)(pd.NA)
+
+    if dtype.na_value is np.nan:
+        if operator.ne == comparison_op:
+            expected = np.array([True, True, True])
+        else:
+            expected = np.array([False, False, False])
+        tm.assert_numpy_array_equal(result, expected)
+    else:
+        expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
+        expected = pd.array([None, None, None], dtype=expected_dtype)
+        tm.assert_extension_array_equal(result, expected)
+        tm.assert_extension_array_equal(result, expected)
+
+
+def test_comparison_methods_scalar_not_string(comparison_op, dtype):
+    op_name = f"__{comparison_op.__name__}__"
+
+    a = pd.array(["a", None, "c"], dtype=dtype)
+    other = 42
+
+    if op_name not in ["__eq__", "__ne__"]:
+        with pytest.raises(TypeError, match="Invalid comparison|not supported between"):
+            getattr(a, op_name)(other)
+
+        return
+
+    result = getattr(a, op_name)(other)
+
+    if dtype.na_value is np.nan:
+        expected_data = {
+            "__eq__": [False, False, False],
+            "__ne__": [True, True, True],
+        }[op_name]
+        expected = np.array(expected_data)
+        tm.assert_numpy_array_equal(result, expected)
+    else:
+        expected_data = {"__eq__": [False, None, False], "__ne__": [True, None, True]}[
+            op_name
+        ]
+        expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
+        expected = pd.array(expected_data, dtype=expected_dtype)
+        tm.assert_extension_array_equal(result, expected)
+
+
+def test_comparison_methods_array(comparison_op, dtype, dtype2):
+    op_name = f"__{comparison_op.__name__}__"
+
+    a = pd.array(["a", None, "c"], dtype=dtype)
+    other = pd.array([None, None, "c"], dtype=dtype2)
+    result = comparison_op(a, other)
+
+    # ensure operation is commutative
+    result2 = comparison_op(other, a)
+    tm.assert_equal(result, result2)
+
+    if dtype.na_value is np.nan and dtype2.na_value is np.nan:
+        if operator.ne == comparison_op:
+            expected = np.array([True, True, False])
+        else:
+            expected = np.array([False, False, False])
+            expected[-1] = getattr(other[-1], op_name)(a[-1])
+        tm.assert_numpy_array_equal(result, expected)
+
+    else:
+        max_dtype = string_dtype_highest_priority(dtype, dtype2)
+        if max_dtype.storage == "python":
+            expected_dtype = "boolean"
+        else:
+            expected_dtype = "bool[pyarrow]"
+
+        expected = np.full(len(a), fill_value=None, dtype="object")
+        expected[-1] = getattr(other[-1], op_name)(a[-1])
+        expected = pd.array(expected, dtype=expected_dtype)
+        tm.assert_extension_array_equal(result, expected)
+
+
+@td.skip_if_no("pyarrow")
+def test_comparison_methods_array_arrow_extension(comparison_op, dtype2):
+    # Test pd.ArrowDtype(pa.string()) against other string arrays
+    import pyarrow as pa
+
+    op_name = f"__{comparison_op.__name__}__"
+    dtype = pd.ArrowDtype(pa.string())
+    a = pd.array(["a", None, "c"], dtype=dtype)
+    other = pd.array([None, None, "c"], dtype=dtype2)
+    result = comparison_op(a, other)
+
+    # ensure operation is commutative
+    result2 = comparison_op(other, a)
+    tm.assert_equal(result, result2)
+
+    expected = pd.array([None, None, True], dtype="bool[pyarrow]")
+    expected[-1] = getattr(other[-1], op_name)(a[-1])
+    tm.assert_extension_array_equal(result, expected)
+
+
+def test_comparison_methods_list(comparison_op, dtype):
+    op_name = f"__{comparison_op.__name__}__"
+
+    a = pd.array(["a", None, "c"], dtype=dtype)
+    other = [None, None, "c"]
+    result = comparison_op(a, other)
+
+    # ensure operation is commutative
+    result2 = comparison_op(other, a)
+    tm.assert_equal(result, result2)
+
+    if dtype.na_value is np.nan:
+        if operator.ne == comparison_op:
+            expected = np.array([True, True, False])
+        else:
+            expected = np.array([False, False, False])
+            expected[-1] = getattr(other[-1], op_name)(a[-1])
+        tm.assert_numpy_array_equal(result, expected)
+
+    else:
+        expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
+        expected = np.full(len(a), fill_value=None, dtype="object")
+        expected[-1] = getattr(other[-1], op_name)(a[-1])
+        expected = pd.array(expected, dtype=expected_dtype)
+        tm.assert_extension_array_equal(result, expected)
+
+
+def test_constructor_raises(cls):
+    if cls is pd.arrays.StringArray:
+        msg = "StringArray requires a sequence of strings or pandas.NA"
+    elif cls is StringArrayNumpySemantics:
+        msg = "StringArrayNumpySemantics requires a sequence of strings or NaN"
+    else:
+        msg = "Unsupported type '<class 'numpy.ndarray'>' for ArrowExtensionArray"
+
+    with pytest.raises(ValueError, match=msg):
+        cls(np.array(["a", "b"], dtype="S1"))
+
+    with pytest.raises(ValueError, match=msg):
+        cls(np.array([]))
+
+    if cls is pd.arrays.StringArray or cls is StringArrayNumpySemantics:
+        # GH#45057 np.nan and None do NOT raise, as they are considered valid NAs
+        #  for string dtype
+        cls(np.array(["a", np.nan], dtype=object))
+        cls(np.array(["a", None], dtype=object))
+    else:
+        with pytest.raises(ValueError, match=msg):
+            cls(np.array(["a", np.nan], dtype=object))
+        with pytest.raises(ValueError, match=msg):
+            cls(np.array(["a", None], dtype=object))
+
+    with pytest.raises(ValueError, match=msg):
+        cls(np.array(["a", pd.NaT], dtype=object))
+
+    with pytest.raises(ValueError, match=msg):
+        cls(np.array(["a", np.datetime64("NaT", "ns")], dtype=object))
+
+    with pytest.raises(ValueError, match=msg):
+        cls(np.array(["a", np.timedelta64("NaT", "ns")], dtype=object))
+
+
+@pytest.mark.parametrize("na", [np.nan, np.float64("nan"), float("nan"), None, pd.NA])
+def test_constructor_nan_like(na):
+    expected = pd.arrays.StringArray(np.array(["a", pd.NA]))
+    tm.assert_extension_array_equal(
+        pd.arrays.StringArray(np.array(["a", na], dtype="object")), expected
+    )
+
+
+@pytest.mark.parametrize("copy", [True, False])
+def test_from_sequence_no_mutate(copy, cls, dtype):
+    nan_arr = np.array(["a", np.nan], dtype=object)
+    expected_input = nan_arr.copy()
+    na_arr = np.array(["a", pd.NA], dtype=object)
+
+    result = cls._from_sequence(nan_arr, dtype=dtype, copy=copy)
+
+    if cls in (ArrowStringArray, ArrowStringArrayNumpySemantics):
+        import pyarrow as pa
+
+        expected = cls(pa.array(na_arr, type=pa.string(), from_pandas=True))
+    elif cls is StringArrayNumpySemantics:
+        expected = cls(nan_arr)
+    else:
+        expected = cls(na_arr)
+
+    tm.assert_extension_array_equal(result, expected)
+    tm.assert_numpy_array_equal(nan_arr, expected_input)
+
+
+def test_astype_int(dtype):
+    arr = pd.array(["1", "2", "3"], dtype=dtype)
+    result = arr.astype("int64")
+    expected = np.array([1, 2, 3], dtype="int64")
+    tm.assert_numpy_array_equal(result, expected)
+
+    arr = pd.array(["1", pd.NA, "3"], dtype=dtype)
+    if dtype.na_value is np.nan:
+        err = ValueError
+        msg = "cannot convert float NaN to integer"
+    else:
+        err = TypeError
+        msg = (
+            r"int\(\) argument must be a string, a bytes-like "
+            r"object or a( real)? number"
+        )
+    with pytest.raises(err, match=msg):
+        arr.astype("int64")
+
+
+def test_astype_nullable_int(dtype):
+    arr = pd.array(["1", pd.NA, "3"], dtype=dtype)
+
+    result = arr.astype("Int64")
+    expected = pd.array([1, pd.NA, 3], dtype="Int64")
+    tm.assert_extension_array_equal(result, expected)
+
+
+def test_astype_float(dtype, any_float_dtype):
+    # Don't compare arrays (37974)
+    ser = pd.Series(["1.1", pd.NA, "3.3"], dtype=dtype)
+    result = ser.astype(any_float_dtype)
+    expected = pd.Series([1.1, np.nan, 3.3], dtype=any_float_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("skipna", [True, False])
+def test_reduce(skipna, dtype):
+    arr = pd.Series(["a", "b", "c"], dtype=dtype)
+    result = arr.sum(skipna=skipna)
+    assert result == "abc"
+
+
+@pytest.mark.parametrize("skipna", [True, False])
+def test_reduce_missing(skipna, dtype):
+    arr = pd.Series([None, "a", None, "b", "c", None], dtype=dtype)
+    result = arr.sum(skipna=skipna)
+    if skipna:
+        assert result == "abc"
+    else:
+        assert pd.isna(result)
+
+
+@pytest.mark.parametrize("method", ["min", "max"])
+@pytest.mark.parametrize("skipna", [True, False])
+def test_min_max(method, skipna, dtype):
+    arr = pd.Series(["a", "b", "c", None], dtype=dtype)
+    result = getattr(arr, method)(skipna=skipna)
+    if skipna:
+        expected = "a" if method == "min" else "c"
+        assert result == expected
+    else:
+        assert result is arr.dtype.na_value
+
+
+@pytest.mark.parametrize("method", ["min", "max"])
+@pytest.mark.parametrize("box", [pd.Series, pd.array])
+def test_min_max_numpy(method, box, dtype, request):
+    if dtype.storage == "pyarrow" and box is pd.array:
+        if box is pd.array:
+            reason = "'<=' not supported between instances of 'str' and 'NoneType'"
+        else:
+            reason = "'ArrowStringArray' object has no attribute 'max'"
+        mark = pytest.mark.xfail(raises=TypeError, reason=reason)
+        request.applymarker(mark)
+
+    arr = box(["a", "b", "c", None], dtype=dtype)
+    result = getattr(np, method)(arr)
+    expected = "a" if method == "min" else "c"
+    assert result == expected
+
+
+def test_fillna_args(dtype):
+    # GH 37987
+
+    arr = pd.array(["a", pd.NA], dtype=dtype)
+
+    res = arr.fillna(value="b")
+    expected = pd.array(["a", "b"], dtype=dtype)
+    tm.assert_extension_array_equal(res, expected)
+
+    res = arr.fillna(value=np.str_("b"))
+    expected = pd.array(["a", "b"], dtype=dtype)
+    tm.assert_extension_array_equal(res, expected)
+
+    msg = "Invalid value '1' for dtype 'str"
+    with pytest.raises(TypeError, match=msg):
+        arr.fillna(value=1)
+
+
+def test_arrow_array(dtype):
+    # protocol added in 0.15.0
+    pa = pytest.importorskip("pyarrow")
+    import pyarrow.compute as pc
+
+    data = pd.array(["a", "b", "c"], dtype=dtype)
+    arr = pa.array(data)
+    expected = pa.array(list(data), type=pa.large_string(), from_pandas=True)
+    if dtype.storage == "pyarrow" and pa_version_under12p0:
+        expected = pa.chunked_array(expected)
+    if dtype.storage == "python":
+        expected = pc.cast(expected, pa.string())
+    assert arr.equals(expected)
+
+
+@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
+def test_arrow_roundtrip(dtype, string_storage, using_infer_string):
+    # roundtrip possible from arrow 1.0.0
+    pa = pytest.importorskip("pyarrow")
+
+    data = pd.array(["a", "b", None], dtype=dtype)
+    df = pd.DataFrame({"a": data})
+    table = pa.table(df)
+    if dtype.storage == "python":
+        assert table.field("a").type == "string"
+    else:
+        assert table.field("a").type == "large_string"
+    with pd.option_context("string_storage", string_storage):
+        result = table.to_pandas()
+    if dtype.na_value is np.nan and not using_infer_string:
+        assert result["a"].dtype == "object"
+    else:
+        assert isinstance(result["a"].dtype, pd.StringDtype)
+        expected = df.astype(pd.StringDtype(string_storage, na_value=dtype.na_value))
+        if using_infer_string:
+            expected.columns = expected.columns.astype(
+                pd.StringDtype(string_storage, na_value=np.nan)
+            )
+        tm.assert_frame_equal(result, expected)
+        # ensure the missing value is represented by NA and not np.nan or None
+        assert result.loc[2, "a"] is result["a"].dtype.na_value
+
+
+@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
+def test_arrow_from_string(using_infer_string):
+    # not roundtrip,  but starting with pyarrow table without pandas metadata
+    pa = pytest.importorskip("pyarrow")
+    table = pa.table({"a": pa.array(["a", "b", None], type=pa.string())})
+
+    result = table.to_pandas()
+
+    if using_infer_string and not pa_version_under19p0:
+        expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="str")
+    else:
+        expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="object")
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
+def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string):
+    # GH-41040
+    pa = pytest.importorskip("pyarrow")
+
+    data = pd.array([], dtype=dtype)
+    df = pd.DataFrame({"a": data})
+    table = pa.table(df)
+    if dtype.storage == "python":
+        assert table.field("a").type == "string"
+    else:
+        assert table.field("a").type == "large_string"
+    # Instantiate the same table with no chunks at all
+    table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema)
+    with pd.option_context("string_storage", string_storage):
+        result = table.to_pandas()
+
+    if dtype.na_value is np.nan and not using_string_dtype():
+        assert result["a"].dtype == "object"
+    else:
+        assert isinstance(result["a"].dtype, pd.StringDtype)
+        expected = df.astype(pd.StringDtype(string_storage, na_value=dtype.na_value))
+        if using_infer_string:
+            expected.columns = expected.columns.astype(
+                pd.StringDtype(string_storage, na_value=np.nan)
+            )
+        tm.assert_frame_equal(result, expected)
+
+
+def test_value_counts_na(dtype):
+    if dtype.na_value is np.nan:
+        exp_dtype = "int64"
+    elif dtype.storage == "pyarrow":
+        exp_dtype = "int64[pyarrow]"
+    else:
+        exp_dtype = "Int64"
+    arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype)
+    result = arr.value_counts(dropna=False)
+    expected = pd.Series([2, 1, 1], index=arr[[0, 1, 3]], dtype=exp_dtype, name="count")
+    tm.assert_series_equal(result, expected)
+
+    result = arr.value_counts(dropna=True)
+    expected = pd.Series([2, 1], index=arr[:2], dtype=exp_dtype, name="count")
+    tm.assert_series_equal(result, expected)
+
+
+def test_value_counts_with_normalize(dtype):
+    if dtype.na_value is np.nan:
+        exp_dtype = np.float64
+    elif dtype.storage == "pyarrow":
+        exp_dtype = "double[pyarrow]"
+    else:
+        exp_dtype = "Float64"
+    ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype)
+    result = ser.value_counts(normalize=True)
+    expected = pd.Series([2, 1], index=ser[:2], dtype=exp_dtype, name="proportion") / 3
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "values, expected",
+    [
+        (["a", "b", "c"], np.array([False, False, False])),
+        (["a", "b", None], np.array([False, False, True])),
+    ],
+)
+def test_use_inf_as_na(values, expected, dtype):
+    # https://github.com/pandas-dev/pandas/issues/33655
+    values = pd.array(values, dtype=dtype)
+    msg = "use_inf_as_na option is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        with pd.option_context("mode.use_inf_as_na", True):
+            result = values.isna()
+            tm.assert_numpy_array_equal(result, expected)
+
+            result = pd.Series(values).isna()
+            expected = pd.Series(expected)
+            tm.assert_series_equal(result, expected)
+
+            result = pd.DataFrame(values).isna()
+            expected = pd.DataFrame(expected)
+            tm.assert_frame_equal(result, expected)
+
+
+def test_value_counts_sort_false(dtype):
+    if dtype.na_value is np.nan:
+        exp_dtype = "int64"
+    elif dtype.storage == "pyarrow":
+        exp_dtype = "int64[pyarrow]"
+    else:
+        exp_dtype = "Int64"
+    ser = pd.Series(["a", "b", "c", "b"], dtype=dtype)
+    result = ser.value_counts(sort=False)
+    expected = pd.Series([1, 2, 1], index=ser[:3], dtype=exp_dtype, name="count")
+    tm.assert_series_equal(result, expected)
+
+
+def test_memory_usage(dtype):
+    # GH 33963
+
+    if dtype.storage == "pyarrow":
+        pytest.skip(f"not applicable for {dtype.storage}")
+
+    series = pd.Series(["a", "b", "c"], dtype=dtype)
+
+    assert 0 < series.nbytes <= series.memory_usage() < series.memory_usage(deep=True)
+
+
+@pytest.mark.parametrize("float_dtype", [np.float16, np.float32, np.float64])
+def test_astype_from_float_dtype(float_dtype, dtype):
+    # https://github.com/pandas-dev/pandas/issues/36451
+    ser = pd.Series([0.1], dtype=float_dtype)
+    result = ser.astype(dtype)
+    expected = pd.Series(["0.1"], dtype=dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_to_numpy_returns_pdna_default(dtype):
+    arr = pd.array(["a", pd.NA, "b"], dtype=dtype)
+    result = np.array(arr)
+    expected = np.array(["a", dtype.na_value, "b"], dtype=object)
+    tm.assert_numpy_array_equal(result, expected)
+
+
+def test_to_numpy_na_value(dtype, nulls_fixture):
+    na_value = nulls_fixture
+    arr = pd.array(["a", pd.NA, "b"], dtype=dtype)
+    result = arr.to_numpy(na_value=na_value)
+    expected = np.array(["a", na_value, "b"], dtype=object)
+    tm.assert_numpy_array_equal(result, expected)
+
+
+def test_isin(dtype, fixed_now_ts):
+    s = pd.Series(["a", "b", None], dtype=dtype)
+
+    result = s.isin(["a", "c"])
+    expected = pd.Series([True, False, False])
+    tm.assert_series_equal(result, expected)
+
+    result = s.isin(["a", pd.NA])
+    expected = pd.Series([True, False, True])
+    tm.assert_series_equal(result, expected)
+
+    result = s.isin([])
+    expected = pd.Series([False, False, False])
+    tm.assert_series_equal(result, expected)
+
+    result = s.isin(["a", fixed_now_ts])
+    expected = pd.Series([True, False, False])
+    tm.assert_series_equal(result, expected)
+
+    result = s.isin([fixed_now_ts])
+    expected = pd.Series([False, False, False])
+    tm.assert_series_equal(result, expected)
+
+
+def test_isin_string_array(dtype, dtype2):
+    s = pd.Series(["a", "b", None], dtype=dtype)
+
+    result = s.isin(pd.array(["a", "c"], dtype=dtype2))
+    expected = pd.Series([True, False, False])
+    tm.assert_series_equal(result, expected)
+
+    result = s.isin(pd.array(["a", None], dtype=dtype2))
+    expected = pd.Series([True, False, True])
+    tm.assert_series_equal(result, expected)
+
+
+def test_isin_arrow_string_array(dtype):
+    pa = pytest.importorskip("pyarrow")
+    s = pd.Series(["a", "b", None], dtype=dtype)
+
+    result = s.isin(pd.array(["a", "c"], dtype=pd.ArrowDtype(pa.string())))
+    expected = pd.Series([True, False, False])
+    tm.assert_series_equal(result, expected)
+
+    result = s.isin(pd.array(["a", None], dtype=pd.ArrowDtype(pa.string())))
+    expected = pd.Series([True, False, True])
+    tm.assert_series_equal(result, expected)
+
+
+def test_setitem_scalar_with_mask_validation(dtype):
+    # https://github.com/pandas-dev/pandas/issues/47628
+    # setting None with a boolean mask (through _putmaks) should still result
+    # in pd.NA values in the underlying array
+    ser = pd.Series(["a", "b", "c"], dtype=dtype)
+    mask = np.array([False, True, False])
+
+    ser[mask] = None
+    assert ser.array[1] is ser.dtype.na_value
+
+    # for other non-string we should also raise an error
+    ser = pd.Series(["a", "b", "c"], dtype=dtype)
+    msg = "Invalid value '1' for dtype 'str"
+    with pytest.raises(TypeError, match=msg):
+        ser[mask] = 1
+
+
+def test_from_numpy_str(dtype):
+    vals = ["a", "b", "c"]
+    arr = np.array(vals, dtype=np.str_)
+    result = pd.array(arr, dtype=dtype)
+    expected = pd.array(vals, dtype=dtype)
+    tm.assert_extension_array_equal(result, expected)
+
+
+def test_tolist(dtype):
+    vals = ["a", "b", "c"]
+    arr = pd.array(vals, dtype=dtype)
+    result = arr.tolist()
+    expected = vals
+    tm.assert_equal(result, expected)
--- a/lib/python3.11/site-packages/pandas/tests/arrays/string_/test_string_arrow.py
+++ b/lib/python3.11/site-packages/pandas/tests/arrays/string_/test_string_arrow.py
@ -0,0 +1,282 @@
+import pickle
+import re
+
+import numpy as np
+import pytest
+
+import pandas.util._test_decorators as td
+
+import pandas as pd
+import pandas._testing as tm
+from pandas.core.arrays.string_ import (
+    StringArray,
+    StringDtype,
+)
+from pandas.core.arrays.string_arrow import (
+    ArrowStringArray,
+    ArrowStringArrayNumpySemantics,
+)
+
+
+def test_eq_all_na():
+    pytest.importorskip("pyarrow")
+    a = pd.array([pd.NA, pd.NA], dtype=StringDtype("pyarrow"))
+    result = a == a
+    expected = pd.array([pd.NA, pd.NA], dtype="boolean[pyarrow]")
+    tm.assert_extension_array_equal(result, expected)
+
+
+def test_config(string_storage, using_infer_string):
+    # with the default string_storage setting
+    # always "python" at the moment
+    assert StringDtype().storage == "python"
+
+    with pd.option_context("string_storage", string_storage):
+        assert StringDtype().storage == string_storage
+        result = pd.array(["a", "b"])
+        assert result.dtype.storage == string_storage
+
+    # pd.array(..) by default always returns the NA-variant
+    dtype = StringDtype(string_storage, na_value=pd.NA)
+    expected = dtype.construct_array_type()._from_sequence(["a", "b"], dtype=dtype)
+    tm.assert_equal(result, expected)
+
+
+def test_config_bad_storage_raises():
+    msg = re.escape("Value must be one of python|pyarrow")
+    with pytest.raises(ValueError, match=msg):
+        pd.options.mode.string_storage = "foo"
+
+
+@pytest.mark.parametrize("chunked", [True, False])
+@pytest.mark.parametrize("array_lib", ["numpy", "pyarrow"])
+def test_constructor_not_string_type_raises(array_lib, chunked):
+    pa = pytest.importorskip("pyarrow")
+
+    array_lib = pa if array_lib == "pyarrow" else np
+
+    arr = array_lib.array([1, 2, 3])
+    if chunked:
+        if array_lib is np:
+            pytest.skip("chunked not applicable to numpy array")
+        arr = pa.chunked_array(arr)
+    if array_lib is np:
+        msg = "Unsupported type '<class 'numpy.ndarray'>' for ArrowExtensionArray"
+    else:
+        msg = re.escape(
+            "ArrowStringArray requires a PyArrow (chunked) array of large_string type"
+        )
+    with pytest.raises(ValueError, match=msg):
+        ArrowStringArray(arr)
+
+
+@pytest.mark.parametrize("chunked", [True, False])
+def test_constructor_not_string_type_value_dictionary_raises(chunked):
+    pa = pytest.importorskip("pyarrow")
+
+    arr = pa.array([1, 2, 3], pa.dictionary(pa.int32(), pa.int32()))
+    if chunked:
+        arr = pa.chunked_array(arr)
+
+    msg = re.escape(
+        "ArrowStringArray requires a PyArrow (chunked) array of large_string type"
+    )
+    with pytest.raises(ValueError, match=msg):
+        ArrowStringArray(arr)
+
+
+@pytest.mark.parametrize("string_type", ["string", "large_string"])
+@pytest.mark.parametrize("chunked", [True, False])
+def test_constructor_valid_string_type_value_dictionary(string_type, chunked):
+    pa = pytest.importorskip("pyarrow")
+
+    arr = pa.array(["1", "2", "3"], getattr(pa, string_type)()).dictionary_encode()
+    if chunked:
+        arr = pa.chunked_array(arr)
+
+    arr = ArrowStringArray(arr)
+    # dictionary type get converted to dense large string array
+    assert pa.types.is_large_string(arr._pa_array.type)
+
+
+@pytest.mark.parametrize("chunked", [True, False])
+def test_constructor_valid_string_view(chunked):
+    # requires pyarrow>=18 for casting string_view to string
+    pa = pytest.importorskip("pyarrow", minversion="18")
+
+    arr = pa.array(["1", "2", "3"], pa.string_view())
+    if chunked:
+        arr = pa.chunked_array(arr)
+
+    arr = ArrowStringArray(arr)
+    # dictionary type get converted to dense large string array
+    assert pa.types.is_large_string(arr._pa_array.type)
+
+
+def test_constructor_from_list():
+    # GH#27673
+    pytest.importorskip("pyarrow")
+    result = pd.Series(["E"], dtype=StringDtype(storage="pyarrow"))
+    assert isinstance(result.dtype, StringDtype)
+    assert result.dtype.storage == "pyarrow"
+
+
+def test_from_sequence_wrong_dtype_raises(using_infer_string):
+    pytest.importorskip("pyarrow")
+    with pd.option_context("string_storage", "python"):
+        ArrowStringArray._from_sequence(["a", None, "c"], dtype="string")
+
+    with pd.option_context("string_storage", "pyarrow"):
+        ArrowStringArray._from_sequence(["a", None, "c"], dtype="string")
+
+    with pytest.raises(AssertionError, match=None):
+        ArrowStringArray._from_sequence(["a", None, "c"], dtype="string[python]")
+
+    ArrowStringArray._from_sequence(["a", None, "c"], dtype="string[pyarrow]")
+
+    if not using_infer_string:
+        with pytest.raises(AssertionError, match=None):
+            with pd.option_context("string_storage", "python"):
+                ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype())
+
+    with pd.option_context("string_storage", "pyarrow"):
+        ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype())
+
+    if not using_infer_string:
+        with pytest.raises(AssertionError, match=None):
+            ArrowStringArray._from_sequence(
+                ["a", None, "c"], dtype=StringDtype("python")
+            )
+
+    ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype("pyarrow"))
+
+    with pd.option_context("string_storage", "python"):
+        StringArray._from_sequence(["a", None, "c"], dtype="string")
+
+    with pd.option_context("string_storage", "pyarrow"):
+        StringArray._from_sequence(["a", None, "c"], dtype="string")
+
+    StringArray._from_sequence(["a", None, "c"], dtype="string[python]")
+
+    with pytest.raises(AssertionError, match=None):
+        StringArray._from_sequence(["a", None, "c"], dtype="string[pyarrow]")
+
+    if not using_infer_string:
+        with pd.option_context("string_storage", "python"):
+            StringArray._from_sequence(["a", None, "c"], dtype=StringDtype())
+
+    if not using_infer_string:
+        with pytest.raises(AssertionError, match=None):
+            with pd.option_context("string_storage", "pyarrow"):
+                StringArray._from_sequence(["a", None, "c"], dtype=StringDtype())
+
+    StringArray._from_sequence(["a", None, "c"], dtype=StringDtype("python"))
+
+    with pytest.raises(AssertionError, match=None):
+        StringArray._from_sequence(["a", None, "c"], dtype=StringDtype("pyarrow"))
+
+
+@td.skip_if_installed("pyarrow")
+def test_pyarrow_not_installed_raises():
+    msg = re.escape("pyarrow>=10.0.1 is required for PyArrow backed")
+
+    with pytest.raises(ImportError, match=msg):
+        StringDtype(storage="pyarrow")
+
+    with pytest.raises(ImportError, match=msg):
+        ArrowStringArray([])
+
+    with pytest.raises(ImportError, match=msg):
+        ArrowStringArrayNumpySemantics([])
+
+    with pytest.raises(ImportError, match=msg):
+        ArrowStringArray._from_sequence(["a", None, "b"])
+
+
+@pytest.mark.parametrize("multiple_chunks", [False, True])
+@pytest.mark.parametrize(
+    "key, value, expected",
+    [
+        (-1, "XX", ["a", "b", "c", "d", "XX"]),
+        (1, "XX", ["a", "XX", "c", "d", "e"]),
+        (1, None, ["a", None, "c", "d", "e"]),
+        (1, pd.NA, ["a", None, "c", "d", "e"]),
+        ([1, 3], "XX", ["a", "XX", "c", "XX", "e"]),
+        ([1, 3], ["XX", "YY"], ["a", "XX", "c", "YY", "e"]),
+        ([1, 3], ["XX", None], ["a", "XX", "c", None, "e"]),
+        ([1, 3], ["XX", pd.NA], ["a", "XX", "c", None, "e"]),
+        ([0, -1], ["XX", "YY"], ["XX", "b", "c", "d", "YY"]),
+        ([-1, 0], ["XX", "YY"], ["YY", "b", "c", "d", "XX"]),
+        (slice(3, None), "XX", ["a", "b", "c", "XX", "XX"]),
+        (slice(2, 4), ["XX", "YY"], ["a", "b", "XX", "YY", "e"]),
+        (slice(3, 1, -1), ["XX", "YY"], ["a", "b", "YY", "XX", "e"]),
+        (slice(None), "XX", ["XX", "XX", "XX", "XX", "XX"]),
+        ([False, True, False, True, False], ["XX", "YY"], ["a", "XX", "c", "YY", "e"]),
+    ],
+)
+def test_setitem(multiple_chunks, key, value, expected):
+    pa = pytest.importorskip("pyarrow")
+
+    result = pa.array(list("abcde"))
+    expected = pa.array(expected)
+
+    if multiple_chunks:
+        result = pa.chunked_array([result[:3], result[3:]])
+        expected = pa.chunked_array([expected[:3], expected[3:]])
+
+    result = ArrowStringArray(result)
+    expected = ArrowStringArray(expected)
+
+    result[key] = value
+    tm.assert_equal(result, expected)
+
+
+def test_setitem_invalid_indexer_raises():
+    pa = pytest.importorskip("pyarrow")
+
+    arr = ArrowStringArray(pa.array(list("abcde")))
+
+    with pytest.raises(IndexError, match=None):
+        arr[5] = "foo"
+
+    with pytest.raises(IndexError, match=None):
+        arr[-6] = "foo"
+
+    with pytest.raises(IndexError, match=None):
+        arr[[0, 5]] = "foo"
+
+    with pytest.raises(IndexError, match=None):
+        arr[[0, -6]] = "foo"
+
+    with pytest.raises(IndexError, match=None):
+        arr[[True, True, False]] = "foo"
+
+    with pytest.raises(ValueError, match=None):
+        arr[[0, 1]] = ["foo", "bar", "baz"]
+
+
+@pytest.mark.parametrize("na_value", [pd.NA, np.nan])
+def test_pickle_roundtrip(na_value):
+    # GH 42600
+    pytest.importorskip("pyarrow")
+    dtype = StringDtype("pyarrow", na_value=na_value)
+    expected = pd.Series(range(10), dtype=dtype)
+    expected_sliced = expected.head(2)
+    full_pickled = pickle.dumps(expected)
+    sliced_pickled = pickle.dumps(expected_sliced)
+
+    assert len(full_pickled) > len(sliced_pickled)
+
+    result = pickle.loads(full_pickled)
+    tm.assert_series_equal(result, expected)
+
+    result_sliced = pickle.loads(sliced_pickled)
+    tm.assert_series_equal(result_sliced, expected_sliced)
+
+
+def test_string_dtype_error_message():
+    # GH#55051
+    pytest.importorskip("pyarrow")
+    msg = "Storage must be 'python' or 'pyarrow'."
+    with pytest.raises(ValueError, match=msg):
+        StringDtype("bla")