done
This commit is contained in:
		| @ -0,0 +1,23 @@ | ||||
| import numpy as np | ||||
|  | ||||
| import pandas as pd | ||||
|  | ||||
|  | ||||
| def is_object_or_nan_string_dtype(dtype): | ||||
|     """ | ||||
|     Check if string-like dtype is following NaN semantics, i.e. is object | ||||
|     dtype or a NaN-variant of the StringDtype. | ||||
|     """ | ||||
|     return (isinstance(dtype, np.dtype) and dtype == "object") or ( | ||||
|         dtype.na_value is np.nan | ||||
|     ) | ||||
|  | ||||
|  | ||||
| def _convert_na_value(ser, expected): | ||||
|     if ser.dtype != object: | ||||
|         if ser.dtype.na_value is np.nan: | ||||
|             expected = expected.fillna(np.nan) | ||||
|         else: | ||||
|             # GH#18463 | ||||
|             expected = expected.fillna(pd.NA) | ||||
|     return expected | ||||
							
								
								
									
										132
									
								
								lib/python3.11/site-packages/pandas/tests/strings/conftest.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										132
									
								
								lib/python3.11/site-packages/pandas/tests/strings/conftest.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,132 @@ | ||||
| import pytest | ||||
|  | ||||
| from pandas import Series | ||||
| from pandas.core.strings.accessor import StringMethods | ||||
|  | ||||
| _any_string_method = [ | ||||
|     ("cat", (), {"sep": ","}), | ||||
|     ("cat", (Series(list("zyx")),), {"sep": ",", "join": "left"}), | ||||
|     ("center", (10,), {}), | ||||
|     ("contains", ("a",), {}), | ||||
|     ("count", ("a",), {}), | ||||
|     ("decode", ("UTF-8",), {}), | ||||
|     ("encode", ("UTF-8",), {}), | ||||
|     ("endswith", ("a",), {}), | ||||
|     ("endswith", ((),), {}), | ||||
|     ("endswith", (("a",),), {}), | ||||
|     ("endswith", (("a", "b"),), {}), | ||||
|     ("endswith", (("a", "MISSING"),), {}), | ||||
|     ("endswith", ("a",), {"na": True}), | ||||
|     ("endswith", ("a",), {"na": False}), | ||||
|     ("extract", ("([a-z]*)",), {"expand": False}), | ||||
|     ("extract", ("([a-z]*)",), {"expand": True}), | ||||
|     ("extractall", ("([a-z]*)",), {}), | ||||
|     ("find", ("a",), {}), | ||||
|     ("findall", ("a",), {}), | ||||
|     ("get", (0,), {}), | ||||
|     # because "index" (and "rindex") fail intentionally | ||||
|     # if the string is not found, search only for empty string | ||||
|     ("index", ("",), {}), | ||||
|     ("join", (",",), {}), | ||||
|     ("ljust", (10,), {}), | ||||
|     ("match", ("a",), {}), | ||||
|     ("fullmatch", ("a",), {}), | ||||
|     ("normalize", ("NFC",), {}), | ||||
|     ("pad", (10,), {}), | ||||
|     ("partition", (" ",), {"expand": False}), | ||||
|     ("partition", (" ",), {"expand": True}), | ||||
|     ("repeat", (3,), {}), | ||||
|     ("replace", ("a", "z"), {}), | ||||
|     ("rfind", ("a",), {}), | ||||
|     ("rindex", ("",), {}), | ||||
|     ("rjust", (10,), {}), | ||||
|     ("rpartition", (" ",), {"expand": False}), | ||||
|     ("rpartition", (" ",), {"expand": True}), | ||||
|     ("slice", (0, 1), {}), | ||||
|     ("slice_replace", (0, 1, "z"), {}), | ||||
|     ("split", (" ",), {"expand": False}), | ||||
|     ("split", (" ",), {"expand": True}), | ||||
|     ("startswith", ("a",), {}), | ||||
|     ("startswith", (("a",),), {}), | ||||
|     ("startswith", (("a", "b"),), {}), | ||||
|     ("startswith", (("a", "MISSING"),), {}), | ||||
|     ("startswith", ((),), {}), | ||||
|     ("startswith", ("a",), {"na": True}), | ||||
|     ("startswith", ("a",), {"na": False}), | ||||
|     ("removeprefix", ("a",), {}), | ||||
|     ("removesuffix", ("a",), {}), | ||||
|     # translating unicode points of "a" to "d" | ||||
|     ("translate", ({97: 100},), {}), | ||||
|     ("wrap", (2,), {}), | ||||
|     ("zfill", (10,), {}), | ||||
| ] + list( | ||||
|     zip( | ||||
|         [ | ||||
|             # methods without positional arguments: zip with empty tuple and empty dict | ||||
|             "capitalize", | ||||
|             "cat", | ||||
|             "get_dummies", | ||||
|             "isalnum", | ||||
|             "isalpha", | ||||
|             "isdecimal", | ||||
|             "isdigit", | ||||
|             "islower", | ||||
|             "isnumeric", | ||||
|             "isspace", | ||||
|             "istitle", | ||||
|             "isupper", | ||||
|             "len", | ||||
|             "lower", | ||||
|             "lstrip", | ||||
|             "partition", | ||||
|             "rpartition", | ||||
|             "rsplit", | ||||
|             "rstrip", | ||||
|             "slice", | ||||
|             "slice_replace", | ||||
|             "split", | ||||
|             "strip", | ||||
|             "swapcase", | ||||
|             "title", | ||||
|             "upper", | ||||
|             "casefold", | ||||
|         ], | ||||
|         [()] * 100, | ||||
|         [{}] * 100, | ||||
|     ) | ||||
| ) | ||||
| ids, _, _ = zip(*_any_string_method)  # use method name as fixture-id | ||||
| missing_methods = {f for f in dir(StringMethods) if not f.startswith("_")} - set(ids) | ||||
|  | ||||
| # test that the above list captures all methods of StringMethods | ||||
| assert not missing_methods | ||||
|  | ||||
|  | ||||
| @pytest.fixture(params=_any_string_method, ids=ids) | ||||
| def any_string_method(request): | ||||
|     """ | ||||
|     Fixture for all public methods of `StringMethods` | ||||
|  | ||||
|     This fixture returns a tuple of the method name and sample arguments | ||||
|     necessary to call the method. | ||||
|  | ||||
|     Returns | ||||
|     ------- | ||||
|     method_name : str | ||||
|         The name of the method in `StringMethods` | ||||
|     args : tuple | ||||
|         Sample values for the positional arguments | ||||
|     kwargs : dict | ||||
|         Sample values for the keyword arguments | ||||
|  | ||||
|     Examples | ||||
|     -------- | ||||
|     >>> def test_something(any_string_method): | ||||
|     ...     s = Series(['a', 'b', np.nan, 'd']) | ||||
|     ... | ||||
|     ...     method_name, args, kwargs = any_string_method | ||||
|     ...     method = getattr(s.str, method_name) | ||||
|     ...     # will not raise | ||||
|     ...     method(*args, **kwargs) | ||||
|     """ | ||||
|     return request.param | ||||
							
								
								
									
										205
									
								
								lib/python3.11/site-packages/pandas/tests/strings/test_api.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										205
									
								
								lib/python3.11/site-packages/pandas/tests/strings/test_api.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,205 @@ | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas import ( | ||||
|     CategoricalDtype, | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     MultiIndex, | ||||
|     Series, | ||||
|     _testing as tm, | ||||
|     option_context, | ||||
| ) | ||||
| from pandas.core.strings.accessor import StringMethods | ||||
|  | ||||
| # subset of the full set from pandas/conftest.py | ||||
| _any_allowed_skipna_inferred_dtype = [ | ||||
|     ("string", ["a", np.nan, "c"]), | ||||
|     ("bytes", [b"a", np.nan, b"c"]), | ||||
|     ("empty", [np.nan, np.nan, np.nan]), | ||||
|     ("empty", []), | ||||
|     ("mixed-integer", ["a", np.nan, 2]), | ||||
| ] | ||||
| ids, _ = zip(*_any_allowed_skipna_inferred_dtype)  # use inferred type as id | ||||
|  | ||||
|  | ||||
| @pytest.fixture(params=_any_allowed_skipna_inferred_dtype, ids=ids) | ||||
| def any_allowed_skipna_inferred_dtype(request): | ||||
|     """ | ||||
|     Fixture for all (inferred) dtypes allowed in StringMethods.__init__ | ||||
|  | ||||
|     The covered (inferred) types are: | ||||
|     * 'string' | ||||
|     * 'empty' | ||||
|     * 'bytes' | ||||
|     * 'mixed' | ||||
|     * 'mixed-integer' | ||||
|  | ||||
|     Returns | ||||
|     ------- | ||||
|     inferred_dtype : str | ||||
|         The string for the inferred dtype from _libs.lib.infer_dtype | ||||
|     values : np.ndarray | ||||
|         An array of object dtype that will be inferred to have | ||||
|         `inferred_dtype` | ||||
|  | ||||
|     Examples | ||||
|     -------- | ||||
|     >>> from pandas._libs import lib | ||||
|     >>> | ||||
|     >>> def test_something(any_allowed_skipna_inferred_dtype): | ||||
|     ...     inferred_dtype, values = any_allowed_skipna_inferred_dtype | ||||
|     ...     # will pass | ||||
|     ...     assert lib.infer_dtype(values, skipna=True) == inferred_dtype | ||||
|     ... | ||||
|     ...     # constructor for .str-accessor will also pass | ||||
|     ...     Series(values).str | ||||
|     """ | ||||
|     inferred_dtype, values = request.param | ||||
|     values = np.array(values, dtype=object)  # object dtype to avoid casting | ||||
|  | ||||
|     # correctness of inference tested in tests/dtypes/test_inference.py | ||||
|     return inferred_dtype, values | ||||
|  | ||||
|  | ||||
| def test_api(any_string_dtype): | ||||
|     # GH 6106, GH 9322 | ||||
|     assert Series.str is StringMethods | ||||
|     assert isinstance(Series([""], dtype=any_string_dtype).str, StringMethods) | ||||
|  | ||||
|  | ||||
| def test_api_mi_raises(): | ||||
|     # GH 23679 | ||||
|     mi = MultiIndex.from_arrays([["a", "b", "c"]]) | ||||
|     msg = "Can only use .str accessor with Index, not MultiIndex" | ||||
|     with pytest.raises(AttributeError, match=msg): | ||||
|         mi.str | ||||
|     assert not hasattr(mi, "str") | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("dtype", [object, "category"]) | ||||
| def test_api_per_dtype(index_or_series, dtype, any_skipna_inferred_dtype): | ||||
|     # one instance of parametrized fixture | ||||
|     box = index_or_series | ||||
|     inferred_dtype, values = any_skipna_inferred_dtype | ||||
|  | ||||
|     t = box(values, dtype=dtype)  # explicit dtype to avoid casting | ||||
|  | ||||
|     types_passing_constructor = [ | ||||
|         "string", | ||||
|         "unicode", | ||||
|         "empty", | ||||
|         "bytes", | ||||
|         "mixed", | ||||
|         "mixed-integer", | ||||
|     ] | ||||
|     if inferred_dtype in types_passing_constructor: | ||||
|         # GH 6106 | ||||
|         assert isinstance(t.str, StringMethods) | ||||
|     else: | ||||
|         # GH 9184, GH 23011, GH 23163 | ||||
|         msg = "Can only use .str accessor with string values.*" | ||||
|         with pytest.raises(AttributeError, match=msg): | ||||
|             t.str | ||||
|         assert not hasattr(t, "str") | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("dtype", [object, "category"]) | ||||
| def test_api_per_method( | ||||
|     index_or_series, | ||||
|     dtype, | ||||
|     any_allowed_skipna_inferred_dtype, | ||||
|     any_string_method, | ||||
|     request, | ||||
|     using_infer_string, | ||||
| ): | ||||
|     # this test does not check correctness of the different methods, | ||||
|     # just that the methods work on the specified (inferred) dtypes, | ||||
|     # and raise on all others | ||||
|     box = index_or_series | ||||
|  | ||||
|     # one instance of each parametrized fixture | ||||
|     inferred_dtype, values = any_allowed_skipna_inferred_dtype | ||||
|     method_name, args, kwargs = any_string_method | ||||
|  | ||||
|     reason = None | ||||
|     if box is Index and values.size == 0: | ||||
|         if method_name in ["partition", "rpartition"] and kwargs.get("expand", True): | ||||
|             raises = TypeError | ||||
|             reason = "Method cannot deal with empty Index" | ||||
|         elif method_name == "split" and kwargs.get("expand", None): | ||||
|             raises = TypeError | ||||
|             reason = "Split fails on empty Series when expand=True" | ||||
|         elif method_name == "get_dummies": | ||||
|             raises = ValueError | ||||
|             reason = "Need to fortify get_dummies corner cases" | ||||
|  | ||||
|     elif ( | ||||
|         box is Index | ||||
|         and inferred_dtype == "empty" | ||||
|         and dtype == object | ||||
|         and method_name == "get_dummies" | ||||
|     ): | ||||
|         raises = ValueError | ||||
|         reason = "Need to fortify get_dummies corner cases" | ||||
|  | ||||
|     if reason is not None: | ||||
|         mark = pytest.mark.xfail(raises=raises, reason=reason) | ||||
|         request.applymarker(mark) | ||||
|  | ||||
|     t = box(values, dtype=dtype)  # explicit dtype to avoid casting | ||||
|     method = getattr(t.str, method_name) | ||||
|  | ||||
|     if using_infer_string and dtype == "category": | ||||
|         string_allowed = method_name not in ["decode"] | ||||
|     else: | ||||
|         string_allowed = True | ||||
|     bytes_allowed = method_name in ["decode", "get", "len", "slice"] | ||||
|     # as of v0.23.4, all methods except 'cat' are very lenient with the | ||||
|     # allowed data types, just returning NaN for entries that error. | ||||
|     # This could be changed with an 'errors'-kwarg to the `str`-accessor, | ||||
|     # see discussion in GH 13877 | ||||
|     mixed_allowed = method_name not in ["cat"] | ||||
|  | ||||
|     allowed_types = ( | ||||
|         ["empty"] | ||||
|         + ["string", "unicode"] * string_allowed | ||||
|         + ["bytes"] * bytes_allowed | ||||
|         + ["mixed", "mixed-integer"] * mixed_allowed | ||||
|     ) | ||||
|  | ||||
|     if inferred_dtype in allowed_types: | ||||
|         # xref GH 23555, GH 23556 | ||||
|         with option_context("future.no_silent_downcasting", True): | ||||
|             method(*args, **kwargs)  # works! | ||||
|     else: | ||||
|         # GH 23011, GH 23163 | ||||
|         msg = ( | ||||
|             f"Cannot use .str.{method_name} with values of " | ||||
|             f"inferred dtype {repr(inferred_dtype)}." | ||||
|             "|a bytes-like object is required, not 'str'" | ||||
|         ) | ||||
|         with pytest.raises(TypeError, match=msg): | ||||
|             method(*args, **kwargs) | ||||
|  | ||||
|  | ||||
| def test_api_for_categorical(any_string_method, any_string_dtype): | ||||
|     # https://github.com/pandas-dev/pandas/issues/10661 | ||||
|     s = Series(list("aabb"), dtype=any_string_dtype) | ||||
|     s = s + " " + s | ||||
|     c = s.astype("category") | ||||
|     c = c.astype(CategoricalDtype(c.dtype.categories.astype("object"))) | ||||
|     assert isinstance(c.str, StringMethods) | ||||
|  | ||||
|     method_name, args, kwargs = any_string_method | ||||
|  | ||||
|     result = getattr(c.str, method_name)(*args, **kwargs) | ||||
|     expected = getattr(s.astype("object").str, method_name)(*args, **kwargs) | ||||
|  | ||||
|     if isinstance(result, DataFrame): | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|     elif isinstance(result, Series): | ||||
|         tm.assert_series_equal(result, expected) | ||||
|     else: | ||||
|         # str.cat(others=None) returns string, for example | ||||
|         assert result == expected | ||||
| @ -0,0 +1,423 @@ | ||||
| from datetime import datetime | ||||
| import operator | ||||
|  | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas import ( | ||||
|     Series, | ||||
|     _testing as tm, | ||||
| ) | ||||
|  | ||||
|  | ||||
| def test_title(any_string_dtype): | ||||
|     s = Series(["FOO", "BAR", np.nan, "Blah", "blurg"], dtype=any_string_dtype) | ||||
|     result = s.str.title() | ||||
|     expected = Series(["Foo", "Bar", np.nan, "Blah", "Blurg"], dtype=any_string_dtype) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_title_mixed_object(): | ||||
|     s = Series(["FOO", np.nan, "bar", True, datetime.today(), "blah", None, 1, 2.0]) | ||||
|     result = s.str.title() | ||||
|     expected = Series( | ||||
|         ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", None, np.nan, np.nan], | ||||
|         dtype=object, | ||||
|     ) | ||||
|     tm.assert_almost_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_lower_upper(any_string_dtype): | ||||
|     s = Series(["om", np.nan, "nom", "nom"], dtype=any_string_dtype) | ||||
|  | ||||
|     result = s.str.upper() | ||||
|     expected = Series(["OM", np.nan, "NOM", "NOM"], dtype=any_string_dtype) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     result = result.str.lower() | ||||
|     tm.assert_series_equal(result, s) | ||||
|  | ||||
|  | ||||
| def test_lower_upper_mixed_object(): | ||||
|     s = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0]) | ||||
|  | ||||
|     result = s.str.upper() | ||||
|     expected = Series( | ||||
|         ["A", np.nan, "B", np.nan, np.nan, "FOO", None, np.nan, np.nan], dtype=object | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     result = s.str.lower() | ||||
|     expected = Series( | ||||
|         ["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan], dtype=object | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "data, expected", | ||||
|     [ | ||||
|         ( | ||||
|             ["FOO", "BAR", np.nan, "Blah", "blurg"], | ||||
|             ["Foo", "Bar", np.nan, "Blah", "Blurg"], | ||||
|         ), | ||||
|         (["a", "b", "c"], ["A", "B", "C"]), | ||||
|         (["a b", "a bc. de"], ["A b", "A bc. de"]), | ||||
|     ], | ||||
| ) | ||||
| def test_capitalize(data, expected, any_string_dtype): | ||||
|     s = Series(data, dtype=any_string_dtype) | ||||
|     result = s.str.capitalize() | ||||
|     expected = Series(expected, dtype=any_string_dtype) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_capitalize_mixed_object(): | ||||
|     s = Series(["FOO", np.nan, "bar", True, datetime.today(), "blah", None, 1, 2.0]) | ||||
|     result = s.str.capitalize() | ||||
|     expected = Series( | ||||
|         ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", None, np.nan, np.nan], | ||||
|         dtype=object, | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_swapcase(any_string_dtype): | ||||
|     s = Series(["FOO", "BAR", np.nan, "Blah", "blurg"], dtype=any_string_dtype) | ||||
|     result = s.str.swapcase() | ||||
|     expected = Series(["foo", "bar", np.nan, "bLAH", "BLURG"], dtype=any_string_dtype) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_swapcase_mixed_object(): | ||||
|     s = Series(["FOO", np.nan, "bar", True, datetime.today(), "Blah", None, 1, 2.0]) | ||||
|     result = s.str.swapcase() | ||||
|     expected = Series( | ||||
|         ["foo", np.nan, "BAR", np.nan, np.nan, "bLAH", None, np.nan, np.nan], | ||||
|         dtype=object, | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_casefold(): | ||||
|     # GH25405 | ||||
|     expected = Series(["ss", np.nan, "case", "ssd"]) | ||||
|     s = Series(["ß", np.nan, "case", "ßd"]) | ||||
|     result = s.str.casefold() | ||||
|  | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_casemethods(any_string_dtype): | ||||
|     values = ["aaa", "bbb", "CCC", "Dddd", "eEEE"] | ||||
|     s = Series(values, dtype=any_string_dtype) | ||||
|     assert s.str.lower().tolist() == [v.lower() for v in values] | ||||
|     assert s.str.upper().tolist() == [v.upper() for v in values] | ||||
|     assert s.str.title().tolist() == [v.title() for v in values] | ||||
|     assert s.str.capitalize().tolist() == [v.capitalize() for v in values] | ||||
|     assert s.str.swapcase().tolist() == [v.swapcase() for v in values] | ||||
|  | ||||
|  | ||||
| def test_pad(any_string_dtype): | ||||
|     s = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"], dtype=any_string_dtype) | ||||
|  | ||||
|     result = s.str.pad(5, side="left") | ||||
|     expected = Series( | ||||
|         ["    a", "    b", np.nan, "    c", np.nan, "eeeeee"], dtype=any_string_dtype | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     result = s.str.pad(5, side="right") | ||||
|     expected = Series( | ||||
|         ["a    ", "b    ", np.nan, "c    ", np.nan, "eeeeee"], dtype=any_string_dtype | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     result = s.str.pad(5, side="both") | ||||
|     expected = Series( | ||||
|         ["  a  ", "  b  ", np.nan, "  c  ", np.nan, "eeeeee"], dtype=any_string_dtype | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_pad_mixed_object(): | ||||
|     s = Series(["a", np.nan, "b", True, datetime.today(), "ee", None, 1, 2.0]) | ||||
|  | ||||
|     result = s.str.pad(5, side="left") | ||||
|     expected = Series( | ||||
|         ["    a", np.nan, "    b", np.nan, np.nan, "   ee", None, np.nan, np.nan], | ||||
|         dtype=object, | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     result = s.str.pad(5, side="right") | ||||
|     expected = Series( | ||||
|         ["a    ", np.nan, "b    ", np.nan, np.nan, "ee   ", None, np.nan, np.nan], | ||||
|         dtype=object, | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     result = s.str.pad(5, side="both") | ||||
|     expected = Series( | ||||
|         ["  a  ", np.nan, "  b  ", np.nan, np.nan, "  ee ", None, np.nan, np.nan], | ||||
|         dtype=object, | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_pad_fillchar(any_string_dtype): | ||||
|     s = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"], dtype=any_string_dtype) | ||||
|  | ||||
|     result = s.str.pad(5, side="left", fillchar="X") | ||||
|     expected = Series( | ||||
|         ["XXXXa", "XXXXb", np.nan, "XXXXc", np.nan, "eeeeee"], dtype=any_string_dtype | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     result = s.str.pad(5, side="right", fillchar="X") | ||||
|     expected = Series( | ||||
|         ["aXXXX", "bXXXX", np.nan, "cXXXX", np.nan, "eeeeee"], dtype=any_string_dtype | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     result = s.str.pad(5, side="both", fillchar="X") | ||||
|     expected = Series( | ||||
|         ["XXaXX", "XXbXX", np.nan, "XXcXX", np.nan, "eeeeee"], dtype=any_string_dtype | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_pad_fillchar_bad_arg_raises(any_string_dtype): | ||||
|     s = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"], dtype=any_string_dtype) | ||||
|  | ||||
|     msg = "fillchar must be a character, not str" | ||||
|     with pytest.raises(TypeError, match=msg): | ||||
|         s.str.pad(5, fillchar="XY") | ||||
|  | ||||
|     msg = "fillchar must be a character, not int" | ||||
|     with pytest.raises(TypeError, match=msg): | ||||
|         s.str.pad(5, fillchar=5) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("method_name", ["center", "ljust", "rjust", "zfill", "pad"]) | ||||
| def test_pad_width_bad_arg_raises(method_name, any_string_dtype): | ||||
|     # see gh-13598 | ||||
|     s = Series(["1", "22", "a", "bb"], dtype=any_string_dtype) | ||||
|     op = operator.methodcaller(method_name, "f") | ||||
|  | ||||
|     msg = "width must be of integer type, not str" | ||||
|     with pytest.raises(TypeError, match=msg): | ||||
|         op(s.str) | ||||
|  | ||||
|  | ||||
| def test_center_ljust_rjust(any_string_dtype): | ||||
|     s = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"], dtype=any_string_dtype) | ||||
|  | ||||
|     result = s.str.center(5) | ||||
|     expected = Series( | ||||
|         ["  a  ", "  b  ", np.nan, "  c  ", np.nan, "eeeeee"], dtype=any_string_dtype | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     result = s.str.ljust(5) | ||||
|     expected = Series( | ||||
|         ["a    ", "b    ", np.nan, "c    ", np.nan, "eeeeee"], dtype=any_string_dtype | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     result = s.str.rjust(5) | ||||
|     expected = Series( | ||||
|         ["    a", "    b", np.nan, "    c", np.nan, "eeeeee"], dtype=any_string_dtype | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_center_ljust_rjust_mixed_object(): | ||||
|     s = Series(["a", np.nan, "b", True, datetime.today(), "c", "eee", None, 1, 2.0]) | ||||
|  | ||||
|     result = s.str.center(5) | ||||
|     expected = Series( | ||||
|         [ | ||||
|             "  a  ", | ||||
|             np.nan, | ||||
|             "  b  ", | ||||
|             np.nan, | ||||
|             np.nan, | ||||
|             "  c  ", | ||||
|             " eee ", | ||||
|             None, | ||||
|             np.nan, | ||||
|             np.nan, | ||||
|         ], | ||||
|         dtype=object, | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     result = s.str.ljust(5) | ||||
|     expected = Series( | ||||
|         [ | ||||
|             "a    ", | ||||
|             np.nan, | ||||
|             "b    ", | ||||
|             np.nan, | ||||
|             np.nan, | ||||
|             "c    ", | ||||
|             "eee  ", | ||||
|             None, | ||||
|             np.nan, | ||||
|             np.nan, | ||||
|         ], | ||||
|         dtype=object, | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     result = s.str.rjust(5) | ||||
|     expected = Series( | ||||
|         [ | ||||
|             "    a", | ||||
|             np.nan, | ||||
|             "    b", | ||||
|             np.nan, | ||||
|             np.nan, | ||||
|             "    c", | ||||
|             "  eee", | ||||
|             None, | ||||
|             np.nan, | ||||
|             np.nan, | ||||
|         ], | ||||
|         dtype=object, | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_center_ljust_rjust_fillchar(any_string_dtype): | ||||
|     # GH#54533, GH#54792 | ||||
|     s = Series(["a", "bb", "cccc", "ddddd", "eeeeee"], dtype=any_string_dtype) | ||||
|  | ||||
|     result = s.str.center(5, fillchar="X") | ||||
|     expected = Series( | ||||
|         ["XXaXX", "XXbbX", "Xcccc", "ddddd", "eeeeee"], dtype=any_string_dtype | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|     expected = np.array([v.center(5, "X") for v in np.array(s)], dtype=np.object_) | ||||
|     tm.assert_numpy_array_equal(np.array(result, dtype=np.object_), expected) | ||||
|  | ||||
|     result = s.str.ljust(5, fillchar="X") | ||||
|     expected = Series( | ||||
|         ["aXXXX", "bbXXX", "ccccX", "ddddd", "eeeeee"], dtype=any_string_dtype | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|     expected = np.array([v.ljust(5, "X") for v in np.array(s)], dtype=np.object_) | ||||
|     tm.assert_numpy_array_equal(np.array(result, dtype=np.object_), expected) | ||||
|  | ||||
|     result = s.str.rjust(5, fillchar="X") | ||||
|     expected = Series( | ||||
|         ["XXXXa", "XXXbb", "Xcccc", "ddddd", "eeeeee"], dtype=any_string_dtype | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|     expected = np.array([v.rjust(5, "X") for v in np.array(s)], dtype=np.object_) | ||||
|     tm.assert_numpy_array_equal(np.array(result, dtype=np.object_), expected) | ||||
|  | ||||
|  | ||||
| def test_center_ljust_rjust_fillchar_bad_arg_raises(any_string_dtype): | ||||
|     s = Series(["a", "bb", "cccc", "ddddd", "eeeeee"], dtype=any_string_dtype) | ||||
|  | ||||
|     # If fillchar is not a character, normal str raises TypeError | ||||
|     # 'aaa'.ljust(5, 'XY') | ||||
|     # TypeError: must be char, not str | ||||
|     template = "fillchar must be a character, not {dtype}" | ||||
|  | ||||
|     with pytest.raises(TypeError, match=template.format(dtype="str")): | ||||
|         s.str.center(5, fillchar="XY") | ||||
|  | ||||
|     with pytest.raises(TypeError, match=template.format(dtype="str")): | ||||
|         s.str.ljust(5, fillchar="XY") | ||||
|  | ||||
|     with pytest.raises(TypeError, match=template.format(dtype="str")): | ||||
|         s.str.rjust(5, fillchar="XY") | ||||
|  | ||||
|     with pytest.raises(TypeError, match=template.format(dtype="int")): | ||||
|         s.str.center(5, fillchar=1) | ||||
|  | ||||
|     with pytest.raises(TypeError, match=template.format(dtype="int")): | ||||
|         s.str.ljust(5, fillchar=1) | ||||
|  | ||||
|     with pytest.raises(TypeError, match=template.format(dtype="int")): | ||||
|         s.str.rjust(5, fillchar=1) | ||||
|  | ||||
|  | ||||
| def test_zfill(any_string_dtype): | ||||
|     s = Series(["1", "22", "aaa", "333", "45678"], dtype=any_string_dtype) | ||||
|  | ||||
|     result = s.str.zfill(5) | ||||
|     expected = Series( | ||||
|         ["00001", "00022", "00aaa", "00333", "45678"], dtype=any_string_dtype | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|     expected = np.array([v.zfill(5) for v in np.array(s)], dtype=np.object_) | ||||
|     tm.assert_numpy_array_equal(np.array(result, dtype=np.object_), expected) | ||||
|  | ||||
|     result = s.str.zfill(3) | ||||
|     expected = Series(["001", "022", "aaa", "333", "45678"], dtype=any_string_dtype) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|     expected = np.array([v.zfill(3) for v in np.array(s)], dtype=np.object_) | ||||
|     tm.assert_numpy_array_equal(np.array(result, dtype=np.object_), expected) | ||||
|  | ||||
|     s = Series(["1", np.nan, "aaa", np.nan, "45678"], dtype=any_string_dtype) | ||||
|     result = s.str.zfill(5) | ||||
|     expected = Series( | ||||
|         ["00001", np.nan, "00aaa", np.nan, "45678"], dtype=any_string_dtype | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_wrap(any_string_dtype): | ||||
|     # test values are: two words less than width, two words equal to width, | ||||
|     # two words greater than width, one word less than width, one word | ||||
|     # equal to width, one word greater than width, multiple tokens with | ||||
|     # trailing whitespace equal to width | ||||
|     s = Series( | ||||
|         [ | ||||
|             "hello world", | ||||
|             "hello world!", | ||||
|             "hello world!!", | ||||
|             "abcdefabcde", | ||||
|             "abcdefabcdef", | ||||
|             "abcdefabcdefa", | ||||
|             "ab ab ab ab ", | ||||
|             "ab ab ab ab a", | ||||
|             "\t", | ||||
|         ], | ||||
|         dtype=any_string_dtype, | ||||
|     ) | ||||
|  | ||||
|     # expected values | ||||
|     expected = Series( | ||||
|         [ | ||||
|             "hello world", | ||||
|             "hello world!", | ||||
|             "hello\nworld!!", | ||||
|             "abcdefabcde", | ||||
|             "abcdefabcdef", | ||||
|             "abcdefabcdef\na", | ||||
|             "ab ab ab ab", | ||||
|             "ab ab ab ab\na", | ||||
|             "", | ||||
|         ], | ||||
|         dtype=any_string_dtype, | ||||
|     ) | ||||
|  | ||||
|     result = s.str.wrap(12, break_long_words=True) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_wrap_unicode(any_string_dtype): | ||||
|     # test with pre and post whitespace (non-unicode), NaN, and non-ascii Unicode | ||||
|     s = Series( | ||||
|         ["  pre  ", np.nan, "\xac\u20ac\U00008000 abadcafe"], dtype=any_string_dtype | ||||
|     ) | ||||
|     expected = Series( | ||||
|         ["  pre", np.nan, "\xac\u20ac\U00008000 ab\nadcafe"], dtype=any_string_dtype | ||||
|     ) | ||||
|     result = s.str.wrap(6) | ||||
|     tm.assert_series_equal(result, expected) | ||||
							
								
								
									
										427
									
								
								lib/python3.11/site-packages/pandas/tests/strings/test_cat.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										427
									
								
								lib/python3.11/site-packages/pandas/tests/strings/test_cat.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,427 @@ | ||||
| import re | ||||
|  | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| import pandas.util._test_decorators as td | ||||
|  | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     MultiIndex, | ||||
|     Series, | ||||
|     _testing as tm, | ||||
|     concat, | ||||
|     option_context, | ||||
| ) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("other", [None, Series, Index]) | ||||
| def test_str_cat_name(index_or_series, other): | ||||
|     # GH 21053 | ||||
|     box = index_or_series | ||||
|     values = ["a", "b"] | ||||
|     if other: | ||||
|         other = other(values) | ||||
|     else: | ||||
|         other = values | ||||
|     result = box(values, name="name").str.cat(other, sep=",") | ||||
|     assert result.name == "name" | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] | ||||
| ) | ||||
| def test_str_cat(index_or_series, infer_string): | ||||
|     with option_context("future.infer_string", infer_string): | ||||
|         box = index_or_series | ||||
|         # test_cat above tests "str_cat" from ndarray; | ||||
|         # here testing "str.cat" from Series/Index to ndarray/list | ||||
|         s = box(["a", "a", "b", "b", "c", np.nan]) | ||||
|  | ||||
|         # single array | ||||
|         result = s.str.cat() | ||||
|         expected = "aabbc" | ||||
|         assert result == expected | ||||
|  | ||||
|         result = s.str.cat(na_rep="-") | ||||
|         expected = "aabbc-" | ||||
|         assert result == expected | ||||
|  | ||||
|         result = s.str.cat(sep="_", na_rep="NA") | ||||
|         expected = "a_a_b_b_c_NA" | ||||
|         assert result == expected | ||||
|  | ||||
|         t = np.array(["a", np.nan, "b", "d", "foo", np.nan], dtype=object) | ||||
|         expected = box(["aa", "a-", "bb", "bd", "cfoo", "--"]) | ||||
|  | ||||
|         # Series/Index with array | ||||
|         result = s.str.cat(t, na_rep="-") | ||||
|         tm.assert_equal(result, expected) | ||||
|  | ||||
|         # Series/Index with list | ||||
|         result = s.str.cat(list(t), na_rep="-") | ||||
|         tm.assert_equal(result, expected) | ||||
|  | ||||
|         # errors for incorrect lengths | ||||
|         rgx = r"If `others` contains arrays or lists \(or other list-likes.*" | ||||
|         z = Series(["1", "2", "3"]) | ||||
|  | ||||
|         with pytest.raises(ValueError, match=rgx): | ||||
|             s.str.cat(z.values) | ||||
|  | ||||
|         with pytest.raises(ValueError, match=rgx): | ||||
|             s.str.cat(list(z)) | ||||
|  | ||||
|  | ||||
| def test_str_cat_raises_intuitive_error(index_or_series): | ||||
|     # GH 11334 | ||||
|     box = index_or_series | ||||
|     s = box(["a", "b", "c", "d"]) | ||||
|     message = "Did you mean to supply a `sep` keyword?" | ||||
|     with pytest.raises(ValueError, match=message): | ||||
|         s.str.cat("|") | ||||
|     with pytest.raises(ValueError, match=message): | ||||
|         s.str.cat("    ") | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] | ||||
| ) | ||||
| @pytest.mark.parametrize("sep", ["", None]) | ||||
| @pytest.mark.parametrize("dtype_target", ["object", "category"]) | ||||
| @pytest.mark.parametrize("dtype_caller", ["object", "category"]) | ||||
| def test_str_cat_categorical( | ||||
|     index_or_series, dtype_caller, dtype_target, sep, infer_string | ||||
| ): | ||||
|     box = index_or_series | ||||
|  | ||||
|     with option_context("future.infer_string", infer_string): | ||||
|         s = Index(["a", "a", "b", "a"], dtype=dtype_caller) | ||||
|         s = s if box == Index else Series(s, index=s, dtype=s.dtype) | ||||
|         t = Index(["b", "a", "b", "c"], dtype=dtype_target) | ||||
|  | ||||
|         expected = Index( | ||||
|             ["ab", "aa", "bb", "ac"], dtype=object if dtype_caller == "object" else None | ||||
|         ) | ||||
|         expected = ( | ||||
|             expected | ||||
|             if box == Index | ||||
|             else Series( | ||||
|                 expected, index=Index(s, dtype=dtype_caller), dtype=expected.dtype | ||||
|             ) | ||||
|         ) | ||||
|  | ||||
|         # Series/Index with unaligned Index -> t.values | ||||
|         result = s.str.cat(t.values, sep=sep) | ||||
|         tm.assert_equal(result, expected) | ||||
|  | ||||
|         # Series/Index with Series having matching Index | ||||
|         t = Series(t.values, index=Index(s, dtype=dtype_caller)) | ||||
|         result = s.str.cat(t, sep=sep) | ||||
|         tm.assert_equal(result, expected) | ||||
|  | ||||
|         # Series/Index with Series.values | ||||
|         result = s.str.cat(t.values, sep=sep) | ||||
|         tm.assert_equal(result, expected) | ||||
|  | ||||
|         # Series/Index with Series having different Index | ||||
|         t = Series(t.values, index=t.values) | ||||
|         expected = Index( | ||||
|             ["aa", "aa", "bb", "bb", "aa"], | ||||
|             dtype=object if dtype_caller == "object" else None, | ||||
|         ) | ||||
|         dtype = object if dtype_caller == "object" else s.dtype.categories.dtype | ||||
|         expected = ( | ||||
|             expected | ||||
|             if box == Index | ||||
|             else Series( | ||||
|                 expected, | ||||
|                 index=Index(expected.str[:1], dtype=dtype), | ||||
|                 dtype=expected.dtype, | ||||
|             ) | ||||
|         ) | ||||
|  | ||||
|         result = s.str.cat(t, sep=sep) | ||||
|         tm.assert_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "data", | ||||
|     [[1, 2, 3], [0.1, 0.2, 0.3], [1, 2, "b"]], | ||||
|     ids=["integers", "floats", "mixed"], | ||||
| ) | ||||
| # without dtype=object, np.array would cast [1, 2, 'b'] to ['1', '2', 'b'] | ||||
| @pytest.mark.parametrize( | ||||
|     "box", | ||||
|     [Series, Index, list, lambda x: np.array(x, dtype=object)], | ||||
|     ids=["Series", "Index", "list", "np.array"], | ||||
| ) | ||||
| def test_str_cat_wrong_dtype_raises(box, data): | ||||
|     # GH 22722 | ||||
|     s = Series(["a", "b", "c"]) | ||||
|     t = box(data) | ||||
|  | ||||
|     msg = "Concatenation requires list-likes containing only strings.*" | ||||
|     with pytest.raises(TypeError, match=msg): | ||||
|         # need to use outer and na_rep, as otherwise Index would not raise | ||||
|         s.str.cat(t, join="outer", na_rep="-") | ||||
|  | ||||
|  | ||||
| def test_str_cat_mixed_inputs(index_or_series): | ||||
|     box = index_or_series | ||||
|     s = Index(["a", "b", "c", "d"]) | ||||
|     s = s if box == Index else Series(s, index=s) | ||||
|  | ||||
|     t = Series(["A", "B", "C", "D"], index=s.values) | ||||
|     d = concat([t, Series(s, index=s)], axis=1) | ||||
|  | ||||
|     expected = Index(["aAa", "bBb", "cCc", "dDd"]) | ||||
|     expected = expected if box == Index else Series(expected.values, index=s.values) | ||||
|  | ||||
|     # Series/Index with DataFrame | ||||
|     result = s.str.cat(d) | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|     # Series/Index with two-dimensional ndarray | ||||
|     result = s.str.cat(d.values) | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|     # Series/Index with list of Series | ||||
|     result = s.str.cat([t, s]) | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|     # Series/Index with mixed list of Series/array | ||||
|     result = s.str.cat([t, s.values]) | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|     # Series/Index with list of Series; different indexes | ||||
|     t.index = ["b", "c", "d", "a"] | ||||
|     expected = box(["aDa", "bAb", "cBc", "dCd"]) | ||||
|     expected = expected if box == Index else Series(expected.values, index=s.values) | ||||
|     result = s.str.cat([t, s]) | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|     # Series/Index with mixed list; different index | ||||
|     result = s.str.cat([t, s.values]) | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|     # Series/Index with DataFrame; different indexes | ||||
|     d.index = ["b", "c", "d", "a"] | ||||
|     expected = box(["aDd", "bAa", "cBb", "dCc"]) | ||||
|     expected = expected if box == Index else Series(expected.values, index=s.values) | ||||
|     result = s.str.cat(d) | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|     # errors for incorrect lengths | ||||
|     rgx = r"If `others` contains arrays or lists \(or other list-likes.*" | ||||
|     z = Series(["1", "2", "3"]) | ||||
|     e = concat([z, z], axis=1) | ||||
|  | ||||
|     # two-dimensional ndarray | ||||
|     with pytest.raises(ValueError, match=rgx): | ||||
|         s.str.cat(e.values) | ||||
|  | ||||
|     # list of list-likes | ||||
|     with pytest.raises(ValueError, match=rgx): | ||||
|         s.str.cat([z.values, s.values]) | ||||
|  | ||||
|     # mixed list of Series/list-like | ||||
|     with pytest.raises(ValueError, match=rgx): | ||||
|         s.str.cat([z.values, s]) | ||||
|  | ||||
|     # errors for incorrect arguments in list-like | ||||
|     rgx = "others must be Series, Index, DataFrame,.*" | ||||
|     # make sure None/NaN do not crash checks in _get_series_list | ||||
|     u = Series(["a", np.nan, "c", None]) | ||||
|  | ||||
|     # mix of string and Series | ||||
|     with pytest.raises(TypeError, match=rgx): | ||||
|         s.str.cat([u, "u"]) | ||||
|  | ||||
|     # DataFrame in list | ||||
|     with pytest.raises(TypeError, match=rgx): | ||||
|         s.str.cat([u, d]) | ||||
|  | ||||
|     # 2-dim ndarray in list | ||||
|     with pytest.raises(TypeError, match=rgx): | ||||
|         s.str.cat([u, d.values]) | ||||
|  | ||||
|     # nested lists | ||||
|     with pytest.raises(TypeError, match=rgx): | ||||
|         s.str.cat([u, [u, d]]) | ||||
|  | ||||
|     # forbidden input type: set | ||||
|     # GH 23009 | ||||
|     with pytest.raises(TypeError, match=rgx): | ||||
|         s.str.cat(set(u)) | ||||
|  | ||||
|     # forbidden input type: set in list | ||||
|     # GH 23009 | ||||
|     with pytest.raises(TypeError, match=rgx): | ||||
|         s.str.cat([u, set(u)]) | ||||
|  | ||||
|     # other forbidden input type, e.g. int | ||||
|     with pytest.raises(TypeError, match=rgx): | ||||
|         s.str.cat(1) | ||||
|  | ||||
|     # nested list-likes | ||||
|     with pytest.raises(TypeError, match=rgx): | ||||
|         s.str.cat(iter([t.values, list(s)])) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("join", ["left", "outer", "inner", "right"]) | ||||
| def test_str_cat_align_indexed(index_or_series, join): | ||||
|     # https://github.com/pandas-dev/pandas/issues/18657 | ||||
|     box = index_or_series | ||||
|  | ||||
|     s = Series(["a", "b", "c", "d"], index=["a", "b", "c", "d"]) | ||||
|     t = Series(["D", "A", "E", "B"], index=["d", "a", "e", "b"]) | ||||
|     sa, ta = s.align(t, join=join) | ||||
|     # result after manual alignment of inputs | ||||
|     expected = sa.str.cat(ta, na_rep="-") | ||||
|  | ||||
|     if box == Index: | ||||
|         s = Index(s) | ||||
|         sa = Index(sa) | ||||
|         expected = Index(expected) | ||||
|  | ||||
|     result = s.str.cat(t, join=join, na_rep="-") | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("join", ["left", "outer", "inner", "right"]) | ||||
| def test_str_cat_align_mixed_inputs(join): | ||||
|     s = Series(["a", "b", "c", "d"]) | ||||
|     t = Series(["d", "a", "e", "b"], index=[3, 0, 4, 1]) | ||||
|     d = concat([t, t], axis=1) | ||||
|  | ||||
|     expected_outer = Series(["aaa", "bbb", "c--", "ddd", "-ee"]) | ||||
|     expected = expected_outer.loc[s.index.join(t.index, how=join)] | ||||
|  | ||||
|     # list of Series | ||||
|     result = s.str.cat([t, t], join=join, na_rep="-") | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     # DataFrame | ||||
|     result = s.str.cat(d, join=join, na_rep="-") | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     # mixed list of indexed/unindexed | ||||
|     u = np.array(["A", "B", "C", "D"]) | ||||
|     expected_outer = Series(["aaA", "bbB", "c-C", "ddD", "-e-"]) | ||||
|     # joint index of rhs [t, u]; u will be forced have index of s | ||||
|     rhs_idx = ( | ||||
|         t.index.intersection(s.index) | ||||
|         if join == "inner" | ||||
|         else t.index.union(s.index) | ||||
|         if join == "outer" | ||||
|         else t.index.append(s.index.difference(t.index)) | ||||
|     ) | ||||
|  | ||||
|     expected = expected_outer.loc[s.index.join(rhs_idx, how=join)] | ||||
|     result = s.str.cat([t, u], join=join, na_rep="-") | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     with pytest.raises(TypeError, match="others must be Series,.*"): | ||||
|         # nested lists are forbidden | ||||
|         s.str.cat([t, list(u)], join=join) | ||||
|  | ||||
|     # errors for incorrect lengths | ||||
|     rgx = r"If `others` contains arrays or lists \(or other list-likes.*" | ||||
|     z = Series(["1", "2", "3"]).values | ||||
|  | ||||
|     # unindexed object of wrong length | ||||
|     with pytest.raises(ValueError, match=rgx): | ||||
|         s.str.cat(z, join=join) | ||||
|  | ||||
|     # unindexed object of wrong length in list | ||||
|     with pytest.raises(ValueError, match=rgx): | ||||
|         s.str.cat([t, z], join=join) | ||||
|  | ||||
|  | ||||
| def test_str_cat_all_na(index_or_series, index_or_series2): | ||||
|     # GH 24044 | ||||
|     box = index_or_series | ||||
|     other = index_or_series2 | ||||
|  | ||||
|     # check that all NaNs in caller / target work | ||||
|     s = Index(["a", "b", "c", "d"]) | ||||
|     s = s if box == Index else Series(s, index=s) | ||||
|     t = other([np.nan] * 4, dtype=object) | ||||
|     # add index of s for alignment | ||||
|     t = t if other == Index else Series(t, index=s) | ||||
|  | ||||
|     # all-NA target | ||||
|     if box == Series: | ||||
|         expected = Series([np.nan] * 4, index=s.index, dtype=s.dtype) | ||||
|     else:  # box == Index | ||||
|         # TODO: Strimg option, this should return string dtype | ||||
|         expected = Index([np.nan] * 4, dtype=object) | ||||
|     result = s.str.cat(t, join="left") | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|     # all-NA caller (only for Series) | ||||
|     if other == Series: | ||||
|         expected = Series([np.nan] * 4, dtype=object, index=t.index) | ||||
|         result = t.str.cat(s, join="left") | ||||
|         tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_str_cat_special_cases(): | ||||
|     s = Series(["a", "b", "c", "d"]) | ||||
|     t = Series(["d", "a", "e", "b"], index=[3, 0, 4, 1]) | ||||
|  | ||||
|     # iterator of elements with different types | ||||
|     expected = Series(["aaa", "bbb", "c-c", "ddd", "-e-"]) | ||||
|     result = s.str.cat(iter([t, s.values]), join="outer", na_rep="-") | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     # right-align with different indexes in others | ||||
|     expected = Series(["aa-", "d-d"], index=[0, 3]) | ||||
|     result = s.str.cat([t.loc[[0]], t.loc[[3]]], join="right", na_rep="-") | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_cat_on_filtered_index(): | ||||
|     df = DataFrame( | ||||
|         index=MultiIndex.from_product( | ||||
|             [[2011, 2012], [1, 2, 3]], names=["year", "month"] | ||||
|         ) | ||||
|     ) | ||||
|  | ||||
|     df = df.reset_index() | ||||
|     df = df[df.month > 1] | ||||
|  | ||||
|     str_year = df.year.astype("str") | ||||
|     str_month = df.month.astype("str") | ||||
|     str_both = str_year.str.cat(str_month, sep=" ") | ||||
|  | ||||
|     assert str_both.loc[1] == "2011 2" | ||||
|  | ||||
|     str_multiple = str_year.str.cat([str_month, str_month], sep=" ") | ||||
|  | ||||
|     assert str_multiple.loc[1] == "2011 2 2" | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("klass", [tuple, list, np.array, Series, Index]) | ||||
| def test_cat_different_classes(klass): | ||||
|     # https://github.com/pandas-dev/pandas/issues/33425 | ||||
|     s = Series(["a", "b", "c"]) | ||||
|     result = s.str.cat(klass(["x", "y", "z"])) | ||||
|     expected = Series(["ax", "by", "cz"]) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_cat_on_series_dot_str(): | ||||
|     # GH 28277 | ||||
|     ps = Series(["AbC", "de", "FGHI", "j", "kLLLm"]) | ||||
|  | ||||
|     message = re.escape( | ||||
|         "others must be Series, Index, DataFrame, np.ndarray " | ||||
|         "or list-like (either containing only strings or " | ||||
|         "containing only objects of type Series/Index/" | ||||
|         "np.ndarray[1-dim])" | ||||
|     ) | ||||
|     with pytest.raises(TypeError, match=message): | ||||
|         ps.str.cat(others=ps.str) | ||||
| @ -0,0 +1,724 @@ | ||||
| from datetime import datetime | ||||
| import re | ||||
|  | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas.core.dtypes.dtypes import ArrowDtype | ||||
|  | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     MultiIndex, | ||||
|     Series, | ||||
|     _testing as tm, | ||||
| ) | ||||
|  | ||||
|  | ||||
| def test_extract_expand_kwarg_wrong_type_raises(any_string_dtype): | ||||
|     # TODO: should this raise TypeError | ||||
|     values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype) | ||||
|     with pytest.raises(ValueError, match="expand must be True or False"): | ||||
|         values.str.extract(".*(BAD[_]+).*(BAD)", expand=None) | ||||
|  | ||||
|  | ||||
| def test_extract_expand_kwarg(any_string_dtype): | ||||
|     s = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype) | ||||
|     expected = DataFrame(["BAD__", np.nan, np.nan], dtype=any_string_dtype) | ||||
|  | ||||
|     result = s.str.extract(".*(BAD[_]+).*") | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = s.str.extract(".*(BAD[_]+).*", expand=True) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     expected = DataFrame( | ||||
|         [["BAD__", "BAD"], [np.nan, np.nan], [np.nan, np.nan]], dtype=any_string_dtype | ||||
|     ) | ||||
|     result = s.str.extract(".*(BAD[_]+).*(BAD)", expand=False) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_extract_expand_False_mixed_object(): | ||||
|     ser = Series( | ||||
|         ["aBAD_BAD", np.nan, "BAD_b_BAD", True, datetime.today(), "foo", None, 1, 2.0] | ||||
|     ) | ||||
|  | ||||
|     # two groups | ||||
|     result = ser.str.extract(".*(BAD[_]+).*(BAD)", expand=False) | ||||
|     er = [np.nan, np.nan]  # empty row | ||||
|     expected = DataFrame( | ||||
|         [["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er], dtype=object | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # single group | ||||
|     result = ser.str.extract(".*(BAD[_]+).*BAD", expand=False) | ||||
|     expected = Series( | ||||
|         ["BAD_", np.nan, "BAD_", np.nan, np.nan, np.nan, None, np.nan, np.nan], | ||||
|         dtype=object, | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_extract_expand_index_raises(): | ||||
|     # GH9980 | ||||
|     # Index only works with one regex group since | ||||
|     # multi-group would expand to a frame | ||||
|     idx = Index(["A1", "A2", "A3", "A4", "B5"]) | ||||
|     msg = "only one regex group is supported with Index" | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         idx.str.extract("([AB])([123])", expand=False) | ||||
|  | ||||
|  | ||||
| def test_extract_expand_no_capture_groups_raises(index_or_series, any_string_dtype): | ||||
|     s_or_idx = index_or_series(["A1", "B2", "C3"], dtype=any_string_dtype) | ||||
|     msg = "pattern contains no capture groups" | ||||
|  | ||||
|     # no groups | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         s_or_idx.str.extract("[ABC][123]", expand=False) | ||||
|  | ||||
|     # only non-capturing groups | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         s_or_idx.str.extract("(?:[AB]).*", expand=False) | ||||
|  | ||||
|  | ||||
| def test_extract_expand_single_capture_group(index_or_series, any_string_dtype): | ||||
|     # single group renames series/index properly | ||||
|     s_or_idx = index_or_series(["A1", "A2"], dtype=any_string_dtype) | ||||
|     result = s_or_idx.str.extract(r"(?P<uno>A)\d", expand=False) | ||||
|  | ||||
|     expected = index_or_series(["A", "A"], name="uno", dtype=any_string_dtype) | ||||
|     if index_or_series == Series: | ||||
|         tm.assert_series_equal(result, expected) | ||||
|     else: | ||||
|         tm.assert_index_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_extract_expand_capture_groups(any_string_dtype): | ||||
|     s = Series(["A1", "B2", "C3"], dtype=any_string_dtype) | ||||
|     # one group, no matches | ||||
|     result = s.str.extract("(_)", expand=False) | ||||
|     expected = Series([np.nan, np.nan, np.nan], dtype=any_string_dtype) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     # two groups, no matches | ||||
|     result = s.str.extract("(_)(_)", expand=False) | ||||
|     expected = DataFrame( | ||||
|         [[np.nan, np.nan], [np.nan, np.nan], [np.nan, np.nan]], dtype=any_string_dtype | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # one group, some matches | ||||
|     result = s.str.extract("([AB])[123]", expand=False) | ||||
|     expected = Series(["A", "B", np.nan], dtype=any_string_dtype) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     # two groups, some matches | ||||
|     result = s.str.extract("([AB])([123])", expand=False) | ||||
|     expected = DataFrame( | ||||
|         [["A", "1"], ["B", "2"], [np.nan, np.nan]], dtype=any_string_dtype | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # one named group | ||||
|     result = s.str.extract("(?P<letter>[AB])", expand=False) | ||||
|     expected = Series(["A", "B", np.nan], name="letter", dtype=any_string_dtype) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     # two named groups | ||||
|     result = s.str.extract("(?P<letter>[AB])(?P<number>[123])", expand=False) | ||||
|     expected = DataFrame( | ||||
|         [["A", "1"], ["B", "2"], [np.nan, np.nan]], | ||||
|         columns=["letter", "number"], | ||||
|         dtype=any_string_dtype, | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # mix named and unnamed groups | ||||
|     result = s.str.extract("([AB])(?P<number>[123])", expand=False) | ||||
|     expected = DataFrame( | ||||
|         [["A", "1"], ["B", "2"], [np.nan, np.nan]], | ||||
|         columns=[0, "number"], | ||||
|         dtype=any_string_dtype, | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # one normal group, one non-capturing group | ||||
|     result = s.str.extract("([AB])(?:[123])", expand=False) | ||||
|     expected = Series(["A", "B", np.nan], dtype=any_string_dtype) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     # two normal groups, one non-capturing group | ||||
|     s = Series(["A11", "B22", "C33"], dtype=any_string_dtype) | ||||
|     result = s.str.extract("([AB])([123])(?:[123])", expand=False) | ||||
|     expected = DataFrame( | ||||
|         [["A", "1"], ["B", "2"], [np.nan, np.nan]], dtype=any_string_dtype | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # one optional group followed by one normal group | ||||
|     s = Series(["A1", "B2", "3"], dtype=any_string_dtype) | ||||
|     result = s.str.extract("(?P<letter>[AB])?(?P<number>[123])", expand=False) | ||||
|     expected = DataFrame( | ||||
|         [["A", "1"], ["B", "2"], [np.nan, "3"]], | ||||
|         columns=["letter", "number"], | ||||
|         dtype=any_string_dtype, | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # one normal group followed by one optional group | ||||
|     s = Series(["A1", "B2", "C"], dtype=any_string_dtype) | ||||
|     result = s.str.extract("(?P<letter>[ABC])(?P<number>[123])?", expand=False) | ||||
|     expected = DataFrame( | ||||
|         [["A", "1"], ["B", "2"], ["C", np.nan]], | ||||
|         columns=["letter", "number"], | ||||
|         dtype=any_string_dtype, | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_extract_expand_capture_groups_index(index, any_string_dtype): | ||||
|     # https://github.com/pandas-dev/pandas/issues/6348 | ||||
|     # not passing index to the extractor | ||||
|     data = ["A1", "B2", "C"] | ||||
|  | ||||
|     if len(index) == 0: | ||||
|         pytest.skip("Test requires len(index) > 0") | ||||
|     while len(index) < len(data): | ||||
|         index = index.repeat(2) | ||||
|  | ||||
|     index = index[: len(data)] | ||||
|     ser = Series(data, index=index, dtype=any_string_dtype) | ||||
|  | ||||
|     result = ser.str.extract(r"(\d)", expand=False) | ||||
|     expected = Series(["1", "2", np.nan], index=index, dtype=any_string_dtype) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     result = ser.str.extract(r"(?P<letter>\D)(?P<number>\d)?", expand=False) | ||||
|     expected = DataFrame( | ||||
|         [["A", "1"], ["B", "2"], ["C", np.nan]], | ||||
|         columns=["letter", "number"], | ||||
|         index=index, | ||||
|         dtype=any_string_dtype, | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_extract_single_series_name_is_preserved(any_string_dtype): | ||||
|     s = Series(["a3", "b3", "c2"], name="bob", dtype=any_string_dtype) | ||||
|     result = s.str.extract(r"(?P<sue>[a-z])", expand=False) | ||||
|     expected = Series(["a", "b", "c"], name="sue", dtype=any_string_dtype) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_extract_expand_True(any_string_dtype): | ||||
|     # Contains tests like those in test_match and some others. | ||||
|     s = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype) | ||||
|  | ||||
|     result = s.str.extract(".*(BAD[_]+).*(BAD)", expand=True) | ||||
|     expected = DataFrame( | ||||
|         [["BAD__", "BAD"], [np.nan, np.nan], [np.nan, np.nan]], dtype=any_string_dtype | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_extract_expand_True_mixed_object(): | ||||
|     er = [np.nan, np.nan]  # empty row | ||||
|     mixed = Series( | ||||
|         [ | ||||
|             "aBAD_BAD", | ||||
|             np.nan, | ||||
|             "BAD_b_BAD", | ||||
|             True, | ||||
|             datetime.today(), | ||||
|             "foo", | ||||
|             None, | ||||
|             1, | ||||
|             2.0, | ||||
|         ] | ||||
|     ) | ||||
|  | ||||
|     result = mixed.str.extract(".*(BAD[_]+).*(BAD)", expand=True) | ||||
|     expected = DataFrame( | ||||
|         [["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er], dtype=object | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_extract_expand_True_single_capture_group_raises( | ||||
|     index_or_series, any_string_dtype | ||||
| ): | ||||
|     # these should work for both Series and Index | ||||
|     # no groups | ||||
|     s_or_idx = index_or_series(["A1", "B2", "C3"], dtype=any_string_dtype) | ||||
|     msg = "pattern contains no capture groups" | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         s_or_idx.str.extract("[ABC][123]", expand=True) | ||||
|  | ||||
|     # only non-capturing groups | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         s_or_idx.str.extract("(?:[AB]).*", expand=True) | ||||
|  | ||||
|  | ||||
| def test_extract_expand_True_single_capture_group(index_or_series, any_string_dtype): | ||||
|     # single group renames series/index properly | ||||
|     s_or_idx = index_or_series(["A1", "A2"], dtype=any_string_dtype) | ||||
|     result = s_or_idx.str.extract(r"(?P<uno>A)\d", expand=True) | ||||
|     expected = DataFrame({"uno": ["A", "A"]}, dtype=any_string_dtype) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("name", [None, "series_name"]) | ||||
| def test_extract_series(name, any_string_dtype): | ||||
|     # extract should give the same result whether or not the series has a name. | ||||
|     s = Series(["A1", "B2", "C3"], name=name, dtype=any_string_dtype) | ||||
|  | ||||
|     # one group, no matches | ||||
|     result = s.str.extract("(_)", expand=True) | ||||
|     expected = DataFrame([np.nan, np.nan, np.nan], dtype=any_string_dtype) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # two groups, no matches | ||||
|     result = s.str.extract("(_)(_)", expand=True) | ||||
|     expected = DataFrame( | ||||
|         [[np.nan, np.nan], [np.nan, np.nan], [np.nan, np.nan]], dtype=any_string_dtype | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # one group, some matches | ||||
|     result = s.str.extract("([AB])[123]", expand=True) | ||||
|     expected = DataFrame(["A", "B", np.nan], dtype=any_string_dtype) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # two groups, some matches | ||||
|     result = s.str.extract("([AB])([123])", expand=True) | ||||
|     expected = DataFrame( | ||||
|         [["A", "1"], ["B", "2"], [np.nan, np.nan]], dtype=any_string_dtype | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # one named group | ||||
|     result = s.str.extract("(?P<letter>[AB])", expand=True) | ||||
|     expected = DataFrame({"letter": ["A", "B", np.nan]}, dtype=any_string_dtype) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # two named groups | ||||
|     result = s.str.extract("(?P<letter>[AB])(?P<number>[123])", expand=True) | ||||
|     expected = DataFrame( | ||||
|         [["A", "1"], ["B", "2"], [np.nan, np.nan]], | ||||
|         columns=["letter", "number"], | ||||
|         dtype=any_string_dtype, | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # mix named and unnamed groups | ||||
|     result = s.str.extract("([AB])(?P<number>[123])", expand=True) | ||||
|     expected = DataFrame( | ||||
|         [["A", "1"], ["B", "2"], [np.nan, np.nan]], | ||||
|         columns=[0, "number"], | ||||
|         dtype=any_string_dtype, | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # one normal group, one non-capturing group | ||||
|     result = s.str.extract("([AB])(?:[123])", expand=True) | ||||
|     expected = DataFrame(["A", "B", np.nan], dtype=any_string_dtype) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_extract_optional_groups(any_string_dtype): | ||||
|     # two normal groups, one non-capturing group | ||||
|     s = Series(["A11", "B22", "C33"], dtype=any_string_dtype) | ||||
|     result = s.str.extract("([AB])([123])(?:[123])", expand=True) | ||||
|     expected = DataFrame( | ||||
|         [["A", "1"], ["B", "2"], [np.nan, np.nan]], dtype=any_string_dtype | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # one optional group followed by one normal group | ||||
|     s = Series(["A1", "B2", "3"], dtype=any_string_dtype) | ||||
|     result = s.str.extract("(?P<letter>[AB])?(?P<number>[123])", expand=True) | ||||
|     expected = DataFrame( | ||||
|         [["A", "1"], ["B", "2"], [np.nan, "3"]], | ||||
|         columns=["letter", "number"], | ||||
|         dtype=any_string_dtype, | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # one normal group followed by one optional group | ||||
|     s = Series(["A1", "B2", "C"], dtype=any_string_dtype) | ||||
|     result = s.str.extract("(?P<letter>[ABC])(?P<number>[123])?", expand=True) | ||||
|     expected = DataFrame( | ||||
|         [["A", "1"], ["B", "2"], ["C", np.nan]], | ||||
|         columns=["letter", "number"], | ||||
|         dtype=any_string_dtype, | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_extract_dataframe_capture_groups_index(index, any_string_dtype): | ||||
|     # GH6348 | ||||
|     # not passing index to the extractor | ||||
|  | ||||
|     data = ["A1", "B2", "C"] | ||||
|  | ||||
|     if len(index) < len(data): | ||||
|         pytest.skip(f"Index needs more than {len(data)} values") | ||||
|  | ||||
|     index = index[: len(data)] | ||||
|     s = Series(data, index=index, dtype=any_string_dtype) | ||||
|  | ||||
|     result = s.str.extract(r"(\d)", expand=True) | ||||
|     expected = DataFrame(["1", "2", np.nan], index=index, dtype=any_string_dtype) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     result = s.str.extract(r"(?P<letter>\D)(?P<number>\d)?", expand=True) | ||||
|     expected = DataFrame( | ||||
|         [["A", "1"], ["B", "2"], ["C", np.nan]], | ||||
|         columns=["letter", "number"], | ||||
|         index=index, | ||||
|         dtype=any_string_dtype, | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_extract_single_group_returns_frame(any_string_dtype): | ||||
|     # GH11386 extract should always return DataFrame, even when | ||||
|     # there is only one group. Prior to v0.18.0, extract returned | ||||
|     # Series when there was only one group in the regex. | ||||
|     s = Series(["a3", "b3", "c2"], name="series_name", dtype=any_string_dtype) | ||||
|     result = s.str.extract(r"(?P<letter>[a-z])", expand=True) | ||||
|     expected = DataFrame({"letter": ["a", "b", "c"]}, dtype=any_string_dtype) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_extractall(any_string_dtype): | ||||
|     data = [ | ||||
|         "dave@google.com", | ||||
|         "tdhock5@gmail.com", | ||||
|         "maudelaperriere@gmail.com", | ||||
|         "rob@gmail.com some text steve@gmail.com", | ||||
|         "a@b.com some text c@d.com and e@f.com", | ||||
|         np.nan, | ||||
|         "", | ||||
|     ] | ||||
|     expected_tuples = [ | ||||
|         ("dave", "google", "com"), | ||||
|         ("tdhock5", "gmail", "com"), | ||||
|         ("maudelaperriere", "gmail", "com"), | ||||
|         ("rob", "gmail", "com"), | ||||
|         ("steve", "gmail", "com"), | ||||
|         ("a", "b", "com"), | ||||
|         ("c", "d", "com"), | ||||
|         ("e", "f", "com"), | ||||
|     ] | ||||
|     pat = r""" | ||||
|     (?P<user>[a-z0-9]+) | ||||
|     @ | ||||
|     (?P<domain>[a-z]+) | ||||
|     \. | ||||
|     (?P<tld>[a-z]{2,4}) | ||||
|     """ | ||||
|     expected_columns = ["user", "domain", "tld"] | ||||
|     s = Series(data, dtype=any_string_dtype) | ||||
|     # extractall should return a DataFrame with one row for each match, indexed by the | ||||
|     # subject from which the match came. | ||||
|     expected_index = MultiIndex.from_tuples( | ||||
|         [(0, 0), (1, 0), (2, 0), (3, 0), (3, 1), (4, 0), (4, 1), (4, 2)], | ||||
|         names=(None, "match"), | ||||
|     ) | ||||
|     expected = DataFrame( | ||||
|         expected_tuples, expected_index, expected_columns, dtype=any_string_dtype | ||||
|     ) | ||||
|     result = s.str.extractall(pat, flags=re.VERBOSE) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # The index of the input Series should be used to construct the index of the output | ||||
|     # DataFrame: | ||||
|     mi = MultiIndex.from_tuples( | ||||
|         [ | ||||
|             ("single", "Dave"), | ||||
|             ("single", "Toby"), | ||||
|             ("single", "Maude"), | ||||
|             ("multiple", "robAndSteve"), | ||||
|             ("multiple", "abcdef"), | ||||
|             ("none", "missing"), | ||||
|             ("none", "empty"), | ||||
|         ] | ||||
|     ) | ||||
|     s = Series(data, index=mi, dtype=any_string_dtype) | ||||
|     expected_index = MultiIndex.from_tuples( | ||||
|         [ | ||||
|             ("single", "Dave", 0), | ||||
|             ("single", "Toby", 0), | ||||
|             ("single", "Maude", 0), | ||||
|             ("multiple", "robAndSteve", 0), | ||||
|             ("multiple", "robAndSteve", 1), | ||||
|             ("multiple", "abcdef", 0), | ||||
|             ("multiple", "abcdef", 1), | ||||
|             ("multiple", "abcdef", 2), | ||||
|         ], | ||||
|         names=(None, None, "match"), | ||||
|     ) | ||||
|     expected = DataFrame( | ||||
|         expected_tuples, expected_index, expected_columns, dtype=any_string_dtype | ||||
|     ) | ||||
|     result = s.str.extractall(pat, flags=re.VERBOSE) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # MultiIndexed subject with names. | ||||
|     s = Series(data, index=mi, dtype=any_string_dtype) | ||||
|     s.index.names = ("matches", "description") | ||||
|     expected_index.names = ("matches", "description", "match") | ||||
|     expected = DataFrame( | ||||
|         expected_tuples, expected_index, expected_columns, dtype=any_string_dtype | ||||
|     ) | ||||
|     result = s.str.extractall(pat, flags=re.VERBOSE) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "pat,expected_names", | ||||
|     [ | ||||
|         # optional groups. | ||||
|         ("(?P<letter>[AB])?(?P<number>[123])", ["letter", "number"]), | ||||
|         # only one of two groups has a name. | ||||
|         ("([AB])?(?P<number>[123])", [0, "number"]), | ||||
|     ], | ||||
| ) | ||||
| def test_extractall_column_names(pat, expected_names, any_string_dtype): | ||||
|     s = Series(["", "A1", "32"], dtype=any_string_dtype) | ||||
|  | ||||
|     result = s.str.extractall(pat) | ||||
|     expected = DataFrame( | ||||
|         [("A", "1"), (np.nan, "3"), (np.nan, "2")], | ||||
|         index=MultiIndex.from_tuples([(1, 0), (2, 0), (2, 1)], names=(None, "match")), | ||||
|         columns=expected_names, | ||||
|         dtype=any_string_dtype, | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_extractall_single_group(any_string_dtype): | ||||
|     s = Series(["a3", "b3", "d4c2"], name="series_name", dtype=any_string_dtype) | ||||
|     expected_index = MultiIndex.from_tuples( | ||||
|         [(0, 0), (1, 0), (2, 0), (2, 1)], names=(None, "match") | ||||
|     ) | ||||
|  | ||||
|     # extractall(one named group) returns DataFrame with one named column. | ||||
|     result = s.str.extractall(r"(?P<letter>[a-z])") | ||||
|     expected = DataFrame( | ||||
|         {"letter": ["a", "b", "d", "c"]}, index=expected_index, dtype=any_string_dtype | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # extractall(one un-named group) returns DataFrame with one un-named column. | ||||
|     result = s.str.extractall(r"([a-z])") | ||||
|     expected = DataFrame( | ||||
|         ["a", "b", "d", "c"], index=expected_index, dtype=any_string_dtype | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_extractall_single_group_with_quantifier(any_string_dtype): | ||||
|     # GH#13382 | ||||
|     # extractall(one un-named group with quantifier) returns DataFrame with one un-named | ||||
|     # column. | ||||
|     s = Series(["ab3", "abc3", "d4cd2"], name="series_name", dtype=any_string_dtype) | ||||
|     result = s.str.extractall(r"([a-z]+)") | ||||
|     expected = DataFrame( | ||||
|         ["ab", "abc", "d", "cd"], | ||||
|         index=MultiIndex.from_tuples( | ||||
|             [(0, 0), (1, 0), (2, 0), (2, 1)], names=(None, "match") | ||||
|         ), | ||||
|         dtype=any_string_dtype, | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "data, names", | ||||
|     [ | ||||
|         ([], (None,)), | ||||
|         ([], ("i1",)), | ||||
|         ([], (None, "i2")), | ||||
|         ([], ("i1", "i2")), | ||||
|         (["a3", "b3", "d4c2"], (None,)), | ||||
|         (["a3", "b3", "d4c2"], ("i1", "i2")), | ||||
|         (["a3", "b3", "d4c2"], (None, "i2")), | ||||
|         (["a3", "b3", "d4c2"], ("i1", "i2")), | ||||
|     ], | ||||
| ) | ||||
| def test_extractall_no_matches(data, names, any_string_dtype): | ||||
|     # GH19075 extractall with no matches should return a valid MultiIndex | ||||
|     n = len(data) | ||||
|     if len(names) == 1: | ||||
|         index = Index(range(n), name=names[0]) | ||||
|     else: | ||||
|         tuples = (tuple([i] * (n - 1)) for i in range(n)) | ||||
|         index = MultiIndex.from_tuples(tuples, names=names) | ||||
|     s = Series(data, name="series_name", index=index, dtype=any_string_dtype) | ||||
|     expected_index = MultiIndex.from_tuples([], names=(names + ("match",))) | ||||
|  | ||||
|     # one un-named group. | ||||
|     result = s.str.extractall("(z)") | ||||
|     expected = DataFrame(columns=[0], index=expected_index, dtype=any_string_dtype) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # two un-named groups. | ||||
|     result = s.str.extractall("(z)(z)") | ||||
|     expected = DataFrame(columns=[0, 1], index=expected_index, dtype=any_string_dtype) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # one named group. | ||||
|     result = s.str.extractall("(?P<first>z)") | ||||
|     expected = DataFrame( | ||||
|         columns=["first"], index=expected_index, dtype=any_string_dtype | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # two named groups. | ||||
|     result = s.str.extractall("(?P<first>z)(?P<second>z)") | ||||
|     expected = DataFrame( | ||||
|         columns=["first", "second"], index=expected_index, dtype=any_string_dtype | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # one named, one un-named. | ||||
|     result = s.str.extractall("(z)(?P<second>z)") | ||||
|     expected = DataFrame( | ||||
|         columns=[0, "second"], index=expected_index, dtype=any_string_dtype | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_extractall_stringindex(any_string_dtype): | ||||
|     s = Series(["a1a2", "b1", "c1"], name="xxx", dtype=any_string_dtype) | ||||
|     result = s.str.extractall(r"[ab](?P<digit>\d)") | ||||
|     expected = DataFrame( | ||||
|         {"digit": ["1", "2", "1"]}, | ||||
|         index=MultiIndex.from_tuples([(0, 0), (0, 1), (1, 0)], names=[None, "match"]), | ||||
|         dtype=any_string_dtype, | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # index should return the same result as the default index without name thus | ||||
|     # index.name doesn't affect to the result | ||||
|     if any_string_dtype == "object": | ||||
|         for idx in [ | ||||
|             Index(["a1a2", "b1", "c1"], dtype=object), | ||||
|             Index(["a1a2", "b1", "c1"], name="xxx", dtype=object), | ||||
|         ]: | ||||
|             result = idx.str.extractall(r"[ab](?P<digit>\d)") | ||||
|             tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     s = Series( | ||||
|         ["a1a2", "b1", "c1"], | ||||
|         name="s_name", | ||||
|         index=Index(["XX", "yy", "zz"], name="idx_name"), | ||||
|         dtype=any_string_dtype, | ||||
|     ) | ||||
|     result = s.str.extractall(r"[ab](?P<digit>\d)") | ||||
|     expected = DataFrame( | ||||
|         {"digit": ["1", "2", "1"]}, | ||||
|         index=MultiIndex.from_tuples( | ||||
|             [("XX", 0), ("XX", 1), ("yy", 0)], names=["idx_name", "match"] | ||||
|         ), | ||||
|         dtype=any_string_dtype, | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_extractall_no_capture_groups_raises(any_string_dtype): | ||||
|     # Does not make sense to use extractall with a regex that has no capture groups. | ||||
|     # (it returns DataFrame with one column for each capture group) | ||||
|     s = Series(["a3", "b3", "d4c2"], name="series_name", dtype=any_string_dtype) | ||||
|     with pytest.raises(ValueError, match="no capture groups"): | ||||
|         s.str.extractall(r"[a-z]") | ||||
|  | ||||
|  | ||||
| def test_extract_index_one_two_groups(): | ||||
|     s = Series(["a3", "b3", "d4c2"], index=["A3", "B3", "D4"], name="series_name") | ||||
|     r = s.index.str.extract(r"([A-Z])", expand=True) | ||||
|     e = DataFrame(["A", "B", "D"]) | ||||
|     tm.assert_frame_equal(r, e) | ||||
|  | ||||
|     # Prior to v0.18.0, index.str.extract(regex with one group) | ||||
|     # returned Index. With more than one group, extract raised an | ||||
|     # error (GH9980). Now extract always returns DataFrame. | ||||
|     r = s.index.str.extract(r"(?P<letter>[A-Z])(?P<digit>[0-9])", expand=True) | ||||
|     e_list = [("A", "3"), ("B", "3"), ("D", "4")] | ||||
|     e = DataFrame(e_list, columns=["letter", "digit"]) | ||||
|     tm.assert_frame_equal(r, e) | ||||
|  | ||||
|  | ||||
| def test_extractall_same_as_extract(any_string_dtype): | ||||
|     s = Series(["a3", "b3", "c2"], name="series_name", dtype=any_string_dtype) | ||||
|  | ||||
|     pattern_two_noname = r"([a-z])([0-9])" | ||||
|     extract_two_noname = s.str.extract(pattern_two_noname, expand=True) | ||||
|     has_multi_index = s.str.extractall(pattern_two_noname) | ||||
|     no_multi_index = has_multi_index.xs(0, level="match") | ||||
|     tm.assert_frame_equal(extract_two_noname, no_multi_index) | ||||
|  | ||||
|     pattern_two_named = r"(?P<letter>[a-z])(?P<digit>[0-9])" | ||||
|     extract_two_named = s.str.extract(pattern_two_named, expand=True) | ||||
|     has_multi_index = s.str.extractall(pattern_two_named) | ||||
|     no_multi_index = has_multi_index.xs(0, level="match") | ||||
|     tm.assert_frame_equal(extract_two_named, no_multi_index) | ||||
|  | ||||
|     pattern_one_named = r"(?P<group_name>[a-z])" | ||||
|     extract_one_named = s.str.extract(pattern_one_named, expand=True) | ||||
|     has_multi_index = s.str.extractall(pattern_one_named) | ||||
|     no_multi_index = has_multi_index.xs(0, level="match") | ||||
|     tm.assert_frame_equal(extract_one_named, no_multi_index) | ||||
|  | ||||
|     pattern_one_noname = r"([a-z])" | ||||
|     extract_one_noname = s.str.extract(pattern_one_noname, expand=True) | ||||
|     has_multi_index = s.str.extractall(pattern_one_noname) | ||||
|     no_multi_index = has_multi_index.xs(0, level="match") | ||||
|     tm.assert_frame_equal(extract_one_noname, no_multi_index) | ||||
|  | ||||
|  | ||||
| def test_extractall_same_as_extract_subject_index(any_string_dtype): | ||||
|     # same as above tests, but s has an MultiIndex. | ||||
|     mi = MultiIndex.from_tuples( | ||||
|         [("A", "first"), ("B", "second"), ("C", "third")], | ||||
|         names=("capital", "ordinal"), | ||||
|     ) | ||||
|     s = Series(["a3", "b3", "c2"], index=mi, name="series_name", dtype=any_string_dtype) | ||||
|  | ||||
|     pattern_two_noname = r"([a-z])([0-9])" | ||||
|     extract_two_noname = s.str.extract(pattern_two_noname, expand=True) | ||||
|     has_match_index = s.str.extractall(pattern_two_noname) | ||||
|     no_match_index = has_match_index.xs(0, level="match") | ||||
|     tm.assert_frame_equal(extract_two_noname, no_match_index) | ||||
|  | ||||
|     pattern_two_named = r"(?P<letter>[a-z])(?P<digit>[0-9])" | ||||
|     extract_two_named = s.str.extract(pattern_two_named, expand=True) | ||||
|     has_match_index = s.str.extractall(pattern_two_named) | ||||
|     no_match_index = has_match_index.xs(0, level="match") | ||||
|     tm.assert_frame_equal(extract_two_named, no_match_index) | ||||
|  | ||||
|     pattern_one_named = r"(?P<group_name>[a-z])" | ||||
|     extract_one_named = s.str.extract(pattern_one_named, expand=True) | ||||
|     has_match_index = s.str.extractall(pattern_one_named) | ||||
|     no_match_index = has_match_index.xs(0, level="match") | ||||
|     tm.assert_frame_equal(extract_one_named, no_match_index) | ||||
|  | ||||
|     pattern_one_noname = r"([a-z])" | ||||
|     extract_one_noname = s.str.extract(pattern_one_noname, expand=True) | ||||
|     has_match_index = s.str.extractall(pattern_one_noname) | ||||
|     no_match_index = has_match_index.xs(0, level="match") | ||||
|     tm.assert_frame_equal(extract_one_noname, no_match_index) | ||||
|  | ||||
|  | ||||
| def test_extractall_preserves_dtype(): | ||||
|     # Ensure that when extractall is called on a series with specific dtypes set, that | ||||
|     # the dtype is preserved in the resulting DataFrame's column. | ||||
|     pa = pytest.importorskip("pyarrow") | ||||
|  | ||||
|     result = Series(["abc", "ab"], dtype=ArrowDtype(pa.string())).str.extractall("(ab)") | ||||
|     assert result.dtypes[0] == "string[pyarrow]" | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -0,0 +1,53 @@ | ||||
| import numpy as np | ||||
|  | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     MultiIndex, | ||||
|     Series, | ||||
|     _testing as tm, | ||||
| ) | ||||
|  | ||||
|  | ||||
| def test_get_dummies(any_string_dtype): | ||||
|     s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) | ||||
|     result = s.str.get_dummies("|") | ||||
|     expected = DataFrame([[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc")) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     s = Series(["a;b", "a", 7], dtype=any_string_dtype) | ||||
|     result = s.str.get_dummies(";") | ||||
|     expected = DataFrame([[0, 1, 1], [0, 1, 0], [1, 0, 0]], columns=list("7ab")) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_get_dummies_index(): | ||||
|     # GH9980, GH8028 | ||||
|     idx = Index(["a|b", "a|c", "b|c"]) | ||||
|     result = idx.str.get_dummies("|") | ||||
|  | ||||
|     expected = MultiIndex.from_tuples( | ||||
|         [(1, 1, 0), (1, 0, 1), (0, 1, 1)], names=("a", "b", "c") | ||||
|     ) | ||||
|     tm.assert_index_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_get_dummies_with_name_dummy(any_string_dtype): | ||||
|     # GH 12180 | ||||
|     # Dummies named 'name' should work as expected | ||||
|     s = Series(["a", "b,name", "b"], dtype=any_string_dtype) | ||||
|     result = s.str.get_dummies(",") | ||||
|     expected = DataFrame([[1, 0, 0], [0, 1, 1], [0, 1, 0]], columns=["a", "b", "name"]) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_get_dummies_with_name_dummy_index(): | ||||
|     # GH 12180 | ||||
|     # Dummies named 'name' should work as expected | ||||
|     idx = Index(["a|b", "name|c", "b|name"]) | ||||
|     result = idx.str.get_dummies("|") | ||||
|  | ||||
|     expected = MultiIndex.from_tuples( | ||||
|         [(1, 1, 0, 0), (0, 0, 1, 1), (0, 1, 0, 1)], names=("a", "b", "c", "name") | ||||
|     ) | ||||
|     tm.assert_index_equal(result, expected) | ||||
| @ -0,0 +1,734 @@ | ||||
| from datetime import datetime | ||||
| import re | ||||
|  | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| import pandas as pd | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     MultiIndex, | ||||
|     Series, | ||||
|     _testing as tm, | ||||
| ) | ||||
| from pandas.tests.strings import ( | ||||
|     _convert_na_value, | ||||
|     is_object_or_nan_string_dtype, | ||||
| ) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("method", ["split", "rsplit"]) | ||||
| def test_split(any_string_dtype, method): | ||||
|     values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype) | ||||
|  | ||||
|     result = getattr(values.str, method)("_") | ||||
|     exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) | ||||
|     exp = _convert_na_value(values, exp) | ||||
|     tm.assert_series_equal(result, exp) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("method", ["split", "rsplit"]) | ||||
| def test_split_more_than_one_char(any_string_dtype, method): | ||||
|     # more than one char | ||||
|     values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"], dtype=any_string_dtype) | ||||
|     result = getattr(values.str, method)("__") | ||||
|     exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) | ||||
|     exp = _convert_na_value(values, exp) | ||||
|     tm.assert_series_equal(result, exp) | ||||
|  | ||||
|     result = getattr(values.str, method)("__", expand=False) | ||||
|     tm.assert_series_equal(result, exp) | ||||
|  | ||||
|  | ||||
| def test_split_more_regex_split(any_string_dtype): | ||||
|     # regex split | ||||
|     values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype) | ||||
|     result = values.str.split("[,_]") | ||||
|     exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) | ||||
|     exp = _convert_na_value(values, exp) | ||||
|     tm.assert_series_equal(result, exp) | ||||
|  | ||||
|  | ||||
| def test_split_regex(any_string_dtype): | ||||
|     # GH 43563 | ||||
|     # explicit regex = True split | ||||
|     values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype) | ||||
|     result = values.str.split(r"\.jpg", regex=True) | ||||
|     exp = Series([["xxxjpgzzz", ""]]) | ||||
|     tm.assert_series_equal(result, exp) | ||||
|  | ||||
|  | ||||
| def test_split_regex_explicit(any_string_dtype): | ||||
|     # explicit regex = True split with compiled regex | ||||
|     regex_pat = re.compile(r".jpg") | ||||
|     values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype) | ||||
|     result = values.str.split(regex_pat) | ||||
|     exp = Series([["xx", "zzz", ""]]) | ||||
|     tm.assert_series_equal(result, exp) | ||||
|  | ||||
|     # explicit regex = False split | ||||
|     result = values.str.split(r"\.jpg", regex=False) | ||||
|     exp = Series([["xxxjpgzzz.jpg"]]) | ||||
|     tm.assert_series_equal(result, exp) | ||||
|  | ||||
|     # non explicit regex split, pattern length == 1 | ||||
|     result = values.str.split(r".") | ||||
|     exp = Series([["xxxjpgzzz", "jpg"]]) | ||||
|     tm.assert_series_equal(result, exp) | ||||
|  | ||||
|     # non explicit regex split, pattern length != 1 | ||||
|     result = values.str.split(r".jpg") | ||||
|     exp = Series([["xx", "zzz", ""]]) | ||||
|     tm.assert_series_equal(result, exp) | ||||
|  | ||||
|     # regex=False with pattern compiled regex raises error | ||||
|     with pytest.raises( | ||||
|         ValueError, | ||||
|         match="Cannot use a compiled regex as replacement pattern with regex=False", | ||||
|     ): | ||||
|         values.str.split(regex_pat, regex=False) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("expand", [None, False]) | ||||
| @pytest.mark.parametrize("method", ["split", "rsplit"]) | ||||
| def test_split_object_mixed(expand, method): | ||||
|     mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0]) | ||||
|     result = getattr(mixed.str, method)("_", expand=expand) | ||||
|     exp = Series( | ||||
|         [ | ||||
|             ["a", "b", "c"], | ||||
|             np.nan, | ||||
|             ["d", "e", "f"], | ||||
|             np.nan, | ||||
|             np.nan, | ||||
|             None, | ||||
|             np.nan, | ||||
|             np.nan, | ||||
|         ] | ||||
|     ) | ||||
|     assert isinstance(result, Series) | ||||
|     tm.assert_almost_equal(result, exp) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("method", ["split", "rsplit"]) | ||||
| @pytest.mark.parametrize("n", [None, 0]) | ||||
| def test_split_n(any_string_dtype, method, n): | ||||
|     s = Series(["a b", pd.NA, "b c"], dtype=any_string_dtype) | ||||
|     expected = Series([["a", "b"], pd.NA, ["b", "c"]]) | ||||
|     result = getattr(s.str, method)(" ", n=n) | ||||
|     expected = _convert_na_value(s, expected) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_rsplit(any_string_dtype): | ||||
|     # regex split is not supported by rsplit | ||||
|     values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype) | ||||
|     result = values.str.rsplit("[,_]") | ||||
|     exp = Series([["a,b_c"], ["c_d,e"], np.nan, ["f,g,h"]]) | ||||
|     exp = _convert_na_value(values, exp) | ||||
|     tm.assert_series_equal(result, exp) | ||||
|  | ||||
|  | ||||
| def test_rsplit_max_number(any_string_dtype): | ||||
|     # setting max number of splits, make sure it's from reverse | ||||
|     values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype) | ||||
|     result = values.str.rsplit("_", n=1) | ||||
|     exp = Series([["a_b", "c"], ["c_d", "e"], np.nan, ["f_g", "h"]]) | ||||
|     exp = _convert_na_value(values, exp) | ||||
|     tm.assert_series_equal(result, exp) | ||||
|  | ||||
|  | ||||
| def test_split_blank_string(any_string_dtype): | ||||
|     # expand blank split GH 20067 | ||||
|     values = Series([""], name="test", dtype=any_string_dtype) | ||||
|     result = values.str.split(expand=True) | ||||
|     exp = DataFrame([[]], dtype=any_string_dtype)  # NOTE: this is NOT an empty df | ||||
|     tm.assert_frame_equal(result, exp) | ||||
|  | ||||
|  | ||||
| def test_split_blank_string_with_non_empty(any_string_dtype): | ||||
|     values = Series(["a b c", "a b", "", " "], name="test", dtype=any_string_dtype) | ||||
|     result = values.str.split(expand=True) | ||||
|     exp = DataFrame( | ||||
|         [ | ||||
|             ["a", "b", "c"], | ||||
|             ["a", "b", None], | ||||
|             [None, None, None], | ||||
|             [None, None, None], | ||||
|         ], | ||||
|         dtype=any_string_dtype, | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, exp) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("method", ["split", "rsplit"]) | ||||
| def test_split_noargs(any_string_dtype, method): | ||||
|     # #1859 | ||||
|     s = Series(["Wes McKinney", "Travis  Oliphant"], dtype=any_string_dtype) | ||||
|     result = getattr(s.str, method)() | ||||
|     expected = ["Travis", "Oliphant"] | ||||
|     assert result[1] == expected | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "data, pat", | ||||
|     [ | ||||
|         (["bd asdf jfg", "kjasdflqw asdfnfk"], None), | ||||
|         (["bd asdf jfg", "kjasdflqw asdfnfk"], "asdf"), | ||||
|         (["bd_asdf_jfg", "kjasdflqw_asdfnfk"], "_"), | ||||
|     ], | ||||
| ) | ||||
| @pytest.mark.parametrize("n", [-1, 0]) | ||||
| def test_split_maxsplit(data, pat, any_string_dtype, n): | ||||
|     # re.split 0, str.split -1 | ||||
|     s = Series(data, dtype=any_string_dtype) | ||||
|  | ||||
|     result = s.str.split(pat=pat, n=n) | ||||
|     xp = s.str.split(pat=pat) | ||||
|     tm.assert_series_equal(result, xp) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "data, pat, expected", | ||||
|     [ | ||||
|         ( | ||||
|             ["split once", "split once too!"], | ||||
|             None, | ||||
|             Series({0: ["split", "once"], 1: ["split", "once too!"]}), | ||||
|         ), | ||||
|         ( | ||||
|             ["split_once", "split_once_too!"], | ||||
|             "_", | ||||
|             Series({0: ["split", "once"], 1: ["split", "once_too!"]}), | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_split_no_pat_with_nonzero_n(data, pat, expected, any_string_dtype): | ||||
|     s = Series(data, dtype=any_string_dtype) | ||||
|     result = s.str.split(pat=pat, n=1) | ||||
|     tm.assert_series_equal(expected, result, check_index_type=False) | ||||
|  | ||||
|  | ||||
| def test_split_to_dataframe_no_splits(any_string_dtype): | ||||
|     s = Series(["nosplit", "alsonosplit"], dtype=any_string_dtype) | ||||
|     result = s.str.split("_", expand=True) | ||||
|     exp = DataFrame({0: Series(["nosplit", "alsonosplit"], dtype=any_string_dtype)}) | ||||
|     tm.assert_frame_equal(result, exp) | ||||
|  | ||||
|  | ||||
| def test_split_to_dataframe(any_string_dtype): | ||||
|     s = Series(["some_equal_splits", "with_no_nans"], dtype=any_string_dtype) | ||||
|     result = s.str.split("_", expand=True) | ||||
|     exp = DataFrame( | ||||
|         {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]}, | ||||
|         dtype=any_string_dtype, | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, exp) | ||||
|  | ||||
|  | ||||
| def test_split_to_dataframe_unequal_splits(any_string_dtype): | ||||
|     s = Series( | ||||
|         ["some_unequal_splits", "one_of_these_things_is_not"], dtype=any_string_dtype | ||||
|     ) | ||||
|     result = s.str.split("_", expand=True) | ||||
|     exp = DataFrame( | ||||
|         { | ||||
|             0: ["some", "one"], | ||||
|             1: ["unequal", "of"], | ||||
|             2: ["splits", "these"], | ||||
|             3: [None, "things"], | ||||
|             4: [None, "is"], | ||||
|             5: [None, "not"], | ||||
|         }, | ||||
|         dtype=any_string_dtype, | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, exp) | ||||
|  | ||||
|  | ||||
| def test_split_to_dataframe_with_index(any_string_dtype): | ||||
|     s = Series( | ||||
|         ["some_splits", "with_index"], index=["preserve", "me"], dtype=any_string_dtype | ||||
|     ) | ||||
|     result = s.str.split("_", expand=True) | ||||
|     exp = DataFrame( | ||||
|         {0: ["some", "with"], 1: ["splits", "index"]}, | ||||
|         index=["preserve", "me"], | ||||
|         dtype=any_string_dtype, | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, exp) | ||||
|  | ||||
|     with pytest.raises(ValueError, match="expand must be"): | ||||
|         s.str.split("_", expand="not_a_boolean") | ||||
|  | ||||
|  | ||||
| def test_split_to_multiindex_expand_no_splits(): | ||||
|     # https://github.com/pandas-dev/pandas/issues/23677 | ||||
|  | ||||
|     idx = Index(["nosplit", "alsonosplit", np.nan]) | ||||
|     result = idx.str.split("_", expand=True) | ||||
|     exp = idx | ||||
|     tm.assert_index_equal(result, exp) | ||||
|     assert result.nlevels == 1 | ||||
|  | ||||
|  | ||||
| def test_split_to_multiindex_expand(): | ||||
|     idx = Index(["some_equal_splits", "with_no_nans", np.nan, None]) | ||||
|     result = idx.str.split("_", expand=True) | ||||
|     exp = MultiIndex.from_tuples( | ||||
|         [ | ||||
|             ("some", "equal", "splits"), | ||||
|             ("with", "no", "nans"), | ||||
|             [np.nan, np.nan, np.nan], | ||||
|             [None, None, None], | ||||
|         ] | ||||
|     ) | ||||
|     tm.assert_index_equal(result, exp) | ||||
|     assert result.nlevels == 3 | ||||
|  | ||||
|  | ||||
| def test_split_to_multiindex_expand_unequal_splits(): | ||||
|     idx = Index(["some_unequal_splits", "one_of_these_things_is_not", np.nan, None]) | ||||
|     result = idx.str.split("_", expand=True) | ||||
|     exp = MultiIndex.from_tuples( | ||||
|         [ | ||||
|             ("some", "unequal", "splits", np.nan, np.nan, np.nan), | ||||
|             ("one", "of", "these", "things", "is", "not"), | ||||
|             (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan), | ||||
|             (None, None, None, None, None, None), | ||||
|         ] | ||||
|     ) | ||||
|     tm.assert_index_equal(result, exp) | ||||
|     assert result.nlevels == 6 | ||||
|  | ||||
|     with pytest.raises(ValueError, match="expand must be"): | ||||
|         idx.str.split("_", expand="not_a_boolean") | ||||
|  | ||||
|  | ||||
| def test_rsplit_to_dataframe_expand_no_splits(any_string_dtype): | ||||
|     s = Series(["nosplit", "alsonosplit"], dtype=any_string_dtype) | ||||
|     result = s.str.rsplit("_", expand=True) | ||||
|     exp = DataFrame({0: Series(["nosplit", "alsonosplit"])}, dtype=any_string_dtype) | ||||
|     tm.assert_frame_equal(result, exp) | ||||
|  | ||||
|  | ||||
| def test_rsplit_to_dataframe_expand(any_string_dtype): | ||||
|     s = Series(["some_equal_splits", "with_no_nans"], dtype=any_string_dtype) | ||||
|     result = s.str.rsplit("_", expand=True) | ||||
|     exp = DataFrame( | ||||
|         {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]}, | ||||
|         dtype=any_string_dtype, | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, exp) | ||||
|  | ||||
|     result = s.str.rsplit("_", expand=True, n=2) | ||||
|     exp = DataFrame( | ||||
|         {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]}, | ||||
|         dtype=any_string_dtype, | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, exp) | ||||
|  | ||||
|     result = s.str.rsplit("_", expand=True, n=1) | ||||
|     exp = DataFrame( | ||||
|         {0: ["some_equal", "with_no"], 1: ["splits", "nans"]}, dtype=any_string_dtype | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, exp) | ||||
|  | ||||
|  | ||||
| def test_rsplit_to_dataframe_expand_with_index(any_string_dtype): | ||||
|     s = Series( | ||||
|         ["some_splits", "with_index"], index=["preserve", "me"], dtype=any_string_dtype | ||||
|     ) | ||||
|     result = s.str.rsplit("_", expand=True) | ||||
|     exp = DataFrame( | ||||
|         {0: ["some", "with"], 1: ["splits", "index"]}, | ||||
|         index=["preserve", "me"], | ||||
|         dtype=any_string_dtype, | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, exp) | ||||
|  | ||||
|  | ||||
| def test_rsplit_to_multiindex_expand_no_split(): | ||||
|     idx = Index(["nosplit", "alsonosplit"]) | ||||
|     result = idx.str.rsplit("_", expand=True) | ||||
|     exp = idx | ||||
|     tm.assert_index_equal(result, exp) | ||||
|     assert result.nlevels == 1 | ||||
|  | ||||
|  | ||||
| def test_rsplit_to_multiindex_expand(): | ||||
|     idx = Index(["some_equal_splits", "with_no_nans"]) | ||||
|     result = idx.str.rsplit("_", expand=True) | ||||
|     exp = MultiIndex.from_tuples([("some", "equal", "splits"), ("with", "no", "nans")]) | ||||
|     tm.assert_index_equal(result, exp) | ||||
|     assert result.nlevels == 3 | ||||
|  | ||||
|  | ||||
| def test_rsplit_to_multiindex_expand_n(): | ||||
|     idx = Index(["some_equal_splits", "with_no_nans"]) | ||||
|     result = idx.str.rsplit("_", expand=True, n=1) | ||||
|     exp = MultiIndex.from_tuples([("some_equal", "splits"), ("with_no", "nans")]) | ||||
|     tm.assert_index_equal(result, exp) | ||||
|     assert result.nlevels == 2 | ||||
|  | ||||
|  | ||||
| def test_split_nan_expand(any_string_dtype): | ||||
|     # gh-18450 | ||||
|     s = Series(["foo,bar,baz", np.nan], dtype=any_string_dtype) | ||||
|     result = s.str.split(",", expand=True) | ||||
|     exp = DataFrame( | ||||
|         [["foo", "bar", "baz"], [np.nan, np.nan, np.nan]], dtype=any_string_dtype | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, exp) | ||||
|  | ||||
|     # check that these are actually np.nan/pd.NA and not None | ||||
|     # TODO see GH 18463 | ||||
|     # tm.assert_frame_equal does not differentiate | ||||
|     if is_object_or_nan_string_dtype(any_string_dtype): | ||||
|         assert all(np.isnan(x) for x in result.iloc[1]) | ||||
|     else: | ||||
|         assert all(x is pd.NA for x in result.iloc[1]) | ||||
|  | ||||
|  | ||||
| def test_split_with_name_series(any_string_dtype): | ||||
|     # GH 12617 | ||||
|  | ||||
|     # should preserve name | ||||
|     s = Series(["a,b", "c,d"], name="xxx", dtype=any_string_dtype) | ||||
|     res = s.str.split(",") | ||||
|     exp = Series([["a", "b"], ["c", "d"]], name="xxx") | ||||
|     tm.assert_series_equal(res, exp) | ||||
|  | ||||
|     res = s.str.split(",", expand=True) | ||||
|     exp = DataFrame([["a", "b"], ["c", "d"]], dtype=any_string_dtype) | ||||
|     tm.assert_frame_equal(res, exp) | ||||
|  | ||||
|  | ||||
| def test_split_with_name_index(): | ||||
|     # GH 12617 | ||||
|     idx = Index(["a,b", "c,d"], name="xxx") | ||||
|     res = idx.str.split(",") | ||||
|     exp = Index([["a", "b"], ["c", "d"]], name="xxx") | ||||
|     assert res.nlevels == 1 | ||||
|     tm.assert_index_equal(res, exp) | ||||
|  | ||||
|     res = idx.str.split(",", expand=True) | ||||
|     exp = MultiIndex.from_tuples([("a", "b"), ("c", "d")]) | ||||
|     assert res.nlevels == 2 | ||||
|     tm.assert_index_equal(res, exp) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "method, exp", | ||||
|     [ | ||||
|         [ | ||||
|             "partition", | ||||
|             [ | ||||
|                 ("a", "__", "b__c"), | ||||
|                 ("c", "__", "d__e"), | ||||
|                 np.nan, | ||||
|                 ("f", "__", "g__h"), | ||||
|                 None, | ||||
|             ], | ||||
|         ], | ||||
|         [ | ||||
|             "rpartition", | ||||
|             [ | ||||
|                 ("a__b", "__", "c"), | ||||
|                 ("c__d", "__", "e"), | ||||
|                 np.nan, | ||||
|                 ("f__g", "__", "h"), | ||||
|                 None, | ||||
|             ], | ||||
|         ], | ||||
|     ], | ||||
| ) | ||||
| def test_partition_series_more_than_one_char(method, exp, any_string_dtype): | ||||
|     # https://github.com/pandas-dev/pandas/issues/23558 | ||||
|     # more than one char | ||||
|     s = Series(["a__b__c", "c__d__e", np.nan, "f__g__h", None], dtype=any_string_dtype) | ||||
|     result = getattr(s.str, method)("__", expand=False) | ||||
|     expected = Series(exp) | ||||
|     expected = _convert_na_value(s, expected) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "method, exp", | ||||
|     [ | ||||
|         [ | ||||
|             "partition", | ||||
|             [("a", " ", "b c"), ("c", " ", "d e"), np.nan, ("f", " ", "g h"), None], | ||||
|         ], | ||||
|         [ | ||||
|             "rpartition", | ||||
|             [("a b", " ", "c"), ("c d", " ", "e"), np.nan, ("f g", " ", "h"), None], | ||||
|         ], | ||||
|     ], | ||||
| ) | ||||
| def test_partition_series_none(any_string_dtype, method, exp): | ||||
|     # https://github.com/pandas-dev/pandas/issues/23558 | ||||
|     # None | ||||
|     s = Series(["a b c", "c d e", np.nan, "f g h", None], dtype=any_string_dtype) | ||||
|     result = getattr(s.str, method)(expand=False) | ||||
|     expected = Series(exp) | ||||
|     expected = _convert_na_value(s, expected) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "method, exp", | ||||
|     [ | ||||
|         [ | ||||
|             "partition", | ||||
|             [("abc", "", ""), ("cde", "", ""), np.nan, ("fgh", "", ""), None], | ||||
|         ], | ||||
|         [ | ||||
|             "rpartition", | ||||
|             [("", "", "abc"), ("", "", "cde"), np.nan, ("", "", "fgh"), None], | ||||
|         ], | ||||
|     ], | ||||
| ) | ||||
| def test_partition_series_not_split(any_string_dtype, method, exp): | ||||
|     # https://github.com/pandas-dev/pandas/issues/23558 | ||||
|     # Not split | ||||
|     s = Series(["abc", "cde", np.nan, "fgh", None], dtype=any_string_dtype) | ||||
|     result = getattr(s.str, method)("_", expand=False) | ||||
|     expected = Series(exp) | ||||
|     expected = _convert_na_value(s, expected) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "method, exp", | ||||
|     [ | ||||
|         [ | ||||
|             "partition", | ||||
|             [("a", "_", "b_c"), ("c", "_", "d_e"), np.nan, ("f", "_", "g_h")], | ||||
|         ], | ||||
|         [ | ||||
|             "rpartition", | ||||
|             [("a_b", "_", "c"), ("c_d", "_", "e"), np.nan, ("f_g", "_", "h")], | ||||
|         ], | ||||
|     ], | ||||
| ) | ||||
| def test_partition_series_unicode(any_string_dtype, method, exp): | ||||
|     # https://github.com/pandas-dev/pandas/issues/23558 | ||||
|     # unicode | ||||
|     s = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype) | ||||
|  | ||||
|     result = getattr(s.str, method)("_", expand=False) | ||||
|     expected = Series(exp) | ||||
|     expected = _convert_na_value(s, expected) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("method", ["partition", "rpartition"]) | ||||
| def test_partition_series_stdlib(any_string_dtype, method): | ||||
|     # https://github.com/pandas-dev/pandas/issues/23558 | ||||
|     # compare to standard lib | ||||
|     s = Series(["A_B_C", "B_C_D", "E_F_G", "EFGHEF"], dtype=any_string_dtype) | ||||
|     result = getattr(s.str, method)("_", expand=False).tolist() | ||||
|     assert result == [getattr(v, method)("_") for v in s] | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "method, expand, exp, exp_levels", | ||||
|     [ | ||||
|         [ | ||||
|             "partition", | ||||
|             False, | ||||
|             np.array( | ||||
|                 [("a", "_", "b_c"), ("c", "_", "d_e"), ("f", "_", "g_h"), np.nan, None], | ||||
|                 dtype=object, | ||||
|             ), | ||||
|             1, | ||||
|         ], | ||||
|         [ | ||||
|             "rpartition", | ||||
|             False, | ||||
|             np.array( | ||||
|                 [("a_b", "_", "c"), ("c_d", "_", "e"), ("f_g", "_", "h"), np.nan, None], | ||||
|                 dtype=object, | ||||
|             ), | ||||
|             1, | ||||
|         ], | ||||
|     ], | ||||
| ) | ||||
| def test_partition_index(method, expand, exp, exp_levels): | ||||
|     # https://github.com/pandas-dev/pandas/issues/23558 | ||||
|  | ||||
|     values = Index(["a_b_c", "c_d_e", "f_g_h", np.nan, None]) | ||||
|  | ||||
|     result = getattr(values.str, method)("_", expand=expand) | ||||
|     exp = Index(exp) | ||||
|     tm.assert_index_equal(result, exp) | ||||
|     assert result.nlevels == exp_levels | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "method, exp", | ||||
|     [ | ||||
|         [ | ||||
|             "partition", | ||||
|             { | ||||
|                 0: ["a", "c", np.nan, "f", None], | ||||
|                 1: ["_", "_", np.nan, "_", None], | ||||
|                 2: ["b_c", "d_e", np.nan, "g_h", None], | ||||
|             }, | ||||
|         ], | ||||
|         [ | ||||
|             "rpartition", | ||||
|             { | ||||
|                 0: ["a_b", "c_d", np.nan, "f_g", None], | ||||
|                 1: ["_", "_", np.nan, "_", None], | ||||
|                 2: ["c", "e", np.nan, "h", None], | ||||
|             }, | ||||
|         ], | ||||
|     ], | ||||
| ) | ||||
| def test_partition_to_dataframe(any_string_dtype, method, exp): | ||||
|     # https://github.com/pandas-dev/pandas/issues/23558 | ||||
|  | ||||
|     s = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None], dtype=any_string_dtype) | ||||
|     result = getattr(s.str, method)("_") | ||||
|     expected = DataFrame( | ||||
|         exp, | ||||
|         dtype=any_string_dtype, | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "method, exp", | ||||
|     [ | ||||
|         [ | ||||
|             "partition", | ||||
|             { | ||||
|                 0: ["a", "c", np.nan, "f", None], | ||||
|                 1: ["_", "_", np.nan, "_", None], | ||||
|                 2: ["b_c", "d_e", np.nan, "g_h", None], | ||||
|             }, | ||||
|         ], | ||||
|         [ | ||||
|             "rpartition", | ||||
|             { | ||||
|                 0: ["a_b", "c_d", np.nan, "f_g", None], | ||||
|                 1: ["_", "_", np.nan, "_", None], | ||||
|                 2: ["c", "e", np.nan, "h", None], | ||||
|             }, | ||||
|         ], | ||||
|     ], | ||||
| ) | ||||
| def test_partition_to_dataframe_from_series(any_string_dtype, method, exp): | ||||
|     # https://github.com/pandas-dev/pandas/issues/23558 | ||||
|     s = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None], dtype=any_string_dtype) | ||||
|     result = getattr(s.str, method)("_", expand=True) | ||||
|     expected = DataFrame( | ||||
|         exp, | ||||
|         dtype=any_string_dtype, | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_partition_with_name(any_string_dtype): | ||||
|     # GH 12617 | ||||
|  | ||||
|     s = Series(["a,b", "c,d"], name="xxx", dtype=any_string_dtype) | ||||
|     result = s.str.partition(",") | ||||
|     expected = DataFrame( | ||||
|         {0: ["a", "c"], 1: [",", ","], 2: ["b", "d"]}, dtype=any_string_dtype | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_partition_with_name_expand(any_string_dtype): | ||||
|     # GH 12617 | ||||
|     # should preserve name | ||||
|     s = Series(["a,b", "c,d"], name="xxx", dtype=any_string_dtype) | ||||
|     result = s.str.partition(",", expand=False) | ||||
|     expected = Series([("a", ",", "b"), ("c", ",", "d")], name="xxx") | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_partition_index_with_name(): | ||||
|     idx = Index(["a,b", "c,d"], name="xxx") | ||||
|     result = idx.str.partition(",") | ||||
|     expected = MultiIndex.from_tuples([("a", ",", "b"), ("c", ",", "d")]) | ||||
|     assert result.nlevels == 3 | ||||
|     tm.assert_index_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_partition_index_with_name_expand_false(): | ||||
|     idx = Index(["a,b", "c,d"], name="xxx") | ||||
|     # should preserve name | ||||
|     result = idx.str.partition(",", expand=False) | ||||
|     expected = Index(np.array([("a", ",", "b"), ("c", ",", "d")]), name="xxx") | ||||
|     assert result.nlevels == 1 | ||||
|     tm.assert_index_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("method", ["partition", "rpartition"]) | ||||
| def test_partition_sep_kwarg(any_string_dtype, method): | ||||
|     # GH 22676; depr kwarg "pat" in favor of "sep" | ||||
|     s = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype) | ||||
|  | ||||
|     expected = getattr(s.str, method)(sep="_") | ||||
|     result = getattr(s.str, method)("_") | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_get(): | ||||
|     ser = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) | ||||
|     result = ser.str.split("_").str.get(1) | ||||
|     expected = Series(["b", "d", np.nan, "g"], dtype=object) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_get_mixed_object(): | ||||
|     ser = Series(["a_b_c", np.nan, "c_d_e", True, datetime.today(), None, 1, 2.0]) | ||||
|     result = ser.str.split("_").str.get(1) | ||||
|     expected = Series( | ||||
|         ["b", np.nan, "d", np.nan, np.nan, None, np.nan, np.nan], dtype=object | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("idx", [2, -3]) | ||||
| def test_get_bounds(idx): | ||||
|     ser = Series(["1_2_3_4_5", "6_7_8_9_10", "11_12"]) | ||||
|     result = ser.str.split("_").str.get(idx) | ||||
|     expected = Series(["3", "8", np.nan], dtype=object) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "idx, exp", [[2, [3, 3, np.nan, "b"]], [-1, [3, 3, np.nan, np.nan]]] | ||||
| ) | ||||
| def test_get_complex(idx, exp): | ||||
|     # GH 20671, getting value not in dict raising `KeyError` | ||||
|     ser = Series([(1, 2, 3), [1, 2, 3], {1, 2, 3}, {1: "a", 2: "b", 3: "c"}]) | ||||
|  | ||||
|     result = ser.str.get(idx) | ||||
|     expected = Series(exp) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("to_type", [tuple, list, np.array]) | ||||
| def test_get_complex_nested(to_type): | ||||
|     ser = Series([to_type([to_type([1, 2])])]) | ||||
|  | ||||
|     result = ser.str.get(0) | ||||
|     expected = Series([to_type([1, 2])]) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     result = ser.str.get(1) | ||||
|     expected = Series([np.nan]) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_get_strings(any_string_dtype): | ||||
|     ser = Series(["a", "ab", np.nan, "abc"], dtype=any_string_dtype) | ||||
|     result = ser.str.get(2) | ||||
|     expected = Series([np.nan, np.nan, np.nan, "c"], dtype=any_string_dtype) | ||||
|     tm.assert_series_equal(result, expected) | ||||
| @ -0,0 +1,111 @@ | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas._libs import lib | ||||
|  | ||||
| from pandas import ( | ||||
|     NA, | ||||
|     DataFrame, | ||||
|     Series, | ||||
|     _testing as tm, | ||||
|     option_context, | ||||
| ) | ||||
|  | ||||
|  | ||||
| def test_string_array(nullable_string_dtype, any_string_method): | ||||
|     method_name, args, kwargs = any_string_method | ||||
|  | ||||
|     data = ["a", "bb", np.nan, "ccc"] | ||||
|     a = Series(data, dtype=object) | ||||
|     b = Series(data, dtype=nullable_string_dtype) | ||||
|  | ||||
|     if method_name == "decode": | ||||
|         with pytest.raises(TypeError, match="a bytes-like object is required"): | ||||
|             getattr(b.str, method_name)(*args, **kwargs) | ||||
|         return | ||||
|  | ||||
|     expected = getattr(a.str, method_name)(*args, **kwargs) | ||||
|     result = getattr(b.str, method_name)(*args, **kwargs) | ||||
|  | ||||
|     if isinstance(expected, Series): | ||||
|         if expected.dtype == "object" and lib.is_string_array( | ||||
|             expected.dropna().values, | ||||
|         ): | ||||
|             assert result.dtype == nullable_string_dtype | ||||
|             result = result.astype(object) | ||||
|  | ||||
|         elif expected.dtype == "object" and lib.is_bool_array( | ||||
|             expected.values, skipna=True | ||||
|         ): | ||||
|             assert result.dtype == "boolean" | ||||
|             expected = expected.astype("boolean") | ||||
|  | ||||
|         elif expected.dtype == "bool": | ||||
|             assert result.dtype == "boolean" | ||||
|             result = result.astype("bool") | ||||
|  | ||||
|         elif expected.dtype == "float" and expected.isna().any(): | ||||
|             assert result.dtype == "Int64" | ||||
|             result = result.astype("float") | ||||
|  | ||||
|         if expected.dtype == object: | ||||
|             # GH#18463 | ||||
|             expected[expected.isna()] = NA | ||||
|  | ||||
|     elif isinstance(expected, DataFrame): | ||||
|         columns = expected.select_dtypes(include="object").columns | ||||
|         assert all(result[columns].dtypes == nullable_string_dtype) | ||||
|         result[columns] = result[columns].astype(object) | ||||
|         with option_context("future.no_silent_downcasting", True): | ||||
|             expected[columns] = expected[columns].fillna(NA)  # GH#18463 | ||||
|  | ||||
|     tm.assert_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "method,expected", | ||||
|     [ | ||||
|         ("count", [2, None]), | ||||
|         ("find", [0, None]), | ||||
|         ("index", [0, None]), | ||||
|         ("rindex", [2, None]), | ||||
|     ], | ||||
| ) | ||||
| def test_string_array_numeric_integer_array(nullable_string_dtype, method, expected): | ||||
|     s = Series(["aba", None], dtype=nullable_string_dtype) | ||||
|     result = getattr(s.str, method)("a") | ||||
|     expected = Series(expected, dtype="Int64") | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "method,expected", | ||||
|     [ | ||||
|         ("isdigit", [False, None, True]), | ||||
|         ("isalpha", [True, None, False]), | ||||
|         ("isalnum", [True, None, True]), | ||||
|         ("isnumeric", [False, None, True]), | ||||
|     ], | ||||
| ) | ||||
| def test_string_array_boolean_array(nullable_string_dtype, method, expected): | ||||
|     s = Series(["a", None, "1"], dtype=nullable_string_dtype) | ||||
|     result = getattr(s.str, method)() | ||||
|     expected = Series(expected, dtype="boolean") | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_string_array_extract(nullable_string_dtype): | ||||
|     # https://github.com/pandas-dev/pandas/issues/30969 | ||||
|     # Only expand=False & multiple groups was failing | ||||
|  | ||||
|     a = Series(["a1", "b2", "cc"], dtype=nullable_string_dtype) | ||||
|     b = Series(["a1", "b2", "cc"], dtype="object") | ||||
|     pat = r"(\w)(\d)" | ||||
|  | ||||
|     result = a.str.extract(pat, expand=False) | ||||
|     expected = b.str.extract(pat, expand=False) | ||||
|     expected = expected.fillna(NA)  # GH#18463 | ||||
|     assert all(result.dtypes == nullable_string_dtype) | ||||
|  | ||||
|     result = result.astype(object) | ||||
|     tm.assert_equal(result, expected) | ||||
| @ -0,0 +1,802 @@ | ||||
| from datetime import ( | ||||
|     datetime, | ||||
|     timedelta, | ||||
| ) | ||||
|  | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     MultiIndex, | ||||
|     Series, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
| from pandas.core.strings.accessor import StringMethods | ||||
| from pandas.tests.strings import is_object_or_nan_string_dtype | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("pattern", [0, True, Series(["foo", "bar"])]) | ||||
| def test_startswith_endswith_non_str_patterns(pattern): | ||||
|     # GH3485 | ||||
|     ser = Series(["foo", "bar"]) | ||||
|     msg = f"expected a string or tuple, not {type(pattern).__name__}" | ||||
|     with pytest.raises(TypeError, match=msg): | ||||
|         ser.str.startswith(pattern) | ||||
|     with pytest.raises(TypeError, match=msg): | ||||
|         ser.str.endswith(pattern) | ||||
|  | ||||
|  | ||||
| def test_iter_raises(): | ||||
|     # GH 54173 | ||||
|     ser = Series(["foo", "bar"]) | ||||
|     with pytest.raises(TypeError, match="'StringMethods' object is not iterable"): | ||||
|         iter(ser.str) | ||||
|  | ||||
|  | ||||
| # test integer/float dtypes (inferred by constructor) and mixed | ||||
|  | ||||
|  | ||||
| def test_count(any_string_dtype): | ||||
|     ser = Series(["foo", "foofoo", np.nan, "foooofooofommmfoo"], dtype=any_string_dtype) | ||||
|     result = ser.str.count("f[o]+") | ||||
|     expected_dtype = ( | ||||
|         np.float64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" | ||||
|     ) | ||||
|     expected = Series([1, 2, np.nan, 4], dtype=expected_dtype) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_count_mixed_object(): | ||||
|     ser = Series( | ||||
|         ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], | ||||
|         dtype=object, | ||||
|     ) | ||||
|     result = ser.str.count("a") | ||||
|     expected = Series([1, np.nan, 0, np.nan, np.nan, 0, np.nan, np.nan, np.nan]) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_repeat(any_string_dtype): | ||||
|     ser = Series(["a", "b", np.nan, "c", np.nan, "d"], dtype=any_string_dtype) | ||||
|  | ||||
|     result = ser.str.repeat(3) | ||||
|     expected = Series( | ||||
|         ["aaa", "bbb", np.nan, "ccc", np.nan, "ddd"], dtype=any_string_dtype | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     result = ser.str.repeat([1, 2, 3, 4, 5, 6]) | ||||
|     expected = Series( | ||||
|         ["a", "bb", np.nan, "cccc", np.nan, "dddddd"], dtype=any_string_dtype | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_repeat_mixed_object(): | ||||
|     ser = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0]) | ||||
|     result = ser.str.repeat(3) | ||||
|     expected = Series( | ||||
|         ["aaa", np.nan, "bbb", np.nan, np.nan, "foofoofoo", None, np.nan, np.nan], | ||||
|         dtype=object, | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("arg, repeat", [[None, 4], ["b", None]]) | ||||
| def test_repeat_with_null(any_string_dtype, arg, repeat): | ||||
|     # GH: 31632 | ||||
|     ser = Series(["a", arg], dtype=any_string_dtype) | ||||
|     result = ser.str.repeat([3, repeat]) | ||||
|     expected = Series(["aaa", None], dtype=any_string_dtype) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_empty_str_methods(any_string_dtype): | ||||
|     empty_str = empty = Series(dtype=any_string_dtype) | ||||
|     empty_inferred_str = Series(dtype="str") | ||||
|     if is_object_or_nan_string_dtype(any_string_dtype): | ||||
|         empty_int = Series(dtype="int64") | ||||
|         empty_bool = Series(dtype=bool) | ||||
|     else: | ||||
|         empty_int = Series(dtype="Int64") | ||||
|         empty_bool = Series(dtype="boolean") | ||||
|     empty_object = Series(dtype=object) | ||||
|     empty_bytes = Series(dtype=object) | ||||
|     empty_df = DataFrame() | ||||
|  | ||||
|     # GH7241 | ||||
|     # (extract) on empty series | ||||
|  | ||||
|     tm.assert_series_equal(empty_str, empty.str.cat(empty)) | ||||
|     assert "" == empty.str.cat() | ||||
|     tm.assert_series_equal(empty_str, empty.str.title()) | ||||
|     tm.assert_series_equal(empty_int, empty.str.count("a")) | ||||
|     tm.assert_series_equal(empty_bool, empty.str.contains("a")) | ||||
|     tm.assert_series_equal(empty_bool, empty.str.startswith("a")) | ||||
|     tm.assert_series_equal(empty_bool, empty.str.endswith("a")) | ||||
|     tm.assert_series_equal(empty_str, empty.str.lower()) | ||||
|     tm.assert_series_equal(empty_str, empty.str.upper()) | ||||
|     tm.assert_series_equal(empty_str, empty.str.replace("a", "b")) | ||||
|     tm.assert_series_equal(empty_str, empty.str.repeat(3)) | ||||
|     tm.assert_series_equal(empty_bool, empty.str.match("^a")) | ||||
|     tm.assert_frame_equal( | ||||
|         DataFrame(columns=[0], dtype=any_string_dtype), | ||||
|         empty.str.extract("()", expand=True), | ||||
|     ) | ||||
|     tm.assert_frame_equal( | ||||
|         DataFrame(columns=[0, 1], dtype=any_string_dtype), | ||||
|         empty.str.extract("()()", expand=True), | ||||
|     ) | ||||
|     tm.assert_series_equal(empty_str, empty.str.extract("()", expand=False)) | ||||
|     tm.assert_frame_equal( | ||||
|         DataFrame(columns=[0, 1], dtype=any_string_dtype), | ||||
|         empty.str.extract("()()", expand=False), | ||||
|     ) | ||||
|     tm.assert_frame_equal(empty_df.set_axis([], axis=1), empty.str.get_dummies()) | ||||
|     tm.assert_series_equal(empty_str, empty_str.str.join("")) | ||||
|     tm.assert_series_equal(empty_int, empty.str.len()) | ||||
|     tm.assert_series_equal(empty_object, empty_str.str.findall("a")) | ||||
|     tm.assert_series_equal(empty_int, empty.str.find("a")) | ||||
|     tm.assert_series_equal(empty_int, empty.str.rfind("a")) | ||||
|     tm.assert_series_equal(empty_str, empty.str.pad(42)) | ||||
|     tm.assert_series_equal(empty_str, empty.str.center(42)) | ||||
|     tm.assert_series_equal(empty_object, empty.str.split("a")) | ||||
|     tm.assert_series_equal(empty_object, empty.str.rsplit("a")) | ||||
|     tm.assert_series_equal(empty_object, empty.str.partition("a", expand=False)) | ||||
|     tm.assert_frame_equal(empty_df, empty.str.partition("a")) | ||||
|     tm.assert_series_equal(empty_object, empty.str.rpartition("a", expand=False)) | ||||
|     tm.assert_frame_equal(empty_df, empty.str.rpartition("a")) | ||||
|     tm.assert_series_equal(empty_str, empty.str.slice(stop=1)) | ||||
|     tm.assert_series_equal(empty_str, empty.str.slice(step=1)) | ||||
|     tm.assert_series_equal(empty_str, empty.str.strip()) | ||||
|     tm.assert_series_equal(empty_str, empty.str.lstrip()) | ||||
|     tm.assert_series_equal(empty_str, empty.str.rstrip()) | ||||
|     tm.assert_series_equal(empty_str, empty.str.wrap(42)) | ||||
|     tm.assert_series_equal(empty_str, empty.str.get(0)) | ||||
|     tm.assert_series_equal(empty_inferred_str, empty_bytes.str.decode("ascii")) | ||||
|     tm.assert_series_equal(empty_bytes, empty.str.encode("ascii")) | ||||
|     # ismethods should always return boolean (GH 29624) | ||||
|     tm.assert_series_equal(empty_bool, empty.str.isalnum()) | ||||
|     tm.assert_series_equal(empty_bool, empty.str.isalpha()) | ||||
|     tm.assert_series_equal(empty_bool, empty.str.isdigit()) | ||||
|     tm.assert_series_equal(empty_bool, empty.str.isspace()) | ||||
|     tm.assert_series_equal(empty_bool, empty.str.islower()) | ||||
|     tm.assert_series_equal(empty_bool, empty.str.isupper()) | ||||
|     tm.assert_series_equal(empty_bool, empty.str.istitle()) | ||||
|     tm.assert_series_equal(empty_bool, empty.str.isnumeric()) | ||||
|     tm.assert_series_equal(empty_bool, empty.str.isdecimal()) | ||||
|     tm.assert_series_equal(empty_str, empty.str.capitalize()) | ||||
|     tm.assert_series_equal(empty_str, empty.str.swapcase()) | ||||
|     tm.assert_series_equal(empty_str, empty.str.normalize("NFC")) | ||||
|  | ||||
|     table = str.maketrans("a", "b") | ||||
|     tm.assert_series_equal(empty_str, empty.str.translate(table)) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "method, expected", | ||||
|     [ | ||||
|         ("isalnum", [True, True, True, True, True, False, True, True, False, False]), | ||||
|         ("isalpha", [True, True, True, False, False, False, True, False, False, False]), | ||||
|         ( | ||||
|             "isdigit", | ||||
|             [False, False, False, True, False, False, False, True, False, False], | ||||
|         ), | ||||
|         ( | ||||
|             "isnumeric", | ||||
|             [False, False, False, True, False, False, False, True, False, False], | ||||
|         ), | ||||
|         ( | ||||
|             "isspace", | ||||
|             [False, False, False, False, False, False, False, False, False, True], | ||||
|         ), | ||||
|         ( | ||||
|             "islower", | ||||
|             [False, True, False, False, False, False, False, False, False, False], | ||||
|         ), | ||||
|         ( | ||||
|             "isupper", | ||||
|             [True, False, False, False, True, False, True, False, False, False], | ||||
|         ), | ||||
|         ( | ||||
|             "istitle", | ||||
|             [True, False, True, False, True, False, False, False, False, False], | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_ismethods(method, expected, any_string_dtype): | ||||
|     ser = Series( | ||||
|         ["A", "b", "Xy", "4", "3A", "", "TT", "55", "-", "  "], dtype=any_string_dtype | ||||
|     ) | ||||
|     expected_dtype = ( | ||||
|         "bool" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" | ||||
|     ) | ||||
|     expected = Series(expected, dtype=expected_dtype) | ||||
|     result = getattr(ser.str, method)() | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     # compare with standard library | ||||
|     expected_stdlib = [getattr(item, method)() for item in ser] | ||||
|     assert list(result) == expected_stdlib | ||||
|  | ||||
|     # with missing value | ||||
|     ser.iloc[[1, 2, 3, 4]] = np.nan | ||||
|     result = getattr(ser.str, method)() | ||||
|     if ser.dtype == "object": | ||||
|         expected = expected.astype(object) | ||||
|         expected.iloc[[1, 2, 3, 4]] = np.nan | ||||
|     elif ser.dtype == "str": | ||||
|         # NaN propagates as False | ||||
|         expected.iloc[[1, 2, 3, 4]] = False | ||||
|     else: | ||||
|         # nullable dtypes propagate NaN | ||||
|         expected.iloc[[1, 2, 3, 4]] = np.nan | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "method, expected", | ||||
|     [ | ||||
|         ("isnumeric", [False, True, True, False, True, True, False]), | ||||
|         ("isdecimal", [False, True, False, False, False, True, False]), | ||||
|     ], | ||||
| ) | ||||
| def test_isnumeric_unicode(method, expected, any_string_dtype): | ||||
|     # 0x00bc: ¼ VULGAR FRACTION ONE QUARTER | ||||
|     # 0x2605: ★ not number | ||||
|     # 0x1378: ፸ ETHIOPIC NUMBER SEVENTY | ||||
|     # 0xFF13: 3 Em 3  # noqa: RUF003 | ||||
|     ser = Series( | ||||
|         ["A", "3", "¼", "★", "፸", "3", "four"], dtype=any_string_dtype  # noqa: RUF001 | ||||
|     ) | ||||
|     expected_dtype = ( | ||||
|         "bool" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" | ||||
|     ) | ||||
|     expected = Series(expected, dtype=expected_dtype) | ||||
|     result = getattr(ser.str, method)() | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     # compare with standard library | ||||
|     expected = [getattr(item, method)() for item in ser] | ||||
|     assert list(result) == expected | ||||
|  | ||||
|  | ||||
| @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") | ||||
| @pytest.mark.parametrize( | ||||
|     "method, expected", | ||||
|     [ | ||||
|         ("isnumeric", [False, np.nan, True, False, np.nan, True, False]), | ||||
|         ("isdecimal", [False, np.nan, False, False, np.nan, True, False]), | ||||
|     ], | ||||
| ) | ||||
| def test_isnumeric_unicode_missing(method, expected, any_string_dtype): | ||||
|     values = ["A", np.nan, "¼", "★", np.nan, "3", "four"]  # noqa: RUF001 | ||||
|     ser = Series(values, dtype=any_string_dtype) | ||||
|     if any_string_dtype == "str": | ||||
|         # NaN propagates as False | ||||
|         expected = Series(expected, dtype=object).fillna(False).astype(bool) | ||||
|     else: | ||||
|         expected_dtype = ( | ||||
|             "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" | ||||
|         ) | ||||
|         expected = Series(expected, dtype=expected_dtype) | ||||
|     result = getattr(ser.str, method)() | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_spilt_join_roundtrip(any_string_dtype): | ||||
|     ser = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype) | ||||
|     result = ser.str.split("_").str.join("_") | ||||
|     expected = ser.astype(object) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_spilt_join_roundtrip_mixed_object(): | ||||
|     ser = Series( | ||||
|         ["a_b", np.nan, "asdf_cas_asdf", True, datetime.today(), "foo", None, 1, 2.0] | ||||
|     ) | ||||
|     result = ser.str.split("_").str.join("_") | ||||
|     expected = Series( | ||||
|         ["a_b", np.nan, "asdf_cas_asdf", np.nan, np.nan, "foo", None, np.nan, np.nan], | ||||
|         dtype=object, | ||||
|     ) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_len(any_string_dtype): | ||||
|     ser = Series( | ||||
|         ["foo", "fooo", "fooooo", np.nan, "fooooooo", "foo\n", "あ"], | ||||
|         dtype=any_string_dtype, | ||||
|     ) | ||||
|     result = ser.str.len() | ||||
|     expected_dtype = ( | ||||
|         "float64" if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" | ||||
|     ) | ||||
|     expected = Series([3, 4, 6, np.nan, 8, 4, 1], dtype=expected_dtype) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_len_mixed(): | ||||
|     ser = Series( | ||||
|         ["a_b", np.nan, "asdf_cas_asdf", True, datetime.today(), "foo", None, 1, 2.0] | ||||
|     ) | ||||
|     result = ser.str.len() | ||||
|     expected = Series([3, np.nan, 13, np.nan, np.nan, 3, np.nan, np.nan, np.nan]) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "method,sub,start,end,expected", | ||||
|     [ | ||||
|         ("index", "EF", None, None, [4, 3, 1, 0]), | ||||
|         ("rindex", "EF", None, None, [4, 5, 7, 4]), | ||||
|         ("index", "EF", 3, None, [4, 3, 7, 4]), | ||||
|         ("rindex", "EF", 3, None, [4, 5, 7, 4]), | ||||
|         ("index", "E", 4, 8, [4, 5, 7, 4]), | ||||
|         ("rindex", "E", 0, 5, [4, 3, 1, 4]), | ||||
|     ], | ||||
| ) | ||||
| def test_index(method, sub, start, end, index_or_series, any_string_dtype, expected): | ||||
|     obj = index_or_series( | ||||
|         ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"], dtype=any_string_dtype | ||||
|     ) | ||||
|     expected_dtype = ( | ||||
|         np.int64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" | ||||
|     ) | ||||
|     expected = index_or_series(expected, dtype=expected_dtype) | ||||
|  | ||||
|     result = getattr(obj.str, method)(sub, start, end) | ||||
|  | ||||
|     if index_or_series is Series: | ||||
|         tm.assert_series_equal(result, expected) | ||||
|     else: | ||||
|         tm.assert_index_equal(result, expected) | ||||
|  | ||||
|     # compare with standard library | ||||
|     expected = [getattr(item, method)(sub, start, end) for item in obj] | ||||
|     assert list(result) == expected | ||||
|  | ||||
|  | ||||
| def test_index_not_found_raises(index_or_series, any_string_dtype): | ||||
|     obj = index_or_series( | ||||
|         ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"], dtype=any_string_dtype | ||||
|     ) | ||||
|     with pytest.raises(ValueError, match="substring not found"): | ||||
|         obj.str.index("DE") | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("method", ["index", "rindex"]) | ||||
| def test_index_wrong_type_raises(index_or_series, any_string_dtype, method): | ||||
|     obj = index_or_series([], dtype=any_string_dtype) | ||||
|     msg = "expected a string object, not int" | ||||
|  | ||||
|     with pytest.raises(TypeError, match=msg): | ||||
|         getattr(obj.str, method)(0) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "method, exp", | ||||
|     [ | ||||
|         ["index", [1, 1, 0]], | ||||
|         ["rindex", [3, 1, 2]], | ||||
|     ], | ||||
| ) | ||||
| def test_index_missing(any_string_dtype, method, exp): | ||||
|     ser = Series(["abcb", "ab", "bcbe", np.nan], dtype=any_string_dtype) | ||||
|     expected_dtype = ( | ||||
|         np.float64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" | ||||
|     ) | ||||
|  | ||||
|     result = getattr(ser.str, method)("b") | ||||
|     expected = Series(exp + [np.nan], dtype=expected_dtype) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_pipe_failures(any_string_dtype): | ||||
|     # #2119 | ||||
|     ser = Series(["A|B|C"], dtype=any_string_dtype) | ||||
|  | ||||
|     result = ser.str.split("|") | ||||
|     expected = Series([["A", "B", "C"]], dtype=object) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     result = ser.str.replace("|", " ", regex=False) | ||||
|     expected = Series(["A B C"], dtype=any_string_dtype) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "start, stop, step, expected", | ||||
|     [ | ||||
|         (2, 5, None, ["foo", "bar", np.nan, "baz"]), | ||||
|         (0, 3, -1, ["", "", np.nan, ""]), | ||||
|         (None, None, -1, ["owtoofaa", "owtrabaa", np.nan, "xuqzabaa"]), | ||||
|         (None, 2, -1, ["owtoo", "owtra", np.nan, "xuqza"]), | ||||
|         (3, 10, 2, ["oto", "ato", np.nan, "aqx"]), | ||||
|         (3, 0, -1, ["ofa", "aba", np.nan, "aba"]), | ||||
|     ], | ||||
| ) | ||||
| def test_slice(start, stop, step, expected, any_string_dtype): | ||||
|     ser = Series(["aafootwo", "aabartwo", np.nan, "aabazqux"], dtype=any_string_dtype) | ||||
|     result = ser.str.slice(start, stop, step) | ||||
|     expected = Series(expected, dtype=any_string_dtype) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "start, stop, step, expected", | ||||
|     [ | ||||
|         (2, 5, None, ["foo", np.nan, "bar", np.nan, np.nan, None, np.nan, np.nan]), | ||||
|         (4, 1, -1, ["oof", np.nan, "rab", np.nan, np.nan, None, np.nan, np.nan]), | ||||
|     ], | ||||
| ) | ||||
| def test_slice_mixed_object(start, stop, step, expected): | ||||
|     ser = Series(["aafootwo", np.nan, "aabartwo", True, datetime.today(), None, 1, 2.0]) | ||||
|     result = ser.str.slice(start, stop, step) | ||||
|     expected = Series(expected, dtype=object) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "start,stop,repl,expected", | ||||
|     [ | ||||
|         (2, 3, None, ["shrt", "a it longer", "evnlongerthanthat", "", np.nan]), | ||||
|         (2, 3, "z", ["shzrt", "a zit longer", "evznlongerthanthat", "z", np.nan]), | ||||
|         (2, 2, "z", ["shzort", "a zbit longer", "evzenlongerthanthat", "z", np.nan]), | ||||
|         (2, 1, "z", ["shzort", "a zbit longer", "evzenlongerthanthat", "z", np.nan]), | ||||
|         (-1, None, "z", ["shorz", "a bit longez", "evenlongerthanthaz", "z", np.nan]), | ||||
|         (None, -2, "z", ["zrt", "zer", "zat", "z", np.nan]), | ||||
|         (6, 8, "z", ["shortz", "a bit znger", "evenlozerthanthat", "z", np.nan]), | ||||
|         (-10, 3, "z", ["zrt", "a zit longer", "evenlongzerthanthat", "z", np.nan]), | ||||
|     ], | ||||
| ) | ||||
| def test_slice_replace(start, stop, repl, expected, any_string_dtype): | ||||
|     ser = Series( | ||||
|         ["short", "a bit longer", "evenlongerthanthat", "", np.nan], | ||||
|         dtype=any_string_dtype, | ||||
|     ) | ||||
|     expected = Series(expected, dtype=any_string_dtype) | ||||
|     result = ser.str.slice_replace(start, stop, repl) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "method, exp", | ||||
|     [ | ||||
|         ["strip", ["aa", "bb", np.nan, "cc"]], | ||||
|         ["lstrip", ["aa   ", "bb \n", np.nan, "cc  "]], | ||||
|         ["rstrip", ["  aa", " bb", np.nan, "cc"]], | ||||
|     ], | ||||
| ) | ||||
| def test_strip_lstrip_rstrip(any_string_dtype, method, exp): | ||||
|     ser = Series(["  aa   ", " bb \n", np.nan, "cc  "], dtype=any_string_dtype) | ||||
|  | ||||
|     result = getattr(ser.str, method)() | ||||
|     expected = Series(exp, dtype=any_string_dtype) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "method, exp", | ||||
|     [ | ||||
|         ["strip", ["aa", np.nan, "bb"]], | ||||
|         ["lstrip", ["aa  ", np.nan, "bb \t\n"]], | ||||
|         ["rstrip", ["  aa", np.nan, " bb"]], | ||||
|     ], | ||||
| ) | ||||
| def test_strip_lstrip_rstrip_mixed_object(method, exp): | ||||
|     ser = Series(["  aa  ", np.nan, " bb \t\n", True, datetime.today(), None, 1, 2.0]) | ||||
|  | ||||
|     result = getattr(ser.str, method)() | ||||
|     expected = Series(exp + [np.nan, np.nan, None, np.nan, np.nan], dtype=object) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "method, exp", | ||||
|     [ | ||||
|         ["strip", ["ABC", " BNSD", "LDFJH "]], | ||||
|         ["lstrip", ["ABCxx", " BNSD", "LDFJH xx"]], | ||||
|         ["rstrip", ["xxABC", "xx BNSD", "LDFJH "]], | ||||
|     ], | ||||
| ) | ||||
| def test_strip_lstrip_rstrip_args(any_string_dtype, method, exp): | ||||
|     ser = Series(["xxABCxx", "xx BNSD", "LDFJH xx"], dtype=any_string_dtype) | ||||
|  | ||||
|     result = getattr(ser.str, method)("x") | ||||
|     expected = Series(exp, dtype=any_string_dtype) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "prefix, expected", [("a", ["b", " b c", "bc"]), ("ab", ["", "a b c", "bc"])] | ||||
| ) | ||||
| def test_removeprefix(any_string_dtype, prefix, expected): | ||||
|     ser = Series(["ab", "a b c", "bc"], dtype=any_string_dtype) | ||||
|     result = ser.str.removeprefix(prefix) | ||||
|     ser_expected = Series(expected, dtype=any_string_dtype) | ||||
|     tm.assert_series_equal(result, ser_expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "suffix, expected", [("c", ["ab", "a b ", "b"]), ("bc", ["ab", "a b c", ""])] | ||||
| ) | ||||
| def test_removesuffix(any_string_dtype, suffix, expected): | ||||
|     ser = Series(["ab", "a b c", "bc"], dtype=any_string_dtype) | ||||
|     result = ser.str.removesuffix(suffix) | ||||
|     ser_expected = Series(expected, dtype=any_string_dtype) | ||||
|     tm.assert_series_equal(result, ser_expected) | ||||
|  | ||||
|  | ||||
| def test_string_slice_get_syntax(any_string_dtype): | ||||
|     ser = Series( | ||||
|         ["YYY", "B", "C", "YYYYYYbYYY", "BYYYcYYY", np.nan, "CYYYBYYY", "dog", "cYYYt"], | ||||
|         dtype=any_string_dtype, | ||||
|     ) | ||||
|  | ||||
|     result = ser.str[0] | ||||
|     expected = ser.str.get(0) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     result = ser.str[:3] | ||||
|     expected = ser.str.slice(stop=3) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     result = ser.str[2::-1] | ||||
|     expected = ser.str.slice(start=2, step=-1) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_string_slice_out_of_bounds_nested(): | ||||
|     ser = Series([(1, 2), (1,), (3, 4, 5)]) | ||||
|     result = ser.str[1] | ||||
|     expected = Series([2, np.nan, 4]) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_string_slice_out_of_bounds(any_string_dtype): | ||||
|     ser = Series(["foo", "b", "ba"], dtype=any_string_dtype) | ||||
|     result = ser.str[1] | ||||
|     expected = Series(["o", np.nan, "a"], dtype=any_string_dtype) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_encode_decode(any_string_dtype): | ||||
|     ser = Series(["a", "b", "a\xe4"], dtype=any_string_dtype).str.encode("utf-8") | ||||
|     result = ser.str.decode("utf-8") | ||||
|     expected = Series(["a", "b", "a\xe4"], dtype="str") | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_encode_errors_kwarg(any_string_dtype): | ||||
|     ser = Series(["a", "b", "a\x9d"], dtype=any_string_dtype) | ||||
|  | ||||
|     msg = ( | ||||
|         r"'charmap' codec can't encode character '\\x9d' in position 1: " | ||||
|         "character maps to <undefined>" | ||||
|     ) | ||||
|     with pytest.raises(UnicodeEncodeError, match=msg): | ||||
|         ser.str.encode("cp1252") | ||||
|  | ||||
|     result = ser.str.encode("cp1252", "ignore") | ||||
|     expected = ser.map(lambda x: x.encode("cp1252", "ignore")) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_decode_errors_kwarg(): | ||||
|     ser = Series([b"a", b"b", b"a\x9d"]) | ||||
|  | ||||
|     msg = ( | ||||
|         "'charmap' codec can't decode byte 0x9d in position 1: " | ||||
|         "character maps to <undefined>" | ||||
|     ) | ||||
|     with pytest.raises(UnicodeDecodeError, match=msg): | ||||
|         ser.str.decode("cp1252") | ||||
|  | ||||
|     result = ser.str.decode("cp1252", "ignore") | ||||
|     expected = ser.map(lambda x: x.decode("cp1252", "ignore")).astype("str") | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_decode_string_dtype(string_dtype): | ||||
|     # https://github.com/pandas-dev/pandas/pull/60940 | ||||
|     ser = Series([b"a", b"b"]) | ||||
|     result = ser.str.decode("utf-8", dtype=string_dtype) | ||||
|     expected = Series(["a", "b"], dtype=string_dtype) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_decode_object_dtype(object_dtype): | ||||
|     # https://github.com/pandas-dev/pandas/pull/60940 | ||||
|     ser = Series([b"a", rb"\ud800"]) | ||||
|     result = ser.str.decode("utf-8", dtype=object_dtype) | ||||
|     expected = Series(["a", r"\ud800"], dtype=object_dtype) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_decode_bad_dtype(): | ||||
|     # https://github.com/pandas-dev/pandas/pull/60940 | ||||
|     ser = Series([b"a", b"b"]) | ||||
|     msg = "dtype must be string or object, got dtype='int64'" | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         ser.str.decode("utf-8", dtype="int64") | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "form, expected", | ||||
|     [ | ||||
|         ("NFKC", ["ABC", "ABC", "123", np.nan, "アイエ"]), | ||||
|         ("NFC", ["ABC", "ABC", "123", np.nan, "アイエ"]),  # noqa: RUF001 | ||||
|     ], | ||||
| ) | ||||
| def test_normalize(form, expected, any_string_dtype): | ||||
|     ser = Series( | ||||
|         ["ABC", "ABC", "123", np.nan, "アイエ"],  # noqa: RUF001 | ||||
|         index=["a", "b", "c", "d", "e"], | ||||
|         dtype=any_string_dtype, | ||||
|     ) | ||||
|     expected = Series(expected, index=["a", "b", "c", "d", "e"], dtype=any_string_dtype) | ||||
|     result = ser.str.normalize(form) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_normalize_bad_arg_raises(any_string_dtype): | ||||
|     ser = Series( | ||||
|         ["ABC", "ABC", "123", np.nan, "アイエ"],  # noqa: RUF001 | ||||
|         index=["a", "b", "c", "d", "e"], | ||||
|         dtype=any_string_dtype, | ||||
|     ) | ||||
|     with pytest.raises(ValueError, match="invalid normalization form"): | ||||
|         ser.str.normalize("xxx") | ||||
|  | ||||
|  | ||||
| def test_normalize_index(): | ||||
|     idx = Index(["ABC", "123", "アイエ"])  # noqa: RUF001 | ||||
|     expected = Index(["ABC", "123", "アイエ"]) | ||||
|     result = idx.str.normalize("NFKC") | ||||
|     tm.assert_index_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "values,inferred_type", | ||||
|     [ | ||||
|         (["a", "b"], "string"), | ||||
|         (["a", "b", 1], "mixed-integer"), | ||||
|         (["a", "b", 1.3], "mixed"), | ||||
|         (["a", "b", 1.3, 1], "mixed-integer"), | ||||
|         (["aa", datetime(2011, 1, 1)], "mixed"), | ||||
|     ], | ||||
| ) | ||||
| def test_index_str_accessor_visibility(values, inferred_type, index_or_series): | ||||
|     obj = index_or_series(values) | ||||
|     if index_or_series is Index: | ||||
|         assert obj.inferred_type == inferred_type | ||||
|  | ||||
|     assert isinstance(obj.str, StringMethods) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "values,inferred_type", | ||||
|     [ | ||||
|         ([1, np.nan], "floating"), | ||||
|         ([datetime(2011, 1, 1)], "datetime64"), | ||||
|         ([timedelta(1)], "timedelta64"), | ||||
|     ], | ||||
| ) | ||||
| def test_index_str_accessor_non_string_values_raises( | ||||
|     values, inferred_type, index_or_series | ||||
| ): | ||||
|     obj = index_or_series(values) | ||||
|     if index_or_series is Index: | ||||
|         assert obj.inferred_type == inferred_type | ||||
|  | ||||
|     msg = "Can only use .str accessor with string values" | ||||
|     with pytest.raises(AttributeError, match=msg): | ||||
|         obj.str | ||||
|  | ||||
|  | ||||
| def test_index_str_accessor_multiindex_raises(): | ||||
|     # MultiIndex has mixed dtype, but not allow to use accessor | ||||
|     idx = MultiIndex.from_tuples([("a", "b"), ("a", "b")]) | ||||
|     assert idx.inferred_type == "mixed" | ||||
|  | ||||
|     msg = "Can only use .str accessor with Index, not MultiIndex" | ||||
|     with pytest.raises(AttributeError, match=msg): | ||||
|         idx.str | ||||
|  | ||||
|  | ||||
| def test_str_accessor_no_new_attributes(any_string_dtype): | ||||
|     # https://github.com/pandas-dev/pandas/issues/10673 | ||||
|     ser = Series(list("aabbcde"), dtype=any_string_dtype) | ||||
|     with pytest.raises(AttributeError, match="You cannot add any new attribute"): | ||||
|         ser.str.xlabel = "a" | ||||
|  | ||||
|  | ||||
| def test_cat_on_bytes_raises(): | ||||
|     lhs = Series(np.array(list("abc"), "S1").astype(object)) | ||||
|     rhs = Series(np.array(list("def"), "S1").astype(object)) | ||||
|     msg = "Cannot use .str.cat with values of inferred dtype 'bytes'" | ||||
|     with pytest.raises(TypeError, match=msg): | ||||
|         lhs.str.cat(rhs) | ||||
|  | ||||
|  | ||||
| def test_str_accessor_in_apply_func(): | ||||
|     # https://github.com/pandas-dev/pandas/issues/38979 | ||||
|     df = DataFrame(zip("abc", "def")) | ||||
|     expected = Series(["A/D", "B/E", "C/F"]) | ||||
|     result = df.apply(lambda f: "/".join(f.str.upper()), axis=1) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_zfill(): | ||||
|     # https://github.com/pandas-dev/pandas/issues/20868 | ||||
|     value = Series(["-1", "1", "1000", 10, np.nan]) | ||||
|     expected = Series(["-01", "001", "1000", np.nan, np.nan], dtype=object) | ||||
|     tm.assert_series_equal(value.str.zfill(3), expected) | ||||
|  | ||||
|     value = Series(["-2", "+5"]) | ||||
|     expected = Series(["-0002", "+0005"]) | ||||
|     tm.assert_series_equal(value.str.zfill(5), expected) | ||||
|  | ||||
|  | ||||
| def test_zfill_with_non_integer_argument(): | ||||
|     value = Series(["-2", "+5"]) | ||||
|     wid = "a" | ||||
|     msg = f"width must be of integer type, not {type(wid).__name__}" | ||||
|     with pytest.raises(TypeError, match=msg): | ||||
|         value.str.zfill(wid) | ||||
|  | ||||
|  | ||||
| def test_zfill_with_leading_sign(): | ||||
|     value = Series(["-cat", "-1", "+dog"]) | ||||
|     expected = Series(["-0cat", "-0001", "+0dog"]) | ||||
|     tm.assert_series_equal(value.str.zfill(5), expected) | ||||
|  | ||||
|  | ||||
| def test_get_with_dict_label(): | ||||
|     # GH47911 | ||||
|     s = Series( | ||||
|         [ | ||||
|             {"name": "Hello", "value": "World"}, | ||||
|             {"name": "Goodbye", "value": "Planet"}, | ||||
|             {"value": "Sea"}, | ||||
|         ] | ||||
|     ) | ||||
|     result = s.str.get("name") | ||||
|     expected = Series(["Hello", "Goodbye", None], dtype=object) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|     result = s.str.get("value") | ||||
|     expected = Series(["World", "Planet", "Sea"], dtype=object) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_series_str_decode(): | ||||
|     # GH 22613 | ||||
|     result = Series([b"x", b"y"]).str.decode(encoding="UTF-8", errors="strict") | ||||
|     expected = Series(["x", "y"], dtype="str") | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_reversed_logical_ops(any_string_dtype): | ||||
|     # GH#60234 | ||||
|     dtype = any_string_dtype | ||||
|     warn = None if dtype == object else DeprecationWarning | ||||
|     left = Series([True, False, False, True]) | ||||
|     right = Series(["", "", "b", "c"], dtype=dtype) | ||||
|  | ||||
|     msg = "operations between boolean dtype and" | ||||
|     with tm.assert_produces_warning(warn, match=msg): | ||||
|         result = left | right | ||||
|     expected = left | right.astype(bool) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     with tm.assert_produces_warning(warn, match=msg): | ||||
|         result = left & right | ||||
|     expected = left & right.astype(bool) | ||||
|     tm.assert_series_equal(result, expected) | ||||
|  | ||||
|     with tm.assert_produces_warning(warn, match=msg): | ||||
|         result = left ^ right | ||||
|     expected = left ^ right.astype(bool) | ||||
|     tm.assert_series_equal(result, expected) | ||||
		Reference in New Issue
	
	Block a user