done
This commit is contained in:
427
lib/python3.11/site-packages/pandas/tests/strings/test_cat.py
Normal file
427
lib/python3.11/site-packages/pandas/tests/strings/test_cat.py
Normal file
@ -0,0 +1,427 @@
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
_testing as tm,
|
||||
concat,
|
||||
option_context,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("other", [None, Series, Index])
|
||||
def test_str_cat_name(index_or_series, other):
|
||||
# GH 21053
|
||||
box = index_or_series
|
||||
values = ["a", "b"]
|
||||
if other:
|
||||
other = other(values)
|
||||
else:
|
||||
other = values
|
||||
result = box(values, name="name").str.cat(other, sep=",")
|
||||
assert result.name == "name"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
|
||||
)
|
||||
def test_str_cat(index_or_series, infer_string):
|
||||
with option_context("future.infer_string", infer_string):
|
||||
box = index_or_series
|
||||
# test_cat above tests "str_cat" from ndarray;
|
||||
# here testing "str.cat" from Series/Index to ndarray/list
|
||||
s = box(["a", "a", "b", "b", "c", np.nan])
|
||||
|
||||
# single array
|
||||
result = s.str.cat()
|
||||
expected = "aabbc"
|
||||
assert result == expected
|
||||
|
||||
result = s.str.cat(na_rep="-")
|
||||
expected = "aabbc-"
|
||||
assert result == expected
|
||||
|
||||
result = s.str.cat(sep="_", na_rep="NA")
|
||||
expected = "a_a_b_b_c_NA"
|
||||
assert result == expected
|
||||
|
||||
t = np.array(["a", np.nan, "b", "d", "foo", np.nan], dtype=object)
|
||||
expected = box(["aa", "a-", "bb", "bd", "cfoo", "--"])
|
||||
|
||||
# Series/Index with array
|
||||
result = s.str.cat(t, na_rep="-")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# Series/Index with list
|
||||
result = s.str.cat(list(t), na_rep="-")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# errors for incorrect lengths
|
||||
rgx = r"If `others` contains arrays or lists \(or other list-likes.*"
|
||||
z = Series(["1", "2", "3"])
|
||||
|
||||
with pytest.raises(ValueError, match=rgx):
|
||||
s.str.cat(z.values)
|
||||
|
||||
with pytest.raises(ValueError, match=rgx):
|
||||
s.str.cat(list(z))
|
||||
|
||||
|
||||
def test_str_cat_raises_intuitive_error(index_or_series):
|
||||
# GH 11334
|
||||
box = index_or_series
|
||||
s = box(["a", "b", "c", "d"])
|
||||
message = "Did you mean to supply a `sep` keyword?"
|
||||
with pytest.raises(ValueError, match=message):
|
||||
s.str.cat("|")
|
||||
with pytest.raises(ValueError, match=message):
|
||||
s.str.cat(" ")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
|
||||
)
|
||||
@pytest.mark.parametrize("sep", ["", None])
|
||||
@pytest.mark.parametrize("dtype_target", ["object", "category"])
|
||||
@pytest.mark.parametrize("dtype_caller", ["object", "category"])
|
||||
def test_str_cat_categorical(
|
||||
index_or_series, dtype_caller, dtype_target, sep, infer_string
|
||||
):
|
||||
box = index_or_series
|
||||
|
||||
with option_context("future.infer_string", infer_string):
|
||||
s = Index(["a", "a", "b", "a"], dtype=dtype_caller)
|
||||
s = s if box == Index else Series(s, index=s, dtype=s.dtype)
|
||||
t = Index(["b", "a", "b", "c"], dtype=dtype_target)
|
||||
|
||||
expected = Index(
|
||||
["ab", "aa", "bb", "ac"], dtype=object if dtype_caller == "object" else None
|
||||
)
|
||||
expected = (
|
||||
expected
|
||||
if box == Index
|
||||
else Series(
|
||||
expected, index=Index(s, dtype=dtype_caller), dtype=expected.dtype
|
||||
)
|
||||
)
|
||||
|
||||
# Series/Index with unaligned Index -> t.values
|
||||
result = s.str.cat(t.values, sep=sep)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# Series/Index with Series having matching Index
|
||||
t = Series(t.values, index=Index(s, dtype=dtype_caller))
|
||||
result = s.str.cat(t, sep=sep)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# Series/Index with Series.values
|
||||
result = s.str.cat(t.values, sep=sep)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# Series/Index with Series having different Index
|
||||
t = Series(t.values, index=t.values)
|
||||
expected = Index(
|
||||
["aa", "aa", "bb", "bb", "aa"],
|
||||
dtype=object if dtype_caller == "object" else None,
|
||||
)
|
||||
dtype = object if dtype_caller == "object" else s.dtype.categories.dtype
|
||||
expected = (
|
||||
expected
|
||||
if box == Index
|
||||
else Series(
|
||||
expected,
|
||||
index=Index(expected.str[:1], dtype=dtype),
|
||||
dtype=expected.dtype,
|
||||
)
|
||||
)
|
||||
|
||||
result = s.str.cat(t, sep=sep)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[[1, 2, 3], [0.1, 0.2, 0.3], [1, 2, "b"]],
|
||||
ids=["integers", "floats", "mixed"],
|
||||
)
|
||||
# without dtype=object, np.array would cast [1, 2, 'b'] to ['1', '2', 'b']
|
||||
@pytest.mark.parametrize(
|
||||
"box",
|
||||
[Series, Index, list, lambda x: np.array(x, dtype=object)],
|
||||
ids=["Series", "Index", "list", "np.array"],
|
||||
)
|
||||
def test_str_cat_wrong_dtype_raises(box, data):
|
||||
# GH 22722
|
||||
s = Series(["a", "b", "c"])
|
||||
t = box(data)
|
||||
|
||||
msg = "Concatenation requires list-likes containing only strings.*"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
# need to use outer and na_rep, as otherwise Index would not raise
|
||||
s.str.cat(t, join="outer", na_rep="-")
|
||||
|
||||
|
||||
def test_str_cat_mixed_inputs(index_or_series):
|
||||
box = index_or_series
|
||||
s = Index(["a", "b", "c", "d"])
|
||||
s = s if box == Index else Series(s, index=s)
|
||||
|
||||
t = Series(["A", "B", "C", "D"], index=s.values)
|
||||
d = concat([t, Series(s, index=s)], axis=1)
|
||||
|
||||
expected = Index(["aAa", "bBb", "cCc", "dDd"])
|
||||
expected = expected if box == Index else Series(expected.values, index=s.values)
|
||||
|
||||
# Series/Index with DataFrame
|
||||
result = s.str.cat(d)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# Series/Index with two-dimensional ndarray
|
||||
result = s.str.cat(d.values)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# Series/Index with list of Series
|
||||
result = s.str.cat([t, s])
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# Series/Index with mixed list of Series/array
|
||||
result = s.str.cat([t, s.values])
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# Series/Index with list of Series; different indexes
|
||||
t.index = ["b", "c", "d", "a"]
|
||||
expected = box(["aDa", "bAb", "cBc", "dCd"])
|
||||
expected = expected if box == Index else Series(expected.values, index=s.values)
|
||||
result = s.str.cat([t, s])
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# Series/Index with mixed list; different index
|
||||
result = s.str.cat([t, s.values])
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# Series/Index with DataFrame; different indexes
|
||||
d.index = ["b", "c", "d", "a"]
|
||||
expected = box(["aDd", "bAa", "cBb", "dCc"])
|
||||
expected = expected if box == Index else Series(expected.values, index=s.values)
|
||||
result = s.str.cat(d)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# errors for incorrect lengths
|
||||
rgx = r"If `others` contains arrays or lists \(or other list-likes.*"
|
||||
z = Series(["1", "2", "3"])
|
||||
e = concat([z, z], axis=1)
|
||||
|
||||
# two-dimensional ndarray
|
||||
with pytest.raises(ValueError, match=rgx):
|
||||
s.str.cat(e.values)
|
||||
|
||||
# list of list-likes
|
||||
with pytest.raises(ValueError, match=rgx):
|
||||
s.str.cat([z.values, s.values])
|
||||
|
||||
# mixed list of Series/list-like
|
||||
with pytest.raises(ValueError, match=rgx):
|
||||
s.str.cat([z.values, s])
|
||||
|
||||
# errors for incorrect arguments in list-like
|
||||
rgx = "others must be Series, Index, DataFrame,.*"
|
||||
# make sure None/NaN do not crash checks in _get_series_list
|
||||
u = Series(["a", np.nan, "c", None])
|
||||
|
||||
# mix of string and Series
|
||||
with pytest.raises(TypeError, match=rgx):
|
||||
s.str.cat([u, "u"])
|
||||
|
||||
# DataFrame in list
|
||||
with pytest.raises(TypeError, match=rgx):
|
||||
s.str.cat([u, d])
|
||||
|
||||
# 2-dim ndarray in list
|
||||
with pytest.raises(TypeError, match=rgx):
|
||||
s.str.cat([u, d.values])
|
||||
|
||||
# nested lists
|
||||
with pytest.raises(TypeError, match=rgx):
|
||||
s.str.cat([u, [u, d]])
|
||||
|
||||
# forbidden input type: set
|
||||
# GH 23009
|
||||
with pytest.raises(TypeError, match=rgx):
|
||||
s.str.cat(set(u))
|
||||
|
||||
# forbidden input type: set in list
|
||||
# GH 23009
|
||||
with pytest.raises(TypeError, match=rgx):
|
||||
s.str.cat([u, set(u)])
|
||||
|
||||
# other forbidden input type, e.g. int
|
||||
with pytest.raises(TypeError, match=rgx):
|
||||
s.str.cat(1)
|
||||
|
||||
# nested list-likes
|
||||
with pytest.raises(TypeError, match=rgx):
|
||||
s.str.cat(iter([t.values, list(s)]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("join", ["left", "outer", "inner", "right"])
|
||||
def test_str_cat_align_indexed(index_or_series, join):
|
||||
# https://github.com/pandas-dev/pandas/issues/18657
|
||||
box = index_or_series
|
||||
|
||||
s = Series(["a", "b", "c", "d"], index=["a", "b", "c", "d"])
|
||||
t = Series(["D", "A", "E", "B"], index=["d", "a", "e", "b"])
|
||||
sa, ta = s.align(t, join=join)
|
||||
# result after manual alignment of inputs
|
||||
expected = sa.str.cat(ta, na_rep="-")
|
||||
|
||||
if box == Index:
|
||||
s = Index(s)
|
||||
sa = Index(sa)
|
||||
expected = Index(expected)
|
||||
|
||||
result = s.str.cat(t, join=join, na_rep="-")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("join", ["left", "outer", "inner", "right"])
|
||||
def test_str_cat_align_mixed_inputs(join):
|
||||
s = Series(["a", "b", "c", "d"])
|
||||
t = Series(["d", "a", "e", "b"], index=[3, 0, 4, 1])
|
||||
d = concat([t, t], axis=1)
|
||||
|
||||
expected_outer = Series(["aaa", "bbb", "c--", "ddd", "-ee"])
|
||||
expected = expected_outer.loc[s.index.join(t.index, how=join)]
|
||||
|
||||
# list of Series
|
||||
result = s.str.cat([t, t], join=join, na_rep="-")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# DataFrame
|
||||
result = s.str.cat(d, join=join, na_rep="-")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# mixed list of indexed/unindexed
|
||||
u = np.array(["A", "B", "C", "D"])
|
||||
expected_outer = Series(["aaA", "bbB", "c-C", "ddD", "-e-"])
|
||||
# joint index of rhs [t, u]; u will be forced have index of s
|
||||
rhs_idx = (
|
||||
t.index.intersection(s.index)
|
||||
if join == "inner"
|
||||
else t.index.union(s.index)
|
||||
if join == "outer"
|
||||
else t.index.append(s.index.difference(t.index))
|
||||
)
|
||||
|
||||
expected = expected_outer.loc[s.index.join(rhs_idx, how=join)]
|
||||
result = s.str.cat([t, u], join=join, na_rep="-")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
with pytest.raises(TypeError, match="others must be Series,.*"):
|
||||
# nested lists are forbidden
|
||||
s.str.cat([t, list(u)], join=join)
|
||||
|
||||
# errors for incorrect lengths
|
||||
rgx = r"If `others` contains arrays or lists \(or other list-likes.*"
|
||||
z = Series(["1", "2", "3"]).values
|
||||
|
||||
# unindexed object of wrong length
|
||||
with pytest.raises(ValueError, match=rgx):
|
||||
s.str.cat(z, join=join)
|
||||
|
||||
# unindexed object of wrong length in list
|
||||
with pytest.raises(ValueError, match=rgx):
|
||||
s.str.cat([t, z], join=join)
|
||||
|
||||
|
||||
def test_str_cat_all_na(index_or_series, index_or_series2):
|
||||
# GH 24044
|
||||
box = index_or_series
|
||||
other = index_or_series2
|
||||
|
||||
# check that all NaNs in caller / target work
|
||||
s = Index(["a", "b", "c", "d"])
|
||||
s = s if box == Index else Series(s, index=s)
|
||||
t = other([np.nan] * 4, dtype=object)
|
||||
# add index of s for alignment
|
||||
t = t if other == Index else Series(t, index=s)
|
||||
|
||||
# all-NA target
|
||||
if box == Series:
|
||||
expected = Series([np.nan] * 4, index=s.index, dtype=s.dtype)
|
||||
else: # box == Index
|
||||
# TODO: Strimg option, this should return string dtype
|
||||
expected = Index([np.nan] * 4, dtype=object)
|
||||
result = s.str.cat(t, join="left")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# all-NA caller (only for Series)
|
||||
if other == Series:
|
||||
expected = Series([np.nan] * 4, dtype=object, index=t.index)
|
||||
result = t.str.cat(s, join="left")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_str_cat_special_cases():
|
||||
s = Series(["a", "b", "c", "d"])
|
||||
t = Series(["d", "a", "e", "b"], index=[3, 0, 4, 1])
|
||||
|
||||
# iterator of elements with different types
|
||||
expected = Series(["aaa", "bbb", "c-c", "ddd", "-e-"])
|
||||
result = s.str.cat(iter([t, s.values]), join="outer", na_rep="-")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# right-align with different indexes in others
|
||||
expected = Series(["aa-", "d-d"], index=[0, 3])
|
||||
result = s.str.cat([t.loc[[0]], t.loc[[3]]], join="right", na_rep="-")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_cat_on_filtered_index():
|
||||
df = DataFrame(
|
||||
index=MultiIndex.from_product(
|
||||
[[2011, 2012], [1, 2, 3]], names=["year", "month"]
|
||||
)
|
||||
)
|
||||
|
||||
df = df.reset_index()
|
||||
df = df[df.month > 1]
|
||||
|
||||
str_year = df.year.astype("str")
|
||||
str_month = df.month.astype("str")
|
||||
str_both = str_year.str.cat(str_month, sep=" ")
|
||||
|
||||
assert str_both.loc[1] == "2011 2"
|
||||
|
||||
str_multiple = str_year.str.cat([str_month, str_month], sep=" ")
|
||||
|
||||
assert str_multiple.loc[1] == "2011 2 2"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("klass", [tuple, list, np.array, Series, Index])
|
||||
def test_cat_different_classes(klass):
|
||||
# https://github.com/pandas-dev/pandas/issues/33425
|
||||
s = Series(["a", "b", "c"])
|
||||
result = s.str.cat(klass(["x", "y", "z"]))
|
||||
expected = Series(["ax", "by", "cz"])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_cat_on_series_dot_str():
|
||||
# GH 28277
|
||||
ps = Series(["AbC", "de", "FGHI", "j", "kLLLm"])
|
||||
|
||||
message = re.escape(
|
||||
"others must be Series, Index, DataFrame, np.ndarray "
|
||||
"or list-like (either containing only strings or "
|
||||
"containing only objects of type Series/Index/"
|
||||
"np.ndarray[1-dim])"
|
||||
)
|
||||
with pytest.raises(TypeError, match=message):
|
||||
ps.str.cat(others=ps.str)
|
Reference in New Issue
Block a user