done
This commit is contained in:
		| @ -0,0 +1,616 @@ | ||||
| from datetime import ( | ||||
|     datetime, | ||||
|     timezone, | ||||
| ) | ||||
|  | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas._libs.tslibs import iNaT | ||||
| from pandas.compat import ( | ||||
|     is_ci_environment, | ||||
|     is_platform_windows, | ||||
| ) | ||||
| from pandas.compat.numpy import np_version_lt1p23 | ||||
|  | ||||
| import pandas as pd | ||||
| import pandas._testing as tm | ||||
| from pandas.core.interchange.column import PandasColumn | ||||
| from pandas.core.interchange.dataframe_protocol import ( | ||||
|     ColumnNullType, | ||||
|     DtypeKind, | ||||
| ) | ||||
| from pandas.core.interchange.from_dataframe import from_dataframe | ||||
| from pandas.core.interchange.utils import ArrowCTypes | ||||
|  | ||||
|  | ||||
| @pytest.fixture | ||||
| def data_categorical(): | ||||
|     return { | ||||
|         "ordered": pd.Categorical(list("testdata") * 30, ordered=True), | ||||
|         "unordered": pd.Categorical(list("testdata") * 30, ordered=False), | ||||
|     } | ||||
|  | ||||
|  | ||||
| @pytest.fixture | ||||
| def string_data(): | ||||
|     return { | ||||
|         "separator data": [ | ||||
|             "abC|DeF,Hik", | ||||
|             "234,3245.67", | ||||
|             "gSaf,qWer|Gre", | ||||
|             "asd3,4sad|", | ||||
|             np.nan, | ||||
|         ] | ||||
|     } | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("data", [("ordered", True), ("unordered", False)]) | ||||
| def test_categorical_dtype(data, data_categorical): | ||||
|     df = pd.DataFrame({"A": (data_categorical[data[0]])}) | ||||
|  | ||||
|     col = df.__dataframe__().get_column_by_name("A") | ||||
|     assert col.dtype[0] == DtypeKind.CATEGORICAL | ||||
|     assert col.null_count == 0 | ||||
|     assert col.describe_null == (ColumnNullType.USE_SENTINEL, -1) | ||||
|     assert col.num_chunks() == 1 | ||||
|     desc_cat = col.describe_categorical | ||||
|     assert desc_cat["is_ordered"] == data[1] | ||||
|     assert desc_cat["is_dictionary"] is True | ||||
|     assert isinstance(desc_cat["categories"], PandasColumn) | ||||
|     tm.assert_series_equal( | ||||
|         desc_cat["categories"]._col, pd.Series(["a", "d", "e", "s", "t"]) | ||||
|     ) | ||||
|  | ||||
|     tm.assert_frame_equal(df, from_dataframe(df.__dataframe__())) | ||||
|  | ||||
|  | ||||
| def test_categorical_pyarrow(): | ||||
|     # GH 49889 | ||||
|     pa = pytest.importorskip("pyarrow", "11.0.0") | ||||
|  | ||||
|     arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"] | ||||
|     table = pa.table({"weekday": pa.array(arr).dictionary_encode()}) | ||||
|     exchange_df = table.__dataframe__() | ||||
|     result = from_dataframe(exchange_df) | ||||
|     weekday = pd.Categorical( | ||||
|         arr, categories=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] | ||||
|     ) | ||||
|     expected = pd.DataFrame({"weekday": weekday}) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_empty_categorical_pyarrow(): | ||||
|     # https://github.com/pandas-dev/pandas/issues/53077 | ||||
|     pa = pytest.importorskip("pyarrow", "11.0.0") | ||||
|  | ||||
|     arr = [None] | ||||
|     table = pa.table({"arr": pa.array(arr, "float64").dictionary_encode()}) | ||||
|     exchange_df = table.__dataframe__() | ||||
|     result = pd.api.interchange.from_dataframe(exchange_df) | ||||
|     expected = pd.DataFrame({"arr": pd.Categorical([np.nan])}) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_large_string_pyarrow(): | ||||
|     # GH 52795 | ||||
|     pa = pytest.importorskip("pyarrow", "11.0.0") | ||||
|  | ||||
|     arr = ["Mon", "Tue"] | ||||
|     table = pa.table({"weekday": pa.array(arr, "large_string")}) | ||||
|     exchange_df = table.__dataframe__() | ||||
|     result = from_dataframe(exchange_df) | ||||
|     expected = pd.DataFrame({"weekday": ["Mon", "Tue"]}) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # check round-trip | ||||
|     assert pa.Table.equals(pa.interchange.from_dataframe(result), table) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     ("offset", "length", "expected_values"), | ||||
|     [ | ||||
|         (0, None, [3.3, float("nan"), 2.1]), | ||||
|         (1, None, [float("nan"), 2.1]), | ||||
|         (2, None, [2.1]), | ||||
|         (0, 2, [3.3, float("nan")]), | ||||
|         (0, 1, [3.3]), | ||||
|         (1, 1, [float("nan")]), | ||||
|     ], | ||||
| ) | ||||
| def test_bitmasks_pyarrow(offset, length, expected_values): | ||||
|     # GH 52795 | ||||
|     pa = pytest.importorskip("pyarrow", "11.0.0") | ||||
|  | ||||
|     arr = [3.3, None, 2.1] | ||||
|     table = pa.table({"arr": arr}).slice(offset, length) | ||||
|     exchange_df = table.__dataframe__() | ||||
|     result = from_dataframe(exchange_df) | ||||
|     expected = pd.DataFrame({"arr": expected_values}) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # check round-trip | ||||
|     assert pa.Table.equals(pa.interchange.from_dataframe(result), table) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "data", | ||||
|     [ | ||||
|         lambda: np.random.default_rng(2).integers(-100, 100), | ||||
|         lambda: np.random.default_rng(2).integers(1, 100), | ||||
|         lambda: np.random.default_rng(2).random(), | ||||
|         lambda: np.random.default_rng(2).choice([True, False]), | ||||
|         lambda: datetime( | ||||
|             year=np.random.default_rng(2).integers(1900, 2100), | ||||
|             month=np.random.default_rng(2).integers(1, 12), | ||||
|             day=np.random.default_rng(2).integers(1, 20), | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_dataframe(data): | ||||
|     NCOLS, NROWS = 10, 20 | ||||
|     data = { | ||||
|         f"col{int((i - NCOLS / 2) % NCOLS + 1)}": [data() for _ in range(NROWS)] | ||||
|         for i in range(NCOLS) | ||||
|     } | ||||
|     df = pd.DataFrame(data) | ||||
|  | ||||
|     df2 = df.__dataframe__() | ||||
|  | ||||
|     assert df2.num_columns() == NCOLS | ||||
|     assert df2.num_rows() == NROWS | ||||
|  | ||||
|     assert list(df2.column_names()) == list(data.keys()) | ||||
|  | ||||
|     indices = (0, 2) | ||||
|     names = tuple(list(data.keys())[idx] for idx in indices) | ||||
|  | ||||
|     result = from_dataframe(df2.select_columns(indices)) | ||||
|     expected = from_dataframe(df2.select_columns_by_name(names)) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     assert isinstance(result.attrs["_INTERCHANGE_PROTOCOL_BUFFERS"], list) | ||||
|     assert isinstance(expected.attrs["_INTERCHANGE_PROTOCOL_BUFFERS"], list) | ||||
|  | ||||
|  | ||||
| def test_missing_from_masked(): | ||||
|     df = pd.DataFrame( | ||||
|         { | ||||
|             "x": np.array([1.0, 2.0, 3.0, 4.0, 0.0]), | ||||
|             "y": np.array([1.5, 2.5, 3.5, 4.5, 0]), | ||||
|             "z": np.array([1.0, 0.0, 1.0, 1.0, 1.0]), | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|     rng = np.random.default_rng(2) | ||||
|     dict_null = {col: rng.integers(low=0, high=len(df)) for col in df.columns} | ||||
|     for col, num_nulls in dict_null.items(): | ||||
|         null_idx = df.index[ | ||||
|             rng.choice(np.arange(len(df)), size=num_nulls, replace=False) | ||||
|         ] | ||||
|         df.loc[null_idx, col] = None | ||||
|  | ||||
|     df2 = df.__dataframe__() | ||||
|  | ||||
|     assert df2.get_column_by_name("x").null_count == dict_null["x"] | ||||
|     assert df2.get_column_by_name("y").null_count == dict_null["y"] | ||||
|     assert df2.get_column_by_name("z").null_count == dict_null["z"] | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "data", | ||||
|     [ | ||||
|         {"x": [1.5, 2.5, 3.5], "y": [9.2, 10.5, 11.8]}, | ||||
|         {"x": [1, 2, 0], "y": [9.2, 10.5, 11.8]}, | ||||
|         { | ||||
|             "x": np.array([True, True, False]), | ||||
|             "y": np.array([1, 2, 0]), | ||||
|             "z": np.array([9.2, 10.5, 11.8]), | ||||
|         }, | ||||
|     ], | ||||
| ) | ||||
| def test_mixed_data(data): | ||||
|     df = pd.DataFrame(data) | ||||
|     df2 = df.__dataframe__() | ||||
|  | ||||
|     for col_name in df.columns: | ||||
|         assert df2.get_column_by_name(col_name).null_count == 0 | ||||
|  | ||||
|  | ||||
| def test_mixed_missing(): | ||||
|     df = pd.DataFrame( | ||||
|         { | ||||
|             "x": np.array([True, None, False, None, True]), | ||||
|             "y": np.array([None, 2, None, 1, 2]), | ||||
|             "z": np.array([9.2, 10.5, None, 11.8, None]), | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|     df2 = df.__dataframe__() | ||||
|  | ||||
|     for col_name in df.columns: | ||||
|         assert df2.get_column_by_name(col_name).null_count == 2 | ||||
|  | ||||
|  | ||||
| def test_string(string_data): | ||||
|     test_str_data = string_data["separator data"] + [""] | ||||
|     df = pd.DataFrame({"A": test_str_data}) | ||||
|     col = df.__dataframe__().get_column_by_name("A") | ||||
|  | ||||
|     assert col.size() == 6 | ||||
|     assert col.null_count == 1 | ||||
|     assert col.dtype[0] == DtypeKind.STRING | ||||
|     assert col.describe_null == (ColumnNullType.USE_BYTEMASK, 0) | ||||
|  | ||||
|     df_sliced = df[1:] | ||||
|     col = df_sliced.__dataframe__().get_column_by_name("A") | ||||
|     assert col.size() == 5 | ||||
|     assert col.null_count == 1 | ||||
|     assert col.dtype[0] == DtypeKind.STRING | ||||
|     assert col.describe_null == (ColumnNullType.USE_BYTEMASK, 0) | ||||
|  | ||||
|  | ||||
| def test_nonstring_object(): | ||||
|     df = pd.DataFrame({"A": ["a", 10, 1.0, ()]}) | ||||
|     col = df.__dataframe__().get_column_by_name("A") | ||||
|     with pytest.raises(NotImplementedError, match="not supported yet"): | ||||
|         col.dtype | ||||
|  | ||||
|  | ||||
| def test_datetime(): | ||||
|     df = pd.DataFrame({"A": [pd.Timestamp("2022-01-01"), pd.NaT]}) | ||||
|     col = df.__dataframe__().get_column_by_name("A") | ||||
|  | ||||
|     assert col.size() == 2 | ||||
|     assert col.null_count == 1 | ||||
|     assert col.dtype[0] == DtypeKind.DATETIME | ||||
|     assert col.describe_null == (ColumnNullType.USE_SENTINEL, iNaT) | ||||
|  | ||||
|     tm.assert_frame_equal(df, from_dataframe(df.__dataframe__())) | ||||
|  | ||||
|  | ||||
| @pytest.mark.skipif(np_version_lt1p23, reason="Numpy > 1.23 required") | ||||
| def test_categorical_to_numpy_dlpack(): | ||||
|     # https://github.com/pandas-dev/pandas/issues/48393 | ||||
|     df = pd.DataFrame({"A": pd.Categorical(["a", "b", "a"])}) | ||||
|     col = df.__dataframe__().get_column_by_name("A") | ||||
|     result = np.from_dlpack(col.get_buffers()["data"][0]) | ||||
|     expected = np.array([0, 1, 0], dtype="int8") | ||||
|     tm.assert_numpy_array_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("data", [{}, {"a": []}]) | ||||
| def test_empty_pyarrow(data): | ||||
|     # GH 53155 | ||||
|     pytest.importorskip("pyarrow", "11.0.0") | ||||
|     from pyarrow.interchange import from_dataframe as pa_from_dataframe | ||||
|  | ||||
|     expected = pd.DataFrame(data) | ||||
|     arrow_df = pa_from_dataframe(expected) | ||||
|     result = from_dataframe(arrow_df) | ||||
|     tm.assert_frame_equal(result, expected, check_column_type=False) | ||||
|  | ||||
|  | ||||
| def test_multi_chunk_pyarrow() -> None: | ||||
|     pa = pytest.importorskip("pyarrow", "11.0.0") | ||||
|     n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) | ||||
|     names = ["n_legs"] | ||||
|     table = pa.table([n_legs], names=names) | ||||
|     with pytest.raises( | ||||
|         RuntimeError, | ||||
|         match="Cannot do zero copy conversion into multi-column DataFrame block", | ||||
|     ): | ||||
|         pd.api.interchange.from_dataframe(table, allow_copy=False) | ||||
|  | ||||
|  | ||||
| def test_multi_chunk_column() -> None: | ||||
|     pytest.importorskip("pyarrow", "11.0.0") | ||||
|     ser = pd.Series([1, 2, None], dtype="Int64[pyarrow]") | ||||
|     df = pd.concat([ser, ser], ignore_index=True).to_frame("a") | ||||
|     df_orig = df.copy() | ||||
|     with pytest.raises( | ||||
|         RuntimeError, match="Found multi-chunk pyarrow array, but `allow_copy` is False" | ||||
|     ): | ||||
|         pd.api.interchange.from_dataframe(df.__dataframe__(allow_copy=False)) | ||||
|     result = pd.api.interchange.from_dataframe(df.__dataframe__(allow_copy=True)) | ||||
|     # Interchange protocol defaults to creating numpy-backed columns, so currently this | ||||
|     # is 'float64'. | ||||
|     expected = pd.DataFrame({"a": [1.0, 2.0, None, 1.0, 2.0, None]}, dtype="float64") | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # Check that the rechunking we did didn't modify the original DataFrame. | ||||
|     tm.assert_frame_equal(df, df_orig) | ||||
|     assert len(df["a"].array._pa_array.chunks) == 2 | ||||
|     assert len(df_orig["a"].array._pa_array.chunks) == 2 | ||||
|  | ||||
|  | ||||
| def test_timestamp_ns_pyarrow(): | ||||
|     # GH 56712 | ||||
|     pytest.importorskip("pyarrow", "11.0.0") | ||||
|     timestamp_args = { | ||||
|         "year": 2000, | ||||
|         "month": 1, | ||||
|         "day": 1, | ||||
|         "hour": 1, | ||||
|         "minute": 1, | ||||
|         "second": 1, | ||||
|     } | ||||
|     df = pd.Series( | ||||
|         [datetime(**timestamp_args)], | ||||
|         dtype="timestamp[ns][pyarrow]", | ||||
|         name="col0", | ||||
|     ).to_frame() | ||||
|  | ||||
|     dfi = df.__dataframe__() | ||||
|     result = pd.api.interchange.from_dataframe(dfi)["col0"].item() | ||||
|  | ||||
|     expected = pd.Timestamp(**timestamp_args) | ||||
|     assert result == expected | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("tz", ["UTC", "US/Pacific"]) | ||||
| @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) | ||||
| def test_datetimetzdtype(tz, unit): | ||||
|     # GH 54239 | ||||
|     tz_data = ( | ||||
|         pd.date_range("2018-01-01", periods=5, freq="D").tz_localize(tz).as_unit(unit) | ||||
|     ) | ||||
|     df = pd.DataFrame({"ts_tz": tz_data}) | ||||
|     tm.assert_frame_equal(df, from_dataframe(df.__dataframe__())) | ||||
|  | ||||
|  | ||||
| def test_interchange_from_non_pandas_tz_aware(request): | ||||
|     # GH 54239, 54287 | ||||
|     pa = pytest.importorskip("pyarrow", "11.0.0") | ||||
|     import pyarrow.compute as pc | ||||
|  | ||||
|     if is_platform_windows() and is_ci_environment(): | ||||
|         mark = pytest.mark.xfail( | ||||
|             raises=pa.ArrowInvalid, | ||||
|             reason=( | ||||
|                 "TODO: Set ARROW_TIMEZONE_DATABASE environment variable " | ||||
|                 "on CI to path to the tzdata for pyarrow." | ||||
|             ), | ||||
|         ) | ||||
|         request.applymarker(mark) | ||||
|  | ||||
|     arr = pa.array([datetime(2020, 1, 1), None, datetime(2020, 1, 2)]) | ||||
|     arr = pc.assume_timezone(arr, "Asia/Kathmandu") | ||||
|     table = pa.table({"arr": arr}) | ||||
|     exchange_df = table.__dataframe__() | ||||
|     result = from_dataframe(exchange_df) | ||||
|  | ||||
|     expected = pd.DataFrame( | ||||
|         ["2020-01-01 00:00:00+05:45", "NaT", "2020-01-02 00:00:00+05:45"], | ||||
|         columns=["arr"], | ||||
|         dtype="datetime64[us, Asia/Kathmandu]", | ||||
|     ) | ||||
|     tm.assert_frame_equal(expected, result) | ||||
|  | ||||
|  | ||||
| def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None: | ||||
|     # https://github.com/pandas-dev/pandas/issues/54781 | ||||
|     df = pd.DataFrame({"a": ["foo", "bar"]}).__dataframe__() | ||||
|     interchange = df.__dataframe__() | ||||
|     column = interchange.get_column_by_name("a") | ||||
|     buffers = column.get_buffers() | ||||
|     buffers_data = buffers["data"] | ||||
|     buffer_dtype = buffers_data[1] | ||||
|     buffer_dtype = ( | ||||
|         DtypeKind.UINT, | ||||
|         8, | ||||
|         ArrowCTypes.UINT8, | ||||
|         buffer_dtype[3], | ||||
|     ) | ||||
|     buffers["data"] = (buffers_data[0], buffer_dtype) | ||||
|     column.get_buffers = lambda: buffers | ||||
|     interchange.get_column_by_name = lambda _: column | ||||
|     monkeypatch.setattr(df, "__dataframe__", lambda allow_copy: interchange) | ||||
|     pd.api.interchange.from_dataframe(df) | ||||
|  | ||||
|  | ||||
| def test_empty_string_column(): | ||||
|     # https://github.com/pandas-dev/pandas/issues/56703 | ||||
|     df = pd.DataFrame({"a": []}, dtype=str) | ||||
|     df2 = df.__dataframe__() | ||||
|     result = pd.api.interchange.from_dataframe(df2) | ||||
|     tm.assert_frame_equal(df, result) | ||||
|  | ||||
|  | ||||
| def test_large_string(): | ||||
|     # GH#56702 | ||||
|     pytest.importorskip("pyarrow") | ||||
|     df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]") | ||||
|     result = pd.api.interchange.from_dataframe(df.__dataframe__()) | ||||
|     expected = pd.DataFrame({"a": ["x"]}, dtype="str") | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_non_str_names(): | ||||
|     # https://github.com/pandas-dev/pandas/issues/56701 | ||||
|     df = pd.Series([1, 2, 3], name=0).to_frame() | ||||
|     names = df.__dataframe__().column_names() | ||||
|     assert names == ["0"] | ||||
|  | ||||
|  | ||||
| def test_non_str_names_w_duplicates(): | ||||
|     # https://github.com/pandas-dev/pandas/issues/56701 | ||||
|     df = pd.DataFrame({"0": [1, 2, 3], 0: [4, 5, 6]}) | ||||
|     dfi = df.__dataframe__() | ||||
|     with pytest.raises( | ||||
|         TypeError, | ||||
|         match=( | ||||
|             "Expected a Series, got a DataFrame. This likely happened because you " | ||||
|             "called __dataframe__ on a DataFrame which, after converting column " | ||||
|             r"names to string, resulted in duplicated names: Index\(\['0', '0'\], " | ||||
|             r"dtype='(str|object)'\). Please rename these columns before using the " | ||||
|             "interchange protocol." | ||||
|         ), | ||||
|     ): | ||||
|         pd.api.interchange.from_dataframe(dfi, allow_copy=False) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     ("data", "dtype", "expected_dtype"), | ||||
|     [ | ||||
|         ([1, 2, None], "Int64", "int64"), | ||||
|         ([1, 2, None], "Int64[pyarrow]", "int64"), | ||||
|         ([1, 2, None], "Int8", "int8"), | ||||
|         ([1, 2, None], "Int8[pyarrow]", "int8"), | ||||
|         ( | ||||
|             [1, 2, None], | ||||
|             "UInt64", | ||||
|             "uint64", | ||||
|         ), | ||||
|         ( | ||||
|             [1, 2, None], | ||||
|             "UInt64[pyarrow]", | ||||
|             "uint64", | ||||
|         ), | ||||
|         ([1.0, 2.25, None], "Float32", "float32"), | ||||
|         ([1.0, 2.25, None], "Float32[pyarrow]", "float32"), | ||||
|         ([True, False, None], "boolean", "bool"), | ||||
|         ([True, False, None], "boolean[pyarrow]", "bool"), | ||||
|         (["much ado", "about", None], pd.StringDtype(na_value=np.nan), "large_string"), | ||||
|         (["much ado", "about", None], "string[pyarrow]", "large_string"), | ||||
|         ( | ||||
|             [datetime(2020, 1, 1), datetime(2020, 1, 2), None], | ||||
|             "timestamp[ns][pyarrow]", | ||||
|             "timestamp[ns]", | ||||
|         ), | ||||
|         ( | ||||
|             [datetime(2020, 1, 1), datetime(2020, 1, 2), None], | ||||
|             "timestamp[us][pyarrow]", | ||||
|             "timestamp[us]", | ||||
|         ), | ||||
|         ( | ||||
|             [ | ||||
|                 datetime(2020, 1, 1, tzinfo=timezone.utc), | ||||
|                 datetime(2020, 1, 2, tzinfo=timezone.utc), | ||||
|                 None, | ||||
|             ], | ||||
|             "timestamp[us, Asia/Kathmandu][pyarrow]", | ||||
|             "timestamp[us, tz=Asia/Kathmandu]", | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_pandas_nullable_with_missing_values( | ||||
|     data: list, dtype: str, expected_dtype: str | ||||
| ) -> None: | ||||
|     # https://github.com/pandas-dev/pandas/issues/57643 | ||||
|     # https://github.com/pandas-dev/pandas/issues/57664 | ||||
|     pa = pytest.importorskip("pyarrow", "11.0.0") | ||||
|     import pyarrow.interchange as pai | ||||
|  | ||||
|     if expected_dtype == "timestamp[us, tz=Asia/Kathmandu]": | ||||
|         expected_dtype = pa.timestamp("us", "Asia/Kathmandu") | ||||
|  | ||||
|     df = pd.DataFrame({"a": data}, dtype=dtype) | ||||
|     result = pai.from_dataframe(df.__dataframe__())["a"] | ||||
|     assert result.type == expected_dtype | ||||
|     assert result[0].as_py() == data[0] | ||||
|     assert result[1].as_py() == data[1] | ||||
|     assert result[2].as_py() is None | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     ("data", "dtype", "expected_dtype"), | ||||
|     [ | ||||
|         ([1, 2, 3], "Int64", "int64"), | ||||
|         ([1, 2, 3], "Int64[pyarrow]", "int64"), | ||||
|         ([1, 2, 3], "Int8", "int8"), | ||||
|         ([1, 2, 3], "Int8[pyarrow]", "int8"), | ||||
|         ( | ||||
|             [1, 2, 3], | ||||
|             "UInt64", | ||||
|             "uint64", | ||||
|         ), | ||||
|         ( | ||||
|             [1, 2, 3], | ||||
|             "UInt64[pyarrow]", | ||||
|             "uint64", | ||||
|         ), | ||||
|         ([1.0, 2.25, 5.0], "Float32", "float32"), | ||||
|         ([1.0, 2.25, 5.0], "Float32[pyarrow]", "float32"), | ||||
|         ([True, False, False], "boolean", "bool"), | ||||
|         ([True, False, False], "boolean[pyarrow]", "bool"), | ||||
|         ( | ||||
|             ["much ado", "about", "nothing"], | ||||
|             pd.StringDtype(na_value=np.nan), | ||||
|             "large_string", | ||||
|         ), | ||||
|         (["much ado", "about", "nothing"], "string[pyarrow]", "large_string"), | ||||
|         ( | ||||
|             [datetime(2020, 1, 1), datetime(2020, 1, 2), datetime(2020, 1, 3)], | ||||
|             "timestamp[ns][pyarrow]", | ||||
|             "timestamp[ns]", | ||||
|         ), | ||||
|         ( | ||||
|             [datetime(2020, 1, 1), datetime(2020, 1, 2), datetime(2020, 1, 3)], | ||||
|             "timestamp[us][pyarrow]", | ||||
|             "timestamp[us]", | ||||
|         ), | ||||
|         ( | ||||
|             [ | ||||
|                 datetime(2020, 1, 1, tzinfo=timezone.utc), | ||||
|                 datetime(2020, 1, 2, tzinfo=timezone.utc), | ||||
|                 datetime(2020, 1, 3, tzinfo=timezone.utc), | ||||
|             ], | ||||
|             "timestamp[us, Asia/Kathmandu][pyarrow]", | ||||
|             "timestamp[us, tz=Asia/Kathmandu]", | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_pandas_nullable_without_missing_values( | ||||
|     data: list, dtype: str, expected_dtype: str | ||||
| ) -> None: | ||||
|     # https://github.com/pandas-dev/pandas/issues/57643 | ||||
|     pa = pytest.importorskip("pyarrow", "11.0.0") | ||||
|     import pyarrow.interchange as pai | ||||
|  | ||||
|     if expected_dtype == "timestamp[us, tz=Asia/Kathmandu]": | ||||
|         expected_dtype = pa.timestamp("us", "Asia/Kathmandu") | ||||
|  | ||||
|     df = pd.DataFrame({"a": data}, dtype=dtype) | ||||
|     result = pai.from_dataframe(df.__dataframe__())["a"] | ||||
|     assert result.type == expected_dtype | ||||
|     assert result[0].as_py() == data[0] | ||||
|     assert result[1].as_py() == data[1] | ||||
|     assert result[2].as_py() == data[2] | ||||
|  | ||||
|  | ||||
| def test_string_validity_buffer() -> None: | ||||
|     # https://github.com/pandas-dev/pandas/issues/57761 | ||||
|     pytest.importorskip("pyarrow", "11.0.0") | ||||
|     df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]") | ||||
|     result = df.__dataframe__().get_column_by_name("a").get_buffers()["validity"] | ||||
|     assert result is None | ||||
|  | ||||
|  | ||||
| def test_string_validity_buffer_no_missing() -> None: | ||||
|     # https://github.com/pandas-dev/pandas/issues/57762 | ||||
|     pytest.importorskip("pyarrow", "11.0.0") | ||||
|     df = pd.DataFrame({"a": ["x", None]}, dtype="large_string[pyarrow]") | ||||
|     validity = df.__dataframe__().get_column_by_name("a").get_buffers()["validity"] | ||||
|     assert validity is not None | ||||
|     result = validity[1] | ||||
|     expected = (DtypeKind.BOOL, 1, ArrowCTypes.BOOL, "=") | ||||
|     assert result == expected | ||||
|  | ||||
|  | ||||
| def test_empty_dataframe(): | ||||
|     # https://github.com/pandas-dev/pandas/issues/56700 | ||||
|     df = pd.DataFrame({"a": []}, dtype="int8") | ||||
|     dfi = df.__dataframe__() | ||||
|     result = pd.api.interchange.from_dataframe(dfi, allow_copy=False) | ||||
|     expected = pd.DataFrame({"a": []}, dtype="int8") | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_from_dataframe_list_dtype(): | ||||
|     pa = pytest.importorskip("pyarrow", "14.0.0") | ||||
|     data = {"a": [[1, 2], [4, 5, 6]]} | ||||
|     tbl = pa.table(data) | ||||
|     result = from_dataframe(tbl) | ||||
|     expected = pd.DataFrame(data) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
| @ -0,0 +1,175 @@ | ||||
| """ | ||||
| A verbatim copy (vendored) of the spec tests. | ||||
| Taken from https://github.com/data-apis/dataframe-api | ||||
| """ | ||||
| import ctypes | ||||
| import math | ||||
|  | ||||
| import pytest | ||||
|  | ||||
| import pandas as pd | ||||
|  | ||||
|  | ||||
| @pytest.fixture | ||||
| def df_from_dict(): | ||||
|     def maker(dct, is_categorical=False): | ||||
|         df = pd.DataFrame(dct) | ||||
|         return df.astype("category") if is_categorical else df | ||||
|  | ||||
|     return maker | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "test_data", | ||||
|     [ | ||||
|         {"a": ["foo", "bar"], "b": ["baz", "qux"]}, | ||||
|         {"a": [1.5, 2.5, 3.5], "b": [9.2, 10.5, 11.8]}, | ||||
|         {"A": [1, 2, 3, 4], "B": [1, 2, 3, 4]}, | ||||
|     ], | ||||
|     ids=["str_data", "float_data", "int_data"], | ||||
| ) | ||||
| def test_only_one_dtype(test_data, df_from_dict): | ||||
|     columns = list(test_data.keys()) | ||||
|     df = df_from_dict(test_data) | ||||
|     dfX = df.__dataframe__() | ||||
|  | ||||
|     column_size = len(test_data[columns[0]]) | ||||
|     for column in columns: | ||||
|         null_count = dfX.get_column_by_name(column).null_count | ||||
|         assert null_count == 0 | ||||
|         assert isinstance(null_count, int) | ||||
|         assert dfX.get_column_by_name(column).size() == column_size | ||||
|         assert dfX.get_column_by_name(column).offset == 0 | ||||
|  | ||||
|  | ||||
| def test_mixed_dtypes(df_from_dict): | ||||
|     df = df_from_dict( | ||||
|         { | ||||
|             "a": [1, 2, 3],  # dtype kind INT = 0 | ||||
|             "b": [3, 4, 5],  # dtype kind INT = 0 | ||||
|             "c": [1.5, 2.5, 3.5],  # dtype kind FLOAT = 2 | ||||
|             "d": [9, 10, 11],  # dtype kind INT = 0 | ||||
|             "e": [True, False, True],  # dtype kind BOOLEAN = 20 | ||||
|             "f": ["a", "", "c"],  # dtype kind STRING = 21 | ||||
|         } | ||||
|     ) | ||||
|     dfX = df.__dataframe__() | ||||
|     # for meanings of dtype[0] see the spec; we cannot import the spec here as this | ||||
|     # file is expected to be vendored *anywhere*; | ||||
|     # values for dtype[0] are explained above | ||||
|     columns = {"a": 0, "b": 0, "c": 2, "d": 0, "e": 20, "f": 21} | ||||
|  | ||||
|     for column, kind in columns.items(): | ||||
|         colX = dfX.get_column_by_name(column) | ||||
|         assert colX.null_count == 0 | ||||
|         assert isinstance(colX.null_count, int) | ||||
|         assert colX.size() == 3 | ||||
|         assert colX.offset == 0 | ||||
|  | ||||
|         assert colX.dtype[0] == kind | ||||
|  | ||||
|     assert dfX.get_column_by_name("c").dtype[1] == 64 | ||||
|  | ||||
|  | ||||
| def test_na_float(df_from_dict): | ||||
|     df = df_from_dict({"a": [1.0, math.nan, 2.0]}) | ||||
|     dfX = df.__dataframe__() | ||||
|     colX = dfX.get_column_by_name("a") | ||||
|     assert colX.null_count == 1 | ||||
|     assert isinstance(colX.null_count, int) | ||||
|  | ||||
|  | ||||
| def test_noncategorical(df_from_dict): | ||||
|     df = df_from_dict({"a": [1, 2, 3]}) | ||||
|     dfX = df.__dataframe__() | ||||
|     colX = dfX.get_column_by_name("a") | ||||
|     with pytest.raises(TypeError, match=".*categorical.*"): | ||||
|         colX.describe_categorical | ||||
|  | ||||
|  | ||||
| def test_categorical(df_from_dict): | ||||
|     df = df_from_dict( | ||||
|         {"weekday": ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"]}, | ||||
|         is_categorical=True, | ||||
|     ) | ||||
|  | ||||
|     colX = df.__dataframe__().get_column_by_name("weekday") | ||||
|     categorical = colX.describe_categorical | ||||
|     assert isinstance(categorical["is_ordered"], bool) | ||||
|     assert isinstance(categorical["is_dictionary"], bool) | ||||
|  | ||||
|  | ||||
| def test_dataframe(df_from_dict): | ||||
|     df = df_from_dict( | ||||
|         {"x": [True, True, False], "y": [1, 2, 0], "z": [9.2, 10.5, 11.8]} | ||||
|     ) | ||||
|     dfX = df.__dataframe__() | ||||
|  | ||||
|     assert dfX.num_columns() == 3 | ||||
|     assert dfX.num_rows() == 3 | ||||
|     assert dfX.num_chunks() == 1 | ||||
|     assert list(dfX.column_names()) == ["x", "y", "z"] | ||||
|     assert list(dfX.select_columns((0, 2)).column_names()) == list( | ||||
|         dfX.select_columns_by_name(("x", "z")).column_names() | ||||
|     ) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)]) | ||||
| def test_df_get_chunks(size, n_chunks, df_from_dict): | ||||
|     df = df_from_dict({"x": list(range(size))}) | ||||
|     dfX = df.__dataframe__() | ||||
|     chunks = list(dfX.get_chunks(n_chunks)) | ||||
|     assert len(chunks) == n_chunks | ||||
|     assert sum(chunk.num_rows() for chunk in chunks) == size | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)]) | ||||
| def test_column_get_chunks(size, n_chunks, df_from_dict): | ||||
|     df = df_from_dict({"x": list(range(size))}) | ||||
|     dfX = df.__dataframe__() | ||||
|     chunks = list(dfX.get_column(0).get_chunks(n_chunks)) | ||||
|     assert len(chunks) == n_chunks | ||||
|     assert sum(chunk.size() for chunk in chunks) == size | ||||
|  | ||||
|  | ||||
| def test_get_columns(df_from_dict): | ||||
|     df = df_from_dict({"a": [0, 1], "b": [2.5, 3.5]}) | ||||
|     dfX = df.__dataframe__() | ||||
|     for colX in dfX.get_columns(): | ||||
|         assert colX.size() == 2 | ||||
|         assert colX.num_chunks() == 1 | ||||
|     # for meanings of dtype[0] see the spec; we cannot import the spec here as this | ||||
|     # file is expected to be vendored *anywhere* | ||||
|     assert dfX.get_column(0).dtype[0] == 0  # INT | ||||
|     assert dfX.get_column(1).dtype[0] == 2  # FLOAT | ||||
|  | ||||
|  | ||||
| def test_buffer(df_from_dict): | ||||
|     arr = [0, 1, -1] | ||||
|     df = df_from_dict({"a": arr}) | ||||
|     dfX = df.__dataframe__() | ||||
|     colX = dfX.get_column(0) | ||||
|     bufX = colX.get_buffers() | ||||
|  | ||||
|     dataBuf, dataDtype = bufX["data"] | ||||
|  | ||||
|     assert dataBuf.bufsize > 0 | ||||
|     assert dataBuf.ptr != 0 | ||||
|     device, _ = dataBuf.__dlpack_device__() | ||||
|  | ||||
|     # for meanings of dtype[0] see the spec; we cannot import the spec here as this | ||||
|     # file is expected to be vendored *anywhere* | ||||
|     assert dataDtype[0] == 0  # INT | ||||
|  | ||||
|     if device == 1:  # CPU-only as we're going to directly read memory here | ||||
|         bitwidth = dataDtype[1] | ||||
|         ctype = { | ||||
|             8: ctypes.c_int8, | ||||
|             16: ctypes.c_int16, | ||||
|             32: ctypes.c_int32, | ||||
|             64: ctypes.c_int64, | ||||
|         }[bitwidth] | ||||
|  | ||||
|         for idx, truth in enumerate(arr): | ||||
|             val = ctype.from_address(dataBuf.ptr + idx * (bitwidth // 8)).value | ||||
|             assert val == truth, f"Buffer at index {idx} mismatch" | ||||
| @ -0,0 +1,89 @@ | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| import pandas as pd | ||||
| from pandas.core.interchange.utils import dtype_to_arrow_c_fmt | ||||
|  | ||||
| # TODO: use ArrowSchema to get reference C-string. | ||||
| # At the time, there is no way to access ArrowSchema holding a type format string | ||||
| # from python. The only way to access it is to export the structure to a C-pointer, | ||||
| # see DataType._export_to_c() method defined in | ||||
| # https://github.com/apache/arrow/blob/master/python/pyarrow/types.pxi | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "pandas_dtype, c_string", | ||||
|     [ | ||||
|         (np.dtype("bool"), "b"), | ||||
|         (np.dtype("int8"), "c"), | ||||
|         (np.dtype("uint8"), "C"), | ||||
|         (np.dtype("int16"), "s"), | ||||
|         (np.dtype("uint16"), "S"), | ||||
|         (np.dtype("int32"), "i"), | ||||
|         (np.dtype("uint32"), "I"), | ||||
|         (np.dtype("int64"), "l"), | ||||
|         (np.dtype("uint64"), "L"), | ||||
|         (np.dtype("float16"), "e"), | ||||
|         (np.dtype("float32"), "f"), | ||||
|         (np.dtype("float64"), "g"), | ||||
|         (pd.Series(["a"]).dtype, "u"), | ||||
|         ( | ||||
|             pd.Series([0]).astype("datetime64[ns]").dtype, | ||||
|             "tsn:", | ||||
|         ), | ||||
|         (pd.CategoricalDtype(["a"]), "l"), | ||||
|         (np.dtype("O"), "u"), | ||||
|     ], | ||||
| ) | ||||
| def test_dtype_to_arrow_c_fmt(pandas_dtype, c_string):  # PR01 | ||||
|     """Test ``dtype_to_arrow_c_fmt`` utility function.""" | ||||
|     assert dtype_to_arrow_c_fmt(pandas_dtype) == c_string | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "pa_dtype, args_kwargs, c_string", | ||||
|     [ | ||||
|         ["null", {}, "n"], | ||||
|         ["bool_", {}, "b"], | ||||
|         ["uint8", {}, "C"], | ||||
|         ["uint16", {}, "S"], | ||||
|         ["uint32", {}, "I"], | ||||
|         ["uint64", {}, "L"], | ||||
|         ["int8", {}, "c"], | ||||
|         ["int16", {}, "S"], | ||||
|         ["int32", {}, "i"], | ||||
|         ["int64", {}, "l"], | ||||
|         ["float16", {}, "e"], | ||||
|         ["float32", {}, "f"], | ||||
|         ["float64", {}, "g"], | ||||
|         ["string", {}, "u"], | ||||
|         ["binary", {}, "z"], | ||||
|         ["time32", ("s",), "tts"], | ||||
|         ["time32", ("ms",), "ttm"], | ||||
|         ["time64", ("us",), "ttu"], | ||||
|         ["time64", ("ns",), "ttn"], | ||||
|         ["date32", {}, "tdD"], | ||||
|         ["date64", {}, "tdm"], | ||||
|         ["timestamp", {"unit": "s"}, "tss:"], | ||||
|         ["timestamp", {"unit": "ms"}, "tsm:"], | ||||
|         ["timestamp", {"unit": "us"}, "tsu:"], | ||||
|         ["timestamp", {"unit": "ns"}, "tsn:"], | ||||
|         ["timestamp", {"unit": "ns", "tz": "UTC"}, "tsn:UTC"], | ||||
|         ["duration", ("s",), "tDs"], | ||||
|         ["duration", ("ms",), "tDm"], | ||||
|         ["duration", ("us",), "tDu"], | ||||
|         ["duration", ("ns",), "tDn"], | ||||
|         ["decimal128", {"precision": 4, "scale": 2}, "d:4,2"], | ||||
|     ], | ||||
| ) | ||||
| def test_dtype_to_arrow_c_fmt_arrowdtype(pa_dtype, args_kwargs, c_string): | ||||
|     # GH 52323 | ||||
|     pa = pytest.importorskip("pyarrow") | ||||
|     if not args_kwargs: | ||||
|         pa_type = getattr(pa, pa_dtype)() | ||||
|     elif isinstance(args_kwargs, tuple): | ||||
|         pa_type = getattr(pa, pa_dtype)(*args_kwargs) | ||||
|     else: | ||||
|         pa_type = getattr(pa, pa_dtype)(**args_kwargs) | ||||
|     arrow_type = pd.ArrowDtype(pa_type) | ||||
|     assert dtype_to_arrow_c_fmt(arrow_type) == c_string | ||||
		Reference in New Issue
	
	Block a user