This commit is contained in:
2025-09-07 22:09:54 +02:00
parent e1b817252c
commit 2fc0d000b6
7796 changed files with 2159515 additions and 933 deletions

View File

@ -0,0 +1,136 @@
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Any,
)
from pandas.core.interchange.dataframe_protocol import (
Buffer,
DlpackDeviceType,
)
if TYPE_CHECKING:
import numpy as np
import pyarrow as pa
class PandasBuffer(Buffer):
"""
Data in the buffer is guaranteed to be contiguous in memory.
"""
def __init__(self, x: np.ndarray, allow_copy: bool = True) -> None:
"""
Handle only regular columns (= numpy arrays) for now.
"""
if x.strides[0] and not x.strides == (x.dtype.itemsize,):
# The protocol does not support strided buffers, so a copy is
# necessary. If that's not allowed, we need to raise an exception.
if allow_copy:
x = x.copy()
else:
raise RuntimeError(
"Exports cannot be zero-copy in the case "
"of a non-contiguous buffer"
)
# Store the numpy array in which the data resides as a private
# attribute, so we can use it to retrieve the public attributes
self._x = x
@property
def bufsize(self) -> int:
"""
Buffer size in bytes.
"""
return self._x.size * self._x.dtype.itemsize
@property
def ptr(self) -> int:
"""
Pointer to start of the buffer as an integer.
"""
return self._x.__array_interface__["data"][0]
def __dlpack__(self) -> Any:
"""
Represent this structure as DLPack interface.
"""
return self._x.__dlpack__()
def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]:
"""
Device type and device ID for where the data in the buffer resides.
"""
return (DlpackDeviceType.CPU, None)
def __repr__(self) -> str:
return (
"PandasBuffer("
+ str(
{
"bufsize": self.bufsize,
"ptr": self.ptr,
"device": self.__dlpack_device__()[0].name,
}
)
+ ")"
)
class PandasBufferPyarrow(Buffer):
"""
Data in the buffer is guaranteed to be contiguous in memory.
"""
def __init__(
self,
buffer: pa.Buffer,
*,
length: int,
) -> None:
"""
Handle pyarrow chunked arrays.
"""
self._buffer = buffer
self._length = length
@property
def bufsize(self) -> int:
"""
Buffer size in bytes.
"""
return self._buffer.size
@property
def ptr(self) -> int:
"""
Pointer to start of the buffer as an integer.
"""
return self._buffer.address
def __dlpack__(self) -> Any:
"""
Represent this structure as DLPack interface.
"""
raise NotImplementedError()
def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]:
"""
Device type and device ID for where the data in the buffer resides.
"""
return (DlpackDeviceType.CPU, None)
def __repr__(self) -> str:
return (
"PandasBuffer[pyarrow]("
+ str(
{
"bufsize": self.bufsize,
"ptr": self.ptr,
"device": "CPU",
}
)
+ ")"
)

View File

@ -0,0 +1,461 @@
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Any,
)
import numpy as np
from pandas._libs.lib import infer_dtype
from pandas._libs.tslibs import iNaT
from pandas.errors import NoBufferPresent
from pandas.util._decorators import cache_readonly
from pandas.core.dtypes.dtypes import BaseMaskedDtype
import pandas as pd
from pandas import (
ArrowDtype,
DatetimeTZDtype,
)
from pandas.api.types import is_string_dtype
from pandas.core.interchange.buffer import (
PandasBuffer,
PandasBufferPyarrow,
)
from pandas.core.interchange.dataframe_protocol import (
Column,
ColumnBuffers,
ColumnNullType,
DtypeKind,
)
from pandas.core.interchange.utils import (
ArrowCTypes,
Endianness,
dtype_to_arrow_c_fmt,
)
if TYPE_CHECKING:
from pandas.core.interchange.dataframe_protocol import Buffer
_NP_KINDS = {
"i": DtypeKind.INT,
"u": DtypeKind.UINT,
"f": DtypeKind.FLOAT,
"b": DtypeKind.BOOL,
"U": DtypeKind.STRING,
"M": DtypeKind.DATETIME,
"m": DtypeKind.DATETIME,
}
_NULL_DESCRIPTION = {
DtypeKind.FLOAT: (ColumnNullType.USE_NAN, None),
DtypeKind.DATETIME: (ColumnNullType.USE_SENTINEL, iNaT),
DtypeKind.INT: (ColumnNullType.NON_NULLABLE, None),
DtypeKind.UINT: (ColumnNullType.NON_NULLABLE, None),
DtypeKind.BOOL: (ColumnNullType.NON_NULLABLE, None),
# Null values for categoricals are stored as `-1` sentinel values
# in the category date (e.g., `col.values.codes` is int8 np.ndarray)
DtypeKind.CATEGORICAL: (ColumnNullType.USE_SENTINEL, -1),
# follow Arrow in using 1 as valid value and 0 for missing/null value
DtypeKind.STRING: (ColumnNullType.USE_BYTEMASK, 0),
}
_NO_VALIDITY_BUFFER = {
ColumnNullType.NON_NULLABLE: "This column is non-nullable",
ColumnNullType.USE_NAN: "This column uses NaN as null",
ColumnNullType.USE_SENTINEL: "This column uses a sentinel value",
}
class PandasColumn(Column):
"""
A column object, with only the methods and properties required by the
interchange protocol defined.
A column can contain one or more chunks. Each chunk can contain up to three
buffers - a data buffer, a mask buffer (depending on null representation),
and an offsets buffer (if variable-size binary; e.g., variable-length
strings).
Note: this Column object can only be produced by ``__dataframe__``, so
doesn't need its own version or ``__column__`` protocol.
"""
def __init__(self, column: pd.Series, allow_copy: bool = True) -> None:
"""
Note: doesn't deal with extension arrays yet, just assume a regular
Series/ndarray for now.
"""
if isinstance(column, pd.DataFrame):
raise TypeError(
"Expected a Series, got a DataFrame. This likely happened "
"because you called __dataframe__ on a DataFrame which, "
"after converting column names to string, resulted in duplicated "
f"names: {column.columns}. Please rename these columns before "
"using the interchange protocol."
)
if not isinstance(column, pd.Series):
raise NotImplementedError(f"Columns of type {type(column)} not handled yet")
# Store the column as a private attribute
self._col = column
self._allow_copy = allow_copy
def size(self) -> int:
"""
Size of the column, in elements.
"""
return self._col.size
@property
def offset(self) -> int:
"""
Offset of first element. Always zero.
"""
# TODO: chunks are implemented now, probably this should return something
return 0
@cache_readonly
def dtype(self) -> tuple[DtypeKind, int, str, str]:
dtype = self._col.dtype
if isinstance(dtype, pd.CategoricalDtype):
codes = self._col.values.codes
(
_,
bitwidth,
c_arrow_dtype_f_str,
_,
) = self._dtype_from_pandasdtype(codes.dtype)
return (
DtypeKind.CATEGORICAL,
bitwidth,
c_arrow_dtype_f_str,
Endianness.NATIVE,
)
elif is_string_dtype(dtype):
if infer_dtype(self._col) in ("string", "empty"):
return (
DtypeKind.STRING,
8,
dtype_to_arrow_c_fmt(dtype),
Endianness.NATIVE,
)
raise NotImplementedError("Non-string object dtypes are not supported yet")
else:
return self._dtype_from_pandasdtype(dtype)
def _dtype_from_pandasdtype(self, dtype) -> tuple[DtypeKind, int, str, str]:
"""
See `self.dtype` for details.
"""
# Note: 'c' (complex) not handled yet (not in array spec v1).
# 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled
# datetime and timedelta both map to datetime (is timedelta handled?)
kind = _NP_KINDS.get(dtype.kind, None)
if kind is None:
# Not a NumPy dtype. Check if it's a categorical maybe
raise ValueError(f"Data type {dtype} not supported by interchange protocol")
if isinstance(dtype, ArrowDtype):
byteorder = dtype.numpy_dtype.byteorder
elif isinstance(dtype, DatetimeTZDtype):
byteorder = dtype.base.byteorder # type: ignore[union-attr]
elif isinstance(dtype, BaseMaskedDtype):
byteorder = dtype.numpy_dtype.byteorder
else:
byteorder = dtype.byteorder
if dtype == "bool[pyarrow]":
# return early to avoid the `* 8` below, as this is a bitmask
# rather than a bytemask
return (
kind,
dtype.itemsize, # pyright: ignore[reportGeneralTypeIssues]
ArrowCTypes.BOOL,
byteorder,
)
return kind, dtype.itemsize * 8, dtype_to_arrow_c_fmt(dtype), byteorder
@property
def describe_categorical(self):
"""
If the dtype is categorical, there are two options:
- There are only values in the data buffer.
- There is a separate non-categorical Column encoding for categorical values.
Raises TypeError if the dtype is not categorical
Content of returned dict:
- "is_ordered" : bool, whether the ordering of dictionary indices is
semantically meaningful.
- "is_dictionary" : bool, whether a dictionary-style mapping of
categorical values to other objects exists
- "categories" : Column representing the (implicit) mapping of indices to
category values (e.g. an array of cat1, cat2, ...).
None if not a dictionary-style categorical.
"""
if not self.dtype[0] == DtypeKind.CATEGORICAL:
raise TypeError(
"describe_categorical only works on a column with categorical dtype!"
)
return {
"is_ordered": self._col.cat.ordered,
"is_dictionary": True,
"categories": PandasColumn(pd.Series(self._col.cat.categories)),
}
@property
def describe_null(self):
if isinstance(self._col.dtype, BaseMaskedDtype):
column_null_dtype = ColumnNullType.USE_BYTEMASK
null_value = 1
return column_null_dtype, null_value
if isinstance(self._col.dtype, ArrowDtype):
# We already rechunk (if necessary / allowed) upon initialization, so this
# is already single-chunk by the time we get here.
if self._col.array._pa_array.chunks[0].buffers()[0] is None: # type: ignore[attr-defined]
return ColumnNullType.NON_NULLABLE, None
return ColumnNullType.USE_BITMASK, 0
kind = self.dtype[0]
try:
null, value = _NULL_DESCRIPTION[kind]
except KeyError:
raise NotImplementedError(f"Data type {kind} not yet supported")
return null, value
@cache_readonly
def null_count(self) -> int:
"""
Number of null elements. Should always be known.
"""
return self._col.isna().sum().item()
@property
def metadata(self) -> dict[str, pd.Index]:
"""
Store specific metadata of the column.
"""
return {"pandas.index": self._col.index}
def num_chunks(self) -> int:
"""
Return the number of chunks the column consists of.
"""
return 1
def get_chunks(self, n_chunks: int | None = None):
"""
Return an iterator yielding the chunks.
See `DataFrame.get_chunks` for details on ``n_chunks``.
"""
if n_chunks and n_chunks > 1:
size = len(self._col)
step = size // n_chunks
if size % n_chunks != 0:
step += 1
for start in range(0, step * n_chunks, step):
yield PandasColumn(
self._col.iloc[start : start + step], self._allow_copy
)
else:
yield self
def get_buffers(self) -> ColumnBuffers:
"""
Return a dictionary containing the underlying buffers.
The returned dictionary has the following contents:
- "data": a two-element tuple whose first element is a buffer
containing the data and whose second element is the data
buffer's associated dtype.
- "validity": a two-element tuple whose first element is a buffer
containing mask values indicating missing data and
whose second element is the mask value buffer's
associated dtype. None if the null representation is
not a bit or byte mask.
- "offsets": a two-element tuple whose first element is a buffer
containing the offset values for variable-size binary
data (e.g., variable-length strings) and whose second
element is the offsets buffer's associated dtype. None
if the data buffer does not have an associated offsets
buffer.
"""
buffers: ColumnBuffers = {
"data": self._get_data_buffer(),
"validity": None,
"offsets": None,
}
try:
buffers["validity"] = self._get_validity_buffer()
except NoBufferPresent:
pass
try:
buffers["offsets"] = self._get_offsets_buffer()
except NoBufferPresent:
pass
return buffers
def _get_data_buffer(
self,
) -> tuple[Buffer, tuple[DtypeKind, int, str, str]]:
"""
Return the buffer containing the data and the buffer's associated dtype.
"""
buffer: Buffer
if self.dtype[0] in (
DtypeKind.INT,
DtypeKind.UINT,
DtypeKind.FLOAT,
DtypeKind.BOOL,
DtypeKind.DATETIME,
):
# self.dtype[2] is an ArrowCTypes.TIMESTAMP where the tz will make
# it longer than 4 characters
dtype = self.dtype
if self.dtype[0] == DtypeKind.DATETIME and len(self.dtype[2]) > 4:
np_arr = self._col.dt.tz_convert(None).to_numpy()
else:
arr = self._col.array
if isinstance(self._col.dtype, BaseMaskedDtype):
np_arr = arr._data # type: ignore[attr-defined]
elif isinstance(self._col.dtype, ArrowDtype):
# We already rechunk (if necessary / allowed) upon initialization,
# so this is already single-chunk by the time we get here.
arr = arr._pa_array.chunks[0] # type: ignore[attr-defined]
buffer = PandasBufferPyarrow(
arr.buffers()[1], # type: ignore[attr-defined]
length=len(arr),
)
return buffer, dtype
else:
np_arr = arr._ndarray # type: ignore[attr-defined]
buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy)
elif self.dtype[0] == DtypeKind.CATEGORICAL:
codes = self._col.values._codes
buffer = PandasBuffer(codes, allow_copy=self._allow_copy)
dtype = self._dtype_from_pandasdtype(codes.dtype)
elif self.dtype[0] == DtypeKind.STRING:
# Marshal the strings from a NumPy object array into a byte array
buf = self._col.to_numpy()
b = bytearray()
# TODO: this for-loop is slow; can be implemented in Cython/C/C++ later
for obj in buf:
if isinstance(obj, str):
b.extend(obj.encode(encoding="utf-8"))
# Convert the byte array to a Pandas "buffer" using
# a NumPy array as the backing store
buffer = PandasBuffer(np.frombuffer(b, dtype="uint8"))
# Define the dtype for the returned buffer
# TODO: this will need correcting
# https://github.com/pandas-dev/pandas/issues/54781
dtype = self.dtype
else:
raise NotImplementedError(f"Data type {self._col.dtype} not handled yet")
return buffer, dtype
def _get_validity_buffer(self) -> tuple[Buffer, Any] | None:
"""
Return the buffer containing the mask values indicating missing data and
the buffer's associated dtype.
Raises NoBufferPresent if null representation is not a bit or byte mask.
"""
null, invalid = self.describe_null
buffer: Buffer
if isinstance(self._col.dtype, ArrowDtype):
# We already rechunk (if necessary / allowed) upon initialization, so this
# is already single-chunk by the time we get here.
arr = self._col.array._pa_array.chunks[0] # type: ignore[attr-defined]
dtype = (DtypeKind.BOOL, 1, ArrowCTypes.BOOL, Endianness.NATIVE)
if arr.buffers()[0] is None:
return None
buffer = PandasBufferPyarrow(
arr.buffers()[0],
length=len(arr),
)
return buffer, dtype
if isinstance(self._col.dtype, BaseMaskedDtype):
mask = self._col.array._mask # type: ignore[attr-defined]
buffer = PandasBuffer(mask)
dtype = (DtypeKind.BOOL, 8, ArrowCTypes.BOOL, Endianness.NATIVE)
return buffer, dtype
if self.dtype[0] == DtypeKind.STRING:
# For now, use byte array as the mask.
# TODO: maybe store as bit array to save space?..
buf = self._col.to_numpy()
# Determine the encoding for valid values
valid = invalid == 0
invalid = not valid
mask = np.zeros(shape=(len(buf),), dtype=np.bool_)
for i, obj in enumerate(buf):
mask[i] = valid if isinstance(obj, str) else invalid
# Convert the mask array to a Pandas "buffer" using
# a NumPy array as the backing store
buffer = PandasBuffer(mask)
# Define the dtype of the returned buffer
dtype = (DtypeKind.BOOL, 8, ArrowCTypes.BOOL, Endianness.NATIVE)
return buffer, dtype
try:
msg = f"{_NO_VALIDITY_BUFFER[null]} so does not have a separate mask"
except KeyError:
# TODO: implement for other bit/byte masks?
raise NotImplementedError("See self.describe_null")
raise NoBufferPresent(msg)
def _get_offsets_buffer(self) -> tuple[PandasBuffer, Any]:
"""
Return the buffer containing the offset values for variable-size binary
data (e.g., variable-length strings) and the buffer's associated dtype.
Raises NoBufferPresent if the data buffer does not have an associated
offsets buffer.
"""
if self.dtype[0] == DtypeKind.STRING:
# For each string, we need to manually determine the next offset
values = self._col.to_numpy()
ptr = 0
offsets = np.zeros(shape=(len(values) + 1,), dtype=np.int64)
for i, v in enumerate(values):
# For missing values (in this case, `np.nan` values)
# we don't increment the pointer
if isinstance(v, str):
b = v.encode(encoding="utf-8")
ptr += len(b)
offsets[i + 1] = ptr
# Convert the offsets to a Pandas "buffer" using
# the NumPy array as the backing store
buffer = PandasBuffer(offsets)
# Assemble the buffer dtype info
dtype = (
DtypeKind.INT,
64,
ArrowCTypes.INT64,
Endianness.NATIVE,
) # note: currently only support native endianness
else:
raise NoBufferPresent(
"This column has a fixed-length dtype so "
"it does not have an offsets buffer"
)
return buffer, dtype

View File

@ -0,0 +1,113 @@
from __future__ import annotations
from collections import abc
from typing import TYPE_CHECKING
from pandas.core.interchange.column import PandasColumn
from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg
from pandas.core.interchange.utils import maybe_rechunk
if TYPE_CHECKING:
from collections.abc import (
Iterable,
Sequence,
)
from pandas import (
DataFrame,
Index,
)
class PandasDataFrameXchg(DataFrameXchg):
"""
A data frame class, with only the methods required by the interchange
protocol defined.
Instances of this (private) class are returned from
``pd.DataFrame.__dataframe__`` as objects with the methods and
attributes defined on this class.
"""
def __init__(self, df: DataFrame, allow_copy: bool = True) -> None:
"""
Constructor - an instance of this (private) class is returned from
`pd.DataFrame.__dataframe__`.
"""
self._df = df.rename(columns=str, copy=False)
self._allow_copy = allow_copy
for i, _col in enumerate(self._df.columns):
rechunked = maybe_rechunk(self._df.iloc[:, i], allow_copy=allow_copy)
if rechunked is not None:
self._df.isetitem(i, rechunked)
def __dataframe__(
self, nan_as_null: bool = False, allow_copy: bool = True
) -> PandasDataFrameXchg:
# `nan_as_null` can be removed here once it's removed from
# Dataframe.__dataframe__
return PandasDataFrameXchg(self._df, allow_copy)
@property
def metadata(self) -> dict[str, Index]:
# `index` isn't a regular column, and the protocol doesn't support row
# labels - so we export it as Pandas-specific metadata here.
return {"pandas.index": self._df.index}
def num_columns(self) -> int:
return len(self._df.columns)
def num_rows(self) -> int:
return len(self._df)
def num_chunks(self) -> int:
return 1
def column_names(self) -> Index:
return self._df.columns
def get_column(self, i: int) -> PandasColumn:
return PandasColumn(self._df.iloc[:, i], allow_copy=self._allow_copy)
def get_column_by_name(self, name: str) -> PandasColumn:
return PandasColumn(self._df[name], allow_copy=self._allow_copy)
def get_columns(self) -> list[PandasColumn]:
return [
PandasColumn(self._df[name], allow_copy=self._allow_copy)
for name in self._df.columns
]
def select_columns(self, indices: Sequence[int]) -> PandasDataFrameXchg:
if not isinstance(indices, abc.Sequence):
raise ValueError("`indices` is not a sequence")
if not isinstance(indices, list):
indices = list(indices)
return PandasDataFrameXchg(
self._df.iloc[:, indices], allow_copy=self._allow_copy
)
def select_columns_by_name(self, names: list[str]) -> PandasDataFrameXchg: # type: ignore[override]
if not isinstance(names, abc.Sequence):
raise ValueError("`names` is not a sequence")
if not isinstance(names, list):
names = list(names)
return PandasDataFrameXchg(self._df.loc[:, names], allow_copy=self._allow_copy)
def get_chunks(self, n_chunks: int | None = None) -> Iterable[PandasDataFrameXchg]:
"""
Return an iterator yielding the chunks.
"""
if n_chunks and n_chunks > 1:
size = len(self._df)
step = size // n_chunks
if size % n_chunks != 0:
step += 1
for start in range(0, step * n_chunks, step):
yield PandasDataFrameXchg(
self._df.iloc[start : start + step, :],
allow_copy=self._allow_copy,
)
else:
yield self

View File

@ -0,0 +1,465 @@
"""
A verbatim copy (vendored) of the spec from https://github.com/data-apis/dataframe-api
"""
from __future__ import annotations
from abc import (
ABC,
abstractmethod,
)
import enum
from typing import (
TYPE_CHECKING,
Any,
TypedDict,
)
if TYPE_CHECKING:
from collections.abc import (
Iterable,
Sequence,
)
class DlpackDeviceType(enum.IntEnum):
"""Integer enum for device type codes matching DLPack."""
CPU = 1
CUDA = 2
CPU_PINNED = 3
OPENCL = 4
VULKAN = 7
METAL = 8
VPI = 9
ROCM = 10
class DtypeKind(enum.IntEnum):
"""
Integer enum for data types.
Attributes
----------
INT : int
Matches to signed integer data type.
UINT : int
Matches to unsigned integer data type.
FLOAT : int
Matches to floating point data type.
BOOL : int
Matches to boolean data type.
STRING : int
Matches to string data type (UTF-8 encoded).
DATETIME : int
Matches to datetime data type.
CATEGORICAL : int
Matches to categorical data type.
"""
INT = 0
UINT = 1
FLOAT = 2
BOOL = 20
STRING = 21 # UTF-8
DATETIME = 22
CATEGORICAL = 23
class ColumnNullType(enum.IntEnum):
"""
Integer enum for null type representation.
Attributes
----------
NON_NULLABLE : int
Non-nullable column.
USE_NAN : int
Use explicit float NaN value.
USE_SENTINEL : int
Sentinel value besides NaN/NaT.
USE_BITMASK : int
The bit is set/unset representing a null on a certain position.
USE_BYTEMASK : int
The byte is set/unset representing a null on a certain position.
"""
NON_NULLABLE = 0
USE_NAN = 1
USE_SENTINEL = 2
USE_BITMASK = 3
USE_BYTEMASK = 4
class ColumnBuffers(TypedDict):
# first element is a buffer containing the column data;
# second element is the data buffer's associated dtype
data: tuple[Buffer, Any]
# first element is a buffer containing mask values indicating missing data;
# second element is the mask value buffer's associated dtype.
# None if the null representation is not a bit or byte mask
validity: tuple[Buffer, Any] | None
# first element is a buffer containing the offset values for
# variable-size binary data (e.g., variable-length strings);
# second element is the offsets buffer's associated dtype.
# None if the data buffer does not have an associated offsets buffer
offsets: tuple[Buffer, Any] | None
class CategoricalDescription(TypedDict):
# whether the ordering of dictionary indices is semantically meaningful
is_ordered: bool
# whether a dictionary-style mapping of categorical values to other objects exists
is_dictionary: bool
# Python-level only (e.g. ``{int: str}``).
# None if not a dictionary-style categorical.
categories: Column | None
class Buffer(ABC):
"""
Data in the buffer is guaranteed to be contiguous in memory.
Note that there is no dtype attribute present, a buffer can be thought of
as simply a block of memory. However, if the column that the buffer is
attached to has a dtype that's supported by DLPack and ``__dlpack__`` is
implemented, then that dtype information will be contained in the return
value from ``__dlpack__``.
This distinction is useful to support both data exchange via DLPack on a
buffer and (b) dtypes like variable-length strings which do not have a
fixed number of bytes per element.
"""
@property
@abstractmethod
def bufsize(self) -> int:
"""
Buffer size in bytes.
"""
@property
@abstractmethod
def ptr(self) -> int:
"""
Pointer to start of the buffer as an integer.
"""
@abstractmethod
def __dlpack__(self):
"""
Produce DLPack capsule (see array API standard).
Raises:
- TypeError : if the buffer contains unsupported dtypes.
- NotImplementedError : if DLPack support is not implemented
Useful to have to connect to array libraries. Support optional because
it's not completely trivial to implement for a Python-only library.
"""
raise NotImplementedError("__dlpack__")
@abstractmethod
def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]:
"""
Device type and device ID for where the data in the buffer resides.
Uses device type codes matching DLPack.
Note: must be implemented even if ``__dlpack__`` is not.
"""
class Column(ABC):
"""
A column object, with only the methods and properties required by the
interchange protocol defined.
A column can contain one or more chunks. Each chunk can contain up to three
buffers - a data buffer, a mask buffer (depending on null representation),
and an offsets buffer (if variable-size binary; e.g., variable-length
strings).
TBD: Arrow has a separate "null" dtype, and has no separate mask concept.
Instead, it seems to use "children" for both columns with a bit mask,
and for nested dtypes. Unclear whether this is elegant or confusing.
This design requires checking the null representation explicitly.
The Arrow design requires checking:
1. the ARROW_FLAG_NULLABLE (for sentinel values)
2. if a column has two children, combined with one of those children
having a null dtype.
Making the mask concept explicit seems useful. One null dtype would
not be enough to cover both bit and byte masks, so that would mean
even more checking if we did it the Arrow way.
TBD: there's also the "chunk" concept here, which is implicit in Arrow as
multiple buffers per array (= column here). Semantically it may make
sense to have both: chunks were meant for example for lazy evaluation
of data which doesn't fit in memory, while multiple buffers per column
could also come from doing a selection operation on a single
contiguous buffer.
Given these concepts, one would expect chunks to be all of the same
size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows),
while multiple buffers could have data-dependent lengths. Not an issue
in pandas if one column is backed by a single NumPy array, but in
Arrow it seems possible.
Are multiple chunks *and* multiple buffers per column necessary for
the purposes of this interchange protocol, or must producers either
reuse the chunk concept for this or copy the data?
Note: this Column object can only be produced by ``__dataframe__``, so
doesn't need its own version or ``__column__`` protocol.
"""
@abstractmethod
def size(self) -> int:
"""
Size of the column, in elements.
Corresponds to DataFrame.num_rows() if column is a single chunk;
equal to size of this current chunk otherwise.
"""
@property
@abstractmethod
def offset(self) -> int:
"""
Offset of first element.
May be > 0 if using chunks; for example for a column with N chunks of
equal size M (only the last chunk may be shorter),
``offset = n * M``, ``n = 0 .. N-1``.
"""
@property
@abstractmethod
def dtype(self) -> tuple[DtypeKind, int, str, str]:
"""
Dtype description as a tuple ``(kind, bit-width, format string, endianness)``.
Bit-width : the number of bits as an integer
Format string : data type description format string in Apache Arrow C
Data Interface format.
Endianness : current only native endianness (``=``) is supported
Notes:
- Kind specifiers are aligned with DLPack where possible (hence the
jump to 20, leave enough room for future extension)
- Masks must be specified as boolean with either bit width 1 (for bit
masks) or 8 (for byte masks).
- Dtype width in bits was preferred over bytes
- Endianness isn't too useful, but included now in case in the future
we need to support non-native endianness
- Went with Apache Arrow format strings over NumPy format strings
because they're more complete from a dataframe perspective
- Format strings are mostly useful for datetime specification, and
for categoricals.
- For categoricals, the format string describes the type of the
categorical in the data buffer. In case of a separate encoding of
the categorical (e.g. an integer to string mapping), this can
be derived from ``self.describe_categorical``.
- Data types not included: complex, Arrow-style null, binary, decimal,
and nested (list, struct, map, union) dtypes.
"""
@property
@abstractmethod
def describe_categorical(self) -> CategoricalDescription:
"""
If the dtype is categorical, there are two options:
- There are only values in the data buffer.
- There is a separate non-categorical Column encoding for categorical values.
Raises TypeError if the dtype is not categorical
Returns the dictionary with description on how to interpret the data buffer:
- "is_ordered" : bool, whether the ordering of dictionary indices is
semantically meaningful.
- "is_dictionary" : bool, whether a mapping of
categorical values to other objects exists
- "categories" : Column representing the (implicit) mapping of indices to
category values (e.g. an array of cat1, cat2, ...).
None if not a dictionary-style categorical.
TBD: are there any other in-memory representations that are needed?
"""
@property
@abstractmethod
def describe_null(self) -> tuple[ColumnNullType, Any]:
"""
Return the missing value (or "null") representation the column dtype
uses, as a tuple ``(kind, value)``.
Value : if kind is "sentinel value", the actual value. If kind is a bit
mask or a byte mask, the value (0 or 1) indicating a missing value. None
otherwise.
"""
@property
@abstractmethod
def null_count(self) -> int | None:
"""
Number of null elements, if known.
Note: Arrow uses -1 to indicate "unknown", but None seems cleaner.
"""
@property
@abstractmethod
def metadata(self) -> dict[str, Any]:
"""
The metadata for the column. See `DataFrame.metadata` for more details.
"""
@abstractmethod
def num_chunks(self) -> int:
"""
Return the number of chunks the column consists of.
"""
@abstractmethod
def get_chunks(self, n_chunks: int | None = None) -> Iterable[Column]:
"""
Return an iterator yielding the chunks.
See `DataFrame.get_chunks` for details on ``n_chunks``.
"""
@abstractmethod
def get_buffers(self) -> ColumnBuffers:
"""
Return a dictionary containing the underlying buffers.
The returned dictionary has the following contents:
- "data": a two-element tuple whose first element is a buffer
containing the data and whose second element is the data
buffer's associated dtype.
- "validity": a two-element tuple whose first element is a buffer
containing mask values indicating missing data and
whose second element is the mask value buffer's
associated dtype. None if the null representation is
not a bit or byte mask.
- "offsets": a two-element tuple whose first element is a buffer
containing the offset values for variable-size binary
data (e.g., variable-length strings) and whose second
element is the offsets buffer's associated dtype. None
if the data buffer does not have an associated offsets
buffer.
"""
# def get_children(self) -> Iterable[Column]:
# """
# Children columns underneath the column, each object in this iterator
# must adhere to the column specification.
# """
# pass
class DataFrame(ABC):
"""
A data frame class, with only the methods required by the interchange
protocol defined.
A "data frame" represents an ordered collection of named columns.
A column's "name" must be a unique string.
Columns may be accessed by name or by position.
This could be a public data frame class, or an object with the methods and
attributes defined on this DataFrame class could be returned from the
``__dataframe__`` method of a public data frame class in a library adhering
to the dataframe interchange protocol specification.
"""
version = 0 # version of the protocol
@abstractmethod
def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):
"""Construct a new interchange object, potentially changing the parameters."""
@property
@abstractmethod
def metadata(self) -> dict[str, Any]:
"""
The metadata for the data frame, as a dictionary with string keys. The
contents of `metadata` may be anything, they are meant for a library
to store information that it needs to, e.g., roundtrip losslessly or
for two implementations to share data that is not (yet) part of the
interchange protocol specification. For avoiding collisions with other
entries, please add name the keys with the name of the library
followed by a period and the desired name, e.g, ``pandas.indexcol``.
"""
@abstractmethod
def num_columns(self) -> int:
"""
Return the number of columns in the DataFrame.
"""
@abstractmethod
def num_rows(self) -> int | None:
# TODO: not happy with Optional, but need to flag it may be expensive
# why include it if it may be None - what do we expect consumers
# to do here?
"""
Return the number of rows in the DataFrame, if available.
"""
@abstractmethod
def num_chunks(self) -> int:
"""
Return the number of chunks the DataFrame consists of.
"""
@abstractmethod
def column_names(self) -> Iterable[str]:
"""
Return an iterator yielding the column names.
"""
@abstractmethod
def get_column(self, i: int) -> Column:
"""
Return the column at the indicated position.
"""
@abstractmethod
def get_column_by_name(self, name: str) -> Column:
"""
Return the column whose name is the indicated name.
"""
@abstractmethod
def get_columns(self) -> Iterable[Column]:
"""
Return an iterator yielding the columns.
"""
@abstractmethod
def select_columns(self, indices: Sequence[int]) -> DataFrame:
"""
Create a new DataFrame by selecting a subset of columns by index.
"""
@abstractmethod
def select_columns_by_name(self, names: Sequence[str]) -> DataFrame:
"""
Create a new DataFrame by selecting a subset of columns by name.
"""
@abstractmethod
def get_chunks(self, n_chunks: int | None = None) -> Iterable[DataFrame]:
"""
Return an iterator yielding the chunks.
By default (None), yields the chunks that the data is stored as by the
producer. If given, ``n_chunks`` must be a multiple of
``self.num_chunks()``, meaning the producer must subdivide each chunk
before yielding it.
"""

View File

@ -0,0 +1,557 @@
from __future__ import annotations
import ctypes
import re
from typing import Any
import numpy as np
from pandas._config import using_string_dtype
from pandas.compat._optional import import_optional_dependency
from pandas.errors import SettingWithCopyError
import pandas as pd
from pandas.core.interchange.dataframe_protocol import (
Buffer,
Column,
ColumnNullType,
DataFrame as DataFrameXchg,
DtypeKind,
)
from pandas.core.interchange.utils import (
ArrowCTypes,
Endianness,
)
_NP_DTYPES: dict[DtypeKind, dict[int, Any]] = {
DtypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64},
DtypeKind.UINT: {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64},
DtypeKind.FLOAT: {32: np.float32, 64: np.float64},
DtypeKind.BOOL: {1: bool, 8: bool},
}
def from_dataframe(df, allow_copy: bool = True) -> pd.DataFrame:
"""
Build a ``pd.DataFrame`` from any DataFrame supporting the interchange protocol.
.. note::
For new development, we highly recommend using the Arrow C Data Interface
alongside the Arrow PyCapsule Interface instead of the interchange protocol.
From pandas 2.3 onwards, `from_dataframe` uses the PyCapsule Interface,
only falling back to the interchange protocol if that fails.
.. warning::
Due to severe implementation issues, we recommend only considering using the
interchange protocol in the following cases:
- converting to pandas: for pandas >= 2.0.3
- converting from pandas: for pandas >= 3.0.0
Parameters
----------
df : DataFrameXchg
Object supporting the interchange protocol, i.e. `__dataframe__` method.
allow_copy : bool, default: True
Whether to allow copying the memory to perform the conversion
(if false then zero-copy approach is requested).
Returns
-------
pd.DataFrame
Examples
--------
>>> df_not_necessarily_pandas = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
>>> interchange_object = df_not_necessarily_pandas.__dataframe__()
>>> interchange_object.column_names()
Index(['A', 'B'], dtype='object')
>>> df_pandas = (pd.api.interchange.from_dataframe
... (interchange_object.select_columns_by_name(['A'])))
>>> df_pandas
A
0 1
1 2
These methods (``column_names``, ``select_columns_by_name``) should work
for any dataframe library which implements the interchange protocol.
"""
if isinstance(df, pd.DataFrame):
return df
if hasattr(df, "__arrow_c_stream__"):
try:
pa = import_optional_dependency("pyarrow", min_version="14.0.0")
except ImportError:
# fallback to _from_dataframe
pass
else:
try:
return pa.table(df).to_pandas(zero_copy_only=not allow_copy)
except pa.ArrowInvalid as e:
raise RuntimeError(e) from e
if not hasattr(df, "__dataframe__"):
raise ValueError("`df` does not support __dataframe__")
return _from_dataframe(
df.__dataframe__(allow_copy=allow_copy), allow_copy=allow_copy
)
def _from_dataframe(df: DataFrameXchg, allow_copy: bool = True):
"""
Build a ``pd.DataFrame`` from the DataFrame interchange object.
Parameters
----------
df : DataFrameXchg
Object supporting the interchange protocol, i.e. `__dataframe__` method.
allow_copy : bool, default: True
Whether to allow copying the memory to perform the conversion
(if false then zero-copy approach is requested).
Returns
-------
pd.DataFrame
"""
pandas_dfs = []
for chunk in df.get_chunks():
pandas_df = protocol_df_chunk_to_pandas(chunk)
pandas_dfs.append(pandas_df)
if not allow_copy and len(pandas_dfs) > 1:
raise RuntimeError(
"To join chunks a copy is required which is forbidden by allow_copy=False"
)
if not pandas_dfs:
pandas_df = protocol_df_chunk_to_pandas(df)
elif len(pandas_dfs) == 1:
pandas_df = pandas_dfs[0]
else:
pandas_df = pd.concat(pandas_dfs, axis=0, ignore_index=True, copy=False)
index_obj = df.metadata.get("pandas.index", None)
if index_obj is not None:
pandas_df.index = index_obj
return pandas_df
def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame:
"""
Convert interchange protocol chunk to ``pd.DataFrame``.
Parameters
----------
df : DataFrameXchg
Returns
-------
pd.DataFrame
"""
columns: dict[str, Any] = {}
buffers = [] # hold on to buffers, keeps memory alive
for name in df.column_names():
if not isinstance(name, str):
raise ValueError(f"Column {name} is not a string")
if name in columns:
raise ValueError(f"Column {name} is not unique")
col = df.get_column_by_name(name)
dtype = col.dtype[0]
if dtype in (
DtypeKind.INT,
DtypeKind.UINT,
DtypeKind.FLOAT,
DtypeKind.BOOL,
):
columns[name], buf = primitive_column_to_ndarray(col)
elif dtype == DtypeKind.CATEGORICAL:
columns[name], buf = categorical_column_to_series(col)
elif dtype == DtypeKind.STRING:
columns[name], buf = string_column_to_ndarray(col)
elif dtype == DtypeKind.DATETIME:
columns[name], buf = datetime_column_to_ndarray(col)
else:
raise NotImplementedError(f"Data type {dtype} not handled yet")
buffers.append(buf)
pandas_df = pd.DataFrame(columns)
pandas_df.attrs["_INTERCHANGE_PROTOCOL_BUFFERS"] = buffers
return pandas_df
def primitive_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
"""
Convert a column holding one of the primitive dtypes to a NumPy array.
A primitive type is one of: int, uint, float, bool.
Parameters
----------
col : Column
Returns
-------
tuple
Tuple of np.ndarray holding the data and the memory owner object
that keeps the memory alive.
"""
buffers = col.get_buffers()
data_buff, data_dtype = buffers["data"]
data = buffer_to_ndarray(
data_buff, data_dtype, offset=col.offset, length=col.size()
)
data = set_nulls(data, col, buffers["validity"])
return data, buffers
def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]:
"""
Convert a column holding categorical data to a pandas Series.
Parameters
----------
col : Column
Returns
-------
tuple
Tuple of pd.Series holding the data and the memory owner object
that keeps the memory alive.
"""
categorical = col.describe_categorical
if not categorical["is_dictionary"]:
raise NotImplementedError("Non-dictionary categoricals not supported yet")
cat_column = categorical["categories"]
if hasattr(cat_column, "_col"):
# Item "Column" of "Optional[Column]" has no attribute "_col"
# Item "None" of "Optional[Column]" has no attribute "_col"
categories = np.array(cat_column._col) # type: ignore[union-attr]
else:
raise NotImplementedError(
"Interchanging categorical columns isn't supported yet, and our "
"fallback of using the `col._col` attribute (a ndarray) failed."
)
buffers = col.get_buffers()
codes_buff, codes_dtype = buffers["data"]
codes = buffer_to_ndarray(
codes_buff, codes_dtype, offset=col.offset, length=col.size()
)
# Doing module in order to not get ``IndexError`` for
# out-of-bounds sentinel values in `codes`
if len(categories) > 0:
values = categories[codes % len(categories)]
else:
values = codes
cat = pd.Categorical(
values, categories=categories, ordered=categorical["is_ordered"]
)
data = pd.Series(cat)
data = set_nulls(data, col, buffers["validity"])
return data, buffers
def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
"""
Convert a column holding string data to a NumPy array.
Parameters
----------
col : Column
Returns
-------
tuple
Tuple of np.ndarray holding the data and the memory owner object
that keeps the memory alive.
"""
null_kind, sentinel_val = col.describe_null
if null_kind not in (
ColumnNullType.NON_NULLABLE,
ColumnNullType.USE_BITMASK,
ColumnNullType.USE_BYTEMASK,
):
raise NotImplementedError(
f"{null_kind} null kind is not yet supported for string columns."
)
buffers = col.get_buffers()
assert buffers["offsets"], "String buffers must contain offsets"
# Retrieve the data buffer containing the UTF-8 code units
data_buff, _ = buffers["data"]
# We're going to reinterpret the buffer as uint8, so make sure we can do it safely
assert col.dtype[2] in (
ArrowCTypes.STRING,
ArrowCTypes.LARGE_STRING,
) # format_str == utf-8
# Convert the buffers to NumPy arrays. In order to go from STRING to
# an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array)
data_dtype = (
DtypeKind.UINT,
8,
ArrowCTypes.UINT8,
Endianness.NATIVE,
)
# Specify zero offset as we don't want to chunk the string data
data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=data_buff.bufsize)
# Retrieve the offsets buffer containing the index offsets demarcating
# the beginning and the ending of each string
offset_buff, offset_dtype = buffers["offsets"]
# Offsets buffer contains start-stop positions of strings in the data buffer,
# meaning that it has more elements than in the data buffer, do `col.size() + 1`
# here to pass a proper offsets buffer size
offsets = buffer_to_ndarray(
offset_buff, offset_dtype, offset=col.offset, length=col.size() + 1
)
null_pos = None
if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):
validity = buffers["validity"]
if validity is not None:
valid_buff, valid_dtype = validity
null_pos = buffer_to_ndarray(
valid_buff, valid_dtype, offset=col.offset, length=col.size()
)
if sentinel_val == 0:
null_pos = ~null_pos
# Assemble the strings from the code units
str_list: list[None | float | str] = [None] * col.size()
for i in range(col.size()):
# Check for missing values
if null_pos is not None and null_pos[i]:
str_list[i] = np.nan
continue
# Extract a range of code units
units = data[offsets[i] : offsets[i + 1]]
# Convert the list of code units to bytes
str_bytes = bytes(units)
# Create the string
string = str_bytes.decode(encoding="utf-8")
# Add to our list of strings
str_list[i] = string
if using_string_dtype():
res = pd.Series(str_list, dtype="str")
else:
res = np.asarray(str_list, dtype="object") # type: ignore[assignment]
return res, buffers # type: ignore[return-value]
def parse_datetime_format_str(format_str, data) -> pd.Series | np.ndarray:
"""Parse datetime `format_str` to interpret the `data`."""
# timestamp 'ts{unit}:tz'
timestamp_meta = re.match(r"ts([smun]):(.*)", format_str)
if timestamp_meta:
unit, tz = timestamp_meta.group(1), timestamp_meta.group(2)
if unit != "s":
# the format string describes only a first letter of the unit, so
# add one extra letter to convert the unit to numpy-style:
# 'm' -> 'ms', 'u' -> 'us', 'n' -> 'ns'
unit += "s"
data = data.astype(f"datetime64[{unit}]")
if tz != "":
data = pd.Series(data).dt.tz_localize("UTC").dt.tz_convert(tz)
return data
# date 'td{Days/Ms}'
date_meta = re.match(r"td([Dm])", format_str)
if date_meta:
unit = date_meta.group(1)
if unit == "D":
# NumPy doesn't support DAY unit, so converting days to seconds
# (converting to uint64 to avoid overflow)
data = (data.astype(np.uint64) * (24 * 60 * 60)).astype("datetime64[s]")
elif unit == "m":
data = data.astype("datetime64[ms]")
else:
raise NotImplementedError(f"Date unit is not supported: {unit}")
return data
raise NotImplementedError(f"DateTime kind is not supported: {format_str}")
def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray | pd.Series, Any]:
"""
Convert a column holding DateTime data to a NumPy array.
Parameters
----------
col : Column
Returns
-------
tuple
Tuple of np.ndarray holding the data and the memory owner object
that keeps the memory alive.
"""
buffers = col.get_buffers()
_, col_bit_width, format_str, _ = col.dtype
dbuf, _ = buffers["data"]
# Consider dtype being `uint` to get number of units passed since the 01.01.1970
data = buffer_to_ndarray(
dbuf,
(
DtypeKind.INT,
col_bit_width,
getattr(ArrowCTypes, f"INT{col_bit_width}"),
Endianness.NATIVE,
),
offset=col.offset,
length=col.size(),
)
data = parse_datetime_format_str(format_str, data) # type: ignore[assignment]
data = set_nulls(data, col, buffers["validity"])
return data, buffers
def buffer_to_ndarray(
buffer: Buffer,
dtype: tuple[DtypeKind, int, str, str],
*,
length: int,
offset: int = 0,
) -> np.ndarray:
"""
Build a NumPy array from the passed buffer.
Parameters
----------
buffer : Buffer
Buffer to build a NumPy array from.
dtype : tuple
Data type of the buffer conforming protocol dtypes format.
offset : int, default: 0
Number of elements to offset from the start of the buffer.
length : int, optional
If the buffer is a bit-mask, specifies a number of bits to read
from the buffer. Has no effect otherwise.
Returns
-------
np.ndarray
Notes
-----
The returned array doesn't own the memory. The caller of this function is
responsible for keeping the memory owner object alive as long as
the returned NumPy array is being used.
"""
kind, bit_width, _, _ = dtype
column_dtype = _NP_DTYPES.get(kind, {}).get(bit_width, None)
if column_dtype is None:
raise NotImplementedError(f"Conversion for {dtype} is not yet supported.")
# TODO: No DLPack yet, so need to construct a new ndarray from the data pointer
# and size in the buffer plus the dtype on the column. Use DLPack as NumPy supports
# it since https://github.com/numpy/numpy/pull/19083
ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype)
if bit_width == 1:
assert length is not None, "`length` must be specified for a bit-mask buffer."
pa = import_optional_dependency("pyarrow")
arr = pa.BooleanArray.from_buffers(
pa.bool_(),
length,
[None, pa.foreign_buffer(buffer.ptr, length)],
offset=offset,
)
return np.asarray(arr)
else:
data_pointer = ctypes.cast(
buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type)
)
if length > 0:
return np.ctypeslib.as_array(data_pointer, shape=(length,))
return np.array([], dtype=ctypes_type)
def set_nulls(
data: np.ndarray | pd.Series,
col: Column,
validity: tuple[Buffer, tuple[DtypeKind, int, str, str]] | None,
allow_modify_inplace: bool = True,
):
"""
Set null values for the data according to the column null kind.
Parameters
----------
data : np.ndarray or pd.Series
Data to set nulls in.
col : Column
Column object that describes the `data`.
validity : tuple(Buffer, dtype) or None
The return value of ``col.buffers()``. We do not access the ``col.buffers()``
here to not take the ownership of the memory of buffer objects.
allow_modify_inplace : bool, default: True
Whether to modify the `data` inplace when zero-copy is possible (True) or always
modify a copy of the `data` (False).
Returns
-------
np.ndarray or pd.Series
Data with the nulls being set.
"""
if validity is None:
return data
null_kind, sentinel_val = col.describe_null
null_pos = None
if null_kind == ColumnNullType.USE_SENTINEL:
null_pos = pd.Series(data) == sentinel_val
elif null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):
assert validity, "Expected to have a validity buffer for the mask"
valid_buff, valid_dtype = validity
null_pos = buffer_to_ndarray(
valid_buff, valid_dtype, offset=col.offset, length=col.size()
)
if sentinel_val == 0:
null_pos = ~null_pos
elif null_kind in (ColumnNullType.NON_NULLABLE, ColumnNullType.USE_NAN):
pass
else:
raise NotImplementedError(f"Null kind {null_kind} is not yet supported.")
if null_pos is not None and np.any(null_pos):
if not allow_modify_inplace:
data = data.copy()
try:
data[null_pos] = None
except TypeError:
# TypeError happens if the `data` dtype appears to be non-nullable
# in numpy notation (bool, int, uint). If this happens,
# cast the `data` to nullable float dtype.
data = data.astype(float)
data[null_pos] = None
except SettingWithCopyError:
# `SettingWithCopyError` may happen for datetime-like with missing values.
data = data.copy()
data[null_pos] = None
return data

View File

@ -0,0 +1,183 @@
"""
Utility functions and objects for implementing the interchange API.
"""
from __future__ import annotations
import typing
import numpy as np
from pandas._libs import lib
from pandas.core.dtypes.dtypes import (
ArrowDtype,
CategoricalDtype,
DatetimeTZDtype,
)
import pandas as pd
if typing.TYPE_CHECKING:
from pandas._typing import DtypeObj
# Maps str(pyarrow.DataType) = C type format string
# Currently, no pyarrow API for this
PYARROW_CTYPES = {
"null": "n",
"bool": "b",
"uint8": "C",
"uint16": "S",
"uint32": "I",
"uint64": "L",
"int8": "c",
"int16": "S",
"int32": "i",
"int64": "l",
"halffloat": "e", # float16
"float": "f", # float32
"double": "g", # float64
"string": "u",
"large_string": "U",
"binary": "z",
"time32[s]": "tts",
"time32[ms]": "ttm",
"time64[us]": "ttu",
"time64[ns]": "ttn",
"date32[day]": "tdD",
"date64[ms]": "tdm",
"timestamp[s]": "tss:",
"timestamp[ms]": "tsm:",
"timestamp[us]": "tsu:",
"timestamp[ns]": "tsn:",
"duration[s]": "tDs",
"duration[ms]": "tDm",
"duration[us]": "tDu",
"duration[ns]": "tDn",
}
class ArrowCTypes:
"""
Enum for Apache Arrow C type format strings.
The Arrow C data interface:
https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings
"""
NULL = "n"
BOOL = "b"
INT8 = "c"
UINT8 = "C"
INT16 = "s"
UINT16 = "S"
INT32 = "i"
UINT32 = "I"
INT64 = "l"
UINT64 = "L"
FLOAT16 = "e"
FLOAT32 = "f"
FLOAT64 = "g"
STRING = "u" # utf-8
LARGE_STRING = "U" # utf-8
DATE32 = "tdD"
DATE64 = "tdm"
# Resoulution:
# - seconds -> 's'
# - milliseconds -> 'm'
# - microseconds -> 'u'
# - nanoseconds -> 'n'
TIMESTAMP = "ts{resolution}:{tz}"
TIME = "tt{resolution}"
class Endianness:
"""Enum indicating the byte-order of a data-type."""
LITTLE = "<"
BIG = ">"
NATIVE = "="
NA = "|"
def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str:
"""
Represent pandas `dtype` as a format string in Apache Arrow C notation.
Parameters
----------
dtype : np.dtype
Datatype of pandas DataFrame to represent.
Returns
-------
str
Format string in Apache Arrow C notation of the given `dtype`.
"""
if isinstance(dtype, CategoricalDtype):
return ArrowCTypes.INT64
elif dtype == np.dtype("O"):
return ArrowCTypes.STRING
elif isinstance(dtype, ArrowDtype):
import pyarrow as pa
pa_type = dtype.pyarrow_dtype
if pa.types.is_decimal(pa_type):
return f"d:{pa_type.precision},{pa_type.scale}"
elif pa.types.is_timestamp(pa_type) and pa_type.tz is not None:
return f"ts{pa_type.unit[0]}:{pa_type.tz}"
format_str = PYARROW_CTYPES.get(str(pa_type), None)
if format_str is not None:
return format_str
format_str = getattr(ArrowCTypes, dtype.name.upper(), None)
if format_str is not None:
return format_str
if isinstance(dtype, pd.StringDtype):
# TODO(infer_string) this should be LARGE_STRING for pyarrow storage,
# but current tests don't cover this distinction
return ArrowCTypes.STRING
elif lib.is_np_dtype(dtype, "M"):
# Selecting the first char of resolution string:
# dtype.str -> '<M8[ns]' -> 'n'
resolution = np.datetime_data(dtype)[0][0]
return ArrowCTypes.TIMESTAMP.format(resolution=resolution, tz="")
elif isinstance(dtype, DatetimeTZDtype):
return ArrowCTypes.TIMESTAMP.format(resolution=dtype.unit[0], tz=dtype.tz)
elif isinstance(dtype, pd.BooleanDtype):
return ArrowCTypes.BOOL
raise NotImplementedError(
f"Conversion of {dtype} to Arrow C format string is not implemented."
)
def maybe_rechunk(series: pd.Series, *, allow_copy: bool) -> pd.Series | None:
"""
Rechunk a multi-chunk pyarrow array into a single-chunk array, if necessary.
- Returns `None` if the input series is not backed by a multi-chunk pyarrow array
(and so doesn't need rechunking)
- Returns a single-chunk-backed-Series if the input is backed by a multi-chunk
pyarrow array and `allow_copy` is `True`.
- Raises a `RuntimeError` if `allow_copy` is `False` and input is a
based by a multi-chunk pyarrow array.
"""
if not isinstance(series.dtype, pd.ArrowDtype):
return None
chunked_array = series.array._pa_array # type: ignore[attr-defined]
if len(chunked_array.chunks) == 1:
return None
if not allow_copy:
raise RuntimeError(
"Found multi-chunk pyarrow array, but `allow_copy` is False. "
"Please rechunk the array before calling this function, or set "
"`allow_copy=True`."
)
arr = chunked_array.combine_chunks()
return pd.Series(arr, dtype=series.dtype, name=series.name, index=series.index)