done
This commit is contained in:
136
lib/python3.11/site-packages/pandas/core/interchange/buffer.py
Normal file
136
lib/python3.11/site-packages/pandas/core/interchange/buffer.py
Normal file
@ -0,0 +1,136 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
)
|
||||
|
||||
from pandas.core.interchange.dataframe_protocol import (
|
||||
Buffer,
|
||||
DlpackDeviceType,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import numpy as np
|
||||
import pyarrow as pa
|
||||
|
||||
|
||||
class PandasBuffer(Buffer):
|
||||
"""
|
||||
Data in the buffer is guaranteed to be contiguous in memory.
|
||||
"""
|
||||
|
||||
def __init__(self, x: np.ndarray, allow_copy: bool = True) -> None:
|
||||
"""
|
||||
Handle only regular columns (= numpy arrays) for now.
|
||||
"""
|
||||
if x.strides[0] and not x.strides == (x.dtype.itemsize,):
|
||||
# The protocol does not support strided buffers, so a copy is
|
||||
# necessary. If that's not allowed, we need to raise an exception.
|
||||
if allow_copy:
|
||||
x = x.copy()
|
||||
else:
|
||||
raise RuntimeError(
|
||||
"Exports cannot be zero-copy in the case "
|
||||
"of a non-contiguous buffer"
|
||||
)
|
||||
|
||||
# Store the numpy array in which the data resides as a private
|
||||
# attribute, so we can use it to retrieve the public attributes
|
||||
self._x = x
|
||||
|
||||
@property
|
||||
def bufsize(self) -> int:
|
||||
"""
|
||||
Buffer size in bytes.
|
||||
"""
|
||||
return self._x.size * self._x.dtype.itemsize
|
||||
|
||||
@property
|
||||
def ptr(self) -> int:
|
||||
"""
|
||||
Pointer to start of the buffer as an integer.
|
||||
"""
|
||||
return self._x.__array_interface__["data"][0]
|
||||
|
||||
def __dlpack__(self) -> Any:
|
||||
"""
|
||||
Represent this structure as DLPack interface.
|
||||
"""
|
||||
return self._x.__dlpack__()
|
||||
|
||||
def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]:
|
||||
"""
|
||||
Device type and device ID for where the data in the buffer resides.
|
||||
"""
|
||||
return (DlpackDeviceType.CPU, None)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return (
|
||||
"PandasBuffer("
|
||||
+ str(
|
||||
{
|
||||
"bufsize": self.bufsize,
|
||||
"ptr": self.ptr,
|
||||
"device": self.__dlpack_device__()[0].name,
|
||||
}
|
||||
)
|
||||
+ ")"
|
||||
)
|
||||
|
||||
|
||||
class PandasBufferPyarrow(Buffer):
|
||||
"""
|
||||
Data in the buffer is guaranteed to be contiguous in memory.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
buffer: pa.Buffer,
|
||||
*,
|
||||
length: int,
|
||||
) -> None:
|
||||
"""
|
||||
Handle pyarrow chunked arrays.
|
||||
"""
|
||||
self._buffer = buffer
|
||||
self._length = length
|
||||
|
||||
@property
|
||||
def bufsize(self) -> int:
|
||||
"""
|
||||
Buffer size in bytes.
|
||||
"""
|
||||
return self._buffer.size
|
||||
|
||||
@property
|
||||
def ptr(self) -> int:
|
||||
"""
|
||||
Pointer to start of the buffer as an integer.
|
||||
"""
|
||||
return self._buffer.address
|
||||
|
||||
def __dlpack__(self) -> Any:
|
||||
"""
|
||||
Represent this structure as DLPack interface.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]:
|
||||
"""
|
||||
Device type and device ID for where the data in the buffer resides.
|
||||
"""
|
||||
return (DlpackDeviceType.CPU, None)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return (
|
||||
"PandasBuffer[pyarrow]("
|
||||
+ str(
|
||||
{
|
||||
"bufsize": self.bufsize,
|
||||
"ptr": self.ptr,
|
||||
"device": "CPU",
|
||||
}
|
||||
)
|
||||
+ ")"
|
||||
)
|
461
lib/python3.11/site-packages/pandas/core/interchange/column.py
Normal file
461
lib/python3.11/site-packages/pandas/core/interchange/column.py
Normal file
@ -0,0 +1,461 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs.lib import infer_dtype
|
||||
from pandas._libs.tslibs import iNaT
|
||||
from pandas.errors import NoBufferPresent
|
||||
from pandas.util._decorators import cache_readonly
|
||||
|
||||
from pandas.core.dtypes.dtypes import BaseMaskedDtype
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
ArrowDtype,
|
||||
DatetimeTZDtype,
|
||||
)
|
||||
from pandas.api.types import is_string_dtype
|
||||
from pandas.core.interchange.buffer import (
|
||||
PandasBuffer,
|
||||
PandasBufferPyarrow,
|
||||
)
|
||||
from pandas.core.interchange.dataframe_protocol import (
|
||||
Column,
|
||||
ColumnBuffers,
|
||||
ColumnNullType,
|
||||
DtypeKind,
|
||||
)
|
||||
from pandas.core.interchange.utils import (
|
||||
ArrowCTypes,
|
||||
Endianness,
|
||||
dtype_to_arrow_c_fmt,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas.core.interchange.dataframe_protocol import Buffer
|
||||
|
||||
_NP_KINDS = {
|
||||
"i": DtypeKind.INT,
|
||||
"u": DtypeKind.UINT,
|
||||
"f": DtypeKind.FLOAT,
|
||||
"b": DtypeKind.BOOL,
|
||||
"U": DtypeKind.STRING,
|
||||
"M": DtypeKind.DATETIME,
|
||||
"m": DtypeKind.DATETIME,
|
||||
}
|
||||
|
||||
_NULL_DESCRIPTION = {
|
||||
DtypeKind.FLOAT: (ColumnNullType.USE_NAN, None),
|
||||
DtypeKind.DATETIME: (ColumnNullType.USE_SENTINEL, iNaT),
|
||||
DtypeKind.INT: (ColumnNullType.NON_NULLABLE, None),
|
||||
DtypeKind.UINT: (ColumnNullType.NON_NULLABLE, None),
|
||||
DtypeKind.BOOL: (ColumnNullType.NON_NULLABLE, None),
|
||||
# Null values for categoricals are stored as `-1` sentinel values
|
||||
# in the category date (e.g., `col.values.codes` is int8 np.ndarray)
|
||||
DtypeKind.CATEGORICAL: (ColumnNullType.USE_SENTINEL, -1),
|
||||
# follow Arrow in using 1 as valid value and 0 for missing/null value
|
||||
DtypeKind.STRING: (ColumnNullType.USE_BYTEMASK, 0),
|
||||
}
|
||||
|
||||
_NO_VALIDITY_BUFFER = {
|
||||
ColumnNullType.NON_NULLABLE: "This column is non-nullable",
|
||||
ColumnNullType.USE_NAN: "This column uses NaN as null",
|
||||
ColumnNullType.USE_SENTINEL: "This column uses a sentinel value",
|
||||
}
|
||||
|
||||
|
||||
class PandasColumn(Column):
|
||||
"""
|
||||
A column object, with only the methods and properties required by the
|
||||
interchange protocol defined.
|
||||
A column can contain one or more chunks. Each chunk can contain up to three
|
||||
buffers - a data buffer, a mask buffer (depending on null representation),
|
||||
and an offsets buffer (if variable-size binary; e.g., variable-length
|
||||
strings).
|
||||
Note: this Column object can only be produced by ``__dataframe__``, so
|
||||
doesn't need its own version or ``__column__`` protocol.
|
||||
"""
|
||||
|
||||
def __init__(self, column: pd.Series, allow_copy: bool = True) -> None:
|
||||
"""
|
||||
Note: doesn't deal with extension arrays yet, just assume a regular
|
||||
Series/ndarray for now.
|
||||
"""
|
||||
if isinstance(column, pd.DataFrame):
|
||||
raise TypeError(
|
||||
"Expected a Series, got a DataFrame. This likely happened "
|
||||
"because you called __dataframe__ on a DataFrame which, "
|
||||
"after converting column names to string, resulted in duplicated "
|
||||
f"names: {column.columns}. Please rename these columns before "
|
||||
"using the interchange protocol."
|
||||
)
|
||||
if not isinstance(column, pd.Series):
|
||||
raise NotImplementedError(f"Columns of type {type(column)} not handled yet")
|
||||
|
||||
# Store the column as a private attribute
|
||||
self._col = column
|
||||
self._allow_copy = allow_copy
|
||||
|
||||
def size(self) -> int:
|
||||
"""
|
||||
Size of the column, in elements.
|
||||
"""
|
||||
return self._col.size
|
||||
|
||||
@property
|
||||
def offset(self) -> int:
|
||||
"""
|
||||
Offset of first element. Always zero.
|
||||
"""
|
||||
# TODO: chunks are implemented now, probably this should return something
|
||||
return 0
|
||||
|
||||
@cache_readonly
|
||||
def dtype(self) -> tuple[DtypeKind, int, str, str]:
|
||||
dtype = self._col.dtype
|
||||
|
||||
if isinstance(dtype, pd.CategoricalDtype):
|
||||
codes = self._col.values.codes
|
||||
(
|
||||
_,
|
||||
bitwidth,
|
||||
c_arrow_dtype_f_str,
|
||||
_,
|
||||
) = self._dtype_from_pandasdtype(codes.dtype)
|
||||
return (
|
||||
DtypeKind.CATEGORICAL,
|
||||
bitwidth,
|
||||
c_arrow_dtype_f_str,
|
||||
Endianness.NATIVE,
|
||||
)
|
||||
elif is_string_dtype(dtype):
|
||||
if infer_dtype(self._col) in ("string", "empty"):
|
||||
return (
|
||||
DtypeKind.STRING,
|
||||
8,
|
||||
dtype_to_arrow_c_fmt(dtype),
|
||||
Endianness.NATIVE,
|
||||
)
|
||||
raise NotImplementedError("Non-string object dtypes are not supported yet")
|
||||
else:
|
||||
return self._dtype_from_pandasdtype(dtype)
|
||||
|
||||
def _dtype_from_pandasdtype(self, dtype) -> tuple[DtypeKind, int, str, str]:
|
||||
"""
|
||||
See `self.dtype` for details.
|
||||
"""
|
||||
# Note: 'c' (complex) not handled yet (not in array spec v1).
|
||||
# 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled
|
||||
# datetime and timedelta both map to datetime (is timedelta handled?)
|
||||
|
||||
kind = _NP_KINDS.get(dtype.kind, None)
|
||||
if kind is None:
|
||||
# Not a NumPy dtype. Check if it's a categorical maybe
|
||||
raise ValueError(f"Data type {dtype} not supported by interchange protocol")
|
||||
if isinstance(dtype, ArrowDtype):
|
||||
byteorder = dtype.numpy_dtype.byteorder
|
||||
elif isinstance(dtype, DatetimeTZDtype):
|
||||
byteorder = dtype.base.byteorder # type: ignore[union-attr]
|
||||
elif isinstance(dtype, BaseMaskedDtype):
|
||||
byteorder = dtype.numpy_dtype.byteorder
|
||||
else:
|
||||
byteorder = dtype.byteorder
|
||||
|
||||
if dtype == "bool[pyarrow]":
|
||||
# return early to avoid the `* 8` below, as this is a bitmask
|
||||
# rather than a bytemask
|
||||
return (
|
||||
kind,
|
||||
dtype.itemsize, # pyright: ignore[reportGeneralTypeIssues]
|
||||
ArrowCTypes.BOOL,
|
||||
byteorder,
|
||||
)
|
||||
|
||||
return kind, dtype.itemsize * 8, dtype_to_arrow_c_fmt(dtype), byteorder
|
||||
|
||||
@property
|
||||
def describe_categorical(self):
|
||||
"""
|
||||
If the dtype is categorical, there are two options:
|
||||
- There are only values in the data buffer.
|
||||
- There is a separate non-categorical Column encoding for categorical values.
|
||||
|
||||
Raises TypeError if the dtype is not categorical
|
||||
|
||||
Content of returned dict:
|
||||
- "is_ordered" : bool, whether the ordering of dictionary indices is
|
||||
semantically meaningful.
|
||||
- "is_dictionary" : bool, whether a dictionary-style mapping of
|
||||
categorical values to other objects exists
|
||||
- "categories" : Column representing the (implicit) mapping of indices to
|
||||
category values (e.g. an array of cat1, cat2, ...).
|
||||
None if not a dictionary-style categorical.
|
||||
"""
|
||||
if not self.dtype[0] == DtypeKind.CATEGORICAL:
|
||||
raise TypeError(
|
||||
"describe_categorical only works on a column with categorical dtype!"
|
||||
)
|
||||
|
||||
return {
|
||||
"is_ordered": self._col.cat.ordered,
|
||||
"is_dictionary": True,
|
||||
"categories": PandasColumn(pd.Series(self._col.cat.categories)),
|
||||
}
|
||||
|
||||
@property
|
||||
def describe_null(self):
|
||||
if isinstance(self._col.dtype, BaseMaskedDtype):
|
||||
column_null_dtype = ColumnNullType.USE_BYTEMASK
|
||||
null_value = 1
|
||||
return column_null_dtype, null_value
|
||||
if isinstance(self._col.dtype, ArrowDtype):
|
||||
# We already rechunk (if necessary / allowed) upon initialization, so this
|
||||
# is already single-chunk by the time we get here.
|
||||
if self._col.array._pa_array.chunks[0].buffers()[0] is None: # type: ignore[attr-defined]
|
||||
return ColumnNullType.NON_NULLABLE, None
|
||||
return ColumnNullType.USE_BITMASK, 0
|
||||
kind = self.dtype[0]
|
||||
try:
|
||||
null, value = _NULL_DESCRIPTION[kind]
|
||||
except KeyError:
|
||||
raise NotImplementedError(f"Data type {kind} not yet supported")
|
||||
|
||||
return null, value
|
||||
|
||||
@cache_readonly
|
||||
def null_count(self) -> int:
|
||||
"""
|
||||
Number of null elements. Should always be known.
|
||||
"""
|
||||
return self._col.isna().sum().item()
|
||||
|
||||
@property
|
||||
def metadata(self) -> dict[str, pd.Index]:
|
||||
"""
|
||||
Store specific metadata of the column.
|
||||
"""
|
||||
return {"pandas.index": self._col.index}
|
||||
|
||||
def num_chunks(self) -> int:
|
||||
"""
|
||||
Return the number of chunks the column consists of.
|
||||
"""
|
||||
return 1
|
||||
|
||||
def get_chunks(self, n_chunks: int | None = None):
|
||||
"""
|
||||
Return an iterator yielding the chunks.
|
||||
See `DataFrame.get_chunks` for details on ``n_chunks``.
|
||||
"""
|
||||
if n_chunks and n_chunks > 1:
|
||||
size = len(self._col)
|
||||
step = size // n_chunks
|
||||
if size % n_chunks != 0:
|
||||
step += 1
|
||||
for start in range(0, step * n_chunks, step):
|
||||
yield PandasColumn(
|
||||
self._col.iloc[start : start + step], self._allow_copy
|
||||
)
|
||||
else:
|
||||
yield self
|
||||
|
||||
def get_buffers(self) -> ColumnBuffers:
|
||||
"""
|
||||
Return a dictionary containing the underlying buffers.
|
||||
The returned dictionary has the following contents:
|
||||
- "data": a two-element tuple whose first element is a buffer
|
||||
containing the data and whose second element is the data
|
||||
buffer's associated dtype.
|
||||
- "validity": a two-element tuple whose first element is a buffer
|
||||
containing mask values indicating missing data and
|
||||
whose second element is the mask value buffer's
|
||||
associated dtype. None if the null representation is
|
||||
not a bit or byte mask.
|
||||
- "offsets": a two-element tuple whose first element is a buffer
|
||||
containing the offset values for variable-size binary
|
||||
data (e.g., variable-length strings) and whose second
|
||||
element is the offsets buffer's associated dtype. None
|
||||
if the data buffer does not have an associated offsets
|
||||
buffer.
|
||||
"""
|
||||
buffers: ColumnBuffers = {
|
||||
"data": self._get_data_buffer(),
|
||||
"validity": None,
|
||||
"offsets": None,
|
||||
}
|
||||
|
||||
try:
|
||||
buffers["validity"] = self._get_validity_buffer()
|
||||
except NoBufferPresent:
|
||||
pass
|
||||
|
||||
try:
|
||||
buffers["offsets"] = self._get_offsets_buffer()
|
||||
except NoBufferPresent:
|
||||
pass
|
||||
|
||||
return buffers
|
||||
|
||||
def _get_data_buffer(
|
||||
self,
|
||||
) -> tuple[Buffer, tuple[DtypeKind, int, str, str]]:
|
||||
"""
|
||||
Return the buffer containing the data and the buffer's associated dtype.
|
||||
"""
|
||||
buffer: Buffer
|
||||
if self.dtype[0] in (
|
||||
DtypeKind.INT,
|
||||
DtypeKind.UINT,
|
||||
DtypeKind.FLOAT,
|
||||
DtypeKind.BOOL,
|
||||
DtypeKind.DATETIME,
|
||||
):
|
||||
# self.dtype[2] is an ArrowCTypes.TIMESTAMP where the tz will make
|
||||
# it longer than 4 characters
|
||||
dtype = self.dtype
|
||||
if self.dtype[0] == DtypeKind.DATETIME and len(self.dtype[2]) > 4:
|
||||
np_arr = self._col.dt.tz_convert(None).to_numpy()
|
||||
else:
|
||||
arr = self._col.array
|
||||
if isinstance(self._col.dtype, BaseMaskedDtype):
|
||||
np_arr = arr._data # type: ignore[attr-defined]
|
||||
elif isinstance(self._col.dtype, ArrowDtype):
|
||||
# We already rechunk (if necessary / allowed) upon initialization,
|
||||
# so this is already single-chunk by the time we get here.
|
||||
arr = arr._pa_array.chunks[0] # type: ignore[attr-defined]
|
||||
buffer = PandasBufferPyarrow(
|
||||
arr.buffers()[1], # type: ignore[attr-defined]
|
||||
length=len(arr),
|
||||
)
|
||||
return buffer, dtype
|
||||
else:
|
||||
np_arr = arr._ndarray # type: ignore[attr-defined]
|
||||
buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy)
|
||||
elif self.dtype[0] == DtypeKind.CATEGORICAL:
|
||||
codes = self._col.values._codes
|
||||
buffer = PandasBuffer(codes, allow_copy=self._allow_copy)
|
||||
dtype = self._dtype_from_pandasdtype(codes.dtype)
|
||||
elif self.dtype[0] == DtypeKind.STRING:
|
||||
# Marshal the strings from a NumPy object array into a byte array
|
||||
buf = self._col.to_numpy()
|
||||
b = bytearray()
|
||||
|
||||
# TODO: this for-loop is slow; can be implemented in Cython/C/C++ later
|
||||
for obj in buf:
|
||||
if isinstance(obj, str):
|
||||
b.extend(obj.encode(encoding="utf-8"))
|
||||
|
||||
# Convert the byte array to a Pandas "buffer" using
|
||||
# a NumPy array as the backing store
|
||||
buffer = PandasBuffer(np.frombuffer(b, dtype="uint8"))
|
||||
|
||||
# Define the dtype for the returned buffer
|
||||
# TODO: this will need correcting
|
||||
# https://github.com/pandas-dev/pandas/issues/54781
|
||||
dtype = self.dtype
|
||||
else:
|
||||
raise NotImplementedError(f"Data type {self._col.dtype} not handled yet")
|
||||
|
||||
return buffer, dtype
|
||||
|
||||
def _get_validity_buffer(self) -> tuple[Buffer, Any] | None:
|
||||
"""
|
||||
Return the buffer containing the mask values indicating missing data and
|
||||
the buffer's associated dtype.
|
||||
Raises NoBufferPresent if null representation is not a bit or byte mask.
|
||||
"""
|
||||
null, invalid = self.describe_null
|
||||
buffer: Buffer
|
||||
if isinstance(self._col.dtype, ArrowDtype):
|
||||
# We already rechunk (if necessary / allowed) upon initialization, so this
|
||||
# is already single-chunk by the time we get here.
|
||||
arr = self._col.array._pa_array.chunks[0] # type: ignore[attr-defined]
|
||||
dtype = (DtypeKind.BOOL, 1, ArrowCTypes.BOOL, Endianness.NATIVE)
|
||||
if arr.buffers()[0] is None:
|
||||
return None
|
||||
buffer = PandasBufferPyarrow(
|
||||
arr.buffers()[0],
|
||||
length=len(arr),
|
||||
)
|
||||
return buffer, dtype
|
||||
|
||||
if isinstance(self._col.dtype, BaseMaskedDtype):
|
||||
mask = self._col.array._mask # type: ignore[attr-defined]
|
||||
buffer = PandasBuffer(mask)
|
||||
dtype = (DtypeKind.BOOL, 8, ArrowCTypes.BOOL, Endianness.NATIVE)
|
||||
return buffer, dtype
|
||||
|
||||
if self.dtype[0] == DtypeKind.STRING:
|
||||
# For now, use byte array as the mask.
|
||||
# TODO: maybe store as bit array to save space?..
|
||||
buf = self._col.to_numpy()
|
||||
|
||||
# Determine the encoding for valid values
|
||||
valid = invalid == 0
|
||||
invalid = not valid
|
||||
|
||||
mask = np.zeros(shape=(len(buf),), dtype=np.bool_)
|
||||
for i, obj in enumerate(buf):
|
||||
mask[i] = valid if isinstance(obj, str) else invalid
|
||||
|
||||
# Convert the mask array to a Pandas "buffer" using
|
||||
# a NumPy array as the backing store
|
||||
buffer = PandasBuffer(mask)
|
||||
|
||||
# Define the dtype of the returned buffer
|
||||
dtype = (DtypeKind.BOOL, 8, ArrowCTypes.BOOL, Endianness.NATIVE)
|
||||
|
||||
return buffer, dtype
|
||||
|
||||
try:
|
||||
msg = f"{_NO_VALIDITY_BUFFER[null]} so does not have a separate mask"
|
||||
except KeyError:
|
||||
# TODO: implement for other bit/byte masks?
|
||||
raise NotImplementedError("See self.describe_null")
|
||||
|
||||
raise NoBufferPresent(msg)
|
||||
|
||||
def _get_offsets_buffer(self) -> tuple[PandasBuffer, Any]:
|
||||
"""
|
||||
Return the buffer containing the offset values for variable-size binary
|
||||
data (e.g., variable-length strings) and the buffer's associated dtype.
|
||||
Raises NoBufferPresent if the data buffer does not have an associated
|
||||
offsets buffer.
|
||||
"""
|
||||
if self.dtype[0] == DtypeKind.STRING:
|
||||
# For each string, we need to manually determine the next offset
|
||||
values = self._col.to_numpy()
|
||||
ptr = 0
|
||||
offsets = np.zeros(shape=(len(values) + 1,), dtype=np.int64)
|
||||
for i, v in enumerate(values):
|
||||
# For missing values (in this case, `np.nan` values)
|
||||
# we don't increment the pointer
|
||||
if isinstance(v, str):
|
||||
b = v.encode(encoding="utf-8")
|
||||
ptr += len(b)
|
||||
|
||||
offsets[i + 1] = ptr
|
||||
|
||||
# Convert the offsets to a Pandas "buffer" using
|
||||
# the NumPy array as the backing store
|
||||
buffer = PandasBuffer(offsets)
|
||||
|
||||
# Assemble the buffer dtype info
|
||||
dtype = (
|
||||
DtypeKind.INT,
|
||||
64,
|
||||
ArrowCTypes.INT64,
|
||||
Endianness.NATIVE,
|
||||
) # note: currently only support native endianness
|
||||
else:
|
||||
raise NoBufferPresent(
|
||||
"This column has a fixed-length dtype so "
|
||||
"it does not have an offsets buffer"
|
||||
)
|
||||
|
||||
return buffer, dtype
|
@ -0,0 +1,113 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import abc
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from pandas.core.interchange.column import PandasColumn
|
||||
from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg
|
||||
from pandas.core.interchange.utils import maybe_rechunk
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import (
|
||||
Iterable,
|
||||
Sequence,
|
||||
)
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
)
|
||||
|
||||
|
||||
class PandasDataFrameXchg(DataFrameXchg):
|
||||
"""
|
||||
A data frame class, with only the methods required by the interchange
|
||||
protocol defined.
|
||||
Instances of this (private) class are returned from
|
||||
``pd.DataFrame.__dataframe__`` as objects with the methods and
|
||||
attributes defined on this class.
|
||||
"""
|
||||
|
||||
def __init__(self, df: DataFrame, allow_copy: bool = True) -> None:
|
||||
"""
|
||||
Constructor - an instance of this (private) class is returned from
|
||||
`pd.DataFrame.__dataframe__`.
|
||||
"""
|
||||
self._df = df.rename(columns=str, copy=False)
|
||||
self._allow_copy = allow_copy
|
||||
for i, _col in enumerate(self._df.columns):
|
||||
rechunked = maybe_rechunk(self._df.iloc[:, i], allow_copy=allow_copy)
|
||||
if rechunked is not None:
|
||||
self._df.isetitem(i, rechunked)
|
||||
|
||||
def __dataframe__(
|
||||
self, nan_as_null: bool = False, allow_copy: bool = True
|
||||
) -> PandasDataFrameXchg:
|
||||
# `nan_as_null` can be removed here once it's removed from
|
||||
# Dataframe.__dataframe__
|
||||
return PandasDataFrameXchg(self._df, allow_copy)
|
||||
|
||||
@property
|
||||
def metadata(self) -> dict[str, Index]:
|
||||
# `index` isn't a regular column, and the protocol doesn't support row
|
||||
# labels - so we export it as Pandas-specific metadata here.
|
||||
return {"pandas.index": self._df.index}
|
||||
|
||||
def num_columns(self) -> int:
|
||||
return len(self._df.columns)
|
||||
|
||||
def num_rows(self) -> int:
|
||||
return len(self._df)
|
||||
|
||||
def num_chunks(self) -> int:
|
||||
return 1
|
||||
|
||||
def column_names(self) -> Index:
|
||||
return self._df.columns
|
||||
|
||||
def get_column(self, i: int) -> PandasColumn:
|
||||
return PandasColumn(self._df.iloc[:, i], allow_copy=self._allow_copy)
|
||||
|
||||
def get_column_by_name(self, name: str) -> PandasColumn:
|
||||
return PandasColumn(self._df[name], allow_copy=self._allow_copy)
|
||||
|
||||
def get_columns(self) -> list[PandasColumn]:
|
||||
return [
|
||||
PandasColumn(self._df[name], allow_copy=self._allow_copy)
|
||||
for name in self._df.columns
|
||||
]
|
||||
|
||||
def select_columns(self, indices: Sequence[int]) -> PandasDataFrameXchg:
|
||||
if not isinstance(indices, abc.Sequence):
|
||||
raise ValueError("`indices` is not a sequence")
|
||||
if not isinstance(indices, list):
|
||||
indices = list(indices)
|
||||
|
||||
return PandasDataFrameXchg(
|
||||
self._df.iloc[:, indices], allow_copy=self._allow_copy
|
||||
)
|
||||
|
||||
def select_columns_by_name(self, names: list[str]) -> PandasDataFrameXchg: # type: ignore[override]
|
||||
if not isinstance(names, abc.Sequence):
|
||||
raise ValueError("`names` is not a sequence")
|
||||
if not isinstance(names, list):
|
||||
names = list(names)
|
||||
|
||||
return PandasDataFrameXchg(self._df.loc[:, names], allow_copy=self._allow_copy)
|
||||
|
||||
def get_chunks(self, n_chunks: int | None = None) -> Iterable[PandasDataFrameXchg]:
|
||||
"""
|
||||
Return an iterator yielding the chunks.
|
||||
"""
|
||||
if n_chunks and n_chunks > 1:
|
||||
size = len(self._df)
|
||||
step = size // n_chunks
|
||||
if size % n_chunks != 0:
|
||||
step += 1
|
||||
for start in range(0, step * n_chunks, step):
|
||||
yield PandasDataFrameXchg(
|
||||
self._df.iloc[start : start + step, :],
|
||||
allow_copy=self._allow_copy,
|
||||
)
|
||||
else:
|
||||
yield self
|
@ -0,0 +1,465 @@
|
||||
"""
|
||||
A verbatim copy (vendored) of the spec from https://github.com/data-apis/dataframe-api
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import (
|
||||
ABC,
|
||||
abstractmethod,
|
||||
)
|
||||
import enum
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
TypedDict,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import (
|
||||
Iterable,
|
||||
Sequence,
|
||||
)
|
||||
|
||||
|
||||
class DlpackDeviceType(enum.IntEnum):
|
||||
"""Integer enum for device type codes matching DLPack."""
|
||||
|
||||
CPU = 1
|
||||
CUDA = 2
|
||||
CPU_PINNED = 3
|
||||
OPENCL = 4
|
||||
VULKAN = 7
|
||||
METAL = 8
|
||||
VPI = 9
|
||||
ROCM = 10
|
||||
|
||||
|
||||
class DtypeKind(enum.IntEnum):
|
||||
"""
|
||||
Integer enum for data types.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
INT : int
|
||||
Matches to signed integer data type.
|
||||
UINT : int
|
||||
Matches to unsigned integer data type.
|
||||
FLOAT : int
|
||||
Matches to floating point data type.
|
||||
BOOL : int
|
||||
Matches to boolean data type.
|
||||
STRING : int
|
||||
Matches to string data type (UTF-8 encoded).
|
||||
DATETIME : int
|
||||
Matches to datetime data type.
|
||||
CATEGORICAL : int
|
||||
Matches to categorical data type.
|
||||
"""
|
||||
|
||||
INT = 0
|
||||
UINT = 1
|
||||
FLOAT = 2
|
||||
BOOL = 20
|
||||
STRING = 21 # UTF-8
|
||||
DATETIME = 22
|
||||
CATEGORICAL = 23
|
||||
|
||||
|
||||
class ColumnNullType(enum.IntEnum):
|
||||
"""
|
||||
Integer enum for null type representation.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
NON_NULLABLE : int
|
||||
Non-nullable column.
|
||||
USE_NAN : int
|
||||
Use explicit float NaN value.
|
||||
USE_SENTINEL : int
|
||||
Sentinel value besides NaN/NaT.
|
||||
USE_BITMASK : int
|
||||
The bit is set/unset representing a null on a certain position.
|
||||
USE_BYTEMASK : int
|
||||
The byte is set/unset representing a null on a certain position.
|
||||
"""
|
||||
|
||||
NON_NULLABLE = 0
|
||||
USE_NAN = 1
|
||||
USE_SENTINEL = 2
|
||||
USE_BITMASK = 3
|
||||
USE_BYTEMASK = 4
|
||||
|
||||
|
||||
class ColumnBuffers(TypedDict):
|
||||
# first element is a buffer containing the column data;
|
||||
# second element is the data buffer's associated dtype
|
||||
data: tuple[Buffer, Any]
|
||||
|
||||
# first element is a buffer containing mask values indicating missing data;
|
||||
# second element is the mask value buffer's associated dtype.
|
||||
# None if the null representation is not a bit or byte mask
|
||||
validity: tuple[Buffer, Any] | None
|
||||
|
||||
# first element is a buffer containing the offset values for
|
||||
# variable-size binary data (e.g., variable-length strings);
|
||||
# second element is the offsets buffer's associated dtype.
|
||||
# None if the data buffer does not have an associated offsets buffer
|
||||
offsets: tuple[Buffer, Any] | None
|
||||
|
||||
|
||||
class CategoricalDescription(TypedDict):
|
||||
# whether the ordering of dictionary indices is semantically meaningful
|
||||
is_ordered: bool
|
||||
# whether a dictionary-style mapping of categorical values to other objects exists
|
||||
is_dictionary: bool
|
||||
# Python-level only (e.g. ``{int: str}``).
|
||||
# None if not a dictionary-style categorical.
|
||||
categories: Column | None
|
||||
|
||||
|
||||
class Buffer(ABC):
|
||||
"""
|
||||
Data in the buffer is guaranteed to be contiguous in memory.
|
||||
|
||||
Note that there is no dtype attribute present, a buffer can be thought of
|
||||
as simply a block of memory. However, if the column that the buffer is
|
||||
attached to has a dtype that's supported by DLPack and ``__dlpack__`` is
|
||||
implemented, then that dtype information will be contained in the return
|
||||
value from ``__dlpack__``.
|
||||
|
||||
This distinction is useful to support both data exchange via DLPack on a
|
||||
buffer and (b) dtypes like variable-length strings which do not have a
|
||||
fixed number of bytes per element.
|
||||
"""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def bufsize(self) -> int:
|
||||
"""
|
||||
Buffer size in bytes.
|
||||
"""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def ptr(self) -> int:
|
||||
"""
|
||||
Pointer to start of the buffer as an integer.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def __dlpack__(self):
|
||||
"""
|
||||
Produce DLPack capsule (see array API standard).
|
||||
|
||||
Raises:
|
||||
|
||||
- TypeError : if the buffer contains unsupported dtypes.
|
||||
- NotImplementedError : if DLPack support is not implemented
|
||||
|
||||
Useful to have to connect to array libraries. Support optional because
|
||||
it's not completely trivial to implement for a Python-only library.
|
||||
"""
|
||||
raise NotImplementedError("__dlpack__")
|
||||
|
||||
@abstractmethod
|
||||
def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]:
|
||||
"""
|
||||
Device type and device ID for where the data in the buffer resides.
|
||||
Uses device type codes matching DLPack.
|
||||
Note: must be implemented even if ``__dlpack__`` is not.
|
||||
"""
|
||||
|
||||
|
||||
class Column(ABC):
|
||||
"""
|
||||
A column object, with only the methods and properties required by the
|
||||
interchange protocol defined.
|
||||
|
||||
A column can contain one or more chunks. Each chunk can contain up to three
|
||||
buffers - a data buffer, a mask buffer (depending on null representation),
|
||||
and an offsets buffer (if variable-size binary; e.g., variable-length
|
||||
strings).
|
||||
|
||||
TBD: Arrow has a separate "null" dtype, and has no separate mask concept.
|
||||
Instead, it seems to use "children" for both columns with a bit mask,
|
||||
and for nested dtypes. Unclear whether this is elegant or confusing.
|
||||
This design requires checking the null representation explicitly.
|
||||
|
||||
The Arrow design requires checking:
|
||||
1. the ARROW_FLAG_NULLABLE (for sentinel values)
|
||||
2. if a column has two children, combined with one of those children
|
||||
having a null dtype.
|
||||
|
||||
Making the mask concept explicit seems useful. One null dtype would
|
||||
not be enough to cover both bit and byte masks, so that would mean
|
||||
even more checking if we did it the Arrow way.
|
||||
|
||||
TBD: there's also the "chunk" concept here, which is implicit in Arrow as
|
||||
multiple buffers per array (= column here). Semantically it may make
|
||||
sense to have both: chunks were meant for example for lazy evaluation
|
||||
of data which doesn't fit in memory, while multiple buffers per column
|
||||
could also come from doing a selection operation on a single
|
||||
contiguous buffer.
|
||||
|
||||
Given these concepts, one would expect chunks to be all of the same
|
||||
size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows),
|
||||
while multiple buffers could have data-dependent lengths. Not an issue
|
||||
in pandas if one column is backed by a single NumPy array, but in
|
||||
Arrow it seems possible.
|
||||
Are multiple chunks *and* multiple buffers per column necessary for
|
||||
the purposes of this interchange protocol, or must producers either
|
||||
reuse the chunk concept for this or copy the data?
|
||||
|
||||
Note: this Column object can only be produced by ``__dataframe__``, so
|
||||
doesn't need its own version or ``__column__`` protocol.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def size(self) -> int:
|
||||
"""
|
||||
Size of the column, in elements.
|
||||
|
||||
Corresponds to DataFrame.num_rows() if column is a single chunk;
|
||||
equal to size of this current chunk otherwise.
|
||||
"""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def offset(self) -> int:
|
||||
"""
|
||||
Offset of first element.
|
||||
|
||||
May be > 0 if using chunks; for example for a column with N chunks of
|
||||
equal size M (only the last chunk may be shorter),
|
||||
``offset = n * M``, ``n = 0 .. N-1``.
|
||||
"""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def dtype(self) -> tuple[DtypeKind, int, str, str]:
|
||||
"""
|
||||
Dtype description as a tuple ``(kind, bit-width, format string, endianness)``.
|
||||
|
||||
Bit-width : the number of bits as an integer
|
||||
Format string : data type description format string in Apache Arrow C
|
||||
Data Interface format.
|
||||
Endianness : current only native endianness (``=``) is supported
|
||||
|
||||
Notes:
|
||||
- Kind specifiers are aligned with DLPack where possible (hence the
|
||||
jump to 20, leave enough room for future extension)
|
||||
- Masks must be specified as boolean with either bit width 1 (for bit
|
||||
masks) or 8 (for byte masks).
|
||||
- Dtype width in bits was preferred over bytes
|
||||
- Endianness isn't too useful, but included now in case in the future
|
||||
we need to support non-native endianness
|
||||
- Went with Apache Arrow format strings over NumPy format strings
|
||||
because they're more complete from a dataframe perspective
|
||||
- Format strings are mostly useful for datetime specification, and
|
||||
for categoricals.
|
||||
- For categoricals, the format string describes the type of the
|
||||
categorical in the data buffer. In case of a separate encoding of
|
||||
the categorical (e.g. an integer to string mapping), this can
|
||||
be derived from ``self.describe_categorical``.
|
||||
- Data types not included: complex, Arrow-style null, binary, decimal,
|
||||
and nested (list, struct, map, union) dtypes.
|
||||
"""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def describe_categorical(self) -> CategoricalDescription:
|
||||
"""
|
||||
If the dtype is categorical, there are two options:
|
||||
- There are only values in the data buffer.
|
||||
- There is a separate non-categorical Column encoding for categorical values.
|
||||
|
||||
Raises TypeError if the dtype is not categorical
|
||||
|
||||
Returns the dictionary with description on how to interpret the data buffer:
|
||||
- "is_ordered" : bool, whether the ordering of dictionary indices is
|
||||
semantically meaningful.
|
||||
- "is_dictionary" : bool, whether a mapping of
|
||||
categorical values to other objects exists
|
||||
- "categories" : Column representing the (implicit) mapping of indices to
|
||||
category values (e.g. an array of cat1, cat2, ...).
|
||||
None if not a dictionary-style categorical.
|
||||
|
||||
TBD: are there any other in-memory representations that are needed?
|
||||
"""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def describe_null(self) -> tuple[ColumnNullType, Any]:
|
||||
"""
|
||||
Return the missing value (or "null") representation the column dtype
|
||||
uses, as a tuple ``(kind, value)``.
|
||||
|
||||
Value : if kind is "sentinel value", the actual value. If kind is a bit
|
||||
mask or a byte mask, the value (0 or 1) indicating a missing value. None
|
||||
otherwise.
|
||||
"""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def null_count(self) -> int | None:
|
||||
"""
|
||||
Number of null elements, if known.
|
||||
|
||||
Note: Arrow uses -1 to indicate "unknown", but None seems cleaner.
|
||||
"""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def metadata(self) -> dict[str, Any]:
|
||||
"""
|
||||
The metadata for the column. See `DataFrame.metadata` for more details.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def num_chunks(self) -> int:
|
||||
"""
|
||||
Return the number of chunks the column consists of.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def get_chunks(self, n_chunks: int | None = None) -> Iterable[Column]:
|
||||
"""
|
||||
Return an iterator yielding the chunks.
|
||||
|
||||
See `DataFrame.get_chunks` for details on ``n_chunks``.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def get_buffers(self) -> ColumnBuffers:
|
||||
"""
|
||||
Return a dictionary containing the underlying buffers.
|
||||
|
||||
The returned dictionary has the following contents:
|
||||
|
||||
- "data": a two-element tuple whose first element is a buffer
|
||||
containing the data and whose second element is the data
|
||||
buffer's associated dtype.
|
||||
- "validity": a two-element tuple whose first element is a buffer
|
||||
containing mask values indicating missing data and
|
||||
whose second element is the mask value buffer's
|
||||
associated dtype. None if the null representation is
|
||||
not a bit or byte mask.
|
||||
- "offsets": a two-element tuple whose first element is a buffer
|
||||
containing the offset values for variable-size binary
|
||||
data (e.g., variable-length strings) and whose second
|
||||
element is the offsets buffer's associated dtype. None
|
||||
if the data buffer does not have an associated offsets
|
||||
buffer.
|
||||
"""
|
||||
|
||||
|
||||
# def get_children(self) -> Iterable[Column]:
|
||||
# """
|
||||
# Children columns underneath the column, each object in this iterator
|
||||
# must adhere to the column specification.
|
||||
# """
|
||||
# pass
|
||||
|
||||
|
||||
class DataFrame(ABC):
|
||||
"""
|
||||
A data frame class, with only the methods required by the interchange
|
||||
protocol defined.
|
||||
|
||||
A "data frame" represents an ordered collection of named columns.
|
||||
A column's "name" must be a unique string.
|
||||
Columns may be accessed by name or by position.
|
||||
|
||||
This could be a public data frame class, or an object with the methods and
|
||||
attributes defined on this DataFrame class could be returned from the
|
||||
``__dataframe__`` method of a public data frame class in a library adhering
|
||||
to the dataframe interchange protocol specification.
|
||||
"""
|
||||
|
||||
version = 0 # version of the protocol
|
||||
|
||||
@abstractmethod
|
||||
def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):
|
||||
"""Construct a new interchange object, potentially changing the parameters."""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def metadata(self) -> dict[str, Any]:
|
||||
"""
|
||||
The metadata for the data frame, as a dictionary with string keys. The
|
||||
contents of `metadata` may be anything, they are meant for a library
|
||||
to store information that it needs to, e.g., roundtrip losslessly or
|
||||
for two implementations to share data that is not (yet) part of the
|
||||
interchange protocol specification. For avoiding collisions with other
|
||||
entries, please add name the keys with the name of the library
|
||||
followed by a period and the desired name, e.g, ``pandas.indexcol``.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def num_columns(self) -> int:
|
||||
"""
|
||||
Return the number of columns in the DataFrame.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def num_rows(self) -> int | None:
|
||||
# TODO: not happy with Optional, but need to flag it may be expensive
|
||||
# why include it if it may be None - what do we expect consumers
|
||||
# to do here?
|
||||
"""
|
||||
Return the number of rows in the DataFrame, if available.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def num_chunks(self) -> int:
|
||||
"""
|
||||
Return the number of chunks the DataFrame consists of.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def column_names(self) -> Iterable[str]:
|
||||
"""
|
||||
Return an iterator yielding the column names.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def get_column(self, i: int) -> Column:
|
||||
"""
|
||||
Return the column at the indicated position.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def get_column_by_name(self, name: str) -> Column:
|
||||
"""
|
||||
Return the column whose name is the indicated name.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def get_columns(self) -> Iterable[Column]:
|
||||
"""
|
||||
Return an iterator yielding the columns.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def select_columns(self, indices: Sequence[int]) -> DataFrame:
|
||||
"""
|
||||
Create a new DataFrame by selecting a subset of columns by index.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def select_columns_by_name(self, names: Sequence[str]) -> DataFrame:
|
||||
"""
|
||||
Create a new DataFrame by selecting a subset of columns by name.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def get_chunks(self, n_chunks: int | None = None) -> Iterable[DataFrame]:
|
||||
"""
|
||||
Return an iterator yielding the chunks.
|
||||
|
||||
By default (None), yields the chunks that the data is stored as by the
|
||||
producer. If given, ``n_chunks`` must be a multiple of
|
||||
``self.num_chunks()``, meaning the producer must subdivide each chunk
|
||||
before yielding it.
|
||||
"""
|
@ -0,0 +1,557 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import ctypes
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._config import using_string_dtype
|
||||
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.errors import SettingWithCopyError
|
||||
|
||||
import pandas as pd
|
||||
from pandas.core.interchange.dataframe_protocol import (
|
||||
Buffer,
|
||||
Column,
|
||||
ColumnNullType,
|
||||
DataFrame as DataFrameXchg,
|
||||
DtypeKind,
|
||||
)
|
||||
from pandas.core.interchange.utils import (
|
||||
ArrowCTypes,
|
||||
Endianness,
|
||||
)
|
||||
|
||||
_NP_DTYPES: dict[DtypeKind, dict[int, Any]] = {
|
||||
DtypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64},
|
||||
DtypeKind.UINT: {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64},
|
||||
DtypeKind.FLOAT: {32: np.float32, 64: np.float64},
|
||||
DtypeKind.BOOL: {1: bool, 8: bool},
|
||||
}
|
||||
|
||||
|
||||
def from_dataframe(df, allow_copy: bool = True) -> pd.DataFrame:
|
||||
"""
|
||||
Build a ``pd.DataFrame`` from any DataFrame supporting the interchange protocol.
|
||||
|
||||
.. note::
|
||||
|
||||
For new development, we highly recommend using the Arrow C Data Interface
|
||||
alongside the Arrow PyCapsule Interface instead of the interchange protocol.
|
||||
From pandas 2.3 onwards, `from_dataframe` uses the PyCapsule Interface,
|
||||
only falling back to the interchange protocol if that fails.
|
||||
|
||||
.. warning::
|
||||
|
||||
Due to severe implementation issues, we recommend only considering using the
|
||||
interchange protocol in the following cases:
|
||||
|
||||
- converting to pandas: for pandas >= 2.0.3
|
||||
- converting from pandas: for pandas >= 3.0.0
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : DataFrameXchg
|
||||
Object supporting the interchange protocol, i.e. `__dataframe__` method.
|
||||
allow_copy : bool, default: True
|
||||
Whether to allow copying the memory to perform the conversion
|
||||
(if false then zero-copy approach is requested).
|
||||
|
||||
Returns
|
||||
-------
|
||||
pd.DataFrame
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df_not_necessarily_pandas = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
|
||||
>>> interchange_object = df_not_necessarily_pandas.__dataframe__()
|
||||
>>> interchange_object.column_names()
|
||||
Index(['A', 'B'], dtype='object')
|
||||
>>> df_pandas = (pd.api.interchange.from_dataframe
|
||||
... (interchange_object.select_columns_by_name(['A'])))
|
||||
>>> df_pandas
|
||||
A
|
||||
0 1
|
||||
1 2
|
||||
|
||||
These methods (``column_names``, ``select_columns_by_name``) should work
|
||||
for any dataframe library which implements the interchange protocol.
|
||||
"""
|
||||
if isinstance(df, pd.DataFrame):
|
||||
return df
|
||||
|
||||
if hasattr(df, "__arrow_c_stream__"):
|
||||
try:
|
||||
pa = import_optional_dependency("pyarrow", min_version="14.0.0")
|
||||
except ImportError:
|
||||
# fallback to _from_dataframe
|
||||
pass
|
||||
else:
|
||||
try:
|
||||
return pa.table(df).to_pandas(zero_copy_only=not allow_copy)
|
||||
except pa.ArrowInvalid as e:
|
||||
raise RuntimeError(e) from e
|
||||
|
||||
if not hasattr(df, "__dataframe__"):
|
||||
raise ValueError("`df` does not support __dataframe__")
|
||||
|
||||
return _from_dataframe(
|
||||
df.__dataframe__(allow_copy=allow_copy), allow_copy=allow_copy
|
||||
)
|
||||
|
||||
|
||||
def _from_dataframe(df: DataFrameXchg, allow_copy: bool = True):
|
||||
"""
|
||||
Build a ``pd.DataFrame`` from the DataFrame interchange object.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : DataFrameXchg
|
||||
Object supporting the interchange protocol, i.e. `__dataframe__` method.
|
||||
allow_copy : bool, default: True
|
||||
Whether to allow copying the memory to perform the conversion
|
||||
(if false then zero-copy approach is requested).
|
||||
|
||||
Returns
|
||||
-------
|
||||
pd.DataFrame
|
||||
"""
|
||||
pandas_dfs = []
|
||||
for chunk in df.get_chunks():
|
||||
pandas_df = protocol_df_chunk_to_pandas(chunk)
|
||||
pandas_dfs.append(pandas_df)
|
||||
|
||||
if not allow_copy and len(pandas_dfs) > 1:
|
||||
raise RuntimeError(
|
||||
"To join chunks a copy is required which is forbidden by allow_copy=False"
|
||||
)
|
||||
if not pandas_dfs:
|
||||
pandas_df = protocol_df_chunk_to_pandas(df)
|
||||
elif len(pandas_dfs) == 1:
|
||||
pandas_df = pandas_dfs[0]
|
||||
else:
|
||||
pandas_df = pd.concat(pandas_dfs, axis=0, ignore_index=True, copy=False)
|
||||
|
||||
index_obj = df.metadata.get("pandas.index", None)
|
||||
if index_obj is not None:
|
||||
pandas_df.index = index_obj
|
||||
|
||||
return pandas_df
|
||||
|
||||
|
||||
def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame:
|
||||
"""
|
||||
Convert interchange protocol chunk to ``pd.DataFrame``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : DataFrameXchg
|
||||
|
||||
Returns
|
||||
-------
|
||||
pd.DataFrame
|
||||
"""
|
||||
columns: dict[str, Any] = {}
|
||||
buffers = [] # hold on to buffers, keeps memory alive
|
||||
for name in df.column_names():
|
||||
if not isinstance(name, str):
|
||||
raise ValueError(f"Column {name} is not a string")
|
||||
if name in columns:
|
||||
raise ValueError(f"Column {name} is not unique")
|
||||
col = df.get_column_by_name(name)
|
||||
dtype = col.dtype[0]
|
||||
if dtype in (
|
||||
DtypeKind.INT,
|
||||
DtypeKind.UINT,
|
||||
DtypeKind.FLOAT,
|
||||
DtypeKind.BOOL,
|
||||
):
|
||||
columns[name], buf = primitive_column_to_ndarray(col)
|
||||
elif dtype == DtypeKind.CATEGORICAL:
|
||||
columns[name], buf = categorical_column_to_series(col)
|
||||
elif dtype == DtypeKind.STRING:
|
||||
columns[name], buf = string_column_to_ndarray(col)
|
||||
elif dtype == DtypeKind.DATETIME:
|
||||
columns[name], buf = datetime_column_to_ndarray(col)
|
||||
else:
|
||||
raise NotImplementedError(f"Data type {dtype} not handled yet")
|
||||
|
||||
buffers.append(buf)
|
||||
|
||||
pandas_df = pd.DataFrame(columns)
|
||||
pandas_df.attrs["_INTERCHANGE_PROTOCOL_BUFFERS"] = buffers
|
||||
return pandas_df
|
||||
|
||||
|
||||
def primitive_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
|
||||
"""
|
||||
Convert a column holding one of the primitive dtypes to a NumPy array.
|
||||
|
||||
A primitive type is one of: int, uint, float, bool.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
col : Column
|
||||
|
||||
Returns
|
||||
-------
|
||||
tuple
|
||||
Tuple of np.ndarray holding the data and the memory owner object
|
||||
that keeps the memory alive.
|
||||
"""
|
||||
buffers = col.get_buffers()
|
||||
|
||||
data_buff, data_dtype = buffers["data"]
|
||||
data = buffer_to_ndarray(
|
||||
data_buff, data_dtype, offset=col.offset, length=col.size()
|
||||
)
|
||||
|
||||
data = set_nulls(data, col, buffers["validity"])
|
||||
return data, buffers
|
||||
|
||||
|
||||
def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]:
|
||||
"""
|
||||
Convert a column holding categorical data to a pandas Series.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
col : Column
|
||||
|
||||
Returns
|
||||
-------
|
||||
tuple
|
||||
Tuple of pd.Series holding the data and the memory owner object
|
||||
that keeps the memory alive.
|
||||
"""
|
||||
categorical = col.describe_categorical
|
||||
|
||||
if not categorical["is_dictionary"]:
|
||||
raise NotImplementedError("Non-dictionary categoricals not supported yet")
|
||||
|
||||
cat_column = categorical["categories"]
|
||||
if hasattr(cat_column, "_col"):
|
||||
# Item "Column" of "Optional[Column]" has no attribute "_col"
|
||||
# Item "None" of "Optional[Column]" has no attribute "_col"
|
||||
categories = np.array(cat_column._col) # type: ignore[union-attr]
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"Interchanging categorical columns isn't supported yet, and our "
|
||||
"fallback of using the `col._col` attribute (a ndarray) failed."
|
||||
)
|
||||
buffers = col.get_buffers()
|
||||
|
||||
codes_buff, codes_dtype = buffers["data"]
|
||||
codes = buffer_to_ndarray(
|
||||
codes_buff, codes_dtype, offset=col.offset, length=col.size()
|
||||
)
|
||||
|
||||
# Doing module in order to not get ``IndexError`` for
|
||||
# out-of-bounds sentinel values in `codes`
|
||||
if len(categories) > 0:
|
||||
values = categories[codes % len(categories)]
|
||||
else:
|
||||
values = codes
|
||||
|
||||
cat = pd.Categorical(
|
||||
values, categories=categories, ordered=categorical["is_ordered"]
|
||||
)
|
||||
data = pd.Series(cat)
|
||||
|
||||
data = set_nulls(data, col, buffers["validity"])
|
||||
return data, buffers
|
||||
|
||||
|
||||
def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
|
||||
"""
|
||||
Convert a column holding string data to a NumPy array.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
col : Column
|
||||
|
||||
Returns
|
||||
-------
|
||||
tuple
|
||||
Tuple of np.ndarray holding the data and the memory owner object
|
||||
that keeps the memory alive.
|
||||
"""
|
||||
null_kind, sentinel_val = col.describe_null
|
||||
|
||||
if null_kind not in (
|
||||
ColumnNullType.NON_NULLABLE,
|
||||
ColumnNullType.USE_BITMASK,
|
||||
ColumnNullType.USE_BYTEMASK,
|
||||
):
|
||||
raise NotImplementedError(
|
||||
f"{null_kind} null kind is not yet supported for string columns."
|
||||
)
|
||||
|
||||
buffers = col.get_buffers()
|
||||
|
||||
assert buffers["offsets"], "String buffers must contain offsets"
|
||||
# Retrieve the data buffer containing the UTF-8 code units
|
||||
data_buff, _ = buffers["data"]
|
||||
# We're going to reinterpret the buffer as uint8, so make sure we can do it safely
|
||||
assert col.dtype[2] in (
|
||||
ArrowCTypes.STRING,
|
||||
ArrowCTypes.LARGE_STRING,
|
||||
) # format_str == utf-8
|
||||
# Convert the buffers to NumPy arrays. In order to go from STRING to
|
||||
# an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array)
|
||||
data_dtype = (
|
||||
DtypeKind.UINT,
|
||||
8,
|
||||
ArrowCTypes.UINT8,
|
||||
Endianness.NATIVE,
|
||||
)
|
||||
# Specify zero offset as we don't want to chunk the string data
|
||||
data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=data_buff.bufsize)
|
||||
|
||||
# Retrieve the offsets buffer containing the index offsets demarcating
|
||||
# the beginning and the ending of each string
|
||||
offset_buff, offset_dtype = buffers["offsets"]
|
||||
# Offsets buffer contains start-stop positions of strings in the data buffer,
|
||||
# meaning that it has more elements than in the data buffer, do `col.size() + 1`
|
||||
# here to pass a proper offsets buffer size
|
||||
offsets = buffer_to_ndarray(
|
||||
offset_buff, offset_dtype, offset=col.offset, length=col.size() + 1
|
||||
)
|
||||
|
||||
null_pos = None
|
||||
if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):
|
||||
validity = buffers["validity"]
|
||||
if validity is not None:
|
||||
valid_buff, valid_dtype = validity
|
||||
null_pos = buffer_to_ndarray(
|
||||
valid_buff, valid_dtype, offset=col.offset, length=col.size()
|
||||
)
|
||||
if sentinel_val == 0:
|
||||
null_pos = ~null_pos
|
||||
|
||||
# Assemble the strings from the code units
|
||||
str_list: list[None | float | str] = [None] * col.size()
|
||||
for i in range(col.size()):
|
||||
# Check for missing values
|
||||
if null_pos is not None and null_pos[i]:
|
||||
str_list[i] = np.nan
|
||||
continue
|
||||
|
||||
# Extract a range of code units
|
||||
units = data[offsets[i] : offsets[i + 1]]
|
||||
|
||||
# Convert the list of code units to bytes
|
||||
str_bytes = bytes(units)
|
||||
|
||||
# Create the string
|
||||
string = str_bytes.decode(encoding="utf-8")
|
||||
|
||||
# Add to our list of strings
|
||||
str_list[i] = string
|
||||
|
||||
if using_string_dtype():
|
||||
res = pd.Series(str_list, dtype="str")
|
||||
else:
|
||||
res = np.asarray(str_list, dtype="object") # type: ignore[assignment]
|
||||
|
||||
return res, buffers # type: ignore[return-value]
|
||||
|
||||
|
||||
def parse_datetime_format_str(format_str, data) -> pd.Series | np.ndarray:
|
||||
"""Parse datetime `format_str` to interpret the `data`."""
|
||||
# timestamp 'ts{unit}:tz'
|
||||
timestamp_meta = re.match(r"ts([smun]):(.*)", format_str)
|
||||
if timestamp_meta:
|
||||
unit, tz = timestamp_meta.group(1), timestamp_meta.group(2)
|
||||
if unit != "s":
|
||||
# the format string describes only a first letter of the unit, so
|
||||
# add one extra letter to convert the unit to numpy-style:
|
||||
# 'm' -> 'ms', 'u' -> 'us', 'n' -> 'ns'
|
||||
unit += "s"
|
||||
data = data.astype(f"datetime64[{unit}]")
|
||||
if tz != "":
|
||||
data = pd.Series(data).dt.tz_localize("UTC").dt.tz_convert(tz)
|
||||
return data
|
||||
|
||||
# date 'td{Days/Ms}'
|
||||
date_meta = re.match(r"td([Dm])", format_str)
|
||||
if date_meta:
|
||||
unit = date_meta.group(1)
|
||||
if unit == "D":
|
||||
# NumPy doesn't support DAY unit, so converting days to seconds
|
||||
# (converting to uint64 to avoid overflow)
|
||||
data = (data.astype(np.uint64) * (24 * 60 * 60)).astype("datetime64[s]")
|
||||
elif unit == "m":
|
||||
data = data.astype("datetime64[ms]")
|
||||
else:
|
||||
raise NotImplementedError(f"Date unit is not supported: {unit}")
|
||||
return data
|
||||
|
||||
raise NotImplementedError(f"DateTime kind is not supported: {format_str}")
|
||||
|
||||
|
||||
def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray | pd.Series, Any]:
|
||||
"""
|
||||
Convert a column holding DateTime data to a NumPy array.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
col : Column
|
||||
|
||||
Returns
|
||||
-------
|
||||
tuple
|
||||
Tuple of np.ndarray holding the data and the memory owner object
|
||||
that keeps the memory alive.
|
||||
"""
|
||||
buffers = col.get_buffers()
|
||||
|
||||
_, col_bit_width, format_str, _ = col.dtype
|
||||
dbuf, _ = buffers["data"]
|
||||
# Consider dtype being `uint` to get number of units passed since the 01.01.1970
|
||||
|
||||
data = buffer_to_ndarray(
|
||||
dbuf,
|
||||
(
|
||||
DtypeKind.INT,
|
||||
col_bit_width,
|
||||
getattr(ArrowCTypes, f"INT{col_bit_width}"),
|
||||
Endianness.NATIVE,
|
||||
),
|
||||
offset=col.offset,
|
||||
length=col.size(),
|
||||
)
|
||||
|
||||
data = parse_datetime_format_str(format_str, data) # type: ignore[assignment]
|
||||
data = set_nulls(data, col, buffers["validity"])
|
||||
return data, buffers
|
||||
|
||||
|
||||
def buffer_to_ndarray(
|
||||
buffer: Buffer,
|
||||
dtype: tuple[DtypeKind, int, str, str],
|
||||
*,
|
||||
length: int,
|
||||
offset: int = 0,
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Build a NumPy array from the passed buffer.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
buffer : Buffer
|
||||
Buffer to build a NumPy array from.
|
||||
dtype : tuple
|
||||
Data type of the buffer conforming protocol dtypes format.
|
||||
offset : int, default: 0
|
||||
Number of elements to offset from the start of the buffer.
|
||||
length : int, optional
|
||||
If the buffer is a bit-mask, specifies a number of bits to read
|
||||
from the buffer. Has no effect otherwise.
|
||||
|
||||
Returns
|
||||
-------
|
||||
np.ndarray
|
||||
|
||||
Notes
|
||||
-----
|
||||
The returned array doesn't own the memory. The caller of this function is
|
||||
responsible for keeping the memory owner object alive as long as
|
||||
the returned NumPy array is being used.
|
||||
"""
|
||||
kind, bit_width, _, _ = dtype
|
||||
|
||||
column_dtype = _NP_DTYPES.get(kind, {}).get(bit_width, None)
|
||||
if column_dtype is None:
|
||||
raise NotImplementedError(f"Conversion for {dtype} is not yet supported.")
|
||||
|
||||
# TODO: No DLPack yet, so need to construct a new ndarray from the data pointer
|
||||
# and size in the buffer plus the dtype on the column. Use DLPack as NumPy supports
|
||||
# it since https://github.com/numpy/numpy/pull/19083
|
||||
ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype)
|
||||
|
||||
if bit_width == 1:
|
||||
assert length is not None, "`length` must be specified for a bit-mask buffer."
|
||||
pa = import_optional_dependency("pyarrow")
|
||||
arr = pa.BooleanArray.from_buffers(
|
||||
pa.bool_(),
|
||||
length,
|
||||
[None, pa.foreign_buffer(buffer.ptr, length)],
|
||||
offset=offset,
|
||||
)
|
||||
return np.asarray(arr)
|
||||
else:
|
||||
data_pointer = ctypes.cast(
|
||||
buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type)
|
||||
)
|
||||
if length > 0:
|
||||
return np.ctypeslib.as_array(data_pointer, shape=(length,))
|
||||
return np.array([], dtype=ctypes_type)
|
||||
|
||||
|
||||
def set_nulls(
|
||||
data: np.ndarray | pd.Series,
|
||||
col: Column,
|
||||
validity: tuple[Buffer, tuple[DtypeKind, int, str, str]] | None,
|
||||
allow_modify_inplace: bool = True,
|
||||
):
|
||||
"""
|
||||
Set null values for the data according to the column null kind.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : np.ndarray or pd.Series
|
||||
Data to set nulls in.
|
||||
col : Column
|
||||
Column object that describes the `data`.
|
||||
validity : tuple(Buffer, dtype) or None
|
||||
The return value of ``col.buffers()``. We do not access the ``col.buffers()``
|
||||
here to not take the ownership of the memory of buffer objects.
|
||||
allow_modify_inplace : bool, default: True
|
||||
Whether to modify the `data` inplace when zero-copy is possible (True) or always
|
||||
modify a copy of the `data` (False).
|
||||
|
||||
Returns
|
||||
-------
|
||||
np.ndarray or pd.Series
|
||||
Data with the nulls being set.
|
||||
"""
|
||||
if validity is None:
|
||||
return data
|
||||
null_kind, sentinel_val = col.describe_null
|
||||
null_pos = None
|
||||
|
||||
if null_kind == ColumnNullType.USE_SENTINEL:
|
||||
null_pos = pd.Series(data) == sentinel_val
|
||||
elif null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):
|
||||
assert validity, "Expected to have a validity buffer for the mask"
|
||||
valid_buff, valid_dtype = validity
|
||||
null_pos = buffer_to_ndarray(
|
||||
valid_buff, valid_dtype, offset=col.offset, length=col.size()
|
||||
)
|
||||
if sentinel_val == 0:
|
||||
null_pos = ~null_pos
|
||||
elif null_kind in (ColumnNullType.NON_NULLABLE, ColumnNullType.USE_NAN):
|
||||
pass
|
||||
else:
|
||||
raise NotImplementedError(f"Null kind {null_kind} is not yet supported.")
|
||||
|
||||
if null_pos is not None and np.any(null_pos):
|
||||
if not allow_modify_inplace:
|
||||
data = data.copy()
|
||||
try:
|
||||
data[null_pos] = None
|
||||
except TypeError:
|
||||
# TypeError happens if the `data` dtype appears to be non-nullable
|
||||
# in numpy notation (bool, int, uint). If this happens,
|
||||
# cast the `data` to nullable float dtype.
|
||||
data = data.astype(float)
|
||||
data[null_pos] = None
|
||||
except SettingWithCopyError:
|
||||
# `SettingWithCopyError` may happen for datetime-like with missing values.
|
||||
data = data.copy()
|
||||
data[null_pos] = None
|
||||
|
||||
return data
|
183
lib/python3.11/site-packages/pandas/core/interchange/utils.py
Normal file
183
lib/python3.11/site-packages/pandas/core/interchange/utils.py
Normal file
@ -0,0 +1,183 @@
|
||||
"""
|
||||
Utility functions and objects for implementing the interchange API.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import typing
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import lib
|
||||
|
||||
from pandas.core.dtypes.dtypes import (
|
||||
ArrowDtype,
|
||||
CategoricalDtype,
|
||||
DatetimeTZDtype,
|
||||
)
|
||||
|
||||
import pandas as pd
|
||||
|
||||
if typing.TYPE_CHECKING:
|
||||
from pandas._typing import DtypeObj
|
||||
|
||||
|
||||
# Maps str(pyarrow.DataType) = C type format string
|
||||
# Currently, no pyarrow API for this
|
||||
PYARROW_CTYPES = {
|
||||
"null": "n",
|
||||
"bool": "b",
|
||||
"uint8": "C",
|
||||
"uint16": "S",
|
||||
"uint32": "I",
|
||||
"uint64": "L",
|
||||
"int8": "c",
|
||||
"int16": "S",
|
||||
"int32": "i",
|
||||
"int64": "l",
|
||||
"halffloat": "e", # float16
|
||||
"float": "f", # float32
|
||||
"double": "g", # float64
|
||||
"string": "u",
|
||||
"large_string": "U",
|
||||
"binary": "z",
|
||||
"time32[s]": "tts",
|
||||
"time32[ms]": "ttm",
|
||||
"time64[us]": "ttu",
|
||||
"time64[ns]": "ttn",
|
||||
"date32[day]": "tdD",
|
||||
"date64[ms]": "tdm",
|
||||
"timestamp[s]": "tss:",
|
||||
"timestamp[ms]": "tsm:",
|
||||
"timestamp[us]": "tsu:",
|
||||
"timestamp[ns]": "tsn:",
|
||||
"duration[s]": "tDs",
|
||||
"duration[ms]": "tDm",
|
||||
"duration[us]": "tDu",
|
||||
"duration[ns]": "tDn",
|
||||
}
|
||||
|
||||
|
||||
class ArrowCTypes:
|
||||
"""
|
||||
Enum for Apache Arrow C type format strings.
|
||||
|
||||
The Arrow C data interface:
|
||||
https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings
|
||||
"""
|
||||
|
||||
NULL = "n"
|
||||
BOOL = "b"
|
||||
INT8 = "c"
|
||||
UINT8 = "C"
|
||||
INT16 = "s"
|
||||
UINT16 = "S"
|
||||
INT32 = "i"
|
||||
UINT32 = "I"
|
||||
INT64 = "l"
|
||||
UINT64 = "L"
|
||||
FLOAT16 = "e"
|
||||
FLOAT32 = "f"
|
||||
FLOAT64 = "g"
|
||||
STRING = "u" # utf-8
|
||||
LARGE_STRING = "U" # utf-8
|
||||
DATE32 = "tdD"
|
||||
DATE64 = "tdm"
|
||||
# Resoulution:
|
||||
# - seconds -> 's'
|
||||
# - milliseconds -> 'm'
|
||||
# - microseconds -> 'u'
|
||||
# - nanoseconds -> 'n'
|
||||
TIMESTAMP = "ts{resolution}:{tz}"
|
||||
TIME = "tt{resolution}"
|
||||
|
||||
|
||||
class Endianness:
|
||||
"""Enum indicating the byte-order of a data-type."""
|
||||
|
||||
LITTLE = "<"
|
||||
BIG = ">"
|
||||
NATIVE = "="
|
||||
NA = "|"
|
||||
|
||||
|
||||
def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str:
|
||||
"""
|
||||
Represent pandas `dtype` as a format string in Apache Arrow C notation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dtype : np.dtype
|
||||
Datatype of pandas DataFrame to represent.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
Format string in Apache Arrow C notation of the given `dtype`.
|
||||
"""
|
||||
if isinstance(dtype, CategoricalDtype):
|
||||
return ArrowCTypes.INT64
|
||||
elif dtype == np.dtype("O"):
|
||||
return ArrowCTypes.STRING
|
||||
elif isinstance(dtype, ArrowDtype):
|
||||
import pyarrow as pa
|
||||
|
||||
pa_type = dtype.pyarrow_dtype
|
||||
if pa.types.is_decimal(pa_type):
|
||||
return f"d:{pa_type.precision},{pa_type.scale}"
|
||||
elif pa.types.is_timestamp(pa_type) and pa_type.tz is not None:
|
||||
return f"ts{pa_type.unit[0]}:{pa_type.tz}"
|
||||
format_str = PYARROW_CTYPES.get(str(pa_type), None)
|
||||
if format_str is not None:
|
||||
return format_str
|
||||
|
||||
format_str = getattr(ArrowCTypes, dtype.name.upper(), None)
|
||||
if format_str is not None:
|
||||
return format_str
|
||||
|
||||
if isinstance(dtype, pd.StringDtype):
|
||||
# TODO(infer_string) this should be LARGE_STRING for pyarrow storage,
|
||||
# but current tests don't cover this distinction
|
||||
return ArrowCTypes.STRING
|
||||
|
||||
elif lib.is_np_dtype(dtype, "M"):
|
||||
# Selecting the first char of resolution string:
|
||||
# dtype.str -> '<M8[ns]' -> 'n'
|
||||
resolution = np.datetime_data(dtype)[0][0]
|
||||
return ArrowCTypes.TIMESTAMP.format(resolution=resolution, tz="")
|
||||
|
||||
elif isinstance(dtype, DatetimeTZDtype):
|
||||
return ArrowCTypes.TIMESTAMP.format(resolution=dtype.unit[0], tz=dtype.tz)
|
||||
|
||||
elif isinstance(dtype, pd.BooleanDtype):
|
||||
return ArrowCTypes.BOOL
|
||||
|
||||
raise NotImplementedError(
|
||||
f"Conversion of {dtype} to Arrow C format string is not implemented."
|
||||
)
|
||||
|
||||
|
||||
def maybe_rechunk(series: pd.Series, *, allow_copy: bool) -> pd.Series | None:
|
||||
"""
|
||||
Rechunk a multi-chunk pyarrow array into a single-chunk array, if necessary.
|
||||
|
||||
- Returns `None` if the input series is not backed by a multi-chunk pyarrow array
|
||||
(and so doesn't need rechunking)
|
||||
- Returns a single-chunk-backed-Series if the input is backed by a multi-chunk
|
||||
pyarrow array and `allow_copy` is `True`.
|
||||
- Raises a `RuntimeError` if `allow_copy` is `False` and input is a
|
||||
based by a multi-chunk pyarrow array.
|
||||
"""
|
||||
if not isinstance(series.dtype, pd.ArrowDtype):
|
||||
return None
|
||||
chunked_array = series.array._pa_array # type: ignore[attr-defined]
|
||||
if len(chunked_array.chunks) == 1:
|
||||
return None
|
||||
if not allow_copy:
|
||||
raise RuntimeError(
|
||||
"Found multi-chunk pyarrow array, but `allow_copy` is False. "
|
||||
"Please rechunk the array before calling this function, or set "
|
||||
"`allow_copy=True`."
|
||||
)
|
||||
arr = chunked_array.combine_chunks()
|
||||
return pd.Series(arr, dtype=series.dtype, name=series.name, index=series.index)
|
Reference in New Issue
Block a user