done
This commit is contained in:
1175
lib/python3.11/site-packages/narwhals/_pandas_like/dataframe.py
Normal file
1175
lib/python3.11/site-packages/narwhals/_pandas_like/dataframe.py
Normal file
File diff suppressed because it is too large
Load Diff
345
lib/python3.11/site-packages/narwhals/_pandas_like/expr.py
Normal file
345
lib/python3.11/site-packages/narwhals/_pandas_like/expr.py
Normal file
@ -0,0 +1,345 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from narwhals._compliant import EagerExpr
|
||||
from narwhals._expression_parsing import evaluate_output_names_and_aliases
|
||||
from narwhals._pandas_like.group_by import PandasLikeGroupBy
|
||||
from narwhals._pandas_like.series import PandasLikeSeries
|
||||
from narwhals._utils import generate_temporary_column_name
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Sequence
|
||||
|
||||
from typing_extensions import Self
|
||||
|
||||
from narwhals._compliant.typing import AliasNames, EvalNames, EvalSeries, ScalarKwargs
|
||||
from narwhals._expression_parsing import ExprMetadata
|
||||
from narwhals._pandas_like.dataframe import PandasLikeDataFrame
|
||||
from narwhals._pandas_like.namespace import PandasLikeNamespace
|
||||
from narwhals._utils import Implementation, Version, _LimitedContext
|
||||
from narwhals.typing import PythonLiteral
|
||||
|
||||
WINDOW_FUNCTIONS_TO_PANDAS_EQUIVALENT = {
|
||||
"cum_sum": "cumsum",
|
||||
"cum_min": "cummin",
|
||||
"cum_max": "cummax",
|
||||
"cum_prod": "cumprod",
|
||||
# Pandas cumcount starts counting from 0 while Polars starts from 1
|
||||
# Pandas cumcount counts nulls while Polars does not
|
||||
# So, instead of using "cumcount" we use "cumsum" on notna() to get the same result
|
||||
"cum_count": "cumsum",
|
||||
"rolling_sum": "sum",
|
||||
"rolling_mean": "mean",
|
||||
"rolling_std": "std",
|
||||
"rolling_var": "var",
|
||||
"shift": "shift",
|
||||
"rank": "rank",
|
||||
"diff": "diff",
|
||||
"fill_null": "fillna",
|
||||
"quantile": "quantile",
|
||||
"ewm_mean": "mean",
|
||||
}
|
||||
|
||||
|
||||
def window_kwargs_to_pandas_equivalent(
|
||||
function_name: str, kwargs: ScalarKwargs
|
||||
) -> dict[str, PythonLiteral]:
|
||||
if function_name == "shift":
|
||||
assert "n" in kwargs # noqa: S101
|
||||
pandas_kwargs: dict[str, PythonLiteral] = {"periods": kwargs["n"]}
|
||||
elif function_name == "rank":
|
||||
assert "method" in kwargs # noqa: S101
|
||||
assert "descending" in kwargs # noqa: S101
|
||||
_method = kwargs["method"]
|
||||
pandas_kwargs = {
|
||||
"method": "first" if _method == "ordinal" else _method,
|
||||
"ascending": not kwargs["descending"],
|
||||
"na_option": "keep",
|
||||
"pct": False,
|
||||
}
|
||||
elif function_name.startswith("cum_"): # Cumulative operation
|
||||
pandas_kwargs = {"skipna": True}
|
||||
elif function_name.startswith("rolling_"): # Rolling operation
|
||||
assert "min_samples" in kwargs # noqa: S101
|
||||
assert "window_size" in kwargs # noqa: S101
|
||||
assert "center" in kwargs # noqa: S101
|
||||
pandas_kwargs = {
|
||||
"min_periods": kwargs["min_samples"],
|
||||
"window": kwargs["window_size"],
|
||||
"center": kwargs["center"],
|
||||
}
|
||||
elif function_name in {"std", "var"}:
|
||||
assert "ddof" in kwargs # noqa: S101
|
||||
pandas_kwargs = {"ddof": kwargs["ddof"]}
|
||||
elif function_name == "fill_null":
|
||||
assert "strategy" in kwargs # noqa: S101
|
||||
assert "limit" in kwargs # noqa: S101
|
||||
pandas_kwargs = {"strategy": kwargs["strategy"], "limit": kwargs["limit"]}
|
||||
elif function_name == "quantile":
|
||||
assert "quantile" in kwargs # noqa: S101
|
||||
assert "interpolation" in kwargs # noqa: S101
|
||||
pandas_kwargs = {
|
||||
"q": kwargs["quantile"],
|
||||
"interpolation": kwargs["interpolation"],
|
||||
}
|
||||
elif function_name.startswith("ewm_"):
|
||||
assert "com" in kwargs # noqa: S101
|
||||
assert "span" in kwargs # noqa: S101
|
||||
assert "half_life" in kwargs # noqa: S101
|
||||
assert "alpha" in kwargs # noqa: S101
|
||||
assert "adjust" in kwargs # noqa: S101
|
||||
assert "min_samples" in kwargs # noqa: S101
|
||||
assert "ignore_nulls" in kwargs # noqa: S101
|
||||
|
||||
pandas_kwargs = {
|
||||
"com": kwargs["com"],
|
||||
"span": kwargs["span"],
|
||||
"halflife": kwargs["half_life"],
|
||||
"alpha": kwargs["alpha"],
|
||||
"adjust": kwargs["adjust"],
|
||||
"min_periods": kwargs["min_samples"],
|
||||
"ignore_na": kwargs["ignore_nulls"],
|
||||
}
|
||||
else: # sum, len, ...
|
||||
pandas_kwargs = {}
|
||||
return pandas_kwargs
|
||||
|
||||
|
||||
class PandasLikeExpr(EagerExpr["PandasLikeDataFrame", PandasLikeSeries]):
|
||||
def __init__(
|
||||
self,
|
||||
call: EvalSeries[PandasLikeDataFrame, PandasLikeSeries],
|
||||
*,
|
||||
depth: int,
|
||||
function_name: str,
|
||||
evaluate_output_names: EvalNames[PandasLikeDataFrame],
|
||||
alias_output_names: AliasNames | None,
|
||||
implementation: Implementation,
|
||||
version: Version,
|
||||
scalar_kwargs: ScalarKwargs | None = None,
|
||||
) -> None:
|
||||
self._call = call
|
||||
self._depth = depth
|
||||
self._function_name = function_name
|
||||
self._evaluate_output_names = evaluate_output_names
|
||||
self._alias_output_names = alias_output_names
|
||||
self._implementation = implementation
|
||||
self._version = version
|
||||
self._scalar_kwargs = scalar_kwargs or {}
|
||||
self._metadata: ExprMetadata | None = None
|
||||
|
||||
def __narwhals_namespace__(self) -> PandasLikeNamespace:
|
||||
from narwhals._pandas_like.namespace import PandasLikeNamespace
|
||||
|
||||
return PandasLikeNamespace(self._implementation, version=self._version)
|
||||
|
||||
@classmethod
|
||||
def from_column_names(
|
||||
cls: type[Self],
|
||||
evaluate_column_names: EvalNames[PandasLikeDataFrame],
|
||||
/,
|
||||
*,
|
||||
context: _LimitedContext,
|
||||
function_name: str = "",
|
||||
) -> Self:
|
||||
def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]:
|
||||
try:
|
||||
return [
|
||||
PandasLikeSeries(
|
||||
df._native_frame[column_name],
|
||||
implementation=df._implementation,
|
||||
version=df._version,
|
||||
)
|
||||
for column_name in evaluate_column_names(df)
|
||||
]
|
||||
except KeyError as e:
|
||||
if error := df._check_columns_exist(evaluate_column_names(df)):
|
||||
raise error from e
|
||||
raise
|
||||
|
||||
return cls(
|
||||
func,
|
||||
depth=0,
|
||||
function_name=function_name,
|
||||
evaluate_output_names=evaluate_column_names,
|
||||
alias_output_names=None,
|
||||
implementation=context._implementation,
|
||||
version=context._version,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_column_indices(cls, *column_indices: int, context: _LimitedContext) -> Self:
|
||||
def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]:
|
||||
native = df.native
|
||||
return [
|
||||
PandasLikeSeries.from_native(native.iloc[:, i], context=df)
|
||||
for i in column_indices
|
||||
]
|
||||
|
||||
return cls(
|
||||
func,
|
||||
depth=0,
|
||||
function_name="nth",
|
||||
evaluate_output_names=cls._eval_names_indices(column_indices),
|
||||
alias_output_names=None,
|
||||
implementation=context._implementation,
|
||||
version=context._version,
|
||||
)
|
||||
|
||||
def ewm_mean(
|
||||
self,
|
||||
*,
|
||||
com: float | None,
|
||||
span: float | None,
|
||||
half_life: float | None,
|
||||
alpha: float | None,
|
||||
adjust: bool,
|
||||
min_samples: int,
|
||||
ignore_nulls: bool,
|
||||
) -> Self:
|
||||
return self._reuse_series(
|
||||
"ewm_mean",
|
||||
scalar_kwargs={
|
||||
"com": com,
|
||||
"span": span,
|
||||
"half_life": half_life,
|
||||
"alpha": alpha,
|
||||
"adjust": adjust,
|
||||
"min_samples": min_samples,
|
||||
"ignore_nulls": ignore_nulls,
|
||||
},
|
||||
)
|
||||
|
||||
def over( # noqa: C901, PLR0915
|
||||
self, partition_by: Sequence[str], order_by: Sequence[str]
|
||||
) -> Self:
|
||||
if not partition_by:
|
||||
# e.g. `nw.col('a').cum_sum().order_by(key)`
|
||||
# We can always easily support this as it doesn't require grouping.
|
||||
assert order_by # noqa: S101
|
||||
|
||||
def func(df: PandasLikeDataFrame) -> Sequence[PandasLikeSeries]:
|
||||
token = generate_temporary_column_name(8, df.columns)
|
||||
df = df.with_row_index(token, order_by=None).sort(
|
||||
*order_by, descending=False, nulls_last=False
|
||||
)
|
||||
results = self(df.drop([token], strict=True))
|
||||
sorting_indices = df.get_column(token)
|
||||
for s in results:
|
||||
s._scatter_in_place(sorting_indices, s)
|
||||
return results
|
||||
elif not self._is_elementary():
|
||||
msg = (
|
||||
"Only elementary expressions are supported for `.over` in pandas-like backends.\n\n"
|
||||
"Please see: "
|
||||
"https://narwhals-dev.github.io/narwhals/concepts/improve_group_by_operation/"
|
||||
)
|
||||
raise NotImplementedError(msg)
|
||||
else:
|
||||
function_name = PandasLikeGroupBy._leaf_name(self)
|
||||
pandas_function_name = WINDOW_FUNCTIONS_TO_PANDAS_EQUIVALENT.get(
|
||||
function_name, PandasLikeGroupBy._REMAP_AGGS.get(function_name)
|
||||
)
|
||||
if pandas_function_name is None:
|
||||
msg = (
|
||||
f"Unsupported function: {function_name} in `over` context.\n\n"
|
||||
f"Supported functions are {', '.join(WINDOW_FUNCTIONS_TO_PANDAS_EQUIVALENT)}\n"
|
||||
f"and {', '.join(PandasLikeGroupBy._REMAP_AGGS)}."
|
||||
)
|
||||
raise NotImplementedError(msg)
|
||||
pandas_kwargs = window_kwargs_to_pandas_equivalent(
|
||||
function_name, self._scalar_kwargs
|
||||
)
|
||||
|
||||
def func(df: PandasLikeDataFrame) -> Sequence[PandasLikeSeries]: # noqa: C901, PLR0912, PLR0914, PLR0915
|
||||
output_names, aliases = evaluate_output_names_and_aliases(self, df, [])
|
||||
if function_name == "cum_count":
|
||||
plx = self.__narwhals_namespace__()
|
||||
df = df.with_columns(~plx.col(*output_names).is_null())
|
||||
|
||||
if function_name.startswith("cum_"):
|
||||
assert "reverse" in self._scalar_kwargs # noqa: S101
|
||||
reverse = self._scalar_kwargs["reverse"]
|
||||
else:
|
||||
assert "reverse" not in self._scalar_kwargs # noqa: S101
|
||||
reverse = False
|
||||
|
||||
if order_by:
|
||||
columns = list(set(partition_by).union(output_names).union(order_by))
|
||||
token = generate_temporary_column_name(8, columns)
|
||||
df = (
|
||||
df.simple_select(*columns)
|
||||
.with_row_index(token, order_by=None)
|
||||
.sort(*order_by, descending=reverse, nulls_last=reverse)
|
||||
)
|
||||
sorting_indices = df.get_column(token)
|
||||
elif reverse:
|
||||
columns = list(set(partition_by).union(output_names))
|
||||
df = df.simple_select(*columns)._gather_slice(slice(None, None, -1))
|
||||
grouped = df._native_frame.groupby(partition_by)
|
||||
if function_name.startswith("rolling"):
|
||||
rolling = grouped[list(output_names)].rolling(**pandas_kwargs)
|
||||
assert pandas_function_name is not None # help mypy # noqa: S101
|
||||
if pandas_function_name in {"std", "var"}:
|
||||
assert "ddof" in self._scalar_kwargs # noqa: S101
|
||||
res_native = getattr(rolling, pandas_function_name)(
|
||||
ddof=self._scalar_kwargs["ddof"]
|
||||
)
|
||||
else:
|
||||
res_native = getattr(rolling, pandas_function_name)()
|
||||
elif function_name.startswith("ewm"):
|
||||
if self._implementation.is_pandas() and (
|
||||
self._implementation._backend_version()
|
||||
) < (1, 2): # pragma: no cover
|
||||
msg = (
|
||||
"Exponentially weighted calculation is not available in over "
|
||||
f"context for pandas versions older than 1.2.0, found {self._implementation._backend_version()}."
|
||||
)
|
||||
raise NotImplementedError(msg)
|
||||
ewm = grouped[list(output_names)].ewm(**pandas_kwargs)
|
||||
assert pandas_function_name is not None # help mypy # noqa: S101
|
||||
res_native = getattr(ewm, pandas_function_name)()
|
||||
elif function_name == "fill_null":
|
||||
assert "strategy" in self._scalar_kwargs # noqa: S101
|
||||
assert "limit" in self._scalar_kwargs # noqa: S101
|
||||
df_grouped = grouped[list(output_names)]
|
||||
if self._scalar_kwargs["strategy"] == "forward":
|
||||
res_native = df_grouped.ffill(limit=self._scalar_kwargs["limit"])
|
||||
elif self._scalar_kwargs["strategy"] == "backward":
|
||||
res_native = df_grouped.bfill(limit=self._scalar_kwargs["limit"])
|
||||
else: # pragma: no cover
|
||||
# This is deprecated in pandas. Indeed, `nw.col('a').fill_null(3).over('b')`
|
||||
# does not seem very useful, and DuckDB doesn't support it either.
|
||||
msg = "`fill_null` with `over` without `strategy` specified is not supported."
|
||||
raise NotImplementedError(msg)
|
||||
elif function_name == "len":
|
||||
if len(output_names) != 1: # pragma: no cover
|
||||
msg = "Safety check failed, please report a bug."
|
||||
raise AssertionError(msg)
|
||||
res_native = grouped.transform("size").to_frame(aliases[0])
|
||||
else:
|
||||
res_native = grouped[list(output_names)].transform(
|
||||
pandas_function_name, **pandas_kwargs
|
||||
)
|
||||
result_frame = df._with_native(res_native).rename(
|
||||
dict(zip(output_names, aliases))
|
||||
)
|
||||
results = [result_frame.get_column(name) for name in aliases]
|
||||
if order_by:
|
||||
for s in results:
|
||||
s._scatter_in_place(sorting_indices, s)
|
||||
return results
|
||||
if reverse:
|
||||
return [s._gather_slice(slice(None, None, -1)) for s in results]
|
||||
return results
|
||||
|
||||
return self.__class__(
|
||||
func,
|
||||
depth=self._depth + 1,
|
||||
function_name=self._function_name + "->over",
|
||||
evaluate_output_names=self._evaluate_output_names,
|
||||
alias_output_names=self._alias_output_names,
|
||||
implementation=self._implementation,
|
||||
version=self._version,
|
||||
)
|
365
lib/python3.11/site-packages/narwhals/_pandas_like/group_by.py
Normal file
365
lib/python3.11/site-packages/narwhals/_pandas_like/group_by.py
Normal file
@ -0,0 +1,365 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import warnings
|
||||
from functools import lru_cache
|
||||
from itertools import chain
|
||||
from operator import methodcaller
|
||||
from typing import TYPE_CHECKING, Any, ClassVar, Literal
|
||||
|
||||
from narwhals._compliant import EagerGroupBy
|
||||
from narwhals._exceptions import issue_warning
|
||||
from narwhals._expression_parsing import evaluate_output_names_and_aliases
|
||||
from narwhals._utils import zip_strict
|
||||
from narwhals.dependencies import is_pandas_like_dataframe
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
|
||||
|
||||
import pandas as pd
|
||||
from pandas.api.typing import DataFrameGroupBy as _NativeGroupBy
|
||||
from typing_extensions import TypeAlias, Unpack
|
||||
|
||||
from narwhals._compliant.typing import NarwhalsAggregation, ScalarKwargs
|
||||
from narwhals._pandas_like.dataframe import PandasLikeDataFrame
|
||||
from narwhals._pandas_like.expr import PandasLikeExpr
|
||||
|
||||
NativeGroupBy: TypeAlias = "_NativeGroupBy[tuple[str, ...], Literal[True]]"
|
||||
|
||||
NativeApply: TypeAlias = "Callable[[pd.DataFrame], pd.Series[Any]]"
|
||||
InefficientNativeAggregation: TypeAlias = Literal["cov", "skew"]
|
||||
NativeAggregation: TypeAlias = Literal[
|
||||
"any",
|
||||
"all",
|
||||
"count",
|
||||
"first",
|
||||
"idxmax",
|
||||
"idxmin",
|
||||
"last",
|
||||
"max",
|
||||
"mean",
|
||||
"median",
|
||||
"min",
|
||||
"mode",
|
||||
"nunique",
|
||||
"prod",
|
||||
"quantile",
|
||||
"sem",
|
||||
"size",
|
||||
"std",
|
||||
"sum",
|
||||
"var",
|
||||
InefficientNativeAggregation,
|
||||
]
|
||||
"""https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#built-in-aggregation-methods"""
|
||||
|
||||
_NativeAgg: TypeAlias = "Callable[[Any], pd.DataFrame | pd.Series[Any]]"
|
||||
"""Equivalent to a partial method call on `DataFrameGroupBy`."""
|
||||
|
||||
|
||||
NonStrHashable: TypeAlias = Any
|
||||
"""Because `pandas` allows *"names"* like that 😭"""
|
||||
|
||||
|
||||
@lru_cache(maxsize=32)
|
||||
def _native_agg(name: NativeAggregation, /, **kwds: Unpack[ScalarKwargs]) -> _NativeAgg:
|
||||
if name == "nunique":
|
||||
return methodcaller(name, dropna=False)
|
||||
if not kwds or kwds.get("ddof") == 1:
|
||||
return methodcaller(name)
|
||||
return methodcaller(name, **kwds)
|
||||
|
||||
|
||||
class AggExpr:
|
||||
"""Wrapper storing the intermediate state per-`PandasLikeExpr`.
|
||||
|
||||
There's a lot of edge cases to handle, so aim to evaluate as little
|
||||
as possible - and store anything that's needed twice.
|
||||
|
||||
Warning:
|
||||
While a `PandasLikeExpr` can be reused - this wrapper is valid **only**
|
||||
in a single `.agg(...)` operation.
|
||||
"""
|
||||
|
||||
expr: PandasLikeExpr
|
||||
output_names: Sequence[str]
|
||||
aliases: Sequence[str]
|
||||
|
||||
def __init__(self, expr: PandasLikeExpr) -> None:
|
||||
self.expr = expr
|
||||
self.output_names = ()
|
||||
self.aliases = ()
|
||||
self._leaf_name: NarwhalsAggregation | Any = ""
|
||||
|
||||
def with_expand_names(self, group_by: PandasLikeGroupBy, /) -> AggExpr:
|
||||
"""**Mutating operation**.
|
||||
|
||||
Stores the results of `evaluate_output_names_and_aliases`.
|
||||
"""
|
||||
df = group_by.compliant
|
||||
exclude = group_by.exclude
|
||||
self.output_names, self.aliases = evaluate_output_names_and_aliases(
|
||||
self.expr, df, exclude
|
||||
)
|
||||
return self
|
||||
|
||||
def _getitem_aggs(
|
||||
self, group_by: PandasLikeGroupBy, /
|
||||
) -> pd.DataFrame | pd.Series[Any]:
|
||||
"""Evaluate the wrapped expression as a group_by operation."""
|
||||
result: pd.DataFrame | pd.Series[Any]
|
||||
names = self.output_names
|
||||
if self.is_len() and self.is_top_level_function():
|
||||
result = group_by._grouped.size()
|
||||
elif self.is_len():
|
||||
result_single = group_by._grouped.size()
|
||||
ns = group_by.compliant.__narwhals_namespace__()
|
||||
result = ns._concat_horizontal(
|
||||
[ns.from_native(result_single).alias(name).native for name in names]
|
||||
)
|
||||
elif self.is_mode():
|
||||
compliant = group_by.compliant
|
||||
if (keep := self.kwargs.get("keep")) != "any": # pragma: no cover
|
||||
msg = (
|
||||
f"`Expr.mode(keep='{keep}')` is not implemented in group by context for "
|
||||
f"backend {compliant._implementation}\n\n"
|
||||
"Hint: Use `nw.col(...).mode(keep='any')` instead."
|
||||
)
|
||||
raise NotImplementedError(msg)
|
||||
|
||||
cols = list(names)
|
||||
native = compliant.native
|
||||
keys, kwargs = group_by._keys, group_by._kwargs
|
||||
|
||||
# Implementation based on the following suggestion:
|
||||
# https://github.com/pandas-dev/pandas/issues/19254#issuecomment-778661578
|
||||
ns = compliant.__narwhals_namespace__()
|
||||
result = ns._concat_horizontal(
|
||||
[
|
||||
native.groupby([*keys, col], **kwargs)
|
||||
.size()
|
||||
.sort_values(ascending=False)
|
||||
.reset_index(col)
|
||||
.groupby(keys, **kwargs)[col]
|
||||
.head(1)
|
||||
.sort_index()
|
||||
for col in cols
|
||||
]
|
||||
)
|
||||
else:
|
||||
select = names[0] if len(names) == 1 else list(names)
|
||||
result = self.native_agg()(group_by._grouped[select])
|
||||
if is_pandas_like_dataframe(result):
|
||||
result.columns = list(self.aliases)
|
||||
else:
|
||||
result.name = self.aliases[0]
|
||||
return result
|
||||
|
||||
def is_len(self) -> bool:
|
||||
return self.leaf_name == "len"
|
||||
|
||||
def is_mode(self) -> bool:
|
||||
return self.leaf_name == "mode"
|
||||
|
||||
def is_top_level_function(self) -> bool:
|
||||
# e.g. `nw.len()`.
|
||||
return self.expr._depth == 0
|
||||
|
||||
@property
|
||||
def kwargs(self) -> ScalarKwargs:
|
||||
return self.expr._scalar_kwargs
|
||||
|
||||
@property
|
||||
def leaf_name(self) -> NarwhalsAggregation | Any:
|
||||
if name := self._leaf_name:
|
||||
return name
|
||||
self._leaf_name = PandasLikeGroupBy._leaf_name(self.expr)
|
||||
return self._leaf_name
|
||||
|
||||
def native_agg(self) -> _NativeAgg:
|
||||
"""Return a partial `DataFrameGroupBy` method, missing only `self`."""
|
||||
return _native_agg(
|
||||
PandasLikeGroupBy._remap_expr_name(self.leaf_name), **self.kwargs
|
||||
)
|
||||
|
||||
|
||||
class PandasLikeGroupBy(
|
||||
EagerGroupBy["PandasLikeDataFrame", "PandasLikeExpr", NativeAggregation]
|
||||
):
|
||||
_REMAP_AGGS: ClassVar[Mapping[NarwhalsAggregation, NativeAggregation]] = {
|
||||
"sum": "sum",
|
||||
"mean": "mean",
|
||||
"median": "median",
|
||||
"max": "max",
|
||||
"min": "min",
|
||||
"mode": "mode",
|
||||
"std": "std",
|
||||
"var": "var",
|
||||
"len": "size",
|
||||
"n_unique": "nunique",
|
||||
"count": "count",
|
||||
"quantile": "quantile",
|
||||
"all": "all",
|
||||
"any": "any",
|
||||
}
|
||||
_original_columns: tuple[str, ...]
|
||||
"""Column names *prior* to any aliasing in `ParseKeysGroupBy`."""
|
||||
|
||||
_keys: list[str]
|
||||
"""Stores the **aliased** version of group keys from `ParseKeysGroupBy`."""
|
||||
|
||||
_output_key_names: list[str]
|
||||
"""Stores the **original** version of group keys."""
|
||||
|
||||
_kwargs: Mapping[str, bool]
|
||||
"""Stores keyword arguments for `DataFrame.groupby` other than `by`."""
|
||||
|
||||
@property
|
||||
def exclude(self) -> tuple[str, ...]:
|
||||
"""Group keys to ignore when expanding multi-output aggregations."""
|
||||
return self._exclude
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
df: PandasLikeDataFrame,
|
||||
keys: Sequence[PandasLikeExpr] | Sequence[str],
|
||||
/,
|
||||
*,
|
||||
drop_null_keys: bool,
|
||||
) -> None:
|
||||
self._original_columns = tuple(df.columns)
|
||||
self._drop_null_keys = drop_null_keys
|
||||
self._compliant_frame, self._keys, self._output_key_names = self._parse_keys(
|
||||
df, keys
|
||||
)
|
||||
self._exclude: tuple[str, ...] = (*self._keys, *self._output_key_names)
|
||||
# Drop index to avoid potential collisions:
|
||||
# https://github.com/narwhals-dev/narwhals/issues/1907.
|
||||
native = self.compliant.native
|
||||
if set(native.index.names).intersection(self.compliant.columns):
|
||||
native = native.reset_index(drop=True)
|
||||
|
||||
self._kwargs = {
|
||||
"sort": False,
|
||||
"as_index": True,
|
||||
"dropna": drop_null_keys,
|
||||
"observed": True,
|
||||
}
|
||||
self._grouped: NativeGroupBy = native.groupby(self._keys.copy(), **self._kwargs)
|
||||
|
||||
def agg(self, *exprs: PandasLikeExpr) -> PandasLikeDataFrame:
|
||||
all_aggs_are_simple = True
|
||||
agg_exprs: list[AggExpr] = []
|
||||
for expr in exprs:
|
||||
agg_exprs.append(AggExpr(expr).with_expand_names(self))
|
||||
if not self._is_simple(expr):
|
||||
all_aggs_are_simple = False
|
||||
|
||||
if all_aggs_are_simple:
|
||||
result: pd.DataFrame
|
||||
if agg_exprs:
|
||||
ns = self.compliant.__narwhals_namespace__()
|
||||
result = ns._concat_horizontal(self._getitem_aggs(agg_exprs))
|
||||
else:
|
||||
result = self.compliant.__native_namespace__().DataFrame(
|
||||
list(self._grouped.groups), columns=self._keys
|
||||
)
|
||||
elif self.compliant.native.empty:
|
||||
raise empty_results_error()
|
||||
else:
|
||||
result = self._apply_aggs(exprs)
|
||||
# NOTE: Keep `inplace=True` to avoid making a redundant copy.
|
||||
# This may need updating, depending on https://github.com/pandas-dev/pandas/pull/51466/files
|
||||
result.reset_index(inplace=True) # noqa: PD002
|
||||
return self._select_results(result, agg_exprs)
|
||||
|
||||
def _select_results(
|
||||
self, df: pd.DataFrame, /, agg_exprs: Sequence[AggExpr]
|
||||
) -> PandasLikeDataFrame:
|
||||
"""Responsible for remapping temp column names back to original.
|
||||
|
||||
See `ParseKeysGroupBy`.
|
||||
"""
|
||||
new_names = chain.from_iterable(e.aliases for e in agg_exprs)
|
||||
return (
|
||||
self.compliant._with_native(df, validate_column_names=False)
|
||||
.simple_select(*self._keys, *new_names)
|
||||
.rename(dict(zip(self._keys, self._output_key_names)))
|
||||
)
|
||||
|
||||
def _getitem_aggs(
|
||||
self, exprs: Iterable[AggExpr], /
|
||||
) -> list[pd.DataFrame | pd.Series[Any]]:
|
||||
return [e._getitem_aggs(self) for e in exprs]
|
||||
|
||||
def _apply_aggs(self, exprs: Iterable[PandasLikeExpr]) -> pd.DataFrame:
|
||||
"""Stub issue for `include_groups` [pandas-dev/pandas-stubs#1270].
|
||||
|
||||
- [User guide] mentions `include_groups` 4 times without deprecation.
|
||||
- [`DataFrameGroupBy.apply`] doc says the default value of `True` is deprecated since `2.2.0`.
|
||||
- `False` is explicitly the only *non-deprecated* option, but entirely omitted since [pandas-dev/pandas-stubs#1268].
|
||||
|
||||
[pandas-dev/pandas-stubs#1270]: https://github.com/pandas-dev/pandas-stubs/issues/1270
|
||||
[User guide]: https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html
|
||||
[`DataFrameGroupBy.apply`]: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.core.groupby.DataFrameGroupBy.apply.html
|
||||
[pandas-dev/pandas-stubs#1268]: https://github.com/pandas-dev/pandas-stubs/pull/1268
|
||||
"""
|
||||
warn_complex_group_by()
|
||||
impl = self.compliant._implementation
|
||||
func = self._apply_exprs_function(exprs)
|
||||
apply = self._grouped.apply
|
||||
if impl.is_pandas() and impl._backend_version() >= (2, 2):
|
||||
return apply(func, include_groups=False) # type: ignore[call-overload]
|
||||
return apply(func) # pragma: no cover
|
||||
|
||||
def _apply_exprs_function(self, exprs: Iterable[PandasLikeExpr]) -> NativeApply:
|
||||
ns = self.compliant.__narwhals_namespace__()
|
||||
into_series = ns._series.from_iterable
|
||||
|
||||
def fn(df: pd.DataFrame) -> pd.Series[Any]:
|
||||
compliant = self.compliant._with_native(df)
|
||||
results = (
|
||||
(keys.native.iloc[0], keys.name)
|
||||
for expr in exprs
|
||||
for keys in expr(compliant)
|
||||
)
|
||||
out_group, out_names = zip_strict(*results) if results else ([], [])
|
||||
return into_series(out_group, index=out_names, context=ns).native
|
||||
|
||||
return fn
|
||||
|
||||
def __iter__(self) -> Iterator[tuple[Any, PandasLikeDataFrame]]:
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings(
|
||||
"ignore",
|
||||
message=".*a length 1 tuple will be returned",
|
||||
category=FutureWarning,
|
||||
)
|
||||
with_native = self.compliant._with_native
|
||||
for key, group in self._grouped:
|
||||
yield (key, with_native(group).simple_select(*self._original_columns))
|
||||
|
||||
|
||||
def empty_results_error() -> ValueError:
|
||||
"""Don't even attempt this, it's way too inconsistent across pandas versions."""
|
||||
msg = (
|
||||
"No results for group-by aggregation.\n\n"
|
||||
"Hint: you were probably trying to apply a non-elementary aggregation with a "
|
||||
"pandas-like API.\n"
|
||||
"Please rewrite your query such that group-by aggregations "
|
||||
"are elementary. For example, instead of:\n\n"
|
||||
" df.group_by('a').agg(nw.col('b').round(2).mean())\n\n"
|
||||
"use:\n\n"
|
||||
" df.with_columns(nw.col('b').round(2)).group_by('a').agg(nw.col('b').mean())\n\n"
|
||||
)
|
||||
return ValueError(msg)
|
||||
|
||||
|
||||
def warn_complex_group_by() -> None:
|
||||
issue_warning(
|
||||
"Found complex group-by expression, which can't be expressed efficiently with the "
|
||||
"pandas API. If you can, please rewrite your query such that group-by aggregations "
|
||||
"are simple (e.g. mean, std, min, max, ...). \n\n"
|
||||
"Please see: "
|
||||
"https://narwhals-dev.github.io/narwhals/concepts/improve_group_by_operation/",
|
||||
UserWarning,
|
||||
)
|
441
lib/python3.11/site-packages/narwhals/_pandas_like/namespace.py
Normal file
441
lib/python3.11/site-packages/narwhals/_pandas_like/namespace.py
Normal file
@ -0,0 +1,441 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import operator
|
||||
import warnings
|
||||
from functools import reduce
|
||||
from itertools import chain
|
||||
from typing import TYPE_CHECKING, Any, Literal, Protocol, overload
|
||||
|
||||
from narwhals._compliant import CompliantThen, EagerNamespace, EagerWhen
|
||||
from narwhals._expression_parsing import (
|
||||
combine_alias_output_names,
|
||||
combine_evaluate_output_names,
|
||||
)
|
||||
from narwhals._pandas_like.dataframe import PandasLikeDataFrame
|
||||
from narwhals._pandas_like.expr import PandasLikeExpr
|
||||
from narwhals._pandas_like.selectors import PandasSelectorNamespace
|
||||
from narwhals._pandas_like.series import PandasLikeSeries
|
||||
from narwhals._pandas_like.typing import NativeDataFrameT, NativeSeriesT
|
||||
from narwhals._pandas_like.utils import is_non_nullable_boolean
|
||||
from narwhals._utils import zip_strict
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable, Sequence
|
||||
|
||||
from typing_extensions import TypeAlias
|
||||
|
||||
from narwhals._compliant.typing import ScalarKwargs
|
||||
from narwhals._utils import Implementation, Version
|
||||
from narwhals.typing import IntoDType, NonNestedLiteral
|
||||
|
||||
|
||||
Incomplete: TypeAlias = Any
|
||||
"""Escape hatch, but leaving a trace that this isn't ideal."""
|
||||
|
||||
|
||||
_Vertical: TypeAlias = Literal[0]
|
||||
_Horizontal: TypeAlias = Literal[1]
|
||||
Axis: TypeAlias = Literal[_Vertical, _Horizontal]
|
||||
|
||||
VERTICAL: _Vertical = 0
|
||||
HORIZONTAL: _Horizontal = 1
|
||||
|
||||
|
||||
class PandasLikeNamespace(
|
||||
EagerNamespace[
|
||||
PandasLikeDataFrame,
|
||||
PandasLikeSeries,
|
||||
PandasLikeExpr,
|
||||
NativeDataFrameT,
|
||||
NativeSeriesT,
|
||||
]
|
||||
):
|
||||
@property
|
||||
def _dataframe(self) -> type[PandasLikeDataFrame]:
|
||||
return PandasLikeDataFrame
|
||||
|
||||
@property
|
||||
def _expr(self) -> type[PandasLikeExpr]:
|
||||
return PandasLikeExpr
|
||||
|
||||
@property
|
||||
def _series(self) -> type[PandasLikeSeries]:
|
||||
return PandasLikeSeries
|
||||
|
||||
@property
|
||||
def selectors(self) -> PandasSelectorNamespace:
|
||||
return PandasSelectorNamespace.from_namespace(self)
|
||||
|
||||
def __init__(self, implementation: Implementation, version: Version) -> None:
|
||||
self._implementation = implementation
|
||||
self._version = version
|
||||
|
||||
def coalesce(self, *exprs: PandasLikeExpr) -> PandasLikeExpr:
|
||||
def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]:
|
||||
align = self._series._align_full_broadcast
|
||||
series = align(*(s for _expr in exprs for s in _expr(df)))
|
||||
return [
|
||||
reduce(lambda x, y: x.fill_null(y, strategy=None, limit=None), series)
|
||||
]
|
||||
|
||||
return self._expr._from_callable(
|
||||
func=func,
|
||||
depth=max(x._depth for x in exprs) + 1,
|
||||
function_name="coalesce",
|
||||
evaluate_output_names=combine_evaluate_output_names(*exprs),
|
||||
alias_output_names=combine_alias_output_names(*exprs),
|
||||
context=self,
|
||||
)
|
||||
|
||||
def lit(self, value: NonNestedLiteral, dtype: IntoDType | None) -> PandasLikeExpr:
|
||||
def _lit_pandas_series(df: PandasLikeDataFrame) -> PandasLikeSeries:
|
||||
pandas_series = self._series.from_iterable(
|
||||
data=[value],
|
||||
name="literal",
|
||||
index=df._native_frame.index[0:1],
|
||||
context=self,
|
||||
)
|
||||
if dtype:
|
||||
return pandas_series.cast(dtype)
|
||||
return pandas_series
|
||||
|
||||
return PandasLikeExpr(
|
||||
lambda df: [_lit_pandas_series(df)],
|
||||
depth=0,
|
||||
function_name="lit",
|
||||
evaluate_output_names=lambda _df: ["literal"],
|
||||
alias_output_names=None,
|
||||
implementation=self._implementation,
|
||||
version=self._version,
|
||||
)
|
||||
|
||||
def len(self) -> PandasLikeExpr:
|
||||
return PandasLikeExpr(
|
||||
lambda df: [
|
||||
self._series.from_iterable(
|
||||
[len(df._native_frame)], name="len", index=[0], context=self
|
||||
)
|
||||
],
|
||||
depth=0,
|
||||
function_name="len",
|
||||
evaluate_output_names=lambda _df: ["len"],
|
||||
alias_output_names=None,
|
||||
implementation=self._implementation,
|
||||
version=self._version,
|
||||
)
|
||||
|
||||
# --- horizontal ---
|
||||
def sum_horizontal(self, *exprs: PandasLikeExpr) -> PandasLikeExpr:
|
||||
def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]:
|
||||
align = self._series._align_full_broadcast
|
||||
it = chain.from_iterable(expr(df) for expr in exprs)
|
||||
series = align(*it)
|
||||
native_series = (s.fill_null(0, None, None) for s in series)
|
||||
return [reduce(operator.add, native_series)]
|
||||
|
||||
return self._expr._from_callable(
|
||||
func=func,
|
||||
depth=max(x._depth for x in exprs) + 1,
|
||||
function_name="sum_horizontal",
|
||||
evaluate_output_names=combine_evaluate_output_names(*exprs),
|
||||
alias_output_names=combine_alias_output_names(*exprs),
|
||||
context=self,
|
||||
)
|
||||
|
||||
def all_horizontal(
|
||||
self, *exprs: PandasLikeExpr, ignore_nulls: bool
|
||||
) -> PandasLikeExpr:
|
||||
def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]:
|
||||
align = self._series._align_full_broadcast
|
||||
series = [s for _expr in exprs for s in _expr(df)]
|
||||
if not ignore_nulls and any(
|
||||
s.native.dtype == "object" and s.is_null().any() for s in series
|
||||
):
|
||||
# classical NumPy boolean columns don't support missing values, so
|
||||
# only do the full scan with `is_null` if we have `object` dtype.
|
||||
msg = "Cannot use `ignore_nulls=False` in `all_horizontal` for non-nullable NumPy-backed pandas Series when nulls are present."
|
||||
raise ValueError(msg)
|
||||
it = (
|
||||
(
|
||||
# NumPy-backed 'bool' dtype can't contain nulls so doesn't need filling.
|
||||
s if is_non_nullable_boolean(s) else s.fill_null(True, None, None)
|
||||
for s in series
|
||||
)
|
||||
if ignore_nulls
|
||||
else iter(series)
|
||||
)
|
||||
return [reduce(operator.and_, align(*it))]
|
||||
|
||||
return self._expr._from_callable(
|
||||
func=func,
|
||||
depth=max(x._depth for x in exprs) + 1,
|
||||
function_name="all_horizontal",
|
||||
evaluate_output_names=combine_evaluate_output_names(*exprs),
|
||||
alias_output_names=combine_alias_output_names(*exprs),
|
||||
context=self,
|
||||
)
|
||||
|
||||
def any_horizontal(
|
||||
self, *exprs: PandasLikeExpr, ignore_nulls: bool
|
||||
) -> PandasLikeExpr:
|
||||
def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]:
|
||||
align = self._series._align_full_broadcast
|
||||
series = [s for _expr in exprs for s in _expr(df)]
|
||||
if not ignore_nulls and any(
|
||||
s.native.dtype == "object" and s.is_null().any() for s in series
|
||||
):
|
||||
# classical NumPy boolean columns don't support missing values, so
|
||||
# only do the full scan with `is_null` if we have `object` dtype.
|
||||
msg = "Cannot use `ignore_nulls=False` in `any_horizontal` for non-nullable NumPy-backed pandas Series when nulls are present."
|
||||
raise ValueError(msg)
|
||||
it = (
|
||||
(
|
||||
# NumPy-backed 'bool' dtype can't contain nulls so doesn't need filling.
|
||||
s if is_non_nullable_boolean(s) else s.fill_null(False, None, None)
|
||||
for s in series
|
||||
)
|
||||
if ignore_nulls
|
||||
else iter(series)
|
||||
)
|
||||
return [reduce(operator.or_, align(*it))]
|
||||
|
||||
return self._expr._from_callable(
|
||||
func=func,
|
||||
depth=max(x._depth for x in exprs) + 1,
|
||||
function_name="any_horizontal",
|
||||
evaluate_output_names=combine_evaluate_output_names(*exprs),
|
||||
alias_output_names=combine_alias_output_names(*exprs),
|
||||
context=self,
|
||||
)
|
||||
|
||||
def mean_horizontal(self, *exprs: PandasLikeExpr) -> PandasLikeExpr:
|
||||
def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]:
|
||||
expr_results = [s for _expr in exprs for s in _expr(df)]
|
||||
align = self._series._align_full_broadcast
|
||||
series = align(
|
||||
*(s.fill_null(0, strategy=None, limit=None) for s in expr_results)
|
||||
)
|
||||
non_na = align(*(1 - s.is_null() for s in expr_results))
|
||||
return [reduce(operator.add, series) / reduce(operator.add, non_na)]
|
||||
|
||||
return self._expr._from_callable(
|
||||
func=func,
|
||||
depth=max(x._depth for x in exprs) + 1,
|
||||
function_name="mean_horizontal",
|
||||
evaluate_output_names=combine_evaluate_output_names(*exprs),
|
||||
alias_output_names=combine_alias_output_names(*exprs),
|
||||
context=self,
|
||||
)
|
||||
|
||||
def min_horizontal(self, *exprs: PandasLikeExpr) -> PandasLikeExpr:
|
||||
def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]:
|
||||
it = chain.from_iterable(expr(df) for expr in exprs)
|
||||
align = self._series._align_full_broadcast
|
||||
series = align(*it)
|
||||
|
||||
return [
|
||||
PandasLikeSeries(
|
||||
self.concat(
|
||||
(s.to_frame() for s in series), how="horizontal"
|
||||
)._native_frame.min(axis=1),
|
||||
implementation=self._implementation,
|
||||
version=self._version,
|
||||
).alias(series[0].name)
|
||||
]
|
||||
|
||||
return self._expr._from_callable(
|
||||
func=func,
|
||||
depth=max(x._depth for x in exprs) + 1,
|
||||
function_name="min_horizontal",
|
||||
evaluate_output_names=combine_evaluate_output_names(*exprs),
|
||||
alias_output_names=combine_alias_output_names(*exprs),
|
||||
context=self,
|
||||
)
|
||||
|
||||
def max_horizontal(self, *exprs: PandasLikeExpr) -> PandasLikeExpr:
|
||||
def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]:
|
||||
it = chain.from_iterable(expr(df) for expr in exprs)
|
||||
align = self._series._align_full_broadcast
|
||||
series = align(*it)
|
||||
|
||||
return [
|
||||
PandasLikeSeries(
|
||||
self.concat(
|
||||
(s.to_frame() for s in series), how="horizontal"
|
||||
).native.max(axis=1),
|
||||
implementation=self._implementation,
|
||||
version=self._version,
|
||||
).alias(series[0].name)
|
||||
]
|
||||
|
||||
return self._expr._from_callable(
|
||||
func=func,
|
||||
depth=max(x._depth for x in exprs) + 1,
|
||||
function_name="max_horizontal",
|
||||
evaluate_output_names=combine_evaluate_output_names(*exprs),
|
||||
alias_output_names=combine_alias_output_names(*exprs),
|
||||
context=self,
|
||||
)
|
||||
|
||||
@property
|
||||
def _concat(self) -> _NativeConcat[NativeDataFrameT, NativeSeriesT]:
|
||||
"""Concatenate pandas objects along a particular axis.
|
||||
|
||||
Return the **native** equivalent of `pd.concat`.
|
||||
"""
|
||||
return self._implementation.to_native_namespace().concat
|
||||
|
||||
def _concat_diagonal(self, dfs: Sequence[NativeDataFrameT], /) -> NativeDataFrameT:
|
||||
if self._implementation.is_pandas() and self._backend_version < (3,):
|
||||
return self._concat(dfs, axis=VERTICAL, copy=False)
|
||||
return self._concat(dfs, axis=VERTICAL)
|
||||
|
||||
def _concat_horizontal(
|
||||
self, dfs: Sequence[NativeDataFrameT | NativeSeriesT], /
|
||||
) -> NativeDataFrameT:
|
||||
if self._implementation.is_cudf():
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings(
|
||||
"ignore",
|
||||
message="The behavior of array concatenation with empty entries is deprecated",
|
||||
category=FutureWarning,
|
||||
)
|
||||
return self._concat(dfs, axis=HORIZONTAL)
|
||||
elif self._implementation.is_pandas() and self._backend_version < (3,):
|
||||
return self._concat(dfs, axis=HORIZONTAL, copy=False)
|
||||
return self._concat(dfs, axis=HORIZONTAL)
|
||||
|
||||
def _concat_vertical(self, dfs: Sequence[NativeDataFrameT], /) -> NativeDataFrameT:
|
||||
cols_0 = dfs[0].columns
|
||||
for i, df in enumerate(dfs[1:], start=1):
|
||||
cols_current = df.columns
|
||||
if not (
|
||||
(len(cols_current) == len(cols_0)) and (cols_current == cols_0).all()
|
||||
):
|
||||
msg = (
|
||||
"unable to vstack, column names don't match:\n"
|
||||
f" - dataframe 0: {cols_0.to_list()}\n"
|
||||
f" - dataframe {i}: {cols_current.to_list()}\n"
|
||||
)
|
||||
raise TypeError(msg)
|
||||
if self._implementation.is_pandas() and self._backend_version < (3,):
|
||||
return self._concat(dfs, axis=VERTICAL, copy=False)
|
||||
return self._concat(dfs, axis=VERTICAL)
|
||||
|
||||
def when(self, predicate: PandasLikeExpr) -> PandasWhen[NativeSeriesT]:
|
||||
return PandasWhen[NativeSeriesT].from_expr(predicate, context=self)
|
||||
|
||||
def concat_str(
|
||||
self, *exprs: PandasLikeExpr, separator: str, ignore_nulls: bool
|
||||
) -> PandasLikeExpr:
|
||||
string = self._version.dtypes.String()
|
||||
|
||||
def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]:
|
||||
expr_results = [s for _expr in exprs for s in _expr(df)]
|
||||
align = self._series._align_full_broadcast
|
||||
series = align(*(s.cast(string) for s in expr_results))
|
||||
null_mask = align(*(s.is_null() for s in expr_results))
|
||||
|
||||
if not ignore_nulls:
|
||||
null_mask_result = reduce(operator.or_, null_mask)
|
||||
result = reduce(lambda x, y: x + separator + y, series).zip_with(
|
||||
~null_mask_result, None
|
||||
)
|
||||
else:
|
||||
# NOTE: Trying to help `mypy` later
|
||||
# error: Cannot determine type of "values" [has-type]
|
||||
values: list[PandasLikeSeries]
|
||||
init_value, *values = [
|
||||
s.zip_with(~nm, "") for s, nm in zip_strict(series, null_mask)
|
||||
]
|
||||
|
||||
sep_array = init_value.from_iterable(
|
||||
data=[separator] * len(init_value),
|
||||
name="sep",
|
||||
index=init_value.native.index,
|
||||
context=self,
|
||||
)
|
||||
separators = (sep_array.zip_with(~nm, "") for nm in null_mask[:-1])
|
||||
result = reduce(
|
||||
operator.add,
|
||||
(s + v for s, v in zip_strict(separators, values)),
|
||||
init_value,
|
||||
)
|
||||
|
||||
return [result]
|
||||
|
||||
return self._expr._from_callable(
|
||||
func=func,
|
||||
depth=max(x._depth for x in exprs) + 1,
|
||||
function_name="concat_str",
|
||||
evaluate_output_names=combine_evaluate_output_names(*exprs),
|
||||
alias_output_names=combine_alias_output_names(*exprs),
|
||||
context=self,
|
||||
)
|
||||
|
||||
|
||||
class _NativeConcat(Protocol[NativeDataFrameT, NativeSeriesT]):
|
||||
@overload
|
||||
def __call__(
|
||||
self,
|
||||
objs: Iterable[NativeDataFrameT],
|
||||
*,
|
||||
axis: _Vertical,
|
||||
copy: bool | None = ...,
|
||||
) -> NativeDataFrameT: ...
|
||||
@overload
|
||||
def __call__(
|
||||
self, objs: Iterable[NativeSeriesT], *, axis: _Vertical, copy: bool | None = ...
|
||||
) -> NativeSeriesT: ...
|
||||
@overload
|
||||
def __call__(
|
||||
self,
|
||||
objs: Iterable[NativeDataFrameT | NativeSeriesT],
|
||||
*,
|
||||
axis: _Horizontal,
|
||||
copy: bool | None = ...,
|
||||
) -> NativeDataFrameT: ...
|
||||
@overload
|
||||
def __call__(
|
||||
self,
|
||||
objs: Iterable[NativeDataFrameT | NativeSeriesT],
|
||||
*,
|
||||
axis: Axis,
|
||||
copy: bool | None = ...,
|
||||
) -> NativeDataFrameT | NativeSeriesT: ...
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
objs: Iterable[NativeDataFrameT | NativeSeriesT],
|
||||
*,
|
||||
axis: Axis,
|
||||
copy: bool | None = None,
|
||||
) -> NativeDataFrameT | NativeSeriesT: ...
|
||||
|
||||
|
||||
class PandasWhen(
|
||||
EagerWhen[PandasLikeDataFrame, PandasLikeSeries, PandasLikeExpr, NativeSeriesT]
|
||||
):
|
||||
@property
|
||||
# Signature of "_then" incompatible with supertype "CompliantWhen"
|
||||
# ArrowWhen seems to follow the same pattern, but no mypy complaint there?
|
||||
def _then(self) -> type[PandasThen]: # type: ignore[override]
|
||||
return PandasThen
|
||||
|
||||
def _if_then_else(
|
||||
self,
|
||||
when: NativeSeriesT,
|
||||
then: NativeSeriesT,
|
||||
otherwise: NativeSeriesT | NonNestedLiteral,
|
||||
) -> NativeSeriesT:
|
||||
where: Incomplete = then.where
|
||||
return where(when) if otherwise is None else where(when, otherwise)
|
||||
|
||||
|
||||
class PandasThen(
|
||||
CompliantThen[PandasLikeDataFrame, PandasLikeSeries, PandasLikeExpr, PandasWhen],
|
||||
PandasLikeExpr,
|
||||
):
|
||||
_depth: int = 0
|
||||
_scalar_kwargs: ScalarKwargs = {} # noqa: RUF012
|
||||
_function_name: str = "whenthen"
|
@ -0,0 +1,38 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from narwhals._compliant import CompliantSelector, EagerSelectorNamespace
|
||||
from narwhals._pandas_like.expr import PandasLikeExpr
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from narwhals._compliant.typing import ScalarKwargs
|
||||
from narwhals._pandas_like.dataframe import PandasLikeDataFrame # noqa: F401
|
||||
from narwhals._pandas_like.series import PandasLikeSeries # noqa: F401
|
||||
|
||||
|
||||
class PandasSelectorNamespace(
|
||||
EagerSelectorNamespace["PandasLikeDataFrame", "PandasLikeSeries"]
|
||||
):
|
||||
@property
|
||||
def _selector(self) -> type[PandasSelector]:
|
||||
return PandasSelector
|
||||
|
||||
|
||||
class PandasSelector( # type: ignore[misc]
|
||||
CompliantSelector["PandasLikeDataFrame", "PandasLikeSeries"], PandasLikeExpr
|
||||
):
|
||||
_depth: int = 0
|
||||
_scalar_kwargs: ScalarKwargs = {} # noqa: RUF012
|
||||
_function_name: str = "selector"
|
||||
|
||||
def _to_expr(self) -> PandasLikeExpr:
|
||||
return PandasLikeExpr(
|
||||
self._call,
|
||||
depth=self._depth,
|
||||
function_name=self._function_name,
|
||||
evaluate_output_names=self._evaluate_output_names,
|
||||
alias_output_names=self._alias_output_names,
|
||||
implementation=self._implementation,
|
||||
version=self._version,
|
||||
)
|
1160
lib/python3.11/site-packages/narwhals/_pandas_like/series.py
Normal file
1160
lib/python3.11/site-packages/narwhals/_pandas_like/series.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,17 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from narwhals._compliant.any_namespace import CatNamespace
|
||||
from narwhals._pandas_like.utils import PandasLikeSeriesNamespace
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from narwhals._pandas_like.series import PandasLikeSeries
|
||||
|
||||
|
||||
class PandasLikeSeriesCatNamespace(
|
||||
PandasLikeSeriesNamespace, CatNamespace["PandasLikeSeries"]
|
||||
):
|
||||
def get_categories(self) -> PandasLikeSeries:
|
||||
s = self.native
|
||||
return self.with_native(type(s)(s.cat.categories, name=s.name))
|
290
lib/python3.11/site-packages/narwhals/_pandas_like/series_dt.py
Normal file
290
lib/python3.11/site-packages/narwhals/_pandas_like/series_dt.py
Normal file
@ -0,0 +1,290 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from narwhals._compliant.any_namespace import DateTimeNamespace
|
||||
from narwhals._constants import (
|
||||
EPOCH_YEAR,
|
||||
MS_PER_SECOND,
|
||||
NS_PER_SECOND,
|
||||
SECONDS_PER_DAY,
|
||||
US_PER_SECOND,
|
||||
)
|
||||
from narwhals._duration import Interval
|
||||
from narwhals._pandas_like.utils import (
|
||||
ALIAS_DICT,
|
||||
UNITS_DICT,
|
||||
PandasLikeSeriesNamespace,
|
||||
calculate_timestamp_date,
|
||||
calculate_timestamp_datetime,
|
||||
get_dtype_backend,
|
||||
int_dtype_mapper,
|
||||
is_dtype_pyarrow,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from datetime import timedelta
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from narwhals._pandas_like.series import PandasLikeSeries
|
||||
from narwhals.typing import TimeUnit
|
||||
|
||||
|
||||
class PandasLikeSeriesDateTimeNamespace(
|
||||
PandasLikeSeriesNamespace, DateTimeNamespace["PandasLikeSeries"]
|
||||
):
|
||||
def date(self) -> PandasLikeSeries:
|
||||
result = self.with_native(self.native.dt.date)
|
||||
if str(result.dtype).lower() == "object":
|
||||
msg = (
|
||||
"Accessing `date` on the default pandas backend "
|
||||
"will return a Series of type `object`."
|
||||
"\nThis differs from polars API and will prevent `.dt` chaining. "
|
||||
"Please switch to the `pyarrow` backend:"
|
||||
'\ndf.convert_dtypes(dtype_backend="pyarrow")'
|
||||
)
|
||||
raise NotImplementedError(msg)
|
||||
return result
|
||||
|
||||
def year(self) -> PandasLikeSeries:
|
||||
return self.with_native(self.native.dt.year)
|
||||
|
||||
def month(self) -> PandasLikeSeries:
|
||||
return self.with_native(self.native.dt.month)
|
||||
|
||||
def day(self) -> PandasLikeSeries:
|
||||
return self.with_native(self.native.dt.day)
|
||||
|
||||
def hour(self) -> PandasLikeSeries:
|
||||
return self.with_native(self.native.dt.hour)
|
||||
|
||||
def minute(self) -> PandasLikeSeries:
|
||||
return self.with_native(self.native.dt.minute)
|
||||
|
||||
def second(self) -> PandasLikeSeries:
|
||||
return self.with_native(self.native.dt.second)
|
||||
|
||||
def millisecond(self) -> PandasLikeSeries:
|
||||
return self.microsecond() // 1000
|
||||
|
||||
def microsecond(self) -> PandasLikeSeries:
|
||||
if self.backend_version < (3, 0, 0) and self._is_pyarrow():
|
||||
# crazy workaround for https://github.com/pandas-dev/pandas/issues/59154
|
||||
import pyarrow.compute as pc # ignore-banned-import()
|
||||
|
||||
from narwhals._arrow.utils import lit
|
||||
|
||||
arr_ns = self.native.array
|
||||
arr = arr_ns.__arrow_array__()
|
||||
result_arr = pc.add(
|
||||
pc.multiply(pc.millisecond(arr), lit(1_000)), pc.microsecond(arr)
|
||||
)
|
||||
result = type(self.native)(type(arr_ns)(result_arr), name=self.native.name)
|
||||
return self.with_native(result)
|
||||
|
||||
return self.with_native(self.native.dt.microsecond)
|
||||
|
||||
def nanosecond(self) -> PandasLikeSeries:
|
||||
return self.microsecond() * 1_000 + self.native.dt.nanosecond
|
||||
|
||||
def ordinal_day(self) -> PandasLikeSeries:
|
||||
year_start = self.native.dt.year
|
||||
result = (
|
||||
self.native.to_numpy().astype("datetime64[D]")
|
||||
- (year_start.to_numpy() - EPOCH_YEAR).astype("datetime64[Y]")
|
||||
).astype("int32") + 1
|
||||
dtype = "Int64[pyarrow]" if self._is_pyarrow() else "int32"
|
||||
return self.with_native(
|
||||
type(self.native)(result, dtype=dtype, name=year_start.name)
|
||||
)
|
||||
|
||||
def weekday(self) -> PandasLikeSeries:
|
||||
# Pandas is 0-6 while Polars is 1-7
|
||||
return self.with_native(self.native.dt.weekday) + 1
|
||||
|
||||
def _is_pyarrow(self) -> bool:
|
||||
return is_dtype_pyarrow(self.native.dtype)
|
||||
|
||||
def _get_total_seconds(self) -> Any:
|
||||
if hasattr(self.native.dt, "total_seconds"):
|
||||
return self.native.dt.total_seconds()
|
||||
return ( # pragma: no cover
|
||||
self.native.dt.days * SECONDS_PER_DAY
|
||||
+ self.native.dt.seconds
|
||||
+ (self.native.dt.microseconds / US_PER_SECOND)
|
||||
+ (self.native.dt.nanoseconds / NS_PER_SECOND)
|
||||
)
|
||||
|
||||
def total_minutes(self) -> PandasLikeSeries:
|
||||
s = self._get_total_seconds()
|
||||
# this calculates the sign of each series element
|
||||
s_sign = 2 * (s > 0).astype(int_dtype_mapper(s.dtype)) - 1
|
||||
s_abs = s.abs() // 60
|
||||
if ~s.isna().any():
|
||||
s_abs = s_abs.astype(int_dtype_mapper(s.dtype))
|
||||
return self.with_native(s_abs * s_sign)
|
||||
|
||||
def total_seconds(self) -> PandasLikeSeries:
|
||||
s = self._get_total_seconds()
|
||||
# this calculates the sign of each series element
|
||||
s_sign = 2 * (s > 0).astype(int_dtype_mapper(s.dtype)) - 1
|
||||
s_abs = s.abs() // 1
|
||||
if ~s.isna().any():
|
||||
s_abs = s_abs.astype(int_dtype_mapper(s.dtype))
|
||||
return self.with_native(s_abs * s_sign)
|
||||
|
||||
def total_milliseconds(self) -> PandasLikeSeries:
|
||||
s = self._get_total_seconds() * MS_PER_SECOND
|
||||
# this calculates the sign of each series element
|
||||
s_sign = 2 * (s > 0).astype(int_dtype_mapper(s.dtype)) - 1
|
||||
s_abs = s.abs() // 1
|
||||
if ~s.isna().any():
|
||||
s_abs = s_abs.astype(int_dtype_mapper(s.dtype))
|
||||
return self.with_native(s_abs * s_sign)
|
||||
|
||||
def total_microseconds(self) -> PandasLikeSeries:
|
||||
s = self._get_total_seconds() * US_PER_SECOND
|
||||
# this calculates the sign of each series element
|
||||
s_sign = 2 * (s > 0).astype(int_dtype_mapper(s.dtype)) - 1
|
||||
s_abs = s.abs() // 1
|
||||
if ~s.isna().any():
|
||||
s_abs = s_abs.astype(int_dtype_mapper(s.dtype))
|
||||
return self.with_native(s_abs * s_sign)
|
||||
|
||||
def total_nanoseconds(self) -> PandasLikeSeries:
|
||||
s = self._get_total_seconds() * NS_PER_SECOND
|
||||
# this calculates the sign of each series element
|
||||
s_sign = 2 * (s > 0).astype(int_dtype_mapper(s.dtype)) - 1
|
||||
s_abs = s.abs() // 1
|
||||
if ~s.isna().any():
|
||||
s_abs = s_abs.astype(int_dtype_mapper(s.dtype))
|
||||
return self.with_native(s_abs * s_sign)
|
||||
|
||||
def to_string(self, format: str) -> PandasLikeSeries:
|
||||
# Polars' parser treats `'%.f'` as pandas does `'.%f'`
|
||||
# PyArrow interprets `'%S'` as "seconds, plus fractional seconds"
|
||||
# and doesn't support `%f`
|
||||
if not self._is_pyarrow():
|
||||
format = format.replace("%S%.f", "%S.%f")
|
||||
else:
|
||||
format = format.replace("%S.%f", "%S").replace("%S%.f", "%S")
|
||||
return self.with_native(self.native.dt.strftime(format))
|
||||
|
||||
def replace_time_zone(self, time_zone: str | None) -> PandasLikeSeries:
|
||||
de_zone = self.native.dt.tz_localize(None)
|
||||
result = de_zone.dt.tz_localize(time_zone) if time_zone is not None else de_zone
|
||||
return self.with_native(result)
|
||||
|
||||
def convert_time_zone(self, time_zone: str) -> PandasLikeSeries:
|
||||
if self.compliant.dtype.time_zone is None: # type: ignore[attr-defined]
|
||||
result = self.native.dt.tz_localize("UTC").dt.tz_convert(time_zone)
|
||||
else:
|
||||
result = self.native.dt.tz_convert(time_zone)
|
||||
return self.with_native(result)
|
||||
|
||||
def timestamp(self, time_unit: TimeUnit) -> PandasLikeSeries:
|
||||
s = self.native
|
||||
dtype = self.compliant.dtype
|
||||
mask_na = s.isna()
|
||||
dtypes = self.version.dtypes
|
||||
if dtype == dtypes.Date:
|
||||
# Date is only supported in pandas dtypes if pyarrow-backed
|
||||
s_cast = s.astype("Int32[pyarrow]")
|
||||
result = calculate_timestamp_date(s_cast, time_unit)
|
||||
elif isinstance(dtype, dtypes.Datetime):
|
||||
fn = (
|
||||
s.view
|
||||
if (self.implementation.is_pandas() and self.backend_version < (2,))
|
||||
else s.astype
|
||||
)
|
||||
s_cast = fn("Int64[pyarrow]") if self._is_pyarrow() else fn("int64")
|
||||
result = calculate_timestamp_datetime(s_cast, dtype.time_unit, time_unit)
|
||||
else:
|
||||
msg = "Input should be either of Date or Datetime type"
|
||||
raise TypeError(msg)
|
||||
result[mask_na] = None
|
||||
return self.with_native(result)
|
||||
|
||||
def truncate(self, every: str) -> PandasLikeSeries:
|
||||
interval = Interval.parse(every)
|
||||
multiple, unit = interval.multiple, interval.unit
|
||||
native = self.native
|
||||
if self.implementation.is_cudf():
|
||||
if multiple != 1:
|
||||
msg = f"Only multiple `1` is supported for cuDF, got: {multiple}."
|
||||
raise NotImplementedError(msg)
|
||||
return self.with_native(self.native.dt.floor(ALIAS_DICT.get(unit, unit)))
|
||||
dtype_backend = get_dtype_backend(native.dtype, self.compliant._implementation)
|
||||
if unit in {"mo", "q", "y"}:
|
||||
if self.implementation.is_cudf():
|
||||
msg = f"Truncating to {unit} is not supported yet for cuDF."
|
||||
raise NotImplementedError(msg)
|
||||
if dtype_backend == "pyarrow":
|
||||
import pyarrow.compute as pc # ignore-banned-import
|
||||
|
||||
ca = native.array._pa_array
|
||||
result_arr = pc.floor_temporal(ca, multiple, UNITS_DICT[unit])
|
||||
else:
|
||||
if unit == "q":
|
||||
multiple *= 3
|
||||
np_unit = "M"
|
||||
elif unit == "mo":
|
||||
np_unit = "M"
|
||||
else:
|
||||
np_unit = "Y"
|
||||
arr = native.values # noqa: PD011
|
||||
arr_dtype = arr.dtype
|
||||
result_arr = arr.astype(f"datetime64[{multiple}{np_unit}]").astype(
|
||||
arr_dtype
|
||||
)
|
||||
result_native = type(native)(
|
||||
result_arr, dtype=native.dtype, index=native.index, name=native.name
|
||||
)
|
||||
return self.with_native(result_native)
|
||||
return self.with_native(
|
||||
self.native.dt.floor(f"{multiple}{ALIAS_DICT.get(unit, unit)}")
|
||||
)
|
||||
|
||||
def offset_by(self, by: str) -> PandasLikeSeries:
|
||||
native = self.native
|
||||
pdx = self.compliant.__native_namespace__()
|
||||
if self._is_pyarrow():
|
||||
import pyarrow as pa # ignore-banned-import
|
||||
|
||||
compliant = self.compliant
|
||||
ca = pa.chunked_array([compliant.to_arrow()]) # type: ignore[arg-type]
|
||||
result = (
|
||||
compliant._version.namespace.from_backend("pyarrow")
|
||||
.compliant.from_native(ca)
|
||||
.dt.offset_by(by)
|
||||
.native
|
||||
)
|
||||
result_pd = native.__class__(
|
||||
result, dtype=native.dtype, index=native.index, name=native.name
|
||||
)
|
||||
else:
|
||||
interval = Interval.parse_no_constraints(by)
|
||||
multiple, unit = interval.multiple, interval.unit
|
||||
if unit == "q":
|
||||
multiple *= 3
|
||||
unit = "mo"
|
||||
offset: pd.DateOffset | timedelta
|
||||
if unit == "y":
|
||||
offset = pdx.DateOffset(years=multiple)
|
||||
elif unit == "mo":
|
||||
offset = pdx.DateOffset(months=multiple)
|
||||
elif unit == "ns":
|
||||
offset = pdx.Timedelta(multiple, unit=UNITS_DICT[unit])
|
||||
else:
|
||||
offset = interval.to_timedelta()
|
||||
dtype = self.compliant.dtype
|
||||
datetime_dtype = self.version.dtypes.Datetime
|
||||
if unit == "d" and isinstance(dtype, datetime_dtype) and dtype.time_zone:
|
||||
native_without_timezone = native.dt.tz_localize(None)
|
||||
result_pd = native_without_timezone + offset
|
||||
result_pd = result_pd.dt.tz_localize(dtype.time_zone)
|
||||
else:
|
||||
result_pd = native + offset
|
||||
|
||||
return self.with_native(result_pd)
|
@ -0,0 +1,42 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from narwhals._compliant.any_namespace import ListNamespace
|
||||
from narwhals._pandas_like.utils import (
|
||||
PandasLikeSeriesNamespace,
|
||||
get_dtype_backend,
|
||||
narwhals_to_native_dtype,
|
||||
)
|
||||
from narwhals._utils import not_implemented
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from narwhals._pandas_like.series import PandasLikeSeries
|
||||
|
||||
|
||||
class PandasLikeSeriesListNamespace(
|
||||
PandasLikeSeriesNamespace, ListNamespace["PandasLikeSeries"]
|
||||
):
|
||||
def len(self) -> PandasLikeSeries:
|
||||
result = self.native.list.len()
|
||||
implementation = self.implementation
|
||||
backend_version = self.backend_version
|
||||
if implementation.is_pandas() and backend_version < (3, 0): # pragma: no cover
|
||||
# `result` is a new object so it's safe to do this inplace.
|
||||
result.index = self.native.index
|
||||
dtype = narwhals_to_native_dtype(
|
||||
self.version.dtypes.UInt32(),
|
||||
get_dtype_backend(result.dtype, implementation),
|
||||
implementation,
|
||||
self.version,
|
||||
)
|
||||
return self.with_native(result.astype(dtype)).alias(self.native.name)
|
||||
|
||||
unique = not_implemented()
|
||||
|
||||
contains = not_implemented()
|
||||
|
||||
def get(self, index: int) -> PandasLikeSeries:
|
||||
result = self.native.list[index]
|
||||
result.name = self.native.name
|
||||
return self.with_native(result)
|
@ -0,0 +1,92 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from narwhals._compliant.any_namespace import StringNamespace
|
||||
from narwhals._pandas_like.utils import PandasLikeSeriesNamespace, is_dtype_pyarrow
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from narwhals._pandas_like.series import PandasLikeSeries
|
||||
|
||||
|
||||
class PandasLikeSeriesStringNamespace(
|
||||
PandasLikeSeriesNamespace, StringNamespace["PandasLikeSeries"]
|
||||
):
|
||||
def len_chars(self) -> PandasLikeSeries:
|
||||
return self.with_native(self.native.str.len())
|
||||
|
||||
def replace(
|
||||
self, pattern: str, value: str, *, literal: bool, n: int
|
||||
) -> PandasLikeSeries:
|
||||
try:
|
||||
series = self.native.str.replace(
|
||||
pat=pattern, repl=value, n=n, regex=not literal
|
||||
)
|
||||
except TypeError as e:
|
||||
if not isinstance(value, str):
|
||||
msg = f"{self.compliant._implementation} backed `.str.replace` only supports str replacement values"
|
||||
raise TypeError(msg) from e
|
||||
raise
|
||||
return self.with_native(series)
|
||||
|
||||
def replace_all(self, pattern: str, value: str, *, literal: bool) -> PandasLikeSeries:
|
||||
return self.replace(pattern, value, literal=literal, n=-1)
|
||||
|
||||
def strip_chars(self, characters: str | None) -> PandasLikeSeries:
|
||||
return self.with_native(self.native.str.strip(characters))
|
||||
|
||||
def starts_with(self, prefix: str) -> PandasLikeSeries:
|
||||
return self.with_native(self.native.str.startswith(prefix))
|
||||
|
||||
def ends_with(self, suffix: str) -> PandasLikeSeries:
|
||||
return self.with_native(self.native.str.endswith(suffix))
|
||||
|
||||
def contains(self, pattern: str, *, literal: bool) -> PandasLikeSeries:
|
||||
return self.with_native(self.native.str.contains(pat=pattern, regex=not literal))
|
||||
|
||||
def slice(self, offset: int, length: int | None) -> PandasLikeSeries:
|
||||
stop = offset + length if length else None
|
||||
return self.with_native(self.native.str.slice(start=offset, stop=stop))
|
||||
|
||||
def split(self, by: str) -> PandasLikeSeries:
|
||||
implementation = self.implementation
|
||||
if not implementation.is_cudf() and not is_dtype_pyarrow(self.native.dtype):
|
||||
msg = (
|
||||
"This operation requires a pyarrow-backed series. "
|
||||
"Please refer to https://narwhals-dev.github.io/narwhals/api-reference/narwhals/#narwhals.maybe_convert_dtypes "
|
||||
"and ensure you are using dtype_backend='pyarrow'. "
|
||||
"Additionally, make sure you have pandas version 1.5+ and pyarrow installed. "
|
||||
)
|
||||
raise TypeError(msg)
|
||||
return self.with_native(self.native.str.split(pat=by))
|
||||
|
||||
def to_datetime(self, format: str | None) -> PandasLikeSeries:
|
||||
# If we know inputs are timezone-aware, we can pass `utc=True` for better performance.
|
||||
if format and any(x in format for x in ("%z", "Z")):
|
||||
return self.with_native(self._to_datetime(format, utc=True))
|
||||
result = self.with_native(self._to_datetime(format, utc=False))
|
||||
if (tz := getattr(result.dtype, "time_zone", None)) and tz != "UTC":
|
||||
return result.dt.convert_time_zone("UTC")
|
||||
return result
|
||||
|
||||
def _to_datetime(self, format: str | None, *, utc: bool) -> Any:
|
||||
result = self.implementation.to_native_namespace().to_datetime(
|
||||
self.native, format=format, utc=utc
|
||||
)
|
||||
return (
|
||||
result.convert_dtypes(dtype_backend="pyarrow")
|
||||
if is_dtype_pyarrow(self.native.dtype)
|
||||
else result
|
||||
)
|
||||
|
||||
def to_date(self, format: str | None) -> PandasLikeSeries:
|
||||
return self.to_datetime(format=format).dt.date()
|
||||
|
||||
def to_uppercase(self) -> PandasLikeSeries:
|
||||
return self.with_native(self.native.str.upper())
|
||||
|
||||
def to_lowercase(self) -> PandasLikeSeries:
|
||||
return self.with_native(self.native.str.lower())
|
||||
|
||||
def zfill(self, width: int) -> PandasLikeSeries:
|
||||
return self.with_native(self.native.str.zfill(width))
|
@ -0,0 +1,16 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from narwhals._compliant.any_namespace import StructNamespace
|
||||
from narwhals._pandas_like.utils import PandasLikeSeriesNamespace
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from narwhals._pandas_like.series import PandasLikeSeries
|
||||
|
||||
|
||||
class PandasLikeSeriesStructNamespace(
|
||||
PandasLikeSeriesNamespace, StructNamespace["PandasLikeSeries"]
|
||||
):
|
||||
def field(self, name: str) -> PandasLikeSeries:
|
||||
return self.with_native(self.native.struct.field(name)).alias(name)
|
43
lib/python3.11/site-packages/narwhals/_pandas_like/typing.py
Normal file
43
lib/python3.11/site-packages/narwhals/_pandas_like/typing.py
Normal file
@ -0,0 +1,43 @@
|
||||
from __future__ import annotations # pragma: no cover
|
||||
|
||||
from typing import TYPE_CHECKING # pragma: no cover
|
||||
|
||||
from narwhals._typing_compat import TypeVar
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import Any
|
||||
|
||||
import pandas as pd
|
||||
from typing_extensions import TypeAlias
|
||||
|
||||
from narwhals._namespace import (
|
||||
_CuDFDataFrame,
|
||||
_CuDFSeries,
|
||||
_ModinDataFrame,
|
||||
_ModinSeries,
|
||||
_NativePandasLikeDataFrame,
|
||||
)
|
||||
from narwhals._pandas_like.expr import PandasLikeExpr
|
||||
from narwhals._pandas_like.series import PandasLikeSeries
|
||||
|
||||
IntoPandasLikeExpr: TypeAlias = "PandasLikeExpr | PandasLikeSeries"
|
||||
|
||||
NativeSeriesT = TypeVar(
|
||||
"NativeSeriesT",
|
||||
"pd.Series[Any]",
|
||||
"_CuDFSeries",
|
||||
"_ModinSeries",
|
||||
default="pd.Series[Any]",
|
||||
)
|
||||
NativeDataFrameT = TypeVar(
|
||||
"NativeDataFrameT", bound="_NativePandasLikeDataFrame", default="pd.DataFrame"
|
||||
)
|
||||
NativeNDFrameT = TypeVar(
|
||||
"NativeNDFrameT",
|
||||
"pd.DataFrame",
|
||||
"pd.Series[Any]",
|
||||
"_CuDFDataFrame",
|
||||
"_CuDFSeries",
|
||||
"_ModinDataFrame",
|
||||
"_ModinSeries",
|
||||
)
|
668
lib/python3.11/site-packages/narwhals/_pandas_like/utils.py
Normal file
668
lib/python3.11/site-packages/narwhals/_pandas_like/utils.py
Normal file
@ -0,0 +1,668 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import functools
|
||||
import operator
|
||||
import re
|
||||
from typing import TYPE_CHECKING, Any, Callable, Literal, TypeVar
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from narwhals._compliant import EagerSeriesNamespace
|
||||
from narwhals._constants import (
|
||||
MS_PER_SECOND,
|
||||
NS_PER_MICROSECOND,
|
||||
NS_PER_MILLISECOND,
|
||||
NS_PER_SECOND,
|
||||
SECONDS_PER_DAY,
|
||||
US_PER_SECOND,
|
||||
)
|
||||
from narwhals._utils import (
|
||||
Implementation,
|
||||
Version,
|
||||
_DeferredIterable,
|
||||
check_columns_exist,
|
||||
isinstance_or_issubclass,
|
||||
)
|
||||
from narwhals.exceptions import ShapeError
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable, Iterator, Mapping
|
||||
from types import ModuleType
|
||||
|
||||
from pandas._typing import Dtype as PandasDtype
|
||||
from pandas.core.dtypes.dtypes import BaseMaskedDtype
|
||||
from typing_extensions import TypeAlias, TypeIs
|
||||
|
||||
from narwhals._duration import IntervalUnit
|
||||
from narwhals._pandas_like.expr import PandasLikeExpr
|
||||
from narwhals._pandas_like.series import PandasLikeSeries
|
||||
from narwhals._pandas_like.typing import (
|
||||
NativeDataFrameT,
|
||||
NativeNDFrameT,
|
||||
NativeSeriesT,
|
||||
)
|
||||
from narwhals.dtypes import DType
|
||||
from narwhals.typing import DTypeBackend, IntoDType, TimeUnit, _1DArray
|
||||
|
||||
ExprT = TypeVar("ExprT", bound=PandasLikeExpr)
|
||||
UnitCurrent: TypeAlias = TimeUnit
|
||||
UnitTarget: TypeAlias = TimeUnit
|
||||
BinOpBroadcast: TypeAlias = Callable[[Any, int], Any]
|
||||
IntoRhs: TypeAlias = int
|
||||
|
||||
|
||||
PANDAS_LIKE_IMPLEMENTATION = {
|
||||
Implementation.PANDAS,
|
||||
Implementation.CUDF,
|
||||
Implementation.MODIN,
|
||||
}
|
||||
PD_DATETIME_RGX = r"""^
|
||||
datetime64\[
|
||||
(?P<time_unit>s|ms|us|ns) # Match time unit: s, ms, us, or ns
|
||||
(?:, # Begin non-capturing group for optional timezone
|
||||
\s* # Optional whitespace after comma
|
||||
(?P<time_zone> # Start named group for timezone
|
||||
[a-zA-Z\/]+ # Match timezone name, e.g., UTC, America/New_York
|
||||
(?:[+-]\d{2}:\d{2})? # Optional offset in format +HH:MM or -HH:MM
|
||||
| # OR
|
||||
pytz\.FixedOffset\(\d+\) # Match pytz.FixedOffset with integer offset in parentheses
|
||||
) # End time_zone group
|
||||
)? # End optional timezone group
|
||||
\] # Closing bracket for datetime64
|
||||
$"""
|
||||
PATTERN_PD_DATETIME = re.compile(PD_DATETIME_RGX, re.VERBOSE)
|
||||
PA_DATETIME_RGX = r"""^
|
||||
timestamp\[
|
||||
(?P<time_unit>s|ms|us|ns) # Match time unit: s, ms, us, or ns
|
||||
(?:, # Begin non-capturing group for optional timezone
|
||||
\s?tz= # Match "tz=" prefix
|
||||
(?P<time_zone> # Start named group for timezone
|
||||
[a-zA-Z\/]* # Match timezone name (e.g., UTC, America/New_York)
|
||||
(?: # Begin optional non-capturing group for offset
|
||||
[+-]\d{2}:\d{2} # Match offset in format +HH:MM or -HH:MM
|
||||
)? # End optional offset group
|
||||
) # End time_zone group
|
||||
)? # End optional timezone group
|
||||
\] # Closing bracket for timestamp
|
||||
\[pyarrow\] # Literal string "[pyarrow]"
|
||||
$"""
|
||||
PATTERN_PA_DATETIME = re.compile(PA_DATETIME_RGX, re.VERBOSE)
|
||||
PD_DURATION_RGX = r"""^
|
||||
timedelta64\[
|
||||
(?P<time_unit>s|ms|us|ns) # Match time unit: s, ms, us, or ns
|
||||
\] # Closing bracket for timedelta64
|
||||
$"""
|
||||
|
||||
PATTERN_PD_DURATION = re.compile(PD_DURATION_RGX, re.VERBOSE)
|
||||
PA_DURATION_RGX = r"""^
|
||||
duration\[
|
||||
(?P<time_unit>s|ms|us|ns) # Match time unit: s, ms, us, or ns
|
||||
\] # Closing bracket for duration
|
||||
\[pyarrow\] # Literal string "[pyarrow]"
|
||||
$"""
|
||||
PATTERN_PA_DURATION = re.compile(PA_DURATION_RGX, re.VERBOSE)
|
||||
|
||||
NativeIntervalUnit: TypeAlias = Literal[
|
||||
"year",
|
||||
"quarter",
|
||||
"month",
|
||||
"week",
|
||||
"day",
|
||||
"hour",
|
||||
"minute",
|
||||
"second",
|
||||
"millisecond",
|
||||
"microsecond",
|
||||
"nanosecond",
|
||||
]
|
||||
ALIAS_DICT = {"d": "D", "m": "min"}
|
||||
UNITS_DICT: Mapping[IntervalUnit, NativeIntervalUnit] = {
|
||||
"y": "year",
|
||||
"q": "quarter",
|
||||
"mo": "month",
|
||||
"d": "day",
|
||||
"h": "hour",
|
||||
"m": "minute",
|
||||
"s": "second",
|
||||
"ms": "millisecond",
|
||||
"us": "microsecond",
|
||||
"ns": "nanosecond",
|
||||
}
|
||||
|
||||
PANDAS_VERSION = Implementation.PANDAS._backend_version()
|
||||
"""Static backend version for `pandas`.
|
||||
|
||||
Always available if we reached here, due to a module-level import.
|
||||
"""
|
||||
|
||||
|
||||
def is_pandas_or_modin(implementation: Implementation) -> bool:
|
||||
return implementation in {Implementation.PANDAS, Implementation.MODIN}
|
||||
|
||||
|
||||
def align_and_extract_native(
|
||||
lhs: PandasLikeSeries, rhs: PandasLikeSeries | object
|
||||
) -> tuple[pd.Series[Any] | object, pd.Series[Any] | object]:
|
||||
"""Validate RHS of binary operation.
|
||||
|
||||
If the comparison isn't supported, return `NotImplemented` so that the
|
||||
"right-hand-side" operation (e.g. `__radd__`) can be tried.
|
||||
"""
|
||||
from narwhals._pandas_like.series import PandasLikeSeries
|
||||
|
||||
lhs_index = lhs.native.index
|
||||
|
||||
if lhs._broadcast and isinstance(rhs, PandasLikeSeries) and not rhs._broadcast:
|
||||
return lhs.native.iloc[0], rhs.native
|
||||
|
||||
if isinstance(rhs, PandasLikeSeries):
|
||||
if rhs._broadcast:
|
||||
return (lhs.native, rhs.native.iloc[0])
|
||||
if rhs.native.index is not lhs_index:
|
||||
return (
|
||||
lhs.native,
|
||||
set_index(rhs.native, lhs_index, implementation=rhs._implementation),
|
||||
)
|
||||
return (lhs.native, rhs.native)
|
||||
|
||||
if isinstance(rhs, list):
|
||||
msg = "Expected Series or scalar, got list."
|
||||
raise TypeError(msg)
|
||||
# `rhs` must be scalar, so just leave it as-is
|
||||
return lhs.native, rhs
|
||||
|
||||
|
||||
def set_index(
|
||||
obj: NativeNDFrameT, index: Any, *, implementation: Implementation
|
||||
) -> NativeNDFrameT:
|
||||
"""Wrapper around pandas' set_axis to set object index.
|
||||
|
||||
We can set `copy` / `inplace` based on implementation/version.
|
||||
"""
|
||||
if isinstance(index, implementation.to_native_namespace().Index) and (
|
||||
expected_len := len(index)
|
||||
) != (actual_len := len(obj)):
|
||||
msg = f"Expected object of length {expected_len}, got length: {actual_len}"
|
||||
raise ShapeError(msg)
|
||||
if implementation is Implementation.CUDF:
|
||||
obj = obj.copy(deep=False)
|
||||
obj.index = index
|
||||
return obj
|
||||
if implementation is Implementation.PANDAS and (
|
||||
(1, 5) <= implementation._backend_version() < (3,)
|
||||
): # pragma: no cover
|
||||
return obj.set_axis(index, axis=0, copy=False)
|
||||
return obj.set_axis(index, axis=0) # pragma: no cover
|
||||
|
||||
|
||||
def rename(
|
||||
obj: NativeNDFrameT, *args: Any, implementation: Implementation, **kwargs: Any
|
||||
) -> NativeNDFrameT:
|
||||
"""Wrapper around pandas' rename so that we can set `copy` based on implementation/version."""
|
||||
if implementation is Implementation.PANDAS and (
|
||||
implementation._backend_version() >= (3,)
|
||||
): # pragma: no cover
|
||||
return obj.rename(*args, **kwargs, inplace=False)
|
||||
return obj.rename(*args, **kwargs, copy=False, inplace=False)
|
||||
|
||||
|
||||
@functools.lru_cache(maxsize=16)
|
||||
def non_object_native_to_narwhals_dtype(native_dtype: Any, version: Version) -> DType: # noqa: C901, PLR0912
|
||||
dtype = str(native_dtype)
|
||||
|
||||
dtypes = version.dtypes
|
||||
if dtype in {"int64", "Int64", "Int64[pyarrow]", "int64[pyarrow]"}:
|
||||
return dtypes.Int64()
|
||||
if dtype in {"int32", "Int32", "Int32[pyarrow]", "int32[pyarrow]"}:
|
||||
return dtypes.Int32()
|
||||
if dtype in {"int16", "Int16", "Int16[pyarrow]", "int16[pyarrow]"}:
|
||||
return dtypes.Int16()
|
||||
if dtype in {"int8", "Int8", "Int8[pyarrow]", "int8[pyarrow]"}:
|
||||
return dtypes.Int8()
|
||||
if dtype in {"uint64", "UInt64", "UInt64[pyarrow]", "uint64[pyarrow]"}:
|
||||
return dtypes.UInt64()
|
||||
if dtype in {"uint32", "UInt32", "UInt32[pyarrow]", "uint32[pyarrow]"}:
|
||||
return dtypes.UInt32()
|
||||
if dtype in {"uint16", "UInt16", "UInt16[pyarrow]", "uint16[pyarrow]"}:
|
||||
return dtypes.UInt16()
|
||||
if dtype in {"uint8", "UInt8", "UInt8[pyarrow]", "uint8[pyarrow]"}:
|
||||
return dtypes.UInt8()
|
||||
if dtype in {
|
||||
"float64",
|
||||
"Float64",
|
||||
"Float64[pyarrow]",
|
||||
"float64[pyarrow]",
|
||||
"double[pyarrow]",
|
||||
}:
|
||||
return dtypes.Float64()
|
||||
if dtype in {
|
||||
"float32",
|
||||
"Float32",
|
||||
"Float32[pyarrow]",
|
||||
"float32[pyarrow]",
|
||||
"float[pyarrow]",
|
||||
}:
|
||||
return dtypes.Float32()
|
||||
if dtype in {
|
||||
# "there is no problem which can't be solved by adding an extra string type" pandas
|
||||
"string",
|
||||
"string[python]",
|
||||
"string[pyarrow]",
|
||||
"string[pyarrow_numpy]",
|
||||
"large_string[pyarrow]",
|
||||
"str",
|
||||
}:
|
||||
return dtypes.String()
|
||||
if dtype in {"bool", "boolean", "boolean[pyarrow]", "bool[pyarrow]"}:
|
||||
return dtypes.Boolean()
|
||||
if dtype.startswith("dictionary<"):
|
||||
return dtypes.Categorical()
|
||||
if dtype == "category":
|
||||
return native_categorical_to_narwhals_dtype(native_dtype, version)
|
||||
if (match_ := PATTERN_PD_DATETIME.match(dtype)) or (
|
||||
match_ := PATTERN_PA_DATETIME.match(dtype)
|
||||
):
|
||||
dt_time_unit: TimeUnit = match_.group("time_unit") # type: ignore[assignment]
|
||||
dt_time_zone: str | None = match_.group("time_zone")
|
||||
return dtypes.Datetime(dt_time_unit, dt_time_zone)
|
||||
if (match_ := PATTERN_PD_DURATION.match(dtype)) or (
|
||||
match_ := PATTERN_PA_DURATION.match(dtype)
|
||||
):
|
||||
du_time_unit: TimeUnit = match_.group("time_unit") # type: ignore[assignment]
|
||||
return dtypes.Duration(du_time_unit)
|
||||
if dtype == "date32[day][pyarrow]":
|
||||
return dtypes.Date()
|
||||
if dtype.startswith("decimal") and dtype.endswith("[pyarrow]"):
|
||||
return dtypes.Decimal()
|
||||
if dtype.startswith("time") and dtype.endswith("[pyarrow]"):
|
||||
return dtypes.Time()
|
||||
if dtype.startswith("binary") and dtype.endswith("[pyarrow]"):
|
||||
return dtypes.Binary()
|
||||
return dtypes.Unknown() # pragma: no cover
|
||||
|
||||
|
||||
def object_native_to_narwhals_dtype(
|
||||
series: PandasLikeSeries | None, version: Version, implementation: Implementation
|
||||
) -> DType:
|
||||
dtypes = version.dtypes
|
||||
if implementation is Implementation.CUDF:
|
||||
# Per conversations with their maintainers, they don't support arbitrary
|
||||
# objects, so we can just return String.
|
||||
return dtypes.String()
|
||||
|
||||
infer = pd.api.types.infer_dtype
|
||||
# Arbitrary limit of 100 elements to use to sniff dtype.
|
||||
inferred_dtype = "empty" if series is None else infer(series.head(100), skipna=True)
|
||||
if inferred_dtype == "string":
|
||||
return dtypes.String()
|
||||
if inferred_dtype == "empty" and version is not Version.V1:
|
||||
# Default to String for empty Series.
|
||||
return dtypes.String()
|
||||
if inferred_dtype == "empty":
|
||||
# But preserve returning Object in V1.
|
||||
return dtypes.Object()
|
||||
return dtypes.Object()
|
||||
|
||||
|
||||
def native_categorical_to_narwhals_dtype(
|
||||
native_dtype: pd.CategoricalDtype,
|
||||
version: Version,
|
||||
implementation: Literal[Implementation.CUDF] | None = None,
|
||||
) -> DType:
|
||||
dtypes = version.dtypes
|
||||
if version is Version.V1:
|
||||
return dtypes.Categorical()
|
||||
if native_dtype.ordered:
|
||||
into_iter = (
|
||||
_cudf_categorical_to_list(native_dtype)
|
||||
if implementation is Implementation.CUDF
|
||||
else native_dtype.categories.to_list
|
||||
)
|
||||
return dtypes.Enum(_DeferredIterable(into_iter))
|
||||
return dtypes.Categorical()
|
||||
|
||||
|
||||
def _cudf_categorical_to_list(
|
||||
native_dtype: Any,
|
||||
) -> Callable[[], list[Any]]: # pragma: no cover
|
||||
# NOTE: https://docs.rapids.ai/api/cudf/stable/user_guide/api_docs/api/cudf.core.dtypes.categoricaldtype/#cudf.core.dtypes.CategoricalDtype
|
||||
def fn() -> list[Any]:
|
||||
return native_dtype.categories.to_arrow().to_pylist()
|
||||
|
||||
return fn
|
||||
|
||||
|
||||
def native_to_narwhals_dtype(
|
||||
native_dtype: Any,
|
||||
version: Version,
|
||||
implementation: Implementation,
|
||||
*,
|
||||
allow_object: bool = False,
|
||||
) -> DType:
|
||||
str_dtype = str(native_dtype)
|
||||
|
||||
if str_dtype.startswith(("large_list", "list", "struct", "fixed_size_list")):
|
||||
from narwhals._arrow.utils import (
|
||||
native_to_narwhals_dtype as arrow_native_to_narwhals_dtype,
|
||||
)
|
||||
|
||||
if hasattr(native_dtype, "to_arrow"): # pragma: no cover
|
||||
# cudf, cudf.pandas
|
||||
return arrow_native_to_narwhals_dtype(native_dtype.to_arrow(), version)
|
||||
return arrow_native_to_narwhals_dtype(native_dtype.pyarrow_dtype, version)
|
||||
if str_dtype == "category" and implementation.is_cudf():
|
||||
# https://github.com/rapidsai/cudf/issues/18536
|
||||
# https://github.com/rapidsai/cudf/issues/14027
|
||||
return native_categorical_to_narwhals_dtype(
|
||||
native_dtype, version, Implementation.CUDF
|
||||
)
|
||||
if str_dtype != "object":
|
||||
return non_object_native_to_narwhals_dtype(native_dtype, version)
|
||||
if implementation is Implementation.DASK:
|
||||
# Per conversations with their maintainers, they don't support arbitrary
|
||||
# objects, so we can just return String.
|
||||
return version.dtypes.String()
|
||||
if allow_object:
|
||||
return object_native_to_narwhals_dtype(None, version, implementation)
|
||||
msg = (
|
||||
"Unreachable code, object dtype should be handled separately" # pragma: no cover
|
||||
)
|
||||
raise AssertionError(msg)
|
||||
|
||||
|
||||
if Implementation.PANDAS._backend_version() >= (1, 2):
|
||||
|
||||
def is_dtype_numpy_nullable(dtype: Any) -> TypeIs[BaseMaskedDtype]:
|
||||
"""Return `True` if `dtype` is `"numpy_nullable"`."""
|
||||
# NOTE: We need a sentinel as the positive case is `BaseMaskedDtype.base = None`
|
||||
# See https://github.com/narwhals-dev/narwhals/pull/2740#discussion_r2171667055
|
||||
sentinel = object()
|
||||
return (
|
||||
isinstance(dtype, pd.api.extensions.ExtensionDtype)
|
||||
and getattr(dtype, "base", sentinel) is None
|
||||
)
|
||||
else: # pragma: no cover
|
||||
|
||||
def is_dtype_numpy_nullable(dtype: Any) -> TypeIs[BaseMaskedDtype]:
|
||||
# NOTE: `base` attribute was added between 1.1-1.2
|
||||
# Checking by isinstance requires using an import path that is no longer valid
|
||||
# `1.1`: https://github.com/pandas-dev/pandas/blob/b5958ee1999e9aead1938c0bba2b674378807b3d/pandas/core/arrays/masked.py#L37
|
||||
# `1.2`: https://github.com/pandas-dev/pandas/blob/7c48ff4409c622c582c56a5702373f726de08e96/pandas/core/arrays/masked.py#L41
|
||||
# `1.5`: https://github.com/pandas-dev/pandas/blob/35b0d1dcadf9d60722c055ee37442dc76a29e64c/pandas/core/dtypes/dtypes.py#L1609
|
||||
if isinstance(dtype, pd.api.extensions.ExtensionDtype):
|
||||
from pandas.core.arrays.masked import ( # type: ignore[attr-defined]
|
||||
BaseMaskedDtype as OldBaseMaskedDtype, # pyright: ignore[reportAttributeAccessIssue]
|
||||
)
|
||||
|
||||
return isinstance(dtype, OldBaseMaskedDtype)
|
||||
return False
|
||||
|
||||
|
||||
def get_dtype_backend(dtype: Any, implementation: Implementation) -> DTypeBackend:
|
||||
"""Get dtype backend for pandas type.
|
||||
|
||||
Matches pandas' `dtype_backend` argument in `convert_dtypes`.
|
||||
"""
|
||||
if implementation is Implementation.CUDF:
|
||||
return None
|
||||
if is_dtype_pyarrow(dtype):
|
||||
return "pyarrow"
|
||||
return "numpy_nullable" if is_dtype_numpy_nullable(dtype) else None
|
||||
|
||||
|
||||
# NOTE: Use this to avoid annotating inline
|
||||
def iter_dtype_backends(
|
||||
dtypes: Iterable[Any], implementation: Implementation
|
||||
) -> Iterator[DTypeBackend]:
|
||||
"""Yield a `DTypeBackend` per-dtype.
|
||||
|
||||
Matches pandas' `dtype_backend` argument in `convert_dtypes`.
|
||||
"""
|
||||
return (get_dtype_backend(dtype, implementation) for dtype in dtypes)
|
||||
|
||||
|
||||
@functools.lru_cache(maxsize=16)
|
||||
def is_dtype_pyarrow(dtype: Any) -> TypeIs[pd.ArrowDtype]:
|
||||
return hasattr(pd, "ArrowDtype") and isinstance(dtype, pd.ArrowDtype)
|
||||
|
||||
|
||||
dtypes = Version.MAIN.dtypes
|
||||
NW_TO_PD_DTYPES_INVARIANT: Mapping[type[DType], str] = {
|
||||
# TODO(Unassigned): is there no pyarrow-backed categorical?
|
||||
# or at least, convert_dtypes(dtype_backend='pyarrow') doesn't
|
||||
# convert to it?
|
||||
dtypes.Categorical: "category",
|
||||
dtypes.Object: "object",
|
||||
}
|
||||
NW_TO_PD_DTYPES_BACKEND: Mapping[type[DType], Mapping[DTypeBackend, str | type[Any]]] = {
|
||||
dtypes.Float64: {
|
||||
"pyarrow": "Float64[pyarrow]",
|
||||
"numpy_nullable": "Float64",
|
||||
None: "float64",
|
||||
},
|
||||
dtypes.Float32: {
|
||||
"pyarrow": "Float32[pyarrow]",
|
||||
"numpy_nullable": "Float32",
|
||||
None: "float32",
|
||||
},
|
||||
dtypes.Int64: {"pyarrow": "Int64[pyarrow]", "numpy_nullable": "Int64", None: "int64"},
|
||||
dtypes.Int32: {"pyarrow": "Int32[pyarrow]", "numpy_nullable": "Int32", None: "int32"},
|
||||
dtypes.Int16: {"pyarrow": "Int16[pyarrow]", "numpy_nullable": "Int16", None: "int16"},
|
||||
dtypes.Int8: {"pyarrow": "Int8[pyarrow]", "numpy_nullable": "Int8", None: "int8"},
|
||||
dtypes.UInt64: {
|
||||
"pyarrow": "UInt64[pyarrow]",
|
||||
"numpy_nullable": "UInt64",
|
||||
None: "uint64",
|
||||
},
|
||||
dtypes.UInt32: {
|
||||
"pyarrow": "UInt32[pyarrow]",
|
||||
"numpy_nullable": "UInt32",
|
||||
None: "uint32",
|
||||
},
|
||||
dtypes.UInt16: {
|
||||
"pyarrow": "UInt16[pyarrow]",
|
||||
"numpy_nullable": "UInt16",
|
||||
None: "uint16",
|
||||
},
|
||||
dtypes.UInt8: {"pyarrow": "UInt8[pyarrow]", "numpy_nullable": "UInt8", None: "uint8"},
|
||||
dtypes.String: {"pyarrow": "string[pyarrow]", "numpy_nullable": "string", None: str},
|
||||
dtypes.Boolean: {
|
||||
"pyarrow": "boolean[pyarrow]",
|
||||
"numpy_nullable": "boolean",
|
||||
None: "bool",
|
||||
},
|
||||
}
|
||||
UNSUPPORTED_DTYPES = (dtypes.Decimal,)
|
||||
|
||||
|
||||
def narwhals_to_native_dtype( # noqa: C901, PLR0912
|
||||
dtype: IntoDType,
|
||||
dtype_backend: DTypeBackend,
|
||||
implementation: Implementation,
|
||||
version: Version,
|
||||
) -> str | PandasDtype:
|
||||
if dtype_backend not in {None, "pyarrow", "numpy_nullable"}:
|
||||
msg = f"Expected one of {{None, 'pyarrow', 'numpy_nullable'}}, got: '{dtype_backend}'"
|
||||
raise ValueError(msg)
|
||||
dtypes = version.dtypes
|
||||
base_type = dtype.base_type()
|
||||
if pd_type := NW_TO_PD_DTYPES_INVARIANT.get(base_type):
|
||||
return pd_type
|
||||
if into_pd_type := NW_TO_PD_DTYPES_BACKEND.get(base_type):
|
||||
return into_pd_type[dtype_backend]
|
||||
if isinstance_or_issubclass(dtype, dtypes.Datetime):
|
||||
# Pandas does not support "ms" or "us" time units before version 2.0
|
||||
if is_pandas_or_modin(implementation) and PANDAS_VERSION < (
|
||||
2,
|
||||
): # pragma: no cover
|
||||
dt_time_unit = "ns"
|
||||
else:
|
||||
dt_time_unit = dtype.time_unit
|
||||
|
||||
if dtype_backend == "pyarrow":
|
||||
tz_part = f", tz={tz}" if (tz := dtype.time_zone) else ""
|
||||
return f"timestamp[{dt_time_unit}{tz_part}][pyarrow]"
|
||||
tz_part = f", {tz}" if (tz := dtype.time_zone) else ""
|
||||
return f"datetime64[{dt_time_unit}{tz_part}]"
|
||||
if isinstance_or_issubclass(dtype, dtypes.Duration):
|
||||
if is_pandas_or_modin(implementation) and PANDAS_VERSION < (
|
||||
2,
|
||||
): # pragma: no cover
|
||||
du_time_unit = "ns"
|
||||
else:
|
||||
du_time_unit = dtype.time_unit
|
||||
return (
|
||||
f"duration[{du_time_unit}][pyarrow]"
|
||||
if dtype_backend == "pyarrow"
|
||||
else f"timedelta64[{du_time_unit}]"
|
||||
)
|
||||
if isinstance_or_issubclass(dtype, dtypes.Date):
|
||||
try:
|
||||
import pyarrow as pa # ignore-banned-import # noqa: F401
|
||||
except ModuleNotFoundError as exc: # pragma: no cover
|
||||
# BUG: Never re-raised?
|
||||
msg = "'pyarrow>=13.0.0' is required for `Date` dtype."
|
||||
raise ModuleNotFoundError(msg) from exc
|
||||
return "date32[pyarrow]"
|
||||
if isinstance_or_issubclass(dtype, dtypes.Enum):
|
||||
if version is Version.V1:
|
||||
msg = "Converting to Enum is not supported in narwhals.stable.v1"
|
||||
raise NotImplementedError(msg)
|
||||
if isinstance(dtype, dtypes.Enum):
|
||||
ns = implementation.to_native_namespace()
|
||||
return ns.CategoricalDtype(dtype.categories, ordered=True)
|
||||
msg = "Can not cast / initialize Enum without categories present"
|
||||
raise ValueError(msg)
|
||||
if issubclass(
|
||||
base_type, (dtypes.Struct, dtypes.Array, dtypes.List, dtypes.Time, dtypes.Binary)
|
||||
):
|
||||
return narwhals_to_native_arrow_dtype(dtype, implementation, version)
|
||||
if issubclass(base_type, UNSUPPORTED_DTYPES):
|
||||
msg = f"Converting to {base_type.__name__} dtype is not supported for {implementation}."
|
||||
raise NotImplementedError(msg)
|
||||
msg = f"Unknown dtype: {dtype}" # pragma: no cover
|
||||
raise AssertionError(msg)
|
||||
|
||||
|
||||
def narwhals_to_native_arrow_dtype(
|
||||
dtype: IntoDType, implementation: Implementation, version: Version
|
||||
) -> pd.ArrowDtype:
|
||||
if is_pandas_or_modin(implementation) and PANDAS_VERSION >= (2, 2):
|
||||
try:
|
||||
import pyarrow as pa # ignore-banned-import # noqa: F401
|
||||
except ImportError as exc: # pragma: no cover
|
||||
msg = f"Unable to convert to {dtype} to to the following exception: {exc.msg}"
|
||||
raise ImportError(msg) from exc
|
||||
from narwhals._arrow.utils import narwhals_to_native_dtype as _to_arrow_dtype
|
||||
|
||||
return pd.ArrowDtype(_to_arrow_dtype(dtype, version))
|
||||
msg = ( # pragma: no cover
|
||||
f"Converting to {dtype} dtype is not supported for implementation "
|
||||
f"{implementation} and version {version}."
|
||||
)
|
||||
raise NotImplementedError(msg)
|
||||
|
||||
|
||||
def int_dtype_mapper(dtype: Any) -> str:
|
||||
if "pyarrow" in str(dtype):
|
||||
return "Int64[pyarrow]"
|
||||
if str(dtype).lower() != str(dtype): # pragma: no cover
|
||||
return "Int64"
|
||||
return "int64"
|
||||
|
||||
|
||||
_TIMESTAMP_DATETIME_OP_FACTOR: Mapping[
|
||||
tuple[UnitCurrent, UnitTarget], tuple[BinOpBroadcast, IntoRhs]
|
||||
] = {
|
||||
("ns", "us"): (operator.floordiv, 1_000),
|
||||
("ns", "ms"): (operator.floordiv, 1_000_000),
|
||||
("us", "ns"): (operator.mul, NS_PER_MICROSECOND),
|
||||
("us", "ms"): (operator.floordiv, 1_000),
|
||||
("ms", "ns"): (operator.mul, NS_PER_MILLISECOND),
|
||||
("ms", "us"): (operator.mul, 1_000),
|
||||
("s", "ns"): (operator.mul, NS_PER_SECOND),
|
||||
("s", "us"): (operator.mul, US_PER_SECOND),
|
||||
("s", "ms"): (operator.mul, MS_PER_SECOND),
|
||||
}
|
||||
|
||||
|
||||
def calculate_timestamp_datetime(
|
||||
s: NativeSeriesT, current: TimeUnit, time_unit: TimeUnit
|
||||
) -> NativeSeriesT:
|
||||
if current == time_unit:
|
||||
return s
|
||||
if item := _TIMESTAMP_DATETIME_OP_FACTOR.get((current, time_unit)):
|
||||
fn, factor = item
|
||||
return fn(s, factor)
|
||||
msg = ( # pragma: no cover
|
||||
f"unexpected time unit {current}, please report an issue at "
|
||||
"https://github.com/narwhals-dev/narwhals"
|
||||
)
|
||||
raise AssertionError(msg)
|
||||
|
||||
|
||||
_TIMESTAMP_DATE_FACTOR: Mapping[TimeUnit, int] = {
|
||||
"ns": NS_PER_SECOND,
|
||||
"us": US_PER_SECOND,
|
||||
"ms": MS_PER_SECOND,
|
||||
"s": 1,
|
||||
}
|
||||
|
||||
|
||||
def calculate_timestamp_date(s: NativeSeriesT, time_unit: TimeUnit) -> NativeSeriesT:
|
||||
return s * SECONDS_PER_DAY * _TIMESTAMP_DATE_FACTOR[time_unit]
|
||||
|
||||
|
||||
def select_columns_by_name(
|
||||
df: NativeDataFrameT,
|
||||
column_names: list[str] | _1DArray, # NOTE: Cannot be a tuple!
|
||||
implementation: Implementation,
|
||||
) -> NativeDataFrameT | Any:
|
||||
"""Select columns by name.
|
||||
|
||||
Prefer this over `df.loc[:, column_names]` as it's
|
||||
generally more performant.
|
||||
"""
|
||||
if len(column_names) == df.shape[1] and (df.columns == column_names).all():
|
||||
return df
|
||||
if (df.columns.dtype.kind == "b") or (
|
||||
implementation is Implementation.PANDAS
|
||||
and implementation._backend_version() < (1, 5)
|
||||
):
|
||||
# See https://github.com/narwhals-dev/narwhals/issues/1349#issuecomment-2470118122
|
||||
# for why we need this
|
||||
if error := check_columns_exist(column_names, available=df.columns.tolist()):
|
||||
raise error
|
||||
return df.loc[:, column_names]
|
||||
try:
|
||||
return df[column_names]
|
||||
except KeyError as e:
|
||||
if error := check_columns_exist(column_names, available=df.columns.tolist()):
|
||||
raise error from e
|
||||
raise
|
||||
|
||||
|
||||
def is_non_nullable_boolean(s: PandasLikeSeries) -> bool:
|
||||
# cuDF booleans are nullable but the native dtype is still 'bool'.
|
||||
return (
|
||||
s._implementation
|
||||
in {Implementation.PANDAS, Implementation.MODIN, Implementation.DASK}
|
||||
and s.native.dtype == "bool"
|
||||
)
|
||||
|
||||
|
||||
def import_array_module(implementation: Implementation, /) -> ModuleType:
|
||||
"""Returns numpy or cupy module depending on the given implementation."""
|
||||
if implementation in {Implementation.PANDAS, Implementation.MODIN}:
|
||||
import numpy as np
|
||||
|
||||
return np
|
||||
if implementation is Implementation.CUDF:
|
||||
import cupy as cp # ignore-banned-import # cuDF dependency.
|
||||
|
||||
return cp
|
||||
msg = f"Expected pandas/modin/cudf, got: {implementation}" # pragma: no cover
|
||||
raise AssertionError(msg)
|
||||
|
||||
|
||||
class PandasLikeSeriesNamespace(EagerSeriesNamespace["PandasLikeSeries", Any]): ...
|
Reference in New Issue
Block a user