done

2025-09-07 22:09:54 +02:00
parent e1b817252c
commit 2fc0d000b6
7796 changed files with 2159515 additions and 933 deletions
--- a/lib/python3.11/site-packages/pandas/io/parsers/init.py
+++ b/lib/python3.11/site-packages/pandas/io/parsers/init.py
@ -0,0 +1,9 @@
+from pandas.io.parsers.readers import (
+    TextFileReader,
+    TextParser,
+    read_csv,
+    read_fwf,
+    read_table,
+)
+
+__all__ = ["TextFileReader", "TextParser", "read_csv", "read_fwf", "read_table"]
--- a/lib/python3.11/site-packages/pandas/io/parsers/arrow_parser_wrapper.py
+++ b/lib/python3.11/site-packages/pandas/io/parsers/arrow_parser_wrapper.py
@ -0,0 +1,295 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+import warnings
+
+from pandas._libs import lib
+from pandas.compat._optional import import_optional_dependency
+from pandas.errors import (
+    ParserError,
+    ParserWarning,
+)
+from pandas.util._exceptions import find_stack_level
+
+from pandas.core.dtypes.common import pandas_dtype
+from pandas.core.dtypes.inference import is_integer
+
+from pandas.io._util import arrow_table_to_pandas
+from pandas.io.parsers.base_parser import ParserBase
+
+if TYPE_CHECKING:
+    from pandas._typing import ReadBuffer
+
+    from pandas import DataFrame
+
+
+class ArrowParserWrapper(ParserBase):
+    """
+    Wrapper for the pyarrow engine for read_csv()
+    """
+
+    def __init__(self, src: ReadBuffer[bytes], **kwds) -> None:
+        super().__init__(kwds)
+        self.kwds = kwds
+        self.src = src
+
+        self._parse_kwds()
+
+    def _parse_kwds(self) -> None:
+        """
+        Validates keywords before passing to pyarrow.
+        """
+        encoding: str | None = self.kwds.get("encoding")
+        self.encoding = "utf-8" if encoding is None else encoding
+
+        na_values = self.kwds["na_values"]
+        if isinstance(na_values, dict):
+            raise ValueError(
+                "The pyarrow engine doesn't support passing a dict for na_values"
+            )
+        self.na_values = list(self.kwds["na_values"])
+
+    def _get_pyarrow_options(self) -> None:
+        """
+        Rename some arguments to pass to pyarrow
+        """
+        mapping = {
+            "usecols": "include_columns",
+            "na_values": "null_values",
+            "escapechar": "escape_char",
+            "skip_blank_lines": "ignore_empty_lines",
+            "decimal": "decimal_point",
+            "quotechar": "quote_char",
+        }
+        for pandas_name, pyarrow_name in mapping.items():
+            if pandas_name in self.kwds and self.kwds.get(pandas_name) is not None:
+                self.kwds[pyarrow_name] = self.kwds.pop(pandas_name)
+
+        # Date format handling
+        # If we get a string, we need to convert it into a list for pyarrow
+        # If we get a dict, we want to parse those separately
+        date_format = self.date_format
+        if isinstance(date_format, str):
+            date_format = [date_format]
+        else:
+            # In case of dict, we don't want to propagate through, so
+            # just set to pyarrow default of None
+
+            # Ideally, in future we disable pyarrow dtype inference (read in as string)
+            # to prevent misreads.
+            date_format = None
+        self.kwds["timestamp_parsers"] = date_format
+
+        self.parse_options = {
+            option_name: option_value
+            for option_name, option_value in self.kwds.items()
+            if option_value is not None
+            and option_name
+            in ("delimiter", "quote_char", "escape_char", "ignore_empty_lines")
+        }
+
+        on_bad_lines = self.kwds.get("on_bad_lines")
+        if on_bad_lines is not None:
+            if callable(on_bad_lines):
+                self.parse_options["invalid_row_handler"] = on_bad_lines
+            elif on_bad_lines == ParserBase.BadLineHandleMethod.ERROR:
+                self.parse_options[
+                    "invalid_row_handler"
+                ] = None  # PyArrow raises an exception by default
+            elif on_bad_lines == ParserBase.BadLineHandleMethod.WARN:
+
+                def handle_warning(invalid_row) -> str:
+                    warnings.warn(
+                        f"Expected {invalid_row.expected_columns} columns, but found "
+                        f"{invalid_row.actual_columns}: {invalid_row.text}",
+                        ParserWarning,
+                        stacklevel=find_stack_level(),
+                    )
+                    return "skip"
+
+                self.parse_options["invalid_row_handler"] = handle_warning
+            elif on_bad_lines == ParserBase.BadLineHandleMethod.SKIP:
+                self.parse_options["invalid_row_handler"] = lambda _: "skip"
+
+        self.convert_options = {
+            option_name: option_value
+            for option_name, option_value in self.kwds.items()
+            if option_value is not None
+            and option_name
+            in (
+                "include_columns",
+                "null_values",
+                "true_values",
+                "false_values",
+                "decimal_point",
+                "timestamp_parsers",
+            )
+        }
+        self.convert_options["strings_can_be_null"] = "" in self.kwds["null_values"]
+        # autogenerated column names are prefixed with 'f' in pyarrow.csv
+        if self.header is None and "include_columns" in self.convert_options:
+            self.convert_options["include_columns"] = [
+                f"f{n}" for n in self.convert_options["include_columns"]
+            ]
+
+        self.read_options = {
+            "autogenerate_column_names": self.header is None,
+            "skip_rows": self.header
+            if self.header is not None
+            else self.kwds["skiprows"],
+            "encoding": self.encoding,
+        }
+
+    def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
+        """
+        Processes data read in based on kwargs.
+
+        Parameters
+        ----------
+        frame: DataFrame
+            The DataFrame to process.
+
+        Returns
+        -------
+        DataFrame
+            The processed DataFrame.
+        """
+        num_cols = len(frame.columns)
+        multi_index_named = True
+        if self.header is None:
+            if self.names is None:
+                if self.header is None:
+                    self.names = range(num_cols)
+            if len(self.names) != num_cols:
+                # usecols is passed through to pyarrow, we only handle index col here
+                # The only way self.names is not the same length as number of cols is
+                # if we have int index_col. We should just pad the names(they will get
+                # removed anyways) to expected length then.
+                columns_prefix = [str(x) for x in range(num_cols - len(self.names))]
+                self.names = columns_prefix + self.names
+                multi_index_named = False
+            frame.columns = self.names
+        # we only need the frame not the names
+        _, frame = self._do_date_conversions(frame.columns, frame)
+        if self.index_col is not None:
+            index_to_set = self.index_col.copy()
+            for i, item in enumerate(self.index_col):
+                if is_integer(item):
+                    index_to_set[i] = frame.columns[item]
+                # String case
+                elif item not in frame.columns:
+                    raise ValueError(f"Index {item} invalid")
+
+                # Process dtype for index_col and drop from dtypes
+                if self.dtype is not None:
+                    key, new_dtype = (
+                        (item, self.dtype.get(item))
+                        if self.dtype.get(item) is not None
+                        else (frame.columns[item], self.dtype.get(frame.columns[item]))
+                    )
+                    if new_dtype is not None:
+                        frame[key] = frame[key].astype(new_dtype)
+                        del self.dtype[key]
+
+            frame.set_index(index_to_set, drop=True, inplace=True)
+            # Clear names if headerless and no name given
+            if self.header is None and not multi_index_named:
+                frame.index.names = [None] * len(frame.index.names)
+
+        if self.dtype is not None:
+            # Ignore non-existent columns from dtype mapping
+            # like other parsers do
+            if isinstance(self.dtype, dict):
+                self.dtype = {
+                    k: pandas_dtype(v)
+                    for k, v in self.dtype.items()
+                    if k in frame.columns
+                }
+            else:
+                self.dtype = pandas_dtype(self.dtype)
+            try:
+                frame = frame.astype(self.dtype)
+            except TypeError as e:
+                # GH#44901 reraise to keep api consistent
+                raise ValueError(e)
+        return frame
+
+    def _validate_usecols(self, usecols) -> None:
+        if lib.is_list_like(usecols) and not all(isinstance(x, str) for x in usecols):
+            raise ValueError(
+                "The pyarrow engine does not allow 'usecols' to be integer "
+                "column positions. Pass a list of string column names instead."
+            )
+        elif callable(usecols):
+            raise ValueError(
+                "The pyarrow engine does not allow 'usecols' to be a callable."
+            )
+
+    def read(self) -> DataFrame:
+        """
+        Reads the contents of a CSV file into a DataFrame and
+        processes it according to the kwargs passed in the
+        constructor.
+
+        Returns
+        -------
+        DataFrame
+            The DataFrame created from the CSV file.
+        """
+        pa = import_optional_dependency("pyarrow")
+        pyarrow_csv = import_optional_dependency("pyarrow.csv")
+        self._get_pyarrow_options()
+
+        try:
+            convert_options = pyarrow_csv.ConvertOptions(**self.convert_options)
+        except TypeError:
+            include = self.convert_options.get("include_columns", None)
+            if include is not None:
+                self._validate_usecols(include)
+
+            nulls = self.convert_options.get("null_values", set())
+            if not lib.is_list_like(nulls) or not all(
+                isinstance(x, str) for x in nulls
+            ):
+                raise TypeError(
+                    "The 'pyarrow' engine requires all na_values to be strings"
+                )
+
+            raise
+
+        try:
+            table = pyarrow_csv.read_csv(
+                self.src,
+                read_options=pyarrow_csv.ReadOptions(**self.read_options),
+                parse_options=pyarrow_csv.ParseOptions(**self.parse_options),
+                convert_options=convert_options,
+            )
+        except pa.ArrowInvalid as e:
+            raise ParserError(e) from e
+
+        dtype_backend = self.kwds["dtype_backend"]
+
+        # Convert all pa.null() cols -> float64 (non nullable)
+        # else Int64 (nullable case, see below)
+        if dtype_backend is lib.no_default:
+            new_schema = table.schema
+            new_type = pa.float64()
+            for i, arrow_type in enumerate(table.schema.types):
+                if pa.types.is_null(arrow_type):
+                    new_schema = new_schema.set(
+                        i, new_schema.field(i).with_type(new_type)
+                    )
+
+            table = table.cast(new_schema)
+
+        with warnings.catch_warnings():
+            warnings.filterwarnings(
+                "ignore",
+                "make_block is deprecated",
+                DeprecationWarning,
+            )
+            frame = arrow_table_to_pandas(
+                table, dtype_backend=dtype_backend, null_to_int64=True
+            )
+
+        return self._finalize_pandas_output(frame)
--- a/lib/python3.11/site-packages/pandas/io/parsers/base_parser.py
+++ b/lib/python3.11/site-packages/pandas/io/parsers/base_parser.py
--- a/lib/python3.11/site-packages/pandas/io/parsers/c_parser_wrapper.py
+++ b/lib/python3.11/site-packages/pandas/io/parsers/c_parser_wrapper.py
@ -0,0 +1,410 @@
+from __future__ import annotations
+
+from collections import defaultdict
+from typing import TYPE_CHECKING
+import warnings
+
+import numpy as np
+
+from pandas._libs import (
+    lib,
+    parsers,
+)
+from pandas.compat._optional import import_optional_dependency
+from pandas.errors import DtypeWarning
+from pandas.util._exceptions import find_stack_level
+
+from pandas.core.dtypes.common import pandas_dtype
+from pandas.core.dtypes.concat import (
+    concat_compat,
+    union_categoricals,
+)
+from pandas.core.dtypes.dtypes import CategoricalDtype
+
+from pandas.core.indexes.api import ensure_index_from_sequences
+
+from pandas.io.common import (
+    dedup_names,
+    is_potential_multi_index,
+)
+from pandas.io.parsers.base_parser import (
+    ParserBase,
+    ParserError,
+    is_index_col,
+)
+
+if TYPE_CHECKING:
+    from collections.abc import (
+        Hashable,
+        Mapping,
+        Sequence,
+    )
+
+    from pandas._typing import (
+        ArrayLike,
+        DtypeArg,
+        DtypeObj,
+        ReadCsvBuffer,
+    )
+
+    from pandas import (
+        Index,
+        MultiIndex,
+    )
+
+
+class CParserWrapper(ParserBase):
+    low_memory: bool
+    _reader: parsers.TextReader
+
+    def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None:
+        super().__init__(kwds)
+        self.kwds = kwds
+        kwds = kwds.copy()
+
+        self.low_memory = kwds.pop("low_memory", False)
+
+        # #2442
+        # error: Cannot determine type of 'index_col'
+        kwds["allow_leading_cols"] = (
+            self.index_col is not False  # type: ignore[has-type]
+        )
+
+        # GH20529, validate usecol arg before TextReader
+        kwds["usecols"] = self.usecols
+
+        # Have to pass int, would break tests using TextReader directly otherwise :(
+        kwds["on_bad_lines"] = self.on_bad_lines.value
+
+        for key in (
+            "storage_options",
+            "encoding",
+            "memory_map",
+            "compression",
+        ):
+            kwds.pop(key, None)
+
+        kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))
+        if "dtype_backend" not in kwds or kwds["dtype_backend"] is lib.no_default:
+            kwds["dtype_backend"] = "numpy"
+        if kwds["dtype_backend"] == "pyarrow":
+            # Fail here loudly instead of in cython after reading
+            import_optional_dependency("pyarrow")
+        self._reader = parsers.TextReader(src, **kwds)
+
+        self.unnamed_cols = self._reader.unnamed_cols
+
+        # error: Cannot determine type of 'names'
+        passed_names = self.names is None  # type: ignore[has-type]
+
+        if self._reader.header is None:
+            self.names = None
+        else:
+            # error: Cannot determine type of 'names'
+            # error: Cannot determine type of 'index_names'
+            (
+                self.names,  # type: ignore[has-type]
+                self.index_names,
+                self.col_names,
+                passed_names,
+            ) = self._extract_multi_indexer_columns(
+                self._reader.header,
+                self.index_names,  # type: ignore[has-type]
+                passed_names,
+            )
+
+        # error: Cannot determine type of 'names'
+        if self.names is None:  # type: ignore[has-type]
+            self.names = list(range(self._reader.table_width))
+
+        # gh-9755
+        #
+        # need to set orig_names here first
+        # so that proper indexing can be done
+        # with _set_noconvert_columns
+        #
+        # once names has been filtered, we will
+        # then set orig_names again to names
+        # error: Cannot determine type of 'names'
+        self.orig_names = self.names[:]  # type: ignore[has-type]
+
+        if self.usecols:
+            usecols = self._evaluate_usecols(self.usecols, self.orig_names)
+
+            # GH 14671
+            # assert for mypy, orig_names is List or None, None would error in issubset
+            assert self.orig_names is not None
+            if self.usecols_dtype == "string" and not set(usecols).issubset(
+                self.orig_names
+            ):
+                self._validate_usecols_names(usecols, self.orig_names)
+
+            # error: Cannot determine type of 'names'
+            if len(self.names) > len(usecols):  # type: ignore[has-type]
+                # error: Cannot determine type of 'names'
+                self.names = [  # type: ignore[has-type]
+                    n
+                    # error: Cannot determine type of 'names'
+                    for i, n in enumerate(self.names)  # type: ignore[has-type]
+                    if (i in usecols or n in usecols)
+                ]
+
+            # error: Cannot determine type of 'names'
+            if len(self.names) < len(usecols):  # type: ignore[has-type]
+                # error: Cannot determine type of 'names'
+                self._validate_usecols_names(
+                    usecols,
+                    self.names,  # type: ignore[has-type]
+                )
+
+        # error: Cannot determine type of 'names'
+        self._validate_parse_dates_presence(self.names)  # type: ignore[has-type]
+        self._set_noconvert_columns()
+
+        # error: Cannot determine type of 'names'
+        self.orig_names = self.names  # type: ignore[has-type]
+
+        if not self._has_complex_date_col:
+            # error: Cannot determine type of 'index_col'
+            if self._reader.leading_cols == 0 and is_index_col(
+                self.index_col  # type: ignore[has-type]
+            ):
+                self._name_processed = True
+                (
+                    index_names,
+                    # error: Cannot determine type of 'names'
+                    self.names,  # type: ignore[has-type]
+                    self.index_col,
+                ) = self._clean_index_names(
+                    # error: Cannot determine type of 'names'
+                    self.names,  # type: ignore[has-type]
+                    # error: Cannot determine type of 'index_col'
+                    self.index_col,  # type: ignore[has-type]
+                )
+
+                if self.index_names is None:
+                    self.index_names = index_names
+
+            if self._reader.header is None and not passed_names:
+                assert self.index_names is not None
+                self.index_names = [None] * len(self.index_names)
+
+        self._implicit_index = self._reader.leading_cols > 0
+
+    def close(self) -> None:
+        # close handles opened by C parser
+        try:
+            self._reader.close()
+        except ValueError:
+            pass
+
+    def _set_noconvert_columns(self) -> None:
+        """
+        Set the columns that should not undergo dtype conversions.
+
+        Currently, any column that is involved with date parsing will not
+        undergo such conversions.
+        """
+        assert self.orig_names is not None
+        # error: Cannot determine type of 'names'
+
+        # much faster than using orig_names.index(x) xref GH#44106
+        names_dict = {x: i for i, x in enumerate(self.orig_names)}
+        col_indices = [names_dict[x] for x in self.names]  # type: ignore[has-type]
+        # error: Cannot determine type of 'names'
+        noconvert_columns = self._set_noconvert_dtype_columns(
+            col_indices,
+            self.names,  # type: ignore[has-type]
+        )
+        for col in noconvert_columns:
+            self._reader.set_noconvert(col)
+
+    def read(
+        self,
+        nrows: int | None = None,
+    ) -> tuple[
+        Index | MultiIndex | None,
+        Sequence[Hashable] | MultiIndex,
+        Mapping[Hashable, ArrayLike],
+    ]:
+        index: Index | MultiIndex | None
+        column_names: Sequence[Hashable] | MultiIndex
+        try:
+            if self.low_memory:
+                chunks = self._reader.read_low_memory(nrows)
+                # destructive to chunks
+                data = _concatenate_chunks(chunks)
+
+            else:
+                data = self._reader.read(nrows)
+        except StopIteration:
+            if self._first_chunk:
+                self._first_chunk = False
+                names = dedup_names(
+                    self.orig_names,
+                    is_potential_multi_index(self.orig_names, self.index_col),
+                )
+                index, columns, col_dict = self._get_empty_meta(
+                    names,
+                    dtype=self.dtype,
+                )
+                columns = self._maybe_make_multi_index_columns(columns, self.col_names)
+
+                if self.usecols is not None:
+                    columns = self._filter_usecols(columns)
+
+                col_dict = {k: v for k, v in col_dict.items() if k in columns}
+
+                return index, columns, col_dict
+
+            else:
+                self.close()
+                raise
+
+        # Done with first read, next time raise StopIteration
+        self._first_chunk = False
+
+        # error: Cannot determine type of 'names'
+        names = self.names  # type: ignore[has-type]
+
+        if self._reader.leading_cols:
+            if self._has_complex_date_col:
+                raise NotImplementedError("file structure not yet supported")
+
+            # implicit index, no index names
+            arrays = []
+
+            if self.index_col and self._reader.leading_cols != len(self.index_col):
+                raise ParserError(
+                    "Could not construct index. Requested to use "
+                    f"{len(self.index_col)} number of columns, but "
+                    f"{self._reader.leading_cols} left to parse."
+                )
+
+            for i in range(self._reader.leading_cols):
+                if self.index_col is None:
+                    values = data.pop(i)
+                else:
+                    values = data.pop(self.index_col[i])
+
+                values = self._maybe_parse_dates(values, i, try_parse_dates=True)
+                arrays.append(values)
+
+            index = ensure_index_from_sequences(arrays)
+
+            if self.usecols is not None:
+                names = self._filter_usecols(names)
+
+            names = dedup_names(names, is_potential_multi_index(names, self.index_col))
+
+            # rename dict keys
+            data_tups = sorted(data.items())
+            data = {k: v for k, (i, v) in zip(names, data_tups)}
+
+            column_names, date_data = self._do_date_conversions(names, data)
+
+            # maybe create a mi on the columns
+            column_names = self._maybe_make_multi_index_columns(
+                column_names, self.col_names
+            )
+
+        else:
+            # rename dict keys
+            data_tups = sorted(data.items())
+
+            # ugh, mutation
+
+            # assert for mypy, orig_names is List or None, None would error in list(...)
+            assert self.orig_names is not None
+            names = list(self.orig_names)
+            names = dedup_names(names, is_potential_multi_index(names, self.index_col))
+
+            if self.usecols is not None:
+                names = self._filter_usecols(names)
+
+            # columns as list
+            alldata = [x[1] for x in data_tups]
+            if self.usecols is None:
+                self._check_data_length(names, alldata)
+
+            data = {k: v for k, (i, v) in zip(names, data_tups)}
+
+            names, date_data = self._do_date_conversions(names, data)
+            index, column_names = self._make_index(date_data, alldata, names)
+
+        return index, column_names, date_data
+
+    def _filter_usecols(self, names: Sequence[Hashable]) -> Sequence[Hashable]:
+        # hackish
+        usecols = self._evaluate_usecols(self.usecols, names)
+        if usecols is not None and len(names) != len(usecols):
+            names = [
+                name for i, name in enumerate(names) if i in usecols or name in usecols
+            ]
+        return names
+
+    def _maybe_parse_dates(self, values, index: int, try_parse_dates: bool = True):
+        if try_parse_dates and self._should_parse_dates(index):
+            values = self._date_conv(
+                values,
+                col=self.index_names[index] if self.index_names is not None else None,
+            )
+        return values
+
+
+def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict:
+    """
+    Concatenate chunks of data read with low_memory=True.
+
+    The tricky part is handling Categoricals, where different chunks
+    may have different inferred categories.
+    """
+    names = list(chunks[0].keys())
+    warning_columns = []
+
+    result: dict = {}
+    for name in names:
+        arrs = [chunk.pop(name) for chunk in chunks]
+        # Check each arr for consistent types.
+        dtypes = {a.dtype for a in arrs}
+        non_cat_dtypes = {x for x in dtypes if not isinstance(x, CategoricalDtype)}
+
+        dtype = dtypes.pop()
+        if isinstance(dtype, CategoricalDtype):
+            result[name] = union_categoricals(arrs, sort_categories=False)
+        else:
+            result[name] = concat_compat(arrs)
+            if len(non_cat_dtypes) > 1 and result[name].dtype == np.dtype(object):
+                warning_columns.append(str(name))
+
+    if warning_columns:
+        warning_names = ",".join(warning_columns)
+        warning_message = " ".join(
+            [
+                f"Columns ({warning_names}) have mixed types. "
+                f"Specify dtype option on import or set low_memory=False."
+            ]
+        )
+        warnings.warn(warning_message, DtypeWarning, stacklevel=find_stack_level())
+    return result
+
+
+def ensure_dtype_objs(
+    dtype: DtypeArg | dict[Hashable, DtypeArg] | None
+) -> DtypeObj | dict[Hashable, DtypeObj] | None:
+    """
+    Ensure we have either None, a dtype object, or a dictionary mapping to
+    dtype objects.
+    """
+    if isinstance(dtype, defaultdict):
+        # "None" not callable  [misc]
+        default_dtype = pandas_dtype(dtype.default_factory())  # type: ignore[misc]
+        dtype_converted: defaultdict = defaultdict(lambda: default_dtype)
+        for key in dtype.keys():
+            dtype_converted[key] = pandas_dtype(dtype[key])
+        return dtype_converted
+    elif isinstance(dtype, dict):
+        return {k: pandas_dtype(dtype[k]) for k in dtype}
+    elif dtype is not None:
+        return pandas_dtype(dtype)
+    return dtype
--- a/lib/python3.11/site-packages/pandas/io/parsers/python_parser.py
+++ b/lib/python3.11/site-packages/pandas/io/parsers/python_parser.py
--- a/lib/python3.11/site-packages/pandas/io/parsers/readers.py
+++ b/lib/python3.11/site-packages/pandas/io/parsers/readers.py