done
This commit is contained in:
		| @ -0,0 +1,9 @@ | ||||
| from pandas.io.parsers.readers import ( | ||||
|     TextFileReader, | ||||
|     TextParser, | ||||
|     read_csv, | ||||
|     read_fwf, | ||||
|     read_table, | ||||
| ) | ||||
|  | ||||
| __all__ = ["TextFileReader", "TextParser", "read_csv", "read_fwf", "read_table"] | ||||
| @ -0,0 +1,295 @@ | ||||
| from __future__ import annotations | ||||
|  | ||||
| from typing import TYPE_CHECKING | ||||
| import warnings | ||||
|  | ||||
| from pandas._libs import lib | ||||
| from pandas.compat._optional import import_optional_dependency | ||||
| from pandas.errors import ( | ||||
|     ParserError, | ||||
|     ParserWarning, | ||||
| ) | ||||
| from pandas.util._exceptions import find_stack_level | ||||
|  | ||||
| from pandas.core.dtypes.common import pandas_dtype | ||||
| from pandas.core.dtypes.inference import is_integer | ||||
|  | ||||
| from pandas.io._util import arrow_table_to_pandas | ||||
| from pandas.io.parsers.base_parser import ParserBase | ||||
|  | ||||
| if TYPE_CHECKING: | ||||
|     from pandas._typing import ReadBuffer | ||||
|  | ||||
|     from pandas import DataFrame | ||||
|  | ||||
|  | ||||
| class ArrowParserWrapper(ParserBase): | ||||
|     """ | ||||
|     Wrapper for the pyarrow engine for read_csv() | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, src: ReadBuffer[bytes], **kwds) -> None: | ||||
|         super().__init__(kwds) | ||||
|         self.kwds = kwds | ||||
|         self.src = src | ||||
|  | ||||
|         self._parse_kwds() | ||||
|  | ||||
|     def _parse_kwds(self) -> None: | ||||
|         """ | ||||
|         Validates keywords before passing to pyarrow. | ||||
|         """ | ||||
|         encoding: str | None = self.kwds.get("encoding") | ||||
|         self.encoding = "utf-8" if encoding is None else encoding | ||||
|  | ||||
|         na_values = self.kwds["na_values"] | ||||
|         if isinstance(na_values, dict): | ||||
|             raise ValueError( | ||||
|                 "The pyarrow engine doesn't support passing a dict for na_values" | ||||
|             ) | ||||
|         self.na_values = list(self.kwds["na_values"]) | ||||
|  | ||||
|     def _get_pyarrow_options(self) -> None: | ||||
|         """ | ||||
|         Rename some arguments to pass to pyarrow | ||||
|         """ | ||||
|         mapping = { | ||||
|             "usecols": "include_columns", | ||||
|             "na_values": "null_values", | ||||
|             "escapechar": "escape_char", | ||||
|             "skip_blank_lines": "ignore_empty_lines", | ||||
|             "decimal": "decimal_point", | ||||
|             "quotechar": "quote_char", | ||||
|         } | ||||
|         for pandas_name, pyarrow_name in mapping.items(): | ||||
|             if pandas_name in self.kwds and self.kwds.get(pandas_name) is not None: | ||||
|                 self.kwds[pyarrow_name] = self.kwds.pop(pandas_name) | ||||
|  | ||||
|         # Date format handling | ||||
|         # If we get a string, we need to convert it into a list for pyarrow | ||||
|         # If we get a dict, we want to parse those separately | ||||
|         date_format = self.date_format | ||||
|         if isinstance(date_format, str): | ||||
|             date_format = [date_format] | ||||
|         else: | ||||
|             # In case of dict, we don't want to propagate through, so | ||||
|             # just set to pyarrow default of None | ||||
|  | ||||
|             # Ideally, in future we disable pyarrow dtype inference (read in as string) | ||||
|             # to prevent misreads. | ||||
|             date_format = None | ||||
|         self.kwds["timestamp_parsers"] = date_format | ||||
|  | ||||
|         self.parse_options = { | ||||
|             option_name: option_value | ||||
|             for option_name, option_value in self.kwds.items() | ||||
|             if option_value is not None | ||||
|             and option_name | ||||
|             in ("delimiter", "quote_char", "escape_char", "ignore_empty_lines") | ||||
|         } | ||||
|  | ||||
|         on_bad_lines = self.kwds.get("on_bad_lines") | ||||
|         if on_bad_lines is not None: | ||||
|             if callable(on_bad_lines): | ||||
|                 self.parse_options["invalid_row_handler"] = on_bad_lines | ||||
|             elif on_bad_lines == ParserBase.BadLineHandleMethod.ERROR: | ||||
|                 self.parse_options[ | ||||
|                     "invalid_row_handler" | ||||
|                 ] = None  # PyArrow raises an exception by default | ||||
|             elif on_bad_lines == ParserBase.BadLineHandleMethod.WARN: | ||||
|  | ||||
|                 def handle_warning(invalid_row) -> str: | ||||
|                     warnings.warn( | ||||
|                         f"Expected {invalid_row.expected_columns} columns, but found " | ||||
|                         f"{invalid_row.actual_columns}: {invalid_row.text}", | ||||
|                         ParserWarning, | ||||
|                         stacklevel=find_stack_level(), | ||||
|                     ) | ||||
|                     return "skip" | ||||
|  | ||||
|                 self.parse_options["invalid_row_handler"] = handle_warning | ||||
|             elif on_bad_lines == ParserBase.BadLineHandleMethod.SKIP: | ||||
|                 self.parse_options["invalid_row_handler"] = lambda _: "skip" | ||||
|  | ||||
|         self.convert_options = { | ||||
|             option_name: option_value | ||||
|             for option_name, option_value in self.kwds.items() | ||||
|             if option_value is not None | ||||
|             and option_name | ||||
|             in ( | ||||
|                 "include_columns", | ||||
|                 "null_values", | ||||
|                 "true_values", | ||||
|                 "false_values", | ||||
|                 "decimal_point", | ||||
|                 "timestamp_parsers", | ||||
|             ) | ||||
|         } | ||||
|         self.convert_options["strings_can_be_null"] = "" in self.kwds["null_values"] | ||||
|         # autogenerated column names are prefixed with 'f' in pyarrow.csv | ||||
|         if self.header is None and "include_columns" in self.convert_options: | ||||
|             self.convert_options["include_columns"] = [ | ||||
|                 f"f{n}" for n in self.convert_options["include_columns"] | ||||
|             ] | ||||
|  | ||||
|         self.read_options = { | ||||
|             "autogenerate_column_names": self.header is None, | ||||
|             "skip_rows": self.header | ||||
|             if self.header is not None | ||||
|             else self.kwds["skiprows"], | ||||
|             "encoding": self.encoding, | ||||
|         } | ||||
|  | ||||
|     def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: | ||||
|         """ | ||||
|         Processes data read in based on kwargs. | ||||
|  | ||||
|         Parameters | ||||
|         ---------- | ||||
|         frame: DataFrame | ||||
|             The DataFrame to process. | ||||
|  | ||||
|         Returns | ||||
|         ------- | ||||
|         DataFrame | ||||
|             The processed DataFrame. | ||||
|         """ | ||||
|         num_cols = len(frame.columns) | ||||
|         multi_index_named = True | ||||
|         if self.header is None: | ||||
|             if self.names is None: | ||||
|                 if self.header is None: | ||||
|                     self.names = range(num_cols) | ||||
|             if len(self.names) != num_cols: | ||||
|                 # usecols is passed through to pyarrow, we only handle index col here | ||||
|                 # The only way self.names is not the same length as number of cols is | ||||
|                 # if we have int index_col. We should just pad the names(they will get | ||||
|                 # removed anyways) to expected length then. | ||||
|                 columns_prefix = [str(x) for x in range(num_cols - len(self.names))] | ||||
|                 self.names = columns_prefix + self.names | ||||
|                 multi_index_named = False | ||||
|             frame.columns = self.names | ||||
|         # we only need the frame not the names | ||||
|         _, frame = self._do_date_conversions(frame.columns, frame) | ||||
|         if self.index_col is not None: | ||||
|             index_to_set = self.index_col.copy() | ||||
|             for i, item in enumerate(self.index_col): | ||||
|                 if is_integer(item): | ||||
|                     index_to_set[i] = frame.columns[item] | ||||
|                 # String case | ||||
|                 elif item not in frame.columns: | ||||
|                     raise ValueError(f"Index {item} invalid") | ||||
|  | ||||
|                 # Process dtype for index_col and drop from dtypes | ||||
|                 if self.dtype is not None: | ||||
|                     key, new_dtype = ( | ||||
|                         (item, self.dtype.get(item)) | ||||
|                         if self.dtype.get(item) is not None | ||||
|                         else (frame.columns[item], self.dtype.get(frame.columns[item])) | ||||
|                     ) | ||||
|                     if new_dtype is not None: | ||||
|                         frame[key] = frame[key].astype(new_dtype) | ||||
|                         del self.dtype[key] | ||||
|  | ||||
|             frame.set_index(index_to_set, drop=True, inplace=True) | ||||
|             # Clear names if headerless and no name given | ||||
|             if self.header is None and not multi_index_named: | ||||
|                 frame.index.names = [None] * len(frame.index.names) | ||||
|  | ||||
|         if self.dtype is not None: | ||||
|             # Ignore non-existent columns from dtype mapping | ||||
|             # like other parsers do | ||||
|             if isinstance(self.dtype, dict): | ||||
|                 self.dtype = { | ||||
|                     k: pandas_dtype(v) | ||||
|                     for k, v in self.dtype.items() | ||||
|                     if k in frame.columns | ||||
|                 } | ||||
|             else: | ||||
|                 self.dtype = pandas_dtype(self.dtype) | ||||
|             try: | ||||
|                 frame = frame.astype(self.dtype) | ||||
|             except TypeError as e: | ||||
|                 # GH#44901 reraise to keep api consistent | ||||
|                 raise ValueError(e) | ||||
|         return frame | ||||
|  | ||||
|     def _validate_usecols(self, usecols) -> None: | ||||
|         if lib.is_list_like(usecols) and not all(isinstance(x, str) for x in usecols): | ||||
|             raise ValueError( | ||||
|                 "The pyarrow engine does not allow 'usecols' to be integer " | ||||
|                 "column positions. Pass a list of string column names instead." | ||||
|             ) | ||||
|         elif callable(usecols): | ||||
|             raise ValueError( | ||||
|                 "The pyarrow engine does not allow 'usecols' to be a callable." | ||||
|             ) | ||||
|  | ||||
|     def read(self) -> DataFrame: | ||||
|         """ | ||||
|         Reads the contents of a CSV file into a DataFrame and | ||||
|         processes it according to the kwargs passed in the | ||||
|         constructor. | ||||
|  | ||||
|         Returns | ||||
|         ------- | ||||
|         DataFrame | ||||
|             The DataFrame created from the CSV file. | ||||
|         """ | ||||
|         pa = import_optional_dependency("pyarrow") | ||||
|         pyarrow_csv = import_optional_dependency("pyarrow.csv") | ||||
|         self._get_pyarrow_options() | ||||
|  | ||||
|         try: | ||||
|             convert_options = pyarrow_csv.ConvertOptions(**self.convert_options) | ||||
|         except TypeError: | ||||
|             include = self.convert_options.get("include_columns", None) | ||||
|             if include is not None: | ||||
|                 self._validate_usecols(include) | ||||
|  | ||||
|             nulls = self.convert_options.get("null_values", set()) | ||||
|             if not lib.is_list_like(nulls) or not all( | ||||
|                 isinstance(x, str) for x in nulls | ||||
|             ): | ||||
|                 raise TypeError( | ||||
|                     "The 'pyarrow' engine requires all na_values to be strings" | ||||
|                 ) | ||||
|  | ||||
|             raise | ||||
|  | ||||
|         try: | ||||
|             table = pyarrow_csv.read_csv( | ||||
|                 self.src, | ||||
|                 read_options=pyarrow_csv.ReadOptions(**self.read_options), | ||||
|                 parse_options=pyarrow_csv.ParseOptions(**self.parse_options), | ||||
|                 convert_options=convert_options, | ||||
|             ) | ||||
|         except pa.ArrowInvalid as e: | ||||
|             raise ParserError(e) from e | ||||
|  | ||||
|         dtype_backend = self.kwds["dtype_backend"] | ||||
|  | ||||
|         # Convert all pa.null() cols -> float64 (non nullable) | ||||
|         # else Int64 (nullable case, see below) | ||||
|         if dtype_backend is lib.no_default: | ||||
|             new_schema = table.schema | ||||
|             new_type = pa.float64() | ||||
|             for i, arrow_type in enumerate(table.schema.types): | ||||
|                 if pa.types.is_null(arrow_type): | ||||
|                     new_schema = new_schema.set( | ||||
|                         i, new_schema.field(i).with_type(new_type) | ||||
|                     ) | ||||
|  | ||||
|             table = table.cast(new_schema) | ||||
|  | ||||
|         with warnings.catch_warnings(): | ||||
|             warnings.filterwarnings( | ||||
|                 "ignore", | ||||
|                 "make_block is deprecated", | ||||
|                 DeprecationWarning, | ||||
|             ) | ||||
|             frame = arrow_table_to_pandas( | ||||
|                 table, dtype_backend=dtype_backend, null_to_int64=True | ||||
|             ) | ||||
|  | ||||
|         return self._finalize_pandas_output(frame) | ||||
							
								
								
									
										1462
									
								
								lib/python3.11/site-packages/pandas/io/parsers/base_parser.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1462
									
								
								lib/python3.11/site-packages/pandas/io/parsers/base_parser.py
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -0,0 +1,410 @@ | ||||
| from __future__ import annotations | ||||
|  | ||||
| from collections import defaultdict | ||||
| from typing import TYPE_CHECKING | ||||
| import warnings | ||||
|  | ||||
| import numpy as np | ||||
|  | ||||
| from pandas._libs import ( | ||||
|     lib, | ||||
|     parsers, | ||||
| ) | ||||
| from pandas.compat._optional import import_optional_dependency | ||||
| from pandas.errors import DtypeWarning | ||||
| from pandas.util._exceptions import find_stack_level | ||||
|  | ||||
| from pandas.core.dtypes.common import pandas_dtype | ||||
| from pandas.core.dtypes.concat import ( | ||||
|     concat_compat, | ||||
|     union_categoricals, | ||||
| ) | ||||
| from pandas.core.dtypes.dtypes import CategoricalDtype | ||||
|  | ||||
| from pandas.core.indexes.api import ensure_index_from_sequences | ||||
|  | ||||
| from pandas.io.common import ( | ||||
|     dedup_names, | ||||
|     is_potential_multi_index, | ||||
| ) | ||||
| from pandas.io.parsers.base_parser import ( | ||||
|     ParserBase, | ||||
|     ParserError, | ||||
|     is_index_col, | ||||
| ) | ||||
|  | ||||
| if TYPE_CHECKING: | ||||
|     from collections.abc import ( | ||||
|         Hashable, | ||||
|         Mapping, | ||||
|         Sequence, | ||||
|     ) | ||||
|  | ||||
|     from pandas._typing import ( | ||||
|         ArrayLike, | ||||
|         DtypeArg, | ||||
|         DtypeObj, | ||||
|         ReadCsvBuffer, | ||||
|     ) | ||||
|  | ||||
|     from pandas import ( | ||||
|         Index, | ||||
|         MultiIndex, | ||||
|     ) | ||||
|  | ||||
|  | ||||
| class CParserWrapper(ParserBase): | ||||
|     low_memory: bool | ||||
|     _reader: parsers.TextReader | ||||
|  | ||||
|     def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: | ||||
|         super().__init__(kwds) | ||||
|         self.kwds = kwds | ||||
|         kwds = kwds.copy() | ||||
|  | ||||
|         self.low_memory = kwds.pop("low_memory", False) | ||||
|  | ||||
|         # #2442 | ||||
|         # error: Cannot determine type of 'index_col' | ||||
|         kwds["allow_leading_cols"] = ( | ||||
|             self.index_col is not False  # type: ignore[has-type] | ||||
|         ) | ||||
|  | ||||
|         # GH20529, validate usecol arg before TextReader | ||||
|         kwds["usecols"] = self.usecols | ||||
|  | ||||
|         # Have to pass int, would break tests using TextReader directly otherwise :( | ||||
|         kwds["on_bad_lines"] = self.on_bad_lines.value | ||||
|  | ||||
|         for key in ( | ||||
|             "storage_options", | ||||
|             "encoding", | ||||
|             "memory_map", | ||||
|             "compression", | ||||
|         ): | ||||
|             kwds.pop(key, None) | ||||
|  | ||||
|         kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None)) | ||||
|         if "dtype_backend" not in kwds or kwds["dtype_backend"] is lib.no_default: | ||||
|             kwds["dtype_backend"] = "numpy" | ||||
|         if kwds["dtype_backend"] == "pyarrow": | ||||
|             # Fail here loudly instead of in cython after reading | ||||
|             import_optional_dependency("pyarrow") | ||||
|         self._reader = parsers.TextReader(src, **kwds) | ||||
|  | ||||
|         self.unnamed_cols = self._reader.unnamed_cols | ||||
|  | ||||
|         # error: Cannot determine type of 'names' | ||||
|         passed_names = self.names is None  # type: ignore[has-type] | ||||
|  | ||||
|         if self._reader.header is None: | ||||
|             self.names = None | ||||
|         else: | ||||
|             # error: Cannot determine type of 'names' | ||||
|             # error: Cannot determine type of 'index_names' | ||||
|             ( | ||||
|                 self.names,  # type: ignore[has-type] | ||||
|                 self.index_names, | ||||
|                 self.col_names, | ||||
|                 passed_names, | ||||
|             ) = self._extract_multi_indexer_columns( | ||||
|                 self._reader.header, | ||||
|                 self.index_names,  # type: ignore[has-type] | ||||
|                 passed_names, | ||||
|             ) | ||||
|  | ||||
|         # error: Cannot determine type of 'names' | ||||
|         if self.names is None:  # type: ignore[has-type] | ||||
|             self.names = list(range(self._reader.table_width)) | ||||
|  | ||||
|         # gh-9755 | ||||
|         # | ||||
|         # need to set orig_names here first | ||||
|         # so that proper indexing can be done | ||||
|         # with _set_noconvert_columns | ||||
|         # | ||||
|         # once names has been filtered, we will | ||||
|         # then set orig_names again to names | ||||
|         # error: Cannot determine type of 'names' | ||||
|         self.orig_names = self.names[:]  # type: ignore[has-type] | ||||
|  | ||||
|         if self.usecols: | ||||
|             usecols = self._evaluate_usecols(self.usecols, self.orig_names) | ||||
|  | ||||
|             # GH 14671 | ||||
|             # assert for mypy, orig_names is List or None, None would error in issubset | ||||
|             assert self.orig_names is not None | ||||
|             if self.usecols_dtype == "string" and not set(usecols).issubset( | ||||
|                 self.orig_names | ||||
|             ): | ||||
|                 self._validate_usecols_names(usecols, self.orig_names) | ||||
|  | ||||
|             # error: Cannot determine type of 'names' | ||||
|             if len(self.names) > len(usecols):  # type: ignore[has-type] | ||||
|                 # error: Cannot determine type of 'names' | ||||
|                 self.names = [  # type: ignore[has-type] | ||||
|                     n | ||||
|                     # error: Cannot determine type of 'names' | ||||
|                     for i, n in enumerate(self.names)  # type: ignore[has-type] | ||||
|                     if (i in usecols or n in usecols) | ||||
|                 ] | ||||
|  | ||||
|             # error: Cannot determine type of 'names' | ||||
|             if len(self.names) < len(usecols):  # type: ignore[has-type] | ||||
|                 # error: Cannot determine type of 'names' | ||||
|                 self._validate_usecols_names( | ||||
|                     usecols, | ||||
|                     self.names,  # type: ignore[has-type] | ||||
|                 ) | ||||
|  | ||||
|         # error: Cannot determine type of 'names' | ||||
|         self._validate_parse_dates_presence(self.names)  # type: ignore[has-type] | ||||
|         self._set_noconvert_columns() | ||||
|  | ||||
|         # error: Cannot determine type of 'names' | ||||
|         self.orig_names = self.names  # type: ignore[has-type] | ||||
|  | ||||
|         if not self._has_complex_date_col: | ||||
|             # error: Cannot determine type of 'index_col' | ||||
|             if self._reader.leading_cols == 0 and is_index_col( | ||||
|                 self.index_col  # type: ignore[has-type] | ||||
|             ): | ||||
|                 self._name_processed = True | ||||
|                 ( | ||||
|                     index_names, | ||||
|                     # error: Cannot determine type of 'names' | ||||
|                     self.names,  # type: ignore[has-type] | ||||
|                     self.index_col, | ||||
|                 ) = self._clean_index_names( | ||||
|                     # error: Cannot determine type of 'names' | ||||
|                     self.names,  # type: ignore[has-type] | ||||
|                     # error: Cannot determine type of 'index_col' | ||||
|                     self.index_col,  # type: ignore[has-type] | ||||
|                 ) | ||||
|  | ||||
|                 if self.index_names is None: | ||||
|                     self.index_names = index_names | ||||
|  | ||||
|             if self._reader.header is None and not passed_names: | ||||
|                 assert self.index_names is not None | ||||
|                 self.index_names = [None] * len(self.index_names) | ||||
|  | ||||
|         self._implicit_index = self._reader.leading_cols > 0 | ||||
|  | ||||
|     def close(self) -> None: | ||||
|         # close handles opened by C parser | ||||
|         try: | ||||
|             self._reader.close() | ||||
|         except ValueError: | ||||
|             pass | ||||
|  | ||||
|     def _set_noconvert_columns(self) -> None: | ||||
|         """ | ||||
|         Set the columns that should not undergo dtype conversions. | ||||
|  | ||||
|         Currently, any column that is involved with date parsing will not | ||||
|         undergo such conversions. | ||||
|         """ | ||||
|         assert self.orig_names is not None | ||||
|         # error: Cannot determine type of 'names' | ||||
|  | ||||
|         # much faster than using orig_names.index(x) xref GH#44106 | ||||
|         names_dict = {x: i for i, x in enumerate(self.orig_names)} | ||||
|         col_indices = [names_dict[x] for x in self.names]  # type: ignore[has-type] | ||||
|         # error: Cannot determine type of 'names' | ||||
|         noconvert_columns = self._set_noconvert_dtype_columns( | ||||
|             col_indices, | ||||
|             self.names,  # type: ignore[has-type] | ||||
|         ) | ||||
|         for col in noconvert_columns: | ||||
|             self._reader.set_noconvert(col) | ||||
|  | ||||
|     def read( | ||||
|         self, | ||||
|         nrows: int | None = None, | ||||
|     ) -> tuple[ | ||||
|         Index | MultiIndex | None, | ||||
|         Sequence[Hashable] | MultiIndex, | ||||
|         Mapping[Hashable, ArrayLike], | ||||
|     ]: | ||||
|         index: Index | MultiIndex | None | ||||
|         column_names: Sequence[Hashable] | MultiIndex | ||||
|         try: | ||||
|             if self.low_memory: | ||||
|                 chunks = self._reader.read_low_memory(nrows) | ||||
|                 # destructive to chunks | ||||
|                 data = _concatenate_chunks(chunks) | ||||
|  | ||||
|             else: | ||||
|                 data = self._reader.read(nrows) | ||||
|         except StopIteration: | ||||
|             if self._first_chunk: | ||||
|                 self._first_chunk = False | ||||
|                 names = dedup_names( | ||||
|                     self.orig_names, | ||||
|                     is_potential_multi_index(self.orig_names, self.index_col), | ||||
|                 ) | ||||
|                 index, columns, col_dict = self._get_empty_meta( | ||||
|                     names, | ||||
|                     dtype=self.dtype, | ||||
|                 ) | ||||
|                 columns = self._maybe_make_multi_index_columns(columns, self.col_names) | ||||
|  | ||||
|                 if self.usecols is not None: | ||||
|                     columns = self._filter_usecols(columns) | ||||
|  | ||||
|                 col_dict = {k: v for k, v in col_dict.items() if k in columns} | ||||
|  | ||||
|                 return index, columns, col_dict | ||||
|  | ||||
|             else: | ||||
|                 self.close() | ||||
|                 raise | ||||
|  | ||||
|         # Done with first read, next time raise StopIteration | ||||
|         self._first_chunk = False | ||||
|  | ||||
|         # error: Cannot determine type of 'names' | ||||
|         names = self.names  # type: ignore[has-type] | ||||
|  | ||||
|         if self._reader.leading_cols: | ||||
|             if self._has_complex_date_col: | ||||
|                 raise NotImplementedError("file structure not yet supported") | ||||
|  | ||||
|             # implicit index, no index names | ||||
|             arrays = [] | ||||
|  | ||||
|             if self.index_col and self._reader.leading_cols != len(self.index_col): | ||||
|                 raise ParserError( | ||||
|                     "Could not construct index. Requested to use " | ||||
|                     f"{len(self.index_col)} number of columns, but " | ||||
|                     f"{self._reader.leading_cols} left to parse." | ||||
|                 ) | ||||
|  | ||||
|             for i in range(self._reader.leading_cols): | ||||
|                 if self.index_col is None: | ||||
|                     values = data.pop(i) | ||||
|                 else: | ||||
|                     values = data.pop(self.index_col[i]) | ||||
|  | ||||
|                 values = self._maybe_parse_dates(values, i, try_parse_dates=True) | ||||
|                 arrays.append(values) | ||||
|  | ||||
|             index = ensure_index_from_sequences(arrays) | ||||
|  | ||||
|             if self.usecols is not None: | ||||
|                 names = self._filter_usecols(names) | ||||
|  | ||||
|             names = dedup_names(names, is_potential_multi_index(names, self.index_col)) | ||||
|  | ||||
|             # rename dict keys | ||||
|             data_tups = sorted(data.items()) | ||||
|             data = {k: v for k, (i, v) in zip(names, data_tups)} | ||||
|  | ||||
|             column_names, date_data = self._do_date_conversions(names, data) | ||||
|  | ||||
|             # maybe create a mi on the columns | ||||
|             column_names = self._maybe_make_multi_index_columns( | ||||
|                 column_names, self.col_names | ||||
|             ) | ||||
|  | ||||
|         else: | ||||
|             # rename dict keys | ||||
|             data_tups = sorted(data.items()) | ||||
|  | ||||
|             # ugh, mutation | ||||
|  | ||||
|             # assert for mypy, orig_names is List or None, None would error in list(...) | ||||
|             assert self.orig_names is not None | ||||
|             names = list(self.orig_names) | ||||
|             names = dedup_names(names, is_potential_multi_index(names, self.index_col)) | ||||
|  | ||||
|             if self.usecols is not None: | ||||
|                 names = self._filter_usecols(names) | ||||
|  | ||||
|             # columns as list | ||||
|             alldata = [x[1] for x in data_tups] | ||||
|             if self.usecols is None: | ||||
|                 self._check_data_length(names, alldata) | ||||
|  | ||||
|             data = {k: v for k, (i, v) in zip(names, data_tups)} | ||||
|  | ||||
|             names, date_data = self._do_date_conversions(names, data) | ||||
|             index, column_names = self._make_index(date_data, alldata, names) | ||||
|  | ||||
|         return index, column_names, date_data | ||||
|  | ||||
|     def _filter_usecols(self, names: Sequence[Hashable]) -> Sequence[Hashable]: | ||||
|         # hackish | ||||
|         usecols = self._evaluate_usecols(self.usecols, names) | ||||
|         if usecols is not None and len(names) != len(usecols): | ||||
|             names = [ | ||||
|                 name for i, name in enumerate(names) if i in usecols or name in usecols | ||||
|             ] | ||||
|         return names | ||||
|  | ||||
|     def _maybe_parse_dates(self, values, index: int, try_parse_dates: bool = True): | ||||
|         if try_parse_dates and self._should_parse_dates(index): | ||||
|             values = self._date_conv( | ||||
|                 values, | ||||
|                 col=self.index_names[index] if self.index_names is not None else None, | ||||
|             ) | ||||
|         return values | ||||
|  | ||||
|  | ||||
| def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict: | ||||
|     """ | ||||
|     Concatenate chunks of data read with low_memory=True. | ||||
|  | ||||
|     The tricky part is handling Categoricals, where different chunks | ||||
|     may have different inferred categories. | ||||
|     """ | ||||
|     names = list(chunks[0].keys()) | ||||
|     warning_columns = [] | ||||
|  | ||||
|     result: dict = {} | ||||
|     for name in names: | ||||
|         arrs = [chunk.pop(name) for chunk in chunks] | ||||
|         # Check each arr for consistent types. | ||||
|         dtypes = {a.dtype for a in arrs} | ||||
|         non_cat_dtypes = {x for x in dtypes if not isinstance(x, CategoricalDtype)} | ||||
|  | ||||
|         dtype = dtypes.pop() | ||||
|         if isinstance(dtype, CategoricalDtype): | ||||
|             result[name] = union_categoricals(arrs, sort_categories=False) | ||||
|         else: | ||||
|             result[name] = concat_compat(arrs) | ||||
|             if len(non_cat_dtypes) > 1 and result[name].dtype == np.dtype(object): | ||||
|                 warning_columns.append(str(name)) | ||||
|  | ||||
|     if warning_columns: | ||||
|         warning_names = ",".join(warning_columns) | ||||
|         warning_message = " ".join( | ||||
|             [ | ||||
|                 f"Columns ({warning_names}) have mixed types. " | ||||
|                 f"Specify dtype option on import or set low_memory=False." | ||||
|             ] | ||||
|         ) | ||||
|         warnings.warn(warning_message, DtypeWarning, stacklevel=find_stack_level()) | ||||
|     return result | ||||
|  | ||||
|  | ||||
| def ensure_dtype_objs( | ||||
|     dtype: DtypeArg | dict[Hashable, DtypeArg] | None | ||||
| ) -> DtypeObj | dict[Hashable, DtypeObj] | None: | ||||
|     """ | ||||
|     Ensure we have either None, a dtype object, or a dictionary mapping to | ||||
|     dtype objects. | ||||
|     """ | ||||
|     if isinstance(dtype, defaultdict): | ||||
|         # "None" not callable  [misc] | ||||
|         default_dtype = pandas_dtype(dtype.default_factory())  # type: ignore[misc] | ||||
|         dtype_converted: defaultdict = defaultdict(lambda: default_dtype) | ||||
|         for key in dtype.keys(): | ||||
|             dtype_converted[key] = pandas_dtype(dtype[key]) | ||||
|         return dtype_converted | ||||
|     elif isinstance(dtype, dict): | ||||
|         return {k: pandas_dtype(dtype[k]) for k in dtype} | ||||
|     elif dtype is not None: | ||||
|         return pandas_dtype(dtype) | ||||
|     return dtype | ||||
							
								
								
									
										1387
									
								
								lib/python3.11/site-packages/pandas/io/parsers/python_parser.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1387
									
								
								lib/python3.11/site-packages/pandas/io/parsers/python_parser.py
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										2383
									
								
								lib/python3.11/site-packages/pandas/io/parsers/readers.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2383
									
								
								lib/python3.11/site-packages/pandas/io/parsers/readers.py
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
		Reference in New Issue
	
	Block a user