done
This commit is contained in:
		| @ -0,0 +1,382 @@ | ||||
| """ | ||||
| Tests that work on both the Python and C engines but do not have a | ||||
| specific classification into the other test modules. | ||||
| """ | ||||
| from io import StringIO | ||||
|  | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas._libs import parsers as libparsers | ||||
| from pandas.errors import DtypeWarning | ||||
|  | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     concat, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
| pytestmark = pytest.mark.filterwarnings( | ||||
|     "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" | ||||
| ) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("index_col", [0, "index"]) | ||||
| def test_read_chunksize_with_index(all_parsers, index_col): | ||||
|     parser = all_parsers | ||||
|     data = """index,A,B,C,D | ||||
| foo,2,3,4,5 | ||||
| bar,7,8,9,10 | ||||
| baz,12,13,14,15 | ||||
| qux,12,13,14,15 | ||||
| foo2,12,13,14,15 | ||||
| bar2,12,13,14,15 | ||||
| """ | ||||
|  | ||||
|     expected = DataFrame( | ||||
|         [ | ||||
|             ["foo", 2, 3, 4, 5], | ||||
|             ["bar", 7, 8, 9, 10], | ||||
|             ["baz", 12, 13, 14, 15], | ||||
|             ["qux", 12, 13, 14, 15], | ||||
|             ["foo2", 12, 13, 14, 15], | ||||
|             ["bar2", 12, 13, 14, 15], | ||||
|         ], | ||||
|         columns=["index", "A", "B", "C", "D"], | ||||
|     ) | ||||
|     expected = expected.set_index("index") | ||||
|  | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader: | ||||
|                 list(reader) | ||||
|         return | ||||
|  | ||||
|     with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader: | ||||
|         chunks = list(reader) | ||||
|     tm.assert_frame_equal(chunks[0], expected[:2]) | ||||
|     tm.assert_frame_equal(chunks[1], expected[2:4]) | ||||
|     tm.assert_frame_equal(chunks[2], expected[4:]) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("chunksize", [1.3, "foo", 0]) | ||||
| def test_read_chunksize_bad(all_parsers, chunksize): | ||||
|     data = """index,A,B,C,D | ||||
| foo,2,3,4,5 | ||||
| bar,7,8,9,10 | ||||
| baz,12,13,14,15 | ||||
| qux,12,13,14,15 | ||||
| foo2,12,13,14,15 | ||||
| bar2,12,13,14,15 | ||||
| """ | ||||
|     parser = all_parsers | ||||
|     msg = r"'chunksize' must be an integer >=1" | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" | ||||
|  | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         with parser.read_csv(StringIO(data), chunksize=chunksize) as _: | ||||
|             pass | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("chunksize", [2, 8]) | ||||
| def test_read_chunksize_and_nrows(all_parsers, chunksize): | ||||
|     # see gh-15755 | ||||
|     data = """index,A,B,C,D | ||||
| foo,2,3,4,5 | ||||
| bar,7,8,9,10 | ||||
| baz,12,13,14,15 | ||||
| qux,12,13,14,15 | ||||
| foo2,12,13,14,15 | ||||
| bar2,12,13,14,15 | ||||
| """ | ||||
|     parser = all_parsers | ||||
|     kwargs = {"index_col": 0, "nrows": 5} | ||||
|  | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = "The 'nrows' option is not supported with the 'pyarrow' engine" | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             parser.read_csv(StringIO(data), **kwargs) | ||||
|         return | ||||
|  | ||||
|     expected = parser.read_csv(StringIO(data), **kwargs) | ||||
|     with parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs) as reader: | ||||
|         tm.assert_frame_equal(concat(reader), expected) | ||||
|  | ||||
|  | ||||
| def test_read_chunksize_and_nrows_changing_size(all_parsers): | ||||
|     data = """index,A,B,C,D | ||||
| foo,2,3,4,5 | ||||
| bar,7,8,9,10 | ||||
| baz,12,13,14,15 | ||||
| qux,12,13,14,15 | ||||
| foo2,12,13,14,15 | ||||
| bar2,12,13,14,15 | ||||
| """ | ||||
|     parser = all_parsers | ||||
|     kwargs = {"index_col": 0, "nrows": 5} | ||||
|  | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = "The 'nrows' option is not supported with the 'pyarrow' engine" | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             parser.read_csv(StringIO(data), **kwargs) | ||||
|         return | ||||
|  | ||||
|     expected = parser.read_csv(StringIO(data), **kwargs) | ||||
|     with parser.read_csv(StringIO(data), chunksize=8, **kwargs) as reader: | ||||
|         tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2]) | ||||
|         tm.assert_frame_equal(reader.get_chunk(size=4), expected.iloc[2:5]) | ||||
|  | ||||
|         with pytest.raises(StopIteration, match=""): | ||||
|             reader.get_chunk(size=3) | ||||
|  | ||||
|  | ||||
| def test_get_chunk_passed_chunksize(all_parsers): | ||||
|     parser = all_parsers | ||||
|     data = """A,B,C | ||||
| 1,2,3 | ||||
| 4,5,6 | ||||
| 7,8,9 | ||||
| 1,2,3""" | ||||
|  | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             with parser.read_csv(StringIO(data), chunksize=2) as reader: | ||||
|                 reader.get_chunk() | ||||
|         return | ||||
|  | ||||
|     with parser.read_csv(StringIO(data), chunksize=2) as reader: | ||||
|         result = reader.get_chunk() | ||||
|  | ||||
|     expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"]) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("kwargs", [{}, {"index_col": 0}]) | ||||
| def test_read_chunksize_compat(all_parsers, kwargs): | ||||
|     # see gh-12185 | ||||
|     data = """index,A,B,C,D | ||||
| foo,2,3,4,5 | ||||
| bar,7,8,9,10 | ||||
| baz,12,13,14,15 | ||||
| qux,12,13,14,15 | ||||
| foo2,12,13,14,15 | ||||
| bar2,12,13,14,15 | ||||
| """ | ||||
|     parser = all_parsers | ||||
|     result = parser.read_csv(StringIO(data), **kwargs) | ||||
|  | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader: | ||||
|                 concat(reader) | ||||
|         return | ||||
|  | ||||
|     with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader: | ||||
|         via_reader = concat(reader) | ||||
|     tm.assert_frame_equal(via_reader, result) | ||||
|  | ||||
|  | ||||
| def test_read_chunksize_jagged_names(all_parsers): | ||||
|     # see gh-23509 | ||||
|     parser = all_parsers | ||||
|     data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)]) | ||||
|  | ||||
|     expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10]) | ||||
|  | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             with parser.read_csv( | ||||
|                 StringIO(data), names=range(10), chunksize=4 | ||||
|             ) as reader: | ||||
|                 concat(reader) | ||||
|         return | ||||
|  | ||||
|     with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader: | ||||
|         result = concat(reader) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_chunk_begins_with_newline_whitespace(all_parsers): | ||||
|     # see gh-10022 | ||||
|     parser = all_parsers | ||||
|     data = "\n hello\nworld\n" | ||||
|  | ||||
|     result = parser.read_csv(StringIO(data), header=None) | ||||
|     expected = DataFrame([" hello", "world"]) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.slow | ||||
| def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch): | ||||
|     # mainly an issue with the C parser | ||||
|     heuristic = 2**3 | ||||
|     parser = all_parsers | ||||
|     integers = [str(i) for i in range(heuristic - 1)] | ||||
|     data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers) | ||||
|  | ||||
|     # Coercions should work without warnings. | ||||
|     with monkeypatch.context() as m: | ||||
|         m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic) | ||||
|         result = parser.read_csv(StringIO(data)) | ||||
|  | ||||
|     assert type(result.a[0]) is np.float64 | ||||
|     assert result.a.dtype == float | ||||
|  | ||||
|  | ||||
| def test_warn_if_chunks_have_mismatched_type(all_parsers, using_infer_string): | ||||
|     warning_type = None | ||||
|     parser = all_parsers | ||||
|     size = 10000 | ||||
|  | ||||
|     # see gh-3866: if chunks are different types and can't | ||||
|     # be coerced using numerical types, then issue warning. | ||||
|     if parser.engine == "c" and parser.low_memory: | ||||
|         warning_type = DtypeWarning | ||||
|         # Use larger size to hit warning path | ||||
|         size = 499999 | ||||
|  | ||||
|     integers = [str(i) for i in range(size)] | ||||
|     data = "a\n" + "\n".join(integers + ["a", "b"] + integers) | ||||
|  | ||||
|     buf = StringIO(data) | ||||
|  | ||||
|     if parser.engine == "pyarrow": | ||||
|         df = parser.read_csv( | ||||
|             buf, | ||||
|         ) | ||||
|     else: | ||||
|         df = parser.read_csv_check_warnings( | ||||
|             warning_type, | ||||
|             r"Columns \(0\) have mixed types. " | ||||
|             "Specify dtype option on import or set low_memory=False.", | ||||
|             buf, | ||||
|         ) | ||||
|     if parser.engine == "c" and parser.low_memory: | ||||
|         assert df.a.dtype == object | ||||
|     elif using_infer_string: | ||||
|         assert df.a.dtype == "str" | ||||
|     else: | ||||
|         assert df.a.dtype == object | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("iterator", [True, False]) | ||||
| def test_empty_with_nrows_chunksize(all_parsers, iterator): | ||||
|     # see gh-9535 | ||||
|     parser = all_parsers | ||||
|     expected = DataFrame(columns=["foo", "bar"]) | ||||
|  | ||||
|     nrows = 10 | ||||
|     data = StringIO("foo,bar\n") | ||||
|  | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = ( | ||||
|             "The '(nrows|chunksize)' option is not supported with the 'pyarrow' engine" | ||||
|         ) | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             if iterator: | ||||
|                 with parser.read_csv(data, chunksize=nrows) as reader: | ||||
|                     next(iter(reader)) | ||||
|             else: | ||||
|                 parser.read_csv(data, nrows=nrows) | ||||
|         return | ||||
|  | ||||
|     if iterator: | ||||
|         with parser.read_csv(data, chunksize=nrows) as reader: | ||||
|             result = next(iter(reader)) | ||||
|     else: | ||||
|         result = parser.read_csv(data, nrows=nrows) | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_read_csv_memory_growth_chunksize(all_parsers): | ||||
|     # see gh-24805 | ||||
|     # | ||||
|     # Let's just make sure that we don't crash | ||||
|     # as we iteratively process all chunks. | ||||
|     parser = all_parsers | ||||
|  | ||||
|     with tm.ensure_clean() as path: | ||||
|         with open(path, "w", encoding="utf-8") as f: | ||||
|             for i in range(1000): | ||||
|                 f.write(str(i) + "\n") | ||||
|  | ||||
|         if parser.engine == "pyarrow": | ||||
|             msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" | ||||
|             with pytest.raises(ValueError, match=msg): | ||||
|                 with parser.read_csv(path, chunksize=20) as result: | ||||
|                     for _ in result: | ||||
|                         pass | ||||
|             return | ||||
|  | ||||
|         with parser.read_csv(path, chunksize=20) as result: | ||||
|             for _ in result: | ||||
|                 pass | ||||
|  | ||||
|  | ||||
| def test_chunksize_with_usecols_second_block_shorter(all_parsers): | ||||
|     # GH#21211 | ||||
|     parser = all_parsers | ||||
|     data = """1,2,3,4 | ||||
| 5,6,7,8 | ||||
| 9,10,11 | ||||
| """ | ||||
|  | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             parser.read_csv( | ||||
|                 StringIO(data), | ||||
|                 names=["a", "b"], | ||||
|                 chunksize=2, | ||||
|                 usecols=[0, 1], | ||||
|                 header=None, | ||||
|             ) | ||||
|         return | ||||
|  | ||||
|     result_chunks = parser.read_csv( | ||||
|         StringIO(data), | ||||
|         names=["a", "b"], | ||||
|         chunksize=2, | ||||
|         usecols=[0, 1], | ||||
|         header=None, | ||||
|     ) | ||||
|  | ||||
|     expected_frames = [ | ||||
|         DataFrame({"a": [1, 5], "b": [2, 6]}), | ||||
|         DataFrame({"a": [9], "b": [10]}, index=[2]), | ||||
|     ] | ||||
|  | ||||
|     for i, result in enumerate(result_chunks): | ||||
|         tm.assert_frame_equal(result, expected_frames[i]) | ||||
|  | ||||
|  | ||||
| def test_chunksize_second_block_shorter(all_parsers): | ||||
|     # GH#21211 | ||||
|     parser = all_parsers | ||||
|     data = """a,b,c,d | ||||
| 1,2,3,4 | ||||
| 5,6,7,8 | ||||
| 9,10,11 | ||||
| """ | ||||
|  | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             parser.read_csv(StringIO(data), chunksize=2) | ||||
|         return | ||||
|  | ||||
|     result_chunks = parser.read_csv(StringIO(data), chunksize=2) | ||||
|  | ||||
|     expected_frames = [ | ||||
|         DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}), | ||||
|         DataFrame({"a": [9], "b": [10], "c": [11], "d": [np.nan]}, index=[2]), | ||||
|     ] | ||||
|  | ||||
|     for i, result in enumerate(result_chunks): | ||||
|         tm.assert_frame_equal(result, expected_frames[i]) | ||||
| @ -0,0 +1,983 @@ | ||||
| """ | ||||
| Tests that work on both the Python and C engines but do not have a | ||||
| specific classification into the other test modules. | ||||
| """ | ||||
| from datetime import datetime | ||||
| from inspect import signature | ||||
| from io import StringIO | ||||
| import os | ||||
| from pathlib import Path | ||||
| import sys | ||||
|  | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas._config import using_string_dtype | ||||
|  | ||||
| from pandas.compat import HAS_PYARROW | ||||
| from pandas.errors import ( | ||||
|     EmptyDataError, | ||||
|     ParserError, | ||||
|     ParserWarning, | ||||
| ) | ||||
|  | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     Timestamp, | ||||
|     compat, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
| from pandas.io.parsers import TextFileReader | ||||
| from pandas.io.parsers.c_parser_wrapper import CParserWrapper | ||||
|  | ||||
| pytestmark = pytest.mark.filterwarnings( | ||||
|     "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" | ||||
| ) | ||||
|  | ||||
| xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") | ||||
| skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") | ||||
|  | ||||
|  | ||||
| def test_override_set_noconvert_columns(): | ||||
|     # see gh-17351 | ||||
|     # | ||||
|     # Usecols needs to be sorted in _set_noconvert_columns based | ||||
|     # on the test_usecols_with_parse_dates test from test_usecols.py | ||||
|     class MyTextFileReader(TextFileReader): | ||||
|         def __init__(self) -> None: | ||||
|             self._currow = 0 | ||||
|             self.squeeze = False | ||||
|  | ||||
|     class MyCParserWrapper(CParserWrapper): | ||||
|         def _set_noconvert_columns(self): | ||||
|             if self.usecols_dtype == "integer": | ||||
|                 # self.usecols is a set, which is documented as unordered | ||||
|                 # but in practice, a CPython set of integers is sorted. | ||||
|                 # In other implementations this assumption does not hold. | ||||
|                 # The following code simulates a different order, which | ||||
|                 # before GH 17351 would cause the wrong columns to be | ||||
|                 # converted via the parse_dates parameter | ||||
|                 self.usecols = list(self.usecols) | ||||
|                 self.usecols.reverse() | ||||
|             return CParserWrapper._set_noconvert_columns(self) | ||||
|  | ||||
|     data = """a,b,c,d,e | ||||
| 0,1,2014-01-01,09:00,4 | ||||
| 0,1,2014-01-02,10:00,4""" | ||||
|  | ||||
|     parse_dates = [[1, 2]] | ||||
|     cols = { | ||||
|         "a": [0, 0], | ||||
|         "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], | ||||
|     } | ||||
|     expected = DataFrame(cols, columns=["c_d", "a"]) | ||||
|  | ||||
|     parser = MyTextFileReader() | ||||
|     parser.options = { | ||||
|         "usecols": [0, 2, 3], | ||||
|         "parse_dates": parse_dates, | ||||
|         "delimiter": ",", | ||||
|     } | ||||
|     parser.engine = "c" | ||||
|     parser._engine = MyCParserWrapper(StringIO(data), **parser.options) | ||||
|  | ||||
|     result = parser.read() | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_read_csv_local(all_parsers, csv1): | ||||
|     prefix = "file:///" if compat.is_platform_windows() else "file://" | ||||
|     parser = all_parsers | ||||
|  | ||||
|     fname = prefix + str(os.path.abspath(csv1)) | ||||
|     result = parser.read_csv(fname, index_col=0, parse_dates=True) | ||||
|     # TODO: make unit check more specific | ||||
|     if parser.engine == "pyarrow": | ||||
|         result.index = result.index.as_unit("ns") | ||||
|     expected = DataFrame( | ||||
|         [ | ||||
|             [0.980269, 3.685731, -0.364216805298, -1.159738], | ||||
|             [1.047916, -0.041232, -0.16181208307, 0.212549], | ||||
|             [0.498581, 0.731168, -0.537677223318, 1.346270], | ||||
|             [1.120202, 1.567621, 0.00364077397681, 0.675253], | ||||
|             [-0.487094, 0.571455, -1.6116394093, 0.103469], | ||||
|             [0.836649, 0.246462, 0.588542635376, 1.062782], | ||||
|             [-0.157161, 1.340307, 1.1957779562, -1.097007], | ||||
|         ], | ||||
|         columns=["A", "B", "C", "D"], | ||||
|         index=Index( | ||||
|             [ | ||||
|                 datetime(2000, 1, 3), | ||||
|                 datetime(2000, 1, 4), | ||||
|                 datetime(2000, 1, 5), | ||||
|                 datetime(2000, 1, 6), | ||||
|                 datetime(2000, 1, 7), | ||||
|                 datetime(2000, 1, 10), | ||||
|                 datetime(2000, 1, 11), | ||||
|             ], | ||||
|             name="index", | ||||
|         ), | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_1000_sep(all_parsers): | ||||
|     parser = all_parsers | ||||
|     data = """A|B|C | ||||
| 1|2,334|5 | ||||
| 10|13|10. | ||||
| """ | ||||
|     expected = DataFrame({"A": [1, 10], "B": [2334, 13], "C": [5, 10.0]}) | ||||
|  | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = "The 'thousands' option is not supported with the 'pyarrow' engine" | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             parser.read_csv(StringIO(data), sep="|", thousands=",") | ||||
|         return | ||||
|  | ||||
|     result = parser.read_csv(StringIO(data), sep="|", thousands=",") | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @xfail_pyarrow  # ValueError: Found non-unique column index | ||||
| def test_unnamed_columns(all_parsers): | ||||
|     data = """A,B,C,, | ||||
| 1,2,3,4,5 | ||||
| 6,7,8,9,10 | ||||
| 11,12,13,14,15 | ||||
| """ | ||||
|     parser = all_parsers | ||||
|     expected = DataFrame( | ||||
|         [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], | ||||
|         dtype=np.int64, | ||||
|         columns=["A", "B", "C", "Unnamed: 3", "Unnamed: 4"], | ||||
|     ) | ||||
|     result = parser.read_csv(StringIO(data)) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_csv_mixed_type(all_parsers): | ||||
|     data = """A,B,C | ||||
| a,1,2 | ||||
| b,3,4 | ||||
| c,4,5 | ||||
| """ | ||||
|     parser = all_parsers | ||||
|     expected = DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]}) | ||||
|     result = parser.read_csv(StringIO(data)) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_read_csv_low_memory_no_rows_with_index(all_parsers): | ||||
|     # see gh-21141 | ||||
|     parser = all_parsers | ||||
|  | ||||
|     if not parser.low_memory: | ||||
|         pytest.skip("This is a low-memory specific test") | ||||
|  | ||||
|     data = """A,B,C | ||||
| 1,1,1,2 | ||||
| 2,2,3,4 | ||||
| 3,3,4,5 | ||||
| """ | ||||
|  | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = "The 'nrows' option is not supported with the 'pyarrow' engine" | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0) | ||||
|         return | ||||
|  | ||||
|     result = parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0) | ||||
|     expected = DataFrame(columns=["A", "B", "C"]) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_read_csv_dataframe(all_parsers, csv1): | ||||
|     parser = all_parsers | ||||
|     result = parser.read_csv(csv1, index_col=0, parse_dates=True) | ||||
|     # TODO: make unit check more specific | ||||
|     if parser.engine == "pyarrow": | ||||
|         result.index = result.index.as_unit("ns") | ||||
|     expected = DataFrame( | ||||
|         [ | ||||
|             [0.980269, 3.685731, -0.364216805298, -1.159738], | ||||
|             [1.047916, -0.041232, -0.16181208307, 0.212549], | ||||
|             [0.498581, 0.731168, -0.537677223318, 1.346270], | ||||
|             [1.120202, 1.567621, 0.00364077397681, 0.675253], | ||||
|             [-0.487094, 0.571455, -1.6116394093, 0.103469], | ||||
|             [0.836649, 0.246462, 0.588542635376, 1.062782], | ||||
|             [-0.157161, 1.340307, 1.1957779562, -1.097007], | ||||
|         ], | ||||
|         columns=["A", "B", "C", "D"], | ||||
|         index=Index( | ||||
|             [ | ||||
|                 datetime(2000, 1, 3), | ||||
|                 datetime(2000, 1, 4), | ||||
|                 datetime(2000, 1, 5), | ||||
|                 datetime(2000, 1, 6), | ||||
|                 datetime(2000, 1, 7), | ||||
|                 datetime(2000, 1, 10), | ||||
|                 datetime(2000, 1, 11), | ||||
|             ], | ||||
|             name="index", | ||||
|         ), | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("nrows", [3, 3.0]) | ||||
| def test_read_nrows(all_parsers, nrows): | ||||
|     # see gh-10476 | ||||
|     data = """index,A,B,C,D | ||||
| foo,2,3,4,5 | ||||
| bar,7,8,9,10 | ||||
| baz,12,13,14,15 | ||||
| qux,12,13,14,15 | ||||
| foo2,12,13,14,15 | ||||
| bar2,12,13,14,15 | ||||
| """ | ||||
|     expected = DataFrame( | ||||
|         [["foo", 2, 3, 4, 5], ["bar", 7, 8, 9, 10], ["baz", 12, 13, 14, 15]], | ||||
|         columns=["index", "A", "B", "C", "D"], | ||||
|     ) | ||||
|     parser = all_parsers | ||||
|  | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = "The 'nrows' option is not supported with the 'pyarrow' engine" | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             parser.read_csv(StringIO(data), nrows=nrows) | ||||
|         return | ||||
|  | ||||
|     result = parser.read_csv(StringIO(data), nrows=nrows) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("nrows", [1.2, "foo", -1]) | ||||
| def test_read_nrows_bad(all_parsers, nrows): | ||||
|     data = """index,A,B,C,D | ||||
| foo,2,3,4,5 | ||||
| bar,7,8,9,10 | ||||
| baz,12,13,14,15 | ||||
| qux,12,13,14,15 | ||||
| foo2,12,13,14,15 | ||||
| bar2,12,13,14,15 | ||||
| """ | ||||
|     msg = r"'nrows' must be an integer >=0" | ||||
|     parser = all_parsers | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = "The 'nrows' option is not supported with the 'pyarrow' engine" | ||||
|  | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         parser.read_csv(StringIO(data), nrows=nrows) | ||||
|  | ||||
|  | ||||
| def test_nrows_skipfooter_errors(all_parsers): | ||||
|     msg = "'skipfooter' not supported with 'nrows'" | ||||
|     data = "a\n1\n2\n3\n4\n5\n6" | ||||
|     parser = all_parsers | ||||
|  | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         parser.read_csv(StringIO(data), skipfooter=1, nrows=5) | ||||
|  | ||||
|  | ||||
| @skip_pyarrow | ||||
| def test_missing_trailing_delimiters(all_parsers): | ||||
|     parser = all_parsers | ||||
|     data = """A,B,C,D | ||||
| 1,2,3,4 | ||||
| 1,3,3, | ||||
| 1,4,5""" | ||||
|  | ||||
|     result = parser.read_csv(StringIO(data)) | ||||
|     expected = DataFrame( | ||||
|         [[1, 2, 3, 4], [1, 3, 3, np.nan], [1, 4, 5, np.nan]], | ||||
|         columns=["A", "B", "C", "D"], | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_skip_initial_space(all_parsers): | ||||
|     data = ( | ||||
|         '"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, ' | ||||
|         "1.00361,  1.12551, 330.65659, 0355626618.16711,  73.48821, " | ||||
|         "314.11625,  1917.09447,   179.71425,  80.000, 240.000, -350,  " | ||||
|         "70.06056, 344.98370, 1,   1, -0.689265, -0.692787,  " | ||||
|         "0.212036,    14.7674,   41.605,   -9999.0,   -9999.0,   " | ||||
|         "-9999.0,   -9999.0,   -9999.0,  -9999.0, 000, 012, 128" | ||||
|     ) | ||||
|     parser = all_parsers | ||||
|  | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine" | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             parser.read_csv( | ||||
|                 StringIO(data), | ||||
|                 names=list(range(33)), | ||||
|                 header=None, | ||||
|                 na_values=["-9999.0"], | ||||
|                 skipinitialspace=True, | ||||
|             ) | ||||
|         return | ||||
|  | ||||
|     result = parser.read_csv( | ||||
|         StringIO(data), | ||||
|         names=list(range(33)), | ||||
|         header=None, | ||||
|         na_values=["-9999.0"], | ||||
|         skipinitialspace=True, | ||||
|     ) | ||||
|     expected = DataFrame( | ||||
|         [ | ||||
|             [ | ||||
|                 "09-Apr-2012", | ||||
|                 "01:10:18.300", | ||||
|                 2456026.548822908, | ||||
|                 12849, | ||||
|                 1.00361, | ||||
|                 1.12551, | ||||
|                 330.65659, | ||||
|                 355626618.16711, | ||||
|                 73.48821, | ||||
|                 314.11625, | ||||
|                 1917.09447, | ||||
|                 179.71425, | ||||
|                 80.0, | ||||
|                 240.0, | ||||
|                 -350, | ||||
|                 70.06056, | ||||
|                 344.9837, | ||||
|                 1, | ||||
|                 1, | ||||
|                 -0.689265, | ||||
|                 -0.692787, | ||||
|                 0.212036, | ||||
|                 14.7674, | ||||
|                 41.605, | ||||
|                 np.nan, | ||||
|                 np.nan, | ||||
|                 np.nan, | ||||
|                 np.nan, | ||||
|                 np.nan, | ||||
|                 np.nan, | ||||
|                 0, | ||||
|                 12, | ||||
|                 128, | ||||
|             ] | ||||
|         ] | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @skip_pyarrow | ||||
| def test_trailing_delimiters(all_parsers): | ||||
|     # see gh-2442 | ||||
|     data = """A,B,C | ||||
| 1,2,3, | ||||
| 4,5,6, | ||||
| 7,8,9,""" | ||||
|     parser = all_parsers | ||||
|     result = parser.read_csv(StringIO(data), index_col=False) | ||||
|  | ||||
|     expected = DataFrame({"A": [1, 4, 7], "B": [2, 5, 8], "C": [3, 6, 9]}) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_escapechar(all_parsers): | ||||
|     # https://stackoverflow.com/questions/13824840/feature-request-for- | ||||
|     # pandas-read-csv | ||||
|     data = '''SEARCH_TERM,ACTUAL_URL | ||||
| "bra tv board","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" | ||||
| "tv p\xc3\xa5 hjul","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" | ||||
| "SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals series","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"''' | ||||
|  | ||||
|     parser = all_parsers | ||||
|     result = parser.read_csv( | ||||
|         StringIO(data), escapechar="\\", quotechar='"', encoding="utf-8" | ||||
|     ) | ||||
|  | ||||
|     assert result["SEARCH_TERM"][2] == 'SLAGBORD, "Bergslagen", IKEA:s 1700-tals series' | ||||
|  | ||||
|     tm.assert_index_equal(result.columns, Index(["SEARCH_TERM", "ACTUAL_URL"])) | ||||
|  | ||||
|  | ||||
| def test_ignore_leading_whitespace(all_parsers): | ||||
|     # see gh-3374, gh-6607 | ||||
|     parser = all_parsers | ||||
|     data = " a b c\n 1 2 3\n 4 5 6\n 7 8 9" | ||||
|  | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = "the 'pyarrow' engine does not support regex separators" | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             parser.read_csv(StringIO(data), sep=r"\s+") | ||||
|         return | ||||
|     result = parser.read_csv(StringIO(data), sep=r"\s+") | ||||
|  | ||||
|     expected = DataFrame({"a": [1, 4, 7], "b": [2, 5, 8], "c": [3, 6, 9]}) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @skip_pyarrow | ||||
| @pytest.mark.parametrize("usecols", [None, [0, 1], ["a", "b"]]) | ||||
| def test_uneven_lines_with_usecols(all_parsers, usecols): | ||||
|     # see gh-12203 | ||||
|     parser = all_parsers | ||||
|     data = r"""a,b,c | ||||
| 0,1,2 | ||||
| 3,4,5,6,7 | ||||
| 8,9,10""" | ||||
|  | ||||
|     if usecols is None: | ||||
|         # Make sure that an error is still raised | ||||
|         # when the "usecols" parameter is not provided. | ||||
|         msg = r"Expected \d+ fields in line \d+, saw \d+" | ||||
|         with pytest.raises(ParserError, match=msg): | ||||
|             parser.read_csv(StringIO(data)) | ||||
|     else: | ||||
|         expected = DataFrame({"a": [0, 3, 8], "b": [1, 4, 9]}) | ||||
|  | ||||
|         result = parser.read_csv(StringIO(data), usecols=usecols) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @skip_pyarrow | ||||
| @pytest.mark.parametrize( | ||||
|     "data,kwargs,expected", | ||||
|     [ | ||||
|         # First, check to see that the response of parser when faced with no | ||||
|         # provided columns raises the correct error, with or without usecols. | ||||
|         ("", {}, None), | ||||
|         ("", {"usecols": ["X"]}, None), | ||||
|         ( | ||||
|             ",,", | ||||
|             {"names": ["Dummy", "X", "Dummy_2"], "usecols": ["X"]}, | ||||
|             DataFrame(columns=["X"], index=[0], dtype=np.float64), | ||||
|         ), | ||||
|         ( | ||||
|             "", | ||||
|             {"names": ["Dummy", "X", "Dummy_2"], "usecols": ["X"]}, | ||||
|             DataFrame(columns=["X"]), | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_read_empty_with_usecols(all_parsers, data, kwargs, expected): | ||||
|     # see gh-12493 | ||||
|     parser = all_parsers | ||||
|  | ||||
|     if expected is None: | ||||
|         msg = "No columns to parse from file" | ||||
|         with pytest.raises(EmptyDataError, match=msg): | ||||
|             parser.read_csv(StringIO(data), **kwargs) | ||||
|     else: | ||||
|         result = parser.read_csv(StringIO(data), **kwargs) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "kwargs,expected", | ||||
|     [ | ||||
|         # gh-8661, gh-8679: this should ignore six lines, including | ||||
|         # lines with trailing whitespace and blank lines. | ||||
|         ( | ||||
|             { | ||||
|                 "header": None, | ||||
|                 "delim_whitespace": True, | ||||
|                 "skiprows": [0, 1, 2, 3, 5, 6], | ||||
|                 "skip_blank_lines": True, | ||||
|             }, | ||||
|             DataFrame([[1.0, 2.0, 4.0], [5.1, np.nan, 10.0]]), | ||||
|         ), | ||||
|         # gh-8983: test skipping set of rows after a row with trailing spaces. | ||||
|         ( | ||||
|             { | ||||
|                 "delim_whitespace": True, | ||||
|                 "skiprows": [1, 2, 3, 5, 6], | ||||
|                 "skip_blank_lines": True, | ||||
|             }, | ||||
|             DataFrame({"A": [1.0, 5.1], "B": [2.0, np.nan], "C": [4.0, 10]}), | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_trailing_spaces(all_parsers, kwargs, expected): | ||||
|     data = "A B C  \nrandom line with trailing spaces    \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n   \n5.1,NaN,10.0\n"  # noqa: E501 | ||||
|     parser = all_parsers | ||||
|  | ||||
|     depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" | ||||
|  | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine" | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             with tm.assert_produces_warning( | ||||
|                 FutureWarning, match=depr_msg, check_stacklevel=False | ||||
|             ): | ||||
|                 parser.read_csv(StringIO(data.replace(",", "  ")), **kwargs) | ||||
|         return | ||||
|  | ||||
|     with tm.assert_produces_warning( | ||||
|         FutureWarning, match=depr_msg, check_stacklevel=False | ||||
|     ): | ||||
|         result = parser.read_csv(StringIO(data.replace(",", "  ")), **kwargs) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_raise_on_sep_with_delim_whitespace(all_parsers): | ||||
|     # see gh-6607 | ||||
|     data = "a b c\n1 2 3" | ||||
|     parser = all_parsers | ||||
|  | ||||
|     depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" | ||||
|     with pytest.raises(ValueError, match="you can only specify one"): | ||||
|         with tm.assert_produces_warning( | ||||
|             FutureWarning, match=depr_msg, check_stacklevel=False | ||||
|         ): | ||||
|             parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True) | ||||
|  | ||||
|  | ||||
| def test_read_filepath_or_buffer(all_parsers): | ||||
|     # see gh-43366 | ||||
|     parser = all_parsers | ||||
|  | ||||
|     with pytest.raises(TypeError, match="Expected file path name or file-like"): | ||||
|         parser.read_csv(filepath_or_buffer=b"input") | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("delim_whitespace", [True, False]) | ||||
| def test_single_char_leading_whitespace(all_parsers, delim_whitespace): | ||||
|     # see gh-9710 | ||||
|     parser = all_parsers | ||||
|     data = """\ | ||||
| MyColumn | ||||
| a | ||||
| b | ||||
| a | ||||
| b\n""" | ||||
|  | ||||
|     expected = DataFrame({"MyColumn": list("abab")}) | ||||
|     depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" | ||||
|  | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine" | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             with tm.assert_produces_warning( | ||||
|                 FutureWarning, match=depr_msg, check_stacklevel=False | ||||
|             ): | ||||
|                 parser.read_csv( | ||||
|                     StringIO(data), | ||||
|                     skipinitialspace=True, | ||||
|                     delim_whitespace=delim_whitespace, | ||||
|                 ) | ||||
|         return | ||||
|  | ||||
|     with tm.assert_produces_warning( | ||||
|         FutureWarning, match=depr_msg, check_stacklevel=False | ||||
|     ): | ||||
|         result = parser.read_csv( | ||||
|             StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace | ||||
|         ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "sep,skip_blank_lines,exp_data", | ||||
|     [ | ||||
|         (",", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]), | ||||
|         (r"\s+", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]), | ||||
|         ( | ||||
|             ",", | ||||
|             False, | ||||
|             [ | ||||
|                 [1.0, 2.0, 4.0], | ||||
|                 [np.nan, np.nan, np.nan], | ||||
|                 [np.nan, np.nan, np.nan], | ||||
|                 [5.0, np.nan, 10.0], | ||||
|                 [np.nan, np.nan, np.nan], | ||||
|                 [-70.0, 0.4, 1.0], | ||||
|             ], | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data, request): | ||||
|     parser = all_parsers | ||||
|     data = """\ | ||||
| A,B,C | ||||
| 1,2.,4. | ||||
|  | ||||
|  | ||||
| 5.,NaN,10.0 | ||||
|  | ||||
| -70,.4,1 | ||||
| """ | ||||
|  | ||||
|     if sep == r"\s+": | ||||
|         data = data.replace(",", "  ") | ||||
|  | ||||
|         if parser.engine == "pyarrow": | ||||
|             msg = "the 'pyarrow' engine does not support regex separators" | ||||
|             with pytest.raises(ValueError, match=msg): | ||||
|                 parser.read_csv( | ||||
|                     StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines | ||||
|                 ) | ||||
|             return | ||||
|  | ||||
|     result = parser.read_csv(StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines) | ||||
|     expected = DataFrame(exp_data, columns=["A", "B", "C"]) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @skip_pyarrow | ||||
| def test_whitespace_lines(all_parsers): | ||||
|     parser = all_parsers | ||||
|     data = """ | ||||
|  | ||||
| \t  \t\t | ||||
| \t | ||||
| A,B,C | ||||
| \t    1,2.,4. | ||||
| 5.,NaN,10.0 | ||||
| """ | ||||
|     expected = DataFrame([[1, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]) | ||||
|     result = parser.read_csv(StringIO(data)) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "data,expected", | ||||
|     [ | ||||
|         ( | ||||
|             """   A   B   C   D | ||||
| a   1   2   3   4 | ||||
| b   1   2   3   4 | ||||
| c   1   2   3   4 | ||||
| """, | ||||
|             DataFrame( | ||||
|                 [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], | ||||
|                 columns=["A", "B", "C", "D"], | ||||
|                 index=["a", "b", "c"], | ||||
|             ), | ||||
|         ), | ||||
|         ( | ||||
|             "    a b c\n1 2 3 \n4 5  6\n 7 8 9", | ||||
|             DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]), | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_whitespace_regex_separator(all_parsers, data, expected): | ||||
|     # see gh-6607 | ||||
|     parser = all_parsers | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = "the 'pyarrow' engine does not support regex separators" | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             parser.read_csv(StringIO(data), sep=r"\s+") | ||||
|         return | ||||
|  | ||||
|     result = parser.read_csv(StringIO(data), sep=r"\s+") | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_sub_character(all_parsers, csv_dir_path): | ||||
|     # see gh-16893 | ||||
|     filename = os.path.join(csv_dir_path, "sub_char.csv") | ||||
|     expected = DataFrame([[1, 2, 3]], columns=["a", "\x1ab", "c"]) | ||||
|  | ||||
|     parser = all_parsers | ||||
|     result = parser.read_csv(filename) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("filename", ["sé-es-vé.csv", "ru-sй.csv", "中文文件名.csv"]) | ||||
| def test_filename_with_special_chars(all_parsers, filename): | ||||
|     # see gh-15086. | ||||
|     parser = all_parsers | ||||
|     df = DataFrame({"a": [1, 2, 3]}) | ||||
|  | ||||
|     with tm.ensure_clean(filename) as path: | ||||
|         df.to_csv(path, index=False) | ||||
|  | ||||
|         result = parser.read_csv(path) | ||||
|         tm.assert_frame_equal(result, df) | ||||
|  | ||||
|  | ||||
| def test_read_table_same_signature_as_read_csv(all_parsers): | ||||
|     # GH-34976 | ||||
|     parser = all_parsers | ||||
|  | ||||
|     table_sign = signature(parser.read_table) | ||||
|     csv_sign = signature(parser.read_csv) | ||||
|  | ||||
|     assert table_sign.parameters.keys() == csv_sign.parameters.keys() | ||||
|     assert table_sign.return_annotation == csv_sign.return_annotation | ||||
|  | ||||
|     for key, csv_param in csv_sign.parameters.items(): | ||||
|         table_param = table_sign.parameters[key] | ||||
|         if key == "sep": | ||||
|             assert csv_param.default == "," | ||||
|             assert table_param.default == "\t" | ||||
|             assert table_param.annotation == csv_param.annotation | ||||
|             assert table_param.kind == csv_param.kind | ||||
|             continue | ||||
|  | ||||
|         assert table_param == csv_param | ||||
|  | ||||
|  | ||||
| def test_read_table_equivalency_to_read_csv(all_parsers): | ||||
|     # see gh-21948 | ||||
|     # As of 0.25.0, read_table is undeprecated | ||||
|     parser = all_parsers | ||||
|     data = "a\tb\n1\t2\n3\t4" | ||||
|     expected = parser.read_csv(StringIO(data), sep="\t") | ||||
|     result = parser.read_table(StringIO(data)) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("read_func", ["read_csv", "read_table"]) | ||||
| def test_read_csv_and_table_sys_setprofile(all_parsers, read_func): | ||||
|     # GH#41069 | ||||
|     parser = all_parsers | ||||
|     data = "a b\n0 1" | ||||
|  | ||||
|     sys.setprofile(lambda *a, **k: None) | ||||
|     result = getattr(parser, read_func)(StringIO(data)) | ||||
|     sys.setprofile(None) | ||||
|  | ||||
|     expected = DataFrame({"a b": ["0 1"]}) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @skip_pyarrow | ||||
| def test_first_row_bom(all_parsers): | ||||
|     # see gh-26545 | ||||
|     parser = all_parsers | ||||
|     data = '''\ufeff"Head1"\t"Head2"\t"Head3"''' | ||||
|  | ||||
|     result = parser.read_csv(StringIO(data), delimiter="\t") | ||||
|     expected = DataFrame(columns=["Head1", "Head2", "Head3"]) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @skip_pyarrow | ||||
| def test_first_row_bom_unquoted(all_parsers): | ||||
|     # see gh-36343 | ||||
|     parser = all_parsers | ||||
|     data = """\ufeffHead1\tHead2\tHead3""" | ||||
|  | ||||
|     result = parser.read_csv(StringIO(data), delimiter="\t") | ||||
|     expected = DataFrame(columns=["Head1", "Head2", "Head3"]) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("nrows", range(1, 6)) | ||||
| def test_blank_lines_between_header_and_data_rows(all_parsers, nrows): | ||||
|     # GH 28071 | ||||
|     ref = DataFrame( | ||||
|         [[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]], | ||||
|         columns=list("ab"), | ||||
|     ) | ||||
|     csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4" | ||||
|     parser = all_parsers | ||||
|  | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = "The 'nrows' option is not supported with the 'pyarrow' engine" | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             parser.read_csv( | ||||
|                 StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False | ||||
|             ) | ||||
|         return | ||||
|  | ||||
|     df = parser.read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False) | ||||
|     tm.assert_frame_equal(df, ref[:nrows]) | ||||
|  | ||||
|  | ||||
| @skip_pyarrow | ||||
| def test_no_header_two_extra_columns(all_parsers): | ||||
|     # GH 26218 | ||||
|     column_names = ["one", "two", "three"] | ||||
|     ref = DataFrame([["foo", "bar", "baz"]], columns=column_names) | ||||
|     stream = StringIO("foo,bar,baz,bam,blah") | ||||
|     parser = all_parsers | ||||
|     df = parser.read_csv_check_warnings( | ||||
|         ParserWarning, | ||||
|         "Length of header or names does not match length of data. " | ||||
|         "This leads to a loss of data with index_col=False.", | ||||
|         stream, | ||||
|         header=None, | ||||
|         names=column_names, | ||||
|         index_col=False, | ||||
|     ) | ||||
|     tm.assert_frame_equal(df, ref) | ||||
|  | ||||
|  | ||||
| def test_read_csv_names_not_accepting_sets(all_parsers): | ||||
|     # GH 34946 | ||||
|     data = """\ | ||||
|     1,2,3 | ||||
|     4,5,6\n""" | ||||
|     parser = all_parsers | ||||
|     with pytest.raises(ValueError, match="Names should be an ordered collection."): | ||||
|         parser.read_csv(StringIO(data), names=set("QAZ")) | ||||
|  | ||||
|  | ||||
| def test_read_table_delim_whitespace_default_sep(all_parsers): | ||||
|     # GH: 35958 | ||||
|     f = StringIO("a  b  c\n1 -2 -3\n4  5   6") | ||||
|     parser = all_parsers | ||||
|  | ||||
|     depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated" | ||||
|  | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine" | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             with tm.assert_produces_warning( | ||||
|                 FutureWarning, match=depr_msg, check_stacklevel=False | ||||
|             ): | ||||
|                 parser.read_table(f, delim_whitespace=True) | ||||
|         return | ||||
|     with tm.assert_produces_warning( | ||||
|         FutureWarning, match=depr_msg, check_stacklevel=False | ||||
|     ): | ||||
|         result = parser.read_table(f, delim_whitespace=True) | ||||
|     expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]}) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("delimiter", [",", "\t"]) | ||||
| def test_read_csv_delim_whitespace_non_default_sep(all_parsers, delimiter): | ||||
|     # GH: 35958 | ||||
|     f = StringIO("a  b  c\n1 -2 -3\n4  5   6") | ||||
|     parser = all_parsers | ||||
|     msg = ( | ||||
|         "Specified a delimiter with both sep and " | ||||
|         "delim_whitespace=True; you can only specify one." | ||||
|     ) | ||||
|     depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" | ||||
|     with tm.assert_produces_warning( | ||||
|         FutureWarning, match=depr_msg, check_stacklevel=False | ||||
|     ): | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             parser.read_csv(f, delim_whitespace=True, sep=delimiter) | ||||
|  | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             parser.read_csv(f, delim_whitespace=True, delimiter=delimiter) | ||||
|  | ||||
|  | ||||
| def test_read_csv_delimiter_and_sep_no_default(all_parsers): | ||||
|     # GH#39823 | ||||
|     f = StringIO("a,b\n1,2") | ||||
|     parser = all_parsers | ||||
|     msg = "Specified a sep and a delimiter; you can only specify one." | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         parser.read_csv(f, sep=" ", delimiter=".") | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("kwargs", [{"delimiter": "\n"}, {"sep": "\n"}]) | ||||
| def test_read_csv_line_break_as_separator(kwargs, all_parsers): | ||||
|     # GH#43528 | ||||
|     parser = all_parsers | ||||
|     data = """a,b,c | ||||
| 1,2,3 | ||||
|     """ | ||||
|     msg = ( | ||||
|         r"Specified \\n as separator or delimiter. This forces the python engine " | ||||
|         r"which does not accept a line terminator. Hence it is not allowed to use " | ||||
|         r"the line terminator as separator." | ||||
|     ) | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         parser.read_csv(StringIO(data), **kwargs) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("delimiter", [",", "\t"]) | ||||
| def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter): | ||||
|     # GH: 35958 | ||||
|     f = StringIO("a  b  c\n1 -2 -3\n4  5   6") | ||||
|     parser = all_parsers | ||||
|     msg = ( | ||||
|         "Specified a delimiter with both sep and " | ||||
|         "delim_whitespace=True; you can only specify one." | ||||
|     ) | ||||
|     depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated" | ||||
|     with tm.assert_produces_warning( | ||||
|         FutureWarning, match=depr_msg, check_stacklevel=False | ||||
|     ): | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             parser.read_table(f, delim_whitespace=True, sep=delimiter) | ||||
|  | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             parser.read_table(f, delim_whitespace=True, delimiter=delimiter) | ||||
|  | ||||
|  | ||||
| @skip_pyarrow | ||||
| def test_dict_keys_as_names(all_parsers): | ||||
|     # GH: 36928 | ||||
|     data = "1,2" | ||||
|  | ||||
|     keys = {"a": int, "b": int}.keys() | ||||
|     parser = all_parsers | ||||
|  | ||||
|     result = parser.read_csv(StringIO(data), names=keys) | ||||
|     expected = DataFrame({"a": [1], "b": [2]}) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") | ||||
| @xfail_pyarrow  # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0 | ||||
| def test_encoding_surrogatepass(all_parsers): | ||||
|     # GH39017 | ||||
|     parser = all_parsers | ||||
|     content = b"\xed\xbd\xbf" | ||||
|     decoded = content.decode("utf-8", errors="surrogatepass") | ||||
|     expected = DataFrame({decoded: [decoded]}, index=[decoded * 2]) | ||||
|     expected.index.name = decoded * 2 | ||||
|  | ||||
|     with tm.ensure_clean() as path: | ||||
|         Path(path).write_bytes( | ||||
|             content * 2 + b"," + content + b"\n" + content * 2 + b"," + content | ||||
|         ) | ||||
|         df = parser.read_csv(path, encoding_errors="surrogatepass", index_col=0) | ||||
|         tm.assert_frame_equal(df, expected) | ||||
|         with pytest.raises(UnicodeDecodeError, match="'utf-8' codec can't decode byte"): | ||||
|             parser.read_csv(path) | ||||
|  | ||||
|  | ||||
| def test_malformed_second_line(all_parsers): | ||||
|     # see GH14782 | ||||
|     parser = all_parsers | ||||
|     data = "\na\nb\n" | ||||
|     result = parser.read_csv(StringIO(data), skip_blank_lines=False, header=1) | ||||
|     expected = DataFrame({"a": ["b"]}) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @skip_pyarrow | ||||
| def test_short_single_line(all_parsers): | ||||
|     # GH 47566 | ||||
|     parser = all_parsers | ||||
|     columns = ["a", "b", "c"] | ||||
|     data = "1,2" | ||||
|     result = parser.read_csv(StringIO(data), header=None, names=columns) | ||||
|     expected = DataFrame({"a": [1], "b": [2], "c": [np.nan]}) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @xfail_pyarrow  # ValueError: Length mismatch: Expected axis has 2 elements | ||||
| def test_short_multi_line(all_parsers): | ||||
|     # GH 47566 | ||||
|     parser = all_parsers | ||||
|     columns = ["a", "b", "c"] | ||||
|     data = "1,2\n1,2" | ||||
|     result = parser.read_csv(StringIO(data), header=None, names=columns) | ||||
|     expected = DataFrame({"a": [1, 1], "b": [2, 2], "c": [np.nan, np.nan]}) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_read_seek(all_parsers): | ||||
|     # GH48646 | ||||
|     parser = all_parsers | ||||
|     prefix = "### DATA\n" | ||||
|     content = "nkey,value\ntables,rectangular\n" | ||||
|     with tm.ensure_clean() as path: | ||||
|         Path(path).write_text(prefix + content, encoding="utf-8") | ||||
|         with open(path, encoding="utf-8") as file: | ||||
|             file.readline() | ||||
|             actual = parser.read_csv(file) | ||||
|         expected = parser.read_csv(StringIO(content)) | ||||
|     tm.assert_frame_equal(actual, expected) | ||||
| @ -0,0 +1,91 @@ | ||||
| """ | ||||
| Tests that work on both the Python and C engines but do not have a | ||||
| specific classification into the other test modules. | ||||
| """ | ||||
| import csv | ||||
| from io import StringIO | ||||
|  | ||||
| import pytest | ||||
|  | ||||
| from pandas import DataFrame | ||||
| import pandas._testing as tm | ||||
|  | ||||
| from pandas.io.parsers import TextParser | ||||
|  | ||||
| pytestmark = pytest.mark.filterwarnings( | ||||
|     "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" | ||||
| ) | ||||
|  | ||||
| xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") | ||||
|  | ||||
|  | ||||
| @xfail_pyarrow | ||||
| def test_read_data_list(all_parsers): | ||||
|     parser = all_parsers | ||||
|     kwargs = {"index_col": 0} | ||||
|     data = "A,B,C\nfoo,1,2,3\nbar,4,5,6" | ||||
|  | ||||
|     data_list = [["A", "B", "C"], ["foo", "1", "2", "3"], ["bar", "4", "5", "6"]] | ||||
|     expected = parser.read_csv(StringIO(data), **kwargs) | ||||
|  | ||||
|     with TextParser(data_list, chunksize=2, **kwargs) as parser: | ||||
|         result = parser.read() | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_reader_list(all_parsers): | ||||
|     data = """index,A,B,C,D | ||||
| foo,2,3,4,5 | ||||
| bar,7,8,9,10 | ||||
| baz,12,13,14,15 | ||||
| qux,12,13,14,15 | ||||
| foo2,12,13,14,15 | ||||
| bar2,12,13,14,15 | ||||
| """ | ||||
|     parser = all_parsers | ||||
|     kwargs = {"index_col": 0} | ||||
|  | ||||
|     lines = list(csv.reader(StringIO(data))) | ||||
|     with TextParser(lines, chunksize=2, **kwargs) as reader: | ||||
|         chunks = list(reader) | ||||
|  | ||||
|     expected = parser.read_csv(StringIO(data), **kwargs) | ||||
|  | ||||
|     tm.assert_frame_equal(chunks[0], expected[:2]) | ||||
|     tm.assert_frame_equal(chunks[1], expected[2:4]) | ||||
|     tm.assert_frame_equal(chunks[2], expected[4:]) | ||||
|  | ||||
|  | ||||
| def test_reader_list_skiprows(all_parsers): | ||||
|     data = """index,A,B,C,D | ||||
| foo,2,3,4,5 | ||||
| bar,7,8,9,10 | ||||
| baz,12,13,14,15 | ||||
| qux,12,13,14,15 | ||||
| foo2,12,13,14,15 | ||||
| bar2,12,13,14,15 | ||||
| """ | ||||
|     parser = all_parsers | ||||
|     kwargs = {"index_col": 0} | ||||
|  | ||||
|     lines = list(csv.reader(StringIO(data))) | ||||
|     with TextParser(lines, chunksize=2, skiprows=[1], **kwargs) as reader: | ||||
|         chunks = list(reader) | ||||
|  | ||||
|     expected = parser.read_csv(StringIO(data), **kwargs) | ||||
|  | ||||
|     tm.assert_frame_equal(chunks[0], expected[1:3]) | ||||
|  | ||||
|  | ||||
| def test_read_csv_parse_simple_list(all_parsers): | ||||
|     parser = all_parsers | ||||
|     data = """foo | ||||
| bar baz | ||||
| qux foo | ||||
| foo | ||||
| bar""" | ||||
|  | ||||
|     result = parser.read_csv(StringIO(data), header=None) | ||||
|     expected = DataFrame(["foo", "bar baz", "qux foo", "foo", "bar"]) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
| @ -0,0 +1,72 @@ | ||||
| """ | ||||
| Tests that work on both the Python and C engines but do not have a | ||||
| specific classification into the other test modules. | ||||
| """ | ||||
| from io import StringIO | ||||
|  | ||||
| import pytest | ||||
|  | ||||
| from pandas import DataFrame | ||||
| import pandas._testing as tm | ||||
|  | ||||
| pytestmark = pytest.mark.filterwarnings( | ||||
|     "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" | ||||
| ) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "data,thousands,decimal", | ||||
|     [ | ||||
|         ( | ||||
|             """A|B|C | ||||
| 1|2,334.01|5 | ||||
| 10|13|10. | ||||
| """, | ||||
|             ",", | ||||
|             ".", | ||||
|         ), | ||||
|         ( | ||||
|             """A|B|C | ||||
| 1|2.334,01|5 | ||||
| 10|13|10, | ||||
| """, | ||||
|             ".", | ||||
|             ",", | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_1000_sep_with_decimal(all_parsers, data, thousands, decimal): | ||||
|     parser = all_parsers | ||||
|     expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]}) | ||||
|  | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = "The 'thousands' option is not supported with the 'pyarrow' engine" | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             parser.read_csv( | ||||
|                 StringIO(data), sep="|", thousands=thousands, decimal=decimal | ||||
|             ) | ||||
|         return | ||||
|  | ||||
|     result = parser.read_csv( | ||||
|         StringIO(data), sep="|", thousands=thousands, decimal=decimal | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_euro_decimal_format(all_parsers): | ||||
|     parser = all_parsers | ||||
|     data = """Id;Number1;Number2;Text1;Text2;Number3 | ||||
| 1;1521,1541;187101,9543;ABC;poi;4,738797819 | ||||
| 2;121,12;14897,76;DEF;uyt;0,377320872 | ||||
| 3;878,158;108013,434;GHI;rez;2,735694704""" | ||||
|  | ||||
|     result = parser.read_csv(StringIO(data), sep=";", decimal=",") | ||||
|     expected = DataFrame( | ||||
|         [ | ||||
|             [1, 1521.1541, 187101.9543, "ABC", "poi", 4.738797819], | ||||
|             [2, 121.12, 14897.76, "DEF", "uyt", 0.377320872], | ||||
|             [3, 878.158, 108013.434, "GHI", "rez", 2.735694704], | ||||
|         ], | ||||
|         columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"], | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
| @ -0,0 +1,478 @@ | ||||
| """ | ||||
| Tests that work on both the Python and C engines but do not have a | ||||
| specific classification into the other test modules. | ||||
| """ | ||||
| from io import ( | ||||
|     BytesIO, | ||||
|     StringIO, | ||||
| ) | ||||
| import os | ||||
| import platform | ||||
| from urllib.error import URLError | ||||
| import uuid | ||||
|  | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas.errors import ( | ||||
|     EmptyDataError, | ||||
|     ParserError, | ||||
| ) | ||||
| import pandas.util._test_decorators as td | ||||
|  | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
| pytestmark = pytest.mark.filterwarnings( | ||||
|     "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" | ||||
| ) | ||||
|  | ||||
| xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") | ||||
| skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") | ||||
|  | ||||
|  | ||||
| @pytest.mark.network | ||||
| @pytest.mark.single_cpu | ||||
| def test_url(all_parsers, csv_dir_path, httpserver): | ||||
|     parser = all_parsers | ||||
|     kwargs = {"sep": "\t"} | ||||
|  | ||||
|     local_path = os.path.join(csv_dir_path, "salaries.csv") | ||||
|     with open(local_path, encoding="utf-8") as f: | ||||
|         httpserver.serve_content(content=f.read()) | ||||
|  | ||||
|     url_result = parser.read_csv(httpserver.url, **kwargs) | ||||
|  | ||||
|     local_result = parser.read_csv(local_path, **kwargs) | ||||
|     tm.assert_frame_equal(url_result, local_result) | ||||
|  | ||||
|  | ||||
| @pytest.mark.slow | ||||
| def test_local_file(all_parsers, csv_dir_path): | ||||
|     parser = all_parsers | ||||
|     kwargs = {"sep": "\t"} | ||||
|  | ||||
|     local_path = os.path.join(csv_dir_path, "salaries.csv") | ||||
|     local_result = parser.read_csv(local_path, **kwargs) | ||||
|     url = "file://localhost/" + local_path | ||||
|  | ||||
|     try: | ||||
|         url_result = parser.read_csv(url, **kwargs) | ||||
|         tm.assert_frame_equal(url_result, local_result) | ||||
|     except URLError: | ||||
|         # Fails on some systems. | ||||
|         pytest.skip("Failing on: " + " ".join(platform.uname())) | ||||
|  | ||||
|  | ||||
| @xfail_pyarrow  # AssertionError: DataFrame.index are different | ||||
| def test_path_path_lib(all_parsers): | ||||
|     parser = all_parsers | ||||
|     df = DataFrame( | ||||
|         1.1 * np.arange(120).reshape((30, 4)), | ||||
|         columns=Index(list("ABCD")), | ||||
|         index=Index([f"i-{i}" for i in range(30)]), | ||||
|     ) | ||||
|     result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0)) | ||||
|     tm.assert_frame_equal(df, result) | ||||
|  | ||||
|  | ||||
| @xfail_pyarrow  # AssertionError: DataFrame.index are different | ||||
| def test_path_local_path(all_parsers): | ||||
|     parser = all_parsers | ||||
|     df = DataFrame( | ||||
|         1.1 * np.arange(120).reshape((30, 4)), | ||||
|         columns=Index(list("ABCD")), | ||||
|         index=Index([f"i-{i}" for i in range(30)]), | ||||
|     ) | ||||
|     result = tm.round_trip_localpath( | ||||
|         df.to_csv, lambda p: parser.read_csv(p, index_col=0) | ||||
|     ) | ||||
|     tm.assert_frame_equal(df, result) | ||||
|  | ||||
|  | ||||
| def test_nonexistent_path(all_parsers): | ||||
|     # gh-2428: pls no segfault | ||||
|     # gh-14086: raise more helpful FileNotFoundError | ||||
|     # GH#29233 "File foo" instead of "File b'foo'" | ||||
|     parser = all_parsers | ||||
|     path = f"{uuid.uuid4()}.csv" | ||||
|  | ||||
|     msg = r"\[Errno 2\]" | ||||
|     with pytest.raises(FileNotFoundError, match=msg) as e: | ||||
|         parser.read_csv(path) | ||||
|     assert path == e.value.filename | ||||
|  | ||||
|  | ||||
| @td.skip_if_windows  # os.chmod does not work in windows | ||||
| def test_no_permission(all_parsers): | ||||
|     # GH 23784 | ||||
|     parser = all_parsers | ||||
|  | ||||
|     msg = r"\[Errno 13\]" | ||||
|     with tm.ensure_clean() as path: | ||||
|         os.chmod(path, 0)  # make file unreadable | ||||
|  | ||||
|         # verify that this process cannot open the file (not running as sudo) | ||||
|         try: | ||||
|             with open(path, encoding="utf-8"): | ||||
|                 pass | ||||
|             pytest.skip("Running as sudo.") | ||||
|         except PermissionError: | ||||
|             pass | ||||
|  | ||||
|         with pytest.raises(PermissionError, match=msg) as e: | ||||
|             parser.read_csv(path) | ||||
|         assert path == e.value.filename | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "data,kwargs,expected,msg", | ||||
|     [ | ||||
|         # gh-10728: WHITESPACE_LINE | ||||
|         ( | ||||
|             "a,b,c\n4,5,6\n ", | ||||
|             {}, | ||||
|             DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), | ||||
|             None, | ||||
|         ), | ||||
|         # gh-10548: EAT_LINE_COMMENT | ||||
|         ( | ||||
|             "a,b,c\n4,5,6\n#comment", | ||||
|             {"comment": "#"}, | ||||
|             DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), | ||||
|             None, | ||||
|         ), | ||||
|         # EAT_CRNL_NOP | ||||
|         ( | ||||
|             "a,b,c\n4,5,6\n\r", | ||||
|             {}, | ||||
|             DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), | ||||
|             None, | ||||
|         ), | ||||
|         # EAT_COMMENT | ||||
|         ( | ||||
|             "a,b,c\n4,5,6#comment", | ||||
|             {"comment": "#"}, | ||||
|             DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), | ||||
|             None, | ||||
|         ), | ||||
|         # SKIP_LINE | ||||
|         ( | ||||
|             "a,b,c\n4,5,6\nskipme", | ||||
|             {"skiprows": [2]}, | ||||
|             DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), | ||||
|             None, | ||||
|         ), | ||||
|         # EAT_LINE_COMMENT | ||||
|         ( | ||||
|             "a,b,c\n4,5,6\n#comment", | ||||
|             {"comment": "#", "skip_blank_lines": False}, | ||||
|             DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), | ||||
|             None, | ||||
|         ), | ||||
|         # IN_FIELD | ||||
|         ( | ||||
|             "a,b,c\n4,5,6\n ", | ||||
|             {"skip_blank_lines": False}, | ||||
|             DataFrame([["4", 5, 6], [" ", None, None]], columns=["a", "b", "c"]), | ||||
|             None, | ||||
|         ), | ||||
|         # EAT_CRNL | ||||
|         ( | ||||
|             "a,b,c\n4,5,6\n\r", | ||||
|             {"skip_blank_lines": False}, | ||||
|             DataFrame([[4, 5, 6], [None, None, None]], columns=["a", "b", "c"]), | ||||
|             None, | ||||
|         ), | ||||
|         # ESCAPED_CHAR | ||||
|         ( | ||||
|             "a,b,c\n4,5,6\n\\", | ||||
|             {"escapechar": "\\"}, | ||||
|             None, | ||||
|             "(EOF following escape character)|(unexpected end of data)", | ||||
|         ), | ||||
|         # ESCAPE_IN_QUOTED_FIELD | ||||
|         ( | ||||
|             'a,b,c\n4,5,6\n"\\', | ||||
|             {"escapechar": "\\"}, | ||||
|             None, | ||||
|             "(EOF inside string starting at row 2)|(unexpected end of data)", | ||||
|         ), | ||||
|         # IN_QUOTED_FIELD | ||||
|         ( | ||||
|             'a,b,c\n4,5,6\n"', | ||||
|             {"escapechar": "\\"}, | ||||
|             None, | ||||
|             "(EOF inside string starting at row 2)|(unexpected end of data)", | ||||
|         ), | ||||
|     ], | ||||
|     ids=[ | ||||
|         "whitespace-line", | ||||
|         "eat-line-comment", | ||||
|         "eat-crnl-nop", | ||||
|         "eat-comment", | ||||
|         "skip-line", | ||||
|         "eat-line-comment", | ||||
|         "in-field", | ||||
|         "eat-crnl", | ||||
|         "escaped-char", | ||||
|         "escape-in-quoted-field", | ||||
|         "in-quoted-field", | ||||
|     ], | ||||
| ) | ||||
| def test_eof_states(all_parsers, data, kwargs, expected, msg, request): | ||||
|     # see gh-10728, gh-10548 | ||||
|     parser = all_parsers | ||||
|  | ||||
|     if parser.engine == "pyarrow" and "comment" in kwargs: | ||||
|         msg = "The 'comment' option is not supported with the 'pyarrow' engine" | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             parser.read_csv(StringIO(data), **kwargs) | ||||
|         return | ||||
|  | ||||
|     if parser.engine == "pyarrow" and "\r" not in data: | ||||
|         # pandas.errors.ParserError: CSV parse error: Expected 3 columns, got 1: | ||||
|         # ValueError: skiprows argument must be an integer when using engine='pyarrow' | ||||
|         # AssertionError: Regex pattern did not match. | ||||
|         pytest.skip(reason="https://github.com/apache/arrow/issues/38676") | ||||
|  | ||||
|     if expected is None: | ||||
|         with pytest.raises(ParserError, match=msg): | ||||
|             parser.read_csv(StringIO(data), **kwargs) | ||||
|     else: | ||||
|         result = parser.read_csv(StringIO(data), **kwargs) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_temporary_file(all_parsers): | ||||
|     # see gh-13398 | ||||
|     parser = all_parsers | ||||
|     data = "0 0" | ||||
|  | ||||
|     with tm.ensure_clean(mode="w+", return_filelike=True) as new_file: | ||||
|         new_file.write(data) | ||||
|         new_file.flush() | ||||
|         new_file.seek(0) | ||||
|  | ||||
|         if parser.engine == "pyarrow": | ||||
|             msg = "the 'pyarrow' engine does not support regex separators" | ||||
|             with pytest.raises(ValueError, match=msg): | ||||
|                 parser.read_csv(new_file, sep=r"\s+", header=None) | ||||
|             return | ||||
|  | ||||
|         result = parser.read_csv(new_file, sep=r"\s+", header=None) | ||||
|  | ||||
|         expected = DataFrame([[0, 0]]) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_internal_eof_byte(all_parsers): | ||||
|     # see gh-5500 | ||||
|     parser = all_parsers | ||||
|     data = "a,b\n1\x1a,2" | ||||
|  | ||||
|     expected = DataFrame([["1\x1a", 2]], columns=["a", "b"]) | ||||
|     result = parser.read_csv(StringIO(data)) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_internal_eof_byte_to_file(all_parsers): | ||||
|     # see gh-16559 | ||||
|     parser = all_parsers | ||||
|     data = b'c1,c2\r\n"test \x1a    test", test\r\n' | ||||
|     expected = DataFrame([["test \x1a    test", " test"]], columns=["c1", "c2"]) | ||||
|     path = f"__{uuid.uuid4()}__.csv" | ||||
|  | ||||
|     with tm.ensure_clean(path) as path: | ||||
|         with open(path, "wb") as f: | ||||
|             f.write(data) | ||||
|  | ||||
|         result = parser.read_csv(path) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_file_handle_string_io(all_parsers): | ||||
|     # gh-14418 | ||||
|     # | ||||
|     # Don't close user provided file handles. | ||||
|     parser = all_parsers | ||||
|     data = "a,b\n1,2" | ||||
|  | ||||
|     fh = StringIO(data) | ||||
|     parser.read_csv(fh) | ||||
|     assert not fh.closed | ||||
|  | ||||
|  | ||||
| def test_file_handles_with_open(all_parsers, csv1): | ||||
|     # gh-14418 | ||||
|     # | ||||
|     # Don't close user provided file handles. | ||||
|     parser = all_parsers | ||||
|  | ||||
|     for mode in ["r", "rb"]: | ||||
|         with open(csv1, mode, encoding="utf-8" if mode == "r" else None) as f: | ||||
|             parser.read_csv(f) | ||||
|             assert not f.closed | ||||
|  | ||||
|  | ||||
| def test_invalid_file_buffer_class(all_parsers): | ||||
|     # see gh-15337 | ||||
|     class InvalidBuffer: | ||||
|         pass | ||||
|  | ||||
|     parser = all_parsers | ||||
|     msg = "Invalid file path or buffer object type" | ||||
|  | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         parser.read_csv(InvalidBuffer()) | ||||
|  | ||||
|  | ||||
| def test_invalid_file_buffer_mock(all_parsers): | ||||
|     # see gh-15337 | ||||
|     parser = all_parsers | ||||
|     msg = "Invalid file path or buffer object type" | ||||
|  | ||||
|     class Foo: | ||||
|         pass | ||||
|  | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         parser.read_csv(Foo()) | ||||
|  | ||||
|  | ||||
| def test_valid_file_buffer_seems_invalid(all_parsers): | ||||
|     # gh-16135: we want to ensure that "tell" and "seek" | ||||
|     # aren't actually being used when we call `read_csv` | ||||
|     # | ||||
|     # Thus, while the object may look "invalid" (these | ||||
|     # methods are attributes of the `StringIO` class), | ||||
|     # it is still a valid file-object for our purposes. | ||||
|     class NoSeekTellBuffer(StringIO): | ||||
|         def tell(self): | ||||
|             raise AttributeError("No tell method") | ||||
|  | ||||
|         def seek(self, pos, whence=0): | ||||
|             raise AttributeError("No seek method") | ||||
|  | ||||
|     data = "a\n1" | ||||
|     parser = all_parsers | ||||
|     expected = DataFrame({"a": [1]}) | ||||
|  | ||||
|     result = parser.read_csv(NoSeekTellBuffer(data)) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("io_class", [StringIO, BytesIO]) | ||||
| @pytest.mark.parametrize("encoding", [None, "utf-8"]) | ||||
| def test_read_csv_file_handle(all_parsers, io_class, encoding): | ||||
|     """ | ||||
|     Test whether read_csv does not close user-provided file handles. | ||||
|  | ||||
|     GH 36980 | ||||
|     """ | ||||
|     parser = all_parsers | ||||
|     expected = DataFrame({"a": [1], "b": [2]}) | ||||
|  | ||||
|     content = "a,b\n1,2" | ||||
|     handle = io_class(content.encode("utf-8") if io_class == BytesIO else content) | ||||
|  | ||||
|     tm.assert_frame_equal(parser.read_csv(handle, encoding=encoding), expected) | ||||
|     assert not handle.closed | ||||
|  | ||||
|  | ||||
| def test_memory_map_compression(all_parsers, compression): | ||||
|     """ | ||||
|     Support memory map for compressed files. | ||||
|  | ||||
|     GH 37621 | ||||
|     """ | ||||
|     parser = all_parsers | ||||
|     expected = DataFrame({"a": [1], "b": [2]}) | ||||
|  | ||||
|     with tm.ensure_clean() as path: | ||||
|         expected.to_csv(path, index=False, compression=compression) | ||||
|  | ||||
|         if parser.engine == "pyarrow": | ||||
|             msg = "The 'memory_map' option is not supported with the 'pyarrow' engine" | ||||
|             with pytest.raises(ValueError, match=msg): | ||||
|                 parser.read_csv(path, memory_map=True, compression=compression) | ||||
|             return | ||||
|  | ||||
|         result = parser.read_csv(path, memory_map=True, compression=compression) | ||||
|  | ||||
|     tm.assert_frame_equal( | ||||
|         result, | ||||
|         expected, | ||||
|     ) | ||||
|  | ||||
|  | ||||
| def test_context_manager(all_parsers, datapath): | ||||
|     # make sure that opened files are closed | ||||
|     parser = all_parsers | ||||
|  | ||||
|     path = datapath("io", "data", "csv", "iris.csv") | ||||
|  | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             parser.read_csv(path, chunksize=1) | ||||
|         return | ||||
|  | ||||
|     reader = parser.read_csv(path, chunksize=1) | ||||
|     assert not reader.handles.handle.closed | ||||
|     try: | ||||
|         with reader: | ||||
|             next(reader) | ||||
|             assert False | ||||
|     except AssertionError: | ||||
|         assert reader.handles.handle.closed | ||||
|  | ||||
|  | ||||
| def test_context_manageri_user_provided(all_parsers, datapath): | ||||
|     # make sure that user-provided handles are not closed | ||||
|     parser = all_parsers | ||||
|  | ||||
|     with open(datapath("io", "data", "csv", "iris.csv"), encoding="utf-8") as path: | ||||
|         if parser.engine == "pyarrow": | ||||
|             msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" | ||||
|             with pytest.raises(ValueError, match=msg): | ||||
|                 parser.read_csv(path, chunksize=1) | ||||
|             return | ||||
|  | ||||
|         reader = parser.read_csv(path, chunksize=1) | ||||
|         assert not reader.handles.handle.closed | ||||
|         try: | ||||
|             with reader: | ||||
|                 next(reader) | ||||
|                 assert False | ||||
|         except AssertionError: | ||||
|             assert not reader.handles.handle.closed | ||||
|  | ||||
|  | ||||
| @skip_pyarrow  # ParserError: Empty CSV file | ||||
| def test_file_descriptor_leak(all_parsers, using_copy_on_write): | ||||
|     # GH 31488 | ||||
|     parser = all_parsers | ||||
|     with tm.ensure_clean() as path: | ||||
|         with pytest.raises(EmptyDataError, match="No columns to parse from file"): | ||||
|             parser.read_csv(path) | ||||
|  | ||||
|  | ||||
| def test_memory_map(all_parsers, csv_dir_path): | ||||
|     mmap_file = os.path.join(csv_dir_path, "test_mmap.csv") | ||||
|     parser = all_parsers | ||||
|  | ||||
|     expected = DataFrame( | ||||
|         {"a": [1, 2, 3], "b": ["one", "two", "three"], "c": ["I", "II", "III"]} | ||||
|     ) | ||||
|  | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = "The 'memory_map' option is not supported with the 'pyarrow' engine" | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             parser.read_csv(mmap_file, memory_map=True) | ||||
|         return | ||||
|  | ||||
|     result = parser.read_csv(mmap_file, memory_map=True) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
| @ -0,0 +1,79 @@ | ||||
| """ | ||||
| Tests that work on both the Python and C engines but do not have a | ||||
| specific classification into the other test modules. | ||||
| """ | ||||
| from io import StringIO | ||||
|  | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas.compat import is_platform_linux | ||||
|  | ||||
| from pandas import DataFrame | ||||
| import pandas._testing as tm | ||||
|  | ||||
| pytestmark = pytest.mark.filterwarnings( | ||||
|     "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" | ||||
| ) | ||||
| xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") | ||||
| skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") | ||||
|  | ||||
|  | ||||
| @skip_pyarrow  # ParserError: CSV parse error: Empty CSV file or block | ||||
| def test_float_parser(all_parsers): | ||||
|     # see gh-9565 | ||||
|     parser = all_parsers | ||||
|     data = "45e-1,4.5,45.,inf,-inf" | ||||
|     result = parser.read_csv(StringIO(data), header=None) | ||||
|  | ||||
|     expected = DataFrame([[float(s) for s in data.split(",")]]) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_scientific_no_exponent(all_parsers_all_precisions): | ||||
|     # see gh-12215 | ||||
|     df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]}) | ||||
|     data = df.to_csv(index=False) | ||||
|     parser, precision = all_parsers_all_precisions | ||||
|  | ||||
|     df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision) | ||||
|     tm.assert_frame_equal(df_roundtrip, df) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "neg_exp", | ||||
|     [ | ||||
|         -617, | ||||
|         -100000, | ||||
|         pytest.param(-99999999999999999, marks=pytest.mark.skip_ubsan), | ||||
|     ], | ||||
| ) | ||||
| def test_very_negative_exponent(all_parsers_all_precisions, neg_exp): | ||||
|     # GH#38753 | ||||
|     parser, precision = all_parsers_all_precisions | ||||
|  | ||||
|     data = f"data\n10E{neg_exp}" | ||||
|     result = parser.read_csv(StringIO(data), float_precision=precision) | ||||
|     expected = DataFrame({"data": [0.0]}) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.skip_ubsan | ||||
| @xfail_pyarrow  # AssertionError: Attributes of DataFrame.iloc[:, 0] are different | ||||
| @pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999]) | ||||
| def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request): | ||||
|     # GH#38753 | ||||
|     parser, precision = all_parsers_all_precisions | ||||
|     data = f"data\n10E{exp}" | ||||
|     result = parser.read_csv(StringIO(data), float_precision=precision) | ||||
|     if precision == "round_trip": | ||||
|         if exp == 999999999999999999 and is_platform_linux(): | ||||
|             mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result") | ||||
|             request.applymarker(mark) | ||||
|  | ||||
|         value = np.inf if exp > 0 else 0.0 | ||||
|         expected = DataFrame({"data": [value]}) | ||||
|     else: | ||||
|         expected = DataFrame({"data": [f"10E{exp}"]}) | ||||
|  | ||||
|     tm.assert_frame_equal(result, expected) | ||||
| @ -0,0 +1,304 @@ | ||||
| """ | ||||
| Tests that work on both the Python and C engines but do not have a | ||||
| specific classification into the other test modules. | ||||
| """ | ||||
| from datetime import datetime | ||||
| from io import StringIO | ||||
| import os | ||||
|  | ||||
| import pytest | ||||
|  | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     MultiIndex, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
| pytestmark = pytest.mark.filterwarnings( | ||||
|     "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" | ||||
| ) | ||||
|  | ||||
| xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") | ||||
| skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "data,kwargs,expected", | ||||
|     [ | ||||
|         ( | ||||
|             """foo,2,3,4,5 | ||||
| bar,7,8,9,10 | ||||
| baz,12,13,14,15 | ||||
| qux,12,13,14,15 | ||||
| foo2,12,13,14,15 | ||||
| bar2,12,13,14,15 | ||||
| """, | ||||
|             {"index_col": 0, "names": ["index", "A", "B", "C", "D"]}, | ||||
|             DataFrame( | ||||
|                 [ | ||||
|                     [2, 3, 4, 5], | ||||
|                     [7, 8, 9, 10], | ||||
|                     [12, 13, 14, 15], | ||||
|                     [12, 13, 14, 15], | ||||
|                     [12, 13, 14, 15], | ||||
|                     [12, 13, 14, 15], | ||||
|                 ], | ||||
|                 index=Index(["foo", "bar", "baz", "qux", "foo2", "bar2"], name="index"), | ||||
|                 columns=["A", "B", "C", "D"], | ||||
|             ), | ||||
|         ), | ||||
|         ( | ||||
|             """foo,one,2,3,4,5 | ||||
| foo,two,7,8,9,10 | ||||
| foo,three,12,13,14,15 | ||||
| bar,one,12,13,14,15 | ||||
| bar,two,12,13,14,15 | ||||
| """, | ||||
|             {"index_col": [0, 1], "names": ["index1", "index2", "A", "B", "C", "D"]}, | ||||
|             DataFrame( | ||||
|                 [ | ||||
|                     [2, 3, 4, 5], | ||||
|                     [7, 8, 9, 10], | ||||
|                     [12, 13, 14, 15], | ||||
|                     [12, 13, 14, 15], | ||||
|                     [12, 13, 14, 15], | ||||
|                 ], | ||||
|                 index=MultiIndex.from_tuples( | ||||
|                     [ | ||||
|                         ("foo", "one"), | ||||
|                         ("foo", "two"), | ||||
|                         ("foo", "three"), | ||||
|                         ("bar", "one"), | ||||
|                         ("bar", "two"), | ||||
|                     ], | ||||
|                     names=["index1", "index2"], | ||||
|                 ), | ||||
|                 columns=["A", "B", "C", "D"], | ||||
|             ), | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_pass_names_with_index(all_parsers, data, kwargs, expected): | ||||
|     parser = all_parsers | ||||
|     result = parser.read_csv(StringIO(data), **kwargs) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) | ||||
| def test_multi_index_no_level_names( | ||||
|     request, all_parsers, index_col, using_infer_string | ||||
| ): | ||||
|     data = """index1,index2,A,B,C,D | ||||
| foo,one,2,3,4,5 | ||||
| foo,two,7,8,9,10 | ||||
| foo,three,12,13,14,15 | ||||
| bar,one,12,13,14,15 | ||||
| bar,two,12,13,14,15 | ||||
| """ | ||||
|     headless_data = "\n".join(data.split("\n")[1:]) | ||||
|  | ||||
|     names = ["A", "B", "C", "D"] | ||||
|     parser = all_parsers | ||||
|  | ||||
|     result = parser.read_csv( | ||||
|         StringIO(headless_data), index_col=index_col, header=None, names=names | ||||
|     ) | ||||
|     expected = parser.read_csv(StringIO(data), index_col=index_col) | ||||
|  | ||||
|     # No index names in headless data. | ||||
|     expected.index.names = [None] * 2 | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @skip_pyarrow | ||||
| def test_multi_index_no_level_names_implicit(all_parsers): | ||||
|     parser = all_parsers | ||||
|     data = """A,B,C,D | ||||
| foo,one,2,3,4,5 | ||||
| foo,two,7,8,9,10 | ||||
| foo,three,12,13,14,15 | ||||
| bar,one,12,13,14,15 | ||||
| bar,two,12,13,14,15 | ||||
| """ | ||||
|  | ||||
|     result = parser.read_csv(StringIO(data)) | ||||
|     expected = DataFrame( | ||||
|         [ | ||||
|             [2, 3, 4, 5], | ||||
|             [7, 8, 9, 10], | ||||
|             [12, 13, 14, 15], | ||||
|             [12, 13, 14, 15], | ||||
|             [12, 13, 14, 15], | ||||
|         ], | ||||
|         columns=["A", "B", "C", "D"], | ||||
|         index=MultiIndex.from_tuples( | ||||
|             [ | ||||
|                 ("foo", "one"), | ||||
|                 ("foo", "two"), | ||||
|                 ("foo", "three"), | ||||
|                 ("bar", "one"), | ||||
|                 ("bar", "two"), | ||||
|             ] | ||||
|         ), | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @xfail_pyarrow  # TypeError: an integer is required | ||||
| @pytest.mark.parametrize( | ||||
|     "data,expected,header", | ||||
|     [ | ||||
|         ("a,b", DataFrame(columns=["a", "b"]), [0]), | ||||
|         ( | ||||
|             "a,b\nc,d", | ||||
|             DataFrame(columns=MultiIndex.from_tuples([("a", "c"), ("b", "d")])), | ||||
|             [0, 1], | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| @pytest.mark.parametrize("round_trip", [True, False]) | ||||
| def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip): | ||||
|     # see gh-14545 | ||||
|     parser = all_parsers | ||||
|     data = expected.to_csv(index=False) if round_trip else data | ||||
|  | ||||
|     result = parser.read_csv(StringIO(data), header=header) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @xfail_pyarrow  # AssertionError: DataFrame.columns are different | ||||
| def test_no_unnamed_index(all_parsers): | ||||
|     parser = all_parsers | ||||
|     data = """ id c0 c1 c2 | ||||
| 0 1 0 a b | ||||
| 1 2 0 c d | ||||
| 2 2 2 e f | ||||
| """ | ||||
|     result = parser.read_csv(StringIO(data), sep=" ") | ||||
|     expected = DataFrame( | ||||
|         [[0, 1, 0, "a", "b"], [1, 2, 0, "c", "d"], [2, 2, 2, "e", "f"]], | ||||
|         columns=["Unnamed: 0", "id", "c0", "c1", "c2"], | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_read_duplicate_index_explicit(all_parsers): | ||||
|     data = """index,A,B,C,D | ||||
| foo,2,3,4,5 | ||||
| bar,7,8,9,10 | ||||
| baz,12,13,14,15 | ||||
| qux,12,13,14,15 | ||||
| foo,12,13,14,15 | ||||
| bar,12,13,14,15 | ||||
| """ | ||||
|     parser = all_parsers | ||||
|     result = parser.read_csv(StringIO(data), index_col=0) | ||||
|  | ||||
|     expected = DataFrame( | ||||
|         [ | ||||
|             [2, 3, 4, 5], | ||||
|             [7, 8, 9, 10], | ||||
|             [12, 13, 14, 15], | ||||
|             [12, 13, 14, 15], | ||||
|             [12, 13, 14, 15], | ||||
|             [12, 13, 14, 15], | ||||
|         ], | ||||
|         columns=["A", "B", "C", "D"], | ||||
|         index=Index(["foo", "bar", "baz", "qux", "foo", "bar"], name="index"), | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @skip_pyarrow | ||||
| def test_read_duplicate_index_implicit(all_parsers): | ||||
|     data = """A,B,C,D | ||||
| foo,2,3,4,5 | ||||
| bar,7,8,9,10 | ||||
| baz,12,13,14,15 | ||||
| qux,12,13,14,15 | ||||
| foo,12,13,14,15 | ||||
| bar,12,13,14,15 | ||||
| """ | ||||
|     parser = all_parsers | ||||
|     result = parser.read_csv(StringIO(data)) | ||||
|  | ||||
|     expected = DataFrame( | ||||
|         [ | ||||
|             [2, 3, 4, 5], | ||||
|             [7, 8, 9, 10], | ||||
|             [12, 13, 14, 15], | ||||
|             [12, 13, 14, 15], | ||||
|             [12, 13, 14, 15], | ||||
|             [12, 13, 14, 15], | ||||
|         ], | ||||
|         columns=["A", "B", "C", "D"], | ||||
|         index=Index(["foo", "bar", "baz", "qux", "foo", "bar"]), | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @skip_pyarrow | ||||
| def test_read_csv_no_index_name(all_parsers, csv_dir_path): | ||||
|     parser = all_parsers | ||||
|     csv2 = os.path.join(csv_dir_path, "test2.csv") | ||||
|     result = parser.read_csv(csv2, index_col=0, parse_dates=True) | ||||
|  | ||||
|     expected = DataFrame( | ||||
|         [ | ||||
|             [0.980269, 3.685731, -0.364216805298, -1.159738, "foo"], | ||||
|             [1.047916, -0.041232, -0.16181208307, 0.212549, "bar"], | ||||
|             [0.498581, 0.731168, -0.537677223318, 1.346270, "baz"], | ||||
|             [1.120202, 1.567621, 0.00364077397681, 0.675253, "qux"], | ||||
|             [-0.487094, 0.571455, -1.6116394093, 0.103469, "foo2"], | ||||
|         ], | ||||
|         columns=["A", "B", "C", "D", "E"], | ||||
|         index=Index( | ||||
|             [ | ||||
|                 datetime(2000, 1, 3), | ||||
|                 datetime(2000, 1, 4), | ||||
|                 datetime(2000, 1, 5), | ||||
|                 datetime(2000, 1, 6), | ||||
|                 datetime(2000, 1, 7), | ||||
|             ] | ||||
|         ), | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @skip_pyarrow | ||||
| def test_empty_with_index(all_parsers): | ||||
|     # see gh-10184 | ||||
|     data = "x,y" | ||||
|     parser = all_parsers | ||||
|     result = parser.read_csv(StringIO(data), index_col=0) | ||||
|  | ||||
|     expected = DataFrame(columns=["y"], index=Index([], name="x")) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| # CSV parse error: Empty CSV file or block: cannot infer number of columns | ||||
| @skip_pyarrow | ||||
| def test_empty_with_multi_index(all_parsers): | ||||
|     # see gh-10467 | ||||
|     data = "x,y,z" | ||||
|     parser = all_parsers | ||||
|     result = parser.read_csv(StringIO(data), index_col=["x", "y"]) | ||||
|  | ||||
|     expected = DataFrame( | ||||
|         columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"]) | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| # CSV parse error: Empty CSV file or block: cannot infer number of columns | ||||
| @skip_pyarrow | ||||
| def test_empty_with_reversed_multi_index(all_parsers): | ||||
|     data = "x,y,z" | ||||
|     parser = all_parsers | ||||
|     result = parser.read_csv(StringIO(data), index_col=[1, 0]) | ||||
|  | ||||
|     expected = DataFrame( | ||||
|         columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"]) | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
| @ -0,0 +1,78 @@ | ||||
| """ | ||||
| Tests that work on both the Python and C engines but do not have a | ||||
| specific classification into the other test modules. | ||||
| """ | ||||
| from io import StringIO | ||||
|  | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     option_context, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
| pytestmark = pytest.mark.filterwarnings( | ||||
|     "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" | ||||
| ) | ||||
|  | ||||
| xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") | ||||
|  | ||||
|  | ||||
| @xfail_pyarrow  # AssertionError: DataFrame.index are different | ||||
| @pytest.mark.parametrize("na_filter", [True, False]) | ||||
| def test_inf_parsing(all_parsers, na_filter): | ||||
|     parser = all_parsers | ||||
|     data = """\ | ||||
| ,A | ||||
| a,inf | ||||
| b,-inf | ||||
| c,+Inf | ||||
| d,-Inf | ||||
| e,INF | ||||
| f,-INF | ||||
| g,+INf | ||||
| h,-INf | ||||
| i,inF | ||||
| j,-inF""" | ||||
|     expected = DataFrame( | ||||
|         {"A": [float("inf"), float("-inf")] * 5}, | ||||
|         index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"], | ||||
|     ) | ||||
|     result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @xfail_pyarrow  # AssertionError: DataFrame.index are different | ||||
| @pytest.mark.parametrize("na_filter", [True, False]) | ||||
| def test_infinity_parsing(all_parsers, na_filter): | ||||
|     parser = all_parsers | ||||
|     data = """\ | ||||
| ,A | ||||
| a,Infinity | ||||
| b,-Infinity | ||||
| c,+Infinity | ||||
| """ | ||||
|     expected = DataFrame( | ||||
|         {"A": [float("infinity"), float("-infinity"), float("+infinity")]}, | ||||
|         index=["a", "b", "c"], | ||||
|     ) | ||||
|     result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_read_csv_with_use_inf_as_na(all_parsers): | ||||
|     # https://github.com/pandas-dev/pandas/issues/35493 | ||||
|     parser = all_parsers | ||||
|     data = "1.0\nNaN\n3.0" | ||||
|     msg = "use_inf_as_na option is deprecated" | ||||
|     warn = FutureWarning | ||||
|     if parser.engine == "pyarrow": | ||||
|         warn = (FutureWarning, DeprecationWarning) | ||||
|  | ||||
|     with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): | ||||
|         with option_context("use_inf_as_na", True): | ||||
|             result = parser.read_csv(StringIO(data), header=None) | ||||
|     expected = DataFrame([1.0, np.nan, 3.0]) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
| @ -0,0 +1,231 @@ | ||||
| """ | ||||
| Tests that work on both the Python and C engines but do not have a | ||||
| specific classification into the other test modules. | ||||
| """ | ||||
| from io import StringIO | ||||
|  | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Series, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
| pytestmark = pytest.mark.filterwarnings( | ||||
|     "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" | ||||
| ) | ||||
|  | ||||
| xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") | ||||
| skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") | ||||
|  | ||||
|  | ||||
| def test_int_conversion(all_parsers): | ||||
|     data = """A,B | ||||
| 1.0,1 | ||||
| 2.0,2 | ||||
| 3.0,3 | ||||
| """ | ||||
|     parser = all_parsers | ||||
|     result = parser.read_csv(StringIO(data)) | ||||
|  | ||||
|     expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["A", "B"]) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "data,kwargs,expected", | ||||
|     [ | ||||
|         ( | ||||
|             "A,B\nTrue,1\nFalse,2\nTrue,3", | ||||
|             {}, | ||||
|             DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]), | ||||
|         ), | ||||
|         ( | ||||
|             "A,B\nYES,1\nno,2\nyes,3\nNo,3\nYes,3", | ||||
|             {"true_values": ["yes", "Yes", "YES"], "false_values": ["no", "NO", "No"]}, | ||||
|             DataFrame( | ||||
|                 [[True, 1], [False, 2], [True, 3], [False, 3], [True, 3]], | ||||
|                 columns=["A", "B"], | ||||
|             ), | ||||
|         ), | ||||
|         ( | ||||
|             "A,B\nTRUE,1\nFALSE,2\nTRUE,3", | ||||
|             {}, | ||||
|             DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]), | ||||
|         ), | ||||
|         ( | ||||
|             "A,B\nfoo,bar\nbar,foo", | ||||
|             {"true_values": ["foo"], "false_values": ["bar"]}, | ||||
|             DataFrame([[True, False], [False, True]], columns=["A", "B"]), | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_parse_bool(all_parsers, data, kwargs, expected): | ||||
|     parser = all_parsers | ||||
|     result = parser.read_csv(StringIO(data), **kwargs) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_parse_integers_above_fp_precision(all_parsers): | ||||
|     data = """Numbers | ||||
| 17007000002000191 | ||||
| 17007000002000191 | ||||
| 17007000002000191 | ||||
| 17007000002000191 | ||||
| 17007000002000192 | ||||
| 17007000002000192 | ||||
| 17007000002000192 | ||||
| 17007000002000192 | ||||
| 17007000002000192 | ||||
| 17007000002000194""" | ||||
|     parser = all_parsers | ||||
|     result = parser.read_csv(StringIO(data)) | ||||
|     expected = DataFrame( | ||||
|         { | ||||
|             "Numbers": [ | ||||
|                 17007000002000191, | ||||
|                 17007000002000191, | ||||
|                 17007000002000191, | ||||
|                 17007000002000191, | ||||
|                 17007000002000192, | ||||
|                 17007000002000192, | ||||
|                 17007000002000192, | ||||
|                 17007000002000192, | ||||
|                 17007000002000192, | ||||
|                 17007000002000194, | ||||
|             ] | ||||
|         } | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("sep", [" ", r"\s+"]) | ||||
| def test_integer_overflow_bug(all_parsers, sep): | ||||
|     # see gh-2601 | ||||
|     data = "65248E10 11\n55555E55 22\n" | ||||
|     parser = all_parsers | ||||
|     if parser.engine == "pyarrow" and sep != " ": | ||||
|         msg = "the 'pyarrow' engine does not support regex separators" | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             parser.read_csv(StringIO(data), header=None, sep=sep) | ||||
|         return | ||||
|  | ||||
|     result = parser.read_csv(StringIO(data), header=None, sep=sep) | ||||
|     expected = DataFrame([[6.5248e14, 11], [5.5555e59, 22]]) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_int64_min_issues(all_parsers): | ||||
|     # see gh-2599 | ||||
|     parser = all_parsers | ||||
|     data = "A,B\n0,0\n0," | ||||
|     result = parser.read_csv(StringIO(data)) | ||||
|  | ||||
|     expected = DataFrame({"A": [0, 0], "B": [0, np.nan]}) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("conv", [None, np.int64, np.uint64]) | ||||
| def test_int64_overflow(all_parsers, conv, request): | ||||
|     data = """ID | ||||
| 00013007854817840016671868 | ||||
| 00013007854817840016749251 | ||||
| 00013007854817840016754630 | ||||
| 00013007854817840016781876 | ||||
| 00013007854817840017028824 | ||||
| 00013007854817840017963235 | ||||
| 00013007854817840018860166""" | ||||
|     parser = all_parsers | ||||
|  | ||||
|     if conv is None: | ||||
|         # 13007854817840016671868 > UINT64_MAX, so this | ||||
|         # will overflow and return object as the dtype. | ||||
|         if parser.engine == "pyarrow": | ||||
|             mark = pytest.mark.xfail(reason="parses to float64") | ||||
|             request.applymarker(mark) | ||||
|  | ||||
|         result = parser.read_csv(StringIO(data)) | ||||
|         expected = DataFrame( | ||||
|             [ | ||||
|                 "00013007854817840016671868", | ||||
|                 "00013007854817840016749251", | ||||
|                 "00013007854817840016754630", | ||||
|                 "00013007854817840016781876", | ||||
|                 "00013007854817840017028824", | ||||
|                 "00013007854817840017963235", | ||||
|                 "00013007854817840018860166", | ||||
|             ], | ||||
|             columns=["ID"], | ||||
|         ) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|     else: | ||||
|         # 13007854817840016671868 > UINT64_MAX, so attempts | ||||
|         # to cast to either int64 or uint64 will result in | ||||
|         # an OverflowError being raised. | ||||
|         msg = "|".join( | ||||
|             [ | ||||
|                 "Python int too large to convert to C long", | ||||
|                 "long too big to convert", | ||||
|                 "int too big to convert", | ||||
|             ] | ||||
|         ) | ||||
|         err = OverflowError | ||||
|         if parser.engine == "pyarrow": | ||||
|             err = ValueError | ||||
|             msg = "The 'converters' option is not supported with the 'pyarrow' engine" | ||||
|  | ||||
|         with pytest.raises(err, match=msg): | ||||
|             parser.read_csv(StringIO(data), converters={"ID": conv}) | ||||
|  | ||||
|  | ||||
| @skip_pyarrow  # CSV parse error: Empty CSV file or block | ||||
| @pytest.mark.parametrize( | ||||
|     "val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min] | ||||
| ) | ||||
| def test_int64_uint64_range(all_parsers, val): | ||||
|     # These numbers fall right inside the int64-uint64 | ||||
|     # range, so they should be parsed as string. | ||||
|     parser = all_parsers | ||||
|     result = parser.read_csv(StringIO(str(val)), header=None) | ||||
|  | ||||
|     expected = DataFrame([val]) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @skip_pyarrow  # CSV parse error: Empty CSV file or block | ||||
| @pytest.mark.parametrize( | ||||
|     "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1] | ||||
| ) | ||||
| def test_outside_int64_uint64_range(all_parsers, val): | ||||
|     # These numbers fall just outside the int64-uint64 | ||||
|     # range, so they should be parsed as string. | ||||
|     parser = all_parsers | ||||
|     result = parser.read_csv(StringIO(str(val)), header=None) | ||||
|  | ||||
|     expected = DataFrame([str(val)]) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @xfail_pyarrow  # gets float64 dtype instead of object | ||||
| @pytest.mark.parametrize("exp_data", [[str(-1), str(2**63)], [str(2**63), str(-1)]]) | ||||
| def test_numeric_range_too_wide(all_parsers, exp_data): | ||||
|     # No numerical dtype can hold both negative and uint64 | ||||
|     # values, so they should be cast as string. | ||||
|     parser = all_parsers | ||||
|     data = "\n".join(exp_data) | ||||
|     expected = DataFrame(exp_data) | ||||
|  | ||||
|     result = parser.read_csv(StringIO(data), header=None) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_integer_precision(all_parsers): | ||||
|     # Gh 7072 | ||||
|     s = """1,1;0;0;0;1;1;3844;3844;3844;1;1;1;1;1;1;0;0;1;1;0;0,,,4321583677327450765 | ||||
| 5,1;0;0;0;1;1;843;843;843;1;1;1;1;1;1;0;0;1;1;0;0,64.0,;,4321113141090630389""" | ||||
|     parser = all_parsers | ||||
|     result = parser.read_csv(StringIO(s), header=None)[4] | ||||
|     expected = Series([4321583677327450765, 4321113141090630389], name=4) | ||||
|     tm.assert_series_equal(result, expected) | ||||
| @ -0,0 +1,134 @@ | ||||
| """ | ||||
| Tests that work on both the Python and C engines but do not have a | ||||
| specific classification into the other test modules. | ||||
| """ | ||||
| from io import StringIO | ||||
|  | ||||
| import pytest | ||||
|  | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     concat, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
| pytestmark = pytest.mark.filterwarnings( | ||||
|     "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" | ||||
| ) | ||||
|  | ||||
|  | ||||
| def test_iterator(all_parsers): | ||||
|     # see gh-6607 | ||||
|     data = """index,A,B,C,D | ||||
| foo,2,3,4,5 | ||||
| bar,7,8,9,10 | ||||
| baz,12,13,14,15 | ||||
| qux,12,13,14,15 | ||||
| foo2,12,13,14,15 | ||||
| bar2,12,13,14,15 | ||||
| """ | ||||
|     parser = all_parsers | ||||
|     kwargs = {"index_col": 0} | ||||
|  | ||||
|     expected = parser.read_csv(StringIO(data), **kwargs) | ||||
|  | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = "The 'iterator' option is not supported with the 'pyarrow' engine" | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             parser.read_csv(StringIO(data), iterator=True, **kwargs) | ||||
|         return | ||||
|  | ||||
|     with parser.read_csv(StringIO(data), iterator=True, **kwargs) as reader: | ||||
|         first_chunk = reader.read(3) | ||||
|         tm.assert_frame_equal(first_chunk, expected[:3]) | ||||
|  | ||||
|         last_chunk = reader.read(5) | ||||
|     tm.assert_frame_equal(last_chunk, expected[3:]) | ||||
|  | ||||
|  | ||||
| def test_iterator2(all_parsers): | ||||
|     parser = all_parsers | ||||
|     data = """A,B,C | ||||
| foo,1,2,3 | ||||
| bar,4,5,6 | ||||
| baz,7,8,9 | ||||
| """ | ||||
|  | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = "The 'iterator' option is not supported with the 'pyarrow' engine" | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             parser.read_csv(StringIO(data), iterator=True) | ||||
|         return | ||||
|  | ||||
|     with parser.read_csv(StringIO(data), iterator=True) as reader: | ||||
|         result = list(reader) | ||||
|  | ||||
|     expected = DataFrame( | ||||
|         [[1, 2, 3], [4, 5, 6], [7, 8, 9]], | ||||
|         index=["foo", "bar", "baz"], | ||||
|         columns=["A", "B", "C"], | ||||
|     ) | ||||
|     tm.assert_frame_equal(result[0], expected) | ||||
|  | ||||
|  | ||||
| def test_iterator_stop_on_chunksize(all_parsers): | ||||
|     # gh-3967: stopping iteration when chunksize is specified | ||||
|     parser = all_parsers | ||||
|     data = """A,B,C | ||||
| foo,1,2,3 | ||||
| bar,4,5,6 | ||||
| baz,7,8,9 | ||||
| """ | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             parser.read_csv(StringIO(data), chunksize=1) | ||||
|         return | ||||
|  | ||||
|     with parser.read_csv(StringIO(data), chunksize=1) as reader: | ||||
|         result = list(reader) | ||||
|  | ||||
|     assert len(result) == 3 | ||||
|     expected = DataFrame( | ||||
|         [[1, 2, 3], [4, 5, 6], [7, 8, 9]], | ||||
|         index=["foo", "bar", "baz"], | ||||
|         columns=["A", "B", "C"], | ||||
|     ) | ||||
|     tm.assert_frame_equal(concat(result), expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "kwargs", [{"iterator": True, "chunksize": 1}, {"iterator": True}, {"chunksize": 1}] | ||||
| ) | ||||
| def test_iterator_skipfooter_errors(all_parsers, kwargs): | ||||
|     msg = "'skipfooter' not supported for iteration" | ||||
|     parser = all_parsers | ||||
|     data = "a\n1\n2" | ||||
|  | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = ( | ||||
|             "The '(chunksize|iterator)' option is not supported with the " | ||||
|             "'pyarrow' engine" | ||||
|         ) | ||||
|  | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         with parser.read_csv(StringIO(data), skipfooter=1, **kwargs) as _: | ||||
|             pass | ||||
|  | ||||
|  | ||||
| def test_iteration_open_handle(all_parsers): | ||||
|     parser = all_parsers | ||||
|     kwargs = {"header": None} | ||||
|  | ||||
|     with tm.ensure_clean() as path: | ||||
|         with open(path, "w", encoding="utf-8") as f: | ||||
|             f.write("AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG") | ||||
|  | ||||
|         with open(path, encoding="utf-8") as f: | ||||
|             for line in f: | ||||
|                 if "CCC" in line: | ||||
|                     break | ||||
|  | ||||
|             result = parser.read_csv(f, **kwargs) | ||||
|             expected = DataFrame({0: ["DDD", "EEE", "FFF", "GGG"]}) | ||||
|             tm.assert_frame_equal(result, expected) | ||||
| @ -0,0 +1,320 @@ | ||||
| """ | ||||
| Tests that work on the Python, C and PyArrow engines but do not have a | ||||
| specific classification into the other test modules. | ||||
| """ | ||||
| import codecs | ||||
| import csv | ||||
| from io import StringIO | ||||
| import os | ||||
| from pathlib import Path | ||||
|  | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas.compat import PY311 | ||||
| from pandas.errors import ( | ||||
|     EmptyDataError, | ||||
|     ParserError, | ||||
|     ParserWarning, | ||||
| ) | ||||
|  | ||||
| from pandas import DataFrame | ||||
| import pandas._testing as tm | ||||
|  | ||||
| xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") | ||||
| skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") | ||||
|  | ||||
|  | ||||
| def test_empty_decimal_marker(all_parsers): | ||||
|     data = """A|B|C | ||||
| 1|2,334|5 | ||||
| 10|13|10. | ||||
| """ | ||||
|     # Parsers support only length-1 decimals | ||||
|     msg = "Only length-1 decimal markers supported" | ||||
|     parser = all_parsers | ||||
|  | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = ( | ||||
|             "only single character unicode strings can be " | ||||
|             "converted to Py_UCS4, got length 0" | ||||
|         ) | ||||
|  | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         parser.read_csv(StringIO(data), decimal="") | ||||
|  | ||||
|  | ||||
| def test_bad_stream_exception(all_parsers, csv_dir_path): | ||||
|     # see gh-13652 | ||||
|     # | ||||
|     # This test validates that both the Python engine and C engine will | ||||
|     # raise UnicodeDecodeError instead of C engine raising ParserError | ||||
|     # and swallowing the exception that caused read to fail. | ||||
|     path = os.path.join(csv_dir_path, "sauron.SHIFT_JIS.csv") | ||||
|     codec = codecs.lookup("utf-8") | ||||
|     utf8 = codecs.lookup("utf-8") | ||||
|     parser = all_parsers | ||||
|     msg = "'utf-8' codec can't decode byte" | ||||
|  | ||||
|     # Stream must be binary UTF8. | ||||
|     with open(path, "rb") as handle, codecs.StreamRecoder( | ||||
|         handle, utf8.encode, utf8.decode, codec.streamreader, codec.streamwriter | ||||
|     ) as stream: | ||||
|         with pytest.raises(UnicodeDecodeError, match=msg): | ||||
|             parser.read_csv(stream) | ||||
|  | ||||
|  | ||||
| def test_malformed(all_parsers): | ||||
|     # see gh-6607 | ||||
|     parser = all_parsers | ||||
|     data = """ignore | ||||
| A,B,C | ||||
| 1,2,3 # comment | ||||
| 1,2,3,4,5 | ||||
| 2,3,4 | ||||
| """ | ||||
|     msg = "Expected 3 fields in line 4, saw 5" | ||||
|     err = ParserError | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = "The 'comment' option is not supported with the 'pyarrow' engine" | ||||
|         err = ValueError | ||||
|     with pytest.raises(err, match=msg): | ||||
|         parser.read_csv(StringIO(data), header=1, comment="#") | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("nrows", [5, 3, None]) | ||||
| def test_malformed_chunks(all_parsers, nrows): | ||||
|     data = """ignore | ||||
| A,B,C | ||||
| skip | ||||
| 1,2,3 | ||||
| 3,5,10 # comment | ||||
| 1,2,3,4,5 | ||||
| 2,3,4 | ||||
| """ | ||||
|     parser = all_parsers | ||||
|  | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = "The 'iterator' option is not supported with the 'pyarrow' engine" | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             parser.read_csv( | ||||
|                 StringIO(data), | ||||
|                 header=1, | ||||
|                 comment="#", | ||||
|                 iterator=True, | ||||
|                 chunksize=1, | ||||
|                 skiprows=[2], | ||||
|             ) | ||||
|         return | ||||
|  | ||||
|     msg = "Expected 3 fields in line 6, saw 5" | ||||
|     with parser.read_csv( | ||||
|         StringIO(data), header=1, comment="#", iterator=True, chunksize=1, skiprows=[2] | ||||
|     ) as reader: | ||||
|         with pytest.raises(ParserError, match=msg): | ||||
|             reader.read(nrows) | ||||
|  | ||||
|  | ||||
| @xfail_pyarrow  # does not raise | ||||
| def test_catch_too_many_names(all_parsers): | ||||
|     # see gh-5156 | ||||
|     data = """\ | ||||
| 1,2,3 | ||||
| 4,,6 | ||||
| 7,8,9 | ||||
| 10,11,12\n""" | ||||
|     parser = all_parsers | ||||
|     msg = ( | ||||
|         "Too many columns specified: expected 4 and found 3" | ||||
|         if parser.engine == "c" | ||||
|         else "Number of passed names did not match " | ||||
|         "number of header fields in the file" | ||||
|     ) | ||||
|  | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"]) | ||||
|  | ||||
|  | ||||
| @skip_pyarrow  # CSV parse error: Empty CSV file or block | ||||
| @pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5]) | ||||
| def test_raise_on_no_columns(all_parsers, nrows): | ||||
|     parser = all_parsers | ||||
|     data = "\n" * nrows | ||||
|  | ||||
|     msg = "No columns to parse from file" | ||||
|     with pytest.raises(EmptyDataError, match=msg): | ||||
|         parser.read_csv(StringIO(data)) | ||||
|  | ||||
|  | ||||
| def test_unexpected_keyword_parameter_exception(all_parsers): | ||||
|     # GH-34976 | ||||
|     parser = all_parsers | ||||
|  | ||||
|     msg = "{}\\(\\) got an unexpected keyword argument 'foo'" | ||||
|     with pytest.raises(TypeError, match=msg.format("read_csv")): | ||||
|         parser.read_csv("foo.csv", foo=1) | ||||
|     with pytest.raises(TypeError, match=msg.format("read_table")): | ||||
|         parser.read_table("foo.tsv", foo=1) | ||||
|  | ||||
|  | ||||
| def test_suppress_error_output(all_parsers): | ||||
|     # see gh-15925 | ||||
|     parser = all_parsers | ||||
|     data = "a\n1\n1,2,3\n4\n5,6,7" | ||||
|     expected = DataFrame({"a": [1, 4]}) | ||||
|  | ||||
|     result = parser.read_csv(StringIO(data), on_bad_lines="skip") | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_error_bad_lines(all_parsers): | ||||
|     # see gh-15925 | ||||
|     parser = all_parsers | ||||
|     data = "a\n1\n1,2,3\n4\n5,6,7" | ||||
|  | ||||
|     msg = "Expected 1 fields in line 3, saw 3" | ||||
|  | ||||
|     if parser.engine == "pyarrow": | ||||
|         # "CSV parse error: Expected 1 columns, got 3: 1,2,3" | ||||
|         pytest.skip(reason="https://github.com/apache/arrow/issues/38676") | ||||
|  | ||||
|     with pytest.raises(ParserError, match=msg): | ||||
|         parser.read_csv(StringIO(data), on_bad_lines="error") | ||||
|  | ||||
|  | ||||
| def test_warn_bad_lines(all_parsers): | ||||
|     # see gh-15925 | ||||
|     parser = all_parsers | ||||
|     data = "a\n1\n1,2,3\n4\n5,6,7" | ||||
|     expected = DataFrame({"a": [1, 4]}) | ||||
|     match_msg = "Skipping line" | ||||
|  | ||||
|     expected_warning = ParserWarning | ||||
|     if parser.engine == "pyarrow": | ||||
|         match_msg = "Expected 1 columns, but found 3: 1,2,3" | ||||
|         expected_warning = (ParserWarning, DeprecationWarning) | ||||
|  | ||||
|     with tm.assert_produces_warning( | ||||
|         expected_warning, match=match_msg, check_stacklevel=False | ||||
|     ): | ||||
|         result = parser.read_csv(StringIO(data), on_bad_lines="warn") | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_read_csv_wrong_num_columns(all_parsers): | ||||
|     # Too few columns. | ||||
|     data = """A,B,C,D,E,F | ||||
| 1,2,3,4,5,6 | ||||
| 6,7,8,9,10,11,12 | ||||
| 11,12,13,14,15,16 | ||||
| """ | ||||
|     parser = all_parsers | ||||
|     msg = "Expected 6 fields in line 3, saw 7" | ||||
|  | ||||
|     if parser.engine == "pyarrow": | ||||
|         # Expected 6 columns, got 7: 6,7,8,9,10,11,12 | ||||
|         pytest.skip(reason="https://github.com/apache/arrow/issues/38676") | ||||
|  | ||||
|     with pytest.raises(ParserError, match=msg): | ||||
|         parser.read_csv(StringIO(data)) | ||||
|  | ||||
|  | ||||
| def test_null_byte_char(request, all_parsers): | ||||
|     # see gh-2741 | ||||
|     data = "\x00,foo" | ||||
|     names = ["a", "b"] | ||||
|     parser = all_parsers | ||||
|  | ||||
|     if parser.engine == "c" or (parser.engine == "python" and PY311): | ||||
|         if parser.engine == "python" and PY311: | ||||
|             request.applymarker( | ||||
|                 pytest.mark.xfail( | ||||
|                     reason="In Python 3.11, this is read as an empty character not null" | ||||
|                 ) | ||||
|             ) | ||||
|         expected = DataFrame([[np.nan, "foo"]], columns=names) | ||||
|         out = parser.read_csv(StringIO(data), names=names) | ||||
|         tm.assert_frame_equal(out, expected) | ||||
|     else: | ||||
|         if parser.engine == "pyarrow": | ||||
|             # CSV parse error: Empty CSV file or block: " | ||||
|             # cannot infer number of columns" | ||||
|             pytest.skip(reason="https://github.com/apache/arrow/issues/38676") | ||||
|         else: | ||||
|             msg = "NULL byte detected" | ||||
|         with pytest.raises(ParserError, match=msg): | ||||
|             parser.read_csv(StringIO(data), names=names) | ||||
|  | ||||
|  | ||||
| @pytest.mark.filterwarnings("always::ResourceWarning") | ||||
| def test_open_file(request, all_parsers): | ||||
|     # GH 39024 | ||||
|     parser = all_parsers | ||||
|  | ||||
|     msg = "Could not determine delimiter" | ||||
|     err = csv.Error | ||||
|     if parser.engine == "c": | ||||
|         msg = "the 'c' engine does not support sep=None with delim_whitespace=False" | ||||
|         err = ValueError | ||||
|     elif parser.engine == "pyarrow": | ||||
|         msg = ( | ||||
|             "the 'pyarrow' engine does not support sep=None with delim_whitespace=False" | ||||
|         ) | ||||
|         err = ValueError | ||||
|  | ||||
|     with tm.ensure_clean() as path: | ||||
|         file = Path(path) | ||||
|         file.write_bytes(b"\xe4\na\n1") | ||||
|  | ||||
|         with tm.assert_produces_warning(None): | ||||
|             # should not trigger a ResourceWarning | ||||
|             with pytest.raises(err, match=msg): | ||||
|                 parser.read_csv(file, sep=None, encoding_errors="replace") | ||||
|  | ||||
|  | ||||
| def test_invalid_on_bad_line(all_parsers): | ||||
|     parser = all_parsers | ||||
|     data = "a\n1\n1,2,3\n4\n5,6,7" | ||||
|     with pytest.raises(ValueError, match="Argument abc is invalid for on_bad_lines"): | ||||
|         parser.read_csv(StringIO(data), on_bad_lines="abc") | ||||
|  | ||||
|  | ||||
| def test_bad_header_uniform_error(all_parsers): | ||||
|     parser = all_parsers | ||||
|     data = "+++123456789...\ncol1,col2,col3,col4\n1,2,3,4\n" | ||||
|     msg = "Expected 2 fields in line 2, saw 4" | ||||
|     if parser.engine == "c": | ||||
|         msg = ( | ||||
|             "Could not construct index. Requested to use 1 " | ||||
|             "number of columns, but 3 left to parse." | ||||
|         ) | ||||
|     elif parser.engine == "pyarrow": | ||||
|         # "CSV parse error: Expected 1 columns, got 4: col1,col2,col3,col4" | ||||
|         pytest.skip(reason="https://github.com/apache/arrow/issues/38676") | ||||
|  | ||||
|     with pytest.raises(ParserError, match=msg): | ||||
|         parser.read_csv(StringIO(data), index_col=0, on_bad_lines="error") | ||||
|  | ||||
|  | ||||
| def test_on_bad_lines_warn_correct_formatting(all_parsers): | ||||
|     # see gh-15925 | ||||
|     parser = all_parsers | ||||
|     data = """1,2 | ||||
| a,b | ||||
| a,b,c | ||||
| a,b,d | ||||
| a,b | ||||
| """ | ||||
|     expected = DataFrame({"1": "a", "2": ["b"] * 2}) | ||||
|     match_msg = "Skipping line" | ||||
|  | ||||
|     expected_warning = ParserWarning | ||||
|     if parser.engine == "pyarrow": | ||||
|         match_msg = "Expected 2 columns, but found 3: a,b,c" | ||||
|         expected_warning = (ParserWarning, DeprecationWarning) | ||||
|  | ||||
|     with tm.assert_produces_warning( | ||||
|         expected_warning, match=match_msg, check_stacklevel=False | ||||
|     ): | ||||
|         result = parser.read_csv(StringIO(data), on_bad_lines="warn") | ||||
|     tm.assert_frame_equal(result, expected) | ||||
| @ -0,0 +1,81 @@ | ||||
| """ | ||||
| Tests that work on both the Python and C engines but do not have a | ||||
| specific classification into the other test modules. | ||||
| """ | ||||
| from io import StringIO | ||||
|  | ||||
| import pytest | ||||
|  | ||||
| import pandas._testing as tm | ||||
|  | ||||
| depr_msg = "The 'verbose' keyword in pd.read_csv is deprecated" | ||||
|  | ||||
|  | ||||
| def test_verbose_read(all_parsers, capsys): | ||||
|     parser = all_parsers | ||||
|     data = """a,b,c,d | ||||
| one,1,2,3 | ||||
| one,1,2,3 | ||||
| ,1,2,3 | ||||
| one,1,2,3 | ||||
| ,1,2,3 | ||||
| ,1,2,3 | ||||
| one,1,2,3 | ||||
| two,1,2,3""" | ||||
|  | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = "The 'verbose' option is not supported with the 'pyarrow' engine" | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             with tm.assert_produces_warning( | ||||
|                 FutureWarning, match=depr_msg, check_stacklevel=False | ||||
|             ): | ||||
|                 parser.read_csv(StringIO(data), verbose=True) | ||||
|         return | ||||
|  | ||||
|     # Engines are verbose in different ways. | ||||
|     with tm.assert_produces_warning( | ||||
|         FutureWarning, match=depr_msg, check_stacklevel=False | ||||
|     ): | ||||
|         parser.read_csv(StringIO(data), verbose=True) | ||||
|     captured = capsys.readouterr() | ||||
|  | ||||
|     if parser.engine == "c": | ||||
|         assert "Tokenization took:" in captured.out | ||||
|         assert "Parser memory cleanup took:" in captured.out | ||||
|     else:  # Python engine | ||||
|         assert captured.out == "Filled 3 NA values in column a\n" | ||||
|  | ||||
|  | ||||
| def test_verbose_read2(all_parsers, capsys): | ||||
|     parser = all_parsers | ||||
|     data = """a,b,c,d | ||||
| one,1,2,3 | ||||
| two,1,2,3 | ||||
| three,1,2,3 | ||||
| four,1,2,3 | ||||
| five,1,2,3 | ||||
| ,1,2,3 | ||||
| seven,1,2,3 | ||||
| eight,1,2,3""" | ||||
|  | ||||
|     if parser.engine == "pyarrow": | ||||
|         msg = "The 'verbose' option is not supported with the 'pyarrow' engine" | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             with tm.assert_produces_warning( | ||||
|                 FutureWarning, match=depr_msg, check_stacklevel=False | ||||
|             ): | ||||
|                 parser.read_csv(StringIO(data), verbose=True, index_col=0) | ||||
|         return | ||||
|  | ||||
|     with tm.assert_produces_warning( | ||||
|         FutureWarning, match=depr_msg, check_stacklevel=False | ||||
|     ): | ||||
|         parser.read_csv(StringIO(data), verbose=True, index_col=0) | ||||
|     captured = capsys.readouterr() | ||||
|  | ||||
|     # Engines are verbose in different ways. | ||||
|     if parser.engine == "c": | ||||
|         assert "Tokenization took:" in captured.out | ||||
|         assert "Parser memory cleanup took:" in captured.out | ||||
|     else:  # Python engine | ||||
|         assert captured.out == "Filled 1 NA values in column a\n" | ||||
		Reference in New Issue
	
	Block a user